linux/lib/radix-tree.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Copyright (C) 2001 Momchil Velikov
   4 * Portions Copyright (C) 2001 Christoph Hellwig
   5 * Copyright (C) 2005 SGI, Christoph Lameter
   6 * Copyright (C) 2006 Nick Piggin
   7 * Copyright (C) 2012 Konstantin Khlebnikov
   8 * Copyright (C) 2016 Intel, Matthew Wilcox
   9 * Copyright (C) 2016 Intel, Ross Zwisler
  10 */
  11
  12#include <linux/bitmap.h>
  13#include <linux/bitops.h>
  14#include <linux/bug.h>
  15#include <linux/cpu.h>
  16#include <linux/errno.h>
  17#include <linux/export.h>
  18#include <linux/idr.h>
  19#include <linux/init.h>
  20#include <linux/kernel.h>
  21#include <linux/kmemleak.h>
  22#include <linux/percpu.h>
  23#include <linux/preempt.h>              /* in_interrupt() */
  24#include <linux/radix-tree.h>
  25#include <linux/rcupdate.h>
  26#include <linux/slab.h>
  27#include <linux/string.h>
  28#include <linux/xarray.h>
  29
  30
  31/*
  32 * Radix tree node cache.
  33 */
  34struct kmem_cache *radix_tree_node_cachep;
  35
  36/*
  37 * The radix tree is variable-height, so an insert operation not only has
  38 * to build the branch to its corresponding item, it also has to build the
  39 * branch to existing items if the size has to be increased (by
  40 * radix_tree_extend).
  41 *
  42 * The worst case is a zero height tree with just a single item at index 0,
  43 * and then inserting an item at index ULONG_MAX. This requires 2 new branches
  44 * of RADIX_TREE_MAX_PATH size to be created, with only the root node shared.
  45 * Hence:
  46 */
  47#define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1)
  48
  49/*
  50 * The IDR does not have to be as high as the radix tree since it uses
  51 * signed integers, not unsigned longs.
  52 */
  53#define IDR_INDEX_BITS          (8 /* CHAR_BIT */ * sizeof(int) - 1)
  54#define IDR_MAX_PATH            (DIV_ROUND_UP(IDR_INDEX_BITS, \
  55                                                RADIX_TREE_MAP_SHIFT))
  56#define IDR_PRELOAD_SIZE        (IDR_MAX_PATH * 2 - 1)
  57
  58/*
  59 * The IDA is even shorter since it uses a bitmap at the last level.
  60 */
  61#define IDA_INDEX_BITS          (8 * sizeof(int) - 1 - ilog2(IDA_BITMAP_BITS))
  62#define IDA_MAX_PATH            (DIV_ROUND_UP(IDA_INDEX_BITS, \
  63                                                RADIX_TREE_MAP_SHIFT))
  64#define IDA_PRELOAD_SIZE        (IDA_MAX_PATH * 2 - 1)
  65
  66/*
  67 * Per-cpu pool of preloaded nodes
  68 */
  69struct radix_tree_preload {
  70        unsigned nr;
  71        /* nodes->parent points to next preallocated node */
  72        struct radix_tree_node *nodes;
  73};
  74static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
  75
  76static inline struct radix_tree_node *entry_to_node(void *ptr)
  77{
  78        return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
  79}
  80
  81static inline void *node_to_entry(void *ptr)
  82{
  83        return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
  84}
  85
  86#define RADIX_TREE_RETRY        XA_RETRY_ENTRY
  87
  88static inline unsigned long
  89get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot)
  90{
  91        return parent ? slot - parent->slots : 0;
  92}
  93
  94static unsigned int radix_tree_descend(const struct radix_tree_node *parent,
  95                        struct radix_tree_node **nodep, unsigned long index)
  96{
  97        unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK;
  98        void __rcu **entry = rcu_dereference_raw(parent->slots[offset]);
  99
 100        *nodep = (void *)entry;
 101        return offset;
 102}
 103
 104static inline gfp_t root_gfp_mask(const struct radix_tree_root *root)
 105{
 106        return root->xa_flags & (__GFP_BITS_MASK & ~GFP_ZONEMASK);
 107}
 108
 109static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
 110                int offset)
 111{
 112        __set_bit(offset, node->tags[tag]);
 113}
 114
 115static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
 116                int offset)
 117{
 118        __clear_bit(offset, node->tags[tag]);
 119}
 120
 121static inline int tag_get(const struct radix_tree_node *node, unsigned int tag,
 122                int offset)
 123{
 124        return test_bit(offset, node->tags[tag]);
 125}
 126
 127static inline void root_tag_set(struct radix_tree_root *root, unsigned tag)
 128{
 129        root->xa_flags |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT));
 130}
 131
 132static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag)
 133{
 134        root->xa_flags &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT));
 135}
 136
 137static inline void root_tag_clear_all(struct radix_tree_root *root)
 138{
 139        root->xa_flags &= (__force gfp_t)((1 << ROOT_TAG_SHIFT) - 1);
 140}
 141
 142static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag)
 143{
 144        return (__force int)root->xa_flags & (1 << (tag + ROOT_TAG_SHIFT));
 145}
 146
 147static inline unsigned root_tags_get(const struct radix_tree_root *root)
 148{
 149        return (__force unsigned)root->xa_flags >> ROOT_TAG_SHIFT;
 150}
 151
 152static inline bool is_idr(const struct radix_tree_root *root)
 153{
 154        return !!(root->xa_flags & ROOT_IS_IDR);
 155}
 156
 157/*
 158 * Returns 1 if any slot in the node has this tag set.
 159 * Otherwise returns 0.
 160 */
 161static inline int any_tag_set(const struct radix_tree_node *node,
 162                                                        unsigned int tag)
 163{
 164        unsigned idx;
 165        for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
 166                if (node->tags[tag][idx])
 167                        return 1;
 168        }
 169        return 0;
 170}
 171
 172static inline void all_tag_set(struct radix_tree_node *node, unsigned int tag)
 173{
 174        bitmap_fill(node->tags[tag], RADIX_TREE_MAP_SIZE);
 175}
 176
 177/**
 178 * radix_tree_find_next_bit - find the next set bit in a memory region
 179 *
 180 * @addr: The address to base the search on
 181 * @size: The bitmap size in bits
 182 * @offset: The bitnumber to start searching at
 183 *
 184 * Unrollable variant of find_next_bit() for constant size arrays.
 185 * Tail bits starting from size to roundup(size, BITS_PER_LONG) must be zero.
 186 * Returns next bit offset, or size if nothing found.
 187 */
 188static __always_inline unsigned long
 189radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag,
 190                         unsigned long offset)
 191{
 192        const unsigned long *addr = node->tags[tag];
 193
 194        if (offset < RADIX_TREE_MAP_SIZE) {
 195                unsigned long tmp;
 196
 197                addr += offset / BITS_PER_LONG;
 198                tmp = *addr >> (offset % BITS_PER_LONG);
 199                if (tmp)
 200                        return __ffs(tmp) + offset;
 201                offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1);
 202                while (offset < RADIX_TREE_MAP_SIZE) {
 203                        tmp = *++addr;
 204                        if (tmp)
 205                                return __ffs(tmp) + offset;
 206                        offset += BITS_PER_LONG;
 207                }
 208        }
 209        return RADIX_TREE_MAP_SIZE;
 210}
 211
 212static unsigned int iter_offset(const struct radix_tree_iter *iter)
 213{
 214        return iter->index & RADIX_TREE_MAP_MASK;
 215}
 216
 217/*
 218 * The maximum index which can be stored in a radix tree
 219 */
 220static inline unsigned long shift_maxindex(unsigned int shift)
 221{
 222        return (RADIX_TREE_MAP_SIZE << shift) - 1;
 223}
 224
 225static inline unsigned long node_maxindex(const struct radix_tree_node *node)
 226{
 227        return shift_maxindex(node->shift);
 228}
 229
 230static unsigned long next_index(unsigned long index,
 231                                const struct radix_tree_node *node,
 232                                unsigned long offset)
 233{
 234        return (index & ~node_maxindex(node)) + (offset << node->shift);
 235}
 236
 237/*
 238 * This assumes that the caller has performed appropriate preallocation, and
 239 * that the caller has pinned this thread of control to the current CPU.
 240 */
 241static struct radix_tree_node *
 242radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent,
 243                        struct radix_tree_root *root,
 244                        unsigned int shift, unsigned int offset,
 245                        unsigned int count, unsigned int nr_values)
 246{
 247        struct radix_tree_node *ret = NULL;
 248
 249        /*
 250         * Preload code isn't irq safe and it doesn't make sense to use
 251         * preloading during an interrupt anyway as all the allocations have
 252         * to be atomic. So just do normal allocation when in interrupt.
 253         */
 254        if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) {
 255                struct radix_tree_preload *rtp;
 256
 257                /*
 258                 * Even if the caller has preloaded, try to allocate from the
 259                 * cache first for the new node to get accounted to the memory
 260                 * cgroup.
 261                 */
 262                ret = kmem_cache_alloc(radix_tree_node_cachep,
 263                                       gfp_mask | __GFP_NOWARN);
 264                if (ret)
 265                        goto out;
 266
 267                /*
 268                 * Provided the caller has preloaded here, we will always
 269                 * succeed in getting a node here (and never reach
 270                 * kmem_cache_alloc)
 271                 */
 272                rtp = this_cpu_ptr(&radix_tree_preloads);
 273                if (rtp->nr) {
 274                        ret = rtp->nodes;
 275                        rtp->nodes = ret->parent;
 276                        rtp->nr--;
 277                }
 278                /*
 279                 * Update the allocation stack trace as this is more useful
 280                 * for debugging.
 281                 */
 282                kmemleak_update_trace(ret);
 283                goto out;
 284        }
 285        ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
 286out:
 287        BUG_ON(radix_tree_is_internal_node(ret));
 288        if (ret) {
 289                ret->shift = shift;
 290                ret->offset = offset;
 291                ret->count = count;
 292                ret->nr_values = nr_values;
 293                ret->parent = parent;
 294                ret->array = root;
 295        }
 296        return ret;
 297}
 298
 299void radix_tree_node_rcu_free(struct rcu_head *head)
 300{
 301        struct radix_tree_node *node =
 302                        container_of(head, struct radix_tree_node, rcu_head);
 303
 304        /*
 305         * Must only free zeroed nodes into the slab.  We can be left with
 306         * non-NULL entries by radix_tree_free_nodes, so clear the entries
 307         * and tags here.
 308         */
 309        memset(node->slots, 0, sizeof(node->slots));
 310        memset(node->tags, 0, sizeof(node->tags));
 311        INIT_LIST_HEAD(&node->private_list);
 312
 313        kmem_cache_free(radix_tree_node_cachep, node);
 314}
 315
 316static inline void
 317radix_tree_node_free(struct radix_tree_node *node)
 318{
 319        call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
 320}
 321
 322/*
 323 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 324 * ensure that the addition of a single element in the tree cannot fail.  On
 325 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 326 * with preemption not disabled.
 327 *
 328 * To make use of this facility, the radix tree must be initialised without
 329 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
 330 */
 331static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
 332{
 333        struct radix_tree_preload *rtp;
 334        struct radix_tree_node *node;
 335        int ret = -ENOMEM;
 336
 337        /*
 338         * Nodes preloaded by one cgroup can be be used by another cgroup, so
 339         * they should never be accounted to any particular memory cgroup.
 340         */
 341        gfp_mask &= ~__GFP_ACCOUNT;
 342
 343        preempt_disable();
 344        rtp = this_cpu_ptr(&radix_tree_preloads);
 345        while (rtp->nr < nr) {
 346                preempt_enable();
 347                node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
 348                if (node == NULL)
 349                        goto out;
 350                preempt_disable();
 351                rtp = this_cpu_ptr(&radix_tree_preloads);
 352                if (rtp->nr < nr) {
 353                        node->parent = rtp->nodes;
 354                        rtp->nodes = node;
 355                        rtp->nr++;
 356                } else {
 357                        kmem_cache_free(radix_tree_node_cachep, node);
 358                }
 359        }
 360        ret = 0;
 361out:
 362        return ret;
 363}
 364
 365/*
 366 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 367 * ensure that the addition of a single element in the tree cannot fail.  On
 368 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 369 * with preemption not disabled.
 370 *
 371 * To make use of this facility, the radix tree must be initialised without
 372 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
 373 */
 374int radix_tree_preload(gfp_t gfp_mask)
 375{
 376        /* Warn on non-sensical use... */
 377        WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask));
 378        return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
 379}
 380EXPORT_SYMBOL(radix_tree_preload);
 381
 382/*
 383 * The same as above function, except we don't guarantee preloading happens.
 384 * We do it, if we decide it helps. On success, return zero with preemption
 385 * disabled. On error, return -ENOMEM with preemption not disabled.
 386 */
 387int radix_tree_maybe_preload(gfp_t gfp_mask)
 388{
 389        if (gfpflags_allow_blocking(gfp_mask))
 390                return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
 391        /* Preloading doesn't help anything with this gfp mask, skip it */
 392        preempt_disable();
 393        return 0;
 394}
 395EXPORT_SYMBOL(radix_tree_maybe_preload);
 396
 397static unsigned radix_tree_load_root(const struct radix_tree_root *root,
 398                struct radix_tree_node **nodep, unsigned long *maxindex)
 399{
 400        struct radix_tree_node *node = rcu_dereference_raw(root->xa_head);
 401
 402        *nodep = node;
 403
 404        if (likely(radix_tree_is_internal_node(node))) {
 405                node = entry_to_node(node);
 406                *maxindex = node_maxindex(node);
 407                return node->shift + RADIX_TREE_MAP_SHIFT;
 408        }
 409
 410        *maxindex = 0;
 411        return 0;
 412}
 413
 414/*
 415 *      Extend a radix tree so it can store key @index.
 416 */
 417static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
 418                                unsigned long index, unsigned int shift)
 419{
 420        void *entry;
 421        unsigned int maxshift;
 422        int tag;
 423
 424        /* Figure out what the shift should be.  */
 425        maxshift = shift;
 426        while (index > shift_maxindex(maxshift))
 427                maxshift += RADIX_TREE_MAP_SHIFT;
 428
 429        entry = rcu_dereference_raw(root->xa_head);
 430        if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE)))
 431                goto out;
 432
 433        do {
 434                struct radix_tree_node *node = radix_tree_node_alloc(gfp, NULL,
 435                                                        root, shift, 0, 1, 0);
 436                if (!node)
 437                        return -ENOMEM;
 438
 439                if (is_idr(root)) {
 440                        all_tag_set(node, IDR_FREE);
 441                        if (!root_tag_get(root, IDR_FREE)) {
 442                                tag_clear(node, IDR_FREE, 0);
 443                                root_tag_set(root, IDR_FREE);
 444                        }
 445                } else {
 446                        /* Propagate the aggregated tag info to the new child */
 447                        for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
 448                                if (root_tag_get(root, tag))
 449                                        tag_set(node, tag, 0);
 450                        }
 451                }
 452
 453                BUG_ON(shift > BITS_PER_LONG);
 454                if (radix_tree_is_internal_node(entry)) {
 455                        entry_to_node(entry)->parent = node;
 456                } else if (xa_is_value(entry)) {
 457                        /* Moving a value entry root->xa_head to a node */
 458                        node->nr_values = 1;
 459                }
 460                /*
 461                 * entry was already in the radix tree, so we do not need
 462                 * rcu_assign_pointer here
 463                 */
 464                node->slots[0] = (void __rcu *)entry;
 465                entry = node_to_entry(node);
 466                rcu_assign_pointer(root->xa_head, entry);
 467                shift += RADIX_TREE_MAP_SHIFT;
 468        } while (shift <= maxshift);
 469out:
 470        return maxshift + RADIX_TREE_MAP_SHIFT;
 471}
 472
 473/**
 474 *      radix_tree_shrink    -    shrink radix tree to minimum height
 475 *      @root           radix tree root
 476 */
 477static inline bool radix_tree_shrink(struct radix_tree_root *root)
 478{
 479        bool shrunk = false;
 480
 481        for (;;) {
 482                struct radix_tree_node *node = rcu_dereference_raw(root->xa_head);
 483                struct radix_tree_node *child;
 484
 485                if (!radix_tree_is_internal_node(node))
 486                        break;
 487                node = entry_to_node(node);
 488
 489                /*
 490                 * The candidate node has more than one child, or its child
 491                 * is not at the leftmost slot, we cannot shrink.
 492                 */
 493                if (node->count != 1)
 494                        break;
 495                child = rcu_dereference_raw(node->slots[0]);
 496                if (!child)
 497                        break;
 498
 499                /*
 500                 * For an IDR, we must not shrink entry 0 into the root in
 501                 * case somebody calls idr_replace() with a pointer that
 502                 * appears to be an internal entry
 503                 */
 504                if (!node->shift && is_idr(root))
 505                        break;
 506
 507                if (radix_tree_is_internal_node(child))
 508                        entry_to_node(child)->parent = NULL;
 509
 510                /*
 511                 * We don't need rcu_assign_pointer(), since we are simply
 512                 * moving the node from one part of the tree to another: if it
 513                 * was safe to dereference the old pointer to it
 514                 * (node->slots[0]), it will be safe to dereference the new
 515                 * one (root->xa_head) as far as dependent read barriers go.
 516                 */
 517                root->xa_head = (void __rcu *)child;
 518                if (is_idr(root) && !tag_get(node, IDR_FREE, 0))
 519                        root_tag_clear(root, IDR_FREE);
 520
 521                /*
 522                 * We have a dilemma here. The node's slot[0] must not be
 523                 * NULLed in case there are concurrent lookups expecting to
 524                 * find the item. However if this was a bottom-level node,
 525                 * then it may be subject to the slot pointer being visible
 526                 * to callers dereferencing it. If item corresponding to
 527                 * slot[0] is subsequently deleted, these callers would expect
 528                 * their slot to become empty sooner or later.
 529                 *
 530                 * For example, lockless pagecache will look up a slot, deref
 531                 * the page pointer, and if the page has 0 refcount it means it
 532                 * was concurrently deleted from pagecache so try the deref
 533                 * again. Fortunately there is already a requirement for logic
 534                 * to retry the entire slot lookup -- the indirect pointer
 535                 * problem (replacing direct root node with an indirect pointer
 536                 * also results in a stale slot). So tag the slot as indirect
 537                 * to force callers to retry.
 538                 */
 539                node->count = 0;
 540                if (!radix_tree_is_internal_node(child)) {
 541                        node->slots[0] = (void __rcu *)RADIX_TREE_RETRY;
 542                }
 543
 544                WARN_ON_ONCE(!list_empty(&node->private_list));
 545                radix_tree_node_free(node);
 546                shrunk = true;
 547        }
 548
 549        return shrunk;
 550}
 551
 552static bool delete_node(struct radix_tree_root *root,
 553                        struct radix_tree_node *node)
 554{
 555        bool deleted = false;
 556
 557        do {
 558                struct radix_tree_node *parent;
 559
 560                if (node->count) {
 561                        if (node_to_entry(node) ==
 562                                        rcu_dereference_raw(root->xa_head))
 563                                deleted |= radix_tree_shrink(root);
 564                        return deleted;
 565                }
 566
 567                parent = node->parent;
 568                if (parent) {
 569                        parent->slots[node->offset] = NULL;
 570                        parent->count--;
 571                } else {
 572                        /*
 573                         * Shouldn't the tags already have all been cleared
 574                         * by the caller?
 575                         */
 576                        if (!is_idr(root))
 577                                root_tag_clear_all(root);
 578                        root->xa_head = NULL;
 579                }
 580
 581                WARN_ON_ONCE(!list_empty(&node->private_list));
 582                radix_tree_node_free(node);
 583                deleted = true;
 584
 585                node = parent;
 586        } while (node);
 587
 588        return deleted;
 589}
 590
 591/**
 592 *      __radix_tree_create     -       create a slot in a radix tree
 593 *      @root:          radix tree root
 594 *      @index:         index key
 595 *      @nodep:         returns node
 596 *      @slotp:         returns slot
 597 *
 598 *      Create, if necessary, and return the node and slot for an item
 599 *      at position @index in the radix tree @root.
 600 *
 601 *      Until there is more than one item in the tree, no nodes are
 602 *      allocated and @root->xa_head is used as a direct slot instead of
 603 *      pointing to a node, in which case *@nodep will be NULL.
 604 *
 605 *      Returns -ENOMEM, or 0 for success.
 606 */
 607static int __radix_tree_create(struct radix_tree_root *root,
 608                unsigned long index, struct radix_tree_node **nodep,
 609                void __rcu ***slotp)
 610{
 611        struct radix_tree_node *node = NULL, *child;
 612        void __rcu **slot = (void __rcu **)&root->xa_head;
 613        unsigned long maxindex;
 614        unsigned int shift, offset = 0;
 615        unsigned long max = index;
 616        gfp_t gfp = root_gfp_mask(root);
 617
 618        shift = radix_tree_load_root(root, &child, &maxindex);
 619
 620        /* Make sure the tree is high enough.  */
 621        if (max > maxindex) {
 622                int error = radix_tree_extend(root, gfp, max, shift);
 623                if (error < 0)
 624                        return error;
 625                shift = error;
 626                child = rcu_dereference_raw(root->xa_head);
 627        }
 628
 629        while (shift > 0) {
 630                shift -= RADIX_TREE_MAP_SHIFT;
 631                if (child == NULL) {
 632                        /* Have to add a child node.  */
 633                        child = radix_tree_node_alloc(gfp, node, root, shift,
 634                                                        offset, 0, 0);
 635                        if (!child)
 636                                return -ENOMEM;
 637                        rcu_assign_pointer(*slot, node_to_entry(child));
 638                        if (node)
 639                                node->count++;
 640                } else if (!radix_tree_is_internal_node(child))
 641                        break;
 642
 643                /* Go a level down */
 644                node = entry_to_node(child);
 645                offset = radix_tree_descend(node, &child, index);
 646                slot = &node->slots[offset];
 647        }
 648
 649        if (nodep)
 650                *nodep = node;
 651        if (slotp)
 652                *slotp = slot;
 653        return 0;
 654}
 655
 656/*
 657 * Free any nodes below this node.  The tree is presumed to not need
 658 * shrinking, and any user data in the tree is presumed to not need a
 659 * destructor called on it.  If we need to add a destructor, we can
 660 * add that functionality later.  Note that we may not clear tags or
 661 * slots from the tree as an RCU walker may still have a pointer into
 662 * this subtree.  We could replace the entries with RADIX_TREE_RETRY,
 663 * but we'll still have to clear those in rcu_free.
 664 */
 665static void radix_tree_free_nodes(struct radix_tree_node *node)
 666{
 667        unsigned offset = 0;
 668        struct radix_tree_node *child = entry_to_node(node);
 669
 670        for (;;) {
 671                void *entry = rcu_dereference_raw(child->slots[offset]);
 672                if (xa_is_node(entry) && child->shift) {
 673                        child = entry_to_node(entry);
 674                        offset = 0;
 675                        continue;
 676                }
 677                offset++;
 678                while (offset == RADIX_TREE_MAP_SIZE) {
 679                        struct radix_tree_node *old = child;
 680                        offset = child->offset + 1;
 681                        child = child->parent;
 682                        WARN_ON_ONCE(!list_empty(&old->private_list));
 683                        radix_tree_node_free(old);
 684                        if (old == entry_to_node(node))
 685                                return;
 686                }
 687        }
 688}
 689
 690static inline int insert_entries(struct radix_tree_node *node,
 691                void __rcu **slot, void *item, bool replace)
 692{
 693        if (*slot)
 694                return -EEXIST;
 695        rcu_assign_pointer(*slot, item);
 696        if (node) {
 697                node->count++;
 698                if (xa_is_value(item))
 699                        node->nr_values++;
 700        }
 701        return 1;
 702}
 703
 704/**
 705 *      __radix_tree_insert    -    insert into a radix tree
 706 *      @root:          radix tree root
 707 *      @index:         index key
 708 *      @item:          item to insert
 709 *
 710 *      Insert an item into the radix tree at position @index.
 711 */
 712int radix_tree_insert(struct radix_tree_root *root, unsigned long index,
 713                        void *item)
 714{
 715        struct radix_tree_node *node;
 716        void __rcu **slot;
 717        int error;
 718
 719        BUG_ON(radix_tree_is_internal_node(item));
 720
 721        error = __radix_tree_create(root, index, &node, &slot);
 722        if (error)
 723                return error;
 724
 725        error = insert_entries(node, slot, item, false);
 726        if (error < 0)
 727                return error;
 728
 729        if (node) {
 730                unsigned offset = get_slot_offset(node, slot);
 731                BUG_ON(tag_get(node, 0, offset));
 732                BUG_ON(tag_get(node, 1, offset));
 733                BUG_ON(tag_get(node, 2, offset));
 734        } else {
 735                BUG_ON(root_tags_get(root));
 736        }
 737
 738        return 0;
 739}
 740EXPORT_SYMBOL(radix_tree_insert);
 741
 742/**
 743 *      __radix_tree_lookup     -       lookup an item in a radix tree
 744 *      @root:          radix tree root
 745 *      @index:         index key
 746 *      @nodep:         returns node
 747 *      @slotp:         returns slot
 748 *
 749 *      Lookup and return the item at position @index in the radix
 750 *      tree @root.
 751 *
 752 *      Until there is more than one item in the tree, no nodes are
 753 *      allocated and @root->xa_head is used as a direct slot instead of
 754 *      pointing to a node, in which case *@nodep will be NULL.
 755 */
 756void *__radix_tree_lookup(const struct radix_tree_root *root,
 757                          unsigned long index, struct radix_tree_node **nodep,
 758                          void __rcu ***slotp)
 759{
 760        struct radix_tree_node *node, *parent;
 761        unsigned long maxindex;
 762        void __rcu **slot;
 763
 764 restart:
 765        parent = NULL;
 766        slot = (void __rcu **)&root->xa_head;
 767        radix_tree_load_root(root, &node, &maxindex);
 768        if (index > maxindex)
 769                return NULL;
 770
 771        while (radix_tree_is_internal_node(node)) {
 772                unsigned offset;
 773
 774                parent = entry_to_node(node);
 775                offset = radix_tree_descend(parent, &node, index);
 776                slot = parent->slots + offset;
 777                if (node == RADIX_TREE_RETRY)
 778                        goto restart;
 779                if (parent->shift == 0)
 780                        break;
 781        }
 782
 783        if (nodep)
 784                *nodep = parent;
 785        if (slotp)
 786                *slotp = slot;
 787        return node;
 788}
 789
 790/**
 791 *      radix_tree_lookup_slot    -    lookup a slot in a radix tree
 792 *      @root:          radix tree root
 793 *      @index:         index key
 794 *
 795 *      Returns:  the slot corresponding to the position @index in the
 796 *      radix tree @root. This is useful for update-if-exists operations.
 797 *
 798 *      This function can be called under rcu_read_lock iff the slot is not
 799 *      modified by radix_tree_replace_slot, otherwise it must be called
 800 *      exclusive from other writers. Any dereference of the slot must be done
 801 *      using radix_tree_deref_slot.
 802 */
 803void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *root,
 804                                unsigned long index)
 805{
 806        void __rcu **slot;
 807
 808        if (!__radix_tree_lookup(root, index, NULL, &slot))
 809                return NULL;
 810        return slot;
 811}
 812EXPORT_SYMBOL(radix_tree_lookup_slot);
 813
 814/**
 815 *      radix_tree_lookup    -    perform lookup operation on a radix tree
 816 *      @root:          radix tree root
 817 *      @index:         index key
 818 *
 819 *      Lookup the item at the position @index in the radix tree @root.
 820 *
 821 *      This function can be called under rcu_read_lock, however the caller
 822 *      must manage lifetimes of leaf nodes (eg. RCU may also be used to free
 823 *      them safely). No RCU barriers are required to access or modify the
 824 *      returned item, however.
 825 */
 826void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index)
 827{
 828        return __radix_tree_lookup(root, index, NULL, NULL);
 829}
 830EXPORT_SYMBOL(radix_tree_lookup);
 831
 832static void replace_slot(void __rcu **slot, void *item,
 833                struct radix_tree_node *node, int count, int values)
 834{
 835        if (node && (count || values)) {
 836                node->count += count;
 837                node->nr_values += values;
 838        }
 839
 840        rcu_assign_pointer(*slot, item);
 841}
 842
 843static bool node_tag_get(const struct radix_tree_root *root,
 844                                const struct radix_tree_node *node,
 845                                unsigned int tag, unsigned int offset)
 846{
 847        if (node)
 848                return tag_get(node, tag, offset);
 849        return root_tag_get(root, tag);
 850}
 851
 852/*
 853 * IDR users want to be able to store NULL in the tree, so if the slot isn't
 854 * free, don't adjust the count, even if it's transitioning between NULL and
 855 * non-NULL.  For the IDA, we mark slots as being IDR_FREE while they still
 856 * have empty bits, but it only stores NULL in slots when they're being
 857 * deleted.
 858 */
 859static int calculate_count(struct radix_tree_root *root,
 860                                struct radix_tree_node *node, void __rcu **slot,
 861                                void *item, void *old)
 862{
 863        if (is_idr(root)) {
 864                unsigned offset = get_slot_offset(node, slot);
 865                bool free = node_tag_get(root, node, IDR_FREE, offset);
 866                if (!free)
 867                        return 0;
 868                if (!old)
 869                        return 1;
 870        }
 871        return !!item - !!old;
 872}
 873
 874/**
 875 * __radix_tree_replace         - replace item in a slot
 876 * @root:               radix tree root
 877 * @node:               pointer to tree node
 878 * @slot:               pointer to slot in @node
 879 * @item:               new item to store in the slot.
 880 *
 881 * For use with __radix_tree_lookup().  Caller must hold tree write locked
 882 * across slot lookup and replacement.
 883 */
 884void __radix_tree_replace(struct radix_tree_root *root,
 885                          struct radix_tree_node *node,
 886                          void __rcu **slot, void *item)
 887{
 888        void *old = rcu_dereference_raw(*slot);
 889        int values = !!xa_is_value(item) - !!xa_is_value(old);
 890        int count = calculate_count(root, node, slot, item, old);
 891
 892        /*
 893         * This function supports replacing value entries and
 894         * deleting entries, but that needs accounting against the
 895         * node unless the slot is root->xa_head.
 896         */
 897        WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->xa_head) &&
 898                        (count || values));
 899        replace_slot(slot, item, node, count, values);
 900
 901        if (!node)
 902                return;
 903
 904        delete_node(root, node);
 905}
 906
 907/**
 908 * radix_tree_replace_slot      - replace item in a slot
 909 * @root:       radix tree root
 910 * @slot:       pointer to slot
 911 * @item:       new item to store in the slot.
 912 *
 913 * For use with radix_tree_lookup_slot() and
 914 * radix_tree_gang_lookup_tag_slot().  Caller must hold tree write locked
 915 * across slot lookup and replacement.
 916 *
 917 * NOTE: This cannot be used to switch between non-entries (empty slots),
 918 * regular entries, and value entries, as that requires accounting
 919 * inside the radix tree node. When switching from one type of entry or
 920 * deleting, use __radix_tree_lookup() and __radix_tree_replace() or
 921 * radix_tree_iter_replace().
 922 */
 923void radix_tree_replace_slot(struct radix_tree_root *root,
 924                             void __rcu **slot, void *item)
 925{
 926        __radix_tree_replace(root, NULL, slot, item);
 927}
 928EXPORT_SYMBOL(radix_tree_replace_slot);
 929
 930/**
 931 * radix_tree_iter_replace - replace item in a slot
 932 * @root:       radix tree root
 933 * @slot:       pointer to slot
 934 * @item:       new item to store in the slot.
 935 *
 936 * For use with radix_tree_for_each_slot().
 937 * Caller must hold tree write locked.
 938 */
 939void radix_tree_iter_replace(struct radix_tree_root *root,
 940                                const struct radix_tree_iter *iter,
 941                                void __rcu **slot, void *item)
 942{
 943        __radix_tree_replace(root, iter->node, slot, item);
 944}
 945
 946static void node_tag_set(struct radix_tree_root *root,
 947                                struct radix_tree_node *node,
 948                                unsigned int tag, unsigned int offset)
 949{
 950        while (node) {
 951                if (tag_get(node, tag, offset))
 952                        return;
 953                tag_set(node, tag, offset);
 954                offset = node->offset;
 955                node = node->parent;
 956        }
 957
 958        if (!root_tag_get(root, tag))
 959                root_tag_set(root, tag);
 960}
 961
 962/**
 963 *      radix_tree_tag_set - set a tag on a radix tree node
 964 *      @root:          radix tree root
 965 *      @index:         index key
 966 *      @tag:           tag index
 967 *
 968 *      Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
 969 *      corresponding to @index in the radix tree.  From
 970 *      the root all the way down to the leaf node.
 971 *
 972 *      Returns the address of the tagged item.  Setting a tag on a not-present
 973 *      item is a bug.
 974 */
 975void *radix_tree_tag_set(struct radix_tree_root *root,
 976                        unsigned long index, unsigned int tag)
 977{
 978        struct radix_tree_node *node, *parent;
 979        unsigned long maxindex;
 980
 981        radix_tree_load_root(root, &node, &maxindex);
 982        BUG_ON(index > maxindex);
 983
 984        while (radix_tree_is_internal_node(node)) {
 985                unsigned offset;
 986
 987                parent = entry_to_node(node);
 988                offset = radix_tree_descend(parent, &node, index);
 989                BUG_ON(!node);
 990
 991                if (!tag_get(parent, tag, offset))
 992                        tag_set(parent, tag, offset);
 993        }
 994
 995        /* set the root's tag bit */
 996        if (!root_tag_get(root, tag))
 997                root_tag_set(root, tag);
 998
 999        return node;
1000}
1001EXPORT_SYMBOL(radix_tree_tag_set);
1002
1003static void node_tag_clear(struct radix_tree_root *root,
1004                                struct radix_tree_node *node,
1005                                unsigned int tag, unsigned int offset)
1006{
1007        while (node) {
1008                if (!tag_get(node, tag, offset))
1009                        return;
1010                tag_clear(node, tag, offset);
1011                if (any_tag_set(node, tag))
1012                        return;
1013
1014                offset = node->offset;
1015                node = node->parent;
1016        }
1017
1018        /* clear the root's tag bit */
1019        if (root_tag_get(root, tag))
1020                root_tag_clear(root, tag);
1021}
1022
1023/**
1024 *      radix_tree_tag_clear - clear a tag on a radix tree node
1025 *      @root:          radix tree root
1026 *      @index:         index key
1027 *      @tag:           tag index
1028 *
1029 *      Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
1030 *      corresponding to @index in the radix tree.  If this causes
1031 *      the leaf node to have no tags set then clear the tag in the
1032 *      next-to-leaf node, etc.
1033 *
1034 *      Returns the address of the tagged item on success, else NULL.  ie:
1035 *      has the same return value and semantics as radix_tree_lookup().
1036 */
1037void *radix_tree_tag_clear(struct radix_tree_root *root,
1038                        unsigned long index, unsigned int tag)
1039{
1040        struct radix_tree_node *node, *parent;
1041        unsigned long maxindex;
1042        int uninitialized_var(offset);
1043
1044        radix_tree_load_root(root, &node, &maxindex);
1045        if (index > maxindex)
1046                return NULL;
1047
1048        parent = NULL;
1049
1050        while (radix_tree_is_internal_node(node)) {
1051                parent = entry_to_node(node);
1052                offset = radix_tree_descend(parent, &node, index);
1053        }
1054
1055        if (node)
1056                node_tag_clear(root, parent, tag, offset);
1057
1058        return node;
1059}
1060EXPORT_SYMBOL(radix_tree_tag_clear);
1061
1062/**
1063  * radix_tree_iter_tag_clear - clear a tag on the current iterator entry
1064  * @root: radix tree root
1065  * @iter: iterator state
1066  * @tag: tag to clear
1067  */
1068void radix_tree_iter_tag_clear(struct radix_tree_root *root,
1069                        const struct radix_tree_iter *iter, unsigned int tag)
1070{
1071        node_tag_clear(root, iter->node, tag, iter_offset(iter));
1072}
1073
1074/**
1075 * radix_tree_tag_get - get a tag on a radix tree node
1076 * @root:               radix tree root
1077 * @index:              index key
1078 * @tag:                tag index (< RADIX_TREE_MAX_TAGS)
1079 *
1080 * Return values:
1081 *
1082 *  0: tag not present or not set
1083 *  1: tag set
1084 *
1085 * Note that the return value of this function may not be relied on, even if
1086 * the RCU lock is held, unless tag modification and node deletion are excluded
1087 * from concurrency.
1088 */
1089int radix_tree_tag_get(const struct radix_tree_root *root,
1090                        unsigned long index, unsigned int tag)
1091{
1092        struct radix_tree_node *node, *parent;
1093        unsigned long maxindex;
1094
1095        if (!root_tag_get(root, tag))
1096                return 0;
1097
1098        radix_tree_load_root(root, &node, &maxindex);
1099        if (index > maxindex)
1100                return 0;
1101
1102        while (radix_tree_is_internal_node(node)) {
1103                unsigned offset;
1104
1105                parent = entry_to_node(node);
1106                offset = radix_tree_descend(parent, &node, index);
1107
1108                if (!tag_get(parent, tag, offset))
1109                        return 0;
1110                if (node == RADIX_TREE_RETRY)
1111                        break;
1112        }
1113
1114        return 1;
1115}
1116EXPORT_SYMBOL(radix_tree_tag_get);
1117
1118/* Construct iter->tags bit-mask from node->tags[tag] array */
1119static void set_iter_tags(struct radix_tree_iter *iter,
1120                                struct radix_tree_node *node, unsigned offset,
1121                                unsigned tag)
1122{
1123        unsigned tag_long = offset / BITS_PER_LONG;
1124        unsigned tag_bit  = offset % BITS_PER_LONG;
1125
1126        if (!node) {
1127                iter->tags = 1;
1128                return;
1129        }
1130
1131        iter->tags = node->tags[tag][tag_long] >> tag_bit;
1132
1133        /* This never happens if RADIX_TREE_TAG_LONGS == 1 */
1134        if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
1135                /* Pick tags from next element */
1136                if (tag_bit)
1137                        iter->tags |= node->tags[tag][tag_long + 1] <<
1138                                                (BITS_PER_LONG - tag_bit);
1139                /* Clip chunk size, here only BITS_PER_LONG tags */
1140                iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG);
1141        }
1142}
1143
1144void __rcu **radix_tree_iter_resume(void __rcu **slot,
1145                                        struct radix_tree_iter *iter)
1146{
1147        slot++;
1148        iter->index = __radix_tree_iter_add(iter, 1);
1149        iter->next_index = iter->index;
1150        iter->tags = 0;
1151        return NULL;
1152}
1153EXPORT_SYMBOL(radix_tree_iter_resume);
1154
1155/**
1156 * radix_tree_next_chunk - find next chunk of slots for iteration
1157 *
1158 * @root:       radix tree root
1159 * @iter:       iterator state
1160 * @flags:      RADIX_TREE_ITER_* flags and tag index
1161 * Returns:     pointer to chunk first slot, or NULL if iteration is over
1162 */
1163void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root,
1164                             struct radix_tree_iter *iter, unsigned flags)
1165{
1166        unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
1167        struct radix_tree_node *node, *child;
1168        unsigned long index, offset, maxindex;
1169
1170        if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
1171                return NULL;
1172
1173        /*
1174         * Catch next_index overflow after ~0UL. iter->index never overflows
1175         * during iterating; it can be zero only at the beginning.
1176         * And we cannot overflow iter->next_index in a single step,
1177         * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG.
1178         *
1179         * This condition also used by radix_tree_next_slot() to stop
1180         * contiguous iterating, and forbid switching to the next chunk.
1181         */
1182        index = iter->next_index;
1183        if (!index && iter->index)
1184                return NULL;
1185
1186 restart:
1187        radix_tree_load_root(root, &child, &maxindex);
1188        if (index > maxindex)
1189                return NULL;
1190        if (!child)
1191                return NULL;
1192
1193        if (!radix_tree_is_internal_node(child)) {
1194                /* Single-slot tree */
1195                iter->index = index;
1196                iter->next_index = maxindex + 1;
1197                iter->tags = 1;
1198                iter->node = NULL;
1199                return (void __rcu **)&root->xa_head;
1200        }
1201
1202        do {
1203                node = entry_to_node(child);
1204                offset = radix_tree_descend(node, &child, index);
1205
1206                if ((flags & RADIX_TREE_ITER_TAGGED) ?
1207                                !tag_get(node, tag, offset) : !child) {
1208                        /* Hole detected */
1209                        if (flags & RADIX_TREE_ITER_CONTIG)
1210                                return NULL;
1211
1212                        if (flags & RADIX_TREE_ITER_TAGGED)
1213                                offset = radix_tree_find_next_bit(node, tag,
1214                                                offset + 1);
1215                        else
1216                                while (++offset < RADIX_TREE_MAP_SIZE) {
1217                                        void *slot = rcu_dereference_raw(
1218                                                        node->slots[offset]);
1219                                        if (slot)
1220                                                break;
1221                                }
1222                        index &= ~node_maxindex(node);
1223                        index += offset << node->shift;
1224                        /* Overflow after ~0UL */
1225                        if (!index)
1226                                return NULL;
1227                        if (offset == RADIX_TREE_MAP_SIZE)
1228                                goto restart;
1229                        child = rcu_dereference_raw(node->slots[offset]);
1230                }
1231
1232                if (!child)
1233                        goto restart;
1234                if (child == RADIX_TREE_RETRY)
1235                        break;
1236        } while (node->shift && radix_tree_is_internal_node(child));
1237
1238        /* Update the iterator state */
1239        iter->index = (index &~ node_maxindex(node)) | offset;
1240        iter->next_index = (index | node_maxindex(node)) + 1;
1241        iter->node = node;
1242
1243        if (flags & RADIX_TREE_ITER_TAGGED)
1244                set_iter_tags(iter, node, offset, tag);
1245
1246        return node->slots + offset;
1247}
1248EXPORT_SYMBOL(radix_tree_next_chunk);
1249
1250/**
1251 *      radix_tree_gang_lookup - perform multiple lookup on a radix tree
1252 *      @root:          radix tree root
1253 *      @results:       where the results of the lookup are placed
1254 *      @first_index:   start the lookup from this key
1255 *      @max_items:     place up to this many items at *results
1256 *
1257 *      Performs an index-ascending scan of the tree for present items.  Places
1258 *      them at *@results and returns the number of items which were placed at
1259 *      *@results.
1260 *
1261 *      The implementation is naive.
1262 *
1263 *      Like radix_tree_lookup, radix_tree_gang_lookup may be called under
1264 *      rcu_read_lock. In this case, rather than the returned results being
1265 *      an atomic snapshot of the tree at a single point in time, the
1266 *      semantics of an RCU protected gang lookup are as though multiple
1267 *      radix_tree_lookups have been issued in individual locks, and results
1268 *      stored in 'results'.
1269 */
1270unsigned int
1271radix_tree_gang_lookup(const struct radix_tree_root *root, void **results,
1272                        unsigned long first_index, unsigned int max_items)
1273{
1274        struct radix_tree_iter iter;
1275        void __rcu **slot;
1276        unsigned int ret = 0;
1277
1278        if (unlikely(!max_items))
1279                return 0;
1280
1281        radix_tree_for_each_slot(slot, root, &iter, first_index) {
1282                results[ret] = rcu_dereference_raw(*slot);
1283                if (!results[ret])
1284                        continue;
1285                if (radix_tree_is_internal_node(results[ret])) {
1286                        slot = radix_tree_iter_retry(&iter);
1287                        continue;
1288                }
1289                if (++ret == max_items)
1290                        break;
1291        }
1292
1293        return ret;
1294}
1295EXPORT_SYMBOL(radix_tree_gang_lookup);
1296
1297/**
1298 *      radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
1299 *                                   based on a tag
1300 *      @root:          radix tree root
1301 *      @results:       where the results of the lookup are placed
1302 *      @first_index:   start the lookup from this key
1303 *      @max_items:     place up to this many items at *results
1304 *      @tag:           the tag index (< RADIX_TREE_MAX_TAGS)
1305 *
1306 *      Performs an index-ascending scan of the tree for present items which
1307 *      have the tag indexed by @tag set.  Places the items at *@results and
1308 *      returns the number of items which were placed at *@results.
1309 */
1310unsigned int
1311radix_tree_gang_lookup_tag(const struct radix_tree_root *root, void **results,
1312                unsigned long first_index, unsigned int max_items,
1313                unsigned int tag)
1314{
1315        struct radix_tree_iter iter;
1316        void __rcu **slot;
1317        unsigned int ret = 0;
1318
1319        if (unlikely(!max_items))
1320                return 0;
1321
1322        radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
1323                results[ret] = rcu_dereference_raw(*slot);
1324                if (!results[ret])
1325                        continue;
1326                if (radix_tree_is_internal_node(results[ret])) {
1327                        slot = radix_tree_iter_retry(&iter);
1328                        continue;
1329                }
1330                if (++ret == max_items)
1331                        break;
1332        }
1333
1334        return ret;
1335}
1336EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
1337
1338/**
1339 *      radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a
1340 *                                        radix tree based on a tag
1341 *      @root:          radix tree root
1342 *      @results:       where the results of the lookup are placed
1343 *      @first_index:   start the lookup from this key
1344 *      @max_items:     place up to this many items at *results
1345 *      @tag:           the tag index (< RADIX_TREE_MAX_TAGS)
1346 *
1347 *      Performs an index-ascending scan of the tree for present items which
1348 *      have the tag indexed by @tag set.  Places the slots at *@results and
1349 *      returns the number of slots which were placed at *@results.
1350 */
1351unsigned int
1352radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root,
1353                void __rcu ***results, unsigned long first_index,
1354                unsigned int max_items, unsigned int tag)
1355{
1356        struct radix_tree_iter iter;
1357        void __rcu **slot;
1358        unsigned int ret = 0;
1359
1360        if (unlikely(!max_items))
1361                return 0;
1362
1363        radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
1364                results[ret] = slot;
1365                if (++ret == max_items)
1366                        break;
1367        }
1368
1369        return ret;
1370}
1371EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
1372
1373static bool __radix_tree_delete(struct radix_tree_root *root,
1374                                struct radix_tree_node *node, void __rcu **slot)
1375{
1376        void *old = rcu_dereference_raw(*slot);
1377        int values = xa_is_value(old) ? -1 : 0;
1378        unsigned offset = get_slot_offset(node, slot);
1379        int tag;
1380
1381        if (is_idr(root))
1382                node_tag_set(root, node, IDR_FREE, offset);
1383        else
1384                for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
1385                        node_tag_clear(root, node, tag, offset);
1386
1387        replace_slot(slot, NULL, node, -1, values);
1388        return node && delete_node(root, node);
1389}
1390
1391/**
1392 * radix_tree_iter_delete - delete the entry at this iterator position
1393 * @root: radix tree root
1394 * @iter: iterator state
1395 * @slot: pointer to slot
1396 *
1397 * Delete the entry at the position currently pointed to by the iterator.
1398 * This may result in the current node being freed; if it is, the iterator
1399 * is advanced so that it will not reference the freed memory.  This
1400 * function may be called without any locking if there are no other threads
1401 * which can access this tree.
1402 */
1403void radix_tree_iter_delete(struct radix_tree_root *root,
1404                                struct radix_tree_iter *iter, void __rcu **slot)
1405{
1406        if (__radix_tree_delete(root, iter->node, slot))
1407                iter->index = iter->next_index;
1408}
1409EXPORT_SYMBOL(radix_tree_iter_delete);
1410
1411/**
1412 * radix_tree_delete_item - delete an item from a radix tree
1413 * @root: radix tree root
1414 * @index: index key
1415 * @item: expected item
1416 *
1417 * Remove @item at @index from the radix tree rooted at @root.
1418 *
1419 * Return: the deleted entry, or %NULL if it was not present
1420 * or the entry at the given @index was not @item.
1421 */
1422void *radix_tree_delete_item(struct radix_tree_root *root,
1423                             unsigned long index, void *item)
1424{
1425        struct radix_tree_node *node = NULL;
1426        void __rcu **slot = NULL;
1427        void *entry;
1428
1429        entry = __radix_tree_lookup(root, index, &node, &slot);
1430        if (!slot)
1431                return NULL;
1432        if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE,
1433                                                get_slot_offset(node, slot))))
1434                return NULL;
1435
1436        if (item && entry != item)
1437                return NULL;
1438
1439        __radix_tree_delete(root, node, slot);
1440
1441        return entry;
1442}
1443EXPORT_SYMBOL(radix_tree_delete_item);
1444
1445/**
1446 * radix_tree_delete - delete an entry from a radix tree
1447 * @root: radix tree root
1448 * @index: index key
1449 *
1450 * Remove the entry at @index from the radix tree rooted at @root.
1451 *
1452 * Return: The deleted entry, or %NULL if it was not present.
1453 */
1454void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
1455{
1456        return radix_tree_delete_item(root, index, NULL);
1457}
1458EXPORT_SYMBOL(radix_tree_delete);
1459
1460/**
1461 *      radix_tree_tagged - test whether any items in the tree are tagged
1462 *      @root:          radix tree root
1463 *      @tag:           tag to test
1464 */
1465int radix_tree_tagged(const struct radix_tree_root *root, unsigned int tag)
1466{
1467        return root_tag_get(root, tag);
1468}
1469EXPORT_SYMBOL(radix_tree_tagged);
1470
1471/**
1472 * idr_preload - preload for idr_alloc()
1473 * @gfp_mask: allocation mask to use for preloading
1474 *
1475 * Preallocate memory to use for the next call to idr_alloc().  This function
1476 * returns with preemption disabled.  It will be enabled by idr_preload_end().
1477 */
1478void idr_preload(gfp_t gfp_mask)
1479{
1480        if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE))
1481                preempt_disable();
1482}
1483EXPORT_SYMBOL(idr_preload);
1484
1485void __rcu **idr_get_free(struct radix_tree_root *root,
1486                              struct radix_tree_iter *iter, gfp_t gfp,
1487                              unsigned long max)
1488{
1489        struct radix_tree_node *node = NULL, *child;
1490        void __rcu **slot = (void __rcu **)&root->xa_head;
1491        unsigned long maxindex, start = iter->next_index;
1492        unsigned int shift, offset = 0;
1493
1494 grow:
1495        shift = radix_tree_load_root(root, &child, &maxindex);
1496        if (!radix_tree_tagged(root, IDR_FREE))
1497                start = max(start, maxindex + 1);
1498        if (start > max)
1499                return ERR_PTR(-ENOSPC);
1500
1501        if (start > maxindex) {
1502                int error = radix_tree_extend(root, gfp, start, shift);
1503                if (error < 0)
1504                        return ERR_PTR(error);
1505                shift = error;
1506                child = rcu_dereference_raw(root->xa_head);
1507        }
1508        if (start == 0 && shift == 0)
1509                shift = RADIX_TREE_MAP_SHIFT;
1510
1511        while (shift) {
1512                shift -= RADIX_TREE_MAP_SHIFT;
1513                if (child == NULL) {
1514                        /* Have to add a child node.  */
1515                        child = radix_tree_node_alloc(gfp, node, root, shift,
1516                                                        offset, 0, 0);
1517                        if (!child)
1518                                return ERR_PTR(-ENOMEM);
1519                        all_tag_set(child, IDR_FREE);
1520                        rcu_assign_pointer(*slot, node_to_entry(child));
1521                        if (node)
1522                                node->count++;
1523                } else if (!radix_tree_is_internal_node(child))
1524                        break;
1525
1526                node = entry_to_node(child);
1527                offset = radix_tree_descend(node, &child, start);
1528                if (!tag_get(node, IDR_FREE, offset)) {
1529                        offset = radix_tree_find_next_bit(node, IDR_FREE,
1530                                                        offset + 1);
1531                        start = next_index(start, node, offset);
1532                        if (start > max)
1533                                return ERR_PTR(-ENOSPC);
1534                        while (offset == RADIX_TREE_MAP_SIZE) {
1535                                offset = node->offset + 1;
1536                                node = node->parent;
1537                                if (!node)
1538                                        goto grow;
1539                                shift = node->shift;
1540                        }
1541                        child = rcu_dereference_raw(node->slots[offset]);
1542                }
1543                slot = &node->slots[offset];
1544        }
1545
1546        iter->index = start;
1547        if (node)
1548                iter->next_index = 1 + min(max, (start | node_maxindex(node)));
1549        else
1550                iter->next_index = 1;
1551        iter->node = node;
1552        set_iter_tags(iter, node, offset, IDR_FREE);
1553
1554        return slot;
1555}
1556
1557/**
1558 * idr_destroy - release all internal memory from an IDR
1559 * @idr: idr handle
1560 *
1561 * After this function is called, the IDR is empty, and may be reused or
1562 * the data structure containing it may be freed.
1563 *
1564 * A typical clean-up sequence for objects stored in an idr tree will use
1565 * idr_for_each() to free all objects, if necessary, then idr_destroy() to
1566 * free the memory used to keep track of those objects.
1567 */
1568void idr_destroy(struct idr *idr)
1569{
1570        struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.xa_head);
1571        if (radix_tree_is_internal_node(node))
1572                radix_tree_free_nodes(node);
1573        idr->idr_rt.xa_head = NULL;
1574        root_tag_set(&idr->idr_rt, IDR_FREE);
1575}
1576EXPORT_SYMBOL(idr_destroy);
1577
1578static void
1579radix_tree_node_ctor(void *arg)
1580{
1581        struct radix_tree_node *node = arg;
1582
1583        memset(node, 0, sizeof(*node));
1584        INIT_LIST_HEAD(&node->private_list);
1585}
1586
1587static int radix_tree_cpu_dead(unsigned int cpu)
1588{
1589        struct radix_tree_preload *rtp;
1590        struct radix_tree_node *node;
1591
1592        /* Free per-cpu pool of preloaded nodes */
1593        rtp = &per_cpu(radix_tree_preloads, cpu);
1594        while (rtp->nr) {
1595                node = rtp->nodes;
1596                rtp->nodes = node->parent;
1597                kmem_cache_free(radix_tree_node_cachep, node);
1598                rtp->nr--;
1599        }
1600        return 0;
1601}
1602
1603void __init radix_tree_init(void)
1604{
1605        int ret;
1606
1607        BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32);
1608        BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK);
1609        BUILD_BUG_ON(XA_CHUNK_SIZE > 255);
1610        radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
1611                        sizeof(struct radix_tree_node), 0,
1612                        SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
1613                        radix_tree_node_ctor);
1614        ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead",
1615                                        NULL, radix_tree_cpu_dead);
1616        WARN_ON(ret < 0);
1617}
1618