linux/lib/radix-tree.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2001 Momchil Velikov
   3 * Portions Copyright (C) 2001 Christoph Hellwig
   4 * Copyright (C) 2005 SGI, Christoph Lameter
   5 * Copyright (C) 2006 Nick Piggin
   6 *
   7 * This program is free software; you can redistribute it and/or
   8 * modify it under the terms of the GNU General Public License as
   9 * published by the Free Software Foundation; either version 2, or (at
  10 * your option) any later version.
  11 *
  12 * This program is distributed in the hope that it will be useful, but
  13 * WITHOUT ANY WARRANTY; without even the implied warranty of
  14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 * General Public License for more details.
  16 *
  17 * You should have received a copy of the GNU General Public License
  18 * along with this program; if not, write to the Free Software
  19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20 */
  21
  22#include <linux/errno.h>
  23#include <linux/init.h>
  24#include <linux/kernel.h>
  25#include <linux/module.h>
  26#include <linux/radix-tree.h>
  27#include <linux/percpu.h>
  28#include <linux/slab.h>
  29#include <linux/notifier.h>
  30#include <linux/cpu.h>
  31#include <linux/gfp.h>
  32#include <linux/string.h>
  33#include <linux/bitops.h>
  34#include <linux/rcupdate.h>
  35
  36
  37#ifdef __KERNEL__
  38#define RADIX_TREE_MAP_SHIFT    (CONFIG_BASE_SMALL ? 4 : 6)
  39#else
  40#define RADIX_TREE_MAP_SHIFT    3       /* For more stressful testing */
  41#endif
  42
  43#define RADIX_TREE_MAP_SIZE     (1UL << RADIX_TREE_MAP_SHIFT)
  44#define RADIX_TREE_MAP_MASK     (RADIX_TREE_MAP_SIZE-1)
  45
  46#define RADIX_TREE_TAG_LONGS    \
  47        ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
  48
  49struct radix_tree_node {
  50        unsigned int    height;         /* Height from the bottom */
  51        unsigned int    count;
  52        struct rcu_head rcu_head;
  53        void            *slots[RADIX_TREE_MAP_SIZE];
  54        unsigned long   tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
  55};
  56
  57struct radix_tree_path {
  58        struct radix_tree_node *node;
  59        int offset;
  60};
  61
  62#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
  63#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
  64                                          RADIX_TREE_MAP_SHIFT))
  65
  66/*
  67 * The height_to_maxindex array needs to be one deeper than the maximum
  68 * path as height 0 holds only 1 entry.
  69 */
  70static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1] __read_mostly;
  71
  72/*
  73 * Radix tree node cache.
  74 */
  75static struct kmem_cache *radix_tree_node_cachep;
  76
  77/*
  78 * Per-cpu pool of preloaded nodes
  79 */
  80struct radix_tree_preload {
  81        int nr;
  82        struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH];
  83};
  84static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
  85
  86static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
  87{
  88        return root->gfp_mask & __GFP_BITS_MASK;
  89}
  90
  91static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
  92                int offset)
  93{
  94        __set_bit(offset, node->tags[tag]);
  95}
  96
  97static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
  98                int offset)
  99{
 100        __clear_bit(offset, node->tags[tag]);
 101}
 102
 103static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
 104                int offset)
 105{
 106        return test_bit(offset, node->tags[tag]);
 107}
 108
 109static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
 110{
 111        root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
 112}
 113
 114static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
 115{
 116        root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
 117}
 118
 119static inline void root_tag_clear_all(struct radix_tree_root *root)
 120{
 121        root->gfp_mask &= __GFP_BITS_MASK;
 122}
 123
 124static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
 125{
 126        return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
 127}
 128
 129/*
 130 * Returns 1 if any slot in the node has this tag set.
 131 * Otherwise returns 0.
 132 */
 133static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
 134{
 135        int idx;
 136        for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
 137                if (node->tags[tag][idx])
 138                        return 1;
 139        }
 140        return 0;
 141}
 142/*
 143 * This assumes that the caller has performed appropriate preallocation, and
 144 * that the caller has pinned this thread of control to the current CPU.
 145 */
 146static struct radix_tree_node *
 147radix_tree_node_alloc(struct radix_tree_root *root)
 148{
 149        struct radix_tree_node *ret = NULL;
 150        gfp_t gfp_mask = root_gfp_mask(root);
 151
 152        if (!(gfp_mask & __GFP_WAIT)) {
 153                struct radix_tree_preload *rtp;
 154
 155                /*
 156                 * Provided the caller has preloaded here, we will always
 157                 * succeed in getting a node here (and never reach
 158                 * kmem_cache_alloc)
 159                 */
 160                rtp = &__get_cpu_var(radix_tree_preloads);
 161                if (rtp->nr) {
 162                        ret = rtp->nodes[rtp->nr - 1];
 163                        rtp->nodes[rtp->nr - 1] = NULL;
 164                        rtp->nr--;
 165                }
 166        }
 167        if (ret == NULL)
 168                ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
 169
 170        BUG_ON(radix_tree_is_indirect_ptr(ret));
 171        return ret;
 172}
 173
 174static void radix_tree_node_rcu_free(struct rcu_head *head)
 175{
 176        struct radix_tree_node *node =
 177                        container_of(head, struct radix_tree_node, rcu_head);
 178
 179        /*
 180         * must only free zeroed nodes into the slab. radix_tree_shrink
 181         * can leave us with a non-NULL entry in the first slot, so clear
 182         * that here to make sure.
 183         */
 184        tag_clear(node, 0, 0);
 185        tag_clear(node, 1, 0);
 186        node->slots[0] = NULL;
 187        node->count = 0;
 188
 189        kmem_cache_free(radix_tree_node_cachep, node);
 190}
 191
 192static inline void
 193radix_tree_node_free(struct radix_tree_node *node)
 194{
 195        call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
 196}
 197
 198/*
 199 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 200 * ensure that the addition of a single element in the tree cannot fail.  On
 201 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 202 * with preemption not disabled.
 203 *
 204 * To make use of this facility, the radix tree must be initialised without
 205 * __GFP_WAIT being passed to INIT_RADIX_TREE().
 206 */
 207int radix_tree_preload(gfp_t gfp_mask)
 208{
 209        struct radix_tree_preload *rtp;
 210        struct radix_tree_node *node;
 211        int ret = -ENOMEM;
 212
 213        preempt_disable();
 214        rtp = &__get_cpu_var(radix_tree_preloads);
 215        while (rtp->nr < ARRAY_SIZE(rtp->nodes)) {
 216                preempt_enable();
 217                node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
 218                if (node == NULL)
 219                        goto out;
 220                preempt_disable();
 221                rtp = &__get_cpu_var(radix_tree_preloads);
 222                if (rtp->nr < ARRAY_SIZE(rtp->nodes))
 223                        rtp->nodes[rtp->nr++] = node;
 224                else
 225                        kmem_cache_free(radix_tree_node_cachep, node);
 226        }
 227        ret = 0;
 228out:
 229        return ret;
 230}
 231EXPORT_SYMBOL(radix_tree_preload);
 232
 233/*
 234 *      Return the maximum key which can be store into a
 235 *      radix tree with height HEIGHT.
 236 */
 237static inline unsigned long radix_tree_maxindex(unsigned int height)
 238{
 239        return height_to_maxindex[height];
 240}
 241
 242/*
 243 *      Extend a radix tree so it can store key @index.
 244 */
 245static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
 246{
 247        struct radix_tree_node *node;
 248        unsigned int height;
 249        int tag;
 250
 251        /* Figure out what the height should be.  */
 252        height = root->height + 1;
 253        while (index > radix_tree_maxindex(height))
 254                height++;
 255
 256        if (root->rnode == NULL) {
 257                root->height = height;
 258                goto out;
 259        }
 260
 261        do {
 262                unsigned int newheight;
 263                if (!(node = radix_tree_node_alloc(root)))
 264                        return -ENOMEM;
 265
 266                /* Increase the height.  */
 267                node->slots[0] = radix_tree_indirect_to_ptr(root->rnode);
 268
 269                /* Propagate the aggregated tag info into the new root */
 270                for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
 271                        if (root_tag_get(root, tag))
 272                                tag_set(node, tag, 0);
 273                }
 274
 275                newheight = root->height+1;
 276                node->height = newheight;
 277                node->count = 1;
 278                node = radix_tree_ptr_to_indirect(node);
 279                rcu_assign_pointer(root->rnode, node);
 280                root->height = newheight;
 281        } while (height > root->height);
 282out:
 283        return 0;
 284}
 285
 286/**
 287 *      radix_tree_insert    -    insert into a radix tree
 288 *      @root:          radix tree root
 289 *      @index:         index key
 290 *      @item:          item to insert
 291 *
 292 *      Insert an item into the radix tree at position @index.
 293 */
 294int radix_tree_insert(struct radix_tree_root *root,
 295                        unsigned long index, void *item)
 296{
 297        struct radix_tree_node *node = NULL, *slot;
 298        unsigned int height, shift;
 299        int offset;
 300        int error;
 301
 302        BUG_ON(radix_tree_is_indirect_ptr(item));
 303
 304        /* Make sure the tree is high enough.  */
 305        if (index > radix_tree_maxindex(root->height)) {
 306                error = radix_tree_extend(root, index);
 307                if (error)
 308                        return error;
 309        }
 310
 311        slot = radix_tree_indirect_to_ptr(root->rnode);
 312
 313        height = root->height;
 314        shift = (height-1) * RADIX_TREE_MAP_SHIFT;
 315
 316        offset = 0;                     /* uninitialised var warning */
 317        while (height > 0) {
 318                if (slot == NULL) {
 319                        /* Have to add a child node.  */
 320                        if (!(slot = radix_tree_node_alloc(root)))
 321                                return -ENOMEM;
 322                        slot->height = height;
 323                        if (node) {
 324                                rcu_assign_pointer(node->slots[offset], slot);
 325                                node->count++;
 326                        } else
 327                                rcu_assign_pointer(root->rnode,
 328                                        radix_tree_ptr_to_indirect(slot));
 329                }
 330
 331                /* Go a level down */
 332                offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 333                node = slot;
 334                slot = node->slots[offset];
 335                shift -= RADIX_TREE_MAP_SHIFT;
 336                height--;
 337        }
 338
 339        if (slot != NULL)
 340                return -EEXIST;
 341
 342        if (node) {
 343                node->count++;
 344                rcu_assign_pointer(node->slots[offset], item);
 345                BUG_ON(tag_get(node, 0, offset));
 346                BUG_ON(tag_get(node, 1, offset));
 347        } else {
 348                rcu_assign_pointer(root->rnode, item);
 349                BUG_ON(root_tag_get(root, 0));
 350                BUG_ON(root_tag_get(root, 1));
 351        }
 352
 353        return 0;
 354}
 355EXPORT_SYMBOL(radix_tree_insert);
 356
 357/*
 358 * is_slot == 1 : search for the slot.
 359 * is_slot == 0 : search for the node.
 360 */
 361static void *radix_tree_lookup_element(struct radix_tree_root *root,
 362                                unsigned long index, int is_slot)
 363{
 364        unsigned int height, shift;
 365        struct radix_tree_node *node, **slot;
 366
 367        node = rcu_dereference(root->rnode);
 368        if (node == NULL)
 369                return NULL;
 370
 371        if (!radix_tree_is_indirect_ptr(node)) {
 372                if (index > 0)
 373                        return NULL;
 374                return is_slot ? (void *)&root->rnode : node;
 375        }
 376        node = radix_tree_indirect_to_ptr(node);
 377
 378        height = node->height;
 379        if (index > radix_tree_maxindex(height))
 380                return NULL;
 381
 382        shift = (height-1) * RADIX_TREE_MAP_SHIFT;
 383
 384        do {
 385                slot = (struct radix_tree_node **)
 386                        (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK));
 387                node = rcu_dereference(*slot);
 388                if (node == NULL)
 389                        return NULL;
 390
 391                shift -= RADIX_TREE_MAP_SHIFT;
 392                height--;
 393        } while (height > 0);
 394
 395        return is_slot ? (void *)slot:node;
 396}
 397
 398/**
 399 *      radix_tree_lookup_slot    -    lookup a slot in a radix tree
 400 *      @root:          radix tree root
 401 *      @index:         index key
 402 *
 403 *      Returns:  the slot corresponding to the position @index in the
 404 *      radix tree @root. This is useful for update-if-exists operations.
 405 *
 406 *      This function can be called under rcu_read_lock iff the slot is not
 407 *      modified by radix_tree_replace_slot, otherwise it must be called
 408 *      exclusive from other writers. Any dereference of the slot must be done
 409 *      using radix_tree_deref_slot.
 410 */
 411void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
 412{
 413        return (void **)radix_tree_lookup_element(root, index, 1);
 414}
 415EXPORT_SYMBOL(radix_tree_lookup_slot);
 416
 417/**
 418 *      radix_tree_lookup    -    perform lookup operation on a radix tree
 419 *      @root:          radix tree root
 420 *      @index:         index key
 421 *
 422 *      Lookup the item at the position @index in the radix tree @root.
 423 *
 424 *      This function can be called under rcu_read_lock, however the caller
 425 *      must manage lifetimes of leaf nodes (eg. RCU may also be used to free
 426 *      them safely). No RCU barriers are required to access or modify the
 427 *      returned item, however.
 428 */
 429void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
 430{
 431        return radix_tree_lookup_element(root, index, 0);
 432}
 433EXPORT_SYMBOL(radix_tree_lookup);
 434
 435/**
 436 *      radix_tree_tag_set - set a tag on a radix tree node
 437 *      @root:          radix tree root
 438 *      @index:         index key
 439 *      @tag:           tag index
 440 *
 441 *      Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
 442 *      corresponding to @index in the radix tree.  From
 443 *      the root all the way down to the leaf node.
 444 *
 445 *      Returns the address of the tagged item.   Setting a tag on a not-present
 446 *      item is a bug.
 447 */
 448void *radix_tree_tag_set(struct radix_tree_root *root,
 449                        unsigned long index, unsigned int tag)
 450{
 451        unsigned int height, shift;
 452        struct radix_tree_node *slot;
 453
 454        height = root->height;
 455        BUG_ON(index > radix_tree_maxindex(height));
 456
 457        slot = radix_tree_indirect_to_ptr(root->rnode);
 458        shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 459
 460        while (height > 0) {
 461                int offset;
 462
 463                offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 464                if (!tag_get(slot, tag, offset))
 465                        tag_set(slot, tag, offset);
 466                slot = slot->slots[offset];
 467                BUG_ON(slot == NULL);
 468                shift -= RADIX_TREE_MAP_SHIFT;
 469                height--;
 470        }
 471
 472        /* set the root's tag bit */
 473        if (slot && !root_tag_get(root, tag))
 474                root_tag_set(root, tag);
 475
 476        return slot;
 477}
 478EXPORT_SYMBOL(radix_tree_tag_set);
 479
 480/**
 481 *      radix_tree_tag_clear - clear a tag on a radix tree node
 482 *      @root:          radix tree root
 483 *      @index:         index key
 484 *      @tag:           tag index
 485 *
 486 *      Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
 487 *      corresponding to @index in the radix tree.  If
 488 *      this causes the leaf node to have no tags set then clear the tag in the
 489 *      next-to-leaf node, etc.
 490 *
 491 *      Returns the address of the tagged item on success, else NULL.  ie:
 492 *      has the same return value and semantics as radix_tree_lookup().
 493 */
 494void *radix_tree_tag_clear(struct radix_tree_root *root,
 495                        unsigned long index, unsigned int tag)
 496{
 497        /*
 498         * The radix tree path needs to be one longer than the maximum path
 499         * since the "list" is null terminated.
 500         */
 501        struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path;
 502        struct radix_tree_node *slot = NULL;
 503        unsigned int height, shift;
 504
 505        height = root->height;
 506        if (index > radix_tree_maxindex(height))
 507                goto out;
 508
 509        shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 510        pathp->node = NULL;
 511        slot = radix_tree_indirect_to_ptr(root->rnode);
 512
 513        while (height > 0) {
 514                int offset;
 515
 516                if (slot == NULL)
 517                        goto out;
 518
 519                offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 520                pathp[1].offset = offset;
 521                pathp[1].node = slot;
 522                slot = slot->slots[offset];
 523                pathp++;
 524                shift -= RADIX_TREE_MAP_SHIFT;
 525                height--;
 526        }
 527
 528        if (slot == NULL)
 529                goto out;
 530
 531        while (pathp->node) {
 532                if (!tag_get(pathp->node, tag, pathp->offset))
 533                        goto out;
 534                tag_clear(pathp->node, tag, pathp->offset);
 535                if (any_tag_set(pathp->node, tag))
 536                        goto out;
 537                pathp--;
 538        }
 539
 540        /* clear the root's tag bit */
 541        if (root_tag_get(root, tag))
 542                root_tag_clear(root, tag);
 543
 544out:
 545        return slot;
 546}
 547EXPORT_SYMBOL(radix_tree_tag_clear);
 548
 549/**
 550 * radix_tree_tag_get - get a tag on a radix tree node
 551 * @root:               radix tree root
 552 * @index:              index key
 553 * @tag:                tag index (< RADIX_TREE_MAX_TAGS)
 554 *
 555 * Return values:
 556 *
 557 *  0: tag not present or not set
 558 *  1: tag set
 559 */
 560int radix_tree_tag_get(struct radix_tree_root *root,
 561                        unsigned long index, unsigned int tag)
 562{
 563        unsigned int height, shift;
 564        struct radix_tree_node *node;
 565        int saw_unset_tag = 0;
 566
 567        /* check the root's tag bit */
 568        if (!root_tag_get(root, tag))
 569                return 0;
 570
 571        node = rcu_dereference(root->rnode);
 572        if (node == NULL)
 573                return 0;
 574
 575        if (!radix_tree_is_indirect_ptr(node))
 576                return (index == 0);
 577        node = radix_tree_indirect_to_ptr(node);
 578
 579        height = node->height;
 580        if (index > radix_tree_maxindex(height))
 581                return 0;
 582
 583        shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 584
 585        for ( ; ; ) {
 586                int offset;
 587
 588                if (node == NULL)
 589                        return 0;
 590
 591                offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 592
 593                /*
 594                 * This is just a debug check.  Later, we can bale as soon as
 595                 * we see an unset tag.
 596                 */
 597                if (!tag_get(node, tag, offset))
 598                        saw_unset_tag = 1;
 599                if (height == 1) {
 600                        int ret = tag_get(node, tag, offset);
 601
 602                        BUG_ON(ret && saw_unset_tag);
 603                        return !!ret;
 604                }
 605                node = rcu_dereference(node->slots[offset]);
 606                shift -= RADIX_TREE_MAP_SHIFT;
 607                height--;
 608        }
 609}
 610EXPORT_SYMBOL(radix_tree_tag_get);
 611
 612/**
 613 *      radix_tree_next_hole    -    find the next hole (not-present entry)
 614 *      @root:          tree root
 615 *      @index:         index key
 616 *      @max_scan:      maximum range to search
 617 *
 618 *      Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest
 619 *      indexed hole.
 620 *
 621 *      Returns: the index of the hole if found, otherwise returns an index
 622 *      outside of the set specified (in which case 'return - index >= max_scan'
 623 *      will be true). In rare cases of index wrap-around, 0 will be returned.
 624 *
 625 *      radix_tree_next_hole may be called under rcu_read_lock. However, like
 626 *      radix_tree_gang_lookup, this will not atomically search a snapshot of
 627 *      the tree at a single point in time. For example, if a hole is created
 628 *      at index 5, then subsequently a hole is created at index 10,
 629 *      radix_tree_next_hole covering both indexes may return 10 if called
 630 *      under rcu_read_lock.
 631 */
 632unsigned long radix_tree_next_hole(struct radix_tree_root *root,
 633                                unsigned long index, unsigned long max_scan)
 634{
 635        unsigned long i;
 636
 637        for (i = 0; i < max_scan; i++) {
 638                if (!radix_tree_lookup(root, index))
 639                        break;
 640                index++;
 641                if (index == 0)
 642                        break;
 643        }
 644
 645        return index;
 646}
 647EXPORT_SYMBOL(radix_tree_next_hole);
 648
 649/**
 650 *      radix_tree_prev_hole    -    find the prev hole (not-present entry)
 651 *      @root:          tree root
 652 *      @index:         index key
 653 *      @max_scan:      maximum range to search
 654 *
 655 *      Search backwards in the range [max(index-max_scan+1, 0), index]
 656 *      for the first hole.
 657 *
 658 *      Returns: the index of the hole if found, otherwise returns an index
 659 *      outside of the set specified (in which case 'index - return >= max_scan'
 660 *      will be true). In rare cases of wrap-around, LONG_MAX will be returned.
 661 *
 662 *      radix_tree_next_hole may be called under rcu_read_lock. However, like
 663 *      radix_tree_gang_lookup, this will not atomically search a snapshot of
 664 *      the tree at a single point in time. For example, if a hole is created
 665 *      at index 10, then subsequently a hole is created at index 5,
 666 *      radix_tree_prev_hole covering both indexes may return 5 if called under
 667 *      rcu_read_lock.
 668 */
 669unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
 670                                   unsigned long index, unsigned long max_scan)
 671{
 672        unsigned long i;
 673
 674        for (i = 0; i < max_scan; i++) {
 675                if (!radix_tree_lookup(root, index))
 676                        break;
 677                index--;
 678                if (index == LONG_MAX)
 679                        break;
 680        }
 681
 682        return index;
 683}
 684EXPORT_SYMBOL(radix_tree_prev_hole);
 685
 686static unsigned int
 687__lookup(struct radix_tree_node *slot, void ***results, unsigned long index,
 688        unsigned int max_items, unsigned long *next_index)
 689{
 690        unsigned int nr_found = 0;
 691        unsigned int shift, height;
 692        unsigned long i;
 693
 694        height = slot->height;
 695        if (height == 0)
 696                goto out;
 697        shift = (height-1) * RADIX_TREE_MAP_SHIFT;
 698
 699        for ( ; height > 1; height--) {
 700                i = (index >> shift) & RADIX_TREE_MAP_MASK;
 701                for (;;) {
 702                        if (slot->slots[i] != NULL)
 703                                break;
 704                        index &= ~((1UL << shift) - 1);
 705                        index += 1UL << shift;
 706                        if (index == 0)
 707                                goto out;       /* 32-bit wraparound */
 708                        i++;
 709                        if (i == RADIX_TREE_MAP_SIZE)
 710                                goto out;
 711                }
 712
 713                shift -= RADIX_TREE_MAP_SHIFT;
 714                slot = rcu_dereference(slot->slots[i]);
 715                if (slot == NULL)
 716                        goto out;
 717        }
 718
 719        /* Bottom level: grab some items */
 720        for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
 721                index++;
 722                if (slot->slots[i]) {
 723                        results[nr_found++] = &(slot->slots[i]);
 724                        if (nr_found == max_items)
 725                                goto out;
 726                }
 727        }
 728out:
 729        *next_index = index;
 730        return nr_found;
 731}
 732
 733/**
 734 *      radix_tree_gang_lookup - perform multiple lookup on a radix tree
 735 *      @root:          radix tree root
 736 *      @results:       where the results of the lookup are placed
 737 *      @first_index:   start the lookup from this key
 738 *      @max_items:     place up to this many items at *results
 739 *
 740 *      Performs an index-ascending scan of the tree for present items.  Places
 741 *      them at *@results and returns the number of items which were placed at
 742 *      *@results.
 743 *
 744 *      The implementation is naive.
 745 *
 746 *      Like radix_tree_lookup, radix_tree_gang_lookup may be called under
 747 *      rcu_read_lock. In this case, rather than the returned results being
 748 *      an atomic snapshot of the tree at a single point in time, the semantics
 749 *      of an RCU protected gang lookup are as though multiple radix_tree_lookups
 750 *      have been issued in individual locks, and results stored in 'results'.
 751 */
 752unsigned int
 753radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 754                        unsigned long first_index, unsigned int max_items)
 755{
 756        unsigned long max_index;
 757        struct radix_tree_node *node;
 758        unsigned long cur_index = first_index;
 759        unsigned int ret;
 760
 761        node = rcu_dereference(root->rnode);
 762        if (!node)
 763                return 0;
 764
 765        if (!radix_tree_is_indirect_ptr(node)) {
 766                if (first_index > 0)
 767                        return 0;
 768                results[0] = node;
 769                return 1;
 770        }
 771        node = radix_tree_indirect_to_ptr(node);
 772
 773        max_index = radix_tree_maxindex(node->height);
 774
 775        ret = 0;
 776        while (ret < max_items) {
 777                unsigned int nr_found, slots_found, i;
 778                unsigned long next_index;       /* Index of next search */
 779
 780                if (cur_index > max_index)
 781                        break;
 782                slots_found = __lookup(node, (void ***)results + ret, cur_index,
 783                                        max_items - ret, &next_index);
 784                nr_found = 0;
 785                for (i = 0; i < slots_found; i++) {
 786                        struct radix_tree_node *slot;
 787                        slot = *(((void ***)results)[ret + i]);
 788                        if (!slot)
 789                                continue;
 790                        results[ret + nr_found] = rcu_dereference(slot);
 791                        nr_found++;
 792                }
 793                ret += nr_found;
 794                if (next_index == 0)
 795                        break;
 796                cur_index = next_index;
 797        }
 798
 799        return ret;
 800}
 801EXPORT_SYMBOL(radix_tree_gang_lookup);
 802
 803/**
 804 *      radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree
 805 *      @root:          radix tree root
 806 *      @results:       where the results of the lookup are placed
 807 *      @first_index:   start the lookup from this key
 808 *      @max_items:     place up to this many items at *results
 809 *
 810 *      Performs an index-ascending scan of the tree for present items.  Places
 811 *      their slots at *@results and returns the number of items which were
 812 *      placed at *@results.
 813 *
 814 *      The implementation is naive.
 815 *
 816 *      Like radix_tree_gang_lookup as far as RCU and locking goes. Slots must
 817 *      be dereferenced with radix_tree_deref_slot, and if using only RCU
 818 *      protection, radix_tree_deref_slot may fail requiring a retry.
 819 */
 820unsigned int
 821radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
 822                        unsigned long first_index, unsigned int max_items)
 823{
 824        unsigned long max_index;
 825        struct radix_tree_node *node;
 826        unsigned long cur_index = first_index;
 827        unsigned int ret;
 828
 829        node = rcu_dereference(root->rnode);
 830        if (!node)
 831                return 0;
 832
 833        if (!radix_tree_is_indirect_ptr(node)) {
 834                if (first_index > 0)
 835                        return 0;
 836                results[0] = (void **)&root->rnode;
 837                return 1;
 838        }
 839        node = radix_tree_indirect_to_ptr(node);
 840
 841        max_index = radix_tree_maxindex(node->height);
 842
 843        ret = 0;
 844        while (ret < max_items) {
 845                unsigned int slots_found;
 846                unsigned long next_index;       /* Index of next search */
 847
 848                if (cur_index > max_index)
 849                        break;
 850                slots_found = __lookup(node, results + ret, cur_index,
 851                                        max_items - ret, &next_index);
 852                ret += slots_found;
 853                if (next_index == 0)
 854                        break;
 855                cur_index = next_index;
 856        }
 857
 858        return ret;
 859}
 860EXPORT_SYMBOL(radix_tree_gang_lookup_slot);
 861
 862/*
 863 * FIXME: the two tag_get()s here should use find_next_bit() instead of
 864 * open-coding the search.
 865 */
 866static unsigned int
 867__lookup_tag(struct radix_tree_node *slot, void ***results, unsigned long index,
 868        unsigned int max_items, unsigned long *next_index, unsigned int tag)
 869{
 870        unsigned int nr_found = 0;
 871        unsigned int shift, height;
 872
 873        height = slot->height;
 874        if (height == 0)
 875                goto out;
 876        shift = (height-1) * RADIX_TREE_MAP_SHIFT;
 877
 878        while (height > 0) {
 879                unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK ;
 880
 881                for (;;) {
 882                        if (tag_get(slot, tag, i))
 883                                break;
 884                        index &= ~((1UL << shift) - 1);
 885                        index += 1UL << shift;
 886                        if (index == 0)
 887                                goto out;       /* 32-bit wraparound */
 888                        i++;
 889                        if (i == RADIX_TREE_MAP_SIZE)
 890                                goto out;
 891                }
 892                height--;
 893                if (height == 0) {      /* Bottom level: grab some items */
 894                        unsigned long j = index & RADIX_TREE_MAP_MASK;
 895
 896                        for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
 897                                index++;
 898                                if (!tag_get(slot, tag, j))
 899                                        continue;
 900                                /*
 901                                 * Even though the tag was found set, we need to
 902                                 * recheck that we have a non-NULL node, because
 903                                 * if this lookup is lockless, it may have been
 904                                 * subsequently deleted.
 905                                 *
 906                                 * Similar care must be taken in any place that
 907                                 * lookup ->slots[x] without a lock (ie. can't
 908                                 * rely on its value remaining the same).
 909                                 */
 910                                if (slot->slots[j]) {
 911                                        results[nr_found++] = &(slot->slots[j]);
 912                                        if (nr_found == max_items)
 913                                                goto out;
 914                                }
 915                        }
 916                }
 917                shift -= RADIX_TREE_MAP_SHIFT;
 918                slot = rcu_dereference(slot->slots[i]);
 919                if (slot == NULL)
 920                        break;
 921        }
 922out:
 923        *next_index = index;
 924        return nr_found;
 925}
 926
 927/**
 928 *      radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
 929 *                                   based on a tag
 930 *      @root:          radix tree root
 931 *      @results:       where the results of the lookup are placed
 932 *      @first_index:   start the lookup from this key
 933 *      @max_items:     place up to this many items at *results
 934 *      @tag:           the tag index (< RADIX_TREE_MAX_TAGS)
 935 *
 936 *      Performs an index-ascending scan of the tree for present items which
 937 *      have the tag indexed by @tag set.  Places the items at *@results and
 938 *      returns the number of items which were placed at *@results.
 939 */
 940unsigned int
 941radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
 942                unsigned long first_index, unsigned int max_items,
 943                unsigned int tag)
 944{
 945        struct radix_tree_node *node;
 946        unsigned long max_index;
 947        unsigned long cur_index = first_index;
 948        unsigned int ret;
 949
 950        /* check the root's tag bit */
 951        if (!root_tag_get(root, tag))
 952                return 0;
 953
 954        node = rcu_dereference(root->rnode);
 955        if (!node)
 956                return 0;
 957
 958        if (!radix_tree_is_indirect_ptr(node)) {
 959                if (first_index > 0)
 960                        return 0;
 961                results[0] = node;
 962                return 1;
 963        }
 964        node = radix_tree_indirect_to_ptr(node);
 965
 966        max_index = radix_tree_maxindex(node->height);
 967
 968        ret = 0;
 969        while (ret < max_items) {
 970                unsigned int nr_found, slots_found, i;
 971                unsigned long next_index;       /* Index of next search */
 972
 973                if (cur_index > max_index)
 974                        break;
 975                slots_found = __lookup_tag(node, (void ***)results + ret,
 976                                cur_index, max_items - ret, &next_index, tag);
 977                nr_found = 0;
 978                for (i = 0; i < slots_found; i++) {
 979                        struct radix_tree_node *slot;
 980                        slot = *(((void ***)results)[ret + i]);
 981                        if (!slot)
 982                                continue;
 983                        results[ret + nr_found] = rcu_dereference(slot);
 984                        nr_found++;
 985                }
 986                ret += nr_found;
 987                if (next_index == 0)
 988                        break;
 989                cur_index = next_index;
 990        }
 991
 992        return ret;
 993}
 994EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
 995
 996/**
 997 *      radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a
 998 *                                        radix tree based on a tag
 999 *      @root:          radix tree root
1000 *      @results:       where the results of the lookup are placed
1001 *      @first_index:   start the lookup from this key
1002 *      @max_items:     place up to this many items at *results
1003 *      @tag:           the tag index (< RADIX_TREE_MAX_TAGS)
1004 *
1005 *      Performs an index-ascending scan of the tree for present items which
1006 *      have the tag indexed by @tag set.  Places the slots at *@results and
1007 *      returns the number of slots which were placed at *@results.
1008 */
1009unsigned int
1010radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
1011                unsigned long first_index, unsigned int max_items,
1012                unsigned int tag)
1013{
1014        struct radix_tree_node *node;
1015        unsigned long max_index;
1016        unsigned long cur_index = first_index;
1017        unsigned int ret;
1018
1019        /* check the root's tag bit */
1020        if (!root_tag_get(root, tag))
1021                return 0;
1022
1023        node = rcu_dereference(root->rnode);
1024        if (!node)
1025                return 0;
1026
1027        if (!radix_tree_is_indirect_ptr(node)) {
1028                if (first_index > 0)
1029                        return 0;
1030                results[0] = (void **)&root->rnode;
1031                return 1;
1032        }
1033        node = radix_tree_indirect_to_ptr(node);
1034
1035        max_index = radix_tree_maxindex(node->height);
1036
1037        ret = 0;
1038        while (ret < max_items) {
1039                unsigned int slots_found;
1040                unsigned long next_index;       /* Index of next search */
1041
1042                if (cur_index > max_index)
1043                        break;
1044                slots_found = __lookup_tag(node, results + ret,
1045                                cur_index, max_items - ret, &next_index, tag);
1046                ret += slots_found;
1047                if (next_index == 0)
1048                        break;
1049                cur_index = next_index;
1050        }
1051
1052        return ret;
1053}
1054EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
1055
1056
1057/**
1058 *      radix_tree_shrink    -    shrink height of a radix tree to minimal
1059 *      @root           radix tree root
1060 */
1061static inline void radix_tree_shrink(struct radix_tree_root *root)
1062{
1063        /* try to shrink tree height */
1064        while (root->height > 0) {
1065                struct radix_tree_node *to_free = root->rnode;
1066                void *newptr;
1067
1068                BUG_ON(!radix_tree_is_indirect_ptr(to_free));
1069                to_free = radix_tree_indirect_to_ptr(to_free);
1070
1071                /*
1072                 * The candidate node has more than one child, or its child
1073                 * is not at the leftmost slot, we cannot shrink.
1074                 */
1075                if (to_free->count != 1)
1076                        break;
1077                if (!to_free->slots[0])
1078                        break;
1079
1080                /*
1081                 * We don't need rcu_assign_pointer(), since we are simply
1082                 * moving the node from one part of the tree to another. If
1083                 * it was safe to dereference the old pointer to it
1084                 * (to_free->slots[0]), it will be safe to dereference the new
1085                 * one (root->rnode).
1086                 */
1087                newptr = to_free->slots[0];
1088                if (root->height > 1)
1089                        newptr = radix_tree_ptr_to_indirect(newptr);
1090                root->rnode = newptr;
1091                root->height--;
1092                radix_tree_node_free(to_free);
1093        }
1094}
1095
1096/**
1097 *      radix_tree_delete    -    delete an item from a radix tree
1098 *      @root:          radix tree root
1099 *      @index:         index key
1100 *
1101 *      Remove the item at @index from the radix tree rooted at @root.
1102 *
1103 *      Returns the address of the deleted item, or NULL if it was not present.
1104 */
1105void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
1106{
1107        /*
1108         * The radix tree path needs to be one longer than the maximum path
1109         * since the "list" is null terminated.
1110         */
1111        struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path;
1112        struct radix_tree_node *slot = NULL;
1113        struct radix_tree_node *to_free;
1114        unsigned int height, shift;
1115        int tag;
1116        int offset;
1117
1118        height = root->height;
1119        if (index > radix_tree_maxindex(height))
1120                goto out;
1121
1122        slot = root->rnode;
1123        if (height == 0) {
1124                root_tag_clear_all(root);
1125                root->rnode = NULL;
1126                goto out;
1127        }
1128        slot = radix_tree_indirect_to_ptr(slot);
1129
1130        shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
1131        pathp->node = NULL;
1132
1133        do {
1134                if (slot == NULL)
1135                        goto out;
1136
1137                pathp++;
1138                offset = (index >> shift) & RADIX_TREE_MAP_MASK;
1139                pathp->offset = offset;
1140                pathp->node = slot;
1141                slot = slot->slots[offset];
1142                shift -= RADIX_TREE_MAP_SHIFT;
1143                height--;
1144        } while (height > 0);
1145
1146        if (slot == NULL)
1147                goto out;
1148
1149        /*
1150         * Clear all tags associated with the just-deleted item
1151         */
1152        for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
1153                if (tag_get(pathp->node, tag, pathp->offset))
1154                        radix_tree_tag_clear(root, index, tag);
1155        }
1156
1157        to_free = NULL;
1158        /* Now free the nodes we do not need anymore */
1159        while (pathp->node) {
1160                pathp->node->slots[pathp->offset] = NULL;
1161                pathp->node->count--;
1162                /*
1163                 * Queue the node for deferred freeing after the
1164                 * last reference to it disappears (set NULL, above).
1165                 */
1166                if (to_free)
1167                        radix_tree_node_free(to_free);
1168
1169                if (pathp->node->count) {
1170                        if (pathp->node ==
1171                                        radix_tree_indirect_to_ptr(root->rnode))
1172                                radix_tree_shrink(root);
1173                        goto out;
1174                }
1175
1176                /* Node with zero slots in use so free it */
1177                to_free = pathp->node;
1178                pathp--;
1179
1180        }
1181        root_tag_clear_all(root);
1182        root->height = 0;
1183        root->rnode = NULL;
1184        if (to_free)
1185                radix_tree_node_free(to_free);
1186
1187out:
1188        return slot;
1189}
1190EXPORT_SYMBOL(radix_tree_delete);
1191
1192/**
1193 *      radix_tree_tagged - test whether any items in the tree are tagged
1194 *      @root:          radix tree root
1195 *      @tag:           tag to test
1196 */
1197int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
1198{
1199        return root_tag_get(root, tag);
1200}
1201EXPORT_SYMBOL(radix_tree_tagged);
1202
1203static void
1204radix_tree_node_ctor(void *node)
1205{
1206        memset(node, 0, sizeof(struct radix_tree_node));
1207}
1208
1209static __init unsigned long __maxindex(unsigned int height)
1210{
1211        unsigned int width = height * RADIX_TREE_MAP_SHIFT;
1212        int shift = RADIX_TREE_INDEX_BITS - width;
1213
1214        if (shift < 0)
1215                return ~0UL;
1216        if (shift >= BITS_PER_LONG)
1217                return 0UL;
1218        return ~0UL >> shift;
1219}
1220
1221static __init void radix_tree_init_maxindex(void)
1222{
1223        unsigned int i;
1224
1225        for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
1226                height_to_maxindex[i] = __maxindex(i);
1227}
1228
1229static int radix_tree_callback(struct notifier_block *nfb,
1230                            unsigned long action,
1231                            void *hcpu)
1232{
1233       int cpu = (long)hcpu;
1234       struct radix_tree_preload *rtp;
1235
1236       /* Free per-cpu pool of perloaded nodes */
1237       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
1238               rtp = &per_cpu(radix_tree_preloads, cpu);
1239               while (rtp->nr) {
1240                       kmem_cache_free(radix_tree_node_cachep,
1241                                       rtp->nodes[rtp->nr-1]);
1242                       rtp->nodes[rtp->nr-1] = NULL;
1243                       rtp->nr--;
1244               }
1245       }
1246       return NOTIFY_OK;
1247}
1248
1249void __init radix_tree_init(void)
1250{
1251        radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
1252                        sizeof(struct radix_tree_node), 0,
1253                        SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
1254                        radix_tree_node_ctor);
1255        radix_tree_init_maxindex();
1256        hotcpu_notifier(radix_tree_callback, 0);
1257}
1258