linux/lib/rhashtable.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Resizable, Scalable, Concurrent Hash Table
   4 *
   5 * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
   6 * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
   7 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
   8 *
   9 * Code partially derived from nft_hash
  10 * Rewritten with rehash code from br_multicast plus single list
  11 * pointer as suggested by Josh Triplett
  12 */
  13
  14#include <linux/atomic.h>
  15#include <linux/kernel.h>
  16#include <linux/init.h>
  17#include <linux/log2.h>
  18#include <linux/sched.h>
  19#include <linux/rculist.h>
  20#include <linux/slab.h>
  21#include <linux/vmalloc.h>
  22#include <linux/mm.h>
  23#include <linux/jhash.h>
  24#include <linux/random.h>
  25#include <linux/rhashtable.h>
  26#include <linux/err.h>
  27#include <linux/export.h>
  28
  29#define HASH_DEFAULT_SIZE       64UL
  30#define HASH_MIN_SIZE           4U
  31
  32union nested_table {
  33        union nested_table __rcu *table;
  34        struct rhash_lock_head *bucket;
  35};
  36
  37static u32 head_hashfn(struct rhashtable *ht,
  38                       const struct bucket_table *tbl,
  39                       const struct rhash_head *he)
  40{
  41        return rht_head_hashfn(ht, tbl, he, ht->p);
  42}
  43
  44#ifdef CONFIG_PROVE_LOCKING
  45#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
  46
  47int lockdep_rht_mutex_is_held(struct rhashtable *ht)
  48{
  49        return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
  50}
  51EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
  52
  53int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
  54{
  55        if (!debug_locks)
  56                return 1;
  57        if (unlikely(tbl->nest))
  58                return 1;
  59        return bit_spin_is_locked(0, (unsigned long *)&tbl->buckets[hash]);
  60}
  61EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
  62#else
  63#define ASSERT_RHT_MUTEX(HT)
  64#endif
  65
  66static void nested_table_free(union nested_table *ntbl, unsigned int size)
  67{
  68        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
  69        const unsigned int len = 1 << shift;
  70        unsigned int i;
  71
  72        ntbl = rcu_dereference_raw(ntbl->table);
  73        if (!ntbl)
  74                return;
  75
  76        if (size > len) {
  77                size >>= shift;
  78                for (i = 0; i < len; i++)
  79                        nested_table_free(ntbl + i, size);
  80        }
  81
  82        kfree(ntbl);
  83}
  84
  85static void nested_bucket_table_free(const struct bucket_table *tbl)
  86{
  87        unsigned int size = tbl->size >> tbl->nest;
  88        unsigned int len = 1 << tbl->nest;
  89        union nested_table *ntbl;
  90        unsigned int i;
  91
  92        ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
  93
  94        for (i = 0; i < len; i++)
  95                nested_table_free(ntbl + i, size);
  96
  97        kfree(ntbl);
  98}
  99
 100static void bucket_table_free(const struct bucket_table *tbl)
 101{
 102        if (tbl->nest)
 103                nested_bucket_table_free(tbl);
 104
 105        kvfree(tbl);
 106}
 107
 108static void bucket_table_free_rcu(struct rcu_head *head)
 109{
 110        bucket_table_free(container_of(head, struct bucket_table, rcu));
 111}
 112
 113static union nested_table *nested_table_alloc(struct rhashtable *ht,
 114                                              union nested_table __rcu **prev,
 115                                              bool leaf)
 116{
 117        union nested_table *ntbl;
 118        int i;
 119
 120        ntbl = rcu_dereference(*prev);
 121        if (ntbl)
 122                return ntbl;
 123
 124        ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC);
 125
 126        if (ntbl && leaf) {
 127                for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++)
 128                        INIT_RHT_NULLS_HEAD(ntbl[i].bucket);
 129        }
 130
 131        if (cmpxchg((union nested_table **)prev, NULL, ntbl) == NULL)
 132                return ntbl;
 133        /* Raced with another thread. */
 134        kfree(ntbl);
 135        return rcu_dereference(*prev);
 136}
 137
 138static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
 139                                                      size_t nbuckets,
 140                                                      gfp_t gfp)
 141{
 142        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
 143        struct bucket_table *tbl;
 144        size_t size;
 145
 146        if (nbuckets < (1 << (shift + 1)))
 147                return NULL;
 148
 149        size = sizeof(*tbl) + sizeof(tbl->buckets[0]);
 150
 151        tbl = kzalloc(size, gfp);
 152        if (!tbl)
 153                return NULL;
 154
 155        if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
 156                                false)) {
 157                kfree(tbl);
 158                return NULL;
 159        }
 160
 161        tbl->nest = (ilog2(nbuckets) - 1) % shift + 1;
 162
 163        return tbl;
 164}
 165
 166static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 167                                               size_t nbuckets,
 168                                               gfp_t gfp)
 169{
 170        struct bucket_table *tbl = NULL;
 171        size_t size;
 172        int i;
 173        static struct lock_class_key __key;
 174
 175        tbl = kvzalloc(struct_size(tbl, buckets, nbuckets), gfp);
 176
 177        size = nbuckets;
 178
 179        if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) {
 180                tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
 181                nbuckets = 0;
 182        }
 183
 184        if (tbl == NULL)
 185                return NULL;
 186
 187        lockdep_init_map(&tbl->dep_map, "rhashtable_bucket", &__key, 0);
 188
 189        tbl->size = size;
 190
 191        rcu_head_init(&tbl->rcu);
 192        INIT_LIST_HEAD(&tbl->walkers);
 193
 194        tbl->hash_rnd = get_random_u32();
 195
 196        for (i = 0; i < nbuckets; i++)
 197                INIT_RHT_NULLS_HEAD(tbl->buckets[i]);
 198
 199        return tbl;
 200}
 201
 202static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
 203                                                  struct bucket_table *tbl)
 204{
 205        struct bucket_table *new_tbl;
 206
 207        do {
 208                new_tbl = tbl;
 209                tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 210        } while (tbl);
 211
 212        return new_tbl;
 213}
 214
 215static int rhashtable_rehash_one(struct rhashtable *ht,
 216                                 struct rhash_lock_head **bkt,
 217                                 unsigned int old_hash)
 218{
 219        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 220        struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl);
 221        int err = -EAGAIN;
 222        struct rhash_head *head, *next, *entry;
 223        struct rhash_head __rcu **pprev = NULL;
 224        unsigned int new_hash;
 225
 226        if (new_tbl->nest)
 227                goto out;
 228
 229        err = -ENOENT;
 230
 231        rht_for_each_from(entry, rht_ptr(bkt, old_tbl, old_hash),
 232                          old_tbl, old_hash) {
 233                err = 0;
 234                next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
 235
 236                if (rht_is_a_nulls(next))
 237                        break;
 238
 239                pprev = &entry->next;
 240        }
 241
 242        if (err)
 243                goto out;
 244
 245        new_hash = head_hashfn(ht, new_tbl, entry);
 246
 247        rht_lock_nested(new_tbl, &new_tbl->buckets[new_hash], SINGLE_DEPTH_NESTING);
 248
 249        head = rht_ptr(new_tbl->buckets + new_hash, new_tbl, new_hash);
 250
 251        RCU_INIT_POINTER(entry->next, head);
 252
 253        rht_assign_unlock(new_tbl, &new_tbl->buckets[new_hash], entry);
 254
 255        if (pprev)
 256                rcu_assign_pointer(*pprev, next);
 257        else
 258                /* Need to preserved the bit lock. */
 259                rht_assign_locked(bkt, next);
 260
 261out:
 262        return err;
 263}
 264
 265static int rhashtable_rehash_chain(struct rhashtable *ht,
 266                                    unsigned int old_hash)
 267{
 268        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 269        struct rhash_lock_head **bkt = rht_bucket_var(old_tbl, old_hash);
 270        int err;
 271
 272        if (!bkt)
 273                return 0;
 274        rht_lock(old_tbl, bkt);
 275
 276        while (!(err = rhashtable_rehash_one(ht, bkt, old_hash)))
 277                ;
 278
 279        if (err == -ENOENT)
 280                err = 0;
 281        rht_unlock(old_tbl, bkt);
 282
 283        return err;
 284}
 285
 286static int rhashtable_rehash_attach(struct rhashtable *ht,
 287                                    struct bucket_table *old_tbl,
 288                                    struct bucket_table *new_tbl)
 289{
 290        /* Make insertions go into the new, empty table right away. Deletions
 291         * and lookups will be attempted in both tables until we synchronize.
 292         * As cmpxchg() provides strong barriers, we do not need
 293         * rcu_assign_pointer().
 294         */
 295
 296        if (cmpxchg((struct bucket_table **)&old_tbl->future_tbl, NULL,
 297                    new_tbl) != NULL)
 298                return -EEXIST;
 299
 300        return 0;
 301}
 302
 303static int rhashtable_rehash_table(struct rhashtable *ht)
 304{
 305        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 306        struct bucket_table *new_tbl;
 307        struct rhashtable_walker *walker;
 308        unsigned int old_hash;
 309        int err;
 310
 311        new_tbl = rht_dereference(old_tbl->future_tbl, ht);
 312        if (!new_tbl)
 313                return 0;
 314
 315        for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
 316                err = rhashtable_rehash_chain(ht, old_hash);
 317                if (err)
 318                        return err;
 319                cond_resched();
 320        }
 321
 322        /* Publish the new table pointer. */
 323        rcu_assign_pointer(ht->tbl, new_tbl);
 324
 325        spin_lock(&ht->lock);
 326        list_for_each_entry(walker, &old_tbl->walkers, list)
 327                walker->tbl = NULL;
 328
 329        /* Wait for readers. All new readers will see the new
 330         * table, and thus no references to the old table will
 331         * remain.
 332         * We do this inside the locked region so that
 333         * rhashtable_walk_stop() can use rcu_head_after_call_rcu()
 334         * to check if it should not re-link the table.
 335         */
 336        call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
 337        spin_unlock(&ht->lock);
 338
 339        return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
 340}
 341
 342static int rhashtable_rehash_alloc(struct rhashtable *ht,
 343                                   struct bucket_table *old_tbl,
 344                                   unsigned int size)
 345{
 346        struct bucket_table *new_tbl;
 347        int err;
 348
 349        ASSERT_RHT_MUTEX(ht);
 350
 351        new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
 352        if (new_tbl == NULL)
 353                return -ENOMEM;
 354
 355        err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
 356        if (err)
 357                bucket_table_free(new_tbl);
 358
 359        return err;
 360}
 361
 362/**
 363 * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
 364 * @ht:         the hash table to shrink
 365 *
 366 * This function shrinks the hash table to fit, i.e., the smallest
 367 * size would not cause it to expand right away automatically.
 368 *
 369 * The caller must ensure that no concurrent resizing occurs by holding
 370 * ht->mutex.
 371 *
 372 * The caller must ensure that no concurrent table mutations take place.
 373 * It is however valid to have concurrent lookups if they are RCU protected.
 374 *
 375 * It is valid to have concurrent insertions and deletions protected by per
 376 * bucket locks or concurrent RCU protected lookups and traversals.
 377 */
 378static int rhashtable_shrink(struct rhashtable *ht)
 379{
 380        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 381        unsigned int nelems = atomic_read(&ht->nelems);
 382        unsigned int size = 0;
 383
 384        if (nelems)
 385                size = roundup_pow_of_two(nelems * 3 / 2);
 386        if (size < ht->p.min_size)
 387                size = ht->p.min_size;
 388
 389        if (old_tbl->size <= size)
 390                return 0;
 391
 392        if (rht_dereference(old_tbl->future_tbl, ht))
 393                return -EEXIST;
 394
 395        return rhashtable_rehash_alloc(ht, old_tbl, size);
 396}
 397
 398static void rht_deferred_worker(struct work_struct *work)
 399{
 400        struct rhashtable *ht;
 401        struct bucket_table *tbl;
 402        int err = 0;
 403
 404        ht = container_of(work, struct rhashtable, run_work);
 405        mutex_lock(&ht->mutex);
 406
 407        tbl = rht_dereference(ht->tbl, ht);
 408        tbl = rhashtable_last_table(ht, tbl);
 409
 410        if (rht_grow_above_75(ht, tbl))
 411                err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2);
 412        else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
 413                err = rhashtable_shrink(ht);
 414        else if (tbl->nest)
 415                err = rhashtable_rehash_alloc(ht, tbl, tbl->size);
 416
 417        if (!err || err == -EEXIST) {
 418                int nerr;
 419
 420                nerr = rhashtable_rehash_table(ht);
 421                err = err ?: nerr;
 422        }
 423
 424        mutex_unlock(&ht->mutex);
 425
 426        if (err)
 427                schedule_work(&ht->run_work);
 428}
 429
 430static int rhashtable_insert_rehash(struct rhashtable *ht,
 431                                    struct bucket_table *tbl)
 432{
 433        struct bucket_table *old_tbl;
 434        struct bucket_table *new_tbl;
 435        unsigned int size;
 436        int err;
 437
 438        old_tbl = rht_dereference_rcu(ht->tbl, ht);
 439
 440        size = tbl->size;
 441
 442        err = -EBUSY;
 443
 444        if (rht_grow_above_75(ht, tbl))
 445                size *= 2;
 446        /* Do not schedule more than one rehash */
 447        else if (old_tbl != tbl)
 448                goto fail;
 449
 450        err = -ENOMEM;
 451
 452        new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN);
 453        if (new_tbl == NULL)
 454                goto fail;
 455
 456        err = rhashtable_rehash_attach(ht, tbl, new_tbl);
 457        if (err) {
 458                bucket_table_free(new_tbl);
 459                if (err == -EEXIST)
 460                        err = 0;
 461        } else
 462                schedule_work(&ht->run_work);
 463
 464        return err;
 465
 466fail:
 467        /* Do not fail the insert if someone else did a rehash. */
 468        if (likely(rcu_access_pointer(tbl->future_tbl)))
 469                return 0;
 470
 471        /* Schedule async rehash to retry allocation in process context. */
 472        if (err == -ENOMEM)
 473                schedule_work(&ht->run_work);
 474
 475        return err;
 476}
 477
 478static void *rhashtable_lookup_one(struct rhashtable *ht,
 479                                   struct rhash_lock_head **bkt,
 480                                   struct bucket_table *tbl, unsigned int hash,
 481                                   const void *key, struct rhash_head *obj)
 482{
 483        struct rhashtable_compare_arg arg = {
 484                .ht = ht,
 485                .key = key,
 486        };
 487        struct rhash_head __rcu **pprev = NULL;
 488        struct rhash_head *head;
 489        int elasticity;
 490
 491        elasticity = RHT_ELASTICITY;
 492        rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) {
 493                struct rhlist_head *list;
 494                struct rhlist_head *plist;
 495
 496                elasticity--;
 497                if (!key ||
 498                    (ht->p.obj_cmpfn ?
 499                     ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
 500                     rhashtable_compare(&arg, rht_obj(ht, head)))) {
 501                        pprev = &head->next;
 502                        continue;
 503                }
 504
 505                if (!ht->rhlist)
 506                        return rht_obj(ht, head);
 507
 508                list = container_of(obj, struct rhlist_head, rhead);
 509                plist = container_of(head, struct rhlist_head, rhead);
 510
 511                RCU_INIT_POINTER(list->next, plist);
 512                head = rht_dereference_bucket(head->next, tbl, hash);
 513                RCU_INIT_POINTER(list->rhead.next, head);
 514                if (pprev)
 515                        rcu_assign_pointer(*pprev, obj);
 516                else
 517                        /* Need to preserve the bit lock */
 518                        rht_assign_locked(bkt, obj);
 519
 520                return NULL;
 521        }
 522
 523        if (elasticity <= 0)
 524                return ERR_PTR(-EAGAIN);
 525
 526        return ERR_PTR(-ENOENT);
 527}
 528
 529static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 530                                                  struct rhash_lock_head **bkt,
 531                                                  struct bucket_table *tbl,
 532                                                  unsigned int hash,
 533                                                  struct rhash_head *obj,
 534                                                  void *data)
 535{
 536        struct bucket_table *new_tbl;
 537        struct rhash_head *head;
 538
 539        if (!IS_ERR_OR_NULL(data))
 540                return ERR_PTR(-EEXIST);
 541
 542        if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT)
 543                return ERR_CAST(data);
 544
 545        new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 546        if (new_tbl)
 547                return new_tbl;
 548
 549        if (PTR_ERR(data) != -ENOENT)
 550                return ERR_CAST(data);
 551
 552        if (unlikely(rht_grow_above_max(ht, tbl)))
 553                return ERR_PTR(-E2BIG);
 554
 555        if (unlikely(rht_grow_above_100(ht, tbl)))
 556                return ERR_PTR(-EAGAIN);
 557
 558        head = rht_ptr(bkt, tbl, hash);
 559
 560        RCU_INIT_POINTER(obj->next, head);
 561        if (ht->rhlist) {
 562                struct rhlist_head *list;
 563
 564                list = container_of(obj, struct rhlist_head, rhead);
 565                RCU_INIT_POINTER(list->next, NULL);
 566        }
 567
 568        /* bkt is always the head of the list, so it holds
 569         * the lock, which we need to preserve
 570         */
 571        rht_assign_locked(bkt, obj);
 572
 573        atomic_inc(&ht->nelems);
 574        if (rht_grow_above_75(ht, tbl))
 575                schedule_work(&ht->run_work);
 576
 577        return NULL;
 578}
 579
 580static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 581                                   struct rhash_head *obj)
 582{
 583        struct bucket_table *new_tbl;
 584        struct bucket_table *tbl;
 585        struct rhash_lock_head **bkt;
 586        unsigned int hash;
 587        void *data;
 588
 589        new_tbl = rcu_dereference(ht->tbl);
 590
 591        do {
 592                tbl = new_tbl;
 593                hash = rht_head_hashfn(ht, tbl, obj, ht->p);
 594                if (rcu_access_pointer(tbl->future_tbl))
 595                        /* Failure is OK */
 596                        bkt = rht_bucket_var(tbl, hash);
 597                else
 598                        bkt = rht_bucket_insert(ht, tbl, hash);
 599                if (bkt == NULL) {
 600                        new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 601                        data = ERR_PTR(-EAGAIN);
 602                } else {
 603                        rht_lock(tbl, bkt);
 604                        data = rhashtable_lookup_one(ht, bkt, tbl,
 605                                                     hash, key, obj);
 606                        new_tbl = rhashtable_insert_one(ht, bkt, tbl,
 607                                                        hash, obj, data);
 608                        if (PTR_ERR(new_tbl) != -EEXIST)
 609                                data = ERR_CAST(new_tbl);
 610
 611                        rht_unlock(tbl, bkt);
 612                }
 613        } while (!IS_ERR_OR_NULL(new_tbl));
 614
 615        if (PTR_ERR(data) == -EAGAIN)
 616                data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
 617                               -EAGAIN);
 618
 619        return data;
 620}
 621
 622void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
 623                             struct rhash_head *obj)
 624{
 625        void *data;
 626
 627        do {
 628                rcu_read_lock();
 629                data = rhashtable_try_insert(ht, key, obj);
 630                rcu_read_unlock();
 631        } while (PTR_ERR(data) == -EAGAIN);
 632
 633        return data;
 634}
 635EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
 636
 637/**
 638 * rhashtable_walk_enter - Initialise an iterator
 639 * @ht:         Table to walk over
 640 * @iter:       Hash table Iterator
 641 *
 642 * This function prepares a hash table walk.
 643 *
 644 * Note that if you restart a walk after rhashtable_walk_stop you
 645 * may see the same object twice.  Also, you may miss objects if
 646 * there are removals in between rhashtable_walk_stop and the next
 647 * call to rhashtable_walk_start.
 648 *
 649 * For a completely stable walk you should construct your own data
 650 * structure outside the hash table.
 651 *
 652 * This function may be called from any process context, including
 653 * non-preemptable context, but cannot be called from softirq or
 654 * hardirq context.
 655 *
 656 * You must call rhashtable_walk_exit after this function returns.
 657 */
 658void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter)
 659{
 660        iter->ht = ht;
 661        iter->p = NULL;
 662        iter->slot = 0;
 663        iter->skip = 0;
 664        iter->end_of_table = 0;
 665
 666        spin_lock(&ht->lock);
 667        iter->walker.tbl =
 668                rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock));
 669        list_add(&iter->walker.list, &iter->walker.tbl->walkers);
 670        spin_unlock(&ht->lock);
 671}
 672EXPORT_SYMBOL_GPL(rhashtable_walk_enter);
 673
 674/**
 675 * rhashtable_walk_exit - Free an iterator
 676 * @iter:       Hash table Iterator
 677 *
 678 * This function frees resources allocated by rhashtable_walk_enter.
 679 */
 680void rhashtable_walk_exit(struct rhashtable_iter *iter)
 681{
 682        spin_lock(&iter->ht->lock);
 683        if (iter->walker.tbl)
 684                list_del(&iter->walker.list);
 685        spin_unlock(&iter->ht->lock);
 686}
 687EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
 688
 689/**
 690 * rhashtable_walk_start_check - Start a hash table walk
 691 * @iter:       Hash table iterator
 692 *
 693 * Start a hash table walk at the current iterator position.  Note that we take
 694 * the RCU lock in all cases including when we return an error.  So you must
 695 * always call rhashtable_walk_stop to clean up.
 696 *
 697 * Returns zero if successful.
 698 *
 699 * Returns -EAGAIN if resize event occured.  Note that the iterator
 700 * will rewind back to the beginning and you may use it immediately
 701 * by calling rhashtable_walk_next.
 702 *
 703 * rhashtable_walk_start is defined as an inline variant that returns
 704 * void. This is preferred in cases where the caller would ignore
 705 * resize events and always continue.
 706 */
 707int rhashtable_walk_start_check(struct rhashtable_iter *iter)
 708        __acquires(RCU)
 709{
 710        struct rhashtable *ht = iter->ht;
 711        bool rhlist = ht->rhlist;
 712
 713        rcu_read_lock();
 714
 715        spin_lock(&ht->lock);
 716        if (iter->walker.tbl)
 717                list_del(&iter->walker.list);
 718        spin_unlock(&ht->lock);
 719
 720        if (iter->end_of_table)
 721                return 0;
 722        if (!iter->walker.tbl) {
 723                iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
 724                iter->slot = 0;
 725                iter->skip = 0;
 726                return -EAGAIN;
 727        }
 728
 729        if (iter->p && !rhlist) {
 730                /*
 731                 * We need to validate that 'p' is still in the table, and
 732                 * if so, update 'skip'
 733                 */
 734                struct rhash_head *p;
 735                int skip = 0;
 736                rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
 737                        skip++;
 738                        if (p == iter->p) {
 739                                iter->skip = skip;
 740                                goto found;
 741                        }
 742                }
 743                iter->p = NULL;
 744        } else if (iter->p && rhlist) {
 745                /* Need to validate that 'list' is still in the table, and
 746                 * if so, update 'skip' and 'p'.
 747                 */
 748                struct rhash_head *p;
 749                struct rhlist_head *list;
 750                int skip = 0;
 751                rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
 752                        for (list = container_of(p, struct rhlist_head, rhead);
 753                             list;
 754                             list = rcu_dereference(list->next)) {
 755                                skip++;
 756                                if (list == iter->list) {
 757                                        iter->p = p;
 758                                        iter->skip = skip;
 759                                        goto found;
 760                                }
 761                        }
 762                }
 763                iter->p = NULL;
 764        }
 765found:
 766        return 0;
 767}
 768EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);
 769
 770/**
 771 * __rhashtable_walk_find_next - Find the next element in a table (or the first
 772 * one in case of a new walk).
 773 *
 774 * @iter:       Hash table iterator
 775 *
 776 * Returns the found object or NULL when the end of the table is reached.
 777 *
 778 * Returns -EAGAIN if resize event occurred.
 779 */
 780static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter)
 781{
 782        struct bucket_table *tbl = iter->walker.tbl;
 783        struct rhlist_head *list = iter->list;
 784        struct rhashtable *ht = iter->ht;
 785        struct rhash_head *p = iter->p;
 786        bool rhlist = ht->rhlist;
 787
 788        if (!tbl)
 789                return NULL;
 790
 791        for (; iter->slot < tbl->size; iter->slot++) {
 792                int skip = iter->skip;
 793
 794                rht_for_each_rcu(p, tbl, iter->slot) {
 795                        if (rhlist) {
 796                                list = container_of(p, struct rhlist_head,
 797                                                    rhead);
 798                                do {
 799                                        if (!skip)
 800                                                goto next;
 801                                        skip--;
 802                                        list = rcu_dereference(list->next);
 803                                } while (list);
 804
 805                                continue;
 806                        }
 807                        if (!skip)
 808                                break;
 809                        skip--;
 810                }
 811
 812next:
 813                if (!rht_is_a_nulls(p)) {
 814                        iter->skip++;
 815                        iter->p = p;
 816                        iter->list = list;
 817                        return rht_obj(ht, rhlist ? &list->rhead : p);
 818                }
 819
 820                iter->skip = 0;
 821        }
 822
 823        iter->p = NULL;
 824
 825        /* Ensure we see any new tables. */
 826        smp_rmb();
 827
 828        iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 829        if (iter->walker.tbl) {
 830                iter->slot = 0;
 831                iter->skip = 0;
 832                return ERR_PTR(-EAGAIN);
 833        } else {
 834                iter->end_of_table = true;
 835        }
 836
 837        return NULL;
 838}
 839
 840/**
 841 * rhashtable_walk_next - Return the next object and advance the iterator
 842 * @iter:       Hash table iterator
 843 *
 844 * Note that you must call rhashtable_walk_stop when you are finished
 845 * with the walk.
 846 *
 847 * Returns the next object or NULL when the end of the table is reached.
 848 *
 849 * Returns -EAGAIN if resize event occurred.  Note that the iterator
 850 * will rewind back to the beginning and you may continue to use it.
 851 */
 852void *rhashtable_walk_next(struct rhashtable_iter *iter)
 853{
 854        struct rhlist_head *list = iter->list;
 855        struct rhashtable *ht = iter->ht;
 856        struct rhash_head *p = iter->p;
 857        bool rhlist = ht->rhlist;
 858
 859        if (p) {
 860                if (!rhlist || !(list = rcu_dereference(list->next))) {
 861                        p = rcu_dereference(p->next);
 862                        list = container_of(p, struct rhlist_head, rhead);
 863                }
 864                if (!rht_is_a_nulls(p)) {
 865                        iter->skip++;
 866                        iter->p = p;
 867                        iter->list = list;
 868                        return rht_obj(ht, rhlist ? &list->rhead : p);
 869                }
 870
 871                /* At the end of this slot, switch to next one and then find
 872                 * next entry from that point.
 873                 */
 874                iter->skip = 0;
 875                iter->slot++;
 876        }
 877
 878        return __rhashtable_walk_find_next(iter);
 879}
 880EXPORT_SYMBOL_GPL(rhashtable_walk_next);
 881
 882/**
 883 * rhashtable_walk_peek - Return the next object but don't advance the iterator
 884 * @iter:       Hash table iterator
 885 *
 886 * Returns the next object or NULL when the end of the table is reached.
 887 *
 888 * Returns -EAGAIN if resize event occurred.  Note that the iterator
 889 * will rewind back to the beginning and you may continue to use it.
 890 */
 891void *rhashtable_walk_peek(struct rhashtable_iter *iter)
 892{
 893        struct rhlist_head *list = iter->list;
 894        struct rhashtable *ht = iter->ht;
 895        struct rhash_head *p = iter->p;
 896
 897        if (p)
 898                return rht_obj(ht, ht->rhlist ? &list->rhead : p);
 899
 900        /* No object found in current iter, find next one in the table. */
 901
 902        if (iter->skip) {
 903                /* A nonzero skip value points to the next entry in the table
 904                 * beyond that last one that was found. Decrement skip so
 905                 * we find the current value. __rhashtable_walk_find_next
 906                 * will restore the original value of skip assuming that
 907                 * the table hasn't changed.
 908                 */
 909                iter->skip--;
 910        }
 911
 912        return __rhashtable_walk_find_next(iter);
 913}
 914EXPORT_SYMBOL_GPL(rhashtable_walk_peek);
 915
 916/**
 917 * rhashtable_walk_stop - Finish a hash table walk
 918 * @iter:       Hash table iterator
 919 *
 920 * Finish a hash table walk.  Does not reset the iterator to the start of the
 921 * hash table.
 922 */
 923void rhashtable_walk_stop(struct rhashtable_iter *iter)
 924        __releases(RCU)
 925{
 926        struct rhashtable *ht;
 927        struct bucket_table *tbl = iter->walker.tbl;
 928
 929        if (!tbl)
 930                goto out;
 931
 932        ht = iter->ht;
 933
 934        spin_lock(&ht->lock);
 935        if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu))
 936                /* This bucket table is being freed, don't re-link it. */
 937                iter->walker.tbl = NULL;
 938        else
 939                list_add(&iter->walker.list, &tbl->walkers);
 940        spin_unlock(&ht->lock);
 941
 942out:
 943        rcu_read_unlock();
 944}
 945EXPORT_SYMBOL_GPL(rhashtable_walk_stop);
 946
 947static size_t rounded_hashtable_size(const struct rhashtable_params *params)
 948{
 949        size_t retsize;
 950
 951        if (params->nelem_hint)
 952                retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
 953                              (unsigned long)params->min_size);
 954        else
 955                retsize = max(HASH_DEFAULT_SIZE,
 956                              (unsigned long)params->min_size);
 957
 958        return retsize;
 959}
 960
 961static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
 962{
 963        return jhash2(key, length, seed);
 964}
 965
 966/**
 967 * rhashtable_init - initialize a new hash table
 968 * @ht:         hash table to be initialized
 969 * @params:     configuration parameters
 970 *
 971 * Initializes a new hash table based on the provided configuration
 972 * parameters. A table can be configured either with a variable or
 973 * fixed length key:
 974 *
 975 * Configuration Example 1: Fixed length keys
 976 * struct test_obj {
 977 *      int                     key;
 978 *      void *                  my_member;
 979 *      struct rhash_head       node;
 980 * };
 981 *
 982 * struct rhashtable_params params = {
 983 *      .head_offset = offsetof(struct test_obj, node),
 984 *      .key_offset = offsetof(struct test_obj, key),
 985 *      .key_len = sizeof(int),
 986 *      .hashfn = jhash,
 987 * };
 988 *
 989 * Configuration Example 2: Variable length keys
 990 * struct test_obj {
 991 *      [...]
 992 *      struct rhash_head       node;
 993 * };
 994 *
 995 * u32 my_hash_fn(const void *data, u32 len, u32 seed)
 996 * {
 997 *      struct test_obj *obj = data;
 998 *
 999 *      return [... hash ...];
1000 * }
1001 *
1002 * struct rhashtable_params params = {
1003 *      .head_offset = offsetof(struct test_obj, node),
1004 *      .hashfn = jhash,
1005 *      .obj_hashfn = my_hash_fn,
1006 * };
1007 */
1008int rhashtable_init(struct rhashtable *ht,
1009                    const struct rhashtable_params *params)
1010{
1011        struct bucket_table *tbl;
1012        size_t size;
1013
1014        if ((!params->key_len && !params->obj_hashfn) ||
1015            (params->obj_hashfn && !params->obj_cmpfn))
1016                return -EINVAL;
1017
1018        memset(ht, 0, sizeof(*ht));
1019        mutex_init(&ht->mutex);
1020        spin_lock_init(&ht->lock);
1021        memcpy(&ht->p, params, sizeof(*params));
1022
1023        if (params->min_size)
1024                ht->p.min_size = roundup_pow_of_two(params->min_size);
1025
1026        /* Cap total entries at 2^31 to avoid nelems overflow. */
1027        ht->max_elems = 1u << 31;
1028
1029        if (params->max_size) {
1030                ht->p.max_size = rounddown_pow_of_two(params->max_size);
1031                if (ht->p.max_size < ht->max_elems / 2)
1032                        ht->max_elems = ht->p.max_size * 2;
1033        }
1034
1035        ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
1036
1037        size = rounded_hashtable_size(&ht->p);
1038
1039        ht->key_len = ht->p.key_len;
1040        if (!params->hashfn) {
1041                ht->p.hashfn = jhash;
1042
1043                if (!(ht->key_len & (sizeof(u32) - 1))) {
1044                        ht->key_len /= sizeof(u32);
1045                        ht->p.hashfn = rhashtable_jhash2;
1046                }
1047        }
1048
1049        /*
1050         * This is api initialization and thus we need to guarantee the
1051         * initial rhashtable allocation. Upon failure, retry with the
1052         * smallest possible size with __GFP_NOFAIL semantics.
1053         */
1054        tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
1055        if (unlikely(tbl == NULL)) {
1056                size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
1057                tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL);
1058        }
1059
1060        atomic_set(&ht->nelems, 0);
1061
1062        RCU_INIT_POINTER(ht->tbl, tbl);
1063
1064        INIT_WORK(&ht->run_work, rht_deferred_worker);
1065
1066        return 0;
1067}
1068EXPORT_SYMBOL_GPL(rhashtable_init);
1069
1070/**
1071 * rhltable_init - initialize a new hash list table
1072 * @hlt:        hash list table to be initialized
1073 * @params:     configuration parameters
1074 *
1075 * Initializes a new hash list table.
1076 *
1077 * See documentation for rhashtable_init.
1078 */
1079int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
1080{
1081        int err;
1082
1083        err = rhashtable_init(&hlt->ht, params);
1084        hlt->ht.rhlist = true;
1085        return err;
1086}
1087EXPORT_SYMBOL_GPL(rhltable_init);
1088
1089static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
1090                                void (*free_fn)(void *ptr, void *arg),
1091                                void *arg)
1092{
1093        struct rhlist_head *list;
1094
1095        if (!ht->rhlist) {
1096                free_fn(rht_obj(ht, obj), arg);
1097                return;
1098        }
1099
1100        list = container_of(obj, struct rhlist_head, rhead);
1101        do {
1102                obj = &list->rhead;
1103                list = rht_dereference(list->next, ht);
1104                free_fn(rht_obj(ht, obj), arg);
1105        } while (list);
1106}
1107
1108/**
1109 * rhashtable_free_and_destroy - free elements and destroy hash table
1110 * @ht:         the hash table to destroy
1111 * @free_fn:    callback to release resources of element
1112 * @arg:        pointer passed to free_fn
1113 *
1114 * Stops an eventual async resize. If defined, invokes free_fn for each
1115 * element to releasal resources. Please note that RCU protected
1116 * readers may still be accessing the elements. Releasing of resources
1117 * must occur in a compatible manner. Then frees the bucket array.
1118 *
1119 * This function will eventually sleep to wait for an async resize
1120 * to complete. The caller is responsible that no further write operations
1121 * occurs in parallel.
1122 */
1123void rhashtable_free_and_destroy(struct rhashtable *ht,
1124                                 void (*free_fn)(void *ptr, void *arg),
1125                                 void *arg)
1126{
1127        struct bucket_table *tbl, *next_tbl;
1128        unsigned int i;
1129
1130        cancel_work_sync(&ht->run_work);
1131
1132        mutex_lock(&ht->mutex);
1133        tbl = rht_dereference(ht->tbl, ht);
1134restart:
1135        if (free_fn) {
1136                for (i = 0; i < tbl->size; i++) {
1137                        struct rhash_head *pos, *next;
1138
1139                        cond_resched();
1140                        for (pos = rht_ptr_exclusive(rht_bucket(tbl, i)),
1141                             next = !rht_is_a_nulls(pos) ?
1142                                        rht_dereference(pos->next, ht) : NULL;
1143                             !rht_is_a_nulls(pos);
1144                             pos = next,
1145                             next = !rht_is_a_nulls(pos) ?
1146                                        rht_dereference(pos->next, ht) : NULL)
1147                                rhashtable_free_one(ht, pos, free_fn, arg);
1148                }
1149        }
1150
1151        next_tbl = rht_dereference(tbl->future_tbl, ht);
1152        bucket_table_free(tbl);
1153        if (next_tbl) {
1154                tbl = next_tbl;
1155                goto restart;
1156        }
1157        mutex_unlock(&ht->mutex);
1158}
1159EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy);
1160
1161void rhashtable_destroy(struct rhashtable *ht)
1162{
1163        return rhashtable_free_and_destroy(ht, NULL, NULL);
1164}
1165EXPORT_SYMBOL_GPL(rhashtable_destroy);
1166
1167struct rhash_lock_head **__rht_bucket_nested(const struct bucket_table *tbl,
1168                                             unsigned int hash)
1169{
1170        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
1171        unsigned int index = hash & ((1 << tbl->nest) - 1);
1172        unsigned int size = tbl->size >> tbl->nest;
1173        unsigned int subhash = hash;
1174        union nested_table *ntbl;
1175
1176        ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
1177        ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash);
1178        subhash >>= tbl->nest;
1179
1180        while (ntbl && size > (1 << shift)) {
1181                index = subhash & ((1 << shift) - 1);
1182                ntbl = rht_dereference_bucket_rcu(ntbl[index].table,
1183                                                  tbl, hash);
1184                size >>= shift;
1185                subhash >>= shift;
1186        }
1187
1188        if (!ntbl)
1189                return NULL;
1190
1191        return &ntbl[subhash].bucket;
1192
1193}
1194EXPORT_SYMBOL_GPL(__rht_bucket_nested);
1195
1196struct rhash_lock_head **rht_bucket_nested(const struct bucket_table *tbl,
1197                                           unsigned int hash)
1198{
1199        static struct rhash_lock_head *rhnull;
1200
1201        if (!rhnull)
1202                INIT_RHT_NULLS_HEAD(rhnull);
1203        return __rht_bucket_nested(tbl, hash) ?: &rhnull;
1204}
1205EXPORT_SYMBOL_GPL(rht_bucket_nested);
1206
1207struct rhash_lock_head **rht_bucket_nested_insert(struct rhashtable *ht,
1208                                                  struct bucket_table *tbl,
1209                                                  unsigned int hash)
1210{
1211        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
1212        unsigned int index = hash & ((1 << tbl->nest) - 1);
1213        unsigned int size = tbl->size >> tbl->nest;
1214        union nested_table *ntbl;
1215
1216        ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
1217        hash >>= tbl->nest;
1218        ntbl = nested_table_alloc(ht, &ntbl[index].table,
1219                                  size <= (1 << shift));
1220
1221        while (ntbl && size > (1 << shift)) {
1222                index = hash & ((1 << shift) - 1);
1223                size >>= shift;
1224                hash >>= shift;
1225                ntbl = nested_table_alloc(ht, &ntbl[index].table,
1226                                          size <= (1 << shift));
1227        }
1228
1229        if (!ntbl)
1230                return NULL;
1231
1232        return &ntbl[hash].bucket;
1233
1234}
1235EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);
1236