linux/lib/rhashtable.c
<<
>>
Prefs
   1/*
   2 * Resizable, Scalable, Concurrent Hash Table
   3 *
   4 * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
   5 * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
   6 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
   7 *
   8 * Code partially derived from nft_hash
   9 * Rewritten with rehash code from br_multicast plus single list
  10 * pointer as suggested by Josh Triplett
  11 *
  12 * This program is free software; you can redistribute it and/or modify
  13 * it under the terms of the GNU General Public License version 2 as
  14 * published by the Free Software Foundation.
  15 */
  16
  17#include <linux/atomic.h>
  18#include <linux/kernel.h>
  19#include <linux/init.h>
  20#include <linux/log2.h>
  21#include <linux/sched.h>
  22#include <linux/rculist.h>
  23#include <linux/slab.h>
  24#include <linux/vmalloc.h>
  25#include <linux/mm.h>
  26#include <linux/jhash.h>
  27#include <linux/random.h>
  28#include <linux/rhashtable.h>
  29#include <linux/err.h>
  30#include <linux/export.h>
  31
  32#define HASH_DEFAULT_SIZE       64UL
  33#define HASH_MIN_SIZE           4U
  34#define BUCKET_LOCKS_PER_CPU    32UL
  35
  36union nested_table {
  37        union nested_table __rcu *table;
  38        struct rhash_head __rcu *bucket;
  39};
  40
  41static u32 head_hashfn(struct rhashtable *ht,
  42                       const struct bucket_table *tbl,
  43                       const struct rhash_head *he)
  44{
  45        return rht_head_hashfn(ht, tbl, he, ht->p);
  46}
  47
  48#ifdef CONFIG_PROVE_LOCKING
  49#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
  50
  51int lockdep_rht_mutex_is_held(struct rhashtable *ht)
  52{
  53        return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
  54}
  55EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
  56
  57int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
  58{
  59        spinlock_t *lock = rht_bucket_lock(tbl, hash);
  60
  61        return (debug_locks) ? lockdep_is_held(lock) : 1;
  62}
  63EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
  64#else
  65#define ASSERT_RHT_MUTEX(HT)
  66#endif
  67
  68static void nested_table_free(union nested_table *ntbl, unsigned int size)
  69{
  70        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
  71        const unsigned int len = 1 << shift;
  72        unsigned int i;
  73
  74        ntbl = rcu_dereference_raw(ntbl->table);
  75        if (!ntbl)
  76                return;
  77
  78        if (size > len) {
  79                size >>= shift;
  80                for (i = 0; i < len; i++)
  81                        nested_table_free(ntbl + i, size);
  82        }
  83
  84        kfree(ntbl);
  85}
  86
  87static void nested_bucket_table_free(const struct bucket_table *tbl)
  88{
  89        unsigned int size = tbl->size >> tbl->nest;
  90        unsigned int len = 1 << tbl->nest;
  91        union nested_table *ntbl;
  92        unsigned int i;
  93
  94        ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
  95
  96        for (i = 0; i < len; i++)
  97                nested_table_free(ntbl + i, size);
  98
  99        kfree(ntbl);
 100}
 101
 102static void bucket_table_free(const struct bucket_table *tbl)
 103{
 104        if (tbl->nest)
 105                nested_bucket_table_free(tbl);
 106
 107        free_bucket_spinlocks(tbl->locks);
 108        kvfree(tbl);
 109}
 110
 111static void bucket_table_free_rcu(struct rcu_head *head)
 112{
 113        bucket_table_free(container_of(head, struct bucket_table, rcu));
 114}
 115
 116static union nested_table *nested_table_alloc(struct rhashtable *ht,
 117                                              union nested_table __rcu **prev,
 118                                              bool leaf)
 119{
 120        union nested_table *ntbl;
 121        int i;
 122
 123        ntbl = rcu_dereference(*prev);
 124        if (ntbl)
 125                return ntbl;
 126
 127        ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC);
 128
 129        if (ntbl && leaf) {
 130                for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++)
 131                        INIT_RHT_NULLS_HEAD(ntbl[i].bucket);
 132        }
 133
 134        rcu_assign_pointer(*prev, ntbl);
 135
 136        return ntbl;
 137}
 138
 139static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
 140                                                      size_t nbuckets,
 141                                                      gfp_t gfp)
 142{
 143        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
 144        struct bucket_table *tbl;
 145        size_t size;
 146
 147        if (nbuckets < (1 << (shift + 1)))
 148                return NULL;
 149
 150        size = sizeof(*tbl) + sizeof(tbl->buckets[0]);
 151
 152        tbl = kzalloc(size, gfp);
 153        if (!tbl)
 154                return NULL;
 155
 156        if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
 157                                false)) {
 158                kfree(tbl);
 159                return NULL;
 160        }
 161
 162        tbl->nest = (ilog2(nbuckets) - 1) % shift + 1;
 163
 164        return tbl;
 165}
 166
 167static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 168                                               size_t nbuckets,
 169                                               gfp_t gfp)
 170{
 171        struct bucket_table *tbl = NULL;
 172        size_t size, max_locks;
 173        int i;
 174
 175        size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
 176        tbl = kvzalloc(size, gfp);
 177
 178        size = nbuckets;
 179
 180        if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) {
 181                tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
 182                nbuckets = 0;
 183        }
 184
 185        if (tbl == NULL)
 186                return NULL;
 187
 188        tbl->size = size;
 189
 190        max_locks = size >> 1;
 191        if (tbl->nest)
 192                max_locks = min_t(size_t, max_locks, 1U << tbl->nest);
 193
 194        if (alloc_bucket_spinlocks(&tbl->locks, &tbl->locks_mask, max_locks,
 195                                   ht->p.locks_mul, gfp) < 0) {
 196                bucket_table_free(tbl);
 197                return NULL;
 198        }
 199
 200        INIT_LIST_HEAD(&tbl->walkers);
 201
 202        tbl->hash_rnd = get_random_u32();
 203
 204        for (i = 0; i < nbuckets; i++)
 205                INIT_RHT_NULLS_HEAD(tbl->buckets[i]);
 206
 207        return tbl;
 208}
 209
 210static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
 211                                                  struct bucket_table *tbl)
 212{
 213        struct bucket_table *new_tbl;
 214
 215        do {
 216                new_tbl = tbl;
 217                tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 218        } while (tbl);
 219
 220        return new_tbl;
 221}
 222
 223static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 224{
 225        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 226        struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl);
 227        struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash);
 228        int err = -EAGAIN;
 229        struct rhash_head *head, *next, *entry;
 230        spinlock_t *new_bucket_lock;
 231        unsigned int new_hash;
 232
 233        if (new_tbl->nest)
 234                goto out;
 235
 236        err = -ENOENT;
 237
 238        rht_for_each(entry, old_tbl, old_hash) {
 239                err = 0;
 240                next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
 241
 242                if (rht_is_a_nulls(next))
 243                        break;
 244
 245                pprev = &entry->next;
 246        }
 247
 248        if (err)
 249                goto out;
 250
 251        new_hash = head_hashfn(ht, new_tbl, entry);
 252
 253        new_bucket_lock = rht_bucket_lock(new_tbl, new_hash);
 254
 255        spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING);
 256        head = rht_dereference_bucket(new_tbl->buckets[new_hash],
 257                                      new_tbl, new_hash);
 258
 259        RCU_INIT_POINTER(entry->next, head);
 260
 261        rcu_assign_pointer(new_tbl->buckets[new_hash], entry);
 262        spin_unlock(new_bucket_lock);
 263
 264        rcu_assign_pointer(*pprev, next);
 265
 266out:
 267        return err;
 268}
 269
 270static int rhashtable_rehash_chain(struct rhashtable *ht,
 271                                    unsigned int old_hash)
 272{
 273        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 274        spinlock_t *old_bucket_lock;
 275        int err;
 276
 277        old_bucket_lock = rht_bucket_lock(old_tbl, old_hash);
 278
 279        spin_lock_bh(old_bucket_lock);
 280        while (!(err = rhashtable_rehash_one(ht, old_hash)))
 281                ;
 282
 283        if (err == -ENOENT) {
 284                old_tbl->rehash++;
 285                err = 0;
 286        }
 287        spin_unlock_bh(old_bucket_lock);
 288
 289        return err;
 290}
 291
 292static int rhashtable_rehash_attach(struct rhashtable *ht,
 293                                    struct bucket_table *old_tbl,
 294                                    struct bucket_table *new_tbl)
 295{
 296        /* Make insertions go into the new, empty table right away. Deletions
 297         * and lookups will be attempted in both tables until we synchronize.
 298         * As cmpxchg() provides strong barriers, we do not need
 299         * rcu_assign_pointer().
 300         */
 301
 302        if (cmpxchg(&old_tbl->future_tbl, NULL, new_tbl) != NULL)
 303                return -EEXIST;
 304
 305        return 0;
 306}
 307
 308static int rhashtable_rehash_table(struct rhashtable *ht)
 309{
 310        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 311        struct bucket_table *new_tbl;
 312        struct rhashtable_walker *walker;
 313        unsigned int old_hash;
 314        int err;
 315
 316        new_tbl = rht_dereference(old_tbl->future_tbl, ht);
 317        if (!new_tbl)
 318                return 0;
 319
 320        for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
 321                err = rhashtable_rehash_chain(ht, old_hash);
 322                if (err)
 323                        return err;
 324                cond_resched();
 325        }
 326
 327        /* Publish the new table pointer. */
 328        rcu_assign_pointer(ht->tbl, new_tbl);
 329
 330        spin_lock(&ht->lock);
 331        list_for_each_entry(walker, &old_tbl->walkers, list)
 332                walker->tbl = NULL;
 333        spin_unlock(&ht->lock);
 334
 335        /* Wait for readers. All new readers will see the new
 336         * table, and thus no references to the old table will
 337         * remain.
 338         */
 339        call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
 340
 341        return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
 342}
 343
 344static int rhashtable_rehash_alloc(struct rhashtable *ht,
 345                                   struct bucket_table *old_tbl,
 346                                   unsigned int size)
 347{
 348        struct bucket_table *new_tbl;
 349        int err;
 350
 351        ASSERT_RHT_MUTEX(ht);
 352
 353        new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
 354        if (new_tbl == NULL)
 355                return -ENOMEM;
 356
 357        err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
 358        if (err)
 359                bucket_table_free(new_tbl);
 360
 361        return err;
 362}
 363
 364/**
 365 * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
 366 * @ht:         the hash table to shrink
 367 *
 368 * This function shrinks the hash table to fit, i.e., the smallest
 369 * size would not cause it to expand right away automatically.
 370 *
 371 * The caller must ensure that no concurrent resizing occurs by holding
 372 * ht->mutex.
 373 *
 374 * The caller must ensure that no concurrent table mutations take place.
 375 * It is however valid to have concurrent lookups if they are RCU protected.
 376 *
 377 * It is valid to have concurrent insertions and deletions protected by per
 378 * bucket locks or concurrent RCU protected lookups and traversals.
 379 */
 380static int rhashtable_shrink(struct rhashtable *ht)
 381{
 382        struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 383        unsigned int nelems = atomic_read(&ht->nelems);
 384        unsigned int size = 0;
 385
 386        if (nelems)
 387                size = roundup_pow_of_two(nelems * 3 / 2);
 388        if (size < ht->p.min_size)
 389                size = ht->p.min_size;
 390
 391        if (old_tbl->size <= size)
 392                return 0;
 393
 394        if (rht_dereference(old_tbl->future_tbl, ht))
 395                return -EEXIST;
 396
 397        return rhashtable_rehash_alloc(ht, old_tbl, size);
 398}
 399
 400static void rht_deferred_worker(struct work_struct *work)
 401{
 402        struct rhashtable *ht;
 403        struct bucket_table *tbl;
 404        int err = 0;
 405
 406        ht = container_of(work, struct rhashtable, run_work);
 407        mutex_lock(&ht->mutex);
 408
 409        tbl = rht_dereference(ht->tbl, ht);
 410        tbl = rhashtable_last_table(ht, tbl);
 411
 412        if (rht_grow_above_75(ht, tbl))
 413                err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2);
 414        else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
 415                err = rhashtable_shrink(ht);
 416        else if (tbl->nest)
 417                err = rhashtable_rehash_alloc(ht, tbl, tbl->size);
 418
 419        if (!err)
 420                err = rhashtable_rehash_table(ht);
 421
 422        mutex_unlock(&ht->mutex);
 423
 424        if (err)
 425                schedule_work(&ht->run_work);
 426}
 427
 428static int rhashtable_insert_rehash(struct rhashtable *ht,
 429                                    struct bucket_table *tbl)
 430{
 431        struct bucket_table *old_tbl;
 432        struct bucket_table *new_tbl;
 433        unsigned int size;
 434        int err;
 435
 436        old_tbl = rht_dereference_rcu(ht->tbl, ht);
 437
 438        size = tbl->size;
 439
 440        err = -EBUSY;
 441
 442        if (rht_grow_above_75(ht, tbl))
 443                size *= 2;
 444        /* Do not schedule more than one rehash */
 445        else if (old_tbl != tbl)
 446                goto fail;
 447
 448        err = -ENOMEM;
 449
 450        new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN);
 451        if (new_tbl == NULL)
 452                goto fail;
 453
 454        err = rhashtable_rehash_attach(ht, tbl, new_tbl);
 455        if (err) {
 456                bucket_table_free(new_tbl);
 457                if (err == -EEXIST)
 458                        err = 0;
 459        } else
 460                schedule_work(&ht->run_work);
 461
 462        return err;
 463
 464fail:
 465        /* Do not fail the insert if someone else did a rehash. */
 466        if (likely(rcu_access_pointer(tbl->future_tbl)))
 467                return 0;
 468
 469        /* Schedule async rehash to retry allocation in process context. */
 470        if (err == -ENOMEM)
 471                schedule_work(&ht->run_work);
 472
 473        return err;
 474}
 475
 476static void *rhashtable_lookup_one(struct rhashtable *ht,
 477                                   struct bucket_table *tbl, unsigned int hash,
 478                                   const void *key, struct rhash_head *obj)
 479{
 480        struct rhashtable_compare_arg arg = {
 481                .ht = ht,
 482                .key = key,
 483        };
 484        struct rhash_head __rcu **pprev;
 485        struct rhash_head *head;
 486        int elasticity;
 487
 488        elasticity = RHT_ELASTICITY;
 489        pprev = rht_bucket_var(tbl, hash);
 490        rht_for_each_continue(head, *pprev, tbl, hash) {
 491                struct rhlist_head *list;
 492                struct rhlist_head *plist;
 493
 494                elasticity--;
 495                if (!key ||
 496                    (ht->p.obj_cmpfn ?
 497                     ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
 498                     rhashtable_compare(&arg, rht_obj(ht, head)))) {
 499                        pprev = &head->next;
 500                        continue;
 501                }
 502
 503                if (!ht->rhlist)
 504                        return rht_obj(ht, head);
 505
 506                list = container_of(obj, struct rhlist_head, rhead);
 507                plist = container_of(head, struct rhlist_head, rhead);
 508
 509                RCU_INIT_POINTER(list->next, plist);
 510                head = rht_dereference_bucket(head->next, tbl, hash);
 511                RCU_INIT_POINTER(list->rhead.next, head);
 512                rcu_assign_pointer(*pprev, obj);
 513
 514                return NULL;
 515        }
 516
 517        if (elasticity <= 0)
 518                return ERR_PTR(-EAGAIN);
 519
 520        return ERR_PTR(-ENOENT);
 521}
 522
 523static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 524                                                  struct bucket_table *tbl,
 525                                                  unsigned int hash,
 526                                                  struct rhash_head *obj,
 527                                                  void *data)
 528{
 529        struct rhash_head __rcu **pprev;
 530        struct bucket_table *new_tbl;
 531        struct rhash_head *head;
 532
 533        if (!IS_ERR_OR_NULL(data))
 534                return ERR_PTR(-EEXIST);
 535
 536        if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT)
 537                return ERR_CAST(data);
 538
 539        new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 540        if (new_tbl)
 541                return new_tbl;
 542
 543        if (PTR_ERR(data) != -ENOENT)
 544                return ERR_CAST(data);
 545
 546        if (unlikely(rht_grow_above_max(ht, tbl)))
 547                return ERR_PTR(-E2BIG);
 548
 549        if (unlikely(rht_grow_above_100(ht, tbl)))
 550                return ERR_PTR(-EAGAIN);
 551
 552        pprev = rht_bucket_insert(ht, tbl, hash);
 553        if (!pprev)
 554                return ERR_PTR(-ENOMEM);
 555
 556        head = rht_dereference_bucket(*pprev, tbl, hash);
 557
 558        RCU_INIT_POINTER(obj->next, head);
 559        if (ht->rhlist) {
 560                struct rhlist_head *list;
 561
 562                list = container_of(obj, struct rhlist_head, rhead);
 563                RCU_INIT_POINTER(list->next, NULL);
 564        }
 565
 566        rcu_assign_pointer(*pprev, obj);
 567
 568        atomic_inc(&ht->nelems);
 569        if (rht_grow_above_75(ht, tbl))
 570                schedule_work(&ht->run_work);
 571
 572        return NULL;
 573}
 574
 575static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 576                                   struct rhash_head *obj)
 577{
 578        struct bucket_table *new_tbl;
 579        struct bucket_table *tbl;
 580        unsigned int hash;
 581        spinlock_t *lock;
 582        void *data;
 583
 584        tbl = rcu_dereference(ht->tbl);
 585
 586        /* All insertions must grab the oldest table containing
 587         * the hashed bucket that is yet to be rehashed.
 588         */
 589        for (;;) {
 590                hash = rht_head_hashfn(ht, tbl, obj, ht->p);
 591                lock = rht_bucket_lock(tbl, hash);
 592                spin_lock_bh(lock);
 593
 594                if (tbl->rehash <= hash)
 595                        break;
 596
 597                spin_unlock_bh(lock);
 598                tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 599        }
 600
 601        data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
 602        new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
 603        if (PTR_ERR(new_tbl) != -EEXIST)
 604                data = ERR_CAST(new_tbl);
 605
 606        while (!IS_ERR_OR_NULL(new_tbl)) {
 607                tbl = new_tbl;
 608                hash = rht_head_hashfn(ht, tbl, obj, ht->p);
 609                spin_lock_nested(rht_bucket_lock(tbl, hash),
 610                                 SINGLE_DEPTH_NESTING);
 611
 612                data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
 613                new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
 614                if (PTR_ERR(new_tbl) != -EEXIST)
 615                        data = ERR_CAST(new_tbl);
 616
 617                spin_unlock(rht_bucket_lock(tbl, hash));
 618        }
 619
 620        spin_unlock_bh(lock);
 621
 622        if (PTR_ERR(data) == -EAGAIN)
 623                data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
 624                               -EAGAIN);
 625
 626        return data;
 627}
 628
 629void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
 630                             struct rhash_head *obj)
 631{
 632        void *data;
 633
 634        do {
 635                rcu_read_lock();
 636                data = rhashtable_try_insert(ht, key, obj);
 637                rcu_read_unlock();
 638        } while (PTR_ERR(data) == -EAGAIN);
 639
 640        return data;
 641}
 642EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
 643
 644/**
 645 * rhashtable_walk_enter - Initialise an iterator
 646 * @ht:         Table to walk over
 647 * @iter:       Hash table Iterator
 648 *
 649 * This function prepares a hash table walk.
 650 *
 651 * Note that if you restart a walk after rhashtable_walk_stop you
 652 * may see the same object twice.  Also, you may miss objects if
 653 * there are removals in between rhashtable_walk_stop and the next
 654 * call to rhashtable_walk_start.
 655 *
 656 * For a completely stable walk you should construct your own data
 657 * structure outside the hash table.
 658 *
 659 * This function may be called from any process context, including
 660 * non-preemptable context, but cannot be called from softirq or
 661 * hardirq context.
 662 *
 663 * You must call rhashtable_walk_exit after this function returns.
 664 */
 665void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter)
 666{
 667        iter->ht = ht;
 668        iter->p = NULL;
 669        iter->slot = 0;
 670        iter->skip = 0;
 671        iter->end_of_table = 0;
 672
 673        spin_lock(&ht->lock);
 674        iter->walker.tbl =
 675                rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock));
 676        list_add(&iter->walker.list, &iter->walker.tbl->walkers);
 677        spin_unlock(&ht->lock);
 678}
 679EXPORT_SYMBOL_GPL(rhashtable_walk_enter);
 680
 681/**
 682 * rhashtable_walk_exit - Free an iterator
 683 * @iter:       Hash table Iterator
 684 *
 685 * This function frees resources allocated by rhashtable_walk_init.
 686 */
 687void rhashtable_walk_exit(struct rhashtable_iter *iter)
 688{
 689        spin_lock(&iter->ht->lock);
 690        if (iter->walker.tbl)
 691                list_del(&iter->walker.list);
 692        spin_unlock(&iter->ht->lock);
 693}
 694EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
 695
 696/**
 697 * rhashtable_walk_start_check - Start a hash table walk
 698 * @iter:       Hash table iterator
 699 *
 700 * Start a hash table walk at the current iterator position.  Note that we take
 701 * the RCU lock in all cases including when we return an error.  So you must
 702 * always call rhashtable_walk_stop to clean up.
 703 *
 704 * Returns zero if successful.
 705 *
 706 * Returns -EAGAIN if resize event occured.  Note that the iterator
 707 * will rewind back to the beginning and you may use it immediately
 708 * by calling rhashtable_walk_next.
 709 *
 710 * rhashtable_walk_start is defined as an inline variant that returns
 711 * void. This is preferred in cases where the caller would ignore
 712 * resize events and always continue.
 713 */
 714int rhashtable_walk_start_check(struct rhashtable_iter *iter)
 715        __acquires(RCU)
 716{
 717        struct rhashtable *ht = iter->ht;
 718        bool rhlist = ht->rhlist;
 719
 720        rcu_read_lock();
 721
 722        spin_lock(&ht->lock);
 723        if (iter->walker.tbl)
 724                list_del(&iter->walker.list);
 725        spin_unlock(&ht->lock);
 726
 727        if (iter->end_of_table)
 728                return 0;
 729        if (!iter->walker.tbl) {
 730                iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
 731                iter->slot = 0;
 732                iter->skip = 0;
 733                return -EAGAIN;
 734        }
 735
 736        if (iter->p && !rhlist) {
 737                /*
 738                 * We need to validate that 'p' is still in the table, and
 739                 * if so, update 'skip'
 740                 */
 741                struct rhash_head *p;
 742                int skip = 0;
 743                rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
 744                        skip++;
 745                        if (p == iter->p) {
 746                                iter->skip = skip;
 747                                goto found;
 748                        }
 749                }
 750                iter->p = NULL;
 751        } else if (iter->p && rhlist) {
 752                /* Need to validate that 'list' is still in the table, and
 753                 * if so, update 'skip' and 'p'.
 754                 */
 755                struct rhash_head *p;
 756                struct rhlist_head *list;
 757                int skip = 0;
 758                rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
 759                        for (list = container_of(p, struct rhlist_head, rhead);
 760                             list;
 761                             list = rcu_dereference(list->next)) {
 762                                skip++;
 763                                if (list == iter->list) {
 764                                        iter->p = p;
 765                                        iter->skip = skip;
 766                                        goto found;
 767                                }
 768                        }
 769                }
 770                iter->p = NULL;
 771        }
 772found:
 773        return 0;
 774}
 775EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);
 776
 777/**
 778 * __rhashtable_walk_find_next - Find the next element in a table (or the first
 779 * one in case of a new walk).
 780 *
 781 * @iter:       Hash table iterator
 782 *
 783 * Returns the found object or NULL when the end of the table is reached.
 784 *
 785 * Returns -EAGAIN if resize event occurred.
 786 */
 787static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter)
 788{
 789        struct bucket_table *tbl = iter->walker.tbl;
 790        struct rhlist_head *list = iter->list;
 791        struct rhashtable *ht = iter->ht;
 792        struct rhash_head *p = iter->p;
 793        bool rhlist = ht->rhlist;
 794
 795        if (!tbl)
 796                return NULL;
 797
 798        for (; iter->slot < tbl->size; iter->slot++) {
 799                int skip = iter->skip;
 800
 801                rht_for_each_rcu(p, tbl, iter->slot) {
 802                        if (rhlist) {
 803                                list = container_of(p, struct rhlist_head,
 804                                                    rhead);
 805                                do {
 806                                        if (!skip)
 807                                                goto next;
 808                                        skip--;
 809                                        list = rcu_dereference(list->next);
 810                                } while (list);
 811
 812                                continue;
 813                        }
 814                        if (!skip)
 815                                break;
 816                        skip--;
 817                }
 818
 819next:
 820                if (!rht_is_a_nulls(p)) {
 821                        iter->skip++;
 822                        iter->p = p;
 823                        iter->list = list;
 824                        return rht_obj(ht, rhlist ? &list->rhead : p);
 825                }
 826
 827                iter->skip = 0;
 828        }
 829
 830        iter->p = NULL;
 831
 832        /* Ensure we see any new tables. */
 833        smp_rmb();
 834
 835        iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 836        if (iter->walker.tbl) {
 837                iter->slot = 0;
 838                iter->skip = 0;
 839                return ERR_PTR(-EAGAIN);
 840        } else {
 841                iter->end_of_table = true;
 842        }
 843
 844        return NULL;
 845}
 846
 847/**
 848 * rhashtable_walk_next - Return the next object and advance the iterator
 849 * @iter:       Hash table iterator
 850 *
 851 * Note that you must call rhashtable_walk_stop when you are finished
 852 * with the walk.
 853 *
 854 * Returns the next object or NULL when the end of the table is reached.
 855 *
 856 * Returns -EAGAIN if resize event occurred.  Note that the iterator
 857 * will rewind back to the beginning and you may continue to use it.
 858 */
 859void *rhashtable_walk_next(struct rhashtable_iter *iter)
 860{
 861        struct rhlist_head *list = iter->list;
 862        struct rhashtable *ht = iter->ht;
 863        struct rhash_head *p = iter->p;
 864        bool rhlist = ht->rhlist;
 865
 866        if (p) {
 867                if (!rhlist || !(list = rcu_dereference(list->next))) {
 868                        p = rcu_dereference(p->next);
 869                        list = container_of(p, struct rhlist_head, rhead);
 870                }
 871                if (!rht_is_a_nulls(p)) {
 872                        iter->skip++;
 873                        iter->p = p;
 874                        iter->list = list;
 875                        return rht_obj(ht, rhlist ? &list->rhead : p);
 876                }
 877
 878                /* At the end of this slot, switch to next one and then find
 879                 * next entry from that point.
 880                 */
 881                iter->skip = 0;
 882                iter->slot++;
 883        }
 884
 885        return __rhashtable_walk_find_next(iter);
 886}
 887EXPORT_SYMBOL_GPL(rhashtable_walk_next);
 888
 889/**
 890 * rhashtable_walk_peek - Return the next object but don't advance the iterator
 891 * @iter:       Hash table iterator
 892 *
 893 * Returns the next object or NULL when the end of the table is reached.
 894 *
 895 * Returns -EAGAIN if resize event occurred.  Note that the iterator
 896 * will rewind back to the beginning and you may continue to use it.
 897 */
 898void *rhashtable_walk_peek(struct rhashtable_iter *iter)
 899{
 900        struct rhlist_head *list = iter->list;
 901        struct rhashtable *ht = iter->ht;
 902        struct rhash_head *p = iter->p;
 903
 904        if (p)
 905                return rht_obj(ht, ht->rhlist ? &list->rhead : p);
 906
 907        /* No object found in current iter, find next one in the table. */
 908
 909        if (iter->skip) {
 910                /* A nonzero skip value points to the next entry in the table
 911                 * beyond that last one that was found. Decrement skip so
 912                 * we find the current value. __rhashtable_walk_find_next
 913                 * will restore the original value of skip assuming that
 914                 * the table hasn't changed.
 915                 */
 916                iter->skip--;
 917        }
 918
 919        return __rhashtable_walk_find_next(iter);
 920}
 921EXPORT_SYMBOL_GPL(rhashtable_walk_peek);
 922
 923/**
 924 * rhashtable_walk_stop - Finish a hash table walk
 925 * @iter:       Hash table iterator
 926 *
 927 * Finish a hash table walk.  Does not reset the iterator to the start of the
 928 * hash table.
 929 */
 930void rhashtable_walk_stop(struct rhashtable_iter *iter)
 931        __releases(RCU)
 932{
 933        struct rhashtable *ht;
 934        struct bucket_table *tbl = iter->walker.tbl;
 935
 936        if (!tbl)
 937                goto out;
 938
 939        ht = iter->ht;
 940
 941        spin_lock(&ht->lock);
 942        if (tbl->rehash < tbl->size)
 943                list_add(&iter->walker.list, &tbl->walkers);
 944        else
 945                iter->walker.tbl = NULL;
 946        spin_unlock(&ht->lock);
 947
 948out:
 949        rcu_read_unlock();
 950}
 951EXPORT_SYMBOL_GPL(rhashtable_walk_stop);
 952
 953static size_t rounded_hashtable_size(const struct rhashtable_params *params)
 954{
 955        size_t retsize;
 956
 957        if (params->nelem_hint)
 958                retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
 959                              (unsigned long)params->min_size);
 960        else
 961                retsize = max(HASH_DEFAULT_SIZE,
 962                              (unsigned long)params->min_size);
 963
 964        return retsize;
 965}
 966
 967static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
 968{
 969        return jhash2(key, length, seed);
 970}
 971
 972/**
 973 * rhashtable_init - initialize a new hash table
 974 * @ht:         hash table to be initialized
 975 * @params:     configuration parameters
 976 *
 977 * Initializes a new hash table based on the provided configuration
 978 * parameters. A table can be configured either with a variable or
 979 * fixed length key:
 980 *
 981 * Configuration Example 1: Fixed length keys
 982 * struct test_obj {
 983 *      int                     key;
 984 *      void *                  my_member;
 985 *      struct rhash_head       node;
 986 * };
 987 *
 988 * struct rhashtable_params params = {
 989 *      .head_offset = offsetof(struct test_obj, node),
 990 *      .key_offset = offsetof(struct test_obj, key),
 991 *      .key_len = sizeof(int),
 992 *      .hashfn = jhash,
 993 * };
 994 *
 995 * Configuration Example 2: Variable length keys
 996 * struct test_obj {
 997 *      [...]
 998 *      struct rhash_head       node;
 999 * };
1000 *
1001 * u32 my_hash_fn(const void *data, u32 len, u32 seed)
1002 * {
1003 *      struct test_obj *obj = data;
1004 *
1005 *      return [... hash ...];
1006 * }
1007 *
1008 * struct rhashtable_params params = {
1009 *      .head_offset = offsetof(struct test_obj, node),
1010 *      .hashfn = jhash,
1011 *      .obj_hashfn = my_hash_fn,
1012 * };
1013 */
1014int rhashtable_init(struct rhashtable *ht,
1015                    const struct rhashtable_params *params)
1016{
1017        struct bucket_table *tbl;
1018        size_t size;
1019
1020        if ((!params->key_len && !params->obj_hashfn) ||
1021            (params->obj_hashfn && !params->obj_cmpfn))
1022                return -EINVAL;
1023
1024        memset(ht, 0, sizeof(*ht));
1025        mutex_init(&ht->mutex);
1026        spin_lock_init(&ht->lock);
1027        memcpy(&ht->p, params, sizeof(*params));
1028
1029        if (params->min_size)
1030                ht->p.min_size = roundup_pow_of_two(params->min_size);
1031
1032        /* Cap total entries at 2^31 to avoid nelems overflow. */
1033        ht->max_elems = 1u << 31;
1034
1035        if (params->max_size) {
1036                ht->p.max_size = rounddown_pow_of_two(params->max_size);
1037                if (ht->p.max_size < ht->max_elems / 2)
1038                        ht->max_elems = ht->p.max_size * 2;
1039        }
1040
1041        ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
1042
1043        size = rounded_hashtable_size(&ht->p);
1044
1045        if (params->locks_mul)
1046                ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
1047        else
1048                ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
1049
1050        ht->key_len = ht->p.key_len;
1051        if (!params->hashfn) {
1052                ht->p.hashfn = jhash;
1053
1054                if (!(ht->key_len & (sizeof(u32) - 1))) {
1055                        ht->key_len /= sizeof(u32);
1056                        ht->p.hashfn = rhashtable_jhash2;
1057                }
1058        }
1059
1060        /*
1061         * This is api initialization and thus we need to guarantee the
1062         * initial rhashtable allocation. Upon failure, retry with the
1063         * smallest possible size with __GFP_NOFAIL semantics.
1064         */
1065        tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
1066        if (unlikely(tbl == NULL)) {
1067                size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
1068                tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL);
1069        }
1070
1071        atomic_set(&ht->nelems, 0);
1072
1073        RCU_INIT_POINTER(ht->tbl, tbl);
1074
1075        INIT_WORK(&ht->run_work, rht_deferred_worker);
1076
1077        return 0;
1078}
1079EXPORT_SYMBOL_GPL(rhashtable_init);
1080
1081/**
1082 * rhltable_init - initialize a new hash list table
1083 * @hlt:        hash list table to be initialized
1084 * @params:     configuration parameters
1085 *
1086 * Initializes a new hash list table.
1087 *
1088 * See documentation for rhashtable_init.
1089 */
1090int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
1091{
1092        int err;
1093
1094        err = rhashtable_init(&hlt->ht, params);
1095        hlt->ht.rhlist = true;
1096        return err;
1097}
1098EXPORT_SYMBOL_GPL(rhltable_init);
1099
1100static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
1101                                void (*free_fn)(void *ptr, void *arg),
1102                                void *arg)
1103{
1104        struct rhlist_head *list;
1105
1106        if (!ht->rhlist) {
1107                free_fn(rht_obj(ht, obj), arg);
1108                return;
1109        }
1110
1111        list = container_of(obj, struct rhlist_head, rhead);
1112        do {
1113                obj = &list->rhead;
1114                list = rht_dereference(list->next, ht);
1115                free_fn(rht_obj(ht, obj), arg);
1116        } while (list);
1117}
1118
1119/**
1120 * rhashtable_free_and_destroy - free elements and destroy hash table
1121 * @ht:         the hash table to destroy
1122 * @free_fn:    callback to release resources of element
1123 * @arg:        pointer passed to free_fn
1124 *
1125 * Stops an eventual async resize. If defined, invokes free_fn for each
1126 * element to releasal resources. Please note that RCU protected
1127 * readers may still be accessing the elements. Releasing of resources
1128 * must occur in a compatible manner. Then frees the bucket array.
1129 *
1130 * This function will eventually sleep to wait for an async resize
1131 * to complete. The caller is responsible that no further write operations
1132 * occurs in parallel.
1133 */
1134void rhashtable_free_and_destroy(struct rhashtable *ht,
1135                                 void (*free_fn)(void *ptr, void *arg),
1136                                 void *arg)
1137{
1138        struct bucket_table *tbl, *next_tbl;
1139        unsigned int i;
1140
1141        cancel_work_sync(&ht->run_work);
1142
1143        mutex_lock(&ht->mutex);
1144        tbl = rht_dereference(ht->tbl, ht);
1145restart:
1146        if (free_fn) {
1147                for (i = 0; i < tbl->size; i++) {
1148                        struct rhash_head *pos, *next;
1149
1150                        cond_resched();
1151                        for (pos = rht_dereference(*rht_bucket(tbl, i), ht),
1152                             next = !rht_is_a_nulls(pos) ?
1153                                        rht_dereference(pos->next, ht) : NULL;
1154                             !rht_is_a_nulls(pos);
1155                             pos = next,
1156                             next = !rht_is_a_nulls(pos) ?
1157                                        rht_dereference(pos->next, ht) : NULL)
1158                                rhashtable_free_one(ht, pos, free_fn, arg);
1159                }
1160        }
1161
1162        next_tbl = rht_dereference(tbl->future_tbl, ht);
1163        bucket_table_free(tbl);
1164        if (next_tbl) {
1165                tbl = next_tbl;
1166                goto restart;
1167        }
1168        mutex_unlock(&ht->mutex);
1169}
1170EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy);
1171
1172void rhashtable_destroy(struct rhashtable *ht)
1173{
1174        return rhashtable_free_and_destroy(ht, NULL, NULL);
1175}
1176EXPORT_SYMBOL_GPL(rhashtable_destroy);
1177
1178struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
1179                                            unsigned int hash)
1180{
1181        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
1182        static struct rhash_head __rcu *rhnull =
1183                (struct rhash_head __rcu *)NULLS_MARKER(0);
1184        unsigned int index = hash & ((1 << tbl->nest) - 1);
1185        unsigned int size = tbl->size >> tbl->nest;
1186        unsigned int subhash = hash;
1187        union nested_table *ntbl;
1188
1189        ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
1190        ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash);
1191        subhash >>= tbl->nest;
1192
1193        while (ntbl && size > (1 << shift)) {
1194                index = subhash & ((1 << shift) - 1);
1195                ntbl = rht_dereference_bucket_rcu(ntbl[index].table,
1196                                                  tbl, hash);
1197                size >>= shift;
1198                subhash >>= shift;
1199        }
1200
1201        if (!ntbl)
1202                return &rhnull;
1203
1204        return &ntbl[subhash].bucket;
1205
1206}
1207EXPORT_SYMBOL_GPL(rht_bucket_nested);
1208
1209struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht,
1210                                                   struct bucket_table *tbl,
1211                                                   unsigned int hash)
1212{
1213        const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
1214        unsigned int index = hash & ((1 << tbl->nest) - 1);
1215        unsigned int size = tbl->size >> tbl->nest;
1216        union nested_table *ntbl;
1217
1218        ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
1219        hash >>= tbl->nest;
1220        ntbl = nested_table_alloc(ht, &ntbl[index].table,
1221                                  size <= (1 << shift));
1222
1223        while (ntbl && size > (1 << shift)) {
1224                index = hash & ((1 << shift) - 1);
1225                size >>= shift;
1226                hash >>= shift;
1227                ntbl = nested_table_alloc(ht, &ntbl[index].table,
1228                                          size <= (1 << shift));
1229        }
1230
1231        if (!ntbl)
1232                return NULL;
1233
1234        return &ntbl[hash].bucket;
1235
1236}
1237EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);
1238