LXR linux/lib/rhashtable.c

   1/*
   2 * Resizable, Scalable, Concurrent Hash Table
   3 *
   4 * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
   5 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
   6 *
   7 * Based on the following paper:
   8 * https://www.usenix.org/legacy/event/atc11/tech/final_files/Triplett.pdf
   9 *
  10 * Code partially derived from nft_hash
  11 *
  12 * This program is free software; you can redistribute it and/or modify
  13 * it under the terms of the GNU General Public License version 2 as
  14 * published by the Free Software Foundation.
  15 */
  16
  17#include <linux/kernel.h>
  18#include <linux/init.h>
  19#include <linux/log2.h>
  20#include <linux/sched.h>
  21#include <linux/slab.h>
  22#include <linux/vmalloc.h>
  23#include <linux/mm.h>
  24#include <linux/jhash.h>
  25#include <linux/random.h>
  26#include <linux/rhashtable.h>
  27#include <linux/err.h>
  28
  29#define HASH_DEFAULT_SIZE       64UL
  30#define HASH_MIN_SIZE           4UL
  31#define BUCKET_LOCKS_PER_CPU   128UL
  32
  33/* Base bits plus 1 bit for nulls marker */
  34#define HASH_RESERVED_SPACE     (RHT_BASE_BITS + 1)
  35
  36enum {
  37        RHT_LOCK_NORMAL,
  38        RHT_LOCK_NESTED,
  39};
  40
  41/* The bucket lock is selected based on the hash and protects mutations
  42 * on a group of hash buckets.
  43 *
  44 * A maximum of tbl->size/2 bucket locks is allocated. This ensures that
  45 * a single lock always covers both buckets which may both contains
  46 * entries which link to the same bucket of the old table during resizing.
  47 * This allows to simplify the locking as locking the bucket in both
  48 * tables during resize always guarantee protection.
  49 *
  50 * IMPORTANT: When holding the bucket lock of both the old and new table
  51 * during expansions and shrinking, the old bucket lock must always be
  52 * acquired first.
  53 */
  54static spinlock_t *bucket_lock(const struct bucket_table *tbl, u32 hash)
  55{
  56        return &tbl->locks[hash & tbl->locks_mask];
  57}
  58
  59static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he)
  60{
  61        return (void *) he - ht->p.head_offset;
  62}
  63
  64static u32 rht_bucket_index(const struct bucket_table *tbl, u32 hash)
  65{
  66        return hash & (tbl->size - 1);
  67}
  68
  69static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr)
  70{
  71        u32 hash;
  72
  73        if (unlikely(!ht->p.key_len))
  74                hash = ht->p.obj_hashfn(ptr, ht->p.hash_rnd);
  75        else
  76                hash = ht->p.hashfn(ptr + ht->p.key_offset, ht->p.key_len,
  77                                    ht->p.hash_rnd);
  78
  79        return hash >> HASH_RESERVED_SPACE;
  80}
  81
  82static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len)
  83{
  84        return ht->p.hashfn(key, len, ht->p.hash_rnd) >> HASH_RESERVED_SPACE;
  85}
  86
  87static u32 head_hashfn(const struct rhashtable *ht,
  88                       const struct bucket_table *tbl,
  89                       const struct rhash_head *he)
  90{
  91        return rht_bucket_index(tbl, obj_raw_hashfn(ht, rht_obj(ht, he)));
  92}
  93
  94#ifdef CONFIG_PROVE_LOCKING
  95static void debug_dump_buckets(const struct rhashtable *ht,
  96                               const struct bucket_table *tbl)
  97{
  98        struct rhash_head *he;
  99        unsigned int i, hash;
 100
 101        for (i = 0; i < tbl->size; i++) {
 102                pr_warn(" [Bucket %d] ", i);
 103                rht_for_each_rcu(he, tbl, i) {
 104                        hash = head_hashfn(ht, tbl, he);
 105                        pr_cont("[hash = %#x, lock = %p] ",
 106                                hash, bucket_lock(tbl, hash));
 107                }
 108                pr_cont("\n");
 109        }
 110
 111}
 112
 113static void debug_dump_table(struct rhashtable *ht,
 114                             const struct bucket_table *tbl,
 115                             unsigned int hash)
 116{
 117        struct bucket_table *old_tbl, *future_tbl;
 118
 119        pr_emerg("BUG: lock for hash %#x in table %p not held\n",
 120                 hash, tbl);
 121
 122        rcu_read_lock();
 123        future_tbl = rht_dereference_rcu(ht->future_tbl, ht);
 124        old_tbl = rht_dereference_rcu(ht->tbl, ht);
 125        if (future_tbl != old_tbl) {
 126                pr_warn("Future table %p (size: %zd)\n",
 127                        future_tbl, future_tbl->size);
 128                debug_dump_buckets(ht, future_tbl);
 129        }
 130
 131        pr_warn("Table %p (size: %zd)\n", old_tbl, old_tbl->size);
 132        debug_dump_buckets(ht, old_tbl);
 133
 134        rcu_read_unlock();
 135}
 136
 137#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
 138#define ASSERT_BUCKET_LOCK(HT, TBL, HASH)                               \
 139        do {                                                            \
 140                if (unlikely(!lockdep_rht_bucket_is_held(TBL, HASH))) { \
 141                        debug_dump_table(HT, TBL, HASH);                \
 142                        BUG();                                          \
 143                }                                                       \
 144        } while (0)
 145
 146int lockdep_rht_mutex_is_held(struct rhashtable *ht)
 147{
 148        return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
 149}
 150EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
 151
 152int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
 153{
 154        spinlock_t *lock = bucket_lock(tbl, hash);
 155
 156        return (debug_locks) ? lockdep_is_held(lock) : 1;
 157}
 158EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
 159#else
 160#define ASSERT_RHT_MUTEX(HT)
 161#define ASSERT_BUCKET_LOCK(HT, TBL, HASH)
 162#endif
 163
 164
 165static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n)
 166{
 167        struct rhash_head __rcu **pprev;
 168
 169        for (pprev = &tbl->buckets[n];
 170             !rht_is_a_nulls(rht_dereference_bucket(*pprev, tbl, n));
 171             pprev = &rht_dereference_bucket(*pprev, tbl, n)->next)
 172                ;
 173
 174        return pprev;
 175}
 176
 177static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl)
 178{
 179        unsigned int i, size;
 180#if defined(CONFIG_PROVE_LOCKING)
 181        unsigned int nr_pcpus = 2;
 182#else
 183        unsigned int nr_pcpus = num_possible_cpus();
 184#endif
 185
 186        nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL);
 187        size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
 188
 189        /* Never allocate more than 0.5 locks per bucket */
 190        size = min_t(unsigned int, size, tbl->size >> 1);
 191
 192        if (sizeof(spinlock_t) != 0) {
 193#ifdef CONFIG_NUMA
 194                if (size * sizeof(spinlock_t) > PAGE_SIZE)
 195                        tbl->locks = vmalloc(size * sizeof(spinlock_t));
 196                else
 197#endif
 198                tbl->locks = kmalloc_array(size, sizeof(spinlock_t),
 199                                           GFP_KERNEL);
 200                if (!tbl->locks)
 201                        return -ENOMEM;
 202                for (i = 0; i < size; i++)
 203                        spin_lock_init(&tbl->locks[i]);
 204        }
 205        tbl->locks_mask = size - 1;
 206
 207        return 0;
 208}
 209
 210static void bucket_table_free(const struct bucket_table *tbl)
 211{
 212        if (tbl)
 213                kvfree(tbl->locks);
 214
 215        kvfree(tbl);
 216}
 217
 218static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 219                                               size_t nbuckets)
 220{
 221        struct bucket_table *tbl = NULL;
 222        size_t size;
 223        int i;
 224
 225        size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
 226        if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
 227                tbl = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
 228        if (tbl == NULL)
 229                tbl = vzalloc(size);
 230        if (tbl == NULL)
 231                return NULL;
 232
 233        tbl->size = nbuckets;
 234
 235        if (alloc_bucket_locks(ht, tbl) < 0) {
 236                bucket_table_free(tbl);
 237                return NULL;
 238        }
 239
 240        for (i = 0; i < nbuckets; i++)
 241                INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i);
 242
 243        return tbl;
 244}
 245
 246/**
 247 * rht_grow_above_75 - returns true if nelems > 0.75 * table-size
 248 * @ht:         hash table
 249 * @new_size:   new table size
 250 */
 251static bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size)
 252{
 253        /* Expand table when exceeding 75% load */
 254        return atomic_read(&ht->nelems) > (new_size / 4 * 3) &&
 255               (!ht->p.max_shift || atomic_read(&ht->shift) < ht->p.max_shift);
 256}
 257
 258/**
 259 * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size
 260 * @ht:         hash table
 261 * @new_size:   new table size
 262 */
 263static bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size)
 264{
 265        /* Shrink table beneath 30% load */
 266        return atomic_read(&ht->nelems) < (new_size * 3 / 10) &&
 267               (atomic_read(&ht->shift) > ht->p.min_shift);
 268}
 269
 270static void lock_buckets(struct bucket_table *new_tbl,
 271                         struct bucket_table *old_tbl, unsigned int hash)
 272        __acquires(old_bucket_lock)
 273{
 274        spin_lock_bh(bucket_lock(old_tbl, hash));
 275        if (new_tbl != old_tbl)
 276                spin_lock_bh_nested(bucket_lock(new_tbl, hash),
 277                                    RHT_LOCK_NESTED);
 278}
 279
 280static void unlock_buckets(struct bucket_table *new_tbl,
 281                           struct bucket_table *old_tbl, unsigned int hash)
 282        __releases(old_bucket_lock)
 283{
 284        if (new_tbl != old_tbl)
 285                spin_unlock_bh(bucket_lock(new_tbl, hash));
 286        spin_unlock_bh(bucket_lock(old_tbl, hash));
 287}
 288
 289/**
 290 * Unlink entries on bucket which hash to different bucket.
 291 *
 292 * Returns true if no more work needs to be performed on the bucket.
 293 */
 294static bool hashtable_chain_unzip(struct rhashtable *ht,
 295                                  const struct bucket_table *new_tbl,
 296                                  struct bucket_table *old_tbl,
 297                                  size_t old_hash)
 298{
 299        struct rhash_head *he, *p, *next;
 300        unsigned int new_hash, new_hash2;
 301
 302        ASSERT_BUCKET_LOCK(ht, old_tbl, old_hash);
 303
 304        /* Old bucket empty, no work needed. */
 305        p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
 306                                   old_hash);
 307        if (rht_is_a_nulls(p))
 308                return false;
 309
 310        new_hash = head_hashfn(ht, new_tbl, p);
 311        ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash);
 312
 313        /* Advance the old bucket pointer one or more times until it
 314         * reaches a node that doesn't hash to the same bucket as the
 315         * previous node p. Call the previous node p;
 316         */
 317        rht_for_each_continue(he, p->next, old_tbl, old_hash) {
 318                new_hash2 = head_hashfn(ht, new_tbl, he);
 319                ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash2);
 320
 321                if (new_hash != new_hash2)
 322                        break;
 323                p = he;
 324        }
 325        rcu_assign_pointer(old_tbl->buckets[old_hash], p->next);
 326
 327        /* Find the subsequent node which does hash to the same
 328         * bucket as node P, or NULL if no such node exists.
 329         */
 330        INIT_RHT_NULLS_HEAD(next, ht, old_hash);
 331        if (!rht_is_a_nulls(he)) {
 332                rht_for_each_continue(he, he->next, old_tbl, old_hash) {
 333                        if (head_hashfn(ht, new_tbl, he) == new_hash) {
 334                                next = he;
 335                                break;
 336                        }
 337                }
 338        }
 339
 340        /* Set p's next pointer to that subsequent node pointer,
 341         * bypassing the nodes which do not hash to p's bucket
 342         */
 343        rcu_assign_pointer(p->next, next);
 344
 345        p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
 346                                   old_hash);
 347
 348        return !rht_is_a_nulls(p);
 349}
 350
 351static void link_old_to_new(struct rhashtable *ht, struct bucket_table *new_tbl,
 352                            unsigned int new_hash, struct rhash_head *entry)
 353{
 354        ASSERT_BUCKET_LOCK(ht, new_tbl, new_hash);
 355
 356        rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry);
 357}
 358
 359/**
 360 * rhashtable_expand - Expand hash table while allowing concurrent lookups
 361 * @ht:         the hash table to expand
 362 *
 363 * A secondary bucket array is allocated and the hash entries are migrated
 364 * while keeping them on both lists until the end of the RCU grace period.
 365 *
 366 * This function may only be called in a context where it is safe to call
 367 * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
 368 *
 369 * The caller must ensure that no concurrent resizing occurs by holding
 370 * ht->mutex.
 371 *
 372 * It is valid to have concurrent insertions and deletions protected by per
 373 * bucket locks or concurrent RCU protected lookups and traversals.
 374 */
 375int rhashtable_expand(struct rhashtable *ht)
 376{
 377        struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
 378        struct rhash_head *he;
 379        unsigned int new_hash, old_hash;
 380        bool complete = false;
 381
 382        ASSERT_RHT_MUTEX(ht);
 383
 384        new_tbl = bucket_table_alloc(ht, old_tbl->size * 2);
 385        if (new_tbl == NULL)
 386                return -ENOMEM;
 387
 388        atomic_inc(&ht->shift);
 389
 390        /* Make insertions go into the new, empty table right away. Deletions
 391         * and lookups will be attempted in both tables until we synchronize.
 392         * The synchronize_rcu() guarantees for the new table to be picked up
 393         * so no new additions go into the old table while we relink.
 394         */
 395        rcu_assign_pointer(ht->future_tbl, new_tbl);
 396        synchronize_rcu();
 397
 398        /* For each new bucket, search the corresponding old bucket for the
 399         * first entry that hashes to the new bucket, and link the end of
 400         * newly formed bucket chain (containing entries added to future
 401         * table) to that entry. Since all the entries which will end up in
 402         * the new bucket appear in the same old bucket, this constructs an
 403         * entirely valid new hash table, but with multiple buckets
 404         * "zipped" together into a single imprecise chain.
 405         */
 406        for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
 407                old_hash = rht_bucket_index(old_tbl, new_hash);
 408                lock_buckets(new_tbl, old_tbl, new_hash);
 409                rht_for_each(he, old_tbl, old_hash) {
 410                        if (head_hashfn(ht, new_tbl, he) == new_hash) {
 411                                link_old_to_new(ht, new_tbl, new_hash, he);
 412                                break;
 413                        }
 414                }
 415                unlock_buckets(new_tbl, old_tbl, new_hash);
 416                cond_resched();
 417        }
 418
 419        /* Unzip interleaved hash chains */
 420        while (!complete && !ht->being_destroyed) {
 421                /* Wait for readers. All new readers will see the new
 422                 * table, and thus no references to the old table will
 423                 * remain.
 424                 */
 425                synchronize_rcu();
 426
 427                /* For each bucket in the old table (each of which
 428                 * contains items from multiple buckets of the new
 429                 * table): ...
 430                 */
 431                complete = true;
 432                for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
 433                        lock_buckets(new_tbl, old_tbl, old_hash);
 434
 435                        if (hashtable_chain_unzip(ht, new_tbl, old_tbl,
 436                                                  old_hash))
 437                                complete = false;
 438
 439                        unlock_buckets(new_tbl, old_tbl, old_hash);
 440                        cond_resched();
 441                }
 442        }
 443
 444        rcu_assign_pointer(ht->tbl, new_tbl);
 445        synchronize_rcu();
 446
 447        bucket_table_free(old_tbl);
 448        return 0;
 449}
 450EXPORT_SYMBOL_GPL(rhashtable_expand);
 451
 452/**
 453 * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
 454 * @ht:         the hash table to shrink
 455 *
 456 * This function may only be called in a context where it is safe to call
 457 * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
 458 *
 459 * The caller must ensure that no concurrent resizing occurs by holding
 460 * ht->mutex.
 461 *
 462 * The caller must ensure that no concurrent table mutations take place.
 463 * It is however valid to have concurrent lookups if they are RCU protected.
 464 *
 465 * It is valid to have concurrent insertions and deletions protected by per
 466 * bucket locks or concurrent RCU protected lookups and traversals.
 467 */
 468int rhashtable_shrink(struct rhashtable *ht)
 469{
 470        struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht);
 471        unsigned int new_hash;
 472
 473        ASSERT_RHT_MUTEX(ht);
 474
 475        new_tbl = bucket_table_alloc(ht, tbl->size / 2);
 476        if (new_tbl == NULL)
 477                return -ENOMEM;
 478
 479        rcu_assign_pointer(ht->future_tbl, new_tbl);
 480        synchronize_rcu();
 481
 482        /* Link the first entry in the old bucket to the end of the
 483         * bucket in the new table. As entries are concurrently being
 484         * added to the new table, lock down the new bucket. As we
 485         * always divide the size in half when shrinking, each bucket
 486         * in the new table maps to exactly two buckets in the old
 487         * table.
 488         */
 489        for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
 490                lock_buckets(new_tbl, tbl, new_hash);
 491
 492                rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
 493                                   tbl->buckets[new_hash]);
 494                ASSERT_BUCKET_LOCK(ht, tbl, new_hash + new_tbl->size);
 495                rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
 496                                   tbl->buckets[new_hash + new_tbl->size]);
 497
 498                unlock_buckets(new_tbl, tbl, new_hash);
 499                cond_resched();
 500        }
 501
 502        /* Publish the new, valid hash table */
 503        rcu_assign_pointer(ht->tbl, new_tbl);
 504        atomic_dec(&ht->shift);
 505
 506        /* Wait for readers. No new readers will have references to the
 507         * old hash table.
 508         */
 509        synchronize_rcu();
 510
 511        bucket_table_free(tbl);
 512
 513        return 0;
 514}
 515EXPORT_SYMBOL_GPL(rhashtable_shrink);
 516
 517static void rht_deferred_worker(struct work_struct *work)
 518{
 519        struct rhashtable *ht;
 520        struct bucket_table *tbl;
 521        struct rhashtable_walker *walker;
 522
 523        ht = container_of(work, struct rhashtable, run_work);
 524        mutex_lock(&ht->mutex);
 525        if (ht->being_destroyed)
 526                goto unlock;
 527
 528        tbl = rht_dereference(ht->tbl, ht);
 529
 530        list_for_each_entry(walker, &ht->walkers, list)
 531                walker->resize = true;
 532
 533        if (rht_grow_above_75(ht, tbl->size))
 534                rhashtable_expand(ht);
 535        else if (rht_shrink_below_30(ht, tbl->size))
 536                rhashtable_shrink(ht);
 537unlock:
 538        mutex_unlock(&ht->mutex);
 539}
 540
 541static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
 542                                struct bucket_table *tbl,
 543                                const struct bucket_table *old_tbl, u32 hash)
 544{
 545        bool no_resize_running = tbl == old_tbl;
 546        struct rhash_head *head;
 547
 548        hash = rht_bucket_index(tbl, hash);
 549        head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
 550
 551        ASSERT_BUCKET_LOCK(ht, tbl, hash);
 552
 553        if (rht_is_a_nulls(head))
 554                INIT_RHT_NULLS_HEAD(obj->next, ht, hash);
 555        else
 556                RCU_INIT_POINTER(obj->next, head);
 557
 558        rcu_assign_pointer(tbl->buckets[hash], obj);
 559
 560        atomic_inc(&ht->nelems);
 561        if (no_resize_running && rht_grow_above_75(ht, tbl->size))
 562                schedule_work(&ht->run_work);
 563}
 564
 565/**
 566 * rhashtable_insert - insert object into hash table
 567 * @ht:         hash table
 568 * @obj:        pointer to hash head inside object
 569 *
 570 * Will take a per bucket spinlock to protect against mutual mutations
 571 * on the same bucket. Multiple insertions may occur in parallel unless
 572 * they map to the same bucket lock.
 573 *
 574 * It is safe to call this function from atomic context.
 575 *
 576 * Will trigger an automatic deferred table resizing if the size grows
 577 * beyond the watermark indicated by grow_decision() which can be passed
 578 * to rhashtable_init().
 579 */
 580void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 581{
 582        struct bucket_table *tbl, *old_tbl;
 583        unsigned hash;
 584
 585        rcu_read_lock();
 586
 587        tbl = rht_dereference_rcu(ht->future_tbl, ht);
 588        old_tbl = rht_dereference_rcu(ht->tbl, ht);
 589        hash = obj_raw_hashfn(ht, rht_obj(ht, obj));
 590
 591        lock_buckets(tbl, old_tbl, hash);
 592        __rhashtable_insert(ht, obj, tbl, old_tbl, hash);
 593        unlock_buckets(tbl, old_tbl, hash);
 594
 595        rcu_read_unlock();
 596}
 597EXPORT_SYMBOL_GPL(rhashtable_insert);
 598
 599/**
 600 * rhashtable_remove - remove object from hash table
 601 * @ht:         hash table
 602 * @obj:        pointer to hash head inside object
 603 *
 604 * Since the hash chain is single linked, the removal operation needs to
 605 * walk the bucket chain upon removal. The removal operation is thus
 606 * considerable slow if the hash table is not correctly sized.
 607 *
 608 * Will automatically shrink the table via rhashtable_expand() if the
 609 * shrink_decision function specified at rhashtable_init() returns true.
 610 *
 611 * The caller must ensure that no concurrent table mutations occur. It is
 612 * however valid to have concurrent lookups if they are RCU protected.
 613 */
 614bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 615{
 616        struct bucket_table *tbl, *new_tbl, *old_tbl;
 617        struct rhash_head __rcu **pprev;
 618        struct rhash_head *he, *he2;
 619        unsigned int hash, new_hash;
 620        bool ret = false;
 621
 622        rcu_read_lock();
 623        old_tbl = rht_dereference_rcu(ht->tbl, ht);
 624        tbl = new_tbl = rht_dereference_rcu(ht->future_tbl, ht);
 625        new_hash = obj_raw_hashfn(ht, rht_obj(ht, obj));
 626
 627        lock_buckets(new_tbl, old_tbl, new_hash);
 628restart:
 629        hash = rht_bucket_index(tbl, new_hash);
 630        pprev = &tbl->buckets[hash];
 631        rht_for_each(he, tbl, hash) {
 632                if (he != obj) {
 633                        pprev = &he->next;
 634                        continue;
 635                }
 636
 637                ASSERT_BUCKET_LOCK(ht, tbl, hash);
 638
 639                if (old_tbl->size > new_tbl->size && tbl == old_tbl &&
 640                    !rht_is_a_nulls(obj->next) &&
 641                    head_hashfn(ht, tbl, obj->next) != hash) {
 642                        rcu_assign_pointer(*pprev, (struct rhash_head *) rht_marker(ht, hash));
 643                } else if (unlikely(old_tbl->size < new_tbl->size && tbl == new_tbl)) {
 644                        rht_for_each_continue(he2, obj->next, tbl, hash) {
 645                                if (head_hashfn(ht, tbl, he2) == hash) {
 646                                        rcu_assign_pointer(*pprev, he2);
 647                                        goto found;
 648                                }
 649                        }
 650
 651                        rcu_assign_pointer(*pprev, (struct rhash_head *) rht_marker(ht, hash));
 652                } else {
 653                        rcu_assign_pointer(*pprev, obj->next);
 654                }
 655
 656found:
 657                ret = true;
 658                break;
 659        }
 660
 661        /* The entry may be linked in either 'tbl', 'future_tbl', or both.
 662         * 'future_tbl' only exists for a short period of time during
 663         * resizing. Thus traversing both is fine and the added cost is
 664         * very rare.
 665         */
 666        if (tbl != old_tbl) {
 667                tbl = old_tbl;
 668                goto restart;
 669        }
 670
 671        unlock_buckets(new_tbl, old_tbl, new_hash);
 672
 673        if (ret) {
 674                bool no_resize_running = new_tbl == old_tbl;
 675
 676                atomic_dec(&ht->nelems);
 677                if (no_resize_running && rht_shrink_below_30(ht, new_tbl->size))
 678                        schedule_work(&ht->run_work);
 679        }
 680
 681        rcu_read_unlock();
 682
 683        return ret;
 684}
 685EXPORT_SYMBOL_GPL(rhashtable_remove);
 686
 687struct rhashtable_compare_arg {
 688        struct rhashtable *ht;
 689        const void *key;
 690};
 691
 692static bool rhashtable_compare(void *ptr, void *arg)
 693{
 694        struct rhashtable_compare_arg *x = arg;
 695        struct rhashtable *ht = x->ht;
 696
 697        return !memcmp(ptr + ht->p.key_offset, x->key, ht->p.key_len);
 698}
 699
 700/**
 701 * rhashtable_lookup - lookup key in hash table
 702 * @ht:         hash table
 703 * @key:        pointer to key
 704 *
 705 * Computes the hash value for the key and traverses the bucket chain looking
 706 * for a entry with an identical key. The first matching entry is returned.
 707 *
 708 * This lookup function may only be used for fixed key hash table (key_len
 709 * parameter set). It will BUG() if used inappropriately.
 710 *
 711 * Lookups may occur in parallel with hashtable mutations and resizing.
 712 */
 713void *rhashtable_lookup(struct rhashtable *ht, const void *key)
 714{
 715        struct rhashtable_compare_arg arg = {
 716                .ht = ht,
 717                .key = key,
 718        };
 719
 720        BUG_ON(!ht->p.key_len);
 721
 722        return rhashtable_lookup_compare(ht, key, &rhashtable_compare, &arg);
 723}
 724EXPORT_SYMBOL_GPL(rhashtable_lookup);
 725
 726/**
 727 * rhashtable_lookup_compare - search hash table with compare function
 728 * @ht:         hash table
 729 * @key:        the pointer to the key
 730 * @compare:    compare function, must return true on match
 731 * @arg:        argument passed on to compare function
 732 *
 733 * Traverses the bucket chain behind the provided hash value and calls the
 734 * specified compare function for each entry.
 735 *
 736 * Lookups may occur in parallel with hashtable mutations and resizing.
 737 *
 738 * Returns the first entry on which the compare function returned true.
 739 */
 740void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
 741                                bool (*compare)(void *, void *), void *arg)
 742{
 743        const struct bucket_table *tbl, *old_tbl;
 744        struct rhash_head *he;
 745        u32 hash;
 746
 747        rcu_read_lock();
 748
 749        old_tbl = rht_dereference_rcu(ht->tbl, ht);
 750        tbl = rht_dereference_rcu(ht->future_tbl, ht);
 751        hash = key_hashfn(ht, key, ht->p.key_len);
 752restart:
 753        rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) {
 754                if (!compare(rht_obj(ht, he), arg))
 755                        continue;
 756                rcu_read_unlock();
 757                return rht_obj(ht, he);
 758        }
 759
 760        if (unlikely(tbl != old_tbl)) {
 761                tbl = old_tbl;
 762                goto restart;
 763        }
 764        rcu_read_unlock();
 765
 766        return NULL;
 767}
 768EXPORT_SYMBOL_GPL(rhashtable_lookup_compare);
 769
 770/**
 771 * rhashtable_lookup_insert - lookup and insert object into hash table
 772 * @ht:         hash table
 773 * @obj:        pointer to hash head inside object
 774 *
 775 * Locks down the bucket chain in both the old and new table if a resize
 776 * is in progress to ensure that writers can't remove from the old table
 777 * and can't insert to the new table during the atomic operation of search
 778 * and insertion. Searches for duplicates in both the old and new table if
 779 * a resize is in progress.
 780 *
 781 * This lookup function may only be used for fixed key hash table (key_len
 782 * parameter set). It will BUG() if used inappropriately.
 783 *
 784 * It is safe to call this function from atomic context.
 785 *
 786 * Will trigger an automatic deferred table resizing if the size grows
 787 * beyond the watermark indicated by grow_decision() which can be passed
 788 * to rhashtable_init().
 789 */
 790bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj)
 791{
 792        struct rhashtable_compare_arg arg = {
 793                .ht = ht,
 794                .key = rht_obj(ht, obj) + ht->p.key_offset,
 795        };
 796
 797        BUG_ON(!ht->p.key_len);
 798
 799        return rhashtable_lookup_compare_insert(ht, obj, &rhashtable_compare,
 800                                                &arg);
 801}
 802EXPORT_SYMBOL_GPL(rhashtable_lookup_insert);
 803
 804/**
 805 * rhashtable_lookup_compare_insert - search and insert object to hash table
 806 *                                    with compare function
 807 * @ht:         hash table
 808 * @obj:        pointer to hash head inside object
 809 * @compare:    compare function, must return true on match
 810 * @arg:        argument passed on to compare function
 811 *
 812 * Locks down the bucket chain in both the old and new table if a resize
 813 * is in progress to ensure that writers can't remove from the old table
 814 * and can't insert to the new table during the atomic operation of search
 815 * and insertion. Searches for duplicates in both the old and new table if
 816 * a resize is in progress.
 817 *
 818 * Lookups may occur in parallel with hashtable mutations and resizing.
 819 *
 820 * Will trigger an automatic deferred table resizing if the size grows
 821 * beyond the watermark indicated by grow_decision() which can be passed
 822 * to rhashtable_init().
 823 */
 824bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
 825                                      struct rhash_head *obj,
 826                                      bool (*compare)(void *, void *),
 827                                      void *arg)
 828{
 829        struct bucket_table *new_tbl, *old_tbl;
 830        u32 new_hash;
 831        bool success = true;
 832
 833        BUG_ON(!ht->p.key_len);
 834
 835        rcu_read_lock();
 836        old_tbl = rht_dereference_rcu(ht->tbl, ht);
 837        new_tbl = rht_dereference_rcu(ht->future_tbl, ht);
 838        new_hash = obj_raw_hashfn(ht, rht_obj(ht, obj));
 839
 840        lock_buckets(new_tbl, old_tbl, new_hash);
 841
 842        if (rhashtable_lookup_compare(ht, rht_obj(ht, obj) + ht->p.key_offset,
 843                                      compare, arg)) {
 844                success = false;
 845                goto exit;
 846        }
 847
 848        __rhashtable_insert(ht, obj, new_tbl, old_tbl, new_hash);
 849
 850exit:
 851        unlock_buckets(new_tbl, old_tbl, new_hash);
 852        rcu_read_unlock();
 853
 854        return success;
 855}
 856EXPORT_SYMBOL_GPL(rhashtable_lookup_compare_insert);
 857
 858/**
 859 * rhashtable_walk_init - Initialise an iterator
 860 * @ht:         Table to walk over
 861 * @iter:       Hash table Iterator
 862 *
 863 * This function prepares a hash table walk.
 864 *
 865 * Note that if you restart a walk after rhashtable_walk_stop you
 866 * may see the same object twice.  Also, you may miss objects if
 867 * there are removals in between rhashtable_walk_stop and the next
 868 * call to rhashtable_walk_start.
 869 *
 870 * For a completely stable walk you should construct your own data
 871 * structure outside the hash table.
 872 *
 873 * This function may sleep so you must not call it from interrupt
 874 * context or with spin locks held.
 875 *
 876 * You must call rhashtable_walk_exit if this function returns
 877 * successfully.
 878 */
 879int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter)
 880{
 881        iter->ht = ht;
 882        iter->p = NULL;
 883        iter->slot = 0;
 884        iter->skip = 0;
 885
 886        iter->walker = kmalloc(sizeof(*iter->walker), GFP_KERNEL);
 887        if (!iter->walker)
 888                return -ENOMEM;
 889
 890        INIT_LIST_HEAD(&iter->walker->list);
 891        iter->walker->resize = false;
 892
 893        mutex_lock(&ht->mutex);
 894        list_add(&iter->walker->list, &ht->walkers);
 895        mutex_unlock(&ht->mutex);
 896
 897        return 0;
 898}
 899EXPORT_SYMBOL_GPL(rhashtable_walk_init);
 900
 901/**
 902 * rhashtable_walk_exit - Free an iterator
 903 * @iter:       Hash table Iterator
 904 *
 905 * This function frees resources allocated by rhashtable_walk_init.
 906 */
 907void rhashtable_walk_exit(struct rhashtable_iter *iter)
 908{
 909        mutex_lock(&iter->ht->mutex);
 910        list_del(&iter->walker->list);
 911        mutex_unlock(&iter->ht->mutex);
 912        kfree(iter->walker);
 913}
 914EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
 915
 916/**
 917 * rhashtable_walk_start - Start a hash table walk
 918 * @iter:       Hash table iterator
 919 *
 920 * Start a hash table walk.  Note that we take the RCU lock in all
 921 * cases including when we return an error.  So you must always call
 922 * rhashtable_walk_stop to clean up.
 923 *
 924 * Returns zero if successful.
 925 *
 926 * Returns -EAGAIN if resize event occured.  Note that the iterator
 927 * will rewind back to the beginning and you may use it immediately
 928 * by calling rhashtable_walk_next.
 929 */
 930int rhashtable_walk_start(struct rhashtable_iter *iter)
 931{
 932        rcu_read_lock();
 933
 934        if (iter->walker->resize) {
 935                iter->slot = 0;
 936                iter->skip = 0;
 937                iter->walker->resize = false;
 938                return -EAGAIN;
 939        }
 940
 941        return 0;
 942}
 943EXPORT_SYMBOL_GPL(rhashtable_walk_start);
 944
 945/**
 946 * rhashtable_walk_next - Return the next object and advance the iterator
 947 * @iter:       Hash table iterator
 948 *
 949 * Note that you must call rhashtable_walk_stop when you are finished
 950 * with the walk.
 951 *
 952 * Returns the next object or NULL when the end of the table is reached.
 953 *
 954 * Returns -EAGAIN if resize event occured.  Note that the iterator
 955 * will rewind back to the beginning and you may continue to use it.
 956 */
 957void *rhashtable_walk_next(struct rhashtable_iter *iter)
 958{
 959        const struct bucket_table *tbl;
 960        struct rhashtable *ht = iter->ht;
 961        struct rhash_head *p = iter->p;
 962        void *obj = NULL;
 963
 964        tbl = rht_dereference_rcu(ht->tbl, ht);
 965
 966        if (p) {
 967                p = rht_dereference_bucket_rcu(p->next, tbl, iter->slot);
 968                goto next;
 969        }
 970
 971        for (; iter->slot < tbl->size; iter->slot++) {
 972                int skip = iter->skip;
 973
 974                rht_for_each_rcu(p, tbl, iter->slot) {
 975                        if (!skip)
 976                                break;
 977                        skip--;
 978                }
 979
 980next:
 981                if (!rht_is_a_nulls(p)) {
 982                        iter->skip++;
 983                        iter->p = p;
 984                        obj = rht_obj(ht, p);
 985                        goto out;
 986                }
 987
 988                iter->skip = 0;
 989        }
 990
 991        iter->p = NULL;
 992
 993out:
 994        if (iter->walker->resize) {
 995                iter->p = NULL;
 996                iter->slot = 0;
 997                iter->skip = 0;
 998                iter->walker->resize = false;
 999                return ERR_PTR(-EAGAIN);
1000        }

1001
1002        return obj;
1003}
1004EXPORT_SYMBOL_GPL(rhashtable_walk_next);
1005
1006/**
1007 * rhashtable_walk_stop - Finish a hash table walk
1008 * @iter:       Hash table iterator
1009 *
1010 * Finish a hash table walk.
1011 */
1012void rhashtable_walk_stop(struct rhashtable_iter *iter)
1013{
1014        rcu_read_unlock();
1015        iter->p = NULL;
1016}
1017EXPORT_SYMBOL_GPL(rhashtable_walk_stop);
1018
1019static size_t rounded_hashtable_size(struct rhashtable_params *params)
1020{
1021        return max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
1022                   1UL << params->min_shift);
1023}
1024
1025/**
1026 * rhashtable_init - initialize a new hash table
1027 * @ht:         hash table to be initialized
1028 * @params:     configuration parameters
1029 *
1030 * Initializes a new hash table based on the provided configuration
1031 * parameters. A table can be configured either with a variable or
1032 * fixed length key:
1033 *
1034 * Configuration Example 1: Fixed length keys
1035 * struct test_obj {
1036 *      int                     key;
1037 *      void *                  my_member;
1038 *      struct rhash_head       node;
1039 * };
1040 *
1041 * struct rhashtable_params params = {
1042 *      .head_offset = offsetof(struct test_obj, node),
1043 *      .key_offset = offsetof(struct test_obj, key),
1044 *      .key_len = sizeof(int),
1045 *      .hashfn = jhash,
1046 *      .nulls_base = (1U << RHT_BASE_SHIFT),
1047 * };
1048 *
1049 * Configuration Example 2: Variable length keys
1050 * struct test_obj {
1051 *      [...]
1052 *      struct rhash_head       node;
1053 * };
1054 *
1055 * u32 my_hash_fn(const void *data, u32 seed)
1056 * {
1057 *      struct test_obj *obj = data;
1058 *
1059 *      return [... hash ...];
1060 * }
1061 *
1062 * struct rhashtable_params params = {
1063 *      .head_offset = offsetof(struct test_obj, node),
1064 *      .hashfn = jhash,
1065 *      .obj_hashfn = my_hash_fn,
1066 * };
1067 */
1068int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
1069{
1070        struct bucket_table *tbl;
1071        size_t size;
1072
1073        size = HASH_DEFAULT_SIZE;
1074
1075        if ((params->key_len && !params->hashfn) ||
1076            (!params->key_len && !params->obj_hashfn))
1077                return -EINVAL;
1078
1079        if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
1080                return -EINVAL;
1081
1082        params->min_shift = max_t(size_t, params->min_shift,
1083                                  ilog2(HASH_MIN_SIZE));
1084
1085        if (params->nelem_hint)
1086                size = rounded_hashtable_size(params);
1087
1088        memset(ht, 0, sizeof(*ht));
1089        mutex_init(&ht->mutex);
1090        memcpy(&ht->p, params, sizeof(*params));
1091        INIT_LIST_HEAD(&ht->walkers);
1092
1093        if (params->locks_mul)
1094                ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
1095        else
1096                ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
1097
1098        tbl = bucket_table_alloc(ht, size);
1099        if (tbl == NULL)
1100                return -ENOMEM;
1101
1102        atomic_set(&ht->nelems, 0);
1103        atomic_set(&ht->shift, ilog2(tbl->size));
1104        RCU_INIT_POINTER(ht->tbl, tbl);
1105        RCU_INIT_POINTER(ht->future_tbl, tbl);
1106
1107        if (!ht->p.hash_rnd)
1108                get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd));
1109
1110        INIT_WORK(&ht->run_work, rht_deferred_worker);
1111
1112        return 0;
1113}
1114EXPORT_SYMBOL_GPL(rhashtable_init);
1115
1116/**
1117 * rhashtable_destroy - destroy hash table
1118 * @ht:         the hash table to destroy
1119 *
1120 * Frees the bucket array. This function is not rcu safe, therefore the caller
1121 * has to make sure that no resizing may happen by unpublishing the hashtable
1122 * and waiting for the quiescent cycle before releasing the bucket array.
1123 */
1124void rhashtable_destroy(struct rhashtable *ht)
1125{
1126        ht->being_destroyed = true;
1127
1128        cancel_work_sync(&ht->run_work);
1129
1130        mutex_lock(&ht->mutex);
1131        bucket_table_free(rht_dereference(ht->tbl, ht));
1132        mutex_unlock(&ht->mutex);
1133}
1134EXPORT_SYMBOL_GPL(rhashtable_destroy);
1135