linux/kernel/bpf/hashtab.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
   3 * Copyright (c) 2016 Facebook
   4 */
   5#include <linux/bpf.h>
   6#include <linux/btf.h>
   7#include <linux/jhash.h>
   8#include <linux/filter.h>
   9#include <linux/rculist_nulls.h>
  10#include <linux/random.h>
  11#include <uapi/linux/btf.h>
  12#include <linux/rcupdate_trace.h>
  13#include "percpu_freelist.h"
  14#include "bpf_lru_list.h"
  15#include "map_in_map.h"
  16
  17#define HTAB_CREATE_FLAG_MASK                                           \
  18        (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |    \
  19         BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
  20
  21#define BATCH_OPS(_name)                        \
  22        .map_lookup_batch =                     \
  23        _name##_map_lookup_batch,               \
  24        .map_lookup_and_delete_batch =          \
  25        _name##_map_lookup_and_delete_batch,    \
  26        .map_update_batch =                     \
  27        generic_map_update_batch,               \
  28        .map_delete_batch =                     \
  29        generic_map_delete_batch
  30
  31/*
  32 * The bucket lock has two protection scopes:
  33 *
  34 * 1) Serializing concurrent operations from BPF programs on different
  35 *    CPUs
  36 *
  37 * 2) Serializing concurrent operations from BPF programs and sys_bpf()
  38 *
  39 * BPF programs can execute in any context including perf, kprobes and
  40 * tracing. As there are almost no limits where perf, kprobes and tracing
  41 * can be invoked from the lock operations need to be protected against
  42 * deadlocks. Deadlocks can be caused by recursion and by an invocation in
  43 * the lock held section when functions which acquire this lock are invoked
  44 * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
  45 * variable bpf_prog_active, which prevents BPF programs attached to perf
  46 * events, kprobes and tracing to be invoked before the prior invocation
  47 * from one of these contexts completed. sys_bpf() uses the same mechanism
  48 * by pinning the task to the current CPU and incrementing the recursion
  49 * protection across the map operation.
  50 *
  51 * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
  52 * operations like memory allocations (even with GFP_ATOMIC) from atomic
  53 * contexts. This is required because even with GFP_ATOMIC the memory
  54 * allocator calls into code paths which acquire locks with long held lock
  55 * sections. To ensure the deterministic behaviour these locks are regular
  56 * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
  57 * true atomic contexts on an RT kernel are the low level hardware
  58 * handling, scheduling, low level interrupt handling, NMIs etc. None of
  59 * these contexts should ever do memory allocations.
  60 *
  61 * As regular device interrupt handlers and soft interrupts are forced into
  62 * thread context, the existing code which does
  63 *   spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*();
  64 * just works.
  65 *
  66 * In theory the BPF locks could be converted to regular spinlocks as well,
  67 * but the bucket locks and percpu_freelist locks can be taken from
  68 * arbitrary contexts (perf, kprobes, tracepoints) which are required to be
  69 * atomic contexts even on RT. These mechanisms require preallocated maps,
  70 * so there is no need to invoke memory allocations within the lock held
  71 * sections.
  72 *
  73 * BPF maps which need dynamic allocation are only used from (forced)
  74 * thread context on RT and can therefore use regular spinlocks which in
  75 * turn allows to invoke memory allocations from the lock held section.
  76 *
  77 * On a non RT kernel this distinction is neither possible nor required.
  78 * spinlock maps to raw_spinlock and the extra code is optimized out by the
  79 * compiler.
  80 */
  81struct bucket {
  82        struct hlist_nulls_head head;
  83        union {
  84                raw_spinlock_t raw_lock;
  85                spinlock_t     lock;
  86        };
  87};
  88
  89#define HASHTAB_MAP_LOCK_COUNT 8
  90#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
  91
  92struct bpf_htab {
  93        struct bpf_map map;
  94        struct bucket *buckets;
  95        void *elems;
  96        union {
  97                struct pcpu_freelist freelist;
  98                struct bpf_lru lru;
  99        };
 100        struct htab_elem *__percpu *extra_elems;
 101        atomic_t count; /* number of elements in this hashtable */
 102        u32 n_buckets;  /* number of hash buckets */
 103        u32 elem_size;  /* size of each element in bytes */
 104        u32 hashrnd;
 105        struct lock_class_key lockdep_key;
 106        int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
 107};
 108
 109/* each htab element is struct htab_elem + key + value */
 110struct htab_elem {
 111        union {
 112                struct hlist_nulls_node hash_node;
 113                struct {
 114                        void *padding;
 115                        union {
 116                                struct bpf_htab *htab;
 117                                struct pcpu_freelist_node fnode;
 118                                struct htab_elem *batch_flink;
 119                        };
 120                };
 121        };
 122        union {
 123                struct rcu_head rcu;
 124                struct bpf_lru_node lru_node;
 125        };
 126        u32 hash;
 127        char key[] __aligned(8);
 128};
 129
 130static inline bool htab_is_prealloc(const struct bpf_htab *htab)
 131{
 132        return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
 133}
 134
 135static inline bool htab_use_raw_lock(const struct bpf_htab *htab)
 136{
 137        return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab));
 138}
 139
 140static void htab_init_buckets(struct bpf_htab *htab)
 141{
 142        unsigned i;
 143
 144        for (i = 0; i < htab->n_buckets; i++) {
 145                INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
 146                if (htab_use_raw_lock(htab)) {
 147                        raw_spin_lock_init(&htab->buckets[i].raw_lock);
 148                        lockdep_set_class(&htab->buckets[i].raw_lock,
 149                                          &htab->lockdep_key);
 150                } else {
 151                        spin_lock_init(&htab->buckets[i].lock);
 152                        lockdep_set_class(&htab->buckets[i].lock,
 153                                          &htab->lockdep_key);
 154                }
 155                cond_resched();
 156        }
 157}
 158
 159static inline int htab_lock_bucket(const struct bpf_htab *htab,
 160                                   struct bucket *b, u32 hash,
 161                                   unsigned long *pflags)
 162{
 163        unsigned long flags;
 164
 165        hash = hash & HASHTAB_MAP_LOCK_MASK;
 166
 167        migrate_disable();
 168        if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
 169                __this_cpu_dec(*(htab->map_locked[hash]));
 170                migrate_enable();
 171                return -EBUSY;
 172        }
 173
 174        if (htab_use_raw_lock(htab))
 175                raw_spin_lock_irqsave(&b->raw_lock, flags);
 176        else
 177                spin_lock_irqsave(&b->lock, flags);
 178        *pflags = flags;
 179
 180        return 0;
 181}
 182
 183static inline void htab_unlock_bucket(const struct bpf_htab *htab,
 184                                      struct bucket *b, u32 hash,
 185                                      unsigned long flags)
 186{
 187        hash = hash & HASHTAB_MAP_LOCK_MASK;
 188        if (htab_use_raw_lock(htab))
 189                raw_spin_unlock_irqrestore(&b->raw_lock, flags);
 190        else
 191                spin_unlock_irqrestore(&b->lock, flags);
 192        __this_cpu_dec(*(htab->map_locked[hash]));
 193        migrate_enable();
 194}
 195
 196static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
 197
 198static bool htab_is_lru(const struct bpf_htab *htab)
 199{
 200        return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
 201                htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
 202}
 203
 204static bool htab_is_percpu(const struct bpf_htab *htab)
 205{
 206        return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 207                htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
 208}
 209
 210static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
 211                                     void __percpu *pptr)
 212{
 213        *(void __percpu **)(l->key + key_size) = pptr;
 214}
 215
 216static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
 217{
 218        return *(void __percpu **)(l->key + key_size);
 219}
 220
 221static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
 222{
 223        return *(void **)(l->key + roundup(map->key_size, 8));
 224}
 225
 226static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
 227{
 228        return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
 229}
 230
 231static bool htab_has_extra_elems(struct bpf_htab *htab)
 232{
 233        return !htab_is_percpu(htab) && !htab_is_lru(htab);
 234}
 235
 236static void htab_free_prealloced_timers(struct bpf_htab *htab)
 237{
 238        u32 num_entries = htab->map.max_entries;
 239        int i;
 240
 241        if (likely(!map_value_has_timer(&htab->map)))
 242                return;
 243        if (htab_has_extra_elems(htab))
 244                num_entries += num_possible_cpus();
 245
 246        for (i = 0; i < num_entries; i++) {
 247                struct htab_elem *elem;
 248
 249                elem = get_htab_elem(htab, i);
 250                bpf_timer_cancel_and_free(elem->key +
 251                                          round_up(htab->map.key_size, 8) +
 252                                          htab->map.timer_off);
 253                cond_resched();
 254        }
 255}
 256
 257static void htab_free_elems(struct bpf_htab *htab)
 258{
 259        int i;
 260
 261        if (!htab_is_percpu(htab))
 262                goto free_elems;
 263
 264        for (i = 0; i < htab->map.max_entries; i++) {
 265                void __percpu *pptr;
 266
 267                pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
 268                                         htab->map.key_size);
 269                free_percpu(pptr);
 270                cond_resched();
 271        }
 272free_elems:
 273        bpf_map_area_free(htab->elems);
 274}
 275
 276/* The LRU list has a lock (lru_lock). Each htab bucket has a lock
 277 * (bucket_lock). If both locks need to be acquired together, the lock
 278 * order is always lru_lock -> bucket_lock and this only happens in
 279 * bpf_lru_list.c logic. For example, certain code path of
 280 * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),
 281 * will acquire lru_lock first followed by acquiring bucket_lock.
 282 *
 283 * In hashtab.c, to avoid deadlock, lock acquisition of
 284 * bucket_lock followed by lru_lock is not allowed. In such cases,
 285 * bucket_lock needs to be released first before acquiring lru_lock.
 286 */
 287static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
 288                                          u32 hash)
 289{
 290        struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
 291        struct htab_elem *l;
 292
 293        if (node) {
 294                u32 key_size = htab->map.key_size;
 295
 296                l = container_of(node, struct htab_elem, lru_node);
 297                memcpy(l->key, key, key_size);
 298                check_and_init_map_value(&htab->map,
 299                                         l->key + round_up(key_size, 8));
 300                return l;
 301        }
 302
 303        return NULL;
 304}
 305
 306static int prealloc_init(struct bpf_htab *htab)
 307{
 308        u32 num_entries = htab->map.max_entries;
 309        int err = -ENOMEM, i;
 310
 311        if (htab_has_extra_elems(htab))
 312                num_entries += num_possible_cpus();
 313
 314        htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries,
 315                                         htab->map.numa_node);
 316        if (!htab->elems)
 317                return -ENOMEM;
 318
 319        if (!htab_is_percpu(htab))
 320                goto skip_percpu_elems;
 321
 322        for (i = 0; i < num_entries; i++) {
 323                u32 size = round_up(htab->map.value_size, 8);
 324                void __percpu *pptr;
 325
 326                pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
 327                                            GFP_USER | __GFP_NOWARN);
 328                if (!pptr)
 329                        goto free_elems;
 330                htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
 331                                  pptr);
 332                cond_resched();
 333        }
 334
 335skip_percpu_elems:
 336        if (htab_is_lru(htab))
 337                err = bpf_lru_init(&htab->lru,
 338                                   htab->map.map_flags & BPF_F_NO_COMMON_LRU,
 339                                   offsetof(struct htab_elem, hash) -
 340                                   offsetof(struct htab_elem, lru_node),
 341                                   htab_lru_map_delete_node,
 342                                   htab);
 343        else
 344                err = pcpu_freelist_init(&htab->freelist);
 345
 346        if (err)
 347                goto free_elems;
 348
 349        if (htab_is_lru(htab))
 350                bpf_lru_populate(&htab->lru, htab->elems,
 351                                 offsetof(struct htab_elem, lru_node),
 352                                 htab->elem_size, num_entries);
 353        else
 354                pcpu_freelist_populate(&htab->freelist,
 355                                       htab->elems + offsetof(struct htab_elem, fnode),
 356                                       htab->elem_size, num_entries);
 357
 358        return 0;
 359
 360free_elems:
 361        htab_free_elems(htab);
 362        return err;
 363}
 364
 365static void prealloc_destroy(struct bpf_htab *htab)
 366{
 367        htab_free_elems(htab);
 368
 369        if (htab_is_lru(htab))
 370                bpf_lru_destroy(&htab->lru);
 371        else
 372                pcpu_freelist_destroy(&htab->freelist);
 373}
 374
 375static int alloc_extra_elems(struct bpf_htab *htab)
 376{
 377        struct htab_elem *__percpu *pptr, *l_new;
 378        struct pcpu_freelist_node *l;
 379        int cpu;
 380
 381        pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8,
 382                                    GFP_USER | __GFP_NOWARN);
 383        if (!pptr)
 384                return -ENOMEM;
 385
 386        for_each_possible_cpu(cpu) {
 387                l = pcpu_freelist_pop(&htab->freelist);
 388                /* pop will succeed, since prealloc_init()
 389                 * preallocated extra num_possible_cpus elements
 390                 */
 391                l_new = container_of(l, struct htab_elem, fnode);
 392                *per_cpu_ptr(pptr, cpu) = l_new;
 393        }
 394        htab->extra_elems = pptr;
 395        return 0;
 396}
 397
 398/* Called from syscall */
 399static int htab_map_alloc_check(union bpf_attr *attr)
 400{
 401        bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 402                       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
 403        bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
 404                    attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
 405        /* percpu_lru means each cpu has its own LRU list.
 406         * it is different from BPF_MAP_TYPE_PERCPU_HASH where
 407         * the map's value itself is percpu.  percpu_lru has
 408         * nothing to do with the map's value.
 409         */
 410        bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
 411        bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
 412        bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
 413        int numa_node = bpf_map_attr_numa_node(attr);
 414
 415        BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
 416                     offsetof(struct htab_elem, hash_node.pprev));
 417        BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
 418                     offsetof(struct htab_elem, hash_node.pprev));
 419
 420        if (lru && !bpf_capable())
 421                /* LRU implementation is much complicated than other
 422                 * maps.  Hence, limit to CAP_BPF.
 423                 */
 424                return -EPERM;
 425
 426        if (zero_seed && !capable(CAP_SYS_ADMIN))
 427                /* Guard against local DoS, and discourage production use. */
 428                return -EPERM;
 429
 430        if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK ||
 431            !bpf_map_flags_access_ok(attr->map_flags))
 432                return -EINVAL;
 433
 434        if (!lru && percpu_lru)
 435                return -EINVAL;
 436
 437        if (lru && !prealloc)
 438                return -ENOTSUPP;
 439
 440        if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
 441                return -EINVAL;
 442
 443        /* check sanity of attributes.
 444         * value_size == 0 may be allowed in the future to use map as a set
 445         */
 446        if (attr->max_entries == 0 || attr->key_size == 0 ||
 447            attr->value_size == 0)
 448                return -EINVAL;
 449
 450        if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE -
 451           sizeof(struct htab_elem))
 452                /* if key_size + value_size is bigger, the user space won't be
 453                 * able to access the elements via bpf syscall. This check
 454                 * also makes sure that the elem_size doesn't overflow and it's
 455                 * kmalloc-able later in htab_map_update_elem()
 456                 */
 457                return -E2BIG;
 458
 459        return 0;
 460}
 461
 462static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 463{
 464        bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 465                       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
 466        bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
 467                    attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
 468        /* percpu_lru means each cpu has its own LRU list.
 469         * it is different from BPF_MAP_TYPE_PERCPU_HASH where
 470         * the map's value itself is percpu.  percpu_lru has
 471         * nothing to do with the map's value.
 472         */
 473        bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
 474        bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
 475        struct bpf_htab *htab;
 476        int err, i;
 477
 478        htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT);
 479        if (!htab)
 480                return ERR_PTR(-ENOMEM);
 481
 482        lockdep_register_key(&htab->lockdep_key);
 483
 484        bpf_map_init_from_attr(&htab->map, attr);
 485
 486        if (percpu_lru) {
 487                /* ensure each CPU's lru list has >=1 elements.
 488                 * since we are at it, make each lru list has the same
 489                 * number of elements.
 490                 */
 491                htab->map.max_entries = roundup(attr->max_entries,
 492                                                num_possible_cpus());
 493                if (htab->map.max_entries < attr->max_entries)
 494                        htab->map.max_entries = rounddown(attr->max_entries,
 495                                                          num_possible_cpus());
 496        }
 497
 498        /* hash table size must be power of 2 */
 499        htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
 500
 501        htab->elem_size = sizeof(struct htab_elem) +
 502                          round_up(htab->map.key_size, 8);
 503        if (percpu)
 504                htab->elem_size += sizeof(void *);
 505        else
 506                htab->elem_size += round_up(htab->map.value_size, 8);
 507
 508        err = -E2BIG;
 509        /* prevent zero size kmalloc and check for u32 overflow */
 510        if (htab->n_buckets == 0 ||
 511            htab->n_buckets > U32_MAX / sizeof(struct bucket))
 512                goto free_htab;
 513
 514        err = -ENOMEM;
 515        htab->buckets = bpf_map_area_alloc(htab->n_buckets *
 516                                           sizeof(struct bucket),
 517                                           htab->map.numa_node);
 518        if (!htab->buckets)
 519                goto free_htab;
 520
 521        for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
 522                htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
 523                                                           sizeof(int),
 524                                                           sizeof(int),
 525                                                           GFP_USER);
 526                if (!htab->map_locked[i])
 527                        goto free_map_locked;
 528        }
 529
 530        if (htab->map.map_flags & BPF_F_ZERO_SEED)
 531                htab->hashrnd = 0;
 532        else
 533                htab->hashrnd = get_random_int();
 534
 535        htab_init_buckets(htab);
 536
 537        if (prealloc) {
 538                err = prealloc_init(htab);
 539                if (err)
 540                        goto free_map_locked;
 541
 542                if (!percpu && !lru) {
 543                        /* lru itself can remove the least used element, so
 544                         * there is no need for an extra elem during map_update.
 545                         */
 546                        err = alloc_extra_elems(htab);
 547                        if (err)
 548                                goto free_prealloc;
 549                }
 550        }
 551
 552        return &htab->map;
 553
 554free_prealloc:
 555        prealloc_destroy(htab);
 556free_map_locked:
 557        for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
 558                free_percpu(htab->map_locked[i]);
 559        bpf_map_area_free(htab->buckets);
 560free_htab:
 561        lockdep_unregister_key(&htab->lockdep_key);
 562        kfree(htab);
 563        return ERR_PTR(err);
 564}
 565
 566static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
 567{
 568        return jhash(key, key_len, hashrnd);
 569}
 570
 571static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
 572{
 573        return &htab->buckets[hash & (htab->n_buckets - 1)];
 574}
 575
 576static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
 577{
 578        return &__select_bucket(htab, hash)->head;
 579}
 580
 581/* this lookup function can only be called with bucket lock taken */
 582static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
 583                                         void *key, u32 key_size)
 584{
 585        struct hlist_nulls_node *n;
 586        struct htab_elem *l;
 587
 588        hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
 589                if (l->hash == hash && !memcmp(&l->key, key, key_size))
 590                        return l;
 591
 592        return NULL;
 593}
 594
 595/* can be called without bucket lock. it will repeat the loop in
 596 * the unlikely event when elements moved from one bucket into another
 597 * while link list is being walked
 598 */
 599static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
 600                                               u32 hash, void *key,
 601                                               u32 key_size, u32 n_buckets)
 602{
 603        struct hlist_nulls_node *n;
 604        struct htab_elem *l;
 605
 606again:
 607        hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
 608                if (l->hash == hash && !memcmp(&l->key, key, key_size))
 609                        return l;
 610
 611        if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
 612                goto again;
 613
 614        return NULL;
 615}
 616
 617/* Called from syscall or from eBPF program directly, so
 618 * arguments have to match bpf_map_lookup_elem() exactly.
 619 * The return value is adjusted by BPF instructions
 620 * in htab_map_gen_lookup().
 621 */
 622static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
 623{
 624        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 625        struct hlist_nulls_head *head;
 626        struct htab_elem *l;
 627        u32 hash, key_size;
 628
 629        WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
 630                     !rcu_read_lock_bh_held());
 631
 632        key_size = map->key_size;
 633
 634        hash = htab_map_hash(key, key_size, htab->hashrnd);
 635
 636        head = select_bucket(htab, hash);
 637
 638        l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
 639
 640        return l;
 641}
 642
 643static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
 644{
 645        struct htab_elem *l = __htab_map_lookup_elem(map, key);
 646
 647        if (l)
 648                return l->key + round_up(map->key_size, 8);
 649
 650        return NULL;
 651}
 652
 653/* inline bpf_map_lookup_elem() call.
 654 * Instead of:
 655 * bpf_prog
 656 *   bpf_map_lookup_elem
 657 *     map->ops->map_lookup_elem
 658 *       htab_map_lookup_elem
 659 *         __htab_map_lookup_elem
 660 * do:
 661 * bpf_prog
 662 *   __htab_map_lookup_elem
 663 */
 664static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 665{
 666        struct bpf_insn *insn = insn_buf;
 667        const int ret = BPF_REG_0;
 668
 669        BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
 670                     (void *(*)(struct bpf_map *map, void *key))NULL));
 671        *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
 672        *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
 673        *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
 674                                offsetof(struct htab_elem, key) +
 675                                round_up(map->key_size, 8));
 676        return insn - insn_buf;
 677}
 678
 679static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
 680                                                        void *key, const bool mark)
 681{
 682        struct htab_elem *l = __htab_map_lookup_elem(map, key);
 683
 684        if (l) {
 685                if (mark)
 686                        bpf_lru_node_set_ref(&l->lru_node);
 687                return l->key + round_up(map->key_size, 8);
 688        }
 689
 690        return NULL;
 691}
 692
 693static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
 694{
 695        return __htab_lru_map_lookup_elem(map, key, true);
 696}
 697
 698static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
 699{
 700        return __htab_lru_map_lookup_elem(map, key, false);
 701}
 702
 703static int htab_lru_map_gen_lookup(struct bpf_map *map,
 704                                   struct bpf_insn *insn_buf)
 705{
 706        struct bpf_insn *insn = insn_buf;
 707        const int ret = BPF_REG_0;
 708        const int ref_reg = BPF_REG_1;
 709
 710        BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
 711                     (void *(*)(struct bpf_map *map, void *key))NULL));
 712        *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
 713        *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
 714        *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
 715                              offsetof(struct htab_elem, lru_node) +
 716                              offsetof(struct bpf_lru_node, ref));
 717        *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1);
 718        *insn++ = BPF_ST_MEM(BPF_B, ret,
 719                             offsetof(struct htab_elem, lru_node) +
 720                             offsetof(struct bpf_lru_node, ref),
 721                             1);
 722        *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
 723                                offsetof(struct htab_elem, key) +
 724                                round_up(map->key_size, 8));
 725        return insn - insn_buf;
 726}
 727
 728static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem)
 729{
 730        if (unlikely(map_value_has_timer(&htab->map)))
 731                bpf_timer_cancel_and_free(elem->key +
 732                                          round_up(htab->map.key_size, 8) +
 733                                          htab->map.timer_off);
 734}
 735
 736/* It is called from the bpf_lru_list when the LRU needs to delete
 737 * older elements from the htab.
 738 */
 739static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
 740{
 741        struct bpf_htab *htab = (struct bpf_htab *)arg;
 742        struct htab_elem *l = NULL, *tgt_l;
 743        struct hlist_nulls_head *head;
 744        struct hlist_nulls_node *n;
 745        unsigned long flags;
 746        struct bucket *b;
 747        int ret;
 748
 749        tgt_l = container_of(node, struct htab_elem, lru_node);
 750        b = __select_bucket(htab, tgt_l->hash);
 751        head = &b->head;
 752
 753        ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags);
 754        if (ret)
 755                return false;
 756
 757        hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
 758                if (l == tgt_l) {
 759                        hlist_nulls_del_rcu(&l->hash_node);
 760                        check_and_free_timer(htab, l);
 761                        break;
 762                }
 763
 764        htab_unlock_bucket(htab, b, tgt_l->hash, flags);
 765
 766        return l == tgt_l;
 767}
 768
 769/* Called from syscall */
 770static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 771{
 772        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 773        struct hlist_nulls_head *head;
 774        struct htab_elem *l, *next_l;
 775        u32 hash, key_size;
 776        int i = 0;
 777
 778        WARN_ON_ONCE(!rcu_read_lock_held());
 779
 780        key_size = map->key_size;
 781
 782        if (!key)
 783                goto find_first_elem;
 784
 785        hash = htab_map_hash(key, key_size, htab->hashrnd);
 786
 787        head = select_bucket(htab, hash);
 788
 789        /* lookup the key */
 790        l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
 791
 792        if (!l)
 793                goto find_first_elem;
 794
 795        /* key was found, get next key in the same bucket */
 796        next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
 797                                  struct htab_elem, hash_node);
 798
 799        if (next_l) {
 800                /* if next elem in this hash list is non-zero, just return it */
 801                memcpy(next_key, next_l->key, key_size);
 802                return 0;
 803        }
 804
 805        /* no more elements in this hash list, go to the next bucket */
 806        i = hash & (htab->n_buckets - 1);
 807        i++;
 808
 809find_first_elem:
 810        /* iterate over buckets */
 811        for (; i < htab->n_buckets; i++) {
 812                head = select_bucket(htab, i);
 813
 814                /* pick first element in the bucket */
 815                next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
 816                                          struct htab_elem, hash_node);
 817                if (next_l) {
 818                        /* if it's not empty, just return it */
 819                        memcpy(next_key, next_l->key, key_size);
 820                        return 0;
 821                }
 822        }
 823
 824        /* iterated over all buckets and all elements */
 825        return -ENOENT;
 826}
 827
 828static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
 829{
 830        if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
 831                free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
 832        check_and_free_timer(htab, l);
 833        kfree(l);
 834}
 835
 836static void htab_elem_free_rcu(struct rcu_head *head)
 837{
 838        struct htab_elem *l = container_of(head, struct htab_elem, rcu);
 839        struct bpf_htab *htab = l->htab;
 840
 841        htab_elem_free(htab, l);
 842}
 843
 844static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
 845{
 846        struct bpf_map *map = &htab->map;
 847        void *ptr;
 848
 849        if (map->ops->map_fd_put_ptr) {
 850                ptr = fd_htab_map_get_ptr(map, l);
 851                map->ops->map_fd_put_ptr(ptr);
 852        }
 853}
 854
 855static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 856{
 857        htab_put_fd_value(htab, l);
 858
 859        if (htab_is_prealloc(htab)) {
 860                check_and_free_timer(htab, l);
 861                __pcpu_freelist_push(&htab->freelist, &l->fnode);
 862        } else {
 863                atomic_dec(&htab->count);
 864                l->htab = htab;
 865                call_rcu(&l->rcu, htab_elem_free_rcu);
 866        }
 867}
 868
 869static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
 870                            void *value, bool onallcpus)
 871{
 872        if (!onallcpus) {
 873                /* copy true value_size bytes */
 874                memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
 875        } else {
 876                u32 size = round_up(htab->map.value_size, 8);
 877                int off = 0, cpu;
 878
 879                for_each_possible_cpu(cpu) {
 880                        bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
 881                                        value + off, size);
 882                        off += size;
 883                }
 884        }
 885}
 886
 887static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
 888                            void *value, bool onallcpus)
 889{
 890        /* When using prealloc and not setting the initial value on all cpus,
 891         * zero-fill element values for other cpus (just as what happens when
 892         * not using prealloc). Otherwise, bpf program has no way to ensure
 893         * known initial values for cpus other than current one
 894         * (onallcpus=false always when coming from bpf prog).
 895         */
 896        if (htab_is_prealloc(htab) && !onallcpus) {
 897                u32 size = round_up(htab->map.value_size, 8);
 898                int current_cpu = raw_smp_processor_id();
 899                int cpu;
 900
 901                for_each_possible_cpu(cpu) {
 902                        if (cpu == current_cpu)
 903                                bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value,
 904                                                size);
 905                        else
 906                                memset(per_cpu_ptr(pptr, cpu), 0, size);
 907                }
 908        } else {
 909                pcpu_copy_value(htab, pptr, value, onallcpus);
 910        }
 911}
 912
 913static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
 914{
 915        return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
 916               BITS_PER_LONG == 64;
 917}
 918
 919static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 920                                         void *value, u32 key_size, u32 hash,
 921                                         bool percpu, bool onallcpus,
 922                                         struct htab_elem *old_elem)
 923{
 924        u32 size = htab->map.value_size;
 925        bool prealloc = htab_is_prealloc(htab);
 926        struct htab_elem *l_new, **pl_new;
 927        void __percpu *pptr;
 928
 929        if (prealloc) {
 930                if (old_elem) {
 931                        /* if we're updating the existing element,
 932                         * use per-cpu extra elems to avoid freelist_pop/push
 933                         */
 934                        pl_new = this_cpu_ptr(htab->extra_elems);
 935                        l_new = *pl_new;
 936                        htab_put_fd_value(htab, old_elem);
 937                        *pl_new = old_elem;
 938                } else {
 939                        struct pcpu_freelist_node *l;
 940
 941                        l = __pcpu_freelist_pop(&htab->freelist);
 942                        if (!l)
 943                                return ERR_PTR(-E2BIG);
 944                        l_new = container_of(l, struct htab_elem, fnode);
 945                }
 946        } else {
 947                if (atomic_inc_return(&htab->count) > htab->map.max_entries)
 948                        if (!old_elem) {
 949                                /* when map is full and update() is replacing
 950                                 * old element, it's ok to allocate, since
 951                                 * old element will be freed immediately.
 952                                 * Otherwise return an error
 953                                 */
 954                                l_new = ERR_PTR(-E2BIG);
 955                                goto dec_count;
 956                        }
 957                l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size,
 958                                             GFP_ATOMIC | __GFP_NOWARN,
 959                                             htab->map.numa_node);
 960                if (!l_new) {
 961                        l_new = ERR_PTR(-ENOMEM);
 962                        goto dec_count;
 963                }
 964                check_and_init_map_value(&htab->map,
 965                                         l_new->key + round_up(key_size, 8));
 966        }
 967
 968        memcpy(l_new->key, key, key_size);
 969        if (percpu) {
 970                size = round_up(size, 8);
 971                if (prealloc) {
 972                        pptr = htab_elem_get_ptr(l_new, key_size);
 973                } else {
 974                        /* alloc_percpu zero-fills */
 975                        pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
 976                                                    GFP_ATOMIC | __GFP_NOWARN);
 977                        if (!pptr) {
 978                                kfree(l_new);
 979                                l_new = ERR_PTR(-ENOMEM);
 980                                goto dec_count;
 981                        }
 982                }
 983
 984                pcpu_init_value(htab, pptr, value, onallcpus);
 985
 986                if (!prealloc)
 987                        htab_elem_set_ptr(l_new, key_size, pptr);
 988        } else if (fd_htab_map_needs_adjust(htab)) {
 989                size = round_up(size, 8);
 990                memcpy(l_new->key + round_up(key_size, 8), value, size);
 991        } else {
 992                copy_map_value(&htab->map,
 993                               l_new->key + round_up(key_size, 8),
 994                               value);
 995        }
 996
 997        l_new->hash = hash;
 998        return l_new;
 999dec_count:
1000        atomic_dec(&htab->count);
1001        return l_new;
1002}
1003
1004static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
1005                       u64 map_flags)
1006{
1007        if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
1008                /* elem already exists */
1009                return -EEXIST;
1010
1011        if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
1012                /* elem doesn't exist, cannot update it */
1013                return -ENOENT;
1014
1015        return 0;
1016}
1017
1018/* Called from syscall or from eBPF program */
1019static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
1020                                u64 map_flags)
1021{
1022        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1023        struct htab_elem *l_new = NULL, *l_old;
1024        struct hlist_nulls_head *head;
1025        unsigned long flags;
1026        struct bucket *b;
1027        u32 key_size, hash;
1028        int ret;
1029
1030        if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
1031                /* unknown flags */
1032                return -EINVAL;
1033
1034        WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1035                     !rcu_read_lock_bh_held());
1036
1037        key_size = map->key_size;
1038
1039        hash = htab_map_hash(key, key_size, htab->hashrnd);
1040
1041        b = __select_bucket(htab, hash);
1042        head = &b->head;
1043
1044        if (unlikely(map_flags & BPF_F_LOCK)) {
1045                if (unlikely(!map_value_has_spin_lock(map)))
1046                        return -EINVAL;
1047                /* find an element without taking the bucket lock */
1048                l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
1049                                              htab->n_buckets);
1050                ret = check_flags(htab, l_old, map_flags);
1051                if (ret)
1052                        return ret;
1053                if (l_old) {
1054                        /* grab the element lock and update value in place */
1055                        copy_map_value_locked(map,
1056                                              l_old->key + round_up(key_size, 8),
1057                                              value, false);
1058                        return 0;
1059                }
1060                /* fall through, grab the bucket lock and lookup again.
1061                 * 99.9% chance that the element won't be found,
1062                 * but second lookup under lock has to be done.
1063                 */
1064        }
1065
1066        ret = htab_lock_bucket(htab, b, hash, &flags);
1067        if (ret)
1068                return ret;
1069
1070        l_old = lookup_elem_raw(head, hash, key, key_size);
1071
1072        ret = check_flags(htab, l_old, map_flags);
1073        if (ret)
1074                goto err;
1075
1076        if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
1077                /* first lookup without the bucket lock didn't find the element,
1078                 * but second lookup with the bucket lock found it.
1079                 * This case is highly unlikely, but has to be dealt with:
1080                 * grab the element lock in addition to the bucket lock
1081                 * and update element in place
1082                 */
1083                copy_map_value_locked(map,
1084                                      l_old->key + round_up(key_size, 8),
1085                                      value, false);
1086                ret = 0;
1087                goto err;
1088        }
1089
1090        l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
1091                                l_old);
1092        if (IS_ERR(l_new)) {
1093                /* all pre-allocated elements are in use or memory exhausted */
1094                ret = PTR_ERR(l_new);
1095                goto err;
1096        }
1097
1098        /* add new element to the head of the list, so that
1099         * concurrent search will find it before old elem
1100         */
1101        hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1102        if (l_old) {
1103                hlist_nulls_del_rcu(&l_old->hash_node);
1104                if (!htab_is_prealloc(htab))
1105                        free_htab_elem(htab, l_old);
1106                else
1107                        check_and_free_timer(htab, l_old);
1108        }
1109        ret = 0;
1110err:
1111        htab_unlock_bucket(htab, b, hash, flags);
1112        return ret;
1113}
1114
1115static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
1116{
1117        check_and_free_timer(htab, elem);
1118        bpf_lru_push_free(&htab->lru, &elem->lru_node);
1119}
1120
1121static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
1122                                    u64 map_flags)
1123{
1124        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1125        struct htab_elem *l_new, *l_old = NULL;
1126        struct hlist_nulls_head *head;
1127        unsigned long flags;
1128        struct bucket *b;
1129        u32 key_size, hash;
1130        int ret;
1131
1132        if (unlikely(map_flags > BPF_EXIST))
1133                /* unknown flags */
1134                return -EINVAL;
1135
1136        WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1137                     !rcu_read_lock_bh_held());
1138
1139        key_size = map->key_size;
1140
1141        hash = htab_map_hash(key, key_size, htab->hashrnd);
1142
1143        b = __select_bucket(htab, hash);
1144        head = &b->head;
1145
1146        /* For LRU, we need to alloc before taking bucket's
1147         * spinlock because getting free nodes from LRU may need
1148         * to remove older elements from htab and this removal
1149         * operation will need a bucket lock.
1150         */
1151        l_new = prealloc_lru_pop(htab, key, hash);
1152        if (!l_new)
1153                return -ENOMEM;
1154        copy_map_value(&htab->map,
1155                       l_new->key + round_up(map->key_size, 8), value);
1156
1157        ret = htab_lock_bucket(htab, b, hash, &flags);
1158        if (ret)
1159                return ret;
1160
1161        l_old = lookup_elem_raw(head, hash, key, key_size);
1162
1163        ret = check_flags(htab, l_old, map_flags);
1164        if (ret)
1165                goto err;
1166
1167        /* add new element to the head of the list, so that
1168         * concurrent search will find it before old elem
1169         */
1170        hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1171        if (l_old) {
1172                bpf_lru_node_set_ref(&l_new->lru_node);
1173                hlist_nulls_del_rcu(&l_old->hash_node);
1174        }
1175        ret = 0;
1176
1177err:
1178        htab_unlock_bucket(htab, b, hash, flags);
1179
1180        if (ret)
1181                htab_lru_push_free(htab, l_new);
1182        else if (l_old)
1183                htab_lru_push_free(htab, l_old);
1184
1185        return ret;
1186}
1187
1188static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
1189                                         void *value, u64 map_flags,
1190                                         bool onallcpus)
1191{
1192        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1193        struct htab_elem *l_new = NULL, *l_old;
1194        struct hlist_nulls_head *head;
1195        unsigned long flags;
1196        struct bucket *b;
1197        u32 key_size, hash;
1198        int ret;
1199
1200        if (unlikely(map_flags > BPF_EXIST))
1201                /* unknown flags */
1202                return -EINVAL;
1203
1204        WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1205                     !rcu_read_lock_bh_held());
1206
1207        key_size = map->key_size;
1208
1209        hash = htab_map_hash(key, key_size, htab->hashrnd);
1210
1211        b = __select_bucket(htab, hash);
1212        head = &b->head;
1213
1214        ret = htab_lock_bucket(htab, b, hash, &flags);
1215        if (ret)
1216                return ret;
1217
1218        l_old = lookup_elem_raw(head, hash, key, key_size);
1219
1220        ret = check_flags(htab, l_old, map_flags);
1221        if (ret)
1222                goto err;
1223
1224        if (l_old) {
1225                /* per-cpu hash map can update value in-place */
1226                pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1227                                value, onallcpus);
1228        } else {
1229                l_new = alloc_htab_elem(htab, key, value, key_size,
1230                                        hash, true, onallcpus, NULL);
1231                if (IS_ERR(l_new)) {
1232                        ret = PTR_ERR(l_new);
1233                        goto err;
1234                }
1235                hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1236        }
1237        ret = 0;
1238err:
1239        htab_unlock_bucket(htab, b, hash, flags);
1240        return ret;
1241}
1242
1243static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1244                                             void *value, u64 map_flags,
1245                                             bool onallcpus)
1246{
1247        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1248        struct htab_elem *l_new = NULL, *l_old;
1249        struct hlist_nulls_head *head;
1250        unsigned long flags;
1251        struct bucket *b;
1252        u32 key_size, hash;
1253        int ret;
1254
1255        if (unlikely(map_flags > BPF_EXIST))
1256                /* unknown flags */
1257                return -EINVAL;
1258
1259        WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1260                     !rcu_read_lock_bh_held());
1261
1262        key_size = map->key_size;
1263
1264        hash = htab_map_hash(key, key_size, htab->hashrnd);
1265
1266        b = __select_bucket(htab, hash);
1267        head = &b->head;
1268
1269        /* For LRU, we need to alloc before taking bucket's
1270         * spinlock because LRU's elem alloc may need
1271         * to remove older elem from htab and this removal
1272         * operation will need a bucket lock.
1273         */
1274        if (map_flags != BPF_EXIST) {
1275                l_new = prealloc_lru_pop(htab, key, hash);
1276                if (!l_new)
1277                        return -ENOMEM;
1278        }
1279
1280        ret = htab_lock_bucket(htab, b, hash, &flags);
1281        if (ret)
1282                return ret;
1283
1284        l_old = lookup_elem_raw(head, hash, key, key_size);
1285
1286        ret = check_flags(htab, l_old, map_flags);
1287        if (ret)
1288                goto err;
1289
1290        if (l_old) {
1291                bpf_lru_node_set_ref(&l_old->lru_node);
1292
1293                /* per-cpu hash map can update value in-place */
1294                pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1295                                value, onallcpus);
1296        } else {
1297                pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
1298                                value, onallcpus);
1299                hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1300                l_new = NULL;
1301        }
1302        ret = 0;
1303err:
1304        htab_unlock_bucket(htab, b, hash, flags);
1305        if (l_new)
1306                bpf_lru_push_free(&htab->lru, &l_new->lru_node);
1307        return ret;
1308}
1309
1310static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
1311                                       void *value, u64 map_flags)
1312{
1313        return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
1314}
1315
1316static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1317                                           void *value, u64 map_flags)
1318{
1319        return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
1320                                                 false);
1321}
1322
1323/* Called from syscall or from eBPF program */
1324static int htab_map_delete_elem(struct bpf_map *map, void *key)
1325{
1326        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1327        struct hlist_nulls_head *head;
1328        struct bucket *b;
1329        struct htab_elem *l;
1330        unsigned long flags;
1331        u32 hash, key_size;
1332        int ret;
1333
1334        WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1335                     !rcu_read_lock_bh_held());
1336
1337        key_size = map->key_size;
1338
1339        hash = htab_map_hash(key, key_size, htab->hashrnd);
1340        b = __select_bucket(htab, hash);
1341        head = &b->head;
1342
1343        ret = htab_lock_bucket(htab, b, hash, &flags);
1344        if (ret)
1345                return ret;
1346
1347        l = lookup_elem_raw(head, hash, key, key_size);
1348
1349        if (l) {
1350                hlist_nulls_del_rcu(&l->hash_node);
1351                free_htab_elem(htab, l);
1352        } else {
1353                ret = -ENOENT;
1354        }
1355
1356        htab_unlock_bucket(htab, b, hash, flags);
1357        return ret;
1358}
1359
1360static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
1361{
1362        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1363        struct hlist_nulls_head *head;
1364        struct bucket *b;
1365        struct htab_elem *l;
1366        unsigned long flags;
1367        u32 hash, key_size;
1368        int ret;
1369
1370        WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1371                     !rcu_read_lock_bh_held());
1372
1373        key_size = map->key_size;
1374
1375        hash = htab_map_hash(key, key_size, htab->hashrnd);
1376        b = __select_bucket(htab, hash);
1377        head = &b->head;
1378
1379        ret = htab_lock_bucket(htab, b, hash, &flags);
1380        if (ret)
1381                return ret;
1382
1383        l = lookup_elem_raw(head, hash, key, key_size);
1384
1385        if (l)
1386                hlist_nulls_del_rcu(&l->hash_node);
1387        else
1388                ret = -ENOENT;
1389
1390        htab_unlock_bucket(htab, b, hash, flags);
1391        if (l)
1392                htab_lru_push_free(htab, l);
1393        return ret;
1394}
1395
1396static void delete_all_elements(struct bpf_htab *htab)
1397{
1398        int i;
1399
1400        for (i = 0; i < htab->n_buckets; i++) {
1401                struct hlist_nulls_head *head = select_bucket(htab, i);
1402                struct hlist_nulls_node *n;
1403                struct htab_elem *l;
1404
1405                hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1406                        hlist_nulls_del_rcu(&l->hash_node);
1407                        htab_elem_free(htab, l);
1408                }
1409        }
1410}
1411
1412static void htab_free_malloced_timers(struct bpf_htab *htab)
1413{
1414        int i;
1415
1416        rcu_read_lock();
1417        for (i = 0; i < htab->n_buckets; i++) {
1418                struct hlist_nulls_head *head = select_bucket(htab, i);
1419                struct hlist_nulls_node *n;
1420                struct htab_elem *l;
1421
1422                hlist_nulls_for_each_entry(l, n, head, hash_node)
1423                        check_and_free_timer(htab, l);
1424                cond_resched_rcu();
1425        }
1426        rcu_read_unlock();
1427}
1428
1429static void htab_map_free_timers(struct bpf_map *map)
1430{
1431        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1432
1433        if (likely(!map_value_has_timer(&htab->map)))
1434                return;
1435        if (!htab_is_prealloc(htab))
1436                htab_free_malloced_timers(htab);
1437        else
1438                htab_free_prealloced_timers(htab);
1439}
1440
1441/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
1442static void htab_map_free(struct bpf_map *map)
1443{
1444        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1445        int i;
1446
1447        /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.
1448         * bpf_free_used_maps() is called after bpf prog is no longer executing.
1449         * There is no need to synchronize_rcu() here to protect map elements.
1450         */
1451
1452        /* some of free_htab_elem() callbacks for elements of this map may
1453         * not have executed. Wait for them.
1454         */
1455        rcu_barrier();
1456        if (!htab_is_prealloc(htab))
1457                delete_all_elements(htab);
1458        else
1459                prealloc_destroy(htab);
1460
1461        free_percpu(htab->extra_elems);
1462        bpf_map_area_free(htab->buckets);
1463        for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
1464                free_percpu(htab->map_locked[i]);
1465        lockdep_unregister_key(&htab->lockdep_key);
1466        kfree(htab);
1467}
1468
1469static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
1470                                   struct seq_file *m)
1471{
1472        void *value;
1473
1474        rcu_read_lock();
1475
1476        value = htab_map_lookup_elem(map, key);
1477        if (!value) {
1478                rcu_read_unlock();
1479                return;
1480        }
1481
1482        btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
1483        seq_puts(m, ": ");
1484        btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
1485        seq_puts(m, "\n");
1486
1487        rcu_read_unlock();
1488}
1489
1490static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1491                                             void *value, bool is_lru_map,
1492                                             bool is_percpu, u64 flags)
1493{
1494        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1495        struct hlist_nulls_head *head;
1496        unsigned long bflags;
1497        struct htab_elem *l;
1498        u32 hash, key_size;
1499        struct bucket *b;
1500        int ret;
1501
1502        key_size = map->key_size;
1503
1504        hash = htab_map_hash(key, key_size, htab->hashrnd);
1505        b = __select_bucket(htab, hash);
1506        head = &b->head;
1507
1508        ret = htab_lock_bucket(htab, b, hash, &bflags);
1509        if (ret)
1510                return ret;
1511
1512        l = lookup_elem_raw(head, hash, key, key_size);
1513        if (!l) {
1514                ret = -ENOENT;
1515        } else {
1516                if (is_percpu) {
1517                        u32 roundup_value_size = round_up(map->value_size, 8);
1518                        void __percpu *pptr;
1519                        int off = 0, cpu;
1520
1521                        pptr = htab_elem_get_ptr(l, key_size);
1522                        for_each_possible_cpu(cpu) {
1523                                bpf_long_memcpy(value + off,
1524                                                per_cpu_ptr(pptr, cpu),
1525                                                roundup_value_size);
1526                                off += roundup_value_size;
1527                        }
1528                } else {
1529                        u32 roundup_key_size = round_up(map->key_size, 8);
1530
1531                        if (flags & BPF_F_LOCK)
1532                                copy_map_value_locked(map, value, l->key +
1533                                                      roundup_key_size,
1534                                                      true);
1535                        else
1536                                copy_map_value(map, value, l->key +
1537                                               roundup_key_size);
1538                        check_and_init_map_value(map, value);
1539                }
1540
1541                hlist_nulls_del_rcu(&l->hash_node);
1542                if (!is_lru_map)
1543                        free_htab_elem(htab, l);
1544        }
1545
1546        htab_unlock_bucket(htab, b, hash, bflags);
1547
1548        if (is_lru_map && l)
1549                htab_lru_push_free(htab, l);
1550
1551        return ret;
1552}
1553
1554static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1555                                           void *value, u64 flags)
1556{
1557        return __htab_map_lookup_and_delete_elem(map, key, value, false, false,
1558                                                 flags);
1559}
1560
1561static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1562                                                  void *key, void *value,
1563                                                  u64 flags)
1564{
1565        return __htab_map_lookup_and_delete_elem(map, key, value, false, true,
1566                                                 flags);
1567}
1568
1569static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
1570                                               void *value, u64 flags)
1571{
1572        return __htab_map_lookup_and_delete_elem(map, key, value, true, false,
1573                                                 flags);
1574}
1575
1576static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1577                                                      void *key, void *value,
1578                                                      u64 flags)
1579{
1580        return __htab_map_lookup_and_delete_elem(map, key, value, true, true,
1581                                                 flags);
1582}
1583
1584static int
1585__htab_map_lookup_and_delete_batch(struct bpf_map *map,
1586                                   const union bpf_attr *attr,
1587                                   union bpf_attr __user *uattr,
1588                                   bool do_delete, bool is_lru_map,
1589                                   bool is_percpu)
1590{
1591        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1592        u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
1593        void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
1594        void __user *uvalues = u64_to_user_ptr(attr->batch.values);
1595        void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
1596        void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1597        u32 batch, max_count, size, bucket_size;
1598        struct htab_elem *node_to_free = NULL;
1599        u64 elem_map_flags, map_flags;
1600        struct hlist_nulls_head *head;
1601        struct hlist_nulls_node *n;
1602        unsigned long flags = 0;
1603        bool locked = false;
1604        struct htab_elem *l;
1605        struct bucket *b;
1606        int ret = 0;
1607
1608        elem_map_flags = attr->batch.elem_flags;
1609        if ((elem_map_flags & ~BPF_F_LOCK) ||
1610            ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
1611                return -EINVAL;
1612
1613        map_flags = attr->batch.flags;
1614        if (map_flags)
1615                return -EINVAL;
1616
1617        max_count = attr->batch.count;
1618        if (!max_count)
1619                return 0;
1620
1621        if (put_user(0, &uattr->batch.count))
1622                return -EFAULT;
1623
1624        batch = 0;
1625        if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
1626                return -EFAULT;
1627
1628        if (batch >= htab->n_buckets)
1629                return -ENOENT;
1630
1631        key_size = htab->map.key_size;
1632        roundup_key_size = round_up(htab->map.key_size, 8);
1633        value_size = htab->map.value_size;
1634        size = round_up(value_size, 8);
1635        if (is_percpu)
1636                value_size = size * num_possible_cpus();
1637        total = 0;
1638        /* while experimenting with hash tables with sizes ranging from 10 to
1639         * 1000, it was observed that a bucket can have upto 5 entries.
1640         */
1641        bucket_size = 5;
1642
1643alloc:
1644        /* We cannot do copy_from_user or copy_to_user inside
1645         * the rcu_read_lock. Allocate enough space here.
1646         */
1647        keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN);
1648        values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN);
1649        if (!keys || !values) {
1650                ret = -ENOMEM;
1651                goto after_loop;
1652        }
1653
1654again:
1655        bpf_disable_instrumentation();
1656        rcu_read_lock();
1657again_nocopy:
1658        dst_key = keys;
1659        dst_val = values;
1660        b = &htab->buckets[batch];
1661        head = &b->head;
1662        /* do not grab the lock unless need it (bucket_cnt > 0). */
1663        if (locked) {
1664                ret = htab_lock_bucket(htab, b, batch, &flags);
1665                if (ret)
1666                        goto next_batch;
1667        }
1668
1669        bucket_cnt = 0;
1670        hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
1671                bucket_cnt++;
1672
1673        if (bucket_cnt && !locked) {
1674                locked = true;
1675                goto again_nocopy;
1676        }
1677
1678        if (bucket_cnt > (max_count - total)) {
1679                if (total == 0)
1680                        ret = -ENOSPC;
1681                /* Note that since bucket_cnt > 0 here, it is implicit
1682                 * that the locked was grabbed, so release it.
1683                 */
1684                htab_unlock_bucket(htab, b, batch, flags);
1685                rcu_read_unlock();
1686                bpf_enable_instrumentation();
1687                goto after_loop;
1688        }
1689
1690        if (bucket_cnt > bucket_size) {
1691                bucket_size = bucket_cnt;
1692                /* Note that since bucket_cnt > 0 here, it is implicit
1693                 * that the locked was grabbed, so release it.
1694                 */
1695                htab_unlock_bucket(htab, b, batch, flags);
1696                rcu_read_unlock();
1697                bpf_enable_instrumentation();
1698                kvfree(keys);
1699                kvfree(values);
1700                goto alloc;
1701        }
1702
1703        /* Next block is only safe to run if you have grabbed the lock */
1704        if (!locked)
1705                goto next_batch;
1706
1707        hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1708                memcpy(dst_key, l->key, key_size);
1709
1710                if (is_percpu) {
1711                        int off = 0, cpu;
1712                        void __percpu *pptr;
1713
1714                        pptr = htab_elem_get_ptr(l, map->key_size);
1715                        for_each_possible_cpu(cpu) {
1716                                bpf_long_memcpy(dst_val + off,
1717                                                per_cpu_ptr(pptr, cpu), size);
1718                                off += size;
1719                        }
1720                } else {
1721                        value = l->key + roundup_key_size;
1722                        if (elem_map_flags & BPF_F_LOCK)
1723                                copy_map_value_locked(map, dst_val, value,
1724                                                      true);
1725                        else
1726                                copy_map_value(map, dst_val, value);
1727                        check_and_init_map_value(map, dst_val);
1728                }
1729                if (do_delete) {
1730                        hlist_nulls_del_rcu(&l->hash_node);
1731
1732                        /* bpf_lru_push_free() will acquire lru_lock, which
1733                         * may cause deadlock. See comments in function
1734                         * prealloc_lru_pop(). Let us do bpf_lru_push_free()
1735                         * after releasing the bucket lock.
1736                         */
1737                        if (is_lru_map) {
1738                                l->batch_flink = node_to_free;
1739                                node_to_free = l;
1740                        } else {
1741                                free_htab_elem(htab, l);
1742                        }
1743                }
1744                dst_key += key_size;
1745                dst_val += value_size;
1746        }
1747
1748        htab_unlock_bucket(htab, b, batch, flags);
1749        locked = false;
1750
1751        while (node_to_free) {
1752                l = node_to_free;
1753                node_to_free = node_to_free->batch_flink;
1754                htab_lru_push_free(htab, l);
1755        }
1756
1757next_batch:
1758        /* If we are not copying data, we can go to next bucket and avoid
1759         * unlocking the rcu.
1760         */
1761        if (!bucket_cnt && (batch + 1 < htab->n_buckets)) {
1762                batch++;
1763                goto again_nocopy;
1764        }
1765
1766        rcu_read_unlock();
1767        bpf_enable_instrumentation();
1768        if (bucket_cnt && (copy_to_user(ukeys + total * key_size, keys,
1769            key_size * bucket_cnt) ||
1770            copy_to_user(uvalues + total * value_size, values,
1771            value_size * bucket_cnt))) {
1772                ret = -EFAULT;
1773                goto after_loop;
1774        }
1775
1776        total += bucket_cnt;
1777        batch++;
1778        if (batch >= htab->n_buckets) {
1779                ret = -ENOENT;
1780                goto after_loop;
1781        }
1782        goto again;
1783
1784after_loop:
1785        if (ret == -EFAULT)
1786                goto out;
1787
1788        /* copy # of entries and next batch */
1789        ubatch = u64_to_user_ptr(attr->batch.out_batch);
1790        if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
1791            put_user(total, &uattr->batch.count))
1792                ret = -EFAULT;
1793
1794out:
1795        kvfree(keys);
1796        kvfree(values);
1797        return ret;
1798}
1799
1800static int
1801htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1802                             union bpf_attr __user *uattr)
1803{
1804        return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1805                                                  false, true);
1806}
1807
1808static int
1809htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1810                                        const union bpf_attr *attr,
1811                                        union bpf_attr __user *uattr)
1812{
1813        return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1814                                                  false, true);
1815}
1816
1817static int
1818htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1819                      union bpf_attr __user *uattr)
1820{
1821        return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1822                                                  false, false);
1823}
1824
1825static int
1826htab_map_lookup_and_delete_batch(struct bpf_map *map,
1827                                 const union bpf_attr *attr,
1828                                 union bpf_attr __user *uattr)
1829{
1830        return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1831                                                  false, false);
1832}
1833
1834static int
1835htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
1836                                 const union bpf_attr *attr,
1837                                 union bpf_attr __user *uattr)
1838{
1839        return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1840                                                  true, true);
1841}
1842
1843static int
1844htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1845                                            const union bpf_attr *attr,
1846                                            union bpf_attr __user *uattr)
1847{
1848        return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1849                                                  true, true);
1850}
1851
1852static int
1853htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
1854                          union bpf_attr __user *uattr)
1855{
1856        return __htab_map_lookup_and_delete_batch(map, attr, uattr, false,
1857                                                  true, false);
1858}
1859
1860static int
1861htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
1862                                     const union bpf_attr *attr,
1863                                     union bpf_attr __user *uattr)
1864{
1865        return __htab_map_lookup_and_delete_batch(map, attr, uattr, true,
1866                                                  true, false);
1867}
1868
1869struct bpf_iter_seq_hash_map_info {
1870        struct bpf_map *map;
1871        struct bpf_htab *htab;
1872        void *percpu_value_buf; // non-zero means percpu hash
1873        u32 bucket_id;
1874        u32 skip_elems;
1875};
1876
1877static struct htab_elem *
1878bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
1879                           struct htab_elem *prev_elem)
1880{
1881        const struct bpf_htab *htab = info->htab;
1882        u32 skip_elems = info->skip_elems;
1883        u32 bucket_id = info->bucket_id;
1884        struct hlist_nulls_head *head;
1885        struct hlist_nulls_node *n;
1886        struct htab_elem *elem;
1887        struct bucket *b;
1888        u32 i, count;
1889
1890        if (bucket_id >= htab->n_buckets)
1891                return NULL;
1892
1893        /* try to find next elem in the same bucket */
1894        if (prev_elem) {
1895                /* no update/deletion on this bucket, prev_elem should be still valid
1896                 * and we won't skip elements.
1897                 */
1898                n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));
1899                elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node);
1900                if (elem)
1901                        return elem;
1902
1903                /* not found, unlock and go to the next bucket */
1904                b = &htab->buckets[bucket_id++];
1905                rcu_read_unlock();
1906                skip_elems = 0;
1907        }
1908
1909        for (i = bucket_id; i < htab->n_buckets; i++) {
1910                b = &htab->buckets[i];
1911                rcu_read_lock();
1912
1913                count = 0;
1914                head = &b->head;
1915                hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
1916                        if (count >= skip_elems) {
1917                                info->bucket_id = i;
1918                                info->skip_elems = count;
1919                                return elem;
1920                        }
1921                        count++;
1922                }
1923
1924                rcu_read_unlock();
1925                skip_elems = 0;
1926        }
1927
1928        info->bucket_id = i;
1929        info->skip_elems = 0;
1930        return NULL;
1931}
1932
1933static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos)
1934{
1935        struct bpf_iter_seq_hash_map_info *info = seq->private;
1936        struct htab_elem *elem;
1937
1938        elem = bpf_hash_map_seq_find_next(info, NULL);
1939        if (!elem)
1940                return NULL;
1941
1942        if (*pos == 0)
1943                ++*pos;
1944        return elem;
1945}
1946
1947static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1948{
1949        struct bpf_iter_seq_hash_map_info *info = seq->private;
1950
1951        ++*pos;
1952        ++info->skip_elems;
1953        return bpf_hash_map_seq_find_next(info, v);
1954}
1955
1956static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
1957{
1958        struct bpf_iter_seq_hash_map_info *info = seq->private;
1959        u32 roundup_key_size, roundup_value_size;
1960        struct bpf_iter__bpf_map_elem ctx = {};
1961        struct bpf_map *map = info->map;
1962        struct bpf_iter_meta meta;
1963        int ret = 0, off = 0, cpu;
1964        struct bpf_prog *prog;
1965        void __percpu *pptr;
1966
1967        meta.seq = seq;
1968        prog = bpf_iter_get_info(&meta, elem == NULL);
1969        if (prog) {
1970                ctx.meta = &meta;
1971                ctx.map = info->map;
1972                if (elem) {
1973                        roundup_key_size = round_up(map->key_size, 8);
1974                        ctx.key = elem->key;
1975                        if (!info->percpu_value_buf) {
1976                                ctx.value = elem->key + roundup_key_size;
1977                        } else {
1978                                roundup_value_size = round_up(map->value_size, 8);
1979                                pptr = htab_elem_get_ptr(elem, map->key_size);
1980                                for_each_possible_cpu(cpu) {
1981                                        bpf_long_memcpy(info->percpu_value_buf + off,
1982                                                        per_cpu_ptr(pptr, cpu),
1983                                                        roundup_value_size);
1984                                        off += roundup_value_size;
1985                                }
1986                                ctx.value = info->percpu_value_buf;
1987                        }
1988                }
1989                ret = bpf_iter_run_prog(prog, &ctx);
1990        }
1991
1992        return ret;
1993}
1994
1995static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)
1996{
1997        return __bpf_hash_map_seq_show(seq, v);
1998}
1999
2000static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)
2001{
2002        if (!v)
2003                (void)__bpf_hash_map_seq_show(seq, NULL);
2004        else
2005                rcu_read_unlock();
2006}
2007
2008static int bpf_iter_init_hash_map(void *priv_data,
2009                                  struct bpf_iter_aux_info *aux)
2010{
2011        struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2012        struct bpf_map *map = aux->map;
2013        void *value_buf;
2014        u32 buf_size;
2015
2016        if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
2017            map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
2018                buf_size = round_up(map->value_size, 8) * num_possible_cpus();
2019                value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
2020                if (!value_buf)
2021                        return -ENOMEM;
2022
2023                seq_info->percpu_value_buf = value_buf;
2024        }
2025
2026        seq_info->map = map;
2027        seq_info->htab = container_of(map, struct bpf_htab, map);
2028        return 0;
2029}
2030
2031static void bpf_iter_fini_hash_map(void *priv_data)
2032{
2033        struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2034
2035        kfree(seq_info->percpu_value_buf);
2036}
2037
2038static const struct seq_operations bpf_hash_map_seq_ops = {
2039        .start  = bpf_hash_map_seq_start,
2040        .next   = bpf_hash_map_seq_next,
2041        .stop   = bpf_hash_map_seq_stop,
2042        .show   = bpf_hash_map_seq_show,
2043};
2044
2045static const struct bpf_iter_seq_info iter_seq_info = {
2046        .seq_ops                = &bpf_hash_map_seq_ops,
2047        .init_seq_private       = bpf_iter_init_hash_map,
2048        .fini_seq_private       = bpf_iter_fini_hash_map,
2049        .seq_priv_size          = sizeof(struct bpf_iter_seq_hash_map_info),
2050};
2051
2052static int bpf_for_each_hash_elem(struct bpf_map *map, void *callback_fn,
2053                                  void *callback_ctx, u64 flags)
2054{
2055        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2056        struct hlist_nulls_head *head;
2057        struct hlist_nulls_node *n;
2058        struct htab_elem *elem;
2059        u32 roundup_key_size;
2060        int i, num_elems = 0;
2061        void __percpu *pptr;
2062        struct bucket *b;
2063        void *key, *val;
2064        bool is_percpu;
2065        u64 ret = 0;
2066
2067        if (flags != 0)
2068                return -EINVAL;
2069
2070        is_percpu = htab_is_percpu(htab);
2071
2072        roundup_key_size = round_up(map->key_size, 8);
2073        /* disable migration so percpu value prepared here will be the
2074         * same as the one seen by the bpf program with bpf_map_lookup_elem().
2075         */
2076        if (is_percpu)
2077                migrate_disable();
2078        for (i = 0; i < htab->n_buckets; i++) {
2079                b = &htab->buckets[i];
2080                rcu_read_lock();
2081                head = &b->head;
2082                hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
2083                        key = elem->key;
2084                        if (is_percpu) {
2085                                /* current cpu value for percpu map */
2086                                pptr = htab_elem_get_ptr(elem, map->key_size);
2087                                val = this_cpu_ptr(pptr);
2088                        } else {
2089                                val = elem->key + roundup_key_size;
2090                        }
2091                        num_elems++;
2092                        ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
2093                                        (u64)(long)key, (u64)(long)val,
2094                                        (u64)(long)callback_ctx, 0);
2095                        /* return value: 0 - continue, 1 - stop and return */
2096                        if (ret) {
2097                                rcu_read_unlock();
2098                                goto out;
2099                        }
2100                }
2101                rcu_read_unlock();
2102        }
2103out:
2104        if (is_percpu)
2105                migrate_enable();
2106        return num_elems;
2107}
2108
2109static int htab_map_btf_id;
2110const struct bpf_map_ops htab_map_ops = {
2111        .map_meta_equal = bpf_map_meta_equal,
2112        .map_alloc_check = htab_map_alloc_check,
2113        .map_alloc = htab_map_alloc,
2114        .map_free = htab_map_free,
2115        .map_get_next_key = htab_map_get_next_key,
2116        .map_release_uref = htab_map_free_timers,
2117        .map_lookup_elem = htab_map_lookup_elem,
2118        .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
2119        .map_update_elem = htab_map_update_elem,
2120        .map_delete_elem = htab_map_delete_elem,
2121        .map_gen_lookup = htab_map_gen_lookup,
2122        .map_seq_show_elem = htab_map_seq_show_elem,
2123        .map_set_for_each_callback_args = map_set_for_each_callback_args,
2124        .map_for_each_callback = bpf_for_each_hash_elem,
2125        BATCH_OPS(htab),
2126        .map_btf_name = "bpf_htab",
2127        .map_btf_id = &htab_map_btf_id,
2128        .iter_seq_info = &iter_seq_info,
2129};
2130
2131static int htab_lru_map_btf_id;
2132const struct bpf_map_ops htab_lru_map_ops = {
2133        .map_meta_equal = bpf_map_meta_equal,
2134        .map_alloc_check = htab_map_alloc_check,
2135        .map_alloc = htab_map_alloc,
2136        .map_free = htab_map_free,
2137        .map_get_next_key = htab_map_get_next_key,
2138        .map_release_uref = htab_map_free_timers,
2139        .map_lookup_elem = htab_lru_map_lookup_elem,
2140        .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
2141        .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
2142        .map_update_elem = htab_lru_map_update_elem,
2143        .map_delete_elem = htab_lru_map_delete_elem,
2144        .map_gen_lookup = htab_lru_map_gen_lookup,
2145        .map_seq_show_elem = htab_map_seq_show_elem,
2146        .map_set_for_each_callback_args = map_set_for_each_callback_args,
2147        .map_for_each_callback = bpf_for_each_hash_elem,
2148        BATCH_OPS(htab_lru),
2149        .map_btf_name = "bpf_htab",
2150        .map_btf_id = &htab_lru_map_btf_id,
2151        .iter_seq_info = &iter_seq_info,
2152};
2153
2154/* Called from eBPF program */
2155static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
2156{
2157        struct htab_elem *l = __htab_map_lookup_elem(map, key);
2158
2159        if (l)
2160                return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2161        else
2162                return NULL;
2163}
2164
2165static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
2166{
2167        struct htab_elem *l = __htab_map_lookup_elem(map, key);
2168
2169        if (l) {
2170                bpf_lru_node_set_ref(&l->lru_node);
2171                return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2172        }
2173
2174        return NULL;
2175}
2176
2177int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
2178{
2179        struct htab_elem *l;
2180        void __percpu *pptr;
2181        int ret = -ENOENT;
2182        int cpu, off = 0;
2183        u32 size;
2184
2185        /* per_cpu areas are zero-filled and bpf programs can only
2186         * access 'value_size' of them, so copying rounded areas
2187         * will not leak any kernel data
2188         */
2189        size = round_up(map->value_size, 8);
2190        rcu_read_lock();
2191        l = __htab_map_lookup_elem(map, key);
2192        if (!l)
2193                goto out;
2194        /* We do not mark LRU map element here in order to not mess up
2195         * eviction heuristics when user space does a map walk.
2196         */
2197        pptr = htab_elem_get_ptr(l, map->key_size);
2198        for_each_possible_cpu(cpu) {
2199                bpf_long_memcpy(value + off,
2200                                per_cpu_ptr(pptr, cpu), size);
2201                off += size;
2202        }
2203        ret = 0;
2204out:
2205        rcu_read_unlock();
2206        return ret;
2207}
2208
2209int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
2210                           u64 map_flags)
2211{
2212        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2213        int ret;
2214
2215        rcu_read_lock();
2216        if (htab_is_lru(htab))
2217                ret = __htab_lru_percpu_map_update_elem(map, key, value,
2218                                                        map_flags, true);
2219        else
2220                ret = __htab_percpu_map_update_elem(map, key, value, map_flags,
2221                                                    true);
2222        rcu_read_unlock();
2223
2224        return ret;
2225}
2226
2227static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
2228                                          struct seq_file *m)
2229{
2230        struct htab_elem *l;
2231        void __percpu *pptr;
2232        int cpu;
2233
2234        rcu_read_lock();
2235
2236        l = __htab_map_lookup_elem(map, key);
2237        if (!l) {
2238                rcu_read_unlock();
2239                return;
2240        }
2241
2242        btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
2243        seq_puts(m, ": {\n");
2244        pptr = htab_elem_get_ptr(l, map->key_size);
2245        for_each_possible_cpu(cpu) {
2246                seq_printf(m, "\tcpu%d: ", cpu);
2247                btf_type_seq_show(map->btf, map->btf_value_type_id,
2248                                  per_cpu_ptr(pptr, cpu), m);
2249                seq_puts(m, "\n");
2250        }
2251        seq_puts(m, "}\n");
2252
2253        rcu_read_unlock();
2254}
2255
2256static int htab_percpu_map_btf_id;
2257const struct bpf_map_ops htab_percpu_map_ops = {
2258        .map_meta_equal = bpf_map_meta_equal,
2259        .map_alloc_check = htab_map_alloc_check,
2260        .map_alloc = htab_map_alloc,
2261        .map_free = htab_map_free,
2262        .map_get_next_key = htab_map_get_next_key,
2263        .map_lookup_elem = htab_percpu_map_lookup_elem,
2264        .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
2265        .map_update_elem = htab_percpu_map_update_elem,
2266        .map_delete_elem = htab_map_delete_elem,
2267        .map_seq_show_elem = htab_percpu_map_seq_show_elem,
2268        .map_set_for_each_callback_args = map_set_for_each_callback_args,
2269        .map_for_each_callback = bpf_for_each_hash_elem,
2270        BATCH_OPS(htab_percpu),
2271        .map_btf_name = "bpf_htab",
2272        .map_btf_id = &htab_percpu_map_btf_id,
2273        .iter_seq_info = &iter_seq_info,
2274};
2275
2276static int htab_lru_percpu_map_btf_id;
2277const struct bpf_map_ops htab_lru_percpu_map_ops = {
2278        .map_meta_equal = bpf_map_meta_equal,
2279        .map_alloc_check = htab_map_alloc_check,
2280        .map_alloc = htab_map_alloc,
2281        .map_free = htab_map_free,
2282        .map_get_next_key = htab_map_get_next_key,
2283        .map_lookup_elem = htab_lru_percpu_map_lookup_elem,
2284        .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
2285        .map_update_elem = htab_lru_percpu_map_update_elem,
2286        .map_delete_elem = htab_lru_map_delete_elem,
2287        .map_seq_show_elem = htab_percpu_map_seq_show_elem,
2288        .map_set_for_each_callback_args = map_set_for_each_callback_args,
2289        .map_for_each_callback = bpf_for_each_hash_elem,
2290        BATCH_OPS(htab_lru_percpu),
2291        .map_btf_name = "bpf_htab",
2292        .map_btf_id = &htab_lru_percpu_map_btf_id,
2293        .iter_seq_info = &iter_seq_info,
2294};
2295
2296static int fd_htab_map_alloc_check(union bpf_attr *attr)
2297{
2298        if (attr->value_size != sizeof(u32))
2299                return -EINVAL;
2300        return htab_map_alloc_check(attr);
2301}
2302
2303static void fd_htab_map_free(struct bpf_map *map)
2304{
2305        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
2306        struct hlist_nulls_node *n;
2307        struct hlist_nulls_head *head;
2308        struct htab_elem *l;
2309        int i;
2310
2311        for (i = 0; i < htab->n_buckets; i++) {
2312                head = select_bucket(htab, i);
2313
2314                hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
2315                        void *ptr = fd_htab_map_get_ptr(map, l);
2316
2317                        map->ops->map_fd_put_ptr(ptr);
2318                }
2319        }
2320
2321        htab_map_free(map);
2322}
2323
2324/* only called from syscall */
2325int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
2326{
2327        void **ptr;
2328        int ret = 0;
2329
2330        if (!map->ops->map_fd_sys_lookup_elem)
2331                return -ENOTSUPP;
2332
2333        rcu_read_lock();
2334        ptr = htab_map_lookup_elem(map, key);
2335        if (ptr)
2336                *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
2337        else
2338                ret = -ENOENT;
2339        rcu_read_unlock();
2340
2341        return ret;
2342}
2343
2344/* only called from syscall */
2345int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
2346                                void *key, void *value, u64 map_flags)
2347{
2348        void *ptr;
2349        int ret;
2350        u32 ufd = *(u32 *)value;
2351
2352        ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
2353        if (IS_ERR(ptr))
2354                return PTR_ERR(ptr);
2355
2356        ret = htab_map_update_elem(map, key, &ptr, map_flags);
2357        if (ret)
2358                map->ops->map_fd_put_ptr(ptr);
2359
2360        return ret;
2361}
2362
2363static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
2364{
2365        struct bpf_map *map, *inner_map_meta;
2366
2367        inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
2368        if (IS_ERR(inner_map_meta))
2369                return inner_map_meta;
2370
2371        map = htab_map_alloc(attr);
2372        if (IS_ERR(map)) {
2373                bpf_map_meta_free(inner_map_meta);
2374                return map;
2375        }
2376
2377        map->inner_map_meta = inner_map_meta;
2378
2379        return map;
2380}
2381
2382static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
2383{
2384        struct bpf_map **inner_map  = htab_map_lookup_elem(map, key);
2385
2386        if (!inner_map)
2387                return NULL;
2388
2389        return READ_ONCE(*inner_map);
2390}
2391
2392static int htab_of_map_gen_lookup(struct bpf_map *map,
2393                                  struct bpf_insn *insn_buf)
2394{
2395        struct bpf_insn *insn = insn_buf;
2396        const int ret = BPF_REG_0;
2397
2398        BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
2399                     (void *(*)(struct bpf_map *map, void *key))NULL));
2400        *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
2401        *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
2402        *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
2403                                offsetof(struct htab_elem, key) +
2404                                round_up(map->key_size, 8));
2405        *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
2406
2407        return insn - insn_buf;
2408}
2409
2410static void htab_of_map_free(struct bpf_map *map)
2411{
2412        bpf_map_meta_free(map->inner_map_meta);
2413        fd_htab_map_free(map);
2414}
2415
2416static int htab_of_maps_map_btf_id;
2417const struct bpf_map_ops htab_of_maps_map_ops = {
2418        .map_alloc_check = fd_htab_map_alloc_check,
2419        .map_alloc = htab_of_map_alloc,
2420        .map_free = htab_of_map_free,
2421        .map_get_next_key = htab_map_get_next_key,
2422        .map_lookup_elem = htab_of_map_lookup_elem,
2423        .map_delete_elem = htab_map_delete_elem,
2424        .map_fd_get_ptr = bpf_map_fd_get_ptr,
2425        .map_fd_put_ptr = bpf_map_fd_put_ptr,
2426        .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
2427        .map_gen_lookup = htab_of_map_gen_lookup,
2428        .map_check_btf = map_check_no_btf,
2429        .map_btf_name = "bpf_htab",
2430        .map_btf_id = &htab_of_maps_map_btf_id,
2431};
2432