linux/kernel/bpf/hashtab.c
<<
>>
Prefs
   1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
   2 * Copyright (c) 2016 Facebook
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of version 2 of the GNU General Public
   6 * License as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful, but
   9 * WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11 * General Public License for more details.
  12 */
  13#include <linux/bpf.h>
  14#include <linux/jhash.h>
  15#include <linux/filter.h>
  16#include <linux/rculist_nulls.h>
  17#include <linux/random.h>
  18#include "percpu_freelist.h"
  19#include "bpf_lru_list.h"
  20#include "map_in_map.h"
  21
  22#define HTAB_CREATE_FLAG_MASK                                           \
  23        (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |    \
  24         BPF_F_RDONLY | BPF_F_WRONLY)
  25
  26struct bucket {
  27        struct hlist_nulls_head head;
  28        raw_spinlock_t lock;
  29};
  30
  31struct bpf_htab {
  32        struct bpf_map map;
  33        struct bucket *buckets;
  34        void *elems;
  35        union {
  36                struct pcpu_freelist freelist;
  37                struct bpf_lru lru;
  38        };
  39        struct htab_elem *__percpu *extra_elems;
  40        atomic_t count; /* number of elements in this hashtable */
  41        u32 n_buckets;  /* number of hash buckets */
  42        u32 elem_size;  /* size of each element in bytes */
  43        u32 hashrnd;
  44};
  45
  46/* each htab element is struct htab_elem + key + value */
  47struct htab_elem {
  48        union {
  49                struct hlist_nulls_node hash_node;
  50                struct {
  51                        void *padding;
  52                        union {
  53                                struct bpf_htab *htab;
  54                                struct pcpu_freelist_node fnode;
  55                        };
  56                };
  57        };
  58        union {
  59                struct rcu_head rcu;
  60                struct bpf_lru_node lru_node;
  61        };
  62        u32 hash;
  63        char key[0] __aligned(8);
  64};
  65
  66static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
  67
  68static bool htab_is_lru(const struct bpf_htab *htab)
  69{
  70        return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
  71                htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
  72}
  73
  74static bool htab_is_percpu(const struct bpf_htab *htab)
  75{
  76        return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
  77                htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
  78}
  79
  80static bool htab_is_prealloc(const struct bpf_htab *htab)
  81{
  82        return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
  83}
  84
  85static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
  86                                     void __percpu *pptr)
  87{
  88        *(void __percpu **)(l->key + key_size) = pptr;
  89}
  90
  91static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
  92{
  93        return *(void __percpu **)(l->key + key_size);
  94}
  95
  96static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
  97{
  98        return *(void **)(l->key + roundup(map->key_size, 8));
  99}
 100
 101static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
 102{
 103        return (struct htab_elem *) (htab->elems + i * htab->elem_size);
 104}
 105
 106static void htab_free_elems(struct bpf_htab *htab)
 107{
 108        int i;
 109
 110        if (!htab_is_percpu(htab))
 111                goto free_elems;
 112
 113        for (i = 0; i < htab->map.max_entries; i++) {
 114                void __percpu *pptr;
 115
 116                pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
 117                                         htab->map.key_size);
 118                free_percpu(pptr);
 119                cond_resched();
 120        }
 121free_elems:
 122        bpf_map_area_free(htab->elems);
 123}
 124
 125static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
 126                                          u32 hash)
 127{
 128        struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
 129        struct htab_elem *l;
 130
 131        if (node) {
 132                l = container_of(node, struct htab_elem, lru_node);
 133                memcpy(l->key, key, htab->map.key_size);
 134                return l;
 135        }
 136
 137        return NULL;
 138}
 139
 140static int prealloc_init(struct bpf_htab *htab)
 141{
 142        u32 num_entries = htab->map.max_entries;
 143        int err = -ENOMEM, i;
 144
 145        if (!htab_is_percpu(htab) && !htab_is_lru(htab))
 146                num_entries += num_possible_cpus();
 147
 148        htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries,
 149                                         htab->map.numa_node);
 150        if (!htab->elems)
 151                return -ENOMEM;
 152
 153        if (!htab_is_percpu(htab))
 154                goto skip_percpu_elems;
 155
 156        for (i = 0; i < num_entries; i++) {
 157                u32 size = round_up(htab->map.value_size, 8);
 158                void __percpu *pptr;
 159
 160                pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
 161                if (!pptr)
 162                        goto free_elems;
 163                htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
 164                                  pptr);
 165                cond_resched();
 166        }
 167
 168skip_percpu_elems:
 169        if (htab_is_lru(htab))
 170                err = bpf_lru_init(&htab->lru,
 171                                   htab->map.map_flags & BPF_F_NO_COMMON_LRU,
 172                                   offsetof(struct htab_elem, hash) -
 173                                   offsetof(struct htab_elem, lru_node),
 174                                   htab_lru_map_delete_node,
 175                                   htab);
 176        else
 177                err = pcpu_freelist_init(&htab->freelist);
 178
 179        if (err)
 180                goto free_elems;
 181
 182        if (htab_is_lru(htab))
 183                bpf_lru_populate(&htab->lru, htab->elems,
 184                                 offsetof(struct htab_elem, lru_node),
 185                                 htab->elem_size, num_entries);
 186        else
 187                pcpu_freelist_populate(&htab->freelist,
 188                                       htab->elems + offsetof(struct htab_elem, fnode),
 189                                       htab->elem_size, num_entries);
 190
 191        return 0;
 192
 193free_elems:
 194        htab_free_elems(htab);
 195        return err;
 196}
 197
 198static void prealloc_destroy(struct bpf_htab *htab)
 199{
 200        htab_free_elems(htab);
 201
 202        if (htab_is_lru(htab))
 203                bpf_lru_destroy(&htab->lru);
 204        else
 205                pcpu_freelist_destroy(&htab->freelist);
 206}
 207
 208static int alloc_extra_elems(struct bpf_htab *htab)
 209{
 210        struct htab_elem *__percpu *pptr, *l_new;
 211        struct pcpu_freelist_node *l;
 212        int cpu;
 213
 214        pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8,
 215                                  GFP_USER | __GFP_NOWARN);
 216        if (!pptr)
 217                return -ENOMEM;
 218
 219        for_each_possible_cpu(cpu) {
 220                l = pcpu_freelist_pop(&htab->freelist);
 221                /* pop will succeed, since prealloc_init()
 222                 * preallocated extra num_possible_cpus elements
 223                 */
 224                l_new = container_of(l, struct htab_elem, fnode);
 225                *per_cpu_ptr(pptr, cpu) = l_new;
 226        }
 227        htab->extra_elems = pptr;
 228        return 0;
 229}
 230
 231/* Called from syscall */
 232static int htab_map_alloc_check(union bpf_attr *attr)
 233{
 234        bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 235                       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
 236        bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
 237                    attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
 238        /* percpu_lru means each cpu has its own LRU list.
 239         * it is different from BPF_MAP_TYPE_PERCPU_HASH where
 240         * the map's value itself is percpu.  percpu_lru has
 241         * nothing to do with the map's value.
 242         */
 243        bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
 244        bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
 245        int numa_node = bpf_map_attr_numa_node(attr);
 246
 247        BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
 248                     offsetof(struct htab_elem, hash_node.pprev));
 249        BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
 250                     offsetof(struct htab_elem, hash_node.pprev));
 251
 252        if (lru && !capable(CAP_SYS_ADMIN))
 253                /* LRU implementation is much complicated than other
 254                 * maps.  Hence, limit to CAP_SYS_ADMIN for now.
 255                 */
 256                return -EPERM;
 257
 258        if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
 259                /* reserved bits should not be used */
 260                return -EINVAL;
 261
 262        if (!lru && percpu_lru)
 263                return -EINVAL;
 264
 265        if (lru && !prealloc)
 266                return -ENOTSUPP;
 267
 268        if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
 269                return -EINVAL;
 270
 271        /* check sanity of attributes.
 272         * value_size == 0 may be allowed in the future to use map as a set
 273         */
 274        if (attr->max_entries == 0 || attr->key_size == 0 ||
 275            attr->value_size == 0)
 276                return -EINVAL;
 277
 278        if (attr->key_size > MAX_BPF_STACK)
 279                /* eBPF programs initialize keys on stack, so they cannot be
 280                 * larger than max stack size
 281                 */
 282                return -E2BIG;
 283
 284        if (attr->value_size >= KMALLOC_MAX_SIZE -
 285            MAX_BPF_STACK - sizeof(struct htab_elem))
 286                /* if value_size is bigger, the user space won't be able to
 287                 * access the elements via bpf syscall. This check also makes
 288                 * sure that the elem_size doesn't overflow and it's
 289                 * kmalloc-able later in htab_map_update_elem()
 290                 */
 291                return -E2BIG;
 292
 293        return 0;
 294}
 295
 296static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 297{
 298        bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 299                       attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
 300        bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
 301                    attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
 302        /* percpu_lru means each cpu has its own LRU list.
 303         * it is different from BPF_MAP_TYPE_PERCPU_HASH where
 304         * the map's value itself is percpu.  percpu_lru has
 305         * nothing to do with the map's value.
 306         */
 307        bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
 308        bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
 309        struct bpf_htab *htab;
 310        int err, i;
 311        u64 cost;
 312
 313        htab = kzalloc(sizeof(*htab), GFP_USER);
 314        if (!htab)
 315                return ERR_PTR(-ENOMEM);
 316
 317        bpf_map_init_from_attr(&htab->map, attr);
 318
 319        if (percpu_lru) {
 320                /* ensure each CPU's lru list has >=1 elements.
 321                 * since we are at it, make each lru list has the same
 322                 * number of elements.
 323                 */
 324                htab->map.max_entries = roundup(attr->max_entries,
 325                                                num_possible_cpus());
 326                if (htab->map.max_entries < attr->max_entries)
 327                        htab->map.max_entries = rounddown(attr->max_entries,
 328                                                          num_possible_cpus());
 329        }
 330
 331        /* hash table size must be power of 2 */
 332        htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
 333
 334        htab->elem_size = sizeof(struct htab_elem) +
 335                          round_up(htab->map.key_size, 8);
 336        if (percpu)
 337                htab->elem_size += sizeof(void *);
 338        else
 339                htab->elem_size += round_up(htab->map.value_size, 8);
 340
 341        err = -E2BIG;
 342        /* prevent zero size kmalloc and check for u32 overflow */
 343        if (htab->n_buckets == 0 ||
 344            htab->n_buckets > U32_MAX / sizeof(struct bucket))
 345                goto free_htab;
 346
 347        cost = (u64) htab->n_buckets * sizeof(struct bucket) +
 348               (u64) htab->elem_size * htab->map.max_entries;
 349
 350        if (percpu)
 351                cost += (u64) round_up(htab->map.value_size, 8) *
 352                        num_possible_cpus() * htab->map.max_entries;
 353        else
 354               cost += (u64) htab->elem_size * num_possible_cpus();
 355
 356        if (cost >= U32_MAX - PAGE_SIZE)
 357                /* make sure page count doesn't overflow */
 358                goto free_htab;
 359
 360        htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 361
 362        /* if map size is larger than memlock limit, reject it early */
 363        err = bpf_map_precharge_memlock(htab->map.pages);
 364        if (err)
 365                goto free_htab;
 366
 367        err = -ENOMEM;
 368        htab->buckets = bpf_map_area_alloc(htab->n_buckets *
 369                                           sizeof(struct bucket),
 370                                           htab->map.numa_node);
 371        if (!htab->buckets)
 372                goto free_htab;
 373
 374        htab->hashrnd = get_random_int();
 375        for (i = 0; i < htab->n_buckets; i++) {
 376                INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
 377                raw_spin_lock_init(&htab->buckets[i].lock);
 378        }
 379
 380        if (prealloc) {
 381                err = prealloc_init(htab);
 382                if (err)
 383                        goto free_buckets;
 384
 385                if (!percpu && !lru) {
 386                        /* lru itself can remove the least used element, so
 387                         * there is no need for an extra elem during map_update.
 388                         */
 389                        err = alloc_extra_elems(htab);
 390                        if (err)
 391                                goto free_prealloc;
 392                }
 393        }
 394
 395        return &htab->map;
 396
 397free_prealloc:
 398        prealloc_destroy(htab);
 399free_buckets:
 400        bpf_map_area_free(htab->buckets);
 401free_htab:
 402        kfree(htab);
 403        return ERR_PTR(err);
 404}
 405
 406static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
 407{
 408        return jhash(key, key_len, hashrnd);
 409}
 410
 411static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
 412{
 413        return &htab->buckets[hash & (htab->n_buckets - 1)];
 414}
 415
 416static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
 417{
 418        return &__select_bucket(htab, hash)->head;
 419}
 420
 421/* this lookup function can only be called with bucket lock taken */
 422static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
 423                                         void *key, u32 key_size)
 424{
 425        struct hlist_nulls_node *n;
 426        struct htab_elem *l;
 427
 428        hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
 429                if (l->hash == hash && !memcmp(&l->key, key, key_size))
 430                        return l;
 431
 432        return NULL;
 433}
 434
 435/* can be called without bucket lock. it will repeat the loop in
 436 * the unlikely event when elements moved from one bucket into another
 437 * while link list is being walked
 438 */
 439static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
 440                                               u32 hash, void *key,
 441                                               u32 key_size, u32 n_buckets)
 442{
 443        struct hlist_nulls_node *n;
 444        struct htab_elem *l;
 445
 446again:
 447        hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
 448                if (l->hash == hash && !memcmp(&l->key, key, key_size))
 449                        return l;
 450
 451        if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
 452                goto again;
 453
 454        return NULL;
 455}
 456
 457/* Called from syscall or from eBPF program directly, so
 458 * arguments have to match bpf_map_lookup_elem() exactly.
 459 * The return value is adjusted by BPF instructions
 460 * in htab_map_gen_lookup().
 461 */
 462static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
 463{
 464        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 465        struct hlist_nulls_head *head;
 466        struct htab_elem *l;
 467        u32 hash, key_size;
 468
 469        /* Must be called with rcu_read_lock. */
 470        WARN_ON_ONCE(!rcu_read_lock_held());
 471
 472        key_size = map->key_size;
 473
 474        hash = htab_map_hash(key, key_size, htab->hashrnd);
 475
 476        head = select_bucket(htab, hash);
 477
 478        l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
 479
 480        return l;
 481}
 482
 483static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
 484{
 485        struct htab_elem *l = __htab_map_lookup_elem(map, key);
 486
 487        if (l)
 488                return l->key + round_up(map->key_size, 8);
 489
 490        return NULL;
 491}
 492
 493/* inline bpf_map_lookup_elem() call.
 494 * Instead of:
 495 * bpf_prog
 496 *   bpf_map_lookup_elem
 497 *     map->ops->map_lookup_elem
 498 *       htab_map_lookup_elem
 499 *         __htab_map_lookup_elem
 500 * do:
 501 * bpf_prog
 502 *   __htab_map_lookup_elem
 503 */
 504static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
 505{
 506        struct bpf_insn *insn = insn_buf;
 507        const int ret = BPF_REG_0;
 508
 509        BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
 510                     (void *(*)(struct bpf_map *map, void *key))NULL));
 511        *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
 512        *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
 513        *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
 514                                offsetof(struct htab_elem, key) +
 515                                round_up(map->key_size, 8));
 516        return insn - insn_buf;
 517}
 518
 519static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
 520{
 521        struct htab_elem *l = __htab_map_lookup_elem(map, key);
 522
 523        if (l) {
 524                bpf_lru_node_set_ref(&l->lru_node);
 525                return l->key + round_up(map->key_size, 8);
 526        }
 527
 528        return NULL;
 529}
 530
 531static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
 532                                   struct bpf_insn *insn_buf)
 533{
 534        struct bpf_insn *insn = insn_buf;
 535        const int ret = BPF_REG_0;
 536        const int ref_reg = BPF_REG_1;
 537
 538        BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
 539                     (void *(*)(struct bpf_map *map, void *key))NULL));
 540        *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
 541        *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
 542        *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
 543                              offsetof(struct htab_elem, lru_node) +
 544                              offsetof(struct bpf_lru_node, ref));
 545        *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1);
 546        *insn++ = BPF_ST_MEM(BPF_B, ret,
 547                             offsetof(struct htab_elem, lru_node) +
 548                             offsetof(struct bpf_lru_node, ref),
 549                             1);
 550        *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
 551                                offsetof(struct htab_elem, key) +
 552                                round_up(map->key_size, 8));
 553        return insn - insn_buf;
 554}
 555
 556/* It is called from the bpf_lru_list when the LRU needs to delete
 557 * older elements from the htab.
 558 */
 559static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
 560{
 561        struct bpf_htab *htab = (struct bpf_htab *)arg;
 562        struct htab_elem *l = NULL, *tgt_l;
 563        struct hlist_nulls_head *head;
 564        struct hlist_nulls_node *n;
 565        unsigned long flags;
 566        struct bucket *b;
 567
 568        tgt_l = container_of(node, struct htab_elem, lru_node);
 569        b = __select_bucket(htab, tgt_l->hash);
 570        head = &b->head;
 571
 572        raw_spin_lock_irqsave(&b->lock, flags);
 573
 574        hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
 575                if (l == tgt_l) {
 576                        hlist_nulls_del_rcu(&l->hash_node);
 577                        break;
 578                }
 579
 580        raw_spin_unlock_irqrestore(&b->lock, flags);
 581
 582        return l == tgt_l;
 583}
 584
 585/* Called from syscall */
 586static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 587{
 588        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 589        struct hlist_nulls_head *head;
 590        struct htab_elem *l, *next_l;
 591        u32 hash, key_size;
 592        int i = 0;
 593
 594        WARN_ON_ONCE(!rcu_read_lock_held());
 595
 596        key_size = map->key_size;
 597
 598        if (!key)
 599                goto find_first_elem;
 600
 601        hash = htab_map_hash(key, key_size, htab->hashrnd);
 602
 603        head = select_bucket(htab, hash);
 604
 605        /* lookup the key */
 606        l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
 607
 608        if (!l)
 609                goto find_first_elem;
 610
 611        /* key was found, get next key in the same bucket */
 612        next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
 613                                  struct htab_elem, hash_node);
 614
 615        if (next_l) {
 616                /* if next elem in this hash list is non-zero, just return it */
 617                memcpy(next_key, next_l->key, key_size);
 618                return 0;
 619        }
 620
 621        /* no more elements in this hash list, go to the next bucket */
 622        i = hash & (htab->n_buckets - 1);
 623        i++;
 624
 625find_first_elem:
 626        /* iterate over buckets */
 627        for (; i < htab->n_buckets; i++) {
 628                head = select_bucket(htab, i);
 629
 630                /* pick first element in the bucket */
 631                next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
 632                                          struct htab_elem, hash_node);
 633                if (next_l) {
 634                        /* if it's not empty, just return it */
 635                        memcpy(next_key, next_l->key, key_size);
 636                        return 0;
 637                }
 638        }
 639
 640        /* iterated over all buckets and all elements */
 641        return -ENOENT;
 642}
 643
 644static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
 645{
 646        if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
 647                free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
 648        kfree(l);
 649}
 650
 651static void htab_elem_free_rcu(struct rcu_head *head)
 652{
 653        struct htab_elem *l = container_of(head, struct htab_elem, rcu);
 654        struct bpf_htab *htab = l->htab;
 655
 656        /* must increment bpf_prog_active to avoid kprobe+bpf triggering while
 657         * we're calling kfree, otherwise deadlock is possible if kprobes
 658         * are placed somewhere inside of slub
 659         */
 660        preempt_disable();
 661        __this_cpu_inc(bpf_prog_active);
 662        htab_elem_free(htab, l);
 663        __this_cpu_dec(bpf_prog_active);
 664        preempt_enable();
 665}
 666
 667static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 668{
 669        struct bpf_map *map = &htab->map;
 670
 671        if (map->ops->map_fd_put_ptr) {
 672                void *ptr = fd_htab_map_get_ptr(map, l);
 673
 674                map->ops->map_fd_put_ptr(ptr);
 675        }
 676
 677        if (htab_is_prealloc(htab)) {
 678                pcpu_freelist_push(&htab->freelist, &l->fnode);
 679        } else {
 680                atomic_dec(&htab->count);
 681                l->htab = htab;
 682                call_rcu(&l->rcu, htab_elem_free_rcu);
 683        }
 684}
 685
 686static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
 687                            void *value, bool onallcpus)
 688{
 689        if (!onallcpus) {
 690                /* copy true value_size bytes */
 691                memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
 692        } else {
 693                u32 size = round_up(htab->map.value_size, 8);
 694                int off = 0, cpu;
 695
 696                for_each_possible_cpu(cpu) {
 697                        bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
 698                                        value + off, size);
 699                        off += size;
 700                }
 701        }
 702}
 703
 704static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
 705{
 706        return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
 707               BITS_PER_LONG == 64;
 708}
 709
 710static u32 htab_size_value(const struct bpf_htab *htab, bool percpu)
 711{
 712        u32 size = htab->map.value_size;
 713
 714        if (percpu || fd_htab_map_needs_adjust(htab))
 715                size = round_up(size, 8);
 716        return size;
 717}
 718
 719static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 720                                         void *value, u32 key_size, u32 hash,
 721                                         bool percpu, bool onallcpus,
 722                                         struct htab_elem *old_elem)
 723{
 724        u32 size = htab_size_value(htab, percpu);
 725        bool prealloc = htab_is_prealloc(htab);
 726        struct htab_elem *l_new, **pl_new;
 727        void __percpu *pptr;
 728
 729        if (prealloc) {
 730                if (old_elem) {
 731                        /* if we're updating the existing element,
 732                         * use per-cpu extra elems to avoid freelist_pop/push
 733                         */
 734                        pl_new = this_cpu_ptr(htab->extra_elems);
 735                        l_new = *pl_new;
 736                        *pl_new = old_elem;
 737                } else {
 738                        struct pcpu_freelist_node *l;
 739
 740                        l = pcpu_freelist_pop(&htab->freelist);
 741                        if (!l)
 742                                return ERR_PTR(-E2BIG);
 743                        l_new = container_of(l, struct htab_elem, fnode);
 744                }
 745        } else {
 746                if (atomic_inc_return(&htab->count) > htab->map.max_entries)
 747                        if (!old_elem) {
 748                                /* when map is full and update() is replacing
 749                                 * old element, it's ok to allocate, since
 750                                 * old element will be freed immediately.
 751                                 * Otherwise return an error
 752                                 */
 753                                l_new = ERR_PTR(-E2BIG);
 754                                goto dec_count;
 755                        }
 756                l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
 757                                     htab->map.numa_node);
 758                if (!l_new) {
 759                        l_new = ERR_PTR(-ENOMEM);
 760                        goto dec_count;
 761                }
 762        }
 763
 764        memcpy(l_new->key, key, key_size);
 765        if (percpu) {
 766                if (prealloc) {
 767                        pptr = htab_elem_get_ptr(l_new, key_size);
 768                } else {
 769                        /* alloc_percpu zero-fills */
 770                        pptr = __alloc_percpu_gfp(size, 8,
 771                                                  GFP_ATOMIC | __GFP_NOWARN);
 772                        if (!pptr) {
 773                                kfree(l_new);
 774                                l_new = ERR_PTR(-ENOMEM);
 775                                goto dec_count;
 776                        }
 777                }
 778
 779                pcpu_copy_value(htab, pptr, value, onallcpus);
 780
 781                if (!prealloc)
 782                        htab_elem_set_ptr(l_new, key_size, pptr);
 783        } else {
 784                memcpy(l_new->key + round_up(key_size, 8), value, size);
 785        }
 786
 787        l_new->hash = hash;
 788        return l_new;
 789dec_count:
 790        atomic_dec(&htab->count);
 791        return l_new;
 792}
 793
 794static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
 795                       u64 map_flags)
 796{
 797        if (l_old && map_flags == BPF_NOEXIST)
 798                /* elem already exists */
 799                return -EEXIST;
 800
 801        if (!l_old && map_flags == BPF_EXIST)
 802                /* elem doesn't exist, cannot update it */
 803                return -ENOENT;
 804
 805        return 0;
 806}
 807
 808/* Called from syscall or from eBPF program */
 809static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 810                                u64 map_flags)
 811{
 812        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 813        struct htab_elem *l_new = NULL, *l_old;
 814        struct hlist_nulls_head *head;
 815        unsigned long flags;
 816        struct bucket *b;
 817        u32 key_size, hash;
 818        int ret;
 819
 820        if (unlikely(map_flags > BPF_EXIST))
 821                /* unknown flags */
 822                return -EINVAL;
 823
 824        WARN_ON_ONCE(!rcu_read_lock_held());
 825
 826        key_size = map->key_size;
 827
 828        hash = htab_map_hash(key, key_size, htab->hashrnd);
 829
 830        b = __select_bucket(htab, hash);
 831        head = &b->head;
 832
 833        /* bpf_map_update_elem() can be called in_irq() */
 834        raw_spin_lock_irqsave(&b->lock, flags);
 835
 836        l_old = lookup_elem_raw(head, hash, key, key_size);
 837
 838        ret = check_flags(htab, l_old, map_flags);
 839        if (ret)
 840                goto err;
 841
 842        l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
 843                                l_old);
 844        if (IS_ERR(l_new)) {
 845                /* all pre-allocated elements are in use or memory exhausted */
 846                ret = PTR_ERR(l_new);
 847                goto err;
 848        }
 849
 850        /* add new element to the head of the list, so that
 851         * concurrent search will find it before old elem
 852         */
 853        hlist_nulls_add_head_rcu(&l_new->hash_node, head);
 854        if (l_old) {
 855                hlist_nulls_del_rcu(&l_old->hash_node);
 856                if (!htab_is_prealloc(htab))
 857                        free_htab_elem(htab, l_old);
 858        }
 859        ret = 0;
 860err:
 861        raw_spin_unlock_irqrestore(&b->lock, flags);
 862        return ret;
 863}
 864
 865static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
 866                                    u64 map_flags)
 867{
 868        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 869        struct htab_elem *l_new, *l_old = NULL;
 870        struct hlist_nulls_head *head;
 871        unsigned long flags;
 872        struct bucket *b;
 873        u32 key_size, hash;
 874        int ret;
 875
 876        if (unlikely(map_flags > BPF_EXIST))
 877                /* unknown flags */
 878                return -EINVAL;
 879
 880        WARN_ON_ONCE(!rcu_read_lock_held());
 881
 882        key_size = map->key_size;
 883
 884        hash = htab_map_hash(key, key_size, htab->hashrnd);
 885
 886        b = __select_bucket(htab, hash);
 887        head = &b->head;
 888
 889        /* For LRU, we need to alloc before taking bucket's
 890         * spinlock because getting free nodes from LRU may need
 891         * to remove older elements from htab and this removal
 892         * operation will need a bucket lock.
 893         */
 894        l_new = prealloc_lru_pop(htab, key, hash);
 895        if (!l_new)
 896                return -ENOMEM;
 897        memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
 898
 899        /* bpf_map_update_elem() can be called in_irq() */
 900        raw_spin_lock_irqsave(&b->lock, flags);
 901
 902        l_old = lookup_elem_raw(head, hash, key, key_size);
 903
 904        ret = check_flags(htab, l_old, map_flags);
 905        if (ret)
 906                goto err;
 907
 908        /* add new element to the head of the list, so that
 909         * concurrent search will find it before old elem
 910         */
 911        hlist_nulls_add_head_rcu(&l_new->hash_node, head);
 912        if (l_old) {
 913                bpf_lru_node_set_ref(&l_new->lru_node);
 914                hlist_nulls_del_rcu(&l_old->hash_node);
 915        }
 916        ret = 0;
 917
 918err:
 919        raw_spin_unlock_irqrestore(&b->lock, flags);
 920
 921        if (ret)
 922                bpf_lru_push_free(&htab->lru, &l_new->lru_node);
 923        else if (l_old)
 924                bpf_lru_push_free(&htab->lru, &l_old->lru_node);
 925
 926        return ret;
 927}
 928
 929static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 930                                         void *value, u64 map_flags,
 931                                         bool onallcpus)
 932{
 933        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 934        struct htab_elem *l_new = NULL, *l_old;
 935        struct hlist_nulls_head *head;
 936        unsigned long flags;
 937        struct bucket *b;
 938        u32 key_size, hash;
 939        int ret;
 940
 941        if (unlikely(map_flags > BPF_EXIST))
 942                /* unknown flags */
 943                return -EINVAL;
 944
 945        WARN_ON_ONCE(!rcu_read_lock_held());
 946
 947        key_size = map->key_size;
 948
 949        hash = htab_map_hash(key, key_size, htab->hashrnd);
 950
 951        b = __select_bucket(htab, hash);
 952        head = &b->head;
 953
 954        /* bpf_map_update_elem() can be called in_irq() */
 955        raw_spin_lock_irqsave(&b->lock, flags);
 956
 957        l_old = lookup_elem_raw(head, hash, key, key_size);
 958
 959        ret = check_flags(htab, l_old, map_flags);
 960        if (ret)
 961                goto err;
 962
 963        if (l_old) {
 964                /* per-cpu hash map can update value in-place */
 965                pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
 966                                value, onallcpus);
 967        } else {
 968                l_new = alloc_htab_elem(htab, key, value, key_size,
 969                                        hash, true, onallcpus, NULL);
 970                if (IS_ERR(l_new)) {
 971                        ret = PTR_ERR(l_new);
 972                        goto err;
 973                }
 974                hlist_nulls_add_head_rcu(&l_new->hash_node, head);
 975        }
 976        ret = 0;
 977err:
 978        raw_spin_unlock_irqrestore(&b->lock, flags);
 979        return ret;
 980}
 981
 982static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
 983                                             void *value, u64 map_flags,
 984                                             bool onallcpus)
 985{
 986        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 987        struct htab_elem *l_new = NULL, *l_old;
 988        struct hlist_nulls_head *head;
 989        unsigned long flags;
 990        struct bucket *b;
 991        u32 key_size, hash;
 992        int ret;
 993
 994        if (unlikely(map_flags > BPF_EXIST))
 995                /* unknown flags */
 996                return -EINVAL;
 997
 998        WARN_ON_ONCE(!rcu_read_lock_held());
 999
1000        key_size = map->key_size;
1001
1002        hash = htab_map_hash(key, key_size, htab->hashrnd);
1003
1004        b = __select_bucket(htab, hash);
1005        head = &b->head;
1006
1007        /* For LRU, we need to alloc before taking bucket's
1008         * spinlock because LRU's elem alloc may need
1009         * to remove older elem from htab and this removal
1010         * operation will need a bucket lock.
1011         */
1012        if (map_flags != BPF_EXIST) {
1013                l_new = prealloc_lru_pop(htab, key, hash);
1014                if (!l_new)
1015                        return -ENOMEM;
1016        }
1017
1018        /* bpf_map_update_elem() can be called in_irq() */
1019        raw_spin_lock_irqsave(&b->lock, flags);
1020
1021        l_old = lookup_elem_raw(head, hash, key, key_size);
1022
1023        ret = check_flags(htab, l_old, map_flags);
1024        if (ret)
1025                goto err;
1026
1027        if (l_old) {
1028                bpf_lru_node_set_ref(&l_old->lru_node);
1029
1030                /* per-cpu hash map can update value in-place */
1031                pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1032                                value, onallcpus);
1033        } else {
1034                pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size),
1035                                value, onallcpus);
1036                hlist_nulls_add_head_rcu(&l_new->hash_node, head);
1037                l_new = NULL;
1038        }
1039        ret = 0;
1040err:
1041        raw_spin_unlock_irqrestore(&b->lock, flags);
1042        if (l_new)
1043                bpf_lru_push_free(&htab->lru, &l_new->lru_node);
1044        return ret;
1045}
1046
1047static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
1048                                       void *value, u64 map_flags)
1049{
1050        return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
1051}
1052
1053static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
1054                                           void *value, u64 map_flags)
1055{
1056        return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
1057                                                 false);
1058}
1059
1060/* Called from syscall or from eBPF program */
1061static int htab_map_delete_elem(struct bpf_map *map, void *key)
1062{
1063        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1064        struct hlist_nulls_head *head;
1065        struct bucket *b;
1066        struct htab_elem *l;
1067        unsigned long flags;
1068        u32 hash, key_size;
1069        int ret = -ENOENT;
1070
1071        WARN_ON_ONCE(!rcu_read_lock_held());
1072
1073        key_size = map->key_size;
1074
1075        hash = htab_map_hash(key, key_size, htab->hashrnd);
1076        b = __select_bucket(htab, hash);
1077        head = &b->head;
1078
1079        raw_spin_lock_irqsave(&b->lock, flags);
1080
1081        l = lookup_elem_raw(head, hash, key, key_size);
1082
1083        if (l) {
1084                hlist_nulls_del_rcu(&l->hash_node);
1085                free_htab_elem(htab, l);
1086                ret = 0;
1087        }
1088
1089        raw_spin_unlock_irqrestore(&b->lock, flags);
1090        return ret;
1091}
1092
1093static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
1094{
1095        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1096        struct hlist_nulls_head *head;
1097        struct bucket *b;
1098        struct htab_elem *l;
1099        unsigned long flags;
1100        u32 hash, key_size;
1101        int ret = -ENOENT;
1102
1103        WARN_ON_ONCE(!rcu_read_lock_held());
1104
1105        key_size = map->key_size;
1106
1107        hash = htab_map_hash(key, key_size, htab->hashrnd);
1108        b = __select_bucket(htab, hash);
1109        head = &b->head;
1110
1111        raw_spin_lock_irqsave(&b->lock, flags);
1112
1113        l = lookup_elem_raw(head, hash, key, key_size);
1114
1115        if (l) {
1116                hlist_nulls_del_rcu(&l->hash_node);
1117                ret = 0;
1118        }
1119
1120        raw_spin_unlock_irqrestore(&b->lock, flags);
1121        if (l)
1122                bpf_lru_push_free(&htab->lru, &l->lru_node);
1123        return ret;
1124}
1125
1126static void delete_all_elements(struct bpf_htab *htab)
1127{
1128        int i;
1129
1130        for (i = 0; i < htab->n_buckets; i++) {
1131                struct hlist_nulls_head *head = select_bucket(htab, i);
1132                struct hlist_nulls_node *n;
1133                struct htab_elem *l;
1134
1135                hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1136                        hlist_nulls_del_rcu(&l->hash_node);
1137                        htab_elem_free(htab, l);
1138                }
1139        }
1140}
1141
1142/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
1143static void htab_map_free(struct bpf_map *map)
1144{
1145        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1146
1147        /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
1148         * so the programs (can be more than one that used this map) were
1149         * disconnected from events. Wait for outstanding critical sections in
1150         * these programs to complete
1151         */
1152        synchronize_rcu();
1153
1154        /* some of free_htab_elem() callbacks for elements of this map may
1155         * not have executed. Wait for them.
1156         */
1157        rcu_barrier();
1158        if (!htab_is_prealloc(htab))
1159                delete_all_elements(htab);
1160        else
1161                prealloc_destroy(htab);
1162
1163        free_percpu(htab->extra_elems);
1164        bpf_map_area_free(htab->buckets);
1165        kfree(htab);
1166}
1167
1168const struct bpf_map_ops htab_map_ops = {
1169        .map_alloc_check = htab_map_alloc_check,
1170        .map_alloc = htab_map_alloc,
1171        .map_free = htab_map_free,
1172        .map_get_next_key = htab_map_get_next_key,
1173        .map_lookup_elem = htab_map_lookup_elem,
1174        .map_update_elem = htab_map_update_elem,
1175        .map_delete_elem = htab_map_delete_elem,
1176        .map_gen_lookup = htab_map_gen_lookup,
1177};
1178
1179const struct bpf_map_ops htab_lru_map_ops = {
1180        .map_alloc_check = htab_map_alloc_check,
1181        .map_alloc = htab_map_alloc,
1182        .map_free = htab_map_free,
1183        .map_get_next_key = htab_map_get_next_key,
1184        .map_lookup_elem = htab_lru_map_lookup_elem,
1185        .map_update_elem = htab_lru_map_update_elem,
1186        .map_delete_elem = htab_lru_map_delete_elem,
1187        .map_gen_lookup = htab_lru_map_gen_lookup,
1188};
1189
1190/* Called from eBPF program */
1191static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
1192{
1193        struct htab_elem *l = __htab_map_lookup_elem(map, key);
1194
1195        if (l)
1196                return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
1197        else
1198                return NULL;
1199}
1200
1201static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
1202{
1203        struct htab_elem *l = __htab_map_lookup_elem(map, key);
1204
1205        if (l) {
1206                bpf_lru_node_set_ref(&l->lru_node);
1207                return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
1208        }
1209
1210        return NULL;
1211}
1212
1213int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
1214{
1215        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1216        struct htab_elem *l;
1217        void __percpu *pptr;
1218        int ret = -ENOENT;
1219        int cpu, off = 0;
1220        u32 size;
1221
1222        /* per_cpu areas are zero-filled and bpf programs can only
1223         * access 'value_size' of them, so copying rounded areas
1224         * will not leak any kernel data
1225         */
1226        size = round_up(map->value_size, 8);
1227        rcu_read_lock();
1228        l = __htab_map_lookup_elem(map, key);
1229        if (!l)
1230                goto out;
1231        if (htab_is_lru(htab))
1232                bpf_lru_node_set_ref(&l->lru_node);
1233        pptr = htab_elem_get_ptr(l, map->key_size);
1234        for_each_possible_cpu(cpu) {
1235                bpf_long_memcpy(value + off,
1236                                per_cpu_ptr(pptr, cpu), size);
1237                off += size;
1238        }
1239        ret = 0;
1240out:
1241        rcu_read_unlock();
1242        return ret;
1243}
1244
1245int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
1246                           u64 map_flags)
1247{
1248        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1249        int ret;
1250
1251        rcu_read_lock();
1252        if (htab_is_lru(htab))
1253                ret = __htab_lru_percpu_map_update_elem(map, key, value,
1254                                                        map_flags, true);
1255        else
1256                ret = __htab_percpu_map_update_elem(map, key, value, map_flags,
1257                                                    true);
1258        rcu_read_unlock();
1259
1260        return ret;
1261}
1262
1263const struct bpf_map_ops htab_percpu_map_ops = {
1264        .map_alloc_check = htab_map_alloc_check,
1265        .map_alloc = htab_map_alloc,
1266        .map_free = htab_map_free,
1267        .map_get_next_key = htab_map_get_next_key,
1268        .map_lookup_elem = htab_percpu_map_lookup_elem,
1269        .map_update_elem = htab_percpu_map_update_elem,
1270        .map_delete_elem = htab_map_delete_elem,
1271};
1272
1273const struct bpf_map_ops htab_lru_percpu_map_ops = {
1274        .map_alloc_check = htab_map_alloc_check,
1275        .map_alloc = htab_map_alloc,
1276        .map_free = htab_map_free,
1277        .map_get_next_key = htab_map_get_next_key,
1278        .map_lookup_elem = htab_lru_percpu_map_lookup_elem,
1279        .map_update_elem = htab_lru_percpu_map_update_elem,
1280        .map_delete_elem = htab_lru_map_delete_elem,
1281};
1282
1283static int fd_htab_map_alloc_check(union bpf_attr *attr)
1284{
1285        if (attr->value_size != sizeof(u32))
1286                return -EINVAL;
1287        return htab_map_alloc_check(attr);
1288}
1289
1290static void fd_htab_map_free(struct bpf_map *map)
1291{
1292        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1293        struct hlist_nulls_node *n;
1294        struct hlist_nulls_head *head;
1295        struct htab_elem *l;
1296        int i;
1297
1298        for (i = 0; i < htab->n_buckets; i++) {
1299                head = select_bucket(htab, i);
1300
1301                hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1302                        void *ptr = fd_htab_map_get_ptr(map, l);
1303
1304                        map->ops->map_fd_put_ptr(ptr);
1305                }
1306        }
1307
1308        htab_map_free(map);
1309}
1310
1311/* only called from syscall */
1312int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
1313{
1314        void **ptr;
1315        int ret = 0;
1316
1317        if (!map->ops->map_fd_sys_lookup_elem)
1318                return -ENOTSUPP;
1319
1320        rcu_read_lock();
1321        ptr = htab_map_lookup_elem(map, key);
1322        if (ptr)
1323                *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
1324        else
1325                ret = -ENOENT;
1326        rcu_read_unlock();
1327
1328        return ret;
1329}
1330
1331/* only called from syscall */
1332int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
1333                                void *key, void *value, u64 map_flags)
1334{
1335        void *ptr;
1336        int ret;
1337        u32 ufd = *(u32 *)value;
1338
1339        ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
1340        if (IS_ERR(ptr))
1341                return PTR_ERR(ptr);
1342
1343        ret = htab_map_update_elem(map, key, &ptr, map_flags);
1344        if (ret)
1345                map->ops->map_fd_put_ptr(ptr);
1346
1347        return ret;
1348}
1349
1350static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
1351{
1352        struct bpf_map *map, *inner_map_meta;
1353
1354        inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
1355        if (IS_ERR(inner_map_meta))
1356                return inner_map_meta;
1357
1358        map = htab_map_alloc(attr);
1359        if (IS_ERR(map)) {
1360                bpf_map_meta_free(inner_map_meta);
1361                return map;
1362        }
1363
1364        map->inner_map_meta = inner_map_meta;
1365
1366        return map;
1367}
1368
1369static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
1370{
1371        struct bpf_map **inner_map  = htab_map_lookup_elem(map, key);
1372
1373        if (!inner_map)
1374                return NULL;
1375
1376        return READ_ONCE(*inner_map);
1377}
1378
1379static u32 htab_of_map_gen_lookup(struct bpf_map *map,
1380                                  struct bpf_insn *insn_buf)
1381{
1382        struct bpf_insn *insn = insn_buf;
1383        const int ret = BPF_REG_0;
1384
1385        BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
1386                     (void *(*)(struct bpf_map *map, void *key))NULL));
1387        *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
1388        *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
1389        *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
1390                                offsetof(struct htab_elem, key) +
1391                                round_up(map->key_size, 8));
1392        *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0);
1393
1394        return insn - insn_buf;
1395}
1396
1397static void htab_of_map_free(struct bpf_map *map)
1398{
1399        bpf_map_meta_free(map->inner_map_meta);
1400        fd_htab_map_free(map);
1401}
1402
1403const struct bpf_map_ops htab_of_maps_map_ops = {
1404        .map_alloc_check = fd_htab_map_alloc_check,
1405        .map_alloc = htab_of_map_alloc,
1406        .map_free = htab_of_map_free,
1407        .map_get_next_key = htab_map_get_next_key,
1408        .map_lookup_elem = htab_of_map_lookup_elem,
1409        .map_delete_elem = htab_map_delete_elem,
1410        .map_fd_get_ptr = bpf_map_fd_get_ptr,
1411        .map_fd_put_ptr = bpf_map_fd_put_ptr,
1412        .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
1413        .map_gen_lookup = htab_of_map_gen_lookup,
1414};
1415