linux/kernel/bpf/hashtab.c
<<
>>
Prefs
   1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
   2 *
   3 * This program is free software; you can redistribute it and/or
   4 * modify it under the terms of version 2 of the GNU General Public
   5 * License as published by the Free Software Foundation.
   6 *
   7 * This program is distributed in the hope that it will be useful, but
   8 * WITHOUT ANY WARRANTY; without even the implied warranty of
   9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10 * General Public License for more details.
  11 */
  12#include <linux/bpf.h>
  13#include <linux/jhash.h>
  14#include <linux/filter.h>
  15#include <linux/vmalloc.h>
  16
  17struct bpf_htab {
  18        struct bpf_map map;
  19        struct hlist_head *buckets;
  20        spinlock_t lock;
  21        u32 count;      /* number of elements in this hashtable */
  22        u32 n_buckets;  /* number of hash buckets */
  23        u32 elem_size;  /* size of each element in bytes */
  24};
  25
  26/* each htab element is struct htab_elem + key + value */
  27struct htab_elem {
  28        struct hlist_node hash_node;
  29        struct rcu_head rcu;
  30        u32 hash;
  31        char key[0] __aligned(8);
  32};
  33
  34/* Called from syscall */
  35static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
  36{
  37        struct bpf_htab *htab;
  38        int err, i;
  39
  40        htab = kzalloc(sizeof(*htab), GFP_USER);
  41        if (!htab)
  42                return ERR_PTR(-ENOMEM);
  43
  44        /* mandatory map attributes */
  45        htab->map.key_size = attr->key_size;
  46        htab->map.value_size = attr->value_size;
  47        htab->map.max_entries = attr->max_entries;
  48
  49        /* check sanity of attributes.
  50         * value_size == 0 may be allowed in the future to use map as a set
  51         */
  52        err = -EINVAL;
  53        if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
  54            htab->map.value_size == 0)
  55                goto free_htab;
  56
  57        /* hash table size must be power of 2 */
  58        htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
  59
  60        err = -E2BIG;
  61        if (htab->map.key_size > MAX_BPF_STACK)
  62                /* eBPF programs initialize keys on stack, so they cannot be
  63                 * larger than max stack size
  64                 */
  65                goto free_htab;
  66
  67        err = -ENOMEM;
  68        /* prevent zero size kmalloc and check for u32 overflow */
  69        if (htab->n_buckets == 0 ||
  70            htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
  71                goto free_htab;
  72
  73        htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
  74                                      GFP_USER | __GFP_NOWARN);
  75
  76        if (!htab->buckets) {
  77                htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
  78                if (!htab->buckets)
  79                        goto free_htab;
  80        }
  81
  82        for (i = 0; i < htab->n_buckets; i++)
  83                INIT_HLIST_HEAD(&htab->buckets[i]);
  84
  85        spin_lock_init(&htab->lock);
  86        htab->count = 0;
  87
  88        htab->elem_size = sizeof(struct htab_elem) +
  89                          round_up(htab->map.key_size, 8) +
  90                          htab->map.value_size;
  91        return &htab->map;
  92
  93free_htab:
  94        kfree(htab);
  95        return ERR_PTR(err);
  96}
  97
  98static inline u32 htab_map_hash(const void *key, u32 key_len)
  99{
 100        return jhash(key, key_len, 0);
 101}
 102
 103static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
 104{
 105        return &htab->buckets[hash & (htab->n_buckets - 1)];
 106}
 107
 108static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
 109                                         void *key, u32 key_size)
 110{
 111        struct htab_elem *l;
 112
 113        hlist_for_each_entry_rcu(l, head, hash_node)
 114                if (l->hash == hash && !memcmp(&l->key, key, key_size))
 115                        return l;
 116
 117        return NULL;
 118}
 119
 120/* Called from syscall or from eBPF program */
 121static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
 122{
 123        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 124        struct hlist_head *head;
 125        struct htab_elem *l;
 126        u32 hash, key_size;
 127
 128        /* Must be called with rcu_read_lock. */
 129        WARN_ON_ONCE(!rcu_read_lock_held());
 130
 131        key_size = map->key_size;
 132
 133        hash = htab_map_hash(key, key_size);
 134
 135        head = select_bucket(htab, hash);
 136
 137        l = lookup_elem_raw(head, hash, key, key_size);
 138
 139        if (l)
 140                return l->key + round_up(map->key_size, 8);
 141
 142        return NULL;
 143}
 144
 145/* Called from syscall */
 146static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 147{
 148        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 149        struct hlist_head *head;
 150        struct htab_elem *l, *next_l;
 151        u32 hash, key_size;
 152        int i;
 153
 154        WARN_ON_ONCE(!rcu_read_lock_held());
 155
 156        key_size = map->key_size;
 157
 158        hash = htab_map_hash(key, key_size);
 159
 160        head = select_bucket(htab, hash);
 161
 162        /* lookup the key */
 163        l = lookup_elem_raw(head, hash, key, key_size);
 164
 165        if (!l) {
 166                i = 0;
 167                goto find_first_elem;
 168        }
 169
 170        /* key was found, get next key in the same bucket */
 171        next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
 172                                  struct htab_elem, hash_node);
 173
 174        if (next_l) {
 175                /* if next elem in this hash list is non-zero, just return it */
 176                memcpy(next_key, next_l->key, key_size);
 177                return 0;
 178        }
 179
 180        /* no more elements in this hash list, go to the next bucket */
 181        i = hash & (htab->n_buckets - 1);
 182        i++;
 183
 184find_first_elem:
 185        /* iterate over buckets */
 186        for (; i < htab->n_buckets; i++) {
 187                head = select_bucket(htab, i);
 188
 189                /* pick first element in the bucket */
 190                next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
 191                                          struct htab_elem, hash_node);
 192                if (next_l) {
 193                        /* if it's not empty, just return it */
 194                        memcpy(next_key, next_l->key, key_size);
 195                        return 0;
 196                }
 197        }
 198
 199        /* itereated over all buckets and all elements */
 200        return -ENOENT;
 201}
 202
 203/* Called from syscall or from eBPF program */
 204static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 205                                u64 map_flags)
 206{
 207        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 208        struct htab_elem *l_new, *l_old;
 209        struct hlist_head *head;
 210        unsigned long flags;
 211        u32 key_size;
 212        int ret;
 213
 214        if (map_flags > BPF_EXIST)
 215                /* unknown flags */
 216                return -EINVAL;
 217
 218        WARN_ON_ONCE(!rcu_read_lock_held());
 219
 220        /* allocate new element outside of lock */
 221        l_new = kmalloc(htab->elem_size, GFP_ATOMIC);
 222        if (!l_new)
 223                return -ENOMEM;
 224
 225        key_size = map->key_size;
 226
 227        memcpy(l_new->key, key, key_size);
 228        memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
 229
 230        l_new->hash = htab_map_hash(l_new->key, key_size);
 231
 232        /* bpf_map_update_elem() can be called in_irq() */
 233        spin_lock_irqsave(&htab->lock, flags);
 234
 235        head = select_bucket(htab, l_new->hash);
 236
 237        l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
 238
 239        if (!l_old && unlikely(htab->count >= map->max_entries)) {
 240                /* if elem with this 'key' doesn't exist and we've reached
 241                 * max_entries limit, fail insertion of new elem
 242                 */
 243                ret = -E2BIG;
 244                goto err;
 245        }
 246
 247        if (l_old && map_flags == BPF_NOEXIST) {
 248                /* elem already exists */
 249                ret = -EEXIST;
 250                goto err;
 251        }
 252
 253        if (!l_old && map_flags == BPF_EXIST) {
 254                /* elem doesn't exist, cannot update it */
 255                ret = -ENOENT;
 256                goto err;
 257        }
 258
 259        /* add new element to the head of the list, so that concurrent
 260         * search will find it before old elem
 261         */
 262        hlist_add_head_rcu(&l_new->hash_node, head);
 263        if (l_old) {
 264                hlist_del_rcu(&l_old->hash_node);
 265                kfree_rcu(l_old, rcu);
 266        } else {
 267                htab->count++;
 268        }
 269        spin_unlock_irqrestore(&htab->lock, flags);
 270
 271        return 0;
 272err:
 273        spin_unlock_irqrestore(&htab->lock, flags);
 274        kfree(l_new);
 275        return ret;
 276}
 277
 278/* Called from syscall or from eBPF program */
 279static int htab_map_delete_elem(struct bpf_map *map, void *key)
 280{
 281        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 282        struct hlist_head *head;
 283        struct htab_elem *l;
 284        unsigned long flags;
 285        u32 hash, key_size;
 286        int ret = -ENOENT;
 287
 288        WARN_ON_ONCE(!rcu_read_lock_held());
 289
 290        key_size = map->key_size;
 291
 292        hash = htab_map_hash(key, key_size);
 293
 294        spin_lock_irqsave(&htab->lock, flags);
 295
 296        head = select_bucket(htab, hash);
 297
 298        l = lookup_elem_raw(head, hash, key, key_size);
 299
 300        if (l) {
 301                hlist_del_rcu(&l->hash_node);
 302                htab->count--;
 303                kfree_rcu(l, rcu);
 304                ret = 0;
 305        }
 306
 307        spin_unlock_irqrestore(&htab->lock, flags);
 308        return ret;
 309}
 310
 311static void delete_all_elements(struct bpf_htab *htab)
 312{
 313        int i;
 314
 315        for (i = 0; i < htab->n_buckets; i++) {
 316                struct hlist_head *head = select_bucket(htab, i);
 317                struct hlist_node *n;
 318                struct htab_elem *l;
 319
 320                hlist_for_each_entry_safe(l, n, head, hash_node) {
 321                        hlist_del_rcu(&l->hash_node);
 322                        htab->count--;
 323                        kfree(l);
 324                }
 325        }
 326}
 327
 328/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
 329static void htab_map_free(struct bpf_map *map)
 330{
 331        struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 332
 333        /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
 334         * so the programs (can be more than one that used this map) were
 335         * disconnected from events. Wait for outstanding critical sections in
 336         * these programs to complete
 337         */
 338        synchronize_rcu();
 339
 340        /* some of kfree_rcu() callbacks for elements of this map may not have
 341         * executed. It's ok. Proceed to free residual elements and map itself
 342         */
 343        delete_all_elements(htab);
 344        kvfree(htab->buckets);
 345        kfree(htab);
 346}
 347
 348static const struct bpf_map_ops htab_ops = {
 349        .map_alloc = htab_map_alloc,
 350        .map_free = htab_map_free,
 351        .map_get_next_key = htab_map_get_next_key,
 352        .map_lookup_elem = htab_map_lookup_elem,
 353        .map_update_elem = htab_map_update_elem,
 354        .map_delete_elem = htab_map_delete_elem,
 355};
 356
 357static struct bpf_map_type_list htab_type __read_mostly = {
 358        .ops = &htab_ops,
 359        .type = BPF_MAP_TYPE_HASH,
 360};
 361
 362static int __init register_htab_map(void)
 363{
 364        bpf_register_map_type(&htab_type);
 365        return 0;
 366}
 367late_initcall(register_htab_map);
 368