linux/kernel/bpf/devmap.c
<<
>>
Prefs
   1/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
   2 *
   3 * This program is free software; you can redistribute it and/or
   4 * modify it under the terms of version 2 of the GNU General Public
   5 * License as published by the Free Software Foundation.
   6 *
   7 * This program is distributed in the hope that it will be useful, but
   8 * WITHOUT ANY WARRANTY; without even the implied warranty of
   9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10 * General Public License for more details.
  11 */
  12
  13/* Devmaps primary use is as a backend map for XDP BPF helper call
  14 * bpf_redirect_map(). Because XDP is mostly concerned with performance we
  15 * spent some effort to ensure the datapath with redirect maps does not use
  16 * any locking. This is a quick note on the details.
  17 *
  18 * We have three possible paths to get into the devmap control plane bpf
  19 * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
  20 * will invoke an update, delete, or lookup operation. To ensure updates and
  21 * deletes appear atomic from the datapath side xchg() is used to modify the
  22 * netdev_map array. Then because the datapath does a lookup into the netdev_map
  23 * array (read-only) from an RCU critical section we use call_rcu() to wait for
  24 * an rcu grace period before free'ing the old data structures. This ensures the
  25 * datapath always has a valid copy. However, the datapath does a "flush"
  26 * operation that pushes any pending packets in the driver outside the RCU
  27 * critical section. Each bpf_dtab_netdev tracks these pending operations using
  28 * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed
  29 * until all bits are cleared indicating outstanding flush operations have
  30 * completed.
  31 *
  32 * BPF syscalls may race with BPF program calls on any of the update, delete
  33 * or lookup operations. As noted above the xchg() operation also keep the
  34 * netdev_map consistent in this case. From the devmap side BPF programs
  35 * calling into these operations are the same as multiple user space threads
  36 * making system calls.
  37 *
  38 * Finally, any of the above may race with a netdev_unregister notifier. The
  39 * unregister notifier must search for net devices in the map structure that
  40 * contain a reference to the net device and remove them. This is a two step
  41 * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
  42 * check to see if the ifindex is the same as the net_device being removed.
  43 * When removing the dev a cmpxchg() is used to ensure the correct dev is
  44 * removed, in the case of a concurrent update or delete operation it is
  45 * possible that the initially referenced dev is no longer in the map. As the
  46 * notifier hook walks the map we know that new dev references can not be
  47 * added by the user because core infrastructure ensures dev_get_by_index()
  48 * calls will fail at this point.
  49 */
  50#include <linux/bpf.h>
  51#include <net/xdp.h>
  52#include <linux/filter.h>
  53#include <trace/events/xdp.h>
  54
  55#define DEV_CREATE_FLAG_MASK \
  56        (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
  57
  58#define DEV_MAP_BULK_SIZE 16
  59struct xdp_bulk_queue {
  60        struct xdp_frame *q[DEV_MAP_BULK_SIZE];
  61        struct net_device *dev_rx;
  62        unsigned int count;
  63};
  64
  65struct bpf_dtab_netdev {
  66        struct net_device *dev; /* must be first member, due to tracepoint */
  67        struct bpf_dtab *dtab;
  68        unsigned int bit;
  69        struct xdp_bulk_queue __percpu *bulkq;
  70        struct rcu_head rcu;
  71};
  72
  73struct bpf_dtab {
  74        struct bpf_map map;
  75        struct bpf_dtab_netdev **netdev_map;
  76        unsigned long __percpu *flush_needed;
  77        struct list_head list;
  78};
  79
  80static DEFINE_SPINLOCK(dev_map_lock);
  81static LIST_HEAD(dev_map_list);
  82
  83static u64 dev_map_bitmap_size(const union bpf_attr *attr)
  84{
  85        return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
  86}
  87
  88static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
  89{
  90        struct bpf_dtab *dtab;
  91        int err = -EINVAL;
  92        u64 cost;
  93
  94        if (!capable(CAP_NET_ADMIN))
  95                return ERR_PTR(-EPERM);
  96
  97        /* check sanity of attributes */
  98        if (attr->max_entries == 0 || attr->key_size != 4 ||
  99            attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
 100                return ERR_PTR(-EINVAL);
 101
 102        dtab = kzalloc(sizeof(*dtab), GFP_USER);
 103        if (!dtab)
 104                return ERR_PTR(-ENOMEM);
 105
 106        bpf_map_init_from_attr(&dtab->map, attr);
 107
 108        /* make sure page count doesn't overflow */
 109        cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
 110        cost += dev_map_bitmap_size(attr) * num_possible_cpus();
 111        if (cost >= U32_MAX - PAGE_SIZE)
 112                goto free_dtab;
 113
 114        dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 115
 116        /* if map size is larger than memlock limit, reject it early */
 117        err = bpf_map_precharge_memlock(dtab->map.pages);
 118        if (err)
 119                goto free_dtab;
 120
 121        err = -ENOMEM;
 122
 123        /* A per cpu bitfield with a bit per possible net device */
 124        dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
 125                                                __alignof__(unsigned long),
 126                                                GFP_KERNEL | __GFP_NOWARN);
 127        if (!dtab->flush_needed)
 128                goto free_dtab;
 129
 130        dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
 131                                              sizeof(struct bpf_dtab_netdev *),
 132                                              dtab->map.numa_node);
 133        if (!dtab->netdev_map)
 134                goto free_dtab;
 135
 136        spin_lock(&dev_map_lock);
 137        list_add_tail_rcu(&dtab->list, &dev_map_list);
 138        spin_unlock(&dev_map_lock);
 139
 140        return &dtab->map;
 141free_dtab:
 142        free_percpu(dtab->flush_needed);
 143        kfree(dtab);
 144        return ERR_PTR(err);
 145}
 146
 147static void dev_map_free(struct bpf_map *map)
 148{
 149        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 150        int i, cpu;
 151
 152        /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
 153         * so the programs (can be more than one that used this map) were
 154         * disconnected from events. Wait for outstanding critical sections in
 155         * these programs to complete. The rcu critical section only guarantees
 156         * no further reads against netdev_map. It does __not__ ensure pending
 157         * flush operations (if any) are complete.
 158         */
 159
 160        spin_lock(&dev_map_lock);
 161        list_del_rcu(&dtab->list);
 162        spin_unlock(&dev_map_lock);
 163
 164        bpf_clear_redirect_map(map);
 165        synchronize_rcu();
 166
 167        /* To ensure all pending flush operations have completed wait for flush
 168         * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
 169         * Because the above synchronize_rcu() ensures the map is disconnected
 170         * from the program we can assume no new bits will be set.
 171         */
 172        for_each_online_cpu(cpu) {
 173                unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
 174
 175                while (!bitmap_empty(bitmap, dtab->map.max_entries))
 176                        cond_resched();
 177        }
 178
 179        for (i = 0; i < dtab->map.max_entries; i++) {
 180                struct bpf_dtab_netdev *dev;
 181
 182                dev = dtab->netdev_map[i];
 183                if (!dev)
 184                        continue;
 185
 186                dev_put(dev->dev);
 187                kfree(dev);
 188        }
 189
 190        free_percpu(dtab->flush_needed);
 191        bpf_map_area_free(dtab->netdev_map);
 192        kfree(dtab);
 193}
 194
 195static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 196{
 197        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 198        u32 index = key ? *(u32 *)key : U32_MAX;
 199        u32 *next = next_key;
 200
 201        if (index >= dtab->map.max_entries) {
 202                *next = 0;
 203                return 0;
 204        }
 205
 206        if (index == dtab->map.max_entries - 1)
 207                return -ENOENT;
 208        *next = index + 1;
 209        return 0;
 210}
 211
 212void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
 213{
 214        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 215        unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
 216
 217        __set_bit(bit, bitmap);
 218}
 219
 220static int bq_xmit_all(struct bpf_dtab_netdev *obj,
 221                       struct xdp_bulk_queue *bq, u32 flags,
 222                       bool in_napi_ctx)
 223{
 224        struct net_device *dev = obj->dev;
 225        int sent = 0, drops = 0, err = 0;
 226        int i;
 227
 228        if (unlikely(!bq->count))
 229                return 0;
 230
 231        for (i = 0; i < bq->count; i++) {
 232                struct xdp_frame *xdpf = bq->q[i];
 233
 234                prefetch(xdpf);
 235        }
 236
 237        sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
 238        if (sent < 0) {
 239                err = sent;
 240                sent = 0;
 241                goto error;
 242        }
 243        drops = bq->count - sent;
 244out:
 245        bq->count = 0;
 246
 247        trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
 248                              sent, drops, bq->dev_rx, dev, err);
 249        bq->dev_rx = NULL;
 250        return 0;
 251error:
 252        /* If ndo_xdp_xmit fails with an errno, no frames have been
 253         * xmit'ed and it's our responsibility to them free all.
 254         */
 255        for (i = 0; i < bq->count; i++) {
 256                struct xdp_frame *xdpf = bq->q[i];
 257
 258                /* RX path under NAPI protection, can return frames faster */
 259                if (likely(in_napi_ctx))
 260                        xdp_return_frame_rx_napi(xdpf);
 261                else
 262                        xdp_return_frame(xdpf);
 263                drops++;
 264        }
 265        goto out;
 266}
 267
 268/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
 269 * from the driver before returning from its napi->poll() routine. The poll()
 270 * routine is called either from busy_poll context or net_rx_action signaled
 271 * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
 272 * net device can be torn down. On devmap tear down we ensure the ctx bitmap
 273 * is zeroed before completing to ensure all flush operations have completed.
 274 */
 275void __dev_map_flush(struct bpf_map *map)
 276{
 277        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 278        unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
 279        u32 bit;
 280
 281        for_each_set_bit(bit, bitmap, map->max_entries) {
 282                struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
 283                struct xdp_bulk_queue *bq;
 284
 285                /* This is possible if the dev entry is removed by user space
 286                 * between xdp redirect and flush op.
 287                 */
 288                if (unlikely(!dev))
 289                        continue;
 290
 291                __clear_bit(bit, bitmap);
 292
 293                bq = this_cpu_ptr(dev->bulkq);
 294                bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true);
 295        }
 296}
 297
 298/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
 299 * update happens in parallel here a dev_put wont happen until after reading the
 300 * ifindex.
 301 */
 302struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
 303{
 304        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 305        struct bpf_dtab_netdev *obj;
 306
 307        if (key >= map->max_entries)
 308                return NULL;
 309
 310        obj = READ_ONCE(dtab->netdev_map[key]);
 311        return obj;
 312}
 313
 314/* Runs under RCU-read-side, plus in softirq under NAPI protection.
 315 * Thus, safe percpu variable access.
 316 */
 317static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
 318                      struct net_device *dev_rx)
 319
 320{
 321        struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
 322
 323        if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
 324                bq_xmit_all(obj, bq, 0, true);
 325
 326        /* Ingress dev_rx will be the same for all xdp_frame's in
 327         * bulk_queue, because bq stored per-CPU and must be flushed
 328         * from net_device drivers NAPI func end.
 329         */
 330        if (!bq->dev_rx)
 331                bq->dev_rx = dev_rx;
 332
 333        bq->q[bq->count++] = xdpf;
 334        return 0;
 335}
 336
 337int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
 338                    struct net_device *dev_rx)
 339{
 340        struct net_device *dev = dst->dev;
 341        struct xdp_frame *xdpf;
 342        int err;
 343
 344        if (!dev->netdev_ops->ndo_xdp_xmit)
 345                return -EOPNOTSUPP;
 346
 347        err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
 348        if (unlikely(err))
 349                return err;
 350
 351        xdpf = convert_to_xdp_frame(xdp);
 352        if (unlikely(!xdpf))
 353                return -EOVERFLOW;
 354
 355        return bq_enqueue(dst, xdpf, dev_rx);
 356}
 357
 358int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 359                             struct bpf_prog *xdp_prog)
 360{
 361        int err;
 362
 363        err = xdp_ok_fwd_dev(dst->dev, skb->len);
 364        if (unlikely(err))
 365                return err;
 366        skb->dev = dst->dev;
 367        generic_xdp_tx(skb, xdp_prog);
 368
 369        return 0;
 370}
 371
 372static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 373{
 374        struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
 375        struct net_device *dev = obj ? obj->dev : NULL;
 376
 377        return dev ? &dev->ifindex : NULL;
 378}
 379
 380static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
 381{
 382        if (dev->dev->netdev_ops->ndo_xdp_xmit) {
 383                struct xdp_bulk_queue *bq;
 384                unsigned long *bitmap;
 385
 386                int cpu;
 387
 388                for_each_online_cpu(cpu) {
 389                        bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
 390                        __clear_bit(dev->bit, bitmap);
 391
 392                        bq = per_cpu_ptr(dev->bulkq, cpu);
 393                        bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false);
 394                }
 395        }
 396}
 397
 398static void __dev_map_entry_free(struct rcu_head *rcu)
 399{
 400        struct bpf_dtab_netdev *dev;
 401
 402        dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
 403        dev_map_flush_old(dev);
 404        free_percpu(dev->bulkq);
 405        dev_put(dev->dev);
 406        kfree(dev);
 407}
 408
 409static int dev_map_delete_elem(struct bpf_map *map, void *key)
 410{
 411        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 412        struct bpf_dtab_netdev *old_dev;
 413        int k = *(u32 *)key;
 414
 415        if (k >= map->max_entries)
 416                return -EINVAL;
 417
 418        /* Use call_rcu() here to ensure any rcu critical sections have
 419         * completed, but this does not guarantee a flush has happened
 420         * yet. Because driver side rcu_read_lock/unlock only protects the
 421         * running XDP program. However, for pending flush operations the
 422         * dev and ctx are stored in another per cpu map. And additionally,
 423         * the driver tear down ensures all soft irqs are complete before
 424         * removing the net device in the case of dev_put equals zero.
 425         */
 426        old_dev = xchg(&dtab->netdev_map[k], NULL);
 427        if (old_dev)
 428                call_rcu(&old_dev->rcu, __dev_map_entry_free);
 429        return 0;
 430}
 431
 432static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
 433                                u64 map_flags)
 434{
 435        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 436        struct net *net = current->nsproxy->net_ns;
 437        gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
 438        struct bpf_dtab_netdev *dev, *old_dev;
 439        u32 i = *(u32 *)key;
 440        u32 ifindex = *(u32 *)value;
 441
 442        if (unlikely(map_flags > BPF_EXIST))
 443                return -EINVAL;
 444        if (unlikely(i >= dtab->map.max_entries))
 445                return -E2BIG;
 446        if (unlikely(map_flags == BPF_NOEXIST))
 447                return -EEXIST;
 448
 449        if (!ifindex) {
 450                dev = NULL;
 451        } else {
 452                dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
 453                if (!dev)
 454                        return -ENOMEM;
 455
 456                dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
 457                                                sizeof(void *), gfp);
 458                if (!dev->bulkq) {
 459                        kfree(dev);
 460                        return -ENOMEM;
 461                }
 462
 463                dev->dev = dev_get_by_index(net, ifindex);
 464                if (!dev->dev) {
 465                        free_percpu(dev->bulkq);
 466                        kfree(dev);
 467                        return -EINVAL;
 468                }
 469
 470                dev->bit = i;
 471                dev->dtab = dtab;
 472        }
 473
 474        /* Use call_rcu() here to ensure rcu critical sections have completed
 475         * Remembering the driver side flush operation will happen before the
 476         * net device is removed.
 477         */
 478        old_dev = xchg(&dtab->netdev_map[i], dev);
 479        if (old_dev)
 480                call_rcu(&old_dev->rcu, __dev_map_entry_free);
 481
 482        return 0;
 483}
 484
 485const struct bpf_map_ops dev_map_ops = {
 486        .map_alloc = dev_map_alloc,
 487        .map_free = dev_map_free,
 488        .map_get_next_key = dev_map_get_next_key,
 489        .map_lookup_elem = dev_map_lookup_elem,
 490        .map_update_elem = dev_map_update_elem,
 491        .map_delete_elem = dev_map_delete_elem,
 492        .map_check_btf = map_check_no_btf,
 493};
 494
 495static int dev_map_notification(struct notifier_block *notifier,
 496                                ulong event, void *ptr)
 497{
 498        struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
 499        struct bpf_dtab *dtab;
 500        int i;
 501
 502        switch (event) {
 503        case NETDEV_UNREGISTER:
 504                /* This rcu_read_lock/unlock pair is needed because
 505                 * dev_map_list is an RCU list AND to ensure a delete
 506                 * operation does not free a netdev_map entry while we
 507                 * are comparing it against the netdev being unregistered.
 508                 */
 509                rcu_read_lock();
 510                list_for_each_entry_rcu(dtab, &dev_map_list, list) {
 511                        for (i = 0; i < dtab->map.max_entries; i++) {
 512                                struct bpf_dtab_netdev *dev, *odev;
 513
 514                                dev = READ_ONCE(dtab->netdev_map[i]);
 515                                if (!dev || netdev != dev->dev)
 516                                        continue;
 517                                odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
 518                                if (dev == odev)
 519                                        call_rcu(&dev->rcu,
 520                                                 __dev_map_entry_free);
 521                        }
 522                }
 523                rcu_read_unlock();
 524                break;
 525        default:
 526                break;
 527        }
 528        return NOTIFY_OK;
 529}
 530
 531static struct notifier_block dev_map_notifier = {
 532        .notifier_call = dev_map_notification,
 533};
 534
 535static int __init dev_map_init(void)
 536{
 537        /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
 538        BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
 539                     offsetof(struct _bpf_dtab_netdev, dev));
 540        register_netdevice_notifier(&dev_map_notifier);
 541        return 0;
 542}
 543
 544subsys_initcall(dev_map_init);
 545