linux/net/core/xdp.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/* net/core/xdp.c
   3 *
   4 * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
   5 */
   6#include <linux/bpf.h>
   7#include <linux/filter.h>
   8#include <linux/types.h>
   9#include <linux/mm.h>
  10#include <linux/netdevice.h>
  11#include <linux/slab.h>
  12#include <linux/idr.h>
  13#include <linux/rhashtable.h>
  14#include <linux/bug.h>
  15#include <net/page_pool.h>
  16
  17#include <net/xdp.h>
  18#include <net/xdp_priv.h> /* struct xdp_mem_allocator */
  19#include <trace/events/xdp.h>
  20#include <net/xdp_sock_drv.h>
  21
  22#define REG_STATE_NEW           0x0
  23#define REG_STATE_REGISTERED    0x1
  24#define REG_STATE_UNREGISTERED  0x2
  25#define REG_STATE_UNUSED        0x3
  26
  27static DEFINE_IDA(mem_id_pool);
  28static DEFINE_MUTEX(mem_id_lock);
  29#define MEM_ID_MAX 0xFFFE
  30#define MEM_ID_MIN 1
  31static int mem_id_next = MEM_ID_MIN;
  32
  33static bool mem_id_init; /* false */
  34static struct rhashtable *mem_id_ht;
  35
  36static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
  37{
  38        const u32 *k = data;
  39        const u32 key = *k;
  40
  41        BUILD_BUG_ON(sizeof_field(struct xdp_mem_allocator, mem.id)
  42                     != sizeof(u32));
  43
  44        /* Use cyclic increasing ID as direct hash key */
  45        return key;
  46}
  47
  48static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg,
  49                          const void *ptr)
  50{
  51        const struct xdp_mem_allocator *xa = ptr;
  52        u32 mem_id = *(u32 *)arg->key;
  53
  54        return xa->mem.id != mem_id;
  55}
  56
  57static const struct rhashtable_params mem_id_rht_params = {
  58        .nelem_hint = 64,
  59        .head_offset = offsetof(struct xdp_mem_allocator, node),
  60        .key_offset  = offsetof(struct xdp_mem_allocator, mem.id),
  61        .key_len = sizeof_field(struct xdp_mem_allocator, mem.id),
  62        .max_size = MEM_ID_MAX,
  63        .min_size = 8,
  64        .automatic_shrinking = true,
  65        .hashfn    = xdp_mem_id_hashfn,
  66        .obj_cmpfn = xdp_mem_id_cmp,
  67};
  68
  69static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
  70{
  71        struct xdp_mem_allocator *xa;
  72
  73        xa = container_of(rcu, struct xdp_mem_allocator, rcu);
  74
  75        /* Allow this ID to be reused */
  76        ida_simple_remove(&mem_id_pool, xa->mem.id);
  77
  78        kfree(xa);
  79}
  80
  81static void mem_xa_remove(struct xdp_mem_allocator *xa)
  82{
  83        trace_mem_disconnect(xa);
  84
  85        if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
  86                call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
  87}
  88
  89static void mem_allocator_disconnect(void *allocator)
  90{
  91        struct xdp_mem_allocator *xa;
  92        struct rhashtable_iter iter;
  93
  94        mutex_lock(&mem_id_lock);
  95
  96        rhashtable_walk_enter(mem_id_ht, &iter);
  97        do {
  98                rhashtable_walk_start(&iter);
  99
 100                while ((xa = rhashtable_walk_next(&iter)) && !IS_ERR(xa)) {
 101                        if (xa->allocator == allocator)
 102                                mem_xa_remove(xa);
 103                }
 104
 105                rhashtable_walk_stop(&iter);
 106
 107        } while (xa == ERR_PTR(-EAGAIN));
 108        rhashtable_walk_exit(&iter);
 109
 110        mutex_unlock(&mem_id_lock);
 111}
 112
 113void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
 114{
 115        struct xdp_mem_allocator *xa;
 116        int type = xdp_rxq->mem.type;
 117        int id = xdp_rxq->mem.id;
 118
 119        /* Reset mem info to defaults */
 120        xdp_rxq->mem.id = 0;
 121        xdp_rxq->mem.type = 0;
 122
 123        if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
 124                WARN(1, "Missing register, driver bug");
 125                return;
 126        }
 127
 128        if (id == 0)
 129                return;
 130
 131        if (type == MEM_TYPE_PAGE_POOL) {
 132                rcu_read_lock();
 133                xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
 134                page_pool_destroy(xa->page_pool);
 135                rcu_read_unlock();
 136        }
 137}
 138EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
 139
 140void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
 141{
 142        /* Simplify driver cleanup code paths, allow unreg "unused" */
 143        if (xdp_rxq->reg_state == REG_STATE_UNUSED)
 144                return;
 145
 146        WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG");
 147
 148        xdp_rxq_info_unreg_mem_model(xdp_rxq);
 149
 150        xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
 151        xdp_rxq->dev = NULL;
 152}
 153EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
 154
 155static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq)
 156{
 157        memset(xdp_rxq, 0, sizeof(*xdp_rxq));
 158}
 159
 160/* Returns 0 on success, negative on failure */
 161int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
 162                     struct net_device *dev, u32 queue_index, unsigned int napi_id)
 163{
 164        if (xdp_rxq->reg_state == REG_STATE_UNUSED) {
 165                WARN(1, "Driver promised not to register this");
 166                return -EINVAL;
 167        }
 168
 169        if (xdp_rxq->reg_state == REG_STATE_REGISTERED) {
 170                WARN(1, "Missing unregister, handled but fix driver");
 171                xdp_rxq_info_unreg(xdp_rxq);
 172        }
 173
 174        if (!dev) {
 175                WARN(1, "Missing net_device from driver");
 176                return -ENODEV;
 177        }
 178
 179        /* State either UNREGISTERED or NEW */
 180        xdp_rxq_info_init(xdp_rxq);
 181        xdp_rxq->dev = dev;
 182        xdp_rxq->queue_index = queue_index;
 183        xdp_rxq->napi_id = napi_id;
 184
 185        xdp_rxq->reg_state = REG_STATE_REGISTERED;
 186        return 0;
 187}
 188EXPORT_SYMBOL_GPL(xdp_rxq_info_reg);
 189
 190void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq)
 191{
 192        xdp_rxq->reg_state = REG_STATE_UNUSED;
 193}
 194EXPORT_SYMBOL_GPL(xdp_rxq_info_unused);
 195
 196bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
 197{
 198        return (xdp_rxq->reg_state == REG_STATE_REGISTERED);
 199}
 200EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);
 201
 202static int __mem_id_init_hash_table(void)
 203{
 204        struct rhashtable *rht;
 205        int ret;
 206
 207        if (unlikely(mem_id_init))
 208                return 0;
 209
 210        rht = kzalloc(sizeof(*rht), GFP_KERNEL);
 211        if (!rht)
 212                return -ENOMEM;
 213
 214        ret = rhashtable_init(rht, &mem_id_rht_params);
 215        if (ret < 0) {
 216                kfree(rht);
 217                return ret;
 218        }
 219        mem_id_ht = rht;
 220        smp_mb(); /* mutex lock should provide enough pairing */
 221        mem_id_init = true;
 222
 223        return 0;
 224}
 225
 226/* Allocate a cyclic ID that maps to allocator pointer.
 227 * See: https://www.kernel.org/doc/html/latest/core-api/idr.html
 228 *
 229 * Caller must lock mem_id_lock.
 230 */
 231static int __mem_id_cyclic_get(gfp_t gfp)
 232{
 233        int retries = 1;
 234        int id;
 235
 236again:
 237        id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp);
 238        if (id < 0) {
 239                if (id == -ENOSPC) {
 240                        /* Cyclic allocator, reset next id */
 241                        if (retries--) {
 242                                mem_id_next = MEM_ID_MIN;
 243                                goto again;
 244                        }
 245                }
 246                return id; /* errno */
 247        }
 248        mem_id_next = id + 1;
 249
 250        return id;
 251}
 252
 253static bool __is_supported_mem_type(enum xdp_mem_type type)
 254{
 255        if (type == MEM_TYPE_PAGE_POOL)
 256                return is_page_pool_compiled_in();
 257
 258        if (type >= MEM_TYPE_MAX)
 259                return false;
 260
 261        return true;
 262}
 263
 264int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
 265                               enum xdp_mem_type type, void *allocator)
 266{
 267        struct xdp_mem_allocator *xdp_alloc;
 268        gfp_t gfp = GFP_KERNEL;
 269        int id, errno, ret;
 270        void *ptr;
 271
 272        if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
 273                WARN(1, "Missing register, driver bug");
 274                return -EFAULT;
 275        }
 276
 277        if (!__is_supported_mem_type(type))
 278                return -EOPNOTSUPP;
 279
 280        xdp_rxq->mem.type = type;
 281
 282        if (!allocator) {
 283                if (type == MEM_TYPE_PAGE_POOL)
 284                        return -EINVAL; /* Setup time check page_pool req */
 285                return 0;
 286        }
 287
 288        /* Delay init of rhashtable to save memory if feature isn't used */
 289        if (!mem_id_init) {
 290                mutex_lock(&mem_id_lock);
 291                ret = __mem_id_init_hash_table();
 292                mutex_unlock(&mem_id_lock);
 293                if (ret < 0) {
 294                        WARN_ON(1);
 295                        return ret;
 296                }
 297        }
 298
 299        xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp);
 300        if (!xdp_alloc)
 301                return -ENOMEM;
 302
 303        mutex_lock(&mem_id_lock);
 304        id = __mem_id_cyclic_get(gfp);
 305        if (id < 0) {
 306                errno = id;
 307                goto err;
 308        }
 309        xdp_rxq->mem.id = id;
 310        xdp_alloc->mem  = xdp_rxq->mem;
 311        xdp_alloc->allocator = allocator;
 312
 313        /* Insert allocator into ID lookup table */
 314        ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
 315        if (IS_ERR(ptr)) {
 316                ida_simple_remove(&mem_id_pool, xdp_rxq->mem.id);
 317                xdp_rxq->mem.id = 0;
 318                errno = PTR_ERR(ptr);
 319                goto err;
 320        }
 321
 322        if (type == MEM_TYPE_PAGE_POOL)
 323                page_pool_use_xdp_mem(allocator, mem_allocator_disconnect);
 324
 325        mutex_unlock(&mem_id_lock);
 326
 327        trace_mem_connect(xdp_alloc, xdp_rxq);
 328        return 0;
 329err:
 330        mutex_unlock(&mem_id_lock);
 331        kfree(xdp_alloc);
 332        return errno;
 333}
 334EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
 335
 336/* XDP RX runs under NAPI protection, and in different delivery error
 337 * scenarios (e.g. queue full), it is possible to return the xdp_frame
 338 * while still leveraging this protection.  The @napi_direct boolean
 339 * is used for those calls sites.  Thus, allowing for faster recycling
 340 * of xdp_frames/pages in those cases.
 341 */
 342static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
 343                         struct xdp_buff *xdp)
 344{
 345        struct xdp_mem_allocator *xa;
 346        struct page *page;
 347
 348        switch (mem->type) {
 349        case MEM_TYPE_PAGE_POOL:
 350                rcu_read_lock();
 351                /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
 352                xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
 353                page = virt_to_head_page(data);
 354                if (napi_direct && xdp_return_frame_no_direct())
 355                        napi_direct = false;
 356                page_pool_put_full_page(xa->page_pool, page, napi_direct);
 357                rcu_read_unlock();
 358                break;
 359        case MEM_TYPE_PAGE_SHARED:
 360                page_frag_free(data);
 361                break;
 362        case MEM_TYPE_PAGE_ORDER0:
 363                page = virt_to_page(data); /* Assumes order0 page*/
 364                put_page(page);
 365                break;
 366        case MEM_TYPE_XSK_BUFF_POOL:
 367                /* NB! Only valid from an xdp_buff! */
 368                xsk_buff_free(xdp);
 369                break;
 370        default:
 371                /* Not possible, checked in xdp_rxq_info_reg_mem_model() */
 372                WARN(1, "Incorrect XDP memory type (%d) usage", mem->type);
 373                break;
 374        }
 375}
 376
 377void xdp_return_frame(struct xdp_frame *xdpf)
 378{
 379        __xdp_return(xdpf->data, &xdpf->mem, false, NULL);
 380}
 381EXPORT_SYMBOL_GPL(xdp_return_frame);
 382
 383void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
 384{
 385        __xdp_return(xdpf->data, &xdpf->mem, true, NULL);
 386}
 387EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
 388
 389/* XDP bulk APIs introduce a defer/flush mechanism to return
 390 * pages belonging to the same xdp_mem_allocator object
 391 * (identified via the mem.id field) in bulk to optimize
 392 * I-cache and D-cache.
 393 * The bulk queue size is set to 16 to be aligned to how
 394 * XDP_REDIRECT bulking works. The bulk is flushed when
 395 * it is full or when mem.id changes.
 396 * xdp_frame_bulk is usually stored/allocated on the function
 397 * call-stack to avoid locking penalties.
 398 */
 399void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq)
 400{
 401        struct xdp_mem_allocator *xa = bq->xa;
 402
 403        if (unlikely(!xa || !bq->count))
 404                return;
 405
 406        page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count);
 407        /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */
 408        bq->count = 0;
 409}
 410EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk);
 411
 412/* Must be called with rcu_read_lock held */
 413void xdp_return_frame_bulk(struct xdp_frame *xdpf,
 414                           struct xdp_frame_bulk *bq)
 415{
 416        struct xdp_mem_info *mem = &xdpf->mem;
 417        struct xdp_mem_allocator *xa;
 418
 419        if (mem->type != MEM_TYPE_PAGE_POOL) {
 420                __xdp_return(xdpf->data, &xdpf->mem, false, NULL);
 421                return;
 422        }
 423
 424        xa = bq->xa;
 425        if (unlikely(!xa)) {
 426                xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
 427                bq->count = 0;
 428                bq->xa = xa;
 429        }
 430
 431        if (bq->count == XDP_BULK_QUEUE_SIZE)
 432                xdp_flush_frame_bulk(bq);
 433
 434        if (unlikely(mem->id != xa->mem.id)) {
 435                xdp_flush_frame_bulk(bq);
 436                bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
 437        }
 438
 439        bq->q[bq->count++] = xdpf->data;
 440}
 441EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
 442
 443void xdp_return_buff(struct xdp_buff *xdp)
 444{
 445        __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp);
 446}
 447
 448/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */
 449void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
 450{
 451        struct xdp_mem_allocator *xa;
 452        struct page *page;
 453
 454        rcu_read_lock();
 455        xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
 456        page = virt_to_head_page(data);
 457        if (xa)
 458                page_pool_release_page(xa->page_pool, page);
 459        rcu_read_unlock();
 460}
 461EXPORT_SYMBOL_GPL(__xdp_release_frame);
 462
 463void xdp_attachment_setup(struct xdp_attachment_info *info,
 464                          struct netdev_bpf *bpf)
 465{
 466        if (info->prog)
 467                bpf_prog_put(info->prog);
 468        info->prog = bpf->prog;
 469        info->flags = bpf->flags;
 470}
 471EXPORT_SYMBOL_GPL(xdp_attachment_setup);
 472
 473struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
 474{
 475        unsigned int metasize, totsize;
 476        void *addr, *data_to_copy;
 477        struct xdp_frame *xdpf;
 478        struct page *page;
 479
 480        /* Clone into a MEM_TYPE_PAGE_ORDER0 xdp_frame. */
 481        metasize = xdp_data_meta_unsupported(xdp) ? 0 :
 482                   xdp->data - xdp->data_meta;
 483        totsize = xdp->data_end - xdp->data + metasize;
 484
 485        if (sizeof(*xdpf) + totsize > PAGE_SIZE)
 486                return NULL;
 487
 488        page = dev_alloc_page();
 489        if (!page)
 490                return NULL;
 491
 492        addr = page_to_virt(page);
 493        xdpf = addr;
 494        memset(xdpf, 0, sizeof(*xdpf));
 495
 496        addr += sizeof(*xdpf);
 497        data_to_copy = metasize ? xdp->data_meta : xdp->data;
 498        memcpy(addr, data_to_copy, totsize);
 499
 500        xdpf->data = addr + metasize;
 501        xdpf->len = totsize - metasize;
 502        xdpf->headroom = 0;
 503        xdpf->metasize = metasize;
 504        xdpf->frame_sz = PAGE_SIZE;
 505        xdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
 506
 507        xsk_buff_free(xdp);
 508        return xdpf;
 509}
 510EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame);
 511
 512/* Used by XDP_WARN macro, to avoid inlining WARN() in fast-path */
 513void xdp_warn(const char *msg, const char *func, const int line)
 514{
 515        WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg);
 516};
 517EXPORT_SYMBOL_GPL(xdp_warn);
 518
 519int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp)
 520{
 521        n_skb = kmem_cache_alloc_bulk(skbuff_head_cache, gfp,
 522                                      n_skb, skbs);
 523        if (unlikely(!n_skb))
 524                return -ENOMEM;
 525
 526        return 0;
 527}
 528EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk);
 529
 530struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
 531                                           struct sk_buff *skb,
 532                                           struct net_device *dev)
 533{
 534        unsigned int headroom, frame_size;
 535        void *hard_start;
 536
 537        /* Part of headroom was reserved to xdpf */
 538        headroom = sizeof(*xdpf) + xdpf->headroom;
 539
 540        /* Memory size backing xdp_frame data already have reserved
 541         * room for build_skb to place skb_shared_info in tailroom.
 542         */
 543        frame_size = xdpf->frame_sz;
 544
 545        hard_start = xdpf->data - headroom;
 546        skb = build_skb_around(skb, hard_start, frame_size);
 547        if (unlikely(!skb))
 548                return NULL;
 549
 550        skb_reserve(skb, headroom);
 551        __skb_put(skb, xdpf->len);
 552        if (xdpf->metasize)
 553                skb_metadata_set(skb, xdpf->metasize);
 554
 555        /* Essential SKB info: protocol and skb->dev */
 556        skb->protocol = eth_type_trans(skb, dev);
 557
 558        /* Optional SKB info, currently missing:
 559         * - HW checksum info           (skb->ip_summed)
 560         * - HW RX hash                 (skb_set_hash)
 561         * - RX ring dev queue index    (skb_record_rx_queue)
 562         */
 563
 564        /* Until page_pool get SKB return path, release DMA here */
 565        xdp_release_frame(xdpf);
 566
 567        /* Allow SKB to reuse area used by xdp_frame */
 568        xdp_scrub_frame(xdpf);
 569
 570        return skb;
 571}
 572EXPORT_SYMBOL_GPL(__xdp_build_skb_from_frame);
 573
 574struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
 575                                         struct net_device *dev)
 576{
 577        struct sk_buff *skb;
 578
 579        skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
 580        if (unlikely(!skb))
 581                return NULL;
 582
 583        memset(skb, 0, offsetof(struct sk_buff, tail));
 584
 585        return __xdp_build_skb_from_frame(xdpf, skb, dev);
 586}
 587EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame);
 588
 589struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
 590{
 591        unsigned int headroom, totalsize;
 592        struct xdp_frame *nxdpf;
 593        struct page *page;
 594        void *addr;
 595
 596        headroom = xdpf->headroom + sizeof(*xdpf);
 597        totalsize = headroom + xdpf->len;
 598
 599        if (unlikely(totalsize > PAGE_SIZE))
 600                return NULL;
 601        page = dev_alloc_page();
 602        if (!page)
 603                return NULL;
 604        addr = page_to_virt(page);
 605
 606        memcpy(addr, xdpf, totalsize);
 607
 608        nxdpf = addr;
 609        nxdpf->data = addr + headroom;
 610        nxdpf->frame_sz = PAGE_SIZE;
 611        nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
 612        nxdpf->mem.id = 0;
 613
 614        return nxdpf;
 615}
 616