dpdk/drivers/net/af_xdp/rte_eth_af_xdp.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright(c) 2019-2020 Intel Corporation.
   3 */
   4#include <unistd.h>
   5#include <errno.h>
   6#include <stdlib.h>
   7#include <string.h>
   8#include <poll.h>
   9#include <netinet/in.h>
  10#include <net/if.h>
  11#include <sys/socket.h>
  12#include <sys/ioctl.h>
  13#include <linux/if_ether.h>
  14#include <linux/if_xdp.h>
  15#include <linux/if_link.h>
  16#include <linux/ethtool.h>
  17#include <linux/sockios.h>
  18#include "af_xdp_deps.h"
  19#include <bpf/xsk.h>
  20
  21#include <rte_ethdev.h>
  22#include <rte_ethdev_driver.h>
  23#include <rte_ethdev_vdev.h>
  24#include <rte_kvargs.h>
  25#include <rte_bus_vdev.h>
  26#include <rte_string_fns.h>
  27#include <rte_branch_prediction.h>
  28#include <rte_common.h>
  29#include <rte_dev.h>
  30#include <rte_eal.h>
  31#include <rte_ether.h>
  32#include <rte_lcore.h>
  33#include <rte_log.h>
  34#include <rte_memory.h>
  35#include <rte_memzone.h>
  36#include <rte_mempool.h>
  37#include <rte_mbuf.h>
  38#include <rte_malloc.h>
  39#include <rte_ring.h>
  40#include <rte_spinlock.h>
  41
  42#include "compat.h"
  43
  44
  45#ifndef SOL_XDP
  46#define SOL_XDP 283
  47#endif
  48
  49#ifndef AF_XDP
  50#define AF_XDP 44
  51#endif
  52
  53#ifndef PF_XDP
  54#define PF_XDP AF_XDP
  55#endif
  56
  57RTE_LOG_REGISTER(af_xdp_logtype, pmd.net.af_xdp, NOTICE);
  58
  59#define AF_XDP_LOG(level, fmt, args...)                 \
  60        rte_log(RTE_LOG_ ## level, af_xdp_logtype,      \
  61                "%s(): " fmt, __func__, ##args)
  62
  63#define ETH_AF_XDP_FRAME_SIZE           2048
  64#define ETH_AF_XDP_NUM_BUFFERS          4096
  65#define ETH_AF_XDP_DFLT_NUM_DESCS       XSK_RING_CONS__DEFAULT_NUM_DESCS
  66#define ETH_AF_XDP_DFLT_START_QUEUE_IDX 0
  67#define ETH_AF_XDP_DFLT_QUEUE_COUNT     1
  68
  69#define ETH_AF_XDP_RX_BATCH_SIZE        32
  70#define ETH_AF_XDP_TX_BATCH_SIZE        32
  71
  72
  73struct xsk_umem_info {
  74        struct xsk_umem *umem;
  75        struct rte_ring *buf_ring;
  76        const struct rte_memzone *mz;
  77        struct rte_mempool *mb_pool;
  78        void *buffer;
  79        uint8_t refcnt;
  80        uint32_t max_xsks;
  81};
  82
  83struct rx_stats {
  84        uint64_t rx_pkts;
  85        uint64_t rx_bytes;
  86        uint64_t rx_dropped;
  87};
  88
  89struct pkt_rx_queue {
  90        struct xsk_ring_cons rx;
  91        struct xsk_umem_info *umem;
  92        struct xsk_socket *xsk;
  93        struct rte_mempool *mb_pool;
  94
  95        struct rx_stats stats;
  96
  97        struct xsk_ring_prod fq;
  98        struct xsk_ring_cons cq;
  99
 100        struct pkt_tx_queue *pair;
 101        struct pollfd fds[1];
 102        int xsk_queue_idx;
 103};
 104
 105struct tx_stats {
 106        uint64_t tx_pkts;
 107        uint64_t tx_bytes;
 108        uint64_t tx_dropped;
 109};
 110
 111struct pkt_tx_queue {
 112        struct xsk_ring_prod tx;
 113        struct xsk_umem_info *umem;
 114
 115        struct tx_stats stats;
 116
 117        struct pkt_rx_queue *pair;
 118        int xsk_queue_idx;
 119};
 120
 121struct pmd_internals {
 122        int if_index;
 123        char if_name[IFNAMSIZ];
 124        int start_queue_idx;
 125        int queue_cnt;
 126        int max_queue_cnt;
 127        int combined_queue_cnt;
 128        bool shared_umem;
 129        char prog_path[PATH_MAX];
 130        bool custom_prog_configured;
 131
 132        struct rte_ether_addr eth_addr;
 133
 134        struct pkt_rx_queue *rx_queues;
 135        struct pkt_tx_queue *tx_queues;
 136};
 137
 138#define ETH_AF_XDP_IFACE_ARG                    "iface"
 139#define ETH_AF_XDP_START_QUEUE_ARG              "start_queue"
 140#define ETH_AF_XDP_QUEUE_COUNT_ARG              "queue_count"
 141#define ETH_AF_XDP_SHARED_UMEM_ARG              "shared_umem"
 142#define ETH_AF_XDP_PROG_ARG                     "xdp_prog"
 143
 144static const char * const valid_arguments[] = {
 145        ETH_AF_XDP_IFACE_ARG,
 146        ETH_AF_XDP_START_QUEUE_ARG,
 147        ETH_AF_XDP_QUEUE_COUNT_ARG,
 148        ETH_AF_XDP_SHARED_UMEM_ARG,
 149        ETH_AF_XDP_PROG_ARG,
 150        NULL
 151};
 152
 153static const struct rte_eth_link pmd_link = {
 154        .link_speed = ETH_SPEED_NUM_10G,
 155        .link_duplex = ETH_LINK_FULL_DUPLEX,
 156        .link_status = ETH_LINK_DOWN,
 157        .link_autoneg = ETH_LINK_AUTONEG
 158};
 159
 160/* List which tracks PMDs to facilitate sharing UMEMs across them. */
 161struct internal_list {
 162        TAILQ_ENTRY(internal_list) next;
 163        struct rte_eth_dev *eth_dev;
 164};
 165
 166TAILQ_HEAD(internal_list_head, internal_list);
 167static struct internal_list_head internal_list =
 168        TAILQ_HEAD_INITIALIZER(internal_list);
 169
 170static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
 171
 172#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
 173static inline int
 174reserve_fill_queue_zc(struct xsk_umem_info *umem, uint16_t reserve_size,
 175                      struct rte_mbuf **bufs, struct xsk_ring_prod *fq)
 176{
 177        uint32_t idx;
 178        uint16_t i;
 179
 180        if (unlikely(!xsk_ring_prod__reserve(fq, reserve_size, &idx))) {
 181                for (i = 0; i < reserve_size; i++)
 182                        rte_pktmbuf_free(bufs[i]);
 183                AF_XDP_LOG(DEBUG, "Failed to reserve enough fq descs.\n");
 184                return -1;
 185        }
 186
 187        for (i = 0; i < reserve_size; i++) {
 188                __u64 *fq_addr;
 189                uint64_t addr;
 190
 191                fq_addr = xsk_ring_prod__fill_addr(fq, idx++);
 192                addr = (uint64_t)bufs[i] - (uint64_t)umem->buffer -
 193                                umem->mb_pool->header_size;
 194                *fq_addr = addr;
 195        }
 196
 197        xsk_ring_prod__submit(fq, reserve_size);
 198
 199        return 0;
 200}
 201#else
 202static inline int
 203reserve_fill_queue_cp(struct xsk_umem_info *umem, uint16_t reserve_size,
 204                      struct rte_mbuf **bufs __rte_unused,
 205                      struct xsk_ring_prod *fq)
 206{
 207        void *addrs[reserve_size];
 208        uint32_t idx;
 209        uint16_t i;
 210
 211        if (rte_ring_dequeue_bulk(umem->buf_ring, addrs, reserve_size, NULL)
 212                    != reserve_size) {
 213                AF_XDP_LOG(DEBUG, "Failed to get enough buffers for fq.\n");
 214                return -1;
 215        }
 216
 217        if (unlikely(!xsk_ring_prod__reserve(fq, reserve_size, &idx))) {
 218                AF_XDP_LOG(DEBUG, "Failed to reserve enough fq descs.\n");
 219                rte_ring_enqueue_bulk(umem->buf_ring, addrs,
 220                                reserve_size, NULL);
 221                return -1;
 222        }
 223
 224        for (i = 0; i < reserve_size; i++) {
 225                __u64 *fq_addr;
 226
 227                fq_addr = xsk_ring_prod__fill_addr(fq, idx++);
 228                *fq_addr = (uint64_t)addrs[i];
 229        }
 230
 231        xsk_ring_prod__submit(fq, reserve_size);
 232
 233        return 0;
 234}
 235#endif
 236
 237static inline int
 238reserve_fill_queue(struct xsk_umem_info *umem, uint16_t reserve_size,
 239                   struct rte_mbuf **bufs, struct xsk_ring_prod *fq)
 240{
 241#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
 242        return reserve_fill_queue_zc(umem, reserve_size, bufs, fq);
 243#else
 244        return reserve_fill_queue_cp(umem, reserve_size, bufs, fq);
 245#endif
 246}
 247
 248#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
 249static uint16_t
 250af_xdp_rx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 251{
 252        struct pkt_rx_queue *rxq = queue;
 253        struct xsk_ring_cons *rx = &rxq->rx;
 254        struct xsk_ring_prod *fq = &rxq->fq;
 255        struct xsk_umem_info *umem = rxq->umem;
 256        uint32_t idx_rx = 0;
 257        unsigned long rx_bytes = 0;
 258        int rcvd, i;
 259        struct rte_mbuf *fq_bufs[ETH_AF_XDP_RX_BATCH_SIZE];
 260
 261        /* allocate bufs for fill queue replenishment after rx */
 262        if (rte_pktmbuf_alloc_bulk(umem->mb_pool, fq_bufs, nb_pkts)) {
 263                AF_XDP_LOG(DEBUG,
 264                        "Failed to get enough buffers for fq.\n");
 265                return 0;
 266        }
 267
 268        rcvd = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
 269
 270        if (rcvd == 0) {
 271#if defined(XDP_USE_NEED_WAKEUP)
 272                if (xsk_ring_prod__needs_wakeup(fq))
 273                        (void)poll(rxq->fds, 1, 1000);
 274#endif
 275
 276                goto out;
 277        }
 278
 279        for (i = 0; i < rcvd; i++) {
 280                const struct xdp_desc *desc;
 281                uint64_t addr;
 282                uint32_t len;
 283                uint64_t offset;
 284
 285                desc = xsk_ring_cons__rx_desc(rx, idx_rx++);
 286                addr = desc->addr;
 287                len = desc->len;
 288
 289                offset = xsk_umem__extract_offset(addr);
 290                addr = xsk_umem__extract_addr(addr);
 291
 292                bufs[i] = (struct rte_mbuf *)
 293                                xsk_umem__get_data(umem->buffer, addr +
 294                                        umem->mb_pool->header_size);
 295                bufs[i]->data_off = offset - sizeof(struct rte_mbuf) -
 296                        rte_pktmbuf_priv_size(umem->mb_pool) -
 297                        umem->mb_pool->header_size;
 298
 299                rte_pktmbuf_pkt_len(bufs[i]) = len;
 300                rte_pktmbuf_data_len(bufs[i]) = len;
 301                rx_bytes += len;
 302        }
 303
 304        xsk_ring_cons__release(rx, rcvd);
 305
 306        (void)reserve_fill_queue(umem, rcvd, fq_bufs, fq);
 307
 308        /* statistics */
 309        rxq->stats.rx_pkts += rcvd;
 310        rxq->stats.rx_bytes += rx_bytes;
 311
 312out:
 313        if (rcvd != nb_pkts)
 314                rte_mempool_put_bulk(umem->mb_pool, (void **)&fq_bufs[rcvd],
 315                                     nb_pkts - rcvd);
 316
 317        return rcvd;
 318}
 319#else
 320static uint16_t
 321af_xdp_rx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 322{
 323        struct pkt_rx_queue *rxq = queue;
 324        struct xsk_ring_cons *rx = &rxq->rx;
 325        struct xsk_umem_info *umem = rxq->umem;
 326        struct xsk_ring_prod *fq = &rxq->fq;
 327        uint32_t idx_rx = 0;
 328        unsigned long rx_bytes = 0;
 329        int rcvd, i;
 330        uint32_t free_thresh = fq->size >> 1;
 331        struct rte_mbuf *mbufs[ETH_AF_XDP_RX_BATCH_SIZE];
 332
 333        if (xsk_prod_nb_free(fq, free_thresh) >= free_thresh)
 334                (void)reserve_fill_queue(umem, ETH_AF_XDP_RX_BATCH_SIZE,
 335                                         NULL, fq);
 336
 337        if (unlikely(rte_pktmbuf_alloc_bulk(rxq->mb_pool, mbufs, nb_pkts) != 0))
 338                return 0;
 339
 340        rcvd = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
 341        if (rcvd == 0) {
 342#if defined(XDP_USE_NEED_WAKEUP)
 343                if (xsk_ring_prod__needs_wakeup(fq))
 344                        (void)poll(rxq->fds, 1, 1000);
 345#endif
 346
 347                goto out;
 348        }
 349
 350        for (i = 0; i < rcvd; i++) {
 351                const struct xdp_desc *desc;
 352                uint64_t addr;
 353                uint32_t len;
 354                void *pkt;
 355
 356                desc = xsk_ring_cons__rx_desc(rx, idx_rx++);
 357                addr = desc->addr;
 358                len = desc->len;
 359                pkt = xsk_umem__get_data(rxq->umem->mz->addr, addr);
 360
 361                rte_memcpy(rte_pktmbuf_mtod(mbufs[i], void *), pkt, len);
 362                rte_ring_enqueue(umem->buf_ring, (void *)addr);
 363                rte_pktmbuf_pkt_len(mbufs[i]) = len;
 364                rte_pktmbuf_data_len(mbufs[i]) = len;
 365                rx_bytes += len;
 366                bufs[i] = mbufs[i];
 367        }
 368
 369        xsk_ring_cons__release(rx, rcvd);
 370
 371        /* statistics */
 372        rxq->stats.rx_pkts += rcvd;
 373        rxq->stats.rx_bytes += rx_bytes;
 374
 375out:
 376        if (rcvd != nb_pkts)
 377                rte_mempool_put_bulk(rxq->mb_pool, (void **)&mbufs[rcvd],
 378                                     nb_pkts - rcvd);
 379
 380        return rcvd;
 381}
 382#endif
 383
 384static uint16_t
 385eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 386{
 387        nb_pkts = RTE_MIN(nb_pkts, ETH_AF_XDP_RX_BATCH_SIZE);
 388
 389#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
 390        return af_xdp_rx_zc(queue, bufs, nb_pkts);
 391#else
 392        return af_xdp_rx_cp(queue, bufs, nb_pkts);
 393#endif
 394}
 395
 396static void
 397pull_umem_cq(struct xsk_umem_info *umem, int size, struct xsk_ring_cons *cq)
 398{
 399        size_t i, n;
 400        uint32_t idx_cq = 0;
 401
 402        n = xsk_ring_cons__peek(cq, size, &idx_cq);
 403
 404        for (i = 0; i < n; i++) {
 405                uint64_t addr;
 406                addr = *xsk_ring_cons__comp_addr(cq, idx_cq++);
 407#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
 408                addr = xsk_umem__extract_addr(addr);
 409                rte_pktmbuf_free((struct rte_mbuf *)
 410                                        xsk_umem__get_data(umem->buffer,
 411                                        addr + umem->mb_pool->header_size));
 412#else
 413                rte_ring_enqueue(umem->buf_ring, (void *)addr);
 414#endif
 415        }
 416
 417        xsk_ring_cons__release(cq, n);
 418}
 419
 420static void
 421kick_tx(struct pkt_tx_queue *txq, struct xsk_ring_cons *cq)
 422{
 423        struct xsk_umem_info *umem = txq->umem;
 424
 425        pull_umem_cq(umem, XSK_RING_CONS__DEFAULT_NUM_DESCS, cq);
 426
 427#if defined(XDP_USE_NEED_WAKEUP)
 428        if (xsk_ring_prod__needs_wakeup(&txq->tx))
 429#endif
 430                while (send(xsk_socket__fd(txq->pair->xsk), NULL,
 431                            0, MSG_DONTWAIT) < 0) {
 432                        /* some thing unexpected */
 433                        if (errno != EBUSY && errno != EAGAIN && errno != EINTR)
 434                                break;
 435
 436                        /* pull from completion queue to leave more space */
 437                        if (errno == EAGAIN)
 438                                pull_umem_cq(umem,
 439                                             XSK_RING_CONS__DEFAULT_NUM_DESCS,
 440                                             cq);
 441                }
 442}
 443
 444#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
 445static uint16_t
 446af_xdp_tx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 447{
 448        struct pkt_tx_queue *txq = queue;
 449        struct xsk_umem_info *umem = txq->umem;
 450        struct rte_mbuf *mbuf;
 451        unsigned long tx_bytes = 0;
 452        int i;
 453        uint32_t idx_tx;
 454        uint16_t count = 0;
 455        struct xdp_desc *desc;
 456        uint64_t addr, offset;
 457        struct xsk_ring_cons *cq = &txq->pair->cq;
 458        uint32_t free_thresh = cq->size >> 1;
 459
 460        if (xsk_cons_nb_avail(cq, free_thresh) >= free_thresh)
 461                pull_umem_cq(umem, XSK_RING_CONS__DEFAULT_NUM_DESCS, cq);
 462
 463        for (i = 0; i < nb_pkts; i++) {
 464                mbuf = bufs[i];
 465
 466                if (mbuf->pool == umem->mb_pool) {
 467                        if (!xsk_ring_prod__reserve(&txq->tx, 1, &idx_tx)) {
 468                                kick_tx(txq, cq);
 469                                if (!xsk_ring_prod__reserve(&txq->tx, 1,
 470                                                            &idx_tx))
 471                                        goto out;
 472                        }
 473                        desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx);
 474                        desc->len = mbuf->pkt_len;
 475                        addr = (uint64_t)mbuf - (uint64_t)umem->buffer -
 476                                        umem->mb_pool->header_size;
 477                        offset = rte_pktmbuf_mtod(mbuf, uint64_t) -
 478                                        (uint64_t)mbuf +
 479                                        umem->mb_pool->header_size;
 480                        offset = offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT;
 481                        desc->addr = addr | offset;
 482                        count++;
 483                } else {
 484                        struct rte_mbuf *local_mbuf =
 485                                        rte_pktmbuf_alloc(umem->mb_pool);
 486                        void *pkt;
 487
 488                        if (local_mbuf == NULL)
 489                                goto out;
 490
 491                        if (!xsk_ring_prod__reserve(&txq->tx, 1, &idx_tx)) {
 492                                rte_pktmbuf_free(local_mbuf);
 493                                kick_tx(txq, cq);
 494                                goto out;
 495                        }
 496
 497                        desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx);
 498                        desc->len = mbuf->pkt_len;
 499
 500                        addr = (uint64_t)local_mbuf - (uint64_t)umem->buffer -
 501                                        umem->mb_pool->header_size;
 502                        offset = rte_pktmbuf_mtod(local_mbuf, uint64_t) -
 503                                        (uint64_t)local_mbuf +
 504                                        umem->mb_pool->header_size;
 505                        pkt = xsk_umem__get_data(umem->buffer, addr + offset);
 506                        offset = offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT;
 507                        desc->addr = addr | offset;
 508                        rte_memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *),
 509                                        desc->len);
 510                        rte_pktmbuf_free(mbuf);
 511                        count++;
 512                }
 513
 514                tx_bytes += mbuf->pkt_len;
 515        }
 516
 517        kick_tx(txq, cq);
 518
 519out:
 520        xsk_ring_prod__submit(&txq->tx, count);
 521
 522        txq->stats.tx_pkts += count;
 523        txq->stats.tx_bytes += tx_bytes;
 524        txq->stats.tx_dropped += nb_pkts - count;
 525
 526        return count;
 527}
 528#else
 529static uint16_t
 530af_xdp_tx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 531{
 532        struct pkt_tx_queue *txq = queue;
 533        struct xsk_umem_info *umem = txq->umem;
 534        struct rte_mbuf *mbuf;
 535        void *addrs[ETH_AF_XDP_TX_BATCH_SIZE];
 536        unsigned long tx_bytes = 0;
 537        int i;
 538        uint32_t idx_tx;
 539        struct xsk_ring_cons *cq = &txq->pair->cq;
 540
 541        nb_pkts = RTE_MIN(nb_pkts, ETH_AF_XDP_TX_BATCH_SIZE);
 542
 543        pull_umem_cq(umem, nb_pkts, cq);
 544
 545        nb_pkts = rte_ring_dequeue_bulk(umem->buf_ring, addrs,
 546                                        nb_pkts, NULL);
 547        if (nb_pkts == 0)
 548                return 0;
 549
 550        if (xsk_ring_prod__reserve(&txq->tx, nb_pkts, &idx_tx) != nb_pkts) {
 551                kick_tx(txq, cq);
 552                rte_ring_enqueue_bulk(umem->buf_ring, addrs, nb_pkts, NULL);
 553                return 0;
 554        }
 555
 556        for (i = 0; i < nb_pkts; i++) {
 557                struct xdp_desc *desc;
 558                void *pkt;
 559
 560                desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx + i);
 561                mbuf = bufs[i];
 562                desc->len = mbuf->pkt_len;
 563
 564                desc->addr = (uint64_t)addrs[i];
 565                pkt = xsk_umem__get_data(umem->mz->addr,
 566                                         desc->addr);
 567                rte_memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *), desc->len);
 568                tx_bytes += mbuf->pkt_len;
 569                rte_pktmbuf_free(mbuf);
 570        }
 571
 572        xsk_ring_prod__submit(&txq->tx, nb_pkts);
 573
 574        kick_tx(txq, cq);
 575
 576        txq->stats.tx_pkts += nb_pkts;
 577        txq->stats.tx_bytes += tx_bytes;
 578
 579        return nb_pkts;
 580}
 581#endif
 582
 583static uint16_t
 584eth_af_xdp_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 585{
 586#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
 587        return af_xdp_tx_zc(queue, bufs, nb_pkts);
 588#else
 589        return af_xdp_tx_cp(queue, bufs, nb_pkts);
 590#endif
 591}
 592
 593static int
 594eth_dev_start(struct rte_eth_dev *dev)
 595{
 596        dev->data->dev_link.link_status = ETH_LINK_UP;
 597
 598        return 0;
 599}
 600
 601/* This function gets called when the current port gets stopped. */
 602static int
 603eth_dev_stop(struct rte_eth_dev *dev)
 604{
 605        dev->data->dev_link.link_status = ETH_LINK_DOWN;
 606        return 0;
 607}
 608
 609/* Find ethdev in list */
 610static inline struct internal_list *
 611find_internal_resource(struct pmd_internals *port_int)
 612{
 613        int found = 0;
 614        struct internal_list *list = NULL;
 615
 616        if (port_int == NULL)
 617                return NULL;
 618
 619        pthread_mutex_lock(&internal_list_lock);
 620
 621        TAILQ_FOREACH(list, &internal_list, next) {
 622                struct pmd_internals *list_int =
 623                                list->eth_dev->data->dev_private;
 624                if (list_int == port_int) {
 625                        found = 1;
 626                        break;
 627                }
 628        }
 629
 630        pthread_mutex_unlock(&internal_list_lock);
 631
 632        if (!found)
 633                return NULL;
 634
 635        return list;
 636}
 637
 638/* Check if the netdev,qid context already exists */
 639static inline bool
 640ctx_exists(struct pkt_rx_queue *rxq, const char *ifname,
 641                struct pkt_rx_queue *list_rxq, const char *list_ifname)
 642{
 643        bool exists = false;
 644
 645        if (rxq->xsk_queue_idx == list_rxq->xsk_queue_idx &&
 646                        !strncmp(ifname, list_ifname, IFNAMSIZ)) {
 647                AF_XDP_LOG(ERR, "ctx %s,%i already exists, cannot share umem\n",
 648                                        ifname, rxq->xsk_queue_idx);
 649                exists = true;
 650        }
 651
 652        return exists;
 653}
 654
 655/* Get a pointer to an existing UMEM which overlays the rxq's mb_pool */
 656static inline int
 657get_shared_umem(struct pkt_rx_queue *rxq, const char *ifname,
 658                        struct xsk_umem_info **umem)
 659{
 660        struct internal_list *list;
 661        struct pmd_internals *internals;
 662        int i = 0, ret = 0;
 663        struct rte_mempool *mb_pool = rxq->mb_pool;
 664
 665        if (mb_pool == NULL)
 666                return ret;
 667
 668        pthread_mutex_lock(&internal_list_lock);
 669
 670        TAILQ_FOREACH(list, &internal_list, next) {
 671                internals = list->eth_dev->data->dev_private;
 672                for (i = 0; i < internals->queue_cnt; i++) {
 673                        struct pkt_rx_queue *list_rxq =
 674                                                &internals->rx_queues[i];
 675                        if (rxq == list_rxq)
 676                                continue;
 677                        if (mb_pool == internals->rx_queues[i].mb_pool) {
 678                                if (ctx_exists(rxq, ifname, list_rxq,
 679                                                internals->if_name)) {
 680                                        ret = -1;
 681                                        goto out;
 682                                }
 683                                if (__atomic_load_n(
 684                                        &internals->rx_queues[i].umem->refcnt,
 685                                                        __ATOMIC_ACQUIRE)) {
 686                                        *umem = internals->rx_queues[i].umem;
 687                                        goto out;
 688                                }
 689                        }
 690                }
 691        }
 692
 693out:
 694        pthread_mutex_unlock(&internal_list_lock);
 695
 696        return ret;
 697}
 698
 699static int
 700eth_dev_configure(struct rte_eth_dev *dev)
 701{
 702        struct pmd_internals *internal = dev->data->dev_private;
 703
 704        /* rx/tx must be paired */
 705        if (dev->data->nb_rx_queues != dev->data->nb_tx_queues)
 706                return -EINVAL;
 707
 708        if (internal->shared_umem) {
 709                struct internal_list *list = NULL;
 710                const char *name = dev->device->name;
 711
 712                /* Ensure PMD is not already inserted into the list */
 713                list = find_internal_resource(internal);
 714                if (list)
 715                        return 0;
 716
 717                list = rte_zmalloc_socket(name, sizeof(*list), 0,
 718                                        dev->device->numa_node);
 719                if (list == NULL)
 720                        return -1;
 721
 722                list->eth_dev = dev;
 723                pthread_mutex_lock(&internal_list_lock);
 724                TAILQ_INSERT_TAIL(&internal_list, list, next);
 725                pthread_mutex_unlock(&internal_list_lock);
 726        }
 727
 728        return 0;
 729}
 730
 731static int
 732eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 733{
 734        struct pmd_internals *internals = dev->data->dev_private;
 735
 736        dev_info->if_index = internals->if_index;
 737        dev_info->max_mac_addrs = 1;
 738        dev_info->max_rx_pktlen = ETH_FRAME_LEN;
 739        dev_info->max_rx_queues = internals->queue_cnt;
 740        dev_info->max_tx_queues = internals->queue_cnt;
 741
 742        dev_info->min_mtu = RTE_ETHER_MIN_MTU;
 743#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
 744        dev_info->max_mtu = getpagesize() -
 745                                sizeof(struct rte_mempool_objhdr) -
 746                                sizeof(struct rte_mbuf) -
 747                                RTE_PKTMBUF_HEADROOM - XDP_PACKET_HEADROOM;
 748#else
 749        dev_info->max_mtu = ETH_AF_XDP_FRAME_SIZE - XDP_PACKET_HEADROOM;
 750#endif
 751
 752        dev_info->default_rxportconf.nb_queues = 1;
 753        dev_info->default_txportconf.nb_queues = 1;
 754        dev_info->default_rxportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
 755        dev_info->default_txportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
 756
 757        return 0;
 758}
 759
 760static int
 761eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
 762{
 763        struct pmd_internals *internals = dev->data->dev_private;
 764        struct xdp_statistics xdp_stats;
 765        struct pkt_rx_queue *rxq;
 766        struct pkt_tx_queue *txq;
 767        socklen_t optlen;
 768        int i, ret;
 769
 770        for (i = 0; i < dev->data->nb_rx_queues; i++) {
 771                optlen = sizeof(struct xdp_statistics);
 772                rxq = &internals->rx_queues[i];
 773                txq = rxq->pair;
 774                stats->q_ipackets[i] = rxq->stats.rx_pkts;
 775                stats->q_ibytes[i] = rxq->stats.rx_bytes;
 776
 777                stats->q_opackets[i] = txq->stats.tx_pkts;
 778                stats->q_obytes[i] = txq->stats.tx_bytes;
 779
 780                stats->ipackets += stats->q_ipackets[i];
 781                stats->ibytes += stats->q_ibytes[i];
 782                stats->imissed += rxq->stats.rx_dropped;
 783                stats->oerrors += txq->stats.tx_dropped;
 784                ret = getsockopt(xsk_socket__fd(rxq->xsk), SOL_XDP,
 785                                XDP_STATISTICS, &xdp_stats, &optlen);
 786                if (ret != 0) {
 787                        AF_XDP_LOG(ERR, "getsockopt() failed for XDP_STATISTICS.\n");
 788                        return -1;
 789                }
 790                stats->imissed += xdp_stats.rx_dropped;
 791
 792                stats->opackets += stats->q_opackets[i];
 793                stats->obytes += stats->q_obytes[i];
 794        }
 795
 796        return 0;
 797}
 798
 799static int
 800eth_stats_reset(struct rte_eth_dev *dev)
 801{
 802        struct pmd_internals *internals = dev->data->dev_private;
 803        int i;
 804
 805        for (i = 0; i < internals->queue_cnt; i++) {
 806                memset(&internals->rx_queues[i].stats, 0,
 807                                        sizeof(struct rx_stats));
 808                memset(&internals->tx_queues[i].stats, 0,
 809                                        sizeof(struct tx_stats));
 810        }
 811
 812        return 0;
 813}
 814
 815static void
 816remove_xdp_program(struct pmd_internals *internals)
 817{
 818        uint32_t curr_prog_id = 0;
 819
 820        if (bpf_get_link_xdp_id(internals->if_index, &curr_prog_id,
 821                                XDP_FLAGS_UPDATE_IF_NOEXIST)) {
 822                AF_XDP_LOG(ERR, "bpf_get_link_xdp_id failed\n");
 823                return;
 824        }
 825        bpf_set_link_xdp_fd(internals->if_index, -1,
 826                        XDP_FLAGS_UPDATE_IF_NOEXIST);
 827}
 828
 829static void
 830xdp_umem_destroy(struct xsk_umem_info *umem)
 831{
 832#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
 833        umem->mb_pool = NULL;
 834#else
 835        rte_memzone_free(umem->mz);
 836        umem->mz = NULL;
 837
 838        rte_ring_free(umem->buf_ring);
 839        umem->buf_ring = NULL;
 840#endif
 841
 842        rte_free(umem);
 843        umem = NULL;
 844}
 845
 846static int
 847eth_dev_close(struct rte_eth_dev *dev)
 848{
 849        struct pmd_internals *internals = dev->data->dev_private;
 850        struct pkt_rx_queue *rxq;
 851        int i;
 852
 853        if (rte_eal_process_type() != RTE_PROC_PRIMARY)
 854                return 0;
 855
 856        AF_XDP_LOG(INFO, "Closing AF_XDP ethdev on numa socket %u\n",
 857                rte_socket_id());
 858
 859        for (i = 0; i < internals->queue_cnt; i++) {
 860                rxq = &internals->rx_queues[i];
 861                if (rxq->umem == NULL)
 862                        break;
 863                xsk_socket__delete(rxq->xsk);
 864
 865                if (__atomic_sub_fetch(&rxq->umem->refcnt, 1, __ATOMIC_ACQUIRE)
 866                                == 0) {
 867                        (void)xsk_umem__delete(rxq->umem->umem);
 868                        xdp_umem_destroy(rxq->umem);
 869                }
 870
 871                /* free pkt_tx_queue */
 872                rte_free(rxq->pair);
 873                rte_free(rxq);
 874        }
 875
 876        /*
 877         * MAC is not allocated dynamically, setting it to NULL would prevent
 878         * from releasing it in rte_eth_dev_release_port.
 879         */
 880        dev->data->mac_addrs = NULL;
 881
 882        remove_xdp_program(internals);
 883
 884        if (internals->shared_umem) {
 885                struct internal_list *list;
 886
 887                /* Remove ethdev from list used to track and share UMEMs */
 888                list = find_internal_resource(internals);
 889                if (list) {
 890                        pthread_mutex_lock(&internal_list_lock);
 891                        TAILQ_REMOVE(&internal_list, list, next);
 892                        pthread_mutex_unlock(&internal_list_lock);
 893                        rte_free(list);
 894                }
 895        }
 896
 897        return 0;
 898}
 899
 900static void
 901eth_queue_release(void *q __rte_unused)
 902{
 903}
 904
 905static int
 906eth_link_update(struct rte_eth_dev *dev __rte_unused,
 907                int wait_to_complete __rte_unused)
 908{
 909        return 0;
 910}
 911
 912#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
 913static inline uintptr_t get_base_addr(struct rte_mempool *mp, uint64_t *align)
 914{
 915        struct rte_mempool_memhdr *memhdr;
 916        uintptr_t memhdr_addr, aligned_addr;
 917
 918        memhdr = STAILQ_FIRST(&mp->mem_list);
 919        memhdr_addr = (uintptr_t)memhdr->addr;
 920        aligned_addr = memhdr_addr & ~(getpagesize() - 1);
 921        *align = memhdr_addr - aligned_addr;
 922
 923        return aligned_addr;
 924}
 925
 926static struct
 927xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
 928                                  struct pkt_rx_queue *rxq)
 929{
 930        struct xsk_umem_info *umem = NULL;
 931        int ret;
 932        struct xsk_umem_config usr_config = {
 933                .fill_size = ETH_AF_XDP_DFLT_NUM_DESCS * 2,
 934                .comp_size = ETH_AF_XDP_DFLT_NUM_DESCS,
 935                .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG};
 936        void *base_addr = NULL;
 937        struct rte_mempool *mb_pool = rxq->mb_pool;
 938        uint64_t umem_size, align = 0;
 939
 940        if (internals->shared_umem) {
 941                if (get_shared_umem(rxq, internals->if_name, &umem) < 0)
 942                        return NULL;
 943
 944                if (umem != NULL &&
 945                        __atomic_load_n(&umem->refcnt, __ATOMIC_ACQUIRE) <
 946                                        umem->max_xsks) {
 947                        AF_XDP_LOG(INFO, "%s,qid%i sharing UMEM\n",
 948                                        internals->if_name, rxq->xsk_queue_idx);
 949                        __atomic_fetch_add(&umem->refcnt, 1, __ATOMIC_ACQUIRE);
 950                }
 951        }
 952
 953        if (umem == NULL) {
 954                usr_config.frame_size =
 955                        rte_mempool_calc_obj_size(mb_pool->elt_size,
 956                                                  mb_pool->flags, NULL);
 957                usr_config.frame_headroom = mb_pool->header_size +
 958                                                sizeof(struct rte_mbuf) +
 959                                                rte_pktmbuf_priv_size(mb_pool) +
 960                                                RTE_PKTMBUF_HEADROOM;
 961
 962                umem = rte_zmalloc_socket("umem", sizeof(*umem), 0,
 963                                          rte_socket_id());
 964                if (umem == NULL) {
 965                        AF_XDP_LOG(ERR, "Failed to allocate umem info");
 966                        return NULL;
 967                }
 968
 969                umem->mb_pool = mb_pool;
 970                base_addr = (void *)get_base_addr(mb_pool, &align);
 971                umem_size = (uint64_t)mb_pool->populated_size *
 972                                (uint64_t)usr_config.frame_size +
 973                                align;
 974
 975                ret = xsk_umem__create(&umem->umem, base_addr, umem_size,
 976                                &rxq->fq, &rxq->cq, &usr_config);
 977                if (ret) {
 978                        AF_XDP_LOG(ERR, "Failed to create umem");
 979                        goto err;
 980                }
 981                umem->buffer = base_addr;
 982
 983                if (internals->shared_umem) {
 984                        umem->max_xsks = mb_pool->populated_size /
 985                                                ETH_AF_XDP_NUM_BUFFERS;
 986                        AF_XDP_LOG(INFO, "Max xsks for UMEM %s: %u\n",
 987                                                mb_pool->name, umem->max_xsks);
 988                }
 989
 990                __atomic_store_n(&umem->refcnt, 1, __ATOMIC_RELEASE);
 991        }
 992
 993#else
 994static struct
 995xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
 996                                  struct pkt_rx_queue *rxq)
 997{
 998        struct xsk_umem_info *umem;
 999        const struct rte_memzone *mz;
1000        struct xsk_umem_config usr_config = {
1001                .fill_size = ETH_AF_XDP_DFLT_NUM_DESCS,
1002                .comp_size = ETH_AF_XDP_DFLT_NUM_DESCS,
1003                .frame_size = ETH_AF_XDP_FRAME_SIZE,
1004                .frame_headroom = 0 };
1005        char ring_name[RTE_RING_NAMESIZE];
1006        char mz_name[RTE_MEMZONE_NAMESIZE];
1007        int ret;
1008        uint64_t i;
1009
1010        umem = rte_zmalloc_socket("umem", sizeof(*umem), 0, rte_socket_id());
1011        if (umem == NULL) {
1012                AF_XDP_LOG(ERR, "Failed to allocate umem info");
1013                return NULL;
1014        }
1015
1016        snprintf(ring_name, sizeof(ring_name), "af_xdp_ring_%s_%u",
1017                       internals->if_name, rxq->xsk_queue_idx);
1018        umem->buf_ring = rte_ring_create(ring_name,
1019                                         ETH_AF_XDP_NUM_BUFFERS,
1020                                         rte_socket_id(),
1021                                         0x0);
1022        if (umem->buf_ring == NULL) {
1023                AF_XDP_LOG(ERR, "Failed to create rte_ring\n");
1024                goto err;
1025        }
1026
1027        for (i = 0; i < ETH_AF_XDP_NUM_BUFFERS; i++)
1028                rte_ring_enqueue(umem->buf_ring,
1029                                 (void *)(i * ETH_AF_XDP_FRAME_SIZE));
1030
1031        snprintf(mz_name, sizeof(mz_name), "af_xdp_umem_%s_%u",
1032                       internals->if_name, rxq->xsk_queue_idx);
1033        mz = rte_memzone_reserve_aligned(mz_name,
1034                        ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE,
1035                        rte_socket_id(), RTE_MEMZONE_IOVA_CONTIG,
1036                        getpagesize());
1037        if (mz == NULL) {
1038                AF_XDP_LOG(ERR, "Failed to reserve memzone for af_xdp umem.\n");
1039                goto err;
1040        }
1041
1042        ret = xsk_umem__create(&umem->umem, mz->addr,
1043                               ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE,
1044                               &rxq->fq, &rxq->cq,
1045                               &usr_config);
1046
1047        if (ret) {
1048                AF_XDP_LOG(ERR, "Failed to create umem");
1049                goto err;
1050        }
1051        umem->mz = mz;
1052
1053#endif
1054        return umem;
1055
1056err:
1057        xdp_umem_destroy(umem);
1058        return NULL;
1059}
1060
1061static int
1062load_custom_xdp_prog(const char *prog_path, int if_index)
1063{
1064        int ret, prog_fd = -1;
1065        struct bpf_object *obj;
1066        struct bpf_map *map;
1067
1068        ret = bpf_prog_load(prog_path, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
1069        if (ret) {
1070                AF_XDP_LOG(ERR, "Failed to load program %s\n", prog_path);
1071                return ret;
1072        }
1073
1074        /*
1075         * The loaded program must provision for a map of xsks, such that some
1076         * traffic can be redirected to userspace. When the xsk is created,
1077         * libbpf inserts it into the map.
1078         */
1079        map = bpf_object__find_map_by_name(obj, "xsks_map");
1080        if (!map) {
1081                AF_XDP_LOG(ERR, "Failed to find xsks_map in %s\n", prog_path);
1082                return -1;
1083        }
1084
1085        /* Link the program with the given network device */
1086        ret = bpf_set_link_xdp_fd(if_index, prog_fd,
1087                                        XDP_FLAGS_UPDATE_IF_NOEXIST);
1088        if (ret) {
1089                AF_XDP_LOG(ERR, "Failed to set prog fd %d on interface\n",
1090                                prog_fd);
1091                return -1;
1092        }
1093
1094        AF_XDP_LOG(INFO, "Successfully loaded XDP program %s with fd %d\n",
1095                                prog_path, prog_fd);
1096
1097        return 0;
1098}
1099
1100static int
1101xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq,
1102              int ring_size)
1103{
1104        struct xsk_socket_config cfg;
1105        struct pkt_tx_queue *txq = rxq->pair;
1106        int ret = 0;
1107        int reserve_size = ETH_AF_XDP_DFLT_NUM_DESCS;
1108        struct rte_mbuf *fq_bufs[reserve_size];
1109
1110        rxq->umem = xdp_umem_configure(internals, rxq);
1111        if (rxq->umem == NULL)
1112                return -ENOMEM;
1113        txq->umem = rxq->umem;
1114
1115        cfg.rx_size = ring_size;
1116        cfg.tx_size = ring_size;
1117        cfg.libbpf_flags = 0;
1118        cfg.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
1119        cfg.bind_flags = 0;
1120
1121#if defined(XDP_USE_NEED_WAKEUP)
1122        cfg.bind_flags |= XDP_USE_NEED_WAKEUP;
1123#endif
1124
1125        if (strnlen(internals->prog_path, PATH_MAX) &&
1126                                !internals->custom_prog_configured) {
1127                ret = load_custom_xdp_prog(internals->prog_path,
1128                                           internals->if_index);
1129                if (ret) {
1130                        AF_XDP_LOG(ERR, "Failed to load custom XDP program %s\n",
1131                                        internals->prog_path);
1132                        goto err;
1133                }
1134                internals->custom_prog_configured = 1;
1135        }
1136
1137        if (internals->shared_umem)
1138                ret = create_shared_socket(&rxq->xsk, internals->if_name,
1139                                rxq->xsk_queue_idx, rxq->umem->umem, &rxq->rx,
1140                                &txq->tx, &rxq->fq, &rxq->cq, &cfg);
1141        else
1142                ret = xsk_socket__create(&rxq->xsk, internals->if_name,
1143                                rxq->xsk_queue_idx, rxq->umem->umem, &rxq->rx,
1144                                &txq->tx, &cfg);
1145
1146        if (ret) {
1147                AF_XDP_LOG(ERR, "Failed to create xsk socket.\n");
1148                goto err;
1149        }
1150
1151#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
1152        if (rte_pktmbuf_alloc_bulk(rxq->umem->mb_pool, fq_bufs, reserve_size)) {
1153                AF_XDP_LOG(DEBUG, "Failed to get enough buffers for fq.\n");
1154                goto err;
1155        }
1156#endif
1157        ret = reserve_fill_queue(rxq->umem, reserve_size, fq_bufs, &rxq->fq);
1158        if (ret) {
1159                xsk_socket__delete(rxq->xsk);
1160                AF_XDP_LOG(ERR, "Failed to reserve fill queue.\n");
1161                goto err;
1162        }
1163
1164        return 0;
1165
1166err:
1167        if (__atomic_sub_fetch(&rxq->umem->refcnt, 1, __ATOMIC_ACQUIRE) == 0)
1168                xdp_umem_destroy(rxq->umem);
1169
1170        return ret;
1171}
1172
1173static int
1174eth_rx_queue_setup(struct rte_eth_dev *dev,
1175                   uint16_t rx_queue_id,
1176                   uint16_t nb_rx_desc,
1177                   unsigned int socket_id __rte_unused,
1178                   const struct rte_eth_rxconf *rx_conf __rte_unused,
1179                   struct rte_mempool *mb_pool)
1180{
1181        struct pmd_internals *internals = dev->data->dev_private;
1182        struct pkt_rx_queue *rxq;
1183        int ret;
1184
1185        rxq = &internals->rx_queues[rx_queue_id];
1186
1187        AF_XDP_LOG(INFO, "Set up rx queue, rx queue id: %d, xsk queue id: %d\n",
1188                   rx_queue_id, rxq->xsk_queue_idx);
1189
1190#ifndef XDP_UMEM_UNALIGNED_CHUNK_FLAG
1191        uint32_t buf_size, data_size;
1192
1193        /* Now get the space available for data in the mbuf */
1194        buf_size = rte_pktmbuf_data_room_size(mb_pool) -
1195                RTE_PKTMBUF_HEADROOM;
1196        data_size = ETH_AF_XDP_FRAME_SIZE;
1197
1198        if (data_size > buf_size) {
1199                AF_XDP_LOG(ERR, "%s: %d bytes will not fit in mbuf (%d bytes)\n",
1200                        dev->device->name, data_size, buf_size);
1201                ret = -ENOMEM;
1202                goto err;
1203        }
1204#endif
1205
1206        rxq->mb_pool = mb_pool;
1207
1208        if (xsk_configure(internals, rxq, nb_rx_desc)) {
1209                AF_XDP_LOG(ERR, "Failed to configure xdp socket\n");
1210                ret = -EINVAL;
1211                goto err;
1212        }
1213
1214        rxq->fds[0].fd = xsk_socket__fd(rxq->xsk);
1215        rxq->fds[0].events = POLLIN;
1216
1217        dev->data->rx_queues[rx_queue_id] = rxq;
1218        return 0;
1219
1220err:
1221        return ret;
1222}
1223
1224static int
1225eth_tx_queue_setup(struct rte_eth_dev *dev,
1226                   uint16_t tx_queue_id,
1227                   uint16_t nb_tx_desc __rte_unused,
1228                   unsigned int socket_id __rte_unused,
1229                   const struct rte_eth_txconf *tx_conf __rte_unused)
1230{
1231        struct pmd_internals *internals = dev->data->dev_private;
1232        struct pkt_tx_queue *txq;
1233
1234        txq = &internals->tx_queues[tx_queue_id];
1235
1236        dev->data->tx_queues[tx_queue_id] = txq;
1237        return 0;
1238}
1239
1240static int
1241eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
1242{
1243        struct pmd_internals *internals = dev->data->dev_private;
1244        struct ifreq ifr = { .ifr_mtu = mtu };
1245        int ret;
1246        int s;
1247
1248        s = socket(PF_INET, SOCK_DGRAM, 0);
1249        if (s < 0)
1250                return -EINVAL;
1251
1252        strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ);
1253        ret = ioctl(s, SIOCSIFMTU, &ifr);
1254        close(s);
1255
1256        return (ret < 0) ? -errno : 0;
1257}
1258
1259static int
1260eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask)
1261{
1262        struct ifreq ifr;
1263        int ret = 0;
1264        int s;
1265
1266        s = socket(PF_INET, SOCK_DGRAM, 0);
1267        if (s < 0)
1268                return -errno;
1269
1270        strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
1271        if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0) {
1272                ret = -errno;
1273                goto out;
1274        }
1275        ifr.ifr_flags &= mask;
1276        ifr.ifr_flags |= flags;
1277        if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) {
1278                ret = -errno;
1279                goto out;
1280        }
1281out:
1282        close(s);
1283        return ret;
1284}
1285
1286static int
1287eth_dev_promiscuous_enable(struct rte_eth_dev *dev)
1288{
1289        struct pmd_internals *internals = dev->data->dev_private;
1290
1291        return eth_dev_change_flags(internals->if_name, IFF_PROMISC, ~0);
1292}
1293
1294static int
1295eth_dev_promiscuous_disable(struct rte_eth_dev *dev)
1296{
1297        struct pmd_internals *internals = dev->data->dev_private;
1298
1299        return eth_dev_change_flags(internals->if_name, 0, ~IFF_PROMISC);
1300}
1301
1302static const struct eth_dev_ops ops = {
1303        .dev_start = eth_dev_start,
1304        .dev_stop = eth_dev_stop,
1305        .dev_close = eth_dev_close,
1306        .dev_configure = eth_dev_configure,
1307        .dev_infos_get = eth_dev_info,
1308        .mtu_set = eth_dev_mtu_set,
1309        .promiscuous_enable = eth_dev_promiscuous_enable,
1310        .promiscuous_disable = eth_dev_promiscuous_disable,
1311        .rx_queue_setup = eth_rx_queue_setup,
1312        .tx_queue_setup = eth_tx_queue_setup,
1313        .rx_queue_release = eth_queue_release,
1314        .tx_queue_release = eth_queue_release,
1315        .link_update = eth_link_update,
1316        .stats_get = eth_stats_get,
1317        .stats_reset = eth_stats_reset,
1318};
1319
1320/** parse integer from integer argument */
1321static int
1322parse_integer_arg(const char *key __rte_unused,
1323                  const char *value, void *extra_args)
1324{
1325        int *i = (int *)extra_args;
1326        char *end;
1327
1328        *i = strtol(value, &end, 10);
1329        if (*i < 0) {
1330                AF_XDP_LOG(ERR, "Argument has to be positive.\n");
1331                return -EINVAL;
1332        }
1333
1334        return 0;
1335}
1336
1337/** parse name argument */
1338static int
1339parse_name_arg(const char *key __rte_unused,
1340               const char *value, void *extra_args)
1341{
1342        char *name = extra_args;
1343
1344        if (strnlen(value, IFNAMSIZ) > IFNAMSIZ - 1) {
1345                AF_XDP_LOG(ERR, "Invalid name %s, should be less than %u bytes.\n",
1346                           value, IFNAMSIZ);
1347                return -EINVAL;
1348        }
1349
1350        strlcpy(name, value, IFNAMSIZ);
1351
1352        return 0;
1353}
1354
1355/** parse xdp prog argument */
1356static int
1357parse_prog_arg(const char *key __rte_unused,
1358               const char *value, void *extra_args)
1359{
1360        char *path = extra_args;
1361
1362        if (strnlen(value, PATH_MAX) == PATH_MAX) {
1363                AF_XDP_LOG(ERR, "Invalid path %s, should be less than %u bytes.\n",
1364                           value, PATH_MAX);
1365                return -EINVAL;
1366        }
1367
1368        if (access(value, F_OK) != 0) {
1369                AF_XDP_LOG(ERR, "Error accessing %s: %s\n",
1370                           value, strerror(errno));
1371                return -EINVAL;
1372        }
1373
1374        strlcpy(path, value, PATH_MAX);
1375
1376        return 0;
1377}
1378
1379static int
1380xdp_get_channels_info(const char *if_name, int *max_queues,
1381                                int *combined_queues)
1382{
1383        struct ethtool_channels channels;
1384        struct ifreq ifr;
1385        int fd, ret;
1386
1387        fd = socket(AF_INET, SOCK_DGRAM, 0);
1388        if (fd < 0)
1389                return -1;
1390
1391        channels.cmd = ETHTOOL_GCHANNELS;
1392        ifr.ifr_data = (void *)&channels;
1393        strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
1394        ret = ioctl(fd, SIOCETHTOOL, &ifr);
1395        if (ret) {
1396                if (errno == EOPNOTSUPP) {
1397                        ret = 0;
1398                } else {
1399                        ret = -errno;
1400                        goto out;
1401                }
1402        }
1403
1404        if (channels.max_combined == 0 || errno == EOPNOTSUPP) {
1405                /* If the device says it has no channels, then all traffic
1406                 * is sent to a single stream, so max queues = 1.
1407                 */
1408                *max_queues = 1;
1409                *combined_queues = 1;
1410        } else {
1411                *max_queues = channels.max_combined;
1412                *combined_queues = channels.combined_count;
1413        }
1414
1415 out:
1416        close(fd);
1417        return ret;
1418}
1419
1420static int
1421parse_parameters(struct rte_kvargs *kvlist, char *if_name, int *start_queue,
1422                        int *queue_cnt, int *shared_umem, char *prog_path)
1423{
1424        int ret;
1425
1426        ret = rte_kvargs_process(kvlist, ETH_AF_XDP_IFACE_ARG,
1427                                 &parse_name_arg, if_name);
1428        if (ret < 0)
1429                goto free_kvlist;
1430
1431        ret = rte_kvargs_process(kvlist, ETH_AF_XDP_START_QUEUE_ARG,
1432                                 &parse_integer_arg, start_queue);
1433        if (ret < 0)
1434                goto free_kvlist;
1435
1436        ret = rte_kvargs_process(kvlist, ETH_AF_XDP_QUEUE_COUNT_ARG,
1437                                 &parse_integer_arg, queue_cnt);
1438        if (ret < 0 || *queue_cnt <= 0) {
1439                ret = -EINVAL;
1440                goto free_kvlist;
1441        }
1442
1443        ret = rte_kvargs_process(kvlist, ETH_AF_XDP_SHARED_UMEM_ARG,
1444                                &parse_integer_arg, shared_umem);
1445        if (ret < 0)
1446                goto free_kvlist;
1447
1448        ret = rte_kvargs_process(kvlist, ETH_AF_XDP_PROG_ARG,
1449                                 &parse_prog_arg, prog_path);
1450        if (ret < 0)
1451                goto free_kvlist;
1452
1453free_kvlist:
1454        rte_kvargs_free(kvlist);
1455        return ret;
1456}
1457
1458static int
1459get_iface_info(const char *if_name,
1460               struct rte_ether_addr *eth_addr,
1461               int *if_index)
1462{
1463        struct ifreq ifr;
1464        int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
1465
1466        if (sock < 0)
1467                return -1;
1468
1469        strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
1470        if (ioctl(sock, SIOCGIFINDEX, &ifr))
1471                goto error;
1472
1473        *if_index = ifr.ifr_ifindex;
1474
1475        if (ioctl(sock, SIOCGIFHWADDR, &ifr))
1476                goto error;
1477
1478        rte_memcpy(eth_addr, ifr.ifr_hwaddr.sa_data, RTE_ETHER_ADDR_LEN);
1479
1480        close(sock);
1481        return 0;
1482
1483error:
1484        close(sock);
1485        return -1;
1486}
1487
1488static struct rte_eth_dev *
1489init_internals(struct rte_vdev_device *dev, const char *if_name,
1490                int start_queue_idx, int queue_cnt, int shared_umem,
1491                const char *prog_path)
1492{
1493        const char *name = rte_vdev_device_name(dev);
1494        const unsigned int numa_node = dev->device.numa_node;
1495        struct pmd_internals *internals;
1496        struct rte_eth_dev *eth_dev;
1497        int ret;
1498        int i;
1499
1500        internals = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
1501        if (internals == NULL)
1502                return NULL;
1503
1504        internals->start_queue_idx = start_queue_idx;
1505        internals->queue_cnt = queue_cnt;
1506        strlcpy(internals->if_name, if_name, IFNAMSIZ);
1507        strlcpy(internals->prog_path, prog_path, PATH_MAX);
1508        internals->custom_prog_configured = 0;
1509
1510#ifndef ETH_AF_XDP_SHARED_UMEM
1511        if (shared_umem) {
1512                AF_XDP_LOG(ERR, "Shared UMEM feature not available. "
1513                                "Check kernel and libbpf version\n");
1514                goto err_free_internals;
1515        }
1516#endif
1517        internals->shared_umem = shared_umem;
1518
1519        if (xdp_get_channels_info(if_name, &internals->max_queue_cnt,
1520                                  &internals->combined_queue_cnt)) {
1521                AF_XDP_LOG(ERR, "Failed to get channel info of interface: %s\n",
1522                                if_name);
1523                goto err_free_internals;
1524        }
1525
1526        if (queue_cnt > internals->combined_queue_cnt) {
1527                AF_XDP_LOG(ERR, "Specified queue count %d is larger than combined queue count %d.\n",
1528                                queue_cnt, internals->combined_queue_cnt);
1529                goto err_free_internals;
1530        }
1531
1532        internals->rx_queues = rte_zmalloc_socket(NULL,
1533                                        sizeof(struct pkt_rx_queue) * queue_cnt,
1534                                        0, numa_node);
1535        if (internals->rx_queues == NULL) {
1536                AF_XDP_LOG(ERR, "Failed to allocate memory for rx queues.\n");
1537                goto err_free_internals;
1538        }
1539
1540        internals->tx_queues = rte_zmalloc_socket(NULL,
1541                                        sizeof(struct pkt_tx_queue) * queue_cnt,
1542                                        0, numa_node);
1543        if (internals->tx_queues == NULL) {
1544                AF_XDP_LOG(ERR, "Failed to allocate memory for tx queues.\n");
1545                goto err_free_rx;
1546        }
1547        for (i = 0; i < queue_cnt; i++) {
1548                internals->tx_queues[i].pair = &internals->rx_queues[i];
1549                internals->rx_queues[i].pair = &internals->tx_queues[i];
1550                internals->rx_queues[i].xsk_queue_idx = start_queue_idx + i;
1551                internals->tx_queues[i].xsk_queue_idx = start_queue_idx + i;
1552        }
1553
1554        ret = get_iface_info(if_name, &internals->eth_addr,
1555                             &internals->if_index);
1556        if (ret)
1557                goto err_free_tx;
1558
1559        eth_dev = rte_eth_vdev_allocate(dev, 0);
1560        if (eth_dev == NULL)
1561                goto err_free_tx;
1562
1563        eth_dev->data->dev_private = internals;
1564        eth_dev->data->dev_link = pmd_link;
1565        eth_dev->data->mac_addrs = &internals->eth_addr;
1566        eth_dev->data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1567        eth_dev->dev_ops = &ops;
1568        eth_dev->rx_pkt_burst = eth_af_xdp_rx;
1569        eth_dev->tx_pkt_burst = eth_af_xdp_tx;
1570
1571#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
1572        AF_XDP_LOG(INFO, "Zero copy between umem and mbuf enabled.\n");
1573#endif
1574
1575        return eth_dev;
1576
1577err_free_tx:
1578        rte_free(internals->tx_queues);
1579err_free_rx:
1580        rte_free(internals->rx_queues);
1581err_free_internals:
1582        rte_free(internals);
1583        return NULL;
1584}
1585
1586static int
1587rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
1588{
1589        struct rte_kvargs *kvlist;
1590        char if_name[IFNAMSIZ] = {'\0'};
1591        int xsk_start_queue_idx = ETH_AF_XDP_DFLT_START_QUEUE_IDX;
1592        int xsk_queue_cnt = ETH_AF_XDP_DFLT_QUEUE_COUNT;
1593        int shared_umem = 0;
1594        char prog_path[PATH_MAX] = {'\0'};
1595        struct rte_eth_dev *eth_dev = NULL;
1596        const char *name;
1597
1598        AF_XDP_LOG(INFO, "Initializing pmd_af_xdp for %s\n",
1599                rte_vdev_device_name(dev));
1600
1601        name = rte_vdev_device_name(dev);
1602        if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
1603                strlen(rte_vdev_device_args(dev)) == 0) {
1604                eth_dev = rte_eth_dev_attach_secondary(name);
1605                if (eth_dev == NULL) {
1606                        AF_XDP_LOG(ERR, "Failed to probe %s\n", name);
1607                        return -EINVAL;
1608                }
1609                eth_dev->dev_ops = &ops;
1610                rte_eth_dev_probing_finish(eth_dev);
1611                return 0;
1612        }
1613
1614        kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1615        if (kvlist == NULL) {
1616                AF_XDP_LOG(ERR, "Invalid kvargs key\n");
1617                return -EINVAL;
1618        }
1619
1620        if (dev->device.numa_node == SOCKET_ID_ANY)
1621                dev->device.numa_node = rte_socket_id();
1622
1623        if (parse_parameters(kvlist, if_name, &xsk_start_queue_idx,
1624                             &xsk_queue_cnt, &shared_umem, prog_path) < 0) {
1625                AF_XDP_LOG(ERR, "Invalid kvargs value\n");
1626                return -EINVAL;
1627        }
1628
1629        if (strlen(if_name) == 0) {
1630                AF_XDP_LOG(ERR, "Network interface must be specified\n");
1631                return -EINVAL;
1632        }
1633
1634        eth_dev = init_internals(dev, if_name, xsk_start_queue_idx,
1635                                        xsk_queue_cnt, shared_umem, prog_path);
1636        if (eth_dev == NULL) {
1637                AF_XDP_LOG(ERR, "Failed to init internals\n");
1638                return -1;
1639        }
1640
1641        rte_eth_dev_probing_finish(eth_dev);
1642
1643        return 0;
1644}
1645
1646static int
1647rte_pmd_af_xdp_remove(struct rte_vdev_device *dev)
1648{
1649        struct rte_eth_dev *eth_dev = NULL;
1650
1651        AF_XDP_LOG(INFO, "Removing AF_XDP ethdev on numa socket %u\n",
1652                rte_socket_id());
1653
1654        if (dev == NULL)
1655                return -1;
1656
1657        /* find the ethdev entry */
1658        eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
1659        if (eth_dev == NULL)
1660                return 0;
1661
1662        eth_dev_close(eth_dev);
1663        rte_eth_dev_release_port(eth_dev);
1664
1665
1666        return 0;
1667}
1668
1669static struct rte_vdev_driver pmd_af_xdp_drv = {
1670        .probe = rte_pmd_af_xdp_probe,
1671        .remove = rte_pmd_af_xdp_remove,
1672};
1673
1674RTE_PMD_REGISTER_VDEV(net_af_xdp, pmd_af_xdp_drv);
1675RTE_PMD_REGISTER_PARAM_STRING(net_af_xdp,
1676                              "iface=<string> "
1677                              "start_queue=<int> "
1678                              "queue_count=<int> "
1679                              "shared_umem=<int> "
1680                              "xdp_prog=<string> ");
1681