1
2
3
4#include <unistd.h>
5#include <errno.h>
6#include <stdlib.h>
7#include <string.h>
8#include <netinet/in.h>
9#include <net/if.h>
10#include <sys/socket.h>
11#include <sys/ioctl.h>
12#include <linux/if_ether.h>
13#include <linux/if_xdp.h>
14#include <linux/if_link.h>
15#include <linux/ethtool.h>
16#include <linux/sockios.h>
17#include "af_xdp_deps.h"
18#include <bpf/xsk.h>
19
20#include <rte_ethdev.h>
21#include <ethdev_driver.h>
22#include <ethdev_vdev.h>
23#include <rte_kvargs.h>
24#include <rte_bus_vdev.h>
25#include <rte_string_fns.h>
26#include <rte_branch_prediction.h>
27#include <rte_common.h>
28#include <rte_dev.h>
29#include <rte_eal.h>
30#include <rte_ether.h>
31#include <rte_lcore.h>
32#include <rte_log.h>
33#include <rte_memory.h>
34#include <rte_memzone.h>
35#include <rte_mempool.h>
36#include <rte_mbuf.h>
37#include <rte_malloc.h>
38#include <rte_ring.h>
39#include <rte_spinlock.h>
40#include <rte_power_intrinsics.h>
41
42#include "compat.h"
43
44#ifndef SO_PREFER_BUSY_POLL
45#define SO_PREFER_BUSY_POLL 69
46#endif
47#ifndef SO_BUSY_POLL_BUDGET
48#define SO_BUSY_POLL_BUDGET 70
49#endif
50
51
52#ifndef SOL_XDP
53#define SOL_XDP 283
54#endif
55
56#ifndef AF_XDP
57#define AF_XDP 44
58#endif
59
60#ifndef PF_XDP
61#define PF_XDP AF_XDP
62#endif
63
64RTE_LOG_REGISTER_DEFAULT(af_xdp_logtype, NOTICE);
65
66#define AF_XDP_LOG(level, fmt, args...) \
67 rte_log(RTE_LOG_ ## level, af_xdp_logtype, \
68 "%s(): " fmt, __func__, ##args)
69
70#define ETH_AF_XDP_FRAME_SIZE 2048
71#define ETH_AF_XDP_NUM_BUFFERS 4096
72#define ETH_AF_XDP_DFLT_NUM_DESCS XSK_RING_CONS__DEFAULT_NUM_DESCS
73#define ETH_AF_XDP_DFLT_START_QUEUE_IDX 0
74#define ETH_AF_XDP_DFLT_QUEUE_COUNT 1
75#define ETH_AF_XDP_DFLT_BUSY_BUDGET 64
76#define ETH_AF_XDP_DFLT_BUSY_TIMEOUT 20
77
78#define ETH_AF_XDP_RX_BATCH_SIZE XSK_RING_CONS__DEFAULT_NUM_DESCS
79#define ETH_AF_XDP_TX_BATCH_SIZE XSK_RING_CONS__DEFAULT_NUM_DESCS
80
81
82struct xsk_umem_info {
83 struct xsk_umem *umem;
84 struct rte_ring *buf_ring;
85 const struct rte_memzone *mz;
86 struct rte_mempool *mb_pool;
87 void *buffer;
88 uint8_t refcnt;
89 uint32_t max_xsks;
90};
91
92struct rx_stats {
93 uint64_t rx_pkts;
94 uint64_t rx_bytes;
95 uint64_t rx_dropped;
96};
97
98struct pkt_rx_queue {
99 struct xsk_ring_cons rx;
100 struct xsk_umem_info *umem;
101 struct xsk_socket *xsk;
102 struct rte_mempool *mb_pool;
103
104 struct rx_stats stats;
105
106 struct xsk_ring_prod fq;
107 struct xsk_ring_cons cq;
108
109 struct pkt_tx_queue *pair;
110 struct pollfd fds[1];
111 int xsk_queue_idx;
112 int busy_budget;
113};
114
115struct tx_stats {
116 uint64_t tx_pkts;
117 uint64_t tx_bytes;
118 uint64_t tx_dropped;
119};
120
121struct pkt_tx_queue {
122 struct xsk_ring_prod tx;
123 struct xsk_umem_info *umem;
124
125 struct tx_stats stats;
126
127 struct pkt_rx_queue *pair;
128 int xsk_queue_idx;
129};
130
131struct pmd_internals {
132 int if_index;
133 char if_name[IFNAMSIZ];
134 int start_queue_idx;
135 int queue_cnt;
136 int max_queue_cnt;
137 int combined_queue_cnt;
138 bool shared_umem;
139 char prog_path[PATH_MAX];
140 bool custom_prog_configured;
141
142 struct rte_ether_addr eth_addr;
143
144 struct pkt_rx_queue *rx_queues;
145 struct pkt_tx_queue *tx_queues;
146};
147
148#define ETH_AF_XDP_IFACE_ARG "iface"
149#define ETH_AF_XDP_START_QUEUE_ARG "start_queue"
150#define ETH_AF_XDP_QUEUE_COUNT_ARG "queue_count"
151#define ETH_AF_XDP_SHARED_UMEM_ARG "shared_umem"
152#define ETH_AF_XDP_PROG_ARG "xdp_prog"
153#define ETH_AF_XDP_BUDGET_ARG "busy_budget"
154
155static const char * const valid_arguments[] = {
156 ETH_AF_XDP_IFACE_ARG,
157 ETH_AF_XDP_START_QUEUE_ARG,
158 ETH_AF_XDP_QUEUE_COUNT_ARG,
159 ETH_AF_XDP_SHARED_UMEM_ARG,
160 ETH_AF_XDP_PROG_ARG,
161 ETH_AF_XDP_BUDGET_ARG,
162 NULL
163};
164
165static const struct rte_eth_link pmd_link = {
166 .link_speed = ETH_SPEED_NUM_10G,
167 .link_duplex = ETH_LINK_FULL_DUPLEX,
168 .link_status = ETH_LINK_DOWN,
169 .link_autoneg = ETH_LINK_AUTONEG
170};
171
172
173struct internal_list {
174 TAILQ_ENTRY(internal_list) next;
175 struct rte_eth_dev *eth_dev;
176};
177
178TAILQ_HEAD(internal_list_head, internal_list);
179static struct internal_list_head internal_list =
180 TAILQ_HEAD_INITIALIZER(internal_list);
181
182static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
183
184#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
185static inline int
186reserve_fill_queue_zc(struct xsk_umem_info *umem, uint16_t reserve_size,
187 struct rte_mbuf **bufs, struct xsk_ring_prod *fq)
188{
189 uint32_t idx;
190 uint16_t i;
191
192 if (unlikely(!xsk_ring_prod__reserve(fq, reserve_size, &idx))) {
193 for (i = 0; i < reserve_size; i++)
194 rte_pktmbuf_free(bufs[i]);
195 AF_XDP_LOG(DEBUG, "Failed to reserve enough fq descs.\n");
196 return -1;
197 }
198
199 for (i = 0; i < reserve_size; i++) {
200 __u64 *fq_addr;
201 uint64_t addr;
202
203 fq_addr = xsk_ring_prod__fill_addr(fq, idx++);
204 addr = (uint64_t)bufs[i] - (uint64_t)umem->buffer -
205 umem->mb_pool->header_size;
206 *fq_addr = addr;
207 }
208
209 xsk_ring_prod__submit(fq, reserve_size);
210
211 return 0;
212}
213#else
214static inline int
215reserve_fill_queue_cp(struct xsk_umem_info *umem, uint16_t reserve_size,
216 struct rte_mbuf **bufs __rte_unused,
217 struct xsk_ring_prod *fq)
218{
219 void *addrs[reserve_size];
220 uint32_t idx;
221 uint16_t i;
222
223 if (rte_ring_dequeue_bulk(umem->buf_ring, addrs, reserve_size, NULL)
224 != reserve_size) {
225 AF_XDP_LOG(DEBUG, "Failed to get enough buffers for fq.\n");
226 return -1;
227 }
228
229 if (unlikely(!xsk_ring_prod__reserve(fq, reserve_size, &idx))) {
230 AF_XDP_LOG(DEBUG, "Failed to reserve enough fq descs.\n");
231 rte_ring_enqueue_bulk(umem->buf_ring, addrs,
232 reserve_size, NULL);
233 return -1;
234 }
235
236 for (i = 0; i < reserve_size; i++) {
237 __u64 *fq_addr;
238
239 fq_addr = xsk_ring_prod__fill_addr(fq, idx++);
240 *fq_addr = (uint64_t)addrs[i];
241 }
242
243 xsk_ring_prod__submit(fq, reserve_size);
244
245 return 0;
246}
247#endif
248
249static inline int
250reserve_fill_queue(struct xsk_umem_info *umem, uint16_t reserve_size,
251 struct rte_mbuf **bufs, struct xsk_ring_prod *fq)
252{
253#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
254 return reserve_fill_queue_zc(umem, reserve_size, bufs, fq);
255#else
256 return reserve_fill_queue_cp(umem, reserve_size, bufs, fq);
257#endif
258}
259
260#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
261static uint16_t
262af_xdp_rx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
263{
264 struct pkt_rx_queue *rxq = queue;
265 struct xsk_ring_cons *rx = &rxq->rx;
266 struct xsk_ring_prod *fq = &rxq->fq;
267 struct xsk_umem_info *umem = rxq->umem;
268 uint32_t idx_rx = 0;
269 unsigned long rx_bytes = 0;
270 int i;
271 struct rte_mbuf *fq_bufs[ETH_AF_XDP_RX_BATCH_SIZE];
272
273 nb_pkts = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
274
275 if (nb_pkts == 0) {
276
277
278
279
280
281 if (rxq->busy_budget) {
282 (void)recvfrom(xsk_socket__fd(rxq->xsk), NULL, 0,
283 MSG_DONTWAIT, NULL, NULL);
284 } else if (xsk_ring_prod__needs_wakeup(fq)) {
285 (void)poll(&rxq->fds[0], 1, 1000);
286 }
287
288 return 0;
289 }
290
291
292 if (rte_pktmbuf_alloc_bulk(umem->mb_pool, fq_bufs, nb_pkts)) {
293 AF_XDP_LOG(DEBUG,
294 "Failed to get enough buffers for fq.\n");
295
296
297
298 rx->cached_cons -= nb_pkts;
299 return 0;
300 }
301
302 for (i = 0; i < nb_pkts; i++) {
303 const struct xdp_desc *desc;
304 uint64_t addr;
305 uint32_t len;
306 uint64_t offset;
307
308 desc = xsk_ring_cons__rx_desc(rx, idx_rx++);
309 addr = desc->addr;
310 len = desc->len;
311
312 offset = xsk_umem__extract_offset(addr);
313 addr = xsk_umem__extract_addr(addr);
314
315 bufs[i] = (struct rte_mbuf *)
316 xsk_umem__get_data(umem->buffer, addr +
317 umem->mb_pool->header_size);
318 bufs[i]->data_off = offset - sizeof(struct rte_mbuf) -
319 rte_pktmbuf_priv_size(umem->mb_pool) -
320 umem->mb_pool->header_size;
321
322 rte_pktmbuf_pkt_len(bufs[i]) = len;
323 rte_pktmbuf_data_len(bufs[i]) = len;
324 rx_bytes += len;
325 }
326
327 xsk_ring_cons__release(rx, nb_pkts);
328 (void)reserve_fill_queue(umem, nb_pkts, fq_bufs, fq);
329
330
331 rxq->stats.rx_pkts += nb_pkts;
332 rxq->stats.rx_bytes += rx_bytes;
333
334 return nb_pkts;
335}
336#else
337static uint16_t
338af_xdp_rx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
339{
340 struct pkt_rx_queue *rxq = queue;
341 struct xsk_ring_cons *rx = &rxq->rx;
342 struct xsk_umem_info *umem = rxq->umem;
343 struct xsk_ring_prod *fq = &rxq->fq;
344 uint32_t idx_rx = 0;
345 unsigned long rx_bytes = 0;
346 int i;
347 uint32_t free_thresh = fq->size >> 1;
348 struct rte_mbuf *mbufs[ETH_AF_XDP_RX_BATCH_SIZE];
349
350 if (xsk_prod_nb_free(fq, free_thresh) >= free_thresh)
351 (void)reserve_fill_queue(umem, nb_pkts, NULL, fq);
352
353 nb_pkts = xsk_ring_cons__peek(rx, nb_pkts, &idx_rx);
354 if (nb_pkts == 0) {
355#if defined(XDP_USE_NEED_WAKEUP)
356 if (xsk_ring_prod__needs_wakeup(fq))
357 (void)poll(rxq->fds, 1, 1000);
358#endif
359 return 0;
360 }
361
362 if (unlikely(rte_pktmbuf_alloc_bulk(rxq->mb_pool, mbufs, nb_pkts))) {
363
364
365
366 rx->cached_cons -= nb_pkts;
367 return 0;
368 }
369
370 for (i = 0; i < nb_pkts; i++) {
371 const struct xdp_desc *desc;
372 uint64_t addr;
373 uint32_t len;
374 void *pkt;
375
376 desc = xsk_ring_cons__rx_desc(rx, idx_rx++);
377 addr = desc->addr;
378 len = desc->len;
379 pkt = xsk_umem__get_data(rxq->umem->mz->addr, addr);
380
381 rte_memcpy(rte_pktmbuf_mtod(mbufs[i], void *), pkt, len);
382 rte_ring_enqueue(umem->buf_ring, (void *)addr);
383 rte_pktmbuf_pkt_len(mbufs[i]) = len;
384 rte_pktmbuf_data_len(mbufs[i]) = len;
385 rx_bytes += len;
386 bufs[i] = mbufs[i];
387 }
388
389 xsk_ring_cons__release(rx, nb_pkts);
390
391
392 rxq->stats.rx_pkts += nb_pkts;
393 rxq->stats.rx_bytes += rx_bytes;
394
395 return nb_pkts;
396}
397#endif
398
399static uint16_t
400af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
401{
402#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
403 return af_xdp_rx_zc(queue, bufs, nb_pkts);
404#else
405 return af_xdp_rx_cp(queue, bufs, nb_pkts);
406#endif
407}
408
409static uint16_t
410eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
411{
412 uint16_t nb_rx;
413
414 if (likely(nb_pkts <= ETH_AF_XDP_RX_BATCH_SIZE))
415 return af_xdp_rx(queue, bufs, nb_pkts);
416
417
418
419
420 nb_rx = 0;
421 while (nb_pkts) {
422 uint16_t ret, n;
423
424 n = (uint16_t)RTE_MIN(nb_pkts, ETH_AF_XDP_RX_BATCH_SIZE);
425 ret = af_xdp_rx(queue, &bufs[nb_rx], n);
426 nb_rx = (uint16_t)(nb_rx + ret);
427 nb_pkts = (uint16_t)(nb_pkts - ret);
428 if (ret < n)
429 break;
430 }
431
432 return nb_rx;
433}
434
435static void
436pull_umem_cq(struct xsk_umem_info *umem, int size, struct xsk_ring_cons *cq)
437{
438 size_t i, n;
439 uint32_t idx_cq = 0;
440
441 n = xsk_ring_cons__peek(cq, size, &idx_cq);
442
443 for (i = 0; i < n; i++) {
444 uint64_t addr;
445 addr = *xsk_ring_cons__comp_addr(cq, idx_cq++);
446#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
447 addr = xsk_umem__extract_addr(addr);
448 rte_pktmbuf_free((struct rte_mbuf *)
449 xsk_umem__get_data(umem->buffer,
450 addr + umem->mb_pool->header_size));
451#else
452 rte_ring_enqueue(umem->buf_ring, (void *)addr);
453#endif
454 }
455
456 xsk_ring_cons__release(cq, n);
457}
458
459static void
460kick_tx(struct pkt_tx_queue *txq, struct xsk_ring_cons *cq)
461{
462 struct xsk_umem_info *umem = txq->umem;
463
464 pull_umem_cq(umem, XSK_RING_CONS__DEFAULT_NUM_DESCS, cq);
465
466 if (tx_syscall_needed(&txq->tx))
467 while (send(xsk_socket__fd(txq->pair->xsk), NULL,
468 0, MSG_DONTWAIT) < 0) {
469
470 if (errno != EBUSY && errno != EAGAIN && errno != EINTR)
471 break;
472
473
474 if (errno == EAGAIN)
475 pull_umem_cq(umem,
476 XSK_RING_CONS__DEFAULT_NUM_DESCS,
477 cq);
478 }
479}
480
481#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
482static uint16_t
483af_xdp_tx_zc(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
484{
485 struct pkt_tx_queue *txq = queue;
486 struct xsk_umem_info *umem = txq->umem;
487 struct rte_mbuf *mbuf;
488 unsigned long tx_bytes = 0;
489 int i;
490 uint32_t idx_tx;
491 uint16_t count = 0;
492 struct xdp_desc *desc;
493 uint64_t addr, offset;
494 struct xsk_ring_cons *cq = &txq->pair->cq;
495 uint32_t free_thresh = cq->size >> 1;
496
497 if (xsk_cons_nb_avail(cq, free_thresh) >= free_thresh)
498 pull_umem_cq(umem, XSK_RING_CONS__DEFAULT_NUM_DESCS, cq);
499
500 for (i = 0; i < nb_pkts; i++) {
501 mbuf = bufs[i];
502
503 if (mbuf->pool == umem->mb_pool) {
504 if (!xsk_ring_prod__reserve(&txq->tx, 1, &idx_tx)) {
505 kick_tx(txq, cq);
506 if (!xsk_ring_prod__reserve(&txq->tx, 1,
507 &idx_tx))
508 goto out;
509 }
510 desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx);
511 desc->len = mbuf->pkt_len;
512 addr = (uint64_t)mbuf - (uint64_t)umem->buffer -
513 umem->mb_pool->header_size;
514 offset = rte_pktmbuf_mtod(mbuf, uint64_t) -
515 (uint64_t)mbuf +
516 umem->mb_pool->header_size;
517 offset = offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT;
518 desc->addr = addr | offset;
519 count++;
520 } else {
521 struct rte_mbuf *local_mbuf =
522 rte_pktmbuf_alloc(umem->mb_pool);
523 void *pkt;
524
525 if (local_mbuf == NULL)
526 goto out;
527
528 if (!xsk_ring_prod__reserve(&txq->tx, 1, &idx_tx)) {
529 rte_pktmbuf_free(local_mbuf);
530 kick_tx(txq, cq);
531 goto out;
532 }
533
534 desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx);
535 desc->len = mbuf->pkt_len;
536
537 addr = (uint64_t)local_mbuf - (uint64_t)umem->buffer -
538 umem->mb_pool->header_size;
539 offset = rte_pktmbuf_mtod(local_mbuf, uint64_t) -
540 (uint64_t)local_mbuf +
541 umem->mb_pool->header_size;
542 pkt = xsk_umem__get_data(umem->buffer, addr + offset);
543 offset = offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT;
544 desc->addr = addr | offset;
545 rte_memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *),
546 desc->len);
547 rte_pktmbuf_free(mbuf);
548 count++;
549 }
550
551 tx_bytes += mbuf->pkt_len;
552 }
553
554 kick_tx(txq, cq);
555
556out:
557 xsk_ring_prod__submit(&txq->tx, count);
558
559 txq->stats.tx_pkts += count;
560 txq->stats.tx_bytes += tx_bytes;
561 txq->stats.tx_dropped += nb_pkts - count;
562
563 return count;
564}
565#else
566static uint16_t
567af_xdp_tx_cp(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
568{
569 struct pkt_tx_queue *txq = queue;
570 struct xsk_umem_info *umem = txq->umem;
571 struct rte_mbuf *mbuf;
572 void *addrs[ETH_AF_XDP_TX_BATCH_SIZE];
573 unsigned long tx_bytes = 0;
574 int i;
575 uint32_t idx_tx;
576 struct xsk_ring_cons *cq = &txq->pair->cq;
577
578 pull_umem_cq(umem, nb_pkts, cq);
579
580 nb_pkts = rte_ring_dequeue_bulk(umem->buf_ring, addrs,
581 nb_pkts, NULL);
582 if (nb_pkts == 0)
583 return 0;
584
585 if (xsk_ring_prod__reserve(&txq->tx, nb_pkts, &idx_tx) != nb_pkts) {
586 kick_tx(txq, cq);
587 rte_ring_enqueue_bulk(umem->buf_ring, addrs, nb_pkts, NULL);
588 return 0;
589 }
590
591 for (i = 0; i < nb_pkts; i++) {
592 struct xdp_desc *desc;
593 void *pkt;
594
595 desc = xsk_ring_prod__tx_desc(&txq->tx, idx_tx + i);
596 mbuf = bufs[i];
597 desc->len = mbuf->pkt_len;
598
599 desc->addr = (uint64_t)addrs[i];
600 pkt = xsk_umem__get_data(umem->mz->addr,
601 desc->addr);
602 rte_memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *), desc->len);
603 tx_bytes += mbuf->pkt_len;
604 rte_pktmbuf_free(mbuf);
605 }
606
607 xsk_ring_prod__submit(&txq->tx, nb_pkts);
608
609 kick_tx(txq, cq);
610
611 txq->stats.tx_pkts += nb_pkts;
612 txq->stats.tx_bytes += tx_bytes;
613
614 return nb_pkts;
615}
616
617static uint16_t
618af_xdp_tx_cp_batch(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
619{
620 uint16_t nb_tx;
621
622 if (likely(nb_pkts <= ETH_AF_XDP_TX_BATCH_SIZE))
623 return af_xdp_tx_cp(queue, bufs, nb_pkts);
624
625 nb_tx = 0;
626 while (nb_pkts) {
627 uint16_t ret, n;
628
629
630
631
632 n = (uint16_t)RTE_MIN(nb_pkts, ETH_AF_XDP_TX_BATCH_SIZE);
633 ret = af_xdp_tx_cp(queue, &bufs[nb_tx], n);
634 nb_tx = (uint16_t)(nb_tx + ret);
635 nb_pkts = (uint16_t)(nb_pkts - ret);
636 if (ret < n)
637 break;
638 }
639
640 return nb_tx;
641}
642#endif
643
644static uint16_t
645eth_af_xdp_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
646{
647#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
648 return af_xdp_tx_zc(queue, bufs, nb_pkts);
649#else
650 return af_xdp_tx_cp_batch(queue, bufs, nb_pkts);
651#endif
652}
653
654static int
655eth_dev_start(struct rte_eth_dev *dev)
656{
657 dev->data->dev_link.link_status = ETH_LINK_UP;
658
659 return 0;
660}
661
662
663static int
664eth_dev_stop(struct rte_eth_dev *dev)
665{
666 dev->data->dev_link.link_status = ETH_LINK_DOWN;
667 return 0;
668}
669
670
671static inline struct internal_list *
672find_internal_resource(struct pmd_internals *port_int)
673{
674 int found = 0;
675 struct internal_list *list = NULL;
676
677 if (port_int == NULL)
678 return NULL;
679
680 pthread_mutex_lock(&internal_list_lock);
681
682 TAILQ_FOREACH(list, &internal_list, next) {
683 struct pmd_internals *list_int =
684 list->eth_dev->data->dev_private;
685 if (list_int == port_int) {
686 found = 1;
687 break;
688 }
689 }
690
691 pthread_mutex_unlock(&internal_list_lock);
692
693 if (!found)
694 return NULL;
695
696 return list;
697}
698
699
700static inline bool
701ctx_exists(struct pkt_rx_queue *rxq, const char *ifname,
702 struct pkt_rx_queue *list_rxq, const char *list_ifname)
703{
704 bool exists = false;
705
706 if (rxq->xsk_queue_idx == list_rxq->xsk_queue_idx &&
707 !strncmp(ifname, list_ifname, IFNAMSIZ)) {
708 AF_XDP_LOG(ERR, "ctx %s,%i already exists, cannot share umem\n",
709 ifname, rxq->xsk_queue_idx);
710 exists = true;
711 }
712
713 return exists;
714}
715
716
717static inline int
718get_shared_umem(struct pkt_rx_queue *rxq, const char *ifname,
719 struct xsk_umem_info **umem)
720{
721 struct internal_list *list;
722 struct pmd_internals *internals;
723 int i = 0, ret = 0;
724 struct rte_mempool *mb_pool = rxq->mb_pool;
725
726 if (mb_pool == NULL)
727 return ret;
728
729 pthread_mutex_lock(&internal_list_lock);
730
731 TAILQ_FOREACH(list, &internal_list, next) {
732 internals = list->eth_dev->data->dev_private;
733 for (i = 0; i < internals->queue_cnt; i++) {
734 struct pkt_rx_queue *list_rxq =
735 &internals->rx_queues[i];
736 if (rxq == list_rxq)
737 continue;
738 if (mb_pool == internals->rx_queues[i].mb_pool) {
739 if (ctx_exists(rxq, ifname, list_rxq,
740 internals->if_name)) {
741 ret = -1;
742 goto out;
743 }
744 if (__atomic_load_n(
745 &internals->rx_queues[i].umem->refcnt,
746 __ATOMIC_ACQUIRE)) {
747 *umem = internals->rx_queues[i].umem;
748 goto out;
749 }
750 }
751 }
752 }
753
754out:
755 pthread_mutex_unlock(&internal_list_lock);
756
757 return ret;
758}
759
760static int
761eth_dev_configure(struct rte_eth_dev *dev)
762{
763 struct pmd_internals *internal = dev->data->dev_private;
764
765
766 if (dev->data->nb_rx_queues != dev->data->nb_tx_queues)
767 return -EINVAL;
768
769 if (internal->shared_umem) {
770 struct internal_list *list = NULL;
771 const char *name = dev->device->name;
772
773
774 list = find_internal_resource(internal);
775 if (list)
776 return 0;
777
778 list = rte_zmalloc_socket(name, sizeof(*list), 0,
779 dev->device->numa_node);
780 if (list == NULL)
781 return -1;
782
783 list->eth_dev = dev;
784 pthread_mutex_lock(&internal_list_lock);
785 TAILQ_INSERT_TAIL(&internal_list, list, next);
786 pthread_mutex_unlock(&internal_list_lock);
787 }
788
789 return 0;
790}
791
792#define CLB_VAL_IDX 0
793static int
794eth_monitor_callback(const uint64_t value,
795 const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
796{
797 const uint64_t v = opaque[CLB_VAL_IDX];
798 const uint64_t m = (uint32_t)~0;
799
800
801 return (value & m) == v ? 0 : -1;
802}
803
804static int
805eth_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
806{
807 struct pkt_rx_queue *rxq = rx_queue;
808 unsigned int *prod = rxq->rx.producer;
809 const uint32_t cur_val = rxq->rx.cached_prod;
810
811
812 pmc->addr = (void *)prod;
813
814
815 pmc->opaque[CLB_VAL_IDX] = cur_val;
816 pmc->fn = eth_monitor_callback;
817
818
819 pmc->size = sizeof(uint32_t);
820
821 return 0;
822}
823
824static int
825eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
826{
827 struct pmd_internals *internals = dev->data->dev_private;
828
829 dev_info->if_index = internals->if_index;
830 dev_info->max_mac_addrs = 1;
831 dev_info->max_rx_pktlen = ETH_FRAME_LEN;
832 dev_info->max_rx_queues = internals->queue_cnt;
833 dev_info->max_tx_queues = internals->queue_cnt;
834
835 dev_info->min_mtu = RTE_ETHER_MIN_MTU;
836#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
837 dev_info->max_mtu = getpagesize() -
838 sizeof(struct rte_mempool_objhdr) -
839 sizeof(struct rte_mbuf) -
840 RTE_PKTMBUF_HEADROOM - XDP_PACKET_HEADROOM;
841#else
842 dev_info->max_mtu = ETH_AF_XDP_FRAME_SIZE - XDP_PACKET_HEADROOM;
843#endif
844
845 dev_info->default_rxportconf.burst_size = ETH_AF_XDP_DFLT_BUSY_BUDGET;
846 dev_info->default_txportconf.burst_size = ETH_AF_XDP_DFLT_BUSY_BUDGET;
847 dev_info->default_rxportconf.nb_queues = 1;
848 dev_info->default_txportconf.nb_queues = 1;
849 dev_info->default_rxportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
850 dev_info->default_txportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
851
852 return 0;
853}
854
855static int
856eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
857{
858 struct pmd_internals *internals = dev->data->dev_private;
859 struct xdp_statistics xdp_stats;
860 struct pkt_rx_queue *rxq;
861 struct pkt_tx_queue *txq;
862 socklen_t optlen;
863 int i, ret;
864
865 for (i = 0; i < dev->data->nb_rx_queues; i++) {
866 optlen = sizeof(struct xdp_statistics);
867 rxq = &internals->rx_queues[i];
868 txq = rxq->pair;
869 stats->q_ipackets[i] = rxq->stats.rx_pkts;
870 stats->q_ibytes[i] = rxq->stats.rx_bytes;
871
872 stats->q_opackets[i] = txq->stats.tx_pkts;
873 stats->q_obytes[i] = txq->stats.tx_bytes;
874
875 stats->ipackets += stats->q_ipackets[i];
876 stats->ibytes += stats->q_ibytes[i];
877 stats->imissed += rxq->stats.rx_dropped;
878 stats->oerrors += txq->stats.tx_dropped;
879 ret = getsockopt(xsk_socket__fd(rxq->xsk), SOL_XDP,
880 XDP_STATISTICS, &xdp_stats, &optlen);
881 if (ret != 0) {
882 AF_XDP_LOG(ERR, "getsockopt() failed for XDP_STATISTICS.\n");
883 return -1;
884 }
885 stats->imissed += xdp_stats.rx_dropped;
886
887 stats->opackets += stats->q_opackets[i];
888 stats->obytes += stats->q_obytes[i];
889 }
890
891 return 0;
892}
893
894static int
895eth_stats_reset(struct rte_eth_dev *dev)
896{
897 struct pmd_internals *internals = dev->data->dev_private;
898 int i;
899
900 for (i = 0; i < internals->queue_cnt; i++) {
901 memset(&internals->rx_queues[i].stats, 0,
902 sizeof(struct rx_stats));
903 memset(&internals->tx_queues[i].stats, 0,
904 sizeof(struct tx_stats));
905 }
906
907 return 0;
908}
909
910static void
911remove_xdp_program(struct pmd_internals *internals)
912{
913 uint32_t curr_prog_id = 0;
914
915 if (bpf_get_link_xdp_id(internals->if_index, &curr_prog_id,
916 XDP_FLAGS_UPDATE_IF_NOEXIST)) {
917 AF_XDP_LOG(ERR, "bpf_get_link_xdp_id failed\n");
918 return;
919 }
920 bpf_set_link_xdp_fd(internals->if_index, -1,
921 XDP_FLAGS_UPDATE_IF_NOEXIST);
922}
923
924static void
925xdp_umem_destroy(struct xsk_umem_info *umem)
926{
927#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
928 umem->mb_pool = NULL;
929#else
930 rte_memzone_free(umem->mz);
931 umem->mz = NULL;
932
933 rte_ring_free(umem->buf_ring);
934 umem->buf_ring = NULL;
935#endif
936
937 rte_free(umem);
938}
939
940static int
941eth_dev_close(struct rte_eth_dev *dev)
942{
943 struct pmd_internals *internals = dev->data->dev_private;
944 struct pkt_rx_queue *rxq;
945 int i;
946
947 if (rte_eal_process_type() != RTE_PROC_PRIMARY)
948 return 0;
949
950 AF_XDP_LOG(INFO, "Closing AF_XDP ethdev on numa socket %u\n",
951 rte_socket_id());
952
953 for (i = 0; i < internals->queue_cnt; i++) {
954 rxq = &internals->rx_queues[i];
955 if (rxq->umem == NULL)
956 break;
957 xsk_socket__delete(rxq->xsk);
958
959 if (__atomic_sub_fetch(&rxq->umem->refcnt, 1, __ATOMIC_ACQUIRE)
960 == 0) {
961 (void)xsk_umem__delete(rxq->umem->umem);
962 xdp_umem_destroy(rxq->umem);
963 }
964
965
966 rte_free(rxq->pair);
967 rte_free(rxq);
968 }
969
970
971
972
973
974 dev->data->mac_addrs = NULL;
975
976 remove_xdp_program(internals);
977
978 if (internals->shared_umem) {
979 struct internal_list *list;
980
981
982 list = find_internal_resource(internals);
983 if (list) {
984 pthread_mutex_lock(&internal_list_lock);
985 TAILQ_REMOVE(&internal_list, list, next);
986 pthread_mutex_unlock(&internal_list_lock);
987 rte_free(list);
988 }
989 }
990
991 return 0;
992}
993
994static void
995eth_queue_release(void *q __rte_unused)
996{
997}
998
999static int
1000eth_link_update(struct rte_eth_dev *dev __rte_unused,
1001 int wait_to_complete __rte_unused)
1002{
1003 return 0;
1004}
1005
1006#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
1007static inline uintptr_t get_base_addr(struct rte_mempool *mp, uint64_t *align)
1008{
1009 struct rte_mempool_memhdr *memhdr;
1010 uintptr_t memhdr_addr, aligned_addr;
1011
1012 memhdr = STAILQ_FIRST(&mp->mem_list);
1013 memhdr_addr = (uintptr_t)memhdr->addr;
1014 aligned_addr = memhdr_addr & ~(getpagesize() - 1);
1015 *align = memhdr_addr - aligned_addr;
1016
1017 return aligned_addr;
1018}
1019
1020static struct
1021xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
1022 struct pkt_rx_queue *rxq)
1023{
1024 struct xsk_umem_info *umem = NULL;
1025 int ret;
1026 struct xsk_umem_config usr_config = {
1027 .fill_size = ETH_AF_XDP_DFLT_NUM_DESCS * 2,
1028 .comp_size = ETH_AF_XDP_DFLT_NUM_DESCS,
1029 .flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG};
1030 void *base_addr = NULL;
1031 struct rte_mempool *mb_pool = rxq->mb_pool;
1032 uint64_t umem_size, align = 0;
1033
1034 if (internals->shared_umem) {
1035 if (get_shared_umem(rxq, internals->if_name, &umem) < 0)
1036 return NULL;
1037
1038 if (umem != NULL &&
1039 __atomic_load_n(&umem->refcnt, __ATOMIC_ACQUIRE) <
1040 umem->max_xsks) {
1041 AF_XDP_LOG(INFO, "%s,qid%i sharing UMEM\n",
1042 internals->if_name, rxq->xsk_queue_idx);
1043 __atomic_fetch_add(&umem->refcnt, 1, __ATOMIC_ACQUIRE);
1044 }
1045 }
1046
1047 if (umem == NULL) {
1048 usr_config.frame_size =
1049 rte_mempool_calc_obj_size(mb_pool->elt_size,
1050 mb_pool->flags, NULL);
1051 usr_config.frame_headroom = mb_pool->header_size +
1052 sizeof(struct rte_mbuf) +
1053 rte_pktmbuf_priv_size(mb_pool) +
1054 RTE_PKTMBUF_HEADROOM;
1055
1056 umem = rte_zmalloc_socket("umem", sizeof(*umem), 0,
1057 rte_socket_id());
1058 if (umem == NULL) {
1059 AF_XDP_LOG(ERR, "Failed to allocate umem info");
1060 return NULL;
1061 }
1062
1063 umem->mb_pool = mb_pool;
1064 base_addr = (void *)get_base_addr(mb_pool, &align);
1065 umem_size = (uint64_t)mb_pool->populated_size *
1066 (uint64_t)usr_config.frame_size +
1067 align;
1068
1069 ret = xsk_umem__create(&umem->umem, base_addr, umem_size,
1070 &rxq->fq, &rxq->cq, &usr_config);
1071 if (ret) {
1072 AF_XDP_LOG(ERR, "Failed to create umem");
1073 goto err;
1074 }
1075 umem->buffer = base_addr;
1076
1077 if (internals->shared_umem) {
1078 umem->max_xsks = mb_pool->populated_size /
1079 ETH_AF_XDP_NUM_BUFFERS;
1080 AF_XDP_LOG(INFO, "Max xsks for UMEM %s: %u\n",
1081 mb_pool->name, umem->max_xsks);
1082 }
1083
1084 __atomic_store_n(&umem->refcnt, 1, __ATOMIC_RELEASE);
1085 }
1086
1087#else
1088static struct
1089xsk_umem_info *xdp_umem_configure(struct pmd_internals *internals,
1090 struct pkt_rx_queue *rxq)
1091{
1092 struct xsk_umem_info *umem;
1093 const struct rte_memzone *mz;
1094 struct xsk_umem_config usr_config = {
1095 .fill_size = ETH_AF_XDP_DFLT_NUM_DESCS,
1096 .comp_size = ETH_AF_XDP_DFLT_NUM_DESCS,
1097 .frame_size = ETH_AF_XDP_FRAME_SIZE,
1098 .frame_headroom = 0 };
1099 char ring_name[RTE_RING_NAMESIZE];
1100 char mz_name[RTE_MEMZONE_NAMESIZE];
1101 int ret;
1102 uint64_t i;
1103
1104 umem = rte_zmalloc_socket("umem", sizeof(*umem), 0, rte_socket_id());
1105 if (umem == NULL) {
1106 AF_XDP_LOG(ERR, "Failed to allocate umem info");
1107 return NULL;
1108 }
1109
1110 snprintf(ring_name, sizeof(ring_name), "af_xdp_ring_%s_%u",
1111 internals->if_name, rxq->xsk_queue_idx);
1112 umem->buf_ring = rte_ring_create(ring_name,
1113 ETH_AF_XDP_NUM_BUFFERS,
1114 rte_socket_id(),
1115 0x0);
1116 if (umem->buf_ring == NULL) {
1117 AF_XDP_LOG(ERR, "Failed to create rte_ring\n");
1118 goto err;
1119 }
1120
1121 for (i = 0; i < ETH_AF_XDP_NUM_BUFFERS; i++)
1122 rte_ring_enqueue(umem->buf_ring,
1123 (void *)(i * ETH_AF_XDP_FRAME_SIZE));
1124
1125 snprintf(mz_name, sizeof(mz_name), "af_xdp_umem_%s_%u",
1126 internals->if_name, rxq->xsk_queue_idx);
1127 mz = rte_memzone_reserve_aligned(mz_name,
1128 ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE,
1129 rte_socket_id(), RTE_MEMZONE_IOVA_CONTIG,
1130 getpagesize());
1131 if (mz == NULL) {
1132 AF_XDP_LOG(ERR, "Failed to reserve memzone for af_xdp umem.\n");
1133 goto err;
1134 }
1135
1136 ret = xsk_umem__create(&umem->umem, mz->addr,
1137 ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE,
1138 &rxq->fq, &rxq->cq,
1139 &usr_config);
1140
1141 if (ret) {
1142 AF_XDP_LOG(ERR, "Failed to create umem");
1143 goto err;
1144 }
1145 umem->mz = mz;
1146
1147#endif
1148 return umem;
1149
1150err:
1151 xdp_umem_destroy(umem);
1152 return NULL;
1153}
1154
1155static int
1156load_custom_xdp_prog(const char *prog_path, int if_index)
1157{
1158 int ret, prog_fd = -1;
1159 struct bpf_object *obj;
1160 struct bpf_map *map;
1161
1162 ret = bpf_prog_load(prog_path, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
1163 if (ret) {
1164 AF_XDP_LOG(ERR, "Failed to load program %s\n", prog_path);
1165 return ret;
1166 }
1167
1168
1169
1170
1171
1172
1173 map = bpf_object__find_map_by_name(obj, "xsks_map");
1174 if (!map) {
1175 AF_XDP_LOG(ERR, "Failed to find xsks_map in %s\n", prog_path);
1176 return -1;
1177 }
1178
1179
1180 ret = bpf_set_link_xdp_fd(if_index, prog_fd,
1181 XDP_FLAGS_UPDATE_IF_NOEXIST);
1182 if (ret) {
1183 AF_XDP_LOG(ERR, "Failed to set prog fd %d on interface\n",
1184 prog_fd);
1185 return -1;
1186 }
1187
1188 AF_XDP_LOG(INFO, "Successfully loaded XDP program %s with fd %d\n",
1189 prog_path, prog_fd);
1190
1191 return 0;
1192}
1193
1194
1195static int
1196configure_preferred_busy_poll(struct pkt_rx_queue *rxq)
1197{
1198 int sock_opt = 1;
1199 int fd = xsk_socket__fd(rxq->xsk);
1200 int ret = 0;
1201
1202 ret = setsockopt(fd, SOL_SOCKET, SO_PREFER_BUSY_POLL,
1203 (void *)&sock_opt, sizeof(sock_opt));
1204 if (ret < 0) {
1205 AF_XDP_LOG(DEBUG, "Failed to set SO_PREFER_BUSY_POLL\n");
1206 goto err_prefer;
1207 }
1208
1209 sock_opt = ETH_AF_XDP_DFLT_BUSY_TIMEOUT;
1210 ret = setsockopt(fd, SOL_SOCKET, SO_BUSY_POLL, (void *)&sock_opt,
1211 sizeof(sock_opt));
1212 if (ret < 0) {
1213 AF_XDP_LOG(DEBUG, "Failed to set SO_BUSY_POLL\n");
1214 goto err_timeout;
1215 }
1216
1217 sock_opt = rxq->busy_budget;
1218 ret = setsockopt(fd, SOL_SOCKET, SO_BUSY_POLL_BUDGET,
1219 (void *)&sock_opt, sizeof(sock_opt));
1220 if (ret < 0) {
1221 AF_XDP_LOG(DEBUG, "Failed to set SO_BUSY_POLL_BUDGET\n");
1222 } else {
1223 AF_XDP_LOG(INFO, "Busy polling budget set to: %u\n",
1224 rxq->busy_budget);
1225 return 0;
1226 }
1227
1228
1229
1230
1231 sock_opt = 0;
1232 ret = setsockopt(fd, SOL_SOCKET, SO_BUSY_POLL, (void *)&sock_opt,
1233 sizeof(sock_opt));
1234 if (ret < 0) {
1235 AF_XDP_LOG(ERR, "Failed to unset SO_BUSY_POLL\n");
1236 return -1;
1237 }
1238
1239err_timeout:
1240 sock_opt = 0;
1241 ret = setsockopt(fd, SOL_SOCKET, SO_PREFER_BUSY_POLL,
1242 (void *)&sock_opt, sizeof(sock_opt));
1243 if (ret < 0) {
1244 AF_XDP_LOG(ERR, "Failed to unset SO_PREFER_BUSY_POLL\n");
1245 return -1;
1246 }
1247
1248err_prefer:
1249 rxq->busy_budget = 0;
1250 return 0;
1251}
1252
1253static int
1254xsk_configure(struct pmd_internals *internals, struct pkt_rx_queue *rxq,
1255 int ring_size)
1256{
1257 struct xsk_socket_config cfg;
1258 struct pkt_tx_queue *txq = rxq->pair;
1259 int ret = 0;
1260 int reserve_size = ETH_AF_XDP_DFLT_NUM_DESCS;
1261 struct rte_mbuf *fq_bufs[reserve_size];
1262
1263 rxq->umem = xdp_umem_configure(internals, rxq);
1264 if (rxq->umem == NULL)
1265 return -ENOMEM;
1266 txq->umem = rxq->umem;
1267
1268 cfg.rx_size = ring_size;
1269 cfg.tx_size = ring_size;
1270 cfg.libbpf_flags = 0;
1271 cfg.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
1272 cfg.bind_flags = 0;
1273
1274#if defined(XDP_USE_NEED_WAKEUP)
1275 cfg.bind_flags |= XDP_USE_NEED_WAKEUP;
1276#endif
1277
1278 if (strnlen(internals->prog_path, PATH_MAX) &&
1279 !internals->custom_prog_configured) {
1280 ret = load_custom_xdp_prog(internals->prog_path,
1281 internals->if_index);
1282 if (ret) {
1283 AF_XDP_LOG(ERR, "Failed to load custom XDP program %s\n",
1284 internals->prog_path);
1285 goto err;
1286 }
1287 internals->custom_prog_configured = 1;
1288 }
1289
1290 if (internals->shared_umem)
1291 ret = create_shared_socket(&rxq->xsk, internals->if_name,
1292 rxq->xsk_queue_idx, rxq->umem->umem, &rxq->rx,
1293 &txq->tx, &rxq->fq, &rxq->cq, &cfg);
1294 else
1295 ret = xsk_socket__create(&rxq->xsk, internals->if_name,
1296 rxq->xsk_queue_idx, rxq->umem->umem, &rxq->rx,
1297 &txq->tx, &cfg);
1298
1299 if (ret) {
1300 AF_XDP_LOG(ERR, "Failed to create xsk socket.\n");
1301 goto err;
1302 }
1303
1304#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
1305 ret = rte_pktmbuf_alloc_bulk(rxq->umem->mb_pool, fq_bufs, reserve_size);
1306 if (ret) {
1307 AF_XDP_LOG(DEBUG, "Failed to get enough buffers for fq.\n");
1308 goto err;
1309 }
1310#endif
1311
1312 if (rxq->busy_budget) {
1313 ret = configure_preferred_busy_poll(rxq);
1314 if (ret) {
1315 AF_XDP_LOG(ERR, "Failed configure busy polling.\n");
1316 goto err;
1317 }
1318 }
1319
1320 ret = reserve_fill_queue(rxq->umem, reserve_size, fq_bufs, &rxq->fq);
1321 if (ret) {
1322 xsk_socket__delete(rxq->xsk);
1323 AF_XDP_LOG(ERR, "Failed to reserve fill queue.\n");
1324 goto err;
1325 }
1326
1327 return 0;
1328
1329err:
1330 if (__atomic_sub_fetch(&rxq->umem->refcnt, 1, __ATOMIC_ACQUIRE) == 0)
1331 xdp_umem_destroy(rxq->umem);
1332
1333 return ret;
1334}
1335
1336static int
1337eth_rx_queue_setup(struct rte_eth_dev *dev,
1338 uint16_t rx_queue_id,
1339 uint16_t nb_rx_desc,
1340 unsigned int socket_id __rte_unused,
1341 const struct rte_eth_rxconf *rx_conf __rte_unused,
1342 struct rte_mempool *mb_pool)
1343{
1344 struct pmd_internals *internals = dev->data->dev_private;
1345 struct pkt_rx_queue *rxq;
1346 int ret;
1347
1348 rxq = &internals->rx_queues[rx_queue_id];
1349
1350 AF_XDP_LOG(INFO, "Set up rx queue, rx queue id: %d, xsk queue id: %d\n",
1351 rx_queue_id, rxq->xsk_queue_idx);
1352
1353#ifndef XDP_UMEM_UNALIGNED_CHUNK_FLAG
1354 uint32_t buf_size, data_size;
1355
1356
1357 buf_size = rte_pktmbuf_data_room_size(mb_pool) -
1358 RTE_PKTMBUF_HEADROOM;
1359 data_size = ETH_AF_XDP_FRAME_SIZE;
1360
1361 if (data_size > buf_size) {
1362 AF_XDP_LOG(ERR, "%s: %d bytes will not fit in mbuf (%d bytes)\n",
1363 dev->device->name, data_size, buf_size);
1364 ret = -ENOMEM;
1365 goto err;
1366 }
1367#endif
1368
1369 rxq->mb_pool = mb_pool;
1370
1371 if (xsk_configure(internals, rxq, nb_rx_desc)) {
1372 AF_XDP_LOG(ERR, "Failed to configure xdp socket\n");
1373 ret = -EINVAL;
1374 goto err;
1375 }
1376
1377 if (!rxq->busy_budget)
1378 AF_XDP_LOG(DEBUG, "Preferred busy polling not enabled\n");
1379
1380 rxq->fds[0].fd = xsk_socket__fd(rxq->xsk);
1381 rxq->fds[0].events = POLLIN;
1382
1383 dev->data->rx_queues[rx_queue_id] = rxq;
1384 return 0;
1385
1386err:
1387 return ret;
1388}
1389
1390static int
1391eth_tx_queue_setup(struct rte_eth_dev *dev,
1392 uint16_t tx_queue_id,
1393 uint16_t nb_tx_desc __rte_unused,
1394 unsigned int socket_id __rte_unused,
1395 const struct rte_eth_txconf *tx_conf __rte_unused)
1396{
1397 struct pmd_internals *internals = dev->data->dev_private;
1398 struct pkt_tx_queue *txq;
1399
1400 txq = &internals->tx_queues[tx_queue_id];
1401
1402 dev->data->tx_queues[tx_queue_id] = txq;
1403 return 0;
1404}
1405
1406static int
1407eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
1408{
1409 struct pmd_internals *internals = dev->data->dev_private;
1410 struct ifreq ifr = { .ifr_mtu = mtu };
1411 int ret;
1412 int s;
1413
1414 s = socket(PF_INET, SOCK_DGRAM, 0);
1415 if (s < 0)
1416 return -EINVAL;
1417
1418 strlcpy(ifr.ifr_name, internals->if_name, IFNAMSIZ);
1419 ret = ioctl(s, SIOCSIFMTU, &ifr);
1420 close(s);
1421
1422 return (ret < 0) ? -errno : 0;
1423}
1424
1425static int
1426eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask)
1427{
1428 struct ifreq ifr;
1429 int ret = 0;
1430 int s;
1431
1432 s = socket(PF_INET, SOCK_DGRAM, 0);
1433 if (s < 0)
1434 return -errno;
1435
1436 strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
1437 if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0) {
1438 ret = -errno;
1439 goto out;
1440 }
1441 ifr.ifr_flags &= mask;
1442 ifr.ifr_flags |= flags;
1443 if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) {
1444 ret = -errno;
1445 goto out;
1446 }
1447out:
1448 close(s);
1449 return ret;
1450}
1451
1452static int
1453eth_dev_promiscuous_enable(struct rte_eth_dev *dev)
1454{
1455 struct pmd_internals *internals = dev->data->dev_private;
1456
1457 return eth_dev_change_flags(internals->if_name, IFF_PROMISC, ~0);
1458}
1459
1460static int
1461eth_dev_promiscuous_disable(struct rte_eth_dev *dev)
1462{
1463 struct pmd_internals *internals = dev->data->dev_private;
1464
1465 return eth_dev_change_flags(internals->if_name, 0, ~IFF_PROMISC);
1466}
1467
1468static const struct eth_dev_ops ops = {
1469 .dev_start = eth_dev_start,
1470 .dev_stop = eth_dev_stop,
1471 .dev_close = eth_dev_close,
1472 .dev_configure = eth_dev_configure,
1473 .dev_infos_get = eth_dev_info,
1474 .mtu_set = eth_dev_mtu_set,
1475 .promiscuous_enable = eth_dev_promiscuous_enable,
1476 .promiscuous_disable = eth_dev_promiscuous_disable,
1477 .rx_queue_setup = eth_rx_queue_setup,
1478 .tx_queue_setup = eth_tx_queue_setup,
1479 .rx_queue_release = eth_queue_release,
1480 .tx_queue_release = eth_queue_release,
1481 .link_update = eth_link_update,
1482 .stats_get = eth_stats_get,
1483 .stats_reset = eth_stats_reset,
1484 .get_monitor_addr = eth_get_monitor_addr,
1485};
1486
1487
1488static int
1489parse_budget_arg(const char *key __rte_unused,
1490 const char *value, void *extra_args)
1491{
1492 int *i = (int *)extra_args;
1493 char *end;
1494
1495 *i = strtol(value, &end, 10);
1496 if (*i < 0 || *i > UINT16_MAX) {
1497 AF_XDP_LOG(ERR, "Invalid busy_budget %i, must be >= 0 and <= %u\n",
1498 *i, UINT16_MAX);
1499 return -EINVAL;
1500 }
1501
1502 return 0;
1503}
1504
1505
1506static int
1507parse_integer_arg(const char *key __rte_unused,
1508 const char *value, void *extra_args)
1509{
1510 int *i = (int *)extra_args;
1511 char *end;
1512
1513 *i = strtol(value, &end, 10);
1514 if (*i < 0) {
1515 AF_XDP_LOG(ERR, "Argument has to be positive.\n");
1516 return -EINVAL;
1517 }
1518
1519 return 0;
1520}
1521
1522
1523static int
1524parse_name_arg(const char *key __rte_unused,
1525 const char *value, void *extra_args)
1526{
1527 char *name = extra_args;
1528
1529 if (strnlen(value, IFNAMSIZ) > IFNAMSIZ - 1) {
1530 AF_XDP_LOG(ERR, "Invalid name %s, should be less than %u bytes.\n",
1531 value, IFNAMSIZ);
1532 return -EINVAL;
1533 }
1534
1535 strlcpy(name, value, IFNAMSIZ);
1536
1537 return 0;
1538}
1539
1540
1541static int
1542parse_prog_arg(const char *key __rte_unused,
1543 const char *value, void *extra_args)
1544{
1545 char *path = extra_args;
1546
1547 if (strnlen(value, PATH_MAX) == PATH_MAX) {
1548 AF_XDP_LOG(ERR, "Invalid path %s, should be less than %u bytes.\n",
1549 value, PATH_MAX);
1550 return -EINVAL;
1551 }
1552
1553 if (access(value, F_OK) != 0) {
1554 AF_XDP_LOG(ERR, "Error accessing %s: %s\n",
1555 value, strerror(errno));
1556 return -EINVAL;
1557 }
1558
1559 strlcpy(path, value, PATH_MAX);
1560
1561 return 0;
1562}
1563
1564static int
1565xdp_get_channels_info(const char *if_name, int *max_queues,
1566 int *combined_queues)
1567{
1568 struct ethtool_channels channels;
1569 struct ifreq ifr;
1570 int fd, ret;
1571
1572 fd = socket(AF_INET, SOCK_DGRAM, 0);
1573 if (fd < 0)
1574 return -1;
1575
1576 channels.cmd = ETHTOOL_GCHANNELS;
1577 ifr.ifr_data = (void *)&channels;
1578 strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
1579 ret = ioctl(fd, SIOCETHTOOL, &ifr);
1580 if (ret) {
1581 if (errno == EOPNOTSUPP) {
1582 ret = 0;
1583 } else {
1584 ret = -errno;
1585 goto out;
1586 }
1587 }
1588
1589 if (channels.max_combined == 0 || errno == EOPNOTSUPP) {
1590
1591
1592
1593 *max_queues = 1;
1594 *combined_queues = 1;
1595 } else {
1596 *max_queues = channels.max_combined;
1597 *combined_queues = channels.combined_count;
1598 }
1599
1600 out:
1601 close(fd);
1602 return ret;
1603}
1604
1605static int
1606parse_parameters(struct rte_kvargs *kvlist, char *if_name, int *start_queue,
1607 int *queue_cnt, int *shared_umem, char *prog_path,
1608 int *busy_budget)
1609{
1610 int ret;
1611
1612 ret = rte_kvargs_process(kvlist, ETH_AF_XDP_IFACE_ARG,
1613 &parse_name_arg, if_name);
1614 if (ret < 0)
1615 goto free_kvlist;
1616
1617 ret = rte_kvargs_process(kvlist, ETH_AF_XDP_START_QUEUE_ARG,
1618 &parse_integer_arg, start_queue);
1619 if (ret < 0)
1620 goto free_kvlist;
1621
1622 ret = rte_kvargs_process(kvlist, ETH_AF_XDP_QUEUE_COUNT_ARG,
1623 &parse_integer_arg, queue_cnt);
1624 if (ret < 0 || *queue_cnt <= 0) {
1625 ret = -EINVAL;
1626 goto free_kvlist;
1627 }
1628
1629 ret = rte_kvargs_process(kvlist, ETH_AF_XDP_SHARED_UMEM_ARG,
1630 &parse_integer_arg, shared_umem);
1631 if (ret < 0)
1632 goto free_kvlist;
1633
1634 ret = rte_kvargs_process(kvlist, ETH_AF_XDP_PROG_ARG,
1635 &parse_prog_arg, prog_path);
1636 if (ret < 0)
1637 goto free_kvlist;
1638
1639 ret = rte_kvargs_process(kvlist, ETH_AF_XDP_BUDGET_ARG,
1640 &parse_budget_arg, busy_budget);
1641 if (ret < 0)
1642 goto free_kvlist;
1643
1644free_kvlist:
1645 rte_kvargs_free(kvlist);
1646 return ret;
1647}
1648
1649static int
1650get_iface_info(const char *if_name,
1651 struct rte_ether_addr *eth_addr,
1652 int *if_index)
1653{
1654 struct ifreq ifr;
1655 int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
1656
1657 if (sock < 0)
1658 return -1;
1659
1660 strlcpy(ifr.ifr_name, if_name, IFNAMSIZ);
1661 if (ioctl(sock, SIOCGIFINDEX, &ifr))
1662 goto error;
1663
1664 *if_index = ifr.ifr_ifindex;
1665
1666 if (ioctl(sock, SIOCGIFHWADDR, &ifr))
1667 goto error;
1668
1669 rte_memcpy(eth_addr, ifr.ifr_hwaddr.sa_data, RTE_ETHER_ADDR_LEN);
1670
1671 close(sock);
1672 return 0;
1673
1674error:
1675 close(sock);
1676 return -1;
1677}
1678
1679static struct rte_eth_dev *
1680init_internals(struct rte_vdev_device *dev, const char *if_name,
1681 int start_queue_idx, int queue_cnt, int shared_umem,
1682 const char *prog_path, int busy_budget)
1683{
1684 const char *name = rte_vdev_device_name(dev);
1685 const unsigned int numa_node = dev->device.numa_node;
1686 struct pmd_internals *internals;
1687 struct rte_eth_dev *eth_dev;
1688 int ret;
1689 int i;
1690
1691 internals = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
1692 if (internals == NULL)
1693 return NULL;
1694
1695 internals->start_queue_idx = start_queue_idx;
1696 internals->queue_cnt = queue_cnt;
1697 strlcpy(internals->if_name, if_name, IFNAMSIZ);
1698 strlcpy(internals->prog_path, prog_path, PATH_MAX);
1699 internals->custom_prog_configured = 0;
1700
1701#ifndef ETH_AF_XDP_SHARED_UMEM
1702 if (shared_umem) {
1703 AF_XDP_LOG(ERR, "Shared UMEM feature not available. "
1704 "Check kernel and libbpf version\n");
1705 goto err_free_internals;
1706 }
1707#endif
1708 internals->shared_umem = shared_umem;
1709
1710 if (xdp_get_channels_info(if_name, &internals->max_queue_cnt,
1711 &internals->combined_queue_cnt)) {
1712 AF_XDP_LOG(ERR, "Failed to get channel info of interface: %s\n",
1713 if_name);
1714 goto err_free_internals;
1715 }
1716
1717 if (queue_cnt > internals->combined_queue_cnt) {
1718 AF_XDP_LOG(ERR, "Specified queue count %d is larger than combined queue count %d.\n",
1719 queue_cnt, internals->combined_queue_cnt);
1720 goto err_free_internals;
1721 }
1722
1723 internals->rx_queues = rte_zmalloc_socket(NULL,
1724 sizeof(struct pkt_rx_queue) * queue_cnt,
1725 0, numa_node);
1726 if (internals->rx_queues == NULL) {
1727 AF_XDP_LOG(ERR, "Failed to allocate memory for rx queues.\n");
1728 goto err_free_internals;
1729 }
1730
1731 internals->tx_queues = rte_zmalloc_socket(NULL,
1732 sizeof(struct pkt_tx_queue) * queue_cnt,
1733 0, numa_node);
1734 if (internals->tx_queues == NULL) {
1735 AF_XDP_LOG(ERR, "Failed to allocate memory for tx queues.\n");
1736 goto err_free_rx;
1737 }
1738 for (i = 0; i < queue_cnt; i++) {
1739 internals->tx_queues[i].pair = &internals->rx_queues[i];
1740 internals->rx_queues[i].pair = &internals->tx_queues[i];
1741 internals->rx_queues[i].xsk_queue_idx = start_queue_idx + i;
1742 internals->tx_queues[i].xsk_queue_idx = start_queue_idx + i;
1743 internals->rx_queues[i].busy_budget = busy_budget;
1744 }
1745
1746 ret = get_iface_info(if_name, &internals->eth_addr,
1747 &internals->if_index);
1748 if (ret)
1749 goto err_free_tx;
1750
1751 eth_dev = rte_eth_vdev_allocate(dev, 0);
1752 if (eth_dev == NULL)
1753 goto err_free_tx;
1754
1755 eth_dev->data->dev_private = internals;
1756 eth_dev->data->dev_link = pmd_link;
1757 eth_dev->data->mac_addrs = &internals->eth_addr;
1758 eth_dev->data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1759 eth_dev->dev_ops = &ops;
1760 eth_dev->rx_pkt_burst = eth_af_xdp_rx;
1761 eth_dev->tx_pkt_burst = eth_af_xdp_tx;
1762
1763#if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
1764 AF_XDP_LOG(INFO, "Zero copy between umem and mbuf enabled.\n");
1765#endif
1766
1767 return eth_dev;
1768
1769err_free_tx:
1770 rte_free(internals->tx_queues);
1771err_free_rx:
1772 rte_free(internals->rx_queues);
1773err_free_internals:
1774 rte_free(internals);
1775 return NULL;
1776}
1777
1778static int
1779rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
1780{
1781 struct rte_kvargs *kvlist;
1782 char if_name[IFNAMSIZ] = {'\0'};
1783 int xsk_start_queue_idx = ETH_AF_XDP_DFLT_START_QUEUE_IDX;
1784 int xsk_queue_cnt = ETH_AF_XDP_DFLT_QUEUE_COUNT;
1785 int shared_umem = 0;
1786 char prog_path[PATH_MAX] = {'\0'};
1787 int busy_budget = -1;
1788 struct rte_eth_dev *eth_dev = NULL;
1789 const char *name;
1790
1791 AF_XDP_LOG(INFO, "Initializing pmd_af_xdp for %s\n",
1792 rte_vdev_device_name(dev));
1793
1794 name = rte_vdev_device_name(dev);
1795 if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
1796 strlen(rte_vdev_device_args(dev)) == 0) {
1797 eth_dev = rte_eth_dev_attach_secondary(name);
1798 if (eth_dev == NULL) {
1799 AF_XDP_LOG(ERR, "Failed to probe %s\n", name);
1800 return -EINVAL;
1801 }
1802 eth_dev->dev_ops = &ops;
1803 rte_eth_dev_probing_finish(eth_dev);
1804 return 0;
1805 }
1806
1807 kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1808 if (kvlist == NULL) {
1809 AF_XDP_LOG(ERR, "Invalid kvargs key\n");
1810 return -EINVAL;
1811 }
1812
1813 if (dev->device.numa_node == SOCKET_ID_ANY)
1814 dev->device.numa_node = rte_socket_id();
1815
1816 if (parse_parameters(kvlist, if_name, &xsk_start_queue_idx,
1817 &xsk_queue_cnt, &shared_umem, prog_path,
1818 &busy_budget) < 0) {
1819 AF_XDP_LOG(ERR, "Invalid kvargs value\n");
1820 return -EINVAL;
1821 }
1822
1823 if (strlen(if_name) == 0) {
1824 AF_XDP_LOG(ERR, "Network interface must be specified\n");
1825 return -EINVAL;
1826 }
1827
1828 busy_budget = busy_budget == -1 ? ETH_AF_XDP_DFLT_BUSY_BUDGET :
1829 busy_budget;
1830
1831 eth_dev = init_internals(dev, if_name, xsk_start_queue_idx,
1832 xsk_queue_cnt, shared_umem, prog_path,
1833 busy_budget);
1834 if (eth_dev == NULL) {
1835 AF_XDP_LOG(ERR, "Failed to init internals\n");
1836 return -1;
1837 }
1838
1839 rte_eth_dev_probing_finish(eth_dev);
1840
1841 return 0;
1842}
1843
1844static int
1845rte_pmd_af_xdp_remove(struct rte_vdev_device *dev)
1846{
1847 struct rte_eth_dev *eth_dev = NULL;
1848
1849 AF_XDP_LOG(INFO, "Removing AF_XDP ethdev on numa socket %u\n",
1850 rte_socket_id());
1851
1852 if (dev == NULL)
1853 return -1;
1854
1855
1856 eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
1857 if (eth_dev == NULL)
1858 return 0;
1859
1860 eth_dev_close(eth_dev);
1861 rte_eth_dev_release_port(eth_dev);
1862
1863
1864 return 0;
1865}
1866
1867static struct rte_vdev_driver pmd_af_xdp_drv = {
1868 .probe = rte_pmd_af_xdp_probe,
1869 .remove = rte_pmd_af_xdp_remove,
1870};
1871
1872RTE_PMD_REGISTER_VDEV(net_af_xdp, pmd_af_xdp_drv);
1873RTE_PMD_REGISTER_PARAM_STRING(net_af_xdp,
1874 "iface=<string> "
1875 "start_queue=<int> "
1876 "queue_count=<int> "
1877 "shared_umem=<int> "
1878 "xdp_prog=<string> "
1879 "busy_budget=<int>");
1880