linux/samples/bpf/xdpsock_user.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/* Copyright(c) 2017 - 2018 Intel Corporation. */
   3
   4#include <asm/barrier.h>
   5#include <errno.h>
   6#include <getopt.h>
   7#include <libgen.h>
   8#include <linux/bpf.h>
   9#include <linux/compiler.h>
  10#include <linux/if_link.h>
  11#include <linux/if_xdp.h>
  12#include <linux/if_ether.h>
  13#include <linux/ip.h>
  14#include <linux/limits.h>
  15#include <linux/udp.h>
  16#include <arpa/inet.h>
  17#include <locale.h>
  18#include <net/ethernet.h>
  19#include <net/if.h>
  20#include <poll.h>
  21#include <pthread.h>
  22#include <signal.h>
  23#include <stdbool.h>
  24#include <stdio.h>
  25#include <stdlib.h>
  26#include <string.h>
  27#include <sys/mman.h>
  28#include <sys/resource.h>
  29#include <sys/socket.h>
  30#include <sys/types.h>
  31#include <time.h>
  32#include <unistd.h>
  33
  34#include <bpf/libbpf.h>
  35#include <bpf/xsk.h>
  36#include <bpf/bpf.h>
  37#include "xdpsock.h"
  38
  39#ifndef SOL_XDP
  40#define SOL_XDP 283
  41#endif
  42
  43#ifndef AF_XDP
  44#define AF_XDP 44
  45#endif
  46
  47#ifndef PF_XDP
  48#define PF_XDP AF_XDP
  49#endif
  50
  51#define NUM_FRAMES (4 * 1024)
  52#define MIN_PKT_SIZE 64
  53
  54#define DEBUG_HEXDUMP 0
  55
  56typedef __u64 u64;
  57typedef __u32 u32;
  58typedef __u16 u16;
  59typedef __u8  u8;
  60
  61static unsigned long prev_time;
  62
  63enum benchmark_type {
  64        BENCH_RXDROP = 0,
  65        BENCH_TXONLY = 1,
  66        BENCH_L2FWD = 2,
  67};
  68
  69static enum benchmark_type opt_bench = BENCH_RXDROP;
  70static u32 opt_xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
  71static const char *opt_if = "";
  72static int opt_ifindex;
  73static int opt_queue;
  74static unsigned long opt_duration;
  75static unsigned long start_time;
  76static bool benchmark_done;
  77static u32 opt_batch_size = 64;
  78static int opt_pkt_count;
  79static u16 opt_pkt_size = MIN_PKT_SIZE;
  80static u32 opt_pkt_fill_pattern = 0x12345678;
  81static bool opt_extra_stats;
  82static bool opt_quiet;
  83static bool opt_app_stats;
  84static const char *opt_irq_str = "";
  85static u32 irq_no;
  86static int irqs_at_init = -1;
  87static int opt_poll;
  88static int opt_interval = 1;
  89static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP;
  90static u32 opt_umem_flags;
  91static int opt_unaligned_chunks;
  92static int opt_mmap_flags;
  93static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
  94static int opt_timeout = 1000;
  95static bool opt_need_wakeup = true;
  96static u32 opt_num_xsks = 1;
  97static u32 prog_id;
  98
  99struct xsk_ring_stats {
 100        unsigned long rx_npkts;
 101        unsigned long tx_npkts;
 102        unsigned long rx_dropped_npkts;
 103        unsigned long rx_invalid_npkts;
 104        unsigned long tx_invalid_npkts;
 105        unsigned long rx_full_npkts;
 106        unsigned long rx_fill_empty_npkts;
 107        unsigned long tx_empty_npkts;
 108        unsigned long prev_rx_npkts;
 109        unsigned long prev_tx_npkts;
 110        unsigned long prev_rx_dropped_npkts;
 111        unsigned long prev_rx_invalid_npkts;
 112        unsigned long prev_tx_invalid_npkts;
 113        unsigned long prev_rx_full_npkts;
 114        unsigned long prev_rx_fill_empty_npkts;
 115        unsigned long prev_tx_empty_npkts;
 116};
 117
 118struct xsk_driver_stats {
 119        unsigned long intrs;
 120        unsigned long prev_intrs;
 121};
 122
 123struct xsk_app_stats {
 124        unsigned long rx_empty_polls;
 125        unsigned long fill_fail_polls;
 126        unsigned long copy_tx_sendtos;
 127        unsigned long tx_wakeup_sendtos;
 128        unsigned long opt_polls;
 129        unsigned long prev_rx_empty_polls;
 130        unsigned long prev_fill_fail_polls;
 131        unsigned long prev_copy_tx_sendtos;
 132        unsigned long prev_tx_wakeup_sendtos;
 133        unsigned long prev_opt_polls;
 134};
 135
 136struct xsk_umem_info {
 137        struct xsk_ring_prod fq;
 138        struct xsk_ring_cons cq;
 139        struct xsk_umem *umem;
 140        void *buffer;
 141};
 142
 143struct xsk_socket_info {
 144        struct xsk_ring_cons rx;
 145        struct xsk_ring_prod tx;
 146        struct xsk_umem_info *umem;
 147        struct xsk_socket *xsk;
 148        struct xsk_ring_stats ring_stats;
 149        struct xsk_app_stats app_stats;
 150        struct xsk_driver_stats drv_stats;
 151        u32 outstanding_tx;
 152};
 153
 154static int num_socks;
 155struct xsk_socket_info *xsks[MAX_SOCKS];
 156
 157static unsigned long get_nsecs(void)
 158{
 159        struct timespec ts;
 160
 161        clock_gettime(CLOCK_MONOTONIC, &ts);
 162        return ts.tv_sec * 1000000000UL + ts.tv_nsec;
 163}
 164
 165static void print_benchmark(bool running)
 166{
 167        const char *bench_str = "INVALID";
 168
 169        if (opt_bench == BENCH_RXDROP)
 170                bench_str = "rxdrop";
 171        else if (opt_bench == BENCH_TXONLY)
 172                bench_str = "txonly";
 173        else if (opt_bench == BENCH_L2FWD)
 174                bench_str = "l2fwd";
 175
 176        printf("%s:%d %s ", opt_if, opt_queue, bench_str);
 177        if (opt_xdp_flags & XDP_FLAGS_SKB_MODE)
 178                printf("xdp-skb ");
 179        else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE)
 180                printf("xdp-drv ");
 181        else
 182                printf("        ");
 183
 184        if (opt_poll)
 185                printf("poll() ");
 186
 187        if (running) {
 188                printf("running...");
 189                fflush(stdout);
 190        }
 191}
 192
 193static int xsk_get_xdp_stats(int fd, struct xsk_socket_info *xsk)
 194{
 195        struct xdp_statistics stats;
 196        socklen_t optlen;
 197        int err;
 198
 199        optlen = sizeof(stats);
 200        err = getsockopt(fd, SOL_XDP, XDP_STATISTICS, &stats, &optlen);
 201        if (err)
 202                return err;
 203
 204        if (optlen == sizeof(struct xdp_statistics)) {
 205                xsk->ring_stats.rx_dropped_npkts = stats.rx_dropped;
 206                xsk->ring_stats.rx_invalid_npkts = stats.rx_invalid_descs;
 207                xsk->ring_stats.tx_invalid_npkts = stats.tx_invalid_descs;
 208                xsk->ring_stats.rx_full_npkts = stats.rx_ring_full;
 209                xsk->ring_stats.rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs;
 210                xsk->ring_stats.tx_empty_npkts = stats.tx_ring_empty_descs;
 211                return 0;
 212        }
 213
 214        return -EINVAL;
 215}
 216
 217static void dump_app_stats(long dt)
 218{
 219        int i;
 220
 221        for (i = 0; i < num_socks && xsks[i]; i++) {
 222                char *fmt = "%-18s %'-14.0f %'-14lu\n";
 223                double rx_empty_polls_ps, fill_fail_polls_ps, copy_tx_sendtos_ps,
 224                                tx_wakeup_sendtos_ps, opt_polls_ps;
 225
 226                rx_empty_polls_ps = (xsks[i]->app_stats.rx_empty_polls -
 227                                        xsks[i]->app_stats.prev_rx_empty_polls) * 1000000000. / dt;
 228                fill_fail_polls_ps = (xsks[i]->app_stats.fill_fail_polls -
 229                                        xsks[i]->app_stats.prev_fill_fail_polls) * 1000000000. / dt;
 230                copy_tx_sendtos_ps = (xsks[i]->app_stats.copy_tx_sendtos -
 231                                        xsks[i]->app_stats.prev_copy_tx_sendtos) * 1000000000. / dt;
 232                tx_wakeup_sendtos_ps = (xsks[i]->app_stats.tx_wakeup_sendtos -
 233                                        xsks[i]->app_stats.prev_tx_wakeup_sendtos)
 234                                                                                * 1000000000. / dt;
 235                opt_polls_ps = (xsks[i]->app_stats.opt_polls -
 236                                        xsks[i]->app_stats.prev_opt_polls) * 1000000000. / dt;
 237
 238                printf("\n%-18s %-14s %-14s\n", "", "calls/s", "count");
 239                printf(fmt, "rx empty polls", rx_empty_polls_ps, xsks[i]->app_stats.rx_empty_polls);
 240                printf(fmt, "fill fail polls", fill_fail_polls_ps,
 241                                                        xsks[i]->app_stats.fill_fail_polls);
 242                printf(fmt, "copy tx sendtos", copy_tx_sendtos_ps,
 243                                                        xsks[i]->app_stats.copy_tx_sendtos);
 244                printf(fmt, "tx wakeup sendtos", tx_wakeup_sendtos_ps,
 245                                                        xsks[i]->app_stats.tx_wakeup_sendtos);
 246                printf(fmt, "opt polls", opt_polls_ps, xsks[i]->app_stats.opt_polls);
 247
 248                xsks[i]->app_stats.prev_rx_empty_polls = xsks[i]->app_stats.rx_empty_polls;
 249                xsks[i]->app_stats.prev_fill_fail_polls = xsks[i]->app_stats.fill_fail_polls;
 250                xsks[i]->app_stats.prev_copy_tx_sendtos = xsks[i]->app_stats.copy_tx_sendtos;
 251                xsks[i]->app_stats.prev_tx_wakeup_sendtos = xsks[i]->app_stats.tx_wakeup_sendtos;
 252                xsks[i]->app_stats.prev_opt_polls = xsks[i]->app_stats.opt_polls;
 253        }
 254}
 255
 256static bool get_interrupt_number(void)
 257{
 258        FILE *f_int_proc;
 259        char line[4096];
 260        bool found = false;
 261
 262        f_int_proc = fopen("/proc/interrupts", "r");
 263        if (f_int_proc == NULL) {
 264                printf("Failed to open /proc/interrupts.\n");
 265                return found;
 266        }
 267
 268        while (!feof(f_int_proc) && !found) {
 269                /* Make sure to read a full line at a time */
 270                if (fgets(line, sizeof(line), f_int_proc) == NULL ||
 271                                line[strlen(line) - 1] != '\n') {
 272                        printf("Error reading from interrupts file\n");
 273                        break;
 274                }
 275
 276                /* Extract interrupt number from line */
 277                if (strstr(line, opt_irq_str) != NULL) {
 278                        irq_no = atoi(line);
 279                        found = true;
 280                        break;
 281                }
 282        }
 283
 284        fclose(f_int_proc);
 285
 286        return found;
 287}
 288
 289static int get_irqs(void)
 290{
 291        char count_path[PATH_MAX];
 292        int total_intrs = -1;
 293        FILE *f_count_proc;
 294        char line[4096];
 295
 296        snprintf(count_path, sizeof(count_path),
 297                "/sys/kernel/irq/%i/per_cpu_count", irq_no);
 298        f_count_proc = fopen(count_path, "r");
 299        if (f_count_proc == NULL) {
 300                printf("Failed to open %s\n", count_path);
 301                return total_intrs;
 302        }
 303
 304        if (fgets(line, sizeof(line), f_count_proc) == NULL ||
 305                        line[strlen(line) - 1] != '\n') {
 306                printf("Error reading from %s\n", count_path);
 307        } else {
 308                static const char com[2] = ",";
 309                char *token;
 310
 311                total_intrs = 0;
 312                token = strtok(line, com);
 313                while (token != NULL) {
 314                        /* sum up interrupts across all cores */
 315                        total_intrs += atoi(token);
 316                        token = strtok(NULL, com);
 317                }
 318        }
 319
 320        fclose(f_count_proc);
 321
 322        return total_intrs;
 323}
 324
 325static void dump_driver_stats(long dt)
 326{
 327        int i;
 328
 329        for (i = 0; i < num_socks && xsks[i]; i++) {
 330                char *fmt = "%-18s %'-14.0f %'-14lu\n";
 331                double intrs_ps;
 332                int n_ints = get_irqs();
 333
 334                if (n_ints < 0) {
 335                        printf("error getting intr info for intr %i\n", irq_no);
 336                        return;
 337                }
 338                xsks[i]->drv_stats.intrs = n_ints - irqs_at_init;
 339
 340                intrs_ps = (xsks[i]->drv_stats.intrs - xsks[i]->drv_stats.prev_intrs) *
 341                         1000000000. / dt;
 342
 343                printf("\n%-18s %-14s %-14s\n", "", "intrs/s", "count");
 344                printf(fmt, "irqs", intrs_ps, xsks[i]->drv_stats.intrs);
 345
 346                xsks[i]->drv_stats.prev_intrs = xsks[i]->drv_stats.intrs;
 347        }
 348}
 349
 350static void dump_stats(void)
 351{
 352        unsigned long now = get_nsecs();
 353        long dt = now - prev_time;
 354        int i;
 355
 356        prev_time = now;
 357
 358        for (i = 0; i < num_socks && xsks[i]; i++) {
 359                char *fmt = "%-18s %'-14.0f %'-14lu\n";
 360                double rx_pps, tx_pps, dropped_pps, rx_invalid_pps, full_pps, fill_empty_pps,
 361                        tx_invalid_pps, tx_empty_pps;
 362
 363                rx_pps = (xsks[i]->ring_stats.rx_npkts - xsks[i]->ring_stats.prev_rx_npkts) *
 364                         1000000000. / dt;
 365                tx_pps = (xsks[i]->ring_stats.tx_npkts - xsks[i]->ring_stats.prev_tx_npkts) *
 366                         1000000000. / dt;
 367
 368                printf("\n sock%d@", i);
 369                print_benchmark(false);
 370                printf("\n");
 371
 372                printf("%-18s %-14s %-14s %-14.2f\n", "", "pps", "pkts",
 373                       dt / 1000000000.);
 374                printf(fmt, "rx", rx_pps, xsks[i]->ring_stats.rx_npkts);
 375                printf(fmt, "tx", tx_pps, xsks[i]->ring_stats.tx_npkts);
 376
 377                xsks[i]->ring_stats.prev_rx_npkts = xsks[i]->ring_stats.rx_npkts;
 378                xsks[i]->ring_stats.prev_tx_npkts = xsks[i]->ring_stats.tx_npkts;
 379
 380                if (opt_extra_stats) {
 381                        if (!xsk_get_xdp_stats(xsk_socket__fd(xsks[i]->xsk), xsks[i])) {
 382                                dropped_pps = (xsks[i]->ring_stats.rx_dropped_npkts -
 383                                                xsks[i]->ring_stats.prev_rx_dropped_npkts) *
 384                                                        1000000000. / dt;
 385                                rx_invalid_pps = (xsks[i]->ring_stats.rx_invalid_npkts -
 386                                                xsks[i]->ring_stats.prev_rx_invalid_npkts) *
 387                                                        1000000000. / dt;
 388                                tx_invalid_pps = (xsks[i]->ring_stats.tx_invalid_npkts -
 389                                                xsks[i]->ring_stats.prev_tx_invalid_npkts) *
 390                                                        1000000000. / dt;
 391                                full_pps = (xsks[i]->ring_stats.rx_full_npkts -
 392                                                xsks[i]->ring_stats.prev_rx_full_npkts) *
 393                                                        1000000000. / dt;
 394                                fill_empty_pps = (xsks[i]->ring_stats.rx_fill_empty_npkts -
 395                                                xsks[i]->ring_stats.prev_rx_fill_empty_npkts) *
 396                                                        1000000000. / dt;
 397                                tx_empty_pps = (xsks[i]->ring_stats.tx_empty_npkts -
 398                                                xsks[i]->ring_stats.prev_tx_empty_npkts) *
 399                                                        1000000000. / dt;
 400
 401                                printf(fmt, "rx dropped", dropped_pps,
 402                                       xsks[i]->ring_stats.rx_dropped_npkts);
 403                                printf(fmt, "rx invalid", rx_invalid_pps,
 404                                       xsks[i]->ring_stats.rx_invalid_npkts);
 405                                printf(fmt, "tx invalid", tx_invalid_pps,
 406                                       xsks[i]->ring_stats.tx_invalid_npkts);
 407                                printf(fmt, "rx queue full", full_pps,
 408                                       xsks[i]->ring_stats.rx_full_npkts);
 409                                printf(fmt, "fill ring empty", fill_empty_pps,
 410                                       xsks[i]->ring_stats.rx_fill_empty_npkts);
 411                                printf(fmt, "tx ring empty", tx_empty_pps,
 412                                       xsks[i]->ring_stats.tx_empty_npkts);
 413
 414                                xsks[i]->ring_stats.prev_rx_dropped_npkts =
 415                                        xsks[i]->ring_stats.rx_dropped_npkts;
 416                                xsks[i]->ring_stats.prev_rx_invalid_npkts =
 417                                        xsks[i]->ring_stats.rx_invalid_npkts;
 418                                xsks[i]->ring_stats.prev_tx_invalid_npkts =
 419                                        xsks[i]->ring_stats.tx_invalid_npkts;
 420                                xsks[i]->ring_stats.prev_rx_full_npkts =
 421                                        xsks[i]->ring_stats.rx_full_npkts;
 422                                xsks[i]->ring_stats.prev_rx_fill_empty_npkts =
 423                                        xsks[i]->ring_stats.rx_fill_empty_npkts;
 424                                xsks[i]->ring_stats.prev_tx_empty_npkts =
 425                                        xsks[i]->ring_stats.tx_empty_npkts;
 426                        } else {
 427                                printf("%-15s\n", "Error retrieving extra stats");
 428                        }
 429                }
 430        }
 431
 432        if (opt_app_stats)
 433                dump_app_stats(dt);
 434        if (irq_no)
 435                dump_driver_stats(dt);
 436}
 437
 438static bool is_benchmark_done(void)
 439{
 440        if (opt_duration > 0) {
 441                unsigned long dt = (get_nsecs() - start_time);
 442
 443                if (dt >= opt_duration)
 444                        benchmark_done = true;
 445        }
 446        return benchmark_done;
 447}
 448
 449static void *poller(void *arg)
 450{
 451        (void)arg;
 452        while (!is_benchmark_done()) {
 453                sleep(opt_interval);
 454                dump_stats();
 455        }
 456
 457        return NULL;
 458}
 459
 460static void remove_xdp_program(void)
 461{
 462        u32 curr_prog_id = 0;
 463
 464        if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) {
 465                printf("bpf_get_link_xdp_id failed\n");
 466                exit(EXIT_FAILURE);
 467        }
 468        if (prog_id == curr_prog_id)
 469                bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
 470        else if (!curr_prog_id)
 471                printf("couldn't find a prog id on a given interface\n");
 472        else
 473                printf("program on interface changed, not removing\n");
 474}
 475
 476static void int_exit(int sig)
 477{
 478        benchmark_done = true;
 479}
 480
 481static void xdpsock_cleanup(void)
 482{
 483        struct xsk_umem *umem = xsks[0]->umem->umem;
 484        int i;
 485
 486        dump_stats();
 487        for (i = 0; i < num_socks; i++)
 488                xsk_socket__delete(xsks[i]->xsk);
 489        (void)xsk_umem__delete(umem);
 490        remove_xdp_program();
 491}
 492
 493static void __exit_with_error(int error, const char *file, const char *func,
 494                              int line)
 495{
 496        fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func,
 497                line, error, strerror(error));
 498        dump_stats();
 499        remove_xdp_program();
 500        exit(EXIT_FAILURE);
 501}
 502
 503#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, \
 504                                                 __LINE__)
 505static void swap_mac_addresses(void *data)
 506{
 507        struct ether_header *eth = (struct ether_header *)data;
 508        struct ether_addr *src_addr = (struct ether_addr *)&eth->ether_shost;
 509        struct ether_addr *dst_addr = (struct ether_addr *)&eth->ether_dhost;
 510        struct ether_addr tmp;
 511
 512        tmp = *src_addr;
 513        *src_addr = *dst_addr;
 514        *dst_addr = tmp;
 515}
 516
 517static void hex_dump(void *pkt, size_t length, u64 addr)
 518{
 519        const unsigned char *address = (unsigned char *)pkt;
 520        const unsigned char *line = address;
 521        size_t line_size = 32;
 522        unsigned char c;
 523        char buf[32];
 524        int i = 0;
 525
 526        if (!DEBUG_HEXDUMP)
 527                return;
 528
 529        sprintf(buf, "addr=%llu", addr);
 530        printf("length = %zu\n", length);
 531        printf("%s | ", buf);
 532        while (length-- > 0) {
 533                printf("%02X ", *address++);
 534                if (!(++i % line_size) || (length == 0 && i % line_size)) {
 535                        if (length == 0) {
 536                                while (i++ % line_size)
 537                                        printf("__ ");
 538                        }
 539                        printf(" | ");  /* right close */
 540                        while (line < address) {
 541                                c = *line++;
 542                                printf("%c", (c < 33 || c == 255) ? 0x2E : c);
 543                        }
 544                        printf("\n");
 545                        if (length > 0)
 546                                printf("%s | ", buf);
 547                }
 548        }
 549        printf("\n");
 550}
 551
 552static void *memset32_htonl(void *dest, u32 val, u32 size)
 553{
 554        u32 *ptr = (u32 *)dest;
 555        int i;
 556
 557        val = htonl(val);
 558
 559        for (i = 0; i < (size & (~0x3)); i += 4)
 560                ptr[i >> 2] = val;
 561
 562        for (; i < size; i++)
 563                ((char *)dest)[i] = ((char *)&val)[i & 3];
 564
 565        return dest;
 566}
 567
 568/*
 569 * This function code has been taken from
 570 * Linux kernel lib/checksum.c
 571 */
 572static inline unsigned short from32to16(unsigned int x)
 573{
 574        /* add up 16-bit and 16-bit for 16+c bit */
 575        x = (x & 0xffff) + (x >> 16);
 576        /* add up carry.. */
 577        x = (x & 0xffff) + (x >> 16);
 578        return x;
 579}
 580
 581/*
 582 * This function code has been taken from
 583 * Linux kernel lib/checksum.c
 584 */
 585static unsigned int do_csum(const unsigned char *buff, int len)
 586{
 587        unsigned int result = 0;
 588        int odd;
 589
 590        if (len <= 0)
 591                goto out;
 592        odd = 1 & (unsigned long)buff;
 593        if (odd) {
 594#ifdef __LITTLE_ENDIAN
 595                result += (*buff << 8);
 596#else
 597                result = *buff;
 598#endif
 599                len--;
 600                buff++;
 601        }
 602        if (len >= 2) {
 603                if (2 & (unsigned long)buff) {
 604                        result += *(unsigned short *)buff;
 605                        len -= 2;
 606                        buff += 2;
 607                }
 608                if (len >= 4) {
 609                        const unsigned char *end = buff +
 610                                                   ((unsigned int)len & ~3);
 611                        unsigned int carry = 0;
 612
 613                        do {
 614                                unsigned int w = *(unsigned int *)buff;
 615
 616                                buff += 4;
 617                                result += carry;
 618                                result += w;
 619                                carry = (w > result);
 620                        } while (buff < end);
 621                        result += carry;
 622                        result = (result & 0xffff) + (result >> 16);
 623                }
 624                if (len & 2) {
 625                        result += *(unsigned short *)buff;
 626                        buff += 2;
 627                }
 628        }
 629        if (len & 1)
 630#ifdef __LITTLE_ENDIAN
 631                result += *buff;
 632#else
 633                result += (*buff << 8);
 634#endif
 635        result = from32to16(result);
 636        if (odd)
 637                result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
 638out:
 639        return result;
 640}
 641
 642__sum16 ip_fast_csum(const void *iph, unsigned int ihl);
 643
 644/*
 645 *      This is a version of ip_compute_csum() optimized for IP headers,
 646 *      which always checksum on 4 octet boundaries.
 647 *      This function code has been taken from
 648 *      Linux kernel lib/checksum.c
 649 */
 650__sum16 ip_fast_csum(const void *iph, unsigned int ihl)
 651{
 652        return (__force __sum16)~do_csum(iph, ihl * 4);
 653}
 654
 655/*
 656 * Fold a partial checksum
 657 * This function code has been taken from
 658 * Linux kernel include/asm-generic/checksum.h
 659 */
 660static inline __sum16 csum_fold(__wsum csum)
 661{
 662        u32 sum = (__force u32)csum;
 663
 664        sum = (sum & 0xffff) + (sum >> 16);
 665        sum = (sum & 0xffff) + (sum >> 16);
 666        return (__force __sum16)~sum;
 667}
 668
 669/*
 670 * This function code has been taken from
 671 * Linux kernel lib/checksum.c
 672 */
 673static inline u32 from64to32(u64 x)
 674{
 675        /* add up 32-bit and 32-bit for 32+c bit */
 676        x = (x & 0xffffffff) + (x >> 32);
 677        /* add up carry.. */
 678        x = (x & 0xffffffff) + (x >> 32);
 679        return (u32)x;
 680}
 681
 682__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
 683                          __u32 len, __u8 proto, __wsum sum);
 684
 685/*
 686 * This function code has been taken from
 687 * Linux kernel lib/checksum.c
 688 */
 689__wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
 690                          __u32 len, __u8 proto, __wsum sum)
 691{
 692        unsigned long long s = (__force u32)sum;
 693
 694        s += (__force u32)saddr;
 695        s += (__force u32)daddr;
 696#ifdef __BIG_ENDIAN__
 697        s += proto + len;
 698#else
 699        s += (proto + len) << 8;
 700#endif
 701        return (__force __wsum)from64to32(s);
 702}
 703
 704/*
 705 * This function has been taken from
 706 * Linux kernel include/asm-generic/checksum.h
 707 */
 708static inline __sum16
 709csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
 710                  __u8 proto, __wsum sum)
 711{
 712        return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
 713}
 714
 715static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len,
 716                           u8 proto, u16 *udp_pkt)
 717{
 718        u32 csum = 0;
 719        u32 cnt = 0;
 720
 721        /* udp hdr and data */
 722        for (; cnt < len; cnt += 2)
 723                csum += udp_pkt[cnt >> 1];
 724
 725        return csum_tcpudp_magic(saddr, daddr, len, proto, csum);
 726}
 727
 728#define ETH_FCS_SIZE 4
 729
 730#define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \
 731                      sizeof(struct udphdr))
 732
 733#define PKT_SIZE                (opt_pkt_size - ETH_FCS_SIZE)
 734#define IP_PKT_SIZE             (PKT_SIZE - sizeof(struct ethhdr))
 735#define UDP_PKT_SIZE            (IP_PKT_SIZE - sizeof(struct iphdr))
 736#define UDP_PKT_DATA_SIZE       (UDP_PKT_SIZE - sizeof(struct udphdr))
 737
 738static u8 pkt_data[XSK_UMEM__DEFAULT_FRAME_SIZE];
 739
 740static void gen_eth_hdr_data(void)
 741{
 742        struct udphdr *udp_hdr = (struct udphdr *)(pkt_data +
 743                                                   sizeof(struct ethhdr) +
 744                                                   sizeof(struct iphdr));
 745        struct iphdr *ip_hdr = (struct iphdr *)(pkt_data +
 746                                                sizeof(struct ethhdr));
 747        struct ethhdr *eth_hdr = (struct ethhdr *)pkt_data;
 748
 749        /* ethernet header */
 750        memcpy(eth_hdr->h_dest, "\x3c\xfd\xfe\x9e\x7f\x71", ETH_ALEN);
 751        memcpy(eth_hdr->h_source, "\xec\xb1\xd7\x98\x3a\xc0", ETH_ALEN);
 752        eth_hdr->h_proto = htons(ETH_P_IP);
 753
 754        /* IP header */
 755        ip_hdr->version = IPVERSION;
 756        ip_hdr->ihl = 0x5; /* 20 byte header */
 757        ip_hdr->tos = 0x0;
 758        ip_hdr->tot_len = htons(IP_PKT_SIZE);
 759        ip_hdr->id = 0;
 760        ip_hdr->frag_off = 0;
 761        ip_hdr->ttl = IPDEFTTL;
 762        ip_hdr->protocol = IPPROTO_UDP;
 763        ip_hdr->saddr = htonl(0x0a0a0a10);
 764        ip_hdr->daddr = htonl(0x0a0a0a20);
 765
 766        /* IP header checksum */
 767        ip_hdr->check = 0;
 768        ip_hdr->check = ip_fast_csum((const void *)ip_hdr, ip_hdr->ihl);
 769
 770        /* UDP header */
 771        udp_hdr->source = htons(0x1000);
 772        udp_hdr->dest = htons(0x1000);
 773        udp_hdr->len = htons(UDP_PKT_SIZE);
 774
 775        /* UDP data */
 776        memset32_htonl(pkt_data + PKT_HDR_SIZE, opt_pkt_fill_pattern,
 777                       UDP_PKT_DATA_SIZE);
 778
 779        /* UDP header checksum */
 780        udp_hdr->check = 0;
 781        udp_hdr->check = udp_csum(ip_hdr->saddr, ip_hdr->daddr, UDP_PKT_SIZE,
 782                                  IPPROTO_UDP, (u16 *)udp_hdr);
 783}
 784
 785static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr)
 786{
 787        memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data,
 788               PKT_SIZE);
 789}
 790
 791static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
 792{
 793        struct xsk_umem_info *umem;
 794        struct xsk_umem_config cfg = {
 795                /* We recommend that you set the fill ring size >= HW RX ring size +
 796                 * AF_XDP RX ring size. Make sure you fill up the fill ring
 797                 * with buffers at regular intervals, and you will with this setting
 798                 * avoid allocation failures in the driver. These are usually quite
 799                 * expensive since drivers have not been written to assume that
 800                 * allocation failures are common. For regular sockets, kernel
 801                 * allocated memory is used that only runs out in OOM situations
 802                 * that should be rare.
 803                 */
 804                .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS * 2,
 805                .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
 806                .frame_size = opt_xsk_frame_size,
 807                .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM,
 808                .flags = opt_umem_flags
 809        };
 810        int ret;
 811
 812        umem = calloc(1, sizeof(*umem));
 813        if (!umem)
 814                exit_with_error(errno);
 815
 816        ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq,
 817                               &cfg);
 818        if (ret)
 819                exit_with_error(-ret);
 820
 821        umem->buffer = buffer;
 822        return umem;
 823}
 824
 825static void xsk_populate_fill_ring(struct xsk_umem_info *umem)
 826{
 827        int ret, i;
 828        u32 idx;
 829
 830        ret = xsk_ring_prod__reserve(&umem->fq,
 831                                     XSK_RING_PROD__DEFAULT_NUM_DESCS * 2, &idx);
 832        if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS * 2)
 833                exit_with_error(-ret);
 834        for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS * 2; i++)
 835                *xsk_ring_prod__fill_addr(&umem->fq, idx++) =
 836                        i * opt_xsk_frame_size;
 837        xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS * 2);
 838}
 839
 840static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem,
 841                                                    bool rx, bool tx)
 842{
 843        struct xsk_socket_config cfg;
 844        struct xsk_socket_info *xsk;
 845        struct xsk_ring_cons *rxr;
 846        struct xsk_ring_prod *txr;
 847        int ret;
 848
 849        xsk = calloc(1, sizeof(*xsk));
 850        if (!xsk)
 851                exit_with_error(errno);
 852
 853        xsk->umem = umem;
 854        cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
 855        cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
 856        if (opt_num_xsks > 1)
 857                cfg.libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD;
 858        else
 859                cfg.libbpf_flags = 0;
 860        cfg.xdp_flags = opt_xdp_flags;
 861        cfg.bind_flags = opt_xdp_bind_flags;
 862
 863        rxr = rx ? &xsk->rx : NULL;
 864        txr = tx ? &xsk->tx : NULL;
 865        ret = xsk_socket__create(&xsk->xsk, opt_if, opt_queue, umem->umem,
 866                                 rxr, txr, &cfg);
 867        if (ret)
 868                exit_with_error(-ret);
 869
 870        ret = bpf_get_link_xdp_id(opt_ifindex, &prog_id, opt_xdp_flags);
 871        if (ret)
 872                exit_with_error(-ret);
 873
 874        xsk->app_stats.rx_empty_polls = 0;
 875        xsk->app_stats.fill_fail_polls = 0;
 876        xsk->app_stats.copy_tx_sendtos = 0;
 877        xsk->app_stats.tx_wakeup_sendtos = 0;
 878        xsk->app_stats.opt_polls = 0;
 879        xsk->app_stats.prev_rx_empty_polls = 0;
 880        xsk->app_stats.prev_fill_fail_polls = 0;
 881        xsk->app_stats.prev_copy_tx_sendtos = 0;
 882        xsk->app_stats.prev_tx_wakeup_sendtos = 0;
 883        xsk->app_stats.prev_opt_polls = 0;
 884
 885        return xsk;
 886}
 887
 888static struct option long_options[] = {
 889        {"rxdrop", no_argument, 0, 'r'},
 890        {"txonly", no_argument, 0, 't'},
 891        {"l2fwd", no_argument, 0, 'l'},
 892        {"interface", required_argument, 0, 'i'},
 893        {"queue", required_argument, 0, 'q'},
 894        {"poll", no_argument, 0, 'p'},
 895        {"xdp-skb", no_argument, 0, 'S'},
 896        {"xdp-native", no_argument, 0, 'N'},
 897        {"interval", required_argument, 0, 'n'},
 898        {"zero-copy", no_argument, 0, 'z'},
 899        {"copy", no_argument, 0, 'c'},
 900        {"frame-size", required_argument, 0, 'f'},
 901        {"no-need-wakeup", no_argument, 0, 'm'},
 902        {"unaligned", no_argument, 0, 'u'},
 903        {"shared-umem", no_argument, 0, 'M'},
 904        {"force", no_argument, 0, 'F'},
 905        {"duration", required_argument, 0, 'd'},
 906        {"batch-size", required_argument, 0, 'b'},
 907        {"tx-pkt-count", required_argument, 0, 'C'},
 908        {"tx-pkt-size", required_argument, 0, 's'},
 909        {"tx-pkt-pattern", required_argument, 0, 'P'},
 910        {"extra-stats", no_argument, 0, 'x'},
 911        {"quiet", no_argument, 0, 'Q'},
 912        {"app-stats", no_argument, 0, 'a'},
 913        {"irq-string", no_argument, 0, 'I'},
 914        {0, 0, 0, 0}
 915};
 916
 917static void usage(const char *prog)
 918{
 919        const char *str =
 920                "  Usage: %s [OPTIONS]\n"
 921                "  Options:\n"
 922                "  -r, --rxdrop         Discard all incoming packets (default)\n"
 923                "  -t, --txonly         Only send packets\n"
 924                "  -l, --l2fwd          MAC swap L2 forwarding\n"
 925                "  -i, --interface=n    Run on interface n\n"
 926                "  -q, --queue=n        Use queue n (default 0)\n"
 927                "  -p, --poll           Use poll syscall\n"
 928                "  -S, --xdp-skb=n      Use XDP skb-mod\n"
 929                "  -N, --xdp-native=n   Enforce XDP native mode\n"
 930                "  -n, --interval=n     Specify statistics update interval (default 1 sec).\n"
 931                "  -z, --zero-copy      Force zero-copy mode.\n"
 932                "  -c, --copy           Force copy mode.\n"
 933                "  -m, --no-need-wakeup Turn off use of driver need wakeup flag.\n"
 934                "  -f, --frame-size=n   Set the frame size (must be a power of two in aligned mode, default is %d).\n"
 935                "  -u, --unaligned      Enable unaligned chunk placement\n"
 936                "  -M, --shared-umem    Enable XDP_SHARED_UMEM\n"
 937                "  -F, --force          Force loading the XDP prog\n"
 938                "  -d, --duration=n     Duration in secs to run command.\n"
 939                "                       Default: forever.\n"
 940                "  -b, --batch-size=n   Batch size for sending or receiving\n"
 941                "                       packets. Default: %d\n"
 942                "  -C, --tx-pkt-count=n Number of packets to send.\n"
 943                "                       Default: Continuous packets.\n"
 944                "  -s, --tx-pkt-size=n  Transmit packet size.\n"
 945                "                       (Default: %d bytes)\n"
 946                "                       Min size: %d, Max size %d.\n"
 947                "  -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n"
 948                "  -x, --extra-stats    Display extra statistics.\n"
 949                "  -Q, --quiet          Do not display any stats.\n"
 950                "  -a, --app-stats      Display application (syscall) statistics.\n"
 951                "  -I, --irq-string     Display driver interrupt statistics for interface associated with irq-string.\n"
 952                "\n";
 953        fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE,
 954                opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE,
 955                XSK_UMEM__DEFAULT_FRAME_SIZE, opt_pkt_fill_pattern);
 956
 957        exit(EXIT_FAILURE);
 958}
 959
 960static void parse_command_line(int argc, char **argv)
 961{
 962        int option_index, c;
 963
 964        opterr = 0;
 965
 966        for (;;) {
 967                c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQaI:",
 968                                long_options, &option_index);
 969                if (c == -1)
 970                        break;
 971
 972                switch (c) {
 973                case 'r':
 974                        opt_bench = BENCH_RXDROP;
 975                        break;
 976                case 't':
 977                        opt_bench = BENCH_TXONLY;
 978                        break;
 979                case 'l':
 980                        opt_bench = BENCH_L2FWD;
 981                        break;
 982                case 'i':
 983                        opt_if = optarg;
 984                        break;
 985                case 'q':
 986                        opt_queue = atoi(optarg);
 987                        break;
 988                case 'p':
 989                        opt_poll = 1;
 990                        break;
 991                case 'S':
 992                        opt_xdp_flags |= XDP_FLAGS_SKB_MODE;
 993                        opt_xdp_bind_flags |= XDP_COPY;
 994                        break;
 995                case 'N':
 996                        /* default, set below */
 997                        break;
 998                case 'n':
 999                        opt_interval = atoi(optarg);
1000                        break;
1001                case 'z':
1002                        opt_xdp_bind_flags |= XDP_ZEROCOPY;
1003                        break;
1004                case 'c':
1005                        opt_xdp_bind_flags |= XDP_COPY;
1006                        break;
1007                case 'u':
1008                        opt_umem_flags |= XDP_UMEM_UNALIGNED_CHUNK_FLAG;
1009                        opt_unaligned_chunks = 1;
1010                        opt_mmap_flags = MAP_HUGETLB;
1011                        break;
1012                case 'F':
1013                        opt_xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
1014                        break;
1015                case 'f':
1016                        opt_xsk_frame_size = atoi(optarg);
1017                        break;
1018                case 'm':
1019                        opt_need_wakeup = false;
1020                        opt_xdp_bind_flags &= ~XDP_USE_NEED_WAKEUP;
1021                        break;
1022                case 'M':
1023                        opt_num_xsks = MAX_SOCKS;
1024                        break;
1025                case 'd':
1026                        opt_duration = atoi(optarg);
1027                        opt_duration *= 1000000000;
1028                        break;
1029                case 'b':
1030                        opt_batch_size = atoi(optarg);
1031                        break;
1032                case 'C':
1033                        opt_pkt_count = atoi(optarg);
1034                        break;
1035                case 's':
1036                        opt_pkt_size = atoi(optarg);
1037                        if (opt_pkt_size > (XSK_UMEM__DEFAULT_FRAME_SIZE) ||
1038                            opt_pkt_size < MIN_PKT_SIZE) {
1039                                fprintf(stderr,
1040                                        "ERROR: Invalid frame size %d\n",
1041                                        opt_pkt_size);
1042                                usage(basename(argv[0]));
1043                        }
1044                        break;
1045                case 'P':
1046                        opt_pkt_fill_pattern = strtol(optarg, NULL, 16);
1047                        break;
1048                case 'x':
1049                        opt_extra_stats = 1;
1050                        break;
1051                case 'Q':
1052                        opt_quiet = 1;
1053                        break;
1054                case 'a':
1055                        opt_app_stats = 1;
1056                        break;
1057                case 'I':
1058                        opt_irq_str = optarg;
1059                        if (get_interrupt_number())
1060                                irqs_at_init = get_irqs();
1061                        if (irqs_at_init < 0) {
1062                                fprintf(stderr, "ERROR: Failed to get irqs for %s\n", opt_irq_str);
1063                                usage(basename(argv[0]));
1064                        }
1065
1066                        break;
1067                default:
1068                        usage(basename(argv[0]));
1069                }
1070        }
1071
1072        if (!(opt_xdp_flags & XDP_FLAGS_SKB_MODE))
1073                opt_xdp_flags |= XDP_FLAGS_DRV_MODE;
1074
1075        opt_ifindex = if_nametoindex(opt_if);
1076        if (!opt_ifindex) {
1077                fprintf(stderr, "ERROR: interface \"%s\" does not exist\n",
1078                        opt_if);
1079                usage(basename(argv[0]));
1080        }
1081
1082        if ((opt_xsk_frame_size & (opt_xsk_frame_size - 1)) &&
1083            !opt_unaligned_chunks) {
1084                fprintf(stderr, "--frame-size=%d is not a power of two\n",
1085                        opt_xsk_frame_size);
1086                usage(basename(argv[0]));
1087        }
1088}
1089
1090static void kick_tx(struct xsk_socket_info *xsk)
1091{
1092        int ret;
1093
1094        ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0);
1095        if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN ||
1096            errno == EBUSY || errno == ENETDOWN)
1097                return;
1098        exit_with_error(errno);
1099}
1100
1101static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
1102                                     struct pollfd *fds)
1103{
1104        struct xsk_umem_info *umem = xsk->umem;
1105        u32 idx_cq = 0, idx_fq = 0;
1106        unsigned int rcvd;
1107        size_t ndescs;
1108
1109        if (!xsk->outstanding_tx)
1110                return;
1111
1112        /* In copy mode, Tx is driven by a syscall so we need to use e.g. sendto() to
1113         * really send the packets. In zero-copy mode we do not have to do this, since Tx
1114         * is driven by the NAPI loop. So as an optimization, we do not have to call
1115         * sendto() all the time in zero-copy mode for l2fwd.
1116         */
1117        if (opt_xdp_bind_flags & XDP_COPY) {
1118                xsk->app_stats.copy_tx_sendtos++;
1119                kick_tx(xsk);
1120        }
1121
1122        ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size :
1123                xsk->outstanding_tx;
1124
1125        /* re-add completed Tx buffers */
1126        rcvd = xsk_ring_cons__peek(&umem->cq, ndescs, &idx_cq);
1127        if (rcvd > 0) {
1128                unsigned int i;
1129                int ret;
1130
1131                ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
1132                while (ret != rcvd) {
1133                        if (ret < 0)
1134                                exit_with_error(-ret);
1135                        if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
1136                                xsk->app_stats.fill_fail_polls++;
1137                                ret = poll(fds, num_socks, opt_timeout);
1138                        }
1139                        ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
1140                }
1141
1142                for (i = 0; i < rcvd; i++)
1143                        *xsk_ring_prod__fill_addr(&umem->fq, idx_fq++) =
1144                                *xsk_ring_cons__comp_addr(&umem->cq, idx_cq++);
1145
1146                xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
1147                xsk_ring_cons__release(&xsk->umem->cq, rcvd);
1148                xsk->outstanding_tx -= rcvd;
1149                xsk->ring_stats.tx_npkts += rcvd;
1150        }
1151}
1152
1153static inline void complete_tx_only(struct xsk_socket_info *xsk,
1154                                    int batch_size)
1155{
1156        unsigned int rcvd;
1157        u32 idx;
1158
1159        if (!xsk->outstanding_tx)
1160                return;
1161
1162        if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) {
1163                xsk->app_stats.tx_wakeup_sendtos++;
1164                kick_tx(xsk);
1165        }
1166
1167        rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx);
1168        if (rcvd > 0) {
1169                xsk_ring_cons__release(&xsk->umem->cq, rcvd);
1170                xsk->outstanding_tx -= rcvd;
1171                xsk->ring_stats.tx_npkts += rcvd;
1172        }
1173}
1174
1175static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
1176{
1177        unsigned int rcvd, i;
1178        u32 idx_rx = 0, idx_fq = 0;
1179        int ret;
1180
1181        rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx);
1182        if (!rcvd) {
1183                if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
1184                        xsk->app_stats.rx_empty_polls++;
1185                        ret = poll(fds, num_socks, opt_timeout);
1186                }
1187                return;
1188        }
1189
1190        ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
1191        while (ret != rcvd) {
1192                if (ret < 0)
1193                        exit_with_error(-ret);
1194                if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
1195                        xsk->app_stats.fill_fail_polls++;
1196                        ret = poll(fds, num_socks, opt_timeout);
1197                }
1198                ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
1199        }
1200
1201        for (i = 0; i < rcvd; i++) {
1202                u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
1203                u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
1204                u64 orig = xsk_umem__extract_addr(addr);
1205
1206                addr = xsk_umem__add_offset_to_addr(addr);
1207                char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
1208
1209                hex_dump(pkt, len, addr);
1210                *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = orig;
1211        }
1212
1213        xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
1214        xsk_ring_cons__release(&xsk->rx, rcvd);
1215        xsk->ring_stats.rx_npkts += rcvd;
1216}
1217
1218static void rx_drop_all(void)
1219{
1220        struct pollfd fds[MAX_SOCKS] = {};
1221        int i, ret;
1222
1223        for (i = 0; i < num_socks; i++) {
1224                fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
1225                fds[i].events = POLLIN;
1226        }
1227
1228        for (;;) {
1229                if (opt_poll) {
1230                        for (i = 0; i < num_socks; i++)
1231                                xsks[i]->app_stats.opt_polls++;
1232                        ret = poll(fds, num_socks, opt_timeout);
1233                        if (ret <= 0)
1234                                continue;
1235                }
1236
1237                for (i = 0; i < num_socks; i++)
1238                        rx_drop(xsks[i], fds);
1239
1240                if (benchmark_done)
1241                        break;
1242        }
1243}
1244
1245static void tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size)
1246{
1247        u32 idx;
1248        unsigned int i;
1249
1250        while (xsk_ring_prod__reserve(&xsk->tx, batch_size, &idx) <
1251                                      batch_size) {
1252                complete_tx_only(xsk, batch_size);
1253        }
1254
1255        for (i = 0; i < batch_size; i++) {
1256                struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx,
1257                                                                  idx + i);
1258                tx_desc->addr = (*frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT;
1259                tx_desc->len = PKT_SIZE;
1260        }
1261
1262        xsk_ring_prod__submit(&xsk->tx, batch_size);
1263        xsk->outstanding_tx += batch_size;
1264        *frame_nb += batch_size;
1265        *frame_nb %= NUM_FRAMES;
1266        complete_tx_only(xsk, batch_size);
1267}
1268
1269static inline int get_batch_size(int pkt_cnt)
1270{
1271        if (!opt_pkt_count)
1272                return opt_batch_size;
1273
1274        if (pkt_cnt + opt_batch_size <= opt_pkt_count)
1275                return opt_batch_size;
1276
1277        return opt_pkt_count - pkt_cnt;
1278}
1279
1280static void complete_tx_only_all(void)
1281{
1282        bool pending;
1283        int i;
1284
1285        do {
1286                pending = false;
1287                for (i = 0; i < num_socks; i++) {
1288                        if (xsks[i]->outstanding_tx) {
1289                                complete_tx_only(xsks[i], opt_batch_size);
1290                                pending = !!xsks[i]->outstanding_tx;
1291                        }
1292                }
1293        } while (pending);
1294}
1295
1296static void tx_only_all(void)
1297{
1298        struct pollfd fds[MAX_SOCKS] = {};
1299        u32 frame_nb[MAX_SOCKS] = {};
1300        int pkt_cnt = 0;
1301        int i, ret;
1302
1303        for (i = 0; i < num_socks; i++) {
1304                fds[0].fd = xsk_socket__fd(xsks[i]->xsk);
1305                fds[0].events = POLLOUT;
1306        }
1307
1308        while ((opt_pkt_count && pkt_cnt < opt_pkt_count) || !opt_pkt_count) {
1309                int batch_size = get_batch_size(pkt_cnt);
1310
1311                if (opt_poll) {
1312                        for (i = 0; i < num_socks; i++)
1313                                xsks[i]->app_stats.opt_polls++;
1314                        ret = poll(fds, num_socks, opt_timeout);
1315                        if (ret <= 0)
1316                                continue;
1317
1318                        if (!(fds[0].revents & POLLOUT))
1319                                continue;
1320                }
1321
1322                for (i = 0; i < num_socks; i++)
1323                        tx_only(xsks[i], &frame_nb[i], batch_size);
1324
1325                pkt_cnt += batch_size;
1326
1327                if (benchmark_done)
1328                        break;
1329        }
1330
1331        if (opt_pkt_count)
1332                complete_tx_only_all();
1333}
1334
1335static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
1336{
1337        unsigned int rcvd, i;
1338        u32 idx_rx = 0, idx_tx = 0;
1339        int ret;
1340
1341        complete_tx_l2fwd(xsk, fds);
1342
1343        rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx);
1344        if (!rcvd) {
1345                if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
1346                        xsk->app_stats.rx_empty_polls++;
1347                        ret = poll(fds, num_socks, opt_timeout);
1348                }
1349                return;
1350        }
1351
1352        ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
1353        while (ret != rcvd) {
1354                if (ret < 0)
1355                        exit_with_error(-ret);
1356                complete_tx_l2fwd(xsk, fds);
1357                if (xsk_ring_prod__needs_wakeup(&xsk->tx)) {
1358                        xsk->app_stats.tx_wakeup_sendtos++;
1359                        kick_tx(xsk);
1360                }
1361                ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
1362        }
1363
1364        for (i = 0; i < rcvd; i++) {
1365                u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
1366                u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
1367                u64 orig = addr;
1368
1369                addr = xsk_umem__add_offset_to_addr(addr);
1370                char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
1371
1372                swap_mac_addresses(pkt);
1373
1374                hex_dump(pkt, len, addr);
1375                xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = orig;
1376                xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len;
1377        }
1378
1379        xsk_ring_prod__submit(&xsk->tx, rcvd);
1380        xsk_ring_cons__release(&xsk->rx, rcvd);
1381
1382        xsk->ring_stats.rx_npkts += rcvd;
1383        xsk->outstanding_tx += rcvd;
1384}
1385
1386static void l2fwd_all(void)
1387{
1388        struct pollfd fds[MAX_SOCKS] = {};
1389        int i, ret;
1390
1391        for (i = 0; i < num_socks; i++) {
1392                fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
1393                fds[i].events = POLLOUT | POLLIN;
1394        }
1395
1396        for (;;) {
1397                if (opt_poll) {
1398                        for (i = 0; i < num_socks; i++)
1399                                xsks[i]->app_stats.opt_polls++;
1400                        ret = poll(fds, num_socks, opt_timeout);
1401                        if (ret <= 0)
1402                                continue;
1403                }
1404
1405                for (i = 0; i < num_socks; i++)
1406                        l2fwd(xsks[i], fds);
1407
1408                if (benchmark_done)
1409                        break;
1410        }
1411}
1412
1413static void load_xdp_program(char **argv, struct bpf_object **obj)
1414{
1415        struct bpf_prog_load_attr prog_load_attr = {
1416                .prog_type      = BPF_PROG_TYPE_XDP,
1417        };
1418        char xdp_filename[256];
1419        int prog_fd;
1420
1421        snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]);
1422        prog_load_attr.file = xdp_filename;
1423
1424        if (bpf_prog_load_xattr(&prog_load_attr, obj, &prog_fd))
1425                exit(EXIT_FAILURE);
1426        if (prog_fd < 0) {
1427                fprintf(stderr, "ERROR: no program found: %s\n",
1428                        strerror(prog_fd));
1429                exit(EXIT_FAILURE);
1430        }
1431
1432        if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd, opt_xdp_flags) < 0) {
1433                fprintf(stderr, "ERROR: link set xdp fd failed\n");
1434                exit(EXIT_FAILURE);
1435        }
1436}
1437
1438static void enter_xsks_into_map(struct bpf_object *obj)
1439{
1440        struct bpf_map *map;
1441        int i, xsks_map;
1442
1443        map = bpf_object__find_map_by_name(obj, "xsks_map");
1444        xsks_map = bpf_map__fd(map);
1445        if (xsks_map < 0) {
1446                fprintf(stderr, "ERROR: no xsks map found: %s\n",
1447                        strerror(xsks_map));
1448                        exit(EXIT_FAILURE);
1449        }
1450
1451        for (i = 0; i < num_socks; i++) {
1452                int fd = xsk_socket__fd(xsks[i]->xsk);
1453                int key, ret;
1454
1455                key = i;
1456                ret = bpf_map_update_elem(xsks_map, &key, &fd, 0);
1457                if (ret) {
1458                        fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i);
1459                        exit(EXIT_FAILURE);
1460                }
1461        }
1462}
1463
1464int main(int argc, char **argv)
1465{
1466        struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
1467        bool rx = false, tx = false;
1468        struct xsk_umem_info *umem;
1469        struct bpf_object *obj;
1470        pthread_t pt;
1471        int i, ret;
1472        void *bufs;
1473
1474        parse_command_line(argc, argv);
1475
1476        if (setrlimit(RLIMIT_MEMLOCK, &r)) {
1477                fprintf(stderr, "ERROR: setrlimit(RLIMIT_MEMLOCK) \"%s\"\n",
1478                        strerror(errno));
1479                exit(EXIT_FAILURE);
1480        }
1481
1482        if (opt_num_xsks > 1)
1483                load_xdp_program(argv, &obj);
1484
1485        /* Reserve memory for the umem. Use hugepages if unaligned chunk mode */
1486        bufs = mmap(NULL, NUM_FRAMES * opt_xsk_frame_size,
1487                    PROT_READ | PROT_WRITE,
1488                    MAP_PRIVATE | MAP_ANONYMOUS | opt_mmap_flags, -1, 0);
1489        if (bufs == MAP_FAILED) {
1490                printf("ERROR: mmap failed\n");
1491                exit(EXIT_FAILURE);
1492        }
1493
1494        /* Create sockets... */
1495        umem = xsk_configure_umem(bufs, NUM_FRAMES * opt_xsk_frame_size);
1496        if (opt_bench == BENCH_RXDROP || opt_bench == BENCH_L2FWD) {
1497                rx = true;
1498                xsk_populate_fill_ring(umem);
1499        }
1500        if (opt_bench == BENCH_L2FWD || opt_bench == BENCH_TXONLY)
1501                tx = true;
1502        for (i = 0; i < opt_num_xsks; i++)
1503                xsks[num_socks++] = xsk_configure_socket(umem, rx, tx);
1504
1505        if (opt_bench == BENCH_TXONLY) {
1506                gen_eth_hdr_data();
1507
1508                for (i = 0; i < NUM_FRAMES; i++)
1509                        gen_eth_frame(umem, i * opt_xsk_frame_size);
1510        }
1511
1512        if (opt_num_xsks > 1 && opt_bench != BENCH_TXONLY)
1513                enter_xsks_into_map(obj);
1514
1515        signal(SIGINT, int_exit);
1516        signal(SIGTERM, int_exit);
1517        signal(SIGABRT, int_exit);
1518
1519        setlocale(LC_ALL, "");
1520
1521        if (!opt_quiet) {
1522                ret = pthread_create(&pt, NULL, poller, NULL);
1523                if (ret)
1524                        exit_with_error(ret);
1525        }
1526
1527        prev_time = get_nsecs();
1528        start_time = prev_time;
1529
1530        if (opt_bench == BENCH_RXDROP)
1531                rx_drop_all();
1532        else if (opt_bench == BENCH_TXONLY)
1533                tx_only_all();
1534        else
1535                l2fwd_all();
1536
1537        benchmark_done = true;
1538
1539        if (!opt_quiet)
1540                pthread_join(pt, NULL);
1541
1542        xdpsock_cleanup();
1543
1544        return 0;
1545}
1546