LXR linux/samples/bpf/xdp_redirect_cpu

   1/*  XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP)
   2 *
   3 *  GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
   4 */
   5#include <uapi/linux/if_ether.h>
   6#include <uapi/linux/if_packet.h>
   7#include <uapi/linux/if_vlan.h>
   8#include <uapi/linux/ip.h>
   9#include <uapi/linux/ipv6.h>
  10#include <uapi/linux/in.h>
  11#include <uapi/linux/tcp.h>
  12#include <uapi/linux/udp.h>
  13
  14#include <uapi/linux/bpf.h>
  15#include "bpf_helpers.h"
  16
  17#define MAX_CPUS 12 /* WARNING - sync with _user.c */
  18
  19/* Special map type that can XDP_REDIRECT frames to another CPU */
  20struct bpf_map_def SEC("maps") cpu_map = {
  21        .type           = BPF_MAP_TYPE_CPUMAP,
  22        .key_size       = sizeof(u32),
  23        .value_size     = sizeof(u32),
  24        .max_entries    = MAX_CPUS,
  25};
  26
  27/* Common stats data record to keep userspace more simple */
  28struct datarec {
  29        __u64 processed;
  30        __u64 dropped;
  31        __u64 issue;
  32};
  33
  34/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
  35 * feedback.  Redirect TX errors can be caught via a tracepoint.
  36 */
  37struct bpf_map_def SEC("maps") rx_cnt = {
  38        .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
  39        .key_size       = sizeof(u32),
  40        .value_size     = sizeof(struct datarec),
  41        .max_entries    = 1,
  42};
  43
  44/* Used by trace point */
  45struct bpf_map_def SEC("maps") redirect_err_cnt = {
  46        .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
  47        .key_size       = sizeof(u32),
  48        .value_size     = sizeof(struct datarec),
  49        .max_entries    = 2,
  50        /* TODO: have entries for all possible errno's */
  51};
  52
  53/* Used by trace point */
  54struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
  55        .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
  56        .key_size       = sizeof(u32),
  57        .value_size     = sizeof(struct datarec),
  58        .max_entries    = MAX_CPUS,
  59};
  60
  61/* Used by trace point */
  62struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
  63        .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
  64        .key_size       = sizeof(u32),
  65        .value_size     = sizeof(struct datarec),
  66        .max_entries    = 1,
  67};
  68
  69/* Set of maps controlling available CPU, and for iterating through
  70 * selectable redirect CPUs.
  71 */
  72struct bpf_map_def SEC("maps") cpus_available = {
  73        .type           = BPF_MAP_TYPE_ARRAY,
  74        .key_size       = sizeof(u32),
  75        .value_size     = sizeof(u32),
  76        .max_entries    = MAX_CPUS,
  77};
  78struct bpf_map_def SEC("maps") cpus_count = {
  79        .type           = BPF_MAP_TYPE_ARRAY,
  80        .key_size       = sizeof(u32),
  81        .value_size     = sizeof(u32),
  82        .max_entries    = 1,
  83};
  84struct bpf_map_def SEC("maps") cpus_iterator = {
  85        .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
  86        .key_size       = sizeof(u32),
  87        .value_size     = sizeof(u32),
  88        .max_entries    = 1,
  89};
  90
  91/* Used by trace point */
  92struct bpf_map_def SEC("maps") exception_cnt = {
  93        .type           = BPF_MAP_TYPE_PERCPU_ARRAY,
  94        .key_size       = sizeof(u32),
  95        .value_size     = sizeof(struct datarec),
  96        .max_entries    = 1,
  97};
  98
  99/* Helper parse functions */
 100
 101/* Parse Ethernet layer 2, extract network layer 3 offset and protocol
 102 *
 103 * Returns false on error and non-supported ether-type
 104 */
 105struct vlan_hdr {
 106        __be16 h_vlan_TCI;
 107        __be16 h_vlan_encapsulated_proto;
 108};
 109
 110static __always_inline
 111bool parse_eth(struct ethhdr *eth, void *data_end,
 112               u16 *eth_proto, u64 *l3_offset)
 113{
 114        u16 eth_type;
 115        u64 offset;
 116
 117        offset = sizeof(*eth);
 118        if ((void *)eth + offset > data_end)
 119                return false;
 120
 121        eth_type = eth->h_proto;
 122
 123        /* Skip non 802.3 Ethertypes */
 124        if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN))
 125                return false;
 126
 127        /* Handle VLAN tagged packet */
 128        if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
 129                struct vlan_hdr *vlan_hdr;
 130
 131                vlan_hdr = (void *)eth + offset;
 132                offset += sizeof(*vlan_hdr);
 133                if ((void *)eth + offset > data_end)
 134                        return false;
 135                eth_type = vlan_hdr->h_vlan_encapsulated_proto;
 136        }
 137        /* TODO: Handle double VLAN tagged packet */
 138
 139        *eth_proto = ntohs(eth_type);
 140        *l3_offset = offset;
 141        return true;
 142}
 143
 144static __always_inline
 145u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off)
 146{
 147        void *data_end = (void *)(long)ctx->data_end;
 148        void *data     = (void *)(long)ctx->data;
 149        struct iphdr *iph = data + nh_off;
 150        struct udphdr *udph;
 151        u16 dport;
 152
 153        if (iph + 1 > data_end)
 154                return 0;
 155        if (!(iph->protocol == IPPROTO_UDP))
 156                return 0;
 157
 158        udph = (void *)(iph + 1);
 159        if (udph + 1 > data_end)
 160                return 0;
 161
 162        dport = ntohs(udph->dest);
 163        return dport;
 164}
 165
 166static __always_inline
 167int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off)
 168{
 169        void *data_end = (void *)(long)ctx->data_end;
 170        void *data     = (void *)(long)ctx->data;
 171        struct iphdr *iph = data + nh_off;
 172
 173        if (iph + 1 > data_end)
 174                return 0;
 175        return iph->protocol;
 176}
 177
 178static __always_inline
 179int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off)
 180{
 181        void *data_end = (void *)(long)ctx->data_end;
 182        void *data     = (void *)(long)ctx->data;
 183        struct ipv6hdr *ip6h = data + nh_off;
 184
 185        if (ip6h + 1 > data_end)
 186                return 0;
 187        return ip6h->nexthdr;
 188}
 189
 190SEC("xdp_cpu_map0")
 191int  xdp_prognum0_no_touch(struct xdp_md *ctx)
 192{
 193        void *data_end = (void *)(long)ctx->data_end;
 194        void *data     = (void *)(long)ctx->data;
 195        struct datarec *rec;
 196        u32 *cpu_selected;
 197        u32 cpu_dest;
 198        u32 key = 0;
 199
 200        /* Only use first entry in cpus_available */
 201        cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
 202        if (!cpu_selected)
 203                return XDP_ABORTED;
 204        cpu_dest = *cpu_selected;
 205
 206        /* Count RX packet in map */
 207        rec = bpf_map_lookup_elem(&rx_cnt, &key);
 208        if (!rec)
 209                return XDP_ABORTED;
 210        rec->processed++;
 211
 212        if (cpu_dest >= MAX_CPUS) {
 213                rec->issue++;
 214                return XDP_ABORTED;
 215        }
 216
 217        return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 218}
 219
 220SEC("xdp_cpu_map1_touch_data")
 221int  xdp_prognum1_touch_data(struct xdp_md *ctx)
 222{
 223        void *data_end = (void *)(long)ctx->data_end;
 224        void *data     = (void *)(long)ctx->data;
 225        struct ethhdr *eth = data;
 226        struct datarec *rec;
 227        u32 *cpu_selected;
 228        u32 cpu_dest;
 229        u16 eth_type;
 230        u32 key = 0;
 231
 232        /* Only use first entry in cpus_available */
 233        cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
 234        if (!cpu_selected)
 235                return XDP_ABORTED;
 236        cpu_dest = *cpu_selected;
 237
 238        /* Validate packet length is minimum Eth header size */
 239        if (eth + 1 > data_end)
 240                return XDP_ABORTED;
 241
 242        /* Count RX packet in map */
 243        rec = bpf_map_lookup_elem(&rx_cnt, &key);
 244        if (!rec)
 245                return XDP_ABORTED;
 246        rec->processed++;
 247
 248        /* Read packet data, and use it (drop non 802.3 Ethertypes) */
 249        eth_type = eth->h_proto;
 250        if (ntohs(eth_type) < ETH_P_802_3_MIN) {
 251                rec->dropped++;
 252                return XDP_DROP;
 253        }
 254
 255        if (cpu_dest >= MAX_CPUS) {
 256                rec->issue++;
 257                return XDP_ABORTED;
 258        }
 259
 260        return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 261}
 262
 263SEC("xdp_cpu_map2_round_robin")
 264int  xdp_prognum2_round_robin(struct xdp_md *ctx)
 265{
 266        void *data_end = (void *)(long)ctx->data_end;
 267        void *data     = (void *)(long)ctx->data;
 268        struct ethhdr *eth = data;
 269        struct datarec *rec;
 270        u32 cpu_dest;
 271        u32 *cpu_lookup;
 272        u32 key0 = 0;
 273
 274        u32 *cpu_selected;
 275        u32 *cpu_iterator;
 276        u32 *cpu_max;
 277        u32 cpu_idx;
 278
 279        cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
 280        if (!cpu_max)
 281                return XDP_ABORTED;
 282
 283        cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0);
 284        if (!cpu_iterator)
 285                return XDP_ABORTED;
 286        cpu_idx = *cpu_iterator;
 287
 288        *cpu_iterator += 1;
 289        if (*cpu_iterator == *cpu_max)
 290                *cpu_iterator = 0;
 291
 292        cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
 293        if (!cpu_selected)
 294                return XDP_ABORTED;
 295        cpu_dest = *cpu_selected;
 296
 297        /* Count RX packet in map */
 298        rec = bpf_map_lookup_elem(&rx_cnt, &key0);
 299        if (!rec)
 300                return XDP_ABORTED;
 301        rec->processed++;
 302
 303        if (cpu_dest >= MAX_CPUS) {
 304                rec->issue++;
 305                return XDP_ABORTED;
 306        }
 307
 308        return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 309}
 310
 311SEC("xdp_cpu_map3_proto_separate")
 312int  xdp_prognum3_proto_separate(struct xdp_md *ctx)
 313{
 314        void *data_end = (void *)(long)ctx->data_end;
 315        void *data     = (void *)(long)ctx->data;
 316        struct ethhdr *eth = data;
 317        u8 ip_proto = IPPROTO_UDP;
 318        struct datarec *rec;
 319        u16 eth_proto = 0;
 320        u64 l3_offset = 0;
 321        u32 cpu_dest = 0;
 322        u32 cpu_idx = 0;
 323        u32 *cpu_lookup;
 324        u32 key = 0;
 325
 326        /* Count RX packet in map */
 327        rec = bpf_map_lookup_elem(&rx_cnt, &key);
 328        if (!rec)
 329                return XDP_ABORTED;
 330        rec->processed++;
 331
 332        if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
 333                return XDP_PASS; /* Just skip */
 334
 335        /* Extract L4 protocol */
 336        switch (eth_proto) {
 337        case ETH_P_IP:
 338                ip_proto = get_proto_ipv4(ctx, l3_offset);
 339                break;
 340        case ETH_P_IPV6:
 341                ip_proto = get_proto_ipv6(ctx, l3_offset);
 342                break;
 343        case ETH_P_ARP:
 344                cpu_idx = 0; /* ARP packet handled on separate CPU */
 345                break;
 346        default:
 347                cpu_idx = 0;
 348        }
 349
 350        /* Choose CPU based on L4 protocol */
 351        switch (ip_proto) {
 352        case IPPROTO_ICMP:
 353        case IPPROTO_ICMPV6:
 354                cpu_idx = 2;
 355                break;
 356        case IPPROTO_TCP:
 357                cpu_idx = 0;
 358                break;
 359        case IPPROTO_UDP:
 360                cpu_idx = 1;
 361                break;
 362        default:
 363                cpu_idx = 0;
 364        }
 365
 366        cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
 367        if (!cpu_lookup)
 368                return XDP_ABORTED;
 369        cpu_dest = *cpu_lookup;
 370
 371        if (cpu_dest >= MAX_CPUS) {
 372                rec->issue++;
 373                return XDP_ABORTED;
 374        }
 375
 376        return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 377}
 378
 379SEC("xdp_cpu_map4_ddos_filter_pktgen")
 380int  xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx)
 381{
 382        void *data_end = (void *)(long)ctx->data_end;
 383        void *data     = (void *)(long)ctx->data;
 384        struct ethhdr *eth = data;
 385        u8 ip_proto = IPPROTO_UDP;
 386        struct datarec *rec;
 387        u16 eth_proto = 0;
 388        u64 l3_offset = 0;
 389        u32 cpu_dest = 0;
 390        u32 cpu_idx = 0;
 391        u16 dest_port;
 392        u32 *cpu_lookup;
 393        u32 key = 0;
 394
 395        /* Count RX packet in map */
 396        rec = bpf_map_lookup_elem(&rx_cnt, &key);
 397        if (!rec)
 398                return XDP_ABORTED;
 399        rec->processed++;
 400
 401        if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
 402                return XDP_PASS; /* Just skip */
 403
 404        /* Extract L4 protocol */
 405        switch (eth_proto) {
 406        case ETH_P_IP:
 407                ip_proto = get_proto_ipv4(ctx, l3_offset);
 408                break;
 409        case ETH_P_IPV6:
 410                ip_proto = get_proto_ipv6(ctx, l3_offset);
 411                break;
 412        case ETH_P_ARP:
 413                cpu_idx = 0; /* ARP packet handled on separate CPU */
 414                break;
 415        default:
 416                cpu_idx = 0;
 417        }
 418
 419        /* Choose CPU based on L4 protocol */
 420        switch (ip_proto) {
 421        case IPPROTO_ICMP:
 422        case IPPROTO_ICMPV6:
 423                cpu_idx = 2;
 424                break;
 425        case IPPROTO_TCP:
 426                cpu_idx = 0;
 427                break;
 428        case IPPROTO_UDP:
 429                cpu_idx = 1;
 430                /* DDoS filter UDP port 9 (pktgen) */
 431                dest_port = get_dest_port_ipv4_udp(ctx, l3_offset);
 432                if (dest_port == 9) {
 433                        if (rec)
 434                                rec->dropped++;
 435                        return XDP_DROP;
 436                }
 437                break;
 438        default:
 439                cpu_idx = 0;
 440        }
 441
 442        cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
 443        if (!cpu_lookup)
 444                return XDP_ABORTED;
 445        cpu_dest = *cpu_lookup;
 446
 447        if (cpu_dest >= MAX_CPUS) {
 448                rec->issue++;
 449                return XDP_ABORTED;
 450        }
 451
 452        return bpf_redirect_map(&cpu_map, cpu_dest, 0);
 453}
 454
 455
 456char _license[] SEC("license") = "GPL";
 457
 458/*** Trace point code ***/
 459
 460/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
 461 * Code in:                kernel/include/trace/events/xdp.h
 462 */
 463struct xdp_redirect_ctx {
 464        u64 __pad;      // First 8 bytes are not accessible by bpf code
 465        int prog_id;    //      offset:8;  size:4; signed:1;
 466        u32 act;        //      offset:12  size:4; signed:0;
 467        int ifindex;    //      offset:16  size:4; signed:1;
 468        int err;        //      offset:20  size:4; signed:1;
 469        int to_ifindex; //      offset:24  size:4; signed:1;
 470        u32 map_id;     //      offset:28  size:4; signed:0;
 471        int map_index;  //      offset:32  size:4; signed:1;
 472};                      //      offset:36
 473
 474enum {
 475        XDP_REDIRECT_SUCCESS = 0,
 476        XDP_REDIRECT_ERROR = 1
 477};
 478
 479static __always_inline
 480int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
 481{
 482        u32 key = XDP_REDIRECT_ERROR;
 483        struct datarec *rec;
 484        int err = ctx->err;
 485
 486        if (!err)
 487                key = XDP_REDIRECT_SUCCESS;
 488
 489        rec = bpf_map_lookup_elem(&redirect_err_cnt, &key);
 490        if (!rec)
 491                return 0;
 492        rec->dropped += 1;
 493
 494        return 0; /* Indicate event was filtered (no further processing)*/
 495        /*
 496         * Returning 1 here would allow e.g. a perf-record tracepoint
 497         * to see and record these events, but it doesn't work well
 498         * in-practice as stopping perf-record also unload this
 499         * bpf_prog.  Plus, there is additional overhead of doing so.
 500         */
 501}
 502
 503SEC("tracepoint/xdp/xdp_redirect_err")
 504int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx)
 505{
 506        return xdp_redirect_collect_stat(ctx);
 507}
 508
 509SEC("tracepoint/xdp/xdp_redirect_map_err")
 510int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx)
 511{
 512        return xdp_redirect_collect_stat(ctx);
 513}
 514
 515/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
 516 * Code in:                kernel/include/trace/events/xdp.h
 517 */
 518struct xdp_exception_ctx {
 519        u64 __pad;      // First 8 bytes are not accessible by bpf code
 520        int prog_id;    //      offset:8;  size:4; signed:1;
 521        u32 act;        //      offset:12; size:4; signed:0;
 522        int ifindex;    //      offset:16; size:4; signed:1;
 523};
 524
 525SEC("tracepoint/xdp/xdp_exception")
 526int trace_xdp_exception(struct xdp_exception_ctx *ctx)
 527{
 528        struct datarec *rec;
 529        u32 key = 0;
 530
 531        rec = bpf_map_lookup_elem(&exception_cnt, &key);
 532        if (!rec)
 533                return 1;
 534        rec->dropped += 1;
 535
 536        return 0;
 537}
 538
 539/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
 540 * Code in:         kernel/include/trace/events/xdp.h
 541 */
 542struct cpumap_enqueue_ctx {
 543        u64 __pad;              // First 8 bytes are not accessible by bpf code
 544        int map_id;             //      offset:8;  size:4; signed:1;
 545        u32 act;                //      offset:12; size:4; signed:0;
 546        int cpu;                //      offset:16; size:4; signed:1;
 547        unsigned int drops;     //      offset:20; size:4; signed:0;
 548        unsigned int processed; //      offset:24; size:4; signed:0;
 549        int to_cpu;             //      offset:28; size:4; signed:1;
 550};
 551
 552SEC("tracepoint/xdp/xdp_cpumap_enqueue")
 553int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
 554{
 555        u32 to_cpu = ctx->to_cpu;
 556        struct datarec *rec;
 557
 558        if (to_cpu >= MAX_CPUS)
 559                return 1;
 560
 561        rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
 562        if (!rec)
 563                return 0;
 564        rec->processed += ctx->processed;
 565        rec->dropped   += ctx->drops;
 566
 567        /* Record bulk events, then userspace can calc average bulk size */
 568        if (ctx->processed > 0)
 569                rec->issue += 1;
 570
 571        /* Inception: It's possible to detect overload situations, via
 572         * this tracepoint.  This can be used for creating a feedback
 573         * loop to XDP, which can take appropriate actions to mitigate
 574         * this overload situation.
 575         */
 576        return 0;
 577}
 578
 579/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
 580 * Code in:         kernel/include/trace/events/xdp.h
 581 */
 582struct cpumap_kthread_ctx {
 583        u64 __pad;              // First 8 bytes are not accessible by bpf code
 584        int map_id;             //      offset:8;  size:4; signed:1;
 585        u32 act;                //      offset:12; size:4; signed:0;
 586        int cpu;                //      offset:16; size:4; signed:1;
 587        unsigned int drops;     //      offset:20; size:4; signed:0;
 588        unsigned int processed; //      offset:24; size:4; signed:0;
 589        int sched;              //      offset:28; size:4; signed:1;
 590};
 591
 592SEC("tracepoint/xdp/xdp_cpumap_kthread")
 593int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
 594{
 595        struct datarec *rec;
 596        u32 key = 0;
 597
 598        rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
 599        if (!rec)
 600                return 0;
 601        rec->processed += ctx->processed;
 602        rec->dropped   += ctx->drops;
 603
 604        /* Count times kthread yielded CPU via schedule call */
 605        if (ctx->sched)
 606                rec->issue++;
 607
 608        return 0;
 609}
 610