qemu/tools/ebpf/rss.bpf.c
<<
>>
Prefs
   1/*
   2 * eBPF RSS program
   3 *
   4 * Developed by Daynix Computing LTD (http://www.daynix.com)
   5 *
   6 * Authors:
   7 *  Andrew Melnychenko <andrew@daynix.com>
   8 *  Yuri Benditovich <yuri.benditovich@daynix.com>
   9 *
  10 * This work is licensed under the terms of the GNU GPL, version 2.  See
  11 * the COPYING file in the top-level directory.
  12 *
  13 * Prepare:
  14 * Requires llvm, clang, bpftool, linux kernel tree
  15 *
  16 * Build rss.bpf.skeleton.h:
  17 * make -f Makefile.ebpf clean all
  18 */
  19
  20#include <stddef.h>
  21#include <stdbool.h>
  22#include <linux/bpf.h>
  23
  24#include <linux/in.h>
  25#include <linux/if_ether.h>
  26#include <linux/ip.h>
  27#include <linux/ipv6.h>
  28
  29#include <linux/udp.h>
  30#include <linux/tcp.h>
  31
  32#include <bpf/bpf_helpers.h>
  33#include <bpf/bpf_endian.h>
  34#include <linux/virtio_net.h>
  35
  36#define INDIRECTION_TABLE_SIZE 128
  37#define HASH_CALCULATION_BUFFER_SIZE 36
  38
  39struct rss_config_t {
  40    __u8 redirect;
  41    __u8 populate_hash;
  42    __u32 hash_types;
  43    __u16 indirections_len;
  44    __u16 default_queue;
  45} __attribute__((packed));
  46
  47struct toeplitz_key_data_t {
  48    __u32 leftmost_32_bits;
  49    __u8 next_byte[HASH_CALCULATION_BUFFER_SIZE];
  50};
  51
  52struct packet_hash_info_t {
  53    __u8 is_ipv4;
  54    __u8 is_ipv6;
  55    __u8 is_udp;
  56    __u8 is_tcp;
  57    __u8 is_ipv6_ext_src;
  58    __u8 is_ipv6_ext_dst;
  59    __u8 is_fragmented;
  60
  61    __u16 src_port;
  62    __u16 dst_port;
  63
  64    union {
  65        struct {
  66            __be32 in_src;
  67            __be32 in_dst;
  68        };
  69
  70        struct {
  71            struct in6_addr in6_src;
  72            struct in6_addr in6_dst;
  73            struct in6_addr in6_ext_src;
  74            struct in6_addr in6_ext_dst;
  75        };
  76    };
  77};
  78
  79struct bpf_map_def SEC("maps")
  80tap_rss_map_configurations = {
  81        .type        = BPF_MAP_TYPE_ARRAY,
  82        .key_size    = sizeof(__u32),
  83        .value_size  = sizeof(struct rss_config_t),
  84        .max_entries = 1,
  85};
  86
  87struct bpf_map_def SEC("maps")
  88tap_rss_map_toeplitz_key = {
  89        .type        = BPF_MAP_TYPE_ARRAY,
  90        .key_size    = sizeof(__u32),
  91        .value_size  = sizeof(struct toeplitz_key_data_t),
  92        .max_entries = 1,
  93};
  94
  95struct bpf_map_def SEC("maps")
  96tap_rss_map_indirection_table = {
  97        .type        = BPF_MAP_TYPE_ARRAY,
  98        .key_size    = sizeof(__u32),
  99        .value_size  = sizeof(__u16),
 100        .max_entries = INDIRECTION_TABLE_SIZE,
 101};
 102
 103static inline void net_rx_rss_add_chunk(__u8 *rss_input, size_t *bytes_written,
 104                                        const void *ptr, size_t size) {
 105    __builtin_memcpy(&rss_input[*bytes_written], ptr, size);
 106    *bytes_written += size;
 107}
 108
 109static inline
 110void net_toeplitz_add(__u32 *result,
 111                      __u8 *input,
 112                      __u32 len
 113        , struct toeplitz_key_data_t *key) {
 114
 115    __u32 accumulator = *result;
 116    __u32 leftmost_32_bits = key->leftmost_32_bits;
 117    __u32 byte;
 118
 119    for (byte = 0; byte < HASH_CALCULATION_BUFFER_SIZE; byte++) {
 120        __u8 input_byte = input[byte];
 121        __u8 key_byte = key->next_byte[byte];
 122        __u8 bit;
 123
 124        for (bit = 0; bit < 8; bit++) {
 125            if (input_byte & (1 << 7)) {
 126                accumulator ^= leftmost_32_bits;
 127            }
 128
 129            leftmost_32_bits =
 130                    (leftmost_32_bits << 1) | ((key_byte & (1 << 7)) >> 7);
 131
 132            input_byte <<= 1;
 133            key_byte <<= 1;
 134        }
 135    }
 136
 137    *result = accumulator;
 138}
 139
 140
 141static inline int ip6_extension_header_type(__u8 hdr_type)
 142{
 143    switch (hdr_type) {
 144    case IPPROTO_HOPOPTS:
 145    case IPPROTO_ROUTING:
 146    case IPPROTO_FRAGMENT:
 147    case IPPROTO_ICMPV6:
 148    case IPPROTO_NONE:
 149    case IPPROTO_DSTOPTS:
 150    case IPPROTO_MH:
 151        return 1;
 152    default:
 153        return 0;
 154    }
 155}
 156/*
 157 * According to
 158 * https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml
 159 * we expect that there are would be no more than 11 extensions in IPv6 header,
 160 * also there is 27 TLV options for Destination and Hop-by-hop extensions.
 161 * Need to choose reasonable amount of maximum extensions/options we may
 162 * check to find ext src/dst.
 163 */
 164#define IP6_EXTENSIONS_COUNT 11
 165#define IP6_OPTIONS_COUNT 30
 166
 167static inline int parse_ipv6_ext(struct __sk_buff *skb,
 168        struct packet_hash_info_t *info,
 169        __u8 *l4_protocol, size_t *l4_offset)
 170{
 171    int err = 0;
 172
 173    if (!ip6_extension_header_type(*l4_protocol)) {
 174        return 0;
 175    }
 176
 177    struct ipv6_opt_hdr ext_hdr = {};
 178
 179    for (unsigned int i = 0; i < IP6_EXTENSIONS_COUNT; ++i) {
 180
 181        err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_hdr,
 182                                    sizeof(ext_hdr), BPF_HDR_START_NET);
 183        if (err) {
 184            goto error;
 185        }
 186
 187        if (*l4_protocol == IPPROTO_ROUTING) {
 188            struct ipv6_rt_hdr ext_rt = {};
 189
 190            err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_rt,
 191                                        sizeof(ext_rt), BPF_HDR_START_NET);
 192            if (err) {
 193                goto error;
 194            }
 195
 196            if ((ext_rt.type == IPV6_SRCRT_TYPE_2) &&
 197                    (ext_rt.hdrlen == sizeof(struct in6_addr) / 8) &&
 198                    (ext_rt.segments_left == 1)) {
 199
 200                err = bpf_skb_load_bytes_relative(skb,
 201                    *l4_offset + offsetof(struct rt2_hdr, addr),
 202                    &info->in6_ext_dst, sizeof(info->in6_ext_dst),
 203                    BPF_HDR_START_NET);
 204                if (err) {
 205                    goto error;
 206                }
 207
 208                info->is_ipv6_ext_dst = 1;
 209            }
 210
 211        } else if (*l4_protocol == IPPROTO_DSTOPTS) {
 212            struct ipv6_opt_t {
 213                __u8 type;
 214                __u8 length;
 215            } __attribute__((packed)) opt = {};
 216
 217            size_t opt_offset = sizeof(ext_hdr);
 218
 219            for (unsigned int j = 0; j < IP6_OPTIONS_COUNT; ++j) {
 220                err = bpf_skb_load_bytes_relative(skb, *l4_offset + opt_offset,
 221                                        &opt, sizeof(opt), BPF_HDR_START_NET);
 222                if (err) {
 223                    goto error;
 224                }
 225
 226                if (opt.type == IPV6_TLV_HAO) {
 227                    err = bpf_skb_load_bytes_relative(skb,
 228                        *l4_offset + opt_offset
 229                        + offsetof(struct ipv6_destopt_hao, addr),
 230                        &info->in6_ext_src, sizeof(info->in6_ext_src),
 231                        BPF_HDR_START_NET);
 232                    if (err) {
 233                        goto error;
 234                    }
 235
 236                    info->is_ipv6_ext_src = 1;
 237                    break;
 238                }
 239
 240                opt_offset += (opt.type == IPV6_TLV_PAD1) ?
 241                              1 : opt.length + sizeof(opt);
 242
 243                if (opt_offset + 1 >= ext_hdr.hdrlen * 8) {
 244                    break;
 245                }
 246            }
 247        } else if (*l4_protocol == IPPROTO_FRAGMENT) {
 248            info->is_fragmented = true;
 249        }
 250
 251        *l4_protocol = ext_hdr.nexthdr;
 252        *l4_offset += (ext_hdr.hdrlen + 1) * 8;
 253
 254        if (!ip6_extension_header_type(ext_hdr.nexthdr)) {
 255            return 0;
 256        }
 257    }
 258
 259    return 0;
 260error:
 261    return err;
 262}
 263
 264static __be16 parse_eth_type(struct __sk_buff *skb)
 265{
 266    unsigned int offset = 12;
 267    __be16 ret = 0;
 268    int err = 0;
 269
 270    err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret),
 271                                BPF_HDR_START_MAC);
 272    if (err) {
 273        return 0;
 274    }
 275
 276    switch (bpf_ntohs(ret)) {
 277    case ETH_P_8021AD:
 278        offset += 4;
 279    case ETH_P_8021Q:
 280        offset += 4;
 281        err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret),
 282                                    BPF_HDR_START_MAC);
 283    default:
 284        break;
 285    }
 286
 287    if (err) {
 288        return 0;
 289    }
 290
 291    return ret;
 292}
 293
 294static inline int parse_packet(struct __sk_buff *skb,
 295        struct packet_hash_info_t *info)
 296{
 297    int err = 0;
 298
 299    if (!info || !skb) {
 300        return -1;
 301    }
 302
 303    size_t l4_offset = 0;
 304    __u8 l4_protocol = 0;
 305    __u16 l3_protocol = bpf_ntohs(parse_eth_type(skb));
 306    if (l3_protocol == 0) {
 307        err = -1;
 308        goto error;
 309    }
 310
 311    if (l3_protocol == ETH_P_IP) {
 312        info->is_ipv4 = 1;
 313
 314        struct iphdr ip = {};
 315        err = bpf_skb_load_bytes_relative(skb, 0, &ip, sizeof(ip),
 316                                    BPF_HDR_START_NET);
 317        if (err) {
 318            goto error;
 319        }
 320
 321        info->in_src = ip.saddr;
 322        info->in_dst = ip.daddr;
 323        info->is_fragmented = !!ip.frag_off;
 324
 325        l4_protocol = ip.protocol;
 326        l4_offset = ip.ihl * 4;
 327    } else if (l3_protocol == ETH_P_IPV6) {
 328        info->is_ipv6 = 1;
 329
 330        struct ipv6hdr ip6 = {};
 331        err = bpf_skb_load_bytes_relative(skb, 0, &ip6, sizeof(ip6),
 332                                    BPF_HDR_START_NET);
 333        if (err) {
 334            goto error;
 335        }
 336
 337        info->in6_src = ip6.saddr;
 338        info->in6_dst = ip6.daddr;
 339
 340        l4_protocol = ip6.nexthdr;
 341        l4_offset = sizeof(ip6);
 342
 343        err = parse_ipv6_ext(skb, info, &l4_protocol, &l4_offset);
 344        if (err) {
 345            goto error;
 346        }
 347    }
 348
 349    if (l4_protocol != 0 && !info->is_fragmented) {
 350        if (l4_protocol == IPPROTO_TCP) {
 351            info->is_tcp = 1;
 352
 353            struct tcphdr tcp = {};
 354            err = bpf_skb_load_bytes_relative(skb, l4_offset, &tcp, sizeof(tcp),
 355                                        BPF_HDR_START_NET);
 356            if (err) {
 357                goto error;
 358            }
 359
 360            info->src_port = tcp.source;
 361            info->dst_port = tcp.dest;
 362        } else if (l4_protocol == IPPROTO_UDP) { /* TODO: add udplite? */
 363            info->is_udp = 1;
 364
 365            struct udphdr udp = {};
 366            err = bpf_skb_load_bytes_relative(skb, l4_offset, &udp, sizeof(udp),
 367                                        BPF_HDR_START_NET);
 368            if (err) {
 369                goto error;
 370            }
 371
 372            info->src_port = udp.source;
 373            info->dst_port = udp.dest;
 374        }
 375    }
 376
 377    return 0;
 378
 379error:
 380    return err;
 381}
 382
 383static inline __u32 calculate_rss_hash(struct __sk_buff *skb,
 384        struct rss_config_t *config, struct toeplitz_key_data_t *toe)
 385{
 386    __u8 rss_input[HASH_CALCULATION_BUFFER_SIZE] = {};
 387    size_t bytes_written = 0;
 388    __u32 result = 0;
 389    int err = 0;
 390    struct packet_hash_info_t packet_info = {};
 391
 392    err = parse_packet(skb, &packet_info);
 393    if (err) {
 394        return 0;
 395    }
 396
 397    if (packet_info.is_ipv4) {
 398        if (packet_info.is_tcp &&
 399            config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) {
 400
 401            net_rx_rss_add_chunk(rss_input, &bytes_written,
 402                                 &packet_info.in_src,
 403                                 sizeof(packet_info.in_src));
 404            net_rx_rss_add_chunk(rss_input, &bytes_written,
 405                                 &packet_info.in_dst,
 406                                 sizeof(packet_info.in_dst));
 407            net_rx_rss_add_chunk(rss_input, &bytes_written,
 408                                 &packet_info.src_port,
 409                                 sizeof(packet_info.src_port));
 410            net_rx_rss_add_chunk(rss_input, &bytes_written,
 411                                 &packet_info.dst_port,
 412                                 sizeof(packet_info.dst_port));
 413        } else if (packet_info.is_udp &&
 414                   config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) {
 415
 416            net_rx_rss_add_chunk(rss_input, &bytes_written,
 417                                 &packet_info.in_src,
 418                                 sizeof(packet_info.in_src));
 419            net_rx_rss_add_chunk(rss_input, &bytes_written,
 420                                 &packet_info.in_dst,
 421                                 sizeof(packet_info.in_dst));
 422            net_rx_rss_add_chunk(rss_input, &bytes_written,
 423                                 &packet_info.src_port,
 424                                 sizeof(packet_info.src_port));
 425            net_rx_rss_add_chunk(rss_input, &bytes_written,
 426                                 &packet_info.dst_port,
 427                                 sizeof(packet_info.dst_port));
 428        } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
 429            net_rx_rss_add_chunk(rss_input, &bytes_written,
 430                                 &packet_info.in_src,
 431                                 sizeof(packet_info.in_src));
 432            net_rx_rss_add_chunk(rss_input, &bytes_written,
 433                                 &packet_info.in_dst,
 434                                 sizeof(packet_info.in_dst));
 435        }
 436    } else if (packet_info.is_ipv6) {
 437        if (packet_info.is_tcp &&
 438            config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) {
 439
 440            if (packet_info.is_ipv6_ext_src &&
 441                config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) {
 442
 443                net_rx_rss_add_chunk(rss_input, &bytes_written,
 444                                     &packet_info.in6_ext_src,
 445                                     sizeof(packet_info.in6_ext_src));
 446            } else {
 447                net_rx_rss_add_chunk(rss_input, &bytes_written,
 448                                     &packet_info.in6_src,
 449                                     sizeof(packet_info.in6_src));
 450            }
 451            if (packet_info.is_ipv6_ext_dst &&
 452                config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) {
 453
 454                net_rx_rss_add_chunk(rss_input, &bytes_written,
 455                                     &packet_info.in6_ext_dst,
 456                                     sizeof(packet_info.in6_ext_dst));
 457            } else {
 458                net_rx_rss_add_chunk(rss_input, &bytes_written,
 459                                     &packet_info.in6_dst,
 460                                     sizeof(packet_info.in6_dst));
 461            }
 462            net_rx_rss_add_chunk(rss_input, &bytes_written,
 463                                 &packet_info.src_port,
 464                                 sizeof(packet_info.src_port));
 465            net_rx_rss_add_chunk(rss_input, &bytes_written,
 466                                 &packet_info.dst_port,
 467                                 sizeof(packet_info.dst_port));
 468        } else if (packet_info.is_udp &&
 469                   config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) {
 470
 471            if (packet_info.is_ipv6_ext_src &&
 472               config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) {
 473
 474                net_rx_rss_add_chunk(rss_input, &bytes_written,
 475                                     &packet_info.in6_ext_src,
 476                                     sizeof(packet_info.in6_ext_src));
 477            } else {
 478                net_rx_rss_add_chunk(rss_input, &bytes_written,
 479                                     &packet_info.in6_src,
 480                                     sizeof(packet_info.in6_src));
 481            }
 482            if (packet_info.is_ipv6_ext_dst &&
 483               config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) {
 484
 485                net_rx_rss_add_chunk(rss_input, &bytes_written,
 486                                     &packet_info.in6_ext_dst,
 487                                     sizeof(packet_info.in6_ext_dst));
 488            } else {
 489                net_rx_rss_add_chunk(rss_input, &bytes_written,
 490                                     &packet_info.in6_dst,
 491                                     sizeof(packet_info.in6_dst));
 492            }
 493
 494            net_rx_rss_add_chunk(rss_input, &bytes_written,
 495                                 &packet_info.src_port,
 496                                 sizeof(packet_info.src_port));
 497            net_rx_rss_add_chunk(rss_input, &bytes_written,
 498                                 &packet_info.dst_port,
 499                                 sizeof(packet_info.dst_port));
 500
 501        } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv6) {
 502            if (packet_info.is_ipv6_ext_src &&
 503               config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) {
 504
 505                net_rx_rss_add_chunk(rss_input, &bytes_written,
 506                                     &packet_info.in6_ext_src,
 507                                     sizeof(packet_info.in6_ext_src));
 508            } else {
 509                net_rx_rss_add_chunk(rss_input, &bytes_written,
 510                                     &packet_info.in6_src,
 511                                     sizeof(packet_info.in6_src));
 512            }
 513            if (packet_info.is_ipv6_ext_dst &&
 514                config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) {
 515
 516                net_rx_rss_add_chunk(rss_input, &bytes_written,
 517                                     &packet_info.in6_ext_dst,
 518                                     sizeof(packet_info.in6_ext_dst));
 519            } else {
 520                net_rx_rss_add_chunk(rss_input, &bytes_written,
 521                                     &packet_info.in6_dst,
 522                                     sizeof(packet_info.in6_dst));
 523            }
 524        }
 525    }
 526
 527    if (bytes_written) {
 528        net_toeplitz_add(&result, rss_input, bytes_written, toe);
 529    }
 530
 531    return result;
 532}
 533
 534SEC("tun_rss_steering")
 535int tun_rss_steering_prog(struct __sk_buff *skb)
 536{
 537
 538    struct rss_config_t *config;
 539    struct toeplitz_key_data_t *toe;
 540
 541    __u32 key = 0;
 542    __u32 hash = 0;
 543
 544    config = bpf_map_lookup_elem(&tap_rss_map_configurations, &key);
 545    toe = bpf_map_lookup_elem(&tap_rss_map_toeplitz_key, &key);
 546
 547    if (config && toe) {
 548        if (!config->redirect) {
 549            return config->default_queue;
 550        }
 551
 552        hash = calculate_rss_hash(skb, config, toe);
 553        if (hash) {
 554            __u32 table_idx = hash % config->indirections_len;
 555            __u16 *queue = 0;
 556
 557            queue = bpf_map_lookup_elem(&tap_rss_map_indirection_table,
 558                                        &table_idx);
 559
 560            if (queue) {
 561                return *queue;
 562            }
 563        }
 564
 565        return config->default_queue;
 566    }
 567
 568    return -1;
 569}
 570
 571char _license[] SEC("license") = "GPL v2";
 572