linux/tools/testing/selftests/bpf/progs/test_cls_redirect.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
   2// Copyright (c) 2019, 2020 Cloudflare
   3
   4#include <stdbool.h>
   5#include <stddef.h>
   6#include <stdint.h>
   7#include <string.h>
   8
   9#include <linux/bpf.h>
  10#include <linux/icmp.h>
  11#include <linux/icmpv6.h>
  12#include <linux/if_ether.h>
  13#include <linux/in.h>
  14#include <linux/ip.h>
  15#include <linux/ipv6.h>
  16#include <linux/pkt_cls.h>
  17#include <linux/tcp.h>
  18#include <linux/udp.h>
  19
  20#include <bpf/bpf_helpers.h>
  21#include <bpf/bpf_endian.h>
  22
  23#include "test_cls_redirect.h"
  24
  25#ifdef SUBPROGS
  26#define INLINING __noinline
  27#else
  28#define INLINING __always_inline
  29#endif
  30
  31#define offsetofend(TYPE, MEMBER) \
  32        (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER)))
  33
  34#define IP_OFFSET_MASK (0x1FFF)
  35#define IP_MF (0x2000)
  36
  37char _license[] SEC("license") = "Dual BSD/GPL";
  38
  39/**
  40 * Destination port and IP used for UDP encapsulation.
  41 */
  42volatile const __be16 ENCAPSULATION_PORT;
  43volatile const __be32 ENCAPSULATION_IP;
  44
  45typedef struct {
  46        uint64_t processed_packets_total;
  47        uint64_t l3_protocol_packets_total_ipv4;
  48        uint64_t l3_protocol_packets_total_ipv6;
  49        uint64_t l4_protocol_packets_total_tcp;
  50        uint64_t l4_protocol_packets_total_udp;
  51        uint64_t accepted_packets_total_syn;
  52        uint64_t accepted_packets_total_syn_cookies;
  53        uint64_t accepted_packets_total_last_hop;
  54        uint64_t accepted_packets_total_icmp_echo_request;
  55        uint64_t accepted_packets_total_established;
  56        uint64_t forwarded_packets_total_gue;
  57        uint64_t forwarded_packets_total_gre;
  58
  59        uint64_t errors_total_unknown_l3_proto;
  60        uint64_t errors_total_unknown_l4_proto;
  61        uint64_t errors_total_malformed_ip;
  62        uint64_t errors_total_fragmented_ip;
  63        uint64_t errors_total_malformed_icmp;
  64        uint64_t errors_total_unwanted_icmp;
  65        uint64_t errors_total_malformed_icmp_pkt_too_big;
  66        uint64_t errors_total_malformed_tcp;
  67        uint64_t errors_total_malformed_udp;
  68        uint64_t errors_total_icmp_echo_replies;
  69        uint64_t errors_total_malformed_encapsulation;
  70        uint64_t errors_total_encap_adjust_failed;
  71        uint64_t errors_total_encap_buffer_too_small;
  72        uint64_t errors_total_redirect_loop;
  73        uint64_t errors_total_encap_mtu_violate;
  74} metrics_t;
  75
  76typedef enum {
  77        INVALID = 0,
  78        UNKNOWN,
  79        ECHO_REQUEST,
  80        SYN,
  81        SYN_COOKIE,
  82        ESTABLISHED,
  83} verdict_t;
  84
  85typedef struct {
  86        uint16_t src, dst;
  87} flow_ports_t;
  88
  89_Static_assert(
  90        sizeof(flow_ports_t) !=
  91                offsetofend(struct bpf_sock_tuple, ipv4.dport) -
  92                        offsetof(struct bpf_sock_tuple, ipv4.sport) - 1,
  93        "flow_ports_t must match sport and dport in struct bpf_sock_tuple");
  94_Static_assert(
  95        sizeof(flow_ports_t) !=
  96                offsetofend(struct bpf_sock_tuple, ipv6.dport) -
  97                        offsetof(struct bpf_sock_tuple, ipv6.sport) - 1,
  98        "flow_ports_t must match sport and dport in struct bpf_sock_tuple");
  99
 100typedef int ret_t;
 101
 102/* This is a bit of a hack. We need a return value which allows us to
 103 * indicate that the regular flow of the program should continue,
 104 * while allowing functions to use XDP_PASS and XDP_DROP, etc.
 105 */
 106static const ret_t CONTINUE_PROCESSING = -1;
 107
 108/* Convenience macro to call functions which return ret_t.
 109 */
 110#define MAYBE_RETURN(x)                           \
 111        do {                                      \
 112                ret_t __ret = x;                  \
 113                if (__ret != CONTINUE_PROCESSING) \
 114                        return __ret;             \
 115        } while (0)
 116
 117/* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes),
 118 * or not aligned if the arch supports efficient unaligned access.
 119 *
 120 * Since the verifier ensures that eBPF packet accesses follow these rules,
 121 * we can tell LLVM to emit code as if we always had a larger alignment.
 122 * It will yell at us if we end up on a platform where this is not valid.
 123 */
 124typedef uint8_t *net_ptr __attribute__((align_value(8)));
 125
 126typedef struct buf {
 127        struct __sk_buff *skb;
 128        net_ptr head;
 129        /* NB: tail musn't have alignment other than 1, otherwise
 130        * LLVM will go and eliminate code, e.g. when checking packet lengths.
 131        */
 132        uint8_t *const tail;
 133} buf_t;
 134
 135static __always_inline size_t buf_off(const buf_t *buf)
 136{
 137        /* Clang seems to optimize constructs like
 138         *    a - b + c
 139         * if c is known:
 140         *    r? = c
 141         *    r? -= b
 142         *    r? += a
 143         *
 144         * This is a problem if a and b are packet pointers,
 145         * since the verifier allows subtracting two pointers to
 146         * get a scalar, but not a scalar and a pointer.
 147         *
 148         * Use inline asm to break this optimization.
 149         */
 150        size_t off = (size_t)buf->head;
 151        asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data));
 152        return off;
 153}
 154
 155static __always_inline bool buf_copy(buf_t *buf, void *dst, size_t len)
 156{
 157        if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) {
 158                return false;
 159        }
 160
 161        buf->head += len;
 162        return true;
 163}
 164
 165static __always_inline bool buf_skip(buf_t *buf, const size_t len)
 166{
 167        /* Check whether off + len is valid in the non-linear part. */
 168        if (buf_off(buf) + len > buf->skb->len) {
 169                return false;
 170        }
 171
 172        buf->head += len;
 173        return true;
 174}
 175
 176/* Returns a pointer to the start of buf, or NULL if len is
 177 * larger than the remaining data. Consumes len bytes on a successful
 178 * call.
 179 *
 180 * If scratch is not NULL, the function will attempt to load non-linear
 181 * data via bpf_skb_load_bytes. On success, scratch is returned.
 182 */
 183static __always_inline void *buf_assign(buf_t *buf, const size_t len, void *scratch)
 184{
 185        if (buf->head + len > buf->tail) {
 186                if (scratch == NULL) {
 187                        return NULL;
 188                }
 189
 190                return buf_copy(buf, scratch, len) ? scratch : NULL;
 191        }
 192
 193        void *ptr = buf->head;
 194        buf->head += len;
 195        return ptr;
 196}
 197
 198static INLINING bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4)
 199{
 200        if (ipv4->ihl <= 5) {
 201                return true;
 202        }
 203
 204        return buf_skip(buf, (ipv4->ihl - 5) * 4);
 205}
 206
 207static INLINING bool ipv4_is_fragment(const struct iphdr *ip)
 208{
 209        uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK);
 210        return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0;
 211}
 212
 213static __always_inline struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch)
 214{
 215        struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch);
 216        if (ipv4 == NULL) {
 217                return NULL;
 218        }
 219
 220        if (ipv4->ihl < 5) {
 221                return NULL;
 222        }
 223
 224        if (!pkt_skip_ipv4_options(pkt, ipv4)) {
 225                return NULL;
 226        }
 227
 228        return ipv4;
 229}
 230
 231/* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */
 232static INLINING bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports)
 233{
 234        if (!buf_copy(pkt, ports, sizeof(*ports))) {
 235                return false;
 236        }
 237
 238        /* Ports in the L4 headers are reversed, since we are parsing an ICMP
 239         * payload which is going towards the eyeball.
 240         */
 241        uint16_t dst = ports->src;
 242        ports->src = ports->dst;
 243        ports->dst = dst;
 244        return true;
 245}
 246
 247static INLINING uint16_t pkt_checksum_fold(uint32_t csum)
 248{
 249        /* The highest reasonable value for an IPv4 header
 250         * checksum requires two folds, so we just do that always.
 251         */
 252        csum = (csum & 0xffff) + (csum >> 16);
 253        csum = (csum & 0xffff) + (csum >> 16);
 254        return (uint16_t)~csum;
 255}
 256
 257static INLINING void pkt_ipv4_checksum(struct iphdr *iph)
 258{
 259        iph->check = 0;
 260
 261        /* An IP header without options is 20 bytes. Two of those
 262         * are the checksum, which we always set to zero. Hence,
 263         * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7,
 264         * which fits in 32 bit.
 265         */
 266        _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes");
 267        uint32_t acc = 0;
 268        uint16_t *ipw = (uint16_t *)iph;
 269
 270#pragma clang loop unroll(full)
 271        for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) {
 272                acc += ipw[i];
 273        }
 274
 275        iph->check = pkt_checksum_fold(acc);
 276}
 277
 278static INLINING
 279bool pkt_skip_ipv6_extension_headers(buf_t *pkt,
 280                                     const struct ipv6hdr *ipv6,
 281                                     uint8_t *upper_proto,
 282                                     bool *is_fragment)
 283{
 284        /* We understand five extension headers.
 285         * https://tools.ietf.org/html/rfc8200#section-4.1 states that all
 286         * headers should occur once, except Destination Options, which may
 287         * occur twice. Hence we give up after 6 headers.
 288         */
 289        struct {
 290                uint8_t next;
 291                uint8_t len;
 292        } exthdr = {
 293                .next = ipv6->nexthdr,
 294        };
 295        *is_fragment = false;
 296
 297#pragma clang loop unroll(full)
 298        for (int i = 0; i < 6; i++) {
 299                switch (exthdr.next) {
 300                case IPPROTO_FRAGMENT:
 301                        *is_fragment = true;
 302                        /* NB: We don't check that hdrlen == 0 as per spec. */
 303                        /* fallthrough; */
 304
 305                case IPPROTO_HOPOPTS:
 306                case IPPROTO_ROUTING:
 307                case IPPROTO_DSTOPTS:
 308                case IPPROTO_MH:
 309                        if (!buf_copy(pkt, &exthdr, sizeof(exthdr))) {
 310                                return false;
 311                        }
 312
 313                        /* hdrlen is in 8-octet units, and excludes the first 8 octets. */
 314                        if (!buf_skip(pkt,
 315                                      (exthdr.len + 1) * 8 - sizeof(exthdr))) {
 316                                return false;
 317                        }
 318
 319                        /* Decode next header */
 320                        break;
 321
 322                default:
 323                        /* The next header is not one of the known extension
 324                         * headers, treat it as the upper layer header.
 325                         *
 326                         * This handles IPPROTO_NONE.
 327                         *
 328                         * Encapsulating Security Payload (50) and Authentication
 329                         * Header (51) also end up here (and will trigger an
 330                         * unknown proto error later). They have a custom header
 331                         * format and seem too esoteric to care about.
 332                         */
 333                        *upper_proto = exthdr.next;
 334                        return true;
 335                }
 336        }
 337
 338        /* We never found an upper layer header. */
 339        return false;
 340}
 341
 342/* This function has to be inlined, because the verifier otherwise rejects it
 343 * due to returning a pointer to the stack. This is technically correct, since
 344 * scratch is allocated on the stack. However, this usage should be safe since
 345 * it's the callers stack after all.
 346 */
 347static __always_inline struct ipv6hdr *
 348pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto,
 349               bool *is_fragment)
 350{
 351        struct ipv6hdr *ipv6 = buf_assign(pkt, sizeof(*ipv6), scratch);
 352        if (ipv6 == NULL) {
 353                return NULL;
 354        }
 355
 356        if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, proto, is_fragment)) {
 357                return NULL;
 358        }
 359
 360        return ipv6;
 361}
 362
 363/* Global metrics, per CPU
 364 */
 365struct {
 366        __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 367        __uint(max_entries, 1);
 368        __type(key, unsigned int);
 369        __type(value, metrics_t);
 370} metrics_map SEC(".maps");
 371
 372static INLINING metrics_t *get_global_metrics(void)
 373{
 374        uint64_t key = 0;
 375        return bpf_map_lookup_elem(&metrics_map, &key);
 376}
 377
 378static INLINING ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap)
 379{
 380        const int payload_off =
 381                sizeof(*encap) +
 382                sizeof(struct in_addr) * encap->unigue.hop_count;
 383        int32_t encap_overhead = payload_off - sizeof(struct ethhdr);
 384
 385        // Changing the ethertype if the encapsulated packet is ipv6
 386        if (encap->gue.proto_ctype == IPPROTO_IPV6) {
 387                encap->eth.h_proto = bpf_htons(ETH_P_IPV6);
 388        }
 389
 390        if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC,
 391                                BPF_F_ADJ_ROOM_FIXED_GSO |
 392                                BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
 393            bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC))
 394                return TC_ACT_SHOT;
 395
 396        return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
 397}
 398
 399static INLINING ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap,
 400                                       struct in_addr *next_hop, metrics_t *metrics)
 401{
 402        metrics->forwarded_packets_total_gre++;
 403
 404        const int payload_off =
 405                sizeof(*encap) +
 406                sizeof(struct in_addr) * encap->unigue.hop_count;
 407        int32_t encap_overhead =
 408                payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr);
 409        int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead;
 410        uint16_t proto = ETH_P_IP;
 411        uint32_t mtu_len = 0;
 412
 413        /* Loop protection: the inner packet's TTL is decremented as a safeguard
 414         * against any forwarding loop. As the only interesting field is the TTL
 415         * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes
 416         * as they handle the split packets if needed (no need for the data to be
 417         * in the linear section).
 418         */
 419        if (encap->gue.proto_ctype == IPPROTO_IPV6) {
 420                proto = ETH_P_IPV6;
 421                uint8_t ttl;
 422                int rc;
 423
 424                rc = bpf_skb_load_bytes(
 425                        skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
 426                        &ttl, 1);
 427                if (rc != 0) {
 428                        metrics->errors_total_malformed_encapsulation++;
 429                        return TC_ACT_SHOT;
 430                }
 431
 432                if (ttl == 0) {
 433                        metrics->errors_total_redirect_loop++;
 434                        return TC_ACT_SHOT;
 435                }
 436
 437                ttl--;
 438                rc = bpf_skb_store_bytes(
 439                        skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
 440                        &ttl, 1, 0);
 441                if (rc != 0) {
 442                        metrics->errors_total_malformed_encapsulation++;
 443                        return TC_ACT_SHOT;
 444                }
 445        } else {
 446                uint8_t ttl;
 447                int rc;
 448
 449                rc = bpf_skb_load_bytes(
 450                        skb, payload_off + offsetof(struct iphdr, ttl), &ttl,
 451                        1);
 452                if (rc != 0) {
 453                        metrics->errors_total_malformed_encapsulation++;
 454                        return TC_ACT_SHOT;
 455                }
 456
 457                if (ttl == 0) {
 458                        metrics->errors_total_redirect_loop++;
 459                        return TC_ACT_SHOT;
 460                }
 461
 462                /* IPv4 also has a checksum to patch. While the TTL is only one byte,
 463                 * this function only works for 2 and 4 bytes arguments (the result is
 464                 * the same).
 465                 */
 466                rc = bpf_l3_csum_replace(
 467                        skb, payload_off + offsetof(struct iphdr, check), ttl,
 468                        ttl - 1, 2);
 469                if (rc != 0) {
 470                        metrics->errors_total_malformed_encapsulation++;
 471                        return TC_ACT_SHOT;
 472                }
 473
 474                ttl--;
 475                rc = bpf_skb_store_bytes(
 476                        skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1,
 477                        0);
 478                if (rc != 0) {
 479                        metrics->errors_total_malformed_encapsulation++;
 480                        return TC_ACT_SHOT;
 481                }
 482        }
 483
 484        if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) {
 485                metrics->errors_total_encap_mtu_violate++;
 486                return TC_ACT_SHOT;
 487        }
 488
 489        if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET,
 490                                BPF_F_ADJ_ROOM_FIXED_GSO |
 491                                BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
 492            bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) {
 493                metrics->errors_total_encap_adjust_failed++;
 494                return TC_ACT_SHOT;
 495        }
 496
 497        if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) {
 498                metrics->errors_total_encap_buffer_too_small++;
 499                return TC_ACT_SHOT;
 500        }
 501
 502        buf_t pkt = {
 503                .skb = skb,
 504                .head = (uint8_t *)(long)skb->data,
 505                .tail = (uint8_t *)(long)skb->data_end,
 506        };
 507
 508        encap_gre_t *encap_gre = buf_assign(&pkt, sizeof(encap_gre_t), NULL);
 509        if (encap_gre == NULL) {
 510                metrics->errors_total_encap_buffer_too_small++;
 511                return TC_ACT_SHOT;
 512        }
 513
 514        encap_gre->ip.protocol = IPPROTO_GRE;
 515        encap_gre->ip.daddr = next_hop->s_addr;
 516        encap_gre->ip.saddr = ENCAPSULATION_IP;
 517        encap_gre->ip.tot_len =
 518                bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta);
 519        encap_gre->gre.flags = 0;
 520        encap_gre->gre.protocol = bpf_htons(proto);
 521        pkt_ipv4_checksum((void *)&encap_gre->ip);
 522
 523        return bpf_redirect(skb->ifindex, 0);
 524}
 525
 526static INLINING ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap,
 527                                          struct in_addr *next_hop, metrics_t *metrics)
 528{
 529        /* swap L2 addresses */
 530        /* This assumes that packets are received from a router.
 531         * So just swapping the MAC addresses here will make the packet go back to
 532         * the router, which will send it to the appropriate machine.
 533         */
 534        unsigned char temp[ETH_ALEN];
 535        memcpy(temp, encap->eth.h_dest, sizeof(temp));
 536        memcpy(encap->eth.h_dest, encap->eth.h_source,
 537               sizeof(encap->eth.h_dest));
 538        memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source));
 539
 540        if (encap->unigue.next_hop == encap->unigue.hop_count - 1 &&
 541            encap->unigue.last_hop_gre) {
 542                return forward_with_gre(skb, encap, next_hop, metrics);
 543        }
 544
 545        metrics->forwarded_packets_total_gue++;
 546        uint32_t old_saddr = encap->ip.saddr;
 547        encap->ip.saddr = encap->ip.daddr;
 548        encap->ip.daddr = next_hop->s_addr;
 549        if (encap->unigue.next_hop < encap->unigue.hop_count) {
 550                encap->unigue.next_hop++;
 551        }
 552
 553        /* Remove ip->saddr, add next_hop->s_addr */
 554        const uint64_t off = offsetof(typeof(*encap), ip.check);
 555        int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4);
 556        if (ret < 0) {
 557                return TC_ACT_SHOT;
 558        }
 559
 560        return bpf_redirect(skb->ifindex, 0);
 561}
 562
 563static INLINING ret_t skip_next_hops(buf_t *pkt, int n)
 564{
 565        switch (n) {
 566        case 1:
 567                if (!buf_skip(pkt, sizeof(struct in_addr)))
 568                        return TC_ACT_SHOT;
 569        case 0:
 570                return CONTINUE_PROCESSING;
 571
 572        default:
 573                return TC_ACT_SHOT;
 574        }
 575}
 576
 577/* Get the next hop from the GLB header.
 578 *
 579 * Sets next_hop->s_addr to 0 if there are no more hops left.
 580 * pkt is positioned just after the variable length GLB header
 581 * iff the call is successful.
 582 */
 583static INLINING ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap,
 584                                   struct in_addr *next_hop)
 585{
 586        if (encap->unigue.next_hop > encap->unigue.hop_count) {
 587                return TC_ACT_SHOT;
 588        }
 589
 590        /* Skip "used" next hops. */
 591        MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop));
 592
 593        if (encap->unigue.next_hop == encap->unigue.hop_count) {
 594                /* No more next hops, we are at the end of the GLB header. */
 595                next_hop->s_addr = 0;
 596                return CONTINUE_PROCESSING;
 597        }
 598
 599        if (!buf_copy(pkt, next_hop, sizeof(*next_hop))) {
 600                return TC_ACT_SHOT;
 601        }
 602
 603        /* Skip the remainig next hops (may be zero). */
 604        return skip_next_hops(pkt, encap->unigue.hop_count -
 605                                           encap->unigue.next_hop - 1);
 606}
 607
 608/* Fill a bpf_sock_tuple to be used with the socket lookup functions.
 609 * This is a kludge that let's us work around verifier limitations:
 610 *
 611 *    fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321)
 612 *
 613 * clang will substitue a costant for sizeof, which allows the verifier
 614 * to track it's value. Based on this, it can figure out the constant
 615 * return value, and calling code works while still being "generic" to
 616 * IPv4 and IPv6.
 617 */
 618static INLINING uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph,
 619                                    uint64_t iphlen, uint16_t sport, uint16_t dport)
 620{
 621        switch (iphlen) {
 622        case sizeof(struct iphdr): {
 623                struct iphdr *ipv4 = (struct iphdr *)iph;
 624                tuple->ipv4.daddr = ipv4->daddr;
 625                tuple->ipv4.saddr = ipv4->saddr;
 626                tuple->ipv4.sport = sport;
 627                tuple->ipv4.dport = dport;
 628                return sizeof(tuple->ipv4);
 629        }
 630
 631        case sizeof(struct ipv6hdr): {
 632                struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph;
 633                memcpy(&tuple->ipv6.daddr, &ipv6->daddr,
 634                       sizeof(tuple->ipv6.daddr));
 635                memcpy(&tuple->ipv6.saddr, &ipv6->saddr,
 636                       sizeof(tuple->ipv6.saddr));
 637                tuple->ipv6.sport = sport;
 638                tuple->ipv6.dport = dport;
 639                return sizeof(tuple->ipv6);
 640        }
 641
 642        default:
 643                return 0;
 644        }
 645}
 646
 647static INLINING verdict_t classify_tcp(struct __sk_buff *skb,
 648                                       struct bpf_sock_tuple *tuple, uint64_t tuplen,
 649                                       void *iph, struct tcphdr *tcp)
 650{
 651        struct bpf_sock *sk =
 652                bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
 653        if (sk == NULL) {
 654                return UNKNOWN;
 655        }
 656
 657        if (sk->state != BPF_TCP_LISTEN) {
 658                bpf_sk_release(sk);
 659                return ESTABLISHED;
 660        }
 661
 662        if (iph != NULL && tcp != NULL) {
 663                /* Kludge: we've run out of arguments, but need the length of the ip header. */
 664                uint64_t iphlen = sizeof(struct iphdr);
 665                if (tuplen == sizeof(tuple->ipv6)) {
 666                        iphlen = sizeof(struct ipv6hdr);
 667                }
 668
 669                if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp,
 670                                            sizeof(*tcp)) == 0) {
 671                        bpf_sk_release(sk);
 672                        return SYN_COOKIE;
 673                }
 674        }
 675
 676        bpf_sk_release(sk);
 677        return UNKNOWN;
 678}
 679
 680static INLINING verdict_t classify_udp(struct __sk_buff *skb,
 681                                       struct bpf_sock_tuple *tuple, uint64_t tuplen)
 682{
 683        struct bpf_sock *sk =
 684                bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
 685        if (sk == NULL) {
 686                return UNKNOWN;
 687        }
 688
 689        if (sk->state == BPF_TCP_ESTABLISHED) {
 690                bpf_sk_release(sk);
 691                return ESTABLISHED;
 692        }
 693
 694        bpf_sk_release(sk);
 695        return UNKNOWN;
 696}
 697
 698static INLINING verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto,
 699                                        struct bpf_sock_tuple *tuple, uint64_t tuplen,
 700                                        metrics_t *metrics)
 701{
 702        switch (proto) {
 703        case IPPROTO_TCP:
 704                return classify_tcp(skb, tuple, tuplen, NULL, NULL);
 705
 706        case IPPROTO_UDP:
 707                return classify_udp(skb, tuple, tuplen);
 708
 709        default:
 710                metrics->errors_total_malformed_icmp++;
 711                return INVALID;
 712        }
 713}
 714
 715static INLINING verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics)
 716{
 717        struct icmphdr icmp;
 718        if (!buf_copy(pkt, &icmp, sizeof(icmp))) {
 719                metrics->errors_total_malformed_icmp++;
 720                return INVALID;
 721        }
 722
 723        /* We should never receive encapsulated echo replies. */
 724        if (icmp.type == ICMP_ECHOREPLY) {
 725                metrics->errors_total_icmp_echo_replies++;
 726                return INVALID;
 727        }
 728
 729        if (icmp.type == ICMP_ECHO) {
 730                return ECHO_REQUEST;
 731        }
 732
 733        if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) {
 734                metrics->errors_total_unwanted_icmp++;
 735                return INVALID;
 736        }
 737
 738        struct iphdr _ip4;
 739        const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4);
 740        if (ipv4 == NULL) {
 741                metrics->errors_total_malformed_icmp_pkt_too_big++;
 742                return INVALID;
 743        }
 744
 745        /* The source address in the outer IP header is from the entity that
 746         * originated the ICMP message. Use the original IP header to restore
 747         * the correct flow tuple.
 748         */
 749        struct bpf_sock_tuple tuple;
 750        tuple.ipv4.saddr = ipv4->daddr;
 751        tuple.ipv4.daddr = ipv4->saddr;
 752
 753        if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv4.sport)) {
 754                metrics->errors_total_malformed_icmp_pkt_too_big++;
 755                return INVALID;
 756        }
 757
 758        return classify_icmp(pkt->skb, ipv4->protocol, &tuple,
 759                             sizeof(tuple.ipv4), metrics);
 760}
 761
 762static INLINING verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics)
 763{
 764        struct icmp6hdr icmp6;
 765        if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) {
 766                metrics->errors_total_malformed_icmp++;
 767                return INVALID;
 768        }
 769
 770        /* We should never receive encapsulated echo replies. */
 771        if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) {
 772                metrics->errors_total_icmp_echo_replies++;
 773                return INVALID;
 774        }
 775
 776        if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) {
 777                return ECHO_REQUEST;
 778        }
 779
 780        if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) {
 781                metrics->errors_total_unwanted_icmp++;
 782                return INVALID;
 783        }
 784
 785        bool is_fragment;
 786        uint8_t l4_proto;
 787        struct ipv6hdr _ipv6;
 788        const struct ipv6hdr *ipv6 =
 789                pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment);
 790        if (ipv6 == NULL) {
 791                metrics->errors_total_malformed_icmp_pkt_too_big++;
 792                return INVALID;
 793        }
 794
 795        if (is_fragment) {
 796                metrics->errors_total_fragmented_ip++;
 797                return INVALID;
 798        }
 799
 800        /* Swap source and dest addresses. */
 801        struct bpf_sock_tuple tuple;
 802        memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr));
 803        memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr));
 804
 805        if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv6.sport)) {
 806                metrics->errors_total_malformed_icmp_pkt_too_big++;
 807                return INVALID;
 808        }
 809
 810        return classify_icmp(pkt->skb, l4_proto, &tuple, sizeof(tuple.ipv6),
 811                             metrics);
 812}
 813
 814static INLINING verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen,
 815                                      metrics_t *metrics)
 816{
 817        metrics->l4_protocol_packets_total_tcp++;
 818
 819        struct tcphdr _tcp;
 820        struct tcphdr *tcp = buf_assign(pkt, sizeof(_tcp), &_tcp);
 821        if (tcp == NULL) {
 822                metrics->errors_total_malformed_tcp++;
 823                return INVALID;
 824        }
 825
 826        if (tcp->syn) {
 827                return SYN;
 828        }
 829
 830        struct bpf_sock_tuple tuple;
 831        uint64_t tuplen =
 832                fill_tuple(&tuple, iph, iphlen, tcp->source, tcp->dest);
 833        return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp);
 834}
 835
 836static INLINING verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen,
 837                                      metrics_t *metrics)
 838{
 839        metrics->l4_protocol_packets_total_udp++;
 840
 841        struct udphdr _udp;
 842        struct udphdr *udph = buf_assign(pkt, sizeof(_udp), &_udp);
 843        if (udph == NULL) {
 844                metrics->errors_total_malformed_udp++;
 845                return INVALID;
 846        }
 847
 848        struct bpf_sock_tuple tuple;
 849        uint64_t tuplen =
 850                fill_tuple(&tuple, iph, iphlen, udph->source, udph->dest);
 851        return classify_udp(pkt->skb, &tuple, tuplen);
 852}
 853
 854static INLINING verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics)
 855{
 856        metrics->l3_protocol_packets_total_ipv4++;
 857
 858        struct iphdr _ip4;
 859        struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4);
 860        if (ipv4 == NULL) {
 861                metrics->errors_total_malformed_ip++;
 862                return INVALID;
 863        }
 864
 865        if (ipv4->version != 4) {
 866                metrics->errors_total_malformed_ip++;
 867                return INVALID;
 868        }
 869
 870        if (ipv4_is_fragment(ipv4)) {
 871                metrics->errors_total_fragmented_ip++;
 872                return INVALID;
 873        }
 874
 875        switch (ipv4->protocol) {
 876        case IPPROTO_ICMP:
 877                return process_icmpv4(pkt, metrics);
 878
 879        case IPPROTO_TCP:
 880                return process_tcp(pkt, ipv4, sizeof(*ipv4), metrics);
 881
 882        case IPPROTO_UDP:
 883                return process_udp(pkt, ipv4, sizeof(*ipv4), metrics);
 884
 885        default:
 886                metrics->errors_total_unknown_l4_proto++;
 887                return INVALID;
 888        }
 889}
 890
 891static INLINING verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics)
 892{
 893        metrics->l3_protocol_packets_total_ipv6++;
 894
 895        uint8_t l4_proto;
 896        bool is_fragment;
 897        struct ipv6hdr _ipv6;
 898        struct ipv6hdr *ipv6 =
 899                pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment);
 900        if (ipv6 == NULL) {
 901                metrics->errors_total_malformed_ip++;
 902                return INVALID;
 903        }
 904
 905        if (ipv6->version != 6) {
 906                metrics->errors_total_malformed_ip++;
 907                return INVALID;
 908        }
 909
 910        if (is_fragment) {
 911                metrics->errors_total_fragmented_ip++;
 912                return INVALID;
 913        }
 914
 915        switch (l4_proto) {
 916        case IPPROTO_ICMPV6:
 917                return process_icmpv6(pkt, metrics);
 918
 919        case IPPROTO_TCP:
 920                return process_tcp(pkt, ipv6, sizeof(*ipv6), metrics);
 921
 922        case IPPROTO_UDP:
 923                return process_udp(pkt, ipv6, sizeof(*ipv6), metrics);
 924
 925        default:
 926                metrics->errors_total_unknown_l4_proto++;
 927                return INVALID;
 928        }
 929}
 930
 931SEC("classifier/cls_redirect")
 932int cls_redirect(struct __sk_buff *skb)
 933{
 934        metrics_t *metrics = get_global_metrics();
 935        if (metrics == NULL) {
 936                return TC_ACT_SHOT;
 937        }
 938
 939        metrics->processed_packets_total++;
 940
 941        /* Pass bogus packets as long as we're not sure they're
 942         * destined for us.
 943         */
 944        if (skb->protocol != bpf_htons(ETH_P_IP)) {
 945                return TC_ACT_OK;
 946        }
 947
 948        encap_headers_t *encap;
 949
 950        /* Make sure that all encapsulation headers are available in
 951         * the linear portion of the skb. This makes it easy to manipulate them.
 952         */
 953        if (bpf_skb_pull_data(skb, sizeof(*encap))) {
 954                return TC_ACT_OK;
 955        }
 956
 957        buf_t pkt = {
 958                .skb = skb,
 959                .head = (uint8_t *)(long)skb->data,
 960                .tail = (uint8_t *)(long)skb->data_end,
 961        };
 962
 963        encap = buf_assign(&pkt, sizeof(*encap), NULL);
 964        if (encap == NULL) {
 965                return TC_ACT_OK;
 966        }
 967
 968        if (encap->ip.ihl != 5) {
 969                /* We never have any options. */
 970                return TC_ACT_OK;
 971        }
 972
 973        if (encap->ip.daddr != ENCAPSULATION_IP ||
 974            encap->ip.protocol != IPPROTO_UDP) {
 975                return TC_ACT_OK;
 976        }
 977
 978        /* TODO Check UDP length? */
 979        if (encap->udp.dest != ENCAPSULATION_PORT) {
 980                return TC_ACT_OK;
 981        }
 982
 983        /* We now know that the packet is destined to us, we can
 984         * drop bogus ones.
 985         */
 986        if (ipv4_is_fragment((void *)&encap->ip)) {
 987                metrics->errors_total_fragmented_ip++;
 988                return TC_ACT_SHOT;
 989        }
 990
 991        if (encap->gue.variant != 0) {
 992                metrics->errors_total_malformed_encapsulation++;
 993                return TC_ACT_SHOT;
 994        }
 995
 996        if (encap->gue.control != 0) {
 997                metrics->errors_total_malformed_encapsulation++;
 998                return TC_ACT_SHOT;
 999        }
1000
1001        if (encap->gue.flags != 0) {
1002                metrics->errors_total_malformed_encapsulation++;
1003                return TC_ACT_SHOT;
1004        }
1005
1006        if (encap->gue.hlen !=
1007            sizeof(encap->unigue) / 4 + encap->unigue.hop_count) {
1008                metrics->errors_total_malformed_encapsulation++;
1009                return TC_ACT_SHOT;
1010        }
1011
1012        if (encap->unigue.version != 0) {
1013                metrics->errors_total_malformed_encapsulation++;
1014                return TC_ACT_SHOT;
1015        }
1016
1017        if (encap->unigue.reserved != 0) {
1018                return TC_ACT_SHOT;
1019        }
1020
1021        struct in_addr next_hop;
1022        MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop));
1023
1024        if (next_hop.s_addr == 0) {
1025                metrics->accepted_packets_total_last_hop++;
1026                return accept_locally(skb, encap);
1027        }
1028
1029        verdict_t verdict;
1030        switch (encap->gue.proto_ctype) {
1031        case IPPROTO_IPIP:
1032                verdict = process_ipv4(&pkt, metrics);
1033                break;
1034
1035        case IPPROTO_IPV6:
1036                verdict = process_ipv6(&pkt, metrics);
1037                break;
1038
1039        default:
1040                metrics->errors_total_unknown_l3_proto++;
1041                return TC_ACT_SHOT;
1042        }
1043
1044        switch (verdict) {
1045        case INVALID:
1046                /* metrics have already been bumped */
1047                return TC_ACT_SHOT;
1048
1049        case UNKNOWN:
1050                return forward_to_next_hop(skb, encap, &next_hop, metrics);
1051
1052        case ECHO_REQUEST:
1053                metrics->accepted_packets_total_icmp_echo_request++;
1054                break;
1055
1056        case SYN:
1057                if (encap->unigue.forward_syn) {
1058                        return forward_to_next_hop(skb, encap, &next_hop,
1059                                                   metrics);
1060                }
1061
1062                metrics->accepted_packets_total_syn++;
1063                break;
1064
1065        case SYN_COOKIE:
1066                metrics->accepted_packets_total_syn_cookies++;
1067                break;
1068
1069        case ESTABLISHED:
1070                metrics->accepted_packets_total_established++;
1071                break;
1072        }
1073
1074        return accept_locally(skb, encap);
1075}
1076