linux/net/core/filter.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Linux Socket Filter - Kernel level socket filtering
   4 *
   5 * Based on the design of the Berkeley Packet Filter. The new
   6 * internal format has been designed by PLUMgrid:
   7 *
   8 *      Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
   9 *
  10 * Authors:
  11 *
  12 *      Jay Schulist <jschlst@samba.org>
  13 *      Alexei Starovoitov <ast@plumgrid.com>
  14 *      Daniel Borkmann <dborkman@redhat.com>
  15 *
  16 * Andi Kleen - Fix a few bad bugs and races.
  17 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
  18 */
  19
  20#include <linux/module.h>
  21#include <linux/types.h>
  22#include <linux/mm.h>
  23#include <linux/fcntl.h>
  24#include <linux/socket.h>
  25#include <linux/sock_diag.h>
  26#include <linux/in.h>
  27#include <linux/inet.h>
  28#include <linux/netdevice.h>
  29#include <linux/if_packet.h>
  30#include <linux/if_arp.h>
  31#include <linux/gfp.h>
  32#include <net/inet_common.h>
  33#include <net/ip.h>
  34#include <net/protocol.h>
  35#include <net/netlink.h>
  36#include <linux/skbuff.h>
  37#include <linux/skmsg.h>
  38#include <net/sock.h>
  39#include <net/flow_dissector.h>
  40#include <linux/errno.h>
  41#include <linux/timer.h>
  42#include <linux/uaccess.h>
  43#include <asm/unaligned.h>
  44#include <asm/cmpxchg.h>
  45#include <linux/filter.h>
  46#include <linux/ratelimit.h>
  47#include <linux/seccomp.h>
  48#include <linux/if_vlan.h>
  49#include <linux/bpf.h>
  50#include <linux/btf.h>
  51#include <net/sch_generic.h>
  52#include <net/cls_cgroup.h>
  53#include <net/dst_metadata.h>
  54#include <net/dst.h>
  55#include <net/sock_reuseport.h>
  56#include <net/busy_poll.h>
  57#include <net/tcp.h>
  58#include <net/xfrm.h>
  59#include <net/udp.h>
  60#include <linux/bpf_trace.h>
  61#include <net/xdp_sock.h>
  62#include <linux/inetdevice.h>
  63#include <net/inet_hashtables.h>
  64#include <net/inet6_hashtables.h>
  65#include <net/ip_fib.h>
  66#include <net/nexthop.h>
  67#include <net/flow.h>
  68#include <net/arp.h>
  69#include <net/ipv6.h>
  70#include <net/net_namespace.h>
  71#include <linux/seg6_local.h>
  72#include <net/seg6.h>
  73#include <net/seg6_local.h>
  74#include <net/lwtunnel.h>
  75#include <net/ipv6_stubs.h>
  76#include <net/bpf_sk_storage.h>
  77#include <net/transp_v6.h>
  78#include <linux/btf_ids.h>
  79
  80int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len)
  81{
  82        if (in_compat_syscall()) {
  83                struct compat_sock_fprog f32;
  84
  85                if (len != sizeof(f32))
  86                        return -EINVAL;
  87                if (copy_from_sockptr(&f32, src, sizeof(f32)))
  88                        return -EFAULT;
  89                memset(dst, 0, sizeof(*dst));
  90                dst->len = f32.len;
  91                dst->filter = compat_ptr(f32.filter);
  92        } else {
  93                if (len != sizeof(*dst))
  94                        return -EINVAL;
  95                if (copy_from_sockptr(dst, src, sizeof(*dst)))
  96                        return -EFAULT;
  97        }
  98
  99        return 0;
 100}
 101EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);
 102
 103/**
 104 *      sk_filter_trim_cap - run a packet through a socket filter
 105 *      @sk: sock associated with &sk_buff
 106 *      @skb: buffer to filter
 107 *      @cap: limit on how short the eBPF program may trim the packet
 108 *
 109 * Run the eBPF program and then cut skb->data to correct size returned by
 110 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
 111 * than pkt_len we keep whole skb->data. This is the socket level
 112 * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
 113 * be accepted or -EPERM if the packet should be tossed.
 114 *
 115 */
 116int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
 117{
 118        int err;
 119        struct sk_filter *filter;
 120
 121        /*
 122         * If the skb was allocated from pfmemalloc reserves, only
 123         * allow SOCK_MEMALLOC sockets to use it as this socket is
 124         * helping free memory
 125         */
 126        if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
 127                NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
 128                return -ENOMEM;
 129        }
 130        err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
 131        if (err)
 132                return err;
 133
 134        err = security_sock_rcv_skb(sk, skb);
 135        if (err)
 136                return err;
 137
 138        rcu_read_lock();
 139        filter = rcu_dereference(sk->sk_filter);
 140        if (filter) {
 141                struct sock *save_sk = skb->sk;
 142                unsigned int pkt_len;
 143
 144                skb->sk = sk;
 145                pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
 146                skb->sk = save_sk;
 147                err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
 148        }
 149        rcu_read_unlock();
 150
 151        return err;
 152}
 153EXPORT_SYMBOL(sk_filter_trim_cap);
 154
 155BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb)
 156{
 157        return skb_get_poff(skb);
 158}
 159
 160BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
 161{
 162        struct nlattr *nla;
 163
 164        if (skb_is_nonlinear(skb))
 165                return 0;
 166
 167        if (skb->len < sizeof(struct nlattr))
 168                return 0;
 169
 170        if (a > skb->len - sizeof(struct nlattr))
 171                return 0;
 172
 173        nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
 174        if (nla)
 175                return (void *) nla - (void *) skb->data;
 176
 177        return 0;
 178}
 179
 180BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
 181{
 182        struct nlattr *nla;
 183
 184        if (skb_is_nonlinear(skb))
 185                return 0;
 186
 187        if (skb->len < sizeof(struct nlattr))
 188                return 0;
 189
 190        if (a > skb->len - sizeof(struct nlattr))
 191                return 0;
 192
 193        nla = (struct nlattr *) &skb->data[a];
 194        if (nla->nla_len > skb->len - a)
 195                return 0;
 196
 197        nla = nla_find_nested(nla, x);
 198        if (nla)
 199                return (void *) nla - (void *) skb->data;
 200
 201        return 0;
 202}
 203
 204BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
 205           data, int, headlen, int, offset)
 206{
 207        u8 tmp, *ptr;
 208        const int len = sizeof(tmp);
 209
 210        if (offset >= 0) {
 211                if (headlen - offset >= len)
 212                        return *(u8 *)(data + offset);
 213                if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
 214                        return tmp;
 215        } else {
 216                ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
 217                if (likely(ptr))
 218                        return *(u8 *)ptr;
 219        }
 220
 221        return -EFAULT;
 222}
 223
 224BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
 225           int, offset)
 226{
 227        return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len,
 228                                         offset);
 229}
 230
 231BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
 232           data, int, headlen, int, offset)
 233{
 234        u16 tmp, *ptr;
 235        const int len = sizeof(tmp);
 236
 237        if (offset >= 0) {
 238                if (headlen - offset >= len)
 239                        return get_unaligned_be16(data + offset);
 240                if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
 241                        return be16_to_cpu(tmp);
 242        } else {
 243                ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
 244                if (likely(ptr))
 245                        return get_unaligned_be16(ptr);
 246        }
 247
 248        return -EFAULT;
 249}
 250
 251BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
 252           int, offset)
 253{
 254        return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len,
 255                                          offset);
 256}
 257
 258BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
 259           data, int, headlen, int, offset)
 260{
 261        u32 tmp, *ptr;
 262        const int len = sizeof(tmp);
 263
 264        if (likely(offset >= 0)) {
 265                if (headlen - offset >= len)
 266                        return get_unaligned_be32(data + offset);
 267                if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
 268                        return be32_to_cpu(tmp);
 269        } else {
 270                ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
 271                if (likely(ptr))
 272                        return get_unaligned_be32(ptr);
 273        }
 274
 275        return -EFAULT;
 276}
 277
 278BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
 279           int, offset)
 280{
 281        return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len,
 282                                          offset);
 283}
 284
 285static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
 286                              struct bpf_insn *insn_buf)
 287{
 288        struct bpf_insn *insn = insn_buf;
 289
 290        switch (skb_field) {
 291        case SKF_AD_MARK:
 292                BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4);
 293
 294                *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
 295                                      offsetof(struct sk_buff, mark));
 296                break;
 297
 298        case SKF_AD_PKTTYPE:
 299                *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET());
 300                *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
 301#ifdef __BIG_ENDIAN_BITFIELD
 302                *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
 303#endif
 304                break;
 305
 306        case SKF_AD_QUEUE:
 307                BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2);
 308
 309                *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
 310                                      offsetof(struct sk_buff, queue_mapping));
 311                break;
 312
 313        case SKF_AD_VLAN_TAG:
 314                BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2);
 315
 316                /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
 317                *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
 318                                      offsetof(struct sk_buff, vlan_tci));
 319                break;
 320        case SKF_AD_VLAN_TAG_PRESENT:
 321                *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET());
 322                if (PKT_VLAN_PRESENT_BIT)
 323                        *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, PKT_VLAN_PRESENT_BIT);
 324                if (PKT_VLAN_PRESENT_BIT < 7)
 325                        *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1);
 326                break;
 327        }
 328
 329        return insn - insn_buf;
 330}
 331
 332static bool convert_bpf_extensions(struct sock_filter *fp,
 333                                   struct bpf_insn **insnp)
 334{
 335        struct bpf_insn *insn = *insnp;
 336        u32 cnt;
 337
 338        switch (fp->k) {
 339        case SKF_AD_OFF + SKF_AD_PROTOCOL:
 340                BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2);
 341
 342                /* A = *(u16 *) (CTX + offsetof(protocol)) */
 343                *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
 344                                      offsetof(struct sk_buff, protocol));
 345                /* A = ntohs(A) [emitting a nop or swap16] */
 346                *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
 347                break;
 348
 349        case SKF_AD_OFF + SKF_AD_PKTTYPE:
 350                cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn);
 351                insn += cnt - 1;
 352                break;
 353
 354        case SKF_AD_OFF + SKF_AD_IFINDEX:
 355        case SKF_AD_OFF + SKF_AD_HATYPE:
 356                BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4);
 357                BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2);
 358
 359                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
 360                                      BPF_REG_TMP, BPF_REG_CTX,
 361                                      offsetof(struct sk_buff, dev));
 362                /* if (tmp != 0) goto pc + 1 */
 363                *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
 364                *insn++ = BPF_EXIT_INSN();
 365                if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
 366                        *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
 367                                            offsetof(struct net_device, ifindex));
 368                else
 369                        *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
 370                                            offsetof(struct net_device, type));
 371                break;
 372
 373        case SKF_AD_OFF + SKF_AD_MARK:
 374                cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn);
 375                insn += cnt - 1;
 376                break;
 377
 378        case SKF_AD_OFF + SKF_AD_RXHASH:
 379                BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4);
 380
 381                *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
 382                                    offsetof(struct sk_buff, hash));
 383                break;
 384
 385        case SKF_AD_OFF + SKF_AD_QUEUE:
 386                cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn);
 387                insn += cnt - 1;
 388                break;
 389
 390        case SKF_AD_OFF + SKF_AD_VLAN_TAG:
 391                cnt = convert_skb_access(SKF_AD_VLAN_TAG,
 392                                         BPF_REG_A, BPF_REG_CTX, insn);
 393                insn += cnt - 1;
 394                break;
 395
 396        case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
 397                cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
 398                                         BPF_REG_A, BPF_REG_CTX, insn);
 399                insn += cnt - 1;
 400                break;
 401
 402        case SKF_AD_OFF + SKF_AD_VLAN_TPID:
 403                BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2);
 404
 405                /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
 406                *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
 407                                      offsetof(struct sk_buff, vlan_proto));
 408                /* A = ntohs(A) [emitting a nop or swap16] */
 409                *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
 410                break;
 411
 412        case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
 413        case SKF_AD_OFF + SKF_AD_NLATTR:
 414        case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
 415        case SKF_AD_OFF + SKF_AD_CPU:
 416        case SKF_AD_OFF + SKF_AD_RANDOM:
 417                /* arg1 = CTX */
 418                *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
 419                /* arg2 = A */
 420                *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
 421                /* arg3 = X */
 422                *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
 423                /* Emit call(arg1=CTX, arg2=A, arg3=X) */
 424                switch (fp->k) {
 425                case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
 426                        *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset);
 427                        break;
 428                case SKF_AD_OFF + SKF_AD_NLATTR:
 429                        *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr);
 430                        break;
 431                case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
 432                        *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest);
 433                        break;
 434                case SKF_AD_OFF + SKF_AD_CPU:
 435                        *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id);
 436                        break;
 437                case SKF_AD_OFF + SKF_AD_RANDOM:
 438                        *insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
 439                        bpf_user_rnd_init_once();
 440                        break;
 441                }
 442                break;
 443
 444        case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
 445                /* A ^= X */
 446                *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
 447                break;
 448
 449        default:
 450                /* This is just a dummy call to avoid letting the compiler
 451                 * evict __bpf_call_base() as an optimization. Placed here
 452                 * where no-one bothers.
 453                 */
 454                BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
 455                return false;
 456        }
 457
 458        *insnp = insn;
 459        return true;
 460}
 461
 462static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
 463{
 464        const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS);
 465        int size = bpf_size_to_bytes(BPF_SIZE(fp->code));
 466        bool endian = BPF_SIZE(fp->code) == BPF_H ||
 467                      BPF_SIZE(fp->code) == BPF_W;
 468        bool indirect = BPF_MODE(fp->code) == BPF_IND;
 469        const int ip_align = NET_IP_ALIGN;
 470        struct bpf_insn *insn = *insnp;
 471        int offset = fp->k;
 472
 473        if (!indirect &&
 474            ((unaligned_ok && offset >= 0) ||
 475             (!unaligned_ok && offset >= 0 &&
 476              offset + ip_align >= 0 &&
 477              offset + ip_align % size == 0))) {
 478                bool ldx_off_ok = offset <= S16_MAX;
 479
 480                *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
 481                if (offset)
 482                        *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
 483                *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP,
 484                                      size, 2 + endian + (!ldx_off_ok * 2));
 485                if (ldx_off_ok) {
 486                        *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
 487                                              BPF_REG_D, offset);
 488                } else {
 489                        *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D);
 490                        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset);
 491                        *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
 492                                              BPF_REG_TMP, 0);
 493                }
 494                if (endian)
 495                        *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
 496                *insn++ = BPF_JMP_A(8);
 497        }
 498
 499        *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
 500        *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D);
 501        *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H);
 502        if (!indirect) {
 503                *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset);
 504        } else {
 505                *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X);
 506                if (fp->k)
 507                        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset);
 508        }
 509
 510        switch (BPF_SIZE(fp->code)) {
 511        case BPF_B:
 512                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8);
 513                break;
 514        case BPF_H:
 515                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16);
 516                break;
 517        case BPF_W:
 518                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32);
 519                break;
 520        default:
 521                return false;
 522        }
 523
 524        *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2);
 525        *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
 526        *insn   = BPF_EXIT_INSN();
 527
 528        *insnp = insn;
 529        return true;
 530}
 531
 532/**
 533 *      bpf_convert_filter - convert filter program
 534 *      @prog: the user passed filter program
 535 *      @len: the length of the user passed filter program
 536 *      @new_prog: allocated 'struct bpf_prog' or NULL
 537 *      @new_len: pointer to store length of converted program
 538 *      @seen_ld_abs: bool whether we've seen ld_abs/ind
 539 *
 540 * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'
 541 * style extended BPF (eBPF).
 542 * Conversion workflow:
 543 *
 544 * 1) First pass for calculating the new program length:
 545 *   bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs)
 546 *
 547 * 2) 2nd pass to remap in two passes: 1st pass finds new
 548 *    jump offsets, 2nd pass remapping:
 549 *   bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs)
 550 */
 551static int bpf_convert_filter(struct sock_filter *prog, int len,
 552                              struct bpf_prog *new_prog, int *new_len,
 553                              bool *seen_ld_abs)
 554{
 555        int new_flen = 0, pass = 0, target, i, stack_off;
 556        struct bpf_insn *new_insn, *first_insn = NULL;
 557        struct sock_filter *fp;
 558        int *addrs = NULL;
 559        u8 bpf_src;
 560
 561        BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
 562        BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
 563
 564        if (len <= 0 || len > BPF_MAXINSNS)
 565                return -EINVAL;
 566
 567        if (new_prog) {
 568                first_insn = new_prog->insnsi;
 569                addrs = kcalloc(len, sizeof(*addrs),
 570                                GFP_KERNEL | __GFP_NOWARN);
 571                if (!addrs)
 572                        return -ENOMEM;
 573        }
 574
 575do_pass:
 576        new_insn = first_insn;
 577        fp = prog;
 578
 579        /* Classic BPF related prologue emission. */
 580        if (new_prog) {
 581                /* Classic BPF expects A and X to be reset first. These need
 582                 * to be guaranteed to be the first two instructions.
 583                 */
 584                *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
 585                *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
 586
 587                /* All programs must keep CTX in callee saved BPF_REG_CTX.
 588                 * In eBPF case it's done by the compiler, here we need to
 589                 * do this ourself. Initial CTX is present in BPF_REG_ARG1.
 590                 */
 591                *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
 592                if (*seen_ld_abs) {
 593                        /* For packet access in classic BPF, cache skb->data
 594                         * in callee-saved BPF R8 and skb->len - skb->data_len
 595                         * (headlen) in BPF R9. Since classic BPF is read-only
 596                         * on CTX, we only need to cache it once.
 597                         */
 598                        *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
 599                                                  BPF_REG_D, BPF_REG_CTX,
 600                                                  offsetof(struct sk_buff, data));
 601                        *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX,
 602                                                  offsetof(struct sk_buff, len));
 603                        *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX,
 604                                                  offsetof(struct sk_buff, data_len));
 605                        *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP);
 606                }
 607        } else {
 608                new_insn += 3;
 609        }
 610
 611        for (i = 0; i < len; fp++, i++) {
 612                struct bpf_insn tmp_insns[32] = { };
 613                struct bpf_insn *insn = tmp_insns;
 614
 615                if (addrs)
 616                        addrs[i] = new_insn - first_insn;
 617
 618                switch (fp->code) {
 619                /* All arithmetic insns and skb loads map as-is. */
 620                case BPF_ALU | BPF_ADD | BPF_X:
 621                case BPF_ALU | BPF_ADD | BPF_K:
 622                case BPF_ALU | BPF_SUB | BPF_X:
 623                case BPF_ALU | BPF_SUB | BPF_K:
 624                case BPF_ALU | BPF_AND | BPF_X:
 625                case BPF_ALU | BPF_AND | BPF_K:
 626                case BPF_ALU | BPF_OR | BPF_X:
 627                case BPF_ALU | BPF_OR | BPF_K:
 628                case BPF_ALU | BPF_LSH | BPF_X:
 629                case BPF_ALU | BPF_LSH | BPF_K:
 630                case BPF_ALU | BPF_RSH | BPF_X:
 631                case BPF_ALU | BPF_RSH | BPF_K:
 632                case BPF_ALU | BPF_XOR | BPF_X:
 633                case BPF_ALU | BPF_XOR | BPF_K:
 634                case BPF_ALU | BPF_MUL | BPF_X:
 635                case BPF_ALU | BPF_MUL | BPF_K:
 636                case BPF_ALU | BPF_DIV | BPF_X:
 637                case BPF_ALU | BPF_DIV | BPF_K:
 638                case BPF_ALU | BPF_MOD | BPF_X:
 639                case BPF_ALU | BPF_MOD | BPF_K:
 640                case BPF_ALU | BPF_NEG:
 641                case BPF_LD | BPF_ABS | BPF_W:
 642                case BPF_LD | BPF_ABS | BPF_H:
 643                case BPF_LD | BPF_ABS | BPF_B:
 644                case BPF_LD | BPF_IND | BPF_W:
 645                case BPF_LD | BPF_IND | BPF_H:
 646                case BPF_LD | BPF_IND | BPF_B:
 647                        /* Check for overloaded BPF extension and
 648                         * directly convert it if found, otherwise
 649                         * just move on with mapping.
 650                         */
 651                        if (BPF_CLASS(fp->code) == BPF_LD &&
 652                            BPF_MODE(fp->code) == BPF_ABS &&
 653                            convert_bpf_extensions(fp, &insn))
 654                                break;
 655                        if (BPF_CLASS(fp->code) == BPF_LD &&
 656                            convert_bpf_ld_abs(fp, &insn)) {
 657                                *seen_ld_abs = true;
 658                                break;
 659                        }
 660
 661                        if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
 662                            fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
 663                                *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X);
 664                                /* Error with exception code on div/mod by 0.
 665                                 * For cBPF programs, this was always return 0.
 666                                 */
 667                                *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2);
 668                                *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
 669                                *insn++ = BPF_EXIT_INSN();
 670                        }
 671
 672                        *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
 673                        break;
 674
 675                /* Jump transformation cannot use BPF block macros
 676                 * everywhere as offset calculation and target updates
 677                 * require a bit more work than the rest, i.e. jump
 678                 * opcodes map as-is, but offsets need adjustment.
 679                 */
 680
 681#define BPF_EMIT_JMP                                                    \
 682        do {                                                            \
 683                const s32 off_min = S16_MIN, off_max = S16_MAX;         \
 684                s32 off;                                                \
 685                                                                        \
 686                if (target >= len || target < 0)                        \
 687                        goto err;                                       \
 688                off = addrs ? addrs[target] - addrs[i] - 1 : 0;         \
 689                /* Adjust pc relative offset for 2nd or 3rd insn. */    \
 690                off -= insn - tmp_insns;                                \
 691                /* Reject anything not fitting into insn->off. */       \
 692                if (off < off_min || off > off_max)                     \
 693                        goto err;                                       \
 694                insn->off = off;                                        \
 695        } while (0)
 696
 697                case BPF_JMP | BPF_JA:
 698                        target = i + fp->k + 1;
 699                        insn->code = fp->code;
 700                        BPF_EMIT_JMP;
 701                        break;
 702
 703                case BPF_JMP | BPF_JEQ | BPF_K:
 704                case BPF_JMP | BPF_JEQ | BPF_X:
 705                case BPF_JMP | BPF_JSET | BPF_K:
 706                case BPF_JMP | BPF_JSET | BPF_X:
 707                case BPF_JMP | BPF_JGT | BPF_K:
 708                case BPF_JMP | BPF_JGT | BPF_X:
 709                case BPF_JMP | BPF_JGE | BPF_K:
 710                case BPF_JMP | BPF_JGE | BPF_X:
 711                        if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
 712                                /* BPF immediates are signed, zero extend
 713                                 * immediate into tmp register and use it
 714                                 * in compare insn.
 715                                 */
 716                                *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);
 717
 718                                insn->dst_reg = BPF_REG_A;
 719                                insn->src_reg = BPF_REG_TMP;
 720                                bpf_src = BPF_X;
 721                        } else {
 722                                insn->dst_reg = BPF_REG_A;
 723                                insn->imm = fp->k;
 724                                bpf_src = BPF_SRC(fp->code);
 725                                insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0;
 726                        }
 727
 728                        /* Common case where 'jump_false' is next insn. */
 729                        if (fp->jf == 0) {
 730                                insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
 731                                target = i + fp->jt + 1;
 732                                BPF_EMIT_JMP;
 733                                break;
 734                        }
 735
 736                        /* Convert some jumps when 'jump_true' is next insn. */
 737                        if (fp->jt == 0) {
 738                                switch (BPF_OP(fp->code)) {
 739                                case BPF_JEQ:
 740                                        insn->code = BPF_JMP | BPF_JNE | bpf_src;
 741                                        break;
 742                                case BPF_JGT:
 743                                        insn->code = BPF_JMP | BPF_JLE | bpf_src;
 744                                        break;
 745                                case BPF_JGE:
 746                                        insn->code = BPF_JMP | BPF_JLT | bpf_src;
 747                                        break;
 748                                default:
 749                                        goto jmp_rest;
 750                                }
 751
 752                                target = i + fp->jf + 1;
 753                                BPF_EMIT_JMP;
 754                                break;
 755                        }
 756jmp_rest:
 757                        /* Other jumps are mapped into two insns: Jxx and JA. */
 758                        target = i + fp->jt + 1;
 759                        insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
 760                        BPF_EMIT_JMP;
 761                        insn++;
 762
 763                        insn->code = BPF_JMP | BPF_JA;
 764                        target = i + fp->jf + 1;
 765                        BPF_EMIT_JMP;
 766                        break;
 767
 768                /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
 769                case BPF_LDX | BPF_MSH | BPF_B: {
 770                        struct sock_filter tmp = {
 771                                .code   = BPF_LD | BPF_ABS | BPF_B,
 772                                .k      = fp->k,
 773                        };
 774
 775                        *seen_ld_abs = true;
 776
 777                        /* X = A */
 778                        *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
 779                        /* A = BPF_R0 = *(u8 *) (skb->data + K) */
 780                        convert_bpf_ld_abs(&tmp, &insn);
 781                        insn++;
 782                        /* A &= 0xf */
 783                        *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
 784                        /* A <<= 2 */
 785                        *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
 786                        /* tmp = X */
 787                        *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X);
 788                        /* X = A */
 789                        *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
 790                        /* A = tmp */
 791                        *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
 792                        break;
 793                }
 794                /* RET_K is remaped into 2 insns. RET_A case doesn't need an
 795                 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
 796                 */
 797                case BPF_RET | BPF_A:
 798                case BPF_RET | BPF_K:
 799                        if (BPF_RVAL(fp->code) == BPF_K)
 800                                *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0,
 801                                                        0, fp->k);
 802                        *insn = BPF_EXIT_INSN();
 803                        break;
 804
 805                /* Store to stack. */
 806                case BPF_ST:
 807                case BPF_STX:
 808                        stack_off = fp->k * 4  + 4;
 809                        *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
 810                                            BPF_ST ? BPF_REG_A : BPF_REG_X,
 811                                            -stack_off);
 812                        /* check_load_and_stores() verifies that classic BPF can
 813                         * load from stack only after write, so tracking
 814                         * stack_depth for ST|STX insns is enough
 815                         */
 816                        if (new_prog && new_prog->aux->stack_depth < stack_off)
 817                                new_prog->aux->stack_depth = stack_off;
 818                        break;
 819
 820                /* Load from stack. */
 821                case BPF_LD | BPF_MEM:
 822                case BPF_LDX | BPF_MEM:
 823                        stack_off = fp->k * 4  + 4;
 824                        *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD  ?
 825                                            BPF_REG_A : BPF_REG_X, BPF_REG_FP,
 826                                            -stack_off);
 827                        break;
 828
 829                /* A = K or X = K */
 830                case BPF_LD | BPF_IMM:
 831                case BPF_LDX | BPF_IMM:
 832                        *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
 833                                              BPF_REG_A : BPF_REG_X, fp->k);
 834                        break;
 835
 836                /* X = A */
 837                case BPF_MISC | BPF_TAX:
 838                        *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
 839                        break;
 840
 841                /* A = X */
 842                case BPF_MISC | BPF_TXA:
 843                        *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
 844                        break;
 845
 846                /* A = skb->len or X = skb->len */
 847                case BPF_LD | BPF_W | BPF_LEN:
 848                case BPF_LDX | BPF_W | BPF_LEN:
 849                        *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
 850                                            BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
 851                                            offsetof(struct sk_buff, len));
 852                        break;
 853
 854                /* Access seccomp_data fields. */
 855                case BPF_LDX | BPF_ABS | BPF_W:
 856                        /* A = *(u32 *) (ctx + K) */
 857                        *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
 858                        break;
 859
 860                /* Unknown instruction. */
 861                default:
 862                        goto err;
 863                }
 864
 865                insn++;
 866                if (new_prog)
 867                        memcpy(new_insn, tmp_insns,
 868                               sizeof(*insn) * (insn - tmp_insns));
 869                new_insn += insn - tmp_insns;
 870        }
 871
 872        if (!new_prog) {
 873                /* Only calculating new length. */
 874                *new_len = new_insn - first_insn;
 875                if (*seen_ld_abs)
 876                        *new_len += 4; /* Prologue bits. */
 877                return 0;
 878        }
 879
 880        pass++;
 881        if (new_flen != new_insn - first_insn) {
 882                new_flen = new_insn - first_insn;
 883                if (pass > 2)
 884                        goto err;
 885                goto do_pass;
 886        }
 887
 888        kfree(addrs);
 889        BUG_ON(*new_len != new_flen);
 890        return 0;
 891err:
 892        kfree(addrs);
 893        return -EINVAL;
 894}
 895
 896/* Security:
 897 *
 898 * As we dont want to clear mem[] array for each packet going through
 899 * __bpf_prog_run(), we check that filter loaded by user never try to read
 900 * a cell if not previously written, and we check all branches to be sure
 901 * a malicious user doesn't try to abuse us.
 902 */
 903static int check_load_and_stores(const struct sock_filter *filter, int flen)
 904{
 905        u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
 906        int pc, ret = 0;
 907
 908        BUILD_BUG_ON(BPF_MEMWORDS > 16);
 909
 910        masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
 911        if (!masks)
 912                return -ENOMEM;
 913
 914        memset(masks, 0xff, flen * sizeof(*masks));
 915
 916        for (pc = 0; pc < flen; pc++) {
 917                memvalid &= masks[pc];
 918
 919                switch (filter[pc].code) {
 920                case BPF_ST:
 921                case BPF_STX:
 922                        memvalid |= (1 << filter[pc].k);
 923                        break;
 924                case BPF_LD | BPF_MEM:
 925                case BPF_LDX | BPF_MEM:
 926                        if (!(memvalid & (1 << filter[pc].k))) {
 927                                ret = -EINVAL;
 928                                goto error;
 929                        }
 930                        break;
 931                case BPF_JMP | BPF_JA:
 932                        /* A jump must set masks on target */
 933                        masks[pc + 1 + filter[pc].k] &= memvalid;
 934                        memvalid = ~0;
 935                        break;
 936                case BPF_JMP | BPF_JEQ | BPF_K:
 937                case BPF_JMP | BPF_JEQ | BPF_X:
 938                case BPF_JMP | BPF_JGE | BPF_K:
 939                case BPF_JMP | BPF_JGE | BPF_X:
 940                case BPF_JMP | BPF_JGT | BPF_K:
 941                case BPF_JMP | BPF_JGT | BPF_X:
 942                case BPF_JMP | BPF_JSET | BPF_K:
 943                case BPF_JMP | BPF_JSET | BPF_X:
 944                        /* A jump must set masks on targets */
 945                        masks[pc + 1 + filter[pc].jt] &= memvalid;
 946                        masks[pc + 1 + filter[pc].jf] &= memvalid;
 947                        memvalid = ~0;
 948                        break;
 949                }
 950        }
 951error:
 952        kfree(masks);
 953        return ret;
 954}
 955
 956static bool chk_code_allowed(u16 code_to_probe)
 957{
 958        static const bool codes[] = {
 959                /* 32 bit ALU operations */
 960                [BPF_ALU | BPF_ADD | BPF_K] = true,
 961                [BPF_ALU | BPF_ADD | BPF_X] = true,
 962                [BPF_ALU | BPF_SUB | BPF_K] = true,
 963                [BPF_ALU | BPF_SUB | BPF_X] = true,
 964                [BPF_ALU | BPF_MUL | BPF_K] = true,
 965                [BPF_ALU | BPF_MUL | BPF_X] = true,
 966                [BPF_ALU | BPF_DIV | BPF_K] = true,
 967                [BPF_ALU | BPF_DIV | BPF_X] = true,
 968                [BPF_ALU | BPF_MOD | BPF_K] = true,
 969                [BPF_ALU | BPF_MOD | BPF_X] = true,
 970                [BPF_ALU | BPF_AND | BPF_K] = true,
 971                [BPF_ALU | BPF_AND | BPF_X] = true,
 972                [BPF_ALU | BPF_OR | BPF_K] = true,
 973                [BPF_ALU | BPF_OR | BPF_X] = true,
 974                [BPF_ALU | BPF_XOR | BPF_K] = true,
 975                [BPF_ALU | BPF_XOR | BPF_X] = true,
 976                [BPF_ALU | BPF_LSH | BPF_K] = true,
 977                [BPF_ALU | BPF_LSH | BPF_X] = true,
 978                [BPF_ALU | BPF_RSH | BPF_K] = true,
 979                [BPF_ALU | BPF_RSH | BPF_X] = true,
 980                [BPF_ALU | BPF_NEG] = true,
 981                /* Load instructions */
 982                [BPF_LD | BPF_W | BPF_ABS] = true,
 983                [BPF_LD | BPF_H | BPF_ABS] = true,
 984                [BPF_LD | BPF_B | BPF_ABS] = true,
 985                [BPF_LD | BPF_W | BPF_LEN] = true,
 986                [BPF_LD | BPF_W | BPF_IND] = true,
 987                [BPF_LD | BPF_H | BPF_IND] = true,
 988                [BPF_LD | BPF_B | BPF_IND] = true,
 989                [BPF_LD | BPF_IMM] = true,
 990                [BPF_LD | BPF_MEM] = true,
 991                [BPF_LDX | BPF_W | BPF_LEN] = true,
 992                [BPF_LDX | BPF_B | BPF_MSH] = true,
 993                [BPF_LDX | BPF_IMM] = true,
 994                [BPF_LDX | BPF_MEM] = true,
 995                /* Store instructions */
 996                [BPF_ST] = true,
 997                [BPF_STX] = true,
 998                /* Misc instructions */
 999                [BPF_MISC | BPF_TAX] = true,
1000                [BPF_MISC | BPF_TXA] = true,
1001                /* Return instructions */
1002                [BPF_RET | BPF_K] = true,
1003                [BPF_RET | BPF_A] = true,
1004                /* Jump instructions */
1005                [BPF_JMP | BPF_JA] = true,
1006                [BPF_JMP | BPF_JEQ | BPF_K] = true,
1007                [BPF_JMP | BPF_JEQ | BPF_X] = true,
1008                [BPF_JMP | BPF_JGE | BPF_K] = true,
1009                [BPF_JMP | BPF_JGE | BPF_X] = true,
1010                [BPF_JMP | BPF_JGT | BPF_K] = true,
1011                [BPF_JMP | BPF_JGT | BPF_X] = true,
1012                [BPF_JMP | BPF_JSET | BPF_K] = true,
1013                [BPF_JMP | BPF_JSET | BPF_X] = true,
1014        };
1015
1016        if (code_to_probe >= ARRAY_SIZE(codes))
1017                return false;
1018
1019        return codes[code_to_probe];
1020}
1021
1022static bool bpf_check_basics_ok(const struct sock_filter *filter,
1023                                unsigned int flen)
1024{
1025        if (filter == NULL)
1026                return false;
1027        if (flen == 0 || flen > BPF_MAXINSNS)
1028                return false;
1029
1030        return true;
1031}
1032
1033/**
1034 *      bpf_check_classic - verify socket filter code
1035 *      @filter: filter to verify
1036 *      @flen: length of filter
1037 *
1038 * Check the user's filter code. If we let some ugly
1039 * filter code slip through kaboom! The filter must contain
1040 * no references or jumps that are out of range, no illegal
1041 * instructions, and must end with a RET instruction.
1042 *
1043 * All jumps are forward as they are not signed.
1044 *
1045 * Returns 0 if the rule set is legal or -EINVAL if not.
1046 */
1047static int bpf_check_classic(const struct sock_filter *filter,
1048                             unsigned int flen)
1049{
1050        bool anc_found;
1051        int pc;
1052
1053        /* Check the filter code now */
1054        for (pc = 0; pc < flen; pc++) {
1055                const struct sock_filter *ftest = &filter[pc];
1056
1057                /* May we actually operate on this code? */
1058                if (!chk_code_allowed(ftest->code))
1059                        return -EINVAL;
1060
1061                /* Some instructions need special checks */
1062                switch (ftest->code) {
1063                case BPF_ALU | BPF_DIV | BPF_K:
1064                case BPF_ALU | BPF_MOD | BPF_K:
1065                        /* Check for division by zero */
1066                        if (ftest->k == 0)
1067                                return -EINVAL;
1068                        break;
1069                case BPF_ALU | BPF_LSH | BPF_K:
1070                case BPF_ALU | BPF_RSH | BPF_K:
1071                        if (ftest->k >= 32)
1072                                return -EINVAL;
1073                        break;
1074                case BPF_LD | BPF_MEM:
1075                case BPF_LDX | BPF_MEM:
1076                case BPF_ST:
1077                case BPF_STX:
1078                        /* Check for invalid memory addresses */
1079                        if (ftest->k >= BPF_MEMWORDS)
1080                                return -EINVAL;
1081                        break;
1082                case BPF_JMP | BPF_JA:
1083                        /* Note, the large ftest->k might cause loops.
1084                         * Compare this with conditional jumps below,
1085                         * where offsets are limited. --ANK (981016)
1086                         */
1087                        if (ftest->k >= (unsigned int)(flen - pc - 1))
1088                                return -EINVAL;
1089                        break;
1090                case BPF_JMP | BPF_JEQ | BPF_K:
1091                case BPF_JMP | BPF_JEQ | BPF_X:
1092                case BPF_JMP | BPF_JGE | BPF_K:
1093                case BPF_JMP | BPF_JGE | BPF_X:
1094                case BPF_JMP | BPF_JGT | BPF_K:
1095                case BPF_JMP | BPF_JGT | BPF_X:
1096                case BPF_JMP | BPF_JSET | BPF_K:
1097                case BPF_JMP | BPF_JSET | BPF_X:
1098                        /* Both conditionals must be safe */
1099                        if (pc + ftest->jt + 1 >= flen ||
1100                            pc + ftest->jf + 1 >= flen)
1101                                return -EINVAL;
1102                        break;
1103                case BPF_LD | BPF_W | BPF_ABS:
1104                case BPF_LD | BPF_H | BPF_ABS:
1105                case BPF_LD | BPF_B | BPF_ABS:
1106                        anc_found = false;
1107                        if (bpf_anc_helper(ftest) & BPF_ANC)
1108                                anc_found = true;
1109                        /* Ancillary operation unknown or unsupported */
1110                        if (anc_found == false && ftest->k >= SKF_AD_OFF)
1111                                return -EINVAL;
1112                }
1113        }
1114
1115        /* Last instruction must be a RET code */
1116        switch (filter[flen - 1].code) {
1117        case BPF_RET | BPF_K:
1118        case BPF_RET | BPF_A:
1119                return check_load_and_stores(filter, flen);
1120        }
1121
1122        return -EINVAL;
1123}
1124
1125static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
1126                                      const struct sock_fprog *fprog)
1127{
1128        unsigned int fsize = bpf_classic_proglen(fprog);
1129        struct sock_fprog_kern *fkprog;
1130
1131        fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
1132        if (!fp->orig_prog)
1133                return -ENOMEM;
1134
1135        fkprog = fp->orig_prog;
1136        fkprog->len = fprog->len;
1137
1138        fkprog->filter = kmemdup(fp->insns, fsize,
1139                                 GFP_KERNEL | __GFP_NOWARN);
1140        if (!fkprog->filter) {
1141                kfree(fp->orig_prog);
1142                return -ENOMEM;
1143        }
1144
1145        return 0;
1146}
1147
1148static void bpf_release_orig_filter(struct bpf_prog *fp)
1149{
1150        struct sock_fprog_kern *fprog = fp->orig_prog;
1151
1152        if (fprog) {
1153                kfree(fprog->filter);
1154                kfree(fprog);
1155        }
1156}
1157
1158static void __bpf_prog_release(struct bpf_prog *prog)
1159{
1160        if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
1161                bpf_prog_put(prog);
1162        } else {
1163                bpf_release_orig_filter(prog);
1164                bpf_prog_free(prog);
1165        }
1166}
1167
1168static void __sk_filter_release(struct sk_filter *fp)
1169{
1170        __bpf_prog_release(fp->prog);
1171        kfree(fp);
1172}
1173
1174/**
1175 *      sk_filter_release_rcu - Release a socket filter by rcu_head
1176 *      @rcu: rcu_head that contains the sk_filter to free
1177 */
1178static void sk_filter_release_rcu(struct rcu_head *rcu)
1179{
1180        struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
1181
1182        __sk_filter_release(fp);
1183}
1184
1185/**
1186 *      sk_filter_release - release a socket filter
1187 *      @fp: filter to remove
1188 *
1189 *      Remove a filter from a socket and release its resources.
1190 */
1191static void sk_filter_release(struct sk_filter *fp)
1192{
1193        if (refcount_dec_and_test(&fp->refcnt))
1194                call_rcu(&fp->rcu, sk_filter_release_rcu);
1195}
1196
1197void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
1198{
1199        u32 filter_size = bpf_prog_size(fp->prog->len);
1200
1201        atomic_sub(filter_size, &sk->sk_omem_alloc);
1202        sk_filter_release(fp);
1203}
1204
1205/* try to charge the socket memory if there is space available
1206 * return true on success
1207 */
1208static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp)
1209{
1210        u32 filter_size = bpf_prog_size(fp->prog->len);
1211
1212        /* same check as in sock_kmalloc() */
1213        if (filter_size <= sysctl_optmem_max &&
1214            atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) {
1215                atomic_add(filter_size, &sk->sk_omem_alloc);
1216                return true;
1217        }
1218        return false;
1219}
1220
1221bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
1222{
1223        if (!refcount_inc_not_zero(&fp->refcnt))
1224                return false;
1225
1226        if (!__sk_filter_charge(sk, fp)) {
1227                sk_filter_release(fp);
1228                return false;
1229        }
1230        return true;
1231}
1232
1233static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
1234{
1235        struct sock_filter *old_prog;
1236        struct bpf_prog *old_fp;
1237        int err, new_len, old_len = fp->len;
1238        bool seen_ld_abs = false;
1239
1240        /* We are free to overwrite insns et al right here as it
1241         * won't be used at this point in time anymore internally
1242         * after the migration to the internal BPF instruction
1243         * representation.
1244         */
1245        BUILD_BUG_ON(sizeof(struct sock_filter) !=
1246                     sizeof(struct bpf_insn));
1247
1248        /* Conversion cannot happen on overlapping memory areas,
1249         * so we need to keep the user BPF around until the 2nd
1250         * pass. At this time, the user BPF is stored in fp->insns.
1251         */
1252        old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
1253                           GFP_KERNEL | __GFP_NOWARN);
1254        if (!old_prog) {
1255                err = -ENOMEM;
1256                goto out_err;
1257        }
1258
1259        /* 1st pass: calculate the new program length. */
1260        err = bpf_convert_filter(old_prog, old_len, NULL, &new_len,
1261                                 &seen_ld_abs);
1262        if (err)
1263                goto out_err_free;
1264
1265        /* Expand fp for appending the new filter representation. */
1266        old_fp = fp;
1267        fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0);
1268        if (!fp) {
1269                /* The old_fp is still around in case we couldn't
1270                 * allocate new memory, so uncharge on that one.
1271                 */
1272                fp = old_fp;
1273                err = -ENOMEM;
1274                goto out_err_free;
1275        }
1276
1277        fp->len = new_len;
1278
1279        /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
1280        err = bpf_convert_filter(old_prog, old_len, fp, &new_len,
1281                                 &seen_ld_abs);
1282        if (err)
1283                /* 2nd bpf_convert_filter() can fail only if it fails
1284                 * to allocate memory, remapping must succeed. Note,
1285                 * that at this time old_fp has already been released
1286                 * by krealloc().
1287                 */
1288                goto out_err_free;
1289
1290        fp = bpf_prog_select_runtime(fp, &err);
1291        if (err)
1292                goto out_err_free;
1293
1294        kfree(old_prog);
1295        return fp;
1296
1297out_err_free:
1298        kfree(old_prog);
1299out_err:
1300        __bpf_prog_release(fp);
1301        return ERR_PTR(err);
1302}
1303
1304static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
1305                                           bpf_aux_classic_check_t trans)
1306{
1307        int err;
1308
1309        fp->bpf_func = NULL;
1310        fp->jited = 0;
1311
1312        err = bpf_check_classic(fp->insns, fp->len);
1313        if (err) {
1314                __bpf_prog_release(fp);
1315                return ERR_PTR(err);
1316        }
1317
1318        /* There might be additional checks and transformations
1319         * needed on classic filters, f.e. in case of seccomp.
1320         */
1321        if (trans) {
1322                err = trans(fp->insns, fp->len);
1323                if (err) {
1324                        __bpf_prog_release(fp);
1325                        return ERR_PTR(err);
1326                }
1327        }
1328
1329        /* Probe if we can JIT compile the filter and if so, do
1330         * the compilation of the filter.
1331         */
1332        bpf_jit_compile(fp);
1333
1334        /* JIT compiler couldn't process this filter, so do the
1335         * internal BPF translation for the optimized interpreter.
1336         */
1337        if (!fp->jited)
1338                fp = bpf_migrate_filter(fp);
1339
1340        return fp;
1341}
1342
1343/**
1344 *      bpf_prog_create - create an unattached filter
1345 *      @pfp: the unattached filter that is created
1346 *      @fprog: the filter program
1347 *
1348 * Create a filter independent of any socket. We first run some
1349 * sanity checks on it to make sure it does not explode on us later.
1350 * If an error occurs or there is insufficient memory for the filter
1351 * a negative errno code is returned. On success the return is zero.
1352 */
1353int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
1354{
1355        unsigned int fsize = bpf_classic_proglen(fprog);
1356        struct bpf_prog *fp;
1357
1358        /* Make sure new filter is there and in the right amounts. */
1359        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1360                return -EINVAL;
1361
1362        fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1363        if (!fp)
1364                return -ENOMEM;
1365
1366        memcpy(fp->insns, fprog->filter, fsize);
1367
1368        fp->len = fprog->len;
1369        /* Since unattached filters are not copied back to user
1370         * space through sk_get_filter(), we do not need to hold
1371         * a copy here, and can spare us the work.
1372         */
1373        fp->orig_prog = NULL;
1374
1375        /* bpf_prepare_filter() already takes care of freeing
1376         * memory in case something goes wrong.
1377         */
1378        fp = bpf_prepare_filter(fp, NULL);
1379        if (IS_ERR(fp))
1380                return PTR_ERR(fp);
1381
1382        *pfp = fp;
1383        return 0;
1384}
1385EXPORT_SYMBOL_GPL(bpf_prog_create);
1386
1387/**
1388 *      bpf_prog_create_from_user - create an unattached filter from user buffer
1389 *      @pfp: the unattached filter that is created
1390 *      @fprog: the filter program
1391 *      @trans: post-classic verifier transformation handler
1392 *      @save_orig: save classic BPF program
1393 *
1394 * This function effectively does the same as bpf_prog_create(), only
1395 * that it builds up its insns buffer from user space provided buffer.
1396 * It also allows for passing a bpf_aux_classic_check_t handler.
1397 */
1398int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
1399                              bpf_aux_classic_check_t trans, bool save_orig)
1400{
1401        unsigned int fsize = bpf_classic_proglen(fprog);
1402        struct bpf_prog *fp;
1403        int err;
1404
1405        /* Make sure new filter is there and in the right amounts. */
1406        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1407                return -EINVAL;
1408
1409        fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1410        if (!fp)
1411                return -ENOMEM;
1412
1413        if (copy_from_user(fp->insns, fprog->filter, fsize)) {
1414                __bpf_prog_free(fp);
1415                return -EFAULT;
1416        }
1417
1418        fp->len = fprog->len;
1419        fp->orig_prog = NULL;
1420
1421        if (save_orig) {
1422                err = bpf_prog_store_orig_filter(fp, fprog);
1423                if (err) {
1424                        __bpf_prog_free(fp);
1425                        return -ENOMEM;
1426                }
1427        }
1428
1429        /* bpf_prepare_filter() already takes care of freeing
1430         * memory in case something goes wrong.
1431         */
1432        fp = bpf_prepare_filter(fp, trans);
1433        if (IS_ERR(fp))
1434                return PTR_ERR(fp);
1435
1436        *pfp = fp;
1437        return 0;
1438}
1439EXPORT_SYMBOL_GPL(bpf_prog_create_from_user);
1440
1441void bpf_prog_destroy(struct bpf_prog *fp)
1442{
1443        __bpf_prog_release(fp);
1444}
1445EXPORT_SYMBOL_GPL(bpf_prog_destroy);
1446
1447static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
1448{
1449        struct sk_filter *fp, *old_fp;
1450
1451        fp = kmalloc(sizeof(*fp), GFP_KERNEL);
1452        if (!fp)
1453                return -ENOMEM;
1454
1455        fp->prog = prog;
1456
1457        if (!__sk_filter_charge(sk, fp)) {
1458                kfree(fp);
1459                return -ENOMEM;
1460        }
1461        refcount_set(&fp->refcnt, 1);
1462
1463        old_fp = rcu_dereference_protected(sk->sk_filter,
1464                                           lockdep_sock_is_held(sk));
1465        rcu_assign_pointer(sk->sk_filter, fp);
1466
1467        if (old_fp)
1468                sk_filter_uncharge(sk, old_fp);
1469
1470        return 0;
1471}
1472
1473static
1474struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
1475{
1476        unsigned int fsize = bpf_classic_proglen(fprog);
1477        struct bpf_prog *prog;
1478        int err;
1479
1480        if (sock_flag(sk, SOCK_FILTER_LOCKED))
1481                return ERR_PTR(-EPERM);
1482
1483        /* Make sure new filter is there and in the right amounts. */
1484        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1485                return ERR_PTR(-EINVAL);
1486
1487        prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1488        if (!prog)
1489                return ERR_PTR(-ENOMEM);
1490
1491        if (copy_from_user(prog->insns, fprog->filter, fsize)) {
1492                __bpf_prog_free(prog);
1493                return ERR_PTR(-EFAULT);
1494        }
1495
1496        prog->len = fprog->len;
1497
1498        err = bpf_prog_store_orig_filter(prog, fprog);
1499        if (err) {
1500                __bpf_prog_free(prog);
1501                return ERR_PTR(-ENOMEM);
1502        }
1503
1504        /* bpf_prepare_filter() already takes care of freeing
1505         * memory in case something goes wrong.
1506         */
1507        return bpf_prepare_filter(prog, NULL);
1508}
1509
1510/**
1511 *      sk_attach_filter - attach a socket filter
1512 *      @fprog: the filter program
1513 *      @sk: the socket to use
1514 *
1515 * Attach the user's filter code. We first run some sanity checks on
1516 * it to make sure it does not explode on us later. If an error
1517 * occurs or there is insufficient memory for the filter a negative
1518 * errno code is returned. On success the return is zero.
1519 */
1520int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1521{
1522        struct bpf_prog *prog = __get_filter(fprog, sk);
1523        int err;
1524
1525        if (IS_ERR(prog))
1526                return PTR_ERR(prog);
1527
1528        err = __sk_attach_prog(prog, sk);
1529        if (err < 0) {
1530                __bpf_prog_release(prog);
1531                return err;
1532        }
1533
1534        return 0;
1535}
1536EXPORT_SYMBOL_GPL(sk_attach_filter);
1537
1538int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1539{
1540        struct bpf_prog *prog = __get_filter(fprog, sk);
1541        int err;
1542
1543        if (IS_ERR(prog))
1544                return PTR_ERR(prog);
1545
1546        if (bpf_prog_size(prog->len) > sysctl_optmem_max)
1547                err = -ENOMEM;
1548        else
1549                err = reuseport_attach_prog(sk, prog);
1550
1551        if (err)
1552                __bpf_prog_release(prog);
1553
1554        return err;
1555}
1556
1557static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
1558{
1559        if (sock_flag(sk, SOCK_FILTER_LOCKED))
1560                return ERR_PTR(-EPERM);
1561
1562        return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
1563}
1564
1565int sk_attach_bpf(u32 ufd, struct sock *sk)
1566{
1567        struct bpf_prog *prog = __get_bpf(ufd, sk);
1568        int err;
1569
1570        if (IS_ERR(prog))
1571                return PTR_ERR(prog);
1572
1573        err = __sk_attach_prog(prog, sk);
1574        if (err < 0) {
1575                bpf_prog_put(prog);
1576                return err;
1577        }
1578
1579        return 0;
1580}
1581
1582int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
1583{
1584        struct bpf_prog *prog;
1585        int err;
1586
1587        if (sock_flag(sk, SOCK_FILTER_LOCKED))
1588                return -EPERM;
1589
1590        prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
1591        if (PTR_ERR(prog) == -EINVAL)
1592                prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT);
1593        if (IS_ERR(prog))
1594                return PTR_ERR(prog);
1595
1596        if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) {
1597                /* Like other non BPF_PROG_TYPE_SOCKET_FILTER
1598                 * bpf prog (e.g. sockmap).  It depends on the
1599                 * limitation imposed by bpf_prog_load().
1600                 * Hence, sysctl_optmem_max is not checked.
1601                 */
1602                if ((sk->sk_type != SOCK_STREAM &&
1603                     sk->sk_type != SOCK_DGRAM) ||
1604                    (sk->sk_protocol != IPPROTO_UDP &&
1605                     sk->sk_protocol != IPPROTO_TCP) ||
1606                    (sk->sk_family != AF_INET &&
1607                     sk->sk_family != AF_INET6)) {
1608                        err = -ENOTSUPP;
1609                        goto err_prog_put;
1610                }
1611        } else {
1612                /* BPF_PROG_TYPE_SOCKET_FILTER */
1613                if (bpf_prog_size(prog->len) > sysctl_optmem_max) {
1614                        err = -ENOMEM;
1615                        goto err_prog_put;
1616                }
1617        }
1618
1619        err = reuseport_attach_prog(sk, prog);
1620err_prog_put:
1621        if (err)
1622                bpf_prog_put(prog);
1623
1624        return err;
1625}
1626
1627void sk_reuseport_prog_free(struct bpf_prog *prog)
1628{
1629        if (!prog)
1630                return;
1631
1632        if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
1633                bpf_prog_put(prog);
1634        else
1635                bpf_prog_destroy(prog);
1636}
1637
1638struct bpf_scratchpad {
1639        union {
1640                __be32 diff[MAX_BPF_STACK / sizeof(__be32)];
1641                u8     buff[MAX_BPF_STACK];
1642        };
1643};
1644
1645static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
1646
1647static inline int __bpf_try_make_writable(struct sk_buff *skb,
1648                                          unsigned int write_len)
1649{
1650        return skb_ensure_writable(skb, write_len);
1651}
1652
1653static inline int bpf_try_make_writable(struct sk_buff *skb,
1654                                        unsigned int write_len)
1655{
1656        int err = __bpf_try_make_writable(skb, write_len);
1657
1658        bpf_compute_data_pointers(skb);
1659        return err;
1660}
1661
1662static int bpf_try_make_head_writable(struct sk_buff *skb)
1663{
1664        return bpf_try_make_writable(skb, skb_headlen(skb));
1665}
1666
1667static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
1668{
1669        if (skb_at_tc_ingress(skb))
1670                skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len);
1671}
1672
1673static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
1674{
1675        if (skb_at_tc_ingress(skb))
1676                skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
1677}
1678
1679BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
1680           const void *, from, u32, len, u64, flags)
1681{
1682        void *ptr;
1683
1684        if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
1685                return -EINVAL;
1686        if (unlikely(offset > 0xffff))
1687                return -EFAULT;
1688        if (unlikely(bpf_try_make_writable(skb, offset + len)))
1689                return -EFAULT;
1690
1691        ptr = skb->data + offset;
1692        if (flags & BPF_F_RECOMPUTE_CSUM)
1693                __skb_postpull_rcsum(skb, ptr, len, offset);
1694
1695        memcpy(ptr, from, len);
1696
1697        if (flags & BPF_F_RECOMPUTE_CSUM)
1698                __skb_postpush_rcsum(skb, ptr, len, offset);
1699        if (flags & BPF_F_INVALIDATE_HASH)
1700                skb_clear_hash(skb);
1701
1702        return 0;
1703}
1704
1705static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
1706        .func           = bpf_skb_store_bytes,
1707        .gpl_only       = false,
1708        .ret_type       = RET_INTEGER,
1709        .arg1_type      = ARG_PTR_TO_CTX,
1710        .arg2_type      = ARG_ANYTHING,
1711        .arg3_type      = ARG_PTR_TO_MEM,
1712        .arg4_type      = ARG_CONST_SIZE,
1713        .arg5_type      = ARG_ANYTHING,
1714};
1715
1716BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
1717           void *, to, u32, len)
1718{
1719        void *ptr;
1720
1721        if (unlikely(offset > 0xffff))
1722                goto err_clear;
1723
1724        ptr = skb_header_pointer(skb, offset, len, to);
1725        if (unlikely(!ptr))
1726                goto err_clear;
1727        if (ptr != to)
1728                memcpy(to, ptr, len);
1729
1730        return 0;
1731err_clear:
1732        memset(to, 0, len);
1733        return -EFAULT;
1734}
1735
1736static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
1737        .func           = bpf_skb_load_bytes,
1738        .gpl_only       = false,
1739        .ret_type       = RET_INTEGER,
1740        .arg1_type      = ARG_PTR_TO_CTX,
1741        .arg2_type      = ARG_ANYTHING,
1742        .arg3_type      = ARG_PTR_TO_UNINIT_MEM,
1743        .arg4_type      = ARG_CONST_SIZE,
1744};
1745
1746BPF_CALL_4(bpf_flow_dissector_load_bytes,
1747           const struct bpf_flow_dissector *, ctx, u32, offset,
1748           void *, to, u32, len)
1749{
1750        void *ptr;
1751
1752        if (unlikely(offset > 0xffff))
1753                goto err_clear;
1754
1755        if (unlikely(!ctx->skb))
1756                goto err_clear;
1757
1758        ptr = skb_header_pointer(ctx->skb, offset, len, to);
1759        if (unlikely(!ptr))
1760                goto err_clear;
1761        if (ptr != to)
1762                memcpy(to, ptr, len);
1763
1764        return 0;
1765err_clear:
1766        memset(to, 0, len);
1767        return -EFAULT;
1768}
1769
1770static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = {
1771        .func           = bpf_flow_dissector_load_bytes,
1772        .gpl_only       = false,
1773        .ret_type       = RET_INTEGER,
1774        .arg1_type      = ARG_PTR_TO_CTX,
1775        .arg2_type      = ARG_ANYTHING,
1776        .arg3_type      = ARG_PTR_TO_UNINIT_MEM,
1777        .arg4_type      = ARG_CONST_SIZE,
1778};
1779
1780BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
1781           u32, offset, void *, to, u32, len, u32, start_header)
1782{
1783        u8 *end = skb_tail_pointer(skb);
1784        u8 *start, *ptr;
1785
1786        if (unlikely(offset > 0xffff))
1787                goto err_clear;
1788
1789        switch (start_header) {
1790        case BPF_HDR_START_MAC:
1791                if (unlikely(!skb_mac_header_was_set(skb)))
1792                        goto err_clear;
1793                start = skb_mac_header(skb);
1794                break;
1795        case BPF_HDR_START_NET:
1796                start = skb_network_header(skb);
1797                break;
1798        default:
1799                goto err_clear;
1800        }
1801
1802        ptr = start + offset;
1803
1804        if (likely(ptr + len <= end)) {
1805                memcpy(to, ptr, len);
1806                return 0;
1807        }
1808
1809err_clear:
1810        memset(to, 0, len);
1811        return -EFAULT;
1812}
1813
1814static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = {
1815        .func           = bpf_skb_load_bytes_relative,
1816        .gpl_only       = false,
1817        .ret_type       = RET_INTEGER,
1818        .arg1_type      = ARG_PTR_TO_CTX,
1819        .arg2_type      = ARG_ANYTHING,
1820        .arg3_type      = ARG_PTR_TO_UNINIT_MEM,
1821        .arg4_type      = ARG_CONST_SIZE,
1822        .arg5_type      = ARG_ANYTHING,
1823};
1824
1825BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
1826{
1827        /* Idea is the following: should the needed direct read/write
1828         * test fail during runtime, we can pull in more data and redo
1829         * again, since implicitly, we invalidate previous checks here.
1830         *
1831         * Or, since we know how much we need to make read/writeable,
1832         * this can be done once at the program beginning for direct
1833         * access case. By this we overcome limitations of only current
1834         * headroom being accessible.
1835         */
1836        return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
1837}
1838
1839static const struct bpf_func_proto bpf_skb_pull_data_proto = {
1840        .func           = bpf_skb_pull_data,
1841        .gpl_only       = false,
1842        .ret_type       = RET_INTEGER,
1843        .arg1_type      = ARG_PTR_TO_CTX,
1844        .arg2_type      = ARG_ANYTHING,
1845};
1846
1847BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk)
1848{
1849        return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL;
1850}
1851
1852static const struct bpf_func_proto bpf_sk_fullsock_proto = {
1853        .func           = bpf_sk_fullsock,
1854        .gpl_only       = false,
1855        .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
1856        .arg1_type      = ARG_PTR_TO_SOCK_COMMON,
1857};
1858
1859static inline int sk_skb_try_make_writable(struct sk_buff *skb,
1860                                           unsigned int write_len)
1861{
1862        int err = __bpf_try_make_writable(skb, write_len);
1863
1864        bpf_compute_data_end_sk_skb(skb);
1865        return err;
1866}
1867
1868BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
1869{
1870        /* Idea is the following: should the needed direct read/write
1871         * test fail during runtime, we can pull in more data and redo
1872         * again, since implicitly, we invalidate previous checks here.
1873         *
1874         * Or, since we know how much we need to make read/writeable,
1875         * this can be done once at the program beginning for direct
1876         * access case. By this we overcome limitations of only current
1877         * headroom being accessible.
1878         */
1879        return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb));
1880}
1881
1882static const struct bpf_func_proto sk_skb_pull_data_proto = {
1883        .func           = sk_skb_pull_data,
1884        .gpl_only       = false,
1885        .ret_type       = RET_INTEGER,
1886        .arg1_type      = ARG_PTR_TO_CTX,
1887        .arg2_type      = ARG_ANYTHING,
1888};
1889
1890BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
1891           u64, from, u64, to, u64, flags)
1892{
1893        __sum16 *ptr;
1894
1895        if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
1896                return -EINVAL;
1897        if (unlikely(offset > 0xffff || offset & 1))
1898                return -EFAULT;
1899        if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
1900                return -EFAULT;
1901
1902        ptr = (__sum16 *)(skb->data + offset);
1903        switch (flags & BPF_F_HDR_FIELD_MASK) {
1904        case 0:
1905                if (unlikely(from != 0))
1906                        return -EINVAL;
1907
1908                csum_replace_by_diff(ptr, to);
1909                break;
1910        case 2:
1911                csum_replace2(ptr, from, to);
1912                break;
1913        case 4:
1914                csum_replace4(ptr, from, to);
1915                break;
1916        default:
1917                return -EINVAL;
1918        }
1919
1920        return 0;
1921}
1922
1923static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
1924        .func           = bpf_l3_csum_replace,
1925        .gpl_only       = false,
1926        .ret_type       = RET_INTEGER,
1927        .arg1_type      = ARG_PTR_TO_CTX,
1928        .arg2_type      = ARG_ANYTHING,
1929        .arg3_type      = ARG_ANYTHING,
1930        .arg4_type      = ARG_ANYTHING,
1931        .arg5_type      = ARG_ANYTHING,
1932};
1933
1934BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
1935           u64, from, u64, to, u64, flags)
1936{
1937        bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
1938        bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
1939        bool do_mforce = flags & BPF_F_MARK_ENFORCE;
1940        __sum16 *ptr;
1941
1942        if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
1943                               BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
1944                return -EINVAL;
1945        if (unlikely(offset > 0xffff || offset & 1))
1946                return -EFAULT;
1947        if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
1948                return -EFAULT;
1949
1950        ptr = (__sum16 *)(skb->data + offset);
1951        if (is_mmzero && !do_mforce && !*ptr)
1952                return 0;
1953
1954        switch (flags & BPF_F_HDR_FIELD_MASK) {
1955        case 0:
1956                if (unlikely(from != 0))
1957                        return -EINVAL;
1958
1959                inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
1960                break;
1961        case 2:
1962                inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
1963                break;
1964        case 4:
1965                inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo);
1966                break;
1967        default:
1968                return -EINVAL;
1969        }
1970
1971        if (is_mmzero && !*ptr)
1972                *ptr = CSUM_MANGLED_0;
1973        return 0;
1974}
1975
1976static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
1977        .func           = bpf_l4_csum_replace,
1978        .gpl_only       = false,
1979        .ret_type       = RET_INTEGER,
1980        .arg1_type      = ARG_PTR_TO_CTX,
1981        .arg2_type      = ARG_ANYTHING,
1982        .arg3_type      = ARG_ANYTHING,
1983        .arg4_type      = ARG_ANYTHING,
1984        .arg5_type      = ARG_ANYTHING,
1985};
1986
1987BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
1988           __be32 *, to, u32, to_size, __wsum, seed)
1989{
1990        struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
1991        u32 diff_size = from_size + to_size;
1992        int i, j = 0;
1993
1994        /* This is quite flexible, some examples:
1995         *
1996         * from_size == 0, to_size > 0,  seed := csum --> pushing data
1997         * from_size > 0,  to_size == 0, seed := csum --> pulling data
1998         * from_size > 0,  to_size > 0,  seed := 0    --> diffing data
1999         *
2000         * Even for diffing, from_size and to_size don't need to be equal.
2001         */
2002        if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
2003                     diff_size > sizeof(sp->diff)))
2004                return -EINVAL;
2005
2006        for (i = 0; i < from_size / sizeof(__be32); i++, j++)
2007                sp->diff[j] = ~from[i];
2008        for (i = 0; i <   to_size / sizeof(__be32); i++, j++)
2009                sp->diff[j] = to[i];
2010
2011        return csum_partial(sp->diff, diff_size, seed);
2012}
2013
2014static const struct bpf_func_proto bpf_csum_diff_proto = {
2015        .func           = bpf_csum_diff,
2016        .gpl_only       = false,
2017        .pkt_access     = true,
2018        .ret_type       = RET_INTEGER,
2019        .arg1_type      = ARG_PTR_TO_MEM_OR_NULL,
2020        .arg2_type      = ARG_CONST_SIZE_OR_ZERO,
2021        .arg3_type      = ARG_PTR_TO_MEM_OR_NULL,
2022        .arg4_type      = ARG_CONST_SIZE_OR_ZERO,
2023        .arg5_type      = ARG_ANYTHING,
2024};
2025
2026BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
2027{
2028        /* The interface is to be used in combination with bpf_csum_diff()
2029         * for direct packet writes. csum rotation for alignment as well
2030         * as emulating csum_sub() can be done from the eBPF program.
2031         */
2032        if (skb->ip_summed == CHECKSUM_COMPLETE)
2033                return (skb->csum = csum_add(skb->csum, csum));
2034
2035        return -ENOTSUPP;
2036}
2037
2038static const struct bpf_func_proto bpf_csum_update_proto = {
2039        .func           = bpf_csum_update,
2040        .gpl_only       = false,
2041        .ret_type       = RET_INTEGER,
2042        .arg1_type      = ARG_PTR_TO_CTX,
2043        .arg2_type      = ARG_ANYTHING,
2044};
2045
2046BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level)
2047{
2048        /* The interface is to be used in combination with bpf_skb_adjust_room()
2049         * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET
2050         * is passed as flags, for example.
2051         */
2052        switch (level) {
2053        case BPF_CSUM_LEVEL_INC:
2054                __skb_incr_checksum_unnecessary(skb);
2055                break;
2056        case BPF_CSUM_LEVEL_DEC:
2057                __skb_decr_checksum_unnecessary(skb);
2058                break;
2059        case BPF_CSUM_LEVEL_RESET:
2060                __skb_reset_checksum_unnecessary(skb);
2061                break;
2062        case BPF_CSUM_LEVEL_QUERY:
2063                return skb->ip_summed == CHECKSUM_UNNECESSARY ?
2064                       skb->csum_level : -EACCES;
2065        default:
2066                return -EINVAL;
2067        }
2068
2069        return 0;
2070}
2071
2072static const struct bpf_func_proto bpf_csum_level_proto = {
2073        .func           = bpf_csum_level,
2074        .gpl_only       = false,
2075        .ret_type       = RET_INTEGER,
2076        .arg1_type      = ARG_PTR_TO_CTX,
2077        .arg2_type      = ARG_ANYTHING,
2078};
2079
2080static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
2081{
2082        return dev_forward_skb(dev, skb);
2083}
2084
2085static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
2086                                      struct sk_buff *skb)
2087{
2088        int ret = ____dev_forward_skb(dev, skb);
2089
2090        if (likely(!ret)) {
2091                skb->dev = dev;
2092                ret = netif_rx(skb);
2093        }
2094
2095        return ret;
2096}
2097
2098static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
2099{
2100        int ret;
2101
2102        if (dev_xmit_recursion()) {
2103                net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
2104                kfree_skb(skb);
2105                return -ENETDOWN;
2106        }
2107
2108        skb->dev = dev;
2109        skb->tstamp = 0;
2110
2111        dev_xmit_recursion_inc();
2112        ret = dev_queue_xmit(skb);
2113        dev_xmit_recursion_dec();
2114
2115        return ret;
2116}
2117
2118static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
2119                                 u32 flags)
2120{
2121        unsigned int mlen = skb_network_offset(skb);
2122
2123        if (mlen) {
2124                __skb_pull(skb, mlen);
2125
2126                /* At ingress, the mac header has already been pulled once.
2127                 * At egress, skb_pospull_rcsum has to be done in case that
2128                 * the skb is originated from ingress (i.e. a forwarded skb)
2129                 * to ensure that rcsum starts at net header.
2130                 */
2131                if (!skb_at_tc_ingress(skb))
2132                        skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
2133        }
2134        skb_pop_mac_header(skb);
2135        skb_reset_mac_len(skb);
2136        return flags & BPF_F_INGRESS ?
2137               __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb);
2138}
2139
2140static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
2141                                 u32 flags)
2142{
2143        /* Verify that a link layer header is carried */
2144        if (unlikely(skb->mac_header >= skb->network_header)) {
2145                kfree_skb(skb);
2146                return -ERANGE;
2147        }
2148
2149        bpf_push_mac_rcsum(skb);
2150        return flags & BPF_F_INGRESS ?
2151               __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
2152}
2153
2154static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
2155                          u32 flags)
2156{
2157        if (dev_is_mac_header_xmit(dev))
2158                return __bpf_redirect_common(skb, dev, flags);
2159        else
2160                return __bpf_redirect_no_mac(skb, dev, flags);
2161}
2162
2163BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
2164{
2165        struct net_device *dev;
2166        struct sk_buff *clone;
2167        int ret;
2168
2169        if (unlikely(flags & ~(BPF_F_INGRESS)))
2170                return -EINVAL;
2171
2172        dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
2173        if (unlikely(!dev))
2174                return -EINVAL;
2175
2176        clone = skb_clone(skb, GFP_ATOMIC);
2177        if (unlikely(!clone))
2178                return -ENOMEM;
2179
2180        /* For direct write, we need to keep the invariant that the skbs
2181         * we're dealing with need to be uncloned. Should uncloning fail
2182         * here, we need to free the just generated clone to unclone once
2183         * again.
2184         */
2185        ret = bpf_try_make_head_writable(skb);
2186        if (unlikely(ret)) {
2187                kfree_skb(clone);
2188                return -ENOMEM;
2189        }
2190
2191        return __bpf_redirect(clone, dev, flags);
2192}
2193
2194static const struct bpf_func_proto bpf_clone_redirect_proto = {
2195        .func           = bpf_clone_redirect,
2196        .gpl_only       = false,
2197        .ret_type       = RET_INTEGER,
2198        .arg1_type      = ARG_PTR_TO_CTX,
2199        .arg2_type      = ARG_ANYTHING,
2200        .arg3_type      = ARG_ANYTHING,
2201};
2202
2203DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
2204EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
2205
2206BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
2207{
2208        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
2209
2210        if (unlikely(flags & ~(BPF_F_INGRESS)))
2211                return TC_ACT_SHOT;
2212
2213        ri->flags = flags;
2214        ri->tgt_index = ifindex;
2215
2216        return TC_ACT_REDIRECT;
2217}
2218
2219int skb_do_redirect(struct sk_buff *skb)
2220{
2221        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
2222        struct net_device *dev;
2223
2224        dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index);
2225        ri->tgt_index = 0;
2226        if (unlikely(!dev)) {
2227                kfree_skb(skb);
2228                return -EINVAL;
2229        }
2230
2231        return __bpf_redirect(skb, dev, ri->flags);
2232}
2233
2234static const struct bpf_func_proto bpf_redirect_proto = {
2235        .func           = bpf_redirect,
2236        .gpl_only       = false,
2237        .ret_type       = RET_INTEGER,
2238        .arg1_type      = ARG_ANYTHING,
2239        .arg2_type      = ARG_ANYTHING,
2240};
2241
2242BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
2243{
2244        msg->apply_bytes = bytes;
2245        return 0;
2246}
2247
2248static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
2249        .func           = bpf_msg_apply_bytes,
2250        .gpl_only       = false,
2251        .ret_type       = RET_INTEGER,
2252        .arg1_type      = ARG_PTR_TO_CTX,
2253        .arg2_type      = ARG_ANYTHING,
2254};
2255
2256BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
2257{
2258        msg->cork_bytes = bytes;
2259        return 0;
2260}
2261
2262static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
2263        .func           = bpf_msg_cork_bytes,
2264        .gpl_only       = false,
2265        .ret_type       = RET_INTEGER,
2266        .arg1_type      = ARG_PTR_TO_CTX,
2267        .arg2_type      = ARG_ANYTHING,
2268};
2269
2270BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
2271           u32, end, u64, flags)
2272{
2273        u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start;
2274        u32 first_sge, last_sge, i, shift, bytes_sg_total;
2275        struct scatterlist *sge;
2276        u8 *raw, *to, *from;
2277        struct page *page;
2278
2279        if (unlikely(flags || end <= start))
2280                return -EINVAL;
2281
2282        /* First find the starting scatterlist element */
2283        i = msg->sg.start;
2284        do {
2285                offset += len;
2286                len = sk_msg_elem(msg, i)->length;
2287                if (start < offset + len)
2288                        break;
2289                sk_msg_iter_var_next(i);
2290        } while (i != msg->sg.end);
2291
2292        if (unlikely(start >= offset + len))
2293                return -EINVAL;
2294
2295        first_sge = i;
2296        /* The start may point into the sg element so we need to also
2297         * account for the headroom.
2298         */
2299        bytes_sg_total = start - offset + bytes;
2300        if (!test_bit(i, &msg->sg.copy) && bytes_sg_total <= len)
2301                goto out;
2302
2303        /* At this point we need to linearize multiple scatterlist
2304         * elements or a single shared page. Either way we need to
2305         * copy into a linear buffer exclusively owned by BPF. Then
2306         * place the buffer in the scatterlist and fixup the original
2307         * entries by removing the entries now in the linear buffer
2308         * and shifting the remaining entries. For now we do not try
2309         * to copy partial entries to avoid complexity of running out
2310         * of sg_entry slots. The downside is reading a single byte
2311         * will copy the entire sg entry.
2312         */
2313        do {
2314                copy += sk_msg_elem(msg, i)->length;
2315                sk_msg_iter_var_next(i);
2316                if (bytes_sg_total <= copy)
2317                        break;
2318        } while (i != msg->sg.end);
2319        last_sge = i;
2320
2321        if (unlikely(bytes_sg_total > copy))
2322                return -EINVAL;
2323
2324        page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
2325                           get_order(copy));
2326        if (unlikely(!page))
2327                return -ENOMEM;
2328
2329        raw = page_address(page);
2330        i = first_sge;
2331        do {
2332                sge = sk_msg_elem(msg, i);
2333                from = sg_virt(sge);
2334                len = sge->length;
2335                to = raw + poffset;
2336
2337                memcpy(to, from, len);
2338                poffset += len;
2339                sge->length = 0;
2340                put_page(sg_page(sge));
2341
2342                sk_msg_iter_var_next(i);
2343        } while (i != last_sge);
2344
2345        sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
2346
2347        /* To repair sg ring we need to shift entries. If we only
2348         * had a single entry though we can just replace it and
2349         * be done. Otherwise walk the ring and shift the entries.
2350         */
2351        WARN_ON_ONCE(last_sge == first_sge);
2352        shift = last_sge > first_sge ?
2353                last_sge - first_sge - 1 :
2354                NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
2355        if (!shift)
2356                goto out;
2357
2358        i = first_sge;
2359        sk_msg_iter_var_next(i);
2360        do {
2361                u32 move_from;
2362
2363                if (i + shift >= NR_MSG_FRAG_IDS)
2364                        move_from = i + shift - NR_MSG_FRAG_IDS;
2365                else
2366                        move_from = i + shift;
2367                if (move_from == msg->sg.end)
2368                        break;
2369
2370                msg->sg.data[i] = msg->sg.data[move_from];
2371                msg->sg.data[move_from].length = 0;
2372                msg->sg.data[move_from].page_link = 0;
2373                msg->sg.data[move_from].offset = 0;
2374                sk_msg_iter_var_next(i);
2375        } while (1);
2376
2377        msg->sg.end = msg->sg.end - shift > msg->sg.end ?
2378                      msg->sg.end - shift + NR_MSG_FRAG_IDS :
2379                      msg->sg.end - shift;
2380out:
2381        msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
2382        msg->data_end = msg->data + bytes;
2383        return 0;
2384}
2385
2386static const struct bpf_func_proto bpf_msg_pull_data_proto = {
2387        .func           = bpf_msg_pull_data,
2388        .gpl_only       = false,
2389        .ret_type       = RET_INTEGER,
2390        .arg1_type      = ARG_PTR_TO_CTX,
2391        .arg2_type      = ARG_ANYTHING,
2392        .arg3_type      = ARG_ANYTHING,
2393        .arg4_type      = ARG_ANYTHING,
2394};
2395
2396BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
2397           u32, len, u64, flags)
2398{
2399        struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
2400        u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
2401        u8 *raw, *to, *from;
2402        struct page *page;
2403
2404        if (unlikely(flags))
2405                return -EINVAL;
2406
2407        /* First find the starting scatterlist element */
2408        i = msg->sg.start;
2409        do {
2410                offset += l;
2411                l = sk_msg_elem(msg, i)->length;
2412
2413                if (start < offset + l)
2414                        break;
2415                sk_msg_iter_var_next(i);
2416        } while (i != msg->sg.end);
2417
2418        if (start >= offset + l)
2419                return -EINVAL;
2420
2421        space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
2422
2423        /* If no space available will fallback to copy, we need at
2424         * least one scatterlist elem available to push data into
2425         * when start aligns to the beginning of an element or two
2426         * when it falls inside an element. We handle the start equals
2427         * offset case because its the common case for inserting a
2428         * header.
2429         */
2430        if (!space || (space == 1 && start != offset))
2431                copy = msg->sg.data[i].length;
2432
2433        page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
2434                           get_order(copy + len));
2435        if (unlikely(!page))
2436                return -ENOMEM;
2437
2438        if (copy) {
2439                int front, back;
2440
2441                raw = page_address(page);
2442
2443                psge = sk_msg_elem(msg, i);
2444                front = start - offset;
2445                back = psge->length - front;
2446                from = sg_virt(psge);
2447
2448                if (front)
2449                        memcpy(raw, from, front);
2450
2451                if (back) {
2452                        from += front;
2453                        to = raw + front + len;
2454
2455                        memcpy(to, from, back);
2456                }
2457
2458                put_page(sg_page(psge));
2459        } else if (start - offset) {
2460                psge = sk_msg_elem(msg, i);
2461                rsge = sk_msg_elem_cpy(msg, i);
2462
2463                psge->length = start - offset;
2464                rsge.length -= psge->length;
2465                rsge.offset += start;
2466
2467                sk_msg_iter_var_next(i);
2468                sg_unmark_end(psge);
2469                sg_unmark_end(&rsge);
2470                sk_msg_iter_next(msg, end);
2471        }
2472
2473        /* Slot(s) to place newly allocated data */
2474        new = i;
2475
2476        /* Shift one or two slots as needed */
2477        if (!copy) {
2478                sge = sk_msg_elem_cpy(msg, i);
2479
2480                sk_msg_iter_var_next(i);
2481                sg_unmark_end(&sge);
2482                sk_msg_iter_next(msg, end);
2483
2484                nsge = sk_msg_elem_cpy(msg, i);
2485                if (rsge.length) {
2486                        sk_msg_iter_var_next(i);
2487                        nnsge = sk_msg_elem_cpy(msg, i);
2488                }
2489
2490                while (i != msg->sg.end) {
2491                        msg->sg.data[i] = sge;
2492                        sge = nsge;
2493                        sk_msg_iter_var_next(i);
2494                        if (rsge.length) {
2495                                nsge = nnsge;
2496                                nnsge = sk_msg_elem_cpy(msg, i);
2497                        } else {
2498                                nsge = sk_msg_elem_cpy(msg, i);
2499                        }
2500                }
2501        }
2502
2503        /* Place newly allocated data buffer */
2504        sk_mem_charge(msg->sk, len);
2505        msg->sg.size += len;
2506        __clear_bit(new, &msg->sg.copy);
2507        sg_set_page(&msg->sg.data[new], page, len + copy, 0);
2508        if (rsge.length) {
2509                get_page(sg_page(&rsge));
2510                sk_msg_iter_var_next(new);
2511                msg->sg.data[new] = rsge;
2512        }
2513
2514        sk_msg_compute_data_pointers(msg);
2515        return 0;
2516}
2517
2518static const struct bpf_func_proto bpf_msg_push_data_proto = {
2519        .func           = bpf_msg_push_data,
2520        .gpl_only       = false,
2521        .ret_type       = RET_INTEGER,
2522        .arg1_type      = ARG_PTR_TO_CTX,
2523        .arg2_type      = ARG_ANYTHING,
2524        .arg3_type      = ARG_ANYTHING,
2525        .arg4_type      = ARG_ANYTHING,
2526};
2527
2528static void sk_msg_shift_left(struct sk_msg *msg, int i)
2529{
2530        int prev;
2531
2532        do {
2533                prev = i;
2534                sk_msg_iter_var_next(i);
2535                msg->sg.data[prev] = msg->sg.data[i];
2536        } while (i != msg->sg.end);
2537
2538        sk_msg_iter_prev(msg, end);
2539}
2540
2541static void sk_msg_shift_right(struct sk_msg *msg, int i)
2542{
2543        struct scatterlist tmp, sge;
2544
2545        sk_msg_iter_next(msg, end);
2546        sge = sk_msg_elem_cpy(msg, i);
2547        sk_msg_iter_var_next(i);
2548        tmp = sk_msg_elem_cpy(msg, i);
2549
2550        while (i != msg->sg.end) {
2551                msg->sg.data[i] = sge;
2552                sk_msg_iter_var_next(i);
2553                sge = tmp;
2554                tmp = sk_msg_elem_cpy(msg, i);
2555        }
2556}
2557
2558BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
2559           u32, len, u64, flags)
2560{
2561        u32 i = 0, l = 0, space, offset = 0;
2562        u64 last = start + len;
2563        int pop;
2564
2565        if (unlikely(flags))
2566                return -EINVAL;
2567
2568        /* First find the starting scatterlist element */
2569        i = msg->sg.start;
2570        do {
2571                offset += l;
2572                l = sk_msg_elem(msg, i)->length;
2573
2574                if (start < offset + l)
2575                        break;
2576                sk_msg_iter_var_next(i);
2577        } while (i != msg->sg.end);
2578
2579        /* Bounds checks: start and pop must be inside message */
2580        if (start >= offset + l || last >= msg->sg.size)
2581                return -EINVAL;
2582
2583        space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
2584
2585        pop = len;
2586        /* --------------| offset
2587         * -| start      |-------- len -------|
2588         *
2589         *  |----- a ----|-------- pop -------|----- b ----|
2590         *  |______________________________________________| length
2591         *
2592         *
2593         * a:   region at front of scatter element to save
2594         * b:   region at back of scatter element to save when length > A + pop
2595         * pop: region to pop from element, same as input 'pop' here will be
2596         *      decremented below per iteration.
2597         *
2598         * Two top-level cases to handle when start != offset, first B is non
2599         * zero and second B is zero corresponding to when a pop includes more
2600         * than one element.
2601         *
2602         * Then if B is non-zero AND there is no space allocate space and
2603         * compact A, B regions into page. If there is space shift ring to
2604         * the rigth free'ing the next element in ring to place B, leaving
2605         * A untouched except to reduce length.
2606         */
2607        if (start != offset) {
2608                struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
2609                int a = start;
2610                int b = sge->length - pop - a;
2611
2612                sk_msg_iter_var_next(i);
2613
2614                if (pop < sge->length - a) {
2615                        if (space) {
2616                                sge->length = a;
2617                                sk_msg_shift_right(msg, i);
2618                                nsge = sk_msg_elem(msg, i);
2619                                get_page(sg_page(sge));
2620                                sg_set_page(nsge,
2621                                            sg_page(sge),
2622                                            b, sge->offset + pop + a);
2623                        } else {
2624                                struct page *page, *orig;
2625                                u8 *to, *from;
2626
2627                                page = alloc_pages(__GFP_NOWARN |
2628                                                   __GFP_COMP   | GFP_ATOMIC,
2629                                                   get_order(a + b));
2630                                if (unlikely(!page))
2631                                        return -ENOMEM;
2632
2633                                sge->length = a;
2634                                orig = sg_page(sge);
2635                                from = sg_virt(sge);
2636                                to = page_address(page);
2637                                memcpy(to, from, a);
2638                                memcpy(to + a, from + a + pop, b);
2639                                sg_set_page(sge, page, a + b, 0);
2640                                put_page(orig);
2641                        }
2642                        pop = 0;
2643                } else if (pop >= sge->length - a) {
2644                        pop -= (sge->length - a);
2645                        sge->length = a;
2646                }
2647        }
2648
2649        /* From above the current layout _must_ be as follows,
2650         *
2651         * -| offset
2652         * -| start
2653         *
2654         *  |---- pop ---|---------------- b ------------|
2655         *  |____________________________________________| length
2656         *
2657         * Offset and start of the current msg elem are equal because in the
2658         * previous case we handled offset != start and either consumed the
2659         * entire element and advanced to the next element OR pop == 0.
2660         *
2661         * Two cases to handle here are first pop is less than the length
2662         * leaving some remainder b above. Simply adjust the element's layout
2663         * in this case. Or pop >= length of the element so that b = 0. In this
2664         * case advance to next element decrementing pop.
2665         */
2666        while (pop) {
2667                struct scatterlist *sge = sk_msg_elem(msg, i);
2668
2669                if (pop < sge->length) {
2670                        sge->length -= pop;
2671                        sge->offset += pop;
2672                        pop = 0;
2673                } else {
2674                        pop -= sge->length;
2675                        sk_msg_shift_left(msg, i);
2676                }
2677                sk_msg_iter_var_next(i);
2678        }
2679
2680        sk_mem_uncharge(msg->sk, len - pop);
2681        msg->sg.size -= (len - pop);
2682        sk_msg_compute_data_pointers(msg);
2683        return 0;
2684}
2685
2686static const struct bpf_func_proto bpf_msg_pop_data_proto = {
2687        .func           = bpf_msg_pop_data,
2688        .gpl_only       = false,
2689        .ret_type       = RET_INTEGER,
2690        .arg1_type      = ARG_PTR_TO_CTX,
2691        .arg2_type      = ARG_ANYTHING,
2692        .arg3_type      = ARG_ANYTHING,
2693        .arg4_type      = ARG_ANYTHING,
2694};
2695
2696#ifdef CONFIG_CGROUP_NET_CLASSID
2697BPF_CALL_0(bpf_get_cgroup_classid_curr)
2698{
2699        return __task_get_classid(current);
2700}
2701
2702static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
2703        .func           = bpf_get_cgroup_classid_curr,
2704        .gpl_only       = false,
2705        .ret_type       = RET_INTEGER,
2706};
2707#endif
2708
2709BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
2710{
2711        return task_get_classid(skb);
2712}
2713
2714static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
2715        .func           = bpf_get_cgroup_classid,
2716        .gpl_only       = false,
2717        .ret_type       = RET_INTEGER,
2718        .arg1_type      = ARG_PTR_TO_CTX,
2719};
2720
2721BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb)
2722{
2723        return dst_tclassid(skb);
2724}
2725
2726static const struct bpf_func_proto bpf_get_route_realm_proto = {
2727        .func           = bpf_get_route_realm,
2728        .gpl_only       = false,
2729        .ret_type       = RET_INTEGER,
2730        .arg1_type      = ARG_PTR_TO_CTX,
2731};
2732
2733BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb)
2734{
2735        /* If skb_clear_hash() was called due to mangling, we can
2736         * trigger SW recalculation here. Later access to hash
2737         * can then use the inline skb->hash via context directly
2738         * instead of calling this helper again.
2739         */
2740        return skb_get_hash(skb);
2741}
2742
2743static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
2744        .func           = bpf_get_hash_recalc,
2745        .gpl_only       = false,
2746        .ret_type       = RET_INTEGER,
2747        .arg1_type      = ARG_PTR_TO_CTX,
2748};
2749
2750BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
2751{
2752        /* After all direct packet write, this can be used once for
2753         * triggering a lazy recalc on next skb_get_hash() invocation.
2754         */
2755        skb_clear_hash(skb);
2756        return 0;
2757}
2758
2759static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
2760        .func           = bpf_set_hash_invalid,
2761        .gpl_only       = false,
2762        .ret_type       = RET_INTEGER,
2763        .arg1_type      = ARG_PTR_TO_CTX,
2764};
2765
2766BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash)
2767{
2768        /* Set user specified hash as L4(+), so that it gets returned
2769         * on skb_get_hash() call unless BPF prog later on triggers a
2770         * skb_clear_hash().
2771         */
2772        __skb_set_sw_hash(skb, hash, true);
2773        return 0;
2774}
2775
2776static const struct bpf_func_proto bpf_set_hash_proto = {
2777        .func           = bpf_set_hash,
2778        .gpl_only       = false,
2779        .ret_type       = RET_INTEGER,
2780        .arg1_type      = ARG_PTR_TO_CTX,
2781        .arg2_type      = ARG_ANYTHING,
2782};
2783
2784BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
2785           u16, vlan_tci)
2786{
2787        int ret;
2788
2789        if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
2790                     vlan_proto != htons(ETH_P_8021AD)))
2791                vlan_proto = htons(ETH_P_8021Q);
2792
2793        bpf_push_mac_rcsum(skb);
2794        ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
2795        bpf_pull_mac_rcsum(skb);
2796
2797        bpf_compute_data_pointers(skb);
2798        return ret;
2799}
2800
2801static const struct bpf_func_proto bpf_skb_vlan_push_proto = {
2802        .func           = bpf_skb_vlan_push,
2803        .gpl_only       = false,
2804        .ret_type       = RET_INTEGER,
2805        .arg1_type      = ARG_PTR_TO_CTX,
2806        .arg2_type      = ARG_ANYTHING,
2807        .arg3_type      = ARG_ANYTHING,
2808};
2809
2810BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
2811{
2812        int ret;
2813
2814        bpf_push_mac_rcsum(skb);
2815        ret = skb_vlan_pop(skb);
2816        bpf_pull_mac_rcsum(skb);
2817
2818        bpf_compute_data_pointers(skb);
2819        return ret;
2820}
2821
2822static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
2823        .func           = bpf_skb_vlan_pop,
2824        .gpl_only       = false,
2825        .ret_type       = RET_INTEGER,
2826        .arg1_type      = ARG_PTR_TO_CTX,
2827};
2828
2829static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
2830{
2831        /* Caller already did skb_cow() with len as headroom,
2832         * so no need to do it here.
2833         */
2834        skb_push(skb, len);
2835        memmove(skb->data, skb->data + len, off);
2836        memset(skb->data + off, 0, len);
2837
2838        /* No skb_postpush_rcsum(skb, skb->data + off, len)
2839         * needed here as it does not change the skb->csum
2840         * result for checksum complete when summing over
2841         * zeroed blocks.
2842         */
2843        return 0;
2844}
2845
2846static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
2847{
2848        /* skb_ensure_writable() is not needed here, as we're
2849         * already working on an uncloned skb.
2850         */
2851        if (unlikely(!pskb_may_pull(skb, off + len)))
2852                return -ENOMEM;
2853
2854        skb_postpull_rcsum(skb, skb->data + off, len);
2855        memmove(skb->data + len, skb->data, off);
2856        __skb_pull(skb, len);
2857
2858        return 0;
2859}
2860
2861static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len)
2862{
2863        bool trans_same = skb->transport_header == skb->network_header;
2864        int ret;
2865
2866        /* There's no need for __skb_push()/__skb_pull() pair to
2867         * get to the start of the mac header as we're guaranteed
2868         * to always start from here under eBPF.
2869         */
2870        ret = bpf_skb_generic_push(skb, off, len);
2871        if (likely(!ret)) {
2872                skb->mac_header -= len;
2873                skb->network_header -= len;
2874                if (trans_same)
2875                        skb->transport_header = skb->network_header;
2876        }
2877
2878        return ret;
2879}
2880
2881static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
2882{
2883        bool trans_same = skb->transport_header == skb->network_header;
2884        int ret;
2885
2886        /* Same here, __skb_push()/__skb_pull() pair not needed. */
2887        ret = bpf_skb_generic_pop(skb, off, len);
2888        if (likely(!ret)) {
2889                skb->mac_header += len;
2890                skb->network_header += len;
2891                if (trans_same)
2892                        skb->transport_header = skb->network_header;
2893        }
2894
2895        return ret;
2896}
2897
2898static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
2899{
2900        const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
2901        u32 off = skb_mac_header_len(skb);
2902        int ret;
2903
2904        if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
2905                return -ENOTSUPP;
2906
2907        ret = skb_cow(skb, len_diff);
2908        if (unlikely(ret < 0))
2909                return ret;
2910
2911        ret = bpf_skb_net_hdr_push(skb, off, len_diff);
2912        if (unlikely(ret < 0))
2913                return ret;
2914
2915        if (skb_is_gso(skb)) {
2916                struct skb_shared_info *shinfo = skb_shinfo(skb);
2917
2918                /* SKB_GSO_TCPV4 needs to be changed into
2919                 * SKB_GSO_TCPV6.
2920                 */
2921                if (shinfo->gso_type & SKB_GSO_TCPV4) {
2922                        shinfo->gso_type &= ~SKB_GSO_TCPV4;
2923                        shinfo->gso_type |=  SKB_GSO_TCPV6;
2924                }
2925
2926                /* Due to IPv6 header, MSS needs to be downgraded. */
2927                skb_decrease_gso_size(shinfo, len_diff);
2928                /* Header must be checked, and gso_segs recomputed. */
2929                shinfo->gso_type |= SKB_GSO_DODGY;
2930                shinfo->gso_segs = 0;
2931        }
2932
2933        skb->protocol = htons(ETH_P_IPV6);
2934        skb_clear_hash(skb);
2935
2936        return 0;
2937}
2938
2939static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
2940{
2941        const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
2942        u32 off = skb_mac_header_len(skb);
2943        int ret;
2944
2945        if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
2946                return -ENOTSUPP;
2947
2948        ret = skb_unclone(skb, GFP_ATOMIC);
2949        if (unlikely(ret < 0))
2950                return ret;
2951
2952        ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
2953        if (unlikely(ret < 0))
2954                return ret;
2955
2956        if (skb_is_gso(skb)) {
2957                struct skb_shared_info *shinfo = skb_shinfo(skb);
2958
2959                /* SKB_GSO_TCPV6 needs to be changed into
2960                 * SKB_GSO_TCPV4.
2961                 */
2962                if (shinfo->gso_type & SKB_GSO_TCPV6) {
2963                        shinfo->gso_type &= ~SKB_GSO_TCPV6;
2964                        shinfo->gso_type |=  SKB_GSO_TCPV4;
2965                }
2966
2967                /* Due to IPv4 header, MSS can be upgraded. */
2968                skb_increase_gso_size(shinfo, len_diff);
2969                /* Header must be checked, and gso_segs recomputed. */
2970                shinfo->gso_type |= SKB_GSO_DODGY;
2971                shinfo->gso_segs = 0;
2972        }
2973
2974        skb->protocol = htons(ETH_P_IP);
2975        skb_clear_hash(skb);
2976
2977        return 0;
2978}
2979
2980static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
2981{
2982        __be16 from_proto = skb->protocol;
2983
2984        if (from_proto == htons(ETH_P_IP) &&
2985              to_proto == htons(ETH_P_IPV6))
2986                return bpf_skb_proto_4_to_6(skb);
2987
2988        if (from_proto == htons(ETH_P_IPV6) &&
2989              to_proto == htons(ETH_P_IP))
2990                return bpf_skb_proto_6_to_4(skb);
2991
2992        return -ENOTSUPP;
2993}
2994
2995BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
2996           u64, flags)
2997{
2998        int ret;
2999
3000        if (unlikely(flags))
3001                return -EINVAL;
3002
3003        /* General idea is that this helper does the basic groundwork
3004         * needed for changing the protocol, and eBPF program fills the
3005         * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace()
3006         * and other helpers, rather than passing a raw buffer here.
3007         *
3008         * The rationale is to keep this minimal and without a need to
3009         * deal with raw packet data. F.e. even if we would pass buffers
3010         * here, the program still needs to call the bpf_lX_csum_replace()
3011         * helpers anyway. Plus, this way we keep also separation of
3012         * concerns, since f.e. bpf_skb_store_bytes() should only take
3013         * care of stores.
3014         *
3015         * Currently, additional options and extension header space are
3016         * not supported, but flags register is reserved so we can adapt
3017         * that. For offloads, we mark packet as dodgy, so that headers
3018         * need to be verified first.
3019         */
3020        ret = bpf_skb_proto_xlat(skb, proto);
3021        bpf_compute_data_pointers(skb);
3022        return ret;
3023}
3024
3025static const struct bpf_func_proto bpf_skb_change_proto_proto = {
3026        .func           = bpf_skb_change_proto,
3027        .gpl_only       = false,
3028        .ret_type       = RET_INTEGER,
3029        .arg1_type      = ARG_PTR_TO_CTX,
3030        .arg2_type      = ARG_ANYTHING,
3031        .arg3_type      = ARG_ANYTHING,
3032};
3033
3034BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type)
3035{
3036        /* We only allow a restricted subset to be changed for now. */
3037        if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
3038                     !skb_pkt_type_ok(pkt_type)))
3039                return -EINVAL;
3040
3041        skb->pkt_type = pkt_type;
3042        return 0;
3043}
3044
3045static const struct bpf_func_proto bpf_skb_change_type_proto = {
3046        .func           = bpf_skb_change_type,
3047        .gpl_only       = false,
3048        .ret_type       = RET_INTEGER,
3049        .arg1_type      = ARG_PTR_TO_CTX,
3050        .arg2_type      = ARG_ANYTHING,
3051};
3052
3053static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
3054{
3055        switch (skb->protocol) {
3056        case htons(ETH_P_IP):
3057                return sizeof(struct iphdr);
3058        case htons(ETH_P_IPV6):
3059                return sizeof(struct ipv6hdr);
3060        default:
3061                return ~0U;
3062        }
3063}
3064
3065#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK    (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
3066                                         BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
3067
3068#define BPF_F_ADJ_ROOM_MASK             (BPF_F_ADJ_ROOM_FIXED_GSO | \
3069                                         BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
3070                                         BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
3071                                         BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
3072                                         BPF_F_ADJ_ROOM_ENCAP_L2( \
3073                                          BPF_ADJ_ROOM_ENCAP_L2_MASK))
3074
3075static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
3076                            u64 flags)
3077{
3078        u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT;
3079        bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
3080        u16 mac_len = 0, inner_net = 0, inner_trans = 0;
3081        unsigned int gso_type = SKB_GSO_DODGY;
3082        int ret;
3083
3084        if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
3085                /* udp gso_size delineates datagrams, only allow if fixed */
3086                if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
3087                    !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
3088                        return -ENOTSUPP;
3089        }
3090
3091        ret = skb_cow_head(skb, len_diff);
3092        if (unlikely(ret < 0))
3093                return ret;
3094
3095        if (encap) {
3096                if (skb->protocol != htons(ETH_P_IP) &&
3097                    skb->protocol != htons(ETH_P_IPV6))
3098                        return -ENOTSUPP;
3099
3100                if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 &&
3101                    flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
3102                        return -EINVAL;
3103
3104                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE &&
3105                    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
3106                        return -EINVAL;
3107
3108                if (skb->encapsulation)
3109                        return -EALREADY;
3110
3111                mac_len = skb->network_header - skb->mac_header;
3112                inner_net = skb->network_header;
3113                if (inner_mac_len > len_diff)
3114                        return -EINVAL;
3115                inner_trans = skb->transport_header;
3116        }
3117
3118        ret = bpf_skb_net_hdr_push(skb, off, len_diff);
3119        if (unlikely(ret < 0))
3120                return ret;
3121
3122        if (encap) {
3123                skb->inner_mac_header = inner_net - inner_mac_len;
3124                skb->inner_network_header = inner_net;
3125                skb->inner_transport_header = inner_trans;
3126                skb_set_inner_protocol(skb, skb->protocol);
3127
3128                skb->encapsulation = 1;
3129                skb_set_network_header(skb, mac_len);
3130
3131                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
3132                        gso_type |= SKB_GSO_UDP_TUNNEL;
3133                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE)
3134                        gso_type |= SKB_GSO_GRE;
3135                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
3136                        gso_type |= SKB_GSO_IPXIP6;
3137                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
3138                        gso_type |= SKB_GSO_IPXIP4;
3139
3140                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE ||
3141                    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) {
3142                        int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ?
3143                                        sizeof(struct ipv6hdr) :
3144                                        sizeof(struct iphdr);
3145
3146                        skb_set_transport_header(skb, mac_len + nh_len);
3147                }
3148
3149                /* Match skb->protocol to new outer l3 protocol */
3150                if (skb->protocol == htons(ETH_P_IP) &&
3151                    flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
3152                        skb->protocol = htons(ETH_P_IPV6);
3153                else if (skb->protocol == htons(ETH_P_IPV6) &&
3154                         flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
3155                        skb->protocol = htons(ETH_P_IP);
3156        }
3157
3158        if (skb_is_gso(skb)) {
3159                struct skb_shared_info *shinfo = skb_shinfo(skb);
3160
3161                /* Due to header grow, MSS needs to be downgraded. */
3162                if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
3163                        skb_decrease_gso_size(shinfo, len_diff);
3164
3165                /* Header must be checked, and gso_segs recomputed. */
3166                shinfo->gso_type |= gso_type;
3167                shinfo->gso_segs = 0;
3168        }
3169
3170        return 0;
3171}
3172
3173static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
3174                              u64 flags)
3175{
3176        int ret;
3177
3178        if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO |
3179                               BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
3180                return -EINVAL;
3181
3182        if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
3183                /* udp gso_size delineates datagrams, only allow if fixed */
3184                if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
3185                    !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
3186                        return -ENOTSUPP;
3187        }
3188
3189        ret = skb_unclone(skb, GFP_ATOMIC);
3190        if (unlikely(ret < 0))
3191                return ret;
3192
3193        ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
3194        if (unlikely(ret < 0))
3195                return ret;
3196
3197        if (skb_is_gso(skb)) {
3198                struct skb_shared_info *shinfo = skb_shinfo(skb);
3199
3200                /* Due to header shrink, MSS can be upgraded. */
3201                if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
3202                        skb_increase_gso_size(shinfo, len_diff);
3203
3204                /* Header must be checked, and gso_segs recomputed. */
3205                shinfo->gso_type |= SKB_GSO_DODGY;
3206                shinfo->gso_segs = 0;
3207        }
3208
3209        return 0;
3210}
3211
3212static u32 __bpf_skb_max_len(const struct sk_buff *skb)
3213{
3214        return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len :
3215                          SKB_MAX_ALLOC;
3216}
3217
3218BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
3219           u32, mode, u64, flags)
3220{
3221        u32 len_cur, len_diff_abs = abs(len_diff);
3222        u32 len_min = bpf_skb_net_base_len(skb);
3223        u32 len_max = __bpf_skb_max_len(skb);
3224        __be16 proto = skb->protocol;
3225        bool shrink = len_diff < 0;
3226        u32 off;
3227        int ret;
3228
3229        if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK |
3230                               BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
3231                return -EINVAL;
3232        if (unlikely(len_diff_abs > 0xfffU))
3233                return -EFAULT;
3234        if (unlikely(proto != htons(ETH_P_IP) &&
3235                     proto != htons(ETH_P_IPV6)))
3236                return -ENOTSUPP;
3237
3238        off = skb_mac_header_len(skb);
3239        switch (mode) {
3240        case BPF_ADJ_ROOM_NET:
3241                off += bpf_skb_net_base_len(skb);
3242                break;
3243        case BPF_ADJ_ROOM_MAC:
3244                break;
3245        default:
3246                return -ENOTSUPP;
3247        }
3248
3249        len_cur = skb->len - skb_network_offset(skb);
3250        if ((shrink && (len_diff_abs >= len_cur ||
3251                        len_cur - len_diff_abs < len_min)) ||
3252            (!shrink && (skb->len + len_diff_abs > len_max &&
3253                         !skb_is_gso(skb))))
3254                return -ENOTSUPP;
3255
3256        ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) :
3257                       bpf_skb_net_grow(skb, off, len_diff_abs, flags);
3258        if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET))
3259                __skb_reset_checksum_unnecessary(skb);
3260
3261        bpf_compute_data_pointers(skb);
3262        return ret;
3263}
3264
3265static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
3266        .func           = bpf_skb_adjust_room,
3267        .gpl_only       = false,
3268        .ret_type       = RET_INTEGER,
3269        .arg1_type      = ARG_PTR_TO_CTX,
3270        .arg2_type      = ARG_ANYTHING,
3271        .arg3_type      = ARG_ANYTHING,
3272        .arg4_type      = ARG_ANYTHING,
3273};
3274
3275static u32 __bpf_skb_min_len(const struct sk_buff *skb)
3276{
3277        u32 min_len = skb_network_offset(skb);
3278
3279        if (skb_transport_header_was_set(skb))
3280                min_len = skb_transport_offset(skb);
3281        if (skb->ip_summed == CHECKSUM_PARTIAL)
3282                min_len = skb_checksum_start_offset(skb) +
3283                          skb->csum_offset + sizeof(__sum16);
3284        return min_len;
3285}
3286
3287static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
3288{
3289        unsigned int old_len = skb->len;
3290        int ret;
3291
3292        ret = __skb_grow_rcsum(skb, new_len);
3293        if (!ret)
3294                memset(skb->data + old_len, 0, new_len - old_len);
3295        return ret;
3296}
3297
3298static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
3299{
3300        return __skb_trim_rcsum(skb, new_len);
3301}
3302
3303static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len,
3304                                        u64 flags)
3305{
3306        u32 max_len = __bpf_skb_max_len(skb);
3307        u32 min_len = __bpf_skb_min_len(skb);
3308        int ret;
3309
3310        if (unlikely(flags || new_len > max_len || new_len < min_len))
3311                return -EINVAL;
3312        if (skb->encapsulation)
3313                return -ENOTSUPP;
3314
3315        /* The basic idea of this helper is that it's performing the
3316         * needed work to either grow or trim an skb, and eBPF program
3317         * rewrites the rest via helpers like bpf_skb_store_bytes(),
3318         * bpf_lX_csum_replace() and others rather than passing a raw
3319         * buffer here. This one is a slow path helper and intended
3320         * for replies with control messages.
3321         *
3322         * Like in bpf_skb_change_proto(), we want to keep this rather
3323         * minimal and without protocol specifics so that we are able
3324         * to separate concerns as in bpf_skb_store_bytes() should only
3325         * be the one responsible for writing buffers.
3326         *
3327         * It's really expected to be a slow path operation here for
3328         * control message replies, so we're implicitly linearizing,
3329         * uncloning and drop offloads from the skb by this.
3330         */
3331        ret = __bpf_try_make_writable(skb, skb->len);
3332        if (!ret) {
3333                if (new_len > skb->len)
3334                        ret = bpf_skb_grow_rcsum(skb, new_len);
3335                else if (new_len < skb->len)
3336                        ret = bpf_skb_trim_rcsum(skb, new_len);
3337                if (!ret && skb_is_gso(skb))
3338                        skb_gso_reset(skb);
3339        }
3340        return ret;
3341}
3342
3343BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
3344           u64, flags)
3345{
3346        int ret = __bpf_skb_change_tail(skb, new_len, flags);
3347
3348        bpf_compute_data_pointers(skb);
3349        return ret;
3350}
3351
3352static const struct bpf_func_proto bpf_skb_change_tail_proto = {
3353        .func           = bpf_skb_change_tail,
3354        .gpl_only       = false,
3355        .ret_type       = RET_INTEGER,
3356        .arg1_type      = ARG_PTR_TO_CTX,
3357        .arg2_type      = ARG_ANYTHING,
3358        .arg3_type      = ARG_ANYTHING,
3359};
3360
3361BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
3362           u64, flags)
3363{
3364        int ret = __bpf_skb_change_tail(skb, new_len, flags);
3365
3366        bpf_compute_data_end_sk_skb(skb);
3367        return ret;
3368}
3369
3370static const struct bpf_func_proto sk_skb_change_tail_proto = {
3371        .func           = sk_skb_change_tail,
3372        .gpl_only       = false,
3373        .ret_type       = RET_INTEGER,
3374        .arg1_type      = ARG_PTR_TO_CTX,
3375        .arg2_type      = ARG_ANYTHING,
3376        .arg3_type      = ARG_ANYTHING,
3377};
3378
3379static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
3380                                        u64 flags)
3381{
3382        u32 max_len = __bpf_skb_max_len(skb);
3383        u32 new_len = skb->len + head_room;
3384        int ret;
3385
3386        if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
3387                     new_len < skb->len))
3388                return -EINVAL;
3389
3390        ret = skb_cow(skb, head_room);
3391        if (likely(!ret)) {
3392                /* Idea for this helper is that we currently only
3393                 * allow to expand on mac header. This means that
3394                 * skb->protocol network header, etc, stay as is.
3395                 * Compared to bpf_skb_change_tail(), we're more
3396                 * flexible due to not needing to linearize or
3397                 * reset GSO. Intention for this helper is to be
3398                 * used by an L3 skb that needs to push mac header
3399                 * for redirection into L2 device.
3400                 */
3401                __skb_push(skb, head_room);
3402                memset(skb->data, 0, head_room);
3403                skb_reset_mac_header(skb);
3404        }
3405
3406        return ret;
3407}
3408
3409BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
3410           u64, flags)
3411{
3412        int ret = __bpf_skb_change_head(skb, head_room, flags);
3413
3414        bpf_compute_data_pointers(skb);
3415        return ret;
3416}
3417
3418static const struct bpf_func_proto bpf_skb_change_head_proto = {
3419        .func           = bpf_skb_change_head,
3420        .gpl_only       = false,
3421        .ret_type       = RET_INTEGER,
3422        .arg1_type      = ARG_PTR_TO_CTX,
3423        .arg2_type      = ARG_ANYTHING,
3424        .arg3_type      = ARG_ANYTHING,
3425};
3426
3427BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
3428           u64, flags)
3429{
3430        int ret = __bpf_skb_change_head(skb, head_room, flags);
3431
3432        bpf_compute_data_end_sk_skb(skb);
3433        return ret;
3434}
3435
3436static const struct bpf_func_proto sk_skb_change_head_proto = {
3437        .func           = sk_skb_change_head,
3438        .gpl_only       = false,
3439        .ret_type       = RET_INTEGER,
3440        .arg1_type      = ARG_PTR_TO_CTX,
3441        .arg2_type      = ARG_ANYTHING,
3442        .arg3_type      = ARG_ANYTHING,
3443};
3444static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
3445{
3446        return xdp_data_meta_unsupported(xdp) ? 0 :
3447               xdp->data - xdp->data_meta;
3448}
3449
3450BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
3451{
3452        void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
3453        unsigned long metalen = xdp_get_metalen(xdp);
3454        void *data_start = xdp_frame_end + metalen;
3455        void *data = xdp->data + offset;
3456
3457        if (unlikely(data < data_start ||
3458                     data > xdp->data_end - ETH_HLEN))
3459                return -EINVAL;
3460
3461        if (metalen)
3462                memmove(xdp->data_meta + offset,
3463                        xdp->data_meta, metalen);
3464        xdp->data_meta += offset;
3465        xdp->data = data;
3466
3467        return 0;
3468}
3469
3470static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
3471        .func           = bpf_xdp_adjust_head,
3472        .gpl_only       = false,
3473        .ret_type       = RET_INTEGER,
3474        .arg1_type      = ARG_PTR_TO_CTX,
3475        .arg2_type      = ARG_ANYTHING,
3476};
3477
3478BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
3479{
3480        void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
3481        void *data_end = xdp->data_end + offset;
3482
3483        /* Notice that xdp_data_hard_end have reserved some tailroom */
3484        if (unlikely(data_end > data_hard_end))
3485                return -EINVAL;
3486
3487        /* ALL drivers MUST init xdp->frame_sz, chicken check below */
3488        if (unlikely(xdp->frame_sz > PAGE_SIZE)) {
3489                WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz);
3490                return -EINVAL;
3491        }
3492
3493        if (unlikely(data_end < xdp->data + ETH_HLEN))
3494                return -EINVAL;
3495
3496        /* Clear memory area on grow, can contain uninit kernel memory */
3497        if (offset > 0)
3498                memset(xdp->data_end, 0, offset);
3499
3500        xdp->data_end = data_end;
3501
3502        return 0;
3503}
3504
3505static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
3506        .func           = bpf_xdp_adjust_tail,
3507        .gpl_only       = false,
3508        .ret_type       = RET_INTEGER,
3509        .arg1_type      = ARG_PTR_TO_CTX,
3510        .arg2_type      = ARG_ANYTHING,
3511};
3512
3513BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
3514{
3515        void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
3516        void *meta = xdp->data_meta + offset;
3517        unsigned long metalen = xdp->data - meta;
3518
3519        if (xdp_data_meta_unsupported(xdp))
3520                return -ENOTSUPP;
3521        if (unlikely(meta < xdp_frame_end ||
3522                     meta > xdp->data))
3523                return -EINVAL;
3524        if (unlikely((metalen & (sizeof(__u32) - 1)) ||
3525                     (metalen > 32)))
3526                return -EACCES;
3527
3528        xdp->data_meta = meta;
3529
3530        return 0;
3531}
3532
3533static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
3534        .func           = bpf_xdp_adjust_meta,
3535        .gpl_only       = false,
3536        .ret_type       = RET_INTEGER,
3537        .arg1_type      = ARG_PTR_TO_CTX,
3538        .arg2_type      = ARG_ANYTHING,
3539};
3540
3541static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
3542                            struct bpf_map *map, struct xdp_buff *xdp)
3543{
3544        switch (map->map_type) {
3545        case BPF_MAP_TYPE_DEVMAP:
3546        case BPF_MAP_TYPE_DEVMAP_HASH:
3547                return dev_map_enqueue(fwd, xdp, dev_rx);
3548        case BPF_MAP_TYPE_CPUMAP:
3549                return cpu_map_enqueue(fwd, xdp, dev_rx);
3550        case BPF_MAP_TYPE_XSKMAP:
3551                return __xsk_map_redirect(fwd, xdp);
3552        default:
3553                return -EBADRQC;
3554        }
3555        return 0;
3556}
3557
3558void xdp_do_flush(void)
3559{
3560        __dev_flush();
3561        __cpu_map_flush();
3562        __xsk_map_flush();
3563}
3564EXPORT_SYMBOL_GPL(xdp_do_flush);
3565
3566static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
3567{
3568        switch (map->map_type) {
3569        case BPF_MAP_TYPE_DEVMAP:
3570                return __dev_map_lookup_elem(map, index);
3571        case BPF_MAP_TYPE_DEVMAP_HASH:
3572                return __dev_map_hash_lookup_elem(map, index);
3573        case BPF_MAP_TYPE_CPUMAP:
3574                return __cpu_map_lookup_elem(map, index);
3575        case BPF_MAP_TYPE_XSKMAP:
3576                return __xsk_map_lookup_elem(map, index);
3577        default:
3578                return NULL;
3579        }
3580}
3581
3582void bpf_clear_redirect_map(struct bpf_map *map)
3583{
3584        struct bpf_redirect_info *ri;
3585        int cpu;
3586
3587        for_each_possible_cpu(cpu) {
3588                ri = per_cpu_ptr(&bpf_redirect_info, cpu);
3589                /* Avoid polluting remote cacheline due to writes if
3590                 * not needed. Once we pass this test, we need the
3591                 * cmpxchg() to make sure it hasn't been changed in
3592                 * the meantime by remote CPU.
3593                 */
3594                if (unlikely(READ_ONCE(ri->map) == map))
3595                        cmpxchg(&ri->map, map, NULL);
3596        }
3597}
3598
3599int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
3600                    struct bpf_prog *xdp_prog)
3601{
3602        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
3603        struct bpf_map *map = READ_ONCE(ri->map);
3604        u32 index = ri->tgt_index;
3605        void *fwd = ri->tgt_value;
3606        int err;
3607
3608        ri->tgt_index = 0;
3609        ri->tgt_value = NULL;
3610        WRITE_ONCE(ri->map, NULL);
3611
3612        if (unlikely(!map)) {
3613                fwd = dev_get_by_index_rcu(dev_net(dev), index);
3614                if (unlikely(!fwd)) {
3615                        err = -EINVAL;
3616                        goto err;
3617                }
3618
3619                err = dev_xdp_enqueue(fwd, xdp, dev);
3620        } else {
3621                err = __bpf_tx_xdp_map(dev, fwd, map, xdp);
3622        }
3623
3624        if (unlikely(err))
3625                goto err;
3626
3627        _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
3628        return 0;
3629err:
3630        _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
3631        return err;
3632}
3633EXPORT_SYMBOL_GPL(xdp_do_redirect);
3634
3635static int xdp_do_generic_redirect_map(struct net_device *dev,
3636                                       struct sk_buff *skb,
3637                                       struct xdp_buff *xdp,
3638                                       struct bpf_prog *xdp_prog,
3639                                       struct bpf_map *map)
3640{
3641        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
3642        u32 index = ri->tgt_index;
3643        void *fwd = ri->tgt_value;
3644        int err = 0;
3645
3646        ri->tgt_index = 0;
3647        ri->tgt_value = NULL;
3648        WRITE_ONCE(ri->map, NULL);
3649
3650        if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
3651            map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
3652                struct bpf_dtab_netdev *dst = fwd;
3653
3654                err = dev_map_generic_redirect(dst, skb, xdp_prog);
3655                if (unlikely(err))
3656                        goto err;
3657        } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
3658                struct xdp_sock *xs = fwd;
3659
3660                err = xsk_generic_rcv(xs, xdp);
3661                if (err)
3662                        goto err;
3663                consume_skb(skb);
3664        } else {
3665                /* TODO: Handle BPF_MAP_TYPE_CPUMAP */
3666                err = -EBADRQC;
3667                goto err;
3668        }
3669
3670        _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
3671        return 0;
3672err:
3673        _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
3674        return err;
3675}
3676
3677int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
3678                            struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
3679{
3680        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
3681        struct bpf_map *map = READ_ONCE(ri->map);
3682        u32 index = ri->tgt_index;
3683        struct net_device *fwd;
3684        int err = 0;
3685
3686        if (map)
3687                return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
3688                                                   map);
3689        ri->tgt_index = 0;
3690        fwd = dev_get_by_index_rcu(dev_net(dev), index);
3691        if (unlikely(!fwd)) {
3692                err = -EINVAL;
3693                goto err;
3694        }
3695
3696        err = xdp_ok_fwd_dev(fwd, skb->len);
3697        if (unlikely(err))
3698                goto err;
3699
3700        skb->dev = fwd;
3701        _trace_xdp_redirect(dev, xdp_prog, index);
3702        generic_xdp_tx(skb, xdp_prog);
3703        return 0;
3704err:
3705        _trace_xdp_redirect_err(dev, xdp_prog, index, err);
3706        return err;
3707}
3708
3709BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
3710{
3711        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
3712
3713        if (unlikely(flags))
3714                return XDP_ABORTED;
3715
3716        ri->flags = flags;
3717        ri->tgt_index = ifindex;
3718        ri->tgt_value = NULL;
3719        WRITE_ONCE(ri->map, NULL);
3720
3721        return XDP_REDIRECT;
3722}
3723
3724static const struct bpf_func_proto bpf_xdp_redirect_proto = {
3725        .func           = bpf_xdp_redirect,
3726        .gpl_only       = false,
3727        .ret_type       = RET_INTEGER,
3728        .arg1_type      = ARG_ANYTHING,
3729        .arg2_type      = ARG_ANYTHING,
3730};
3731
3732BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
3733           u64, flags)
3734{
3735        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
3736
3737        /* Lower bits of the flags are used as return code on lookup failure */
3738        if (unlikely(flags > XDP_TX))
3739                return XDP_ABORTED;
3740
3741        ri->tgt_value = __xdp_map_lookup_elem(map, ifindex);
3742        if (unlikely(!ri->tgt_value)) {
3743                /* If the lookup fails we want to clear out the state in the
3744                 * redirect_info struct completely, so that if an eBPF program
3745                 * performs multiple lookups, the last one always takes
3746                 * precedence.
3747                 */
3748                WRITE_ONCE(ri->map, NULL);
3749                return flags;
3750        }
3751
3752        ri->flags = flags;
3753        ri->tgt_index = ifindex;
3754        WRITE_ONCE(ri->map, map);
3755
3756        return XDP_REDIRECT;
3757}
3758
3759static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
3760        .func           = bpf_xdp_redirect_map,
3761        .gpl_only       = false,
3762        .ret_type       = RET_INTEGER,
3763        .arg1_type      = ARG_CONST_MAP_PTR,
3764        .arg2_type      = ARG_ANYTHING,
3765        .arg3_type      = ARG_ANYTHING,
3766};
3767
3768static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
3769                                  unsigned long off, unsigned long len)
3770{
3771        void *ptr = skb_header_pointer(skb, off, len, dst_buff);
3772
3773        if (unlikely(!ptr))
3774                return len;
3775        if (ptr != dst_buff)
3776                memcpy(dst_buff, ptr, len);
3777
3778        return 0;
3779}
3780
3781BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
3782           u64, flags, void *, meta, u64, meta_size)
3783{
3784        u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
3785
3786        if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
3787                return -EINVAL;
3788        if (unlikely(!skb || skb_size > skb->len))
3789                return -EFAULT;
3790
3791        return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
3792                                bpf_skb_copy);
3793}
3794
3795static const struct bpf_func_proto bpf_skb_event_output_proto = {
3796        .func           = bpf_skb_event_output,
3797        .gpl_only       = true,
3798        .ret_type       = RET_INTEGER,
3799        .arg1_type      = ARG_PTR_TO_CTX,
3800        .arg2_type      = ARG_CONST_MAP_PTR,
3801        .arg3_type      = ARG_ANYTHING,
3802        .arg4_type      = ARG_PTR_TO_MEM,
3803        .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
3804};
3805
3806BTF_ID_LIST(bpf_skb_output_btf_ids)
3807BTF_ID(struct, sk_buff)
3808
3809const struct bpf_func_proto bpf_skb_output_proto = {
3810        .func           = bpf_skb_event_output,
3811        .gpl_only       = true,
3812        .ret_type       = RET_INTEGER,
3813        .arg1_type      = ARG_PTR_TO_BTF_ID,
3814        .arg2_type      = ARG_CONST_MAP_PTR,
3815        .arg3_type      = ARG_ANYTHING,
3816        .arg4_type      = ARG_PTR_TO_MEM,
3817        .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
3818        .btf_id         = bpf_skb_output_btf_ids,
3819};
3820
3821static unsigned short bpf_tunnel_key_af(u64 flags)
3822{
3823        return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
3824}
3825
3826BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to,
3827           u32, size, u64, flags)
3828{
3829        const struct ip_tunnel_info *info = skb_tunnel_info(skb);
3830        u8 compat[sizeof(struct bpf_tunnel_key)];
3831        void *to_orig = to;
3832        int err;
3833
3834        if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) {
3835                err = -EINVAL;
3836                goto err_clear;
3837        }
3838        if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) {
3839                err = -EPROTO;
3840                goto err_clear;
3841        }
3842        if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
3843                err = -EINVAL;
3844                switch (size) {
3845                case offsetof(struct bpf_tunnel_key, tunnel_label):
3846                case offsetof(struct bpf_tunnel_key, tunnel_ext):
3847                        goto set_compat;
3848                case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
3849                        /* Fixup deprecated structure layouts here, so we have
3850                         * a common path later on.
3851                         */
3852                        if (ip_tunnel_info_af(info) != AF_INET)
3853                                goto err_clear;
3854set_compat:
3855                        to = (struct bpf_tunnel_key *)compat;
3856                        break;
3857                default:
3858                        goto err_clear;
3859                }
3860        }
3861
3862        to->tunnel_id = be64_to_cpu(info->key.tun_id);
3863        to->tunnel_tos = info->key.tos;
3864        to->tunnel_ttl = info->key.ttl;
3865        to->tunnel_ext = 0;
3866
3867        if (flags & BPF_F_TUNINFO_IPV6) {
3868                memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
3869                       sizeof(to->remote_ipv6));
3870                to->tunnel_label = be32_to_cpu(info->key.label);
3871        } else {
3872                to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
3873                memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
3874                to->tunnel_label = 0;
3875        }
3876
3877        if (unlikely(size != sizeof(struct bpf_tunnel_key)))
3878                memcpy(to_orig, to, size);
3879
3880        return 0;
3881err_clear:
3882        memset(to_orig, 0, size);
3883        return err;
3884}
3885
3886static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
3887        .func           = bpf_skb_get_tunnel_key,
3888        .gpl_only       = false,
3889        .ret_type       = RET_INTEGER,
3890        .arg1_type      = ARG_PTR_TO_CTX,
3891        .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
3892        .arg3_type      = ARG_CONST_SIZE,
3893        .arg4_type      = ARG_ANYTHING,
3894};
3895
3896BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
3897{
3898        const struct ip_tunnel_info *info = skb_tunnel_info(skb);
3899        int err;
3900
3901        if (unlikely(!info ||
3902                     !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) {
3903                err = -ENOENT;
3904                goto err_clear;
3905        }
3906        if (unlikely(size < info->options_len)) {
3907                err = -ENOMEM;
3908                goto err_clear;
3909        }
3910
3911        ip_tunnel_info_opts_get(to, info);
3912        if (size > info->options_len)
3913                memset(to + info->options_len, 0, size - info->options_len);
3914
3915        return info->options_len;
3916err_clear:
3917        memset(to, 0, size);
3918        return err;
3919}
3920
3921static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
3922        .func           = bpf_skb_get_tunnel_opt,
3923        .gpl_only       = false,
3924        .ret_type       = RET_INTEGER,
3925        .arg1_type      = ARG_PTR_TO_CTX,
3926        .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
3927        .arg3_type      = ARG_CONST_SIZE,
3928};
3929
3930static struct metadata_dst __percpu *md_dst;
3931
3932BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
3933           const struct bpf_tunnel_key *, from, u32, size, u64, flags)
3934{
3935        struct metadata_dst *md = this_cpu_ptr(md_dst);
3936        u8 compat[sizeof(struct bpf_tunnel_key)];
3937        struct ip_tunnel_info *info;
3938
3939        if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
3940                               BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER)))
3941                return -EINVAL;
3942        if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
3943                switch (size) {
3944                case offsetof(struct bpf_tunnel_key, tunnel_label):
3945                case offsetof(struct bpf_tunnel_key, tunnel_ext):
3946                case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
3947                        /* Fixup deprecated structure layouts here, so we have
3948                         * a common path later on.
3949                         */
3950                        memcpy(compat, from, size);
3951                        memset(compat + size, 0, sizeof(compat) - size);
3952                        from = (const struct bpf_tunnel_key *) compat;
3953                        break;
3954                default:
3955                        return -EINVAL;
3956                }
3957        }
3958        if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) ||
3959                     from->tunnel_ext))
3960                return -EINVAL;
3961
3962        skb_dst_drop(skb);
3963        dst_hold((struct dst_entry *) md);
3964        skb_dst_set(skb, (struct dst_entry *) md);
3965
3966        info = &md->u.tun_info;
3967        memset(info, 0, sizeof(*info));
3968        info->mode = IP_TUNNEL_INFO_TX;
3969
3970        info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
3971        if (flags & BPF_F_DONT_FRAGMENT)
3972                info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
3973        if (flags & BPF_F_ZERO_CSUM_TX)
3974                info->key.tun_flags &= ~TUNNEL_CSUM;
3975        if (flags & BPF_F_SEQ_NUMBER)
3976                info->key.tun_flags |= TUNNEL_SEQ;
3977
3978        info->key.tun_id = cpu_to_be64(from->tunnel_id);
3979        info->key.tos = from->tunnel_tos;
3980        info->key.ttl = from->tunnel_ttl;
3981
3982        if (flags & BPF_F_TUNINFO_IPV6) {
3983                info->mode |= IP_TUNNEL_INFO_IPV6;
3984                memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
3985                       sizeof(from->remote_ipv6));
3986                info->key.label = cpu_to_be32(from->tunnel_label) &
3987                                  IPV6_FLOWLABEL_MASK;
3988        } else {
3989                info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
3990        }
3991
3992        return 0;
3993}
3994
3995static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
3996        .func           = bpf_skb_set_tunnel_key,
3997        .gpl_only       = false,
3998        .ret_type       = RET_INTEGER,
3999        .arg1_type      = ARG_PTR_TO_CTX,
4000        .arg2_type      = ARG_PTR_TO_MEM,
4001        .arg3_type      = ARG_CONST_SIZE,
4002        .arg4_type      = ARG_ANYTHING,
4003};
4004
4005BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
4006           const u8 *, from, u32, size)
4007{
4008        struct ip_tunnel_info *info = skb_tunnel_info(skb);
4009        const struct metadata_dst *md = this_cpu_ptr(md_dst);
4010
4011        if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
4012                return -EINVAL;
4013        if (unlikely(size > IP_TUNNEL_OPTS_MAX))
4014                return -ENOMEM;
4015
4016        ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT);
4017
4018        return 0;
4019}
4020
4021static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
4022        .func           = bpf_skb_set_tunnel_opt,
4023        .gpl_only       = false,
4024        .ret_type       = RET_INTEGER,
4025        .arg1_type      = ARG_PTR_TO_CTX,
4026        .arg2_type      = ARG_PTR_TO_MEM,
4027        .arg3_type      = ARG_CONST_SIZE,
4028};
4029
4030static const struct bpf_func_proto *
4031bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
4032{
4033        if (!md_dst) {
4034                struct metadata_dst __percpu *tmp;
4035
4036                tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
4037                                                METADATA_IP_TUNNEL,
4038                                                GFP_KERNEL);
4039                if (!tmp)
4040                        return NULL;
4041                if (cmpxchg(&md_dst, NULL, tmp))
4042                        metadata_dst_free_percpu(tmp);
4043        }
4044
4045        switch (which) {
4046        case BPF_FUNC_skb_set_tunnel_key:
4047                return &bpf_skb_set_tunnel_key_proto;
4048        case BPF_FUNC_skb_set_tunnel_opt:
4049                return &bpf_skb_set_tunnel_opt_proto;
4050        default:
4051                return NULL;
4052        }
4053}
4054
4055BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map,
4056           u32, idx)
4057{
4058        struct bpf_array *array = container_of(map, struct bpf_array, map);
4059        struct cgroup *cgrp;
4060        struct sock *sk;
4061
4062        sk = skb_to_full_sk(skb);
4063        if (!sk || !sk_fullsock(sk))
4064                return -ENOENT;
4065        if (unlikely(idx >= array->map.max_entries))
4066                return -E2BIG;
4067
4068        cgrp = READ_ONCE(array->ptrs[idx]);
4069        if (unlikely(!cgrp))
4070                return -EAGAIN;
4071
4072        return sk_under_cgroup_hierarchy(sk, cgrp);
4073}
4074
4075static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
4076        .func           = bpf_skb_under_cgroup,
4077        .gpl_only       = false,
4078        .ret_type       = RET_INTEGER,
4079        .arg1_type      = ARG_PTR_TO_CTX,
4080        .arg2_type      = ARG_CONST_MAP_PTR,
4081        .arg3_type      = ARG_ANYTHING,
4082};
4083
4084#ifdef CONFIG_SOCK_CGROUP_DATA
4085static inline u64 __bpf_sk_cgroup_id(struct sock *sk)
4086{
4087        struct cgroup *cgrp;
4088
4089        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
4090        return cgroup_id(cgrp);
4091}
4092
4093BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
4094{
4095        struct sock *sk = skb_to_full_sk(skb);
4096
4097        if (!sk || !sk_fullsock(sk))
4098                return 0;
4099
4100        return __bpf_sk_cgroup_id(sk);
4101}
4102
4103static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
4104        .func           = bpf_skb_cgroup_id,
4105        .gpl_only       = false,
4106        .ret_type       = RET_INTEGER,
4107        .arg1_type      = ARG_PTR_TO_CTX,
4108};
4109
4110static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk,
4111                                              int ancestor_level)
4112{
4113        struct cgroup *ancestor;
4114        struct cgroup *cgrp;
4115
4116        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
4117        ancestor = cgroup_ancestor(cgrp, ancestor_level);
4118        if (!ancestor)
4119                return 0;
4120
4121        return cgroup_id(ancestor);
4122}
4123
4124BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
4125           ancestor_level)
4126{
4127        struct sock *sk = skb_to_full_sk(skb);
4128
4129        if (!sk || !sk_fullsock(sk))
4130                return 0;
4131
4132        return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
4133}
4134
4135static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
4136        .func           = bpf_skb_ancestor_cgroup_id,
4137        .gpl_only       = false,
4138        .ret_type       = RET_INTEGER,
4139        .arg1_type      = ARG_PTR_TO_CTX,
4140        .arg2_type      = ARG_ANYTHING,
4141};
4142
4143BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk)
4144{
4145        return __bpf_sk_cgroup_id(sk);
4146}
4147
4148static const struct bpf_func_proto bpf_sk_cgroup_id_proto = {
4149        .func           = bpf_sk_cgroup_id,
4150        .gpl_only       = false,
4151        .ret_type       = RET_INTEGER,
4152        .arg1_type      = ARG_PTR_TO_SOCKET,
4153};
4154
4155BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level)
4156{
4157        return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
4158}
4159
4160static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
4161        .func           = bpf_sk_ancestor_cgroup_id,
4162        .gpl_only       = false,
4163        .ret_type       = RET_INTEGER,
4164        .arg1_type      = ARG_PTR_TO_SOCKET,
4165        .arg2_type      = ARG_ANYTHING,
4166};
4167#endif
4168
4169static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
4170                                  unsigned long off, unsigned long len)
4171{
4172        memcpy(dst_buff, src_buff + off, len);
4173        return 0;
4174}
4175
4176BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
4177           u64, flags, void *, meta, u64, meta_size)
4178{
4179        u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
4180
4181        if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
4182                return -EINVAL;
4183        if (unlikely(!xdp ||
4184                     xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
4185                return -EFAULT;
4186
4187        return bpf_event_output(map, flags, meta, meta_size, xdp->data,
4188                                xdp_size, bpf_xdp_copy);
4189}
4190
4191static const struct bpf_func_proto bpf_xdp_event_output_proto = {
4192        .func           = bpf_xdp_event_output,
4193        .gpl_only       = true,
4194        .ret_type       = RET_INTEGER,
4195        .arg1_type      = ARG_PTR_TO_CTX,
4196        .arg2_type      = ARG_CONST_MAP_PTR,
4197        .arg3_type      = ARG_ANYTHING,
4198        .arg4_type      = ARG_PTR_TO_MEM,
4199        .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
4200};
4201
4202BTF_ID_LIST(bpf_xdp_output_btf_ids)
4203BTF_ID(struct, xdp_buff)
4204
4205const struct bpf_func_proto bpf_xdp_output_proto = {
4206        .func           = bpf_xdp_event_output,
4207        .gpl_only       = true,
4208        .ret_type       = RET_INTEGER,
4209        .arg1_type      = ARG_PTR_TO_BTF_ID,
4210        .arg2_type      = ARG_CONST_MAP_PTR,
4211        .arg3_type      = ARG_ANYTHING,
4212        .arg4_type      = ARG_PTR_TO_MEM,
4213        .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
4214        .btf_id         = bpf_xdp_output_btf_ids,
4215};
4216
4217BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
4218{
4219        return skb->sk ? sock_gen_cookie(skb->sk) : 0;
4220}
4221
4222static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
4223        .func           = bpf_get_socket_cookie,
4224        .gpl_only       = false,
4225        .ret_type       = RET_INTEGER,
4226        .arg1_type      = ARG_PTR_TO_CTX,
4227};
4228
4229BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
4230{
4231        return sock_gen_cookie(ctx->sk);
4232}
4233
4234static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
4235        .func           = bpf_get_socket_cookie_sock_addr,
4236        .gpl_only       = false,
4237        .ret_type       = RET_INTEGER,
4238        .arg1_type      = ARG_PTR_TO_CTX,
4239};
4240
4241BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx)
4242{
4243        return sock_gen_cookie(ctx);
4244}
4245
4246static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
4247        .func           = bpf_get_socket_cookie_sock,
4248        .gpl_only       = false,
4249        .ret_type       = RET_INTEGER,
4250        .arg1_type      = ARG_PTR_TO_CTX,
4251};
4252
4253BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
4254{
4255        return sock_gen_cookie(ctx->sk);
4256}
4257
4258static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
4259        .func           = bpf_get_socket_cookie_sock_ops,
4260        .gpl_only       = false,
4261        .ret_type       = RET_INTEGER,
4262        .arg1_type      = ARG_PTR_TO_CTX,
4263};
4264
4265static u64 __bpf_get_netns_cookie(struct sock *sk)
4266{
4267#ifdef CONFIG_NET_NS
4268        return net_gen_cookie(sk ? sk->sk_net.net : &init_net);
4269#else
4270        return 0;
4271#endif
4272}
4273
4274BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
4275{
4276        return __bpf_get_netns_cookie(ctx);
4277}
4278
4279static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = {
4280        .func           = bpf_get_netns_cookie_sock,
4281        .gpl_only       = false,
4282        .ret_type       = RET_INTEGER,
4283        .arg1_type      = ARG_PTR_TO_CTX_OR_NULL,
4284};
4285
4286BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
4287{
4288        return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
4289}
4290
4291static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
4292        .func           = bpf_get_netns_cookie_sock_addr,
4293        .gpl_only       = false,
4294        .ret_type       = RET_INTEGER,
4295        .arg1_type      = ARG_PTR_TO_CTX_OR_NULL,
4296};
4297
4298BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
4299{
4300        struct sock *sk = sk_to_full_sk(skb->sk);
4301        kuid_t kuid;
4302
4303        if (!sk || !sk_fullsock(sk))
4304                return overflowuid;
4305        kuid = sock_net_uid(sock_net(sk), sk);
4306        return from_kuid_munged(sock_net(sk)->user_ns, kuid);
4307}
4308
4309static const struct bpf_func_proto bpf_get_socket_uid_proto = {
4310        .func           = bpf_get_socket_uid,
4311        .gpl_only       = false,
4312        .ret_type       = RET_INTEGER,
4313        .arg1_type      = ARG_PTR_TO_CTX,
4314};
4315
4316#define SOCKOPT_CC_REINIT (1 << 0)
4317
4318static int _bpf_setsockopt(struct sock *sk, int level, int optname,
4319                           char *optval, int optlen, u32 flags)
4320{
4321        char devname[IFNAMSIZ];
4322        int val, valbool;
4323        struct net *net;
4324        int ifindex;
4325        int ret = 0;
4326
4327        if (!sk_fullsock(sk))
4328                return -EINVAL;
4329
4330        sock_owned_by_me(sk);
4331
4332        if (level == SOL_SOCKET) {
4333                if (optlen != sizeof(int) && optname != SO_BINDTODEVICE)
4334                        return -EINVAL;
4335                val = *((int *)optval);
4336                valbool = val ? 1 : 0;
4337
4338                /* Only some socketops are supported */
4339                switch (optname) {
4340                case SO_RCVBUF:
4341                        val = min_t(u32, val, sysctl_rmem_max);
4342                        sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
4343                        WRITE_ONCE(sk->sk_rcvbuf,
4344                                   max_t(int, val * 2, SOCK_MIN_RCVBUF));
4345                        break;
4346                case SO_SNDBUF:
4347                        val = min_t(u32, val, sysctl_wmem_max);
4348                        sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
4349                        WRITE_ONCE(sk->sk_sndbuf,
4350                                   max_t(int, val * 2, SOCK_MIN_SNDBUF));
4351                        break;
4352                case SO_MAX_PACING_RATE: /* 32bit version */
4353                        if (val != ~0U)
4354                                cmpxchg(&sk->sk_pacing_status,
4355                                        SK_PACING_NONE,
4356                                        SK_PACING_NEEDED);
4357                        sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
4358                        sk->sk_pacing_rate = min(sk->sk_pacing_rate,
4359                                                 sk->sk_max_pacing_rate);
4360                        break;
4361                case SO_PRIORITY:
4362                        sk->sk_priority = val;
4363                        break;
4364                case SO_RCVLOWAT:
4365                        if (val < 0)
4366                                val = INT_MAX;
4367                        WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
4368                        break;
4369                case SO_MARK:
4370                        if (sk->sk_mark != val) {
4371                                sk->sk_mark = val;
4372                                sk_dst_reset(sk);
4373                        }
4374                        break;
4375                case SO_BINDTODEVICE:
4376                        optlen = min_t(long, optlen, IFNAMSIZ - 1);
4377                        strncpy(devname, optval, optlen);
4378                        devname[optlen] = 0;
4379
4380                        ifindex = 0;
4381                        if (devname[0] != '\0') {
4382                                struct net_device *dev;
4383
4384                                ret = -ENODEV;
4385
4386                                net = sock_net(sk);
4387                                dev = dev_get_by_name(net, devname);
4388                                if (!dev)
4389                                        break;
4390                                ifindex = dev->ifindex;
4391                                dev_put(dev);
4392                        }
4393                        ret = sock_bindtoindex(sk, ifindex, false);
4394                        break;
4395                case SO_KEEPALIVE:
4396                        if (sk->sk_prot->keepalive)
4397                                sk->sk_prot->keepalive(sk, valbool);
4398                        sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
4399                        break;
4400                default:
4401                        ret = -EINVAL;
4402                }
4403#ifdef CONFIG_INET
4404        } else if (level == SOL_IP) {
4405                if (optlen != sizeof(int) || sk->sk_family != AF_INET)
4406                        return -EINVAL;
4407
4408                val = *((int *)optval);
4409                /* Only some options are supported */
4410                switch (optname) {
4411                case IP_TOS:
4412                        if (val < -1 || val > 0xff) {
4413                                ret = -EINVAL;
4414                        } else {
4415                                struct inet_sock *inet = inet_sk(sk);
4416
4417                                if (val == -1)
4418                                        val = 0;
4419                                inet->tos = val;
4420                        }
4421                        break;
4422                default:
4423                        ret = -EINVAL;
4424                }
4425#if IS_ENABLED(CONFIG_IPV6)
4426        } else if (level == SOL_IPV6) {
4427                if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
4428                        return -EINVAL;
4429
4430                val = *((int *)optval);
4431                /* Only some options are supported */
4432                switch (optname) {
4433                case IPV6_TCLASS:
4434                        if (val < -1 || val > 0xff) {
4435                                ret = -EINVAL;
4436                        } else {
4437                                struct ipv6_pinfo *np = inet6_sk(sk);
4438
4439                                if (val == -1)
4440                                        val = 0;
4441                                np->tclass = val;
4442                        }
4443                        break;
4444                default:
4445                        ret = -EINVAL;
4446                }
4447#endif
4448        } else if (level == SOL_TCP &&
4449                   sk->sk_prot->setsockopt == tcp_setsockopt) {
4450                if (optname == TCP_CONGESTION) {
4451                        char name[TCP_CA_NAME_MAX];
4452                        bool reinit = flags & SOCKOPT_CC_REINIT;
4453
4454                        strncpy(name, optval, min_t(long, optlen,
4455                                                    TCP_CA_NAME_MAX-1));
4456                        name[TCP_CA_NAME_MAX-1] = 0;
4457                        ret = tcp_set_congestion_control(sk, name, false,
4458                                                         reinit, true);
4459                } else {
4460                        struct inet_connection_sock *icsk = inet_csk(sk);
4461                        struct tcp_sock *tp = tcp_sk(sk);
4462
4463                        if (optlen != sizeof(int))
4464                                return -EINVAL;
4465
4466                        val = *((int *)optval);
4467                        /* Only some options are supported */
4468                        switch (optname) {
4469                        case TCP_BPF_IW:
4470                                if (val <= 0 || tp->data_segs_out > tp->syn_data)
4471                                        ret = -EINVAL;
4472                                else
4473                                        tp->snd_cwnd = val;
4474                                break;
4475                        case TCP_BPF_SNDCWND_CLAMP:
4476                                if (val <= 0) {
4477                                        ret = -EINVAL;
4478                                } else {
4479                                        tp->snd_cwnd_clamp = val;
4480                                        tp->snd_ssthresh = val;
4481                                }
4482                                break;
4483                        case TCP_SAVE_SYN:
4484                                if (val < 0 || val > 1)
4485                                        ret = -EINVAL;
4486                                else
4487                                        tp->save_syn = val;
4488                                break;
4489                        case TCP_KEEPIDLE:
4490                                ret = tcp_sock_set_keepidle_locked(sk, val);
4491                                break;
4492                        case TCP_KEEPINTVL:
4493                                if (val < 1 || val > MAX_TCP_KEEPINTVL)
4494                                        ret = -EINVAL;
4495                                else
4496                                        tp->keepalive_intvl = val * HZ;
4497                                break;
4498                        case TCP_KEEPCNT:
4499                                if (val < 1 || val > MAX_TCP_KEEPCNT)
4500                                        ret = -EINVAL;
4501                                else
4502                                        tp->keepalive_probes = val;
4503                                break;
4504                        case TCP_SYNCNT:
4505                                if (val < 1 || val > MAX_TCP_SYNCNT)
4506                                        ret = -EINVAL;
4507                                else
4508                                        icsk->icsk_syn_retries = val;
4509                                break;
4510                        case TCP_USER_TIMEOUT:
4511                                if (val < 0)
4512                                        ret = -EINVAL;
4513                                else
4514                                        icsk->icsk_user_timeout = val;
4515                                break;
4516                        default:
4517                                ret = -EINVAL;
4518                        }
4519                }
4520#endif
4521        } else {
4522                ret = -EINVAL;
4523        }
4524        return ret;
4525}
4526
4527static int _bpf_getsockopt(struct sock *sk, int level, int optname,
4528                           char *optval, int optlen)
4529{
4530        if (!sk_fullsock(sk))
4531                goto err_clear;
4532
4533        sock_owned_by_me(sk);
4534
4535#ifdef CONFIG_INET
4536        if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
4537                struct inet_connection_sock *icsk;
4538                struct tcp_sock *tp;
4539
4540                switch (optname) {
4541                case TCP_CONGESTION:
4542                        icsk = inet_csk(sk);
4543
4544                        if (!icsk->icsk_ca_ops || optlen <= 1)
4545                                goto err_clear;
4546                        strncpy(optval, icsk->icsk_ca_ops->name, optlen);
4547                        optval[optlen - 1] = 0;
4548                        break;
4549                case TCP_SAVED_SYN:
4550                        tp = tcp_sk(sk);
4551
4552                        if (optlen <= 0 || !tp->saved_syn ||
4553                            optlen > tp->saved_syn[0])
4554                                goto err_clear;
4555                        memcpy(optval, tp->saved_syn + 1, optlen);
4556                        break;
4557                default:
4558                        goto err_clear;
4559                }
4560        } else if (level == SOL_IP) {
4561                struct inet_sock *inet = inet_sk(sk);
4562
4563                if (optlen != sizeof(int) || sk->sk_family != AF_INET)
4564                        goto err_clear;
4565
4566                /* Only some options are supported */
4567                switch (optname) {
4568                case IP_TOS:
4569                        *((int *)optval) = (int)inet->tos;
4570                        break;
4571                default:
4572                        goto err_clear;
4573                }
4574#if IS_ENABLED(CONFIG_IPV6)
4575        } else if (level == SOL_IPV6) {
4576                struct ipv6_pinfo *np = inet6_sk(sk);
4577
4578                if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
4579                        goto err_clear;
4580
4581                /* Only some options are supported */
4582                switch (optname) {
4583                case IPV6_TCLASS:
4584                        *((int *)optval) = (int)np->tclass;
4585                        break;
4586                default:
4587                        goto err_clear;
4588                }
4589#endif
4590        } else {
4591                goto err_clear;
4592        }
4593        return 0;
4594#endif
4595err_clear:
4596        memset(optval, 0, optlen);
4597        return -EINVAL;
4598}
4599
4600BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
4601           int, level, int, optname, char *, optval, int, optlen)
4602{
4603        u32 flags = 0;
4604        return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen,
4605                               flags);
4606}
4607
4608static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = {
4609        .func           = bpf_sock_addr_setsockopt,
4610        .gpl_only       = false,
4611        .ret_type       = RET_INTEGER,
4612        .arg1_type      = ARG_PTR_TO_CTX,
4613        .arg2_type      = ARG_ANYTHING,
4614        .arg3_type      = ARG_ANYTHING,
4615        .arg4_type      = ARG_PTR_TO_MEM,
4616        .arg5_type      = ARG_CONST_SIZE,
4617};
4618
4619BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx,
4620           int, level, int, optname, char *, optval, int, optlen)
4621{
4622        return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen);
4623}
4624
4625static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
4626        .func           = bpf_sock_addr_getsockopt,
4627        .gpl_only       = false,
4628        .ret_type       = RET_INTEGER,
4629        .arg1_type      = ARG_PTR_TO_CTX,
4630        .arg2_type      = ARG_ANYTHING,
4631        .arg3_type      = ARG_ANYTHING,
4632        .arg4_type      = ARG_PTR_TO_UNINIT_MEM,
4633        .arg5_type      = ARG_CONST_SIZE,
4634};
4635
4636BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
4637           int, level, int, optname, char *, optval, int, optlen)
4638{
4639        u32 flags = 0;
4640        if (bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN)
4641                flags |= SOCKOPT_CC_REINIT;
4642        return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen,
4643                               flags);
4644}
4645
4646static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
4647        .func           = bpf_sock_ops_setsockopt,
4648        .gpl_only       = false,
4649        .ret_type       = RET_INTEGER,
4650        .arg1_type      = ARG_PTR_TO_CTX,
4651        .arg2_type      = ARG_ANYTHING,
4652        .arg3_type      = ARG_ANYTHING,
4653        .arg4_type      = ARG_PTR_TO_MEM,
4654        .arg5_type      = ARG_CONST_SIZE,
4655};
4656
4657BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
4658           int, level, int, optname, char *, optval, int, optlen)
4659{
4660        return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen);
4661}
4662
4663static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = {
4664        .func           = bpf_sock_ops_getsockopt,
4665        .gpl_only       = false,
4666        .ret_type       = RET_INTEGER,
4667        .arg1_type      = ARG_PTR_TO_CTX,
4668        .arg2_type      = ARG_ANYTHING,
4669        .arg3_type      = ARG_ANYTHING,
4670        .arg4_type      = ARG_PTR_TO_UNINIT_MEM,
4671        .arg5_type      = ARG_CONST_SIZE,
4672};
4673
4674BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
4675           int, argval)
4676{
4677        struct sock *sk = bpf_sock->sk;
4678        int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
4679
4680        if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
4681                return -EINVAL;
4682
4683        tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
4684
4685        return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
4686}
4687
4688static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = {
4689        .func           = bpf_sock_ops_cb_flags_set,
4690        .gpl_only       = false,
4691        .ret_type       = RET_INTEGER,
4692        .arg1_type      = ARG_PTR_TO_CTX,
4693        .arg2_type      = ARG_ANYTHING,
4694};
4695
4696const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
4697EXPORT_SYMBOL_GPL(ipv6_bpf_stub);
4698
4699BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
4700           int, addr_len)
4701{
4702#ifdef CONFIG_INET
4703        struct sock *sk = ctx->sk;
4704        u32 flags = BIND_FROM_BPF;
4705        int err;
4706
4707        err = -EINVAL;
4708        if (addr_len < offsetofend(struct sockaddr, sa_family))
4709                return err;
4710        if (addr->sa_family == AF_INET) {
4711                if (addr_len < sizeof(struct sockaddr_in))
4712                        return err;
4713                if (((struct sockaddr_in *)addr)->sin_port == htons(0))
4714                        flags |= BIND_FORCE_ADDRESS_NO_PORT;
4715                return __inet_bind(sk, addr, addr_len, flags);
4716#if IS_ENABLED(CONFIG_IPV6)
4717        } else if (addr->sa_family == AF_INET6) {
4718                if (addr_len < SIN6_LEN_RFC2133)
4719                        return err;
4720                if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0))
4721                        flags |= BIND_FORCE_ADDRESS_NO_PORT;
4722                /* ipv6_bpf_stub cannot be NULL, since it's called from
4723                 * bpf_cgroup_inet6_connect hook and ipv6 is already loaded
4724                 */
4725                return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags);
4726#endif /* CONFIG_IPV6 */
4727        }
4728#endif /* CONFIG_INET */
4729
4730        return -EAFNOSUPPORT;
4731}
4732
4733static const struct bpf_func_proto bpf_bind_proto = {
4734        .func           = bpf_bind,
4735        .gpl_only       = false,
4736        .ret_type       = RET_INTEGER,
4737        .arg1_type      = ARG_PTR_TO_CTX,
4738        .arg2_type      = ARG_PTR_TO_MEM,
4739        .arg3_type      = ARG_CONST_SIZE,
4740};
4741
4742#ifdef CONFIG_XFRM
4743BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
4744           struct bpf_xfrm_state *, to, u32, size, u64, flags)
4745{
4746        const struct sec_path *sp = skb_sec_path(skb);
4747        const struct xfrm_state *x;
4748
4749        if (!sp || unlikely(index >= sp->len || flags))
4750                goto err_clear;
4751
4752        x = sp->xvec[index];
4753
4754        if (unlikely(size != sizeof(struct bpf_xfrm_state)))
4755                goto err_clear;
4756
4757        to->reqid = x->props.reqid;
4758        to->spi = x->id.spi;
4759        to->family = x->props.family;
4760        to->ext = 0;
4761
4762        if (to->family == AF_INET6) {
4763                memcpy(to->remote_ipv6, x->props.saddr.a6,
4764                       sizeof(to->remote_ipv6));
4765        } else {
4766                to->remote_ipv4 = x->props.saddr.a4;
4767                memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
4768        }
4769
4770        return 0;
4771err_clear:
4772        memset(to, 0, size);
4773        return -EINVAL;
4774}
4775
4776static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
4777        .func           = bpf_skb_get_xfrm_state,
4778        .gpl_only       = false,
4779        .ret_type       = RET_INTEGER,
4780        .arg1_type      = ARG_PTR_TO_CTX,
4781        .arg2_type      = ARG_ANYTHING,
4782        .arg3_type      = ARG_PTR_TO_UNINIT_MEM,
4783        .arg4_type      = ARG_CONST_SIZE,
4784        .arg5_type      = ARG_ANYTHING,
4785};
4786#endif
4787
4788#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
4789static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
4790                                  const struct neighbour *neigh,
4791                                  const struct net_device *dev)
4792{
4793        memcpy(params->dmac, neigh->ha, ETH_ALEN);
4794        memcpy(params->smac, dev->dev_addr, ETH_ALEN);
4795        params->h_vlan_TCI = 0;
4796        params->h_vlan_proto = 0;
4797        params->ifindex = dev->ifindex;
4798
4799        return 0;
4800}
4801#endif
4802
4803#if IS_ENABLED(CONFIG_INET)
4804static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
4805                               u32 flags, bool check_mtu)
4806{
4807        struct fib_nh_common *nhc;
4808        struct in_device *in_dev;
4809        struct neighbour *neigh;
4810        struct net_device *dev;
4811        struct fib_result res;
4812        struct flowi4 fl4;
4813        int err;
4814        u32 mtu;
4815
4816        dev = dev_get_by_index_rcu(net, params->ifindex);
4817        if (unlikely(!dev))
4818                return -ENODEV;
4819
4820        /* verify forwarding is enabled on this interface */
4821        in_dev = __in_dev_get_rcu(dev);
4822        if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
4823                return BPF_FIB_LKUP_RET_FWD_DISABLED;
4824
4825        if (flags & BPF_FIB_LOOKUP_OUTPUT) {
4826                fl4.flowi4_iif = 1;
4827                fl4.flowi4_oif = params->ifindex;
4828        } else {
4829                fl4.flowi4_iif = params->ifindex;
4830                fl4.flowi4_oif = 0;
4831        }
4832        fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
4833        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
4834        fl4.flowi4_flags = 0;
4835
4836        fl4.flowi4_proto = params->l4_protocol;
4837        fl4.daddr = params->ipv4_dst;
4838        fl4.saddr = params->ipv4_src;
4839        fl4.fl4_sport = params->sport;
4840        fl4.fl4_dport = params->dport;
4841        fl4.flowi4_multipath_hash = 0;
4842
4843        if (flags & BPF_FIB_LOOKUP_DIRECT) {
4844                u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
4845                struct fib_table *tb;
4846
4847                tb = fib_get_table(net, tbid);
4848                if (unlikely(!tb))
4849                        return BPF_FIB_LKUP_RET_NOT_FWDED;
4850
4851                err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
4852        } else {
4853                fl4.flowi4_mark = 0;
4854                fl4.flowi4_secid = 0;
4855                fl4.flowi4_tun_key.tun_id = 0;
4856                fl4.flowi4_uid = sock_net_uid(net, NULL);
4857
4858                err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
4859        }
4860
4861        if (err) {
4862                /* map fib lookup errors to RTN_ type */
4863                if (err == -EINVAL)
4864                        return BPF_FIB_LKUP_RET_BLACKHOLE;
4865                if (err == -EHOSTUNREACH)
4866                        return BPF_FIB_LKUP_RET_UNREACHABLE;
4867                if (err == -EACCES)
4868                        return BPF_FIB_LKUP_RET_PROHIBIT;
4869
4870                return BPF_FIB_LKUP_RET_NOT_FWDED;
4871        }
4872
4873        if (res.type != RTN_UNICAST)
4874                return BPF_FIB_LKUP_RET_NOT_FWDED;
4875
4876        if (fib_info_num_path(res.fi) > 1)
4877                fib_select_path(net, &res, &fl4, NULL);
4878
4879        if (check_mtu) {
4880                mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
4881                if (params->tot_len > mtu)
4882                        return BPF_FIB_LKUP_RET_FRAG_NEEDED;
4883        }
4884
4885        nhc = res.nhc;
4886
4887        /* do not handle lwt encaps right now */
4888        if (nhc->nhc_lwtstate)
4889                return BPF_FIB_LKUP_RET_UNSUPP_LWT;
4890
4891        dev = nhc->nhc_dev;
4892
4893        params->rt_metric = res.fi->fib_priority;
4894
4895        /* xdp and cls_bpf programs are run in RCU-bh so
4896         * rcu_read_lock_bh is not needed here
4897         */
4898        if (likely(nhc->nhc_gw_family != AF_INET6)) {
4899                if (nhc->nhc_gw_family)
4900                        params->ipv4_dst = nhc->nhc_gw.ipv4;
4901
4902                neigh = __ipv4_neigh_lookup_noref(dev,
4903                                                 (__force u32)params->ipv4_dst);
4904        } else {
4905                struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst;
4906
4907                params->family = AF_INET6;
4908                *dst = nhc->nhc_gw.ipv6;
4909                neigh = __ipv6_neigh_lookup_noref_stub(dev, dst);
4910        }
4911
4912        if (!neigh)
4913                return BPF_FIB_LKUP_RET_NO_NEIGH;
4914
4915        return bpf_fib_set_fwd_params(params, neigh, dev);
4916}
4917#endif
4918
4919#if IS_ENABLED(CONFIG_IPV6)
4920static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
4921                               u32 flags, bool check_mtu)
4922{
4923        struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
4924        struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
4925        struct fib6_result res = {};
4926        struct neighbour *neigh;
4927        struct net_device *dev;
4928        struct inet6_dev *idev;
4929        struct flowi6 fl6;
4930        int strict = 0;
4931        int oif, err;
4932        u32 mtu;
4933
4934        /* link local addresses are never forwarded */
4935        if (rt6_need_strict(dst) || rt6_need_strict(src))
4936                return BPF_FIB_LKUP_RET_NOT_FWDED;
4937
4938        dev = dev_get_by_index_rcu(net, params->ifindex);
4939        if (unlikely(!dev))
4940                return -ENODEV;
4941
4942        idev = __in6_dev_get_safely(dev);
4943        if (unlikely(!idev || !idev->cnf.forwarding))
4944                return BPF_FIB_LKUP_RET_FWD_DISABLED;
4945
4946        if (flags & BPF_FIB_LOOKUP_OUTPUT) {
4947                fl6.flowi6_iif = 1;
4948                oif = fl6.flowi6_oif = params->ifindex;
4949        } else {
4950                oif = fl6.flowi6_iif = params->ifindex;
4951                fl6.flowi6_oif = 0;
4952                strict = RT6_LOOKUP_F_HAS_SADDR;
4953        }
4954        fl6.flowlabel = params->flowinfo;
4955        fl6.flowi6_scope = 0;
4956        fl6.flowi6_flags = 0;
4957        fl6.mp_hash = 0;
4958
4959        fl6.flowi6_proto = params->l4_protocol;
4960        fl6.daddr = *dst;
4961        fl6.saddr = *src;
4962        fl6.fl6_sport = params->sport;
4963        fl6.fl6_dport = params->dport;
4964
4965        if (flags & BPF_FIB_LOOKUP_DIRECT) {
4966                u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
4967                struct fib6_table *tb;
4968
4969                tb = ipv6_stub->fib6_get_table(net, tbid);
4970                if (unlikely(!tb))
4971                        return BPF_FIB_LKUP_RET_NOT_FWDED;
4972
4973                err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res,
4974                                                   strict);
4975        } else {
4976                fl6.flowi6_mark = 0;
4977                fl6.flowi6_secid = 0;
4978                fl6.flowi6_tun_key.tun_id = 0;
4979                fl6.flowi6_uid = sock_net_uid(net, NULL);
4980
4981                err = ipv6_stub->fib6_lookup(net, oif, &fl6, &res, strict);
4982        }
4983
4984        if (unlikely(err || IS_ERR_OR_NULL(res.f6i) ||
4985                     res.f6i == net->ipv6.fib6_null_entry))
4986                return BPF_FIB_LKUP_RET_NOT_FWDED;
4987
4988        switch (res.fib6_type) {
4989        /* only unicast is forwarded */
4990        case RTN_UNICAST:
4991                break;
4992        case RTN_BLACKHOLE:
4993                return BPF_FIB_LKUP_RET_BLACKHOLE;
4994        case RTN_UNREACHABLE:
4995                return BPF_FIB_LKUP_RET_UNREACHABLE;
4996        case RTN_PROHIBIT:
4997                return BPF_FIB_LKUP_RET_PROHIBIT;
4998        default:
4999                return BPF_FIB_LKUP_RET_NOT_FWDED;
5000        }
5001
5002        ipv6_stub->fib6_select_path(net, &res, &fl6, fl6.flowi6_oif,
5003                                    fl6.flowi6_oif != 0, NULL, strict);
5004
5005        if (check_mtu) {
5006                mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src);
5007                if (params->tot_len > mtu)
5008                        return BPF_FIB_LKUP_RET_FRAG_NEEDED;
5009        }
5010
5011        if (res.nh->fib_nh_lws)
5012                return BPF_FIB_LKUP_RET_UNSUPP_LWT;
5013
5014        if (res.nh->fib_nh_gw_family)
5015                *dst = res.nh->fib_nh_gw6;
5016
5017        dev = res.nh->fib_nh_dev;
5018        params->rt_metric = res.f6i->fib6_metric;
5019
5020        /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
5021         * not needed here.
5022         */
5023        neigh = __ipv6_neigh_lookup_noref_stub(dev, dst);
5024        if (!neigh)
5025                return BPF_FIB_LKUP_RET_NO_NEIGH;
5026
5027        return bpf_fib_set_fwd_params(params, neigh, dev);
5028}
5029#endif
5030
5031BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
5032           struct bpf_fib_lookup *, params, int, plen, u32, flags)
5033{
5034        if (plen < sizeof(*params))
5035                return -EINVAL;
5036
5037        if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
5038                return -EINVAL;
5039
5040        switch (params->family) {
5041#if IS_ENABLED(CONFIG_INET)
5042        case AF_INET:
5043                return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
5044                                           flags, true);
5045#endif
5046#if IS_ENABLED(CONFIG_IPV6)
5047        case AF_INET6:
5048                return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
5049                                           flags, true);
5050#endif
5051        }
5052        return -EAFNOSUPPORT;
5053}
5054
5055static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
5056        .func           = bpf_xdp_fib_lookup,
5057        .gpl_only       = true,
5058        .ret_type       = RET_INTEGER,
5059        .arg1_type      = ARG_PTR_TO_CTX,
5060        .arg2_type      = ARG_PTR_TO_MEM,
5061        .arg3_type      = ARG_CONST_SIZE,
5062        .arg4_type      = ARG_ANYTHING,
5063};
5064
5065BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
5066           struct bpf_fib_lookup *, params, int, plen, u32, flags)
5067{
5068        struct net *net = dev_net(skb->dev);
5069        int rc = -EAFNOSUPPORT;
5070
5071        if (plen < sizeof(*params))
5072                return -EINVAL;
5073
5074        if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
5075                return -EINVAL;
5076
5077        switch (params->family) {
5078#if IS_ENABLED(CONFIG_INET)
5079        case AF_INET:
5080                rc = bpf_ipv4_fib_lookup(net, params, flags, false);
5081                break;
5082#endif
5083#if IS_ENABLED(CONFIG_IPV6)
5084        case AF_INET6:
5085                rc = bpf_ipv6_fib_lookup(net, params, flags, false);
5086                break;
5087#endif
5088        }
5089
5090        if (!rc) {
5091                struct net_device *dev;
5092
5093                dev = dev_get_by_index_rcu(net, params->ifindex);
5094                if (!is_skb_forwardable(dev, skb))
5095                        rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
5096        }
5097
5098        return rc;
5099}
5100
5101static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
5102        .func           = bpf_skb_fib_lookup,
5103        .gpl_only       = true,
5104        .ret_type       = RET_INTEGER,
5105        .arg1_type      = ARG_PTR_TO_CTX,
5106        .arg2_type      = ARG_PTR_TO_MEM,
5107        .arg3_type      = ARG_CONST_SIZE,
5108        .arg4_type      = ARG_ANYTHING,
5109};
5110
5111#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
5112static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
5113{
5114        int err;
5115        struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;
5116
5117        if (!seg6_validate_srh(srh, len, false))
5118                return -EINVAL;
5119
5120        switch (type) {
5121        case BPF_LWT_ENCAP_SEG6_INLINE:
5122                if (skb->protocol != htons(ETH_P_IPV6))
5123                        return -EBADMSG;
5124
5125                err = seg6_do_srh_inline(skb, srh);
5126                break;
5127        case BPF_LWT_ENCAP_SEG6:
5128                skb_reset_inner_headers(skb);
5129                skb->encapsulation = 1;
5130                err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6);
5131                break;
5132        default:
5133                return -EINVAL;
5134        }
5135
5136        bpf_compute_data_pointers(skb);
5137        if (err)
5138                return err;
5139
5140        ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
5141        skb_set_transport_header(skb, sizeof(struct ipv6hdr));
5142
5143        return seg6_lookup_nexthop(skb, NULL, 0);
5144}
5145#endif /* CONFIG_IPV6_SEG6_BPF */
5146
5147#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
5148static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
5149                             bool ingress)
5150{
5151        return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
5152}
5153#endif
5154
5155BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
5156           u32, len)
5157{
5158        switch (type) {
5159#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
5160        case BPF_LWT_ENCAP_SEG6:
5161        case BPF_LWT_ENCAP_SEG6_INLINE:
5162                return bpf_push_seg6_encap(skb, type, hdr, len);
5163#endif
5164#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
5165        case BPF_LWT_ENCAP_IP:
5166                return bpf_push_ip_encap(skb, hdr, len, true /* ingress */);
5167#endif
5168        default:
5169                return -EINVAL;
5170        }
5171}
5172
5173BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type,
5174           void *, hdr, u32, len)
5175{
5176        switch (type) {
5177#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
5178        case BPF_LWT_ENCAP_IP:
5179                return bpf_push_ip_encap(skb, hdr, len, false /* egress */);
5180#endif
5181        default:
5182                return -EINVAL;
5183        }
5184}
5185
5186static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
5187        .func           = bpf_lwt_in_push_encap,
5188        .gpl_only       = false,
5189        .ret_type       = RET_INTEGER,
5190        .arg1_type      = ARG_PTR_TO_CTX,
5191        .arg2_type      = ARG_ANYTHING,
5192        .arg3_type      = ARG_PTR_TO_MEM,
5193        .arg4_type      = ARG_CONST_SIZE
5194};
5195
5196static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
5197        .func           = bpf_lwt_xmit_push_encap,
5198        .gpl_only       = false,
5199        .ret_type       = RET_INTEGER,
5200        .arg1_type      = ARG_PTR_TO_CTX,
5201        .arg2_type      = ARG_ANYTHING,
5202        .arg3_type      = ARG_PTR_TO_MEM,
5203        .arg4_type      = ARG_CONST_SIZE
5204};
5205
5206#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
5207BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
5208           const void *, from, u32, len)
5209{
5210        struct seg6_bpf_srh_state *srh_state =
5211                this_cpu_ptr(&seg6_bpf_srh_states);
5212        struct ipv6_sr_hdr *srh = srh_state->srh;
5213        void *srh_tlvs, *srh_end, *ptr;
5214        int srhoff = 0;
5215
5216        if (srh == NULL)
5217                return -EINVAL;
5218
5219        srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
5220        srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);
5221
5222        ptr = skb->data + offset;
5223        if (ptr >= srh_tlvs && ptr + len <= srh_end)
5224                srh_state->valid = false;
5225        else if (ptr < (void *)&srh->flags ||
5226                 ptr + len > (void *)&srh->segments)
5227                return -EFAULT;
5228
5229        if (unlikely(bpf_try_make_writable(skb, offset + len)))
5230                return -EFAULT;
5231        if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
5232                return -EINVAL;
5233        srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
5234
5235        memcpy(skb->data + offset, from, len);
5236        return 0;
5237}
5238
5239static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
5240        .func           = bpf_lwt_seg6_store_bytes,
5241        .gpl_only       = false,
5242        .ret_type       = RET_INTEGER,
5243        .arg1_type      = ARG_PTR_TO_CTX,
5244        .arg2_type      = ARG_ANYTHING,
5245        .arg3_type      = ARG_PTR_TO_MEM,
5246        .arg4_type      = ARG_CONST_SIZE
5247};
5248
5249static void bpf_update_srh_state(struct sk_buff *skb)
5250{
5251        struct seg6_bpf_srh_state *srh_state =
5252                this_cpu_ptr(&seg6_bpf_srh_states);
5253        int srhoff = 0;
5254
5255        if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) {
5256                srh_state->srh = NULL;
5257        } else {
5258                srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
5259                srh_state->hdrlen = srh_state->srh->hdrlen << 3;
5260                srh_state->valid = true;
5261        }
5262}
5263
5264BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
5265           u32, action, void *, param, u32, param_len)
5266{
5267        struct seg6_bpf_srh_state *srh_state =
5268                this_cpu_ptr(&seg6_bpf_srh_states);
5269        int hdroff = 0;
5270        int err;
5271
5272        switch (action) {
5273        case SEG6_LOCAL_ACTION_END_X:
5274                if (!seg6_bpf_has_valid_srh(skb))
5275                        return -EBADMSG;
5276                if (param_len != sizeof(struct in6_addr))
5277                        return -EINVAL;
5278                return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
5279        case SEG6_LOCAL_ACTION_END_T:
5280                if (!seg6_bpf_has_valid_srh(skb))
5281                        return -EBADMSG;
5282                if (param_len != sizeof(int))
5283                        return -EINVAL;
5284                return seg6_lookup_nexthop(skb, NULL, *(int *)param);
5285        case SEG6_LOCAL_ACTION_END_DT6:
5286                if (!seg6_bpf_has_valid_srh(skb))
5287                        return -EBADMSG;
5288                if (param_len != sizeof(int))
5289                        return -EINVAL;
5290
5291                if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0)
5292                        return -EBADMSG;
5293                if (!pskb_pull(skb, hdroff))
5294                        return -EBADMSG;
5295
5296                skb_postpull_rcsum(skb, skb_network_header(skb), hdroff);
5297                skb_reset_network_header(skb);
5298                skb_reset_transport_header(skb);
5299                skb->encapsulation = 0;
5300
5301                bpf_compute_data_pointers(skb);
5302                bpf_update_srh_state(skb);
5303                return seg6_lookup_nexthop(skb, NULL, *(int *)param);
5304        case SEG6_LOCAL_ACTION_END_B6:
5305                if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
5306                        return -EBADMSG;
5307                err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE,
5308                                          param, param_len);
5309                if (!err)
5310                        bpf_update_srh_state(skb);
5311
5312                return err;
5313        case SEG6_LOCAL_ACTION_END_B6_ENCAP:
5314                if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
5315                        return -EBADMSG;
5316                err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6,
5317                                          param, param_len);
5318                if (!err)
5319                        bpf_update_srh_state(skb);
5320
5321                return err;
5322        default:
5323                return -EINVAL;
5324        }
5325}
5326
5327static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
5328        .func           = bpf_lwt_seg6_action,
5329        .gpl_only       = false,
5330        .ret_type       = RET_INTEGER,
5331        .arg1_type      = ARG_PTR_TO_CTX,
5332        .arg2_type      = ARG_ANYTHING,
5333        .arg3_type      = ARG_PTR_TO_MEM,
5334        .arg4_type      = ARG_CONST_SIZE
5335};
5336
5337BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
5338           s32, len)
5339{
5340        struct seg6_bpf_srh_state *srh_state =
5341                this_cpu_ptr(&seg6_bpf_srh_states);
5342        struct ipv6_sr_hdr *srh = srh_state->srh;
5343        void *srh_end, *srh_tlvs, *ptr;
5344        struct ipv6hdr *hdr;
5345        int srhoff = 0;
5346        int ret;
5347
5348        if (unlikely(srh == NULL))
5349                return -EINVAL;
5350
5351        srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) +
5352                        ((srh->first_segment + 1) << 4));
5353        srh_end = (void *)((unsigned char *)srh + sizeof(*srh) +
5354                        srh_state->hdrlen);
5355        ptr = skb->data + offset;
5356
5357        if (unlikely(ptr < srh_tlvs || ptr > srh_end))
5358                return -EFAULT;
5359        if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end))
5360                return -EFAULT;
5361
5362        if (len > 0) {
5363                ret = skb_cow_head(skb, len);
5364                if (unlikely(ret < 0))
5365                        return ret;
5366
5367                ret = bpf_skb_net_hdr_push(skb, offset, len);
5368        } else {
5369                ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len);
5370        }
5371
5372        bpf_compute_data_pointers(skb);
5373        if (unlikely(ret < 0))
5374                return ret;
5375
5376        hdr = (struct ipv6hdr *)skb->data;
5377        hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
5378
5379        if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
5380                return -EINVAL;
5381        srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
5382        srh_state->hdrlen += len;
5383        srh_state->valid = false;
5384        return 0;
5385}
5386
5387static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
5388        .func           = bpf_lwt_seg6_adjust_srh,
5389        .gpl_only       = false,
5390        .ret_type       = RET_INTEGER,
5391        .arg1_type      = ARG_PTR_TO_CTX,
5392        .arg2_type      = ARG_ANYTHING,
5393        .arg3_type      = ARG_ANYTHING,
5394};
5395#endif /* CONFIG_IPV6_SEG6_BPF */
5396
5397#ifdef CONFIG_INET
5398static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
5399                              int dif, int sdif, u8 family, u8 proto)
5400{
5401        bool refcounted = false;
5402        struct sock *sk = NULL;
5403
5404        if (family == AF_INET) {
5405                __be32 src4 = tuple->ipv4.saddr;
5406                __be32 dst4 = tuple->ipv4.daddr;
5407
5408                if (proto == IPPROTO_TCP)
5409                        sk = __inet_lookup(net, &tcp_hashinfo, NULL, 0,
5410                                           src4, tuple->ipv4.sport,
5411                                           dst4, tuple->ipv4.dport,
5412                                           dif, sdif, &refcounted);
5413                else
5414                        sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
5415                                               dst4, tuple->ipv4.dport,
5416                                               dif, sdif, &udp_table, NULL);
5417#if IS_ENABLED(CONFIG_IPV6)
5418        } else {
5419                struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
5420                struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
5421
5422                if (proto == IPPROTO_TCP)
5423                        sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0,
5424                                            src6, tuple->ipv6.sport,
5425                                            dst6, ntohs(tuple->ipv6.dport),
5426                                            dif, sdif, &refcounted);
5427                else if (likely(ipv6_bpf_stub))
5428                        sk = ipv6_bpf_stub->udp6_lib_lookup(net,
5429                                                            src6, tuple->ipv6.sport,
5430                                                            dst6, tuple->ipv6.dport,
5431                                                            dif, sdif,
5432                                                            &udp_table, NULL);
5433#endif
5434        }
5435
5436        if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) {
5437                WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
5438                sk = NULL;
5439        }
5440        return sk;
5441}
5442
5443/* bpf_skc_lookup performs the core lookup for different types of sockets,
5444 * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
5445 * Returns the socket as an 'unsigned long' to simplify the casting in the
5446 * callers to satisfy BPF_CALL declarations.
5447 */
5448static struct sock *
5449__bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
5450                 struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
5451                 u64 flags)
5452{
5453        struct sock *sk = NULL;
5454        u8 family = AF_UNSPEC;
5455        struct net *net;
5456        int sdif;
5457
5458        if (len == sizeof(tuple->ipv4))
5459                family = AF_INET;
5460        else if (len == sizeof(tuple->ipv6))
5461                family = AF_INET6;
5462        else
5463                return NULL;
5464
5465        if (unlikely(family == AF_UNSPEC || flags ||
5466                     !((s32)netns_id < 0 || netns_id <= S32_MAX)))
5467                goto out;
5468
5469        if (family == AF_INET)
5470                sdif = inet_sdif(skb);
5471        else
5472                sdif = inet6_sdif(skb);
5473
5474        if ((s32)netns_id < 0) {
5475                net = caller_net;
5476                sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
5477        } else {
5478                net = get_net_ns_by_id(caller_net, netns_id);
5479                if (unlikely(!net))
5480                        goto out;
5481                sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
5482                put_net(net);
5483        }
5484
5485out:
5486        return sk;
5487}
5488
5489static struct sock *
5490__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
5491                struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
5492                u64 flags)
5493{
5494        struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net,
5495                                           ifindex, proto, netns_id, flags);
5496
5497        if (sk) {
5498                sk = sk_to_full_sk(sk);
5499                if (!sk_fullsock(sk)) {
5500                        sock_gen_put(sk);
5501                        return NULL;
5502                }
5503        }
5504
5505        return sk;
5506}
5507
5508static struct sock *
5509bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
5510               u8 proto, u64 netns_id, u64 flags)
5511{
5512        struct net *caller_net;
5513        int ifindex;
5514
5515        if (skb->dev) {
5516                caller_net = dev_net(skb->dev);
5517                ifindex = skb->dev->ifindex;
5518        } else {
5519                caller_net = sock_net(skb->sk);
5520                ifindex = 0;
5521        }
5522
5523        return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto,
5524                                netns_id, flags);
5525}
5526
5527static struct sock *
5528bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
5529              u8 proto, u64 netns_id, u64 flags)
5530{
5531        struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id,
5532                                         flags);
5533
5534        if (sk) {
5535                sk = sk_to_full_sk(sk);
5536                if (!sk_fullsock(sk)) {
5537                        sock_gen_put(sk);
5538                        return NULL;
5539                }
5540        }
5541
5542        return sk;
5543}
5544
5545BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb,
5546           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
5547{
5548        return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP,
5549                                             netns_id, flags);
5550}
5551
5552static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = {
5553        .func           = bpf_skc_lookup_tcp,
5554        .gpl_only       = false,
5555        .pkt_access     = true,
5556        .ret_type       = RET_PTR_TO_SOCK_COMMON_OR_NULL,
5557        .arg1_type      = ARG_PTR_TO_CTX,
5558        .arg2_type      = ARG_PTR_TO_MEM,
5559        .arg3_type      = ARG_CONST_SIZE,
5560        .arg4_type      = ARG_ANYTHING,
5561        .arg5_type      = ARG_ANYTHING,
5562};
5563
5564BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
5565           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
5566{
5567        return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP,
5568                                            netns_id, flags);
5569}
5570
5571static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
5572        .func           = bpf_sk_lookup_tcp,
5573        .gpl_only       = false,
5574        .pkt_access     = true,
5575        .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
5576        .arg1_type      = ARG_PTR_TO_CTX,
5577        .arg2_type      = ARG_PTR_TO_MEM,
5578        .arg3_type      = ARG_CONST_SIZE,
5579        .arg4_type      = ARG_ANYTHING,
5580        .arg5_type      = ARG_ANYTHING,
5581};
5582
5583BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb,
5584           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
5585{
5586        return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP,
5587                                            netns_id, flags);
5588}
5589
5590static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
5591        .func           = bpf_sk_lookup_udp,
5592        .gpl_only       = false,
5593        .pkt_access     = true,
5594        .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
5595        .arg1_type      = ARG_PTR_TO_CTX,
5596        .arg2_type      = ARG_PTR_TO_MEM,
5597        .arg3_type      = ARG_CONST_SIZE,
5598        .arg4_type      = ARG_ANYTHING,
5599        .arg5_type      = ARG_ANYTHING,
5600};
5601
5602BPF_CALL_1(bpf_sk_release, struct sock *, sk)
5603{
5604        if (sk_is_refcounted(sk))
5605                sock_gen_put(sk);
5606        return 0;
5607}
5608
5609static const struct bpf_func_proto bpf_sk_release_proto = {
5610        .func           = bpf_sk_release,
5611        .gpl_only       = false,
5612        .ret_type       = RET_INTEGER,
5613        .arg1_type      = ARG_PTR_TO_SOCK_COMMON,
5614};
5615
5616BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx,
5617           struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
5618{
5619        struct net *caller_net = dev_net(ctx->rxq->dev);
5620        int ifindex = ctx->rxq->dev->ifindex;
5621
5622        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
5623                                              ifindex, IPPROTO_UDP, netns_id,
5624                                              flags);
5625}
5626
5627static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
5628        .func           = bpf_xdp_sk_lookup_udp,
5629        .gpl_only       = false,
5630        .pkt_access     = true,
5631        .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
5632        .arg1_type      = ARG_PTR_TO_CTX,
5633        .arg2_type      = ARG_PTR_TO_MEM,
5634        .arg3_type      = ARG_CONST_SIZE,
5635        .arg4_type      = ARG_ANYTHING,
5636        .arg5_type      = ARG_ANYTHING,
5637};
5638
5639BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx,
5640           struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
5641{
5642        struct net *caller_net = dev_net(ctx->rxq->dev);
5643        int ifindex = ctx->rxq->dev->ifindex;
5644
5645        return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net,
5646                                               ifindex, IPPROTO_TCP, netns_id,
5647                                               flags);
5648}
5649
5650static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {
5651        .func           = bpf_xdp_skc_lookup_tcp,
5652        .gpl_only       = false,
5653        .pkt_access     = true,
5654        .ret_type       = RET_PTR_TO_SOCK_COMMON_OR_NULL,
5655        .arg1_type      = ARG_PTR_TO_CTX,
5656        .arg2_type      = ARG_PTR_TO_MEM,
5657        .arg3_type      = ARG_CONST_SIZE,
5658        .arg4_type      = ARG_ANYTHING,
5659        .arg5_type      = ARG_ANYTHING,
5660};
5661
5662BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx,
5663           struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
5664{
5665        struct net *caller_net = dev_net(ctx->rxq->dev);
5666        int ifindex = ctx->rxq->dev->ifindex;
5667
5668        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
5669                                              ifindex, IPPROTO_TCP, netns_id,
5670                                              flags);
5671}
5672
5673static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
5674        .func           = bpf_xdp_sk_lookup_tcp,
5675        .gpl_only       = false,
5676        .pkt_access     = true,
5677        .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
5678        .arg1_type      = ARG_PTR_TO_CTX,
5679        .arg2_type      = ARG_PTR_TO_MEM,
5680        .arg3_type      = ARG_CONST_SIZE,
5681        .arg4_type      = ARG_ANYTHING,
5682        .arg5_type      = ARG_ANYTHING,
5683};
5684
5685BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
5686           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
5687{
5688        return (unsigned long)__bpf_skc_lookup(NULL, tuple, len,
5689                                               sock_net(ctx->sk), 0,
5690                                               IPPROTO_TCP, netns_id, flags);
5691}
5692
5693static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {
5694        .func           = bpf_sock_addr_skc_lookup_tcp,
5695        .gpl_only       = false,
5696        .ret_type       = RET_PTR_TO_SOCK_COMMON_OR_NULL,
5697        .arg1_type      = ARG_PTR_TO_CTX,
5698        .arg2_type      = ARG_PTR_TO_MEM,
5699        .arg3_type      = ARG_CONST_SIZE,
5700        .arg4_type      = ARG_ANYTHING,
5701        .arg5_type      = ARG_ANYTHING,
5702};
5703
5704BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
5705           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
5706{
5707        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
5708                                              sock_net(ctx->sk), 0, IPPROTO_TCP,
5709                                              netns_id, flags);
5710}
5711
5712static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
5713        .func           = bpf_sock_addr_sk_lookup_tcp,
5714        .gpl_only       = false,
5715        .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
5716        .arg1_type      = ARG_PTR_TO_CTX,
5717        .arg2_type      = ARG_PTR_TO_MEM,
5718        .arg3_type      = ARG_CONST_SIZE,
5719        .arg4_type      = ARG_ANYTHING,
5720        .arg5_type      = ARG_ANYTHING,
5721};
5722
5723BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx,
5724           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
5725{
5726        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
5727                                              sock_net(ctx->sk), 0, IPPROTO_UDP,
5728                                              netns_id, flags);
5729}
5730
5731static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
5732        .func           = bpf_sock_addr_sk_lookup_udp,
5733        .gpl_only       = false,
5734        .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
5735        .arg1_type      = ARG_PTR_TO_CTX,
5736        .arg2_type      = ARG_PTR_TO_MEM,
5737        .arg3_type      = ARG_CONST_SIZE,
5738        .arg4_type      = ARG_ANYTHING,
5739        .arg5_type      = ARG_ANYTHING,
5740};
5741
5742bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
5743                                  struct bpf_insn_access_aux *info)
5744{
5745        if (off < 0 || off >= offsetofend(struct bpf_tcp_sock,
5746                                          icsk_retransmits))
5747                return false;
5748
5749        if (off % size != 0)
5750                return false;
5751
5752        switch (off) {
5753        case offsetof(struct bpf_tcp_sock, bytes_received):
5754        case offsetof(struct bpf_tcp_sock, bytes_acked):
5755                return size == sizeof(__u64);
5756        default:
5757                return size == sizeof(__u32);
5758        }
5759}
5760
5761u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
5762                                    const struct bpf_insn *si,
5763                                    struct bpf_insn *insn_buf,
5764                                    struct bpf_prog *prog, u32 *target_size)
5765{
5766        struct bpf_insn *insn = insn_buf;
5767
5768#define BPF_TCP_SOCK_GET_COMMON(FIELD)                                  \
5769        do {                                                            \
5770                BUILD_BUG_ON(sizeof_field(struct tcp_sock, FIELD) >     \
5771                             sizeof_field(struct bpf_tcp_sock, FIELD)); \
5772                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\
5773                                      si->dst_reg, si->src_reg,         \
5774                                      offsetof(struct tcp_sock, FIELD)); \
5775        } while (0)
5776
5777#define BPF_INET_SOCK_GET_COMMON(FIELD)                                 \
5778        do {                                                            \
5779                BUILD_BUG_ON(sizeof_field(struct inet_connection_sock,  \
5780                                          FIELD) >                      \
5781                             sizeof_field(struct bpf_tcp_sock, FIELD)); \
5782                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                 \
5783                                        struct inet_connection_sock,    \
5784                                        FIELD),                         \
5785                                      si->dst_reg, si->src_reg,         \
5786                                      offsetof(                         \
5787                                        struct inet_connection_sock,    \
5788                                        FIELD));                        \
5789        } while (0)
5790
5791        if (insn > insn_buf)
5792                return insn - insn_buf;
5793
5794        switch (si->off) {
5795        case offsetof(struct bpf_tcp_sock, rtt_min):
5796                BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
5797                             sizeof(struct minmax));
5798                BUILD_BUG_ON(sizeof(struct minmax) <
5799                             sizeof(struct minmax_sample));
5800
5801                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
5802                                      offsetof(struct tcp_sock, rtt_min) +
5803                                      offsetof(struct minmax_sample, v));
5804                break;
5805        case offsetof(struct bpf_tcp_sock, snd_cwnd):
5806                BPF_TCP_SOCK_GET_COMMON(snd_cwnd);
5807                break;
5808        case offsetof(struct bpf_tcp_sock, srtt_us):
5809                BPF_TCP_SOCK_GET_COMMON(srtt_us);
5810                break;
5811        case offsetof(struct bpf_tcp_sock, snd_ssthresh):
5812                BPF_TCP_SOCK_GET_COMMON(snd_ssthresh);
5813                break;
5814        case offsetof(struct bpf_tcp_sock, rcv_nxt):
5815                BPF_TCP_SOCK_GET_COMMON(rcv_nxt);
5816                break;
5817        case offsetof(struct bpf_tcp_sock, snd_nxt):
5818                BPF_TCP_SOCK_GET_COMMON(snd_nxt);
5819                break;
5820        case offsetof(struct bpf_tcp_sock, snd_una):
5821                BPF_TCP_SOCK_GET_COMMON(snd_una);
5822                break;
5823        case offsetof(struct bpf_tcp_sock, mss_cache):
5824                BPF_TCP_SOCK_GET_COMMON(mss_cache);
5825                break;
5826        case offsetof(struct bpf_tcp_sock, ecn_flags):
5827                BPF_TCP_SOCK_GET_COMMON(ecn_flags);
5828                break;
5829        case offsetof(struct bpf_tcp_sock, rate_delivered):
5830                BPF_TCP_SOCK_GET_COMMON(rate_delivered);
5831                break;
5832        case offsetof(struct bpf_tcp_sock, rate_interval_us):
5833                BPF_TCP_SOCK_GET_COMMON(rate_interval_us);
5834                break;
5835        case offsetof(struct bpf_tcp_sock, packets_out):
5836                BPF_TCP_SOCK_GET_COMMON(packets_out);
5837                break;
5838        case offsetof(struct bpf_tcp_sock, retrans_out):
5839                BPF_TCP_SOCK_GET_COMMON(retrans_out);
5840                break;
5841        case offsetof(struct bpf_tcp_sock, total_retrans):
5842                BPF_TCP_SOCK_GET_COMMON(total_retrans);
5843                break;
5844        case offsetof(struct bpf_tcp_sock, segs_in):
5845                BPF_TCP_SOCK_GET_COMMON(segs_in);
5846                break;
5847        case offsetof(struct bpf_tcp_sock, data_segs_in):
5848                BPF_TCP_SOCK_GET_COMMON(data_segs_in);
5849                break;
5850        case offsetof(struct bpf_tcp_sock, segs_out):
5851                BPF_TCP_SOCK_GET_COMMON(segs_out);
5852                break;
5853        case offsetof(struct bpf_tcp_sock, data_segs_out):
5854                BPF_TCP_SOCK_GET_COMMON(data_segs_out);
5855                break;
5856        case offsetof(struct bpf_tcp_sock, lost_out):
5857                BPF_TCP_SOCK_GET_COMMON(lost_out);
5858                break;
5859        case offsetof(struct bpf_tcp_sock, sacked_out):
5860                BPF_TCP_SOCK_GET_COMMON(sacked_out);
5861                break;
5862        case offsetof(struct bpf_tcp_sock, bytes_received):
5863                BPF_TCP_SOCK_GET_COMMON(bytes_received);
5864                break;
5865        case offsetof(struct bpf_tcp_sock, bytes_acked):
5866                BPF_TCP_SOCK_GET_COMMON(bytes_acked);
5867                break;
5868        case offsetof(struct bpf_tcp_sock, dsack_dups):
5869                BPF_TCP_SOCK_GET_COMMON(dsack_dups);
5870                break;
5871        case offsetof(struct bpf_tcp_sock, delivered):
5872                BPF_TCP_SOCK_GET_COMMON(delivered);
5873                break;
5874        case offsetof(struct bpf_tcp_sock, delivered_ce):
5875                BPF_TCP_SOCK_GET_COMMON(delivered_ce);
5876                break;
5877        case offsetof(struct bpf_tcp_sock, icsk_retransmits):
5878                BPF_INET_SOCK_GET_COMMON(icsk_retransmits);
5879                break;
5880        }
5881
5882        return insn - insn_buf;
5883}
5884
5885BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
5886{
5887        if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
5888                return (unsigned long)sk;
5889
5890        return (unsigned long)NULL;
5891}
5892
5893const struct bpf_func_proto bpf_tcp_sock_proto = {
5894        .func           = bpf_tcp_sock,
5895        .gpl_only       = false,
5896        .ret_type       = RET_PTR_TO_TCP_SOCK_OR_NULL,
5897        .arg1_type      = ARG_PTR_TO_SOCK_COMMON,
5898};
5899
5900BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk)
5901{
5902        sk = sk_to_full_sk(sk);
5903
5904        if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
5905                return (unsigned long)sk;
5906
5907        return (unsigned long)NULL;
5908}
5909
5910static const struct bpf_func_proto bpf_get_listener_sock_proto = {
5911        .func           = bpf_get_listener_sock,
5912        .gpl_only       = false,
5913        .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
5914        .arg1_type      = ARG_PTR_TO_SOCK_COMMON,
5915};
5916
5917BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
5918{
5919        unsigned int iphdr_len;
5920
5921        switch (skb_protocol(skb, true)) {
5922        case cpu_to_be16(ETH_P_IP):
5923                iphdr_len = sizeof(struct iphdr);
5924                break;
5925        case cpu_to_be16(ETH_P_IPV6):
5926                iphdr_len = sizeof(struct ipv6hdr);
5927                break;
5928        default:
5929                return 0;
5930        }
5931
5932        if (skb_headlen(skb) < iphdr_len)
5933                return 0;
5934
5935        if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len))
5936                return 0;
5937
5938        return INET_ECN_set_ce(skb);
5939}
5940
5941bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
5942                                  struct bpf_insn_access_aux *info)
5943{
5944        if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id))
5945                return false;
5946
5947        if (off % size != 0)
5948                return false;
5949
5950        switch (off) {
5951        default:
5952                return size == sizeof(__u32);
5953        }
5954}
5955
5956u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
5957                                    const struct bpf_insn *si,
5958                                    struct bpf_insn *insn_buf,
5959                                    struct bpf_prog *prog, u32 *target_size)
5960{
5961        struct bpf_insn *insn = insn_buf;
5962
5963#define BPF_XDP_SOCK_GET(FIELD)                                         \
5964        do {                                                            \
5965                BUILD_BUG_ON(sizeof_field(struct xdp_sock, FIELD) >     \
5966                             sizeof_field(struct bpf_xdp_sock, FIELD)); \
5967                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\
5968                                      si->dst_reg, si->src_reg,         \
5969                                      offsetof(struct xdp_sock, FIELD)); \
5970        } while (0)
5971
5972        switch (si->off) {
5973        case offsetof(struct bpf_xdp_sock, queue_id):
5974                BPF_XDP_SOCK_GET(queue_id);
5975                break;
5976        }
5977
5978        return insn - insn_buf;
5979}
5980
5981static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
5982        .func           = bpf_skb_ecn_set_ce,
5983        .gpl_only       = false,
5984        .ret_type       = RET_INTEGER,
5985        .arg1_type      = ARG_PTR_TO_CTX,
5986};
5987
5988BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
5989           struct tcphdr *, th, u32, th_len)
5990{
5991#ifdef CONFIG_SYN_COOKIES
5992        u32 cookie;
5993        int ret;
5994
5995        if (unlikely(th_len < sizeof(*th)))
5996                return -EINVAL;
5997
5998        /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */
5999        if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
6000                return -EINVAL;
6001
6002        if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies)
6003                return -EINVAL;
6004
6005        if (!th->ack || th->rst || th->syn)
6006                return -ENOENT;
6007
6008        if (tcp_synq_no_recent_overflow(sk))
6009                return -ENOENT;
6010
6011        cookie = ntohl(th->ack_seq) - 1;
6012
6013        switch (sk->sk_family) {
6014        case AF_INET:
6015                if (unlikely(iph_len < sizeof(struct iphdr)))
6016                        return -EINVAL;
6017
6018                ret = __cookie_v4_check((struct iphdr *)iph, th, cookie);
6019                break;
6020
6021#if IS_BUILTIN(CONFIG_IPV6)
6022        case AF_INET6:
6023                if (unlikely(iph_len < sizeof(struct ipv6hdr)))
6024                        return -EINVAL;
6025
6026                ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie);
6027                break;
6028#endif /* CONFIG_IPV6 */
6029
6030        default:
6031                return -EPROTONOSUPPORT;
6032        }
6033
6034        if (ret > 0)
6035                return 0;
6036
6037        return -ENOENT;
6038#else
6039        return -ENOTSUPP;
6040#endif
6041}
6042
6043static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {
6044        .func           = bpf_tcp_check_syncookie,
6045        .gpl_only       = true,
6046        .pkt_access     = true,
6047        .ret_type       = RET_INTEGER,
6048        .arg1_type      = ARG_PTR_TO_SOCK_COMMON,
6049        .arg2_type      = ARG_PTR_TO_MEM,
6050        .arg3_type      = ARG_CONST_SIZE,
6051        .arg4_type      = ARG_PTR_TO_MEM,
6052        .arg5_type      = ARG_CONST_SIZE,
6053};
6054
6055BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
6056           struct tcphdr *, th, u32, th_len)
6057{
6058#ifdef CONFIG_SYN_COOKIES
6059        u32 cookie;
6060        u16 mss;
6061
6062        if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
6063                return -EINVAL;
6064
6065        if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
6066                return -EINVAL;
6067
6068        if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies)
6069                return -ENOENT;
6070
6071        if (!th->syn || th->ack || th->fin || th->rst)
6072                return -EINVAL;
6073
6074        if (unlikely(iph_len < sizeof(struct iphdr)))
6075                return -EINVAL;
6076
6077        /* Both struct iphdr and struct ipv6hdr have the version field at the
6078         * same offset so we can cast to the shorter header (struct iphdr).
6079         */
6080        switch (((struct iphdr *)iph)->version) {
6081        case 4:
6082                if (sk->sk_family == AF_INET6 && sk->sk_ipv6only)
6083                        return -EINVAL;
6084
6085                mss = tcp_v4_get_syncookie(sk, iph, th, &cookie);
6086                break;
6087
6088#if IS_BUILTIN(CONFIG_IPV6)
6089        case 6:
6090                if (unlikely(iph_len < sizeof(struct ipv6hdr)))
6091                        return -EINVAL;
6092
6093                if (sk->sk_family != AF_INET6)
6094                        return -EINVAL;
6095
6096                mss = tcp_v6_get_syncookie(sk, iph, th, &cookie);
6097                break;
6098#endif /* CONFIG_IPV6 */
6099
6100        default:
6101                return -EPROTONOSUPPORT;
6102        }
6103        if (mss == 0)
6104                return -ENOENT;
6105
6106        return cookie | ((u64)mss << 32);
6107#else
6108        return -EOPNOTSUPP;
6109#endif /* CONFIG_SYN_COOKIES */
6110}
6111
6112static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
6113        .func           = bpf_tcp_gen_syncookie,
6114        .gpl_only       = true, /* __cookie_v*_init_sequence() is GPL */
6115        .pkt_access     = true,
6116        .ret_type       = RET_INTEGER,
6117        .arg1_type      = ARG_PTR_TO_SOCK_COMMON,
6118        .arg2_type      = ARG_PTR_TO_MEM,
6119        .arg3_type      = ARG_CONST_SIZE,
6120        .arg4_type      = ARG_PTR_TO_MEM,
6121        .arg5_type      = ARG_CONST_SIZE,
6122};
6123
6124BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
6125{
6126        if (flags != 0)
6127                return -EINVAL;
6128        if (!skb_at_tc_ingress(skb))
6129                return -EOPNOTSUPP;
6130        if (unlikely(dev_net(skb->dev) != sock_net(sk)))
6131                return -ENETUNREACH;
6132        if (unlikely(sk_fullsock(sk) && sk->sk_reuseport))
6133                return -ESOCKTNOSUPPORT;
6134        if (sk_is_refcounted(sk) &&
6135            unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
6136                return -ENOENT;
6137
6138        skb_orphan(skb);
6139        skb->sk = sk;
6140        skb->destructor = sock_pfree;
6141
6142        return 0;
6143}
6144
6145static const struct bpf_func_proto bpf_sk_assign_proto = {
6146        .func           = bpf_sk_assign,
6147        .gpl_only       = false,
6148        .ret_type       = RET_INTEGER,
6149        .arg1_type      = ARG_PTR_TO_CTX,
6150        .arg2_type      = ARG_PTR_TO_SOCK_COMMON,
6151        .arg3_type      = ARG_ANYTHING,
6152};
6153
6154#endif /* CONFIG_INET */
6155
6156bool bpf_helper_changes_pkt_data(void *func)
6157{
6158        if (func == bpf_skb_vlan_push ||
6159            func == bpf_skb_vlan_pop ||
6160            func == bpf_skb_store_bytes ||
6161            func == bpf_skb_change_proto ||
6162            func == bpf_skb_change_head ||
6163            func == sk_skb_change_head ||
6164            func == bpf_skb_change_tail ||
6165            func == sk_skb_change_tail ||
6166            func == bpf_skb_adjust_room ||
6167            func == bpf_skb_pull_data ||
6168            func == sk_skb_pull_data ||
6169            func == bpf_clone_redirect ||
6170            func == bpf_l3_csum_replace ||
6171            func == bpf_l4_csum_replace ||
6172            func == bpf_xdp_adjust_head ||
6173            func == bpf_xdp_adjust_meta ||
6174            func == bpf_msg_pull_data ||
6175            func == bpf_msg_push_data ||
6176            func == bpf_msg_pop_data ||
6177            func == bpf_xdp_adjust_tail ||
6178#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
6179            func == bpf_lwt_seg6_store_bytes ||
6180            func == bpf_lwt_seg6_adjust_srh ||
6181            func == bpf_lwt_seg6_action ||
6182#endif
6183            func == bpf_lwt_in_push_encap ||
6184            func == bpf_lwt_xmit_push_encap)
6185                return true;
6186
6187        return false;
6188}
6189
6190const struct bpf_func_proto bpf_event_output_data_proto __weak;
6191const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak;
6192
6193static const struct bpf_func_proto *
6194sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
6195{
6196        switch (func_id) {
6197        /* inet and inet6 sockets are created in a process
6198         * context so there is always a valid uid/gid
6199         */
6200        case BPF_FUNC_get_current_uid_gid:
6201                return &bpf_get_current_uid_gid_proto;
6202        case BPF_FUNC_get_local_storage:
6203                return &bpf_get_local_storage_proto;
6204        case BPF_FUNC_get_socket_cookie:
6205                return &bpf_get_socket_cookie_sock_proto;
6206        case BPF_FUNC_get_netns_cookie:
6207                return &bpf_get_netns_cookie_sock_proto;
6208        case BPF_FUNC_perf_event_output:
6209                return &bpf_event_output_data_proto;
6210        case BPF_FUNC_get_current_pid_tgid:
6211                return &bpf_get_current_pid_tgid_proto;
6212        case BPF_FUNC_get_current_comm:
6213                return &bpf_get_current_comm_proto;
6214#ifdef CONFIG_CGROUPS
6215        case BPF_FUNC_get_current_cgroup_id:
6216                return &bpf_get_current_cgroup_id_proto;
6217        case BPF_FUNC_get_current_ancestor_cgroup_id:
6218                return &bpf_get_current_ancestor_cgroup_id_proto;
6219#endif
6220#ifdef CONFIG_CGROUP_NET_CLASSID
6221        case BPF_FUNC_get_cgroup_classid:
6222                return &bpf_get_cgroup_classid_curr_proto;
6223#endif
6224        case BPF_FUNC_sk_storage_get:
6225                return &bpf_sk_storage_get_cg_sock_proto;
6226        default:
6227                return bpf_base_func_proto(func_id);
6228        }
6229}
6230
6231static const struct bpf_func_proto *
6232sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
6233{
6234        switch (func_id) {
6235        /* inet and inet6 sockets are created in a process
6236         * context so there is always a valid uid/gid
6237         */
6238        case BPF_FUNC_get_current_uid_gid:
6239                return &bpf_get_current_uid_gid_proto;
6240        case BPF_FUNC_bind:
6241                switch (prog->expected_attach_type) {
6242                case BPF_CGROUP_INET4_CONNECT:
6243                case BPF_CGROUP_INET6_CONNECT:
6244                        return &bpf_bind_proto;
6245                default:
6246                        return NULL;
6247                }
6248        case BPF_FUNC_get_socket_cookie:
6249                return &bpf_get_socket_cookie_sock_addr_proto;
6250        case BPF_FUNC_get_netns_cookie:
6251                return &bpf_get_netns_cookie_sock_addr_proto;
6252        case BPF_FUNC_get_local_storage:
6253                return &bpf_get_local_storage_proto;
6254        case BPF_FUNC_perf_event_output:
6255                return &bpf_event_output_data_proto;
6256        case BPF_FUNC_get_current_pid_tgid:
6257                return &bpf_get_current_pid_tgid_proto;
6258        case BPF_FUNC_get_current_comm:
6259                return &bpf_get_current_comm_proto;
6260#ifdef CONFIG_CGROUPS
6261        case BPF_FUNC_get_current_cgroup_id:
6262                return &bpf_get_current_cgroup_id_proto;
6263        case BPF_FUNC_get_current_ancestor_cgroup_id:
6264                return &bpf_get_current_ancestor_cgroup_id_proto;
6265#endif
6266#ifdef CONFIG_CGROUP_NET_CLASSID
6267        case BPF_FUNC_get_cgroup_classid:
6268                return &bpf_get_cgroup_classid_curr_proto;
6269#endif
6270#ifdef CONFIG_INET
6271        case BPF_FUNC_sk_lookup_tcp:
6272                return &bpf_sock_addr_sk_lookup_tcp_proto;
6273        case BPF_FUNC_sk_lookup_udp:
6274                return &bpf_sock_addr_sk_lookup_udp_proto;
6275        case BPF_FUNC_sk_release:
6276                return &bpf_sk_release_proto;
6277        case BPF_FUNC_skc_lookup_tcp:
6278                return &bpf_sock_addr_skc_lookup_tcp_proto;
6279#endif /* CONFIG_INET */
6280        case BPF_FUNC_sk_storage_get:
6281                return &bpf_sk_storage_get_proto;
6282        case BPF_FUNC_sk_storage_delete:
6283                return &bpf_sk_storage_delete_proto;
6284        case BPF_FUNC_setsockopt:
6285                switch (prog->expected_attach_type) {
6286                case BPF_CGROUP_INET4_CONNECT:
6287                case BPF_CGROUP_INET6_CONNECT:
6288                        return &bpf_sock_addr_setsockopt_proto;
6289                default:
6290                        return NULL;
6291                }
6292        case BPF_FUNC_getsockopt:
6293                switch (prog->expected_attach_type) {
6294                case BPF_CGROUP_INET4_CONNECT:
6295                case BPF_CGROUP_INET6_CONNECT:
6296                        return &bpf_sock_addr_getsockopt_proto;
6297                default:
6298                        return NULL;
6299                }
6300        default:
6301                return bpf_base_func_proto(func_id);
6302        }
6303}
6304
6305static const struct bpf_func_proto *
6306sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
6307{
6308        switch (func_id) {
6309        case BPF_FUNC_skb_load_bytes:
6310                return &bpf_skb_load_bytes_proto;
6311        case BPF_FUNC_skb_load_bytes_relative:
6312                return &bpf_skb_load_bytes_relative_proto;
6313        case BPF_FUNC_get_socket_cookie:
6314                return &bpf_get_socket_cookie_proto;
6315        case BPF_FUNC_get_socket_uid:
6316                return &bpf_get_socket_uid_proto;
6317        case BPF_FUNC_perf_event_output:
6318                return &bpf_skb_event_output_proto;
6319        default:
6320                return bpf_base_func_proto(func_id);
6321        }
6322}
6323
6324const struct bpf_func_proto bpf_sk_storage_get_proto __weak;
6325const struct bpf_func_proto bpf_sk_storage_delete_proto __weak;
6326
6327static const struct bpf_func_proto *
6328cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
6329{
6330        switch (func_id) {
6331        case BPF_FUNC_get_local_storage:
6332                return &bpf_get_local_storage_proto;
6333        case BPF_FUNC_sk_fullsock:
6334                return &bpf_sk_fullsock_proto;
6335        case BPF_FUNC_sk_storage_get:
6336                return &bpf_sk_storage_get_proto;
6337        case BPF_FUNC_sk_storage_delete:
6338                return &bpf_sk_storage_delete_proto;
6339        case BPF_FUNC_perf_event_output:
6340                return &bpf_skb_event_output_proto;
6341#ifdef CONFIG_SOCK_CGROUP_DATA
6342        case BPF_FUNC_skb_cgroup_id:
6343                return &bpf_skb_cgroup_id_proto;
6344        case BPF_FUNC_skb_ancestor_cgroup_id:
6345                return &bpf_skb_ancestor_cgroup_id_proto;
6346        case BPF_FUNC_sk_cgroup_id:
6347                return &bpf_sk_cgroup_id_proto;
6348        case BPF_FUNC_sk_ancestor_cgroup_id:
6349                return &bpf_sk_ancestor_cgroup_id_proto;
6350#endif
6351#ifdef CONFIG_INET
6352        case BPF_FUNC_sk_lookup_tcp:
6353                return &bpf_sk_lookup_tcp_proto;
6354        case BPF_FUNC_sk_lookup_udp:
6355                return &bpf_sk_lookup_udp_proto;
6356        case BPF_FUNC_sk_release:
6357                return &bpf_sk_release_proto;
6358        case BPF_FUNC_skc_lookup_tcp:
6359                return &bpf_skc_lookup_tcp_proto;
6360        case BPF_FUNC_tcp_sock:
6361                return &bpf_tcp_sock_proto;
6362        case BPF_FUNC_get_listener_sock:
6363                return &bpf_get_listener_sock_proto;
6364        case BPF_FUNC_skb_ecn_set_ce:
6365                return &bpf_skb_ecn_set_ce_proto;
6366#endif
6367        default:
6368                return sk_filter_func_proto(func_id, prog);
6369        }
6370}
6371
6372static const struct bpf_func_proto *
6373tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
6374{
6375        switch (func_id) {
6376        case BPF_FUNC_skb_store_bytes:
6377                return &bpf_skb_store_bytes_proto;
6378        case BPF_FUNC_skb_load_bytes:
6379                return &bpf_skb_load_bytes_proto;
6380        case BPF_FUNC_skb_load_bytes_relative:
6381                return &bpf_skb_load_bytes_relative_proto;
6382        case BPF_FUNC_skb_pull_data:
6383                return &bpf_skb_pull_data_proto;
6384        case BPF_FUNC_csum_diff:
6385                return &bpf_csum_diff_proto;
6386        case BPF_FUNC_csum_update:
6387                return &bpf_csum_update_proto;
6388        case BPF_FUNC_csum_level:
6389                return &bpf_csum_level_proto;
6390        case BPF_FUNC_l3_csum_replace:
6391                return &bpf_l3_csum_replace_proto;
6392        case BPF_FUNC_l4_csum_replace:
6393                return &bpf_l4_csum_replace_proto;
6394        case BPF_FUNC_clone_redirect:
6395                return &bpf_clone_redirect_proto;
6396        case BPF_FUNC_get_cgroup_classid:
6397                return &bpf_get_cgroup_classid_proto;
6398        case BPF_FUNC_skb_vlan_push:
6399                return &bpf_skb_vlan_push_proto;
6400        case BPF_FUNC_skb_vlan_pop:
6401                return &bpf_skb_vlan_pop_proto;
6402        case BPF_FUNC_skb_change_proto:
6403                return &bpf_skb_change_proto_proto;
6404        case BPF_FUNC_skb_change_type:
6405                return &bpf_skb_change_type_proto;
6406        case BPF_FUNC_skb_adjust_room:
6407                return &bpf_skb_adjust_room_proto;
6408        case BPF_FUNC_skb_change_tail:
6409                return &bpf_skb_change_tail_proto;
6410        case BPF_FUNC_skb_change_head:
6411                return &bpf_skb_change_head_proto;
6412        case BPF_FUNC_skb_get_tunnel_key:
6413                return &bpf_skb_get_tunnel_key_proto;
6414        case BPF_FUNC_skb_set_tunnel_key:
6415                return bpf_get_skb_set_tunnel_proto(func_id);
6416        case BPF_FUNC_skb_get_tunnel_opt:
6417                return &bpf_skb_get_tunnel_opt_proto;
6418        case BPF_FUNC_skb_set_tunnel_opt:
6419                return bpf_get_skb_set_tunnel_proto(func_id);
6420        case BPF_FUNC_redirect:
6421                return &bpf_redirect_proto;
6422        case BPF_FUNC_get_route_realm:
6423                return &bpf_get_route_realm_proto;
6424        case BPF_FUNC_get_hash_recalc:
6425                return &bpf_get_hash_recalc_proto;
6426        case BPF_FUNC_set_hash_invalid:
6427                return &bpf_set_hash_invalid_proto;
6428        case BPF_FUNC_set_hash:
6429                return &bpf_set_hash_proto;
6430        case BPF_FUNC_perf_event_output:
6431                return &bpf_skb_event_output_proto;
6432        case BPF_FUNC_get_smp_processor_id:
6433                return &bpf_get_smp_processor_id_proto;
6434        case BPF_FUNC_skb_under_cgroup:
6435                return &bpf_skb_under_cgroup_proto;
6436        case BPF_FUNC_get_socket_cookie:
6437                return &bpf_get_socket_cookie_proto;
6438        case BPF_FUNC_get_socket_uid:
6439                return &bpf_get_socket_uid_proto;
6440        case BPF_FUNC_fib_lookup:
6441                return &bpf_skb_fib_lookup_proto;
6442        case BPF_FUNC_sk_fullsock:
6443                return &bpf_sk_fullsock_proto;
6444        case BPF_FUNC_sk_storage_get:
6445                return &bpf_sk_storage_get_proto;
6446        case BPF_FUNC_sk_storage_delete:
6447                return &bpf_sk_storage_delete_proto;
6448#ifdef CONFIG_XFRM
6449        case BPF_FUNC_skb_get_xfrm_state:
6450                return &bpf_skb_get_xfrm_state_proto;
6451#endif
6452#ifdef CONFIG_SOCK_CGROUP_DATA
6453        case BPF_FUNC_skb_cgroup_id:
6454                return &bpf_skb_cgroup_id_proto;
6455        case BPF_FUNC_skb_ancestor_cgroup_id:
6456                return &bpf_skb_ancestor_cgroup_id_proto;
6457#endif
6458#ifdef CONFIG_INET
6459        case BPF_FUNC_sk_lookup_tcp:
6460                return &bpf_sk_lookup_tcp_proto;
6461        case BPF_FUNC_sk_lookup_udp:
6462                return &bpf_sk_lookup_udp_proto;
6463        case BPF_FUNC_sk_release:
6464                return &bpf_sk_release_proto;
6465        case BPF_FUNC_tcp_sock:
6466                return &bpf_tcp_sock_proto;
6467        case BPF_FUNC_get_listener_sock:
6468                return &bpf_get_listener_sock_proto;
6469        case BPF_FUNC_skc_lookup_tcp:
6470                return &bpf_skc_lookup_tcp_proto;
6471        case BPF_FUNC_tcp_check_syncookie:
6472                return &bpf_tcp_check_syncookie_proto;
6473        case BPF_FUNC_skb_ecn_set_ce:
6474                return &bpf_skb_ecn_set_ce_proto;
6475        case BPF_FUNC_tcp_gen_syncookie:
6476                return &bpf_tcp_gen_syncookie_proto;
6477        case BPF_FUNC_sk_assign:
6478                return &bpf_sk_assign_proto;
6479#endif
6480        default:
6481                return bpf_base_func_proto(func_id);
6482        }
6483}
6484
6485static const struct bpf_func_proto *
6486xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
6487{
6488        switch (func_id) {
6489        case BPF_FUNC_perf_event_output:
6490                return &bpf_xdp_event_output_proto;
6491        case BPF_FUNC_get_smp_processor_id:
6492                return &bpf_get_smp_processor_id_proto;
6493        case BPF_FUNC_csum_diff:
6494                return &bpf_csum_diff_proto;
6495        case BPF_FUNC_xdp_adjust_head:
6496                return &bpf_xdp_adjust_head_proto;
6497        case BPF_FUNC_xdp_adjust_meta:
6498                return &bpf_xdp_adjust_meta_proto;
6499        case BPF_FUNC_redirect:
6500                return &bpf_xdp_redirect_proto;
6501        case BPF_FUNC_redirect_map:
6502                return &bpf_xdp_redirect_map_proto;
6503        case BPF_FUNC_xdp_adjust_tail:
6504                return &bpf_xdp_adjust_tail_proto;
6505        case BPF_FUNC_fib_lookup:
6506                return &bpf_xdp_fib_lookup_proto;
6507#ifdef CONFIG_INET
6508        case BPF_FUNC_sk_lookup_udp:
6509                return &bpf_xdp_sk_lookup_udp_proto;
6510        case BPF_FUNC_sk_lookup_tcp:
6511                return &bpf_xdp_sk_lookup_tcp_proto;
6512        case BPF_FUNC_sk_release:
6513                return &bpf_sk_release_proto;
6514        case BPF_FUNC_skc_lookup_tcp:
6515                return &bpf_xdp_skc_lookup_tcp_proto;
6516        case BPF_FUNC_tcp_check_syncookie:
6517                return &bpf_tcp_check_syncookie_proto;
6518        case BPF_FUNC_tcp_gen_syncookie:
6519                return &bpf_tcp_gen_syncookie_proto;
6520#endif
6521        default:
6522                return bpf_base_func_proto(func_id);
6523        }
6524}
6525
6526const struct bpf_func_proto bpf_sock_map_update_proto __weak;
6527const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
6528
6529static const struct bpf_func_proto *
6530sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
6531{
6532        switch (func_id) {
6533        case BPF_FUNC_setsockopt:
6534                return &bpf_sock_ops_setsockopt_proto;
6535        case BPF_FUNC_getsockopt:
6536                return &bpf_sock_ops_getsockopt_proto;
6537        case BPF_FUNC_sock_ops_cb_flags_set:
6538                return &bpf_sock_ops_cb_flags_set_proto;
6539        case BPF_FUNC_sock_map_update:
6540                return &bpf_sock_map_update_proto;
6541        case BPF_FUNC_sock_hash_update:
6542                return &bpf_sock_hash_update_proto;
6543        case BPF_FUNC_get_socket_cookie:
6544                return &bpf_get_socket_cookie_sock_ops_proto;
6545        case BPF_FUNC_get_local_storage:
6546                return &bpf_get_local_storage_proto;
6547        case BPF_FUNC_perf_event_output:
6548                return &bpf_event_output_data_proto;
6549        case BPF_FUNC_sk_storage_get:
6550                return &bpf_sk_storage_get_proto;
6551        case BPF_FUNC_sk_storage_delete:
6552                return &bpf_sk_storage_delete_proto;
6553#ifdef CONFIG_INET
6554        case BPF_FUNC_tcp_sock:
6555                return &bpf_tcp_sock_proto;
6556#endif /* CONFIG_INET */
6557        default:
6558                return bpf_base_func_proto(func_id);
6559        }
6560}
6561
6562const struct bpf_func_proto bpf_msg_redirect_map_proto __weak;
6563const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak;
6564
6565static const struct bpf_func_proto *
6566sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
6567{
6568        switch (func_id) {
6569        case BPF_FUNC_msg_redirect_map:
6570                return &bpf_msg_redirect_map_proto;
6571        case BPF_FUNC_msg_redirect_hash:
6572                return &bpf_msg_redirect_hash_proto;
6573        case BPF_FUNC_msg_apply_bytes:
6574                return &bpf_msg_apply_bytes_proto;
6575        case BPF_FUNC_msg_cork_bytes:
6576                return &bpf_msg_cork_bytes_proto;
6577        case BPF_FUNC_msg_pull_data:
6578                return &bpf_msg_pull_data_proto;
6579        case BPF_FUNC_msg_push_data:
6580                return &bpf_msg_push_data_proto;
6581        case BPF_FUNC_msg_pop_data:
6582                return &bpf_msg_pop_data_proto;
6583        case BPF_FUNC_perf_event_output:
6584                return &bpf_event_output_data_proto;
6585        case BPF_FUNC_get_current_uid_gid:
6586                return &bpf_get_current_uid_gid_proto;
6587        case BPF_FUNC_get_current_pid_tgid:
6588                return &bpf_get_current_pid_tgid_proto;
6589        case BPF_FUNC_sk_storage_get:
6590                return &bpf_sk_storage_get_proto;
6591        case BPF_FUNC_sk_storage_delete:
6592                return &bpf_sk_storage_delete_proto;
6593#ifdef CONFIG_CGROUPS
6594        case BPF_FUNC_get_current_cgroup_id:
6595                return &bpf_get_current_cgroup_id_proto;
6596        case BPF_FUNC_get_current_ancestor_cgroup_id:
6597                return &bpf_get_current_ancestor_cgroup_id_proto;
6598#endif
6599#ifdef CONFIG_CGROUP_NET_CLASSID
6600        case BPF_FUNC_get_cgroup_classid:
6601                return &bpf_get_cgroup_classid_curr_proto;
6602#endif
6603        default:
6604                return bpf_base_func_proto(func_id);
6605        }
6606}
6607
6608const struct bpf_func_proto bpf_sk_redirect_map_proto __weak;
6609const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak;
6610
6611static const struct bpf_func_proto *
6612sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
6613{
6614        switch (func_id) {
6615        case BPF_FUNC_skb_store_bytes:
6616                return &bpf_skb_store_bytes_proto;
6617        case BPF_FUNC_skb_load_bytes:
6618                return &bpf_skb_load_bytes_proto;
6619        case BPF_FUNC_skb_pull_data:
6620                return &sk_skb_pull_data_proto;
6621        case BPF_FUNC_skb_change_tail:
6622                return &sk_skb_change_tail_proto;
6623        case BPF_FUNC_skb_change_head:
6624                return &sk_skb_change_head_proto;
6625        case BPF_FUNC_get_socket_cookie:
6626                return &bpf_get_socket_cookie_proto;
6627        case BPF_FUNC_get_socket_uid:
6628                return &bpf_get_socket_uid_proto;
6629        case BPF_FUNC_sk_redirect_map:
6630                return &bpf_sk_redirect_map_proto;
6631        case BPF_FUNC_sk_redirect_hash:
6632                return &bpf_sk_redirect_hash_proto;
6633        case BPF_FUNC_perf_event_output:
6634                return &bpf_skb_event_output_proto;
6635#ifdef CONFIG_INET
6636        case BPF_FUNC_sk_lookup_tcp:
6637                return &bpf_sk_lookup_tcp_proto;
6638        case BPF_FUNC_sk_lookup_udp:
6639                return &bpf_sk_lookup_udp_proto;
6640        case BPF_FUNC_sk_release:
6641                return &bpf_sk_release_proto;
6642        case BPF_FUNC_skc_lookup_tcp:
6643                return &bpf_skc_lookup_tcp_proto;
6644#endif
6645        default:
6646                return bpf_base_func_proto(func_id);
6647        }
6648}
6649
6650static const struct bpf_func_proto *
6651flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
6652{
6653        switch (func_id) {
6654        case BPF_FUNC_skb_load_bytes:
6655                return &bpf_flow_dissector_load_bytes_proto;
6656        default:
6657                return bpf_base_func_proto(func_id);
6658        }
6659}
6660
6661static const struct bpf_func_proto *
6662lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
6663{
6664        switch (func_id) {
6665        case BPF_FUNC_skb_load_bytes:
6666                return &bpf_skb_load_bytes_proto;
6667        case BPF_FUNC_skb_pull_data:
6668                return &bpf_skb_pull_data_proto;
6669        case BPF_FUNC_csum_diff:
6670                return &bpf_csum_diff_proto;
6671        case BPF_FUNC_get_cgroup_classid:
6672                return &bpf_get_cgroup_classid_proto;
6673        case BPF_FUNC_get_route_realm:
6674                return &bpf_get_route_realm_proto;
6675        case BPF_FUNC_get_hash_recalc:
6676                return &bpf_get_hash_recalc_proto;
6677        case BPF_FUNC_perf_event_output:
6678                return &bpf_skb_event_output_proto;
6679        case BPF_FUNC_get_smp_processor_id:
6680                return &bpf_get_smp_processor_id_proto;
6681        case BPF_FUNC_skb_under_cgroup:
6682                return &bpf_skb_under_cgroup_proto;
6683        default:
6684                return bpf_base_func_proto(func_id);
6685        }
6686}
6687
6688static const struct bpf_func_proto *
6689lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
6690{
6691        switch (func_id) {
6692        case BPF_FUNC_lwt_push_encap:
6693                return &bpf_lwt_in_push_encap_proto;
6694        default:
6695                return lwt_out_func_proto(func_id, prog);
6696        }
6697}
6698
6699static const struct bpf_func_proto *
6700lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
6701{
6702        switch (func_id) {
6703        case BPF_FUNC_skb_get_tunnel_key:
6704                return &bpf_skb_get_tunnel_key_proto;
6705        case BPF_FUNC_skb_set_tunnel_key:
6706                return bpf_get_skb_set_tunnel_proto(func_id);
6707        case BPF_FUNC_skb_get_tunnel_opt:
6708                return &bpf_skb_get_tunnel_opt_proto;
6709        case BPF_FUNC_skb_set_tunnel_opt:
6710                return bpf_get_skb_set_tunnel_proto(func_id);
6711        case BPF_FUNC_redirect:
6712                return &bpf_redirect_proto;
6713        case BPF_FUNC_clone_redirect:
6714                return &bpf_clone_redirect_proto;
6715        case BPF_FUNC_skb_change_tail:
6716                return &bpf_skb_change_tail_proto;
6717        case BPF_FUNC_skb_change_head:
6718                return &bpf_skb_change_head_proto;
6719        case BPF_FUNC_skb_store_bytes:
6720                return &bpf_skb_store_bytes_proto;
6721        case BPF_FUNC_csum_update:
6722                return &bpf_csum_update_proto;
6723        case BPF_FUNC_csum_level:
6724                return &bpf_csum_level_proto;
6725        case BPF_FUNC_l3_csum_replace:
6726                return &bpf_l3_csum_replace_proto;
6727        case BPF_FUNC_l4_csum_replace:
6728                return &bpf_l4_csum_replace_proto;
6729        case BPF_FUNC_set_hash_invalid:
6730                return &bpf_set_hash_invalid_proto;
6731        case BPF_FUNC_lwt_push_encap:
6732                return &bpf_lwt_xmit_push_encap_proto;
6733        default:
6734                return lwt_out_func_proto(func_id, prog);
6735        }
6736}
6737
6738static const struct bpf_func_proto *
6739lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
6740{
6741        switch (func_id) {
6742#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
6743        case BPF_FUNC_lwt_seg6_store_bytes:
6744                return &bpf_lwt_seg6_store_bytes_proto;
6745        case BPF_FUNC_lwt_seg6_action:
6746                return &bpf_lwt_seg6_action_proto;
6747        case BPF_FUNC_lwt_seg6_adjust_srh:
6748                return &bpf_lwt_seg6_adjust_srh_proto;
6749#endif
6750        default:
6751                return lwt_out_func_proto(func_id, prog);
6752        }
6753}
6754
6755static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type,
6756                                    const struct bpf_prog *prog,
6757                                    struct bpf_insn_access_aux *info)
6758{
6759        const int size_default = sizeof(__u32);
6760
6761        if (off < 0 || off >= sizeof(struct __sk_buff))
6762                return false;
6763
6764        /* The verifier guarantees that size > 0. */
6765        if (off % size != 0)
6766                return false;
6767
6768        switch (off) {
6769        case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
6770                if (off + size > offsetofend(struct __sk_buff, cb[4]))
6771                        return false;
6772                break;
6773        case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]):
6774        case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]):
6775        case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
6776        case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
6777        case bpf_ctx_range(struct __sk_buff, data):
6778        case bpf_ctx_range(struct __sk_buff, data_meta):
6779        case bpf_ctx_range(struct __sk_buff, data_end):
6780                if (size != size_default)
6781                        return false;
6782                break;
6783        case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
6784                return false;
6785        case bpf_ctx_range(struct __sk_buff, tstamp):
6786                if (size != sizeof(__u64))
6787                        return false;
6788                break;
6789        case offsetof(struct __sk_buff, sk):
6790                if (type == BPF_WRITE || size != sizeof(__u64))
6791                        return false;
6792                info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
6793                break;
6794        default:
6795                /* Only narrow read access allowed for now. */
6796                if (type == BPF_WRITE) {
6797                        if (size != size_default)
6798                                return false;
6799                } else {
6800                        bpf_ctx_record_field_size(info, size_default);
6801                        if (!bpf_ctx_narrow_access_ok(off, size, size_default))
6802                                return false;
6803                }
6804        }
6805
6806        return true;
6807}
6808
6809static bool sk_filter_is_valid_access(int off, int size,
6810                                      enum bpf_access_type type,
6811                                      const struct bpf_prog *prog,
6812                                      struct bpf_insn_access_aux *info)
6813{
6814        switch (off) {
6815        case bpf_ctx_range(struct __sk_buff, tc_classid):
6816        case bpf_ctx_range(struct __sk_buff, data):
6817        case bpf_ctx_range(struct __sk_buff, data_meta):
6818        case bpf_ctx_range(struct __sk_buff, data_end):
6819        case bpf_ctx_range_till(struct __sk_buff, family, local_port):
6820        case bpf_ctx_range(struct __sk_buff, tstamp):
6821        case bpf_ctx_range(struct __sk_buff, wire_len):
6822                return false;
6823        }
6824
6825        if (type == BPF_WRITE) {
6826                switch (off) {
6827                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
6828                        break;
6829                default:
6830                        return false;
6831                }
6832        }
6833
6834        return bpf_skb_is_valid_access(off, size, type, prog, info);
6835}
6836
6837static bool cg_skb_is_valid_access(int off, int size,
6838                                   enum bpf_access_type type,
6839                                   const struct bpf_prog *prog,
6840                                   struct bpf_insn_access_aux *info)
6841{
6842        switch (off) {
6843        case bpf_ctx_range(struct __sk_buff, tc_classid):
6844        case bpf_ctx_range(struct __sk_buff, data_meta):
6845        case bpf_ctx_range(struct __sk_buff, wire_len):
6846                return false;
6847        case bpf_ctx_range(struct __sk_buff, data):
6848        case bpf_ctx_range(struct __sk_buff, data_end):
6849                if (!bpf_capable())
6850                        return false;
6851                break;
6852        }
6853
6854        if (type == BPF_WRITE) {
6855                switch (off) {
6856                case bpf_ctx_range(struct __sk_buff, mark):
6857                case bpf_ctx_range(struct __sk_buff, priority):
6858                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
6859                        break;
6860                case bpf_ctx_range(struct __sk_buff, tstamp):
6861                        if (!bpf_capable())
6862                                return false;
6863                        break;
6864                default:
6865                        return false;
6866                }
6867        }
6868
6869        switch (off) {
6870        case bpf_ctx_range(struct __sk_buff, data):
6871                info->reg_type = PTR_TO_PACKET;
6872                break;
6873        case bpf_ctx_range(struct __sk_buff, data_end):
6874                info->reg_type = PTR_TO_PACKET_END;
6875                break;
6876        }
6877
6878        return bpf_skb_is_valid_access(off, size, type, prog, info);
6879}
6880
6881static bool lwt_is_valid_access(int off, int size,
6882                                enum bpf_access_type type,
6883                                const struct bpf_prog *prog,
6884                                struct bpf_insn_access_aux *info)
6885{
6886        switch (off) {
6887        case bpf_ctx_range(struct __sk_buff, tc_classid):
6888        case bpf_ctx_range_till(struct __sk_buff, family, local_port):
6889        case bpf_ctx_range(struct __sk_buff, data_meta):
6890        case bpf_ctx_range(struct __sk_buff, tstamp):
6891        case bpf_ctx_range(struct __sk_buff, wire_len):
6892                return false;
6893        }
6894
6895        if (type == BPF_WRITE) {
6896                switch (off) {
6897                case bpf_ctx_range(struct __sk_buff, mark):
6898                case bpf_ctx_range(struct __sk_buff, priority):
6899                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
6900                        break;
6901                default:
6902                        return false;
6903                }
6904        }
6905
6906        switch (off) {
6907        case bpf_ctx_range(struct __sk_buff, data):
6908                info->reg_type = PTR_TO_PACKET;
6909                break;
6910        case bpf_ctx_range(struct __sk_buff, data_end):
6911                info->reg_type = PTR_TO_PACKET_END;
6912                break;
6913        }
6914
6915        return bpf_skb_is_valid_access(off, size, type, prog, info);
6916}
6917
6918/* Attach type specific accesses */
6919static bool __sock_filter_check_attach_type(int off,
6920                                            enum bpf_access_type access_type,
6921                                            enum bpf_attach_type attach_type)
6922{
6923        switch (off) {
6924        case offsetof(struct bpf_sock, bound_dev_if):
6925        case offsetof(struct bpf_sock, mark):
6926        case offsetof(struct bpf_sock, priority):
6927                switch (attach_type) {
6928                case BPF_CGROUP_INET_SOCK_CREATE:
6929                case BPF_CGROUP_INET_SOCK_RELEASE:
6930                        goto full_access;
6931                default:
6932                        return false;
6933                }
6934        case bpf_ctx_range(struct bpf_sock, src_ip4):
6935                switch (attach_type) {
6936                case BPF_CGROUP_INET4_POST_BIND:
6937                        goto read_only;
6938                default:
6939                        return false;
6940                }
6941        case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
6942                switch (attach_type) {
6943                case BPF_CGROUP_INET6_POST_BIND:
6944                        goto read_only;
6945                default:
6946                        return false;
6947                }
6948        case bpf_ctx_range(struct bpf_sock, src_port):
6949                switch (attach_type) {
6950                case BPF_CGROUP_INET4_POST_BIND:
6951                case BPF_CGROUP_INET6_POST_BIND:
6952                        goto read_only;
6953                default:
6954                        return false;
6955                }
6956        }
6957read_only:
6958        return access_type == BPF_READ;
6959full_access:
6960        return true;
6961}
6962
6963bool bpf_sock_common_is_valid_access(int off, int size,
6964                                     enum bpf_access_type type,
6965                                     struct bpf_insn_access_aux *info)
6966{
6967        switch (off) {
6968        case bpf_ctx_range_till(struct bpf_sock, type, priority):
6969                return false;
6970        default:
6971                return bpf_sock_is_valid_access(off, size, type, info);
6972        }
6973}
6974
6975bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
6976                              struct bpf_insn_access_aux *info)
6977{
6978        const int size_default = sizeof(__u32);
6979
6980        if (off < 0 || off >= sizeof(struct bpf_sock))
6981                return false;
6982        if (off % size != 0)
6983                return false;
6984
6985        switch (off) {
6986        case offsetof(struct bpf_sock, state):
6987        case offsetof(struct bpf_sock, family):
6988        case offsetof(struct bpf_sock, type):
6989        case offsetof(struct bpf_sock, protocol):
6990        case offsetof(struct bpf_sock, dst_port):
6991        case offsetof(struct bpf_sock, src_port):
6992        case offsetof(struct bpf_sock, rx_queue_mapping):
6993        case bpf_ctx_range(struct bpf_sock, src_ip4):
6994        case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
6995        case bpf_ctx_range(struct bpf_sock, dst_ip4):
6996        case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
6997                bpf_ctx_record_field_size(info, size_default);
6998                return bpf_ctx_narrow_access_ok(off, size, size_default);
6999        }
7000
7001        return size == size_default;
7002}
7003
7004static bool sock_filter_is_valid_access(int off, int size,
7005                                        enum bpf_access_type type,
7006                                        const struct bpf_prog *prog,
7007                                        struct bpf_insn_access_aux *info)
7008{
7009        if (!bpf_sock_is_valid_access(off, size, type, info))
7010                return false;
7011        return __sock_filter_check_attach_type(off, type,
7012                                               prog->expected_attach_type);
7013}
7014
7015static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write,
7016                             const struct bpf_prog *prog)
7017{
7018        /* Neither direct read nor direct write requires any preliminary
7019         * action.
7020         */
7021        return 0;
7022}
7023
7024static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
7025                                const struct bpf_prog *prog, int drop_verdict)
7026{
7027        struct bpf_insn *insn = insn_buf;
7028
7029        if (!direct_write)
7030                return 0;
7031
7032        /* if (!skb->cloned)
7033         *       goto start;
7034         *
7035         * (Fast-path, otherwise approximation that we might be
7036         *  a clone, do the rest in helper.)
7037         */
7038        *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET());
7039        *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
7040        *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
7041
7042        /* ret = bpf_skb_pull_data(skb, 0); */
7043        *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
7044        *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
7045        *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
7046                               BPF_FUNC_skb_pull_data);
7047        /* if (!ret)
7048         *      goto restore;
7049         * return TC_ACT_SHOT;
7050         */
7051        *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
7052        *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict);
7053        *insn++ = BPF_EXIT_INSN();
7054
7055        /* restore: */
7056        *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
7057        /* start: */
7058        *insn++ = prog->insnsi[0];
7059
7060        return insn - insn_buf;
7061}
7062
7063static int bpf_gen_ld_abs(const struct bpf_insn *orig,
7064                          struct bpf_insn *insn_buf)
7065{
7066        bool indirect = BPF_MODE(orig->code) == BPF_IND;
7067        struct bpf_insn *insn = insn_buf;
7068
7069        if (!indirect) {
7070                *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm);
7071        } else {
7072                *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg);
7073                if (orig->imm)
7074                        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm);
7075        }
7076        /* We're guaranteed here that CTX is in R6. */
7077        *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);
7078
7079        switch (BPF_SIZE(orig->code)) {
7080        case BPF_B:
7081                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache);
7082                break;
7083        case BPF_H:
7084                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache);
7085                break;
7086        case BPF_W:
7087                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache);
7088                break;
7089        }
7090
7091        *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2);
7092        *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
7093        *insn++ = BPF_EXIT_INSN();
7094
7095        return insn - insn_buf;
7096}
7097
7098static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
7099                               const struct bpf_prog *prog)
7100{
7101        return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT);
7102}
7103
7104static bool tc_cls_act_is_valid_access(int off, int size,
7105                                       enum bpf_access_type type,
7106                                       const struct bpf_prog *prog,
7107                                       struct bpf_insn_access_aux *info)
7108{
7109        if (type == BPF_WRITE) {
7110                switch (off) {
7111                case bpf_ctx_range(struct __sk_buff, mark):
7112                case bpf_ctx_range(struct __sk_buff, tc_index):
7113                case bpf_ctx_range(struct __sk_buff, priority):
7114                case bpf_ctx_range(struct __sk_buff, tc_classid):
7115                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
7116                case bpf_ctx_range(struct __sk_buff, tstamp):
7117                case bpf_ctx_range(struct __sk_buff, queue_mapping):
7118                        break;
7119                default:
7120                        return false;
7121                }
7122        }
7123
7124        switch (off) {
7125        case bpf_ctx_range(struct __sk_buff, data):
7126                info->reg_type = PTR_TO_PACKET;
7127                break;
7128        case bpf_ctx_range(struct __sk_buff, data_meta):
7129                info->reg_type = PTR_TO_PACKET_META;
7130                break;
7131        case bpf_ctx_range(struct __sk_buff, data_end):
7132                info->reg_type = PTR_TO_PACKET_END;
7133                break;
7134        case bpf_ctx_range_till(struct __sk_buff, family, local_port):
7135                return false;
7136        }
7137
7138        return bpf_skb_is_valid_access(off, size, type, prog, info);
7139}
7140
7141static bool __is_valid_xdp_access(int off, int size)
7142{
7143        if (off < 0 || off >= sizeof(struct xdp_md))
7144                return false;
7145        if (off % size != 0)
7146                return false;
7147        if (size != sizeof(__u32))
7148                return false;
7149
7150        return true;
7151}
7152
7153static bool xdp_is_valid_access(int off, int size,
7154                                enum bpf_access_type type,
7155                                const struct bpf_prog *prog,
7156                                struct bpf_insn_access_aux *info)
7157{
7158        if (prog->expected_attach_type != BPF_XDP_DEVMAP) {
7159                switch (off) {
7160                case offsetof(struct xdp_md, egress_ifindex):
7161                        return false;
7162                }
7163        }
7164
7165        if (type == BPF_WRITE) {
7166                if (bpf_prog_is_dev_bound(prog->aux)) {
7167                        switch (off) {
7168                        case offsetof(struct xdp_md, rx_queue_index):
7169                                return __is_valid_xdp_access(off, size);
7170                        }
7171                }
7172                return false;
7173        }
7174
7175        switch (off) {
7176        case offsetof(struct xdp_md, data):
7177                info->reg_type = PTR_TO_PACKET;
7178                break;
7179        case offsetof(struct xdp_md, data_meta):
7180                info->reg_type = PTR_TO_PACKET_META;
7181                break;
7182        case offsetof(struct xdp_md, data_end):
7183                info->reg_type = PTR_TO_PACKET_END;
7184                break;
7185        }
7186
7187        return __is_valid_xdp_access(off, size);
7188}
7189
7190void bpf_warn_invalid_xdp_action(u32 act)
7191{
7192        const u32 act_max = XDP_REDIRECT;
7193
7194        WARN_ONCE(1, "%s XDP return value %u, expect packet loss!\n",
7195                  act > act_max ? "Illegal" : "Driver unsupported",
7196                  act);
7197}
7198EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
7199
7200static bool sock_addr_is_valid_access(int off, int size,
7201                                      enum bpf_access_type type,
7202                                      const struct bpf_prog *prog,
7203                                      struct bpf_insn_access_aux *info)
7204{
7205        const int size_default = sizeof(__u32);
7206
7207        if (off < 0 || off >= sizeof(struct bpf_sock_addr))
7208                return false;
7209        if (off % size != 0)
7210                return false;
7211
7212        /* Disallow access to IPv6 fields from IPv4 contex and vise
7213         * versa.
7214         */
7215        switch (off) {
7216        case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
7217                switch (prog->expected_attach_type) {
7218                case BPF_CGROUP_INET4_BIND:
7219                case BPF_CGROUP_INET4_CONNECT:
7220                case BPF_CGROUP_INET4_GETPEERNAME:
7221                case BPF_CGROUP_INET4_GETSOCKNAME:
7222                case BPF_CGROUP_UDP4_SENDMSG:
7223                case BPF_CGROUP_UDP4_RECVMSG:
7224                        break;
7225                default:
7226                        return false;
7227                }
7228                break;
7229        case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
7230                switch (prog->expected_attach_type) {
7231                case BPF_CGROUP_INET6_BIND:
7232                case BPF_CGROUP_INET6_CONNECT:
7233                case BPF_CGROUP_INET6_GETPEERNAME:
7234                case BPF_CGROUP_INET6_GETSOCKNAME:
7235                case BPF_CGROUP_UDP6_SENDMSG:
7236                case BPF_CGROUP_UDP6_RECVMSG:
7237                        break;
7238                default:
7239                        return false;
7240                }
7241                break;
7242        case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
7243                switch (prog->expected_attach_type) {
7244                case BPF_CGROUP_UDP4_SENDMSG:
7245                        break;
7246                default:
7247                        return false;
7248                }
7249                break;
7250        case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
7251                                msg_src_ip6[3]):
7252                switch (prog->expected_attach_type) {
7253                case BPF_CGROUP_UDP6_SENDMSG:
7254                        break;
7255                default:
7256                        return false;
7257                }
7258                break;
7259        }
7260
7261        switch (off) {
7262        case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
7263        case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
7264        case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
7265        case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
7266                                msg_src_ip6[3]):
7267        case bpf_ctx_range(struct bpf_sock_addr, user_port):
7268                if (type == BPF_READ) {
7269                        bpf_ctx_record_field_size(info, size_default);
7270
7271                        if (bpf_ctx_wide_access_ok(off, size,
7272                                                   struct bpf_sock_addr,
7273                                                   user_ip6))
7274                                return true;
7275
7276                        if (bpf_ctx_wide_access_ok(off, size,
7277                                                   struct bpf_sock_addr,
7278                                                   msg_src_ip6))
7279                                return true;
7280
7281                        if (!bpf_ctx_narrow_access_ok(off, size, size_default))
7282                                return false;
7283                } else {
7284                        if (bpf_ctx_wide_access_ok(off, size,
7285                                                   struct bpf_sock_addr,
7286                                                   user_ip6))
7287                                return true;
7288
7289                        if (bpf_ctx_wide_access_ok(off, size,
7290                                                   struct bpf_sock_addr,
7291                                                   msg_src_ip6))
7292                                return true;
7293
7294                        if (size != size_default)
7295                                return false;
7296                }
7297                break;
7298        case offsetof(struct bpf_sock_addr, sk):
7299                if (type != BPF_READ)
7300                        return false;
7301                if (size != sizeof(__u64))
7302                        return false;
7303                info->reg_type = PTR_TO_SOCKET;
7304                break;
7305        default:
7306                if (type == BPF_READ) {
7307                        if (size != size_default)
7308                                return false;
7309                } else {
7310                        return false;
7311                }
7312        }
7313
7314        return true;
7315}
7316
7317static bool sock_ops_is_valid_access(int off, int size,
7318                                     enum bpf_access_type type,
7319                                     const struct bpf_prog *prog,
7320                                     struct bpf_insn_access_aux *info)
7321{
7322        const int size_default = sizeof(__u32);
7323
7324        if (off < 0 || off >= sizeof(struct bpf_sock_ops))
7325                return false;
7326
7327        /* The verifier guarantees that size > 0. */
7328        if (off % size != 0)
7329                return false;
7330
7331        if (type == BPF_WRITE) {
7332                switch (off) {
7333                case offsetof(struct bpf_sock_ops, reply):
7334                case offsetof(struct bpf_sock_ops, sk_txhash):
7335                        if (size != size_default)
7336                                return false;
7337                        break;
7338                default:
7339                        return false;
7340                }
7341        } else {
7342                switch (off) {
7343                case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
7344                                        bytes_acked):
7345                        if (size != sizeof(__u64))
7346                                return false;
7347                        break;
7348                case offsetof(struct bpf_sock_ops, sk):
7349                        if (size != sizeof(__u64))
7350                                return false;
7351                        info->reg_type = PTR_TO_SOCKET_OR_NULL;
7352                        break;
7353                default:
7354                        if (size != size_default)
7355                                return false;
7356                        break;
7357                }
7358        }
7359
7360        return true;
7361}
7362
7363static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
7364                           const struct bpf_prog *prog)
7365{
7366        return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP);
7367}
7368
7369static bool sk_skb_is_valid_access(int off, int size,
7370                                   enum bpf_access_type type,
7371                                   const struct bpf_prog *prog,
7372                                   struct bpf_insn_access_aux *info)
7373{
7374        switch (off) {
7375        case bpf_ctx_range(struct __sk_buff, tc_classid):
7376        case bpf_ctx_range(struct __sk_buff, data_meta):
7377        case bpf_ctx_range(struct __sk_buff, tstamp):
7378        case bpf_ctx_range(struct __sk_buff, wire_len):
7379                return false;
7380        }
7381
7382        if (type == BPF_WRITE) {
7383                switch (off) {
7384                case bpf_ctx_range(struct __sk_buff, tc_index):
7385                case bpf_ctx_range(struct __sk_buff, priority):
7386                        break;
7387                default:
7388                        return false;
7389                }
7390        }
7391
7392        switch (off) {
7393        case bpf_ctx_range(struct __sk_buff, mark):
7394                return false;
7395        case bpf_ctx_range(struct __sk_buff, data):
7396                info->reg_type = PTR_TO_PACKET;
7397                break;
7398        case bpf_ctx_range(struct __sk_buff, data_end):
7399                info->reg_type = PTR_TO_PACKET_END;
7400                break;
7401        }
7402
7403        return bpf_skb_is_valid_access(off, size, type, prog, info);
7404}
7405
7406static bool sk_msg_is_valid_access(int off, int size,
7407                                   enum bpf_access_type type,
7408                                   const struct bpf_prog *prog,
7409                                   struct bpf_insn_access_aux *info)
7410{
7411        if (type == BPF_WRITE)
7412                return false;
7413
7414        if (off % size != 0)
7415                return false;
7416
7417        switch (off) {
7418        case offsetof(struct sk_msg_md, data):
7419                info->reg_type = PTR_TO_PACKET;
7420                if (size != sizeof(__u64))
7421                        return false;
7422                break;
7423        case offsetof(struct sk_msg_md, data_end):
7424                info->reg_type = PTR_TO_PACKET_END;
7425                if (size != sizeof(__u64))
7426                        return false;
7427                break;
7428        case offsetof(struct sk_msg_md, sk):
7429                if (size != sizeof(__u64))
7430                        return false;
7431                info->reg_type = PTR_TO_SOCKET;
7432                break;
7433        case bpf_ctx_range(struct sk_msg_md, family):
7434        case bpf_ctx_range(struct sk_msg_md, remote_ip4):
7435        case bpf_ctx_range(struct sk_msg_md, local_ip4):
7436        case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]):
7437        case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]):
7438        case bpf_ctx_range(struct sk_msg_md, remote_port):
7439        case bpf_ctx_range(struct sk_msg_md, local_port):
7440        case bpf_ctx_range(struct sk_msg_md, size):
7441                if (size != sizeof(__u32))
7442                        return false;
7443                break;
7444        default:
7445                return false;
7446        }
7447        return true;
7448}
7449
7450static bool flow_dissector_is_valid_access(int off, int size,
7451                                           enum bpf_access_type type,
7452                                           const struct bpf_prog *prog,
7453                                           struct bpf_insn_access_aux *info)
7454{
7455        const int size_default = sizeof(__u32);
7456
7457        if (off < 0 || off >= sizeof(struct __sk_buff))
7458                return false;
7459
7460        if (type == BPF_WRITE)
7461                return false;
7462
7463        switch (off) {
7464        case bpf_ctx_range(struct __sk_buff, data):
7465                if (size != size_default)
7466                        return false;
7467                info->reg_type = PTR_TO_PACKET;
7468                return true;
7469        case bpf_ctx_range(struct __sk_buff, data_end):
7470                if (size != size_default)
7471                        return false;
7472                info->reg_type = PTR_TO_PACKET_END;
7473                return true;
7474        case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
7475                if (size != sizeof(__u64))
7476                        return false;
7477                info->reg_type = PTR_TO_FLOW_KEYS;
7478                return true;
7479        default:
7480                return false;
7481        }
7482}
7483
7484static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
7485                                             const struct bpf_insn *si,
7486                                             struct bpf_insn *insn_buf,
7487                                             struct bpf_prog *prog,
7488                                             u32 *target_size)
7489
7490{
7491        struct bpf_insn *insn = insn_buf;
7492
7493        switch (si->off) {
7494        case offsetof(struct __sk_buff, data):
7495                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data),
7496                                      si->dst_reg, si->src_reg,
7497                                      offsetof(struct bpf_flow_dissector, data));
7498                break;
7499
7500        case offsetof(struct __sk_buff, data_end):
7501                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end),
7502                                      si->dst_reg, si->src_reg,
7503                                      offsetof(struct bpf_flow_dissector, data_end));
7504                break;
7505
7506        case offsetof(struct __sk_buff, flow_keys):
7507                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys),
7508                                      si->dst_reg, si->src_reg,
7509                                      offsetof(struct bpf_flow_dissector, flow_keys));
7510                break;
7511        }
7512
7513        return insn - insn_buf;
7514}
7515
7516static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si,
7517                                                  struct bpf_insn *insn)
7518{
7519        /* si->dst_reg = skb_shinfo(SKB); */
7520#ifdef NET_SKBUFF_DATA_USES_OFFSET
7521        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
7522                              BPF_REG_AX, si->src_reg,
7523                              offsetof(struct sk_buff, end));
7524        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head),
7525                              si->dst_reg, si->src_reg,
7526                              offsetof(struct sk_buff, head));
7527        *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
7528#else
7529        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
7530                              si->dst_reg, si->src_reg,
7531                              offsetof(struct sk_buff, end));
7532#endif
7533
7534        return insn;
7535}
7536
7537static u32 bpf_convert_ctx_access(enum bpf_access_type type,
7538                                  const struct bpf_insn *si,
7539                                  struct bpf_insn *insn_buf,
7540                                  struct bpf_prog *prog, u32 *target_size)
7541{
7542        struct bpf_insn *insn = insn_buf;
7543        int off;
7544
7545        switch (si->off) {
7546        case offsetof(struct __sk_buff, len):
7547                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
7548                                      bpf_target_off(struct sk_buff, len, 4,
7549                                                     target_size));
7550                break;
7551
7552        case offsetof(struct __sk_buff, protocol):
7553                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
7554                                      bpf_target_off(struct sk_buff, protocol, 2,
7555                                                     target_size));
7556                break;
7557
7558        case offsetof(struct __sk_buff, vlan_proto):
7559                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
7560                                      bpf_target_off(struct sk_buff, vlan_proto, 2,
7561                                                     target_size));
7562                break;
7563
7564        case offsetof(struct __sk_buff, priority):
7565                if (type == BPF_WRITE)
7566                        *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
7567                                              bpf_target_off(struct sk_buff, priority, 4,
7568                                                             target_size));
7569                else
7570                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
7571                                              bpf_target_off(struct sk_buff, priority, 4,
7572                                                             target_size));
7573                break;
7574
7575        case offsetof(struct __sk_buff, ingress_ifindex):
7576                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
7577                                      bpf_target_off(struct sk_buff, skb_iif, 4,
7578                                                     target_size));
7579                break;
7580
7581        case offsetof(struct __sk_buff, ifindex):
7582                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
7583                                      si->dst_reg, si->src_reg,
7584                                      offsetof(struct sk_buff, dev));
7585                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
7586                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
7587                                      bpf_target_off(struct net_device, ifindex, 4,
7588                                                     target_size));
7589                break;
7590
7591        case offsetof(struct __sk_buff, hash):
7592                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
7593                                      bpf_target_off(struct sk_buff, hash, 4,
7594                                                     target_size));
7595                break;
7596
7597        case offsetof(struct __sk_buff, mark):
7598                if (type == BPF_WRITE)
7599                        *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
7600                                              bpf_target_off(struct sk_buff, mark, 4,
7601                                                             target_size));
7602                else
7603                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
7604                                              bpf_target_off(struct sk_buff, mark, 4,
7605                                                             target_size));
7606                break;
7607
7608        case offsetof(struct __sk_buff, pkt_type):
7609                *target_size = 1;
7610                *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
7611                                      PKT_TYPE_OFFSET());
7612                *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX);
7613#ifdef __BIG_ENDIAN_BITFIELD
7614                *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5);
7615#endif
7616                break;
7617
7618        case offsetof(struct __sk_buff, queue_mapping):
7619                if (type == BPF_WRITE) {
7620                        *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1);
7621                        *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg,
7622                                              bpf_target_off(struct sk_buff,
7623                                                             queue_mapping,
7624                                                             2, target_size));
7625                } else {
7626                        *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
7627                                              bpf_target_off(struct sk_buff,
7628                                                             queue_mapping,
7629                                                             2, target_size));
7630                }
7631                break;
7632
7633        case offsetof(struct __sk_buff, vlan_present):
7634                *target_size = 1;
7635                *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
7636                                      PKT_VLAN_PRESENT_OFFSET());
7637                if (PKT_VLAN_PRESENT_BIT)
7638                        *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, PKT_VLAN_PRESENT_BIT);
7639                if (PKT_VLAN_PRESENT_BIT < 7)
7640                        *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1);
7641                break;
7642
7643        case offsetof(struct __sk_buff, vlan_tci):
7644                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
7645                                      bpf_target_off(struct sk_buff, vlan_tci, 2,
7646                                                     target_size));
7647                break;
7648
7649        case offsetof(struct __sk_buff, cb[0]) ...
7650             offsetofend(struct __sk_buff, cb[4]) - 1:
7651                BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20);
7652                BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
7653                              offsetof(struct qdisc_skb_cb, data)) %
7654                             sizeof(__u64));
7655
7656                prog->cb_access = 1;
7657                off  = si->off;
7658                off -= offsetof(struct __sk_buff, cb[0]);
7659                off += offsetof(struct sk_buff, cb);
7660                off += offsetof(struct qdisc_skb_cb, data);
7661                if (type == BPF_WRITE)
7662                        *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg,
7663                                              si->src_reg, off);
7664                else
7665                        *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
7666                                              si->src_reg, off);
7667                break;
7668
7669        case offsetof(struct __sk_buff, tc_classid):
7670                BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, tc_classid) != 2);
7671
7672                off  = si->off;
7673                off -= offsetof(struct __sk_buff, tc_classid);
7674                off += offsetof(struct sk_buff, cb);
7675                off += offsetof(struct qdisc_skb_cb, tc_classid);
7676                *target_size = 2;
7677                if (type == BPF_WRITE)
7678                        *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg,
7679                                              si->src_reg, off);
7680                else
7681                        *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg,
7682                                              si->src_reg, off);
7683                break;
7684
7685        case offsetof(struct __sk_buff, data):
7686                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
7687                                      si->dst_reg, si->src_reg,
7688                                      offsetof(struct sk_buff, data));
7689                break;
7690
7691        case offsetof(struct __sk_buff, data_meta):
7692                off  = si->off;
7693                off -= offsetof(struct __sk_buff, data_meta);
7694                off += offsetof(struct sk_buff, cb);
7695                off += offsetof(struct bpf_skb_data_end, data_meta);
7696                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
7697                                      si->src_reg, off);
7698                break;
7699
7700        case offsetof(struct __sk_buff, data_end):
7701                off  = si->off;
7702                off -= offsetof(struct __sk_buff, data_end);
7703                off += offsetof(struct sk_buff, cb);
7704                off += offsetof(struct bpf_skb_data_end, data_end);
7705                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
7706                                      si->src_reg, off);
7707                break;
7708
7709        case offsetof(struct __sk_buff, tc_index):
7710#ifdef CONFIG_NET_SCHED
7711                if (type == BPF_WRITE)
7712                        *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg,
7713                                              bpf_target_off(struct sk_buff, tc_index, 2,
7714                                                             target_size));
7715                else
7716                        *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
7717                                              bpf_target_off(struct sk_buff, tc_index, 2,
7718                                                             target_size));
7719#else
7720                *target_size = 2;
7721                if (type == BPF_WRITE)
7722                        *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
7723                else
7724                        *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
7725#endif
7726                break;
7727
7728        case offsetof(struct __sk_buff, napi_id):
7729#if defined(CONFIG_NET_RX_BUSY_POLL)
7730                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
7731                                      bpf_target_off(struct sk_buff, napi_id, 4,
7732                                                     target_size));
7733                *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1);
7734                *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
7735#else
7736                *target_size = 4;
7737                *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
7738#endif
7739                break;
7740        case offsetof(struct __sk_buff, family):
7741                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);
7742
7743                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
7744                                      si->dst_reg, si->src_reg,
7745                                      offsetof(struct sk_buff, sk));
7746                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
7747                                      bpf_target_off(struct sock_common,
7748                                                     skc_family,
7749                                                     2, target_size));
7750                break;
7751        case offsetof(struct __sk_buff, remote_ip4):
7752                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);
7753
7754                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
7755                                      si->dst_reg, si->src_reg,
7756                                      offsetof(struct sk_buff, sk));
7757                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
7758                                      bpf_target_off(struct sock_common,
7759                                                     skc_daddr,
7760                                                     4, target_size));
7761                break;
7762        case offsetof(struct __sk_buff, local_ip4):
7763                BUILD_BUG_ON(sizeof_field(struct sock_common,
7764                                          skc_rcv_saddr) != 4);
7765
7766                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
7767                                      si->dst_reg, si->src_reg,
7768                                      offsetof(struct sk_buff, sk));
7769                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
7770                                      bpf_target_off(struct sock_common,
7771                                                     skc_rcv_saddr,
7772                                                     4, target_size));
7773                break;
7774        case offsetof(struct __sk_buff, remote_ip6[0]) ...
7775             offsetof(struct __sk_buff, remote_ip6[3]):
7776#if IS_ENABLED(CONFIG_IPV6)
7777                BUILD_BUG_ON(sizeof_field(struct sock_common,
7778                                          skc_v6_daddr.s6_addr32[0]) != 4);
7779
7780                off = si->off;
7781                off -= offsetof(struct __sk_buff, remote_ip6[0]);
7782
7783                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
7784                                      si->dst_reg, si->src_reg,
7785                                      offsetof(struct sk_buff, sk));
7786                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
7787                                      offsetof(struct sock_common,
7788                                               skc_v6_daddr.s6_addr32[0]) +
7789                                      off);
7790#else
7791                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
7792#endif
7793                break;
7794        case offsetof(struct __sk_buff, local_ip6[0]) ...
7795             offsetof(struct __sk_buff, local_ip6[3]):
7796#if IS_ENABLED(CONFIG_IPV6)
7797                BUILD_BUG_ON(sizeof_field(struct sock_common,
7798                                          skc_v6_rcv_saddr.s6_addr32[0]) != 4);
7799
7800                off = si->off;
7801                off -= offsetof(struct __sk_buff, local_ip6[0]);
7802
7803                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
7804                                      si->dst_reg, si->src_reg,
7805                                      offsetof(struct sk_buff, sk));
7806                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
7807                                      offsetof(struct sock_common,
7808                                               skc_v6_rcv_saddr.s6_addr32[0]) +
7809                                      off);
7810#else
7811                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
7812#endif
7813                break;
7814
7815        case offsetof(struct __sk_buff, remote_port):
7816                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);
7817
7818                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
7819                                      si->dst_reg, si->src_reg,
7820                                      offsetof(struct sk_buff, sk));
7821                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
7822                                      bpf_target_off(struct sock_common,
7823                                                     skc_dport,
7824                                                     2, target_size));
7825#ifndef __BIG_ENDIAN_BITFIELD
7826                *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
7827#endif
7828                break;
7829
7830        case offsetof(struct __sk_buff, local_port):
7831                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);
7832
7833                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
7834                                      si->dst_reg, si->src_reg,
7835                                      offsetof(struct sk_buff, sk));
7836                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
7837                                      bpf_target_off(struct sock_common,
7838                                                     skc_num, 2, target_size));
7839                break;
7840
7841        case offsetof(struct __sk_buff, tstamp):
7842                BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8);
7843
7844                if (type == BPF_WRITE)
7845                        *insn++ = BPF_STX_MEM(BPF_DW,
7846                                              si->dst_reg, si->src_reg,
7847                                              bpf_target_off(struct sk_buff,
7848                                                             tstamp, 8,
7849                                                             target_size));
7850                else
7851                        *insn++ = BPF_LDX_MEM(BPF_DW,
7852                                              si->dst_reg, si->src_reg,
7853                                              bpf_target_off(struct sk_buff,
7854                                                             tstamp, 8,
7855                                                             target_size));
7856                break;
7857
7858        case offsetof(struct __sk_buff, gso_segs):
7859                insn = bpf_convert_shinfo_access(si, insn);
7860                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs),
7861                                      si->dst_reg, si->dst_reg,
7862                                      bpf_target_off(struct skb_shared_info,
7863                                                     gso_segs, 2,
7864                                                     target_size));
7865                break;
7866        case offsetof(struct __sk_buff, gso_size):
7867                insn = bpf_convert_shinfo_access(si, insn);
7868                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size),
7869                                      si->dst_reg, si->dst_reg,
7870                                      bpf_target_off(struct skb_shared_info,
7871                                                     gso_size, 2,
7872                                                     target_size));
7873                break;
7874        case offsetof(struct __sk_buff, wire_len):
7875                BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4);
7876
7877                off = si->off;
7878                off -= offsetof(struct __sk_buff, wire_len);
7879                off += offsetof(struct sk_buff, cb);
7880                off += offsetof(struct qdisc_skb_cb, pkt_len);
7881                *target_size = 4;
7882                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off);
7883                break;
7884
7885        case offsetof(struct __sk_buff, sk):
7886                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
7887                                      si->dst_reg, si->src_reg,
7888                                      offsetof(struct sk_buff, sk));
7889                break;
7890        }
7891
7892        return insn - insn_buf;
7893}
7894
7895u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
7896                                const struct bpf_insn *si,
7897                                struct bpf_insn *insn_buf,
7898                                struct bpf_prog *prog, u32 *target_size)
7899{
7900        struct bpf_insn *insn = insn_buf;
7901        int off;
7902
7903        switch (si->off) {
7904        case offsetof(struct bpf_sock, bound_dev_if):
7905                BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4);
7906
7907                if (type == BPF_WRITE)
7908                        *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
7909                                        offsetof(struct sock, sk_bound_dev_if));
7910                else
7911                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
7912                                      offsetof(struct sock, sk_bound_dev_if));
7913                break;
7914
7915        case offsetof(struct bpf_sock, mark):
7916                BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4);
7917
7918                if (type == BPF_WRITE)
7919                        *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
7920                                        offsetof(struct sock, sk_mark));
7921                else
7922                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
7923                                      offsetof(struct sock, sk_mark));
7924                break;
7925
7926        case offsetof(struct bpf_sock, priority):
7927                BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4);
7928
7929                if (type == BPF_WRITE)
7930                        *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
7931                                        offsetof(struct sock, sk_priority));
7932                else
7933                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
7934                                      offsetof(struct sock, sk_priority));
7935                break;
7936
7937        case offsetof(struct bpf_sock, family):
7938                *insn++ = BPF_LDX_MEM(
7939                        BPF_FIELD_SIZEOF(struct sock_common, skc_family),
7940                        si->dst_reg, si->src_reg,
7941                        bpf_target_off(struct sock_common,
7942                                       skc_family,
7943                                       sizeof_field(struct sock_common,
7944                                                    skc_family),
7945                                       target_size));
7946                break;
7947
7948        case offsetof(struct bpf_sock, type):
7949                *insn++ = BPF_LDX_MEM(
7950                        BPF_FIELD_SIZEOF(struct sock, sk_type),
7951                        si->dst_reg, si->src_reg,
7952                        bpf_target_off(struct sock, sk_type,
7953                                       sizeof_field(struct sock, sk_type),
7954                                       target_size));
7955                break;
7956
7957        case offsetof(struct bpf_sock, protocol):
7958                *insn++ = BPF_LDX_MEM(
7959                        BPF_FIELD_SIZEOF(struct sock, sk_protocol),
7960                        si->dst_reg, si->src_reg,
7961                        bpf_target_off(struct sock, sk_protocol,
7962                                       sizeof_field(struct sock, sk_protocol),
7963                                       target_size));
7964                break;
7965
7966        case offsetof(struct bpf_sock, src_ip4):
7967                *insn++ = BPF_LDX_MEM(
7968                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
7969                        bpf_target_off(struct sock_common, skc_rcv_saddr,
7970                                       sizeof_field(struct sock_common,
7971                                                    skc_rcv_saddr),
7972                                       target_size));
7973                break;
7974
7975        case offsetof(struct bpf_sock, dst_ip4):
7976                *insn++ = BPF_LDX_MEM(
7977                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
7978                        bpf_target_off(struct sock_common, skc_daddr,
7979                                       sizeof_field(struct sock_common,
7980                                                    skc_daddr),
7981                                       target_size));
7982                break;
7983
7984        case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
7985#if IS_ENABLED(CONFIG_IPV6)
7986                off = si->off;
7987                off -= offsetof(struct bpf_sock, src_ip6[0]);
7988                *insn++ = BPF_LDX_MEM(
7989                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
7990                        bpf_target_off(
7991                                struct sock_common,
7992                                skc_v6_rcv_saddr.s6_addr32[0],
7993                                sizeof_field(struct sock_common,
7994                                             skc_v6_rcv_saddr.s6_addr32[0]),
7995                                target_size) + off);
7996#else
7997                (void)off;
7998                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
7999#endif
8000                break;
8001
8002        case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
8003#if IS_ENABLED(CONFIG_IPV6)
8004                off = si->off;
8005                off -= offsetof(struct bpf_sock, dst_ip6[0]);
8006                *insn++ = BPF_LDX_MEM(
8007                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
8008                        bpf_target_off(struct sock_common,
8009                                       skc_v6_daddr.s6_addr32[0],
8010                                       sizeof_field(struct sock_common,
8011                                                    skc_v6_daddr.s6_addr32[0]),
8012                                       target_size) + off);
8013#else
8014                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
8015                *target_size = 4;
8016#endif
8017                break;
8018
8019        case offsetof(struct bpf_sock, src_port):
8020                *insn++ = BPF_LDX_MEM(
8021                        BPF_FIELD_SIZEOF(struct sock_common, skc_num),
8022                        si->dst_reg, si->src_reg,
8023                        bpf_target_off(struct sock_common, skc_num,
8024                                       sizeof_field(struct sock_common,
8025                                                    skc_num),
8026                                       target_size));
8027                break;
8028
8029        case offsetof(struct bpf_sock, dst_port):
8030                *insn++ = BPF_LDX_MEM(
8031                        BPF_FIELD_SIZEOF(struct sock_common, skc_dport),
8032                        si->dst_reg, si->src_reg,
8033                        bpf_target_off(struct sock_common, skc_dport,
8034                                       sizeof_field(struct sock_common,
8035                                                    skc_dport),
8036                                       target_size));
8037                break;
8038
8039        case offsetof(struct bpf_sock, state):
8040                *insn++ = BPF_LDX_MEM(
8041                        BPF_FIELD_SIZEOF(struct sock_common, skc_state),
8042                        si->dst_reg, si->src_reg,
8043                        bpf_target_off(struct sock_common, skc_state,
8044                                       sizeof_field(struct sock_common,
8045                                                    skc_state),
8046                                       target_size));
8047                break;
8048        case offsetof(struct bpf_sock, rx_queue_mapping):
8049#ifdef CONFIG_XPS
8050                *insn++ = BPF_LDX_MEM(
8051                        BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping),
8052                        si->dst_reg, si->src_reg,
8053                        bpf_target_off(struct sock, sk_rx_queue_mapping,
8054                                       sizeof_field(struct sock,
8055                                                    sk_rx_queue_mapping),
8056                                       target_size));
8057                *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING,
8058                                      1);
8059                *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
8060#else
8061                *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
8062                *target_size = 2;
8063#endif
8064                break;
8065        }
8066
8067        return insn - insn_buf;
8068}
8069
8070static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
8071                                         const struct bpf_insn *si,
8072                                         struct bpf_insn *insn_buf,
8073                                         struct bpf_prog *prog, u32 *target_size)
8074{
8075        struct bpf_insn *insn = insn_buf;
8076
8077        switch (si->off) {
8078        case offsetof(struct __sk_buff, ifindex):
8079                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
8080                                      si->dst_reg, si->src_reg,
8081                                      offsetof(struct sk_buff, dev));
8082                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
8083                                      bpf_target_off(struct net_device, ifindex, 4,
8084                                                     target_size));
8085                break;
8086        default:
8087                return bpf_convert_ctx_access(type, si, insn_buf, prog,
8088                                              target_size);
8089        }
8090
8091        return insn - insn_buf;
8092}
8093
8094static u32 xdp_convert_ctx_access(enum bpf_access_type type,
8095                                  const struct bpf_insn *si,
8096                                  struct bpf_insn *insn_buf,
8097                                  struct bpf_prog *prog, u32 *target_size)
8098{
8099        struct bpf_insn *insn = insn_buf;
8100
8101        switch (si->off) {
8102        case offsetof(struct xdp_md, data):
8103                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
8104                                      si->dst_reg, si->src_reg,
8105                                      offsetof(struct xdp_buff, data));
8106                break;
8107        case offsetof(struct xdp_md, data_meta):
8108                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta),
8109                                      si->dst_reg, si->src_reg,
8110                                      offsetof(struct xdp_buff, data_meta));
8111                break;
8112        case offsetof(struct xdp_md, data_end):
8113                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
8114                                      si->dst_reg, si->src_reg,
8115                                      offsetof(struct xdp_buff, data_end));
8116                break;
8117        case offsetof(struct xdp_md, ingress_ifindex):
8118                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
8119                                      si->dst_reg, si->src_reg,
8120                                      offsetof(struct xdp_buff, rxq));
8121                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
8122                                      si->dst_reg, si->dst_reg,
8123                                      offsetof(struct xdp_rxq_info, dev));
8124                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
8125                                      offsetof(struct net_device, ifindex));
8126                break;
8127        case offsetof(struct xdp_md, rx_queue_index):
8128                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
8129                                      si->dst_reg, si->src_reg,
8130                                      offsetof(struct xdp_buff, rxq));
8131                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
8132                                      offsetof(struct xdp_rxq_info,
8133                                               queue_index));
8134                break;
8135        case offsetof(struct xdp_md, egress_ifindex):
8136                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq),
8137                                      si->dst_reg, si->src_reg,
8138                                      offsetof(struct xdp_buff, txq));
8139                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev),
8140                                      si->dst_reg, si->dst_reg,
8141                                      offsetof(struct xdp_txq_info, dev));
8142                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
8143                                      offsetof(struct net_device, ifindex));
8144                break;
8145        }
8146
8147        return insn - insn_buf;
8148}
8149
8150/* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of
8151 * context Structure, F is Field in context structure that contains a pointer
8152 * to Nested Structure of type NS that has the field NF.
8153 *
8154 * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make
8155 * sure that SIZE is not greater than actual size of S.F.NF.
8156 *
8157 * If offset OFF is provided, the load happens from that offset relative to
8158 * offset of NF.
8159 */
8160#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF)          \
8161        do {                                                                   \
8162                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg,     \
8163                                      si->src_reg, offsetof(S, F));            \
8164                *insn++ = BPF_LDX_MEM(                                         \
8165                        SIZE, si->dst_reg, si->dst_reg,                        \
8166                        bpf_target_off(NS, NF, sizeof_field(NS, NF),           \
8167                                       target_size)                            \
8168                                + OFF);                                        \
8169        } while (0)
8170
8171#define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF)                              \
8172        SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF,                     \
8173                                             BPF_FIELD_SIZEOF(NS, NF), 0)
8174
8175/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to
8176 * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation.
8177 *
8178 * In addition it uses Temporary Field TF (member of struct S) as the 3rd
8179 * "register" since two registers available in convert_ctx_access are not
8180 * enough: we can't override neither SRC, since it contains value to store, nor
8181 * DST since it contains pointer to context that may be used by later
8182 * instructions. But we need a temporary place to save pointer to nested
8183 * structure whose field we want to store to.
8184 */
8185#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF)          \
8186        do {                                                                   \
8187                int tmp_reg = BPF_REG_9;                                       \
8188                if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg)          \
8189                        --tmp_reg;                                             \
8190                if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg)          \
8191                        --tmp_reg;                                             \
8192                *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg,            \
8193                                      offsetof(S, TF));                        \
8194                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg,         \
8195                                      si->dst_reg, offsetof(S, F));            \
8196                *insn++ = BPF_STX_MEM(SIZE, tmp_reg, si->src_reg,              \
8197                        bpf_target_off(NS, NF, sizeof_field(NS, NF),           \
8198                                       target_size)                            \
8199                                + OFF);                                        \
8200                *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg,            \
8201                                      offsetof(S, TF));                        \
8202        } while (0)
8203
8204#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \
8205                                                      TF)                      \
8206        do {                                                                   \
8207                if (type == BPF_WRITE) {                                       \
8208                        SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE,   \
8209                                                         OFF, TF);             \
8210                } else {                                                       \
8211                        SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(                  \
8212                                S, NS, F, NF, SIZE, OFF);  \
8213                }                                                              \
8214        } while (0)
8215
8216#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF)                 \
8217        SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(                         \
8218                S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF)
8219
8220static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
8221                                        const struct bpf_insn *si,
8222                                        struct bpf_insn *insn_buf,
8223                                        struct bpf_prog *prog, u32 *target_size)
8224{
8225        int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port);
8226        struct bpf_insn *insn = insn_buf;
8227
8228        switch (si->off) {
8229        case offsetof(struct bpf_sock_addr, user_family):
8230                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
8231                                            struct sockaddr, uaddr, sa_family);
8232                break;
8233
8234        case offsetof(struct bpf_sock_addr, user_ip4):
8235                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
8236                        struct bpf_sock_addr_kern, struct sockaddr_in, uaddr,
8237                        sin_addr, BPF_SIZE(si->code), 0, tmp_reg);
8238                break;
8239
8240        case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
8241                off = si->off;
8242                off -= offsetof(struct bpf_sock_addr, user_ip6[0]);
8243                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
8244                        struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
8245                        sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off,
8246                        tmp_reg);
8247                break;
8248
8249        case offsetof(struct bpf_sock_addr, user_port):
8250                /* To get port we need to know sa_family first and then treat
8251                 * sockaddr as either sockaddr_in or sockaddr_in6.
8252                 * Though we can simplify since port field has same offset and
8253                 * size in both structures.
8254                 * Here we check this invariant and use just one of the
8255                 * structures if it's true.
8256                 */
8257                BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) !=
8258                             offsetof(struct sockaddr_in6, sin6_port));
8259                BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) !=
8260                             sizeof_field(struct sockaddr_in6, sin6_port));
8261                /* Account for sin6_port being smaller than user_port. */
8262                port_size = min(port_size, BPF_LDST_BYTES(si));
8263                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
8264                        struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
8265                        sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg);
8266                break;
8267
8268        case offsetof(struct bpf_sock_addr, family):
8269                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
8270                                            struct sock, sk, sk_family);
8271                break;
8272
8273        case offsetof(struct bpf_sock_addr, type):
8274                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
8275                                            struct sock, sk, sk_type);
8276                break;
8277
8278        case offsetof(struct bpf_sock_addr, protocol):
8279                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
8280                                            struct sock, sk, sk_protocol);
8281                break;
8282
8283        case offsetof(struct bpf_sock_addr, msg_src_ip4):
8284                /* Treat t_ctx as struct in_addr for msg_src_ip4. */
8285                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
8286                        struct bpf_sock_addr_kern, struct in_addr, t_ctx,
8287                        s_addr, BPF_SIZE(si->code), 0, tmp_reg);
8288                break;
8289
8290        case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
8291                                msg_src_ip6[3]):
8292                off = si->off;
8293                off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
8294                /* Treat t_ctx as struct in6_addr for msg_src_ip6. */
8295                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
8296                        struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
8297                        s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
8298                break;
8299        case offsetof(struct bpf_sock_addr, sk):
8300                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk),
8301                                      si->dst_reg, si->src_reg,
8302                                      offsetof(struct bpf_sock_addr_kern, sk));
8303                break;
8304        }
8305
8306        return insn - insn_buf;
8307}
8308
8309static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
8310                                       const struct bpf_insn *si,
8311                                       struct bpf_insn *insn_buf,
8312                                       struct bpf_prog *prog,
8313                                       u32 *target_size)
8314{
8315        struct bpf_insn *insn = insn_buf;
8316        int off;
8317
8318/* Helper macro for adding read access to tcp_sock or sock fields. */
8319#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)                         \
8320        do {                                                                  \
8321                int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2;     \
8322                BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) >                   \
8323                             sizeof_field(struct bpf_sock_ops, BPF_FIELD));   \
8324                if (si->dst_reg == reg || si->src_reg == reg)                 \
8325                        reg--;                                                \
8326                if (si->dst_reg == reg || si->src_reg == reg)                 \
8327                        reg--;                                                \
8328                if (si->dst_reg == si->src_reg) {                             \
8329                        *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg,       \
8330                                          offsetof(struct bpf_sock_ops_kern,  \
8331                                          temp));                             \
8332                        fullsock_reg = reg;                                   \
8333                        jmp += 2;                                             \
8334                }                                                             \
8335                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                       \
8336                                                struct bpf_sock_ops_kern,     \
8337                                                is_fullsock),                 \
8338                                      fullsock_reg, si->src_reg,              \
8339                                      offsetof(struct bpf_sock_ops_kern,      \
8340                                               is_fullsock));                 \
8341                *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp);         \
8342                if (si->dst_reg == si->src_reg)                               \
8343                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,       \
8344                                      offsetof(struct bpf_sock_ops_kern,      \
8345                                      temp));                                 \
8346                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                       \
8347                                                struct bpf_sock_ops_kern, sk),\
8348                                      si->dst_reg, si->src_reg,               \
8349                                      offsetof(struct bpf_sock_ops_kern, sk));\
8350                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ,                   \
8351                                                       OBJ_FIELD),            \
8352                                      si->dst_reg, si->dst_reg,               \
8353                                      offsetof(OBJ, OBJ_FIELD));              \
8354                if (si->dst_reg == si->src_reg) {                             \
8355                        *insn++ = BPF_JMP_A(1);                               \
8356                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,       \
8357                                      offsetof(struct bpf_sock_ops_kern,      \
8358                                      temp));                                 \
8359                }                                                             \
8360        } while (0)
8361
8362#define SOCK_OPS_GET_SK()                                                             \
8363        do {                                                                  \
8364                int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1;     \
8365                if (si->dst_reg == reg || si->src_reg == reg)                 \
8366                        reg--;                                                \
8367                if (si->dst_reg == reg || si->src_reg == reg)                 \
8368                        reg--;                                                \
8369                if (si->dst_reg == si->src_reg) {                             \
8370                        *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg,       \
8371                                          offsetof(struct bpf_sock_ops_kern,  \
8372                                          temp));                             \
8373                        fullsock_reg = reg;                                   \
8374                        jmp += 2;                                             \
8375                }                                                             \
8376                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                       \
8377                                                struct bpf_sock_ops_kern,     \
8378                                                is_fullsock),                 \
8379                                      fullsock_reg, si->src_reg,              \
8380                                      offsetof(struct bpf_sock_ops_kern,      \
8381                                               is_fullsock));                 \
8382                *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp);         \
8383                if (si->dst_reg == si->src_reg)                               \
8384                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,       \
8385                                      offsetof(struct bpf_sock_ops_kern,      \
8386                                      temp));                                 \
8387                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                       \
8388                                                struct bpf_sock_ops_kern, sk),\
8389                                      si->dst_reg, si->src_reg,               \
8390                                      offsetof(struct bpf_sock_ops_kern, sk));\
8391                if (si->dst_reg == si->src_reg) {                             \
8392                        *insn++ = BPF_JMP_A(1);                               \
8393                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,       \
8394                                      offsetof(struct bpf_sock_ops_kern,      \
8395                                      temp));                                 \
8396                }                                                             \
8397        } while (0)
8398
8399#define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \
8400                SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock)
8401
8402/* Helper macro for adding write access to tcp_sock or sock fields.
8403 * The macro is called with two registers, dst_reg which contains a pointer
8404 * to ctx (context) and src_reg which contains the value that should be
8405 * stored. However, we need an additional register since we cannot overwrite
8406 * dst_reg because it may be used later in the program.
8407 * Instead we "borrow" one of the other register. We first save its value
8408 * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
8409 * it at the end of the macro.
8410 */
8411#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)                         \
8412        do {                                                                  \
8413                int reg = BPF_REG_9;                                          \
8414                BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) >                   \
8415                             sizeof_field(struct bpf_sock_ops, BPF_FIELD));   \
8416                if (si->dst_reg == reg || si->src_reg == reg)                 \
8417                        reg--;                                                \
8418                if (si->dst_reg == reg || si->src_reg == reg)                 \
8419                        reg--;                                                \
8420                *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg,               \
8421                                      offsetof(struct bpf_sock_ops_kern,      \
8422                                               temp));                        \
8423                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                       \
8424                                                struct bpf_sock_ops_kern,     \
8425                                                is_fullsock),                 \
8426                                      reg, si->dst_reg,                       \
8427                                      offsetof(struct bpf_sock_ops_kern,      \
8428                                               is_fullsock));                 \
8429                *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2);                    \
8430                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                       \
8431                                                struct bpf_sock_ops_kern, sk),\
8432                                      reg, si->dst_reg,                       \
8433                                      offsetof(struct bpf_sock_ops_kern, sk));\
8434                *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD),       \
8435                                      reg, si->src_reg,                       \
8436                                      offsetof(OBJ, OBJ_FIELD));              \
8437                *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg,               \
8438                                      offsetof(struct bpf_sock_ops_kern,      \
8439                                               temp));                        \
8440        } while (0)
8441
8442#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE)            \
8443        do {                                                                  \
8444                if (TYPE == BPF_WRITE)                                        \
8445                        SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);        \
8446                else                                                          \
8447                        SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);        \
8448        } while (0)
8449
8450        if (insn > insn_buf)
8451                return insn - insn_buf;
8452
8453        switch (si->off) {
8454        case offsetof(struct bpf_sock_ops, op) ...
8455             offsetof(struct bpf_sock_ops, replylong[3]):
8456                BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, op) !=
8457                             sizeof_field(struct bpf_sock_ops_kern, op));
8458                BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) !=
8459                             sizeof_field(struct bpf_sock_ops_kern, reply));
8460                BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) !=
8461                             sizeof_field(struct bpf_sock_ops_kern, replylong));
8462                off = si->off;
8463                off -= offsetof(struct bpf_sock_ops, op);
8464                off += offsetof(struct bpf_sock_ops_kern, op);
8465                if (type == BPF_WRITE)
8466                        *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
8467                                              off);
8468                else
8469                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
8470                                              off);
8471                break;
8472
8473        case offsetof(struct bpf_sock_ops, family):
8474                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);
8475
8476                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
8477                                              struct bpf_sock_ops_kern, sk),
8478                                      si->dst_reg, si->src_reg,
8479                                      offsetof(struct bpf_sock_ops_kern, sk));
8480                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
8481                                      offsetof(struct sock_common, skc_family));
8482                break;
8483
8484        case offsetof(struct bpf_sock_ops, remote_ip4):
8485                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);
8486
8487                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
8488                                                struct bpf_sock_ops_kern, sk),
8489                                      si->dst_reg, si->src_reg,
8490                                      offsetof(struct bpf_sock_ops_kern, sk));
8491                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
8492                                      offsetof(struct sock_common, skc_daddr));
8493                break;
8494
8495        case offsetof(struct bpf_sock_ops, local_ip4):
8496                BUILD_BUG_ON(sizeof_field(struct sock_common,
8497                                          skc_rcv_saddr) != 4);
8498
8499                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
8500                                              struct bpf_sock_ops_kern, sk),
8501                                      si->dst_reg, si->src_reg,
8502                                      offsetof(struct bpf_sock_ops_kern, sk));
8503                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
8504                                      offsetof(struct sock_common,
8505                                               skc_rcv_saddr));
8506                break;
8507
8508        case offsetof(struct bpf_sock_ops, remote_ip6[0]) ...
8509             offsetof(struct bpf_sock_ops, remote_ip6[3]):
8510#if IS_ENABLED(CONFIG_IPV6)
8511                BUILD_BUG_ON(sizeof_field(struct sock_common,
8512                                          skc_v6_daddr.s6_addr32[0]) != 4);
8513
8514                off = si->off;
8515                off -= offsetof(struct bpf_sock_ops, remote_ip6[0]);
8516                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
8517                                                struct bpf_sock_ops_kern, sk),
8518                                      si->dst_reg, si->src_reg,
8519                                      offsetof(struct bpf_sock_ops_kern, sk));
8520                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
8521                                      offsetof(struct sock_common,
8522                                               skc_v6_daddr.s6_addr32[0]) +
8523                                      off);
8524#else
8525                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
8526#endif
8527                break;
8528
8529        case offsetof(struct bpf_sock_ops, local_ip6[0]) ...
8530             offsetof(struct bpf_sock_ops, local_ip6[3]):
8531#if IS_ENABLED(CONFIG_IPV6)
8532                BUILD_BUG_ON(sizeof_field(struct sock_common,
8533                                          skc_v6_rcv_saddr.s6_addr32[0]) != 4);
8534
8535                off = si->off;
8536                off -= offsetof(struct bpf_sock_ops, local_ip6[0]);
8537                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
8538                                                struct bpf_sock_ops_kern, sk),
8539                                      si->dst_reg, si->src_reg,
8540                                      offsetof(struct bpf_sock_ops_kern, sk));
8541                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
8542                                      offsetof(struct sock_common,
8543                                               skc_v6_rcv_saddr.s6_addr32[0]) +
8544                                      off);
8545#else
8546                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
8547#endif
8548                break;
8549
8550        case offsetof(struct bpf_sock_ops, remote_port):
8551                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);
8552
8553                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
8554                                                struct bpf_sock_ops_kern, sk),
8555                                      si->dst_reg, si->src_reg,
8556                                      offsetof(struct bpf_sock_ops_kern, sk));
8557                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
8558                                      offsetof(struct sock_common, skc_dport));
8559#ifndef __BIG_ENDIAN_BITFIELD
8560                *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
8561#endif
8562                break;
8563
8564        case offsetof(struct bpf_sock_ops, local_port):
8565                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);
8566
8567                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
8568                                                struct bpf_sock_ops_kern, sk),
8569                                      si->dst_reg, si->src_reg,
8570                                      offsetof(struct bpf_sock_ops_kern, sk));
8571                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
8572                                      offsetof(struct sock_common, skc_num));
8573                break;
8574
8575        case offsetof(struct bpf_sock_ops, is_fullsock):
8576                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
8577                                                struct bpf_sock_ops_kern,
8578                                                is_fullsock),
8579                                      si->dst_reg, si->src_reg,
8580                                      offsetof(struct bpf_sock_ops_kern,
8581                                               is_fullsock));
8582                break;
8583
8584        case offsetof(struct bpf_sock_ops, state):
8585                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_state) != 1);
8586
8587                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
8588                                                struct bpf_sock_ops_kern, sk),
8589                                      si->dst_reg, si->src_reg,
8590                                      offsetof(struct bpf_sock_ops_kern, sk));
8591                *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg,
8592                                      offsetof(struct sock_common, skc_state));
8593                break;
8594
8595        case offsetof(struct bpf_sock_ops, rtt_min):
8596                BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
8597                             sizeof(struct minmax));
8598                BUILD_BUG_ON(sizeof(struct minmax) <
8599                             sizeof(struct minmax_sample));
8600
8601                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
8602                                                struct bpf_sock_ops_kern, sk),
8603                                      si->dst_reg, si->src_reg,
8604                                      offsetof(struct bpf_sock_ops_kern, sk));
8605                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
8606                                      offsetof(struct tcp_sock, rtt_min) +
8607                                      sizeof_field(struct minmax_sample, t));
8608                break;
8609
8610        case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags):
8611                SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags,
8612                                   struct tcp_sock);
8613                break;
8614
8615        case offsetof(struct bpf_sock_ops, sk_txhash):
8616                SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
8617                                          struct sock, type);
8618                break;
8619        case offsetof(struct bpf_sock_ops, snd_cwnd):
8620                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd);
8621                break;
8622        case offsetof(struct bpf_sock_ops, srtt_us):
8623                SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us);
8624                break;
8625        case offsetof(struct bpf_sock_ops, snd_ssthresh):
8626                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh);
8627                break;
8628        case offsetof(struct bpf_sock_ops, rcv_nxt):
8629                SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt);
8630                break;
8631        case offsetof(struct bpf_sock_ops, snd_nxt):
8632                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt);
8633                break;
8634        case offsetof(struct bpf_sock_ops, snd_una):
8635                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una);
8636                break;
8637        case offsetof(struct bpf_sock_ops, mss_cache):
8638                SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache);
8639                break;
8640        case offsetof(struct bpf_sock_ops, ecn_flags):
8641                SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags);
8642                break;
8643        case offsetof(struct bpf_sock_ops, rate_delivered):
8644                SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered);
8645                break;
8646        case offsetof(struct bpf_sock_ops, rate_interval_us):
8647                SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us);
8648                break;
8649        case offsetof(struct bpf_sock_ops, packets_out):
8650                SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out);
8651                break;
8652        case offsetof(struct bpf_sock_ops, retrans_out):
8653                SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out);
8654                break;
8655        case offsetof(struct bpf_sock_ops, total_retrans):
8656                SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans);
8657                break;
8658        case offsetof(struct bpf_sock_ops, segs_in):
8659                SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in);
8660                break;
8661        case offsetof(struct bpf_sock_ops, data_segs_in):
8662                SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in);
8663                break;
8664        case offsetof(struct bpf_sock_ops, segs_out):
8665                SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out);
8666                break;
8667        case offsetof(struct bpf_sock_ops, data_segs_out):
8668                SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out);
8669                break;
8670        case offsetof(struct bpf_sock_ops, lost_out):
8671                SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out);
8672                break;
8673        case offsetof(struct bpf_sock_ops, sacked_out):
8674                SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out);
8675                break;
8676        case offsetof(struct bpf_sock_ops, bytes_received):
8677                SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received);
8678                break;
8679        case offsetof(struct bpf_sock_ops, bytes_acked):
8680                SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked);
8681                break;
8682        case offsetof(struct bpf_sock_ops, sk):
8683                SOCK_OPS_GET_SK();
8684                break;
8685        }
8686        return insn - insn_buf;
8687}
8688
8689static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
8690                                     const struct bpf_insn *si,
8691                                     struct bpf_insn *insn_buf,
8692                                     struct bpf_prog *prog, u32 *target_size)
8693{
8694        struct bpf_insn *insn = insn_buf;
8695        int off;
8696
8697        switch (si->off) {
8698        case offsetof(struct __sk_buff, data_end):
8699                off  = si->off;
8700                off -= offsetof(struct __sk_buff, data_end);
8701                off += offsetof(struct sk_buff, cb);
8702                off += offsetof(struct tcp_skb_cb, bpf.data_end);
8703                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
8704                                      si->src_reg, off);
8705                break;
8706        default:
8707                return bpf_convert_ctx_access(type, si, insn_buf, prog,
8708                                              target_size);
8709        }
8710
8711        return insn - insn_buf;
8712}
8713
8714static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
8715                                     const struct bpf_insn *si,
8716                                     struct bpf_insn *insn_buf,
8717                                     struct bpf_prog *prog, u32 *target_size)
8718{
8719        struct bpf_insn *insn = insn_buf;
8720#if IS_ENABLED(CONFIG_IPV6)
8721        int off;
8722#endif
8723
8724        /* convert ctx uses the fact sg element is first in struct */
8725        BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0);
8726
8727        switch (si->off) {
8728        case offsetof(struct sk_msg_md, data):
8729                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
8730                                      si->dst_reg, si->src_reg,
8731                                      offsetof(struct sk_msg, data));
8732                break;
8733        case offsetof(struct sk_msg_md, data_end):
8734                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end),
8735                                      si->dst_reg, si->src_reg,
8736                                      offsetof(struct sk_msg, data_end));
8737                break;
8738        case offsetof(struct sk_msg_md, family):
8739                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);
8740
8741                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
8742                                              struct sk_msg, sk),
8743                                      si->dst_reg, si->src_reg,
8744                                      offsetof(struct sk_msg, sk));
8745                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
8746                                      offsetof(struct sock_common, skc_family));
8747                break;
8748
8749        case offsetof(struct sk_msg_md, remote_ip4):
8750                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);
8751
8752                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
8753                                                struct sk_msg, sk),
8754                                      si->dst_reg, si->src_reg,
8755                                      offsetof(struct sk_msg, sk));
8756                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
8757                                      offsetof(struct sock_common, skc_daddr));
8758                break;
8759
8760        case offsetof(struct sk_msg_md, local_ip4):
8761                BUILD_BUG_ON(sizeof_field(struct sock_common,
8762                                          skc_rcv_saddr) != 4);
8763
8764                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
8765                                              struct sk_msg, sk),
8766                                      si->dst_reg, si->src_reg,
8767                                      offsetof(struct sk_msg, sk));
8768                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
8769                                      offsetof(struct sock_common,
8770                                               skc_rcv_saddr));
8771                break;
8772
8773        case offsetof(struct sk_msg_md, remote_ip6[0]) ...
8774             offsetof(struct sk_msg_md, remote_ip6[3]):
8775#if IS_ENABLED(CONFIG_IPV6)
8776                BUILD_BUG_ON(sizeof_field(struct sock_common,
8777                                          skc_v6_daddr.s6_addr32[0]) != 4);
8778
8779                off = si->off;
8780                off -= offsetof(struct sk_msg_md, remote_ip6[0]);
8781                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
8782                                                struct sk_msg, sk),
8783                                      si->dst_reg, si->src_reg,
8784                                      offsetof(struct sk_msg, sk));
8785                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
8786                                      offsetof(struct sock_common,
8787                                               skc_v6_daddr.s6_addr32[0]) +
8788                                      off);
8789#else
8790                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
8791#endif
8792                break;
8793
8794        case offsetof(struct sk_msg_md, local_ip6[0]) ...
8795             offsetof(struct sk_msg_md, local_ip6[3]):
8796#if IS_ENABLED(CONFIG_IPV6)
8797                BUILD_BUG_ON(sizeof_field(struct sock_common,
8798                                          skc_v6_rcv_saddr.s6_addr32[0]) != 4);
8799
8800                off = si->off;
8801                off -= offsetof(struct sk_msg_md, local_ip6[0]);
8802                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
8803                                                struct sk_msg, sk),
8804                                      si->dst_reg, si->src_reg,
8805                                      offsetof(struct sk_msg, sk));
8806                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
8807                                      offsetof(struct sock_common,
8808                                               skc_v6_rcv_saddr.s6_addr32[0]) +
8809                                      off);
8810#else
8811                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
8812#endif
8813                break;
8814
8815        case offsetof(struct sk_msg_md, remote_port):
8816                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);
8817
8818                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
8819                                                struct sk_msg, sk),
8820                                      si->dst_reg, si->src_reg,
8821                                      offsetof(struct sk_msg, sk));
8822                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
8823                                      offsetof(struct sock_common, skc_dport));
8824#ifndef __BIG_ENDIAN_BITFIELD
8825                *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
8826#endif
8827                break;
8828
8829        case offsetof(struct sk_msg_md, local_port):
8830                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);
8831
8832                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
8833                                                struct sk_msg, sk),
8834                                      si->dst_reg, si->src_reg,
8835                                      offsetof(struct sk_msg, sk));
8836                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
8837                                      offsetof(struct sock_common, skc_num));
8838                break;
8839
8840        case offsetof(struct sk_msg_md, size):
8841                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size),
8842                                      si->dst_reg, si->src_reg,
8843                                      offsetof(struct sk_msg_sg, size));
8844                break;
8845
8846        case offsetof(struct sk_msg_md, sk):
8847                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk),
8848                                      si->dst_reg, si->src_reg,
8849                                      offsetof(struct sk_msg, sk));
8850                break;
8851        }
8852
8853        return insn - insn_buf;
8854}
8855
8856const struct bpf_verifier_ops sk_filter_verifier_ops = {
8857        .get_func_proto         = sk_filter_func_proto,
8858        .is_valid_access        = sk_filter_is_valid_access,
8859        .convert_ctx_access     = bpf_convert_ctx_access,
8860        .gen_ld_abs             = bpf_gen_ld_abs,
8861};
8862
8863const struct bpf_prog_ops sk_filter_prog_ops = {
8864        .test_run               = bpf_prog_test_run_skb,
8865};
8866
8867const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
8868        .get_func_proto         = tc_cls_act_func_proto,
8869        .is_valid_access        = tc_cls_act_is_valid_access,
8870        .convert_ctx_access     = tc_cls_act_convert_ctx_access,
8871        .gen_prologue           = tc_cls_act_prologue,
8872        .gen_ld_abs             = bpf_gen_ld_abs,
8873};
8874
8875const struct bpf_prog_ops tc_cls_act_prog_ops = {
8876        .test_run               = bpf_prog_test_run_skb,
8877};
8878
8879const struct bpf_verifier_ops xdp_verifier_ops = {
8880        .get_func_proto         = xdp_func_proto,
8881        .is_valid_access        = xdp_is_valid_access,
8882        .convert_ctx_access     = xdp_convert_ctx_access,
8883        .gen_prologue           = bpf_noop_prologue,
8884};
8885
8886const struct bpf_prog_ops xdp_prog_ops = {
8887        .test_run               = bpf_prog_test_run_xdp,
8888};
8889
8890const struct bpf_verifier_ops cg_skb_verifier_ops = {
8891        .get_func_proto         = cg_skb_func_proto,
8892        .is_valid_access        = cg_skb_is_valid_access,
8893        .convert_ctx_access     = bpf_convert_ctx_access,
8894};
8895
8896const struct bpf_prog_ops cg_skb_prog_ops = {
8897        .test_run               = bpf_prog_test_run_skb,
8898};
8899
8900const struct bpf_verifier_ops lwt_in_verifier_ops = {
8901        .get_func_proto         = lwt_in_func_proto,
8902        .is_valid_access        = lwt_is_valid_access,
8903        .convert_ctx_access     = bpf_convert_ctx_access,
8904};
8905
8906const struct bpf_prog_ops lwt_in_prog_ops = {
8907        .test_run               = bpf_prog_test_run_skb,
8908};
8909
8910const struct bpf_verifier_ops lwt_out_verifier_ops = {
8911        .get_func_proto         = lwt_out_func_proto,
8912        .is_valid_access        = lwt_is_valid_access,
8913        .convert_ctx_access     = bpf_convert_ctx_access,
8914};
8915
8916const struct bpf_prog_ops lwt_out_prog_ops = {
8917        .test_run               = bpf_prog_test_run_skb,
8918};
8919
8920const struct bpf_verifier_ops lwt_xmit_verifier_ops = {
8921        .get_func_proto         = lwt_xmit_func_proto,
8922        .is_valid_access        = lwt_is_valid_access,
8923        .convert_ctx_access     = bpf_convert_ctx_access,
8924        .gen_prologue           = tc_cls_act_prologue,
8925};
8926
8927const struct bpf_prog_ops lwt_xmit_prog_ops = {
8928        .test_run               = bpf_prog_test_run_skb,
8929};
8930
8931const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
8932        .get_func_proto         = lwt_seg6local_func_proto,
8933        .is_valid_access        = lwt_is_valid_access,
8934        .convert_ctx_access     = bpf_convert_ctx_access,
8935};
8936
8937const struct bpf_prog_ops lwt_seg6local_prog_ops = {
8938        .test_run               = bpf_prog_test_run_skb,
8939};
8940
8941const struct bpf_verifier_ops cg_sock_verifier_ops = {
8942        .get_func_proto         = sock_filter_func_proto,
8943        .is_valid_access        = sock_filter_is_valid_access,
8944        .convert_ctx_access     = bpf_sock_convert_ctx_access,
8945};
8946
8947const struct bpf_prog_ops cg_sock_prog_ops = {
8948};
8949
8950const struct bpf_verifier_ops cg_sock_addr_verifier_ops = {
8951        .get_func_proto         = sock_addr_func_proto,
8952        .is_valid_access        = sock_addr_is_valid_access,
8953        .convert_ctx_access     = sock_addr_convert_ctx_access,
8954};
8955
8956const struct bpf_prog_ops cg_sock_addr_prog_ops = {
8957};
8958
8959const struct bpf_verifier_ops sock_ops_verifier_ops = {
8960        .get_func_proto         = sock_ops_func_proto,
8961        .is_valid_access        = sock_ops_is_valid_access,
8962        .convert_ctx_access     = sock_ops_convert_ctx_access,
8963};
8964
8965const struct bpf_prog_ops sock_ops_prog_ops = {
8966};
8967
8968const struct bpf_verifier_ops sk_skb_verifier_ops = {
8969        .get_func_proto         = sk_skb_func_proto,
8970        .is_valid_access        = sk_skb_is_valid_access,
8971        .convert_ctx_access     = sk_skb_convert_ctx_access,
8972        .gen_prologue           = sk_skb_prologue,
8973};
8974
8975const struct bpf_prog_ops sk_skb_prog_ops = {
8976};
8977
8978const struct bpf_verifier_ops sk_msg_verifier_ops = {
8979        .get_func_proto         = sk_msg_func_proto,
8980        .is_valid_access        = sk_msg_is_valid_access,
8981        .convert_ctx_access     = sk_msg_convert_ctx_access,
8982        .gen_prologue           = bpf_noop_prologue,
8983};
8984
8985const struct bpf_prog_ops sk_msg_prog_ops = {
8986};
8987
8988const struct bpf_verifier_ops flow_dissector_verifier_ops = {
8989        .get_func_proto         = flow_dissector_func_proto,
8990        .is_valid_access        = flow_dissector_is_valid_access,
8991        .convert_ctx_access     = flow_dissector_convert_ctx_access,
8992};
8993
8994const struct bpf_prog_ops flow_dissector_prog_ops = {
8995        .test_run               = bpf_prog_test_run_flow_dissector,
8996};
8997
8998int sk_detach_filter(struct sock *sk)
8999{
9000        int ret = -ENOENT;
9001        struct sk_filter *filter;
9002
9003        if (sock_flag(sk, SOCK_FILTER_LOCKED))
9004                return -EPERM;
9005
9006        filter = rcu_dereference_protected(sk->sk_filter,
9007                                           lockdep_sock_is_held(sk));
9008        if (filter) {
9009                RCU_INIT_POINTER(sk->sk_filter, NULL);
9010                sk_filter_uncharge(sk, filter);
9011                ret = 0;
9012        }
9013
9014        return ret;
9015}
9016EXPORT_SYMBOL_GPL(sk_detach_filter);
9017
9018int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
9019                  unsigned int len)
9020{
9021        struct sock_fprog_kern *fprog;
9022        struct sk_filter *filter;
9023        int ret = 0;
9024
9025        lock_sock(sk);
9026        filter = rcu_dereference_protected(sk->sk_filter,
9027                                           lockdep_sock_is_held(sk));
9028        if (!filter)
9029                goto out;
9030
9031        /* We're copying the filter that has been originally attached,
9032         * so no conversion/decode needed anymore. eBPF programs that
9033         * have no original program cannot be dumped through this.
9034         */
9035        ret = -EACCES;
9036        fprog = filter->prog->orig_prog;
9037        if (!fprog)
9038                goto out;
9039
9040        ret = fprog->len;
9041        if (!len)
9042                /* User space only enquires number of filter blocks. */
9043                goto out;
9044
9045        ret = -EINVAL;
9046        if (len < fprog->len)
9047                goto out;
9048
9049        ret = -EFAULT;
9050        if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog)))
9051                goto out;
9052
9053        /* Instead of bytes, the API requests to return the number
9054         * of filter blocks.
9055         */
9056        ret = fprog->len;
9057out:
9058        release_sock(sk);
9059        return ret;
9060}
9061
9062#ifdef CONFIG_INET
9063static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
9064                                    struct sock_reuseport *reuse,
9065                                    struct sock *sk, struct sk_buff *skb,
9066                                    u32 hash)
9067{
9068        reuse_kern->skb = skb;
9069        reuse_kern->sk = sk;
9070        reuse_kern->selected_sk = NULL;
9071        reuse_kern->data_end = skb->data + skb_headlen(skb);
9072        reuse_kern->hash = hash;
9073        reuse_kern->reuseport_id = reuse->reuseport_id;
9074        reuse_kern->bind_inany = reuse->bind_inany;
9075}
9076
9077struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
9078                                  struct bpf_prog *prog, struct sk_buff *skb,
9079                                  u32 hash)
9080{
9081        struct sk_reuseport_kern reuse_kern;
9082        enum sk_action action;
9083
9084        bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash);
9085        action = BPF_PROG_RUN(prog, &reuse_kern);
9086
9087        if (action == SK_PASS)
9088                return reuse_kern.selected_sk;
9089        else
9090                return ERR_PTR(-ECONNREFUSED);
9091}
9092
9093BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
9094           struct bpf_map *, map, void *, key, u32, flags)
9095{
9096        bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY;
9097        struct sock_reuseport *reuse;
9098        struct sock *selected_sk;
9099
9100        selected_sk = map->ops->map_lookup_elem(map, key);
9101        if (!selected_sk)
9102                return -ENOENT;
9103
9104        reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
9105        if (!reuse) {
9106                /* Lookup in sock_map can return TCP ESTABLISHED sockets. */
9107                if (sk_is_refcounted(selected_sk))
9108                        sock_put(selected_sk);
9109
9110                /* reuseport_array has only sk with non NULL sk_reuseport_cb.
9111                 * The only (!reuse) case here is - the sk has already been
9112                 * unhashed (e.g. by close()), so treat it as -ENOENT.
9113                 *
9114                 * Other maps (e.g. sock_map) do not provide this guarantee and
9115                 * the sk may never be in the reuseport group to begin with.
9116                 */
9117                return is_sockarray ? -ENOENT : -EINVAL;
9118        }
9119
9120        if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
9121                struct sock *sk = reuse_kern->sk;
9122
9123                if (sk->sk_protocol != selected_sk->sk_protocol)
9124                        return -EPROTOTYPE;
9125                else if (sk->sk_family != selected_sk->sk_family)
9126                        return -EAFNOSUPPORT;
9127
9128                /* Catch all. Likely bound to a different sockaddr. */
9129                return -EBADFD;
9130        }
9131
9132        reuse_kern->selected_sk = selected_sk;
9133
9134        return 0;
9135}
9136
9137static const struct bpf_func_proto sk_select_reuseport_proto = {
9138        .func           = sk_select_reuseport,
9139        .gpl_only       = false,
9140        .ret_type       = RET_INTEGER,
9141        .arg1_type      = ARG_PTR_TO_CTX,
9142        .arg2_type      = ARG_CONST_MAP_PTR,
9143        .arg3_type      = ARG_PTR_TO_MAP_KEY,
9144        .arg4_type      = ARG_ANYTHING,
9145};
9146
9147BPF_CALL_4(sk_reuseport_load_bytes,
9148           const struct sk_reuseport_kern *, reuse_kern, u32, offset,
9149           void *, to, u32, len)
9150{
9151        return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len);
9152}
9153
9154static const struct bpf_func_proto sk_reuseport_load_bytes_proto = {
9155        .func           = sk_reuseport_load_bytes,
9156        .gpl_only       = false,
9157        .ret_type       = RET_INTEGER,
9158        .arg1_type      = ARG_PTR_TO_CTX,
9159        .arg2_type      = ARG_ANYTHING,
9160        .arg3_type      = ARG_PTR_TO_UNINIT_MEM,
9161        .arg4_type      = ARG_CONST_SIZE,
9162};
9163
9164BPF_CALL_5(sk_reuseport_load_bytes_relative,
9165           const struct sk_reuseport_kern *, reuse_kern, u32, offset,
9166           void *, to, u32, len, u32, start_header)
9167{
9168        return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to,
9169                                               len, start_header);
9170}
9171
9172static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = {
9173        .func           = sk_reuseport_load_bytes_relative,
9174        .gpl_only       = false,
9175        .ret_type       = RET_INTEGER,
9176        .arg1_type      = ARG_PTR_TO_CTX,
9177        .arg2_type      = ARG_ANYTHING,
9178        .arg3_type      = ARG_PTR_TO_UNINIT_MEM,
9179        .arg4_type      = ARG_CONST_SIZE,
9180        .arg5_type      = ARG_ANYTHING,
9181};
9182
9183static const struct bpf_func_proto *
9184sk_reuseport_func_proto(enum bpf_func_id func_id,
9185                        const struct bpf_prog *prog)
9186{
9187        switch (func_id) {
9188        case BPF_FUNC_sk_select_reuseport:
9189                return &sk_select_reuseport_proto;
9190        case BPF_FUNC_skb_load_bytes:
9191                return &sk_reuseport_load_bytes_proto;
9192        case BPF_FUNC_skb_load_bytes_relative:
9193                return &sk_reuseport_load_bytes_relative_proto;
9194        default:
9195                return bpf_base_func_proto(func_id);
9196        }
9197}
9198
9199static bool
9200sk_reuseport_is_valid_access(int off, int size,
9201                             enum bpf_access_type type,
9202                             const struct bpf_prog *prog,
9203                             struct bpf_insn_access_aux *info)
9204{
9205        const u32 size_default = sizeof(__u32);
9206
9207        if (off < 0 || off >= sizeof(struct sk_reuseport_md) ||
9208            off % size || type != BPF_READ)
9209                return false;
9210
9211        switch (off) {
9212        case offsetof(struct sk_reuseport_md, data):
9213                info->reg_type = PTR_TO_PACKET;
9214                return size == sizeof(__u64);
9215
9216        case offsetof(struct sk_reuseport_md, data_end):
9217                info->reg_type = PTR_TO_PACKET_END;
9218                return size == sizeof(__u64);
9219
9220        case offsetof(struct sk_reuseport_md, hash):
9221                return size == size_default;
9222
9223        /* Fields that allow narrowing */
9224        case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
9225                if (size < sizeof_field(struct sk_buff, protocol))
9226                        return false;
9227                fallthrough;
9228        case bpf_ctx_range(struct sk_reuseport_md, ip_protocol):
9229        case bpf_ctx_range(struct sk_reuseport_md, bind_inany):
9230        case bpf_ctx_range(struct sk_reuseport_md, len):
9231                bpf_ctx_record_field_size(info, size_default);
9232                return bpf_ctx_narrow_access_ok(off, size, size_default);
9233
9234        default:
9235                return false;
9236        }
9237}
9238
9239#define SK_REUSEPORT_LOAD_FIELD(F) ({                                   \
9240        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \
9241                              si->dst_reg, si->src_reg,                 \
9242                              bpf_target_off(struct sk_reuseport_kern, F, \
9243                                             sizeof_field(struct sk_reuseport_kern, F), \
9244                                             target_size));             \
9245        })
9246
9247#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD)                          \
9248        SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern,           \
9249                                    struct sk_buff,                     \
9250                                    skb,                                \
9251                                    SKB_FIELD)
9252
9253#define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD)                            \
9254        SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern,           \
9255                                    struct sock,                        \
9256                                    sk,                                 \
9257                                    SK_FIELD)
9258
9259static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
9260                                           const struct bpf_insn *si,
9261                                           struct bpf_insn *insn_buf,
9262                                           struct bpf_prog *prog,
9263                                           u32 *target_size)
9264{
9265        struct bpf_insn *insn = insn_buf;
9266
9267        switch (si->off) {
9268        case offsetof(struct sk_reuseport_md, data):
9269                SK_REUSEPORT_LOAD_SKB_FIELD(data);
9270                break;
9271
9272        case offsetof(struct sk_reuseport_md, len):
9273                SK_REUSEPORT_LOAD_SKB_FIELD(len);
9274                break;
9275
9276        case offsetof(struct sk_reuseport_md, eth_protocol):
9277                SK_REUSEPORT_LOAD_SKB_FIELD(protocol);
9278                break;
9279
9280        case offsetof(struct sk_reuseport_md, ip_protocol):
9281                SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol);
9282                break;
9283
9284        case offsetof(struct sk_reuseport_md, data_end):
9285                SK_REUSEPORT_LOAD_FIELD(data_end);
9286                break;
9287
9288        case offsetof(struct sk_reuseport_md, hash):
9289                SK_REUSEPORT_LOAD_FIELD(hash);
9290                break;
9291
9292        case offsetof(struct sk_reuseport_md, bind_inany):
9293                SK_REUSEPORT_LOAD_FIELD(bind_inany);
9294                break;
9295        }
9296
9297        return insn - insn_buf;
9298}
9299
9300const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
9301        .get_func_proto         = sk_reuseport_func_proto,
9302        .is_valid_access        = sk_reuseport_is_valid_access,
9303        .convert_ctx_access     = sk_reuseport_convert_ctx_access,
9304};
9305
9306const struct bpf_prog_ops sk_reuseport_prog_ops = {
9307};
9308
9309DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled);
9310EXPORT_SYMBOL(bpf_sk_lookup_enabled);
9311
9312BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx,
9313           struct sock *, sk, u64, flags)
9314{
9315        if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE |
9316                               BPF_SK_LOOKUP_F_NO_REUSEPORT)))
9317                return -EINVAL;
9318        if (unlikely(sk && sk_is_refcounted(sk)))
9319                return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */
9320        if (unlikely(sk && sk->sk_state == TCP_ESTABLISHED))
9321                return -ESOCKTNOSUPPORT; /* reject connected sockets */
9322
9323        /* Check if socket is suitable for packet L3/L4 protocol */
9324        if (sk && sk->sk_protocol != ctx->protocol)
9325                return -EPROTOTYPE;
9326        if (sk && sk->sk_family != ctx->family &&
9327            (sk->sk_family == AF_INET || ipv6_only_sock(sk)))
9328                return -EAFNOSUPPORT;
9329
9330        if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE))
9331                return -EEXIST;
9332
9333        /* Select socket as lookup result */
9334        ctx->selected_sk = sk;
9335        ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT;
9336        return 0;
9337}
9338
9339static const struct bpf_func_proto bpf_sk_lookup_assign_proto = {
9340        .func           = bpf_sk_lookup_assign,
9341        .gpl_only       = false,
9342        .ret_type       = RET_INTEGER,
9343        .arg1_type      = ARG_PTR_TO_CTX,
9344        .arg2_type      = ARG_PTR_TO_SOCKET_OR_NULL,
9345        .arg3_type      = ARG_ANYTHING,
9346};
9347
9348static const struct bpf_func_proto *
9349sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
9350{
9351        switch (func_id) {
9352        case BPF_FUNC_perf_event_output:
9353                return &bpf_event_output_data_proto;
9354        case BPF_FUNC_sk_assign:
9355                return &bpf_sk_lookup_assign_proto;
9356        case BPF_FUNC_sk_release:
9357                return &bpf_sk_release_proto;
9358        default:
9359                return bpf_base_func_proto(func_id);
9360        }
9361}
9362
9363static bool sk_lookup_is_valid_access(int off, int size,
9364                                      enum bpf_access_type type,
9365                                      const struct bpf_prog *prog,
9366                                      struct bpf_insn_access_aux *info)
9367{
9368        if (off < 0 || off >= sizeof(struct bpf_sk_lookup))
9369                return false;
9370        if (off % size != 0)
9371                return false;
9372        if (type != BPF_READ)
9373                return false;
9374
9375        switch (off) {
9376        case offsetof(struct bpf_sk_lookup, sk):
9377                info->reg_type = PTR_TO_SOCKET_OR_NULL;
9378                return size == sizeof(__u64);
9379
9380        case bpf_ctx_range(struct bpf_sk_lookup, family):
9381        case bpf_ctx_range(struct bpf_sk_lookup, protocol):
9382        case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4):
9383        case bpf_ctx_range(struct bpf_sk_lookup, local_ip4):
9384        case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]):
9385        case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
9386        case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
9387        case bpf_ctx_range(struct bpf_sk_lookup, local_port):
9388                bpf_ctx_record_field_size(info, sizeof(__u32));
9389                return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32));
9390
9391        default:
9392                return false;
9393        }
9394}
9395
9396static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
9397                                        const struct bpf_insn *si,
9398                                        struct bpf_insn *insn_buf,
9399                                        struct bpf_prog *prog,
9400                                        u32 *target_size)
9401{
9402        struct bpf_insn *insn = insn_buf;
9403
9404        switch (si->off) {
9405        case offsetof(struct bpf_sk_lookup, sk):
9406                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
9407                                      offsetof(struct bpf_sk_lookup_kern, selected_sk));
9408                break;
9409
9410        case offsetof(struct bpf_sk_lookup, family):
9411                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
9412                                      bpf_target_off(struct bpf_sk_lookup_kern,
9413                                                     family, 2, target_size));
9414                break;
9415
9416        case offsetof(struct bpf_sk_lookup, protocol):
9417                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
9418                                      bpf_target_off(struct bpf_sk_lookup_kern,
9419                                                     protocol, 2, target_size));
9420                break;
9421
9422        case offsetof(struct bpf_sk_lookup, remote_ip4):
9423                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
9424                                      bpf_target_off(struct bpf_sk_lookup_kern,
9425                                                     v4.saddr, 4, target_size));
9426                break;
9427
9428        case offsetof(struct bpf_sk_lookup, local_ip4):
9429                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
9430                                      bpf_target_off(struct bpf_sk_lookup_kern,
9431                                                     v4.daddr, 4, target_size));
9432                break;
9433
9434        case bpf_ctx_range_till(struct bpf_sk_lookup,
9435                                remote_ip6[0], remote_ip6[3]): {
9436#if IS_ENABLED(CONFIG_IPV6)
9437                int off = si->off;
9438
9439                off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]);
9440                off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
9441                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
9442                                      offsetof(struct bpf_sk_lookup_kern, v6.saddr));
9443                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
9444                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
9445#else
9446                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
9447#endif
9448                break;
9449        }
9450        case bpf_ctx_range_till(struct bpf_sk_lookup,
9451                                local_ip6[0], local_ip6[3]): {
9452#if IS_ENABLED(CONFIG_IPV6)
9453                int off = si->off;
9454
9455                off -= offsetof(struct bpf_sk_lookup, local_ip6[0]);
9456                off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
9457                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
9458                                      offsetof(struct bpf_sk_lookup_kern, v6.daddr));
9459                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
9460                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
9461#else
9462                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
9463#endif
9464                break;
9465        }
9466        case offsetof(struct bpf_sk_lookup, remote_port):
9467                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
9468                                      bpf_target_off(struct bpf_sk_lookup_kern,
9469                                                     sport, 2, target_size));
9470                break;
9471
9472        case offsetof(struct bpf_sk_lookup, local_port):
9473                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
9474                                      bpf_target_off(struct bpf_sk_lookup_kern,
9475                                                     dport, 2, target_size));
9476                break;
9477        }
9478
9479        return insn - insn_buf;
9480}
9481
9482const struct bpf_prog_ops sk_lookup_prog_ops = {
9483};
9484
9485const struct bpf_verifier_ops sk_lookup_verifier_ops = {
9486        .get_func_proto         = sk_lookup_func_proto,
9487        .is_valid_access        = sk_lookup_is_valid_access,
9488        .convert_ctx_access     = sk_lookup_convert_ctx_access,
9489};
9490
9491#endif /* CONFIG_INET */
9492
9493DEFINE_BPF_DISPATCHER(xdp)
9494
9495void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
9496{
9497        bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog);
9498}
9499
9500#ifdef CONFIG_DEBUG_INFO_BTF
9501BTF_ID_LIST_GLOBAL(btf_sock_ids)
9502#define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type)
9503BTF_SOCK_TYPE_xxx
9504#undef BTF_SOCK_TYPE
9505#else
9506u32 btf_sock_ids[MAX_BTF_SOCK_TYPE];
9507#endif
9508
9509static bool check_arg_btf_id(u32 btf_id, u32 arg)
9510{
9511        int i;
9512
9513        /* only one argument, no need to check arg */
9514        for (i = 0; i < MAX_BTF_SOCK_TYPE; i++)
9515                if (btf_sock_ids[i] == btf_id)
9516                        return true;
9517        return false;
9518}
9519
9520BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk)
9521{
9522        /* tcp6_sock type is not generated in dwarf and hence btf,
9523         * trigger an explicit type generation here.
9524         */
9525        BTF_TYPE_EMIT(struct tcp6_sock);
9526        if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP &&
9527            sk->sk_family == AF_INET6)
9528                return (unsigned long)sk;
9529
9530        return (unsigned long)NULL;
9531}
9532
9533const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = {
9534        .func                   = bpf_skc_to_tcp6_sock,
9535        .gpl_only               = false,
9536        .ret_type               = RET_PTR_TO_BTF_ID_OR_NULL,
9537        .arg1_type              = ARG_PTR_TO_BTF_ID,
9538        .check_btf_id           = check_arg_btf_id,
9539        .ret_btf_id             = &btf_sock_ids[BTF_SOCK_TYPE_TCP6],
9540};
9541
9542BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk)
9543{
9544        if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
9545                return (unsigned long)sk;
9546
9547        return (unsigned long)NULL;
9548}
9549
9550const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = {
9551        .func                   = bpf_skc_to_tcp_sock,
9552        .gpl_only               = false,
9553        .ret_type               = RET_PTR_TO_BTF_ID_OR_NULL,
9554        .arg1_type              = ARG_PTR_TO_BTF_ID,
9555        .check_btf_id           = check_arg_btf_id,
9556        .ret_btf_id             = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
9557};
9558
9559BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk)
9560{
9561        /* BTF types for tcp_timewait_sock and inet_timewait_sock are not
9562         * generated if CONFIG_INET=n. Trigger an explicit generation here.
9563         */
9564        BTF_TYPE_EMIT(struct inet_timewait_sock);
9565        BTF_TYPE_EMIT(struct tcp_timewait_sock);
9566
9567#ifdef CONFIG_INET
9568        if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT)
9569                return (unsigned long)sk;
9570#endif
9571
9572#if IS_BUILTIN(CONFIG_IPV6)
9573        if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT)
9574                return (unsigned long)sk;
9575#endif
9576
9577        return (unsigned long)NULL;
9578}
9579
9580const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = {
9581        .func                   = bpf_skc_to_tcp_timewait_sock,
9582        .gpl_only               = false,
9583        .ret_type               = RET_PTR_TO_BTF_ID_OR_NULL,
9584        .arg1_type              = ARG_PTR_TO_BTF_ID,
9585        .check_btf_id           = check_arg_btf_id,
9586        .ret_btf_id             = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW],
9587};
9588
9589BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk)
9590{
9591#ifdef CONFIG_INET
9592        if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV)
9593                return (unsigned long)sk;
9594#endif
9595
9596#if IS_BUILTIN(CONFIG_IPV6)
9597        if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV)
9598                return (unsigned long)sk;
9599#endif
9600
9601        return (unsigned long)NULL;
9602}
9603
9604const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = {
9605        .func                   = bpf_skc_to_tcp_request_sock,
9606        .gpl_only               = false,
9607        .ret_type               = RET_PTR_TO_BTF_ID_OR_NULL,
9608        .arg1_type              = ARG_PTR_TO_BTF_ID,
9609        .check_btf_id           = check_arg_btf_id,
9610        .ret_btf_id             = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ],
9611};
9612
9613BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk)
9614{
9615        /* udp6_sock type is not generated in dwarf and hence btf,
9616         * trigger an explicit type generation here.
9617         */
9618        BTF_TYPE_EMIT(struct udp6_sock);
9619        if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP &&
9620            sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6)
9621                return (unsigned long)sk;
9622
9623        return (unsigned long)NULL;
9624}
9625
9626const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = {
9627        .func                   = bpf_skc_to_udp6_sock,
9628        .gpl_only               = false,
9629        .ret_type               = RET_PTR_TO_BTF_ID_OR_NULL,
9630        .arg1_type              = ARG_PTR_TO_BTF_ID,
9631        .check_btf_id           = check_arg_btf_id,
9632        .ret_btf_id             = &btf_sock_ids[BTF_SOCK_TYPE_UDP6],
9633};
9634