linux/net/core/filter.c
<<
>>
Prefs
   1/*
   2 * Linux Socket Filter - Kernel level socket filtering
   3 *
   4 * Based on the design of the Berkeley Packet Filter. The new
   5 * internal format has been designed by PLUMgrid:
   6 *
   7 *      Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
   8 *
   9 * Authors:
  10 *
  11 *      Jay Schulist <jschlst@samba.org>
  12 *      Alexei Starovoitov <ast@plumgrid.com>
  13 *      Daniel Borkmann <dborkman@redhat.com>
  14 *
  15 * This program is free software; you can redistribute it and/or
  16 * modify it under the terms of the GNU General Public License
  17 * as published by the Free Software Foundation; either version
  18 * 2 of the License, or (at your option) any later version.
  19 *
  20 * Andi Kleen - Fix a few bad bugs and races.
  21 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
  22 */
  23
  24#include <linux/module.h>
  25#include <linux/types.h>
  26#include <linux/mm.h>
  27#include <linux/fcntl.h>
  28#include <linux/socket.h>
  29#include <linux/in.h>
  30#include <linux/inet.h>
  31#include <linux/netdevice.h>
  32#include <linux/if_packet.h>
  33#include <linux/gfp.h>
  34#include <net/ip.h>
  35#include <net/protocol.h>
  36#include <net/netlink.h>
  37#include <linux/skbuff.h>
  38#include <net/sock.h>
  39#include <net/flow_dissector.h>
  40#include <linux/errno.h>
  41#include <linux/timer.h>
  42#include <asm/uaccess.h>
  43#include <asm/unaligned.h>
  44#include <linux/filter.h>
  45#include <linux/ratelimit.h>
  46#include <linux/seccomp.h>
  47#include <linux/if_vlan.h>
  48#include <linux/bpf.h>
  49#include <net/sch_generic.h>
  50#include <net/cls_cgroup.h>
  51#include <net/dst_metadata.h>
  52#include <net/dst.h>
  53#include <net/sock_reuseport.h>
  54
  55/**
  56 *      sk_filter_trim_cap - run a packet through a socket filter
  57 *      @sk: sock associated with &sk_buff
  58 *      @skb: buffer to filter
  59 *      @cap: limit on how short the eBPF program may trim the packet
  60 *
  61 * Run the eBPF program and then cut skb->data to correct size returned by
  62 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
  63 * than pkt_len we keep whole skb->data. This is the socket level
  64 * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
  65 * be accepted or -EPERM if the packet should be tossed.
  66 *
  67 */
  68int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
  69{
  70        int err;
  71        struct sk_filter *filter;
  72
  73        /*
  74         * If the skb was allocated from pfmemalloc reserves, only
  75         * allow SOCK_MEMALLOC sockets to use it as this socket is
  76         * helping free memory
  77         */
  78        if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
  79                return -ENOMEM;
  80
  81        err = security_sock_rcv_skb(sk, skb);
  82        if (err)
  83                return err;
  84
  85        rcu_read_lock();
  86        filter = rcu_dereference(sk->sk_filter);
  87        if (filter) {
  88                unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
  89                err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
  90        }
  91        rcu_read_unlock();
  92
  93        return err;
  94}
  95EXPORT_SYMBOL(sk_filter_trim_cap);
  96
  97static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
  98{
  99        return skb_get_poff((struct sk_buff *)(unsigned long) ctx);
 100}
 101
 102static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 103{
 104        struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
 105        struct nlattr *nla;
 106
 107        if (skb_is_nonlinear(skb))
 108                return 0;
 109
 110        if (skb->len < sizeof(struct nlattr))
 111                return 0;
 112
 113        if (a > skb->len - sizeof(struct nlattr))
 114                return 0;
 115
 116        nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
 117        if (nla)
 118                return (void *) nla - (void *) skb->data;
 119
 120        return 0;
 121}
 122
 123static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 124{
 125        struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
 126        struct nlattr *nla;
 127
 128        if (skb_is_nonlinear(skb))
 129                return 0;
 130
 131        if (skb->len < sizeof(struct nlattr))
 132                return 0;
 133
 134        if (a > skb->len - sizeof(struct nlattr))
 135                return 0;
 136
 137        nla = (struct nlattr *) &skb->data[a];
 138        if (nla->nla_len > skb->len - a)
 139                return 0;
 140
 141        nla = nla_find_nested(nla, x);
 142        if (nla)
 143                return (void *) nla - (void *) skb->data;
 144
 145        return 0;
 146}
 147
 148static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 149{
 150        return raw_smp_processor_id();
 151}
 152
 153static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
 154        .func           = __get_raw_cpu_id,
 155        .gpl_only       = false,
 156        .ret_type       = RET_INTEGER,
 157};
 158
 159static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
 160                              struct bpf_insn *insn_buf)
 161{
 162        struct bpf_insn *insn = insn_buf;
 163
 164        switch (skb_field) {
 165        case SKF_AD_MARK:
 166                BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
 167
 168                *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
 169                                      offsetof(struct sk_buff, mark));
 170                break;
 171
 172        case SKF_AD_PKTTYPE:
 173                *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET());
 174                *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
 175#ifdef __BIG_ENDIAN_BITFIELD
 176                *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
 177#endif
 178                break;
 179
 180        case SKF_AD_QUEUE:
 181                BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
 182
 183                *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
 184                                      offsetof(struct sk_buff, queue_mapping));
 185                break;
 186
 187        case SKF_AD_VLAN_TAG:
 188        case SKF_AD_VLAN_TAG_PRESENT:
 189                BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
 190                BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
 191
 192                /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
 193                *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
 194                                      offsetof(struct sk_buff, vlan_tci));
 195                if (skb_field == SKF_AD_VLAN_TAG) {
 196                        *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg,
 197                                                ~VLAN_TAG_PRESENT);
 198                } else {
 199                        /* dst_reg >>= 12 */
 200                        *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12);
 201                        /* dst_reg &= 1 */
 202                        *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1);
 203                }
 204                break;
 205        }
 206
 207        return insn - insn_buf;
 208}
 209
 210static bool convert_bpf_extensions(struct sock_filter *fp,
 211                                   struct bpf_insn **insnp)
 212{
 213        struct bpf_insn *insn = *insnp;
 214        u32 cnt;
 215
 216        switch (fp->k) {
 217        case SKF_AD_OFF + SKF_AD_PROTOCOL:
 218                BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
 219
 220                /* A = *(u16 *) (CTX + offsetof(protocol)) */
 221                *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
 222                                      offsetof(struct sk_buff, protocol));
 223                /* A = ntohs(A) [emitting a nop or swap16] */
 224                *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
 225                break;
 226
 227        case SKF_AD_OFF + SKF_AD_PKTTYPE:
 228                cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn);
 229                insn += cnt - 1;
 230                break;
 231
 232        case SKF_AD_OFF + SKF_AD_IFINDEX:
 233        case SKF_AD_OFF + SKF_AD_HATYPE:
 234                BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
 235                BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
 236                BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0);
 237
 238                *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
 239                                      BPF_REG_TMP, BPF_REG_CTX,
 240                                      offsetof(struct sk_buff, dev));
 241                /* if (tmp != 0) goto pc + 1 */
 242                *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
 243                *insn++ = BPF_EXIT_INSN();
 244                if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
 245                        *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
 246                                            offsetof(struct net_device, ifindex));
 247                else
 248                        *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
 249                                            offsetof(struct net_device, type));
 250                break;
 251
 252        case SKF_AD_OFF + SKF_AD_MARK:
 253                cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn);
 254                insn += cnt - 1;
 255                break;
 256
 257        case SKF_AD_OFF + SKF_AD_RXHASH:
 258                BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
 259
 260                *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
 261                                    offsetof(struct sk_buff, hash));
 262                break;
 263
 264        case SKF_AD_OFF + SKF_AD_QUEUE:
 265                cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn);
 266                insn += cnt - 1;
 267                break;
 268
 269        case SKF_AD_OFF + SKF_AD_VLAN_TAG:
 270                cnt = convert_skb_access(SKF_AD_VLAN_TAG,
 271                                         BPF_REG_A, BPF_REG_CTX, insn);
 272                insn += cnt - 1;
 273                break;
 274
 275        case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
 276                cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
 277                                         BPF_REG_A, BPF_REG_CTX, insn);
 278                insn += cnt - 1;
 279                break;
 280
 281        case SKF_AD_OFF + SKF_AD_VLAN_TPID:
 282                BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
 283
 284                /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
 285                *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
 286                                      offsetof(struct sk_buff, vlan_proto));
 287                /* A = ntohs(A) [emitting a nop or swap16] */
 288                *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
 289                break;
 290
 291        case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
 292        case SKF_AD_OFF + SKF_AD_NLATTR:
 293        case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
 294        case SKF_AD_OFF + SKF_AD_CPU:
 295        case SKF_AD_OFF + SKF_AD_RANDOM:
 296                /* arg1 = CTX */
 297                *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
 298                /* arg2 = A */
 299                *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
 300                /* arg3 = X */
 301                *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
 302                /* Emit call(arg1=CTX, arg2=A, arg3=X) */
 303                switch (fp->k) {
 304                case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
 305                        *insn = BPF_EMIT_CALL(__skb_get_pay_offset);
 306                        break;
 307                case SKF_AD_OFF + SKF_AD_NLATTR:
 308                        *insn = BPF_EMIT_CALL(__skb_get_nlattr);
 309                        break;
 310                case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
 311                        *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest);
 312                        break;
 313                case SKF_AD_OFF + SKF_AD_CPU:
 314                        *insn = BPF_EMIT_CALL(__get_raw_cpu_id);
 315                        break;
 316                case SKF_AD_OFF + SKF_AD_RANDOM:
 317                        *insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
 318                        bpf_user_rnd_init_once();
 319                        break;
 320                }
 321                break;
 322
 323        case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
 324                /* A ^= X */
 325                *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
 326                break;
 327
 328        default:
 329                /* This is just a dummy call to avoid letting the compiler
 330                 * evict __bpf_call_base() as an optimization. Placed here
 331                 * where no-one bothers.
 332                 */
 333                BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
 334                return false;
 335        }
 336
 337        *insnp = insn;
 338        return true;
 339}
 340
 341/**
 342 *      bpf_convert_filter - convert filter program
 343 *      @prog: the user passed filter program
 344 *      @len: the length of the user passed filter program
 345 *      @new_prog: buffer where converted program will be stored
 346 *      @new_len: pointer to store length of converted program
 347 *
 348 * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style.
 349 * Conversion workflow:
 350 *
 351 * 1) First pass for calculating the new program length:
 352 *   bpf_convert_filter(old_prog, old_len, NULL, &new_len)
 353 *
 354 * 2) 2nd pass to remap in two passes: 1st pass finds new
 355 *    jump offsets, 2nd pass remapping:
 356 *   new_prog = kmalloc(sizeof(struct bpf_insn) * new_len);
 357 *   bpf_convert_filter(old_prog, old_len, new_prog, &new_len);
 358 */
 359static int bpf_convert_filter(struct sock_filter *prog, int len,
 360                              struct bpf_insn *new_prog, int *new_len)
 361{
 362        int new_flen = 0, pass = 0, target, i;
 363        struct bpf_insn *new_insn;
 364        struct sock_filter *fp;
 365        int *addrs = NULL;
 366        u8 bpf_src;
 367
 368        BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
 369        BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
 370
 371        if (len <= 0 || len > BPF_MAXINSNS)
 372                return -EINVAL;
 373
 374        if (new_prog) {
 375                addrs = kcalloc(len, sizeof(*addrs),
 376                                GFP_KERNEL | __GFP_NOWARN);
 377                if (!addrs)
 378                        return -ENOMEM;
 379        }
 380
 381do_pass:
 382        new_insn = new_prog;
 383        fp = prog;
 384
 385        /* Classic BPF related prologue emission. */
 386        if (new_insn) {
 387                /* Classic BPF expects A and X to be reset first. These need
 388                 * to be guaranteed to be the first two instructions.
 389                 */
 390                *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
 391                *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
 392
 393                /* All programs must keep CTX in callee saved BPF_REG_CTX.
 394                 * In eBPF case it's done by the compiler, here we need to
 395                 * do this ourself. Initial CTX is present in BPF_REG_ARG1.
 396                 */
 397                *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
 398        } else {
 399                new_insn += 3;
 400        }
 401
 402        for (i = 0; i < len; fp++, i++) {
 403                struct bpf_insn tmp_insns[6] = { };
 404                struct bpf_insn *insn = tmp_insns;
 405
 406                if (addrs)
 407                        addrs[i] = new_insn - new_prog;
 408
 409                switch (fp->code) {
 410                /* All arithmetic insns and skb loads map as-is. */
 411                case BPF_ALU | BPF_ADD | BPF_X:
 412                case BPF_ALU | BPF_ADD | BPF_K:
 413                case BPF_ALU | BPF_SUB | BPF_X:
 414                case BPF_ALU | BPF_SUB | BPF_K:
 415                case BPF_ALU | BPF_AND | BPF_X:
 416                case BPF_ALU | BPF_AND | BPF_K:
 417                case BPF_ALU | BPF_OR | BPF_X:
 418                case BPF_ALU | BPF_OR | BPF_K:
 419                case BPF_ALU | BPF_LSH | BPF_X:
 420                case BPF_ALU | BPF_LSH | BPF_K:
 421                case BPF_ALU | BPF_RSH | BPF_X:
 422                case BPF_ALU | BPF_RSH | BPF_K:
 423                case BPF_ALU | BPF_XOR | BPF_X:
 424                case BPF_ALU | BPF_XOR | BPF_K:
 425                case BPF_ALU | BPF_MUL | BPF_X:
 426                case BPF_ALU | BPF_MUL | BPF_K:
 427                case BPF_ALU | BPF_DIV | BPF_X:
 428                case BPF_ALU | BPF_DIV | BPF_K:
 429                case BPF_ALU | BPF_MOD | BPF_X:
 430                case BPF_ALU | BPF_MOD | BPF_K:
 431                case BPF_ALU | BPF_NEG:
 432                case BPF_LD | BPF_ABS | BPF_W:
 433                case BPF_LD | BPF_ABS | BPF_H:
 434                case BPF_LD | BPF_ABS | BPF_B:
 435                case BPF_LD | BPF_IND | BPF_W:
 436                case BPF_LD | BPF_IND | BPF_H:
 437                case BPF_LD | BPF_IND | BPF_B:
 438                        /* Check for overloaded BPF extension and
 439                         * directly convert it if found, otherwise
 440                         * just move on with mapping.
 441                         */
 442                        if (BPF_CLASS(fp->code) == BPF_LD &&
 443                            BPF_MODE(fp->code) == BPF_ABS &&
 444                            convert_bpf_extensions(fp, &insn))
 445                                break;
 446
 447                        *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
 448                        break;
 449
 450                /* Jump transformation cannot use BPF block macros
 451                 * everywhere as offset calculation and target updates
 452                 * require a bit more work than the rest, i.e. jump
 453                 * opcodes map as-is, but offsets need adjustment.
 454                 */
 455
 456#define BPF_EMIT_JMP                                                    \
 457        do {                                                            \
 458                if (target >= len || target < 0)                        \
 459                        goto err;                                       \
 460                insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0;   \
 461                /* Adjust pc relative offset for 2nd or 3rd insn. */    \
 462                insn->off -= insn - tmp_insns;                          \
 463        } while (0)
 464
 465                case BPF_JMP | BPF_JA:
 466                        target = i + fp->k + 1;
 467                        insn->code = fp->code;
 468                        BPF_EMIT_JMP;
 469                        break;
 470
 471                case BPF_JMP | BPF_JEQ | BPF_K:
 472                case BPF_JMP | BPF_JEQ | BPF_X:
 473                case BPF_JMP | BPF_JSET | BPF_K:
 474                case BPF_JMP | BPF_JSET | BPF_X:
 475                case BPF_JMP | BPF_JGT | BPF_K:
 476                case BPF_JMP | BPF_JGT | BPF_X:
 477                case BPF_JMP | BPF_JGE | BPF_K:
 478                case BPF_JMP | BPF_JGE | BPF_X:
 479                        if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
 480                                /* BPF immediates are signed, zero extend
 481                                 * immediate into tmp register and use it
 482                                 * in compare insn.
 483                                 */
 484                                *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);
 485
 486                                insn->dst_reg = BPF_REG_A;
 487                                insn->src_reg = BPF_REG_TMP;
 488                                bpf_src = BPF_X;
 489                        } else {
 490                                insn->dst_reg = BPF_REG_A;
 491                                insn->imm = fp->k;
 492                                bpf_src = BPF_SRC(fp->code);
 493                                insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0;
 494                        }
 495
 496                        /* Common case where 'jump_false' is next insn. */
 497                        if (fp->jf == 0) {
 498                                insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
 499                                target = i + fp->jt + 1;
 500                                BPF_EMIT_JMP;
 501                                break;
 502                        }
 503
 504                        /* Convert JEQ into JNE when 'jump_true' is next insn. */
 505                        if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) {
 506                                insn->code = BPF_JMP | BPF_JNE | bpf_src;
 507                                target = i + fp->jf + 1;
 508                                BPF_EMIT_JMP;
 509                                break;
 510                        }
 511
 512                        /* Other jumps are mapped into two insns: Jxx and JA. */
 513                        target = i + fp->jt + 1;
 514                        insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
 515                        BPF_EMIT_JMP;
 516                        insn++;
 517
 518                        insn->code = BPF_JMP | BPF_JA;
 519                        target = i + fp->jf + 1;
 520                        BPF_EMIT_JMP;
 521                        break;
 522
 523                /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
 524                case BPF_LDX | BPF_MSH | BPF_B:
 525                        /* tmp = A */
 526                        *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A);
 527                        /* A = BPF_R0 = *(u8 *) (skb->data + K) */
 528                        *insn++ = BPF_LD_ABS(BPF_B, fp->k);
 529                        /* A &= 0xf */
 530                        *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
 531                        /* A <<= 2 */
 532                        *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
 533                        /* X = A */
 534                        *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
 535                        /* A = tmp */
 536                        *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
 537                        break;
 538
 539                /* RET_K is remaped into 2 insns. RET_A case doesn't need an
 540                 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
 541                 */
 542                case BPF_RET | BPF_A:
 543                case BPF_RET | BPF_K:
 544                        if (BPF_RVAL(fp->code) == BPF_K)
 545                                *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0,
 546                                                        0, fp->k);
 547                        *insn = BPF_EXIT_INSN();
 548                        break;
 549
 550                /* Store to stack. */
 551                case BPF_ST:
 552                case BPF_STX:
 553                        *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
 554                                            BPF_ST ? BPF_REG_A : BPF_REG_X,
 555                                            -(BPF_MEMWORDS - fp->k) * 4);
 556                        break;
 557
 558                /* Load from stack. */
 559                case BPF_LD | BPF_MEM:
 560                case BPF_LDX | BPF_MEM:
 561                        *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD  ?
 562                                            BPF_REG_A : BPF_REG_X, BPF_REG_FP,
 563                                            -(BPF_MEMWORDS - fp->k) * 4);
 564                        break;
 565
 566                /* A = K or X = K */
 567                case BPF_LD | BPF_IMM:
 568                case BPF_LDX | BPF_IMM:
 569                        *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
 570                                              BPF_REG_A : BPF_REG_X, fp->k);
 571                        break;
 572
 573                /* X = A */
 574                case BPF_MISC | BPF_TAX:
 575                        *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
 576                        break;
 577
 578                /* A = X */
 579                case BPF_MISC | BPF_TXA:
 580                        *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
 581                        break;
 582
 583                /* A = skb->len or X = skb->len */
 584                case BPF_LD | BPF_W | BPF_LEN:
 585                case BPF_LDX | BPF_W | BPF_LEN:
 586                        *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
 587                                            BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
 588                                            offsetof(struct sk_buff, len));
 589                        break;
 590
 591                /* Access seccomp_data fields. */
 592                case BPF_LDX | BPF_ABS | BPF_W:
 593                        /* A = *(u32 *) (ctx + K) */
 594                        *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
 595                        break;
 596
 597                /* Unknown instruction. */
 598                default:
 599                        goto err;
 600                }
 601
 602                insn++;
 603                if (new_prog)
 604                        memcpy(new_insn, tmp_insns,
 605                               sizeof(*insn) * (insn - tmp_insns));
 606                new_insn += insn - tmp_insns;
 607        }
 608
 609        if (!new_prog) {
 610                /* Only calculating new length. */
 611                *new_len = new_insn - new_prog;
 612                return 0;
 613        }
 614
 615        pass++;
 616        if (new_flen != new_insn - new_prog) {
 617                new_flen = new_insn - new_prog;
 618                if (pass > 2)
 619                        goto err;
 620                goto do_pass;
 621        }
 622
 623        kfree(addrs);
 624        BUG_ON(*new_len != new_flen);
 625        return 0;
 626err:
 627        kfree(addrs);
 628        return -EINVAL;
 629}
 630
 631/* Security:
 632 *
 633 * As we dont want to clear mem[] array for each packet going through
 634 * __bpf_prog_run(), we check that filter loaded by user never try to read
 635 * a cell if not previously written, and we check all branches to be sure
 636 * a malicious user doesn't try to abuse us.
 637 */
 638static int check_load_and_stores(const struct sock_filter *filter, int flen)
 639{
 640        u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
 641        int pc, ret = 0;
 642
 643        BUILD_BUG_ON(BPF_MEMWORDS > 16);
 644
 645        masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
 646        if (!masks)
 647                return -ENOMEM;
 648
 649        memset(masks, 0xff, flen * sizeof(*masks));
 650
 651        for (pc = 0; pc < flen; pc++) {
 652                memvalid &= masks[pc];
 653
 654                switch (filter[pc].code) {
 655                case BPF_ST:
 656                case BPF_STX:
 657                        memvalid |= (1 << filter[pc].k);
 658                        break;
 659                case BPF_LD | BPF_MEM:
 660                case BPF_LDX | BPF_MEM:
 661                        if (!(memvalid & (1 << filter[pc].k))) {
 662                                ret = -EINVAL;
 663                                goto error;
 664                        }
 665                        break;
 666                case BPF_JMP | BPF_JA:
 667                        /* A jump must set masks on target */
 668                        masks[pc + 1 + filter[pc].k] &= memvalid;
 669                        memvalid = ~0;
 670                        break;
 671                case BPF_JMP | BPF_JEQ | BPF_K:
 672                case BPF_JMP | BPF_JEQ | BPF_X:
 673                case BPF_JMP | BPF_JGE | BPF_K:
 674                case BPF_JMP | BPF_JGE | BPF_X:
 675                case BPF_JMP | BPF_JGT | BPF_K:
 676                case BPF_JMP | BPF_JGT | BPF_X:
 677                case BPF_JMP | BPF_JSET | BPF_K:
 678                case BPF_JMP | BPF_JSET | BPF_X:
 679                        /* A jump must set masks on targets */
 680                        masks[pc + 1 + filter[pc].jt] &= memvalid;
 681                        masks[pc + 1 + filter[pc].jf] &= memvalid;
 682                        memvalid = ~0;
 683                        break;
 684                }
 685        }
 686error:
 687        kfree(masks);
 688        return ret;
 689}
 690
 691static bool chk_code_allowed(u16 code_to_probe)
 692{
 693        static const bool codes[] = {
 694                /* 32 bit ALU operations */
 695                [BPF_ALU | BPF_ADD | BPF_K] = true,
 696                [BPF_ALU | BPF_ADD | BPF_X] = true,
 697                [BPF_ALU | BPF_SUB | BPF_K] = true,
 698                [BPF_ALU | BPF_SUB | BPF_X] = true,
 699                [BPF_ALU | BPF_MUL | BPF_K] = true,
 700                [BPF_ALU | BPF_MUL | BPF_X] = true,
 701                [BPF_ALU | BPF_DIV | BPF_K] = true,
 702                [BPF_ALU | BPF_DIV | BPF_X] = true,
 703                [BPF_ALU | BPF_MOD | BPF_K] = true,
 704                [BPF_ALU | BPF_MOD | BPF_X] = true,
 705                [BPF_ALU | BPF_AND | BPF_K] = true,
 706                [BPF_ALU | BPF_AND | BPF_X] = true,
 707                [BPF_ALU | BPF_OR | BPF_K] = true,
 708                [BPF_ALU | BPF_OR | BPF_X] = true,
 709                [BPF_ALU | BPF_XOR | BPF_K] = true,
 710                [BPF_ALU | BPF_XOR | BPF_X] = true,
 711                [BPF_ALU | BPF_LSH | BPF_K] = true,
 712                [BPF_ALU | BPF_LSH | BPF_X] = true,
 713                [BPF_ALU | BPF_RSH | BPF_K] = true,
 714                [BPF_ALU | BPF_RSH | BPF_X] = true,
 715                [BPF_ALU | BPF_NEG] = true,
 716                /* Load instructions */
 717                [BPF_LD | BPF_W | BPF_ABS] = true,
 718                [BPF_LD | BPF_H | BPF_ABS] = true,
 719                [BPF_LD | BPF_B | BPF_ABS] = true,
 720                [BPF_LD | BPF_W | BPF_LEN] = true,
 721                [BPF_LD | BPF_W | BPF_IND] = true,
 722                [BPF_LD | BPF_H | BPF_IND] = true,
 723                [BPF_LD | BPF_B | BPF_IND] = true,
 724                [BPF_LD | BPF_IMM] = true,
 725                [BPF_LD | BPF_MEM] = true,
 726                [BPF_LDX | BPF_W | BPF_LEN] = true,
 727                [BPF_LDX | BPF_B | BPF_MSH] = true,
 728                [BPF_LDX | BPF_IMM] = true,
 729                [BPF_LDX | BPF_MEM] = true,
 730                /* Store instructions */
 731                [BPF_ST] = true,
 732                [BPF_STX] = true,
 733                /* Misc instructions */
 734                [BPF_MISC | BPF_TAX] = true,
 735                [BPF_MISC | BPF_TXA] = true,
 736                /* Return instructions */
 737                [BPF_RET | BPF_K] = true,
 738                [BPF_RET | BPF_A] = true,
 739                /* Jump instructions */
 740                [BPF_JMP | BPF_JA] = true,
 741                [BPF_JMP | BPF_JEQ | BPF_K] = true,
 742                [BPF_JMP | BPF_JEQ | BPF_X] = true,
 743                [BPF_JMP | BPF_JGE | BPF_K] = true,
 744                [BPF_JMP | BPF_JGE | BPF_X] = true,
 745                [BPF_JMP | BPF_JGT | BPF_K] = true,
 746                [BPF_JMP | BPF_JGT | BPF_X] = true,
 747                [BPF_JMP | BPF_JSET | BPF_K] = true,
 748                [BPF_JMP | BPF_JSET | BPF_X] = true,
 749        };
 750
 751        if (code_to_probe >= ARRAY_SIZE(codes))
 752                return false;
 753
 754        return codes[code_to_probe];
 755}
 756
 757static bool bpf_check_basics_ok(const struct sock_filter *filter,
 758                                unsigned int flen)
 759{
 760        if (filter == NULL)
 761                return false;
 762        if (flen == 0 || flen > BPF_MAXINSNS)
 763                return false;
 764
 765        return true;
 766}
 767
 768/**
 769 *      bpf_check_classic - verify socket filter code
 770 *      @filter: filter to verify
 771 *      @flen: length of filter
 772 *
 773 * Check the user's filter code. If we let some ugly
 774 * filter code slip through kaboom! The filter must contain
 775 * no references or jumps that are out of range, no illegal
 776 * instructions, and must end with a RET instruction.
 777 *
 778 * All jumps are forward as they are not signed.
 779 *
 780 * Returns 0 if the rule set is legal or -EINVAL if not.
 781 */
 782static int bpf_check_classic(const struct sock_filter *filter,
 783                             unsigned int flen)
 784{
 785        bool anc_found;
 786        int pc;
 787
 788        /* Check the filter code now */
 789        for (pc = 0; pc < flen; pc++) {
 790                const struct sock_filter *ftest = &filter[pc];
 791
 792                /* May we actually operate on this code? */
 793                if (!chk_code_allowed(ftest->code))
 794                        return -EINVAL;
 795
 796                /* Some instructions need special checks */
 797                switch (ftest->code) {
 798                case BPF_ALU | BPF_DIV | BPF_K:
 799                case BPF_ALU | BPF_MOD | BPF_K:
 800                        /* Check for division by zero */
 801                        if (ftest->k == 0)
 802                                return -EINVAL;
 803                        break;
 804                case BPF_ALU | BPF_LSH | BPF_K:
 805                case BPF_ALU | BPF_RSH | BPF_K:
 806                        if (ftest->k >= 32)
 807                                return -EINVAL;
 808                        break;
 809                case BPF_LD | BPF_MEM:
 810                case BPF_LDX | BPF_MEM:
 811                case BPF_ST:
 812                case BPF_STX:
 813                        /* Check for invalid memory addresses */
 814                        if (ftest->k >= BPF_MEMWORDS)
 815                                return -EINVAL;
 816                        break;
 817                case BPF_JMP | BPF_JA:
 818                        /* Note, the large ftest->k might cause loops.
 819                         * Compare this with conditional jumps below,
 820                         * where offsets are limited. --ANK (981016)
 821                         */
 822                        if (ftest->k >= (unsigned int)(flen - pc - 1))
 823                                return -EINVAL;
 824                        break;
 825                case BPF_JMP | BPF_JEQ | BPF_K:
 826                case BPF_JMP | BPF_JEQ | BPF_X:
 827                case BPF_JMP | BPF_JGE | BPF_K:
 828                case BPF_JMP | BPF_JGE | BPF_X:
 829                case BPF_JMP | BPF_JGT | BPF_K:
 830                case BPF_JMP | BPF_JGT | BPF_X:
 831                case BPF_JMP | BPF_JSET | BPF_K:
 832                case BPF_JMP | BPF_JSET | BPF_X:
 833                        /* Both conditionals must be safe */
 834                        if (pc + ftest->jt + 1 >= flen ||
 835                            pc + ftest->jf + 1 >= flen)
 836                                return -EINVAL;
 837                        break;
 838                case BPF_LD | BPF_W | BPF_ABS:
 839                case BPF_LD | BPF_H | BPF_ABS:
 840                case BPF_LD | BPF_B | BPF_ABS:
 841                        anc_found = false;
 842                        if (bpf_anc_helper(ftest) & BPF_ANC)
 843                                anc_found = true;
 844                        /* Ancillary operation unknown or unsupported */
 845                        if (anc_found == false && ftest->k >= SKF_AD_OFF)
 846                                return -EINVAL;
 847                }
 848        }
 849
 850        /* Last instruction must be a RET code */
 851        switch (filter[flen - 1].code) {
 852        case BPF_RET | BPF_K:
 853        case BPF_RET | BPF_A:
 854                return check_load_and_stores(filter, flen);
 855        }
 856
 857        return -EINVAL;
 858}
 859
 860static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
 861                                      const struct sock_fprog *fprog)
 862{
 863        unsigned int fsize = bpf_classic_proglen(fprog);
 864        struct sock_fprog_kern *fkprog;
 865
 866        fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
 867        if (!fp->orig_prog)
 868                return -ENOMEM;
 869
 870        fkprog = fp->orig_prog;
 871        fkprog->len = fprog->len;
 872
 873        fkprog->filter = kmemdup(fp->insns, fsize,
 874                                 GFP_KERNEL | __GFP_NOWARN);
 875        if (!fkprog->filter) {
 876                kfree(fp->orig_prog);
 877                return -ENOMEM;
 878        }
 879
 880        return 0;
 881}
 882
 883static void bpf_release_orig_filter(struct bpf_prog *fp)
 884{
 885        struct sock_fprog_kern *fprog = fp->orig_prog;
 886
 887        if (fprog) {
 888                kfree(fprog->filter);
 889                kfree(fprog);
 890        }
 891}
 892
 893static void __bpf_prog_release(struct bpf_prog *prog)
 894{
 895        if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
 896                bpf_prog_put(prog);
 897        } else {
 898                bpf_release_orig_filter(prog);
 899                bpf_prog_free(prog);
 900        }
 901}
 902
 903static void __sk_filter_release(struct sk_filter *fp)
 904{
 905        __bpf_prog_release(fp->prog);
 906        kfree(fp);
 907}
 908
 909/**
 910 *      sk_filter_release_rcu - Release a socket filter by rcu_head
 911 *      @rcu: rcu_head that contains the sk_filter to free
 912 */
 913static void sk_filter_release_rcu(struct rcu_head *rcu)
 914{
 915        struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
 916
 917        __sk_filter_release(fp);
 918}
 919
 920/**
 921 *      sk_filter_release - release a socket filter
 922 *      @fp: filter to remove
 923 *
 924 *      Remove a filter from a socket and release its resources.
 925 */
 926static void sk_filter_release(struct sk_filter *fp)
 927{
 928        if (atomic_dec_and_test(&fp->refcnt))
 929                call_rcu(&fp->rcu, sk_filter_release_rcu);
 930}
 931
 932void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
 933{
 934        u32 filter_size = bpf_prog_size(fp->prog->len);
 935
 936        atomic_sub(filter_size, &sk->sk_omem_alloc);
 937        sk_filter_release(fp);
 938}
 939
 940/* try to charge the socket memory if there is space available
 941 * return true on success
 942 */
 943bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
 944{
 945        u32 filter_size = bpf_prog_size(fp->prog->len);
 946
 947        /* same check as in sock_kmalloc() */
 948        if (filter_size <= sysctl_optmem_max &&
 949            atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) {
 950                atomic_inc(&fp->refcnt);
 951                atomic_add(filter_size, &sk->sk_omem_alloc);
 952                return true;
 953        }
 954        return false;
 955}
 956
 957static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
 958{
 959        struct sock_filter *old_prog;
 960        struct bpf_prog *old_fp;
 961        int err, new_len, old_len = fp->len;
 962
 963        /* We are free to overwrite insns et al right here as it
 964         * won't be used at this point in time anymore internally
 965         * after the migration to the internal BPF instruction
 966         * representation.
 967         */
 968        BUILD_BUG_ON(sizeof(struct sock_filter) !=
 969                     sizeof(struct bpf_insn));
 970
 971        /* Conversion cannot happen on overlapping memory areas,
 972         * so we need to keep the user BPF around until the 2nd
 973         * pass. At this time, the user BPF is stored in fp->insns.
 974         */
 975        old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
 976                           GFP_KERNEL | __GFP_NOWARN);
 977        if (!old_prog) {
 978                err = -ENOMEM;
 979                goto out_err;
 980        }
 981
 982        /* 1st pass: calculate the new program length. */
 983        err = bpf_convert_filter(old_prog, old_len, NULL, &new_len);
 984        if (err)
 985                goto out_err_free;
 986
 987        /* Expand fp for appending the new filter representation. */
 988        old_fp = fp;
 989        fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0);
 990        if (!fp) {
 991                /* The old_fp is still around in case we couldn't
 992                 * allocate new memory, so uncharge on that one.
 993                 */
 994                fp = old_fp;
 995                err = -ENOMEM;
 996                goto out_err_free;
 997        }
 998
 999        fp->len = new_len;
1000
1001        /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
1002        err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len);
1003        if (err)
1004                /* 2nd bpf_convert_filter() can fail only if it fails
1005                 * to allocate memory, remapping must succeed. Note,
1006                 * that at this time old_fp has already been released
1007                 * by krealloc().
1008                 */
1009                goto out_err_free;
1010
1011        /* We are guaranteed to never error here with cBPF to eBPF
1012         * transitions, since there's no issue with type compatibility
1013         * checks on program arrays.
1014         */
1015        fp = bpf_prog_select_runtime(fp, &err);
1016
1017        kfree(old_prog);
1018        return fp;
1019
1020out_err_free:
1021        kfree(old_prog);
1022out_err:
1023        __bpf_prog_release(fp);
1024        return ERR_PTR(err);
1025}
1026
1027static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
1028                                           bpf_aux_classic_check_t trans)
1029{
1030        int err;
1031
1032        fp->bpf_func = NULL;
1033        fp->jited = 0;
1034
1035        err = bpf_check_classic(fp->insns, fp->len);
1036        if (err) {
1037                __bpf_prog_release(fp);
1038                return ERR_PTR(err);
1039        }
1040
1041        /* There might be additional checks and transformations
1042         * needed on classic filters, f.e. in case of seccomp.
1043         */
1044        if (trans) {
1045                err = trans(fp->insns, fp->len);
1046                if (err) {
1047                        __bpf_prog_release(fp);
1048                        return ERR_PTR(err);
1049                }
1050        }
1051
1052        /* Probe if we can JIT compile the filter and if so, do
1053         * the compilation of the filter.
1054         */
1055        bpf_jit_compile(fp);
1056
1057        /* JIT compiler couldn't process this filter, so do the
1058         * internal BPF translation for the optimized interpreter.
1059         */
1060        if (!fp->jited)
1061                fp = bpf_migrate_filter(fp);
1062
1063        return fp;
1064}
1065
1066/**
1067 *      bpf_prog_create - create an unattached filter
1068 *      @pfp: the unattached filter that is created
1069 *      @fprog: the filter program
1070 *
1071 * Create a filter independent of any socket. We first run some
1072 * sanity checks on it to make sure it does not explode on us later.
1073 * If an error occurs or there is insufficient memory for the filter
1074 * a negative errno code is returned. On success the return is zero.
1075 */
1076int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
1077{
1078        unsigned int fsize = bpf_classic_proglen(fprog);
1079        struct bpf_prog *fp;
1080
1081        /* Make sure new filter is there and in the right amounts. */
1082        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1083                return -EINVAL;
1084
1085        fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1086        if (!fp)
1087                return -ENOMEM;
1088
1089        memcpy(fp->insns, fprog->filter, fsize);
1090
1091        fp->len = fprog->len;
1092        /* Since unattached filters are not copied back to user
1093         * space through sk_get_filter(), we do not need to hold
1094         * a copy here, and can spare us the work.
1095         */
1096        fp->orig_prog = NULL;
1097
1098        /* bpf_prepare_filter() already takes care of freeing
1099         * memory in case something goes wrong.
1100         */
1101        fp = bpf_prepare_filter(fp, NULL);
1102        if (IS_ERR(fp))
1103                return PTR_ERR(fp);
1104
1105        *pfp = fp;
1106        return 0;
1107}
1108EXPORT_SYMBOL_GPL(bpf_prog_create);
1109
1110/**
1111 *      bpf_prog_create_from_user - create an unattached filter from user buffer
1112 *      @pfp: the unattached filter that is created
1113 *      @fprog: the filter program
1114 *      @trans: post-classic verifier transformation handler
1115 *      @save_orig: save classic BPF program
1116 *
1117 * This function effectively does the same as bpf_prog_create(), only
1118 * that it builds up its insns buffer from user space provided buffer.
1119 * It also allows for passing a bpf_aux_classic_check_t handler.
1120 */
1121int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
1122                              bpf_aux_classic_check_t trans, bool save_orig)
1123{
1124        unsigned int fsize = bpf_classic_proglen(fprog);
1125        struct bpf_prog *fp;
1126        int err;
1127
1128        /* Make sure new filter is there and in the right amounts. */
1129        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1130                return -EINVAL;
1131
1132        fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1133        if (!fp)
1134                return -ENOMEM;
1135
1136        if (copy_from_user(fp->insns, fprog->filter, fsize)) {
1137                __bpf_prog_free(fp);
1138                return -EFAULT;
1139        }
1140
1141        fp->len = fprog->len;
1142        fp->orig_prog = NULL;
1143
1144        if (save_orig) {
1145                err = bpf_prog_store_orig_filter(fp, fprog);
1146                if (err) {
1147                        __bpf_prog_free(fp);
1148                        return -ENOMEM;
1149                }
1150        }
1151
1152        /* bpf_prepare_filter() already takes care of freeing
1153         * memory in case something goes wrong.
1154         */
1155        fp = bpf_prepare_filter(fp, trans);
1156        if (IS_ERR(fp))
1157                return PTR_ERR(fp);
1158
1159        *pfp = fp;
1160        return 0;
1161}
1162EXPORT_SYMBOL_GPL(bpf_prog_create_from_user);
1163
1164void bpf_prog_destroy(struct bpf_prog *fp)
1165{
1166        __bpf_prog_release(fp);
1167}
1168EXPORT_SYMBOL_GPL(bpf_prog_destroy);
1169
1170static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
1171{
1172        struct sk_filter *fp, *old_fp;
1173
1174        fp = kmalloc(sizeof(*fp), GFP_KERNEL);
1175        if (!fp)
1176                return -ENOMEM;
1177
1178        fp->prog = prog;
1179        atomic_set(&fp->refcnt, 0);
1180
1181        if (!sk_filter_charge(sk, fp)) {
1182                kfree(fp);
1183                return -ENOMEM;
1184        }
1185
1186        old_fp = rcu_dereference_protected(sk->sk_filter,
1187                                           lockdep_sock_is_held(sk));
1188        rcu_assign_pointer(sk->sk_filter, fp);
1189
1190        if (old_fp)
1191                sk_filter_uncharge(sk, old_fp);
1192
1193        return 0;
1194}
1195
1196static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk)
1197{
1198        struct bpf_prog *old_prog;
1199        int err;
1200
1201        if (bpf_prog_size(prog->len) > sysctl_optmem_max)
1202                return -ENOMEM;
1203
1204        if (sk_unhashed(sk) && sk->sk_reuseport) {
1205                err = reuseport_alloc(sk);
1206                if (err)
1207                        return err;
1208        } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
1209                /* The socket wasn't bound with SO_REUSEPORT */
1210                return -EINVAL;
1211        }
1212
1213        old_prog = reuseport_attach_prog(sk, prog);
1214        if (old_prog)
1215                bpf_prog_destroy(old_prog);
1216
1217        return 0;
1218}
1219
1220static
1221struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
1222{
1223        unsigned int fsize = bpf_classic_proglen(fprog);
1224        struct bpf_prog *prog;
1225        int err;
1226
1227        if (sock_flag(sk, SOCK_FILTER_LOCKED))
1228                return ERR_PTR(-EPERM);
1229
1230        /* Make sure new filter is there and in the right amounts. */
1231        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1232                return ERR_PTR(-EINVAL);
1233
1234        prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1235        if (!prog)
1236                return ERR_PTR(-ENOMEM);
1237
1238        if (copy_from_user(prog->insns, fprog->filter, fsize)) {
1239                __bpf_prog_free(prog);
1240                return ERR_PTR(-EFAULT);
1241        }
1242
1243        prog->len = fprog->len;
1244
1245        err = bpf_prog_store_orig_filter(prog, fprog);
1246        if (err) {
1247                __bpf_prog_free(prog);
1248                return ERR_PTR(-ENOMEM);
1249        }
1250
1251        /* bpf_prepare_filter() already takes care of freeing
1252         * memory in case something goes wrong.
1253         */
1254        return bpf_prepare_filter(prog, NULL);
1255}
1256
1257/**
1258 *      sk_attach_filter - attach a socket filter
1259 *      @fprog: the filter program
1260 *      @sk: the socket to use
1261 *
1262 * Attach the user's filter code. We first run some sanity checks on
1263 * it to make sure it does not explode on us later. If an error
1264 * occurs or there is insufficient memory for the filter a negative
1265 * errno code is returned. On success the return is zero.
1266 */
1267int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1268{
1269        struct bpf_prog *prog = __get_filter(fprog, sk);
1270        int err;
1271
1272        if (IS_ERR(prog))
1273                return PTR_ERR(prog);
1274
1275        err = __sk_attach_prog(prog, sk);
1276        if (err < 0) {
1277                __bpf_prog_release(prog);
1278                return err;
1279        }
1280
1281        return 0;
1282}
1283EXPORT_SYMBOL_GPL(sk_attach_filter);
1284
1285int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1286{
1287        struct bpf_prog *prog = __get_filter(fprog, sk);
1288        int err;
1289
1290        if (IS_ERR(prog))
1291                return PTR_ERR(prog);
1292
1293        err = __reuseport_attach_prog(prog, sk);
1294        if (err < 0) {
1295                __bpf_prog_release(prog);
1296                return err;
1297        }
1298
1299        return 0;
1300}
1301
1302static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
1303{
1304        if (sock_flag(sk, SOCK_FILTER_LOCKED))
1305                return ERR_PTR(-EPERM);
1306
1307        return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
1308}
1309
1310int sk_attach_bpf(u32 ufd, struct sock *sk)
1311{
1312        struct bpf_prog *prog = __get_bpf(ufd, sk);
1313        int err;
1314
1315        if (IS_ERR(prog))
1316                return PTR_ERR(prog);
1317
1318        err = __sk_attach_prog(prog, sk);
1319        if (err < 0) {
1320                bpf_prog_put(prog);
1321                return err;
1322        }
1323
1324        return 0;
1325}
1326
1327int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
1328{
1329        struct bpf_prog *prog = __get_bpf(ufd, sk);
1330        int err;
1331
1332        if (IS_ERR(prog))
1333                return PTR_ERR(prog);
1334
1335        err = __reuseport_attach_prog(prog, sk);
1336        if (err < 0) {
1337                bpf_prog_put(prog);
1338                return err;
1339        }
1340
1341        return 0;
1342}
1343
1344struct bpf_scratchpad {
1345        union {
1346                __be32 diff[MAX_BPF_STACK / sizeof(__be32)];
1347                u8     buff[MAX_BPF_STACK];
1348        };
1349};
1350
1351static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
1352
1353static inline int bpf_try_make_writable(struct sk_buff *skb,
1354                                        unsigned int write_len)
1355{
1356        int err;
1357
1358        err = skb_ensure_writable(skb, write_len);
1359        bpf_compute_data_end(skb);
1360
1361        return err;
1362}
1363
1364static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
1365{
1366        if (skb_at_tc_ingress(skb))
1367                skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len);
1368}
1369
1370static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
1371{
1372        if (skb_at_tc_ingress(skb))
1373                skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
1374}
1375
1376static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
1377{
1378        struct sk_buff *skb = (struct sk_buff *) (long) r1;
1379        unsigned int offset = (unsigned int) r2;
1380        void *from = (void *) (long) r3;
1381        unsigned int len = (unsigned int) r4;
1382        void *ptr;
1383
1384        if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
1385                return -EINVAL;
1386        if (unlikely(offset > 0xffff))
1387                return -EFAULT;
1388        if (unlikely(bpf_try_make_writable(skb, offset + len)))
1389                return -EFAULT;
1390
1391        ptr = skb->data + offset;
1392        if (flags & BPF_F_RECOMPUTE_CSUM)
1393                __skb_postpull_rcsum(skb, ptr, len, offset);
1394
1395        memcpy(ptr, from, len);
1396
1397        if (flags & BPF_F_RECOMPUTE_CSUM)
1398                __skb_postpush_rcsum(skb, ptr, len, offset);
1399        if (flags & BPF_F_INVALIDATE_HASH)
1400                skb_clear_hash(skb);
1401
1402        return 0;
1403}
1404
1405static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
1406        .func           = bpf_skb_store_bytes,
1407        .gpl_only       = false,
1408        .ret_type       = RET_INTEGER,
1409        .arg1_type      = ARG_PTR_TO_CTX,
1410        .arg2_type      = ARG_ANYTHING,
1411        .arg3_type      = ARG_PTR_TO_STACK,
1412        .arg4_type      = ARG_CONST_STACK_SIZE,
1413        .arg5_type      = ARG_ANYTHING,
1414};
1415
1416static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
1417{
1418        const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1;
1419        unsigned int offset = (unsigned int) r2;
1420        void *to = (void *)(unsigned long) r3;
1421        unsigned int len = (unsigned int) r4;
1422        void *ptr;
1423
1424        if (unlikely(offset > 0xffff))
1425                goto err_clear;
1426
1427        ptr = skb_header_pointer(skb, offset, len, to);
1428        if (unlikely(!ptr))
1429                goto err_clear;
1430        if (ptr != to)
1431                memcpy(to, ptr, len);
1432
1433        return 0;
1434err_clear:
1435        memset(to, 0, len);
1436        return -EFAULT;
1437}
1438
1439static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
1440        .func           = bpf_skb_load_bytes,
1441        .gpl_only       = false,
1442        .ret_type       = RET_INTEGER,
1443        .arg1_type      = ARG_PTR_TO_CTX,
1444        .arg2_type      = ARG_ANYTHING,
1445        .arg3_type      = ARG_PTR_TO_RAW_STACK,
1446        .arg4_type      = ARG_CONST_STACK_SIZE,
1447};
1448
1449static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
1450{
1451        struct sk_buff *skb = (struct sk_buff *) (long) r1;
1452        unsigned int offset = (unsigned int) r2;
1453        __sum16 *ptr;
1454
1455        if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
1456                return -EINVAL;
1457        if (unlikely(offset > 0xffff || offset & 1))
1458                return -EFAULT;
1459        if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
1460                return -EFAULT;
1461
1462        ptr = (__sum16 *)(skb->data + offset);
1463        switch (flags & BPF_F_HDR_FIELD_MASK) {
1464        case 0:
1465                if (unlikely(from != 0))
1466                        return -EINVAL;
1467
1468                csum_replace_by_diff(ptr, to);
1469                break;
1470        case 2:
1471                csum_replace2(ptr, from, to);
1472                break;
1473        case 4:
1474                csum_replace4(ptr, from, to);
1475                break;
1476        default:
1477                return -EINVAL;
1478        }
1479
1480        return 0;
1481}
1482
1483static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
1484        .func           = bpf_l3_csum_replace,
1485        .gpl_only       = false,
1486        .ret_type       = RET_INTEGER,
1487        .arg1_type      = ARG_PTR_TO_CTX,
1488        .arg2_type      = ARG_ANYTHING,
1489        .arg3_type      = ARG_ANYTHING,
1490        .arg4_type      = ARG_ANYTHING,
1491        .arg5_type      = ARG_ANYTHING,
1492};
1493
1494static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
1495{
1496        struct sk_buff *skb = (struct sk_buff *) (long) r1;
1497        bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
1498        bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
1499        unsigned int offset = (unsigned int) r2;
1500        __sum16 *ptr;
1501
1502        if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
1503                               BPF_F_HDR_FIELD_MASK)))
1504                return -EINVAL;
1505        if (unlikely(offset > 0xffff || offset & 1))
1506                return -EFAULT;
1507        if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
1508                return -EFAULT;
1509
1510        ptr = (__sum16 *)(skb->data + offset);
1511        if (is_mmzero && !*ptr)
1512                return 0;
1513
1514        switch (flags & BPF_F_HDR_FIELD_MASK) {
1515        case 0:
1516                if (unlikely(from != 0))
1517                        return -EINVAL;
1518
1519                inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
1520                break;
1521        case 2:
1522                inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
1523                break;
1524        case 4:
1525                inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo);
1526                break;
1527        default:
1528                return -EINVAL;
1529        }
1530
1531        if (is_mmzero && !*ptr)
1532                *ptr = CSUM_MANGLED_0;
1533        return 0;
1534}
1535
1536static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
1537        .func           = bpf_l4_csum_replace,
1538        .gpl_only       = false,
1539        .ret_type       = RET_INTEGER,
1540        .arg1_type      = ARG_PTR_TO_CTX,
1541        .arg2_type      = ARG_ANYTHING,
1542        .arg3_type      = ARG_ANYTHING,
1543        .arg4_type      = ARG_ANYTHING,
1544        .arg5_type      = ARG_ANYTHING,
1545};
1546
1547static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
1548{
1549        struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
1550        u64 diff_size = from_size + to_size;
1551        __be32 *from = (__be32 *) (long) r1;
1552        __be32 *to   = (__be32 *) (long) r3;
1553        int i, j = 0;
1554
1555        /* This is quite flexible, some examples:
1556         *
1557         * from_size == 0, to_size > 0,  seed := csum --> pushing data
1558         * from_size > 0,  to_size == 0, seed := csum --> pulling data
1559         * from_size > 0,  to_size > 0,  seed := 0    --> diffing data
1560         *
1561         * Even for diffing, from_size and to_size don't need to be equal.
1562         */
1563        if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
1564                     diff_size > sizeof(sp->diff)))
1565                return -EINVAL;
1566
1567        for (i = 0; i < from_size / sizeof(__be32); i++, j++)
1568                sp->diff[j] = ~from[i];
1569        for (i = 0; i <   to_size / sizeof(__be32); i++, j++)
1570                sp->diff[j] = to[i];
1571
1572        return csum_partial(sp->diff, diff_size, seed);
1573}
1574
1575static const struct bpf_func_proto bpf_csum_diff_proto = {
1576        .func           = bpf_csum_diff,
1577        .gpl_only       = false,
1578        .ret_type       = RET_INTEGER,
1579        .arg1_type      = ARG_PTR_TO_STACK,
1580        .arg2_type      = ARG_CONST_STACK_SIZE_OR_ZERO,
1581        .arg3_type      = ARG_PTR_TO_STACK,
1582        .arg4_type      = ARG_CONST_STACK_SIZE_OR_ZERO,
1583        .arg5_type      = ARG_ANYTHING,
1584};
1585
1586static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
1587{
1588        return dev_forward_skb(dev, skb);
1589}
1590
1591static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
1592{
1593        int ret;
1594
1595        if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
1596                net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
1597                kfree_skb(skb);
1598                return -ENETDOWN;
1599        }
1600
1601        skb->dev = dev;
1602
1603        __this_cpu_inc(xmit_recursion);
1604        ret = dev_queue_xmit(skb);
1605        __this_cpu_dec(xmit_recursion);
1606
1607        return ret;
1608}
1609
1610static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
1611{
1612        struct sk_buff *skb = (struct sk_buff *) (long) r1;
1613        struct net_device *dev;
1614
1615        if (unlikely(flags & ~(BPF_F_INGRESS)))
1616                return -EINVAL;
1617
1618        dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
1619        if (unlikely(!dev))
1620                return -EINVAL;
1621
1622        skb = skb_clone(skb, GFP_ATOMIC);
1623        if (unlikely(!skb))
1624                return -ENOMEM;
1625
1626        bpf_push_mac_rcsum(skb);
1627
1628        return flags & BPF_F_INGRESS ?
1629               __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
1630}
1631
1632static const struct bpf_func_proto bpf_clone_redirect_proto = {
1633        .func           = bpf_clone_redirect,
1634        .gpl_only       = false,
1635        .ret_type       = RET_INTEGER,
1636        .arg1_type      = ARG_PTR_TO_CTX,
1637        .arg2_type      = ARG_ANYTHING,
1638        .arg3_type      = ARG_ANYTHING,
1639};
1640
1641struct redirect_info {
1642        u32 ifindex;
1643        u32 flags;
1644};
1645
1646static DEFINE_PER_CPU(struct redirect_info, redirect_info);
1647
1648static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5)
1649{
1650        struct redirect_info *ri = this_cpu_ptr(&redirect_info);
1651
1652        if (unlikely(flags & ~(BPF_F_INGRESS)))
1653                return TC_ACT_SHOT;
1654
1655        ri->ifindex = ifindex;
1656        ri->flags = flags;
1657
1658        return TC_ACT_REDIRECT;
1659}
1660
1661int skb_do_redirect(struct sk_buff *skb)
1662{
1663        struct redirect_info *ri = this_cpu_ptr(&redirect_info);
1664        struct net_device *dev;
1665
1666        dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex);
1667        ri->ifindex = 0;
1668        if (unlikely(!dev)) {
1669                kfree_skb(skb);
1670                return -EINVAL;
1671        }
1672
1673        bpf_push_mac_rcsum(skb);
1674
1675        return ri->flags & BPF_F_INGRESS ?
1676               __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
1677}
1678
1679static const struct bpf_func_proto bpf_redirect_proto = {
1680        .func           = bpf_redirect,
1681        .gpl_only       = false,
1682        .ret_type       = RET_INTEGER,
1683        .arg1_type      = ARG_ANYTHING,
1684        .arg2_type      = ARG_ANYTHING,
1685};
1686
1687static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
1688{
1689        return task_get_classid((struct sk_buff *) (unsigned long) r1);
1690}
1691
1692static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
1693        .func           = bpf_get_cgroup_classid,
1694        .gpl_only       = false,
1695        .ret_type       = RET_INTEGER,
1696        .arg1_type      = ARG_PTR_TO_CTX,
1697};
1698
1699static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
1700{
1701        return dst_tclassid((struct sk_buff *) (unsigned long) r1);
1702}
1703
1704static const struct bpf_func_proto bpf_get_route_realm_proto = {
1705        .func           = bpf_get_route_realm,
1706        .gpl_only       = false,
1707        .ret_type       = RET_INTEGER,
1708        .arg1_type      = ARG_PTR_TO_CTX,
1709};
1710
1711static u64 bpf_get_hash_recalc(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
1712{
1713        /* If skb_clear_hash() was called due to mangling, we can
1714         * trigger SW recalculation here. Later access to hash
1715         * can then use the inline skb->hash via context directly
1716         * instead of calling this helper again.
1717         */
1718        return skb_get_hash((struct sk_buff *) (unsigned long) r1);
1719}
1720
1721static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
1722        .func           = bpf_get_hash_recalc,
1723        .gpl_only       = false,
1724        .ret_type       = RET_INTEGER,
1725        .arg1_type      = ARG_PTR_TO_CTX,
1726};
1727
1728static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5)
1729{
1730        struct sk_buff *skb = (struct sk_buff *) (long) r1;
1731        __be16 vlan_proto = (__force __be16) r2;
1732        int ret;
1733
1734        if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
1735                     vlan_proto != htons(ETH_P_8021AD)))
1736                vlan_proto = htons(ETH_P_8021Q);
1737
1738        bpf_push_mac_rcsum(skb);
1739        ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
1740        bpf_pull_mac_rcsum(skb);
1741
1742        bpf_compute_data_end(skb);
1743        return ret;
1744}
1745
1746const struct bpf_func_proto bpf_skb_vlan_push_proto = {
1747        .func           = bpf_skb_vlan_push,
1748        .gpl_only       = false,
1749        .ret_type       = RET_INTEGER,
1750        .arg1_type      = ARG_PTR_TO_CTX,
1751        .arg2_type      = ARG_ANYTHING,
1752        .arg3_type      = ARG_ANYTHING,
1753};
1754EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
1755
1756static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
1757{
1758        struct sk_buff *skb = (struct sk_buff *) (long) r1;
1759        int ret;
1760
1761        bpf_push_mac_rcsum(skb);
1762        ret = skb_vlan_pop(skb);
1763        bpf_pull_mac_rcsum(skb);
1764
1765        bpf_compute_data_end(skb);
1766        return ret;
1767}
1768
1769const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
1770        .func           = bpf_skb_vlan_pop,
1771        .gpl_only       = false,
1772        .ret_type       = RET_INTEGER,
1773        .arg1_type      = ARG_PTR_TO_CTX,
1774};
1775EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto);
1776
1777static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
1778{
1779        /* Caller already did skb_cow() with len as headroom,
1780         * so no need to do it here.
1781         */
1782        skb_push(skb, len);
1783        memmove(skb->data, skb->data + len, off);
1784        memset(skb->data + off, 0, len);
1785
1786        /* No skb_postpush_rcsum(skb, skb->data + off, len)
1787         * needed here as it does not change the skb->csum
1788         * result for checksum complete when summing over
1789         * zeroed blocks.
1790         */
1791        return 0;
1792}
1793
1794static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
1795{
1796        /* skb_ensure_writable() is not needed here, as we're
1797         * already working on an uncloned skb.
1798         */
1799        if (unlikely(!pskb_may_pull(skb, off + len)))
1800                return -ENOMEM;
1801
1802        skb_postpull_rcsum(skb, skb->data + off, len);
1803        memmove(skb->data + len, skb->data, off);
1804        __skb_pull(skb, len);
1805
1806        return 0;
1807}
1808
1809static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len)
1810{
1811        bool trans_same = skb->transport_header == skb->network_header;
1812        int ret;
1813
1814        /* There's no need for __skb_push()/__skb_pull() pair to
1815         * get to the start of the mac header as we're guaranteed
1816         * to always start from here under eBPF.
1817         */
1818        ret = bpf_skb_generic_push(skb, off, len);
1819        if (likely(!ret)) {
1820                skb->mac_header -= len;
1821                skb->network_header -= len;
1822                if (trans_same)
1823                        skb->transport_header = skb->network_header;
1824        }
1825
1826        return ret;
1827}
1828
1829static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
1830{
1831        bool trans_same = skb->transport_header == skb->network_header;
1832        int ret;
1833
1834        /* Same here, __skb_push()/__skb_pull() pair not needed. */
1835        ret = bpf_skb_generic_pop(skb, off, len);
1836        if (likely(!ret)) {
1837                skb->mac_header += len;
1838                skb->network_header += len;
1839                if (trans_same)
1840                        skb->transport_header = skb->network_header;
1841        }
1842
1843        return ret;
1844}
1845
1846static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
1847{
1848        const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
1849        u32 off = skb->network_header - skb->mac_header;
1850        int ret;
1851
1852        ret = skb_cow(skb, len_diff);
1853        if (unlikely(ret < 0))
1854                return ret;
1855
1856        ret = bpf_skb_net_hdr_push(skb, off, len_diff);
1857        if (unlikely(ret < 0))
1858                return ret;
1859
1860        if (skb_is_gso(skb)) {
1861                /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV4 needs to
1862                 * be changed into SKB_GSO_TCPV6.
1863                 */
1864                if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
1865                        skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4;
1866                        skb_shinfo(skb)->gso_type |=  SKB_GSO_TCPV6;
1867                }
1868
1869                /* Due to IPv6 header, MSS needs to be downgraded. */
1870                skb_shinfo(skb)->gso_size -= len_diff;
1871                /* Header must be checked, and gso_segs recomputed. */
1872                skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1873                skb_shinfo(skb)->gso_segs = 0;
1874        }
1875
1876        skb->protocol = htons(ETH_P_IPV6);
1877        skb_clear_hash(skb);
1878
1879        return 0;
1880}
1881
1882static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
1883{
1884        const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
1885        u32 off = skb->network_header - skb->mac_header;
1886        int ret;
1887
1888        ret = skb_unclone(skb, GFP_ATOMIC);
1889        if (unlikely(ret < 0))
1890                return ret;
1891
1892        ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
1893        if (unlikely(ret < 0))
1894                return ret;
1895
1896        if (skb_is_gso(skb)) {
1897                /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV6 needs to
1898                 * be changed into SKB_GSO_TCPV4.
1899                 */
1900                if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) {
1901                        skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6;
1902                        skb_shinfo(skb)->gso_type |=  SKB_GSO_TCPV4;
1903                }
1904
1905                /* Due to IPv4 header, MSS can be upgraded. */
1906                skb_shinfo(skb)->gso_size += len_diff;
1907                /* Header must be checked, and gso_segs recomputed. */
1908                skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1909                skb_shinfo(skb)->gso_segs = 0;
1910        }
1911
1912        skb->protocol = htons(ETH_P_IP);
1913        skb_clear_hash(skb);
1914
1915        return 0;
1916}
1917
1918static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
1919{
1920        __be16 from_proto = skb->protocol;
1921
1922        if (from_proto == htons(ETH_P_IP) &&
1923              to_proto == htons(ETH_P_IPV6))
1924                return bpf_skb_proto_4_to_6(skb);
1925
1926        if (from_proto == htons(ETH_P_IPV6) &&
1927              to_proto == htons(ETH_P_IP))
1928                return bpf_skb_proto_6_to_4(skb);
1929
1930        return -ENOTSUPP;
1931}
1932
1933static u64 bpf_skb_change_proto(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
1934{
1935        struct sk_buff *skb = (struct sk_buff *) (long) r1;
1936        __be16 proto = (__force __be16) r2;
1937        int ret;
1938
1939        if (unlikely(flags))
1940                return -EINVAL;
1941
1942        /* General idea is that this helper does the basic groundwork
1943         * needed for changing the protocol, and eBPF program fills the
1944         * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace()
1945         * and other helpers, rather than passing a raw buffer here.
1946         *
1947         * The rationale is to keep this minimal and without a need to
1948         * deal with raw packet data. F.e. even if we would pass buffers
1949         * here, the program still needs to call the bpf_lX_csum_replace()
1950         * helpers anyway. Plus, this way we keep also separation of
1951         * concerns, since f.e. bpf_skb_store_bytes() should only take
1952         * care of stores.
1953         *
1954         * Currently, additional options and extension header space are
1955         * not supported, but flags register is reserved so we can adapt
1956         * that. For offloads, we mark packet as dodgy, so that headers
1957         * need to be verified first.
1958         */
1959        ret = bpf_skb_proto_xlat(skb, proto);
1960        bpf_compute_data_end(skb);
1961        return ret;
1962}
1963
1964static const struct bpf_func_proto bpf_skb_change_proto_proto = {
1965        .func           = bpf_skb_change_proto,
1966        .gpl_only       = false,
1967        .ret_type       = RET_INTEGER,
1968        .arg1_type      = ARG_PTR_TO_CTX,
1969        .arg2_type      = ARG_ANYTHING,
1970        .arg3_type      = ARG_ANYTHING,
1971};
1972
1973static u64 bpf_skb_change_type(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
1974{
1975        struct sk_buff *skb = (struct sk_buff *) (long) r1;
1976        u32 pkt_type = r2;
1977
1978        /* We only allow a restricted subset to be changed for now. */
1979        if (unlikely(skb->pkt_type > PACKET_OTHERHOST ||
1980                     pkt_type > PACKET_OTHERHOST))
1981                return -EINVAL;
1982
1983        skb->pkt_type = pkt_type;
1984        return 0;
1985}
1986
1987static const struct bpf_func_proto bpf_skb_change_type_proto = {
1988        .func           = bpf_skb_change_type,
1989        .gpl_only       = false,
1990        .ret_type       = RET_INTEGER,
1991        .arg1_type      = ARG_PTR_TO_CTX,
1992        .arg2_type      = ARG_ANYTHING,
1993};
1994
1995bool bpf_helper_changes_skb_data(void *func)
1996{
1997        if (func == bpf_skb_vlan_push)
1998                return true;
1999        if (func == bpf_skb_vlan_pop)
2000                return true;
2001        if (func == bpf_skb_store_bytes)
2002                return true;
2003        if (func == bpf_skb_change_proto)
2004                return true;
2005        if (func == bpf_l3_csum_replace)
2006                return true;
2007        if (func == bpf_l4_csum_replace)
2008                return true;
2009
2010        return false;
2011}
2012
2013static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
2014                                  unsigned long off, unsigned long len)
2015{
2016        void *ptr = skb_header_pointer(skb, off, len, dst_buff);
2017
2018        if (unlikely(!ptr))
2019                return len;
2020        if (ptr != dst_buff)
2021                memcpy(dst_buff, ptr, len);
2022
2023        return 0;
2024}
2025
2026static u64 bpf_skb_event_output(u64 r1, u64 r2, u64 flags, u64 r4,
2027                                u64 meta_size)
2028{
2029        struct sk_buff *skb = (struct sk_buff *)(long) r1;
2030        struct bpf_map *map = (struct bpf_map *)(long) r2;
2031        u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
2032        void *meta = (void *)(long) r4;
2033
2034        if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
2035                return -EINVAL;
2036        if (unlikely(skb_size > skb->len))
2037                return -EFAULT;
2038
2039        return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
2040                                bpf_skb_copy);
2041}
2042
2043static const struct bpf_func_proto bpf_skb_event_output_proto = {
2044        .func           = bpf_skb_event_output,
2045        .gpl_only       = true,
2046        .ret_type       = RET_INTEGER,
2047        .arg1_type      = ARG_PTR_TO_CTX,
2048        .arg2_type      = ARG_CONST_MAP_PTR,
2049        .arg3_type      = ARG_ANYTHING,
2050        .arg4_type      = ARG_PTR_TO_STACK,
2051        .arg5_type      = ARG_CONST_STACK_SIZE,
2052};
2053
2054static unsigned short bpf_tunnel_key_af(u64 flags)
2055{
2056        return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
2057}
2058
2059static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
2060{
2061        struct sk_buff *skb = (struct sk_buff *) (long) r1;
2062        struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2;
2063        const struct ip_tunnel_info *info = skb_tunnel_info(skb);
2064        u8 compat[sizeof(struct bpf_tunnel_key)];
2065        void *to_orig = to;
2066        int err;
2067
2068        if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) {
2069                err = -EINVAL;
2070                goto err_clear;
2071        }
2072        if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) {
2073                err = -EPROTO;
2074                goto err_clear;
2075        }
2076        if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
2077                err = -EINVAL;
2078                switch (size) {
2079                case offsetof(struct bpf_tunnel_key, tunnel_label):
2080                case offsetof(struct bpf_tunnel_key, tunnel_ext):
2081                        goto set_compat;
2082                case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
2083                        /* Fixup deprecated structure layouts here, so we have
2084                         * a common path later on.
2085                         */
2086                        if (ip_tunnel_info_af(info) != AF_INET)
2087                                goto err_clear;
2088set_compat:
2089                        to = (struct bpf_tunnel_key *)compat;
2090                        break;
2091                default:
2092                        goto err_clear;
2093                }
2094        }
2095
2096        to->tunnel_id = be64_to_cpu(info->key.tun_id);
2097        to->tunnel_tos = info->key.tos;
2098        to->tunnel_ttl = info->key.ttl;
2099
2100        if (flags & BPF_F_TUNINFO_IPV6) {
2101                memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
2102                       sizeof(to->remote_ipv6));
2103                to->tunnel_label = be32_to_cpu(info->key.label);
2104        } else {
2105                to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
2106        }
2107
2108        if (unlikely(size != sizeof(struct bpf_tunnel_key)))
2109                memcpy(to_orig, to, size);
2110
2111        return 0;
2112err_clear:
2113        memset(to_orig, 0, size);
2114        return err;
2115}
2116
2117static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
2118        .func           = bpf_skb_get_tunnel_key,
2119        .gpl_only       = false,
2120        .ret_type       = RET_INTEGER,
2121        .arg1_type      = ARG_PTR_TO_CTX,
2122        .arg2_type      = ARG_PTR_TO_RAW_STACK,
2123        .arg3_type      = ARG_CONST_STACK_SIZE,
2124        .arg4_type      = ARG_ANYTHING,
2125};
2126
2127static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
2128{
2129        struct sk_buff *skb = (struct sk_buff *) (long) r1;
2130        u8 *to = (u8 *) (long) r2;
2131        const struct ip_tunnel_info *info = skb_tunnel_info(skb);
2132        int err;
2133
2134        if (unlikely(!info ||
2135                     !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) {
2136                err = -ENOENT;
2137                goto err_clear;
2138        }
2139        if (unlikely(size < info->options_len)) {
2140                err = -ENOMEM;
2141                goto err_clear;
2142        }
2143
2144        ip_tunnel_info_opts_get(to, info);
2145        if (size > info->options_len)
2146                memset(to + info->options_len, 0, size - info->options_len);
2147
2148        return info->options_len;
2149err_clear:
2150        memset(to, 0, size);
2151        return err;
2152}
2153
2154static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
2155        .func           = bpf_skb_get_tunnel_opt,
2156        .gpl_only       = false,
2157        .ret_type       = RET_INTEGER,
2158        .arg1_type      = ARG_PTR_TO_CTX,
2159        .arg2_type      = ARG_PTR_TO_RAW_STACK,
2160        .arg3_type      = ARG_CONST_STACK_SIZE,
2161};
2162
2163static struct metadata_dst __percpu *md_dst;
2164
2165static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
2166{
2167        struct sk_buff *skb = (struct sk_buff *) (long) r1;
2168        struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2;
2169        struct metadata_dst *md = this_cpu_ptr(md_dst);
2170        u8 compat[sizeof(struct bpf_tunnel_key)];
2171        struct ip_tunnel_info *info;
2172
2173        if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
2174                               BPF_F_DONT_FRAGMENT)))
2175                return -EINVAL;
2176        if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
2177                switch (size) {
2178                case offsetof(struct bpf_tunnel_key, tunnel_label):
2179                case offsetof(struct bpf_tunnel_key, tunnel_ext):
2180                case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
2181                        /* Fixup deprecated structure layouts here, so we have
2182                         * a common path later on.
2183                         */
2184                        memcpy(compat, from, size);
2185                        memset(compat + size, 0, sizeof(compat) - size);
2186                        from = (struct bpf_tunnel_key *)compat;
2187                        break;
2188                default:
2189                        return -EINVAL;
2190                }
2191        }
2192        if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) ||
2193                     from->tunnel_ext))
2194                return -EINVAL;
2195
2196        skb_dst_drop(skb);
2197        dst_hold((struct dst_entry *) md);
2198        skb_dst_set(skb, (struct dst_entry *) md);
2199
2200        info = &md->u.tun_info;
2201        info->mode = IP_TUNNEL_INFO_TX;
2202
2203        info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
2204        if (flags & BPF_F_DONT_FRAGMENT)
2205                info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
2206
2207        info->key.tun_id = cpu_to_be64(from->tunnel_id);
2208        info->key.tos = from->tunnel_tos;
2209        info->key.ttl = from->tunnel_ttl;
2210
2211        if (flags & BPF_F_TUNINFO_IPV6) {
2212                info->mode |= IP_TUNNEL_INFO_IPV6;
2213                memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
2214                       sizeof(from->remote_ipv6));
2215                info->key.label = cpu_to_be32(from->tunnel_label) &
2216                                  IPV6_FLOWLABEL_MASK;
2217        } else {
2218                info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
2219                if (flags & BPF_F_ZERO_CSUM_TX)
2220                        info->key.tun_flags &= ~TUNNEL_CSUM;
2221        }
2222
2223        return 0;
2224}
2225
2226static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
2227        .func           = bpf_skb_set_tunnel_key,
2228        .gpl_only       = false,
2229        .ret_type       = RET_INTEGER,
2230        .arg1_type      = ARG_PTR_TO_CTX,
2231        .arg2_type      = ARG_PTR_TO_STACK,
2232        .arg3_type      = ARG_CONST_STACK_SIZE,
2233        .arg4_type      = ARG_ANYTHING,
2234};
2235
2236static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
2237{
2238        struct sk_buff *skb = (struct sk_buff *) (long) r1;
2239        u8 *from = (u8 *) (long) r2;
2240        struct ip_tunnel_info *info = skb_tunnel_info(skb);
2241        const struct metadata_dst *md = this_cpu_ptr(md_dst);
2242
2243        if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
2244                return -EINVAL;
2245        if (unlikely(size > IP_TUNNEL_OPTS_MAX))
2246                return -ENOMEM;
2247
2248        ip_tunnel_info_opts_set(info, from, size);
2249
2250        return 0;
2251}
2252
2253static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
2254        .func           = bpf_skb_set_tunnel_opt,
2255        .gpl_only       = false,
2256        .ret_type       = RET_INTEGER,
2257        .arg1_type      = ARG_PTR_TO_CTX,
2258        .arg2_type      = ARG_PTR_TO_STACK,
2259        .arg3_type      = ARG_CONST_STACK_SIZE,
2260};
2261
2262static const struct bpf_func_proto *
2263bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
2264{
2265        if (!md_dst) {
2266                /* Race is not possible, since it's called from verifier
2267                 * that is holding verifier mutex.
2268                 */
2269                md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
2270                                                   GFP_KERNEL);
2271                if (!md_dst)
2272                        return NULL;
2273        }
2274
2275        switch (which) {
2276        case BPF_FUNC_skb_set_tunnel_key:
2277                return &bpf_skb_set_tunnel_key_proto;
2278        case BPF_FUNC_skb_set_tunnel_opt:
2279                return &bpf_skb_set_tunnel_opt_proto;
2280        default:
2281                return NULL;
2282        }
2283}
2284
2285#ifdef CONFIG_SOCK_CGROUP_DATA
2286static u64 bpf_skb_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
2287{
2288        struct sk_buff *skb = (struct sk_buff *)(long)r1;
2289        struct bpf_map *map = (struct bpf_map *)(long)r2;
2290        struct bpf_array *array = container_of(map, struct bpf_array, map);
2291        struct cgroup *cgrp;
2292        struct sock *sk;
2293        u32 i = (u32)r3;
2294
2295        sk = skb->sk;
2296        if (!sk || !sk_fullsock(sk))
2297                return -ENOENT;
2298
2299        if (unlikely(i >= array->map.max_entries))
2300                return -E2BIG;
2301
2302        cgrp = READ_ONCE(array->ptrs[i]);
2303        if (unlikely(!cgrp))
2304                return -EAGAIN;
2305
2306        return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), cgrp);
2307}
2308
2309static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
2310        .func           = bpf_skb_under_cgroup,
2311        .gpl_only       = false,
2312        .ret_type       = RET_INTEGER,
2313        .arg1_type      = ARG_PTR_TO_CTX,
2314        .arg2_type      = ARG_CONST_MAP_PTR,
2315        .arg3_type      = ARG_ANYTHING,
2316};
2317#endif
2318
2319static const struct bpf_func_proto *
2320sk_filter_func_proto(enum bpf_func_id func_id)
2321{
2322        switch (func_id) {
2323        case BPF_FUNC_map_lookup_elem:
2324                return &bpf_map_lookup_elem_proto;
2325        case BPF_FUNC_map_update_elem:
2326                return &bpf_map_update_elem_proto;
2327        case BPF_FUNC_map_delete_elem:
2328                return &bpf_map_delete_elem_proto;
2329        case BPF_FUNC_get_prandom_u32:
2330                return &bpf_get_prandom_u32_proto;
2331        case BPF_FUNC_get_smp_processor_id:
2332                return &bpf_get_raw_smp_processor_id_proto;
2333        case BPF_FUNC_tail_call:
2334                return &bpf_tail_call_proto;
2335        case BPF_FUNC_ktime_get_ns:
2336                return &bpf_ktime_get_ns_proto;
2337        case BPF_FUNC_trace_printk:
2338                if (capable(CAP_SYS_ADMIN))
2339                        return bpf_get_trace_printk_proto();
2340        default:
2341                return NULL;
2342        }
2343}
2344
2345static const struct bpf_func_proto *
2346tc_cls_act_func_proto(enum bpf_func_id func_id)
2347{
2348        switch (func_id) {
2349        case BPF_FUNC_skb_store_bytes:
2350                return &bpf_skb_store_bytes_proto;
2351        case BPF_FUNC_skb_load_bytes:
2352                return &bpf_skb_load_bytes_proto;
2353        case BPF_FUNC_csum_diff:
2354                return &bpf_csum_diff_proto;
2355        case BPF_FUNC_l3_csum_replace:
2356                return &bpf_l3_csum_replace_proto;
2357        case BPF_FUNC_l4_csum_replace:
2358                return &bpf_l4_csum_replace_proto;
2359        case BPF_FUNC_clone_redirect:
2360                return &bpf_clone_redirect_proto;
2361        case BPF_FUNC_get_cgroup_classid:
2362                return &bpf_get_cgroup_classid_proto;
2363        case BPF_FUNC_skb_vlan_push:
2364                return &bpf_skb_vlan_push_proto;
2365        case BPF_FUNC_skb_vlan_pop:
2366                return &bpf_skb_vlan_pop_proto;
2367        case BPF_FUNC_skb_change_proto:
2368                return &bpf_skb_change_proto_proto;
2369        case BPF_FUNC_skb_change_type:
2370                return &bpf_skb_change_type_proto;
2371        case BPF_FUNC_skb_get_tunnel_key:
2372                return &bpf_skb_get_tunnel_key_proto;
2373        case BPF_FUNC_skb_set_tunnel_key:
2374                return bpf_get_skb_set_tunnel_proto(func_id);
2375        case BPF_FUNC_skb_get_tunnel_opt:
2376                return &bpf_skb_get_tunnel_opt_proto;
2377        case BPF_FUNC_skb_set_tunnel_opt:
2378                return bpf_get_skb_set_tunnel_proto(func_id);
2379        case BPF_FUNC_redirect:
2380                return &bpf_redirect_proto;
2381        case BPF_FUNC_get_route_realm:
2382                return &bpf_get_route_realm_proto;
2383        case BPF_FUNC_get_hash_recalc:
2384                return &bpf_get_hash_recalc_proto;
2385        case BPF_FUNC_perf_event_output:
2386                return &bpf_skb_event_output_proto;
2387        case BPF_FUNC_get_smp_processor_id:
2388                return &bpf_get_smp_processor_id_proto;
2389#ifdef CONFIG_SOCK_CGROUP_DATA
2390        case BPF_FUNC_skb_under_cgroup:
2391                return &bpf_skb_under_cgroup_proto;
2392#endif
2393        default:
2394                return sk_filter_func_proto(func_id);
2395        }
2396}
2397
2398static const struct bpf_func_proto *
2399xdp_func_proto(enum bpf_func_id func_id)
2400{
2401        return sk_filter_func_proto(func_id);
2402}
2403
2404static bool __is_valid_access(int off, int size, enum bpf_access_type type)
2405{
2406        if (off < 0 || off >= sizeof(struct __sk_buff))
2407                return false;
2408        /* The verifier guarantees that size > 0. */
2409        if (off % size != 0)
2410                return false;
2411        if (size != sizeof(__u32))
2412                return false;
2413
2414        return true;
2415}
2416
2417static bool sk_filter_is_valid_access(int off, int size,
2418                                      enum bpf_access_type type,
2419                                      enum bpf_reg_type *reg_type)
2420{
2421        switch (off) {
2422        case offsetof(struct __sk_buff, tc_classid):
2423        case offsetof(struct __sk_buff, data):
2424        case offsetof(struct __sk_buff, data_end):
2425                return false;
2426        }
2427
2428        if (type == BPF_WRITE) {
2429                switch (off) {
2430                case offsetof(struct __sk_buff, cb[0]) ...
2431                     offsetof(struct __sk_buff, cb[4]):
2432                        break;
2433                default:
2434                        return false;
2435                }
2436        }
2437
2438        return __is_valid_access(off, size, type);
2439}
2440
2441static bool tc_cls_act_is_valid_access(int off, int size,
2442                                       enum bpf_access_type type,
2443                                       enum bpf_reg_type *reg_type)
2444{
2445        if (type == BPF_WRITE) {
2446                switch (off) {
2447                case offsetof(struct __sk_buff, mark):
2448                case offsetof(struct __sk_buff, tc_index):
2449                case offsetof(struct __sk_buff, priority):
2450                case offsetof(struct __sk_buff, cb[0]) ...
2451                     offsetof(struct __sk_buff, cb[4]):
2452                case offsetof(struct __sk_buff, tc_classid):
2453                        break;
2454                default:
2455                        return false;
2456                }
2457        }
2458
2459        switch (off) {
2460        case offsetof(struct __sk_buff, data):
2461                *reg_type = PTR_TO_PACKET;
2462                break;
2463        case offsetof(struct __sk_buff, data_end):
2464                *reg_type = PTR_TO_PACKET_END;
2465                break;
2466        }
2467
2468        return __is_valid_access(off, size, type);
2469}
2470
2471static bool __is_valid_xdp_access(int off, int size,
2472                                  enum bpf_access_type type)
2473{
2474        if (off < 0 || off >= sizeof(struct xdp_md))
2475                return false;
2476        if (off % size != 0)
2477                return false;
2478        if (size != 4)
2479                return false;
2480
2481        return true;
2482}
2483
2484static bool xdp_is_valid_access(int off, int size,
2485                                enum bpf_access_type type,
2486                                enum bpf_reg_type *reg_type)
2487{
2488        if (type == BPF_WRITE)
2489                return false;
2490
2491        switch (off) {
2492        case offsetof(struct xdp_md, data):
2493                *reg_type = PTR_TO_PACKET;
2494                break;
2495        case offsetof(struct xdp_md, data_end):
2496                *reg_type = PTR_TO_PACKET_END;
2497                break;
2498        }
2499
2500        return __is_valid_xdp_access(off, size, type);
2501}
2502
2503void bpf_warn_invalid_xdp_action(u32 act)
2504{
2505        WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act);
2506}
2507EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
2508
2509static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2510                                      int src_reg, int ctx_off,
2511                                      struct bpf_insn *insn_buf,
2512                                      struct bpf_prog *prog)
2513{
2514        struct bpf_insn *insn = insn_buf;
2515
2516        switch (ctx_off) {
2517        case offsetof(struct __sk_buff, len):
2518                BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
2519
2520                *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
2521                                      offsetof(struct sk_buff, len));
2522                break;
2523
2524        case offsetof(struct __sk_buff, protocol):
2525                BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
2526
2527                *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
2528                                      offsetof(struct sk_buff, protocol));
2529                break;
2530
2531        case offsetof(struct __sk_buff, vlan_proto):
2532                BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
2533
2534                *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
2535                                      offsetof(struct sk_buff, vlan_proto));
2536                break;
2537
2538        case offsetof(struct __sk_buff, priority):
2539                BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4);
2540
2541                if (type == BPF_WRITE)
2542                        *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
2543                                              offsetof(struct sk_buff, priority));
2544                else
2545                        *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
2546                                              offsetof(struct sk_buff, priority));
2547                break;
2548
2549        case offsetof(struct __sk_buff, ingress_ifindex):
2550                BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4);
2551
2552                *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
2553                                      offsetof(struct sk_buff, skb_iif));
2554                break;
2555
2556        case offsetof(struct __sk_buff, ifindex):
2557                BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
2558
2559                *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
2560                                      dst_reg, src_reg,
2561                                      offsetof(struct sk_buff, dev));
2562                *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
2563                *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
2564                                      offsetof(struct net_device, ifindex));
2565                break;
2566
2567        case offsetof(struct __sk_buff, hash):
2568                BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
2569
2570                *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
2571                                      offsetof(struct sk_buff, hash));
2572                break;
2573
2574        case offsetof(struct __sk_buff, mark):
2575                BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
2576
2577                if (type == BPF_WRITE)
2578                        *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
2579                                              offsetof(struct sk_buff, mark));
2580                else
2581                        *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
2582                                              offsetof(struct sk_buff, mark));
2583                break;
2584
2585        case offsetof(struct __sk_buff, pkt_type):
2586                return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn);
2587
2588        case offsetof(struct __sk_buff, queue_mapping):
2589                return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn);
2590
2591        case offsetof(struct __sk_buff, vlan_present):
2592                return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
2593                                          dst_reg, src_reg, insn);
2594
2595        case offsetof(struct __sk_buff, vlan_tci):
2596                return convert_skb_access(SKF_AD_VLAN_TAG,
2597                                          dst_reg, src_reg, insn);
2598
2599        case offsetof(struct __sk_buff, cb[0]) ...
2600                offsetof(struct __sk_buff, cb[4]):
2601                BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
2602
2603                prog->cb_access = 1;
2604                ctx_off -= offsetof(struct __sk_buff, cb[0]);
2605                ctx_off += offsetof(struct sk_buff, cb);
2606                ctx_off += offsetof(struct qdisc_skb_cb, data);
2607                if (type == BPF_WRITE)
2608                        *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
2609                else
2610                        *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
2611                break;
2612
2613        case offsetof(struct __sk_buff, tc_classid):
2614                ctx_off -= offsetof(struct __sk_buff, tc_classid);
2615                ctx_off += offsetof(struct sk_buff, cb);
2616                ctx_off += offsetof(struct qdisc_skb_cb, tc_classid);
2617                if (type == BPF_WRITE)
2618                        *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
2619                else
2620                        *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
2621                break;
2622
2623        case offsetof(struct __sk_buff, data):
2624                *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, data)),
2625                                      dst_reg, src_reg,
2626                                      offsetof(struct sk_buff, data));
2627                break;
2628
2629        case offsetof(struct __sk_buff, data_end):
2630                ctx_off -= offsetof(struct __sk_buff, data_end);
2631                ctx_off += offsetof(struct sk_buff, cb);
2632                ctx_off += offsetof(struct bpf_skb_data_end, data_end);
2633                *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(void *)),
2634                                      dst_reg, src_reg, ctx_off);
2635                break;
2636
2637        case offsetof(struct __sk_buff, tc_index):
2638#ifdef CONFIG_NET_SCHED
2639                BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
2640
2641                if (type == BPF_WRITE)
2642                        *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg,
2643                                              offsetof(struct sk_buff, tc_index));
2644                else
2645                        *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
2646                                              offsetof(struct sk_buff, tc_index));
2647                break;
2648#else
2649                if (type == BPF_WRITE)
2650                        *insn++ = BPF_MOV64_REG(dst_reg, dst_reg);
2651                else
2652                        *insn++ = BPF_MOV64_IMM(dst_reg, 0);
2653                break;
2654#endif
2655        }
2656
2657        return insn - insn_buf;
2658}
2659
2660static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg,
2661                                  int src_reg, int ctx_off,
2662                                  struct bpf_insn *insn_buf,
2663                                  struct bpf_prog *prog)
2664{
2665        struct bpf_insn *insn = insn_buf;
2666
2667        switch (ctx_off) {
2668        case offsetof(struct xdp_md, data):
2669                *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data)),
2670                                      dst_reg, src_reg,
2671                                      offsetof(struct xdp_buff, data));
2672                break;
2673        case offsetof(struct xdp_md, data_end):
2674                *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct xdp_buff, data_end)),
2675                                      dst_reg, src_reg,
2676                                      offsetof(struct xdp_buff, data_end));
2677                break;
2678        }
2679
2680        return insn - insn_buf;
2681}
2682
2683static const struct bpf_verifier_ops sk_filter_ops = {
2684        .get_func_proto         = sk_filter_func_proto,
2685        .is_valid_access        = sk_filter_is_valid_access,
2686        .convert_ctx_access     = bpf_net_convert_ctx_access,
2687};
2688
2689static const struct bpf_verifier_ops tc_cls_act_ops = {
2690        .get_func_proto         = tc_cls_act_func_proto,
2691        .is_valid_access        = tc_cls_act_is_valid_access,
2692        .convert_ctx_access     = bpf_net_convert_ctx_access,
2693};
2694
2695static const struct bpf_verifier_ops xdp_ops = {
2696        .get_func_proto         = xdp_func_proto,
2697        .is_valid_access        = xdp_is_valid_access,
2698        .convert_ctx_access     = xdp_convert_ctx_access,
2699};
2700
2701static struct bpf_prog_type_list sk_filter_type __read_mostly = {
2702        .ops    = &sk_filter_ops,
2703        .type   = BPF_PROG_TYPE_SOCKET_FILTER,
2704};
2705
2706static struct bpf_prog_type_list sched_cls_type __read_mostly = {
2707        .ops    = &tc_cls_act_ops,
2708        .type   = BPF_PROG_TYPE_SCHED_CLS,
2709};
2710
2711static struct bpf_prog_type_list sched_act_type __read_mostly = {
2712        .ops    = &tc_cls_act_ops,
2713        .type   = BPF_PROG_TYPE_SCHED_ACT,
2714};
2715
2716static struct bpf_prog_type_list xdp_type __read_mostly = {
2717        .ops    = &xdp_ops,
2718        .type   = BPF_PROG_TYPE_XDP,
2719};
2720
2721static int __init register_sk_filter_ops(void)
2722{
2723        bpf_register_prog_type(&sk_filter_type);
2724        bpf_register_prog_type(&sched_cls_type);
2725        bpf_register_prog_type(&sched_act_type);
2726        bpf_register_prog_type(&xdp_type);
2727
2728        return 0;
2729}
2730late_initcall(register_sk_filter_ops);
2731
2732int sk_detach_filter(struct sock *sk)
2733{
2734        int ret = -ENOENT;
2735        struct sk_filter *filter;
2736
2737        if (sock_flag(sk, SOCK_FILTER_LOCKED))
2738                return -EPERM;
2739
2740        filter = rcu_dereference_protected(sk->sk_filter,
2741                                           lockdep_sock_is_held(sk));
2742        if (filter) {
2743                RCU_INIT_POINTER(sk->sk_filter, NULL);
2744                sk_filter_uncharge(sk, filter);
2745                ret = 0;
2746        }
2747
2748        return ret;
2749}
2750EXPORT_SYMBOL_GPL(sk_detach_filter);
2751
2752int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
2753                  unsigned int len)
2754{
2755        struct sock_fprog_kern *fprog;
2756        struct sk_filter *filter;
2757        int ret = 0;
2758
2759        lock_sock(sk);
2760        filter = rcu_dereference_protected(sk->sk_filter,
2761                                           lockdep_sock_is_held(sk));
2762        if (!filter)
2763                goto out;
2764
2765        /* We're copying the filter that has been originally attached,
2766         * so no conversion/decode needed anymore. eBPF programs that
2767         * have no original program cannot be dumped through this.
2768         */
2769        ret = -EACCES;
2770        fprog = filter->prog->orig_prog;
2771        if (!fprog)
2772                goto out;
2773
2774        ret = fprog->len;
2775        if (!len)
2776                /* User space only enquires number of filter blocks. */
2777                goto out;
2778
2779        ret = -EINVAL;
2780        if (len < fprog->len)
2781                goto out;
2782
2783        ret = -EFAULT;
2784        if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog)))
2785                goto out;
2786
2787        /* Instead of bytes, the API requests to return the number
2788         * of filter blocks.
2789         */
2790        ret = fprog->len;
2791out:
2792        release_sock(sk);
2793        return ret;
2794}
2795