linux/net/core/filter.c
<<
>>
Prefs
   1/*
   2 * Linux Socket Filter - Kernel level socket filtering
   3 *
   4 * Based on the design of the Berkeley Packet Filter. The new
   5 * internal format has been designed by PLUMgrid:
   6 *
   7 *      Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
   8 *
   9 * Authors:
  10 *
  11 *      Jay Schulist <jschlst@samba.org>
  12 *      Alexei Starovoitov <ast@plumgrid.com>
  13 *      Daniel Borkmann <dborkman@redhat.com>
  14 *
  15 * This program is free software; you can redistribute it and/or
  16 * modify it under the terms of the GNU General Public License
  17 * as published by the Free Software Foundation; either version
  18 * 2 of the License, or (at your option) any later version.
  19 *
  20 * Andi Kleen - Fix a few bad bugs and races.
  21 * Kris Katterjohn - Added many additional checks in sk_chk_filter()
  22 */
  23
  24#include <linux/module.h>
  25#include <linux/types.h>
  26#include <linux/mm.h>
  27#include <linux/fcntl.h>
  28#include <linux/socket.h>
  29#include <linux/in.h>
  30#include <linux/inet.h>
  31#include <linux/netdevice.h>
  32#include <linux/if_packet.h>
  33#include <linux/gfp.h>
  34#include <net/ip.h>
  35#include <net/protocol.h>
  36#include <net/netlink.h>
  37#include <linux/skbuff.h>
  38#include <net/sock.h>
  39#include <linux/errno.h>
  40#include <linux/timer.h>
  41#include <asm/uaccess.h>
  42#include <asm/unaligned.h>
  43#include <linux/filter.h>
  44#include <linux/ratelimit.h>
  45#include <linux/seccomp.h>
  46#include <linux/if_vlan.h>
  47
  48/* Registers */
  49#define BPF_R0  regs[BPF_REG_0]
  50#define BPF_R1  regs[BPF_REG_1]
  51#define BPF_R2  regs[BPF_REG_2]
  52#define BPF_R3  regs[BPF_REG_3]
  53#define BPF_R4  regs[BPF_REG_4]
  54#define BPF_R5  regs[BPF_REG_5]
  55#define BPF_R6  regs[BPF_REG_6]
  56#define BPF_R7  regs[BPF_REG_7]
  57#define BPF_R8  regs[BPF_REG_8]
  58#define BPF_R9  regs[BPF_REG_9]
  59#define BPF_R10 regs[BPF_REG_10]
  60
  61/* Named registers */
  62#define DST     regs[insn->dst_reg]
  63#define SRC     regs[insn->src_reg]
  64#define FP      regs[BPF_REG_FP]
  65#define ARG1    regs[BPF_REG_ARG1]
  66#define CTX     regs[BPF_REG_CTX]
  67#define IMM     insn->imm
  68
  69/* No hurry in this branch
  70 *
  71 * Exported for the bpf jit load helper.
  72 */
  73void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
  74{
  75        u8 *ptr = NULL;
  76
  77        if (k >= SKF_NET_OFF)
  78                ptr = skb_network_header(skb) + k - SKF_NET_OFF;
  79        else if (k >= SKF_LL_OFF)
  80                ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
  81        if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
  82                return ptr;
  83
  84        return NULL;
  85}
  86
  87static inline void *load_pointer(const struct sk_buff *skb, int k,
  88                                 unsigned int size, void *buffer)
  89{
  90        if (k >= 0)
  91                return skb_header_pointer(skb, k, size, buffer);
  92
  93        return bpf_internal_load_pointer_neg_helper(skb, k, size);
  94}
  95
  96/**
  97 *      sk_filter - run a packet through a socket filter
  98 *      @sk: sock associated with &sk_buff
  99 *      @skb: buffer to filter
 100 *
 101 * Run the filter code and then cut skb->data to correct size returned by
 102 * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller
 103 * than pkt_len we keep whole skb->data. This is the socket level
 104 * wrapper to sk_run_filter. It returns 0 if the packet should
 105 * be accepted or -EPERM if the packet should be tossed.
 106 *
 107 */
 108int sk_filter(struct sock *sk, struct sk_buff *skb)
 109{
 110        int err;
 111        struct sk_filter *filter;
 112
 113        /*
 114         * If the skb was allocated from pfmemalloc reserves, only
 115         * allow SOCK_MEMALLOC sockets to use it as this socket is
 116         * helping free memory
 117         */
 118        if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
 119                return -ENOMEM;
 120
 121        err = security_sock_rcv_skb(sk, skb);
 122        if (err)
 123                return err;
 124
 125        rcu_read_lock();
 126        filter = rcu_dereference(sk->sk_filter);
 127        if (filter) {
 128                unsigned int pkt_len = SK_RUN_FILTER(filter, skb);
 129
 130                err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
 131        }
 132        rcu_read_unlock();
 133
 134        return err;
 135}
 136EXPORT_SYMBOL(sk_filter);
 137
 138/* Base function for offset calculation. Needs to go into .text section,
 139 * therefore keeping it non-static as well; will also be used by JITs
 140 * anyway later on, so do not let the compiler omit it.
 141 */
 142noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 143{
 144        return 0;
 145}
 146
 147/**
 148 *      __sk_run_filter - run a filter on a given context
 149 *      @ctx: buffer to run the filter on
 150 *      @insn: filter to apply
 151 *
 152 * Decode and apply filter instructions to the skb->data. Return length to
 153 * keep, 0 for none. @ctx is the data we are operating on, @insn is the
 154 * array of filter instructions.
 155 */
 156static unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
 157{
 158        u64 stack[MAX_BPF_STACK / sizeof(u64)];
 159        u64 regs[MAX_BPF_REG], tmp;
 160        static const void *jumptable[256] = {
 161                [0 ... 255] = &&default_label,
 162                /* Now overwrite non-defaults ... */
 163                /* 32 bit ALU operations */
 164                [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
 165                [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
 166                [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
 167                [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
 168                [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
 169                [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
 170                [BPF_ALU | BPF_OR | BPF_X]  = &&ALU_OR_X,
 171                [BPF_ALU | BPF_OR | BPF_K]  = &&ALU_OR_K,
 172                [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
 173                [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
 174                [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
 175                [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
 176                [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
 177                [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
 178                [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
 179                [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
 180                [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
 181                [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
 182                [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
 183                [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
 184                [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
 185                [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
 186                [BPF_ALU | BPF_NEG] = &&ALU_NEG,
 187                [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
 188                [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
 189                /* 64 bit ALU operations */
 190                [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
 191                [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
 192                [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
 193                [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
 194                [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
 195                [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
 196                [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
 197                [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
 198                [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
 199                [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
 200                [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
 201                [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
 202                [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
 203                [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
 204                [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
 205                [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
 206                [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
 207                [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
 208                [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
 209                [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
 210                [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
 211                [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
 212                [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
 213                [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
 214                [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
 215                /* Call instruction */
 216                [BPF_JMP | BPF_CALL] = &&JMP_CALL,
 217                /* Jumps */
 218                [BPF_JMP | BPF_JA] = &&JMP_JA,
 219                [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
 220                [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
 221                [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
 222                [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
 223                [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
 224                [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
 225                [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
 226                [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
 227                [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
 228                [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
 229                [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
 230                [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
 231                [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
 232                [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
 233                /* Program return */
 234                [BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
 235                /* Store instructions */
 236                [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
 237                [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
 238                [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
 239                [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
 240                [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
 241                [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
 242                [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
 243                [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
 244                [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
 245                [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
 246                /* Load instructions */
 247                [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
 248                [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
 249                [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
 250                [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
 251                [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
 252                [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
 253                [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
 254                [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
 255                [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
 256                [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
 257        };
 258        void *ptr;
 259        int off;
 260
 261#define CONT     ({ insn++; goto select_insn; })
 262#define CONT_JMP ({ insn++; goto select_insn; })
 263
 264        FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
 265        ARG1 = (u64) (unsigned long) ctx;
 266
 267        /* Registers used in classic BPF programs need to be reset first. */
 268        regs[BPF_REG_A] = 0;
 269        regs[BPF_REG_X] = 0;
 270
 271select_insn:
 272        goto *jumptable[insn->code];
 273
 274        /* ALU */
 275#define ALU(OPCODE, OP)                 \
 276        ALU64_##OPCODE##_X:             \
 277                DST = DST OP SRC;       \
 278                CONT;                   \
 279        ALU_##OPCODE##_X:               \
 280                DST = (u32) DST OP (u32) SRC;   \
 281                CONT;                   \
 282        ALU64_##OPCODE##_K:             \
 283                DST = DST OP IMM;               \
 284                CONT;                   \
 285        ALU_##OPCODE##_K:               \
 286                DST = (u32) DST OP (u32) IMM;   \
 287                CONT;
 288
 289        ALU(ADD,  +)
 290        ALU(SUB,  -)
 291        ALU(AND,  &)
 292        ALU(OR,   |)
 293        ALU(LSH, <<)
 294        ALU(RSH, >>)
 295        ALU(XOR,  ^)
 296        ALU(MUL,  *)
 297#undef ALU
 298        ALU_NEG:
 299                DST = (u32) -DST;
 300                CONT;
 301        ALU64_NEG:
 302                DST = -DST;
 303                CONT;
 304        ALU_MOV_X:
 305                DST = (u32) SRC;
 306                CONT;
 307        ALU_MOV_K:
 308                DST = (u32) IMM;
 309                CONT;
 310        ALU64_MOV_X:
 311                DST = SRC;
 312                CONT;
 313        ALU64_MOV_K:
 314                DST = IMM;
 315                CONT;
 316        ALU64_ARSH_X:
 317                (*(s64 *) &DST) >>= SRC;
 318                CONT;
 319        ALU64_ARSH_K:
 320                (*(s64 *) &DST) >>= IMM;
 321                CONT;
 322        ALU64_MOD_X:
 323                if (unlikely(SRC == 0))
 324                        return 0;
 325                tmp = DST;
 326                DST = do_div(tmp, SRC);
 327                CONT;
 328        ALU_MOD_X:
 329                if (unlikely(SRC == 0))
 330                        return 0;
 331                tmp = (u32) DST;
 332                DST = do_div(tmp, (u32) SRC);
 333                CONT;
 334        ALU64_MOD_K:
 335                tmp = DST;
 336                DST = do_div(tmp, IMM);
 337                CONT;
 338        ALU_MOD_K:
 339                tmp = (u32) DST;
 340                DST = do_div(tmp, (u32) IMM);
 341                CONT;
 342        ALU64_DIV_X:
 343                if (unlikely(SRC == 0))
 344                        return 0;
 345                do_div(DST, SRC);
 346                CONT;
 347        ALU_DIV_X:
 348                if (unlikely(SRC == 0))
 349                        return 0;
 350                tmp = (u32) DST;
 351                do_div(tmp, (u32) SRC);
 352                DST = (u32) tmp;
 353                CONT;
 354        ALU64_DIV_K:
 355                do_div(DST, IMM);
 356                CONT;
 357        ALU_DIV_K:
 358                tmp = (u32) DST;
 359                do_div(tmp, (u32) IMM);
 360                DST = (u32) tmp;
 361                CONT;
 362        ALU_END_TO_BE:
 363                switch (IMM) {
 364                case 16:
 365                        DST = (__force u16) cpu_to_be16(DST);
 366                        break;
 367                case 32:
 368                        DST = (__force u32) cpu_to_be32(DST);
 369                        break;
 370                case 64:
 371                        DST = (__force u64) cpu_to_be64(DST);
 372                        break;
 373                }
 374                CONT;
 375        ALU_END_TO_LE:
 376                switch (IMM) {
 377                case 16:
 378                        DST = (__force u16) cpu_to_le16(DST);
 379                        break;
 380                case 32:
 381                        DST = (__force u32) cpu_to_le32(DST);
 382                        break;
 383                case 64:
 384                        DST = (__force u64) cpu_to_le64(DST);
 385                        break;
 386                }
 387                CONT;
 388
 389        /* CALL */
 390        JMP_CALL:
 391                /* Function call scratches BPF_R1-BPF_R5 registers,
 392                 * preserves BPF_R6-BPF_R9, and stores return value
 393                 * into BPF_R0.
 394                 */
 395                BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
 396                                                       BPF_R4, BPF_R5);
 397                CONT;
 398
 399        /* JMP */
 400        JMP_JA:
 401                insn += insn->off;
 402                CONT;
 403        JMP_JEQ_X:
 404                if (DST == SRC) {
 405                        insn += insn->off;
 406                        CONT_JMP;
 407                }
 408                CONT;
 409        JMP_JEQ_K:
 410                if (DST == IMM) {
 411                        insn += insn->off;
 412                        CONT_JMP;
 413                }
 414                CONT;
 415        JMP_JNE_X:
 416                if (DST != SRC) {
 417                        insn += insn->off;
 418                        CONT_JMP;
 419                }
 420                CONT;
 421        JMP_JNE_K:
 422                if (DST != IMM) {
 423                        insn += insn->off;
 424                        CONT_JMP;
 425                }
 426                CONT;
 427        JMP_JGT_X:
 428                if (DST > SRC) {
 429                        insn += insn->off;
 430                        CONT_JMP;
 431                }
 432                CONT;
 433        JMP_JGT_K:
 434                if (DST > IMM) {
 435                        insn += insn->off;
 436                        CONT_JMP;
 437                }
 438                CONT;
 439        JMP_JGE_X:
 440                if (DST >= SRC) {
 441                        insn += insn->off;
 442                        CONT_JMP;
 443                }
 444                CONT;
 445        JMP_JGE_K:
 446                if (DST >= IMM) {
 447                        insn += insn->off;
 448                        CONT_JMP;
 449                }
 450                CONT;
 451        JMP_JSGT_X:
 452                if (((s64) DST) > ((s64) SRC)) {
 453                        insn += insn->off;
 454                        CONT_JMP;
 455                }
 456                CONT;
 457        JMP_JSGT_K:
 458                if (((s64) DST) > ((s64) IMM)) {
 459                        insn += insn->off;
 460                        CONT_JMP;
 461                }
 462                CONT;
 463        JMP_JSGE_X:
 464                if (((s64) DST) >= ((s64) SRC)) {
 465                        insn += insn->off;
 466                        CONT_JMP;
 467                }
 468                CONT;
 469        JMP_JSGE_K:
 470                if (((s64) DST) >= ((s64) IMM)) {
 471                        insn += insn->off;
 472                        CONT_JMP;
 473                }
 474                CONT;
 475        JMP_JSET_X:
 476                if (DST & SRC) {
 477                        insn += insn->off;
 478                        CONT_JMP;
 479                }
 480                CONT;
 481        JMP_JSET_K:
 482                if (DST & IMM) {
 483                        insn += insn->off;
 484                        CONT_JMP;
 485                }
 486                CONT;
 487        JMP_EXIT:
 488                return BPF_R0;
 489
 490        /* STX and ST and LDX*/
 491#define LDST(SIZEOP, SIZE)                                              \
 492        STX_MEM_##SIZEOP:                                               \
 493                *(SIZE *)(unsigned long) (DST + insn->off) = SRC;       \
 494                CONT;                                                   \
 495        ST_MEM_##SIZEOP:                                                \
 496                *(SIZE *)(unsigned long) (DST + insn->off) = IMM;       \
 497                CONT;                                                   \
 498        LDX_MEM_##SIZEOP:                                               \
 499                DST = *(SIZE *)(unsigned long) (SRC + insn->off);       \
 500                CONT;
 501
 502        LDST(B,   u8)
 503        LDST(H,  u16)
 504        LDST(W,  u32)
 505        LDST(DW, u64)
 506#undef LDST
 507        STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
 508                atomic_add((u32) SRC, (atomic_t *)(unsigned long)
 509                           (DST + insn->off));
 510                CONT;
 511        STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
 512                atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
 513                             (DST + insn->off));
 514                CONT;
 515        LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
 516                off = IMM;
 517load_word:
 518                /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
 519                 * only appearing in the programs where ctx ==
 520                 * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
 521                 * == BPF_R6, sk_convert_filter() saves it in BPF_R6,
 522                 * internal BPF verifier will check that BPF_R6 ==
 523                 * ctx.
 524                 *
 525                 * BPF_ABS and BPF_IND are wrappers of function calls,
 526                 * so they scratch BPF_R1-BPF_R5 registers, preserve
 527                 * BPF_R6-BPF_R9, and store return value into BPF_R0.
 528                 *
 529                 * Implicit input:
 530                 *   ctx == skb == BPF_R6 == CTX
 531                 *
 532                 * Explicit input:
 533                 *   SRC == any register
 534                 *   IMM == 32-bit immediate
 535                 *
 536                 * Output:
 537                 *   BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
 538                 */
 539
 540                ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
 541                if (likely(ptr != NULL)) {
 542                        BPF_R0 = get_unaligned_be32(ptr);
 543                        CONT;
 544                }
 545
 546                return 0;
 547        LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
 548                off = IMM;
 549load_half:
 550                ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
 551                if (likely(ptr != NULL)) {
 552                        BPF_R0 = get_unaligned_be16(ptr);
 553                        CONT;
 554                }
 555
 556                return 0;
 557        LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
 558                off = IMM;
 559load_byte:
 560                ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
 561                if (likely(ptr != NULL)) {
 562                        BPF_R0 = *(u8 *)ptr;
 563                        CONT;
 564                }
 565
 566                return 0;
 567        LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
 568                off = IMM + SRC;
 569                goto load_word;
 570        LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
 571                off = IMM + SRC;
 572                goto load_half;
 573        LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
 574                off = IMM + SRC;
 575                goto load_byte;
 576
 577        default_label:
 578                /* If we ever reach this, we have a bug somewhere. */
 579                WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
 580                return 0;
 581}
 582
 583/* Helper to find the offset of pkt_type in sk_buff structure. We want
 584 * to make sure its still a 3bit field starting at a byte boundary;
 585 * taken from arch/x86/net/bpf_jit_comp.c.
 586 */
 587#ifdef __BIG_ENDIAN_BITFIELD
 588#define PKT_TYPE_MAX    (7 << 5)
 589#else
 590#define PKT_TYPE_MAX    7
 591#endif
 592static unsigned int pkt_type_offset(void)
 593{
 594        struct sk_buff skb_probe = { .pkt_type = ~0, };
 595        u8 *ct = (u8 *) &skb_probe;
 596        unsigned int off;
 597
 598        for (off = 0; off < sizeof(struct sk_buff); off++) {
 599                if (ct[off] == PKT_TYPE_MAX)
 600                        return off;
 601        }
 602
 603        pr_err_once("Please fix %s, as pkt_type couldn't be found!\n", __func__);
 604        return -1;
 605}
 606
 607static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 608{
 609        return __skb_get_poff((struct sk_buff *)(unsigned long) ctx);
 610}
 611
 612static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 613{
 614        struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
 615        struct nlattr *nla;
 616
 617        if (skb_is_nonlinear(skb))
 618                return 0;
 619
 620        if (skb->len < sizeof(struct nlattr))
 621                return 0;
 622
 623        if (a > skb->len - sizeof(struct nlattr))
 624                return 0;
 625
 626        nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
 627        if (nla)
 628                return (void *) nla - (void *) skb->data;
 629
 630        return 0;
 631}
 632
 633static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 634{
 635        struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx;
 636        struct nlattr *nla;
 637
 638        if (skb_is_nonlinear(skb))
 639                return 0;
 640
 641        if (skb->len < sizeof(struct nlattr))
 642                return 0;
 643
 644        if (a > skb->len - sizeof(struct nlattr))
 645                return 0;
 646
 647        nla = (struct nlattr *) &skb->data[a];
 648        if (nla->nla_len > skb->len - a)
 649                return 0;
 650
 651        nla = nla_find_nested(nla, x);
 652        if (nla)
 653                return (void *) nla - (void *) skb->data;
 654
 655        return 0;
 656}
 657
 658static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 659{
 660        return raw_smp_processor_id();
 661}
 662
 663/* note that this only generates 32-bit random numbers */
 664static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
 665{
 666        return prandom_u32();
 667}
 668
 669static bool convert_bpf_extensions(struct sock_filter *fp,
 670                                   struct sock_filter_int **insnp)
 671{
 672        struct sock_filter_int *insn = *insnp;
 673
 674        switch (fp->k) {
 675        case SKF_AD_OFF + SKF_AD_PROTOCOL:
 676                BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
 677
 678                /* A = *(u16 *) (CTX + offsetof(protocol)) */
 679                *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
 680                                      offsetof(struct sk_buff, protocol));
 681                /* A = ntohs(A) [emitting a nop or swap16] */
 682                *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
 683                break;
 684
 685        case SKF_AD_OFF + SKF_AD_PKTTYPE:
 686                *insn = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX,
 687                                    pkt_type_offset());
 688                if (insn->off < 0)
 689                        return false;
 690                insn++;
 691                *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX);
 692#ifdef __BIG_ENDIAN_BITFIELD
 693                insn++;
 694                *insn = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 5);
 695#endif
 696                break;
 697
 698        case SKF_AD_OFF + SKF_AD_IFINDEX:
 699        case SKF_AD_OFF + SKF_AD_HATYPE:
 700                BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
 701                BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
 702                BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0);
 703
 704                *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
 705                                      BPF_REG_TMP, BPF_REG_CTX,
 706                                      offsetof(struct sk_buff, dev));
 707                /* if (tmp != 0) goto pc + 1 */
 708                *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
 709                *insn++ = BPF_EXIT_INSN();
 710                if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
 711                        *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
 712                                            offsetof(struct net_device, ifindex));
 713                else
 714                        *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
 715                                            offsetof(struct net_device, type));
 716                break;
 717
 718        case SKF_AD_OFF + SKF_AD_MARK:
 719                BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
 720
 721                *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
 722                                    offsetof(struct sk_buff, mark));
 723                break;
 724
 725        case SKF_AD_OFF + SKF_AD_RXHASH:
 726                BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
 727
 728                *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
 729                                    offsetof(struct sk_buff, hash));
 730                break;
 731
 732        case SKF_AD_OFF + SKF_AD_QUEUE:
 733                BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
 734
 735                *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
 736                                    offsetof(struct sk_buff, queue_mapping));
 737                break;
 738
 739        case SKF_AD_OFF + SKF_AD_VLAN_TAG:
 740        case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
 741                BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
 742                BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
 743
 744                /* A = *(u16 *) (CTX + offsetof(vlan_tci)) */
 745                *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
 746                                      offsetof(struct sk_buff, vlan_tci));
 747                if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) {
 748                        *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A,
 749                                              ~VLAN_TAG_PRESENT);
 750                } else {
 751                        /* A >>= 12 */
 752                        *insn++ = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 12);
 753                        /* A &= 1 */
 754                        *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 1);
 755                }
 756                break;
 757
 758        case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
 759        case SKF_AD_OFF + SKF_AD_NLATTR:
 760        case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
 761        case SKF_AD_OFF + SKF_AD_CPU:
 762        case SKF_AD_OFF + SKF_AD_RANDOM:
 763                /* arg1 = CTX */
 764                *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
 765                /* arg2 = A */
 766                *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
 767                /* arg3 = X */
 768                *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
 769                /* Emit call(arg1=CTX, arg2=A, arg3=X) */
 770                switch (fp->k) {
 771                case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
 772                        *insn = BPF_EMIT_CALL(__skb_get_pay_offset);
 773                        break;
 774                case SKF_AD_OFF + SKF_AD_NLATTR:
 775                        *insn = BPF_EMIT_CALL(__skb_get_nlattr);
 776                        break;
 777                case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
 778                        *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest);
 779                        break;
 780                case SKF_AD_OFF + SKF_AD_CPU:
 781                        *insn = BPF_EMIT_CALL(__get_raw_cpu_id);
 782                        break;
 783                case SKF_AD_OFF + SKF_AD_RANDOM:
 784                        *insn = BPF_EMIT_CALL(__get_random_u32);
 785                        break;
 786                }
 787                break;
 788
 789        case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
 790                /* A ^= X */
 791                *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
 792                break;
 793
 794        default:
 795                /* This is just a dummy call to avoid letting the compiler
 796                 * evict __bpf_call_base() as an optimization. Placed here
 797                 * where no-one bothers.
 798                 */
 799                BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
 800                return false;
 801        }
 802
 803        *insnp = insn;
 804        return true;
 805}
 806
 807/**
 808 *      sk_convert_filter - convert filter program
 809 *      @prog: the user passed filter program
 810 *      @len: the length of the user passed filter program
 811 *      @new_prog: buffer where converted program will be stored
 812 *      @new_len: pointer to store length of converted program
 813 *
 814 * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style.
 815 * Conversion workflow:
 816 *
 817 * 1) First pass for calculating the new program length:
 818 *   sk_convert_filter(old_prog, old_len, NULL, &new_len)
 819 *
 820 * 2) 2nd pass to remap in two passes: 1st pass finds new
 821 *    jump offsets, 2nd pass remapping:
 822 *   new_prog = kmalloc(sizeof(struct sock_filter_int) * new_len);
 823 *   sk_convert_filter(old_prog, old_len, new_prog, &new_len);
 824 *
 825 * User BPF's register A is mapped to our BPF register 6, user BPF
 826 * register X is mapped to BPF register 7; frame pointer is always
 827 * register 10; Context 'void *ctx' is stored in register 1, that is,
 828 * for socket filters: ctx == 'struct sk_buff *', for seccomp:
 829 * ctx == 'struct seccomp_data *'.
 830 */
 831int sk_convert_filter(struct sock_filter *prog, int len,
 832                      struct sock_filter_int *new_prog, int *new_len)
 833{
 834        int new_flen = 0, pass = 0, target, i;
 835        struct sock_filter_int *new_insn;
 836        struct sock_filter *fp;
 837        int *addrs = NULL;
 838        u8 bpf_src;
 839
 840        BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
 841        BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
 842
 843        if (len <= 0 || len > BPF_MAXINSNS)
 844                return -EINVAL;
 845
 846        if (new_prog) {
 847                addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL);
 848                if (!addrs)
 849                        return -ENOMEM;
 850        }
 851
 852do_pass:
 853        new_insn = new_prog;
 854        fp = prog;
 855
 856        if (new_insn)
 857                *new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
 858        new_insn++;
 859
 860        for (i = 0; i < len; fp++, i++) {
 861                struct sock_filter_int tmp_insns[6] = { };
 862                struct sock_filter_int *insn = tmp_insns;
 863
 864                if (addrs)
 865                        addrs[i] = new_insn - new_prog;
 866
 867                switch (fp->code) {
 868                /* All arithmetic insns and skb loads map as-is. */
 869                case BPF_ALU | BPF_ADD | BPF_X:
 870                case BPF_ALU | BPF_ADD | BPF_K:
 871                case BPF_ALU | BPF_SUB | BPF_X:
 872                case BPF_ALU | BPF_SUB | BPF_K:
 873                case BPF_ALU | BPF_AND | BPF_X:
 874                case BPF_ALU | BPF_AND | BPF_K:
 875                case BPF_ALU | BPF_OR | BPF_X:
 876                case BPF_ALU | BPF_OR | BPF_K:
 877                case BPF_ALU | BPF_LSH | BPF_X:
 878                case BPF_ALU | BPF_LSH | BPF_K:
 879                case BPF_ALU | BPF_RSH | BPF_X:
 880                case BPF_ALU | BPF_RSH | BPF_K:
 881                case BPF_ALU | BPF_XOR | BPF_X:
 882                case BPF_ALU | BPF_XOR | BPF_K:
 883                case BPF_ALU | BPF_MUL | BPF_X:
 884                case BPF_ALU | BPF_MUL | BPF_K:
 885                case BPF_ALU | BPF_DIV | BPF_X:
 886                case BPF_ALU | BPF_DIV | BPF_K:
 887                case BPF_ALU | BPF_MOD | BPF_X:
 888                case BPF_ALU | BPF_MOD | BPF_K:
 889                case BPF_ALU | BPF_NEG:
 890                case BPF_LD | BPF_ABS | BPF_W:
 891                case BPF_LD | BPF_ABS | BPF_H:
 892                case BPF_LD | BPF_ABS | BPF_B:
 893                case BPF_LD | BPF_IND | BPF_W:
 894                case BPF_LD | BPF_IND | BPF_H:
 895                case BPF_LD | BPF_IND | BPF_B:
 896                        /* Check for overloaded BPF extension and
 897                         * directly convert it if found, otherwise
 898                         * just move on with mapping.
 899                         */
 900                        if (BPF_CLASS(fp->code) == BPF_LD &&
 901                            BPF_MODE(fp->code) == BPF_ABS &&
 902                            convert_bpf_extensions(fp, &insn))
 903                                break;
 904
 905                        *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
 906                        break;
 907
 908                /* Jump transformation cannot use BPF block macros
 909                 * everywhere as offset calculation and target updates
 910                 * require a bit more work than the rest, i.e. jump
 911                 * opcodes map as-is, but offsets need adjustment.
 912                 */
 913
 914#define BPF_EMIT_JMP                                                    \
 915        do {                                                            \
 916                if (target >= len || target < 0)                        \
 917                        goto err;                                       \
 918                insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0;   \
 919                /* Adjust pc relative offset for 2nd or 3rd insn. */    \
 920                insn->off -= insn - tmp_insns;                          \
 921        } while (0)
 922
 923                case BPF_JMP | BPF_JA:
 924                        target = i + fp->k + 1;
 925                        insn->code = fp->code;
 926                        BPF_EMIT_JMP;
 927                        break;
 928
 929                case BPF_JMP | BPF_JEQ | BPF_K:
 930                case BPF_JMP | BPF_JEQ | BPF_X:
 931                case BPF_JMP | BPF_JSET | BPF_K:
 932                case BPF_JMP | BPF_JSET | BPF_X:
 933                case BPF_JMP | BPF_JGT | BPF_K:
 934                case BPF_JMP | BPF_JGT | BPF_X:
 935                case BPF_JMP | BPF_JGE | BPF_K:
 936                case BPF_JMP | BPF_JGE | BPF_X:
 937                        if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
 938                                /* BPF immediates are signed, zero extend
 939                                 * immediate into tmp register and use it
 940                                 * in compare insn.
 941                                 */
 942                                *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);
 943
 944                                insn->dst_reg = BPF_REG_A;
 945                                insn->src_reg = BPF_REG_TMP;
 946                                bpf_src = BPF_X;
 947                        } else {
 948                                insn->dst_reg = BPF_REG_A;
 949                                insn->src_reg = BPF_REG_X;
 950                                insn->imm = fp->k;
 951                                bpf_src = BPF_SRC(fp->code);
 952                        }
 953
 954                        /* Common case where 'jump_false' is next insn. */
 955                        if (fp->jf == 0) {
 956                                insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
 957                                target = i + fp->jt + 1;
 958                                BPF_EMIT_JMP;
 959                                break;
 960                        }
 961
 962                        /* Convert JEQ into JNE when 'jump_true' is next insn. */
 963                        if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) {
 964                                insn->code = BPF_JMP | BPF_JNE | bpf_src;
 965                                target = i + fp->jf + 1;
 966                                BPF_EMIT_JMP;
 967                                break;
 968                        }
 969
 970                        /* Other jumps are mapped into two insns: Jxx and JA. */
 971                        target = i + fp->jt + 1;
 972                        insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
 973                        BPF_EMIT_JMP;
 974                        insn++;
 975
 976                        insn->code = BPF_JMP | BPF_JA;
 977                        target = i + fp->jf + 1;
 978                        BPF_EMIT_JMP;
 979                        break;
 980
 981                /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
 982                case BPF_LDX | BPF_MSH | BPF_B:
 983                        /* tmp = A */
 984                        *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A);
 985                        /* A = BPF_R0 = *(u8 *) (skb->data + K) */
 986                        *insn++ = BPF_LD_ABS(BPF_B, fp->k);
 987                        /* A &= 0xf */
 988                        *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
 989                        /* A <<= 2 */
 990                        *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
 991                        /* X = A */
 992                        *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
 993                        /* A = tmp */
 994                        *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
 995                        break;
 996
 997                /* RET_K, RET_A are remaped into 2 insns. */
 998                case BPF_RET | BPF_A:
 999                case BPF_RET | BPF_K:
1000                        *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ?
1001                                                BPF_K : BPF_X, BPF_REG_0,
1002                                                BPF_REG_A, fp->k);
1003                        *insn = BPF_EXIT_INSN();
1004                        break;
1005
1006                /* Store to stack. */
1007                case BPF_ST:
1008                case BPF_STX:
1009                        *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
1010                                            BPF_ST ? BPF_REG_A : BPF_REG_X,
1011                                            -(BPF_MEMWORDS - fp->k) * 4);
1012                        break;
1013
1014                /* Load from stack. */
1015                case BPF_LD | BPF_MEM:
1016                case BPF_LDX | BPF_MEM:
1017                        *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD  ?
1018                                            BPF_REG_A : BPF_REG_X, BPF_REG_FP,
1019                                            -(BPF_MEMWORDS - fp->k) * 4);
1020                        break;
1021
1022                /* A = K or X = K */
1023                case BPF_LD | BPF_IMM:
1024                case BPF_LDX | BPF_IMM:
1025                        *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
1026                                              BPF_REG_A : BPF_REG_X, fp->k);
1027                        break;
1028
1029                /* X = A */
1030                case BPF_MISC | BPF_TAX:
1031                        *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
1032                        break;
1033
1034                /* A = X */
1035                case BPF_MISC | BPF_TXA:
1036                        *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
1037                        break;
1038
1039                /* A = skb->len or X = skb->len */
1040                case BPF_LD | BPF_W | BPF_LEN:
1041                case BPF_LDX | BPF_W | BPF_LEN:
1042                        *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
1043                                            BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
1044                                            offsetof(struct sk_buff, len));
1045                        break;
1046
1047                /* Access seccomp_data fields. */
1048                case BPF_LDX | BPF_ABS | BPF_W:
1049                        /* A = *(u32 *) (ctx + K) */
1050                        *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
1051                        break;
1052
1053                /* Unkown instruction. */
1054                default:
1055                        goto err;
1056                }
1057
1058                insn++;
1059                if (new_prog)
1060                        memcpy(new_insn, tmp_insns,
1061                               sizeof(*insn) * (insn - tmp_insns));
1062                new_insn += insn - tmp_insns;
1063        }
1064
1065        if (!new_prog) {
1066                /* Only calculating new length. */
1067                *new_len = new_insn - new_prog;
1068                return 0;
1069        }
1070
1071        pass++;
1072        if (new_flen != new_insn - new_prog) {
1073                new_flen = new_insn - new_prog;
1074                if (pass > 2)
1075                        goto err;
1076                goto do_pass;
1077        }
1078
1079        kfree(addrs);
1080        BUG_ON(*new_len != new_flen);
1081        return 0;
1082err:
1083        kfree(addrs);
1084        return -EINVAL;
1085}
1086
1087/* Security:
1088 *
1089 * A BPF program is able to use 16 cells of memory to store intermediate
1090 * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()).
1091 *
1092 * As we dont want to clear mem[] array for each packet going through
1093 * sk_run_filter(), we check that filter loaded by user never try to read
1094 * a cell if not previously written, and we check all branches to be sure
1095 * a malicious user doesn't try to abuse us.
1096 */
1097static int check_load_and_stores(struct sock_filter *filter, int flen)
1098{
1099        u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
1100        int pc, ret = 0;
1101
1102        BUILD_BUG_ON(BPF_MEMWORDS > 16);
1103
1104        masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
1105        if (!masks)
1106                return -ENOMEM;
1107
1108        memset(masks, 0xff, flen * sizeof(*masks));
1109
1110        for (pc = 0; pc < flen; pc++) {
1111                memvalid &= masks[pc];
1112
1113                switch (filter[pc].code) {
1114                case BPF_ST:
1115                case BPF_STX:
1116                        memvalid |= (1 << filter[pc].k);
1117                        break;
1118                case BPF_LD | BPF_MEM:
1119                case BPF_LDX | BPF_MEM:
1120                        if (!(memvalid & (1 << filter[pc].k))) {
1121                                ret = -EINVAL;
1122                                goto error;
1123                        }
1124                        break;
1125                case BPF_JMP | BPF_JA:
1126                        /* A jump must set masks on target */
1127                        masks[pc + 1 + filter[pc].k] &= memvalid;
1128                        memvalid = ~0;
1129                        break;
1130                case BPF_JMP | BPF_JEQ | BPF_K:
1131                case BPF_JMP | BPF_JEQ | BPF_X:
1132                case BPF_JMP | BPF_JGE | BPF_K:
1133                case BPF_JMP | BPF_JGE | BPF_X:
1134                case BPF_JMP | BPF_JGT | BPF_K:
1135                case BPF_JMP | BPF_JGT | BPF_X:
1136                case BPF_JMP | BPF_JSET | BPF_K:
1137                case BPF_JMP | BPF_JSET | BPF_X:
1138                        /* A jump must set masks on targets */
1139                        masks[pc + 1 + filter[pc].jt] &= memvalid;
1140                        masks[pc + 1 + filter[pc].jf] &= memvalid;
1141                        memvalid = ~0;
1142                        break;
1143                }
1144        }
1145error:
1146        kfree(masks);
1147        return ret;
1148}
1149
1150static bool chk_code_allowed(u16 code_to_probe)
1151{
1152        static const bool codes[] = {
1153                /* 32 bit ALU operations */
1154                [BPF_ALU | BPF_ADD | BPF_K] = true,
1155                [BPF_ALU | BPF_ADD | BPF_X] = true,
1156                [BPF_ALU | BPF_SUB | BPF_K] = true,
1157                [BPF_ALU | BPF_SUB | BPF_X] = true,
1158                [BPF_ALU | BPF_MUL | BPF_K] = true,
1159                [BPF_ALU | BPF_MUL | BPF_X] = true,
1160                [BPF_ALU | BPF_DIV | BPF_K] = true,
1161                [BPF_ALU | BPF_DIV | BPF_X] = true,
1162                [BPF_ALU | BPF_MOD | BPF_K] = true,
1163                [BPF_ALU | BPF_MOD | BPF_X] = true,
1164                [BPF_ALU | BPF_AND | BPF_K] = true,
1165                [BPF_ALU | BPF_AND | BPF_X] = true,
1166                [BPF_ALU | BPF_OR | BPF_K] = true,
1167                [BPF_ALU | BPF_OR | BPF_X] = true,
1168                [BPF_ALU | BPF_XOR | BPF_K] = true,
1169                [BPF_ALU | BPF_XOR | BPF_X] = true,
1170                [BPF_ALU | BPF_LSH | BPF_K] = true,
1171                [BPF_ALU | BPF_LSH | BPF_X] = true,
1172                [BPF_ALU | BPF_RSH | BPF_K] = true,
1173                [BPF_ALU | BPF_RSH | BPF_X] = true,
1174                [BPF_ALU | BPF_NEG] = true,
1175                /* Load instructions */
1176                [BPF_LD | BPF_W | BPF_ABS] = true,
1177                [BPF_LD | BPF_H | BPF_ABS] = true,
1178                [BPF_LD | BPF_B | BPF_ABS] = true,
1179                [BPF_LD | BPF_W | BPF_LEN] = true,
1180                [BPF_LD | BPF_W | BPF_IND] = true,
1181                [BPF_LD | BPF_H | BPF_IND] = true,
1182                [BPF_LD | BPF_B | BPF_IND] = true,
1183                [BPF_LD | BPF_IMM] = true,
1184                [BPF_LD | BPF_MEM] = true,
1185                [BPF_LDX | BPF_W | BPF_LEN] = true,
1186                [BPF_LDX | BPF_B | BPF_MSH] = true,
1187                [BPF_LDX | BPF_IMM] = true,
1188                [BPF_LDX | BPF_MEM] = true,
1189                /* Store instructions */
1190                [BPF_ST] = true,
1191                [BPF_STX] = true,
1192                /* Misc instructions */
1193                [BPF_MISC | BPF_TAX] = true,
1194                [BPF_MISC | BPF_TXA] = true,
1195                /* Return instructions */
1196                [BPF_RET | BPF_K] = true,
1197                [BPF_RET | BPF_A] = true,
1198                /* Jump instructions */
1199                [BPF_JMP | BPF_JA] = true,
1200                [BPF_JMP | BPF_JEQ | BPF_K] = true,
1201                [BPF_JMP | BPF_JEQ | BPF_X] = true,
1202                [BPF_JMP | BPF_JGE | BPF_K] = true,
1203                [BPF_JMP | BPF_JGE | BPF_X] = true,
1204                [BPF_JMP | BPF_JGT | BPF_K] = true,
1205                [BPF_JMP | BPF_JGT | BPF_X] = true,
1206                [BPF_JMP | BPF_JSET | BPF_K] = true,
1207                [BPF_JMP | BPF_JSET | BPF_X] = true,
1208        };
1209
1210        if (code_to_probe >= ARRAY_SIZE(codes))
1211                return false;
1212
1213        return codes[code_to_probe];
1214}
1215
1216/**
1217 *      sk_chk_filter - verify socket filter code
1218 *      @filter: filter to verify
1219 *      @flen: length of filter
1220 *
1221 * Check the user's filter code. If we let some ugly
1222 * filter code slip through kaboom! The filter must contain
1223 * no references or jumps that are out of range, no illegal
1224 * instructions, and must end with a RET instruction.
1225 *
1226 * All jumps are forward as they are not signed.
1227 *
1228 * Returns 0 if the rule set is legal or -EINVAL if not.
1229 */
1230int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
1231{
1232        bool anc_found;
1233        int pc;
1234
1235        if (flen == 0 || flen > BPF_MAXINSNS)
1236                return -EINVAL;
1237
1238        /* Check the filter code now */
1239        for (pc = 0; pc < flen; pc++) {
1240                struct sock_filter *ftest = &filter[pc];
1241
1242                /* May we actually operate on this code? */
1243                if (!chk_code_allowed(ftest->code))
1244                        return -EINVAL;
1245
1246                /* Some instructions need special checks */
1247                switch (ftest->code) {
1248                case BPF_ALU | BPF_DIV | BPF_K:
1249                case BPF_ALU | BPF_MOD | BPF_K:
1250                        /* Check for division by zero */
1251                        if (ftest->k == 0)
1252                                return -EINVAL;
1253                        break;
1254                case BPF_LD | BPF_MEM:
1255                case BPF_LDX | BPF_MEM:
1256                case BPF_ST:
1257                case BPF_STX:
1258                        /* Check for invalid memory addresses */
1259                        if (ftest->k >= BPF_MEMWORDS)
1260                                return -EINVAL;
1261                        break;
1262                case BPF_JMP | BPF_JA:
1263                        /* Note, the large ftest->k might cause loops.
1264                         * Compare this with conditional jumps below,
1265                         * where offsets are limited. --ANK (981016)
1266                         */
1267                        if (ftest->k >= (unsigned int)(flen - pc - 1))
1268                                return -EINVAL;
1269                        break;
1270                case BPF_JMP | BPF_JEQ | BPF_K:
1271                case BPF_JMP | BPF_JEQ | BPF_X:
1272                case BPF_JMP | BPF_JGE | BPF_K:
1273                case BPF_JMP | BPF_JGE | BPF_X:
1274                case BPF_JMP | BPF_JGT | BPF_K:
1275                case BPF_JMP | BPF_JGT | BPF_X:
1276                case BPF_JMP | BPF_JSET | BPF_K:
1277                case BPF_JMP | BPF_JSET | BPF_X:
1278                        /* Both conditionals must be safe */
1279                        if (pc + ftest->jt + 1 >= flen ||
1280                            pc + ftest->jf + 1 >= flen)
1281                                return -EINVAL;
1282                        break;
1283                case BPF_LD | BPF_W | BPF_ABS:
1284                case BPF_LD | BPF_H | BPF_ABS:
1285                case BPF_LD | BPF_B | BPF_ABS:
1286                        anc_found = false;
1287                        if (bpf_anc_helper(ftest) & BPF_ANC)
1288                                anc_found = true;
1289                        /* Ancillary operation unknown or unsupported */
1290                        if (anc_found == false && ftest->k >= SKF_AD_OFF)
1291                                return -EINVAL;
1292                }
1293        }
1294
1295        /* Last instruction must be a RET code */
1296        switch (filter[flen - 1].code) {
1297        case BPF_RET | BPF_K:
1298        case BPF_RET | BPF_A:
1299                return check_load_and_stores(filter, flen);
1300        }
1301
1302        return -EINVAL;
1303}
1304EXPORT_SYMBOL(sk_chk_filter);
1305
1306static int sk_store_orig_filter(struct sk_filter *fp,
1307                                const struct sock_fprog *fprog)
1308{
1309        unsigned int fsize = sk_filter_proglen(fprog);
1310        struct sock_fprog_kern *fkprog;
1311
1312        fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
1313        if (!fp->orig_prog)
1314                return -ENOMEM;
1315
1316        fkprog = fp->orig_prog;
1317        fkprog->len = fprog->len;
1318        fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL);
1319        if (!fkprog->filter) {
1320                kfree(fp->orig_prog);
1321                return -ENOMEM;
1322        }
1323
1324        return 0;
1325}
1326
1327static void sk_release_orig_filter(struct sk_filter *fp)
1328{
1329        struct sock_fprog_kern *fprog = fp->orig_prog;
1330
1331        if (fprog) {
1332                kfree(fprog->filter);
1333                kfree(fprog);
1334        }
1335}
1336
1337/**
1338 *      sk_filter_release_rcu - Release a socket filter by rcu_head
1339 *      @rcu: rcu_head that contains the sk_filter to free
1340 */
1341static void sk_filter_release_rcu(struct rcu_head *rcu)
1342{
1343        struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
1344
1345        sk_release_orig_filter(fp);
1346        sk_filter_free(fp);
1347}
1348
1349/**
1350 *      sk_filter_release - release a socket filter
1351 *      @fp: filter to remove
1352 *
1353 *      Remove a filter from a socket and release its resources.
1354 */
1355static void sk_filter_release(struct sk_filter *fp)
1356{
1357        if (atomic_dec_and_test(&fp->refcnt))
1358                call_rcu(&fp->rcu, sk_filter_release_rcu);
1359}
1360
1361void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
1362{
1363        atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc);
1364        sk_filter_release(fp);
1365}
1366
1367void sk_filter_charge(struct sock *sk, struct sk_filter *fp)
1368{
1369        atomic_inc(&fp->refcnt);
1370        atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc);
1371}
1372
1373static struct sk_filter *__sk_migrate_realloc(struct sk_filter *fp,
1374                                              struct sock *sk,
1375                                              unsigned int len)
1376{
1377        struct sk_filter *fp_new;
1378
1379        if (sk == NULL)
1380                return krealloc(fp, len, GFP_KERNEL);
1381
1382        fp_new = sock_kmalloc(sk, len, GFP_KERNEL);
1383        if (fp_new) {
1384                *fp_new = *fp;
1385                /* As we're keeping orig_prog in fp_new along,
1386                 * we need to make sure we're not evicting it
1387                 * from the old fp.
1388                 */
1389                fp->orig_prog = NULL;
1390                sk_filter_uncharge(sk, fp);
1391        }
1392
1393        return fp_new;
1394}
1395
1396static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
1397                                             struct sock *sk)
1398{
1399        struct sock_filter *old_prog;
1400        struct sk_filter *old_fp;
1401        int err, new_len, old_len = fp->len;
1402
1403        /* We are free to overwrite insns et al right here as it
1404         * won't be used at this point in time anymore internally
1405         * after the migration to the internal BPF instruction
1406         * representation.
1407         */
1408        BUILD_BUG_ON(sizeof(struct sock_filter) !=
1409                     sizeof(struct sock_filter_int));
1410
1411        /* Conversion cannot happen on overlapping memory areas,
1412         * so we need to keep the user BPF around until the 2nd
1413         * pass. At this time, the user BPF is stored in fp->insns.
1414         */
1415        old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
1416                           GFP_KERNEL);
1417        if (!old_prog) {
1418                err = -ENOMEM;
1419                goto out_err;
1420        }
1421
1422        /* 1st pass: calculate the new program length. */
1423        err = sk_convert_filter(old_prog, old_len, NULL, &new_len);
1424        if (err)
1425                goto out_err_free;
1426
1427        /* Expand fp for appending the new filter representation. */
1428        old_fp = fp;
1429        fp = __sk_migrate_realloc(old_fp, sk, sk_filter_size(new_len));
1430        if (!fp) {
1431                /* The old_fp is still around in case we couldn't
1432                 * allocate new memory, so uncharge on that one.
1433                 */
1434                fp = old_fp;
1435                err = -ENOMEM;
1436                goto out_err_free;
1437        }
1438
1439        fp->len = new_len;
1440
1441        /* 2nd pass: remap sock_filter insns into sock_filter_int insns. */
1442        err = sk_convert_filter(old_prog, old_len, fp->insnsi, &new_len);
1443        if (err)
1444                /* 2nd sk_convert_filter() can fail only if it fails
1445                 * to allocate memory, remapping must succeed. Note,
1446                 * that at this time old_fp has already been released
1447                 * by __sk_migrate_realloc().
1448                 */
1449                goto out_err_free;
1450
1451        sk_filter_select_runtime(fp);
1452
1453        kfree(old_prog);
1454        return fp;
1455
1456out_err_free:
1457        kfree(old_prog);
1458out_err:
1459        /* Rollback filter setup. */
1460        if (sk != NULL)
1461                sk_filter_uncharge(sk, fp);
1462        else
1463                kfree(fp);
1464        return ERR_PTR(err);
1465}
1466
1467void __weak bpf_int_jit_compile(struct sk_filter *prog)
1468{
1469}
1470
1471/**
1472 *      sk_filter_select_runtime - select execution runtime for BPF program
1473 *      @fp: sk_filter populated with internal BPF program
1474 *
1475 * try to JIT internal BPF program, if JIT is not available select interpreter
1476 * BPF program will be executed via SK_RUN_FILTER() macro
1477 */
1478void sk_filter_select_runtime(struct sk_filter *fp)
1479{
1480        fp->bpf_func = (void *) __sk_run_filter;
1481
1482        /* Probe if internal BPF can be JITed */
1483        bpf_int_jit_compile(fp);
1484}
1485EXPORT_SYMBOL_GPL(sk_filter_select_runtime);
1486
1487/* free internal BPF program */
1488void sk_filter_free(struct sk_filter *fp)
1489{
1490        bpf_jit_free(fp);
1491}
1492EXPORT_SYMBOL_GPL(sk_filter_free);
1493
1494static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp,
1495                                             struct sock *sk)
1496{
1497        int err;
1498
1499        fp->bpf_func = NULL;
1500        fp->jited = 0;
1501
1502        err = sk_chk_filter(fp->insns, fp->len);
1503        if (err) {
1504                if (sk != NULL)
1505                        sk_filter_uncharge(sk, fp);
1506                else
1507                        kfree(fp);
1508                return ERR_PTR(err);
1509        }
1510
1511        /* Probe if we can JIT compile the filter and if so, do
1512         * the compilation of the filter.
1513         */
1514        bpf_jit_compile(fp);
1515
1516        /* JIT compiler couldn't process this filter, so do the
1517         * internal BPF translation for the optimized interpreter.
1518         */
1519        if (!fp->jited)
1520                fp = __sk_migrate_filter(fp, sk);
1521
1522        return fp;
1523}
1524
1525/**
1526 *      sk_unattached_filter_create - create an unattached filter
1527 *      @pfp: the unattached filter that is created
1528 *      @fprog: the filter program
1529 *
1530 * Create a filter independent of any socket. We first run some
1531 * sanity checks on it to make sure it does not explode on us later.
1532 * If an error occurs or there is insufficient memory for the filter
1533 * a negative errno code is returned. On success the return is zero.
1534 */
1535int sk_unattached_filter_create(struct sk_filter **pfp,
1536                                struct sock_fprog_kern *fprog)
1537{
1538        unsigned int fsize = sk_filter_proglen(fprog);
1539        struct sk_filter *fp;
1540
1541        /* Make sure new filter is there and in the right amounts. */
1542        if (fprog->filter == NULL)
1543                return -EINVAL;
1544
1545        fp = kmalloc(sk_filter_size(fprog->len), GFP_KERNEL);
1546        if (!fp)
1547                return -ENOMEM;
1548
1549        memcpy(fp->insns, fprog->filter, fsize);
1550
1551        atomic_set(&fp->refcnt, 1);
1552        fp->len = fprog->len;
1553        /* Since unattached filters are not copied back to user
1554         * space through sk_get_filter(), we do not need to hold
1555         * a copy here, and can spare us the work.
1556         */
1557        fp->orig_prog = NULL;
1558
1559        /* __sk_prepare_filter() already takes care of uncharging
1560         * memory in case something goes wrong.
1561         */
1562        fp = __sk_prepare_filter(fp, NULL);
1563        if (IS_ERR(fp))
1564                return PTR_ERR(fp);
1565
1566        *pfp = fp;
1567        return 0;
1568}
1569EXPORT_SYMBOL_GPL(sk_unattached_filter_create);
1570
1571void sk_unattached_filter_destroy(struct sk_filter *fp)
1572{
1573        sk_filter_release(fp);
1574}
1575EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy);
1576
1577/**
1578 *      sk_attach_filter - attach a socket filter
1579 *      @fprog: the filter program
1580 *      @sk: the socket to use
1581 *
1582 * Attach the user's filter code. We first run some sanity checks on
1583 * it to make sure it does not explode on us later. If an error
1584 * occurs or there is insufficient memory for the filter a negative
1585 * errno code is returned. On success the return is zero.
1586 */
1587int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1588{
1589        struct sk_filter *fp, *old_fp;
1590        unsigned int fsize = sk_filter_proglen(fprog);
1591        unsigned int sk_fsize = sk_filter_size(fprog->len);
1592        int err;
1593
1594        if (sock_flag(sk, SOCK_FILTER_LOCKED))
1595                return -EPERM;
1596
1597        /* Make sure new filter is there and in the right amounts. */
1598        if (fprog->filter == NULL)
1599                return -EINVAL;
1600
1601        fp = sock_kmalloc(sk, sk_fsize, GFP_KERNEL);
1602        if (!fp)
1603                return -ENOMEM;
1604
1605        if (copy_from_user(fp->insns, fprog->filter, fsize)) {
1606                sock_kfree_s(sk, fp, sk_fsize);
1607                return -EFAULT;
1608        }
1609
1610        atomic_set(&fp->refcnt, 1);
1611        fp->len = fprog->len;
1612
1613        err = sk_store_orig_filter(fp, fprog);
1614        if (err) {
1615                sk_filter_uncharge(sk, fp);
1616                return -ENOMEM;
1617        }
1618
1619        /* __sk_prepare_filter() already takes care of uncharging
1620         * memory in case something goes wrong.
1621         */
1622        fp = __sk_prepare_filter(fp, sk);
1623        if (IS_ERR(fp))
1624                return PTR_ERR(fp);
1625
1626        old_fp = rcu_dereference_protected(sk->sk_filter,
1627                                           sock_owned_by_user(sk));
1628        rcu_assign_pointer(sk->sk_filter, fp);
1629
1630        if (old_fp)
1631                sk_filter_uncharge(sk, old_fp);
1632
1633        return 0;
1634}
1635EXPORT_SYMBOL_GPL(sk_attach_filter);
1636
1637int sk_detach_filter(struct sock *sk)
1638{
1639        int ret = -ENOENT;
1640        struct sk_filter *filter;
1641
1642        if (sock_flag(sk, SOCK_FILTER_LOCKED))
1643                return -EPERM;
1644
1645        filter = rcu_dereference_protected(sk->sk_filter,
1646                                           sock_owned_by_user(sk));
1647        if (filter) {
1648                RCU_INIT_POINTER(sk->sk_filter, NULL);
1649                sk_filter_uncharge(sk, filter);
1650                ret = 0;
1651        }
1652
1653        return ret;
1654}
1655EXPORT_SYMBOL_GPL(sk_detach_filter);
1656
1657int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
1658                  unsigned int len)
1659{
1660        struct sock_fprog_kern *fprog;
1661        struct sk_filter *filter;
1662        int ret = 0;
1663
1664        lock_sock(sk);
1665        filter = rcu_dereference_protected(sk->sk_filter,
1666                                           sock_owned_by_user(sk));
1667        if (!filter)
1668                goto out;
1669
1670        /* We're copying the filter that has been originally attached,
1671         * so no conversion/decode needed anymore.
1672         */
1673        fprog = filter->orig_prog;
1674
1675        ret = fprog->len;
1676        if (!len)
1677                /* User space only enquires number of filter blocks. */
1678                goto out;
1679
1680        ret = -EINVAL;
1681        if (len < fprog->len)
1682                goto out;
1683
1684        ret = -EFAULT;
1685        if (copy_to_user(ubuf, fprog->filter, sk_filter_proglen(fprog)))
1686                goto out;
1687
1688        /* Instead of bytes, the API requests to return the number
1689         * of filter blocks.
1690         */
1691        ret = fprog->len;
1692out:
1693        release_sock(sk);
1694        return ret;
1695}
1696