linux/arch/arm/net/bpf_jit_32.c
<<
>>
Prefs
   1/*
   2 * Just-In-Time compiler for eBPF filters on 32bit ARM
   3 *
   4 * Copyright (c) 2017 Shubham Bansal <illusionist.neo@gmail.com>
   5 * Copyright (c) 2011 Mircea Gherzan <mgherzan@gmail.com>
   6 *
   7 * This program is free software; you can redistribute it and/or modify it
   8 * under the terms of the GNU General Public License as published by the
   9 * Free Software Foundation; version 2 of the License.
  10 */
  11
  12#include <linux/bpf.h>
  13#include <linux/bitops.h>
  14#include <linux/compiler.h>
  15#include <linux/errno.h>
  16#include <linux/filter.h>
  17#include <linux/netdevice.h>
  18#include <linux/string.h>
  19#include <linux/slab.h>
  20#include <linux/if_vlan.h>
  21
  22#include <asm/cacheflush.h>
  23#include <asm/hwcap.h>
  24#include <asm/opcodes.h>
  25#include <asm/system_info.h>
  26
  27#include "bpf_jit_32.h"
  28
  29/*
  30 * eBPF prog stack layout:
  31 *
  32 *                         high
  33 * original ARM_SP =>     +-----+
  34 *                        |     | callee saved registers
  35 *                        +-----+ <= (BPF_FP + SCRATCH_SIZE)
  36 *                        | ... | eBPF JIT scratch space
  37 * eBPF fp register =>    +-----+
  38 *   (BPF_FP)             | ... | eBPF prog stack
  39 *                        +-----+
  40 *                        |RSVD | JIT scratchpad
  41 * current ARM_SP =>      +-----+ <= (BPF_FP - STACK_SIZE + SCRATCH_SIZE)
  42 *                        |     |
  43 *                        | ... | Function call stack
  44 *                        |     |
  45 *                        +-----+
  46 *                          low
  47 *
  48 * The callee saved registers depends on whether frame pointers are enabled.
  49 * With frame pointers (to be compliant with the ABI):
  50 *
  51 *                              high
  52 * original ARM_SP =>     +--------------+ \
  53 *                        |      pc      | |
  54 * current ARM_FP =>      +--------------+ } callee saved registers
  55 *                        |r4-r9,fp,ip,lr| |
  56 *                        +--------------+ /
  57 *                              low
  58 *
  59 * Without frame pointers:
  60 *
  61 *                              high
  62 * original ARM_SP =>     +--------------+
  63 *                        |  r4-r9,fp,lr | callee saved registers
  64 * current ARM_FP =>      +--------------+
  65 *                              low
  66 *
  67 * When popping registers off the stack at the end of a BPF function, we
  68 * reference them via the current ARM_FP register.
  69 */
  70#define CALLEE_MASK     (1 << ARM_R4 | 1 << ARM_R5 | 1 << ARM_R6 | \
  71                         1 << ARM_R7 | 1 << ARM_R8 | 1 << ARM_R9 | \
  72                         1 << ARM_FP)
  73#define CALLEE_PUSH_MASK (CALLEE_MASK | 1 << ARM_LR)
  74#define CALLEE_POP_MASK  (CALLEE_MASK | 1 << ARM_PC)
  75
  76enum {
  77        /* Stack layout - these are offsets from (top of stack - 4) */
  78        BPF_R2_HI,
  79        BPF_R2_LO,
  80        BPF_R3_HI,
  81        BPF_R3_LO,
  82        BPF_R4_HI,
  83        BPF_R4_LO,
  84        BPF_R5_HI,
  85        BPF_R5_LO,
  86        BPF_R7_HI,
  87        BPF_R7_LO,
  88        BPF_R8_HI,
  89        BPF_R8_LO,
  90        BPF_R9_HI,
  91        BPF_R9_LO,
  92        BPF_FP_HI,
  93        BPF_FP_LO,
  94        BPF_TC_HI,
  95        BPF_TC_LO,
  96        BPF_AX_HI,
  97        BPF_AX_LO,
  98        /* Stack space for BPF_REG_2, BPF_REG_3, BPF_REG_4,
  99         * BPF_REG_5, BPF_REG_7, BPF_REG_8, BPF_REG_9,
 100         * BPF_REG_FP and Tail call counts.
 101         */
 102        BPF_JIT_SCRATCH_REGS,
 103};
 104
 105/*
 106 * Negative "register" values indicate the register is stored on the stack
 107 * and are the offset from the top of the eBPF JIT scratch space.
 108 */
 109#define STACK_OFFSET(k) (-4 - (k) * 4)
 110#define SCRATCH_SIZE    (BPF_JIT_SCRATCH_REGS * 4)
 111
 112#ifdef CONFIG_FRAME_POINTER
 113#define EBPF_SCRATCH_TO_ARM_FP(x) ((x) - 4 * hweight16(CALLEE_PUSH_MASK) - 4)
 114#else
 115#define EBPF_SCRATCH_TO_ARM_FP(x) (x)
 116#endif
 117
 118#define TMP_REG_1       (MAX_BPF_JIT_REG + 0)   /* TEMP Register 1 */
 119#define TMP_REG_2       (MAX_BPF_JIT_REG + 1)   /* TEMP Register 2 */
 120#define TCALL_CNT       (MAX_BPF_JIT_REG + 2)   /* Tail Call Count */
 121
 122#define FLAG_IMM_OVERFLOW       (1 << 0)
 123
 124/*
 125 * Map eBPF registers to ARM 32bit registers or stack scratch space.
 126 *
 127 * 1. First argument is passed using the arm 32bit registers and rest of the
 128 * arguments are passed on stack scratch space.
 129 * 2. First callee-saved argument is mapped to arm 32 bit registers and rest
 130 * arguments are mapped to scratch space on stack.
 131 * 3. We need two 64 bit temp registers to do complex operations on eBPF
 132 * registers.
 133 *
 134 * As the eBPF registers are all 64 bit registers and arm has only 32 bit
 135 * registers, we have to map each eBPF registers with two arm 32 bit regs or
 136 * scratch memory space and we have to build eBPF 64 bit register from those.
 137 *
 138 */
 139static const s8 bpf2a32[][2] = {
 140        /* return value from in-kernel function, and exit value from eBPF */
 141        [BPF_REG_0] = {ARM_R1, ARM_R0},
 142        /* arguments from eBPF program to in-kernel function */
 143        [BPF_REG_1] = {ARM_R3, ARM_R2},
 144        /* Stored on stack scratch space */
 145        [BPF_REG_2] = {STACK_OFFSET(BPF_R2_HI), STACK_OFFSET(BPF_R2_LO)},
 146        [BPF_REG_3] = {STACK_OFFSET(BPF_R3_HI), STACK_OFFSET(BPF_R3_LO)},
 147        [BPF_REG_4] = {STACK_OFFSET(BPF_R4_HI), STACK_OFFSET(BPF_R4_LO)},
 148        [BPF_REG_5] = {STACK_OFFSET(BPF_R5_HI), STACK_OFFSET(BPF_R5_LO)},
 149        /* callee saved registers that in-kernel function will preserve */
 150        [BPF_REG_6] = {ARM_R5, ARM_R4},
 151        /* Stored on stack scratch space */
 152        [BPF_REG_7] = {STACK_OFFSET(BPF_R7_HI), STACK_OFFSET(BPF_R7_LO)},
 153        [BPF_REG_8] = {STACK_OFFSET(BPF_R8_HI), STACK_OFFSET(BPF_R8_LO)},
 154        [BPF_REG_9] = {STACK_OFFSET(BPF_R9_HI), STACK_OFFSET(BPF_R9_LO)},
 155        /* Read only Frame Pointer to access Stack */
 156        [BPF_REG_FP] = {STACK_OFFSET(BPF_FP_HI), STACK_OFFSET(BPF_FP_LO)},
 157        /* Temporary Register for internal BPF JIT, can be used
 158         * for constant blindings and others.
 159         */
 160        [TMP_REG_1] = {ARM_R7, ARM_R6},
 161        [TMP_REG_2] = {ARM_R9, ARM_R8},
 162        /* Tail call count. Stored on stack scratch space. */
 163        [TCALL_CNT] = {STACK_OFFSET(BPF_TC_HI), STACK_OFFSET(BPF_TC_LO)},
 164        /* temporary register for blinding constants.
 165         * Stored on stack scratch space.
 166         */
 167        [BPF_REG_AX] = {STACK_OFFSET(BPF_AX_HI), STACK_OFFSET(BPF_AX_LO)},
 168};
 169
 170#define dst_lo  dst[1]
 171#define dst_hi  dst[0]
 172#define src_lo  src[1]
 173#define src_hi  src[0]
 174
 175/*
 176 * JIT Context:
 177 *
 178 * prog                 :       bpf_prog
 179 * idx                  :       index of current last JITed instruction.
 180 * prologue_bytes       :       bytes used in prologue.
 181 * epilogue_offset      :       offset of epilogue starting.
 182 * offsets              :       array of eBPF instruction offsets in
 183 *                              JITed code.
 184 * target               :       final JITed code.
 185 * epilogue_bytes       :       no of bytes used in epilogue.
 186 * imm_count            :       no of immediate counts used for global
 187 *                              variables.
 188 * imms                 :       array of global variable addresses.
 189 */
 190
 191struct jit_ctx {
 192        const struct bpf_prog *prog;
 193        unsigned int idx;
 194        unsigned int prologue_bytes;
 195        unsigned int epilogue_offset;
 196        unsigned int cpu_architecture;
 197        u32 flags;
 198        u32 *offsets;
 199        u32 *target;
 200        u32 stack_size;
 201#if __LINUX_ARM_ARCH__ < 7
 202        u16 epilogue_bytes;
 203        u16 imm_count;
 204        u32 *imms;
 205#endif
 206};
 207
 208/*
 209 * Wrappers which handle both OABI and EABI and assures Thumb2 interworking
 210 * (where the assembly routines like __aeabi_uidiv could cause problems).
 211 */
 212static u32 jit_udiv32(u32 dividend, u32 divisor)
 213{
 214        return dividend / divisor;
 215}
 216
 217static u32 jit_mod32(u32 dividend, u32 divisor)
 218{
 219        return dividend % divisor;
 220}
 221
 222static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx)
 223{
 224        inst |= (cond << 28);
 225        inst = __opcode_to_mem_arm(inst);
 226
 227        if (ctx->target != NULL)
 228                ctx->target[ctx->idx] = inst;
 229
 230        ctx->idx++;
 231}
 232
 233/*
 234 * Emit an instruction that will be executed unconditionally.
 235 */
 236static inline void emit(u32 inst, struct jit_ctx *ctx)
 237{
 238        _emit(ARM_COND_AL, inst, ctx);
 239}
 240
 241/*
 242 * This is rather horrid, but necessary to convert an integer constant
 243 * to an immediate operand for the opcodes, and be able to detect at
 244 * build time whether the constant can't be converted (iow, usable in
 245 * BUILD_BUG_ON()).
 246 */
 247#define imm12val(v, s) (rol32(v, (s)) | (s) << 7)
 248#define const_imm8m(x)                                  \
 249        ({ int r;                                       \
 250           u32 v = (x);                                 \
 251           if (!(v & ~0x000000ff))                      \
 252                r = imm12val(v, 0);                     \
 253           else if (!(v & ~0xc000003f))                 \
 254                r = imm12val(v, 2);                     \
 255           else if (!(v & ~0xf000000f))                 \
 256                r = imm12val(v, 4);                     \
 257           else if (!(v & ~0xfc000003))                 \
 258                r = imm12val(v, 6);                     \
 259           else if (!(v & ~0xff000000))                 \
 260                r = imm12val(v, 8);                     \
 261           else if (!(v & ~0x3fc00000))                 \
 262                r = imm12val(v, 10);                    \
 263           else if (!(v & ~0x0ff00000))                 \
 264                r = imm12val(v, 12);                    \
 265           else if (!(v & ~0x03fc0000))                 \
 266                r = imm12val(v, 14);                    \
 267           else if (!(v & ~0x00ff0000))                 \
 268                r = imm12val(v, 16);                    \
 269           else if (!(v & ~0x003fc000))                 \
 270                r = imm12val(v, 18);                    \
 271           else if (!(v & ~0x000ff000))                 \
 272                r = imm12val(v, 20);                    \
 273           else if (!(v & ~0x0003fc00))                 \
 274                r = imm12val(v, 22);                    \
 275           else if (!(v & ~0x0000ff00))                 \
 276                r = imm12val(v, 24);                    \
 277           else if (!(v & ~0x00003fc0))                 \
 278                r = imm12val(v, 26);                    \
 279           else if (!(v & ~0x00000ff0))                 \
 280                r = imm12val(v, 28);                    \
 281           else if (!(v & ~0x000003fc))                 \
 282                r = imm12val(v, 30);                    \
 283           else                                         \
 284                r = -1;                                 \
 285           r; })
 286
 287/*
 288 * Checks if immediate value can be converted to imm12(12 bits) value.
 289 */
 290static int imm8m(u32 x)
 291{
 292        u32 rot;
 293
 294        for (rot = 0; rot < 16; rot++)
 295                if ((x & ~ror32(0xff, 2 * rot)) == 0)
 296                        return rol32(x, 2 * rot) | (rot << 8);
 297        return -1;
 298}
 299
 300#define imm8m(x) (__builtin_constant_p(x) ? const_imm8m(x) : imm8m(x))
 301
 302static u32 arm_bpf_ldst_imm12(u32 op, u8 rt, u8 rn, s16 imm12)
 303{
 304        op |= rt << 12 | rn << 16;
 305        if (imm12 >= 0)
 306                op |= ARM_INST_LDST__U;
 307        else
 308                imm12 = -imm12;
 309        return op | (imm12 & ARM_INST_LDST__IMM12);
 310}
 311
 312static u32 arm_bpf_ldst_imm8(u32 op, u8 rt, u8 rn, s16 imm8)
 313{
 314        op |= rt << 12 | rn << 16;
 315        if (imm8 >= 0)
 316                op |= ARM_INST_LDST__U;
 317        else
 318                imm8 = -imm8;
 319        return op | (imm8 & 0xf0) << 4 | (imm8 & 0x0f);
 320}
 321
 322#define ARM_LDR_I(rt, rn, off)  arm_bpf_ldst_imm12(ARM_INST_LDR_I, rt, rn, off)
 323#define ARM_LDRB_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_LDRB_I, rt, rn, off)
 324#define ARM_LDRD_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRD_I, rt, rn, off)
 325#define ARM_LDRH_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRH_I, rt, rn, off)
 326
 327#define ARM_STR_I(rt, rn, off)  arm_bpf_ldst_imm12(ARM_INST_STR_I, rt, rn, off)
 328#define ARM_STRB_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_STRB_I, rt, rn, off)
 329#define ARM_STRD_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_STRD_I, rt, rn, off)
 330#define ARM_STRH_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_STRH_I, rt, rn, off)
 331
 332/*
 333 * Initializes the JIT space with undefined instructions.
 334 */
 335static void jit_fill_hole(void *area, unsigned int size)
 336{
 337        u32 *ptr;
 338        /* We are guaranteed to have aligned memory. */
 339        for (ptr = area; size >= sizeof(u32); size -= sizeof(u32))
 340                *ptr++ = __opcode_to_mem_arm(ARM_INST_UDF);
 341}
 342
 343#if defined(CONFIG_AEABI) && (__LINUX_ARM_ARCH__ >= 5)
 344/* EABI requires the stack to be aligned to 64-bit boundaries */
 345#define STACK_ALIGNMENT 8
 346#else
 347/* Stack must be aligned to 32-bit boundaries */
 348#define STACK_ALIGNMENT 4
 349#endif
 350
 351/* total stack size used in JITed code */
 352#define _STACK_SIZE     (ctx->prog->aux->stack_depth + SCRATCH_SIZE)
 353#define STACK_SIZE      ALIGN(_STACK_SIZE, STACK_ALIGNMENT)
 354
 355#if __LINUX_ARM_ARCH__ < 7
 356
 357static u16 imm_offset(u32 k, struct jit_ctx *ctx)
 358{
 359        unsigned int i = 0, offset;
 360        u16 imm;
 361
 362        /* on the "fake" run we just count them (duplicates included) */
 363        if (ctx->target == NULL) {
 364                ctx->imm_count++;
 365                return 0;
 366        }
 367
 368        while ((i < ctx->imm_count) && ctx->imms[i]) {
 369                if (ctx->imms[i] == k)
 370                        break;
 371                i++;
 372        }
 373
 374        if (ctx->imms[i] == 0)
 375                ctx->imms[i] = k;
 376
 377        /* constants go just after the epilogue */
 378        offset =  ctx->offsets[ctx->prog->len - 1] * 4;
 379        offset += ctx->prologue_bytes;
 380        offset += ctx->epilogue_bytes;
 381        offset += i * 4;
 382
 383        ctx->target[offset / 4] = k;
 384
 385        /* PC in ARM mode == address of the instruction + 8 */
 386        imm = offset - (8 + ctx->idx * 4);
 387
 388        if (imm & ~0xfff) {
 389                /*
 390                 * literal pool is too far, signal it into flags. we
 391                 * can only detect it on the second pass unfortunately.
 392                 */
 393                ctx->flags |= FLAG_IMM_OVERFLOW;
 394                return 0;
 395        }
 396
 397        return imm;
 398}
 399
 400#endif /* __LINUX_ARM_ARCH__ */
 401
 402static inline int bpf2a32_offset(int bpf_to, int bpf_from,
 403                                 const struct jit_ctx *ctx) {
 404        int to, from;
 405
 406        if (ctx->target == NULL)
 407                return 0;
 408        to = ctx->offsets[bpf_to];
 409        from = ctx->offsets[bpf_from];
 410
 411        return to - from - 1;
 412}
 413
 414/*
 415 * Move an immediate that's not an imm8m to a core register.
 416 */
 417static inline void emit_mov_i_no8m(const u8 rd, u32 val, struct jit_ctx *ctx)
 418{
 419#if __LINUX_ARM_ARCH__ < 7
 420        emit(ARM_LDR_I(rd, ARM_PC, imm_offset(val, ctx)), ctx);
 421#else
 422        emit(ARM_MOVW(rd, val & 0xffff), ctx);
 423        if (val > 0xffff)
 424                emit(ARM_MOVT(rd, val >> 16), ctx);
 425#endif
 426}
 427
 428static inline void emit_mov_i(const u8 rd, u32 val, struct jit_ctx *ctx)
 429{
 430        int imm12 = imm8m(val);
 431
 432        if (imm12 >= 0)
 433                emit(ARM_MOV_I(rd, imm12), ctx);
 434        else
 435                emit_mov_i_no8m(rd, val, ctx);
 436}
 437
 438static void emit_bx_r(u8 tgt_reg, struct jit_ctx *ctx)
 439{
 440        if (elf_hwcap & HWCAP_THUMB)
 441                emit(ARM_BX(tgt_reg), ctx);
 442        else
 443                emit(ARM_MOV_R(ARM_PC, tgt_reg), ctx);
 444}
 445
 446static inline void emit_blx_r(u8 tgt_reg, struct jit_ctx *ctx)
 447{
 448#if __LINUX_ARM_ARCH__ < 5
 449        emit(ARM_MOV_R(ARM_LR, ARM_PC), ctx);
 450        emit_bx_r(tgt_reg, ctx);
 451#else
 452        emit(ARM_BLX_R(tgt_reg), ctx);
 453#endif
 454}
 455
 456static inline int epilogue_offset(const struct jit_ctx *ctx)
 457{
 458        int to, from;
 459        /* No need for 1st dummy run */
 460        if (ctx->target == NULL)
 461                return 0;
 462        to = ctx->epilogue_offset;
 463        from = ctx->idx;
 464
 465        return to - from - 2;
 466}
 467
 468static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op)
 469{
 470        const s8 *tmp = bpf2a32[TMP_REG_1];
 471
 472#if __LINUX_ARM_ARCH__ == 7
 473        if (elf_hwcap & HWCAP_IDIVA) {
 474                if (op == BPF_DIV)
 475                        emit(ARM_UDIV(rd, rm, rn), ctx);
 476                else {
 477                        emit(ARM_UDIV(ARM_IP, rm, rn), ctx);
 478                        emit(ARM_MLS(rd, rn, ARM_IP, rm), ctx);
 479                }
 480                return;
 481        }
 482#endif
 483
 484        /*
 485         * For BPF_ALU | BPF_DIV | BPF_K instructions
 486         * As ARM_R1 and ARM_R0 contains 1st argument of bpf
 487         * function, we need to save it on caller side to save
 488         * it from getting destroyed within callee.
 489         * After the return from the callee, we restore ARM_R0
 490         * ARM_R1.
 491         */
 492        if (rn != ARM_R1) {
 493                emit(ARM_MOV_R(tmp[0], ARM_R1), ctx);
 494                emit(ARM_MOV_R(ARM_R1, rn), ctx);
 495        }
 496        if (rm != ARM_R0) {
 497                emit(ARM_MOV_R(tmp[1], ARM_R0), ctx);
 498                emit(ARM_MOV_R(ARM_R0, rm), ctx);
 499        }
 500
 501        /* Call appropriate function */
 502        emit_mov_i(ARM_IP, op == BPF_DIV ?
 503                   (u32)jit_udiv32 : (u32)jit_mod32, ctx);
 504        emit_blx_r(ARM_IP, ctx);
 505
 506        /* Save return value */
 507        if (rd != ARM_R0)
 508                emit(ARM_MOV_R(rd, ARM_R0), ctx);
 509
 510        /* Restore ARM_R0 and ARM_R1 */
 511        if (rn != ARM_R1)
 512                emit(ARM_MOV_R(ARM_R1, tmp[0]), ctx);
 513        if (rm != ARM_R0)
 514                emit(ARM_MOV_R(ARM_R0, tmp[1]), ctx);
 515}
 516
 517/* Is the translated BPF register on stack? */
 518static bool is_stacked(s8 reg)
 519{
 520        return reg < 0;
 521}
 522
 523/* If a BPF register is on the stack (stk is true), load it to the
 524 * supplied temporary register and return the temporary register
 525 * for subsequent operations, otherwise just use the CPU register.
 526 */
 527static s8 arm_bpf_get_reg32(s8 reg, s8 tmp, struct jit_ctx *ctx)
 528{
 529        if (is_stacked(reg)) {
 530                emit(ARM_LDR_I(tmp, ARM_FP, EBPF_SCRATCH_TO_ARM_FP(reg)), ctx);
 531                reg = tmp;
 532        }
 533        return reg;
 534}
 535
 536static const s8 *arm_bpf_get_reg64(const s8 *reg, const s8 *tmp,
 537                                   struct jit_ctx *ctx)
 538{
 539        if (is_stacked(reg[1])) {
 540                if (__LINUX_ARM_ARCH__ >= 6 ||
 541                    ctx->cpu_architecture >= CPU_ARCH_ARMv5TE) {
 542                        emit(ARM_LDRD_I(tmp[1], ARM_FP,
 543                                        EBPF_SCRATCH_TO_ARM_FP(reg[1])), ctx);
 544                } else {
 545                        emit(ARM_LDR_I(tmp[1], ARM_FP,
 546                                       EBPF_SCRATCH_TO_ARM_FP(reg[1])), ctx);
 547                        emit(ARM_LDR_I(tmp[0], ARM_FP,
 548                                       EBPF_SCRATCH_TO_ARM_FP(reg[0])), ctx);
 549                }
 550                reg = tmp;
 551        }
 552        return reg;
 553}
 554
 555/* If a BPF register is on the stack (stk is true), save the register
 556 * back to the stack.  If the source register is not the same, then
 557 * move it into the correct register.
 558 */
 559static void arm_bpf_put_reg32(s8 reg, s8 src, struct jit_ctx *ctx)
 560{
 561        if (is_stacked(reg))
 562                emit(ARM_STR_I(src, ARM_FP, EBPF_SCRATCH_TO_ARM_FP(reg)), ctx);
 563        else if (reg != src)
 564                emit(ARM_MOV_R(reg, src), ctx);
 565}
 566
 567static void arm_bpf_put_reg64(const s8 *reg, const s8 *src,
 568                              struct jit_ctx *ctx)
 569{
 570        if (is_stacked(reg[1])) {
 571                if (__LINUX_ARM_ARCH__ >= 6 ||
 572                    ctx->cpu_architecture >= CPU_ARCH_ARMv5TE) {
 573                        emit(ARM_STRD_I(src[1], ARM_FP,
 574                                       EBPF_SCRATCH_TO_ARM_FP(reg[1])), ctx);
 575                } else {
 576                        emit(ARM_STR_I(src[1], ARM_FP,
 577                                       EBPF_SCRATCH_TO_ARM_FP(reg[1])), ctx);
 578                        emit(ARM_STR_I(src[0], ARM_FP,
 579                                       EBPF_SCRATCH_TO_ARM_FP(reg[0])), ctx);
 580                }
 581        } else {
 582                if (reg[1] != src[1])
 583                        emit(ARM_MOV_R(reg[1], src[1]), ctx);
 584                if (reg[0] != src[0])
 585                        emit(ARM_MOV_R(reg[0], src[0]), ctx);
 586        }
 587}
 588
 589static inline void emit_a32_mov_i(const s8 dst, const u32 val,
 590                                  struct jit_ctx *ctx)
 591{
 592        const s8 *tmp = bpf2a32[TMP_REG_1];
 593
 594        if (is_stacked(dst)) {
 595                emit_mov_i(tmp[1], val, ctx);
 596                arm_bpf_put_reg32(dst, tmp[1], ctx);
 597        } else {
 598                emit_mov_i(dst, val, ctx);
 599        }
 600}
 601
 602static void emit_a32_mov_i64(const s8 dst[], u64 val, struct jit_ctx *ctx)
 603{
 604        const s8 *tmp = bpf2a32[TMP_REG_1];
 605        const s8 *rd = is_stacked(dst_lo) ? tmp : dst;
 606
 607        emit_mov_i(rd[1], (u32)val, ctx);
 608        emit_mov_i(rd[0], val >> 32, ctx);
 609
 610        arm_bpf_put_reg64(dst, rd, ctx);
 611}
 612
 613/* Sign extended move */
 614static inline void emit_a32_mov_se_i64(const bool is64, const s8 dst[],
 615                                       const u32 val, struct jit_ctx *ctx) {
 616        u64 val64 = val;
 617
 618        if (is64 && (val & (1<<31)))
 619                val64 |= 0xffffffff00000000ULL;
 620        emit_a32_mov_i64(dst, val64, ctx);
 621}
 622
 623static inline void emit_a32_add_r(const u8 dst, const u8 src,
 624                              const bool is64, const bool hi,
 625                              struct jit_ctx *ctx) {
 626        /* 64 bit :
 627         *      adds dst_lo, dst_lo, src_lo
 628         *      adc dst_hi, dst_hi, src_hi
 629         * 32 bit :
 630         *      add dst_lo, dst_lo, src_lo
 631         */
 632        if (!hi && is64)
 633                emit(ARM_ADDS_R(dst, dst, src), ctx);
 634        else if (hi && is64)
 635                emit(ARM_ADC_R(dst, dst, src), ctx);
 636        else
 637                emit(ARM_ADD_R(dst, dst, src), ctx);
 638}
 639
 640static inline void emit_a32_sub_r(const u8 dst, const u8 src,
 641                                  const bool is64, const bool hi,
 642                                  struct jit_ctx *ctx) {
 643        /* 64 bit :
 644         *      subs dst_lo, dst_lo, src_lo
 645         *      sbc dst_hi, dst_hi, src_hi
 646         * 32 bit :
 647         *      sub dst_lo, dst_lo, src_lo
 648         */
 649        if (!hi && is64)
 650                emit(ARM_SUBS_R(dst, dst, src), ctx);
 651        else if (hi && is64)
 652                emit(ARM_SBC_R(dst, dst, src), ctx);
 653        else
 654                emit(ARM_SUB_R(dst, dst, src), ctx);
 655}
 656
 657static inline void emit_alu_r(const u8 dst, const u8 src, const bool is64,
 658                              const bool hi, const u8 op, struct jit_ctx *ctx){
 659        switch (BPF_OP(op)) {
 660        /* dst = dst + src */
 661        case BPF_ADD:
 662                emit_a32_add_r(dst, src, is64, hi, ctx);
 663                break;
 664        /* dst = dst - src */
 665        case BPF_SUB:
 666                emit_a32_sub_r(dst, src, is64, hi, ctx);
 667                break;
 668        /* dst = dst | src */
 669        case BPF_OR:
 670                emit(ARM_ORR_R(dst, dst, src), ctx);
 671                break;
 672        /* dst = dst & src */
 673        case BPF_AND:
 674                emit(ARM_AND_R(dst, dst, src), ctx);
 675                break;
 676        /* dst = dst ^ src */
 677        case BPF_XOR:
 678                emit(ARM_EOR_R(dst, dst, src), ctx);
 679                break;
 680        /* dst = dst * src */
 681        case BPF_MUL:
 682                emit(ARM_MUL(dst, dst, src), ctx);
 683                break;
 684        /* dst = dst << src */
 685        case BPF_LSH:
 686                emit(ARM_LSL_R(dst, dst, src), ctx);
 687                break;
 688        /* dst = dst >> src */
 689        case BPF_RSH:
 690                emit(ARM_LSR_R(dst, dst, src), ctx);
 691                break;
 692        /* dst = dst >> src (signed)*/
 693        case BPF_ARSH:
 694                emit(ARM_MOV_SR(dst, dst, SRTYPE_ASR, src), ctx);
 695                break;
 696        }
 697}
 698
 699/* ALU operation (32 bit)
 700 * dst = dst (op) src
 701 */
 702static inline void emit_a32_alu_r(const s8 dst, const s8 src,
 703                                  struct jit_ctx *ctx, const bool is64,
 704                                  const bool hi, const u8 op) {
 705        const s8 *tmp = bpf2a32[TMP_REG_1];
 706        s8 rn, rd;
 707
 708        rn = arm_bpf_get_reg32(src, tmp[1], ctx);
 709        rd = arm_bpf_get_reg32(dst, tmp[0], ctx);
 710        /* ALU operation */
 711        emit_alu_r(rd, rn, is64, hi, op, ctx);
 712        arm_bpf_put_reg32(dst, rd, ctx);
 713}
 714
 715/* ALU operation (64 bit) */
 716static inline void emit_a32_alu_r64(const bool is64, const s8 dst[],
 717                                  const s8 src[], struct jit_ctx *ctx,
 718                                  const u8 op) {
 719        const s8 *tmp = bpf2a32[TMP_REG_1];
 720        const s8 *tmp2 = bpf2a32[TMP_REG_2];
 721        const s8 *rd;
 722
 723        rd = arm_bpf_get_reg64(dst, tmp, ctx);
 724        if (is64) {
 725                const s8 *rs;
 726
 727                rs = arm_bpf_get_reg64(src, tmp2, ctx);
 728
 729                /* ALU operation */
 730                emit_alu_r(rd[1], rs[1], true, false, op, ctx);
 731                emit_alu_r(rd[0], rs[0], true, true, op, ctx);
 732        } else {
 733                s8 rs;
 734
 735                rs = arm_bpf_get_reg32(src_lo, tmp2[1], ctx);
 736
 737                /* ALU operation */
 738                emit_alu_r(rd[1], rs, true, false, op, ctx);
 739                emit_a32_mov_i(rd[0], 0, ctx);
 740        }
 741
 742        arm_bpf_put_reg64(dst, rd, ctx);
 743}
 744
 745/* dst = src (4 bytes)*/
 746static inline void emit_a32_mov_r(const s8 dst, const s8 src,
 747                                  struct jit_ctx *ctx) {
 748        const s8 *tmp = bpf2a32[TMP_REG_1];
 749        s8 rt;
 750
 751        rt = arm_bpf_get_reg32(src, tmp[0], ctx);
 752        arm_bpf_put_reg32(dst, rt, ctx);
 753}
 754
 755/* dst = src */
 756static inline void emit_a32_mov_r64(const bool is64, const s8 dst[],
 757                                  const s8 src[],
 758                                  struct jit_ctx *ctx) {
 759        if (!is64) {
 760                emit_a32_mov_r(dst_lo, src_lo, ctx);
 761                /* Zero out high 4 bytes */
 762                emit_a32_mov_i(dst_hi, 0, ctx);
 763        } else if (__LINUX_ARM_ARCH__ < 6 &&
 764                   ctx->cpu_architecture < CPU_ARCH_ARMv5TE) {
 765                /* complete 8 byte move */
 766                emit_a32_mov_r(dst_lo, src_lo, ctx);
 767                emit_a32_mov_r(dst_hi, src_hi, ctx);
 768        } else if (is_stacked(src_lo) && is_stacked(dst_lo)) {
 769                const u8 *tmp = bpf2a32[TMP_REG_1];
 770
 771                emit(ARM_LDRD_I(tmp[1], ARM_FP, EBPF_SCRATCH_TO_ARM_FP(src_lo)), ctx);
 772                emit(ARM_STRD_I(tmp[1], ARM_FP, EBPF_SCRATCH_TO_ARM_FP(dst_lo)), ctx);
 773        } else if (is_stacked(src_lo)) {
 774                emit(ARM_LDRD_I(dst[1], ARM_FP, EBPF_SCRATCH_TO_ARM_FP(src_lo)), ctx);
 775        } else if (is_stacked(dst_lo)) {
 776                emit(ARM_STRD_I(src[1], ARM_FP, EBPF_SCRATCH_TO_ARM_FP(dst_lo)), ctx);
 777        } else {
 778                emit(ARM_MOV_R(dst[0], src[0]), ctx);
 779                emit(ARM_MOV_R(dst[1], src[1]), ctx);
 780        }
 781}
 782
 783/* Shift operations */
 784static inline void emit_a32_alu_i(const s8 dst, const u32 val,
 785                                struct jit_ctx *ctx, const u8 op) {
 786        const s8 *tmp = bpf2a32[TMP_REG_1];
 787        s8 rd;
 788
 789        rd = arm_bpf_get_reg32(dst, tmp[0], ctx);
 790
 791        /* Do shift operation */
 792        switch (op) {
 793        case BPF_LSH:
 794                emit(ARM_LSL_I(rd, rd, val), ctx);
 795                break;
 796        case BPF_RSH:
 797                emit(ARM_LSR_I(rd, rd, val), ctx);
 798                break;
 799        case BPF_NEG:
 800                emit(ARM_RSB_I(rd, rd, val), ctx);
 801                break;
 802        }
 803
 804        arm_bpf_put_reg32(dst, rd, ctx);
 805}
 806
 807/* dst = ~dst (64 bit) */
 808static inline void emit_a32_neg64(const s8 dst[],
 809                                struct jit_ctx *ctx){
 810        const s8 *tmp = bpf2a32[TMP_REG_1];
 811        const s8 *rd;
 812
 813        /* Setup Operand */
 814        rd = arm_bpf_get_reg64(dst, tmp, ctx);
 815
 816        /* Do Negate Operation */
 817        emit(ARM_RSBS_I(rd[1], rd[1], 0), ctx);
 818        emit(ARM_RSC_I(rd[0], rd[0], 0), ctx);
 819
 820        arm_bpf_put_reg64(dst, rd, ctx);
 821}
 822
 823/* dst = dst << src */
 824static inline void emit_a32_lsh_r64(const s8 dst[], const s8 src[],
 825                                    struct jit_ctx *ctx) {
 826        const s8 *tmp = bpf2a32[TMP_REG_1];
 827        const s8 *tmp2 = bpf2a32[TMP_REG_2];
 828        const s8 *rd;
 829        s8 rt;
 830
 831        /* Setup Operands */
 832        rt = arm_bpf_get_reg32(src_lo, tmp2[1], ctx);
 833        rd = arm_bpf_get_reg64(dst, tmp, ctx);
 834
 835        /* Do LSH operation */
 836        emit(ARM_SUB_I(ARM_IP, rt, 32), ctx);
 837        emit(ARM_RSB_I(tmp2[0], rt, 32), ctx);
 838        emit(ARM_MOV_SR(ARM_LR, rd[0], SRTYPE_ASL, rt), ctx);
 839        emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[1], SRTYPE_ASL, ARM_IP), ctx);
 840        emit(ARM_ORR_SR(ARM_IP, ARM_LR, rd[1], SRTYPE_LSR, tmp2[0]), ctx);
 841        emit(ARM_MOV_SR(ARM_LR, rd[1], SRTYPE_ASL, rt), ctx);
 842
 843        arm_bpf_put_reg32(dst_lo, ARM_LR, ctx);
 844        arm_bpf_put_reg32(dst_hi, ARM_IP, ctx);
 845}
 846
 847/* dst = dst >> src (signed)*/
 848static inline void emit_a32_arsh_r64(const s8 dst[], const s8 src[],
 849                                     struct jit_ctx *ctx) {
 850        const s8 *tmp = bpf2a32[TMP_REG_1];
 851        const s8 *tmp2 = bpf2a32[TMP_REG_2];
 852        const s8 *rd;
 853        s8 rt;
 854
 855        /* Setup Operands */
 856        rt = arm_bpf_get_reg32(src_lo, tmp2[1], ctx);
 857        rd = arm_bpf_get_reg64(dst, tmp, ctx);
 858
 859        /* Do the ARSH operation */
 860        emit(ARM_RSB_I(ARM_IP, rt, 32), ctx);
 861        emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx);
 862        emit(ARM_MOV_SR(ARM_LR, rd[1], SRTYPE_LSR, rt), ctx);
 863        emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASL, ARM_IP), ctx);
 864        _emit(ARM_COND_MI, ARM_B(0), ctx);
 865        emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASR, tmp2[0]), ctx);
 866        emit(ARM_MOV_SR(ARM_IP, rd[0], SRTYPE_ASR, rt), ctx);
 867
 868        arm_bpf_put_reg32(dst_lo, ARM_LR, ctx);
 869        arm_bpf_put_reg32(dst_hi, ARM_IP, ctx);
 870}
 871
 872/* dst = dst >> src */
 873static inline void emit_a32_rsh_r64(const s8 dst[], const s8 src[],
 874                                    struct jit_ctx *ctx) {
 875        const s8 *tmp = bpf2a32[TMP_REG_1];
 876        const s8 *tmp2 = bpf2a32[TMP_REG_2];
 877        const s8 *rd;
 878        s8 rt;
 879
 880        /* Setup Operands */
 881        rt = arm_bpf_get_reg32(src_lo, tmp2[1], ctx);
 882        rd = arm_bpf_get_reg64(dst, tmp, ctx);
 883
 884        /* Do RSH operation */
 885        emit(ARM_RSB_I(ARM_IP, rt, 32), ctx);
 886        emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx);
 887        emit(ARM_MOV_SR(ARM_LR, rd[1], SRTYPE_LSR, rt), ctx);
 888        emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASL, ARM_IP), ctx);
 889        emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_LSR, tmp2[0]), ctx);
 890        emit(ARM_MOV_SR(ARM_IP, rd[0], SRTYPE_LSR, rt), ctx);
 891
 892        arm_bpf_put_reg32(dst_lo, ARM_LR, ctx);
 893        arm_bpf_put_reg32(dst_hi, ARM_IP, ctx);
 894}
 895
 896/* dst = dst << val */
 897static inline void emit_a32_lsh_i64(const s8 dst[],
 898                                    const u32 val, struct jit_ctx *ctx){
 899        const s8 *tmp = bpf2a32[TMP_REG_1];
 900        const s8 *tmp2 = bpf2a32[TMP_REG_2];
 901        const s8 *rd;
 902
 903        /* Setup operands */
 904        rd = arm_bpf_get_reg64(dst, tmp, ctx);
 905
 906        /* Do LSH operation */
 907        if (val < 32) {
 908                emit(ARM_MOV_SI(tmp2[0], rd[0], SRTYPE_ASL, val), ctx);
 909                emit(ARM_ORR_SI(rd[0], tmp2[0], rd[1], SRTYPE_LSR, 32 - val), ctx);
 910                emit(ARM_MOV_SI(rd[1], rd[1], SRTYPE_ASL, val), ctx);
 911        } else {
 912                if (val == 32)
 913                        emit(ARM_MOV_R(rd[0], rd[1]), ctx);
 914                else
 915                        emit(ARM_MOV_SI(rd[0], rd[1], SRTYPE_ASL, val - 32), ctx);
 916                emit(ARM_EOR_R(rd[1], rd[1], rd[1]), ctx);
 917        }
 918
 919        arm_bpf_put_reg64(dst, rd, ctx);
 920}
 921
 922/* dst = dst >> val */
 923static inline void emit_a32_rsh_i64(const s8 dst[],
 924                                    const u32 val, struct jit_ctx *ctx) {
 925        const s8 *tmp = bpf2a32[TMP_REG_1];
 926        const s8 *tmp2 = bpf2a32[TMP_REG_2];
 927        const s8 *rd;
 928
 929        /* Setup operands */
 930        rd = arm_bpf_get_reg64(dst, tmp, ctx);
 931
 932        /* Do LSR operation */
 933        if (val < 32) {
 934                emit(ARM_MOV_SI(tmp2[1], rd[1], SRTYPE_LSR, val), ctx);
 935                emit(ARM_ORR_SI(rd[1], tmp2[1], rd[0], SRTYPE_ASL, 32 - val), ctx);
 936                emit(ARM_MOV_SI(rd[0], rd[0], SRTYPE_LSR, val), ctx);
 937        } else if (val == 32) {
 938                emit(ARM_MOV_R(rd[1], rd[0]), ctx);
 939                emit(ARM_MOV_I(rd[0], 0), ctx);
 940        } else {
 941                emit(ARM_MOV_SI(rd[1], rd[0], SRTYPE_LSR, val - 32), ctx);
 942                emit(ARM_MOV_I(rd[0], 0), ctx);
 943        }
 944
 945        arm_bpf_put_reg64(dst, rd, ctx);
 946}
 947
 948/* dst = dst >> val (signed) */
 949static inline void emit_a32_arsh_i64(const s8 dst[],
 950                                     const u32 val, struct jit_ctx *ctx){
 951        const s8 *tmp = bpf2a32[TMP_REG_1];
 952        const s8 *tmp2 = bpf2a32[TMP_REG_2];
 953        const s8 *rd;
 954
 955        /* Setup operands */
 956        rd = arm_bpf_get_reg64(dst, tmp, ctx);
 957
 958        /* Do ARSH operation */
 959        if (val < 32) {
 960                emit(ARM_MOV_SI(tmp2[1], rd[1], SRTYPE_LSR, val), ctx);
 961                emit(ARM_ORR_SI(rd[1], tmp2[1], rd[0], SRTYPE_ASL, 32 - val), ctx);
 962                emit(ARM_MOV_SI(rd[0], rd[0], SRTYPE_ASR, val), ctx);
 963        } else if (val == 32) {
 964                emit(ARM_MOV_R(rd[1], rd[0]), ctx);
 965                emit(ARM_MOV_SI(rd[0], rd[0], SRTYPE_ASR, 31), ctx);
 966        } else {
 967                emit(ARM_MOV_SI(rd[1], rd[0], SRTYPE_ASR, val - 32), ctx);
 968                emit(ARM_MOV_SI(rd[0], rd[0], SRTYPE_ASR, 31), ctx);
 969        }
 970
 971        arm_bpf_put_reg64(dst, rd, ctx);
 972}
 973
 974static inline void emit_a32_mul_r64(const s8 dst[], const s8 src[],
 975                                    struct jit_ctx *ctx) {
 976        const s8 *tmp = bpf2a32[TMP_REG_1];
 977        const s8 *tmp2 = bpf2a32[TMP_REG_2];
 978        const s8 *rd, *rt;
 979
 980        /* Setup operands for multiplication */
 981        rd = arm_bpf_get_reg64(dst, tmp, ctx);
 982        rt = arm_bpf_get_reg64(src, tmp2, ctx);
 983
 984        /* Do Multiplication */
 985        emit(ARM_MUL(ARM_IP, rd[1], rt[0]), ctx);
 986        emit(ARM_MUL(ARM_LR, rd[0], rt[1]), ctx);
 987        emit(ARM_ADD_R(ARM_LR, ARM_IP, ARM_LR), ctx);
 988
 989        emit(ARM_UMULL(ARM_IP, rd[0], rd[1], rt[1]), ctx);
 990        emit(ARM_ADD_R(rd[0], ARM_LR, rd[0]), ctx);
 991
 992        arm_bpf_put_reg32(dst_lo, ARM_IP, ctx);
 993        arm_bpf_put_reg32(dst_hi, rd[0], ctx);
 994}
 995
 996/* *(size *)(dst + off) = src */
 997static inline void emit_str_r(const s8 dst, const s8 src[],
 998                              s32 off, struct jit_ctx *ctx, const u8 sz){
 999        const s8 *tmp = bpf2a32[TMP_REG_1];
1000        s32 off_max;
1001        s8 rd;
1002
1003        rd = arm_bpf_get_reg32(dst, tmp[1], ctx);
1004
1005        if (sz == BPF_H)
1006                off_max = 0xff;
1007        else
1008                off_max = 0xfff;
1009
1010        if (off < 0 || off > off_max) {
1011                emit_a32_mov_i(tmp[0], off, ctx);
1012                emit(ARM_ADD_R(tmp[0], tmp[0], rd), ctx);
1013                rd = tmp[0];
1014                off = 0;
1015        }
1016        switch (sz) {
1017        case BPF_B:
1018                /* Store a Byte */
1019                emit(ARM_STRB_I(src_lo, rd, off), ctx);
1020                break;
1021        case BPF_H:
1022                /* Store a HalfWord */
1023                emit(ARM_STRH_I(src_lo, rd, off), ctx);
1024                break;
1025        case BPF_W:
1026                /* Store a Word */
1027                emit(ARM_STR_I(src_lo, rd, off), ctx);
1028                break;
1029        case BPF_DW:
1030                /* Store a Double Word */
1031                emit(ARM_STR_I(src_lo, rd, off), ctx);
1032                emit(ARM_STR_I(src_hi, rd, off + 4), ctx);
1033                break;
1034        }
1035}
1036
1037/* dst = *(size*)(src + off) */
1038static inline void emit_ldx_r(const s8 dst[], const s8 src,
1039                              s32 off, struct jit_ctx *ctx, const u8 sz){
1040        const s8 *tmp = bpf2a32[TMP_REG_1];
1041        const s8 *rd = is_stacked(dst_lo) ? tmp : dst;
1042        s8 rm = src;
1043        s32 off_max;
1044
1045        if (sz == BPF_H)
1046                off_max = 0xff;
1047        else
1048                off_max = 0xfff;
1049
1050        if (off < 0 || off > off_max) {
1051                emit_a32_mov_i(tmp[0], off, ctx);
1052                emit(ARM_ADD_R(tmp[0], tmp[0], src), ctx);
1053                rm = tmp[0];
1054                off = 0;
1055        } else if (rd[1] == rm) {
1056                emit(ARM_MOV_R(tmp[0], rm), ctx);
1057                rm = tmp[0];
1058        }
1059        switch (sz) {
1060        case BPF_B:
1061                /* Load a Byte */
1062                emit(ARM_LDRB_I(rd[1], rm, off), ctx);
1063                emit_a32_mov_i(rd[0], 0, ctx);
1064                break;
1065        case BPF_H:
1066                /* Load a HalfWord */
1067                emit(ARM_LDRH_I(rd[1], rm, off), ctx);
1068                emit_a32_mov_i(rd[0], 0, ctx);
1069                break;
1070        case BPF_W:
1071                /* Load a Word */
1072                emit(ARM_LDR_I(rd[1], rm, off), ctx);
1073                emit_a32_mov_i(rd[0], 0, ctx);
1074                break;
1075        case BPF_DW:
1076                /* Load a Double Word */
1077                emit(ARM_LDR_I(rd[1], rm, off), ctx);
1078                emit(ARM_LDR_I(rd[0], rm, off + 4), ctx);
1079                break;
1080        }
1081        arm_bpf_put_reg64(dst, rd, ctx);
1082}
1083
1084/* Arithmatic Operation */
1085static inline void emit_ar_r(const u8 rd, const u8 rt, const u8 rm,
1086                             const u8 rn, struct jit_ctx *ctx, u8 op,
1087                             bool is_jmp64) {
1088        switch (op) {
1089        case BPF_JSET:
1090                if (is_jmp64) {
1091                        emit(ARM_AND_R(ARM_IP, rt, rn), ctx);
1092                        emit(ARM_AND_R(ARM_LR, rd, rm), ctx);
1093                        emit(ARM_ORRS_R(ARM_IP, ARM_LR, ARM_IP), ctx);
1094                } else {
1095                        emit(ARM_ANDS_R(ARM_IP, rt, rn), ctx);
1096                }
1097                break;
1098        case BPF_JEQ:
1099        case BPF_JNE:
1100        case BPF_JGT:
1101        case BPF_JGE:
1102        case BPF_JLE:
1103        case BPF_JLT:
1104                if (is_jmp64) {
1105                        emit(ARM_CMP_R(rd, rm), ctx);
1106                        /* Only compare low halve if high halve are equal. */
1107                        _emit(ARM_COND_EQ, ARM_CMP_R(rt, rn), ctx);
1108                } else {
1109                        emit(ARM_CMP_R(rt, rn), ctx);
1110                }
1111                break;
1112        case BPF_JSLE:
1113        case BPF_JSGT:
1114                emit(ARM_CMP_R(rn, rt), ctx);
1115                if (is_jmp64)
1116                        emit(ARM_SBCS_R(ARM_IP, rm, rd), ctx);
1117                break;
1118        case BPF_JSLT:
1119        case BPF_JSGE:
1120                emit(ARM_CMP_R(rt, rn), ctx);
1121                if (is_jmp64)
1122                        emit(ARM_SBCS_R(ARM_IP, rd, rm), ctx);
1123                break;
1124        }
1125}
1126
1127static int out_offset = -1; /* initialized on the first pass of build_body() */
1128static int emit_bpf_tail_call(struct jit_ctx *ctx)
1129{
1130
1131        /* bpf_tail_call(void *prog_ctx, struct bpf_array *array, u64 index) */
1132        const s8 *r2 = bpf2a32[BPF_REG_2];
1133        const s8 *r3 = bpf2a32[BPF_REG_3];
1134        const s8 *tmp = bpf2a32[TMP_REG_1];
1135        const s8 *tmp2 = bpf2a32[TMP_REG_2];
1136        const s8 *tcc = bpf2a32[TCALL_CNT];
1137        const s8 *tc;
1138        const int idx0 = ctx->idx;
1139#define cur_offset (ctx->idx - idx0)
1140#define jmp_offset (out_offset - (cur_offset) - 2)
1141        u32 lo, hi;
1142        s8 r_array, r_index;
1143        int off;
1144
1145        /* if (index >= array->map.max_entries)
1146         *      goto out;
1147         */
1148        BUILD_BUG_ON(offsetof(struct bpf_array, map.max_entries) >
1149                     ARM_INST_LDST__IMM12);
1150        off = offsetof(struct bpf_array, map.max_entries);
1151        r_array = arm_bpf_get_reg32(r2[1], tmp2[0], ctx);
1152        /* index is 32-bit for arrays */
1153        r_index = arm_bpf_get_reg32(r3[1], tmp2[1], ctx);
1154        /* array->map.max_entries */
1155        emit(ARM_LDR_I(tmp[1], r_array, off), ctx);
1156        /* index >= array->map.max_entries */
1157        emit(ARM_CMP_R(r_index, tmp[1]), ctx);
1158        _emit(ARM_COND_CS, ARM_B(jmp_offset), ctx);
1159
1160        /* tmp2[0] = array, tmp2[1] = index */
1161
1162        /* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
1163         *      goto out;
1164         * tail_call_cnt++;
1165         */
1166        lo = (u32)MAX_TAIL_CALL_CNT;
1167        hi = (u32)((u64)MAX_TAIL_CALL_CNT >> 32);
1168        tc = arm_bpf_get_reg64(tcc, tmp, ctx);
1169        emit(ARM_CMP_I(tc[0], hi), ctx);
1170        _emit(ARM_COND_EQ, ARM_CMP_I(tc[1], lo), ctx);
1171        _emit(ARM_COND_HI, ARM_B(jmp_offset), ctx);
1172        emit(ARM_ADDS_I(tc[1], tc[1], 1), ctx);
1173        emit(ARM_ADC_I(tc[0], tc[0], 0), ctx);
1174        arm_bpf_put_reg64(tcc, tmp, ctx);
1175
1176        /* prog = array->ptrs[index]
1177         * if (prog == NULL)
1178         *      goto out;
1179         */
1180        BUILD_BUG_ON(imm8m(offsetof(struct bpf_array, ptrs)) < 0);
1181        off = imm8m(offsetof(struct bpf_array, ptrs));
1182        emit(ARM_ADD_I(tmp[1], r_array, off), ctx);
1183        emit(ARM_LDR_R_SI(tmp[1], tmp[1], r_index, SRTYPE_ASL, 2), ctx);
1184        emit(ARM_CMP_I(tmp[1], 0), ctx);
1185        _emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx);
1186
1187        /* goto *(prog->bpf_func + prologue_size); */
1188        BUILD_BUG_ON(offsetof(struct bpf_prog, bpf_func) >
1189                     ARM_INST_LDST__IMM12);
1190        off = offsetof(struct bpf_prog, bpf_func);
1191        emit(ARM_LDR_I(tmp[1], tmp[1], off), ctx);
1192        emit(ARM_ADD_I(tmp[1], tmp[1], ctx->prologue_bytes), ctx);
1193        emit_bx_r(tmp[1], ctx);
1194
1195        /* out: */
1196        if (out_offset == -1)
1197                out_offset = cur_offset;
1198        if (cur_offset != out_offset) {
1199                pr_err_once("tail_call out_offset = %d, expected %d!\n",
1200                            cur_offset, out_offset);
1201                return -1;
1202        }
1203        return 0;
1204#undef cur_offset
1205#undef jmp_offset
1206}
1207
1208/* 0xabcd => 0xcdab */
1209static inline void emit_rev16(const u8 rd, const u8 rn, struct jit_ctx *ctx)
1210{
1211#if __LINUX_ARM_ARCH__ < 6
1212        const s8 *tmp2 = bpf2a32[TMP_REG_2];
1213
1214        emit(ARM_AND_I(tmp2[1], rn, 0xff), ctx);
1215        emit(ARM_MOV_SI(tmp2[0], rn, SRTYPE_LSR, 8), ctx);
1216        emit(ARM_AND_I(tmp2[0], tmp2[0], 0xff), ctx);
1217        emit(ARM_ORR_SI(rd, tmp2[0], tmp2[1], SRTYPE_LSL, 8), ctx);
1218#else /* ARMv6+ */
1219        emit(ARM_REV16(rd, rn), ctx);
1220#endif
1221}
1222
1223/* 0xabcdefgh => 0xghefcdab */
1224static inline void emit_rev32(const u8 rd, const u8 rn, struct jit_ctx *ctx)
1225{
1226#if __LINUX_ARM_ARCH__ < 6
1227        const s8 *tmp2 = bpf2a32[TMP_REG_2];
1228
1229        emit(ARM_AND_I(tmp2[1], rn, 0xff), ctx);
1230        emit(ARM_MOV_SI(tmp2[0], rn, SRTYPE_LSR, 24), ctx);
1231        emit(ARM_ORR_SI(ARM_IP, tmp2[0], tmp2[1], SRTYPE_LSL, 24), ctx);
1232
1233        emit(ARM_MOV_SI(tmp2[1], rn, SRTYPE_LSR, 8), ctx);
1234        emit(ARM_AND_I(tmp2[1], tmp2[1], 0xff), ctx);
1235        emit(ARM_MOV_SI(tmp2[0], rn, SRTYPE_LSR, 16), ctx);
1236        emit(ARM_AND_I(tmp2[0], tmp2[0], 0xff), ctx);
1237        emit(ARM_MOV_SI(tmp2[0], tmp2[0], SRTYPE_LSL, 8), ctx);
1238        emit(ARM_ORR_SI(tmp2[0], tmp2[0], tmp2[1], SRTYPE_LSL, 16), ctx);
1239        emit(ARM_ORR_R(rd, ARM_IP, tmp2[0]), ctx);
1240
1241#else /* ARMv6+ */
1242        emit(ARM_REV(rd, rn), ctx);
1243#endif
1244}
1245
1246// push the scratch stack register on top of the stack
1247static inline void emit_push_r64(const s8 src[], struct jit_ctx *ctx)
1248{
1249        const s8 *tmp2 = bpf2a32[TMP_REG_2];
1250        const s8 *rt;
1251        u16 reg_set = 0;
1252
1253        rt = arm_bpf_get_reg64(src, tmp2, ctx);
1254
1255        reg_set = (1 << rt[1]) | (1 << rt[0]);
1256        emit(ARM_PUSH(reg_set), ctx);
1257}
1258
1259static void build_prologue(struct jit_ctx *ctx)
1260{
1261        const s8 r0 = bpf2a32[BPF_REG_0][1];
1262        const s8 r2 = bpf2a32[BPF_REG_1][1];
1263        const s8 r3 = bpf2a32[BPF_REG_1][0];
1264        const s8 r4 = bpf2a32[BPF_REG_6][1];
1265        const s8 fplo = bpf2a32[BPF_REG_FP][1];
1266        const s8 fphi = bpf2a32[BPF_REG_FP][0];
1267        const s8 *tcc = bpf2a32[TCALL_CNT];
1268
1269        /* Save callee saved registers. */
1270#ifdef CONFIG_FRAME_POINTER
1271        u16 reg_set = CALLEE_PUSH_MASK | 1 << ARM_IP | 1 << ARM_PC;
1272        emit(ARM_MOV_R(ARM_IP, ARM_SP), ctx);
1273        emit(ARM_PUSH(reg_set), ctx);
1274        emit(ARM_SUB_I(ARM_FP, ARM_IP, 4), ctx);
1275#else
1276        emit(ARM_PUSH(CALLEE_PUSH_MASK), ctx);
1277        emit(ARM_MOV_R(ARM_FP, ARM_SP), ctx);
1278#endif
1279        /* Save frame pointer for later */
1280        emit(ARM_SUB_I(ARM_IP, ARM_SP, SCRATCH_SIZE), ctx);
1281
1282        ctx->stack_size = imm8m(STACK_SIZE);
1283
1284        /* Set up function call stack */
1285        emit(ARM_SUB_I(ARM_SP, ARM_SP, ctx->stack_size), ctx);
1286
1287        /* Set up BPF prog stack base register */
1288        emit_a32_mov_r(fplo, ARM_IP, ctx);
1289        emit_a32_mov_i(fphi, 0, ctx);
1290
1291        /* mov r4, 0 */
1292        emit(ARM_MOV_I(r4, 0), ctx);
1293
1294        /* Move BPF_CTX to BPF_R1 */
1295        emit(ARM_MOV_R(r3, r4), ctx);
1296        emit(ARM_MOV_R(r2, r0), ctx);
1297        /* Initialize Tail Count */
1298        emit(ARM_STR_I(r4, ARM_FP, EBPF_SCRATCH_TO_ARM_FP(tcc[0])), ctx);
1299        emit(ARM_STR_I(r4, ARM_FP, EBPF_SCRATCH_TO_ARM_FP(tcc[1])), ctx);
1300        /* end of prologue */
1301}
1302
1303/* restore callee saved registers. */
1304static void build_epilogue(struct jit_ctx *ctx)
1305{
1306#ifdef CONFIG_FRAME_POINTER
1307        /* When using frame pointers, some additional registers need to
1308         * be loaded. */
1309        u16 reg_set = CALLEE_POP_MASK | 1 << ARM_SP;
1310        emit(ARM_SUB_I(ARM_SP, ARM_FP, hweight16(reg_set) * 4), ctx);
1311        emit(ARM_LDM(ARM_SP, reg_set), ctx);
1312#else
1313        /* Restore callee saved registers. */
1314        emit(ARM_MOV_R(ARM_SP, ARM_FP), ctx);
1315        emit(ARM_POP(CALLEE_POP_MASK), ctx);
1316#endif
1317}
1318
1319/*
1320 * Convert an eBPF instruction to native instruction, i.e
1321 * JITs an eBPF instruction.
1322 * Returns :
1323 *      0  - Successfully JITed an 8-byte eBPF instruction
1324 *      >0 - Successfully JITed a 16-byte eBPF instruction
1325 *      <0 - Failed to JIT.
1326 */
1327static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
1328{
1329        const u8 code = insn->code;
1330        const s8 *dst = bpf2a32[insn->dst_reg];
1331        const s8 *src = bpf2a32[insn->src_reg];
1332        const s8 *tmp = bpf2a32[TMP_REG_1];
1333        const s8 *tmp2 = bpf2a32[TMP_REG_2];
1334        const s16 off = insn->off;
1335        const s32 imm = insn->imm;
1336        const int i = insn - ctx->prog->insnsi;
1337        const bool is64 = BPF_CLASS(code) == BPF_ALU64;
1338        const s8 *rd, *rs;
1339        s8 rd_lo, rt, rm, rn;
1340        s32 jmp_offset;
1341
1342#define check_imm(bits, imm) do {                               \
1343        if ((imm) >= (1 << ((bits) - 1)) ||                     \
1344            (imm) < -(1 << ((bits) - 1))) {                     \
1345                pr_info("[%2d] imm=%d(0x%x) out of range\n",    \
1346                        i, imm, imm);                           \
1347                return -EINVAL;                                 \
1348        }                                                       \
1349} while (0)
1350#define check_imm24(imm) check_imm(24, imm)
1351
1352        switch (code) {
1353        /* ALU operations */
1354
1355        /* dst = src */
1356        case BPF_ALU | BPF_MOV | BPF_K:
1357        case BPF_ALU | BPF_MOV | BPF_X:
1358        case BPF_ALU64 | BPF_MOV | BPF_K:
1359        case BPF_ALU64 | BPF_MOV | BPF_X:
1360                switch (BPF_SRC(code)) {
1361                case BPF_X:
1362                        emit_a32_mov_r64(is64, dst, src, ctx);
1363                        break;
1364                case BPF_K:
1365                        /* Sign-extend immediate value to destination reg */
1366                        emit_a32_mov_se_i64(is64, dst, imm, ctx);
1367                        break;
1368                }
1369                break;
1370        /* dst = dst + src/imm */
1371        /* dst = dst - src/imm */
1372        /* dst = dst | src/imm */
1373        /* dst = dst & src/imm */
1374        /* dst = dst ^ src/imm */
1375        /* dst = dst * src/imm */
1376        /* dst = dst << src */
1377        /* dst = dst >> src */
1378        case BPF_ALU | BPF_ADD | BPF_K:
1379        case BPF_ALU | BPF_ADD | BPF_X:
1380        case BPF_ALU | BPF_SUB | BPF_K:
1381        case BPF_ALU | BPF_SUB | BPF_X:
1382        case BPF_ALU | BPF_OR | BPF_K:
1383        case BPF_ALU | BPF_OR | BPF_X:
1384        case BPF_ALU | BPF_AND | BPF_K:
1385        case BPF_ALU | BPF_AND | BPF_X:
1386        case BPF_ALU | BPF_XOR | BPF_K:
1387        case BPF_ALU | BPF_XOR | BPF_X:
1388        case BPF_ALU | BPF_MUL | BPF_K:
1389        case BPF_ALU | BPF_MUL | BPF_X:
1390        case BPF_ALU | BPF_LSH | BPF_X:
1391        case BPF_ALU | BPF_RSH | BPF_X:
1392        case BPF_ALU | BPF_ARSH | BPF_K:
1393        case BPF_ALU | BPF_ARSH | BPF_X:
1394        case BPF_ALU64 | BPF_ADD | BPF_K:
1395        case BPF_ALU64 | BPF_ADD | BPF_X:
1396        case BPF_ALU64 | BPF_SUB | BPF_K:
1397        case BPF_ALU64 | BPF_SUB | BPF_X:
1398        case BPF_ALU64 | BPF_OR | BPF_K:
1399        case BPF_ALU64 | BPF_OR | BPF_X:
1400        case BPF_ALU64 | BPF_AND | BPF_K:
1401        case BPF_ALU64 | BPF_AND | BPF_X:
1402        case BPF_ALU64 | BPF_XOR | BPF_K:
1403        case BPF_ALU64 | BPF_XOR | BPF_X:
1404                switch (BPF_SRC(code)) {
1405                case BPF_X:
1406                        emit_a32_alu_r64(is64, dst, src, ctx, BPF_OP(code));
1407                        break;
1408                case BPF_K:
1409                        /* Move immediate value to the temporary register
1410                         * and then do the ALU operation on the temporary
1411                         * register as this will sign-extend the immediate
1412                         * value into temporary reg and then it would be
1413                         * safe to do the operation on it.
1414                         */
1415                        emit_a32_mov_se_i64(is64, tmp2, imm, ctx);
1416                        emit_a32_alu_r64(is64, dst, tmp2, ctx, BPF_OP(code));
1417                        break;
1418                }
1419                break;
1420        /* dst = dst / src(imm) */
1421        /* dst = dst % src(imm) */
1422        case BPF_ALU | BPF_DIV | BPF_K:
1423        case BPF_ALU | BPF_DIV | BPF_X:
1424        case BPF_ALU | BPF_MOD | BPF_K:
1425        case BPF_ALU | BPF_MOD | BPF_X:
1426                rd_lo = arm_bpf_get_reg32(dst_lo, tmp2[1], ctx);
1427                switch (BPF_SRC(code)) {
1428                case BPF_X:
1429                        rt = arm_bpf_get_reg32(src_lo, tmp2[0], ctx);
1430                        break;
1431                case BPF_K:
1432                        rt = tmp2[0];
1433                        emit_a32_mov_i(rt, imm, ctx);
1434                        break;
1435                default:
1436                        rt = src_lo;
1437                        break;
1438                }
1439                emit_udivmod(rd_lo, rd_lo, rt, ctx, BPF_OP(code));
1440                arm_bpf_put_reg32(dst_lo, rd_lo, ctx);
1441                emit_a32_mov_i(dst_hi, 0, ctx);
1442                break;
1443        case BPF_ALU64 | BPF_DIV | BPF_K:
1444        case BPF_ALU64 | BPF_DIV | BPF_X:
1445        case BPF_ALU64 | BPF_MOD | BPF_K:
1446        case BPF_ALU64 | BPF_MOD | BPF_X:
1447                goto notyet;
1448        /* dst = dst >> imm */
1449        /* dst = dst << imm */
1450        case BPF_ALU | BPF_RSH | BPF_K:
1451        case BPF_ALU | BPF_LSH | BPF_K:
1452                if (unlikely(imm > 31))
1453                        return -EINVAL;
1454                if (imm)
1455                        emit_a32_alu_i(dst_lo, imm, ctx, BPF_OP(code));
1456                emit_a32_mov_i(dst_hi, 0, ctx);
1457                break;
1458        /* dst = dst << imm */
1459        case BPF_ALU64 | BPF_LSH | BPF_K:
1460                if (unlikely(imm > 63))
1461                        return -EINVAL;
1462                emit_a32_lsh_i64(dst, imm, ctx);
1463                break;
1464        /* dst = dst >> imm */
1465        case BPF_ALU64 | BPF_RSH | BPF_K:
1466                if (unlikely(imm > 63))
1467                        return -EINVAL;
1468                emit_a32_rsh_i64(dst, imm, ctx);
1469                break;
1470        /* dst = dst << src */
1471        case BPF_ALU64 | BPF_LSH | BPF_X:
1472                emit_a32_lsh_r64(dst, src, ctx);
1473                break;
1474        /* dst = dst >> src */
1475        case BPF_ALU64 | BPF_RSH | BPF_X:
1476                emit_a32_rsh_r64(dst, src, ctx);
1477                break;
1478        /* dst = dst >> src (signed) */
1479        case BPF_ALU64 | BPF_ARSH | BPF_X:
1480                emit_a32_arsh_r64(dst, src, ctx);
1481                break;
1482        /* dst = dst >> imm (signed) */
1483        case BPF_ALU64 | BPF_ARSH | BPF_K:
1484                if (unlikely(imm > 63))
1485                        return -EINVAL;
1486                emit_a32_arsh_i64(dst, imm, ctx);
1487                break;
1488        /* dst = ~dst */
1489        case BPF_ALU | BPF_NEG:
1490                emit_a32_alu_i(dst_lo, 0, ctx, BPF_OP(code));
1491                emit_a32_mov_i(dst_hi, 0, ctx);
1492                break;
1493        /* dst = ~dst (64 bit) */
1494        case BPF_ALU64 | BPF_NEG:
1495                emit_a32_neg64(dst, ctx);
1496                break;
1497        /* dst = dst * src/imm */
1498        case BPF_ALU64 | BPF_MUL | BPF_X:
1499        case BPF_ALU64 | BPF_MUL | BPF_K:
1500                switch (BPF_SRC(code)) {
1501                case BPF_X:
1502                        emit_a32_mul_r64(dst, src, ctx);
1503                        break;
1504                case BPF_K:
1505                        /* Move immediate value to the temporary register
1506                         * and then do the multiplication on it as this
1507                         * will sign-extend the immediate value into temp
1508                         * reg then it would be safe to do the operation
1509                         * on it.
1510                         */
1511                        emit_a32_mov_se_i64(is64, tmp2, imm, ctx);
1512                        emit_a32_mul_r64(dst, tmp2, ctx);
1513                        break;
1514                }
1515                break;
1516        /* dst = htole(dst) */
1517        /* dst = htobe(dst) */
1518        case BPF_ALU | BPF_END | BPF_FROM_LE:
1519        case BPF_ALU | BPF_END | BPF_FROM_BE:
1520                rd = arm_bpf_get_reg64(dst, tmp, ctx);
1521                if (BPF_SRC(code) == BPF_FROM_LE)
1522                        goto emit_bswap_uxt;
1523                switch (imm) {
1524                case 16:
1525                        emit_rev16(rd[1], rd[1], ctx);
1526                        goto emit_bswap_uxt;
1527                case 32:
1528                        emit_rev32(rd[1], rd[1], ctx);
1529                        goto emit_bswap_uxt;
1530                case 64:
1531                        emit_rev32(ARM_LR, rd[1], ctx);
1532                        emit_rev32(rd[1], rd[0], ctx);
1533                        emit(ARM_MOV_R(rd[0], ARM_LR), ctx);
1534                        break;
1535                }
1536                goto exit;
1537emit_bswap_uxt:
1538                switch (imm) {
1539                case 16:
1540                        /* zero-extend 16 bits into 64 bits */
1541#if __LINUX_ARM_ARCH__ < 6
1542                        emit_a32_mov_i(tmp2[1], 0xffff, ctx);
1543                        emit(ARM_AND_R(rd[1], rd[1], tmp2[1]), ctx);
1544#else /* ARMv6+ */
1545                        emit(ARM_UXTH(rd[1], rd[1]), ctx);
1546#endif
1547                        emit(ARM_EOR_R(rd[0], rd[0], rd[0]), ctx);
1548                        break;
1549                case 32:
1550                        /* zero-extend 32 bits into 64 bits */
1551                        emit(ARM_EOR_R(rd[0], rd[0], rd[0]), ctx);
1552                        break;
1553                case 64:
1554                        /* nop */
1555                        break;
1556                }
1557exit:
1558                arm_bpf_put_reg64(dst, rd, ctx);
1559                break;
1560        /* dst = imm64 */
1561        case BPF_LD | BPF_IMM | BPF_DW:
1562        {
1563                u64 val = (u32)imm | (u64)insn[1].imm << 32;
1564
1565                emit_a32_mov_i64(dst, val, ctx);
1566
1567                return 1;
1568        }
1569        /* LDX: dst = *(size *)(src + off) */
1570        case BPF_LDX | BPF_MEM | BPF_W:
1571        case BPF_LDX | BPF_MEM | BPF_H:
1572        case BPF_LDX | BPF_MEM | BPF_B:
1573        case BPF_LDX | BPF_MEM | BPF_DW:
1574                rn = arm_bpf_get_reg32(src_lo, tmp2[1], ctx);
1575                emit_ldx_r(dst, rn, off, ctx, BPF_SIZE(code));
1576                break;
1577        /* ST: *(size *)(dst + off) = imm */
1578        case BPF_ST | BPF_MEM | BPF_W:
1579        case BPF_ST | BPF_MEM | BPF_H:
1580        case BPF_ST | BPF_MEM | BPF_B:
1581        case BPF_ST | BPF_MEM | BPF_DW:
1582                switch (BPF_SIZE(code)) {
1583                case BPF_DW:
1584                        /* Sign-extend immediate value into temp reg */
1585                        emit_a32_mov_se_i64(true, tmp2, imm, ctx);
1586                        break;
1587                case BPF_W:
1588                case BPF_H:
1589                case BPF_B:
1590                        emit_a32_mov_i(tmp2[1], imm, ctx);
1591                        break;
1592                }
1593                emit_str_r(dst_lo, tmp2, off, ctx, BPF_SIZE(code));
1594                break;
1595        /* STX XADD: lock *(u32 *)(dst + off) += src */
1596        case BPF_STX | BPF_XADD | BPF_W:
1597        /* STX XADD: lock *(u64 *)(dst + off) += src */
1598        case BPF_STX | BPF_XADD | BPF_DW:
1599                goto notyet;
1600        /* STX: *(size *)(dst + off) = src */
1601        case BPF_STX | BPF_MEM | BPF_W:
1602        case BPF_STX | BPF_MEM | BPF_H:
1603        case BPF_STX | BPF_MEM | BPF_B:
1604        case BPF_STX | BPF_MEM | BPF_DW:
1605                rs = arm_bpf_get_reg64(src, tmp2, ctx);
1606                emit_str_r(dst_lo, rs, off, ctx, BPF_SIZE(code));
1607                break;
1608        /* PC += off if dst == src */
1609        /* PC += off if dst > src */
1610        /* PC += off if dst >= src */
1611        /* PC += off if dst < src */
1612        /* PC += off if dst <= src */
1613        /* PC += off if dst != src */
1614        /* PC += off if dst > src (signed) */
1615        /* PC += off if dst >= src (signed) */
1616        /* PC += off if dst < src (signed) */
1617        /* PC += off if dst <= src (signed) */
1618        /* PC += off if dst & src */
1619        case BPF_JMP | BPF_JEQ | BPF_X:
1620        case BPF_JMP | BPF_JGT | BPF_X:
1621        case BPF_JMP | BPF_JGE | BPF_X:
1622        case BPF_JMP | BPF_JNE | BPF_X:
1623        case BPF_JMP | BPF_JSGT | BPF_X:
1624        case BPF_JMP | BPF_JSGE | BPF_X:
1625        case BPF_JMP | BPF_JSET | BPF_X:
1626        case BPF_JMP | BPF_JLE | BPF_X:
1627        case BPF_JMP | BPF_JLT | BPF_X:
1628        case BPF_JMP | BPF_JSLT | BPF_X:
1629        case BPF_JMP | BPF_JSLE | BPF_X:
1630        case BPF_JMP32 | BPF_JEQ | BPF_X:
1631        case BPF_JMP32 | BPF_JGT | BPF_X:
1632        case BPF_JMP32 | BPF_JGE | BPF_X:
1633        case BPF_JMP32 | BPF_JNE | BPF_X:
1634        case BPF_JMP32 | BPF_JSGT | BPF_X:
1635        case BPF_JMP32 | BPF_JSGE | BPF_X:
1636        case BPF_JMP32 | BPF_JSET | BPF_X:
1637        case BPF_JMP32 | BPF_JLE | BPF_X:
1638        case BPF_JMP32 | BPF_JLT | BPF_X:
1639        case BPF_JMP32 | BPF_JSLT | BPF_X:
1640        case BPF_JMP32 | BPF_JSLE | BPF_X:
1641                /* Setup source registers */
1642                rm = arm_bpf_get_reg32(src_hi, tmp2[0], ctx);
1643                rn = arm_bpf_get_reg32(src_lo, tmp2[1], ctx);
1644                goto go_jmp;
1645        /* PC += off if dst == imm */
1646        /* PC += off if dst > imm */
1647        /* PC += off if dst >= imm */
1648        /* PC += off if dst < imm */
1649        /* PC += off if dst <= imm */
1650        /* PC += off if dst != imm */
1651        /* PC += off if dst > imm (signed) */
1652        /* PC += off if dst >= imm (signed) */
1653        /* PC += off if dst < imm (signed) */
1654        /* PC += off if dst <= imm (signed) */
1655        /* PC += off if dst & imm */
1656        case BPF_JMP | BPF_JEQ | BPF_K:
1657        case BPF_JMP | BPF_JGT | BPF_K:
1658        case BPF_JMP | BPF_JGE | BPF_K:
1659        case BPF_JMP | BPF_JNE | BPF_K:
1660        case BPF_JMP | BPF_JSGT | BPF_K:
1661        case BPF_JMP | BPF_JSGE | BPF_K:
1662        case BPF_JMP | BPF_JSET | BPF_K:
1663        case BPF_JMP | BPF_JLT | BPF_K:
1664        case BPF_JMP | BPF_JLE | BPF_K:
1665        case BPF_JMP | BPF_JSLT | BPF_K:
1666        case BPF_JMP | BPF_JSLE | BPF_K:
1667        case BPF_JMP32 | BPF_JEQ | BPF_K:
1668        case BPF_JMP32 | BPF_JGT | BPF_K:
1669        case BPF_JMP32 | BPF_JGE | BPF_K:
1670        case BPF_JMP32 | BPF_JNE | BPF_K:
1671        case BPF_JMP32 | BPF_JSGT | BPF_K:
1672        case BPF_JMP32 | BPF_JSGE | BPF_K:
1673        case BPF_JMP32 | BPF_JSET | BPF_K:
1674        case BPF_JMP32 | BPF_JLT | BPF_K:
1675        case BPF_JMP32 | BPF_JLE | BPF_K:
1676        case BPF_JMP32 | BPF_JSLT | BPF_K:
1677        case BPF_JMP32 | BPF_JSLE | BPF_K:
1678                if (off == 0)
1679                        break;
1680                rm = tmp2[0];
1681                rn = tmp2[1];
1682                /* Sign-extend immediate value */
1683                emit_a32_mov_se_i64(true, tmp2, imm, ctx);
1684go_jmp:
1685                /* Setup destination register */
1686                rd = arm_bpf_get_reg64(dst, tmp, ctx);
1687
1688                /* Check for the condition */
1689                emit_ar_r(rd[0], rd[1], rm, rn, ctx, BPF_OP(code),
1690                          BPF_CLASS(code) == BPF_JMP);
1691
1692                /* Setup JUMP instruction */
1693                jmp_offset = bpf2a32_offset(i+off, i, ctx);
1694                switch (BPF_OP(code)) {
1695                case BPF_JNE:
1696                case BPF_JSET:
1697                        _emit(ARM_COND_NE, ARM_B(jmp_offset), ctx);
1698                        break;
1699                case BPF_JEQ:
1700                        _emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx);
1701                        break;
1702                case BPF_JGT:
1703                        _emit(ARM_COND_HI, ARM_B(jmp_offset), ctx);
1704                        break;
1705                case BPF_JGE:
1706                        _emit(ARM_COND_CS, ARM_B(jmp_offset), ctx);
1707                        break;
1708                case BPF_JSGT:
1709                        _emit(ARM_COND_LT, ARM_B(jmp_offset), ctx);
1710                        break;
1711                case BPF_JSGE:
1712                        _emit(ARM_COND_GE, ARM_B(jmp_offset), ctx);
1713                        break;
1714                case BPF_JLE:
1715                        _emit(ARM_COND_LS, ARM_B(jmp_offset), ctx);
1716                        break;
1717                case BPF_JLT:
1718                        _emit(ARM_COND_CC, ARM_B(jmp_offset), ctx);
1719                        break;
1720                case BPF_JSLT:
1721                        _emit(ARM_COND_LT, ARM_B(jmp_offset), ctx);
1722                        break;
1723                case BPF_JSLE:
1724                        _emit(ARM_COND_GE, ARM_B(jmp_offset), ctx);
1725                        break;
1726                }
1727                break;
1728        /* JMP OFF */
1729        case BPF_JMP | BPF_JA:
1730        {
1731                if (off == 0)
1732                        break;
1733                jmp_offset = bpf2a32_offset(i+off, i, ctx);
1734                check_imm24(jmp_offset);
1735                emit(ARM_B(jmp_offset), ctx);
1736                break;
1737        }
1738        /* tail call */
1739        case BPF_JMP | BPF_TAIL_CALL:
1740                if (emit_bpf_tail_call(ctx))
1741                        return -EFAULT;
1742                break;
1743        /* function call */
1744        case BPF_JMP | BPF_CALL:
1745        {
1746                const s8 *r0 = bpf2a32[BPF_REG_0];
1747                const s8 *r1 = bpf2a32[BPF_REG_1];
1748                const s8 *r2 = bpf2a32[BPF_REG_2];
1749                const s8 *r3 = bpf2a32[BPF_REG_3];
1750                const s8 *r4 = bpf2a32[BPF_REG_4];
1751                const s8 *r5 = bpf2a32[BPF_REG_5];
1752                const u32 func = (u32)__bpf_call_base + (u32)imm;
1753
1754                emit_a32_mov_r64(true, r0, r1, ctx);
1755                emit_a32_mov_r64(true, r1, r2, ctx);
1756                emit_push_r64(r5, ctx);
1757                emit_push_r64(r4, ctx);
1758                emit_push_r64(r3, ctx);
1759
1760                emit_a32_mov_i(tmp[1], func, ctx);
1761                emit_blx_r(tmp[1], ctx);
1762
1763                emit(ARM_ADD_I(ARM_SP, ARM_SP, imm8m(24)), ctx); // callee clean
1764                break;
1765        }
1766        /* function return */
1767        case BPF_JMP | BPF_EXIT:
1768                /* Optimization: when last instruction is EXIT
1769                 * simply fallthrough to epilogue.
1770                 */
1771                if (i == ctx->prog->len - 1)
1772                        break;
1773                jmp_offset = epilogue_offset(ctx);
1774                check_imm24(jmp_offset);
1775                emit(ARM_B(jmp_offset), ctx);
1776                break;
1777notyet:
1778                pr_info_once("*** NOT YET: opcode %02x ***\n", code);
1779                return -EFAULT;
1780        default:
1781                pr_err_once("unknown opcode %02x\n", code);
1782                return -EINVAL;
1783        }
1784
1785        if (ctx->flags & FLAG_IMM_OVERFLOW)
1786                /*
1787                 * this instruction generated an overflow when
1788                 * trying to access the literal pool, so
1789                 * delegate this filter to the kernel interpreter.
1790                 */
1791                return -1;
1792        return 0;
1793}
1794
1795static int build_body(struct jit_ctx *ctx)
1796{
1797        const struct bpf_prog *prog = ctx->prog;
1798        unsigned int i;
1799
1800        for (i = 0; i < prog->len; i++) {
1801                const struct bpf_insn *insn = &(prog->insnsi[i]);
1802                int ret;
1803
1804                ret = build_insn(insn, ctx);
1805
1806                /* It's used with loading the 64 bit immediate value. */
1807                if (ret > 0) {
1808                        i++;
1809                        if (ctx->target == NULL)
1810                                ctx->offsets[i] = ctx->idx;
1811                        continue;
1812                }
1813
1814                if (ctx->target == NULL)
1815                        ctx->offsets[i] = ctx->idx;
1816
1817                /* If unsuccesfull, return with error code */
1818                if (ret)
1819                        return ret;
1820        }
1821        return 0;
1822}
1823
1824static int validate_code(struct jit_ctx *ctx)
1825{
1826        int i;
1827
1828        for (i = 0; i < ctx->idx; i++) {
1829                if (ctx->target[i] == __opcode_to_mem_arm(ARM_INST_UDF))
1830                        return -1;
1831        }
1832
1833        return 0;
1834}
1835
1836void bpf_jit_compile(struct bpf_prog *prog)
1837{
1838        /* Nothing to do here. We support Internal BPF. */
1839}
1840
1841struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
1842{
1843        struct bpf_prog *tmp, *orig_prog = prog;
1844        struct bpf_binary_header *header;
1845        bool tmp_blinded = false;
1846        struct jit_ctx ctx;
1847        unsigned int tmp_idx;
1848        unsigned int image_size;
1849        u8 *image_ptr;
1850
1851        /* If BPF JIT was not enabled then we must fall back to
1852         * the interpreter.
1853         */
1854        if (!prog->jit_requested)
1855                return orig_prog;
1856
1857        /* If constant blinding was enabled and we failed during blinding
1858         * then we must fall back to the interpreter. Otherwise, we save
1859         * the new JITed code.
1860         */
1861        tmp = bpf_jit_blind_constants(prog);
1862
1863        if (IS_ERR(tmp))
1864                return orig_prog;
1865        if (tmp != prog) {
1866                tmp_blinded = true;
1867                prog = tmp;
1868        }
1869
1870        memset(&ctx, 0, sizeof(ctx));
1871        ctx.prog = prog;
1872        ctx.cpu_architecture = cpu_architecture();
1873
1874        /* Not able to allocate memory for offsets[] , then
1875         * we must fall back to the interpreter
1876         */
1877        ctx.offsets = kcalloc(prog->len, sizeof(int), GFP_KERNEL);
1878        if (ctx.offsets == NULL) {
1879                prog = orig_prog;
1880                goto out;
1881        }
1882
1883        /* 1) fake pass to find in the length of the JITed code,
1884         * to compute ctx->offsets and other context variables
1885         * needed to compute final JITed code.
1886         * Also, calculate random starting pointer/start of JITed code
1887         * which is prefixed by random number of fault instructions.
1888         *
1889         * If the first pass fails then there is no chance of it
1890         * being successful in the second pass, so just fall back
1891         * to the interpreter.
1892         */
1893        if (build_body(&ctx)) {
1894                prog = orig_prog;
1895                goto out_off;
1896        }
1897
1898        tmp_idx = ctx.idx;
1899        build_prologue(&ctx);
1900        ctx.prologue_bytes = (ctx.idx - tmp_idx) * 4;
1901
1902        ctx.epilogue_offset = ctx.idx;
1903
1904#if __LINUX_ARM_ARCH__ < 7
1905        tmp_idx = ctx.idx;
1906        build_epilogue(&ctx);
1907        ctx.epilogue_bytes = (ctx.idx - tmp_idx) * 4;
1908
1909        ctx.idx += ctx.imm_count;
1910        if (ctx.imm_count) {
1911                ctx.imms = kcalloc(ctx.imm_count, sizeof(u32), GFP_KERNEL);
1912                if (ctx.imms == NULL) {
1913                        prog = orig_prog;
1914                        goto out_off;
1915                }
1916        }
1917#else
1918        /* there's nothing about the epilogue on ARMv7 */
1919        build_epilogue(&ctx);
1920#endif
1921        /* Now we can get the actual image size of the JITed arm code.
1922         * Currently, we are not considering the THUMB-2 instructions
1923         * for jit, although it can decrease the size of the image.
1924         *
1925         * As each arm instruction is of length 32bit, we are translating
1926         * number of JITed intructions into the size required to store these
1927         * JITed code.
1928         */
1929        image_size = sizeof(u32) * ctx.idx;
1930
1931        /* Now we know the size of the structure to make */
1932        header = bpf_jit_binary_alloc(image_size, &image_ptr,
1933                                      sizeof(u32), jit_fill_hole);
1934        /* Not able to allocate memory for the structure then
1935         * we must fall back to the interpretation
1936         */
1937        if (header == NULL) {
1938                prog = orig_prog;
1939                goto out_imms;
1940        }
1941
1942        /* 2.) Actual pass to generate final JIT code */
1943        ctx.target = (u32 *) image_ptr;
1944        ctx.idx = 0;
1945
1946        build_prologue(&ctx);
1947
1948        /* If building the body of the JITed code fails somehow,
1949         * we fall back to the interpretation.
1950         */
1951        if (build_body(&ctx) < 0) {
1952                image_ptr = NULL;
1953                bpf_jit_binary_free(header);
1954                prog = orig_prog;
1955                goto out_imms;
1956        }
1957        build_epilogue(&ctx);
1958
1959        /* 3.) Extra pass to validate JITed Code */
1960        if (validate_code(&ctx)) {
1961                image_ptr = NULL;
1962                bpf_jit_binary_free(header);
1963                prog = orig_prog;
1964                goto out_imms;
1965        }
1966        flush_icache_range((u32)header, (u32)(ctx.target + ctx.idx));
1967
1968        if (bpf_jit_enable > 1)
1969                /* there are 2 passes here */
1970                bpf_jit_dump(prog->len, image_size, 2, ctx.target);
1971
1972        bpf_jit_binary_lock_ro(header);
1973        prog->bpf_func = (void *)ctx.target;
1974        prog->jited = 1;
1975        prog->jited_len = image_size;
1976
1977out_imms:
1978#if __LINUX_ARM_ARCH__ < 7
1979        if (ctx.imm_count)
1980                kfree(ctx.imms);
1981#endif
1982out_off:
1983        kfree(ctx.offsets);
1984out:
1985        if (tmp_blinded)
1986                bpf_jit_prog_release_other(prog, prog == orig_prog ?
1987                                           tmp : orig_prog);
1988        return prog;
1989}
1990
1991