linux/arch/x86/kernel/kprobes/opt.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *  Kernel Probes Jump Optimization (Optprobes)
   4 *
   5 * Copyright (C) IBM Corporation, 2002, 2004
   6 * Copyright (C) Hitachi Ltd., 2012
   7 */
   8#include <linux/kprobes.h>
   9#include <linux/ptrace.h>
  10#include <linux/string.h>
  11#include <linux/slab.h>
  12#include <linux/hardirq.h>
  13#include <linux/preempt.h>
  14#include <linux/extable.h>
  15#include <linux/kdebug.h>
  16#include <linux/kallsyms.h>
  17#include <linux/ftrace.h>
  18#include <linux/frame.h>
  19
  20#include <asm/text-patching.h>
  21#include <asm/cacheflush.h>
  22#include <asm/desc.h>
  23#include <asm/pgtable.h>
  24#include <linux/uaccess.h>
  25#include <asm/alternative.h>
  26#include <asm/insn.h>
  27#include <asm/debugreg.h>
  28#include <asm/set_memory.h>
  29#include <asm/sections.h>
  30#include <asm/nospec-branch.h>
  31
  32#include "common.h"
  33
  34unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
  35{
  36        struct optimized_kprobe *op;
  37        struct kprobe *kp;
  38        long offs;
  39        int i;
  40
  41        for (i = 0; i < RELATIVEJUMP_SIZE; i++) {
  42                kp = get_kprobe((void *)addr - i);
  43                /* This function only handles jump-optimized kprobe */
  44                if (kp && kprobe_optimized(kp)) {
  45                        op = container_of(kp, struct optimized_kprobe, kp);
  46                        /* If op->list is not empty, op is under optimizing */
  47                        if (list_empty(&op->list))
  48                                goto found;
  49                }
  50        }
  51
  52        return addr;
  53found:
  54        /*
  55         * If the kprobe can be optimized, original bytes which can be
  56         * overwritten by jump destination address. In this case, original
  57         * bytes must be recovered from op->optinsn.copied_insn buffer.
  58         */
  59        if (probe_kernel_read(buf, (void *)addr,
  60                MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
  61                return 0UL;
  62
  63        if (addr == (unsigned long)kp->addr) {
  64                buf[0] = kp->opcode;
  65                memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
  66        } else {
  67                offs = addr - (unsigned long)kp->addr - 1;
  68                memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs);
  69        }
  70
  71        return (unsigned long)buf;
  72}
  73
  74/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
  75static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
  76{
  77#ifdef CONFIG_X86_64
  78        *addr++ = 0x48;
  79        *addr++ = 0xbf;
  80#else
  81        *addr++ = 0xb8;
  82#endif
  83        *(unsigned long *)addr = val;
  84}
  85
  86asm (
  87                        ".pushsection .rodata\n"
  88                        "optprobe_template_func:\n"
  89                        ".global optprobe_template_entry\n"
  90                        "optprobe_template_entry:\n"
  91#ifdef CONFIG_X86_64
  92                        /* We don't bother saving the ss register */
  93                        "       pushq %rsp\n"
  94                        "       pushfq\n"
  95                        SAVE_REGS_STRING
  96                        "       movq %rsp, %rsi\n"
  97                        ".global optprobe_template_val\n"
  98                        "optprobe_template_val:\n"
  99                        ASM_NOP5
 100                        ASM_NOP5
 101                        ".global optprobe_template_call\n"
 102                        "optprobe_template_call:\n"
 103                        ASM_NOP5
 104                        /* Move flags to rsp */
 105                        "       movq 18*8(%rsp), %rdx\n"
 106                        "       movq %rdx, 19*8(%rsp)\n"
 107                        RESTORE_REGS_STRING
 108                        /* Skip flags entry */
 109                        "       addq $8, %rsp\n"
 110                        "       popfq\n"
 111#else /* CONFIG_X86_32 */
 112                        "       pushl %esp\n"
 113                        "       pushfl\n"
 114                        SAVE_REGS_STRING
 115                        "       movl %esp, %edx\n"
 116                        ".global optprobe_template_val\n"
 117                        "optprobe_template_val:\n"
 118                        ASM_NOP5
 119                        ".global optprobe_template_call\n"
 120                        "optprobe_template_call:\n"
 121                        ASM_NOP5
 122                        /* Move flags into esp */
 123                        "       movl 14*4(%esp), %edx\n"
 124                        "       movl %edx, 15*4(%esp)\n"
 125                        RESTORE_REGS_STRING
 126                        /* Skip flags entry */
 127                        "       addl $4, %esp\n"
 128                        "       popfl\n"
 129#endif
 130                        ".global optprobe_template_end\n"
 131                        "optprobe_template_end:\n"
 132                        ".popsection\n");
 133
 134void optprobe_template_func(void);
 135STACK_FRAME_NON_STANDARD(optprobe_template_func);
 136
 137#define TMPL_MOVE_IDX \
 138        ((long)optprobe_template_val - (long)optprobe_template_entry)
 139#define TMPL_CALL_IDX \
 140        ((long)optprobe_template_call - (long)optprobe_template_entry)
 141#define TMPL_END_IDX \
 142        ((long)optprobe_template_end - (long)optprobe_template_entry)
 143
 144#define INT3_SIZE sizeof(kprobe_opcode_t)
 145
 146/* Optimized kprobe call back function: called from optinsn */
 147static void
 148optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
 149{
 150        /* This is possible if op is under delayed unoptimizing */
 151        if (kprobe_disabled(&op->kp))
 152                return;
 153
 154        preempt_disable();
 155        if (kprobe_running()) {
 156                kprobes_inc_nmissed_count(&op->kp);
 157        } else {
 158                struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 159                /* Save skipped registers */
 160                regs->cs = __KERNEL_CS;
 161#ifdef CONFIG_X86_32
 162                regs->cs |= get_kernel_rpl();
 163                regs->gs = 0;
 164#endif
 165                regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
 166                regs->orig_ax = ~0UL;
 167
 168                __this_cpu_write(current_kprobe, &op->kp);
 169                kcb->kprobe_status = KPROBE_HIT_ACTIVE;
 170                opt_pre_handler(&op->kp, regs);
 171                __this_cpu_write(current_kprobe, NULL);
 172        }
 173        preempt_enable();
 174}
 175NOKPROBE_SYMBOL(optimized_callback);
 176
 177static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real)
 178{
 179        struct insn insn;
 180        int len = 0, ret;
 181
 182        while (len < RELATIVEJUMP_SIZE) {
 183                ret = __copy_instruction(dest + len, src + len, real + len, &insn);
 184                if (!ret || !can_boost(&insn, src + len))
 185                        return -EINVAL;
 186                len += ret;
 187        }
 188        /* Check whether the address range is reserved */
 189        if (ftrace_text_reserved(src, src + len - 1) ||
 190            alternatives_text_reserved(src, src + len - 1) ||
 191            jump_label_text_reserved(src, src + len - 1))
 192                return -EBUSY;
 193
 194        return len;
 195}
 196
 197/* Check whether insn is indirect jump */
 198static int __insn_is_indirect_jump(struct insn *insn)
 199{
 200        return ((insn->opcode.bytes[0] == 0xff &&
 201                (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
 202                insn->opcode.bytes[0] == 0xea); /* Segment based jump */
 203}
 204
 205/* Check whether insn jumps into specified address range */
 206static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
 207{
 208        unsigned long target = 0;
 209
 210        switch (insn->opcode.bytes[0]) {
 211        case 0xe0:      /* loopne */
 212        case 0xe1:      /* loope */
 213        case 0xe2:      /* loop */
 214        case 0xe3:      /* jcxz */
 215        case 0xe9:      /* near relative jump */
 216        case 0xeb:      /* short relative jump */
 217                break;
 218        case 0x0f:
 219                if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
 220                        break;
 221                return 0;
 222        default:
 223                if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
 224                        break;
 225                return 0;
 226        }
 227        target = (unsigned long)insn->next_byte + insn->immediate.value;
 228
 229        return (start <= target && target <= start + len);
 230}
 231
 232static int insn_is_indirect_jump(struct insn *insn)
 233{
 234        int ret = __insn_is_indirect_jump(insn);
 235
 236#ifdef CONFIG_RETPOLINE
 237        /*
 238         * Jump to x86_indirect_thunk_* is treated as an indirect jump.
 239         * Note that even with CONFIG_RETPOLINE=y, the kernel compiled with
 240         * older gcc may use indirect jump. So we add this check instead of
 241         * replace indirect-jump check.
 242         */
 243        if (!ret)
 244                ret = insn_jump_into_range(insn,
 245                                (unsigned long)__indirect_thunk_start,
 246                                (unsigned long)__indirect_thunk_end -
 247                                (unsigned long)__indirect_thunk_start);
 248#endif
 249        return ret;
 250}
 251
 252/* Decode whole function to ensure any instructions don't jump into target */
 253static int can_optimize(unsigned long paddr)
 254{
 255        unsigned long addr, size = 0, offset = 0;
 256        struct insn insn;
 257        kprobe_opcode_t buf[MAX_INSN_SIZE];
 258
 259        /* Lookup symbol including addr */
 260        if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
 261                return 0;
 262
 263        /*
 264         * Do not optimize in the entry code due to the unstable
 265         * stack handling and registers setup.
 266         */
 267        if (((paddr >= (unsigned long)__entry_text_start) &&
 268             (paddr <  (unsigned long)__entry_text_end)) ||
 269            ((paddr >= (unsigned long)__irqentry_text_start) &&
 270             (paddr <  (unsigned long)__irqentry_text_end)))
 271                return 0;
 272
 273        /* Check there is enough space for a relative jump. */
 274        if (size - offset < RELATIVEJUMP_SIZE)
 275                return 0;
 276
 277        /* Decode instructions */
 278        addr = paddr - offset;
 279        while (addr < paddr - offset + size) { /* Decode until function end */
 280                unsigned long recovered_insn;
 281                if (search_exception_tables(addr))
 282                        /*
 283                         * Since some fixup code will jumps into this function,
 284                         * we can't optimize kprobe in this function.
 285                         */
 286                        return 0;
 287                recovered_insn = recover_probed_instruction(buf, addr);
 288                if (!recovered_insn)
 289                        return 0;
 290                kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
 291                insn_get_length(&insn);
 292                /* Another subsystem puts a breakpoint */
 293                if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
 294                        return 0;
 295                /* Recover address */
 296                insn.kaddr = (void *)addr;
 297                insn.next_byte = (void *)(addr + insn.length);
 298                /* Check any instructions don't jump into target */
 299                if (insn_is_indirect_jump(&insn) ||
 300                    insn_jump_into_range(&insn, paddr + INT3_SIZE,
 301                                         RELATIVE_ADDR_SIZE))
 302                        return 0;
 303                addr += insn.length;
 304        }
 305
 306        return 1;
 307}
 308
 309/* Check optimized_kprobe can actually be optimized. */
 310int arch_check_optimized_kprobe(struct optimized_kprobe *op)
 311{
 312        int i;
 313        struct kprobe *p;
 314
 315        for (i = 1; i < op->optinsn.size; i++) {
 316                p = get_kprobe(op->kp.addr + i);
 317                if (p && !kprobe_disabled(p))
 318                        return -EEXIST;
 319        }
 320
 321        return 0;
 322}
 323
 324/* Check the addr is within the optimized instructions. */
 325int arch_within_optimized_kprobe(struct optimized_kprobe *op,
 326                                 unsigned long addr)
 327{
 328        return ((unsigned long)op->kp.addr <= addr &&
 329                (unsigned long)op->kp.addr + op->optinsn.size > addr);
 330}
 331
 332/* Free optimized instruction slot */
 333static
 334void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
 335{
 336        if (op->optinsn.insn) {
 337                free_optinsn_slot(op->optinsn.insn, dirty);
 338                op->optinsn.insn = NULL;
 339                op->optinsn.size = 0;
 340        }
 341}
 342
 343void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
 344{
 345        __arch_remove_optimized_kprobe(op, 1);
 346}
 347
 348/*
 349 * Copy replacing target instructions
 350 * Target instructions MUST be relocatable (checked inside)
 351 * This is called when new aggr(opt)probe is allocated or reused.
 352 */
 353int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
 354                                  struct kprobe *__unused)
 355{
 356        u8 *buf = NULL, *slot;
 357        int ret, len;
 358        long rel;
 359
 360        if (!can_optimize((unsigned long)op->kp.addr))
 361                return -EILSEQ;
 362
 363        buf = kzalloc(MAX_OPTINSN_SIZE, GFP_KERNEL);
 364        if (!buf)
 365                return -ENOMEM;
 366
 367        op->optinsn.insn = slot = get_optinsn_slot();
 368        if (!slot) {
 369                ret = -ENOMEM;
 370                goto out;
 371        }
 372
 373        /*
 374         * Verify if the address gap is in 2GB range, because this uses
 375         * a relative jump.
 376         */
 377        rel = (long)slot - (long)op->kp.addr + RELATIVEJUMP_SIZE;
 378        if (abs(rel) > 0x7fffffff) {
 379                ret = -ERANGE;
 380                goto err;
 381        }
 382
 383        /* Copy arch-dep-instance from template */
 384        memcpy(buf, optprobe_template_entry, TMPL_END_IDX);
 385
 386        /* Copy instructions into the out-of-line buffer */
 387        ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr,
 388                                          slot + TMPL_END_IDX);
 389        if (ret < 0)
 390                goto err;
 391        op->optinsn.size = ret;
 392        len = TMPL_END_IDX + op->optinsn.size;
 393
 394        /* Set probe information */
 395        synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
 396
 397        /* Set probe function call */
 398        synthesize_relcall(buf + TMPL_CALL_IDX,
 399                           slot + TMPL_CALL_IDX, optimized_callback);
 400
 401        /* Set returning jmp instruction at the tail of out-of-line buffer */
 402        synthesize_reljump(buf + len, slot + len,
 403                           (u8 *)op->kp.addr + op->optinsn.size);
 404        len += RELATIVEJUMP_SIZE;
 405
 406        /* We have to use text_poke for instuction buffer because it is RO */
 407        text_poke(slot, buf, len);
 408        ret = 0;
 409out:
 410        kfree(buf);
 411        return ret;
 412
 413err:
 414        __arch_remove_optimized_kprobe(op, 0);
 415        goto out;
 416}
 417
 418/*
 419 * Replace breakpoints (int3) with relative jumps.
 420 * Caller must call with locking kprobe_mutex and text_mutex.
 421 */
 422void arch_optimize_kprobes(struct list_head *oplist)
 423{
 424        struct optimized_kprobe *op, *tmp;
 425        u8 insn_buff[RELATIVEJUMP_SIZE];
 426
 427        list_for_each_entry_safe(op, tmp, oplist, list) {
 428                s32 rel = (s32)((long)op->optinsn.insn -
 429                        ((long)op->kp.addr + RELATIVEJUMP_SIZE));
 430
 431                WARN_ON(kprobe_disabled(&op->kp));
 432
 433                /* Backup instructions which will be replaced by jump address */
 434                memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
 435                       RELATIVE_ADDR_SIZE);
 436
 437                insn_buff[0] = RELATIVEJUMP_OPCODE;
 438                *(s32 *)(&insn_buff[1]) = rel;
 439
 440                text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE,
 441                             op->optinsn.insn);
 442
 443                list_del_init(&op->list);
 444        }
 445}
 446
 447/* Replace a relative jump with a breakpoint (int3).  */
 448void arch_unoptimize_kprobe(struct optimized_kprobe *op)
 449{
 450        u8 insn_buff[RELATIVEJUMP_SIZE];
 451
 452        /* Set int3 to first byte for kprobes */
 453        insn_buff[0] = BREAKPOINT_INSTRUCTION;
 454        memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
 455        text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE,
 456                     op->optinsn.insn);
 457}
 458
 459/*
 460 * Recover original instructions and breakpoints from relative jumps.
 461 * Caller must call with locking kprobe_mutex.
 462 */
 463extern void arch_unoptimize_kprobes(struct list_head *oplist,
 464                                    struct list_head *done_list)
 465{
 466        struct optimized_kprobe *op, *tmp;
 467
 468        list_for_each_entry_safe(op, tmp, oplist, list) {
 469                arch_unoptimize_kprobe(op);
 470                list_move(&op->list, done_list);
 471        }
 472}
 473
 474int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
 475{
 476        struct optimized_kprobe *op;
 477
 478        if (p->flags & KPROBE_FLAG_OPTIMIZED) {
 479                /* This kprobe is really able to run optimized path. */
 480                op = container_of(p, struct optimized_kprobe, kp);
 481                /* Detour through copied instructions */
 482                regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
 483                if (!reenter)
 484                        reset_current_kprobe();
 485                return 1;
 486        }
 487        return 0;
 488}
 489NOKPROBE_SYMBOL(setup_detour_execution);
 490