linux/arch/x86/kernel/kprobes/opt.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *  Kernel Probes Jump Optimization (Optprobes)
   4 *
   5 * Copyright (C) IBM Corporation, 2002, 2004
   6 * Copyright (C) Hitachi Ltd., 2012
   7 */
   8#include <linux/kprobes.h>
   9#include <linux/perf_event.h>
  10#include <linux/ptrace.h>
  11#include <linux/string.h>
  12#include <linux/slab.h>
  13#include <linux/hardirq.h>
  14#include <linux/preempt.h>
  15#include <linux/extable.h>
  16#include <linux/kdebug.h>
  17#include <linux/kallsyms.h>
  18#include <linux/ftrace.h>
  19#include <linux/frame.h>
  20#include <linux/pgtable.h>
  21
  22#include <asm/text-patching.h>
  23#include <asm/cacheflush.h>
  24#include <asm/desc.h>
  25#include <linux/uaccess.h>
  26#include <asm/alternative.h>
  27#include <asm/insn.h>
  28#include <asm/debugreg.h>
  29#include <asm/set_memory.h>
  30#include <asm/sections.h>
  31#include <asm/nospec-branch.h>
  32
  33#include "common.h"
  34
  35unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
  36{
  37        struct optimized_kprobe *op;
  38        struct kprobe *kp;
  39        long offs;
  40        int i;
  41
  42        for (i = 0; i < JMP32_INSN_SIZE; i++) {
  43                kp = get_kprobe((void *)addr - i);
  44                /* This function only handles jump-optimized kprobe */
  45                if (kp && kprobe_optimized(kp)) {
  46                        op = container_of(kp, struct optimized_kprobe, kp);
  47                        /* If op->list is not empty, op is under optimizing */
  48                        if (list_empty(&op->list))
  49                                goto found;
  50                }
  51        }
  52
  53        return addr;
  54found:
  55        /*
  56         * If the kprobe can be optimized, original bytes which can be
  57         * overwritten by jump destination address. In this case, original
  58         * bytes must be recovered from op->optinsn.copied_insn buffer.
  59         */
  60        if (copy_from_kernel_nofault(buf, (void *)addr,
  61                MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
  62                return 0UL;
  63
  64        if (addr == (unsigned long)kp->addr) {
  65                buf[0] = kp->opcode;
  66                memcpy(buf + 1, op->optinsn.copied_insn, DISP32_SIZE);
  67        } else {
  68                offs = addr - (unsigned long)kp->addr - 1;
  69                memcpy(buf, op->optinsn.copied_insn + offs, DISP32_SIZE - offs);
  70        }
  71
  72        return (unsigned long)buf;
  73}
  74
  75static void synthesize_clac(kprobe_opcode_t *addr)
  76{
  77        /*
  78         * Can't be static_cpu_has() due to how objtool treats this feature bit.
  79         * This isn't a fast path anyway.
  80         */
  81        if (!boot_cpu_has(X86_FEATURE_SMAP))
  82                return;
  83
  84        /* Replace the NOP3 with CLAC */
  85        addr[0] = 0x0f;
  86        addr[1] = 0x01;
  87        addr[2] = 0xca;
  88}
  89
  90/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
  91static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
  92{
  93#ifdef CONFIG_X86_64
  94        *addr++ = 0x48;
  95        *addr++ = 0xbf;
  96#else
  97        *addr++ = 0xb8;
  98#endif
  99        *(unsigned long *)addr = val;
 100}
 101
 102asm (
 103                        ".pushsection .rodata\n"
 104                        "optprobe_template_func:\n"
 105                        ".global optprobe_template_entry\n"
 106                        "optprobe_template_entry:\n"
 107#ifdef CONFIG_X86_64
 108                        /* We don't bother saving the ss register */
 109                        "       pushq %rsp\n"
 110                        "       pushfq\n"
 111                        ".global optprobe_template_clac\n"
 112                        "optprobe_template_clac:\n"
 113                        ASM_NOP3
 114                        SAVE_REGS_STRING
 115                        "       movq %rsp, %rsi\n"
 116                        ".global optprobe_template_val\n"
 117                        "optprobe_template_val:\n"
 118                        ASM_NOP5
 119                        ASM_NOP5
 120                        ".global optprobe_template_call\n"
 121                        "optprobe_template_call:\n"
 122                        ASM_NOP5
 123                        /* Move flags to rsp */
 124                        "       movq 18*8(%rsp), %rdx\n"
 125                        "       movq %rdx, 19*8(%rsp)\n"
 126                        RESTORE_REGS_STRING
 127                        /* Skip flags entry */
 128                        "       addq $8, %rsp\n"
 129                        "       popfq\n"
 130#else /* CONFIG_X86_32 */
 131                        "       pushl %esp\n"
 132                        "       pushfl\n"
 133                        ".global optprobe_template_clac\n"
 134                        "optprobe_template_clac:\n"
 135                        ASM_NOP3
 136                        SAVE_REGS_STRING
 137                        "       movl %esp, %edx\n"
 138                        ".global optprobe_template_val\n"
 139                        "optprobe_template_val:\n"
 140                        ASM_NOP5
 141                        ".global optprobe_template_call\n"
 142                        "optprobe_template_call:\n"
 143                        ASM_NOP5
 144                        /* Move flags into esp */
 145                        "       movl 14*4(%esp), %edx\n"
 146                        "       movl %edx, 15*4(%esp)\n"
 147                        RESTORE_REGS_STRING
 148                        /* Skip flags entry */
 149                        "       addl $4, %esp\n"
 150                        "       popfl\n"
 151#endif
 152                        ".global optprobe_template_end\n"
 153                        "optprobe_template_end:\n"
 154                        ".popsection\n");
 155
 156void optprobe_template_func(void);
 157STACK_FRAME_NON_STANDARD(optprobe_template_func);
 158
 159#define TMPL_CLAC_IDX \
 160        ((long)optprobe_template_clac - (long)optprobe_template_entry)
 161#define TMPL_MOVE_IDX \
 162        ((long)optprobe_template_val - (long)optprobe_template_entry)
 163#define TMPL_CALL_IDX \
 164        ((long)optprobe_template_call - (long)optprobe_template_entry)
 165#define TMPL_END_IDX \
 166        ((long)optprobe_template_end - (long)optprobe_template_entry)
 167
 168/* Optimized kprobe call back function: called from optinsn */
 169static void
 170optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
 171{
 172        /* This is possible if op is under delayed unoptimizing */
 173        if (kprobe_disabled(&op->kp))
 174                return;
 175
 176        preempt_disable();
 177        if (kprobe_running()) {
 178                kprobes_inc_nmissed_count(&op->kp);
 179        } else {
 180                struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 181                /* Save skipped registers */
 182                regs->cs = __KERNEL_CS;
 183#ifdef CONFIG_X86_32
 184                regs->cs |= get_kernel_rpl();
 185                regs->gs = 0;
 186#endif
 187                regs->ip = (unsigned long)op->kp.addr + INT3_INSN_SIZE;
 188                regs->orig_ax = ~0UL;
 189
 190                __this_cpu_write(current_kprobe, &op->kp);
 191                kcb->kprobe_status = KPROBE_HIT_ACTIVE;
 192                opt_pre_handler(&op->kp, regs);
 193                __this_cpu_write(current_kprobe, NULL);
 194        }
 195        preempt_enable();
 196}
 197NOKPROBE_SYMBOL(optimized_callback);
 198
 199static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real)
 200{
 201        struct insn insn;
 202        int len = 0, ret;
 203
 204        while (len < JMP32_INSN_SIZE) {
 205                ret = __copy_instruction(dest + len, src + len, real + len, &insn);
 206                if (!ret || !can_boost(&insn, src + len))
 207                        return -EINVAL;
 208                len += ret;
 209        }
 210        /* Check whether the address range is reserved */
 211        if (ftrace_text_reserved(src, src + len - 1) ||
 212            alternatives_text_reserved(src, src + len - 1) ||
 213            jump_label_text_reserved(src, src + len - 1))
 214                return -EBUSY;
 215
 216        return len;
 217}
 218
 219/* Check whether insn is indirect jump */
 220static int __insn_is_indirect_jump(struct insn *insn)
 221{
 222        return ((insn->opcode.bytes[0] == 0xff &&
 223                (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
 224                insn->opcode.bytes[0] == 0xea); /* Segment based jump */
 225}
 226
 227/* Check whether insn jumps into specified address range */
 228static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
 229{
 230        unsigned long target = 0;
 231
 232        switch (insn->opcode.bytes[0]) {
 233        case 0xe0:      /* loopne */
 234        case 0xe1:      /* loope */
 235        case 0xe2:      /* loop */
 236        case 0xe3:      /* jcxz */
 237        case 0xe9:      /* near relative jump */
 238        case 0xeb:      /* short relative jump */
 239                break;
 240        case 0x0f:
 241                if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
 242                        break;
 243                return 0;
 244        default:
 245                if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
 246                        break;
 247                return 0;
 248        }
 249        target = (unsigned long)insn->next_byte + insn->immediate.value;
 250
 251        return (start <= target && target <= start + len);
 252}
 253
 254static int insn_is_indirect_jump(struct insn *insn)
 255{
 256        int ret = __insn_is_indirect_jump(insn);
 257
 258#ifdef CONFIG_RETPOLINE
 259        /*
 260         * Jump to x86_indirect_thunk_* is treated as an indirect jump.
 261         * Note that even with CONFIG_RETPOLINE=y, the kernel compiled with
 262         * older gcc may use indirect jump. So we add this check instead of
 263         * replace indirect-jump check.
 264         */
 265        if (!ret)
 266                ret = insn_jump_into_range(insn,
 267                                (unsigned long)__indirect_thunk_start,
 268                                (unsigned long)__indirect_thunk_end -
 269                                (unsigned long)__indirect_thunk_start);
 270#endif
 271        return ret;
 272}
 273
 274/* Decode whole function to ensure any instructions don't jump into target */
 275static int can_optimize(unsigned long paddr)
 276{
 277        unsigned long addr, size = 0, offset = 0;
 278        struct insn insn;
 279        kprobe_opcode_t buf[MAX_INSN_SIZE];
 280
 281        /* Lookup symbol including addr */
 282        if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
 283                return 0;
 284
 285        /*
 286         * Do not optimize in the entry code due to the unstable
 287         * stack handling and registers setup.
 288         */
 289        if (((paddr >= (unsigned long)__entry_text_start) &&
 290             (paddr <  (unsigned long)__entry_text_end)))
 291                return 0;
 292
 293        /* Check there is enough space for a relative jump. */
 294        if (size - offset < JMP32_INSN_SIZE)
 295                return 0;
 296
 297        /* Decode instructions */
 298        addr = paddr - offset;
 299        while (addr < paddr - offset + size) { /* Decode until function end */
 300                unsigned long recovered_insn;
 301                if (search_exception_tables(addr))
 302                        /*
 303                         * Since some fixup code will jumps into this function,
 304                         * we can't optimize kprobe in this function.
 305                         */
 306                        return 0;
 307                recovered_insn = recover_probed_instruction(buf, addr);
 308                if (!recovered_insn)
 309                        return 0;
 310                kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
 311                insn_get_length(&insn);
 312                /* Another subsystem puts a breakpoint */
 313                if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
 314                        return 0;
 315                /* Recover address */
 316                insn.kaddr = (void *)addr;
 317                insn.next_byte = (void *)(addr + insn.length);
 318                /* Check any instructions don't jump into target */
 319                if (insn_is_indirect_jump(&insn) ||
 320                    insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE,
 321                                         DISP32_SIZE))
 322                        return 0;
 323                addr += insn.length;
 324        }
 325
 326        return 1;
 327}
 328
 329/* Check optimized_kprobe can actually be optimized. */
 330int arch_check_optimized_kprobe(struct optimized_kprobe *op)
 331{
 332        int i;
 333        struct kprobe *p;
 334
 335        for (i = 1; i < op->optinsn.size; i++) {
 336                p = get_kprobe(op->kp.addr + i);
 337                if (p && !kprobe_disabled(p))
 338                        return -EEXIST;
 339        }
 340
 341        return 0;
 342}
 343
 344/* Check the addr is within the optimized instructions. */
 345int arch_within_optimized_kprobe(struct optimized_kprobe *op,
 346                                 unsigned long addr)
 347{
 348        return ((unsigned long)op->kp.addr <= addr &&
 349                (unsigned long)op->kp.addr + op->optinsn.size > addr);
 350}
 351
 352/* Free optimized instruction slot */
 353static
 354void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
 355{
 356        u8 *slot = op->optinsn.insn;
 357        if (slot) {
 358                int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE;
 359
 360                /* Record the perf event before freeing the slot */
 361                if (dirty)
 362                        perf_event_text_poke(slot, slot, len, NULL, 0);
 363
 364                free_optinsn_slot(slot, dirty);
 365                op->optinsn.insn = NULL;
 366                op->optinsn.size = 0;
 367        }
 368}
 369
 370void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
 371{
 372        __arch_remove_optimized_kprobe(op, 1);
 373}
 374
 375/*
 376 * Copy replacing target instructions
 377 * Target instructions MUST be relocatable (checked inside)
 378 * This is called when new aggr(opt)probe is allocated or reused.
 379 */
 380int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
 381                                  struct kprobe *__unused)
 382{
 383        u8 *buf = NULL, *slot;
 384        int ret, len;
 385        long rel;
 386
 387        if (!can_optimize((unsigned long)op->kp.addr))
 388                return -EILSEQ;
 389
 390        buf = kzalloc(MAX_OPTINSN_SIZE, GFP_KERNEL);
 391        if (!buf)
 392                return -ENOMEM;
 393
 394        op->optinsn.insn = slot = get_optinsn_slot();
 395        if (!slot) {
 396                ret = -ENOMEM;
 397                goto out;
 398        }
 399
 400        /*
 401         * Verify if the address gap is in 2GB range, because this uses
 402         * a relative jump.
 403         */
 404        rel = (long)slot - (long)op->kp.addr + JMP32_INSN_SIZE;
 405        if (abs(rel) > 0x7fffffff) {
 406                ret = -ERANGE;
 407                goto err;
 408        }
 409
 410        /* Copy arch-dep-instance from template */
 411        memcpy(buf, optprobe_template_entry, TMPL_END_IDX);
 412
 413        /* Copy instructions into the out-of-line buffer */
 414        ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr,
 415                                          slot + TMPL_END_IDX);
 416        if (ret < 0)
 417                goto err;
 418        op->optinsn.size = ret;
 419        len = TMPL_END_IDX + op->optinsn.size;
 420
 421        synthesize_clac(buf + TMPL_CLAC_IDX);
 422
 423        /* Set probe information */
 424        synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
 425
 426        /* Set probe function call */
 427        synthesize_relcall(buf + TMPL_CALL_IDX,
 428                           slot + TMPL_CALL_IDX, optimized_callback);
 429
 430        /* Set returning jmp instruction at the tail of out-of-line buffer */
 431        synthesize_reljump(buf + len, slot + len,
 432                           (u8 *)op->kp.addr + op->optinsn.size);
 433        len += JMP32_INSN_SIZE;
 434
 435        /*
 436         * Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also
 437         * used in __arch_remove_optimized_kprobe().
 438         */
 439
 440        /* We have to use text_poke() for instruction buffer because it is RO */
 441        perf_event_text_poke(slot, NULL, 0, buf, len);
 442        text_poke(slot, buf, len);
 443
 444        ret = 0;
 445out:
 446        kfree(buf);
 447        return ret;
 448
 449err:
 450        __arch_remove_optimized_kprobe(op, 0);
 451        goto out;
 452}
 453
 454/*
 455 * Replace breakpoints (INT3) with relative jumps (JMP.d32).
 456 * Caller must call with locking kprobe_mutex and text_mutex.
 457 *
 458 * The caller will have installed a regular kprobe and after that issued
 459 * syncrhonize_rcu_tasks(), this ensures that the instruction(s) that live in
 460 * the 4 bytes after the INT3 are unused and can now be overwritten.
 461 */
 462void arch_optimize_kprobes(struct list_head *oplist)
 463{
 464        struct optimized_kprobe *op, *tmp;
 465        u8 insn_buff[JMP32_INSN_SIZE];
 466
 467        list_for_each_entry_safe(op, tmp, oplist, list) {
 468                s32 rel = (s32)((long)op->optinsn.insn -
 469                        ((long)op->kp.addr + JMP32_INSN_SIZE));
 470
 471                WARN_ON(kprobe_disabled(&op->kp));
 472
 473                /* Backup instructions which will be replaced by jump address */
 474                memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_INSN_SIZE,
 475                       DISP32_SIZE);
 476
 477                insn_buff[0] = JMP32_INSN_OPCODE;
 478                *(s32 *)(&insn_buff[1]) = rel;
 479
 480                text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL);
 481
 482                list_del_init(&op->list);
 483        }
 484}
 485
 486/*
 487 * Replace a relative jump (JMP.d32) with a breakpoint (INT3).
 488 *
 489 * After that, we can restore the 4 bytes after the INT3 to undo what
 490 * arch_optimize_kprobes() scribbled. This is safe since those bytes will be
 491 * unused once the INT3 lands.
 492 */
 493void arch_unoptimize_kprobe(struct optimized_kprobe *op)
 494{
 495        u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, };
 496        u8 old[JMP32_INSN_SIZE];
 497        u8 *addr = op->kp.addr;
 498
 499        memcpy(old, op->kp.addr, JMP32_INSN_SIZE);
 500        memcpy(new + INT3_INSN_SIZE,
 501               op->optinsn.copied_insn,
 502               JMP32_INSN_SIZE - INT3_INSN_SIZE);
 503
 504        text_poke(addr, new, INT3_INSN_SIZE);
 505        text_poke_sync();
 506        text_poke(addr + INT3_INSN_SIZE,
 507                  new + INT3_INSN_SIZE,
 508                  JMP32_INSN_SIZE - INT3_INSN_SIZE);
 509        text_poke_sync();
 510
 511        perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE);
 512}
 513
 514/*
 515 * Recover original instructions and breakpoints from relative jumps.
 516 * Caller must call with locking kprobe_mutex.
 517 */
 518extern void arch_unoptimize_kprobes(struct list_head *oplist,
 519                                    struct list_head *done_list)
 520{
 521        struct optimized_kprobe *op, *tmp;
 522
 523        list_for_each_entry_safe(op, tmp, oplist, list) {
 524                arch_unoptimize_kprobe(op);
 525                list_move(&op->list, done_list);
 526        }
 527}
 528
 529int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
 530{
 531        struct optimized_kprobe *op;
 532
 533        if (p->flags & KPROBE_FLAG_OPTIMIZED) {
 534                /* This kprobe is really able to run optimized path. */
 535                op = container_of(p, struct optimized_kprobe, kp);
 536                /* Detour through copied instructions */
 537                regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
 538                if (!reenter)
 539                        reset_current_kprobe();
 540                return 1;
 541        }
 542        return 0;
 543}
 544NOKPROBE_SYMBOL(setup_detour_execution);
 545