linux/arch/x86/kernel/kprobes/core.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *  Kernel Probes (KProbes)
   4 *
   5 * Copyright (C) IBM Corporation, 2002, 2004
   6 *
   7 * 2002-Oct     Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
   8 *              Probes initial implementation ( includes contributions from
   9 *              Rusty Russell).
  10 * 2004-July    Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
  11 *              interface to access function arguments.
  12 * 2004-Oct     Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
  13 *              <prasanna@in.ibm.com> adapted for x86_64 from i386.
  14 * 2005-Mar     Roland McGrath <roland@redhat.com>
  15 *              Fixed to handle %rip-relative addressing mode correctly.
  16 * 2005-May     Hien Nguyen <hien@us.ibm.com>, Jim Keniston
  17 *              <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
  18 *              <prasanna@in.ibm.com> added function-return probes.
  19 * 2005-May     Rusty Lynch <rusty.lynch@intel.com>
  20 *              Added function return probes functionality
  21 * 2006-Feb     Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
  22 *              kprobe-booster and kretprobe-booster for i386.
  23 * 2007-Dec     Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
  24 *              and kretprobe-booster for x86-64
  25 * 2007-Dec     Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
  26 *              <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
  27 *              unified x86 kprobes code.
  28 */
  29#include <linux/kprobes.h>
  30#include <linux/ptrace.h>
  31#include <linux/string.h>
  32#include <linux/slab.h>
  33#include <linux/hardirq.h>
  34#include <linux/preempt.h>
  35#include <linux/sched/debug.h>
  36#include <linux/perf_event.h>
  37#include <linux/extable.h>
  38#include <linux/kdebug.h>
  39#include <linux/kallsyms.h>
  40#include <linux/ftrace.h>
  41#include <linux/kasan.h>
  42#include <linux/moduleloader.h>
  43#include <linux/objtool.h>
  44#include <linux/vmalloc.h>
  45#include <linux/pgtable.h>
  46
  47#include <asm/text-patching.h>
  48#include <asm/cacheflush.h>
  49#include <asm/desc.h>
  50#include <linux/uaccess.h>
  51#include <asm/alternative.h>
  52#include <asm/insn.h>
  53#include <asm/debugreg.h>
  54#include <asm/set_memory.h>
  55
  56#include "common.h"
  57
  58DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
  59DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
  60
  61#define stack_addr(regs) ((unsigned long *)regs->sp)
  62
  63#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
  64        (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
  65          (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
  66          (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
  67          (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
  68         << (row % 32))
  69        /*
  70         * Undefined/reserved opcodes, conditional jump, Opcode Extension
  71         * Groups, and some special opcodes can not boost.
  72         * This is non-const and volatile to keep gcc from statically
  73         * optimizing it out, as variable_test_bit makes gcc think only
  74         * *(unsigned long*) is used.
  75         */
  76static volatile u32 twobyte_is_boostable[256 / 32] = {
  77        /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
  78        /*      ----------------------------------------------          */
  79        W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
  80        W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1) , /* 10 */
  81        W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
  82        W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
  83        W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
  84        W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
  85        W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
  86        W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
  87        W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
  88        W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
  89        W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
  90        W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
  91        W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
  92        W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
  93        W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
  94        W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0)   /* f0 */
  95        /*      -----------------------------------------------         */
  96        /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
  97};
  98#undef W
  99
 100struct kretprobe_blackpoint kretprobe_blacklist[] = {
 101        {"__switch_to", }, /* This function switches only current task, but
 102                              doesn't switch kernel stack.*/
 103        {NULL, NULL}    /* Terminator */
 104};
 105
 106const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
 107
 108static nokprobe_inline void
 109__synthesize_relative_insn(void *dest, void *from, void *to, u8 op)
 110{
 111        struct __arch_relative_insn {
 112                u8 op;
 113                s32 raddr;
 114        } __packed *insn;
 115
 116        insn = (struct __arch_relative_insn *)dest;
 117        insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
 118        insn->op = op;
 119}
 120
 121/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
 122void synthesize_reljump(void *dest, void *from, void *to)
 123{
 124        __synthesize_relative_insn(dest, from, to, JMP32_INSN_OPCODE);
 125}
 126NOKPROBE_SYMBOL(synthesize_reljump);
 127
 128/* Insert a call instruction at address 'from', which calls address 'to'.*/
 129void synthesize_relcall(void *dest, void *from, void *to)
 130{
 131        __synthesize_relative_insn(dest, from, to, CALL_INSN_OPCODE);
 132}
 133NOKPROBE_SYMBOL(synthesize_relcall);
 134
 135/*
 136 * Returns non-zero if INSN is boostable.
 137 * RIP relative instructions are adjusted at copying time in 64 bits mode
 138 */
 139int can_boost(struct insn *insn, void *addr)
 140{
 141        kprobe_opcode_t opcode;
 142        insn_byte_t prefix;
 143        int i;
 144
 145        if (search_exception_tables((unsigned long)addr))
 146                return 0;       /* Page fault may occur on this address. */
 147
 148        /* 2nd-byte opcode */
 149        if (insn->opcode.nbytes == 2)
 150                return test_bit(insn->opcode.bytes[1],
 151                                (unsigned long *)twobyte_is_boostable);
 152
 153        if (insn->opcode.nbytes != 1)
 154                return 0;
 155
 156        for_each_insn_prefix(insn, i, prefix) {
 157                insn_attr_t attr;
 158
 159                attr = inat_get_opcode_attribute(prefix);
 160                /* Can't boost Address-size override prefix and CS override prefix */
 161                if (prefix == 0x2e || inat_is_address_size_prefix(attr))
 162                        return 0;
 163        }
 164
 165        opcode = insn->opcode.bytes[0];
 166
 167        switch (opcode) {
 168        case 0x62:              /* bound */
 169        case 0x70 ... 0x7f:     /* Conditional jumps */
 170        case 0x9a:              /* Call far */
 171        case 0xc0 ... 0xc1:     /* Grp2 */
 172        case 0xcc ... 0xce:     /* software exceptions */
 173        case 0xd0 ... 0xd3:     /* Grp2 */
 174        case 0xd6:              /* (UD) */
 175        case 0xd8 ... 0xdf:     /* ESC */
 176        case 0xe0 ... 0xe3:     /* LOOP*, JCXZ */
 177        case 0xe8 ... 0xe9:     /* near Call, JMP */
 178        case 0xeb:              /* Short JMP */
 179        case 0xf0 ... 0xf4:     /* LOCK/REP, HLT */
 180        case 0xf6 ... 0xf7:     /* Grp3 */
 181        case 0xfe:              /* Grp4 */
 182                /* ... are not boostable */
 183                return 0;
 184        case 0xff:              /* Grp5 */
 185                /* Only indirect jmp is boostable */
 186                return X86_MODRM_REG(insn->modrm.bytes[0]) == 4;
 187        default:
 188                return 1;
 189        }
 190}
 191
 192static unsigned long
 193__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
 194{
 195        struct kprobe *kp;
 196        unsigned long faddr;
 197
 198        kp = get_kprobe((void *)addr);
 199        faddr = ftrace_location(addr);
 200        /*
 201         * Addresses inside the ftrace location are refused by
 202         * arch_check_ftrace_location(). Something went terribly wrong
 203         * if such an address is checked here.
 204         */
 205        if (WARN_ON(faddr && faddr != addr))
 206                return 0UL;
 207        /*
 208         * Use the current code if it is not modified by Kprobe
 209         * and it cannot be modified by ftrace.
 210         */
 211        if (!kp && !faddr)
 212                return addr;
 213
 214        /*
 215         * Basically, kp->ainsn.insn has an original instruction.
 216         * However, RIP-relative instruction can not do single-stepping
 217         * at different place, __copy_instruction() tweaks the displacement of
 218         * that instruction. In that case, we can't recover the instruction
 219         * from the kp->ainsn.insn.
 220         *
 221         * On the other hand, in case on normal Kprobe, kp->opcode has a copy
 222         * of the first byte of the probed instruction, which is overwritten
 223         * by int3. And the instruction at kp->addr is not modified by kprobes
 224         * except for the first byte, we can recover the original instruction
 225         * from it and kp->opcode.
 226         *
 227         * In case of Kprobes using ftrace, we do not have a copy of
 228         * the original instruction. In fact, the ftrace location might
 229         * be modified at anytime and even could be in an inconsistent state.
 230         * Fortunately, we know that the original code is the ideal 5-byte
 231         * long NOP.
 232         */
 233        if (copy_from_kernel_nofault(buf, (void *)addr,
 234                MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
 235                return 0UL;
 236
 237        if (faddr)
 238                memcpy(buf, x86_nops[5], 5);
 239        else
 240                buf[0] = kp->opcode;
 241        return (unsigned long)buf;
 242}
 243
 244/*
 245 * Recover the probed instruction at addr for further analysis.
 246 * Caller must lock kprobes by kprobe_mutex, or disable preemption
 247 * for preventing to release referencing kprobes.
 248 * Returns zero if the instruction can not get recovered (or access failed).
 249 */
 250unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
 251{
 252        unsigned long __addr;
 253
 254        __addr = __recover_optprobed_insn(buf, addr);
 255        if (__addr != addr)
 256                return __addr;
 257
 258        return __recover_probed_insn(buf, addr);
 259}
 260
 261/* Check if paddr is at an instruction boundary */
 262static int can_probe(unsigned long paddr)
 263{
 264        unsigned long addr, __addr, offset = 0;
 265        struct insn insn;
 266        kprobe_opcode_t buf[MAX_INSN_SIZE];
 267
 268        if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
 269                return 0;
 270
 271        /* Decode instructions */
 272        addr = paddr - offset;
 273        while (addr < paddr) {
 274                int ret;
 275
 276                /*
 277                 * Check if the instruction has been modified by another
 278                 * kprobe, in which case we replace the breakpoint by the
 279                 * original instruction in our buffer.
 280                 * Also, jump optimization will change the breakpoint to
 281                 * relative-jump. Since the relative-jump itself is
 282                 * normally used, we just go through if there is no kprobe.
 283                 */
 284                __addr = recover_probed_instruction(buf, addr);
 285                if (!__addr)
 286                        return 0;
 287
 288                ret = insn_decode_kernel(&insn, (void *)__addr);
 289                if (ret < 0)
 290                        return 0;
 291
 292                /*
 293                 * Another debugging subsystem might insert this breakpoint.
 294                 * In that case, we can't recover it.
 295                 */
 296                if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
 297                        return 0;
 298                addr += insn.length;
 299        }
 300
 301        return (addr == paddr);
 302}
 303
 304/*
 305 * Copy an instruction with recovering modified instruction by kprobes
 306 * and adjust the displacement if the instruction uses the %rip-relative
 307 * addressing mode. Note that since @real will be the final place of copied
 308 * instruction, displacement must be adjust by @real, not @dest.
 309 * This returns the length of copied instruction, or 0 if it has an error.
 310 */
 311int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
 312{
 313        kprobe_opcode_t buf[MAX_INSN_SIZE];
 314        unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src);
 315        int ret;
 316
 317        if (!recovered_insn || !insn)
 318                return 0;
 319
 320        /* This can access kernel text if given address is not recovered */
 321        if (copy_from_kernel_nofault(dest, (void *)recovered_insn,
 322                        MAX_INSN_SIZE))
 323                return 0;
 324
 325        ret = insn_decode_kernel(insn, dest);
 326        if (ret < 0)
 327                return 0;
 328
 329        /* We can not probe force emulate prefixed instruction */
 330        if (insn_has_emulate_prefix(insn))
 331                return 0;
 332
 333        /* Another subsystem puts a breakpoint, failed to recover */
 334        if (insn->opcode.bytes[0] == INT3_INSN_OPCODE)
 335                return 0;
 336
 337        /* We should not singlestep on the exception masking instructions */
 338        if (insn_masking_exception(insn))
 339                return 0;
 340
 341#ifdef CONFIG_X86_64
 342        /* Only x86_64 has RIP relative instructions */
 343        if (insn_rip_relative(insn)) {
 344                s64 newdisp;
 345                u8 *disp;
 346                /*
 347                 * The copied instruction uses the %rip-relative addressing
 348                 * mode.  Adjust the displacement for the difference between
 349                 * the original location of this instruction and the location
 350                 * of the copy that will actually be run.  The tricky bit here
 351                 * is making sure that the sign extension happens correctly in
 352                 * this calculation, since we need a signed 32-bit result to
 353                 * be sign-extended to 64 bits when it's added to the %rip
 354                 * value and yield the same 64-bit result that the sign-
 355                 * extension of the original signed 32-bit displacement would
 356                 * have given.
 357                 */
 358                newdisp = (u8 *) src + (s64) insn->displacement.value
 359                          - (u8 *) real;
 360                if ((s64) (s32) newdisp != newdisp) {
 361                        pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
 362                        return 0;
 363                }
 364                disp = (u8 *) dest + insn_offset_displacement(insn);
 365                *(s32 *) disp = (s32) newdisp;
 366        }
 367#endif
 368        return insn->length;
 369}
 370
 371/* Prepare reljump or int3 right after instruction */
 372static int prepare_singlestep(kprobe_opcode_t *buf, struct kprobe *p,
 373                              struct insn *insn)
 374{
 375        int len = insn->length;
 376
 377        if (!IS_ENABLED(CONFIG_PREEMPTION) &&
 378            !p->post_handler && can_boost(insn, p->addr) &&
 379            MAX_INSN_SIZE - len >= JMP32_INSN_SIZE) {
 380                /*
 381                 * These instructions can be executed directly if it
 382                 * jumps back to correct address.
 383                 */
 384                synthesize_reljump(buf + len, p->ainsn.insn + len,
 385                                   p->addr + insn->length);
 386                len += JMP32_INSN_SIZE;
 387                p->ainsn.boostable = 1;
 388        } else {
 389                /* Otherwise, put an int3 for trapping singlestep */
 390                if (MAX_INSN_SIZE - len < INT3_INSN_SIZE)
 391                        return -ENOSPC;
 392
 393                buf[len] = INT3_INSN_OPCODE;
 394                len += INT3_INSN_SIZE;
 395        }
 396
 397        return len;
 398}
 399
 400/* Make page to RO mode when allocate it */
 401void *alloc_insn_page(void)
 402{
 403        void *page;
 404
 405        page = module_alloc(PAGE_SIZE);
 406        if (!page)
 407                return NULL;
 408
 409        set_vm_flush_reset_perms(page);
 410        /*
 411         * First make the page read-only, and only then make it executable to
 412         * prevent it from being W+X in between.
 413         */
 414        set_memory_ro((unsigned long)page, 1);
 415
 416        /*
 417         * TODO: Once additional kernel code protection mechanisms are set, ensure
 418         * that the page was not maliciously altered and it is still zeroed.
 419         */
 420        set_memory_x((unsigned long)page, 1);
 421
 422        return page;
 423}
 424
 425/* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */
 426
 427static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs)
 428{
 429        switch (p->ainsn.opcode) {
 430        case 0xfa:      /* cli */
 431                regs->flags &= ~(X86_EFLAGS_IF);
 432                break;
 433        case 0xfb:      /* sti */
 434                regs->flags |= X86_EFLAGS_IF;
 435                break;
 436        case 0x9c:      /* pushf */
 437                int3_emulate_push(regs, regs->flags);
 438                break;
 439        case 0x9d:      /* popf */
 440                regs->flags = int3_emulate_pop(regs);
 441                break;
 442        }
 443        regs->ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
 444}
 445NOKPROBE_SYMBOL(kprobe_emulate_ifmodifiers);
 446
 447static void kprobe_emulate_ret(struct kprobe *p, struct pt_regs *regs)
 448{
 449        int3_emulate_ret(regs);
 450}
 451NOKPROBE_SYMBOL(kprobe_emulate_ret);
 452
 453static void kprobe_emulate_call(struct kprobe *p, struct pt_regs *regs)
 454{
 455        unsigned long func = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
 456
 457        func += p->ainsn.rel32;
 458        int3_emulate_call(regs, func);
 459}
 460NOKPROBE_SYMBOL(kprobe_emulate_call);
 461
 462static nokprobe_inline
 463void __kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs, bool cond)
 464{
 465        unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
 466
 467        if (cond)
 468                ip += p->ainsn.rel32;
 469        int3_emulate_jmp(regs, ip);
 470}
 471
 472static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs)
 473{
 474        __kprobe_emulate_jmp(p, regs, true);
 475}
 476NOKPROBE_SYMBOL(kprobe_emulate_jmp);
 477
 478static const unsigned long jcc_mask[6] = {
 479        [0] = X86_EFLAGS_OF,
 480        [1] = X86_EFLAGS_CF,
 481        [2] = X86_EFLAGS_ZF,
 482        [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF,
 483        [4] = X86_EFLAGS_SF,
 484        [5] = X86_EFLAGS_PF,
 485};
 486
 487static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs)
 488{
 489        bool invert = p->ainsn.jcc.type & 1;
 490        bool match;
 491
 492        if (p->ainsn.jcc.type < 0xc) {
 493                match = regs->flags & jcc_mask[p->ainsn.jcc.type >> 1];
 494        } else {
 495                match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^
 496                        ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT);
 497                if (p->ainsn.jcc.type >= 0xe)
 498                        match = match && (regs->flags & X86_EFLAGS_ZF);
 499        }
 500        __kprobe_emulate_jmp(p, regs, (match && !invert) || (!match && invert));
 501}
 502NOKPROBE_SYMBOL(kprobe_emulate_jcc);
 503
 504static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs)
 505{
 506        bool match;
 507
 508        if (p->ainsn.loop.type != 3) {  /* LOOP* */
 509                if (p->ainsn.loop.asize == 32)
 510                        match = ((*(u32 *)&regs->cx)--) != 0;
 511#ifdef CONFIG_X86_64
 512                else if (p->ainsn.loop.asize == 64)
 513                        match = ((*(u64 *)&regs->cx)--) != 0;
 514#endif
 515                else
 516                        match = ((*(u16 *)&regs->cx)--) != 0;
 517        } else {                        /* JCXZ */
 518                if (p->ainsn.loop.asize == 32)
 519                        match = *(u32 *)(&regs->cx) == 0;
 520#ifdef CONFIG_X86_64
 521                else if (p->ainsn.loop.asize == 64)
 522                        match = *(u64 *)(&regs->cx) == 0;
 523#endif
 524                else
 525                        match = *(u16 *)(&regs->cx) == 0;
 526        }
 527
 528        if (p->ainsn.loop.type == 0)    /* LOOPNE */
 529                match = match && !(regs->flags & X86_EFLAGS_ZF);
 530        else if (p->ainsn.loop.type == 1)       /* LOOPE */
 531                match = match && (regs->flags & X86_EFLAGS_ZF);
 532
 533        __kprobe_emulate_jmp(p, regs, match);
 534}
 535NOKPROBE_SYMBOL(kprobe_emulate_loop);
 536
 537static const int addrmode_regoffs[] = {
 538        offsetof(struct pt_regs, ax),
 539        offsetof(struct pt_regs, cx),
 540        offsetof(struct pt_regs, dx),
 541        offsetof(struct pt_regs, bx),
 542        offsetof(struct pt_regs, sp),
 543        offsetof(struct pt_regs, bp),
 544        offsetof(struct pt_regs, si),
 545        offsetof(struct pt_regs, di),
 546#ifdef CONFIG_X86_64
 547        offsetof(struct pt_regs, r8),
 548        offsetof(struct pt_regs, r9),
 549        offsetof(struct pt_regs, r10),
 550        offsetof(struct pt_regs, r11),
 551        offsetof(struct pt_regs, r12),
 552        offsetof(struct pt_regs, r13),
 553        offsetof(struct pt_regs, r14),
 554        offsetof(struct pt_regs, r15),
 555#endif
 556};
 557
 558static void kprobe_emulate_call_indirect(struct kprobe *p, struct pt_regs *regs)
 559{
 560        unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg];
 561
 562        int3_emulate_call(regs, regs_get_register(regs, offs));
 563}
 564NOKPROBE_SYMBOL(kprobe_emulate_call_indirect);
 565
 566static void kprobe_emulate_jmp_indirect(struct kprobe *p, struct pt_regs *regs)
 567{
 568        unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg];
 569
 570        int3_emulate_jmp(regs, regs_get_register(regs, offs));
 571}
 572NOKPROBE_SYMBOL(kprobe_emulate_jmp_indirect);
 573
 574static int prepare_emulation(struct kprobe *p, struct insn *insn)
 575{
 576        insn_byte_t opcode = insn->opcode.bytes[0];
 577
 578        switch (opcode) {
 579        case 0xfa:              /* cli */
 580        case 0xfb:              /* sti */
 581        case 0x9c:              /* pushfl */
 582        case 0x9d:              /* popf/popfd */
 583                /*
 584                 * IF modifiers must be emulated since it will enable interrupt while
 585                 * int3 single stepping.
 586                 */
 587                p->ainsn.emulate_op = kprobe_emulate_ifmodifiers;
 588                p->ainsn.opcode = opcode;
 589                break;
 590        case 0xc2:      /* ret/lret */
 591        case 0xc3:
 592        case 0xca:
 593        case 0xcb:
 594                p->ainsn.emulate_op = kprobe_emulate_ret;
 595                break;
 596        case 0x9a:      /* far call absolute -- segment is not supported */
 597        case 0xea:      /* far jmp absolute -- segment is not supported */
 598        case 0xcc:      /* int3 */
 599        case 0xcf:      /* iret -- in-kernel IRET is not supported */
 600                return -EOPNOTSUPP;
 601                break;
 602        case 0xe8:      /* near call relative */
 603                p->ainsn.emulate_op = kprobe_emulate_call;
 604                if (insn->immediate.nbytes == 2)
 605                        p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
 606                else
 607                        p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
 608                break;
 609        case 0xeb:      /* short jump relative */
 610        case 0xe9:      /* near jump relative */
 611                p->ainsn.emulate_op = kprobe_emulate_jmp;
 612                if (insn->immediate.nbytes == 1)
 613                        p->ainsn.rel32 = *(s8 *)&insn->immediate.value;
 614                else if (insn->immediate.nbytes == 2)
 615                        p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
 616                else
 617                        p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
 618                break;
 619        case 0x70 ... 0x7f:
 620                /* 1 byte conditional jump */
 621                p->ainsn.emulate_op = kprobe_emulate_jcc;
 622                p->ainsn.jcc.type = opcode & 0xf;
 623                p->ainsn.rel32 = *(char *)insn->immediate.bytes;
 624                break;
 625        case 0x0f:
 626                opcode = insn->opcode.bytes[1];
 627                if ((opcode & 0xf0) == 0x80) {
 628                        /* 2 bytes Conditional Jump */
 629                        p->ainsn.emulate_op = kprobe_emulate_jcc;
 630                        p->ainsn.jcc.type = opcode & 0xf;
 631                        if (insn->immediate.nbytes == 2)
 632                                p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
 633                        else
 634                                p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
 635                } else if (opcode == 0x01 &&
 636                           X86_MODRM_REG(insn->modrm.bytes[0]) == 0 &&
 637                           X86_MODRM_MOD(insn->modrm.bytes[0]) == 3) {
 638                        /* VM extensions - not supported */
 639                        return -EOPNOTSUPP;
 640                }
 641                break;
 642        case 0xe0:      /* Loop NZ */
 643        case 0xe1:      /* Loop */
 644        case 0xe2:      /* Loop */
 645        case 0xe3:      /* J*CXZ */
 646                p->ainsn.emulate_op = kprobe_emulate_loop;
 647                p->ainsn.loop.type = opcode & 0x3;
 648                p->ainsn.loop.asize = insn->addr_bytes * 8;
 649                p->ainsn.rel32 = *(s8 *)&insn->immediate.value;
 650                break;
 651        case 0xff:
 652                /*
 653                 * Since the 0xff is an extended group opcode, the instruction
 654                 * is determined by the MOD/RM byte.
 655                 */
 656                opcode = insn->modrm.bytes[0];
 657                if ((opcode & 0x30) == 0x10) {
 658                        if ((opcode & 0x8) == 0x8)
 659                                return -EOPNOTSUPP;     /* far call */
 660                        /* call absolute, indirect */
 661                        p->ainsn.emulate_op = kprobe_emulate_call_indirect;
 662                } else if ((opcode & 0x30) == 0x20) {
 663                        if ((opcode & 0x8) == 0x8)
 664                                return -EOPNOTSUPP;     /* far jmp */
 665                        /* jmp near absolute indirect */
 666                        p->ainsn.emulate_op = kprobe_emulate_jmp_indirect;
 667                } else
 668                        break;
 669
 670                if (insn->addr_bytes != sizeof(unsigned long))
 671                        return -EOPNOTSUPP;     /* Don't support different size */
 672                if (X86_MODRM_MOD(opcode) != 3)
 673                        return -EOPNOTSUPP;     /* TODO: support memory addressing */
 674
 675                p->ainsn.indirect.reg = X86_MODRM_RM(opcode);
 676#ifdef CONFIG_X86_64
 677                if (X86_REX_B(insn->rex_prefix.value))
 678                        p->ainsn.indirect.reg += 8;
 679#endif
 680                break;
 681        default:
 682                break;
 683        }
 684        p->ainsn.size = insn->length;
 685
 686        return 0;
 687}
 688
 689static int arch_copy_kprobe(struct kprobe *p)
 690{
 691        struct insn insn;
 692        kprobe_opcode_t buf[MAX_INSN_SIZE];
 693        int ret, len;
 694
 695        /* Copy an instruction with recovering if other optprobe modifies it.*/
 696        len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn);
 697        if (!len)
 698                return -EINVAL;
 699
 700        /* Analyze the opcode and setup emulate functions */
 701        ret = prepare_emulation(p, &insn);
 702        if (ret < 0)
 703                return ret;
 704
 705        /* Add int3 for single-step or booster jmp */
 706        len = prepare_singlestep(buf, p, &insn);
 707        if (len < 0)
 708                return len;
 709
 710        /* Also, displacement change doesn't affect the first byte */
 711        p->opcode = buf[0];
 712
 713        p->ainsn.tp_len = len;
 714        perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len);
 715
 716        /* OK, write back the instruction(s) into ROX insn buffer */
 717        text_poke(p->ainsn.insn, buf, len);
 718
 719        return 0;
 720}
 721
 722int arch_prepare_kprobe(struct kprobe *p)
 723{
 724        int ret;
 725
 726        if (alternatives_text_reserved(p->addr, p->addr))
 727                return -EINVAL;
 728
 729        if (!can_probe((unsigned long)p->addr))
 730                return -EILSEQ;
 731
 732        memset(&p->ainsn, 0, sizeof(p->ainsn));
 733
 734        /* insn: must be on special executable page on x86. */
 735        p->ainsn.insn = get_insn_slot();
 736        if (!p->ainsn.insn)
 737                return -ENOMEM;
 738
 739        ret = arch_copy_kprobe(p);
 740        if (ret) {
 741                free_insn_slot(p->ainsn.insn, 0);
 742                p->ainsn.insn = NULL;
 743        }
 744
 745        return ret;
 746}
 747
 748void arch_arm_kprobe(struct kprobe *p)
 749{
 750        u8 int3 = INT3_INSN_OPCODE;
 751
 752        text_poke(p->addr, &int3, 1);
 753        text_poke_sync();
 754        perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1);
 755}
 756
 757void arch_disarm_kprobe(struct kprobe *p)
 758{
 759        u8 int3 = INT3_INSN_OPCODE;
 760
 761        perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1);
 762        text_poke(p->addr, &p->opcode, 1);
 763        text_poke_sync();
 764}
 765
 766void arch_remove_kprobe(struct kprobe *p)
 767{
 768        if (p->ainsn.insn) {
 769                /* Record the perf event before freeing the slot */
 770                perf_event_text_poke(p->ainsn.insn, p->ainsn.insn,
 771                                     p->ainsn.tp_len, NULL, 0);
 772                free_insn_slot(p->ainsn.insn, p->ainsn.boostable);
 773                p->ainsn.insn = NULL;
 774        }
 775}
 776
 777static nokprobe_inline void
 778save_previous_kprobe(struct kprobe_ctlblk *kcb)
 779{
 780        kcb->prev_kprobe.kp = kprobe_running();
 781        kcb->prev_kprobe.status = kcb->kprobe_status;
 782        kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
 783        kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
 784}
 785
 786static nokprobe_inline void
 787restore_previous_kprobe(struct kprobe_ctlblk *kcb)
 788{
 789        __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
 790        kcb->kprobe_status = kcb->prev_kprobe.status;
 791        kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
 792        kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
 793}
 794
 795static nokprobe_inline void
 796set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 797                   struct kprobe_ctlblk *kcb)
 798{
 799        __this_cpu_write(current_kprobe, p);
 800        kcb->kprobe_saved_flags = kcb->kprobe_old_flags
 801                = (regs->flags & X86_EFLAGS_IF);
 802}
 803
 804void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
 805{
 806        unsigned long *sara = stack_addr(regs);
 807
 808        ri->ret_addr = (kprobe_opcode_t *) *sara;
 809        ri->fp = sara;
 810
 811        /* Replace the return addr with trampoline addr */
 812        *sara = (unsigned long) &kretprobe_trampoline;
 813}
 814NOKPROBE_SYMBOL(arch_prepare_kretprobe);
 815
 816static void kprobe_post_process(struct kprobe *cur, struct pt_regs *regs,
 817                               struct kprobe_ctlblk *kcb)
 818{
 819        if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
 820                kcb->kprobe_status = KPROBE_HIT_SSDONE;
 821                cur->post_handler(cur, regs, 0);
 822        }
 823
 824        /* Restore back the original saved kprobes variables and continue. */
 825        if (kcb->kprobe_status == KPROBE_REENTER)
 826                restore_previous_kprobe(kcb);
 827        else
 828                reset_current_kprobe();
 829}
 830NOKPROBE_SYMBOL(kprobe_post_process);
 831
 832static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
 833                             struct kprobe_ctlblk *kcb, int reenter)
 834{
 835        if (setup_detour_execution(p, regs, reenter))
 836                return;
 837
 838#if !defined(CONFIG_PREEMPTION)
 839        if (p->ainsn.boostable) {
 840                /* Boost up -- we can execute copied instructions directly */
 841                if (!reenter)
 842                        reset_current_kprobe();
 843                /*
 844                 * Reentering boosted probe doesn't reset current_kprobe,
 845                 * nor set current_kprobe, because it doesn't use single
 846                 * stepping.
 847                 */
 848                regs->ip = (unsigned long)p->ainsn.insn;
 849                return;
 850        }
 851#endif
 852        if (reenter) {
 853                save_previous_kprobe(kcb);
 854                set_current_kprobe(p, regs, kcb);
 855                kcb->kprobe_status = KPROBE_REENTER;
 856        } else
 857                kcb->kprobe_status = KPROBE_HIT_SS;
 858
 859        if (p->ainsn.emulate_op) {
 860                p->ainsn.emulate_op(p, regs);
 861                kprobe_post_process(p, regs, kcb);
 862                return;
 863        }
 864
 865        /* Disable interrupt, and set ip register on trampoline */
 866        regs->flags &= ~X86_EFLAGS_IF;
 867        regs->ip = (unsigned long)p->ainsn.insn;
 868}
 869NOKPROBE_SYMBOL(setup_singlestep);
 870
 871/*
 872 * Called after single-stepping.  p->addr is the address of the
 873 * instruction whose first byte has been replaced by the "int3"
 874 * instruction.  To avoid the SMP problems that can occur when we
 875 * temporarily put back the original opcode to single-step, we
 876 * single-stepped a copy of the instruction.  The address of this
 877 * copy is p->ainsn.insn. We also doesn't use trap, but "int3" again
 878 * right after the copied instruction.
 879 * Different from the trap single-step, "int3" single-step can not
 880 * handle the instruction which changes the ip register, e.g. jmp,
 881 * call, conditional jmp, and the instructions which changes the IF
 882 * flags because interrupt must be disabled around the single-stepping.
 883 * Such instructions are software emulated, but others are single-stepped
 884 * using "int3".
 885 *
 886 * When the 2nd "int3" handled, the regs->ip and regs->flags needs to
 887 * be adjusted, so that we can resume execution on correct code.
 888 */
 889static void resume_singlestep(struct kprobe *p, struct pt_regs *regs,
 890                              struct kprobe_ctlblk *kcb)
 891{
 892        unsigned long copy_ip = (unsigned long)p->ainsn.insn;
 893        unsigned long orig_ip = (unsigned long)p->addr;
 894
 895        /* Restore saved interrupt flag and ip register */
 896        regs->flags |= kcb->kprobe_saved_flags;
 897        /* Note that regs->ip is executed int3 so must be a step back */
 898        regs->ip += (orig_ip - copy_ip) - INT3_INSN_SIZE;
 899}
 900NOKPROBE_SYMBOL(resume_singlestep);
 901
 902/*
 903 * We have reentered the kprobe_handler(), since another probe was hit while
 904 * within the handler. We save the original kprobes variables and just single
 905 * step on the instruction of the new probe without calling any user handlers.
 906 */
 907static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
 908                          struct kprobe_ctlblk *kcb)
 909{
 910        switch (kcb->kprobe_status) {
 911        case KPROBE_HIT_SSDONE:
 912        case KPROBE_HIT_ACTIVE:
 913        case KPROBE_HIT_SS:
 914                kprobes_inc_nmissed_count(p);
 915                setup_singlestep(p, regs, kcb, 1);
 916                break;
 917        case KPROBE_REENTER:
 918                /* A probe has been hit in the codepath leading up to, or just
 919                 * after, single-stepping of a probed instruction. This entire
 920                 * codepath should strictly reside in .kprobes.text section.
 921                 * Raise a BUG or we'll continue in an endless reentering loop
 922                 * and eventually a stack overflow.
 923                 */
 924                pr_err("Unrecoverable kprobe detected.\n");
 925                dump_kprobe(p);
 926                BUG();
 927        default:
 928                /* impossible cases */
 929                WARN_ON(1);
 930                return 0;
 931        }
 932
 933        return 1;
 934}
 935NOKPROBE_SYMBOL(reenter_kprobe);
 936
 937static nokprobe_inline int kprobe_is_ss(struct kprobe_ctlblk *kcb)
 938{
 939        return (kcb->kprobe_status == KPROBE_HIT_SS ||
 940                kcb->kprobe_status == KPROBE_REENTER);
 941}
 942
 943/*
 944 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
 945 * remain disabled throughout this function.
 946 */
 947int kprobe_int3_handler(struct pt_regs *regs)
 948{
 949        kprobe_opcode_t *addr;
 950        struct kprobe *p;
 951        struct kprobe_ctlblk *kcb;
 952
 953        if (user_mode(regs))
 954                return 0;
 955
 956        addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
 957        /*
 958         * We don't want to be preempted for the entire duration of kprobe
 959         * processing. Since int3 and debug trap disables irqs and we clear
 960         * IF while singlestepping, it must be no preemptible.
 961         */
 962
 963        kcb = get_kprobe_ctlblk();
 964        p = get_kprobe(addr);
 965
 966        if (p) {
 967                if (kprobe_running()) {
 968                        if (reenter_kprobe(p, regs, kcb))
 969                                return 1;
 970                } else {
 971                        set_current_kprobe(p, regs, kcb);
 972                        kcb->kprobe_status = KPROBE_HIT_ACTIVE;
 973
 974                        /*
 975                         * If we have no pre-handler or it returned 0, we
 976                         * continue with normal processing.  If we have a
 977                         * pre-handler and it returned non-zero, that means
 978                         * user handler setup registers to exit to another
 979                         * instruction, we must skip the single stepping.
 980                         */
 981                        if (!p->pre_handler || !p->pre_handler(p, regs))
 982                                setup_singlestep(p, regs, kcb, 0);
 983                        else
 984                                reset_current_kprobe();
 985                        return 1;
 986                }
 987        } else if (kprobe_is_ss(kcb)) {
 988                p = kprobe_running();
 989                if ((unsigned long)p->ainsn.insn < regs->ip &&
 990                    (unsigned long)p->ainsn.insn + MAX_INSN_SIZE > regs->ip) {
 991                        /* Most provably this is the second int3 for singlestep */
 992                        resume_singlestep(p, regs, kcb);
 993                        kprobe_post_process(p, regs, kcb);
 994                        return 1;
 995                }
 996        }
 997
 998        if (*addr != INT3_INSN_OPCODE) {
 999                /*
1000                 * The breakpoint instruction was removed right
1001                 * after we hit it.  Another cpu has removed
1002                 * either a probepoint or a debugger breakpoint
1003                 * at this address.  In either case, no further
1004                 * handling of this interrupt is appropriate.
1005                 * Back up over the (now missing) int3 and run
1006                 * the original instruction.
1007                 */
1008                regs->ip = (unsigned long)addr;
1009                return 1;
1010        } /* else: not a kprobe fault; let the kernel handle it */
1011
1012        return 0;
1013}
1014NOKPROBE_SYMBOL(kprobe_int3_handler);
1015
1016/*
1017 * When a retprobed function returns, this code saves registers and
1018 * calls trampoline_handler() runs, which calls the kretprobe's handler.
1019 */
1020asm(
1021        ".text\n"
1022        ".global kretprobe_trampoline\n"
1023        ".type kretprobe_trampoline, @function\n"
1024        "kretprobe_trampoline:\n"
1025        /* We don't bother saving the ss register */
1026#ifdef CONFIG_X86_64
1027        "       pushq %rsp\n"
1028        "       pushfq\n"
1029        SAVE_REGS_STRING
1030        "       movq %rsp, %rdi\n"
1031        "       call trampoline_handler\n"
1032        /* Replace saved sp with true return address. */
1033        "       movq %rax, 19*8(%rsp)\n"
1034        RESTORE_REGS_STRING
1035        "       popfq\n"
1036#else
1037        "       pushl %esp\n"
1038        "       pushfl\n"
1039        SAVE_REGS_STRING
1040        "       movl %esp, %eax\n"
1041        "       call trampoline_handler\n"
1042        /* Replace saved sp with true return address. */
1043        "       movl %eax, 15*4(%esp)\n"
1044        RESTORE_REGS_STRING
1045        "       popfl\n"
1046#endif
1047        "       ret\n"
1048        ".size kretprobe_trampoline, .-kretprobe_trampoline\n"
1049);
1050NOKPROBE_SYMBOL(kretprobe_trampoline);
1051STACK_FRAME_NON_STANDARD(kretprobe_trampoline);
1052
1053
1054/*
1055 * Called from kretprobe_trampoline
1056 */
1057__used __visible void *trampoline_handler(struct pt_regs *regs)
1058{
1059        /* fixup registers */
1060        regs->cs = __KERNEL_CS;
1061#ifdef CONFIG_X86_32
1062        regs->gs = 0;
1063#endif
1064        regs->ip = (unsigned long)&kretprobe_trampoline;
1065        regs->orig_ax = ~0UL;
1066
1067        return (void *)kretprobe_trampoline_handler(regs, &kretprobe_trampoline, &regs->sp);
1068}
1069NOKPROBE_SYMBOL(trampoline_handler);
1070
1071int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
1072{
1073        struct kprobe *cur = kprobe_running();
1074        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1075
1076        if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) {
1077                /* This must happen on single-stepping */
1078                WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS &&
1079                        kcb->kprobe_status != KPROBE_REENTER);
1080                /*
1081                 * We are here because the instruction being single
1082                 * stepped caused a page fault. We reset the current
1083                 * kprobe and the ip points back to the probe address
1084                 * and allow the page fault handler to continue as a
1085                 * normal page fault.
1086                 */
1087                regs->ip = (unsigned long)cur->addr;
1088
1089                /*
1090                 * If the IF flag was set before the kprobe hit,
1091                 * don't touch it:
1092                 */
1093                regs->flags |= kcb->kprobe_old_flags;
1094
1095                if (kcb->kprobe_status == KPROBE_REENTER)
1096                        restore_previous_kprobe(kcb);
1097                else
1098                        reset_current_kprobe();
1099        }
1100
1101        return 0;
1102}
1103NOKPROBE_SYMBOL(kprobe_fault_handler);
1104
1105int __init arch_populate_kprobe_blacklist(void)
1106{
1107        return kprobe_add_area_blacklist((unsigned long)__entry_text_start,
1108                                         (unsigned long)__entry_text_end);
1109}
1110
1111int __init arch_init_kprobes(void)
1112{
1113        return 0;
1114}
1115
1116int arch_trampoline_kprobe(struct kprobe *p)
1117{
1118        return 0;
1119}
1120