linux/arch/x86/kernel/kprobes/core.c
<<
>>
Prefs
   1/*
   2 *  Kernel Probes (KProbes)
   3 *
   4 * This program is free software; you can redistribute it and/or modify
   5 * it under the terms of the GNU General Public License as published by
   6 * the Free Software Foundation; either version 2 of the License, or
   7 * (at your option) any later version.
   8 *
   9 * This program is distributed in the hope that it will be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write to the Free Software
  16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  17 *
  18 * Copyright (C) IBM Corporation, 2002, 2004
  19 *
  20 * 2002-Oct     Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
  21 *              Probes initial implementation ( includes contributions from
  22 *              Rusty Russell).
  23 * 2004-July    Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
  24 *              interface to access function arguments.
  25 * 2004-Oct     Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
  26 *              <prasanna@in.ibm.com> adapted for x86_64 from i386.
  27 * 2005-Mar     Roland McGrath <roland@redhat.com>
  28 *              Fixed to handle %rip-relative addressing mode correctly.
  29 * 2005-May     Hien Nguyen <hien@us.ibm.com>, Jim Keniston
  30 *              <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
  31 *              <prasanna@in.ibm.com> added function-return probes.
  32 * 2005-May     Rusty Lynch <rusty.lynch@intel.com>
  33 *              Added function return probes functionality
  34 * 2006-Feb     Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
  35 *              kprobe-booster and kretprobe-booster for i386.
  36 * 2007-Dec     Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
  37 *              and kretprobe-booster for x86-64
  38 * 2007-Dec     Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
  39 *              <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
  40 *              unified x86 kprobes code.
  41 */
  42#include <linux/kprobes.h>
  43#include <linux/ptrace.h>
  44#include <linux/string.h>
  45#include <linux/slab.h>
  46#include <linux/hardirq.h>
  47#include <linux/preempt.h>
  48#include <linux/module.h>
  49#include <linux/kdebug.h>
  50#include <linux/kallsyms.h>
  51#include <linux/ftrace.h>
  52#include <linux/frame.h>
  53
  54#include <asm/text-patching.h>
  55#include <asm/cacheflush.h>
  56#include <asm/desc.h>
  57#include <asm/pgtable.h>
  58#include <asm/uaccess.h>
  59#include <asm/alternative.h>
  60#include <asm/insn.h>
  61#include <asm/debugreg.h>
  62
  63#include "common.h"
  64
  65void jprobe_return_end(void);
  66
  67DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
  68DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
  69
  70#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
  71
  72#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
  73        (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
  74          (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
  75          (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
  76          (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
  77         << (row % 32))
  78        /*
  79         * Undefined/reserved opcodes, conditional jump, Opcode Extension
  80         * Groups, and some special opcodes can not boost.
  81         * This is non-const and volatile to keep gcc from statically
  82         * optimizing it out, as variable_test_bit makes gcc think only
  83         * *(unsigned long*) is used.
  84         */
  85static volatile u32 twobyte_is_boostable[256 / 32] = {
  86        /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
  87        /*      ----------------------------------------------          */
  88        W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
  89        W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1) , /* 10 */
  90        W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
  91        W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
  92        W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
  93        W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
  94        W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
  95        W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
  96        W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
  97        W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
  98        W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
  99        W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
 100        W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
 101        W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
 102        W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
 103        W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0)   /* f0 */
 104        /*      -----------------------------------------------         */
 105        /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
 106};
 107#undef W
 108
 109struct kretprobe_blackpoint kretprobe_blacklist[] = {
 110        {"__switch_to", }, /* This function switches only current task, but
 111                              doesn't switch kernel stack.*/
 112        {NULL, NULL}    /* Terminator */
 113};
 114
 115const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
 116
 117static nokprobe_inline void
 118__synthesize_relative_insn(void *from, void *to, u8 op)
 119{
 120        struct __arch_relative_insn {
 121                u8 op;
 122                s32 raddr;
 123        } __packed *insn;
 124
 125        insn = (struct __arch_relative_insn *)from;
 126        insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
 127        insn->op = op;
 128}
 129
 130/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
 131void synthesize_reljump(void *from, void *to)
 132{
 133        __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
 134}
 135NOKPROBE_SYMBOL(synthesize_reljump);
 136
 137/* Insert a call instruction at address 'from', which calls address 'to'.*/
 138void synthesize_relcall(void *from, void *to)
 139{
 140        __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
 141}
 142NOKPROBE_SYMBOL(synthesize_relcall);
 143
 144/*
 145 * Skip the prefixes of the instruction.
 146 */
 147static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn)
 148{
 149        insn_attr_t attr;
 150
 151        attr = inat_get_opcode_attribute((insn_byte_t)*insn);
 152        while (inat_is_legacy_prefix(attr)) {
 153                insn++;
 154                attr = inat_get_opcode_attribute((insn_byte_t)*insn);
 155        }
 156#ifdef CONFIG_X86_64
 157        if (inat_is_rex_prefix(attr))
 158                insn++;
 159#endif
 160        return insn;
 161}
 162NOKPROBE_SYMBOL(skip_prefixes);
 163
 164/*
 165 * Returns non-zero if opcode is boostable.
 166 * RIP relative instructions are adjusted at copying time in 64 bits mode
 167 */
 168int can_boost(kprobe_opcode_t *opcodes)
 169{
 170        kprobe_opcode_t opcode;
 171        kprobe_opcode_t *orig_opcodes = opcodes;
 172
 173        if (search_exception_tables((unsigned long)opcodes))
 174                return 0;       /* Page fault may occur on this address. */
 175
 176retry:
 177        if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
 178                return 0;
 179        opcode = *(opcodes++);
 180
 181        /* 2nd-byte opcode */
 182        if (opcode == 0x0f) {
 183                if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
 184                        return 0;
 185                return test_bit(*opcodes,
 186                                (unsigned long *)twobyte_is_boostable);
 187        }
 188
 189        switch (opcode & 0xf0) {
 190#ifdef CONFIG_X86_64
 191        case 0x40:
 192                goto retry; /* REX prefix is boostable */
 193#endif
 194        case 0x60:
 195                if (0x63 < opcode && opcode < 0x67)
 196                        goto retry; /* prefixes */
 197                /* can't boost Address-size override and bound */
 198                return (opcode != 0x62 && opcode != 0x67);
 199        case 0x70:
 200                return 0; /* can't boost conditional jump */
 201        case 0xc0:
 202                /* can't boost software-interruptions */
 203                return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
 204        case 0xd0:
 205                /* can boost AA* and XLAT */
 206                return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
 207        case 0xe0:
 208                /* can boost in/out and absolute jmps */
 209                return ((opcode & 0x04) || opcode == 0xea);
 210        case 0xf0:
 211                if ((opcode & 0x0c) == 0 && opcode != 0xf1)
 212                        goto retry; /* lock/rep(ne) prefix */
 213                /* clear and set flags are boostable */
 214                return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
 215        default:
 216                /* segment override prefixes are boostable */
 217                if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
 218                        goto retry; /* prefixes */
 219                /* CS override prefix and call are not boostable */
 220                return (opcode != 0x2e && opcode != 0x9a);
 221        }
 222}
 223
 224static unsigned long
 225__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
 226{
 227        struct kprobe *kp;
 228        unsigned long faddr;
 229
 230        kp = get_kprobe((void *)addr);
 231        faddr = ftrace_location(addr);
 232        /*
 233         * Addresses inside the ftrace location are refused by
 234         * arch_check_ftrace_location(). Something went terribly wrong
 235         * if such an address is checked here.
 236         */
 237        if (WARN_ON(faddr && faddr != addr))
 238                return 0UL;
 239        /*
 240         * Use the current code if it is not modified by Kprobe
 241         * and it cannot be modified by ftrace.
 242         */
 243        if (!kp && !faddr)
 244                return addr;
 245
 246        /*
 247         * Basically, kp->ainsn.insn has an original instruction.
 248         * However, RIP-relative instruction can not do single-stepping
 249         * at different place, __copy_instruction() tweaks the displacement of
 250         * that instruction. In that case, we can't recover the instruction
 251         * from the kp->ainsn.insn.
 252         *
 253         * On the other hand, in case on normal Kprobe, kp->opcode has a copy
 254         * of the first byte of the probed instruction, which is overwritten
 255         * by int3. And the instruction at kp->addr is not modified by kprobes
 256         * except for the first byte, we can recover the original instruction
 257         * from it and kp->opcode.
 258         *
 259         * In case of Kprobes using ftrace, we do not have a copy of
 260         * the original instruction. In fact, the ftrace location might
 261         * be modified at anytime and even could be in an inconsistent state.
 262         * Fortunately, we know that the original code is the ideal 5-byte
 263         * long NOP.
 264         */
 265        memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
 266        if (faddr)
 267                memcpy(buf, ideal_nops[NOP_ATOMIC5], 5);
 268        else
 269                buf[0] = kp->opcode;
 270        return (unsigned long)buf;
 271}
 272
 273/*
 274 * Recover the probed instruction at addr for further analysis.
 275 * Caller must lock kprobes by kprobe_mutex, or disable preemption
 276 * for preventing to release referencing kprobes.
 277 * Returns zero if the instruction can not get recovered.
 278 */
 279unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
 280{
 281        unsigned long __addr;
 282
 283        __addr = __recover_optprobed_insn(buf, addr);
 284        if (__addr != addr)
 285                return __addr;
 286
 287        return __recover_probed_insn(buf, addr);
 288}
 289
 290/* Check if paddr is at an instruction boundary */
 291static int can_probe(unsigned long paddr)
 292{
 293        unsigned long addr, __addr, offset = 0;
 294        struct insn insn;
 295        kprobe_opcode_t buf[MAX_INSN_SIZE];
 296
 297        if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
 298                return 0;
 299
 300        /* Decode instructions */
 301        addr = paddr - offset;
 302        while (addr < paddr) {
 303                /*
 304                 * Check if the instruction has been modified by another
 305                 * kprobe, in which case we replace the breakpoint by the
 306                 * original instruction in our buffer.
 307                 * Also, jump optimization will change the breakpoint to
 308                 * relative-jump. Since the relative-jump itself is
 309                 * normally used, we just go through if there is no kprobe.
 310                 */
 311                __addr = recover_probed_instruction(buf, addr);
 312                if (!__addr)
 313                        return 0;
 314                kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE);
 315                insn_get_length(&insn);
 316
 317                /*
 318                 * Another debugging subsystem might insert this breakpoint.
 319                 * In that case, we can't recover it.
 320                 */
 321                if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
 322                        return 0;
 323                addr += insn.length;
 324        }
 325
 326        return (addr == paddr);
 327}
 328
 329/*
 330 * Returns non-zero if opcode modifies the interrupt flag.
 331 */
 332static int is_IF_modifier(kprobe_opcode_t *insn)
 333{
 334        /* Skip prefixes */
 335        insn = skip_prefixes(insn);
 336
 337        switch (*insn) {
 338        case 0xfa:              /* cli */
 339        case 0xfb:              /* sti */
 340        case 0xcf:              /* iret/iretd */
 341        case 0x9d:              /* popf/popfd */
 342                return 1;
 343        }
 344
 345        return 0;
 346}
 347
 348/*
 349 * Copy an instruction and adjust the displacement if the instruction
 350 * uses the %rip-relative addressing mode.
 351 * If it does, Return the address of the 32-bit displacement word.
 352 * If not, return null.
 353 * Only applicable to 64-bit x86.
 354 */
 355int __copy_instruction(u8 *dest, u8 *src)
 356{
 357        struct insn insn;
 358        kprobe_opcode_t buf[MAX_INSN_SIZE];
 359        int length;
 360        unsigned long recovered_insn =
 361                recover_probed_instruction(buf, (unsigned long)src);
 362
 363        if (!recovered_insn)
 364                return 0;
 365        kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
 366        insn_get_length(&insn);
 367        length = insn.length;
 368
 369        /* Another subsystem puts a breakpoint, failed to recover */
 370        if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
 371                return 0;
 372        memcpy(dest, insn.kaddr, length);
 373
 374#ifdef CONFIG_X86_64
 375        if (insn_rip_relative(&insn)) {
 376                s64 newdisp;
 377                u8 *disp;
 378                kernel_insn_init(&insn, dest, length);
 379                insn_get_displacement(&insn);
 380                /*
 381                 * The copied instruction uses the %rip-relative addressing
 382                 * mode.  Adjust the displacement for the difference between
 383                 * the original location of this instruction and the location
 384                 * of the copy that will actually be run.  The tricky bit here
 385                 * is making sure that the sign extension happens correctly in
 386                 * this calculation, since we need a signed 32-bit result to
 387                 * be sign-extended to 64 bits when it's added to the %rip
 388                 * value and yield the same 64-bit result that the sign-
 389                 * extension of the original signed 32-bit displacement would
 390                 * have given.
 391                 */
 392                newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest;
 393                if ((s64) (s32) newdisp != newdisp) {
 394                        pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
 395                        pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", src, dest, insn.displacement.value);
 396                        return 0;
 397                }
 398                disp = (u8 *) dest + insn_offset_displacement(&insn);
 399                *(s32 *) disp = (s32) newdisp;
 400        }
 401#endif
 402        return length;
 403}
 404
 405static int arch_copy_kprobe(struct kprobe *p)
 406{
 407        int ret;
 408
 409        /* Copy an instruction with recovering if other optprobe modifies it.*/
 410        ret = __copy_instruction(p->ainsn.insn, p->addr);
 411        if (!ret)
 412                return -EINVAL;
 413
 414        /*
 415         * __copy_instruction can modify the displacement of the instruction,
 416         * but it doesn't affect boostable check.
 417         */
 418        if (can_boost(p->ainsn.insn))
 419                p->ainsn.boostable = 0;
 420        else
 421                p->ainsn.boostable = -1;
 422
 423        /* Check whether the instruction modifies Interrupt Flag or not */
 424        p->ainsn.if_modifier = is_IF_modifier(p->ainsn.insn);
 425
 426        /* Also, displacement change doesn't affect the first byte */
 427        p->opcode = p->ainsn.insn[0];
 428
 429        return 0;
 430}
 431
 432int arch_prepare_kprobe(struct kprobe *p)
 433{
 434        if (alternatives_text_reserved(p->addr, p->addr))
 435                return -EINVAL;
 436
 437        if (!can_probe((unsigned long)p->addr))
 438                return -EILSEQ;
 439        /* insn: must be on special executable page on x86. */
 440        p->ainsn.insn = get_insn_slot();
 441        if (!p->ainsn.insn)
 442                return -ENOMEM;
 443
 444        return arch_copy_kprobe(p);
 445}
 446
 447void arch_arm_kprobe(struct kprobe *p)
 448{
 449        text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
 450}
 451
 452void arch_disarm_kprobe(struct kprobe *p)
 453{
 454        text_poke(p->addr, &p->opcode, 1);
 455}
 456
 457void arch_remove_kprobe(struct kprobe *p)
 458{
 459        if (p->ainsn.insn) {
 460                free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
 461                p->ainsn.insn = NULL;
 462        }
 463}
 464
 465static nokprobe_inline void
 466save_previous_kprobe(struct kprobe_ctlblk *kcb)
 467{
 468        kcb->prev_kprobe.kp = kprobe_running();
 469        kcb->prev_kprobe.status = kcb->kprobe_status;
 470        kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
 471        kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
 472}
 473
 474static nokprobe_inline void
 475restore_previous_kprobe(struct kprobe_ctlblk *kcb)
 476{
 477        __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
 478        kcb->kprobe_status = kcb->prev_kprobe.status;
 479        kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
 480        kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
 481}
 482
 483static nokprobe_inline void
 484set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 485                   struct kprobe_ctlblk *kcb)
 486{
 487        __this_cpu_write(current_kprobe, p);
 488        kcb->kprobe_saved_flags = kcb->kprobe_old_flags
 489                = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
 490        if (p->ainsn.if_modifier)
 491                kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;
 492}
 493
 494static nokprobe_inline void clear_btf(void)
 495{
 496        if (test_thread_flag(TIF_BLOCKSTEP)) {
 497                unsigned long debugctl = get_debugctlmsr();
 498
 499                debugctl &= ~DEBUGCTLMSR_BTF;
 500                update_debugctlmsr(debugctl);
 501        }
 502}
 503
 504static nokprobe_inline void restore_btf(void)
 505{
 506        if (test_thread_flag(TIF_BLOCKSTEP)) {
 507                unsigned long debugctl = get_debugctlmsr();
 508
 509                debugctl |= DEBUGCTLMSR_BTF;
 510                update_debugctlmsr(debugctl);
 511        }
 512}
 513
 514void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
 515{
 516        unsigned long *sara = stack_addr(regs);
 517
 518        ri->ret_addr = (kprobe_opcode_t *) *sara;
 519
 520        /* Replace the return addr with trampoline addr */
 521        *sara = (unsigned long) &kretprobe_trampoline;
 522}
 523NOKPROBE_SYMBOL(arch_prepare_kretprobe);
 524
 525static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
 526                             struct kprobe_ctlblk *kcb, int reenter)
 527{
 528        if (setup_detour_execution(p, regs, reenter))
 529                return;
 530
 531#if !defined(CONFIG_PREEMPT)
 532        if (p->ainsn.boostable == 1 && !p->post_handler) {
 533                /* Boost up -- we can execute copied instructions directly */
 534                if (!reenter)
 535                        reset_current_kprobe();
 536                /*
 537                 * Reentering boosted probe doesn't reset current_kprobe,
 538                 * nor set current_kprobe, because it doesn't use single
 539                 * stepping.
 540                 */
 541                regs->ip = (unsigned long)p->ainsn.insn;
 542                preempt_enable_no_resched();
 543                return;
 544        }
 545#endif
 546        if (reenter) {
 547                save_previous_kprobe(kcb);
 548                set_current_kprobe(p, regs, kcb);
 549                kcb->kprobe_status = KPROBE_REENTER;
 550        } else
 551                kcb->kprobe_status = KPROBE_HIT_SS;
 552        /* Prepare real single stepping */
 553        clear_btf();
 554        regs->flags |= X86_EFLAGS_TF;
 555        regs->flags &= ~X86_EFLAGS_IF;
 556        /* single step inline if the instruction is an int3 */
 557        if (p->opcode == BREAKPOINT_INSTRUCTION)
 558                regs->ip = (unsigned long)p->addr;
 559        else
 560                regs->ip = (unsigned long)p->ainsn.insn;
 561}
 562NOKPROBE_SYMBOL(setup_singlestep);
 563
 564/*
 565 * We have reentered the kprobe_handler(), since another probe was hit while
 566 * within the handler. We save the original kprobes variables and just single
 567 * step on the instruction of the new probe without calling any user handlers.
 568 */
 569static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
 570                          struct kprobe_ctlblk *kcb)
 571{
 572        switch (kcb->kprobe_status) {
 573        case KPROBE_HIT_SSDONE:
 574        case KPROBE_HIT_ACTIVE:
 575        case KPROBE_HIT_SS:
 576                kprobes_inc_nmissed_count(p);
 577                setup_singlestep(p, regs, kcb, 1);
 578                break;
 579        case KPROBE_REENTER:
 580                /* A probe has been hit in the codepath leading up to, or just
 581                 * after, single-stepping of a probed instruction. This entire
 582                 * codepath should strictly reside in .kprobes.text section.
 583                 * Raise a BUG or we'll continue in an endless reentering loop
 584                 * and eventually a stack overflow.
 585                 */
 586                printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
 587                       p->addr);
 588                dump_kprobe(p);
 589                BUG();
 590        default:
 591                /* impossible cases */
 592                WARN_ON(1);
 593                return 0;
 594        }
 595
 596        return 1;
 597}
 598NOKPROBE_SYMBOL(reenter_kprobe);
 599
 600/*
 601 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
 602 * remain disabled throughout this function.
 603 */
 604int kprobe_int3_handler(struct pt_regs *regs)
 605{
 606        kprobe_opcode_t *addr;
 607        struct kprobe *p;
 608        struct kprobe_ctlblk *kcb;
 609
 610        if (user_mode(regs))
 611                return 0;
 612
 613        addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
 614        /*
 615         * We don't want to be preempted for the entire
 616         * duration of kprobe processing. We conditionally
 617         * re-enable preemption at the end of this function,
 618         * and also in reenter_kprobe() and setup_singlestep().
 619         */
 620        preempt_disable();
 621
 622        kcb = get_kprobe_ctlblk();
 623        p = get_kprobe(addr);
 624
 625        if (p) {
 626                if (kprobe_running()) {
 627                        if (reenter_kprobe(p, regs, kcb))
 628                                return 1;
 629                } else {
 630                        set_current_kprobe(p, regs, kcb);
 631                        kcb->kprobe_status = KPROBE_HIT_ACTIVE;
 632
 633                        /*
 634                         * If we have no pre-handler or it returned 0, we
 635                         * continue with normal processing.  If we have a
 636                         * pre-handler and it returned non-zero, it prepped
 637                         * for calling the break_handler below on re-entry
 638                         * for jprobe processing, so get out doing nothing
 639                         * more here.
 640                         */
 641                        if (!p->pre_handler || !p->pre_handler(p, regs))
 642                                setup_singlestep(p, regs, kcb, 0);
 643                        return 1;
 644                }
 645        } else if (*addr != BREAKPOINT_INSTRUCTION) {
 646                /*
 647                 * The breakpoint instruction was removed right
 648                 * after we hit it.  Another cpu has removed
 649                 * either a probepoint or a debugger breakpoint
 650                 * at this address.  In either case, no further
 651                 * handling of this interrupt is appropriate.
 652                 * Back up over the (now missing) int3 and run
 653                 * the original instruction.
 654                 */
 655                regs->ip = (unsigned long)addr;
 656                preempt_enable_no_resched();
 657                return 1;
 658        } else if (kprobe_running()) {
 659                p = __this_cpu_read(current_kprobe);
 660                if (p->break_handler && p->break_handler(p, regs)) {
 661                        if (!skip_singlestep(p, regs, kcb))
 662                                setup_singlestep(p, regs, kcb, 0);
 663                        return 1;
 664                }
 665        } /* else: not a kprobe fault; let the kernel handle it */
 666
 667        preempt_enable_no_resched();
 668        return 0;
 669}
 670NOKPROBE_SYMBOL(kprobe_int3_handler);
 671
 672/*
 673 * When a retprobed function returns, this code saves registers and
 674 * calls trampoline_handler() runs, which calls the kretprobe's handler.
 675 */
 676asm(
 677        ".global kretprobe_trampoline\n"
 678        ".type kretprobe_trampoline, @function\n"
 679        "kretprobe_trampoline:\n"
 680#ifdef CONFIG_X86_64
 681        /* We don't bother saving the ss register */
 682        "       pushq %rsp\n"
 683        "       pushfq\n"
 684        SAVE_REGS_STRING
 685        "       movq %rsp, %rdi\n"
 686        "       call trampoline_handler\n"
 687        /* Replace saved sp with true return address. */
 688        "       movq %rax, 152(%rsp)\n"
 689        RESTORE_REGS_STRING
 690        "       popfq\n"
 691#else
 692        "       pushf\n"
 693        SAVE_REGS_STRING
 694        "       movl %esp, %eax\n"
 695        "       call trampoline_handler\n"
 696        /* Move flags to cs */
 697        "       movl 56(%esp), %edx\n"
 698        "       movl %edx, 52(%esp)\n"
 699        /* Replace saved flags with true return address. */
 700        "       movl %eax, 56(%esp)\n"
 701        RESTORE_REGS_STRING
 702        "       popf\n"
 703#endif
 704        "       ret\n"
 705        ".size kretprobe_trampoline, .-kretprobe_trampoline\n"
 706);
 707NOKPROBE_SYMBOL(kretprobe_trampoline);
 708STACK_FRAME_NON_STANDARD(kretprobe_trampoline);
 709
 710/*
 711 * Called from kretprobe_trampoline
 712 */
 713__visible __used void *trampoline_handler(struct pt_regs *regs)
 714{
 715        struct kretprobe_instance *ri = NULL;
 716        struct hlist_head *head, empty_rp;
 717        struct hlist_node *tmp;
 718        unsigned long flags, orig_ret_address = 0;
 719        unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
 720        kprobe_opcode_t *correct_ret_addr = NULL;
 721
 722        INIT_HLIST_HEAD(&empty_rp);
 723        kretprobe_hash_lock(current, &head, &flags);
 724        /* fixup registers */
 725#ifdef CONFIG_X86_64
 726        regs->cs = __KERNEL_CS;
 727#else
 728        regs->cs = __KERNEL_CS | get_kernel_rpl();
 729        regs->gs = 0;
 730#endif
 731        regs->ip = trampoline_address;
 732        regs->orig_ax = ~0UL;
 733
 734        /*
 735         * It is possible to have multiple instances associated with a given
 736         * task either because multiple functions in the call path have
 737         * return probes installed on them, and/or more than one
 738         * return probe was registered for a target function.
 739         *
 740         * We can handle this because:
 741         *     - instances are always pushed into the head of the list
 742         *     - when multiple return probes are registered for the same
 743         *       function, the (chronologically) first instance's ret_addr
 744         *       will be the real return address, and all the rest will
 745         *       point to kretprobe_trampoline.
 746         */
 747        hlist_for_each_entry_safe(ri, tmp, head, hlist) {
 748                if (ri->task != current)
 749                        /* another task is sharing our hash bucket */
 750                        continue;
 751
 752                orig_ret_address = (unsigned long)ri->ret_addr;
 753
 754                if (orig_ret_address != trampoline_address)
 755                        /*
 756                         * This is the real return address. Any other
 757                         * instances associated with this task are for
 758                         * other calls deeper on the call stack
 759                         */
 760                        break;
 761        }
 762
 763        kretprobe_assert(ri, orig_ret_address, trampoline_address);
 764
 765        correct_ret_addr = ri->ret_addr;
 766        hlist_for_each_entry_safe(ri, tmp, head, hlist) {
 767                if (ri->task != current)
 768                        /* another task is sharing our hash bucket */
 769                        continue;
 770
 771                orig_ret_address = (unsigned long)ri->ret_addr;
 772                if (ri->rp && ri->rp->handler) {
 773                        __this_cpu_write(current_kprobe, &ri->rp->kp);
 774                        get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
 775                        ri->ret_addr = correct_ret_addr;
 776                        ri->rp->handler(ri, regs);
 777                        __this_cpu_write(current_kprobe, NULL);
 778                }
 779
 780                recycle_rp_inst(ri, &empty_rp);
 781
 782                if (orig_ret_address != trampoline_address)
 783                        /*
 784                         * This is the real return address. Any other
 785                         * instances associated with this task are for
 786                         * other calls deeper on the call stack
 787                         */
 788                        break;
 789        }
 790
 791        kretprobe_hash_unlock(current, &flags);
 792
 793        hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
 794                hlist_del(&ri->hlist);
 795                kfree(ri);
 796        }
 797        return (void *)orig_ret_address;
 798}
 799NOKPROBE_SYMBOL(trampoline_handler);
 800
 801/*
 802 * Called after single-stepping.  p->addr is the address of the
 803 * instruction whose first byte has been replaced by the "int 3"
 804 * instruction.  To avoid the SMP problems that can occur when we
 805 * temporarily put back the original opcode to single-step, we
 806 * single-stepped a copy of the instruction.  The address of this
 807 * copy is p->ainsn.insn.
 808 *
 809 * This function prepares to return from the post-single-step
 810 * interrupt.  We have to fix up the stack as follows:
 811 *
 812 * 0) Except in the case of absolute or indirect jump or call instructions,
 813 * the new ip is relative to the copied instruction.  We need to make
 814 * it relative to the original instruction.
 815 *
 816 * 1) If the single-stepped instruction was pushfl, then the TF and IF
 817 * flags are set in the just-pushed flags, and may need to be cleared.
 818 *
 819 * 2) If the single-stepped instruction was a call, the return address
 820 * that is atop the stack is the address following the copied instruction.
 821 * We need to make it the address following the original instruction.
 822 *
 823 * If this is the first time we've single-stepped the instruction at
 824 * this probepoint, and the instruction is boostable, boost it: add a
 825 * jump instruction after the copied instruction, that jumps to the next
 826 * instruction after the probepoint.
 827 */
 828static void resume_execution(struct kprobe *p, struct pt_regs *regs,
 829                             struct kprobe_ctlblk *kcb)
 830{
 831        unsigned long *tos = stack_addr(regs);
 832        unsigned long copy_ip = (unsigned long)p->ainsn.insn;
 833        unsigned long orig_ip = (unsigned long)p->addr;
 834        kprobe_opcode_t *insn = p->ainsn.insn;
 835
 836        /* Skip prefixes */
 837        insn = skip_prefixes(insn);
 838
 839        regs->flags &= ~X86_EFLAGS_TF;
 840        switch (*insn) {
 841        case 0x9c:      /* pushfl */
 842                *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF);
 843                *tos |= kcb->kprobe_old_flags;
 844                break;
 845        case 0xc2:      /* iret/ret/lret */
 846        case 0xc3:
 847        case 0xca:
 848        case 0xcb:
 849        case 0xcf:
 850        case 0xea:      /* jmp absolute -- ip is correct */
 851                /* ip is already adjusted, no more changes required */
 852                p->ainsn.boostable = 1;
 853                goto no_change;
 854        case 0xe8:      /* call relative - Fix return addr */
 855                *tos = orig_ip + (*tos - copy_ip);
 856                break;
 857#ifdef CONFIG_X86_32
 858        case 0x9a:      /* call absolute -- same as call absolute, indirect */
 859                *tos = orig_ip + (*tos - copy_ip);
 860                goto no_change;
 861#endif
 862        case 0xff:
 863                if ((insn[1] & 0x30) == 0x10) {
 864                        /*
 865                         * call absolute, indirect
 866                         * Fix return addr; ip is correct.
 867                         * But this is not boostable
 868                         */
 869                        *tos = orig_ip + (*tos - copy_ip);
 870                        goto no_change;
 871                } else if (((insn[1] & 0x31) == 0x20) ||
 872                           ((insn[1] & 0x31) == 0x21)) {
 873                        /*
 874                         * jmp near and far, absolute indirect
 875                         * ip is correct. And this is boostable
 876                         */
 877                        p->ainsn.boostable = 1;
 878                        goto no_change;
 879                }
 880        default:
 881                break;
 882        }
 883
 884        if (p->ainsn.boostable == 0) {
 885                if ((regs->ip > copy_ip) &&
 886                    (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) {
 887                        /*
 888                         * These instructions can be executed directly if it
 889                         * jumps back to correct address.
 890                         */
 891                        synthesize_reljump((void *)regs->ip,
 892                                (void *)orig_ip + (regs->ip - copy_ip));
 893                        p->ainsn.boostable = 1;
 894                } else {
 895                        p->ainsn.boostable = -1;
 896                }
 897        }
 898
 899        regs->ip += orig_ip - copy_ip;
 900
 901no_change:
 902        restore_btf();
 903}
 904NOKPROBE_SYMBOL(resume_execution);
 905
 906/*
 907 * Interrupts are disabled on entry as trap1 is an interrupt gate and they
 908 * remain disabled throughout this function.
 909 */
 910int kprobe_debug_handler(struct pt_regs *regs)
 911{
 912        struct kprobe *cur = kprobe_running();
 913        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 914
 915        if (!cur)
 916                return 0;
 917
 918        resume_execution(cur, regs, kcb);
 919        regs->flags |= kcb->kprobe_saved_flags;
 920
 921        if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
 922                kcb->kprobe_status = KPROBE_HIT_SSDONE;
 923                cur->post_handler(cur, regs, 0);
 924        }
 925
 926        /* Restore back the original saved kprobes variables and continue. */
 927        if (kcb->kprobe_status == KPROBE_REENTER) {
 928                restore_previous_kprobe(kcb);
 929                goto out;
 930        }
 931        reset_current_kprobe();
 932out:
 933        preempt_enable_no_resched();
 934
 935        /*
 936         * if somebody else is singlestepping across a probe point, flags
 937         * will have TF set, in which case, continue the remaining processing
 938         * of do_debug, as if this is not a probe hit.
 939         */
 940        if (regs->flags & X86_EFLAGS_TF)
 941                return 0;
 942
 943        return 1;
 944}
 945NOKPROBE_SYMBOL(kprobe_debug_handler);
 946
 947int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 948{
 949        struct kprobe *cur = kprobe_running();
 950        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 951
 952        if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) {
 953                /* This must happen on single-stepping */
 954                WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS &&
 955                        kcb->kprobe_status != KPROBE_REENTER);
 956                /*
 957                 * We are here because the instruction being single
 958                 * stepped caused a page fault. We reset the current
 959                 * kprobe and the ip points back to the probe address
 960                 * and allow the page fault handler to continue as a
 961                 * normal page fault.
 962                 */
 963                regs->ip = (unsigned long)cur->addr;
 964                /*
 965                 * Trap flag (TF) has been set here because this fault
 966                 * happened where the single stepping will be done.
 967                 * So clear it by resetting the current kprobe:
 968                 */
 969                regs->flags &= ~X86_EFLAGS_TF;
 970
 971                /*
 972                 * If the TF flag was set before the kprobe hit,
 973                 * don't touch it:
 974                 */
 975                regs->flags |= kcb->kprobe_old_flags;
 976
 977                if (kcb->kprobe_status == KPROBE_REENTER)
 978                        restore_previous_kprobe(kcb);
 979                else
 980                        reset_current_kprobe();
 981                preempt_enable_no_resched();
 982        } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE ||
 983                   kcb->kprobe_status == KPROBE_HIT_SSDONE) {
 984                /*
 985                 * We increment the nmissed count for accounting,
 986                 * we can also use npre/npostfault count for accounting
 987                 * these specific fault cases.
 988                 */
 989                kprobes_inc_nmissed_count(cur);
 990
 991                /*
 992                 * We come here because instructions in the pre/post
 993                 * handler caused the page_fault, this could happen
 994                 * if handler tries to access user space by
 995                 * copy_from_user(), get_user() etc. Let the
 996                 * user-specified handler try to fix it first.
 997                 */
 998                if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
 999                        return 1;
1000
1001                /*
1002                 * In case the user-specified fault handler returned
1003                 * zero, try to fix up.
1004                 */
1005                if (fixup_exception(regs, trapnr))
1006                        return 1;
1007
1008                /*
1009                 * fixup routine could not handle it,
1010                 * Let do_page_fault() fix it.
1011                 */
1012        }
1013
1014        return 0;
1015}
1016NOKPROBE_SYMBOL(kprobe_fault_handler);
1017
1018/*
1019 * Wrapper routine for handling exceptions.
1020 */
1021int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
1022                             void *data)
1023{
1024        struct die_args *args = data;
1025        int ret = NOTIFY_DONE;
1026
1027        if (args->regs && user_mode(args->regs))
1028                return ret;
1029
1030        if (val == DIE_GPF) {
1031                /*
1032                 * To be potentially processing a kprobe fault and to
1033                 * trust the result from kprobe_running(), we have
1034                 * be non-preemptible.
1035                 */
1036                if (!preemptible() && kprobe_running() &&
1037                    kprobe_fault_handler(args->regs, args->trapnr))
1038                        ret = NOTIFY_STOP;
1039        }
1040        return ret;
1041}
1042NOKPROBE_SYMBOL(kprobe_exceptions_notify);
1043
1044int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
1045{
1046        struct jprobe *jp = container_of(p, struct jprobe, kp);
1047        unsigned long addr;
1048        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1049
1050        kcb->jprobe_saved_regs = *regs;
1051        kcb->jprobe_saved_sp = stack_addr(regs);
1052        addr = (unsigned long)(kcb->jprobe_saved_sp);
1053
1054        /*
1055         * As Linus pointed out, gcc assumes that the callee
1056         * owns the argument space and could overwrite it, e.g.
1057         * tailcall optimization. So, to be absolutely safe
1058         * we also save and restore enough stack bytes to cover
1059         * the argument area.
1060         */
1061        memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
1062               MIN_STACK_SIZE(addr));
1063        regs->flags &= ~X86_EFLAGS_IF;
1064        trace_hardirqs_off();
1065        regs->ip = (unsigned long)(jp->entry);
1066
1067        /*
1068         * jprobes use jprobe_return() which skips the normal return
1069         * path of the function, and this messes up the accounting of the
1070         * function graph tracer to get messed up.
1071         *
1072         * Pause function graph tracing while performing the jprobe function.
1073         */
1074        pause_graph_tracing();
1075        return 1;
1076}
1077NOKPROBE_SYMBOL(setjmp_pre_handler);
1078
1079void jprobe_return(void)
1080{
1081        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1082
1083        asm volatile (
1084#ifdef CONFIG_X86_64
1085                        "       xchg   %%rbx,%%rsp      \n"
1086#else
1087                        "       xchgl   %%ebx,%%esp     \n"
1088#endif
1089                        "       int3                    \n"
1090                        "       .globl jprobe_return_end\n"
1091                        "       jprobe_return_end:      \n"
1092                        "       nop                     \n"::"b"
1093                        (kcb->jprobe_saved_sp):"memory");
1094}
1095NOKPROBE_SYMBOL(jprobe_return);
1096NOKPROBE_SYMBOL(jprobe_return_end);
1097
1098int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1099{
1100        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1101        u8 *addr = (u8 *) (regs->ip - 1);
1102        struct jprobe *jp = container_of(p, struct jprobe, kp);
1103        void *saved_sp = kcb->jprobe_saved_sp;
1104
1105        if ((addr > (u8 *) jprobe_return) &&
1106            (addr < (u8 *) jprobe_return_end)) {
1107                if (stack_addr(regs) != saved_sp) {
1108                        struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
1109                        printk(KERN_ERR
1110                               "current sp %p does not match saved sp %p\n",
1111                               stack_addr(regs), saved_sp);
1112                        printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
1113                        show_regs(saved_regs);
1114                        printk(KERN_ERR "Current registers\n");
1115                        show_regs(regs);
1116                        BUG();
1117                }
1118                /* It's OK to start function graph tracing again */
1119                unpause_graph_tracing();
1120                *regs = kcb->jprobe_saved_regs;
1121                memcpy(saved_sp, kcb->jprobes_stack, MIN_STACK_SIZE(saved_sp));
1122                preempt_enable_no_resched();
1123                return 1;
1124        }
1125        return 0;
1126}
1127NOKPROBE_SYMBOL(longjmp_break_handler);
1128
1129bool arch_within_kprobe_blacklist(unsigned long addr)
1130{
1131        return  (addr >= (unsigned long)__kprobes_text_start &&
1132                 addr < (unsigned long)__kprobes_text_end) ||
1133                (addr >= (unsigned long)__entry_text_start &&
1134                 addr < (unsigned long)__entry_text_end);
1135}
1136
1137int __init arch_init_kprobes(void)
1138{
1139        return 0;
1140}
1141
1142int arch_trampoline_kprobe(struct kprobe *p)
1143{
1144        return 0;
1145}
1146