linux/arch/x86/kernel/traps.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 1991, 1992  Linus Torvalds
   3 *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
   4 *
   5 *  Pentium III FXSR, SSE support
   6 *      Gareth Hughes <gareth@valinux.com>, May 2000
   7 */
   8
   9/*
  10 * Handle hardware traps and faults.
  11 */
  12
  13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  14
  15#include <linux/context_tracking.h>
  16#include <linux/interrupt.h>
  17#include <linux/kallsyms.h>
  18#include <linux/spinlock.h>
  19#include <linux/kprobes.h>
  20#include <linux/uaccess.h>
  21#include <linux/kdebug.h>
  22#include <linux/kgdb.h>
  23#include <linux/kernel.h>
  24#include <linux/export.h>
  25#include <linux/ptrace.h>
  26#include <linux/uprobes.h>
  27#include <linux/string.h>
  28#include <linux/delay.h>
  29#include <linux/errno.h>
  30#include <linux/kexec.h>
  31#include <linux/sched.h>
  32#include <linux/timer.h>
  33#include <linux/init.h>
  34#include <linux/bug.h>
  35#include <linux/nmi.h>
  36#include <linux/mm.h>
  37#include <linux/smp.h>
  38#include <linux/io.h>
  39
  40#ifdef CONFIG_EISA
  41#include <linux/ioport.h>
  42#include <linux/eisa.h>
  43#endif
  44
  45#if defined(CONFIG_EDAC)
  46#include <linux/edac.h>
  47#endif
  48
  49#include <asm/kmemcheck.h>
  50#include <asm/stacktrace.h>
  51#include <asm/processor.h>
  52#include <asm/debugreg.h>
  53#include <linux/atomic.h>
  54#include <asm/text-patching.h>
  55#include <asm/ftrace.h>
  56#include <asm/traps.h>
  57#include <asm/desc.h>
  58#include <asm/fpu/internal.h>
  59#include <asm/mce.h>
  60#include <asm/fixmap.h>
  61#include <asm/mach_traps.h>
  62#include <asm/alternative.h>
  63#include <asm/fpu/xstate.h>
  64#include <asm/trace/mpx.h>
  65#include <asm/mpx.h>
  66#include <asm/vm86.h>
  67
  68#ifdef CONFIG_X86_64
  69#include <asm/x86_init.h>
  70#include <asm/pgalloc.h>
  71#include <asm/proto.h>
  72
  73/* No need to be aligned, but done to keep all IDTs defined the same way. */
  74gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss;
  75#else
  76#include <asm/processor-flags.h>
  77#include <asm/setup.h>
  78#include <asm/proto.h>
  79#endif
  80
  81/* Must be page-aligned because the real IDT is used in a fixmap. */
  82gate_desc idt_table[NR_VECTORS] __page_aligned_bss;
  83
  84DECLARE_BITMAP(used_vectors, NR_VECTORS);
  85EXPORT_SYMBOL_GPL(used_vectors);
  86
  87static inline void cond_local_irq_enable(struct pt_regs *regs)
  88{
  89        if (regs->flags & X86_EFLAGS_IF)
  90                local_irq_enable();
  91}
  92
  93static inline void cond_local_irq_disable(struct pt_regs *regs)
  94{
  95        if (regs->flags & X86_EFLAGS_IF)
  96                local_irq_disable();
  97}
  98
  99/*
 100 * In IST context, we explicitly disable preemption.  This serves two
 101 * purposes: it makes it much less likely that we would accidentally
 102 * schedule in IST context and it will force a warning if we somehow
 103 * manage to schedule by accident.
 104 */
 105void ist_enter(struct pt_regs *regs)
 106{
 107        if (user_mode(regs)) {
 108                RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 109        } else {
 110                /*
 111                 * We might have interrupted pretty much anything.  In
 112                 * fact, if we're a machine check, we can even interrupt
 113                 * NMI processing.  We don't want in_nmi() to return true,
 114                 * but we need to notify RCU.
 115                 */
 116                rcu_nmi_enter();
 117        }
 118
 119        preempt_disable();
 120
 121        /* This code is a bit fragile.  Test it. */
 122        RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work");
 123}
 124
 125void ist_exit(struct pt_regs *regs)
 126{
 127        preempt_enable_no_resched();
 128
 129        if (!user_mode(regs))
 130                rcu_nmi_exit();
 131}
 132
 133/**
 134 * ist_begin_non_atomic() - begin a non-atomic section in an IST exception
 135 * @regs:       regs passed to the IST exception handler
 136 *
 137 * IST exception handlers normally cannot schedule.  As a special
 138 * exception, if the exception interrupted userspace code (i.e.
 139 * user_mode(regs) would return true) and the exception was not
 140 * a double fault, it can be safe to schedule.  ist_begin_non_atomic()
 141 * begins a non-atomic section within an ist_enter()/ist_exit() region.
 142 * Callers are responsible for enabling interrupts themselves inside
 143 * the non-atomic section, and callers must call ist_end_non_atomic()
 144 * before ist_exit().
 145 */
 146void ist_begin_non_atomic(struct pt_regs *regs)
 147{
 148        BUG_ON(!user_mode(regs));
 149
 150        /*
 151         * Sanity check: we need to be on the normal thread stack.  This
 152         * will catch asm bugs and any attempt to use ist_preempt_enable
 153         * from double_fault.
 154         */
 155        BUG_ON((unsigned long)(current_top_of_stack() -
 156                               current_stack_pointer()) >= THREAD_SIZE);
 157
 158        preempt_enable_no_resched();
 159}
 160
 161/**
 162 * ist_end_non_atomic() - begin a non-atomic section in an IST exception
 163 *
 164 * Ends a non-atomic section started with ist_begin_non_atomic().
 165 */
 166void ist_end_non_atomic(void)
 167{
 168        preempt_disable();
 169}
 170
 171static nokprobe_inline int
 172do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
 173                  struct pt_regs *regs, long error_code)
 174{
 175        if (v8086_mode(regs)) {
 176                /*
 177                 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
 178                 * On nmi (interrupt 2), do_trap should not be called.
 179                 */
 180                if (trapnr < X86_TRAP_UD) {
 181                        if (!handle_vm86_trap((struct kernel_vm86_regs *) regs,
 182                                                error_code, trapnr))
 183                                return 0;
 184                }
 185                return -1;
 186        }
 187
 188        if (!user_mode(regs)) {
 189                if (!fixup_exception(regs, trapnr)) {
 190                        tsk->thread.error_code = error_code;
 191                        tsk->thread.trap_nr = trapnr;
 192                        die(str, regs, error_code);
 193                }
 194                return 0;
 195        }
 196
 197        return -1;
 198}
 199
 200static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr,
 201                                siginfo_t *info)
 202{
 203        unsigned long siaddr;
 204        int sicode;
 205
 206        switch (trapnr) {
 207        default:
 208                return SEND_SIG_PRIV;
 209
 210        case X86_TRAP_DE:
 211                sicode = FPE_INTDIV;
 212                siaddr = uprobe_get_trap_addr(regs);
 213                break;
 214        case X86_TRAP_UD:
 215                sicode = ILL_ILLOPN;
 216                siaddr = uprobe_get_trap_addr(regs);
 217                break;
 218        case X86_TRAP_AC:
 219                sicode = BUS_ADRALN;
 220                siaddr = 0;
 221                break;
 222        }
 223
 224        info->si_signo = signr;
 225        info->si_errno = 0;
 226        info->si_code = sicode;
 227        info->si_addr = (void __user *)siaddr;
 228        return info;
 229}
 230
 231static void
 232do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 233        long error_code, siginfo_t *info)
 234{
 235        struct task_struct *tsk = current;
 236
 237
 238        if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
 239                return;
 240        /*
 241         * We want error_code and trap_nr set for userspace faults and
 242         * kernelspace faults which result in die(), but not
 243         * kernelspace faults which are fixed up.  die() gives the
 244         * process no chance to handle the signal and notice the
 245         * kernel fault information, so that won't result in polluting
 246         * the information about previously queued, but not yet
 247         * delivered, faults.  See also do_general_protection below.
 248         */
 249        tsk->thread.error_code = error_code;
 250        tsk->thread.trap_nr = trapnr;
 251
 252        if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
 253            printk_ratelimit()) {
 254                pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
 255                        tsk->comm, tsk->pid, str,
 256                        regs->ip, regs->sp, error_code);
 257                print_vma_addr(" in ", regs->ip);
 258                pr_cont("\n");
 259        }
 260
 261        force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);
 262}
 263NOKPROBE_SYMBOL(do_trap);
 264
 265static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 266                          unsigned long trapnr, int signr)
 267{
 268        siginfo_t info;
 269
 270        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 271
 272        if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
 273                        NOTIFY_STOP) {
 274                cond_local_irq_enable(regs);
 275                do_trap(trapnr, signr, str, regs, error_code,
 276                        fill_trap_info(regs, signr, trapnr, &info));
 277        }
 278}
 279
 280#define DO_ERROR(trapnr, signr, str, name)                              \
 281dotraplinkage void do_##name(struct pt_regs *regs, long error_code)     \
 282{                                                                       \
 283        do_error_trap(regs, error_code, str, trapnr, signr);            \
 284}
 285
 286DO_ERROR(X86_TRAP_DE,     SIGFPE,  "divide error",              divide_error)
 287DO_ERROR(X86_TRAP_OF,     SIGSEGV, "overflow",                  overflow)
 288DO_ERROR(X86_TRAP_UD,     SIGILL,  "invalid opcode",            invalid_op)
 289DO_ERROR(X86_TRAP_OLD_MF, SIGFPE,  "coprocessor segment overrun",coprocessor_segment_overrun)
 290DO_ERROR(X86_TRAP_TS,     SIGSEGV, "invalid TSS",               invalid_TSS)
 291DO_ERROR(X86_TRAP_NP,     SIGBUS,  "segment not present",       segment_not_present)
 292DO_ERROR(X86_TRAP_SS,     SIGBUS,  "stack segment",             stack_segment)
 293DO_ERROR(X86_TRAP_AC,     SIGBUS,  "alignment check",           alignment_check)
 294
 295#ifdef CONFIG_VMAP_STACK
 296__visible void __noreturn handle_stack_overflow(const char *message,
 297                                                struct pt_regs *regs,
 298                                                unsigned long fault_address)
 299{
 300        printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
 301                 (void *)fault_address, current->stack,
 302                 (char *)current->stack + THREAD_SIZE - 1);
 303        die(message, regs, 0);
 304
 305        /* Be absolutely certain we don't return. */
 306        panic(message);
 307}
 308#endif
 309
 310#ifdef CONFIG_X86_64
 311/* Runs on IST stack */
 312dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 313{
 314        static const char str[] = "double fault";
 315        struct task_struct *tsk = current;
 316#ifdef CONFIG_VMAP_STACK
 317        unsigned long cr2;
 318#endif
 319
 320#ifdef CONFIG_X86_ESPFIX64
 321        extern unsigned char native_irq_return_iret[];
 322
 323        /*
 324         * If IRET takes a non-IST fault on the espfix64 stack, then we
 325         * end up promoting it to a doublefault.  In that case, modify
 326         * the stack to make it look like we just entered the #GP
 327         * handler from user space, similar to bad_iret.
 328         *
 329         * No need for ist_enter here because we don't use RCU.
 330         */
 331        if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
 332                regs->cs == __KERNEL_CS &&
 333                regs->ip == (unsigned long)native_irq_return_iret)
 334        {
 335                struct pt_regs *normal_regs = task_pt_regs(current);
 336
 337                /* Fake a #GP(0) from userspace. */
 338                memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
 339                normal_regs->orig_ax = 0;  /* Missing (lost) #GP error code */
 340                regs->ip = (unsigned long)general_protection;
 341                regs->sp = (unsigned long)&normal_regs->orig_ax;
 342
 343                return;
 344        }
 345#endif
 346
 347        ist_enter(regs);
 348        notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
 349
 350        tsk->thread.error_code = error_code;
 351        tsk->thread.trap_nr = X86_TRAP_DF;
 352
 353#ifdef CONFIG_VMAP_STACK
 354        /*
 355         * If we overflow the stack into a guard page, the CPU will fail
 356         * to deliver #PF and will send #DF instead.  Similarly, if we
 357         * take any non-IST exception while too close to the bottom of
 358         * the stack, the processor will get a page fault while
 359         * delivering the exception and will generate a double fault.
 360         *
 361         * According to the SDM (footnote in 6.15 under "Interrupt 14 -
 362         * Page-Fault Exception (#PF):
 363         *
 364         *   Processors update CR2 whenever a page fault is detected. If a
 365         *   second page fault occurs while an earlier page fault is being
 366         *   deliv- ered, the faulting linear address of the second fault will
 367         *   overwrite the contents of CR2 (replacing the previous
 368         *   address). These updates to CR2 occur even if the page fault
 369         *   results in a double fault or occurs during the delivery of a
 370         *   double fault.
 371         *
 372         * The logic below has a small possibility of incorrectly diagnosing
 373         * some errors as stack overflows.  For example, if the IDT or GDT
 374         * gets corrupted such that #GP delivery fails due to a bad descriptor
 375         * causing #GP and we hit this condition while CR2 coincidentally
 376         * points to the stack guard page, we'll think we overflowed the
 377         * stack.  Given that we're going to panic one way or another
 378         * if this happens, this isn't necessarily worth fixing.
 379         *
 380         * If necessary, we could improve the test by only diagnosing
 381         * a stack overflow if the saved RSP points within 47 bytes of
 382         * the bottom of the stack: if RSP == tsk_stack + 48 and we
 383         * take an exception, the stack is already aligned and there
 384         * will be enough room SS, RSP, RFLAGS, CS, RIP, and a
 385         * possible error code, so a stack overflow would *not* double
 386         * fault.  With any less space left, exception delivery could
 387         * fail, and, as a practical matter, we've overflowed the
 388         * stack even if the actual trigger for the double fault was
 389         * something else.
 390         */
 391        cr2 = read_cr2();
 392        if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
 393                handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
 394#endif
 395
 396#ifdef CONFIG_DOUBLEFAULT
 397        df_debug(regs, error_code);
 398#endif
 399        /*
 400         * This is always a kernel trap and never fixable (and thus must
 401         * never return).
 402         */
 403        for (;;)
 404                die(str, regs, error_code);
 405}
 406#endif
 407
 408dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 409{
 410        const struct mpx_bndcsr *bndcsr;
 411        siginfo_t *info;
 412
 413        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 414        if (notify_die(DIE_TRAP, "bounds", regs, error_code,
 415                        X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
 416                return;
 417        cond_local_irq_enable(regs);
 418
 419        if (!user_mode(regs))
 420                die("bounds", regs, error_code);
 421
 422        if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
 423                /* The exception is not from Intel MPX */
 424                goto exit_trap;
 425        }
 426
 427        /*
 428         * We need to look at BNDSTATUS to resolve this exception.
 429         * A NULL here might mean that it is in its 'init state',
 430         * which is all zeros which indicates MPX was not
 431         * responsible for the exception.
 432         */
 433        bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR);
 434        if (!bndcsr)
 435                goto exit_trap;
 436
 437        trace_bounds_exception_mpx(bndcsr);
 438        /*
 439         * The error code field of the BNDSTATUS register communicates status
 440         * information of a bound range exception #BR or operation involving
 441         * bound directory.
 442         */
 443        switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) {
 444        case 2: /* Bound directory has invalid entry. */
 445                if (mpx_handle_bd_fault())
 446                        goto exit_trap;
 447                break; /* Success, it was handled */
 448        case 1: /* Bound violation. */
 449                info = mpx_generate_siginfo(regs);
 450                if (IS_ERR(info)) {
 451                        /*
 452                         * We failed to decode the MPX instruction.  Act as if
 453                         * the exception was not caused by MPX.
 454                         */
 455                        goto exit_trap;
 456                }
 457                /*
 458                 * Success, we decoded the instruction and retrieved
 459                 * an 'info' containing the address being accessed
 460                 * which caused the exception.  This information
 461                 * allows and application to possibly handle the
 462                 * #BR exception itself.
 463                 */
 464                do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, info);
 465                kfree(info);
 466                break;
 467        case 0: /* No exception caused by Intel MPX operations. */
 468                goto exit_trap;
 469        default:
 470                die("bounds", regs, error_code);
 471        }
 472
 473        return;
 474
 475exit_trap:
 476        /*
 477         * This path out is for all the cases where we could not
 478         * handle the exception in some way (like allocating a
 479         * table or telling userspace about it.  We will also end
 480         * up here if the kernel has MPX turned off at compile
 481         * time..
 482         */
 483        do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL);
 484}
 485
 486dotraplinkage void
 487do_general_protection(struct pt_regs *regs, long error_code)
 488{
 489        struct task_struct *tsk;
 490
 491        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 492        cond_local_irq_enable(regs);
 493
 494        if (v8086_mode(regs)) {
 495                local_irq_enable();
 496                handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
 497                return;
 498        }
 499
 500        tsk = current;
 501        if (!user_mode(regs)) {
 502                if (fixup_exception(regs, X86_TRAP_GP))
 503                        return;
 504
 505                tsk->thread.error_code = error_code;
 506                tsk->thread.trap_nr = X86_TRAP_GP;
 507                if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
 508                               X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
 509                        die("general protection fault", regs, error_code);
 510                return;
 511        }
 512
 513        tsk->thread.error_code = error_code;
 514        tsk->thread.trap_nr = X86_TRAP_GP;
 515
 516        if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
 517                        printk_ratelimit()) {
 518                pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx",
 519                        tsk->comm, task_pid_nr(tsk),
 520                        regs->ip, regs->sp, error_code);
 521                print_vma_addr(" in ", regs->ip);
 522                pr_cont("\n");
 523        }
 524
 525        force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
 526}
 527NOKPROBE_SYMBOL(do_general_protection);
 528
 529/* May run on IST stack. */
 530dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 531{
 532#ifdef CONFIG_DYNAMIC_FTRACE
 533        /*
 534         * ftrace must be first, everything else may cause a recursive crash.
 535         * See note by declaration of modifying_ftrace_code in ftrace.c
 536         */
 537        if (unlikely(atomic_read(&modifying_ftrace_code)) &&
 538            ftrace_int3_handler(regs))
 539                return;
 540#endif
 541        if (poke_int3_handler(regs))
 542                return;
 543
 544        ist_enter(regs);
 545        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 546#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
 547        if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
 548                                SIGTRAP) == NOTIFY_STOP)
 549                goto exit;
 550#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
 551
 552#ifdef CONFIG_KPROBES
 553        if (kprobe_int3_handler(regs))
 554                goto exit;
 555#endif
 556
 557        if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
 558                        SIGTRAP) == NOTIFY_STOP)
 559                goto exit;
 560
 561        /*
 562         * Let others (NMI) know that the debug stack is in use
 563         * as we may switch to the interrupt stack.
 564         */
 565        debug_stack_usage_inc();
 566        preempt_disable();
 567        cond_local_irq_enable(regs);
 568        do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
 569        cond_local_irq_disable(regs);
 570        preempt_enable_no_resched();
 571        debug_stack_usage_dec();
 572exit:
 573        ist_exit(regs);
 574}
 575NOKPROBE_SYMBOL(do_int3);
 576
 577#ifdef CONFIG_X86_64
 578/*
 579 * Help handler running on IST stack to switch off the IST stack if the
 580 * interrupted code was in user mode. The actual stack switch is done in
 581 * entry_64.S
 582 */
 583asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
 584{
 585        struct pt_regs *regs = task_pt_regs(current);
 586        *regs = *eregs;
 587        return regs;
 588}
 589NOKPROBE_SYMBOL(sync_regs);
 590
 591struct bad_iret_stack {
 592        void *error_entry_ret;
 593        struct pt_regs regs;
 594};
 595
 596asmlinkage __visible notrace
 597struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
 598{
 599        /*
 600         * This is called from entry_64.S early in handling a fault
 601         * caused by a bad iret to user mode.  To handle the fault
 602         * correctly, we want move our stack frame to task_pt_regs
 603         * and we want to pretend that the exception came from the
 604         * iret target.
 605         */
 606        struct bad_iret_stack *new_stack =
 607                container_of(task_pt_regs(current),
 608                             struct bad_iret_stack, regs);
 609
 610        /* Copy the IRET target to the new stack. */
 611        memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
 612
 613        /* Copy the remainder of the stack from the current stack. */
 614        memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
 615
 616        BUG_ON(!user_mode(&new_stack->regs));
 617        return new_stack;
 618}
 619NOKPROBE_SYMBOL(fixup_bad_iret);
 620#endif
 621
 622static bool is_sysenter_singlestep(struct pt_regs *regs)
 623{
 624        /*
 625         * We don't try for precision here.  If we're anywhere in the region of
 626         * code that can be single-stepped in the SYSENTER entry path, then
 627         * assume that this is a useless single-step trap due to SYSENTER
 628         * being invoked with TF set.  (We don't know in advance exactly
 629         * which instructions will be hit because BTF could plausibly
 630         * be set.)
 631         */
 632#ifdef CONFIG_X86_32
 633        return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) <
 634                (unsigned long)__end_SYSENTER_singlestep_region -
 635                (unsigned long)__begin_SYSENTER_singlestep_region;
 636#elif defined(CONFIG_IA32_EMULATION)
 637        return (regs->ip - (unsigned long)entry_SYSENTER_compat) <
 638                (unsigned long)__end_entry_SYSENTER_compat -
 639                (unsigned long)entry_SYSENTER_compat;
 640#else
 641        return false;
 642#endif
 643}
 644
 645/*
 646 * Our handling of the processor debug registers is non-trivial.
 647 * We do not clear them on entry and exit from the kernel. Therefore
 648 * it is possible to get a watchpoint trap here from inside the kernel.
 649 * However, the code in ./ptrace.c has ensured that the user can
 650 * only set watchpoints on userspace addresses. Therefore the in-kernel
 651 * watchpoint trap can only occur in code which is reading/writing
 652 * from user space. Such code must not hold kernel locks (since it
 653 * can equally take a page fault), therefore it is safe to call
 654 * force_sig_info even though that claims and releases locks.
 655 *
 656 * Code in ./signal.c ensures that the debug control register
 657 * is restored before we deliver any signal, and therefore that
 658 * user code runs with the correct debug control register even though
 659 * we clear it here.
 660 *
 661 * Being careful here means that we don't have to be as careful in a
 662 * lot of more complicated places (task switching can be a bit lazy
 663 * about restoring all the debug state, and ptrace doesn't have to
 664 * find every occurrence of the TF bit that could be saved away even
 665 * by user code)
 666 *
 667 * May run on IST stack.
 668 */
 669dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 670{
 671        struct task_struct *tsk = current;
 672        int user_icebp = 0;
 673        unsigned long dr6;
 674        int si_code;
 675
 676        ist_enter(regs);
 677
 678        get_debugreg(dr6, 6);
 679        /*
 680         * The Intel SDM says:
 681         *
 682         *   Certain debug exceptions may clear bits 0-3. The remaining
 683         *   contents of the DR6 register are never cleared by the
 684         *   processor. To avoid confusion in identifying debug
 685         *   exceptions, debug handlers should clear the register before
 686         *   returning to the interrupted task.
 687         *
 688         * Keep it simple: clear DR6 immediately.
 689         */
 690        set_debugreg(0, 6);
 691
 692        /* Filter out all the reserved bits which are preset to 1 */
 693        dr6 &= ~DR6_RESERVED;
 694
 695        /*
 696         * The SDM says "The processor clears the BTF flag when it
 697         * generates a debug exception."  Clear TIF_BLOCKSTEP to keep
 698         * TIF_BLOCKSTEP in sync with the hardware BTF flag.
 699         */
 700        clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
 701
 702        if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) &&
 703                     is_sysenter_singlestep(regs))) {
 704                dr6 &= ~DR_STEP;
 705                if (!dr6)
 706                        goto exit;
 707                /*
 708                 * else we might have gotten a single-step trap and hit a
 709                 * watchpoint at the same time, in which case we should fall
 710                 * through and handle the watchpoint.
 711                 */
 712        }
 713
 714        /*
 715         * If dr6 has no reason to give us about the origin of this trap,
 716         * then it's very likely the result of an icebp/int01 trap.
 717         * User wants a sigtrap for that.
 718         */
 719        if (!dr6 && user_mode(regs))
 720                user_icebp = 1;
 721
 722        /* Catch kmemcheck conditions! */
 723        if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
 724                goto exit;
 725
 726        /* Store the virtualized DR6 value */
 727        tsk->thread.debugreg6 = dr6;
 728
 729#ifdef CONFIG_KPROBES
 730        if (kprobe_debug_handler(regs))
 731                goto exit;
 732#endif
 733
 734        if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,
 735                                                        SIGTRAP) == NOTIFY_STOP)
 736                goto exit;
 737
 738        /*
 739         * Let others (NMI) know that the debug stack is in use
 740         * as we may switch to the interrupt stack.
 741         */
 742        debug_stack_usage_inc();
 743
 744        /* It's safe to allow irq's after DR6 has been saved */
 745        preempt_disable();
 746        cond_local_irq_enable(regs);
 747
 748        if (v8086_mode(regs)) {
 749                handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
 750                                        X86_TRAP_DB);
 751                cond_local_irq_disable(regs);
 752                preempt_enable_no_resched();
 753                debug_stack_usage_dec();
 754                goto exit;
 755        }
 756
 757        if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) {
 758                /*
 759                 * Historical junk that used to handle SYSENTER single-stepping.
 760                 * This should be unreachable now.  If we survive for a while
 761                 * without anyone hitting this warning, we'll turn this into
 762                 * an oops.
 763                 */
 764                tsk->thread.debugreg6 &= ~DR_STEP;
 765                set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
 766                regs->flags &= ~X86_EFLAGS_TF;
 767        }
 768        si_code = get_si_code(tsk->thread.debugreg6);
 769        if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
 770                send_sigtrap(tsk, regs, error_code, si_code);
 771        cond_local_irq_disable(regs);
 772        preempt_enable_no_resched();
 773        debug_stack_usage_dec();
 774
 775exit:
 776#if defined(CONFIG_X86_32)
 777        /*
 778         * This is the most likely code path that involves non-trivial use
 779         * of the SYSENTER stack.  Check that we haven't overrun it.
 780         */
 781        WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
 782             "Overran or corrupted SYSENTER stack\n");
 783#endif
 784        ist_exit(regs);
 785}
 786NOKPROBE_SYMBOL(do_debug);
 787
 788/*
 789 * Note that we play around with the 'TS' bit in an attempt to get
 790 * the correct behaviour even in the presence of the asynchronous
 791 * IRQ13 behaviour
 792 */
 793static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 794{
 795        struct task_struct *task = current;
 796        struct fpu *fpu = &task->thread.fpu;
 797        siginfo_t info;
 798        char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
 799                                                "simd exception";
 800
 801        if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
 802                return;
 803        cond_local_irq_enable(regs);
 804
 805        if (!user_mode(regs)) {
 806                if (!fixup_exception(regs, trapnr)) {
 807                        task->thread.error_code = error_code;
 808                        task->thread.trap_nr = trapnr;
 809                        die(str, regs, error_code);
 810                }
 811                return;
 812        }
 813
 814        /*
 815         * Save the info for the exception handler and clear the error.
 816         */
 817        fpu__save(fpu);
 818
 819        task->thread.trap_nr    = trapnr;
 820        task->thread.error_code = error_code;
 821        info.si_signo           = SIGFPE;
 822        info.si_errno           = 0;
 823        info.si_addr            = (void __user *)uprobe_get_trap_addr(regs);
 824
 825        info.si_code = fpu__exception_code(fpu, trapnr);
 826
 827        /* Retry when we get spurious exceptions: */
 828        if (!info.si_code)
 829                return;
 830
 831        force_sig_info(SIGFPE, &info, task);
 832}
 833
 834dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 835{
 836        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 837        math_error(regs, error_code, X86_TRAP_MF);
 838}
 839
 840dotraplinkage void
 841do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 842{
 843        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 844        math_error(regs, error_code, X86_TRAP_XF);
 845}
 846
 847dotraplinkage void
 848do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 849{
 850        cond_local_irq_enable(regs);
 851}
 852
 853dotraplinkage void
 854do_device_not_available(struct pt_regs *regs, long error_code)
 855{
 856        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 857
 858#ifdef CONFIG_MATH_EMULATION
 859        if (!boot_cpu_has(X86_FEATURE_FPU) && (read_cr0() & X86_CR0_EM)) {
 860                struct math_emu_info info = { };
 861
 862                cond_local_irq_enable(regs);
 863
 864                info.regs = regs;
 865                math_emulate(&info);
 866                return;
 867        }
 868#endif
 869        fpu__restore(&current->thread.fpu); /* interrupts still off */
 870#ifdef CONFIG_X86_32
 871        cond_local_irq_enable(regs);
 872#endif
 873}
 874NOKPROBE_SYMBOL(do_device_not_available);
 875
 876#ifdef CONFIG_X86_32
 877dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 878{
 879        siginfo_t info;
 880
 881        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 882        local_irq_enable();
 883
 884        info.si_signo = SIGILL;
 885        info.si_errno = 0;
 886        info.si_code = ILL_BADSTK;
 887        info.si_addr = NULL;
 888        if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
 889                        X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
 890                do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
 891                        &info);
 892        }
 893}
 894#endif
 895
 896/* Set of traps needed for early debugging. */
 897void __init early_trap_init(void)
 898{
 899        /*
 900         * Don't use IST to set DEBUG_STACK as it doesn't work until TSS
 901         * is ready in cpu_init() <-- trap_init(). Before trap_init(),
 902         * CPU runs at ring 0 so it is impossible to hit an invalid
 903         * stack.  Using the original stack works well enough at this
 904         * early stage. DEBUG_STACK will be equipped after cpu_init() in
 905         * trap_init().
 906         *
 907         * We don't need to set trace_idt_table like set_intr_gate(),
 908         * since we don't have trace_debug and it will be reset to
 909         * 'debug' in trap_init() by set_intr_gate_ist().
 910         */
 911        set_intr_gate_notrace(X86_TRAP_DB, debug);
 912        /* int3 can be called from all */
 913        set_system_intr_gate(X86_TRAP_BP, &int3);
 914#ifdef CONFIG_X86_32
 915        set_intr_gate(X86_TRAP_PF, page_fault);
 916#endif
 917        load_idt(&idt_descr);
 918}
 919
 920void __init early_trap_pf_init(void)
 921{
 922#ifdef CONFIG_X86_64
 923        set_intr_gate(X86_TRAP_PF, page_fault);
 924#endif
 925}
 926
 927void __init trap_init(void)
 928{
 929        int i;
 930
 931#ifdef CONFIG_EISA
 932        void __iomem *p = early_ioremap(0x0FFFD9, 4);
 933
 934        if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24))
 935                EISA_bus = 1;
 936        early_iounmap(p, 4);
 937#endif
 938
 939        set_intr_gate(X86_TRAP_DE, divide_error);
 940        set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
 941        /* int4 can be called from all */
 942        set_system_intr_gate(X86_TRAP_OF, &overflow);
 943        set_intr_gate(X86_TRAP_BR, bounds);
 944        set_intr_gate(X86_TRAP_UD, invalid_op);
 945        set_intr_gate(X86_TRAP_NM, device_not_available);
 946#ifdef CONFIG_X86_32
 947        set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);
 948#else
 949        set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
 950#endif
 951        set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
 952        set_intr_gate(X86_TRAP_TS, invalid_TSS);
 953        set_intr_gate(X86_TRAP_NP, segment_not_present);
 954        set_intr_gate(X86_TRAP_SS, stack_segment);
 955        set_intr_gate(X86_TRAP_GP, general_protection);
 956        set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
 957        set_intr_gate(X86_TRAP_MF, coprocessor_error);
 958        set_intr_gate(X86_TRAP_AC, alignment_check);
 959#ifdef CONFIG_X86_MCE
 960        set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
 961#endif
 962        set_intr_gate(X86_TRAP_XF, simd_coprocessor_error);
 963
 964        /* Reserve all the builtin and the syscall vector: */
 965        for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
 966                set_bit(i, used_vectors);
 967
 968#ifdef CONFIG_IA32_EMULATION
 969        set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_compat);
 970        set_bit(IA32_SYSCALL_VECTOR, used_vectors);
 971#endif
 972
 973#ifdef CONFIG_X86_32
 974        set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
 975        set_bit(IA32_SYSCALL_VECTOR, used_vectors);
 976#endif
 977
 978        /*
 979         * Set the IDT descriptor to a fixed read-only location, so that the
 980         * "sidt" instruction will not leak the location of the kernel, and
 981         * to defend the IDT against arbitrary memory write vulnerabilities.
 982         * It will be reloaded in cpu_init() */
 983        __set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
 984        idt_descr.address = fix_to_virt(FIX_RO_IDT);
 985
 986        /*
 987         * Should be a barrier for any external CPU state:
 988         */
 989        cpu_init();
 990
 991        /*
 992         * X86_TRAP_DB and X86_TRAP_BP have been set
 993         * in early_trap_init(). However, ITS works only after
 994         * cpu_init() loads TSS. See comments in early_trap_init().
 995         */
 996        set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
 997        /* int3 can be called from all */
 998        set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
 999
1000        x86_init.irqs.trap_init();
1001
1002#ifdef CONFIG_X86_64
1003        memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16);
1004        set_nmi_gate(X86_TRAP_DB, &debug);
1005        set_nmi_gate(X86_TRAP_BP, &int3);
1006#endif
1007}
1008