LXR linux/arch/x86/kernel/traps.c

   1/*
   2 *  Copyright (C) 1991, 1992  Linus Torvalds
   3 *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
   4 *
   5 *  Pentium III FXSR, SSE support
   6 *      Gareth Hughes <gareth@valinux.com>, May 2000
   7 */
   8
   9/*
  10 * Handle hardware traps and faults.
  11 */
  12
  13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  14
  15#include <linux/context_tracking.h>
  16#include <linux/interrupt.h>
  17#include <linux/kallsyms.h>
  18#include <linux/spinlock.h>
  19#include <linux/kprobes.h>
  20#include <linux/uaccess.h>
  21#include <linux/kdebug.h>
  22#include <linux/kgdb.h>
  23#include <linux/kernel.h>
  24#include <linux/export.h>
  25#include <linux/ptrace.h>
  26#include <linux/uprobes.h>
  27#include <linux/string.h>
  28#include <linux/delay.h>
  29#include <linux/errno.h>
  30#include <linux/kexec.h>
  31#include <linux/sched.h>
  32#include <linux/sched/task_stack.h>
  33#include <linux/timer.h>
  34#include <linux/init.h>
  35#include <linux/bug.h>
  36#include <linux/nmi.h>
  37#include <linux/mm.h>
  38#include <linux/smp.h>
  39#include <linux/io.h>
  40
  41#if defined(CONFIG_EDAC)
  42#include <linux/edac.h>
  43#endif
  44#include <linux/hardirq.h>
  45#include <linux/atomic.h>
  46
  47#include <asm/stacktrace.h>
  48#include <asm/processor.h>
  49#include <asm/debugreg.h>
  50#include <asm/realmode.h>
  51#include <asm/text-patching.h>
  52#include <asm/ftrace.h>
  53#include <asm/traps.h>
  54#include <asm/desc.h>
  55#include <asm/fpu/api.h>
  56#include <asm/cpu.h>
  57#include <asm/cpu_entry_area.h>
  58#include <asm/mce.h>
  59#include <asm/fixmap.h>
  60#include <asm/mach_traps.h>
  61#include <asm/alternative.h>
  62#include <asm/fpu/xstate.h>
  63#include <asm/trace/mpx.h>
  64#include <asm/mpx.h>
  65#include <asm/vm86.h>
  66#include <asm/umip.h>
  67#include <asm/vdso.h>
  68
  69#ifdef CONFIG_X86_64
  70#include <asm/x86_init.h>
  71#include <asm/pgalloc.h>
  72#include <asm/proto.h>
  73#else
  74#include <asm/processor-flags.h>
  75#include <asm/setup.h>
  76#include <asm/proto.h>
  77#endif
  78
  79DECLARE_BITMAP(system_vectors, NR_VECTORS);
  80
  81static inline void cond_local_irq_enable(struct pt_regs *regs)
  82{
  83        if (regs->flags & X86_EFLAGS_IF)
  84                local_irq_enable();
  85}
  86
  87static inline void cond_local_irq_disable(struct pt_regs *regs)
  88{
  89        if (regs->flags & X86_EFLAGS_IF)
  90                local_irq_disable();
  91}
  92
  93int is_valid_bugaddr(unsigned long addr)
  94{
  95        unsigned short ud;
  96
  97        if (addr < TASK_SIZE_MAX)
  98                return 0;
  99
 100        if (probe_kernel_address((unsigned short *)addr, ud))
 101                return 0;
 102
 103        return ud == INSN_UD0 || ud == INSN_UD2;
 104}
 105
 106int fixup_bug(struct pt_regs *regs, int trapnr)
 107{
 108        if (trapnr != X86_TRAP_UD)
 109                return 0;
 110
 111        switch (report_bug(regs->ip, regs)) {
 112        case BUG_TRAP_TYPE_NONE:
 113        case BUG_TRAP_TYPE_BUG:
 114                break;
 115
 116        case BUG_TRAP_TYPE_WARN:
 117                regs->ip += LEN_UD2;
 118                return 1;
 119        }
 120
 121        return 0;
 122}
 123
 124static nokprobe_inline int
 125do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
 126                  struct pt_regs *regs, long error_code)
 127{
 128        if (v8086_mode(regs)) {
 129                /*
 130                 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
 131                 * On nmi (interrupt 2), do_trap should not be called.
 132                 */
 133                if (trapnr < X86_TRAP_UD) {
 134                        if (!handle_vm86_trap((struct kernel_vm86_regs *) regs,
 135                                                error_code, trapnr))
 136                                return 0;
 137                }
 138        } else if (!user_mode(regs)) {
 139                if (fixup_exception(regs, trapnr))
 140                        return 0;
 141
 142                tsk->thread.error_code = error_code;
 143                tsk->thread.trap_nr = trapnr;
 144                die(str, regs, error_code);
 145        } else {
 146                if (fixup_vdso_exception(regs, trapnr, error_code, 0))
 147                        return 0;
 148        }
 149
 150        /*
 151         * We want error_code and trap_nr set for userspace faults and
 152         * kernelspace faults which result in die(), but not
 153         * kernelspace faults which are fixed up.  die() gives the
 154         * process no chance to handle the signal and notice the
 155         * kernel fault information, so that won't result in polluting
 156         * the information about previously queued, but not yet
 157         * delivered, faults.  See also do_general_protection below.
 158         */
 159        tsk->thread.error_code = error_code;
 160        tsk->thread.trap_nr = trapnr;
 161
 162        return -1;
 163}
 164
 165static void show_signal(struct task_struct *tsk, int signr,
 166                        const char *type, const char *desc,
 167                        struct pt_regs *regs, long error_code)
 168{
 169        if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
 170            printk_ratelimit()) {
 171                pr_info("%s[%d] %s%s ip:%lx sp:%lx error:%lx",
 172                        tsk->comm, task_pid_nr(tsk), type, desc,
 173                        regs->ip, regs->sp, error_code);
 174                print_vma_addr(KERN_CONT " in ", regs->ip);
 175                pr_cont("\n");
 176        }
 177}
 178
 179static void
 180do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 181        long error_code, int sicode, void __user *addr)
 182{
 183        struct task_struct *tsk = current;
 184
 185        if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
 186                return;
 187
 188        show_signal(tsk, signr, "trap ", str, regs, error_code);
 189
 190        if (!sicode)
 191                force_sig(signr, tsk);
 192        else
 193                force_sig_fault(signr, sicode, addr, tsk);
 194}
 195NOKPROBE_SYMBOL(do_trap);
 196
 197static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 198        unsigned long trapnr, int signr, int sicode, void __user *addr)
 199{
 200        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 201
 202        /*
 203         * WARN*()s end up here; fix them up before we call the
 204         * notifier chain.
 205         */
 206        if (!user_mode(regs) && fixup_bug(regs, trapnr))
 207                return;
 208
 209        if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
 210                        NOTIFY_STOP) {
 211                cond_local_irq_enable(regs);
 212                do_trap(trapnr, signr, str, regs, error_code, sicode, addr);
 213        }
 214}
 215
 216#define IP ((void __user *)uprobe_get_trap_addr(regs))
 217#define DO_ERROR(trapnr, signr, sicode, addr, str, name)                   \
 218dotraplinkage void do_##name(struct pt_regs *regs, long error_code)        \
 219{                                                                          \
 220        do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \
 221}
 222
 223DO_ERROR(X86_TRAP_DE,     SIGFPE,  FPE_INTDIV,   IP, "divide error",        divide_error)
 224DO_ERROR(X86_TRAP_OF,     SIGSEGV,          0, NULL, "overflow",            overflow)
 225DO_ERROR(X86_TRAP_UD,     SIGILL,  ILL_ILLOPN,   IP, "invalid opcode",      invalid_op)
 226DO_ERROR(X86_TRAP_OLD_MF, SIGFPE,           0, NULL, "coprocessor segment overrun", coprocessor_segment_overrun)
 227DO_ERROR(X86_TRAP_TS,     SIGSEGV,          0, NULL, "invalid TSS",         invalid_TSS)
 228DO_ERROR(X86_TRAP_NP,     SIGBUS,           0, NULL, "segment not present", segment_not_present)
 229DO_ERROR(X86_TRAP_SS,     SIGBUS,           0, NULL, "stack segment",       stack_segment)
 230#undef IP
 231
 232dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code)
 233{
 234        char *str = "alignment check";
 235
 236        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 237
 238        if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP)
 239                return;
 240
 241        local_irq_enable();
 242
 243        if (!user_mode(regs)) {
 244                handle_kernel_split_lock(regs, error_code);
 245                return;
 246        }
 247
 248        if (handle_user_split_lock(regs, error_code))
 249                return;
 250
 251        do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs,
 252                error_code, BUS_ADRALN, NULL);
 253}
 254
 255#ifdef CONFIG_VMAP_STACK
 256__visible void __noreturn handle_stack_overflow(const char *message,
 257                                                struct pt_regs *regs,
 258                                                unsigned long fault_address)
 259{
 260        printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
 261                 (void *)fault_address, current->stack,
 262                 (char *)current->stack + THREAD_SIZE - 1);
 263        die(message, regs, 0);
 264
 265        /* Be absolutely certain we don't return. */
 266        panic(message);
 267}
 268#endif
 269
 270#ifdef CONFIG_X86_64
 271/* Runs on IST stack */
 272dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 273{
 274        static const char str[] = "double fault";
 275        struct task_struct *tsk = current;
 276#ifdef CONFIG_VMAP_STACK
 277        unsigned long cr2;
 278#endif
 279
 280#ifdef CONFIG_X86_ESPFIX64
 281        extern unsigned char native_irq_return_iret[];
 282
 283        /*
 284         * If IRET takes a non-IST fault on the espfix64 stack, then we
 285         * end up promoting it to a doublefault.  In that case, take
 286         * advantage of the fact that we're not using the normal (TSS.sp0)
 287         * stack right now.  We can write a fake #GP(0) frame at TSS.sp0
 288         * and then modify our own IRET frame so that, when we return,
 289         * we land directly at the #GP(0) vector with the stack already
 290         * set up according to its expectations.
 291         *
 292         * The net result is that our #GP handler will think that we
 293         * entered from usermode with the bad user context.
 294         *
 295         * No need for nmi_enter() here because we don't use RCU.
 296         */
 297        if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY &&
 298                regs->cs == __KERNEL_CS &&
 299                regs->ip == (unsigned long)native_irq_return_iret)
 300        {
 301                struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
 302
 303                /*
 304                 * regs->sp points to the failing IRET frame on the
 305                 * ESPFIX64 stack.  Copy it to the entry stack.  This fills
 306                 * in gpregs->ss through gpregs->ip.
 307                 *
 308                 */
 309                memmove(&gpregs->ip, (void *)regs->sp, 5*8);
 310                gpregs->orig_ax = 0;  /* Missing (lost) #GP error code */
 311
 312                /*
 313                 * Adjust our frame so that we return straight to the #GP
 314                 * vector with the expected RSP value.  This is safe because
 315                 * we won't enable interupts or schedule before we invoke
 316                 * general_protection, so nothing will clobber the stack
 317                 * frame we just set up.
 318                 *
 319                 * We will enter general_protection with kernel GSBASE,
 320                 * which is what the stub expects, given that the faulting
 321                 * RIP will be the IRET instruction.
 322                 */
 323                regs->ip = (unsigned long)general_protection;
 324                regs->sp = (unsigned long)&gpregs->orig_ax;
 325
 326                return;
 327        }
 328#endif
 329
 330        nmi_enter();
 331        notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
 332
 333        tsk->thread.error_code = error_code;
 334        tsk->thread.trap_nr = X86_TRAP_DF;
 335
 336#ifdef CONFIG_VMAP_STACK
 337        /*
 338         * If we overflow the stack into a guard page, the CPU will fail
 339         * to deliver #PF and will send #DF instead.  Similarly, if we
 340         * take any non-IST exception while too close to the bottom of
 341         * the stack, the processor will get a page fault while
 342         * delivering the exception and will generate a double fault.
 343         *
 344         * According to the SDM (footnote in 6.15 under "Interrupt 14 -
 345         * Page-Fault Exception (#PF):
 346         *
 347         *   Processors update CR2 whenever a page fault is detected. If a
 348         *   second page fault occurs while an earlier page fault is being
 349         *   delivered, the faulting linear address of the second fault will
 350         *   overwrite the contents of CR2 (replacing the previous
 351         *   address). These updates to CR2 occur even if the page fault
 352         *   results in a double fault or occurs during the delivery of a
 353         *   double fault.
 354         *
 355         * The logic below has a small possibility of incorrectly diagnosing
 356         * some errors as stack overflows.  For example, if the IDT or GDT
 357         * gets corrupted such that #GP delivery fails due to a bad descriptor
 358         * causing #GP and we hit this condition while CR2 coincidentally
 359         * points to the stack guard page, we'll think we overflowed the
 360         * stack.  Given that we're going to panic one way or another
 361         * if this happens, this isn't necessarily worth fixing.
 362         *
 363         * If necessary, we could improve the test by only diagnosing
 364         * a stack overflow if the saved RSP points within 47 bytes of
 365         * the bottom of the stack: if RSP == tsk_stack + 48 and we
 366         * take an exception, the stack is already aligned and there
 367         * will be enough room SS, RSP, RFLAGS, CS, RIP, and a
 368         * possible error code, so a stack overflow would *not* double
 369         * fault.  With any less space left, exception delivery could
 370         * fail, and, as a practical matter, we've overflowed the
 371         * stack even if the actual trigger for the double fault was
 372         * something else.
 373         */
 374        cr2 = read_cr2();
 375        if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
 376                handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
 377#endif
 378
 379#ifdef CONFIG_DOUBLEFAULT
 380        df_debug(regs, error_code);
 381#endif
 382        /*
 383         * This is always a kernel trap and never fixable (and thus must
 384         * never return).
 385         */
 386        for (;;)
 387                die(str, regs, error_code);
 388}
 389#endif
 390
 391dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 392{
 393        const struct mpx_bndcsr *bndcsr;
 394
 395        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 396        if (notify_die(DIE_TRAP, "bounds", regs, error_code,
 397                        X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
 398                return;
 399        cond_local_irq_enable(regs);
 400
 401        if (!user_mode(regs))
 402                die("bounds", regs, error_code);
 403
 404        if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
 405                /* The exception is not from Intel MPX */
 406                goto exit_trap;
 407        }
 408
 409        /*
 410         * We need to look at BNDSTATUS to resolve this exception.
 411         * A NULL here might mean that it is in its 'init state',
 412         * which is all zeros which indicates MPX was not
 413         * responsible for the exception.
 414         */
 415        bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR);
 416        if (!bndcsr)
 417                goto exit_trap;
 418
 419        trace_bounds_exception_mpx(bndcsr);
 420        /*
 421         * The error code field of the BNDSTATUS register communicates status
 422         * information of a bound range exception #BR or operation involving
 423         * bound directory.
 424         */
 425        switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) {
 426        case 2: /* Bound directory has invalid entry. */
 427                if (mpx_handle_bd_fault())
 428                        goto exit_trap;
 429                break; /* Success, it was handled */
 430        case 1: /* Bound violation. */
 431        {
 432                struct task_struct *tsk = current;
 433                struct mpx_fault_info mpx;
 434
 435                if (mpx_fault_info(&mpx, regs)) {
 436                        /*
 437                         * We failed to decode the MPX instruction.  Act as if
 438                         * the exception was not caused by MPX.
 439                         */
 440                        goto exit_trap;
 441                }
 442                /*
 443                 * Success, we decoded the instruction and retrieved
 444                 * an 'mpx' containing the address being accessed
 445                 * which caused the exception.  This information
 446                 * allows and application to possibly handle the
 447                 * #BR exception itself.
 448                 */
 449                if (!do_trap_no_signal(tsk, X86_TRAP_BR, "bounds", regs,
 450                                       error_code))
 451                        break;
 452
 453                show_signal(tsk, SIGSEGV, "trap ", "bounds", regs, error_code);
 454
 455                force_sig_bnderr(mpx.addr, mpx.lower, mpx.upper);
 456                break;
 457        }
 458        case 0: /* No exception caused by Intel MPX operations. */
 459                goto exit_trap;
 460        default:
 461                die("bounds", regs, error_code);
 462        }
 463
 464        return;
 465
 466exit_trap:
 467        /*
 468         * This path out is for all the cases where we could not
 469         * handle the exception in some way (like allocating a
 470         * table or telling userspace about it.  We will also end
 471         * up here if the kernel has MPX turned off at compile
 472         * time..
 473         */
 474        do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
 475}
 476
 477dotraplinkage void
 478do_general_protection(struct pt_regs *regs, long error_code)
 479{
 480        const char *desc = "general protection fault";
 481        struct task_struct *tsk;
 482
 483        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 484        cond_local_irq_enable(regs);
 485
 486        if (static_cpu_has(X86_FEATURE_UMIP)) {
 487                if (user_mode(regs) && fixup_umip_exception(regs))
 488                        return;
 489        }
 490
 491        if (v8086_mode(regs)) {
 492                local_irq_enable();
 493                handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
 494                return;
 495        }
 496
 497        tsk = current;
 498        if (!user_mode(regs)) {
 499                if (fixup_exception(regs, X86_TRAP_GP))
 500                        return;
 501
 502                tsk->thread.error_code = error_code;
 503                tsk->thread.trap_nr = X86_TRAP_GP;
 504                if (notify_die(DIE_GPF, desc, regs, error_code,
 505                               X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
 506                        die(desc, regs, error_code);
 507                return;
 508        }
 509
 510        tsk->thread.error_code = error_code;
 511        tsk->thread.trap_nr = X86_TRAP_GP;
 512
 513        if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0))
 514                return;
 515
 516        show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
 517
 518        force_sig(SIGSEGV, tsk);
 519}
 520NOKPROBE_SYMBOL(do_general_protection);
 521
 522dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 523{
 524#ifdef CONFIG_DYNAMIC_FTRACE
 525        /*
 526         * ftrace must be first, everything else may cause a recursive crash.
 527         * See note by declaration of modifying_ftrace_code in ftrace.c
 528         */
 529        if (unlikely(atomic_read(&modifying_ftrace_code)) &&
 530            ftrace_int3_handler(regs))
 531                return;
 532#endif
 533        if (poke_int3_handler(regs))
 534                return;
 535
 536        /*
 537         * Unlike any other non-IST entry, we can be called from pretty much
 538         * any location in the kernel through kprobes -- text_poke() will most
 539         * likely be handled by poke_int3_handler() above. This means this
 540         * handler is effectively NMI-like.
 541         */
 542        if (!user_mode(regs))
 543                nmi_enter();
 544
 545#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
 546        if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
 547                                SIGTRAP) == NOTIFY_STOP)
 548                goto exit;
 549#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
 550
 551#ifdef CONFIG_KPROBES
 552        if (kprobe_int3_handler(regs))
 553                goto exit;
 554#endif
 555
 556        if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
 557                        SIGTRAP) == NOTIFY_STOP)
 558                goto exit;
 559
 560        cond_local_irq_enable(regs);
 561        do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, 0, NULL);
 562        cond_local_irq_disable(regs);
 563
 564exit:
 565        if (!user_mode(regs))
 566                nmi_exit();
 567}
 568NOKPROBE_SYMBOL(do_int3);
 569
 570#ifdef CONFIG_X86_64
 571/*
 572 * Help handler running on a per-cpu (IST or entry trampoline) stack
 573 * to switch to the normal thread stack if the interrupted code was in
 574 * user mode. The actual stack switch is done in entry_64.S
 575 */
 576asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
 577{
 578        struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
 579        if (regs != eregs)
 580                *regs = *eregs;
 581        return regs;
 582}
 583NOKPROBE_SYMBOL(sync_regs);
 584
 585#ifdef CONFIG_AMD_MEM_ENCRYPT
 586asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *regs)
 587{
 588        unsigned long sp, *stack;
 589        struct stack_info info;
 590        struct pt_regs *regs_ret;
 591
 592        /*
 593         * In the SYSCALL entry path the RSP value comes from user-space - don't
 594         * trust it and switch to the current kernel stack
 595         */
 596        if (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
 597            regs->ip <  (unsigned long)entry_SYSCALL_64_safe_stack) {
 598                sp = this_cpu_read(cpu_current_top_of_stack);
 599                goto sync;
 600        }
 601
 602        /*
 603         * From here on the RSP value is trusted. Now check whether entry
 604         * happened from a safe stack. Not safe are the entry or unknown stacks,
 605         * use the fall-back stack instead in this case.
 606         */
 607        sp    = regs->sp;
 608        stack = (unsigned long *)sp;
 609
 610        if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY ||
 611            info.type >= STACK_TYPE_EXCEPTION_LAST)
 612                sp = __this_cpu_ist_top_va(VC2);
 613
 614sync:
 615        /*
 616         * Found a safe stack - switch to it as if the entry didn't happen via
 617         * IST stack. The code below only copies pt_regs, the real switch happens
 618         * in assembly code.
 619         */
 620        sp = ALIGN_DOWN(sp, 8) - sizeof(*regs_ret);
 621
 622        regs_ret = (struct pt_regs *)sp;
 623        *regs_ret = *regs;
 624
 625        return regs_ret;
 626}
 627#endif
 628
 629struct bad_iret_stack {
 630        void *error_entry_ret;
 631        struct pt_regs regs;
 632};
 633
 634asmlinkage __visible notrace
 635struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
 636{
 637        /*
 638         * This is called from entry_64.S early in handling a fault
 639         * caused by a bad iret to user mode.  To handle the fault
 640         * correctly, we want to move our stack frame to where it would
 641         * be had we entered directly on the entry stack (rather than
 642         * just below the IRET frame) and we want to pretend that the
 643         * exception came from the IRET target.
 644         */
 645        struct bad_iret_stack *new_stack =
 646                (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
 647
 648        /* Copy the IRET target to the new stack. */
 649        memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
 650
 651        /* Copy the remainder of the stack from the current stack. */
 652        memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
 653
 654        BUG_ON(!user_mode(&new_stack->regs));
 655        return new_stack;
 656}
 657NOKPROBE_SYMBOL(fixup_bad_iret);
 658#endif
 659
 660static bool is_sysenter_singlestep(struct pt_regs *regs)
 661{
 662        /*
 663         * We don't try for precision here.  If we're anywhere in the region of
 664         * code that can be single-stepped in the SYSENTER entry path, then
 665         * assume that this is a useless single-step trap due to SYSENTER
 666         * being invoked with TF set.  (We don't know in advance exactly
 667         * which instructions will be hit because BTF could plausibly
 668         * be set.)
 669         */
 670#ifdef CONFIG_X86_32
 671        return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) <
 672                (unsigned long)__end_SYSENTER_singlestep_region -
 673                (unsigned long)__begin_SYSENTER_singlestep_region;
 674#elif defined(CONFIG_IA32_EMULATION)
 675        return (regs->ip - (unsigned long)entry_SYSENTER_compat) <
 676                (unsigned long)__end_entry_SYSENTER_compat -
 677                (unsigned long)entry_SYSENTER_compat;
 678#else
 679        return false;
 680#endif
 681}
 682
 683static __always_inline unsigned long debug_read_clear_dr6(void)
 684{
 685        unsigned long dr6;
 686
 687        /*
 688         * The Intel SDM says:
 689         *
 690         *   Certain debug exceptions may clear bits 0-3. The remaining
 691         *   contents of the DR6 register are never cleared by the
 692         *   processor. To avoid confusion in identifying debug
 693         *   exceptions, debug handlers should clear the register before
 694         *   returning to the interrupted task.
 695         *
 696         * Keep it simple: clear DR6 immediately.
 697         */
 698        get_debugreg(dr6, 6);
 699        set_debugreg(DR6_RESERVED, 6);
 700        dr6 ^= DR6_RESERVED; /* Flip to positive polarity */
 701
 702        return dr6;
 703}
 704
 705/*
 706 * Our handling of the processor debug registers is non-trivial.
 707 * We do not clear them on entry and exit from the kernel. Therefore
 708 * it is possible to get a watchpoint trap here from inside the kernel.
 709 * However, the code in ./ptrace.c has ensured that the user can
 710 * only set watchpoints on userspace addresses. Therefore the in-kernel
 711 * watchpoint trap can only occur in code which is reading/writing
 712 * from user space. Such code must not hold kernel locks (since it
 713 * can equally take a page fault), therefore it is safe to call
 714 * force_sig_info even though that claims and releases locks.
 715 *
 716 * Code in ./signal.c ensures that the debug control register
 717 * is restored before we deliver any signal, and therefore that
 718 * user code runs with the correct debug control register even though
 719 * we clear it here.
 720 *
 721 * Being careful here means that we don't have to be as careful in a
 722 * lot of more complicated places (task switching can be a bit lazy
 723 * about restoring all the debug state, and ptrace doesn't have to
 724 * find every occurrence of the TF bit that could be saved away even
 725 * by user code)
 726 *
 727 * May run on IST stack.
 728 */
 729dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 730{
 731        struct task_struct *tsk = current;
 732        unsigned long dr6 = debug_read_clear_dr6(), dr7 = 0;
 733        int user_icebp = 0;
 734        int si_code;
 735
 736        /*
 737         * Disable breakpoints during exception handling; recursive exceptions
 738         * are exceedingly 'fun'.
 739         *
 740         * Since this function is NOKPROBE, and that also applies to
 741         * HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a
 742         * HW_BREAKPOINT_W on our stack)
 743         *
 744         * Entry text is excluded for HW_BP_X and cpu_entry_area, which
 745         * includes the entry stack is excluded for everything.
 746         */
 747        /*
 748         * NB: We can't easily clear DR7 if we're coming from user mode because
 749         * idtentry_exit_to_usermode() can invoke ptrace, schedule, access
 750         * user memory, etc.  This means that a recursive #DB is possible.  If
 751         * this happens, that #DB will hit exc_debug_kernel() and clear DR7.
 752         * Since we're not on the IST stack right now, everything will be
 753         * fine.
 754         */
 755        if (!user_mode(regs))
 756                dr7 = local_db_save();
 757
 758        nmi_enter();
 759
 760        /*
 761         * The SDM says "The processor clears the BTF flag when it
 762         * generates a debug exception."  Clear TIF_BLOCKSTEP to keep
 763         * TIF_BLOCKSTEP in sync with the hardware BTF flag.
 764         */
 765        clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
 766
 767        if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) &&
 768                     is_sysenter_singlestep(regs))) {
 769                dr6 &= ~DR_STEP;
 770                if (!dr6)
 771                        goto exit;
 772                /*
 773                 * else we might have gotten a single-step trap and hit a
 774                 * watchpoint at the same time, in which case we should fall
 775                 * through and handle the watchpoint.
 776                 */
 777        }
 778
 779        /*
 780         * If dr6 has no reason to give us about the origin of this trap,
 781         * then it's very likely the result of an icebp/int01 trap.
 782         * User wants a sigtrap for that.
 783         */
 784        if (!dr6 && user_mode(regs))
 785                user_icebp = 1;
 786
 787        /* Store the virtualized DR6 value */
 788        tsk->thread.debugreg6 = dr6;
 789
 790#ifdef CONFIG_KPROBES
 791        if (kprobe_debug_handler(regs))
 792                goto exit;
 793#endif
 794
 795        if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,
 796                       SIGTRAP) == NOTIFY_STOP)
 797                goto exit;
 798
 799        /*
 800         * Let others (NMI) know that the debug stack is in use
 801         * as we may switch to the interrupt stack.
 802         */
 803        debug_stack_usage_inc();
 804
 805        /* It's safe to allow irq's after DR6 has been saved */
 806        cond_local_irq_enable(regs);
 807
 808        if (v8086_mode(regs)) {
 809                handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
 810                                        X86_TRAP_DB);
 811                cond_local_irq_disable(regs);
 812                debug_stack_usage_dec();
 813                goto exit;
 814        }
 815
 816        /* #DB for bus lock can only be triggered from userspace. */
 817        if (dr6 & DR_BUS_LOCK)
 818                handle_bus_lock(regs);
 819
 820        if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) {
 821                /*
 822                 * Historical junk that used to handle SYSENTER single-stepping.
 823                 * This should be unreachable now.  If we survive for a while
 824                 * without anyone hitting this warning, we'll turn this into
 825                 * an oops.
 826                 */
 827                tsk->thread.debugreg6 &= ~DR_STEP;
 828                set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
 829                regs->flags &= ~X86_EFLAGS_TF;
 830        }
 831        si_code = get_si_code(tsk->thread.debugreg6);
 832        if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
 833                send_sigtrap(tsk, regs, error_code, si_code);
 834        cond_local_irq_disable(regs);
 835        debug_stack_usage_dec();
 836
 837exit:
 838        nmi_exit();
 839
 840        if (!user_mode(regs))
 841                local_db_restore(dr7);
 842}
 843NOKPROBE_SYMBOL(do_debug);
 844
 845/*
 846 * Note that we play around with the 'TS' bit in an attempt to get
 847 * the correct behaviour even in the presence of the asynchronous
 848 * IRQ13 behaviour
 849 */
 850static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 851{
 852        struct task_struct *task = current;
 853        struct fpu *fpu = &task->thread.fpu;
 854        int si_code;
 855        char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
 856                                                "simd exception";
 857
 858        cond_local_irq_enable(regs);
 859
 860        if (!user_mode(regs)) {
 861                if (fixup_exception(regs, trapnr))
 862                        return;
 863
 864                task->thread.error_code = error_code;
 865                task->thread.trap_nr = trapnr;
 866
 867                if (notify_die(DIE_TRAP, str, regs, error_code,
 868                                        trapnr, SIGFPE) != NOTIFY_STOP)
 869                        die(str, regs, error_code);
 870                return;
 871        }
 872
 873        /*
 874         * Synchronize the FPU register state to the memory register state
 875         * if necessary. This allows the exception handler to inspect it.
 876         */
 877        fpu_sync_fpstate(fpu);
 878
 879        task->thread.trap_nr    = trapnr;
 880        task->thread.error_code = error_code;
 881
 882        si_code = fpu__exception_code(fpu, trapnr);
 883        /* Retry when we get spurious exceptions: */
 884        if (!si_code)
 885                return;
 886
 887        if (fixup_vdso_exception(regs, trapnr, 0, 0))
 888                return;
 889
 890        force_sig_fault(SIGFPE, si_code,
 891                        (void __user *)uprobe_get_trap_addr(regs), task);
 892}
 893
 894dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 895{
 896        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 897        math_error(regs, error_code, X86_TRAP_MF);
 898}
 899
 900dotraplinkage void
 901do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 902{
 903        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 904        math_error(regs, error_code, X86_TRAP_XF);
 905}
 906
 907dotraplinkage void
 908do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 909{
 910        cond_local_irq_enable(regs);
 911}
 912
 913static bool handle_xfd_event(struct pt_regs *regs)
 914{
 915        struct task_struct *task = current;
 916        u64 xfd_err;
 917        int err;
 918
 919        if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD))
 920                return false;
 921
 922        rdmsrl(MSR_IA32_XFD_ERR, xfd_err);
 923        if (!xfd_err)
 924                return false;
 925
 926        wrmsrl(MSR_IA32_XFD_ERR, 0);
 927
 928        /* Die if that happens in kernel space */
 929        if (WARN_ON(!user_mode(regs)))
 930                return false;
 931
 932        local_irq_enable();
 933
 934        err = xfd_enable_feature(xfd_err);
 935
 936        switch (err) {
 937        case -EPERM:
 938                force_sig_fault(SIGILL, ILL_ILLOPC,
 939                                (void __user *)uprobe_get_trap_addr(regs), task);
 940                break;
 941        case -EFAULT:
 942                force_sig(SIGSEGV, task);
 943                break;
 944        }
 945
 946        local_irq_disable();
 947        return true;
 948}
 949
 950dotraplinkage void
 951do_device_not_available(struct pt_regs *regs, long error_code)
 952{
 953        unsigned long cr0;
 954
 955        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 956
 957        if (handle_xfd_event(regs))
 958                return;
 959
 960#ifdef CONFIG_MATH_EMULATION
 961        if (!boot_cpu_has(X86_FEATURE_FPU) && (read_cr0() & X86_CR0_EM)) {
 962                struct math_emu_info info = { };
 963
 964                cond_local_irq_enable(regs);
 965
 966                info.regs = regs;
 967                math_emulate(&info);
 968                return;
 969        }
 970#endif
 971
 972        /* This should not happen. */
 973        cr0 = read_cr0();
 974        if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) {
 975                /* Try to fix it up and carry on. */
 976                write_cr0(cr0 & ~X86_CR0_TS);
 977        } else {
 978                /*
 979                 * Something terrible happened, and we're better off trying
 980                 * to kill the task than getting stuck in a never-ending
 981                 * loop of #NM faults.
 982                 */
 983                die("unexpected #NM exception", regs, error_code);
 984        }
 985}
 986NOKPROBE_SYMBOL(do_device_not_available);
 987
 988#ifdef CONFIG_X86_32
 989dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 990{
 991        RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 992        local_irq_enable();
 993
 994        if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
 995                        X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
 996                do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
 997                        ILL_BADSTK, (void __user *)NULL);
 998        }
 999}
1000#endif

1001
1002void __init trap_init(void)
1003{
1004        /* Init cpu_entry_area before IST entries are set up */
1005        setup_cpu_entry_areas();
1006
1007        /* Init GHCB memory pages when running as an SEV-ES guest */
1008        sev_es_init_vc_handling();
1009
1010        idt_setup_traps();
1011
1012        /*
1013         * Set the IDT descriptor to a fixed read-only location, so that the
1014         * "sidt" instruction will not leak the location of the kernel, and
1015         * to defend the IDT against arbitrary memory write vulnerabilities.
1016         * It will be reloaded in cpu_init() */
1017        cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
1018                    PAGE_KERNEL_RO);
1019        idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
1020
1021        /*
1022         * Should be a barrier for any external CPU state:
1023         */
1024        cpu_init();
1025
1026        idt_setup_ist_traps();
1027
1028        idt_setup_debugidt_traps();
1029}
1030