linux/arch/x86/mm/fault.c
<<
>>
Prefs
   1/*
   2 *  Copyright (C) 1995  Linus Torvalds
   3 *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
   4 *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
   5 */
   6#include <linux/magic.h>                /* STACK_END_MAGIC              */
   7#include <linux/sched.h>                /* test_thread_flag(), ...      */
   8#include <linux/kdebug.h>               /* oops_begin/end, ...          */
   9#include <linux/module.h>               /* search_exception_table       */
  10#include <linux/bootmem.h>              /* max_low_pfn                  */
  11#include <linux/kprobes.h>              /* __kprobes, ...               */
  12#include <linux/mmiotrace.h>            /* kmmio_handler, ...           */
  13#include <linux/perf_event.h>           /* perf_sw_event                */
  14#include <linux/hugetlb.h>              /* hstate_index_to_shift        */
  15#include <linux/prefetch.h>             /* prefetchw                    */
  16#include <linux/context_tracking.h>     /* exception_enter(), ...       */
  17
  18#include <asm/traps.h>                  /* dotraplinkage, ...           */
  19#include <asm/pgalloc.h>                /* pgd_*(), ...                 */
  20#include <asm/kmemcheck.h>              /* kmemcheck_*(), ...           */
  21#include <asm/fixmap.h>                 /* VSYSCALL_START               */
  22
  23/*
  24 * Page fault error code bits:
  25 *
  26 *   bit 0 ==    0: no page found       1: protection fault
  27 *   bit 1 ==    0: read access         1: write access
  28 *   bit 2 ==    0: kernel-mode access  1: user-mode access
  29 *   bit 3 ==                           1: use of reserved bit detected
  30 *   bit 4 ==                           1: fault was an instruction fetch
  31 */
  32enum x86_pf_error_code {
  33
  34        PF_PROT         =               1 << 0,
  35        PF_WRITE        =               1 << 1,
  36        PF_USER         =               1 << 2,
  37        PF_RSVD         =               1 << 3,
  38        PF_INSTR        =               1 << 4,
  39};
  40
  41/*
  42 * Returns 0 if mmiotrace is disabled, or if the fault is not
  43 * handled by mmiotrace:
  44 */
  45static inline int __kprobes
  46kmmio_fault(struct pt_regs *regs, unsigned long addr)
  47{
  48        if (unlikely(is_kmmio_active()))
  49                if (kmmio_handler(regs, addr) == 1)
  50                        return -1;
  51        return 0;
  52}
  53
  54static inline int __kprobes notify_page_fault(struct pt_regs *regs)
  55{
  56        int ret = 0;
  57
  58        /* kprobe_running() needs smp_processor_id() */
  59        if (kprobes_built_in() && !user_mode_vm(regs)) {
  60                preempt_disable();
  61                if (kprobe_running() && kprobe_fault_handler(regs, 14))
  62                        ret = 1;
  63                preempt_enable();
  64        }
  65
  66        return ret;
  67}
  68
  69/*
  70 * Prefetch quirks:
  71 *
  72 * 32-bit mode:
  73 *
  74 *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
  75 *   Check that here and ignore it.
  76 *
  77 * 64-bit mode:
  78 *
  79 *   Sometimes the CPU reports invalid exceptions on prefetch.
  80 *   Check that here and ignore it.
  81 *
  82 * Opcode checker based on code by Richard Brunner.
  83 */
  84static inline int
  85check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
  86                      unsigned char opcode, int *prefetch)
  87{
  88        unsigned char instr_hi = opcode & 0xf0;
  89        unsigned char instr_lo = opcode & 0x0f;
  90
  91        switch (instr_hi) {
  92        case 0x20:
  93        case 0x30:
  94                /*
  95                 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
  96                 * In X86_64 long mode, the CPU will signal invalid
  97                 * opcode if some of these prefixes are present so
  98                 * X86_64 will never get here anyway
  99                 */
 100                return ((instr_lo & 7) == 0x6);
 101#ifdef CONFIG_X86_64
 102        case 0x40:
 103                /*
 104                 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
 105                 * Need to figure out under what instruction mode the
 106                 * instruction was issued. Could check the LDT for lm,
 107                 * but for now it's good enough to assume that long
 108                 * mode only uses well known segments or kernel.
 109                 */
 110                return (!user_mode(regs) || user_64bit_mode(regs));
 111#endif
 112        case 0x60:
 113                /* 0x64 thru 0x67 are valid prefixes in all modes. */
 114                return (instr_lo & 0xC) == 0x4;
 115        case 0xF0:
 116                /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
 117                return !instr_lo || (instr_lo>>1) == 1;
 118        case 0x00:
 119                /* Prefetch instruction is 0x0F0D or 0x0F18 */
 120                if (probe_kernel_address(instr, opcode))
 121                        return 0;
 122
 123                *prefetch = (instr_lo == 0xF) &&
 124                        (opcode == 0x0D || opcode == 0x18);
 125                return 0;
 126        default:
 127                return 0;
 128        }
 129}
 130
 131static int
 132is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
 133{
 134        unsigned char *max_instr;
 135        unsigned char *instr;
 136        int prefetch = 0;
 137
 138        /*
 139         * If it was a exec (instruction fetch) fault on NX page, then
 140         * do not ignore the fault:
 141         */
 142        if (error_code & PF_INSTR)
 143                return 0;
 144
 145        instr = (void *)convert_ip_to_linear(current, regs);
 146        max_instr = instr + 15;
 147
 148        if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
 149                return 0;
 150
 151        while (instr < max_instr) {
 152                unsigned char opcode;
 153
 154                if (probe_kernel_address(instr, opcode))
 155                        break;
 156
 157                instr++;
 158
 159                if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
 160                        break;
 161        }
 162        return prefetch;
 163}
 164
 165static void
 166force_sig_info_fault(int si_signo, int si_code, unsigned long address,
 167                     struct task_struct *tsk, int fault)
 168{
 169        unsigned lsb = 0;
 170        siginfo_t info;
 171
 172        info.si_signo   = si_signo;
 173        info.si_errno   = 0;
 174        info.si_code    = si_code;
 175        info.si_addr    = (void __user *)address;
 176        if (fault & VM_FAULT_HWPOISON_LARGE)
 177                lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); 
 178        if (fault & VM_FAULT_HWPOISON)
 179                lsb = PAGE_SHIFT;
 180        info.si_addr_lsb = lsb;
 181
 182        force_sig_info(si_signo, &info, tsk);
 183}
 184
 185DEFINE_SPINLOCK(pgd_lock);
 186LIST_HEAD(pgd_list);
 187
 188#ifdef CONFIG_X86_32
 189static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
 190{
 191        unsigned index = pgd_index(address);
 192        pgd_t *pgd_k;
 193        pud_t *pud, *pud_k;
 194        pmd_t *pmd, *pmd_k;
 195
 196        pgd += index;
 197        pgd_k = init_mm.pgd + index;
 198
 199        if (!pgd_present(*pgd_k))
 200                return NULL;
 201
 202        /*
 203         * set_pgd(pgd, *pgd_k); here would be useless on PAE
 204         * and redundant with the set_pmd() on non-PAE. As would
 205         * set_pud.
 206         */
 207        pud = pud_offset(pgd, address);
 208        pud_k = pud_offset(pgd_k, address);
 209        if (!pud_present(*pud_k))
 210                return NULL;
 211
 212        pmd = pmd_offset(pud, address);
 213        pmd_k = pmd_offset(pud_k, address);
 214        if (!pmd_present(*pmd_k))
 215                return NULL;
 216
 217        if (!pmd_present(*pmd))
 218                set_pmd(pmd, *pmd_k);
 219        else
 220                BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
 221
 222        return pmd_k;
 223}
 224
 225void vmalloc_sync_all(void)
 226{
 227        unsigned long address;
 228
 229        if (SHARED_KERNEL_PMD)
 230                return;
 231
 232        for (address = VMALLOC_START & PMD_MASK;
 233             address >= TASK_SIZE && address < FIXADDR_TOP;
 234             address += PMD_SIZE) {
 235                struct page *page;
 236
 237                spin_lock(&pgd_lock);
 238                list_for_each_entry(page, &pgd_list, lru) {
 239                        spinlock_t *pgt_lock;
 240                        pmd_t *ret;
 241
 242                        /* the pgt_lock only for Xen */
 243                        pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
 244
 245                        spin_lock(pgt_lock);
 246                        ret = vmalloc_sync_one(page_address(page), address);
 247                        spin_unlock(pgt_lock);
 248
 249                        if (!ret)
 250                                break;
 251                }
 252                spin_unlock(&pgd_lock);
 253        }
 254}
 255
 256/*
 257 * 32-bit:
 258 *
 259 *   Handle a fault on the vmalloc or module mapping area
 260 */
 261static noinline __kprobes int vmalloc_fault(unsigned long address)
 262{
 263        unsigned long pgd_paddr;
 264        pmd_t *pmd_k;
 265        pte_t *pte_k;
 266
 267        /* Make sure we are in vmalloc area: */
 268        if (!(address >= VMALLOC_START && address < VMALLOC_END))
 269                return -1;
 270
 271        WARN_ON_ONCE(in_nmi());
 272
 273        /*
 274         * Synchronize this task's top level page-table
 275         * with the 'reference' page table.
 276         *
 277         * Do _not_ use "current" here. We might be inside
 278         * an interrupt in the middle of a task switch..
 279         */
 280        pgd_paddr = read_cr3();
 281        pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
 282        if (!pmd_k)
 283                return -1;
 284
 285        pte_k = pte_offset_kernel(pmd_k, address);
 286        if (!pte_present(*pte_k))
 287                return -1;
 288
 289        return 0;
 290}
 291
 292/*
 293 * Did it hit the DOS screen memory VA from vm86 mode?
 294 */
 295static inline void
 296check_v8086_mode(struct pt_regs *regs, unsigned long address,
 297                 struct task_struct *tsk)
 298{
 299        unsigned long bit;
 300
 301        if (!v8086_mode(regs))
 302                return;
 303
 304        bit = (address - 0xA0000) >> PAGE_SHIFT;
 305        if (bit < 32)
 306                tsk->thread.screen_bitmap |= 1 << bit;
 307}
 308
 309static bool low_pfn(unsigned long pfn)
 310{
 311        return pfn < max_low_pfn;
 312}
 313
 314static void dump_pagetable(unsigned long address)
 315{
 316        pgd_t *base = __va(read_cr3());
 317        pgd_t *pgd = &base[pgd_index(address)];
 318        pmd_t *pmd;
 319        pte_t *pte;
 320
 321#ifdef CONFIG_X86_PAE
 322        printk("*pdpt = %016Lx ", pgd_val(*pgd));
 323        if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
 324                goto out;
 325#endif
 326        pmd = pmd_offset(pud_offset(pgd, address), address);
 327        printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
 328
 329        /*
 330         * We must not directly access the pte in the highpte
 331         * case if the page table is located in highmem.
 332         * And let's rather not kmap-atomic the pte, just in case
 333         * it's allocated already:
 334         */
 335        if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
 336                goto out;
 337
 338        pte = pte_offset_kernel(pmd, address);
 339        printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
 340out:
 341        printk("\n");
 342}
 343
 344#else /* CONFIG_X86_64: */
 345
 346void vmalloc_sync_all(void)
 347{
 348        sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
 349}
 350
 351/*
 352 * 64-bit:
 353 *
 354 *   Handle a fault on the vmalloc area
 355 *
 356 * This assumes no large pages in there.
 357 */
 358static noinline __kprobes int vmalloc_fault(unsigned long address)
 359{
 360        pgd_t *pgd, *pgd_ref;
 361        pud_t *pud, *pud_ref;
 362        pmd_t *pmd, *pmd_ref;
 363        pte_t *pte, *pte_ref;
 364
 365        /* Make sure we are in vmalloc area: */
 366        if (!(address >= VMALLOC_START && address < VMALLOC_END))
 367                return -1;
 368
 369        WARN_ON_ONCE(in_nmi());
 370
 371        /*
 372         * Copy kernel mappings over when needed. This can also
 373         * happen within a race in page table update. In the later
 374         * case just flush:
 375         */
 376        pgd = pgd_offset(current->active_mm, address);
 377        pgd_ref = pgd_offset_k(address);
 378        if (pgd_none(*pgd_ref))
 379                return -1;
 380
 381        if (pgd_none(*pgd)) {
 382                set_pgd(pgd, *pgd_ref);
 383                arch_flush_lazy_mmu_mode();
 384        } else {
 385                BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 386        }
 387
 388        /*
 389         * Below here mismatches are bugs because these lower tables
 390         * are shared:
 391         */
 392
 393        pud = pud_offset(pgd, address);
 394        pud_ref = pud_offset(pgd_ref, address);
 395        if (pud_none(*pud_ref))
 396                return -1;
 397
 398        if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
 399                BUG();
 400
 401        pmd = pmd_offset(pud, address);
 402        pmd_ref = pmd_offset(pud_ref, address);
 403        if (pmd_none(*pmd_ref))
 404                return -1;
 405
 406        if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
 407                BUG();
 408
 409        pte_ref = pte_offset_kernel(pmd_ref, address);
 410        if (!pte_present(*pte_ref))
 411                return -1;
 412
 413        pte = pte_offset_kernel(pmd, address);
 414
 415        /*
 416         * Don't use pte_page here, because the mappings can point
 417         * outside mem_map, and the NUMA hash lookup cannot handle
 418         * that:
 419         */
 420        if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
 421                BUG();
 422
 423        return 0;
 424}
 425
 426#ifdef CONFIG_CPU_SUP_AMD
 427static const char errata93_warning[] =
 428KERN_ERR 
 429"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
 430"******* Working around it, but it may cause SEGVs or burn power.\n"
 431"******* Please consider a BIOS update.\n"
 432"******* Disabling USB legacy in the BIOS may also help.\n";
 433#endif
 434
 435/*
 436 * No vm86 mode in 64-bit mode:
 437 */
 438static inline void
 439check_v8086_mode(struct pt_regs *regs, unsigned long address,
 440                 struct task_struct *tsk)
 441{
 442}
 443
 444static int bad_address(void *p)
 445{
 446        unsigned long dummy;
 447
 448        return probe_kernel_address((unsigned long *)p, dummy);
 449}
 450
 451static void dump_pagetable(unsigned long address)
 452{
 453        pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
 454        pgd_t *pgd = base + pgd_index(address);
 455        pud_t *pud;
 456        pmd_t *pmd;
 457        pte_t *pte;
 458
 459        if (bad_address(pgd))
 460                goto bad;
 461
 462        printk("PGD %lx ", pgd_val(*pgd));
 463
 464        if (!pgd_present(*pgd))
 465                goto out;
 466
 467        pud = pud_offset(pgd, address);
 468        if (bad_address(pud))
 469                goto bad;
 470
 471        printk("PUD %lx ", pud_val(*pud));
 472        if (!pud_present(*pud) || pud_large(*pud))
 473                goto out;
 474
 475        pmd = pmd_offset(pud, address);
 476        if (bad_address(pmd))
 477                goto bad;
 478
 479        printk("PMD %lx ", pmd_val(*pmd));
 480        if (!pmd_present(*pmd) || pmd_large(*pmd))
 481                goto out;
 482
 483        pte = pte_offset_kernel(pmd, address);
 484        if (bad_address(pte))
 485                goto bad;
 486
 487        printk("PTE %lx", pte_val(*pte));
 488out:
 489        printk("\n");
 490        return;
 491bad:
 492        printk("BAD\n");
 493}
 494
 495#endif /* CONFIG_X86_64 */
 496
 497/*
 498 * Workaround for K8 erratum #93 & buggy BIOS.
 499 *
 500 * BIOS SMM functions are required to use a specific workaround
 501 * to avoid corruption of the 64bit RIP register on C stepping K8.
 502 *
 503 * A lot of BIOS that didn't get tested properly miss this.
 504 *
 505 * The OS sees this as a page fault with the upper 32bits of RIP cleared.
 506 * Try to work around it here.
 507 *
 508 * Note we only handle faults in kernel here.
 509 * Does nothing on 32-bit.
 510 */
 511static int is_errata93(struct pt_regs *regs, unsigned long address)
 512{
 513#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
 514        if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
 515            || boot_cpu_data.x86 != 0xf)
 516                return 0;
 517
 518        if (address != regs->ip)
 519                return 0;
 520
 521        if ((address >> 32) != 0)
 522                return 0;
 523
 524        address |= 0xffffffffUL << 32;
 525        if ((address >= (u64)_stext && address <= (u64)_etext) ||
 526            (address >= MODULES_VADDR && address <= MODULES_END)) {
 527                printk_once(errata93_warning);
 528                regs->ip = address;
 529                return 1;
 530        }
 531#endif
 532        return 0;
 533}
 534
 535/*
 536 * Work around K8 erratum #100 K8 in compat mode occasionally jumps
 537 * to illegal addresses >4GB.
 538 *
 539 * We catch this in the page fault handler because these addresses
 540 * are not reachable. Just detect this case and return.  Any code
 541 * segment in LDT is compatibility mode.
 542 */
 543static int is_errata100(struct pt_regs *regs, unsigned long address)
 544{
 545#ifdef CONFIG_X86_64
 546        if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
 547                return 1;
 548#endif
 549        return 0;
 550}
 551
 552static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
 553{
 554#ifdef CONFIG_X86_F00F_BUG
 555        unsigned long nr;
 556
 557        /*
 558         * Pentium F0 0F C7 C8 bug workaround:
 559         */
 560        if (boot_cpu_has_bug(X86_BUG_F00F)) {
 561                nr = (address - idt_descr.address) >> 3;
 562
 563                if (nr == 6) {
 564                        do_invalid_op(regs, 0);
 565                        return 1;
 566                }
 567        }
 568#endif
 569        return 0;
 570}
 571
 572static const char nx_warning[] = KERN_CRIT
 573"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
 574
 575static void
 576show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 577                unsigned long address)
 578{
 579        if (!oops_may_print())
 580                return;
 581
 582        if (error_code & PF_INSTR) {
 583                unsigned int level;
 584
 585                pte_t *pte = lookup_address(address, &level);
 586
 587                if (pte && pte_present(*pte) && !pte_exec(*pte))
 588                        printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
 589        }
 590
 591        printk(KERN_ALERT "BUG: unable to handle kernel ");
 592        if (address < PAGE_SIZE)
 593                printk(KERN_CONT "NULL pointer dereference");
 594        else
 595                printk(KERN_CONT "paging request");
 596
 597        printk(KERN_CONT " at %p\n", (void *) address);
 598        printk(KERN_ALERT "IP:");
 599        printk_address(regs->ip, 1);
 600
 601        dump_pagetable(address);
 602}
 603
 604static noinline void
 605pgtable_bad(struct pt_regs *regs, unsigned long error_code,
 606            unsigned long address)
 607{
 608        struct task_struct *tsk;
 609        unsigned long flags;
 610        int sig;
 611
 612        flags = oops_begin();
 613        tsk = current;
 614        sig = SIGKILL;
 615
 616        printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
 617               tsk->comm, address);
 618        dump_pagetable(address);
 619
 620        tsk->thread.cr2         = address;
 621        tsk->thread.trap_nr     = X86_TRAP_PF;
 622        tsk->thread.error_code  = error_code;
 623
 624        if (__die("Bad pagetable", regs, error_code))
 625                sig = 0;
 626
 627        oops_end(flags, regs, sig);
 628}
 629
 630static noinline void
 631no_context(struct pt_regs *regs, unsigned long error_code,
 632           unsigned long address, int signal, int si_code)
 633{
 634        struct task_struct *tsk = current;
 635        unsigned long *stackend;
 636        unsigned long flags;
 637        int sig;
 638
 639        /* Are we prepared to handle this kernel fault? */
 640        if (fixup_exception(regs)) {
 641                if (current_thread_info()->sig_on_uaccess_error && signal) {
 642                        tsk->thread.trap_nr = X86_TRAP_PF;
 643                        tsk->thread.error_code = error_code | PF_USER;
 644                        tsk->thread.cr2 = address;
 645
 646                        /* XXX: hwpoison faults will set the wrong code. */
 647                        force_sig_info_fault(signal, si_code, address, tsk, 0);
 648                }
 649                return;
 650        }
 651
 652        /*
 653         * 32-bit:
 654         *
 655         *   Valid to do another page fault here, because if this fault
 656         *   had been triggered by is_prefetch fixup_exception would have
 657         *   handled it.
 658         *
 659         * 64-bit:
 660         *
 661         *   Hall of shame of CPU/BIOS bugs.
 662         */
 663        if (is_prefetch(regs, error_code, address))
 664                return;
 665
 666        if (is_errata93(regs, address))
 667                return;
 668
 669        /*
 670         * Oops. The kernel tried to access some bad page. We'll have to
 671         * terminate things with extreme prejudice:
 672         */
 673        flags = oops_begin();
 674
 675        show_fault_oops(regs, error_code, address);
 676
 677        stackend = end_of_stack(tsk);
 678        if (tsk != &init_task && *stackend != STACK_END_MAGIC)
 679                printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
 680
 681        tsk->thread.cr2         = address;
 682        tsk->thread.trap_nr     = X86_TRAP_PF;
 683        tsk->thread.error_code  = error_code;
 684
 685        sig = SIGKILL;
 686        if (__die("Oops", regs, error_code))
 687                sig = 0;
 688
 689        /* Executive summary in case the body of the oops scrolled away */
 690        printk(KERN_DEFAULT "CR2: %016lx\n", address);
 691
 692        oops_end(flags, regs, sig);
 693}
 694
 695/*
 696 * Print out info about fatal segfaults, if the show_unhandled_signals
 697 * sysctl is set:
 698 */
 699static inline void
 700show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 701                unsigned long address, struct task_struct *tsk)
 702{
 703        if (!unhandled_signal(tsk, SIGSEGV))
 704                return;
 705
 706        if (!printk_ratelimit())
 707                return;
 708
 709        printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
 710                task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 711                tsk->comm, task_pid_nr(tsk), address,
 712                (void *)regs->ip, (void *)regs->sp, error_code);
 713
 714        print_vma_addr(KERN_CONT " in ", regs->ip);
 715
 716        printk(KERN_CONT "\n");
 717}
 718
 719static void
 720__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 721                       unsigned long address, int si_code)
 722{
 723        struct task_struct *tsk = current;
 724
 725        /* User mode accesses just cause a SIGSEGV */
 726        if (error_code & PF_USER) {
 727                /*
 728                 * It's possible to have interrupts off here:
 729                 */
 730                local_irq_enable();
 731
 732                /*
 733                 * Valid to do another page fault here because this one came
 734                 * from user space:
 735                 */
 736                if (is_prefetch(regs, error_code, address))
 737                        return;
 738
 739                if (is_errata100(regs, address))
 740                        return;
 741
 742#ifdef CONFIG_X86_64
 743                /*
 744                 * Instruction fetch faults in the vsyscall page might need
 745                 * emulation.
 746                 */
 747                if (unlikely((error_code & PF_INSTR) &&
 748                             ((address & ~0xfff) == VSYSCALL_START))) {
 749                        if (emulate_vsyscall(regs, address))
 750                                return;
 751                }
 752#endif
 753                /* Kernel addresses are always protection faults: */
 754                if (address >= TASK_SIZE)
 755                        error_code |= PF_PROT;
 756
 757                if (likely(show_unhandled_signals))
 758                        show_signal_msg(regs, error_code, address, tsk);
 759
 760                tsk->thread.cr2         = address;
 761                tsk->thread.error_code  = error_code;
 762                tsk->thread.trap_nr     = X86_TRAP_PF;
 763
 764                force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
 765
 766                return;
 767        }
 768
 769        if (is_f00f_bug(regs, address))
 770                return;
 771
 772        no_context(regs, error_code, address, SIGSEGV, si_code);
 773}
 774
 775static noinline void
 776bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 777                     unsigned long address)
 778{
 779        __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
 780}
 781
 782static void
 783__bad_area(struct pt_regs *regs, unsigned long error_code,
 784           unsigned long address, int si_code)
 785{
 786        struct mm_struct *mm = current->mm;
 787
 788        /*
 789         * Something tried to access memory that isn't in our memory map..
 790         * Fix it, but check if it's kernel or user first..
 791         */
 792        up_read(&mm->mmap_sem);
 793
 794        __bad_area_nosemaphore(regs, error_code, address, si_code);
 795}
 796
 797static noinline void
 798bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
 799{
 800        __bad_area(regs, error_code, address, SEGV_MAPERR);
 801}
 802
 803static noinline void
 804bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
 805                      unsigned long address)
 806{
 807        __bad_area(regs, error_code, address, SEGV_ACCERR);
 808}
 809
 810static void
 811do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 812          unsigned int fault)
 813{
 814        struct task_struct *tsk = current;
 815        struct mm_struct *mm = tsk->mm;
 816        int code = BUS_ADRERR;
 817
 818        up_read(&mm->mmap_sem);
 819
 820        /* Kernel mode? Handle exceptions or die: */
 821        if (!(error_code & PF_USER)) {
 822                no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
 823                return;
 824        }
 825
 826        /* User-space => ok to do another page fault: */
 827        if (is_prefetch(regs, error_code, address))
 828                return;
 829
 830        tsk->thread.cr2         = address;
 831        tsk->thread.error_code  = error_code;
 832        tsk->thread.trap_nr     = X86_TRAP_PF;
 833
 834#ifdef CONFIG_MEMORY_FAILURE
 835        if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
 836                printk(KERN_ERR
 837        "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
 838                        tsk->comm, tsk->pid, address);
 839                code = BUS_MCEERR_AR;
 840        }
 841#endif
 842        force_sig_info_fault(SIGBUS, code, address, tsk, fault);
 843}
 844
 845static noinline int
 846mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 847               unsigned long address, unsigned int fault)
 848{
 849        /*
 850         * Pagefault was interrupted by SIGKILL. We have no reason to
 851         * continue pagefault.
 852         */
 853        if (fatal_signal_pending(current)) {
 854                if (!(fault & VM_FAULT_RETRY))
 855                        up_read(&current->mm->mmap_sem);
 856                if (!(error_code & PF_USER))
 857                        no_context(regs, error_code, address, 0, 0);
 858                return 1;
 859        }
 860        if (!(fault & VM_FAULT_ERROR))
 861                return 0;
 862
 863        if (fault & VM_FAULT_OOM) {
 864                /* Kernel mode? Handle exceptions or die: */
 865                if (!(error_code & PF_USER)) {
 866                        up_read(&current->mm->mmap_sem);
 867                        no_context(regs, error_code, address,
 868                                   SIGSEGV, SEGV_MAPERR);
 869                        return 1;
 870                }
 871
 872                up_read(&current->mm->mmap_sem);
 873
 874                /*
 875                 * We ran out of memory, call the OOM killer, and return the
 876                 * userspace (which will retry the fault, or kill us if we got
 877                 * oom-killed):
 878                 */
 879                pagefault_out_of_memory();
 880        } else {
 881                if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
 882                             VM_FAULT_HWPOISON_LARGE))
 883                        do_sigbus(regs, error_code, address, fault);
 884                else
 885                        BUG();
 886        }
 887        return 1;
 888}
 889
 890static int spurious_fault_check(unsigned long error_code, pte_t *pte)
 891{
 892        if ((error_code & PF_WRITE) && !pte_write(*pte))
 893                return 0;
 894
 895        if ((error_code & PF_INSTR) && !pte_exec(*pte))
 896                return 0;
 897
 898        return 1;
 899}
 900
 901/*
 902 * Handle a spurious fault caused by a stale TLB entry.
 903 *
 904 * This allows us to lazily refresh the TLB when increasing the
 905 * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
 906 * eagerly is very expensive since that implies doing a full
 907 * cross-processor TLB flush, even if no stale TLB entries exist
 908 * on other processors.
 909 *
 910 * There are no security implications to leaving a stale TLB when
 911 * increasing the permissions on a page.
 912 */
 913static noinline __kprobes int
 914spurious_fault(unsigned long error_code, unsigned long address)
 915{
 916        pgd_t *pgd;
 917        pud_t *pud;
 918        pmd_t *pmd;
 919        pte_t *pte;
 920        int ret;
 921
 922        /* Reserved-bit violation or user access to kernel space? */
 923        if (error_code & (PF_USER | PF_RSVD))
 924                return 0;
 925
 926        pgd = init_mm.pgd + pgd_index(address);
 927        if (!pgd_present(*pgd))
 928                return 0;
 929
 930        pud = pud_offset(pgd, address);
 931        if (!pud_present(*pud))
 932                return 0;
 933
 934        if (pud_large(*pud))
 935                return spurious_fault_check(error_code, (pte_t *) pud);
 936
 937        pmd = pmd_offset(pud, address);
 938        if (!pmd_present(*pmd))
 939                return 0;
 940
 941        if (pmd_large(*pmd))
 942                return spurious_fault_check(error_code, (pte_t *) pmd);
 943
 944        pte = pte_offset_kernel(pmd, address);
 945        if (!pte_present(*pte))
 946                return 0;
 947
 948        ret = spurious_fault_check(error_code, pte);
 949        if (!ret)
 950                return 0;
 951
 952        /*
 953         * Make sure we have permissions in PMD.
 954         * If not, then there's a bug in the page tables:
 955         */
 956        ret = spurious_fault_check(error_code, (pte_t *) pmd);
 957        WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
 958
 959        return ret;
 960}
 961
 962int show_unhandled_signals = 1;
 963
 964static inline int
 965access_error(unsigned long error_code, struct vm_area_struct *vma)
 966{
 967        if (error_code & PF_WRITE) {
 968                /* write, present and write, not present: */
 969                if (unlikely(!(vma->vm_flags & VM_WRITE)))
 970                        return 1;
 971                return 0;
 972        }
 973
 974        /* read, present: */
 975        if (unlikely(error_code & PF_PROT))
 976                return 1;
 977
 978        /* read, not present: */
 979        if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
 980                return 1;
 981
 982        return 0;
 983}
 984
 985static int fault_in_kernel_space(unsigned long address)
 986{
 987        return address >= TASK_SIZE_MAX;
 988}
 989
 990static inline bool smap_violation(int error_code, struct pt_regs *regs)
 991{
 992        if (error_code & PF_USER)
 993                return false;
 994
 995        if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC))
 996                return false;
 997
 998        return true;
 999}
1000
1001/*
1002 * This routine handles page faults.  It determines the address,
1003 * and the problem, and then passes it off to one of the appropriate
1004 * routines.
1005 */
1006static void __kprobes
1007__do_page_fault(struct pt_regs *regs, unsigned long error_code)
1008{
1009        struct vm_area_struct *vma;
1010        struct task_struct *tsk;
1011        unsigned long address;
1012        struct mm_struct *mm;
1013        int fault;
1014        int write = error_code & PF_WRITE;
1015        unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
1016                                        (write ? FAULT_FLAG_WRITE : 0);
1017
1018        tsk = current;
1019        mm = tsk->mm;
1020
1021        /* Get the faulting address: */
1022        address = read_cr2();
1023
1024        /*
1025         * Detect and handle instructions that would cause a page fault for
1026         * both a tracked kernel page and a userspace page.
1027         */
1028        if (kmemcheck_active(regs))
1029                kmemcheck_hide(regs);
1030        prefetchw(&mm->mmap_sem);
1031
1032        if (unlikely(kmmio_fault(regs, address)))
1033                return;
1034
1035        /*
1036         * We fault-in kernel-space virtual memory on-demand. The
1037         * 'reference' page table is init_mm.pgd.
1038         *
1039         * NOTE! We MUST NOT take any locks for this case. We may
1040         * be in an interrupt or a critical region, and should
1041         * only copy the information from the master page table,
1042         * nothing more.
1043         *
1044         * This verifies that the fault happens in kernel space
1045         * (error_code & 4) == 0, and that the fault was not a
1046         * protection error (error_code & 9) == 0.
1047         */
1048        if (unlikely(fault_in_kernel_space(address))) {
1049                if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
1050                        if (vmalloc_fault(address) >= 0)
1051                                return;
1052
1053                        if (kmemcheck_fault(regs, address, error_code))
1054                                return;
1055                }
1056
1057                /* Can handle a stale RO->RW TLB: */
1058                if (spurious_fault(error_code, address))
1059                        return;
1060
1061                /* kprobes don't want to hook the spurious faults: */
1062                if (notify_page_fault(regs))
1063                        return;
1064                /*
1065                 * Don't take the mm semaphore here. If we fixup a prefetch
1066                 * fault we could otherwise deadlock:
1067                 */
1068                bad_area_nosemaphore(regs, error_code, address);
1069
1070                return;
1071        }
1072
1073        /* kprobes don't want to hook the spurious faults: */
1074        if (unlikely(notify_page_fault(regs)))
1075                return;
1076        /*
1077         * It's safe to allow irq's after cr2 has been saved and the
1078         * vmalloc fault has been handled.
1079         *
1080         * User-mode registers count as a user access even for any
1081         * potential system fault or CPU buglet:
1082         */
1083        if (user_mode_vm(regs)) {
1084                local_irq_enable();
1085                error_code |= PF_USER;
1086        } else {
1087                if (regs->flags & X86_EFLAGS_IF)
1088                        local_irq_enable();
1089        }
1090
1091        if (unlikely(error_code & PF_RSVD))
1092                pgtable_bad(regs, error_code, address);
1093
1094        if (static_cpu_has(X86_FEATURE_SMAP)) {
1095                if (unlikely(smap_violation(error_code, regs))) {
1096                        bad_area_nosemaphore(regs, error_code, address);
1097                        return;
1098                }
1099        }
1100
1101        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
1102
1103        /*
1104         * If we're in an interrupt, have no user context or are running
1105         * in an atomic region then we must not take the fault:
1106         */
1107        if (unlikely(in_atomic() || !mm)) {
1108                bad_area_nosemaphore(regs, error_code, address);
1109                return;
1110        }
1111
1112        /*
1113         * When running in the kernel we expect faults to occur only to
1114         * addresses in user space.  All other faults represent errors in
1115         * the kernel and should generate an OOPS.  Unfortunately, in the
1116         * case of an erroneous fault occurring in a code path which already
1117         * holds mmap_sem we will deadlock attempting to validate the fault
1118         * against the address space.  Luckily the kernel only validly
1119         * references user space from well defined areas of code, which are
1120         * listed in the exceptions table.
1121         *
1122         * As the vast majority of faults will be valid we will only perform
1123         * the source reference check when there is a possibility of a
1124         * deadlock. Attempt to lock the address space, if we cannot we then
1125         * validate the source. If this is invalid we can skip the address
1126         * space check, thus avoiding the deadlock:
1127         */
1128        if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
1129                if ((error_code & PF_USER) == 0 &&
1130                    !search_exception_tables(regs->ip)) {
1131                        bad_area_nosemaphore(regs, error_code, address);
1132                        return;
1133                }
1134retry:
1135                down_read(&mm->mmap_sem);
1136        } else {
1137                /*
1138                 * The above down_read_trylock() might have succeeded in
1139                 * which case we'll have missed the might_sleep() from
1140                 * down_read():
1141                 */
1142                might_sleep();
1143        }
1144
1145        vma = find_vma(mm, address);
1146        if (unlikely(!vma)) {
1147                bad_area(regs, error_code, address);
1148                return;
1149        }
1150        if (likely(vma->vm_start <= address))
1151                goto good_area;
1152        if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
1153                bad_area(regs, error_code, address);
1154                return;
1155        }
1156        if (error_code & PF_USER) {
1157                /*
1158                 * Accessing the stack below %sp is always a bug.
1159                 * The large cushion allows instructions like enter
1160                 * and pusha to work. ("enter $65535, $31" pushes
1161                 * 32 pointers and then decrements %sp by 65535.)
1162                 */
1163                if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
1164                        bad_area(regs, error_code, address);
1165                        return;
1166                }
1167        }
1168        if (unlikely(expand_stack(vma, address))) {
1169                bad_area(regs, error_code, address);
1170                return;
1171        }
1172
1173        /*
1174         * Ok, we have a good vm_area for this memory access, so
1175         * we can handle it..
1176         */
1177good_area:
1178        if (unlikely(access_error(error_code, vma))) {
1179                bad_area_access_error(regs, error_code, address);
1180                return;
1181        }
1182
1183        /*
1184         * If for any reason at all we couldn't handle the fault,
1185         * make sure we exit gracefully rather than endlessly redo
1186         * the fault:
1187         */
1188        fault = handle_mm_fault(mm, vma, address, flags);
1189
1190        if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
1191                if (mm_fault_error(regs, error_code, address, fault))
1192                        return;
1193        }
1194
1195        /*
1196         * Major/minor page fault accounting is only done on the
1197         * initial attempt. If we go through a retry, it is extremely
1198         * likely that the page will be found in page cache at that point.
1199         */
1200        if (flags & FAULT_FLAG_ALLOW_RETRY) {
1201                if (fault & VM_FAULT_MAJOR) {
1202                        tsk->maj_flt++;
1203                        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
1204                                      regs, address);
1205                } else {
1206                        tsk->min_flt++;
1207                        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
1208                                      regs, address);
1209                }
1210                if (fault & VM_FAULT_RETRY) {
1211                        /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
1212                         * of starvation. */
1213                        flags &= ~FAULT_FLAG_ALLOW_RETRY;
1214                        flags |= FAULT_FLAG_TRIED;
1215                        goto retry;
1216                }
1217        }
1218
1219        check_v8086_mode(regs, address, tsk);
1220
1221        up_read(&mm->mmap_sem);
1222}
1223
1224dotraplinkage void __kprobes
1225do_page_fault(struct pt_regs *regs, unsigned long error_code)
1226{
1227        enum ctx_state prev_state;
1228
1229        prev_state = exception_enter();
1230        __do_page_fault(regs, error_code);
1231        exception_exit(prev_state);
1232}
1233