1
2
3
4
5
6#include <linux/sched.h>
7#include <linux/kdebug.h>
8#include <linux/module.h>
9#include <linux/bootmem.h>
10#include <linux/kprobes.h>
11#include <linux/mmiotrace.h>
12#include <linux/perf_event.h>
13#include <linux/hugetlb.h>
14#include <linux/prefetch.h>
15#include <linux/context_tracking.h>
16#include <linux/uaccess.h>
17
18#include <asm/traps.h>
19#include <asm/pgalloc.h>
20#include <asm/kmemcheck.h>
21#include <asm/fixmap.h>
22#include <asm/vsyscall.h>
23#include <asm/vm86.h>
24
25#define CREATE_TRACE_POINTS
26#include <asm/trace/exceptions.h>
27
28
29
30
31
32
33
34
35
36
37enum x86_pf_error_code {
38
39 PF_PROT = 1 << 0,
40 PF_WRITE = 1 << 1,
41 PF_USER = 1 << 2,
42 PF_RSVD = 1 << 3,
43 PF_INSTR = 1 << 4,
44};
45
46
47
48
49
50static nokprobe_inline int
51kmmio_fault(struct pt_regs *regs, unsigned long addr)
52{
53 if (unlikely(is_kmmio_active()))
54 if (kmmio_handler(regs, addr) == 1)
55 return -1;
56 return 0;
57}
58
59static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
60{
61 int ret = 0;
62
63
64 if (kprobes_built_in() && !user_mode(regs)) {
65 preempt_disable();
66 if (kprobe_running() && kprobe_fault_handler(regs, 14))
67 ret = 1;
68 preempt_enable();
69 }
70
71 return ret;
72}
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89static inline int
90check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
91 unsigned char opcode, int *prefetch)
92{
93 unsigned char instr_hi = opcode & 0xf0;
94 unsigned char instr_lo = opcode & 0x0f;
95
96 switch (instr_hi) {
97 case 0x20:
98 case 0x30:
99
100
101
102
103
104
105 return ((instr_lo & 7) == 0x6);
106#ifdef CONFIG_X86_64
107 case 0x40:
108
109
110
111
112
113
114
115 return (!user_mode(regs) || user_64bit_mode(regs));
116#endif
117 case 0x60:
118
119 return (instr_lo & 0xC) == 0x4;
120 case 0xF0:
121
122 return !instr_lo || (instr_lo>>1) == 1;
123 case 0x00:
124
125 if (probe_kernel_address(instr, opcode))
126 return 0;
127
128 *prefetch = (instr_lo == 0xF) &&
129 (opcode == 0x0D || opcode == 0x18);
130 return 0;
131 default:
132 return 0;
133 }
134}
135
136static int
137is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
138{
139 unsigned char *max_instr;
140 unsigned char *instr;
141 int prefetch = 0;
142
143
144
145
146
147 if (error_code & PF_INSTR)
148 return 0;
149
150 instr = (void *)convert_ip_to_linear(current, regs);
151 max_instr = instr + 15;
152
153 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
154 return 0;
155
156 while (instr < max_instr) {
157 unsigned char opcode;
158
159 if (probe_kernel_address(instr, opcode))
160 break;
161
162 instr++;
163
164 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
165 break;
166 }
167 return prefetch;
168}
169
170static void
171force_sig_info_fault(int si_signo, int si_code, unsigned long address,
172 struct task_struct *tsk, int fault)
173{
174 unsigned lsb = 0;
175 siginfo_t info;
176
177 info.si_signo = si_signo;
178 info.si_errno = 0;
179 info.si_code = si_code;
180 info.si_addr = (void __user *)address;
181 if (fault & VM_FAULT_HWPOISON_LARGE)
182 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
183 if (fault & VM_FAULT_HWPOISON)
184 lsb = PAGE_SHIFT;
185 info.si_addr_lsb = lsb;
186
187 force_sig_info(si_signo, &info, tsk);
188}
189
190DEFINE_SPINLOCK(pgd_lock);
191LIST_HEAD(pgd_list);
192
193#ifdef CONFIG_X86_32
194static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
195{
196 unsigned index = pgd_index(address);
197 pgd_t *pgd_k;
198 pud_t *pud, *pud_k;
199 pmd_t *pmd, *pmd_k;
200
201 pgd += index;
202 pgd_k = init_mm.pgd + index;
203
204 if (!pgd_present(*pgd_k))
205 return NULL;
206
207
208
209
210
211
212 pud = pud_offset(pgd, address);
213 pud_k = pud_offset(pgd_k, address);
214 if (!pud_present(*pud_k))
215 return NULL;
216
217 pmd = pmd_offset(pud, address);
218 pmd_k = pmd_offset(pud_k, address);
219 if (!pmd_present(*pmd_k))
220 return NULL;
221
222 if (!pmd_present(*pmd))
223 set_pmd(pmd, *pmd_k);
224 else
225 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
226
227 return pmd_k;
228}
229
230void vmalloc_sync_all(void)
231{
232 unsigned long address;
233
234 if (SHARED_KERNEL_PMD)
235 return;
236
237 for (address = VMALLOC_START & PMD_MASK;
238 address >= TASK_SIZE && address < FIXADDR_TOP;
239 address += PMD_SIZE) {
240 struct page *page;
241
242 spin_lock(&pgd_lock);
243 list_for_each_entry(page, &pgd_list, lru) {
244 spinlock_t *pgt_lock;
245 pmd_t *ret;
246
247
248 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
249
250 spin_lock(pgt_lock);
251 ret = vmalloc_sync_one(page_address(page), address);
252 spin_unlock(pgt_lock);
253
254 if (!ret)
255 break;
256 }
257 spin_unlock(&pgd_lock);
258 }
259}
260
261
262
263
264
265
266static noinline int vmalloc_fault(unsigned long address)
267{
268 unsigned long pgd_paddr;
269 pmd_t *pmd_k;
270 pte_t *pte_k;
271
272
273 if (!(address >= VMALLOC_START && address < VMALLOC_END))
274 return -1;
275
276 WARN_ON_ONCE(in_nmi());
277
278
279
280
281
282
283
284
285 pgd_paddr = read_cr3();
286 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
287 if (!pmd_k)
288 return -1;
289
290 pte_k = pte_offset_kernel(pmd_k, address);
291 if (!pte_present(*pte_k))
292 return -1;
293
294 return 0;
295}
296NOKPROBE_SYMBOL(vmalloc_fault);
297
298
299
300
301static inline void
302check_v8086_mode(struct pt_regs *regs, unsigned long address,
303 struct task_struct *tsk)
304{
305#ifdef CONFIG_VM86
306 unsigned long bit;
307
308 if (!v8086_mode(regs) || !tsk->thread.vm86)
309 return;
310
311 bit = (address - 0xA0000) >> PAGE_SHIFT;
312 if (bit < 32)
313 tsk->thread.vm86->screen_bitmap |= 1 << bit;
314#endif
315}
316
317static bool low_pfn(unsigned long pfn)
318{
319 return pfn < max_low_pfn;
320}
321
322static void dump_pagetable(unsigned long address)
323{
324 pgd_t *base = __va(read_cr3());
325 pgd_t *pgd = &base[pgd_index(address)];
326 pmd_t *pmd;
327 pte_t *pte;
328
329#ifdef CONFIG_X86_PAE
330 printk("*pdpt = %016Lx ", pgd_val(*pgd));
331 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
332 goto out;
333#endif
334 pmd = pmd_offset(pud_offset(pgd, address), address);
335 printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
336
337
338
339
340
341
342
343 if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
344 goto out;
345
346 pte = pte_offset_kernel(pmd, address);
347 printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
348out:
349 printk("\n");
350}
351
352#else
353
354void vmalloc_sync_all(void)
355{
356 sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0);
357}
358
359
360
361
362
363
364
365
366static noinline int vmalloc_fault(unsigned long address)
367{
368 pgd_t *pgd, *pgd_ref;
369 pud_t *pud, *pud_ref;
370 pmd_t *pmd, *pmd_ref;
371 pte_t *pte, *pte_ref;
372
373
374 if (!(address >= VMALLOC_START && address < VMALLOC_END))
375 return -1;
376
377 WARN_ON_ONCE(in_nmi());
378
379
380
381
382
383
384 pgd = pgd_offset(current->active_mm, address);
385 pgd_ref = pgd_offset_k(address);
386 if (pgd_none(*pgd_ref))
387 return -1;
388
389 if (pgd_none(*pgd)) {
390 set_pgd(pgd, *pgd_ref);
391 arch_flush_lazy_mmu_mode();
392 } else {
393 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
394 }
395
396
397
398
399
400
401 pud = pud_offset(pgd, address);
402 pud_ref = pud_offset(pgd_ref, address);
403 if (pud_none(*pud_ref))
404 return -1;
405
406 if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
407 BUG();
408
409 pmd = pmd_offset(pud, address);
410 pmd_ref = pmd_offset(pud_ref, address);
411 if (pmd_none(*pmd_ref))
412 return -1;
413
414 if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
415 BUG();
416
417 pte_ref = pte_offset_kernel(pmd_ref, address);
418 if (!pte_present(*pte_ref))
419 return -1;
420
421 pte = pte_offset_kernel(pmd, address);
422
423
424
425
426
427
428 if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
429 BUG();
430
431 return 0;
432}
433NOKPROBE_SYMBOL(vmalloc_fault);
434
435#ifdef CONFIG_CPU_SUP_AMD
436static const char errata93_warning[] =
437KERN_ERR
438"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
439"******* Working around it, but it may cause SEGVs or burn power.\n"
440"******* Please consider a BIOS update.\n"
441"******* Disabling USB legacy in the BIOS may also help.\n";
442#endif
443
444
445
446
447static inline void
448check_v8086_mode(struct pt_regs *regs, unsigned long address,
449 struct task_struct *tsk)
450{
451}
452
453static int bad_address(void *p)
454{
455 unsigned long dummy;
456
457 return probe_kernel_address((unsigned long *)p, dummy);
458}
459
460static void dump_pagetable(unsigned long address)
461{
462 pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
463 pgd_t *pgd = base + pgd_index(address);
464 pud_t *pud;
465 pmd_t *pmd;
466 pte_t *pte;
467
468 if (bad_address(pgd))
469 goto bad;
470
471 printk("PGD %lx ", pgd_val(*pgd));
472
473 if (!pgd_present(*pgd))
474 goto out;
475
476 pud = pud_offset(pgd, address);
477 if (bad_address(pud))
478 goto bad;
479
480 printk("PUD %lx ", pud_val(*pud));
481 if (!pud_present(*pud) || pud_large(*pud))
482 goto out;
483
484 pmd = pmd_offset(pud, address);
485 if (bad_address(pmd))
486 goto bad;
487
488 printk("PMD %lx ", pmd_val(*pmd));
489 if (!pmd_present(*pmd) || pmd_large(*pmd))
490 goto out;
491
492 pte = pte_offset_kernel(pmd, address);
493 if (bad_address(pte))
494 goto bad;
495
496 printk("PTE %lx", pte_val(*pte));
497out:
498 printk("\n");
499 return;
500bad:
501 printk("BAD\n");
502}
503
504#endif
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520static int is_errata93(struct pt_regs *regs, unsigned long address)
521{
522#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
523 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
524 || boot_cpu_data.x86 != 0xf)
525 return 0;
526
527 if (address != regs->ip)
528 return 0;
529
530 if ((address >> 32) != 0)
531 return 0;
532
533 address |= 0xffffffffUL << 32;
534 if ((address >= (u64)_stext && address <= (u64)_etext) ||
535 (address >= MODULES_VADDR && address <= MODULES_END)) {
536 printk_once(errata93_warning);
537 regs->ip = address;
538 return 1;
539 }
540#endif
541 return 0;
542}
543
544
545
546
547
548
549
550
551
552static int is_errata100(struct pt_regs *regs, unsigned long address)
553{
554#ifdef CONFIG_X86_64
555 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
556 return 1;
557#endif
558 return 0;
559}
560
561static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
562{
563#ifdef CONFIG_X86_F00F_BUG
564 unsigned long nr;
565
566
567
568
569 if (boot_cpu_has_bug(X86_BUG_F00F)) {
570 nr = (address - idt_descr.address) >> 3;
571
572 if (nr == 6) {
573 do_invalid_op(regs, 0);
574 return 1;
575 }
576 }
577#endif
578 return 0;
579}
580
581static const char nx_warning[] = KERN_CRIT
582"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
583static const char smep_warning[] = KERN_CRIT
584"unable to execute userspace code (SMEP?) (uid: %d)\n";
585
586static void
587show_fault_oops(struct pt_regs *regs, unsigned long error_code,
588 unsigned long address)
589{
590 if (!oops_may_print())
591 return;
592
593 if (error_code & PF_INSTR) {
594 unsigned int level;
595 pgd_t *pgd;
596 pte_t *pte;
597
598 pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
599 pgd += pgd_index(address);
600
601 pte = lookup_address_in_pgd(pgd, address, &level);
602
603 if (pte && pte_present(*pte) && !pte_exec(*pte))
604 printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
605 if (pte && pte_present(*pte) && pte_exec(*pte) &&
606 (pgd_flags(*pgd) & _PAGE_USER) &&
607 (__read_cr4() & X86_CR4_SMEP))
608 printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
609 }
610
611 printk(KERN_ALERT "BUG: unable to handle kernel ");
612 if (address < PAGE_SIZE)
613 printk(KERN_CONT "NULL pointer dereference");
614 else
615 printk(KERN_CONT "paging request");
616
617 printk(KERN_CONT " at %p\n", (void *) address);
618 printk(KERN_ALERT "IP:");
619 printk_address(regs->ip);
620
621 dump_pagetable(address);
622}
623
624static noinline void
625pgtable_bad(struct pt_regs *regs, unsigned long error_code,
626 unsigned long address)
627{
628 struct task_struct *tsk;
629 unsigned long flags;
630 int sig;
631
632 flags = oops_begin();
633 tsk = current;
634 sig = SIGKILL;
635
636 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
637 tsk->comm, address);
638 dump_pagetable(address);
639
640 tsk->thread.cr2 = address;
641 tsk->thread.trap_nr = X86_TRAP_PF;
642 tsk->thread.error_code = error_code;
643
644 if (__die("Bad pagetable", regs, error_code))
645 sig = 0;
646
647 oops_end(flags, regs, sig);
648}
649
650static noinline void
651no_context(struct pt_regs *regs, unsigned long error_code,
652 unsigned long address, int signal, int si_code)
653{
654 struct task_struct *tsk = current;
655 unsigned long flags;
656 int sig;
657
658
659 if (fixup_exception(regs)) {
660
661
662
663
664
665 if (in_interrupt())
666 return;
667
668
669
670
671
672
673
674 if (current_thread_info()->sig_on_uaccess_error && signal) {
675 tsk->thread.trap_nr = X86_TRAP_PF;
676 tsk->thread.error_code = error_code | PF_USER;
677 tsk->thread.cr2 = address;
678
679
680 force_sig_info_fault(signal, si_code, address, tsk, 0);
681 }
682
683
684
685
686 return;
687 }
688
689
690
691
692
693
694
695
696
697
698
699
700 if (is_prefetch(regs, error_code, address))
701 return;
702
703 if (is_errata93(regs, address))
704 return;
705
706
707
708
709
710 flags = oops_begin();
711
712 show_fault_oops(regs, error_code, address);
713
714 if (task_stack_end_corrupted(tsk))
715 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
716
717 tsk->thread.cr2 = address;
718 tsk->thread.trap_nr = X86_TRAP_PF;
719 tsk->thread.error_code = error_code;
720
721 sig = SIGKILL;
722 if (__die("Oops", regs, error_code))
723 sig = 0;
724
725
726 printk(KERN_DEFAULT "CR2: %016lx\n", address);
727
728 oops_end(flags, regs, sig);
729}
730
731
732
733
734
735static inline void
736show_signal_msg(struct pt_regs *regs, unsigned long error_code,
737 unsigned long address, struct task_struct *tsk)
738{
739 if (!unhandled_signal(tsk, SIGSEGV))
740 return;
741
742 if (!printk_ratelimit())
743 return;
744
745 printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
746 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
747 tsk->comm, task_pid_nr(tsk), address,
748 (void *)regs->ip, (void *)regs->sp, error_code);
749
750 print_vma_addr(KERN_CONT " in ", regs->ip);
751
752 printk(KERN_CONT "\n");
753}
754
755static void
756__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
757 unsigned long address, int si_code)
758{
759 struct task_struct *tsk = current;
760
761
762 if (error_code & PF_USER) {
763
764
765
766 local_irq_enable();
767
768
769
770
771
772 if (is_prefetch(regs, error_code, address))
773 return;
774
775 if (is_errata100(regs, address))
776 return;
777
778#ifdef CONFIG_X86_64
779
780
781
782
783 if (unlikely((error_code & PF_INSTR) &&
784 ((address & ~0xfff) == VSYSCALL_ADDR))) {
785 if (emulate_vsyscall(regs, address))
786 return;
787 }
788#endif
789
790 if (address >= TASK_SIZE)
791 error_code |= PF_PROT;
792
793 if (likely(show_unhandled_signals))
794 show_signal_msg(regs, error_code, address, tsk);
795
796 tsk->thread.cr2 = address;
797 tsk->thread.error_code = error_code;
798 tsk->thread.trap_nr = X86_TRAP_PF;
799
800 force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
801
802 return;
803 }
804
805 if (is_f00f_bug(regs, address))
806 return;
807
808 no_context(regs, error_code, address, SIGSEGV, si_code);
809}
810
811static noinline void
812bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
813 unsigned long address)
814{
815 __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
816}
817
818static void
819__bad_area(struct pt_regs *regs, unsigned long error_code,
820 unsigned long address, int si_code)
821{
822 struct mm_struct *mm = current->mm;
823
824
825
826
827
828 up_read(&mm->mmap_sem);
829
830 __bad_area_nosemaphore(regs, error_code, address, si_code);
831}
832
833static noinline void
834bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
835{
836 __bad_area(regs, error_code, address, SEGV_MAPERR);
837}
838
839static noinline void
840bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
841 unsigned long address)
842{
843 __bad_area(regs, error_code, address, SEGV_ACCERR);
844}
845
846static void
847do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
848 unsigned int fault)
849{
850 struct task_struct *tsk = current;
851 int code = BUS_ADRERR;
852
853
854 if (!(error_code & PF_USER)) {
855 no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
856 return;
857 }
858
859
860 if (is_prefetch(regs, error_code, address))
861 return;
862
863 tsk->thread.cr2 = address;
864 tsk->thread.error_code = error_code;
865 tsk->thread.trap_nr = X86_TRAP_PF;
866
867#ifdef CONFIG_MEMORY_FAILURE
868 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
869 printk(KERN_ERR
870 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
871 tsk->comm, tsk->pid, address);
872 code = BUS_MCEERR_AR;
873 }
874#endif
875 force_sig_info_fault(SIGBUS, code, address, tsk, fault);
876}
877
878static noinline void
879mm_fault_error(struct pt_regs *regs, unsigned long error_code,
880 unsigned long address, unsigned int fault)
881{
882 if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
883 no_context(regs, error_code, address, 0, 0);
884 return;
885 }
886
887 if (fault & VM_FAULT_OOM) {
888
889 if (!(error_code & PF_USER)) {
890 no_context(regs, error_code, address,
891 SIGSEGV, SEGV_MAPERR);
892 return;
893 }
894
895
896
897
898
899
900 pagefault_out_of_memory();
901 } else {
902 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
903 VM_FAULT_HWPOISON_LARGE))
904 do_sigbus(regs, error_code, address, fault);
905 else if (fault & VM_FAULT_SIGSEGV)
906 bad_area_nosemaphore(regs, error_code, address);
907 else
908 BUG();
909 }
910}
911
912static int spurious_fault_check(unsigned long error_code, pte_t *pte)
913{
914 if ((error_code & PF_WRITE) && !pte_write(*pte))
915 return 0;
916
917 if ((error_code & PF_INSTR) && !pte_exec(*pte))
918 return 0;
919
920 return 1;
921}
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944static noinline int
945spurious_fault(unsigned long error_code, unsigned long address)
946{
947 pgd_t *pgd;
948 pud_t *pud;
949 pmd_t *pmd;
950 pte_t *pte;
951 int ret;
952
953
954
955
956
957
958
959
960
961
962 if (error_code != (PF_WRITE | PF_PROT)
963 && error_code != (PF_INSTR | PF_PROT))
964 return 0;
965
966 pgd = init_mm.pgd + pgd_index(address);
967 if (!pgd_present(*pgd))
968 return 0;
969
970 pud = pud_offset(pgd, address);
971 if (!pud_present(*pud))
972 return 0;
973
974 if (pud_large(*pud))
975 return spurious_fault_check(error_code, (pte_t *) pud);
976
977 pmd = pmd_offset(pud, address);
978 if (!pmd_present(*pmd))
979 return 0;
980
981 if (pmd_large(*pmd))
982 return spurious_fault_check(error_code, (pte_t *) pmd);
983
984 pte = pte_offset_kernel(pmd, address);
985 if (!pte_present(*pte))
986 return 0;
987
988 ret = spurious_fault_check(error_code, pte);
989 if (!ret)
990 return 0;
991
992
993
994
995
996 ret = spurious_fault_check(error_code, (pte_t *) pmd);
997 WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
998
999 return ret;
1000}
1001NOKPROBE_SYMBOL(spurious_fault);
1002
1003int show_unhandled_signals = 1;
1004
1005static inline int
1006access_error(unsigned long error_code, struct vm_area_struct *vma)
1007{
1008 if (error_code & PF_WRITE) {
1009
1010 if (unlikely(!(vma->vm_flags & VM_WRITE)))
1011 return 1;
1012 return 0;
1013 }
1014
1015
1016 if (unlikely(error_code & PF_PROT))
1017 return 1;
1018
1019
1020 if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
1021 return 1;
1022
1023 return 0;
1024}
1025
1026static int fault_in_kernel_space(unsigned long address)
1027{
1028 return address >= TASK_SIZE_MAX;
1029}
1030
1031static inline bool smap_violation(int error_code, struct pt_regs *regs)
1032{
1033 if (!IS_ENABLED(CONFIG_X86_SMAP))
1034 return false;
1035
1036 if (!static_cpu_has(X86_FEATURE_SMAP))
1037 return false;
1038
1039 if (error_code & PF_USER)
1040 return false;
1041
1042 if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
1043 return false;
1044
1045 return true;
1046}
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057static noinline void
1058__do_page_fault(struct pt_regs *regs, unsigned long error_code,
1059 unsigned long address)
1060{
1061 struct vm_area_struct *vma;
1062 struct task_struct *tsk;
1063 struct mm_struct *mm;
1064 int fault, major = 0;
1065 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
1066
1067 tsk = current;
1068 mm = tsk->mm;
1069
1070
1071
1072
1073
1074 if (kmemcheck_active(regs))
1075 kmemcheck_hide(regs);
1076 prefetchw(&mm->mmap_sem);
1077
1078 if (unlikely(kmmio_fault(regs, address)))
1079 return;
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094 if (unlikely(fault_in_kernel_space(address))) {
1095 if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
1096 if (vmalloc_fault(address) >= 0)
1097 return;
1098
1099 if (kmemcheck_fault(regs, address, error_code))
1100 return;
1101 }
1102
1103
1104 if (spurious_fault(error_code, address))
1105 return;
1106
1107
1108 if (kprobes_fault(regs))
1109 return;
1110
1111
1112
1113
1114 bad_area_nosemaphore(regs, error_code, address);
1115
1116 return;
1117 }
1118
1119
1120 if (unlikely(kprobes_fault(regs)))
1121 return;
1122
1123 if (unlikely(error_code & PF_RSVD))
1124 pgtable_bad(regs, error_code, address);
1125
1126 if (unlikely(smap_violation(error_code, regs))) {
1127 bad_area_nosemaphore(regs, error_code, address);
1128 return;
1129 }
1130
1131
1132
1133
1134
1135 if (unlikely(faulthandler_disabled() || !mm)) {
1136 bad_area_nosemaphore(regs, error_code, address);
1137 return;
1138 }
1139
1140
1141
1142
1143
1144
1145
1146
1147 if (user_mode(regs)) {
1148 local_irq_enable();
1149 error_code |= PF_USER;
1150 flags |= FAULT_FLAG_USER;
1151 } else {
1152 if (regs->flags & X86_EFLAGS_IF)
1153 local_irq_enable();
1154 }
1155
1156 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
1157
1158 if (error_code & PF_WRITE)
1159 flags |= FAULT_FLAG_WRITE;
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
1178 if ((error_code & PF_USER) == 0 &&
1179 !search_exception_tables(regs->ip)) {
1180 bad_area_nosemaphore(regs, error_code, address);
1181 return;
1182 }
1183retry:
1184 down_read(&mm->mmap_sem);
1185 } else {
1186
1187
1188
1189
1190
1191 might_sleep();
1192 }
1193
1194 vma = find_vma(mm, address);
1195 if (unlikely(!vma)) {
1196 bad_area(regs, error_code, address);
1197 return;
1198 }
1199 if (likely(vma->vm_start <= address))
1200 goto good_area;
1201 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
1202 bad_area(regs, error_code, address);
1203 return;
1204 }
1205 if (error_code & PF_USER) {
1206
1207
1208
1209
1210
1211
1212 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
1213 bad_area(regs, error_code, address);
1214 return;
1215 }
1216 }
1217 if (unlikely(expand_stack(vma, address))) {
1218 bad_area(regs, error_code, address);
1219 return;
1220 }
1221
1222
1223
1224
1225
1226good_area:
1227 if (unlikely(access_error(error_code, vma))) {
1228 bad_area_access_error(regs, error_code, address);
1229 return;
1230 }
1231
1232
1233
1234
1235
1236
1237
1238 fault = handle_mm_fault(mm, vma, address, flags);
1239 major |= fault & VM_FAULT_MAJOR;
1240
1241
1242
1243
1244
1245
1246 if (unlikely(fault & VM_FAULT_RETRY)) {
1247
1248 if (flags & FAULT_FLAG_ALLOW_RETRY) {
1249 flags &= ~FAULT_FLAG_ALLOW_RETRY;
1250 flags |= FAULT_FLAG_TRIED;
1251 if (!fatal_signal_pending(tsk))
1252 goto retry;
1253 }
1254
1255
1256 if (flags & FAULT_FLAG_USER)
1257 return;
1258
1259
1260 no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
1261 return;
1262 }
1263
1264 up_read(&mm->mmap_sem);
1265 if (unlikely(fault & VM_FAULT_ERROR)) {
1266 mm_fault_error(regs, error_code, address, fault);
1267 return;
1268 }
1269
1270
1271
1272
1273
1274 if (major) {
1275 tsk->maj_flt++;
1276 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
1277 } else {
1278 tsk->min_flt++;
1279 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
1280 }
1281
1282 check_v8086_mode(regs, address, tsk);
1283}
1284NOKPROBE_SYMBOL(__do_page_fault);
1285
1286dotraplinkage void notrace
1287do_page_fault(struct pt_regs *regs, unsigned long error_code)
1288{
1289 unsigned long address = read_cr2();
1290 enum ctx_state prev_state;
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300 prev_state = exception_enter();
1301 __do_page_fault(regs, error_code, address);
1302 exception_exit(prev_state);
1303}
1304NOKPROBE_SYMBOL(do_page_fault);
1305
1306#ifdef CONFIG_TRACING
1307static nokprobe_inline void
1308trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
1309 unsigned long error_code)
1310{
1311 if (user_mode(regs))
1312 trace_page_fault_user(address, regs, error_code);
1313 else
1314 trace_page_fault_kernel(address, regs, error_code);
1315}
1316
1317dotraplinkage void notrace
1318trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
1319{
1320
1321
1322
1323
1324
1325
1326 unsigned long address = read_cr2();
1327 enum ctx_state prev_state;
1328
1329 prev_state = exception_enter();
1330 trace_page_fault_entries(address, regs, error_code);
1331 __do_page_fault(regs, error_code, address);
1332 exception_exit(prev_state);
1333}
1334NOKPROBE_SYMBOL(trace_do_page_fault);
1335#endif
1336