1
2
3
4
5
6
7#include <linux/sched.h>
8#include <linux/sched/task_stack.h>
9#include <linux/kdebug.h>
10#include <linux/extable.h>
11#include <linux/memblock.h>
12#include <linux/kprobes.h>
13#include <linux/mmiotrace.h>
14#include <linux/perf_event.h>
15#include <linux/hugetlb.h>
16#include <linux/prefetch.h>
17#include <linux/context_tracking.h>
18#include <linux/uaccess.h>
19#include <linux/efi.h>
20#include <linux/mm_types.h>
21
22#include <asm/cpufeature.h>
23#include <asm/traps.h>
24#include <asm/fixmap.h>
25#include <asm/vsyscall.h>
26#include <asm/vm86.h>
27#include <asm/mmu_context.h>
28#include <asm/efi.h>
29#include <asm/desc.h>
30#include <asm/cpu_entry_area.h>
31#include <asm/pgtable_areas.h>
32#include <asm/kvm_para.h>
33
34#define CREATE_TRACE_POINTS
35#include <asm/trace/exceptions.h>
36
37
38
39
40
41static nokprobe_inline int
42kmmio_fault(struct pt_regs *regs, unsigned long addr)
43{
44 if (unlikely(is_kmmio_active()))
45 if (kmmio_handler(regs, addr) == 1)
46 return -1;
47 return 0;
48}
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65static inline int
66check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
67 unsigned char opcode, int *prefetch)
68{
69 unsigned char instr_hi = opcode & 0xf0;
70 unsigned char instr_lo = opcode & 0x0f;
71
72 switch (instr_hi) {
73 case 0x20:
74 case 0x30:
75
76
77
78
79
80
81 return ((instr_lo & 7) == 0x6);
82#ifdef CONFIG_X86_64
83 case 0x40:
84
85
86
87
88
89
90
91 return (!user_mode(regs) || user_64bit_mode(regs));
92#endif
93 case 0x60:
94
95 return (instr_lo & 0xC) == 0x4;
96 case 0xF0:
97
98 return !instr_lo || (instr_lo>>1) == 1;
99 case 0x00:
100
101 if (get_kernel_nofault(opcode, instr))
102 return 0;
103
104 *prefetch = (instr_lo == 0xF) &&
105 (opcode == 0x0D || opcode == 0x18);
106 return 0;
107 default:
108 return 0;
109 }
110}
111
112static int
113is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
114{
115 unsigned char *max_instr;
116 unsigned char *instr;
117 int prefetch = 0;
118
119
120
121
122
123 if (error_code & X86_PF_INSTR)
124 return 0;
125
126 instr = (void *)convert_ip_to_linear(current, regs);
127 max_instr = instr + 15;
128
129 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
130 return 0;
131
132 while (instr < max_instr) {
133 unsigned char opcode;
134
135 if (get_kernel_nofault(opcode, instr))
136 break;
137
138 instr++;
139
140 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
141 break;
142 }
143 return prefetch;
144}
145
146DEFINE_SPINLOCK(pgd_lock);
147LIST_HEAD(pgd_list);
148
149#ifdef CONFIG_X86_32
150static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
151{
152 unsigned index = pgd_index(address);
153 pgd_t *pgd_k;
154 p4d_t *p4d, *p4d_k;
155 pud_t *pud, *pud_k;
156 pmd_t *pmd, *pmd_k;
157
158 pgd += index;
159 pgd_k = init_mm.pgd + index;
160
161 if (!pgd_present(*pgd_k))
162 return NULL;
163
164
165
166
167
168
169 p4d = p4d_offset(pgd, address);
170 p4d_k = p4d_offset(pgd_k, address);
171 if (!p4d_present(*p4d_k))
172 return NULL;
173
174 pud = pud_offset(p4d, address);
175 pud_k = pud_offset(p4d_k, address);
176 if (!pud_present(*pud_k))
177 return NULL;
178
179 pmd = pmd_offset(pud, address);
180 pmd_k = pmd_offset(pud_k, address);
181
182 if (pmd_present(*pmd) != pmd_present(*pmd_k))
183 set_pmd(pmd, *pmd_k);
184
185 if (!pmd_present(*pmd_k))
186 return NULL;
187 else
188 BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k));
189
190 return pmd_k;
191}
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207static noinline int vmalloc_fault(unsigned long address)
208{
209 unsigned long pgd_paddr;
210 pmd_t *pmd_k;
211 pte_t *pte_k;
212
213
214 if (!(address >= VMALLOC_START && address < VMALLOC_END))
215 return -1;
216
217
218
219
220
221
222
223
224 pgd_paddr = read_cr3_pa();
225 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
226 if (!pmd_k)
227 return -1;
228
229 if (pmd_large(*pmd_k))
230 return 0;
231
232 pte_k = pte_offset_kernel(pmd_k, address);
233 if (!pte_present(*pte_k))
234 return -1;
235
236 return 0;
237}
238NOKPROBE_SYMBOL(vmalloc_fault);
239
240void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
241{
242 unsigned long addr;
243
244 for (addr = start & PMD_MASK;
245 addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
246 addr += PMD_SIZE) {
247 struct page *page;
248
249 spin_lock(&pgd_lock);
250 list_for_each_entry(page, &pgd_list, lru) {
251 spinlock_t *pgt_lock;
252
253
254 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
255
256 spin_lock(pgt_lock);
257 vmalloc_sync_one(page_address(page), addr);
258 spin_unlock(pgt_lock);
259 }
260 spin_unlock(&pgd_lock);
261 }
262}
263
264
265
266
267static inline void
268check_v8086_mode(struct pt_regs *regs, unsigned long address,
269 struct task_struct *tsk)
270{
271#ifdef CONFIG_VM86
272 unsigned long bit;
273
274 if (!v8086_mode(regs) || !tsk->thread.vm86)
275 return;
276
277 bit = (address - 0xA0000) >> PAGE_SHIFT;
278 if (bit < 32)
279 tsk->thread.vm86->screen_bitmap |= 1 << bit;
280#endif
281}
282
283static bool low_pfn(unsigned long pfn)
284{
285 return pfn < max_low_pfn;
286}
287
288static void dump_pagetable(unsigned long address)
289{
290 pgd_t *base = __va(read_cr3_pa());
291 pgd_t *pgd = &base[pgd_index(address)];
292 p4d_t *p4d;
293 pud_t *pud;
294 pmd_t *pmd;
295 pte_t *pte;
296
297#ifdef CONFIG_X86_PAE
298 pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
299 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
300 goto out;
301#define pr_pde pr_cont
302#else
303#define pr_pde pr_info
304#endif
305 p4d = p4d_offset(pgd, address);
306 pud = pud_offset(p4d, address);
307 pmd = pmd_offset(pud, address);
308 pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
309#undef pr_pde
310
311
312
313
314
315
316
317 if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
318 goto out;
319
320 pte = pte_offset_kernel(pmd, address);
321 pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
322out:
323 pr_cont("\n");
324}
325
326#else
327
328#ifdef CONFIG_CPU_SUP_AMD
329static const char errata93_warning[] =
330KERN_ERR
331"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
332"******* Working around it, but it may cause SEGVs or burn power.\n"
333"******* Please consider a BIOS update.\n"
334"******* Disabling USB legacy in the BIOS may also help.\n";
335#endif
336
337
338
339
340static inline void
341check_v8086_mode(struct pt_regs *regs, unsigned long address,
342 struct task_struct *tsk)
343{
344}
345
346static int bad_address(void *p)
347{
348 unsigned long dummy;
349
350 return get_kernel_nofault(dummy, (unsigned long *)p);
351}
352
353static void dump_pagetable(unsigned long address)
354{
355 pgd_t *base = __va(read_cr3_pa());
356 pgd_t *pgd = base + pgd_index(address);
357 p4d_t *p4d;
358 pud_t *pud;
359 pmd_t *pmd;
360 pte_t *pte;
361
362 if (bad_address(pgd))
363 goto bad;
364
365 pr_info("PGD %lx ", pgd_val(*pgd));
366
367 if (!pgd_present(*pgd))
368 goto out;
369
370 p4d = p4d_offset(pgd, address);
371 if (bad_address(p4d))
372 goto bad;
373
374 pr_cont("P4D %lx ", p4d_val(*p4d));
375 if (!p4d_present(*p4d) || p4d_large(*p4d))
376 goto out;
377
378 pud = pud_offset(p4d, address);
379 if (bad_address(pud))
380 goto bad;
381
382 pr_cont("PUD %lx ", pud_val(*pud));
383 if (!pud_present(*pud) || pud_large(*pud))
384 goto out;
385
386 pmd = pmd_offset(pud, address);
387 if (bad_address(pmd))
388 goto bad;
389
390 pr_cont("PMD %lx ", pmd_val(*pmd));
391 if (!pmd_present(*pmd) || pmd_large(*pmd))
392 goto out;
393
394 pte = pte_offset_kernel(pmd, address);
395 if (bad_address(pte))
396 goto bad;
397
398 pr_cont("PTE %lx", pte_val(*pte));
399out:
400 pr_cont("\n");
401 return;
402bad:
403 pr_info("BAD\n");
404}
405
406#endif
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422static int is_errata93(struct pt_regs *regs, unsigned long address)
423{
424#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
425 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
426 || boot_cpu_data.x86 != 0xf)
427 return 0;
428
429 if (address != regs->ip)
430 return 0;
431
432 if ((address >> 32) != 0)
433 return 0;
434
435 address |= 0xffffffffUL << 32;
436 if ((address >= (u64)_stext && address <= (u64)_etext) ||
437 (address >= MODULES_VADDR && address <= MODULES_END)) {
438 printk_once(errata93_warning);
439 regs->ip = address;
440 return 1;
441 }
442#endif
443 return 0;
444}
445
446
447
448
449
450
451
452
453
454static int is_errata100(struct pt_regs *regs, unsigned long address)
455{
456#ifdef CONFIG_X86_64
457 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
458 return 1;
459#endif
460 return 0;
461}
462
463
464static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
465{
466#ifdef CONFIG_X86_F00F_BUG
467 if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) {
468 handle_invalid_op(regs);
469 return 1;
470 }
471#endif
472 return 0;
473}
474
475static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
476{
477 u32 offset = (index >> 3) * sizeof(struct desc_struct);
478 unsigned long addr;
479 struct ldttss_desc desc;
480
481 if (index == 0) {
482 pr_alert("%s: NULL\n", name);
483 return;
484 }
485
486 if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
487 pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
488 return;
489 }
490
491 if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset),
492 sizeof(struct ldttss_desc))) {
493 pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
494 name, index);
495 return;
496 }
497
498 addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24);
499#ifdef CONFIG_X86_64
500 addr |= ((u64)desc.base3 << 32);
501#endif
502 pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
503 name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
504}
505
506static void
507show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
508{
509 if (!oops_may_print())
510 return;
511
512 if (error_code & X86_PF_INSTR) {
513 unsigned int level;
514 pgd_t *pgd;
515 pte_t *pte;
516
517 pgd = __va(read_cr3_pa());
518 pgd += pgd_index(address);
519
520 pte = lookup_address_in_pgd(pgd, address, &level);
521
522 if (pte && pte_present(*pte) && !pte_exec(*pte))
523 pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
524 from_kuid(&init_user_ns, current_uid()));
525 if (pte && pte_present(*pte) && pte_exec(*pte) &&
526 (pgd_flags(*pgd) & _PAGE_USER) &&
527 (__read_cr4() & X86_CR4_SMEP))
528 pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
529 from_kuid(&init_user_ns, current_uid()));
530 }
531
532 if (address < PAGE_SIZE && !user_mode(regs))
533 pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
534 (void *)address);
535 else
536 pr_alert("BUG: unable to handle page fault for address: %px\n",
537 (void *)address);
538
539 pr_alert("#PF: %s %s in %s mode\n",
540 (error_code & X86_PF_USER) ? "user" : "supervisor",
541 (error_code & X86_PF_INSTR) ? "instruction fetch" :
542 (error_code & X86_PF_WRITE) ? "write access" :
543 "read access",
544 user_mode(regs) ? "user" : "kernel");
545 pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
546 !(error_code & X86_PF_PROT) ? "not-present page" :
547 (error_code & X86_PF_RSVD) ? "reserved bit violation" :
548 (error_code & X86_PF_PK) ? "protection keys violation" :
549 "permissions violation");
550
551 if (!(error_code & X86_PF_USER) && user_mode(regs)) {
552 struct desc_ptr idt, gdt;
553 u16 ldtr, tr;
554
555
556
557
558
559
560
561
562
563
564
565 store_idt(&idt);
566
567
568 native_store_gdt(&gdt);
569
570 pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
571 idt.address, idt.size, gdt.address, gdt.size);
572
573 store_ldt(ldtr);
574 show_ldttss(&gdt, "LDTR", ldtr);
575
576 store_tr(tr);
577 show_ldttss(&gdt, "TR", tr);
578 }
579
580 dump_pagetable(address);
581}
582
583static noinline void
584pgtable_bad(struct pt_regs *regs, unsigned long error_code,
585 unsigned long address)
586{
587 struct task_struct *tsk;
588 unsigned long flags;
589 int sig;
590
591 flags = oops_begin();
592 tsk = current;
593 sig = SIGKILL;
594
595 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
596 tsk->comm, address);
597 dump_pagetable(address);
598
599 if (__die("Bad pagetable", regs, error_code))
600 sig = 0;
601
602 oops_end(flags, regs, sig);
603}
604
605static void set_signal_archinfo(unsigned long address,
606 unsigned long error_code)
607{
608 struct task_struct *tsk = current;
609
610
611
612
613
614
615
616
617
618
619 if (address >= TASK_SIZE_MAX)
620 error_code |= X86_PF_PROT;
621
622 tsk->thread.trap_nr = X86_TRAP_PF;
623 tsk->thread.error_code = error_code | X86_PF_USER;
624 tsk->thread.cr2 = address;
625}
626
627static noinline void
628no_context(struct pt_regs *regs, unsigned long error_code,
629 unsigned long address, int signal, int si_code)
630{
631 struct task_struct *tsk = current;
632 unsigned long flags;
633 int sig;
634
635 if (user_mode(regs)) {
636
637
638
639
640
641 goto oops;
642 }
643
644
645 if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
646
647
648
649
650
651 if (in_interrupt())
652 return;
653
654
655
656
657
658
659
660 if (current->thread.sig_on_uaccess_err && signal) {
661 set_signal_archinfo(address, error_code);
662
663
664 force_sig_fault(signal, si_code, (void __user *)address);
665 }
666
667
668
669
670 return;
671 }
672
673#ifdef CONFIG_VMAP_STACK
674
675
676
677
678
679 if (is_vmalloc_addr((void *)address) &&
680 (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
681 address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
682 unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
683
684
685
686
687
688
689
690
691
692
693 asm volatile ("movq %[stack], %%rsp\n\t"
694 "call handle_stack_overflow\n\t"
695 "1: jmp 1b"
696 : ASM_CALL_CONSTRAINT
697 : "D" ("kernel stack overflow (page fault)"),
698 "S" (regs), "d" (address),
699 [stack] "rm" (stack));
700 unreachable();
701 }
702#endif
703
704
705
706
707
708
709
710
711
712
713
714
715 if (is_prefetch(regs, error_code, address))
716 return;
717
718 if (is_errata93(regs, address))
719 return;
720
721
722
723
724
725 if (IS_ENABLED(CONFIG_EFI))
726 efi_recover_from_page_fault(address);
727
728oops:
729
730
731
732
733 flags = oops_begin();
734
735 show_fault_oops(regs, error_code, address);
736
737 if (task_stack_end_corrupted(tsk))
738 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
739
740 sig = SIGKILL;
741 if (__die("Oops", regs, error_code))
742 sig = 0;
743
744
745 printk(KERN_DEFAULT "CR2: %016lx\n", address);
746
747 oops_end(flags, regs, sig);
748}
749
750
751
752
753
754static inline void
755show_signal_msg(struct pt_regs *regs, unsigned long error_code,
756 unsigned long address, struct task_struct *tsk)
757{
758 const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
759
760 if (!unhandled_signal(tsk, SIGSEGV))
761 return;
762
763 if (!printk_ratelimit())
764 return;
765
766 printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
767 loglvl, tsk->comm, task_pid_nr(tsk), address,
768 (void *)regs->ip, (void *)regs->sp, error_code);
769
770 print_vma_addr(KERN_CONT " in ", regs->ip);
771
772 printk(KERN_CONT "\n");
773
774 show_opcodes(regs, loglvl);
775}
776
777
778
779
780
781static bool is_vsyscall_vaddr(unsigned long vaddr)
782{
783 return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
784}
785
786static void
787__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
788 unsigned long address, u32 pkey, int si_code)
789{
790 struct task_struct *tsk = current;
791
792
793 if (user_mode(regs) && (error_code & X86_PF_USER)) {
794
795
796
797 local_irq_enable();
798
799
800
801
802
803 if (is_prefetch(regs, error_code, address))
804 return;
805
806 if (is_errata100(regs, address))
807 return;
808
809
810
811
812
813
814 if (address >= TASK_SIZE_MAX)
815 error_code |= X86_PF_PROT;
816
817 if (likely(show_unhandled_signals))
818 show_signal_msg(regs, error_code, address, tsk);
819
820 set_signal_archinfo(address, error_code);
821
822 if (si_code == SEGV_PKUERR)
823 force_sig_pkuerr((void __user *)address, pkey);
824
825 force_sig_fault(SIGSEGV, si_code, (void __user *)address);
826
827 local_irq_disable();
828
829 return;
830 }
831
832 if (is_f00f_bug(regs, address))
833 return;
834
835 no_context(regs, error_code, address, SIGSEGV, si_code);
836}
837
838static noinline void
839bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
840 unsigned long address)
841{
842 __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
843}
844
845static void
846__bad_area(struct pt_regs *regs, unsigned long error_code,
847 unsigned long address, u32 pkey, int si_code)
848{
849 struct mm_struct *mm = current->mm;
850
851
852
853
854 mmap_read_unlock(mm);
855
856 __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
857}
858
859static noinline void
860bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
861{
862 __bad_area(regs, error_code, address, 0, SEGV_MAPERR);
863}
864
865static inline bool bad_area_access_from_pkeys(unsigned long error_code,
866 struct vm_area_struct *vma)
867{
868
869 bool foreign = false;
870
871 if (!boot_cpu_has(X86_FEATURE_OSPKE))
872 return false;
873 if (error_code & X86_PF_PK)
874 return true;
875
876 if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
877 (error_code & X86_PF_INSTR), foreign))
878 return true;
879 return false;
880}
881
882static noinline void
883bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
884 unsigned long address, struct vm_area_struct *vma)
885{
886
887
888
889
890
891 if (bad_area_access_from_pkeys(error_code, vma)) {
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912 u32 pkey = vma_pkey(vma);
913
914 __bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
915 } else {
916 __bad_area(regs, error_code, address, 0, SEGV_ACCERR);
917 }
918}
919
920static void
921do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
922 vm_fault_t fault)
923{
924
925 if (!(error_code & X86_PF_USER)) {
926 no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
927 return;
928 }
929
930
931 if (is_prefetch(regs, error_code, address))
932 return;
933
934 set_signal_archinfo(address, error_code);
935
936#ifdef CONFIG_MEMORY_FAILURE
937 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
938 struct task_struct *tsk = current;
939 unsigned lsb = 0;
940
941 pr_err(
942 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
943 tsk->comm, tsk->pid, address);
944 if (fault & VM_FAULT_HWPOISON_LARGE)
945 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
946 if (fault & VM_FAULT_HWPOISON)
947 lsb = PAGE_SHIFT;
948 force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
949 return;
950 }
951#endif
952 force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
953}
954
955static noinline void
956mm_fault_error(struct pt_regs *regs, unsigned long error_code,
957 unsigned long address, vm_fault_t fault)
958{
959 if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
960 no_context(regs, error_code, address, 0, 0);
961 return;
962 }
963
964 if (fault & VM_FAULT_OOM) {
965
966 if (!(error_code & X86_PF_USER)) {
967 no_context(regs, error_code, address,
968 SIGSEGV, SEGV_MAPERR);
969 return;
970 }
971
972
973
974
975
976
977 pagefault_out_of_memory();
978 } else {
979 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
980 VM_FAULT_HWPOISON_LARGE))
981 do_sigbus(regs, error_code, address, fault);
982 else if (fault & VM_FAULT_SIGSEGV)
983 bad_area_nosemaphore(regs, error_code, address);
984 else
985 BUG();
986 }
987}
988
989static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
990{
991 if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
992 return 0;
993
994 if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
995 return 0;
996
997 return 1;
998}
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021static noinline int
1022spurious_kernel_fault(unsigned long error_code, unsigned long address)
1023{
1024 pgd_t *pgd;
1025 p4d_t *p4d;
1026 pud_t *pud;
1027 pmd_t *pmd;
1028 pte_t *pte;
1029 int ret;
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040 if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
1041 error_code != (X86_PF_INSTR | X86_PF_PROT))
1042 return 0;
1043
1044 pgd = init_mm.pgd + pgd_index(address);
1045 if (!pgd_present(*pgd))
1046 return 0;
1047
1048 p4d = p4d_offset(pgd, address);
1049 if (!p4d_present(*p4d))
1050 return 0;
1051
1052 if (p4d_large(*p4d))
1053 return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
1054
1055 pud = pud_offset(p4d, address);
1056 if (!pud_present(*pud))
1057 return 0;
1058
1059 if (pud_large(*pud))
1060 return spurious_kernel_fault_check(error_code, (pte_t *) pud);
1061
1062 pmd = pmd_offset(pud, address);
1063 if (!pmd_present(*pmd))
1064 return 0;
1065
1066 if (pmd_large(*pmd))
1067 return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
1068
1069 pte = pte_offset_kernel(pmd, address);
1070 if (!pte_present(*pte))
1071 return 0;
1072
1073 ret = spurious_kernel_fault_check(error_code, pte);
1074 if (!ret)
1075 return 0;
1076
1077
1078
1079
1080
1081 ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
1082 WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
1083
1084 return ret;
1085}
1086NOKPROBE_SYMBOL(spurious_kernel_fault);
1087
1088int show_unhandled_signals = 1;
1089
1090static inline int
1091access_error(unsigned long error_code, struct vm_area_struct *vma)
1092{
1093
1094 bool foreign = false;
1095
1096
1097
1098
1099
1100
1101 if (error_code & X86_PF_PK)
1102 return 1;
1103
1104
1105
1106
1107
1108
1109 if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
1110 (error_code & X86_PF_INSTR), foreign))
1111 return 1;
1112
1113 if (error_code & X86_PF_WRITE) {
1114
1115 if (unlikely(!(vma->vm_flags & VM_WRITE)))
1116 return 1;
1117 return 0;
1118 }
1119
1120
1121 if (unlikely(error_code & X86_PF_PROT))
1122 return 1;
1123
1124
1125 if (unlikely(!vma_is_accessible(vma)))
1126 return 1;
1127
1128 return 0;
1129}
1130
1131static int fault_in_kernel_space(unsigned long address)
1132{
1133
1134
1135
1136
1137
1138 if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
1139 return false;
1140
1141 return address >= TASK_SIZE_MAX;
1142}
1143
1144
1145
1146
1147
1148
1149static void
1150do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
1151 unsigned long address)
1152{
1153
1154
1155
1156
1157
1158 WARN_ON_ONCE(hw_error_code & X86_PF_PK);
1159
1160#ifdef CONFIG_X86_32
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185 if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
1186 if (vmalloc_fault(address) >= 0)
1187 return;
1188 }
1189#endif
1190
1191
1192 if (spurious_kernel_fault(hw_error_code, address))
1193 return;
1194
1195
1196 if (kprobe_page_fault(regs, X86_TRAP_PF))
1197 return;
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207 bad_area_nosemaphore(regs, hw_error_code, address);
1208}
1209NOKPROBE_SYMBOL(do_kern_addr_fault);
1210
1211
1212static inline
1213void do_user_addr_fault(struct pt_regs *regs,
1214 unsigned long hw_error_code,
1215 unsigned long address)
1216{
1217 struct vm_area_struct *vma;
1218 struct task_struct *tsk;
1219 struct mm_struct *mm;
1220 vm_fault_t fault;
1221 unsigned int flags = FAULT_FLAG_DEFAULT;
1222
1223 tsk = current;
1224 mm = tsk->mm;
1225
1226
1227 if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
1228 return;
1229
1230
1231
1232
1233
1234 if (unlikely(hw_error_code & X86_PF_RSVD))
1235 pgtable_bad(regs, hw_error_code, address);
1236
1237
1238
1239
1240
1241
1242
1243
1244 if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
1245 !(hw_error_code & X86_PF_USER) &&
1246 !(regs->flags & X86_EFLAGS_AC)))
1247 {
1248 bad_area_nosemaphore(regs, hw_error_code, address);
1249 return;
1250 }
1251
1252
1253
1254
1255
1256 if (unlikely(faulthandler_disabled() || !mm)) {
1257 bad_area_nosemaphore(regs, hw_error_code, address);
1258 return;
1259 }
1260
1261
1262
1263
1264
1265
1266
1267
1268 if (user_mode(regs)) {
1269 local_irq_enable();
1270 flags |= FAULT_FLAG_USER;
1271 } else {
1272 if (regs->flags & X86_EFLAGS_IF)
1273 local_irq_enable();
1274 }
1275
1276 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
1277
1278 if (hw_error_code & X86_PF_WRITE)
1279 flags |= FAULT_FLAG_WRITE;
1280 if (hw_error_code & X86_PF_INSTR)
1281 flags |= FAULT_FLAG_INSTRUCTION;
1282
1283#ifdef CONFIG_X86_64
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295 if (is_vsyscall_vaddr(address)) {
1296 if (emulate_vsyscall(hw_error_code, regs, address))
1297 return;
1298 }
1299#endif
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313 if (unlikely(!mmap_read_trylock(mm))) {
1314 if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
1315
1316
1317
1318
1319 bad_area_nosemaphore(regs, hw_error_code, address);
1320 return;
1321 }
1322retry:
1323 mmap_read_lock(mm);
1324 } else {
1325
1326
1327
1328
1329
1330 might_sleep();
1331 }
1332
1333 vma = find_vma(mm, address);
1334 if (unlikely(!vma)) {
1335 bad_area(regs, hw_error_code, address);
1336 return;
1337 }
1338 if (likely(vma->vm_start <= address))
1339 goto good_area;
1340 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
1341 bad_area(regs, hw_error_code, address);
1342 return;
1343 }
1344 if (unlikely(expand_stack(vma, address))) {
1345 bad_area(regs, hw_error_code, address);
1346 return;
1347 }
1348
1349
1350
1351
1352
1353good_area:
1354 if (unlikely(access_error(hw_error_code, vma))) {
1355 bad_area_access_error(regs, hw_error_code, address, vma);
1356 return;
1357 }
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372 fault = handle_mm_fault(vma, address, flags, regs);
1373
1374
1375 if (fault_signal_pending(fault, regs)) {
1376 if (!user_mode(regs))
1377 no_context(regs, hw_error_code, address, SIGBUS,
1378 BUS_ADRERR);
1379 return;
1380 }
1381
1382
1383
1384
1385
1386
1387 if (unlikely((fault & VM_FAULT_RETRY) &&
1388 (flags & FAULT_FLAG_ALLOW_RETRY))) {
1389 flags |= FAULT_FLAG_TRIED;
1390 goto retry;
1391 }
1392
1393 mmap_read_unlock(mm);
1394 if (unlikely(fault & VM_FAULT_ERROR)) {
1395 mm_fault_error(regs, hw_error_code, address, fault);
1396 return;
1397 }
1398
1399 check_v8086_mode(regs, address, tsk);
1400}
1401NOKPROBE_SYMBOL(do_user_addr_fault);
1402
1403static __always_inline void
1404trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
1405 unsigned long address)
1406{
1407 if (!trace_pagefault_enabled())
1408 return;
1409
1410 if (user_mode(regs))
1411 trace_page_fault_user(address, regs, error_code);
1412 else
1413 trace_page_fault_kernel(address, regs, error_code);
1414}
1415
1416static __always_inline void
1417handle_page_fault(struct pt_regs *regs, unsigned long error_code,
1418 unsigned long address)
1419{
1420 trace_page_fault_entries(regs, error_code, address);
1421
1422 if (unlikely(kmmio_fault(regs, address)))
1423 return;
1424
1425
1426 if (unlikely(fault_in_kernel_space(address))) {
1427 do_kern_addr_fault(regs, error_code, address);
1428 } else {
1429 do_user_addr_fault(regs, error_code, address);
1430
1431
1432
1433
1434
1435
1436
1437 local_irq_disable();
1438 }
1439}
1440
1441DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
1442{
1443 unsigned long address = read_cr2();
1444 irqentry_state_t state;
1445
1446 prefetchw(¤t->mm->mmap_lock);
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466 if (kvm_handle_async_pf(regs, (u32)address))
1467 return;
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479 state = irqentry_enter(regs);
1480
1481 instrumentation_begin();
1482 handle_page_fault(regs, error_code, address);
1483 instrumentation_end();
1484
1485 irqentry_exit(regs, state);
1486}
1487