1
2
3
4
5
6
7#include <linux/sched.h>
8#include <linux/sched/task_stack.h>
9#include <linux/kdebug.h>
10#include <linux/extable.h>
11#include <linux/memblock.h>
12#include <linux/kprobes.h>
13#include <linux/mmiotrace.h>
14#include <linux/perf_event.h>
15#include <linux/hugetlb.h>
16#include <linux/prefetch.h>
17#include <linux/context_tracking.h>
18#include <linux/uaccess.h>
19#include <linux/efi.h>
20#include <linux/mm_types.h>
21
22#include <asm/cpufeature.h>
23#include <asm/traps.h>
24#include <asm/fixmap.h>
25#include <asm/vsyscall.h>
26#include <asm/vm86.h>
27#include <asm/mmu_context.h>
28#include <asm/efi.h>
29#include <asm/desc.h>
30#include <asm/cpu_entry_area.h>
31#include <asm/pgtable_areas.h>
32#include <asm/kvm_para.h>
33#include <asm/vdso.h>
34
35#define CREATE_TRACE_POINTS
36#include <asm/trace/exceptions.h>
37
38
39
40
41
42static nokprobe_inline int
43kmmio_fault(struct pt_regs *regs, unsigned long addr)
44{
45 if (unlikely(is_kmmio_active()))
46 if (kmmio_handler(regs, addr) == 1)
47 return -1;
48 return 0;
49}
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66static inline int
67check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
68 unsigned char opcode, int *prefetch)
69{
70 unsigned char instr_hi = opcode & 0xf0;
71 unsigned char instr_lo = opcode & 0x0f;
72
73 switch (instr_hi) {
74 case 0x20:
75 case 0x30:
76
77
78
79
80
81
82 return ((instr_lo & 7) == 0x6);
83#ifdef CONFIG_X86_64
84 case 0x40:
85
86
87
88
89
90
91
92 return (!user_mode(regs) || user_64bit_mode(regs));
93#endif
94 case 0x60:
95
96 return (instr_lo & 0xC) == 0x4;
97 case 0xF0:
98
99 return !instr_lo || (instr_lo>>1) == 1;
100 case 0x00:
101
102 if (get_kernel_nofault(opcode, instr))
103 return 0;
104
105 *prefetch = (instr_lo == 0xF) &&
106 (opcode == 0x0D || opcode == 0x18);
107 return 0;
108 default:
109 return 0;
110 }
111}
112
113static int
114is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
115{
116 unsigned char *max_instr;
117 unsigned char *instr;
118 int prefetch = 0;
119
120
121
122
123
124 if (error_code & X86_PF_INSTR)
125 return 0;
126
127 instr = (void *)convert_ip_to_linear(current, regs);
128 max_instr = instr + 15;
129
130 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
131 return 0;
132
133 while (instr < max_instr) {
134 unsigned char opcode;
135
136 if (get_kernel_nofault(opcode, instr))
137 break;
138
139 instr++;
140
141 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
142 break;
143 }
144 return prefetch;
145}
146
147DEFINE_SPINLOCK(pgd_lock);
148LIST_HEAD(pgd_list);
149
150#ifdef CONFIG_X86_32
151static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
152{
153 unsigned index = pgd_index(address);
154 pgd_t *pgd_k;
155 p4d_t *p4d, *p4d_k;
156 pud_t *pud, *pud_k;
157 pmd_t *pmd, *pmd_k;
158
159 pgd += index;
160 pgd_k = init_mm.pgd + index;
161
162 if (!pgd_present(*pgd_k))
163 return NULL;
164
165
166
167
168
169
170 p4d = p4d_offset(pgd, address);
171 p4d_k = p4d_offset(pgd_k, address);
172 if (!p4d_present(*p4d_k))
173 return NULL;
174
175 pud = pud_offset(p4d, address);
176 pud_k = pud_offset(p4d_k, address);
177 if (!pud_present(*pud_k))
178 return NULL;
179
180 pmd = pmd_offset(pud, address);
181 pmd_k = pmd_offset(pud_k, address);
182
183 if (pmd_present(*pmd) != pmd_present(*pmd_k))
184 set_pmd(pmd, *pmd_k);
185
186 if (!pmd_present(*pmd_k))
187 return NULL;
188 else
189 BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k));
190
191 return pmd_k;
192}
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208static noinline int vmalloc_fault(unsigned long address)
209{
210 unsigned long pgd_paddr;
211 pmd_t *pmd_k;
212 pte_t *pte_k;
213
214
215 if (!(address >= VMALLOC_START && address < VMALLOC_END))
216 return -1;
217
218
219
220
221
222
223
224
225 pgd_paddr = read_cr3_pa();
226 pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
227 if (!pmd_k)
228 return -1;
229
230 if (pmd_large(*pmd_k))
231 return 0;
232
233 pte_k = pte_offset_kernel(pmd_k, address);
234 if (!pte_present(*pte_k))
235 return -1;
236
237 return 0;
238}
239NOKPROBE_SYMBOL(vmalloc_fault);
240
241void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
242{
243 unsigned long addr;
244
245 for (addr = start & PMD_MASK;
246 addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
247 addr += PMD_SIZE) {
248 struct page *page;
249
250 spin_lock(&pgd_lock);
251 list_for_each_entry(page, &pgd_list, lru) {
252 spinlock_t *pgt_lock;
253
254
255 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
256
257 spin_lock(pgt_lock);
258 vmalloc_sync_one(page_address(page), addr);
259 spin_unlock(pgt_lock);
260 }
261 spin_unlock(&pgd_lock);
262 }
263}
264
265
266
267
268static inline void
269check_v8086_mode(struct pt_regs *regs, unsigned long address,
270 struct task_struct *tsk)
271{
272#ifdef CONFIG_VM86
273 unsigned long bit;
274
275 if (!v8086_mode(regs) || !tsk->thread.vm86)
276 return;
277
278 bit = (address - 0xA0000) >> PAGE_SHIFT;
279 if (bit < 32)
280 tsk->thread.vm86->screen_bitmap |= 1 << bit;
281#endif
282}
283
284static bool low_pfn(unsigned long pfn)
285{
286 return pfn < max_low_pfn;
287}
288
289static void dump_pagetable(unsigned long address)
290{
291 pgd_t *base = __va(read_cr3_pa());
292 pgd_t *pgd = &base[pgd_index(address)];
293 p4d_t *p4d;
294 pud_t *pud;
295 pmd_t *pmd;
296 pte_t *pte;
297
298#ifdef CONFIG_X86_PAE
299 pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
300 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
301 goto out;
302#define pr_pde pr_cont
303#else
304#define pr_pde pr_info
305#endif
306 p4d = p4d_offset(pgd, address);
307 pud = pud_offset(p4d, address);
308 pmd = pmd_offset(pud, address);
309 pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
310#undef pr_pde
311
312
313
314
315
316
317
318 if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
319 goto out;
320
321 pte = pte_offset_kernel(pmd, address);
322 pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
323out:
324 pr_cont("\n");
325}
326
327#else
328
329#ifdef CONFIG_CPU_SUP_AMD
330static const char errata93_warning[] =
331KERN_ERR
332"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
333"******* Working around it, but it may cause SEGVs or burn power.\n"
334"******* Please consider a BIOS update.\n"
335"******* Disabling USB legacy in the BIOS may also help.\n";
336#endif
337
338
339
340
341static inline void
342check_v8086_mode(struct pt_regs *regs, unsigned long address,
343 struct task_struct *tsk)
344{
345}
346
347static int bad_address(void *p)
348{
349 unsigned long dummy;
350
351 return get_kernel_nofault(dummy, (unsigned long *)p);
352}
353
354static void dump_pagetable(unsigned long address)
355{
356 pgd_t *base = __va(read_cr3_pa());
357 pgd_t *pgd = base + pgd_index(address);
358 p4d_t *p4d;
359 pud_t *pud;
360 pmd_t *pmd;
361 pte_t *pte;
362
363 if (bad_address(pgd))
364 goto bad;
365
366 pr_info("PGD %lx ", pgd_val(*pgd));
367
368 if (!pgd_present(*pgd))
369 goto out;
370
371 p4d = p4d_offset(pgd, address);
372 if (bad_address(p4d))
373 goto bad;
374
375 pr_cont("P4D %lx ", p4d_val(*p4d));
376 if (!p4d_present(*p4d) || p4d_large(*p4d))
377 goto out;
378
379 pud = pud_offset(p4d, address);
380 if (bad_address(pud))
381 goto bad;
382
383 pr_cont("PUD %lx ", pud_val(*pud));
384 if (!pud_present(*pud) || pud_large(*pud))
385 goto out;
386
387 pmd = pmd_offset(pud, address);
388 if (bad_address(pmd))
389 goto bad;
390
391 pr_cont("PMD %lx ", pmd_val(*pmd));
392 if (!pmd_present(*pmd) || pmd_large(*pmd))
393 goto out;
394
395 pte = pte_offset_kernel(pmd, address);
396 if (bad_address(pte))
397 goto bad;
398
399 pr_cont("PTE %lx", pte_val(*pte));
400out:
401 pr_cont("\n");
402 return;
403bad:
404 pr_info("BAD\n");
405}
406
407#endif
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423static int is_errata93(struct pt_regs *regs, unsigned long address)
424{
425#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
426 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
427 || boot_cpu_data.x86 != 0xf)
428 return 0;
429
430 if (address != regs->ip)
431 return 0;
432
433 if ((address >> 32) != 0)
434 return 0;
435
436 address |= 0xffffffffUL << 32;
437 if ((address >= (u64)_stext && address <= (u64)_etext) ||
438 (address >= MODULES_VADDR && address <= MODULES_END)) {
439 printk_once(errata93_warning);
440 regs->ip = address;
441 return 1;
442 }
443#endif
444 return 0;
445}
446
447
448
449
450
451
452
453
454
455static int is_errata100(struct pt_regs *regs, unsigned long address)
456{
457#ifdef CONFIG_X86_64
458 if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
459 return 1;
460#endif
461 return 0;
462}
463
464
465static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
466{
467#ifdef CONFIG_X86_F00F_BUG
468 if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) {
469 handle_invalid_op(regs);
470 return 1;
471 }
472#endif
473 return 0;
474}
475
476static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
477{
478 u32 offset = (index >> 3) * sizeof(struct desc_struct);
479 unsigned long addr;
480 struct ldttss_desc desc;
481
482 if (index == 0) {
483 pr_alert("%s: NULL\n", name);
484 return;
485 }
486
487 if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
488 pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
489 return;
490 }
491
492 if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset),
493 sizeof(struct ldttss_desc))) {
494 pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
495 name, index);
496 return;
497 }
498
499 addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24);
500#ifdef CONFIG_X86_64
501 addr |= ((u64)desc.base3 << 32);
502#endif
503 pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
504 name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
505}
506
507static void
508show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
509{
510 if (!oops_may_print())
511 return;
512
513 if (error_code & X86_PF_INSTR) {
514 unsigned int level;
515 pgd_t *pgd;
516 pte_t *pte;
517
518 pgd = __va(read_cr3_pa());
519 pgd += pgd_index(address);
520
521 pte = lookup_address_in_pgd(pgd, address, &level);
522
523 if (pte && pte_present(*pte) && !pte_exec(*pte))
524 pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
525 from_kuid(&init_user_ns, current_uid()));
526 if (pte && pte_present(*pte) && pte_exec(*pte) &&
527 (pgd_flags(*pgd) & _PAGE_USER) &&
528 (__read_cr4() & X86_CR4_SMEP))
529 pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
530 from_kuid(&init_user_ns, current_uid()));
531 }
532
533 if (address < PAGE_SIZE && !user_mode(regs))
534 pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
535 (void *)address);
536 else
537 pr_alert("BUG: unable to handle page fault for address: %px\n",
538 (void *)address);
539
540 pr_alert("#PF: %s %s in %s mode\n",
541 (error_code & X86_PF_USER) ? "user" : "supervisor",
542 (error_code & X86_PF_INSTR) ? "instruction fetch" :
543 (error_code & X86_PF_WRITE) ? "write access" :
544 "read access",
545 user_mode(regs) ? "user" : "kernel");
546 pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
547 !(error_code & X86_PF_PROT) ? "not-present page" :
548 (error_code & X86_PF_RSVD) ? "reserved bit violation" :
549 (error_code & X86_PF_PK) ? "protection keys violation" :
550 "permissions violation");
551
552 if (!(error_code & X86_PF_USER) && user_mode(regs)) {
553 struct desc_ptr idt, gdt;
554 u16 ldtr, tr;
555
556
557
558
559
560
561
562
563
564
565
566 store_idt(&idt);
567
568
569 native_store_gdt(&gdt);
570
571 pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
572 idt.address, idt.size, gdt.address, gdt.size);
573
574 store_ldt(ldtr);
575 show_ldttss(&gdt, "LDTR", ldtr);
576
577 store_tr(tr);
578 show_ldttss(&gdt, "TR", tr);
579 }
580
581 dump_pagetable(address);
582}
583
584static noinline void
585pgtable_bad(struct pt_regs *regs, unsigned long error_code,
586 unsigned long address)
587{
588 struct task_struct *tsk;
589 unsigned long flags;
590 int sig;
591
592 flags = oops_begin();
593 tsk = current;
594 sig = SIGKILL;
595
596 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
597 tsk->comm, address);
598 dump_pagetable(address);
599
600 if (__die("Bad pagetable", regs, error_code))
601 sig = 0;
602
603 oops_end(flags, regs, sig);
604}
605
606static void sanitize_error_code(unsigned long address,
607 unsigned long *error_code)
608{
609
610
611
612
613
614
615
616
617
618 if (address >= TASK_SIZE_MAX)
619 *error_code |= X86_PF_PROT;
620}
621
622static void set_signal_archinfo(unsigned long address,
623 unsigned long error_code)
624{
625 struct task_struct *tsk = current;
626
627 tsk->thread.trap_nr = X86_TRAP_PF;
628 tsk->thread.error_code = error_code | X86_PF_USER;
629 tsk->thread.cr2 = address;
630}
631
632static noinline void
633no_context(struct pt_regs *regs, unsigned long error_code,
634 unsigned long address, int signal, int si_code)
635{
636 struct task_struct *tsk = current;
637 unsigned long flags;
638 int sig;
639
640 if (user_mode(regs)) {
641
642
643
644
645
646 goto oops;
647 }
648
649
650 if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
651
652
653
654
655
656 if (in_interrupt())
657 return;
658
659
660
661
662
663
664
665 if (current->thread.sig_on_uaccess_err && signal) {
666 sanitize_error_code(address, &error_code);
667
668 set_signal_archinfo(address, error_code);
669
670
671 force_sig_fault(signal, si_code, (void __user *)address);
672 }
673
674
675
676
677 return;
678 }
679
680#ifdef CONFIG_VMAP_STACK
681
682
683
684
685
686 if (is_vmalloc_addr((void *)address) &&
687 (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
688 address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
689 unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
690
691
692
693
694
695
696
697
698
699
700 asm volatile ("movq %[stack], %%rsp\n\t"
701 "call handle_stack_overflow\n\t"
702 "1: jmp 1b"
703 : ASM_CALL_CONSTRAINT
704 : "D" ("kernel stack overflow (page fault)"),
705 "S" (regs), "d" (address),
706 [stack] "rm" (stack));
707 unreachable();
708 }
709#endif
710
711
712
713
714
715
716
717
718
719
720
721
722 if (is_prefetch(regs, error_code, address))
723 return;
724
725 if (is_errata93(regs, address))
726 return;
727
728
729
730
731
732 if (IS_ENABLED(CONFIG_EFI))
733 efi_recover_from_page_fault(address);
734
735oops:
736
737
738
739
740 flags = oops_begin();
741
742 show_fault_oops(regs, error_code, address);
743
744 if (task_stack_end_corrupted(tsk))
745 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
746
747 sig = SIGKILL;
748 if (__die("Oops", regs, error_code))
749 sig = 0;
750
751
752 printk(KERN_DEFAULT "CR2: %016lx\n", address);
753
754 oops_end(flags, regs, sig);
755}
756
757
758
759
760
761static inline void
762show_signal_msg(struct pt_regs *regs, unsigned long error_code,
763 unsigned long address, struct task_struct *tsk)
764{
765 const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
766
767 if (!unhandled_signal(tsk, SIGSEGV))
768 return;
769
770 if (!printk_ratelimit())
771 return;
772
773 printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
774 loglvl, tsk->comm, task_pid_nr(tsk), address,
775 (void *)regs->ip, (void *)regs->sp, error_code);
776
777 print_vma_addr(KERN_CONT " in ", regs->ip);
778
779 printk(KERN_CONT "\n");
780
781 show_opcodes(regs, loglvl);
782}
783
784
785
786
787
788static bool is_vsyscall_vaddr(unsigned long vaddr)
789{
790 return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
791}
792
793static void
794__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
795 unsigned long address, u32 pkey, int si_code)
796{
797 struct task_struct *tsk = current;
798
799
800 if (user_mode(regs) && (error_code & X86_PF_USER)) {
801
802
803
804 local_irq_enable();
805
806
807
808
809
810 if (is_prefetch(regs, error_code, address))
811 return;
812
813 if (is_errata100(regs, address))
814 return;
815
816 sanitize_error_code(address, &error_code);
817
818 if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
819 return;
820
821 if (likely(show_unhandled_signals))
822 show_signal_msg(regs, error_code, address, tsk);
823
824 set_signal_archinfo(address, error_code);
825
826 if (si_code == SEGV_PKUERR)
827 force_sig_pkuerr((void __user *)address, pkey);
828
829 force_sig_fault(SIGSEGV, si_code, (void __user *)address);
830
831 local_irq_disable();
832
833 return;
834 }
835
836 if (is_f00f_bug(regs, address))
837 return;
838
839 no_context(regs, error_code, address, SIGSEGV, si_code);
840}
841
842static noinline void
843bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
844 unsigned long address)
845{
846 __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
847}
848
849static void
850__bad_area(struct pt_regs *regs, unsigned long error_code,
851 unsigned long address, u32 pkey, int si_code)
852{
853 struct mm_struct *mm = current->mm;
854
855
856
857
858 mmap_read_unlock(mm);
859
860 __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
861}
862
863static noinline void
864bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
865{
866 __bad_area(regs, error_code, address, 0, SEGV_MAPERR);
867}
868
869static inline bool bad_area_access_from_pkeys(unsigned long error_code,
870 struct vm_area_struct *vma)
871{
872
873 bool foreign = false;
874
875 if (!boot_cpu_has(X86_FEATURE_OSPKE))
876 return false;
877 if (error_code & X86_PF_PK)
878 return true;
879
880 if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
881 (error_code & X86_PF_INSTR), foreign))
882 return true;
883 return false;
884}
885
886static noinline void
887bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
888 unsigned long address, struct vm_area_struct *vma)
889{
890
891
892
893
894
895 if (bad_area_access_from_pkeys(error_code, vma)) {
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916 u32 pkey = vma_pkey(vma);
917
918 __bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
919 } else {
920 __bad_area(regs, error_code, address, 0, SEGV_ACCERR);
921 }
922}
923
924static void
925do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
926 vm_fault_t fault)
927{
928
929 if (!(error_code & X86_PF_USER)) {
930 no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
931 return;
932 }
933
934
935 if (is_prefetch(regs, error_code, address))
936 return;
937
938 sanitize_error_code(address, &error_code);
939
940 if (fixup_vdso_exception(regs, X86_TRAP_PF, error_code, address))
941 return;
942
943 set_signal_archinfo(address, error_code);
944
945#ifdef CONFIG_MEMORY_FAILURE
946 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
947 struct task_struct *tsk = current;
948 unsigned lsb = 0;
949
950 pr_err(
951 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
952 tsk->comm, tsk->pid, address);
953 if (fault & VM_FAULT_HWPOISON_LARGE)
954 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
955 if (fault & VM_FAULT_HWPOISON)
956 lsb = PAGE_SHIFT;
957 force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
958 return;
959 }
960#endif
961 force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
962}
963
964static noinline void
965mm_fault_error(struct pt_regs *regs, unsigned long error_code,
966 unsigned long address, vm_fault_t fault)
967{
968 if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
969 no_context(regs, error_code, address, 0, 0);
970 return;
971 }
972
973 if (fault & VM_FAULT_OOM) {
974
975 if (!(error_code & X86_PF_USER)) {
976 no_context(regs, error_code, address,
977 SIGSEGV, SEGV_MAPERR);
978 return;
979 }
980
981
982
983
984
985
986 pagefault_out_of_memory();
987 } else {
988 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
989 VM_FAULT_HWPOISON_LARGE))
990 do_sigbus(regs, error_code, address, fault);
991 else if (fault & VM_FAULT_SIGSEGV)
992 bad_area_nosemaphore(regs, error_code, address);
993 else
994 BUG();
995 }
996}
997
998static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
999{
1000 if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
1001 return 0;
1002
1003 if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
1004 return 0;
1005
1006 return 1;
1007}
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030static noinline int
1031spurious_kernel_fault(unsigned long error_code, unsigned long address)
1032{
1033 pgd_t *pgd;
1034 p4d_t *p4d;
1035 pud_t *pud;
1036 pmd_t *pmd;
1037 pte_t *pte;
1038 int ret;
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049 if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
1050 error_code != (X86_PF_INSTR | X86_PF_PROT))
1051 return 0;
1052
1053 pgd = init_mm.pgd + pgd_index(address);
1054 if (!pgd_present(*pgd))
1055 return 0;
1056
1057 p4d = p4d_offset(pgd, address);
1058 if (!p4d_present(*p4d))
1059 return 0;
1060
1061 if (p4d_large(*p4d))
1062 return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
1063
1064 pud = pud_offset(p4d, address);
1065 if (!pud_present(*pud))
1066 return 0;
1067
1068 if (pud_large(*pud))
1069 return spurious_kernel_fault_check(error_code, (pte_t *) pud);
1070
1071 pmd = pmd_offset(pud, address);
1072 if (!pmd_present(*pmd))
1073 return 0;
1074
1075 if (pmd_large(*pmd))
1076 return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
1077
1078 pte = pte_offset_kernel(pmd, address);
1079 if (!pte_present(*pte))
1080 return 0;
1081
1082 ret = spurious_kernel_fault_check(error_code, pte);
1083 if (!ret)
1084 return 0;
1085
1086
1087
1088
1089
1090 ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
1091 WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
1092
1093 return ret;
1094}
1095NOKPROBE_SYMBOL(spurious_kernel_fault);
1096
1097int show_unhandled_signals = 1;
1098
1099static inline int
1100access_error(unsigned long error_code, struct vm_area_struct *vma)
1101{
1102
1103 bool foreign = false;
1104
1105
1106
1107
1108
1109
1110 if (error_code & X86_PF_PK)
1111 return 1;
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122 if (unlikely(error_code & X86_PF_SGX))
1123 return 1;
1124
1125
1126
1127
1128
1129
1130 if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
1131 (error_code & X86_PF_INSTR), foreign))
1132 return 1;
1133
1134 if (error_code & X86_PF_WRITE) {
1135
1136 if (unlikely(!(vma->vm_flags & VM_WRITE)))
1137 return 1;
1138 return 0;
1139 }
1140
1141
1142 if (unlikely(error_code & X86_PF_PROT))
1143 return 1;
1144
1145
1146 if (unlikely(!vma_is_accessible(vma)))
1147 return 1;
1148
1149 return 0;
1150}
1151
1152bool fault_in_kernel_space(unsigned long address)
1153{
1154
1155
1156
1157
1158
1159 if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
1160 return false;
1161
1162 return address >= TASK_SIZE_MAX;
1163}
1164
1165
1166
1167
1168
1169
1170static void
1171do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
1172 unsigned long address)
1173{
1174
1175
1176
1177
1178
1179 WARN_ON_ONCE(hw_error_code & X86_PF_PK);
1180
1181#ifdef CONFIG_X86_32
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206 if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
1207 if (vmalloc_fault(address) >= 0)
1208 return;
1209 }
1210#endif
1211
1212
1213 if (spurious_kernel_fault(hw_error_code, address))
1214 return;
1215
1216
1217 if (kprobe_page_fault(regs, X86_TRAP_PF))
1218 return;
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228 bad_area_nosemaphore(regs, hw_error_code, address);
1229}
1230NOKPROBE_SYMBOL(do_kern_addr_fault);
1231
1232
1233static inline
1234void do_user_addr_fault(struct pt_regs *regs,
1235 unsigned long hw_error_code,
1236 unsigned long address)
1237{
1238 struct vm_area_struct *vma;
1239 struct task_struct *tsk;
1240 struct mm_struct *mm;
1241 vm_fault_t fault;
1242 unsigned int flags = FAULT_FLAG_DEFAULT;
1243
1244 tsk = current;
1245 mm = tsk->mm;
1246
1247
1248 if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
1249 return;
1250
1251
1252
1253
1254
1255 if (unlikely(hw_error_code & X86_PF_RSVD))
1256 pgtable_bad(regs, hw_error_code, address);
1257
1258
1259
1260
1261
1262
1263
1264
1265 if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
1266 !(hw_error_code & X86_PF_USER) &&
1267 !(regs->flags & X86_EFLAGS_AC)))
1268 {
1269 bad_area_nosemaphore(regs, hw_error_code, address);
1270 return;
1271 }
1272
1273
1274
1275
1276
1277 if (unlikely(faulthandler_disabled() || !mm)) {
1278 bad_area_nosemaphore(regs, hw_error_code, address);
1279 return;
1280 }
1281
1282
1283
1284
1285
1286
1287
1288
1289 if (user_mode(regs)) {
1290 local_irq_enable();
1291 flags |= FAULT_FLAG_USER;
1292 } else {
1293 if (regs->flags & X86_EFLAGS_IF)
1294 local_irq_enable();
1295 }
1296
1297 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
1298
1299 if (hw_error_code & X86_PF_WRITE)
1300 flags |= FAULT_FLAG_WRITE;
1301 if (hw_error_code & X86_PF_INSTR)
1302 flags |= FAULT_FLAG_INSTRUCTION;
1303
1304#ifdef CONFIG_X86_64
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316 if (is_vsyscall_vaddr(address)) {
1317 if (emulate_vsyscall(hw_error_code, regs, address))
1318 return;
1319 }
1320#endif
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334 if (unlikely(!mmap_read_trylock(mm))) {
1335 if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
1336
1337
1338
1339
1340 bad_area_nosemaphore(regs, hw_error_code, address);
1341 return;
1342 }
1343retry:
1344 mmap_read_lock(mm);
1345 } else {
1346
1347
1348
1349
1350
1351 might_sleep();
1352 }
1353
1354 vma = find_vma(mm, address);
1355 if (unlikely(!vma)) {
1356 bad_area(regs, hw_error_code, address);
1357 return;
1358 }
1359 if (likely(vma->vm_start <= address))
1360 goto good_area;
1361 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
1362 bad_area(regs, hw_error_code, address);
1363 return;
1364 }
1365 if (unlikely(expand_stack(vma, address))) {
1366 bad_area(regs, hw_error_code, address);
1367 return;
1368 }
1369
1370
1371
1372
1373
1374good_area:
1375 if (unlikely(access_error(hw_error_code, vma))) {
1376 bad_area_access_error(regs, hw_error_code, address, vma);
1377 return;
1378 }
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393 fault = handle_mm_fault(vma, address, flags, regs);
1394
1395
1396 if (fault_signal_pending(fault, regs)) {
1397 if (!user_mode(regs))
1398 no_context(regs, hw_error_code, address, SIGBUS,
1399 BUS_ADRERR);
1400 return;
1401 }
1402
1403
1404
1405
1406
1407
1408 if (unlikely((fault & VM_FAULT_RETRY) &&
1409 (flags & FAULT_FLAG_ALLOW_RETRY))) {
1410 flags |= FAULT_FLAG_TRIED;
1411 goto retry;
1412 }
1413
1414 mmap_read_unlock(mm);
1415 if (unlikely(fault & VM_FAULT_ERROR)) {
1416 mm_fault_error(regs, hw_error_code, address, fault);
1417 return;
1418 }
1419
1420 check_v8086_mode(regs, address, tsk);
1421}
1422NOKPROBE_SYMBOL(do_user_addr_fault);
1423
1424static __always_inline void
1425trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
1426 unsigned long address)
1427{
1428 if (!trace_pagefault_enabled())
1429 return;
1430
1431 if (user_mode(regs))
1432 trace_page_fault_user(address, regs, error_code);
1433 else
1434 trace_page_fault_kernel(address, regs, error_code);
1435}
1436
1437static __always_inline void
1438handle_page_fault(struct pt_regs *regs, unsigned long error_code,
1439 unsigned long address)
1440{
1441 trace_page_fault_entries(regs, error_code, address);
1442
1443 if (unlikely(kmmio_fault(regs, address)))
1444 return;
1445
1446
1447 if (unlikely(fault_in_kernel_space(address))) {
1448 do_kern_addr_fault(regs, error_code, address);
1449 } else {
1450 do_user_addr_fault(regs, error_code, address);
1451
1452
1453
1454
1455
1456
1457
1458 local_irq_disable();
1459 }
1460}
1461
1462DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
1463{
1464 unsigned long address = read_cr2();
1465 irqentry_state_t state;
1466
1467 prefetchw(¤t->mm->mmap_lock);
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490 if (kvm_handle_async_pf(regs, (u32)address))
1491 return;
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503 state = irqentry_enter(regs);
1504
1505 instrumentation_begin();
1506 handle_page_fault(regs, error_code, address);
1507 instrumentation_end();
1508
1509 irqentry_exit(regs, state);
1510}
1511