1
2
3
4
5
6
7
8
9
10
11
12#include <linux/thread_info.h>
13#include <linux/capability.h>
14#include <linux/miscdevice.h>
15#include <linux/ratelimit.h>
16#include <linux/rcupdate.h>
17#include <linux/kobject.h>
18#include <linux/uaccess.h>
19#include <linux/kdebug.h>
20#include <linux/kernel.h>
21#include <linux/percpu.h>
22#include <linux/string.h>
23#include <linux/device.h>
24#include <linux/syscore_ops.h>
25#include <linux/delay.h>
26#include <linux/ctype.h>
27#include <linux/sched.h>
28#include <linux/sysfs.h>
29#include <linux/types.h>
30#include <linux/slab.h>
31#include <linux/init.h>
32#include <linux/kmod.h>
33#include <linux/poll.h>
34#include <linux/nmi.h>
35#include <linux/cpu.h>
36#include <linux/ras.h>
37#include <linux/smp.h>
38#include <linux/fs.h>
39#include <linux/mm.h>
40#include <linux/debugfs.h>
41#include <linux/irq_work.h>
42#include <linux/export.h>
43#include <linux/jump_label.h>
44#include <linux/set_memory.h>
45#include <linux/task_work.h>
46#include <linux/hardirq.h>
47
48#include <asm/intel-family.h>
49#include <asm/processor.h>
50#include <asm/traps.h>
51#include <asm/tlbflush.h>
52#include <asm/mce.h>
53#include <asm/msr.h>
54#include <asm/reboot.h>
55
56#include "internal.h"
57
58
59static DEFINE_MUTEX(mce_sysfs_mutex);
60
61#define CREATE_TRACE_POINTS
62#include <trace/events/mce.h>
63
64#define SPINUNIT 100
65
66DEFINE_PER_CPU(unsigned, mce_exception_count);
67
68DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks);
69
70struct mce_bank {
71 u64 ctl;
72 bool init;
73};
74static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array);
75
76#define ATTR_LEN 16
77
78struct mce_bank_dev {
79 struct device_attribute attr;
80 char attrname[ATTR_LEN];
81 u8 bank;
82};
83static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS];
84
85struct mce_vendor_flags mce_flags __read_mostly;
86
87struct mca_config mca_cfg __read_mostly = {
88 .bootlog = -1,
89
90
91
92
93
94
95
96 .tolerant = 1,
97 .monarch_timeout = -1
98};
99
100static DEFINE_PER_CPU(struct mce, mces_seen);
101static unsigned long mce_need_notify;
102static int cpu_missing;
103
104
105
106
107
108DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
109 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
110};
111
112
113
114
115
116
117
118
119mce_banks_t mce_banks_ce_disabled;
120
121static struct work_struct mce_work;
122static struct irq_work mce_irq_work;
123
124static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
125
126
127
128
129
130BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
131
132
133noinstr void mce_setup(struct mce *m)
134{
135 memset(m, 0, sizeof(struct mce));
136 m->cpu = m->extcpu = smp_processor_id();
137
138 m->time = __ktime_get_real_seconds();
139 m->cpuvendor = boot_cpu_data.x86_vendor;
140 m->cpuid = cpuid_eax(1);
141 m->socketid = cpu_data(m->extcpu).phys_proc_id;
142 m->apicid = cpu_data(m->extcpu).initial_apicid;
143 m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
144
145 if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
146 m->ppin = __rdmsr(MSR_PPIN);
147 else if (this_cpu_has(X86_FEATURE_AMD_PPIN))
148 m->ppin = __rdmsr(MSR_AMD_PPIN);
149
150 m->microcode = boot_cpu_data.microcode;
151}
152
153DEFINE_PER_CPU(struct mce, injectm);
154EXPORT_PER_CPU_SYMBOL_GPL(injectm);
155
156void mce_log(struct mce *m)
157{
158 if (!mce_gen_pool_add(m))
159 irq_work_queue(&mce_irq_work);
160}
161EXPORT_SYMBOL_GPL(mce_log);
162
163void mce_register_decode_chain(struct notifier_block *nb)
164{
165 if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
166 return;
167
168 blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
169}
170EXPORT_SYMBOL_GPL(mce_register_decode_chain);
171
172void mce_unregister_decode_chain(struct notifier_block *nb)
173{
174 blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
175}
176EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
177
178static inline u32 ctl_reg(int bank)
179{
180 return MSR_IA32_MCx_CTL(bank);
181}
182
183static inline u32 status_reg(int bank)
184{
185 return MSR_IA32_MCx_STATUS(bank);
186}
187
188static inline u32 addr_reg(int bank)
189{
190 return MSR_IA32_MCx_ADDR(bank);
191}
192
193static inline u32 misc_reg(int bank)
194{
195 return MSR_IA32_MCx_MISC(bank);
196}
197
198static inline u32 smca_ctl_reg(int bank)
199{
200 return MSR_AMD64_SMCA_MCx_CTL(bank);
201}
202
203static inline u32 smca_status_reg(int bank)
204{
205 return MSR_AMD64_SMCA_MCx_STATUS(bank);
206}
207
208static inline u32 smca_addr_reg(int bank)
209{
210 return MSR_AMD64_SMCA_MCx_ADDR(bank);
211}
212
213static inline u32 smca_misc_reg(int bank)
214{
215 return MSR_AMD64_SMCA_MCx_MISC(bank);
216}
217
218struct mca_msr_regs msr_ops = {
219 .ctl = ctl_reg,
220 .status = status_reg,
221 .addr = addr_reg,
222 .misc = misc_reg
223};
224
225static void __print_mce(struct mce *m)
226{
227 pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
228 m->extcpu,
229 (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
230 m->mcgstatus, m->bank, m->status);
231
232 if (m->ip) {
233 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
234 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
235 m->cs, m->ip);
236
237 if (m->cs == __KERNEL_CS)
238 pr_cont("{%pS}", (void *)(unsigned long)m->ip);
239 pr_cont("\n");
240 }
241
242 pr_emerg(HW_ERR "TSC %llx ", m->tsc);
243 if (m->addr)
244 pr_cont("ADDR %llx ", m->addr);
245 if (m->misc)
246 pr_cont("MISC %llx ", m->misc);
247
248 if (mce_flags.smca) {
249 if (m->synd)
250 pr_cont("SYND %llx ", m->synd);
251 if (m->ipid)
252 pr_cont("IPID %llx ", m->ipid);
253 }
254
255 pr_cont("\n");
256
257
258
259
260
261 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
262 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
263 m->microcode);
264}
265
266static void print_mce(struct mce *m)
267{
268 __print_mce(m);
269
270 if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
271 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
272}
273
274#define PANIC_TIMEOUT 5
275
276static atomic_t mce_panicked;
277
278static int fake_panic;
279static atomic_t mce_fake_panicked;
280
281
282static void wait_for_panic(void)
283{
284 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
285
286 preempt_disable();
287 local_irq_enable();
288 while (timeout-- > 0)
289 udelay(1);
290 if (panic_timeout == 0)
291 panic_timeout = mca_cfg.panic_timeout;
292 panic("Panicing machine check CPU died");
293}
294
295static void mce_panic(const char *msg, struct mce *final, char *exp)
296{
297 int apei_err = 0;
298 struct llist_node *pending;
299 struct mce_evt_llist *l;
300
301 if (!fake_panic) {
302
303
304
305 if (atomic_inc_return(&mce_panicked) > 1)
306 wait_for_panic();
307 barrier();
308
309 bust_spinlocks(1);
310 console_verbose();
311 } else {
312
313 if (atomic_inc_return(&mce_fake_panicked) > 1)
314 return;
315 }
316 pending = mce_gen_pool_prepare_records();
317
318 llist_for_each_entry(l, pending, llnode) {
319 struct mce *m = &l->mce;
320 if (!(m->status & MCI_STATUS_UC)) {
321 print_mce(m);
322 if (!apei_err)
323 apei_err = apei_write_mce(m);
324 }
325 }
326
327 llist_for_each_entry(l, pending, llnode) {
328 struct mce *m = &l->mce;
329 if (!(m->status & MCI_STATUS_UC))
330 continue;
331 if (!final || mce_cmp(m, final)) {
332 print_mce(m);
333 if (!apei_err)
334 apei_err = apei_write_mce(m);
335 }
336 }
337 if (final) {
338 print_mce(final);
339 if (!apei_err)
340 apei_err = apei_write_mce(final);
341 }
342 if (cpu_missing)
343 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
344 if (exp)
345 pr_emerg(HW_ERR "Machine check: %s\n", exp);
346 if (!fake_panic) {
347 if (panic_timeout == 0)
348 panic_timeout = mca_cfg.panic_timeout;
349 panic(msg);
350 } else
351 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
352}
353
354
355
356static int msr_to_offset(u32 msr)
357{
358 unsigned bank = __this_cpu_read(injectm.bank);
359
360 if (msr == mca_cfg.rip_msr)
361 return offsetof(struct mce, ip);
362 if (msr == msr_ops.status(bank))
363 return offsetof(struct mce, status);
364 if (msr == msr_ops.addr(bank))
365 return offsetof(struct mce, addr);
366 if (msr == msr_ops.misc(bank))
367 return offsetof(struct mce, misc);
368 if (msr == MSR_IA32_MCG_STATUS)
369 return offsetof(struct mce, mcgstatus);
370 return -1;
371}
372
373
374static u64 mce_rdmsrl(u32 msr)
375{
376 u64 v;
377
378 if (__this_cpu_read(injectm.finished)) {
379 int offset = msr_to_offset(msr);
380
381 if (offset < 0)
382 return 0;
383 return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
384 }
385
386 if (rdmsrl_safe(msr, &v)) {
387 WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
388
389
390
391
392
393 v = 0;
394 }
395
396 return v;
397}
398
399static void mce_wrmsrl(u32 msr, u64 v)
400{
401 if (__this_cpu_read(injectm.finished)) {
402 int offset = msr_to_offset(msr);
403
404 if (offset >= 0)
405 *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
406 return;
407 }
408 wrmsrl(msr, v);
409}
410
411
412
413
414
415
416static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
417{
418 mce_setup(m);
419
420 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
421 if (regs) {
422
423
424
425
426 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
427 m->ip = regs->ip;
428 m->cs = regs->cs;
429
430
431
432
433
434
435 if (v8086_mode(regs))
436 m->cs |= 3;
437 }
438
439 if (mca_cfg.rip_msr)
440 m->ip = mce_rdmsrl(mca_cfg.rip_msr);
441 }
442}
443
444int mce_available(struct cpuinfo_x86 *c)
445{
446 if (mca_cfg.disabled)
447 return 0;
448 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
449}
450
451static void mce_schedule_work(void)
452{
453 if (!mce_gen_pool_empty())
454 schedule_work(&mce_work);
455}
456
457static void mce_irq_work_cb(struct irq_work *entry)
458{
459 mce_schedule_work();
460}
461
462
463
464
465
466
467
468int mce_usable_address(struct mce *m)
469{
470 if (!(m->status & MCI_STATUS_ADDRV))
471 return 0;
472
473
474 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
475 boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
476 return 1;
477
478 if (!(m->status & MCI_STATUS_MISCV))
479 return 0;
480
481 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
482 return 0;
483
484 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
485 return 0;
486
487 return 1;
488}
489EXPORT_SYMBOL_GPL(mce_usable_address);
490
491bool mce_is_memory_error(struct mce *m)
492{
493 switch (m->cpuvendor) {
494 case X86_VENDOR_AMD:
495 case X86_VENDOR_HYGON:
496 return amd_mce_is_memory_error(m);
497
498 case X86_VENDOR_INTEL:
499 case X86_VENDOR_ZHAOXIN:
500
501
502
503
504
505
506
507
508
509
510
511
512
513 return (m->status & 0xef80) == BIT(7) ||
514 (m->status & 0xef00) == BIT(8) ||
515 (m->status & 0xeffc) == 0xc;
516
517 default:
518 return false;
519 }
520}
521EXPORT_SYMBOL_GPL(mce_is_memory_error);
522
523static bool whole_page(struct mce *m)
524{
525 if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
526 return true;
527
528 return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
529}
530
531bool mce_is_correctable(struct mce *m)
532{
533 if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
534 return false;
535
536 if (m->cpuvendor == X86_VENDOR_HYGON && m->status & MCI_STATUS_DEFERRED)
537 return false;
538
539 if (m->status & MCI_STATUS_UC)
540 return false;
541
542 return true;
543}
544EXPORT_SYMBOL_GPL(mce_is_correctable);
545
546static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
547 void *data)
548{
549 struct mce *m = (struct mce *)data;
550
551 if (!m)
552 return NOTIFY_DONE;
553
554
555 trace_mce_record(m);
556
557 set_bit(0, &mce_need_notify);
558
559 mce_notify_irq();
560
561 return NOTIFY_DONE;
562}
563
564static struct notifier_block early_nb = {
565 .notifier_call = mce_early_notifier,
566 .priority = MCE_PRIO_EARLY,
567};
568
569static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
570 void *data)
571{
572 struct mce *mce = (struct mce *)data;
573 unsigned long pfn;
574
575 if (!mce || !mce_usable_address(mce))
576 return NOTIFY_DONE;
577
578 if (mce->severity != MCE_AO_SEVERITY &&
579 mce->severity != MCE_DEFERRED_SEVERITY)
580 return NOTIFY_DONE;
581
582 pfn = mce->addr >> PAGE_SHIFT;
583 if (!memory_failure(pfn, 0)) {
584 set_mce_nospec(pfn, whole_page(mce));
585 mce->kflags |= MCE_HANDLED_UC;
586 }
587
588 return NOTIFY_OK;
589}
590
591static struct notifier_block mce_uc_nb = {
592 .notifier_call = uc_decode_notifier,
593 .priority = MCE_PRIO_UC,
594};
595
596static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
597 void *data)
598{
599 struct mce *m = (struct mce *)data;
600
601 if (!m)
602 return NOTIFY_DONE;
603
604 if (mca_cfg.print_all || !m->kflags)
605 __print_mce(m);
606
607 return NOTIFY_DONE;
608}
609
610static struct notifier_block mce_default_nb = {
611 .notifier_call = mce_default_notifier,
612
613 .priority = MCE_PRIO_LOWEST,
614};
615
616
617
618
619static void mce_read_aux(struct mce *m, int i)
620{
621 if (m->status & MCI_STATUS_MISCV)
622 m->misc = mce_rdmsrl(msr_ops.misc(i));
623
624 if (m->status & MCI_STATUS_ADDRV) {
625 m->addr = mce_rdmsrl(msr_ops.addr(i));
626
627
628
629
630 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
631 u8 shift = MCI_MISC_ADDR_LSB(m->misc);
632 m->addr >>= shift;
633 m->addr <<= shift;
634 }
635
636
637
638
639
640 if (mce_flags.smca) {
641 u8 lsb = (m->addr >> 56) & 0x3f;
642
643 m->addr &= GENMASK_ULL(55, lsb);
644 }
645 }
646
647 if (mce_flags.smca) {
648 m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
649
650 if (m->status & MCI_STATUS_SYNDV)
651 m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
652 }
653}
654
655DEFINE_PER_CPU(unsigned, mce_poll_count);
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
673{
674 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
675 bool error_seen = false;
676 struct mce m;
677 int i;
678
679 this_cpu_inc(mce_poll_count);
680
681 mce_gather_info(&m, NULL);
682
683 if (flags & MCP_TIMESTAMP)
684 m.tsc = rdtsc();
685
686 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
687 if (!mce_banks[i].ctl || !test_bit(i, *b))
688 continue;
689
690 m.misc = 0;
691 m.addr = 0;
692 m.bank = i;
693
694 barrier();
695 m.status = mce_rdmsrl(msr_ops.status(i));
696
697
698 if (!(m.status & MCI_STATUS_VAL))
699 continue;
700
701
702
703
704
705 if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC))
706 goto log_it;
707
708
709
710
711
712
713
714 if (!mca_cfg.ser) {
715 if (m.status & MCI_STATUS_UC)
716 continue;
717 goto log_it;
718 }
719
720
721 if (!(m.status & MCI_STATUS_EN))
722 goto log_it;
723
724
725
726
727
728 if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S))
729 goto log_it;
730
731
732
733
734
735
736 continue;
737
738log_it:
739 error_seen = true;
740
741 if (flags & MCP_DONTLOG)
742 goto clear_it;
743
744 mce_read_aux(&m, i);
745 m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
746
747
748
749
750
751 if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
752 goto clear_it;
753
754 mce_log(&m);
755
756clear_it:
757
758
759
760 mce_wrmsrl(msr_ops.status(i), 0);
761 }
762
763
764
765
766
767
768 sync_core();
769
770 return error_seen;
771}
772EXPORT_SYMBOL_GPL(machine_check_poll);
773
774
775
776
777
778static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
779 struct pt_regs *regs)
780{
781 char *tmp = *msg;
782 int i;
783
784 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
785 m->status = mce_rdmsrl(msr_ops.status(i));
786 if (!(m->status & MCI_STATUS_VAL))
787 continue;
788
789 __set_bit(i, validp);
790 if (quirk_no_way_out)
791 quirk_no_way_out(i, m, regs);
792
793 m->bank = i;
794 if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
795 mce_read_aux(m, i);
796 *msg = tmp;
797 return 1;
798 }
799 }
800 return 0;
801}
802
803
804
805
806
807static atomic_t mce_executing;
808
809
810
811
812static atomic_t mce_callin;
813
814
815
816
817static int mce_timed_out(u64 *t, const char *msg)
818{
819
820
821
822
823
824
825 rmb();
826 if (atomic_read(&mce_panicked))
827 wait_for_panic();
828 if (!mca_cfg.monarch_timeout)
829 goto out;
830 if ((s64)*t < SPINUNIT) {
831 if (mca_cfg.tolerant <= 1)
832 mce_panic(msg, NULL, NULL);
833 cpu_missing = 1;
834 return 1;
835 }
836 *t -= SPINUNIT;
837out:
838 touch_nmi_watchdog();
839 return 0;
840}
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866static void mce_reign(void)
867{
868 int cpu;
869 struct mce *m = NULL;
870 int global_worst = 0;
871 char *msg = NULL;
872 char *nmsg = NULL;
873
874
875
876
877
878
879 for_each_possible_cpu(cpu) {
880 int severity = mce_severity(&per_cpu(mces_seen, cpu),
881 mca_cfg.tolerant,
882 &nmsg, true);
883 if (severity > global_worst) {
884 msg = nmsg;
885 global_worst = severity;
886 m = &per_cpu(mces_seen, cpu);
887 }
888 }
889
890
891
892
893
894
895 if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
896 mce_panic("Fatal machine check", m, msg);
897
898
899
900
901
902
903
904
905
906
907
908 if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
909 mce_panic("Fatal machine check from unknown source", NULL, NULL);
910
911
912
913
914
915 for_each_possible_cpu(cpu)
916 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
917}
918
919static atomic_t global_nwo;
920
921
922
923
924
925
926
927
928static int mce_start(int *no_way_out)
929{
930 int order;
931 int cpus = num_online_cpus();
932 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
933
934 if (!timeout)
935 return -1;
936
937 atomic_add(*no_way_out, &global_nwo);
938
939
940
941
942 order = atomic_inc_return(&mce_callin);
943
944
945
946
947 while (atomic_read(&mce_callin) != cpus) {
948 if (mce_timed_out(&timeout,
949 "Timeout: Not all CPUs entered broadcast exception handler")) {
950 atomic_set(&global_nwo, 0);
951 return -1;
952 }
953 ndelay(SPINUNIT);
954 }
955
956
957
958
959 smp_rmb();
960
961 if (order == 1) {
962
963
964
965 atomic_set(&mce_executing, 1);
966 } else {
967
968
969
970
971
972
973 while (atomic_read(&mce_executing) < order) {
974 if (mce_timed_out(&timeout,
975 "Timeout: Subject CPUs unable to finish machine check processing")) {
976 atomic_set(&global_nwo, 0);
977 return -1;
978 }
979 ndelay(SPINUNIT);
980 }
981 }
982
983
984
985
986 *no_way_out = atomic_read(&global_nwo);
987
988 return order;
989}
990
991
992
993
994
995static int mce_end(int order)
996{
997 int ret = -1;
998 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
999
1000 if (!timeout)
1001 goto reset;
1002 if (order < 0)
1003 goto reset;
1004
1005
1006
1007
1008 atomic_inc(&mce_executing);
1009
1010 if (order == 1) {
1011
1012 int cpus = num_online_cpus();
1013
1014
1015
1016
1017
1018 while (atomic_read(&mce_executing) <= cpus) {
1019 if (mce_timed_out(&timeout,
1020 "Timeout: Monarch CPU unable to finish machine check processing"))
1021 goto reset;
1022 ndelay(SPINUNIT);
1023 }
1024
1025 mce_reign();
1026 barrier();
1027 ret = 0;
1028 } else {
1029
1030
1031
1032 while (atomic_read(&mce_executing) != 0) {
1033 if (mce_timed_out(&timeout,
1034 "Timeout: Monarch CPU did not finish machine check processing"))
1035 goto reset;
1036 ndelay(SPINUNIT);
1037 }
1038
1039
1040
1041
1042 return 0;
1043 }
1044
1045
1046
1047
1048reset:
1049 atomic_set(&global_nwo, 0);
1050 atomic_set(&mce_callin, 0);
1051 barrier();
1052
1053
1054
1055
1056 atomic_set(&mce_executing, 0);
1057 return ret;
1058}
1059
1060static void mce_clear_state(unsigned long *toclear)
1061{
1062 int i;
1063
1064 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1065 if (test_bit(i, toclear))
1066 mce_wrmsrl(msr_ops.status(i), 0);
1067 }
1068}
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082static noinstr bool mce_check_crashing_cpu(void)
1083{
1084 unsigned int cpu = smp_processor_id();
1085
1086 if (arch_cpu_is_offline(cpu) ||
1087 (crashing_cpu != -1 && crashing_cpu != cpu)) {
1088 u64 mcgstatus;
1089
1090 mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS);
1091
1092 if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
1093 if (mcgstatus & MCG_STATUS_LMCES)
1094 return false;
1095 }
1096
1097 if (mcgstatus & MCG_STATUS_RIPV) {
1098 __wrmsr(MSR_IA32_MCG_STATUS, 0, 0);
1099 return true;
1100 }
1101 }
1102 return false;
1103}
1104
1105static void __mc_scan_banks(struct mce *m, struct mce *final,
1106 unsigned long *toclear, unsigned long *valid_banks,
1107 int no_way_out, int *worst)
1108{
1109 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1110 struct mca_config *cfg = &mca_cfg;
1111 int severity, i;
1112
1113 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1114 __clear_bit(i, toclear);
1115 if (!test_bit(i, valid_banks))
1116 continue;
1117
1118 if (!mce_banks[i].ctl)
1119 continue;
1120
1121 m->misc = 0;
1122 m->addr = 0;
1123 m->bank = i;
1124
1125 m->status = mce_rdmsrl(msr_ops.status(i));
1126 if (!(m->status & MCI_STATUS_VAL))
1127 continue;
1128
1129
1130
1131
1132
1133 if (!(m->status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1134 !no_way_out)
1135 continue;
1136
1137
1138 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1139
1140 severity = mce_severity(m, cfg->tolerant, NULL, true);
1141
1142
1143
1144
1145
1146 if ((severity == MCE_KEEP_SEVERITY ||
1147 severity == MCE_UCNA_SEVERITY) && !no_way_out)
1148 continue;
1149
1150 __set_bit(i, toclear);
1151
1152
1153 if (severity == MCE_NO_SEVERITY)
1154 continue;
1155
1156 mce_read_aux(m, i);
1157
1158
1159 m->severity = severity;
1160
1161 mce_log(m);
1162
1163 if (severity > *worst) {
1164 *final = *m;
1165 *worst = severity;
1166 }
1167 }
1168
1169
1170 *m = *final;
1171}
1172
1173static void kill_me_now(struct callback_head *ch)
1174{
1175 force_sig(SIGBUS);
1176}
1177
1178static void kill_me_maybe(struct callback_head *cb)
1179{
1180 struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
1181 int flags = MF_ACTION_REQUIRED;
1182
1183 pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
1184
1185 if (!p->mce_ripv)
1186 flags |= MF_MUST_KILL;
1187
1188 if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) {
1189 set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
1190 return;
1191 }
1192
1193 pr_err("Memory error not recovered");
1194 kill_me_now(cb);
1195}
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215void noinstr do_machine_check(struct pt_regs *regs)
1216{
1217 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1218 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1219 struct mca_config *cfg = &mca_cfg;
1220 struct mce m, *final;
1221 char *msg = NULL;
1222 int worst = 0;
1223
1224
1225
1226
1227
1228 int order = -1;
1229
1230
1231
1232
1233
1234 int no_way_out = 0;
1235
1236
1237
1238
1239
1240 int kill_it = 0;
1241
1242
1243
1244
1245
1246 int lmce = 1;
1247
1248 this_cpu_inc(mce_exception_count);
1249
1250 mce_gather_info(&m, regs);
1251 m.tsc = rdtsc();
1252
1253 final = this_cpu_ptr(&mces_seen);
1254 *final = m;
1255
1256 memset(valid_banks, 0, sizeof(valid_banks));
1257 no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1258
1259 barrier();
1260
1261
1262
1263
1264
1265
1266 if (!(m.mcgstatus & MCG_STATUS_RIPV))
1267 kill_it = 1;
1268
1269
1270
1271
1272
1273 if (m.cpuvendor == X86_VENDOR_INTEL ||
1274 m.cpuvendor == X86_VENDOR_ZHAOXIN)
1275 lmce = m.mcgstatus & MCG_STATUS_LMCES;
1276
1277
1278
1279
1280
1281
1282
1283
1284 if (lmce) {
1285 if (no_way_out)
1286 mce_panic("Fatal local machine check", &m, msg);
1287 } else {
1288 order = mce_start(&no_way_out);
1289 }
1290
1291 __mc_scan_banks(&m, final, toclear, valid_banks, no_way_out, &worst);
1292
1293 if (!no_way_out)
1294 mce_clear_state(toclear);
1295
1296
1297
1298
1299
1300 if (!lmce) {
1301 if (mce_end(order) < 0)
1302 no_way_out = worst >= MCE_PANIC_SEVERITY;
1303 } else {
1304
1305
1306
1307
1308
1309
1310
1311
1312 if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
1313 mce_severity(&m, cfg->tolerant, &msg, true);
1314 mce_panic("Local fatal machine check!", &m, msg);
1315 }
1316 }
1317
1318
1319
1320
1321
1322 if (cfg->tolerant == 3)
1323 kill_it = 0;
1324 else if (no_way_out)
1325 mce_panic("Fatal machine check on current CPU", &m, msg);
1326
1327 if (worst > 0)
1328 irq_work_queue(&mce_irq_work);
1329
1330 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1331
1332 sync_core();
1333
1334 if (worst != MCE_AR_SEVERITY && !kill_it)
1335 return;
1336
1337
1338 if ((m.cs & 3) == 3) {
1339
1340 BUG_ON(!on_thread_stack() || !user_mode(regs));
1341
1342 current->mce_addr = m.addr;
1343 current->mce_ripv = !!(m.mcgstatus & MCG_STATUS_RIPV);
1344 current->mce_whole_page = whole_page(&m);
1345 current->mce_kill_me.func = kill_me_maybe;
1346 if (kill_it)
1347 current->mce_kill_me.func = kill_me_now;
1348 task_work_add(current, ¤t->mce_kill_me, true);
1349 } else {
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359 if (m.kflags & MCE_IN_KERNEL_RECOV) {
1360 if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
1361 mce_panic("Failed kernel mode recovery", &m, msg);
1362 }
1363 }
1364}
1365EXPORT_SYMBOL_GPL(do_machine_check);
1366
1367#ifndef CONFIG_MEMORY_FAILURE
1368int memory_failure(unsigned long pfn, int flags)
1369{
1370
1371 BUG_ON(flags & MF_ACTION_REQUIRED);
1372 pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1373 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1374 pfn);
1375
1376 return 0;
1377}
1378#endif
1379
1380
1381
1382
1383
1384
1385static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1386
1387static DEFINE_PER_CPU(unsigned long, mce_next_interval);
1388static DEFINE_PER_CPU(struct timer_list, mce_timer);
1389
1390static unsigned long mce_adjust_timer_default(unsigned long interval)
1391{
1392 return interval;
1393}
1394
1395static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1396
1397static void __start_timer(struct timer_list *t, unsigned long interval)
1398{
1399 unsigned long when = jiffies + interval;
1400 unsigned long flags;
1401
1402 local_irq_save(flags);
1403
1404 if (!timer_pending(t) || time_before(when, t->expires))
1405 mod_timer(t, round_jiffies(when));
1406
1407 local_irq_restore(flags);
1408}
1409
1410static void mce_timer_fn(struct timer_list *t)
1411{
1412 struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
1413 unsigned long iv;
1414
1415 WARN_ON(cpu_t != t);
1416
1417 iv = __this_cpu_read(mce_next_interval);
1418
1419 if (mce_available(this_cpu_ptr(&cpu_info))) {
1420 machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
1421
1422 if (mce_intel_cmci_poll()) {
1423 iv = mce_adjust_timer(iv);
1424 goto done;
1425 }
1426 }
1427
1428
1429
1430
1431
1432 if (mce_notify_irq())
1433 iv = max(iv / 2, (unsigned long) HZ/100);
1434 else
1435 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1436
1437done:
1438 __this_cpu_write(mce_next_interval, iv);
1439 __start_timer(t, iv);
1440}
1441
1442
1443
1444
1445void mce_timer_kick(unsigned long interval)
1446{
1447 struct timer_list *t = this_cpu_ptr(&mce_timer);
1448 unsigned long iv = __this_cpu_read(mce_next_interval);
1449
1450 __start_timer(t, interval);
1451
1452 if (interval < iv)
1453 __this_cpu_write(mce_next_interval, interval);
1454}
1455
1456
1457static void mce_timer_delete_all(void)
1458{
1459 int cpu;
1460
1461 for_each_online_cpu(cpu)
1462 del_timer_sync(&per_cpu(mce_timer, cpu));
1463}
1464
1465
1466
1467
1468
1469
1470int mce_notify_irq(void)
1471{
1472
1473 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1474
1475 if (test_and_clear_bit(0, &mce_need_notify)) {
1476 mce_work_trigger();
1477
1478 if (__ratelimit(&ratelimit))
1479 pr_info(HW_ERR "Machine check events logged\n");
1480
1481 return 1;
1482 }
1483 return 0;
1484}
1485EXPORT_SYMBOL_GPL(mce_notify_irq);
1486
1487static void __mcheck_cpu_mce_banks_init(void)
1488{
1489 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1490 u8 n_banks = this_cpu_read(mce_num_banks);
1491 int i;
1492
1493 for (i = 0; i < n_banks; i++) {
1494 struct mce_bank *b = &mce_banks[i];
1495
1496
1497
1498
1499
1500
1501 b->ctl = -1ULL;
1502 b->init = 1;
1503 }
1504}
1505
1506
1507
1508
1509static void __mcheck_cpu_cap_init(void)
1510{
1511 u64 cap;
1512 u8 b;
1513
1514 rdmsrl(MSR_IA32_MCG_CAP, cap);
1515
1516 b = cap & MCG_BANKCNT_MASK;
1517
1518 if (b > MAX_NR_BANKS) {
1519 pr_warn("CPU%d: Using only %u machine check banks out of %u\n",
1520 smp_processor_id(), MAX_NR_BANKS, b);
1521 b = MAX_NR_BANKS;
1522 }
1523
1524 this_cpu_write(mce_num_banks, b);
1525
1526 __mcheck_cpu_mce_banks_init();
1527
1528
1529 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1530 mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1531
1532 if (cap & MCG_SER_P)
1533 mca_cfg.ser = 1;
1534}
1535
1536static void __mcheck_cpu_init_generic(void)
1537{
1538 enum mcp_flags m_fl = 0;
1539 mce_banks_t all_banks;
1540 u64 cap;
1541
1542 if (!mca_cfg.bootlog)
1543 m_fl = MCP_DONTLOG;
1544
1545
1546
1547
1548 bitmap_fill(all_banks, MAX_NR_BANKS);
1549 machine_check_poll(MCP_UC | m_fl, &all_banks);
1550
1551 cr4_set_bits(X86_CR4_MCE);
1552
1553 rdmsrl(MSR_IA32_MCG_CAP, cap);
1554 if (cap & MCG_CTL_P)
1555 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1556}
1557
1558static void __mcheck_cpu_init_clear_banks(void)
1559{
1560 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1561 int i;
1562
1563 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1564 struct mce_bank *b = &mce_banks[i];
1565
1566 if (!b->init)
1567 continue;
1568 wrmsrl(msr_ops.ctl(i), b->ctl);
1569 wrmsrl(msr_ops.status(i), 0);
1570 }
1571}
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583static void __mcheck_cpu_check_banks(void)
1584{
1585 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1586 u64 msrval;
1587 int i;
1588
1589 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
1590 struct mce_bank *b = &mce_banks[i];
1591
1592 if (!b->init)
1593 continue;
1594
1595 rdmsrl(msr_ops.ctl(i), msrval);
1596 b->init = !!msrval;
1597 }
1598}
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1609{
1610 if (bank != 0)
1611 return;
1612 if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1613 return;
1614 if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1615 MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1616 MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1617 MCACOD)) !=
1618 (MCI_STATUS_UC|MCI_STATUS_EN|
1619 MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1620 MCI_STATUS_AR|MCACOD_INSTR))
1621 return;
1622
1623 m->mcgstatus |= MCG_STATUS_EIPV;
1624 m->ip = regs->ip;
1625 m->cs = regs->cs;
1626}
1627
1628
1629static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1630{
1631 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1632 struct mca_config *cfg = &mca_cfg;
1633
1634 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1635 pr_info("unknown CPU type - not enabling MCE support\n");
1636 return -EOPNOTSUPP;
1637 }
1638
1639
1640 if (c->x86_vendor == X86_VENDOR_AMD) {
1641 if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
1642
1643
1644
1645
1646
1647 clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1648 }
1649 if (c->x86 < 0x11 && cfg->bootlog < 0) {
1650
1651
1652
1653
1654 cfg->bootlog = 0;
1655 }
1656
1657
1658
1659
1660 if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0)
1661 mce_banks[0].ctl = 0;
1662
1663
1664
1665
1666
1667 if (c->x86 == 0x15 && c->x86_model <= 0xf)
1668 mce_flags.overflow_recov = 1;
1669
1670 }
1671
1672 if (c->x86_vendor == X86_VENDOR_INTEL) {
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682 if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
1683 mce_banks[0].init = 0;
1684
1685
1686
1687
1688
1689 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1690 cfg->monarch_timeout < 0)
1691 cfg->monarch_timeout = USEC_PER_SEC;
1692
1693
1694
1695
1696
1697 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1698 cfg->bootlog = 0;
1699
1700 if (c->x86 == 6 && c->x86_model == 45)
1701 quirk_no_way_out = quirk_sandybridge_ifu;
1702 }
1703
1704 if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
1705
1706
1707
1708
1709 if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
1710 if (cfg->monarch_timeout < 0)
1711 cfg->monarch_timeout = USEC_PER_SEC;
1712 }
1713 }
1714
1715 if (cfg->monarch_timeout < 0)
1716 cfg->monarch_timeout = 0;
1717 if (cfg->bootlog != 0)
1718 cfg->panic_timeout = 30;
1719
1720 return 0;
1721}
1722
1723static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1724{
1725 if (c->x86 != 5)
1726 return 0;
1727
1728 switch (c->x86_vendor) {
1729 case X86_VENDOR_INTEL:
1730 intel_p5_mcheck_init(c);
1731 return 1;
1732 break;
1733 case X86_VENDOR_CENTAUR:
1734 winchip_mcheck_init(c);
1735 return 1;
1736 break;
1737 default:
1738 return 0;
1739 }
1740
1741 return 0;
1742}
1743
1744
1745
1746
1747static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
1748{
1749 if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) {
1750 mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
1751 mce_flags.succor = !!cpu_has(c, X86_FEATURE_SUCCOR);
1752 mce_flags.smca = !!cpu_has(c, X86_FEATURE_SMCA);
1753 mce_flags.amd_threshold = 1;
1754
1755 if (mce_flags.smca) {
1756 msr_ops.ctl = smca_ctl_reg;
1757 msr_ops.status = smca_status_reg;
1758 msr_ops.addr = smca_addr_reg;
1759 msr_ops.misc = smca_misc_reg;
1760 }
1761 }
1762}
1763
1764static void mce_centaur_feature_init(struct cpuinfo_x86 *c)
1765{
1766 struct mca_config *cfg = &mca_cfg;
1767
1768
1769
1770
1771
1772 if ((c->x86 == 6 && c->x86_model == 0xf && c->x86_stepping >= 0xe) ||
1773 c->x86 > 6) {
1774 if (cfg->monarch_timeout < 0)
1775 cfg->monarch_timeout = USEC_PER_SEC;
1776 }
1777}
1778
1779static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
1780{
1781 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792 if ((c->x86 == 7 && c->x86_model == 0x1b) ||
1793 (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
1794 if (this_cpu_read(mce_num_banks) > 8)
1795 mce_banks[8].ctl = 0;
1796 }
1797
1798 intel_init_cmci();
1799 intel_init_lmce();
1800 mce_adjust_timer = cmci_intel_adjust_timer;
1801}
1802
1803static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
1804{
1805 intel_clear_lmce();
1806}
1807
1808static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1809{
1810 switch (c->x86_vendor) {
1811 case X86_VENDOR_INTEL:
1812 mce_intel_feature_init(c);
1813 mce_adjust_timer = cmci_intel_adjust_timer;
1814 break;
1815
1816 case X86_VENDOR_AMD: {
1817 mce_amd_feature_init(c);
1818 break;
1819 }
1820
1821 case X86_VENDOR_HYGON:
1822 mce_hygon_feature_init(c);
1823 break;
1824
1825 case X86_VENDOR_CENTAUR:
1826 mce_centaur_feature_init(c);
1827 break;
1828
1829 case X86_VENDOR_ZHAOXIN:
1830 mce_zhaoxin_feature_init(c);
1831 break;
1832
1833 default:
1834 break;
1835 }
1836}
1837
1838static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
1839{
1840 switch (c->x86_vendor) {
1841 case X86_VENDOR_INTEL:
1842 mce_intel_feature_clear(c);
1843 break;
1844
1845 case X86_VENDOR_ZHAOXIN:
1846 mce_zhaoxin_feature_clear(c);
1847 break;
1848
1849 default:
1850 break;
1851 }
1852}
1853
1854static void mce_start_timer(struct timer_list *t)
1855{
1856 unsigned long iv = check_interval * HZ;
1857
1858 if (mca_cfg.ignore_ce || !iv)
1859 return;
1860
1861 this_cpu_write(mce_next_interval, iv);
1862 __start_timer(t, iv);
1863}
1864
1865static void __mcheck_cpu_setup_timer(void)
1866{
1867 struct timer_list *t = this_cpu_ptr(&mce_timer);
1868
1869 timer_setup(t, mce_timer_fn, TIMER_PINNED);
1870}
1871
1872static void __mcheck_cpu_init_timer(void)
1873{
1874 struct timer_list *t = this_cpu_ptr(&mce_timer);
1875
1876 timer_setup(t, mce_timer_fn, TIMER_PINNED);
1877 mce_start_timer(t);
1878}
1879
1880bool filter_mce(struct mce *m)
1881{
1882 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1883 return amd_filter_mce(m);
1884 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
1885 return intel_filter_mce(m);
1886
1887 return false;
1888}
1889
1890
1891static noinstr void unexpected_machine_check(struct pt_regs *regs)
1892{
1893 instrumentation_begin();
1894 pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1895 smp_processor_id());
1896 instrumentation_end();
1897}
1898
1899
1900void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check;
1901
1902static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
1903{
1904 WARN_ON_ONCE(user_mode(regs));
1905
1906
1907
1908
1909
1910 if (machine_check_vector == do_machine_check &&
1911 mce_check_crashing_cpu())
1912 return;
1913
1914 nmi_enter();
1915
1916
1917
1918
1919 instrumentation_begin();
1920 trace_hardirqs_off_finish();
1921 machine_check_vector(regs);
1922 if (regs->flags & X86_EFLAGS_IF)
1923 trace_hardirqs_on_prepare();
1924 instrumentation_end();
1925 nmi_exit();
1926}
1927
1928static __always_inline void exc_machine_check_user(struct pt_regs *regs)
1929{
1930 idtentry_enter_user(regs);
1931 instrumentation_begin();
1932 machine_check_vector(regs);
1933 instrumentation_end();
1934 idtentry_exit_user(regs);
1935}
1936
1937#ifdef CONFIG_X86_64
1938
1939DEFINE_IDTENTRY_MCE(exc_machine_check)
1940{
1941 unsigned long dr7;
1942
1943 dr7 = local_db_save();
1944 exc_machine_check_kernel(regs);
1945 local_db_restore(dr7);
1946}
1947
1948
1949DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
1950{
1951 unsigned long dr7;
1952
1953 dr7 = local_db_save();
1954 exc_machine_check_user(regs);
1955 local_db_restore(dr7);
1956}
1957#else
1958
1959DEFINE_IDTENTRY_RAW(exc_machine_check)
1960{
1961 unsigned long dr7;
1962
1963 dr7 = local_db_save();
1964 if (user_mode(regs))
1965 exc_machine_check_user(regs);
1966 else
1967 exc_machine_check_kernel(regs);
1968 local_db_restore(dr7);
1969}
1970#endif
1971
1972
1973
1974
1975
1976void mcheck_cpu_init(struct cpuinfo_x86 *c)
1977{
1978 if (mca_cfg.disabled)
1979 return;
1980
1981 if (__mcheck_cpu_ancient_init(c))
1982 return;
1983
1984 if (!mce_available(c))
1985 return;
1986
1987 __mcheck_cpu_cap_init();
1988
1989 if (__mcheck_cpu_apply_quirks(c) < 0) {
1990 mca_cfg.disabled = 1;
1991 return;
1992 }
1993
1994 if (mce_gen_pool_init()) {
1995 mca_cfg.disabled = 1;
1996 pr_emerg("Couldn't allocate MCE records pool!\n");
1997 return;
1998 }
1999
2000 machine_check_vector = do_machine_check;
2001
2002 __mcheck_cpu_init_early(c);
2003 __mcheck_cpu_init_generic();
2004 __mcheck_cpu_init_vendor(c);
2005 __mcheck_cpu_init_clear_banks();
2006 __mcheck_cpu_check_banks();
2007 __mcheck_cpu_setup_timer();
2008}
2009
2010
2011
2012
2013void mcheck_cpu_clear(struct cpuinfo_x86 *c)
2014{
2015 if (mca_cfg.disabled)
2016 return;
2017
2018 if (!mce_available(c))
2019 return;
2020
2021
2022
2023
2024
2025 __mcheck_cpu_clear_vendor(c);
2026
2027}
2028
2029static void __mce_disable_bank(void *arg)
2030{
2031 int bank = *((int *)arg);
2032 __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
2033 cmci_disable_bank(bank);
2034}
2035
2036void mce_disable_bank(int bank)
2037{
2038 if (bank >= this_cpu_read(mce_num_banks)) {
2039 pr_warn(FW_BUG
2040 "Ignoring request to disable invalid MCA bank %d.\n",
2041 bank);
2042 return;
2043 }
2044 set_bit(bank, mce_banks_ce_disabled);
2045 on_each_cpu(__mce_disable_bank, &bank, 1);
2046}
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064static int __init mcheck_enable(char *str)
2065{
2066 struct mca_config *cfg = &mca_cfg;
2067
2068 if (*str == 0) {
2069 enable_p5_mce();
2070 return 1;
2071 }
2072 if (*str == '=')
2073 str++;
2074 if (!strcmp(str, "off"))
2075 cfg->disabled = 1;
2076 else if (!strcmp(str, "no_cmci"))
2077 cfg->cmci_disabled = true;
2078 else if (!strcmp(str, "no_lmce"))
2079 cfg->lmce_disabled = 1;
2080 else if (!strcmp(str, "dont_log_ce"))
2081 cfg->dont_log_ce = true;
2082 else if (!strcmp(str, "print_all"))
2083 cfg->print_all = true;
2084 else if (!strcmp(str, "ignore_ce"))
2085 cfg->ignore_ce = true;
2086 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
2087 cfg->bootlog = (str[0] == 'b');
2088 else if (!strcmp(str, "bios_cmci_threshold"))
2089 cfg->bios_cmci_threshold = 1;
2090 else if (!strcmp(str, "recovery"))
2091 cfg->recovery = 1;
2092 else if (isdigit(str[0])) {
2093 if (get_option(&str, &cfg->tolerant) == 2)
2094 get_option(&str, &(cfg->monarch_timeout));
2095 } else {
2096 pr_info("mce argument %s ignored. Please use /sys\n", str);
2097 return 0;
2098 }
2099 return 1;
2100}
2101__setup("mce", mcheck_enable);
2102
2103int __init mcheck_init(void)
2104{
2105 mcheck_intel_therm_init();
2106 mce_register_decode_chain(&early_nb);
2107 mce_register_decode_chain(&mce_uc_nb);
2108 mce_register_decode_chain(&mce_default_nb);
2109 mcheck_vendor_init_severity();
2110
2111 INIT_WORK(&mce_work, mce_gen_pool_process);
2112 init_irq_work(&mce_irq_work, mce_irq_work_cb);
2113
2114 return 0;
2115}
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125static void mce_disable_error_reporting(void)
2126{
2127 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
2128 int i;
2129
2130 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
2131 struct mce_bank *b = &mce_banks[i];
2132
2133 if (b->init)
2134 wrmsrl(msr_ops.ctl(i), 0);
2135 }
2136 return;
2137}
2138
2139static void vendor_disable_error_reporting(void)
2140{
2141
2142
2143
2144
2145
2146
2147
2148 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
2149 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
2150 boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
2151 boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN)
2152 return;
2153
2154 mce_disable_error_reporting();
2155}
2156
2157static int mce_syscore_suspend(void)
2158{
2159 vendor_disable_error_reporting();
2160 return 0;
2161}
2162
2163static void mce_syscore_shutdown(void)
2164{
2165 vendor_disable_error_reporting();
2166}
2167
2168
2169
2170
2171
2172
2173static void mce_syscore_resume(void)
2174{
2175 __mcheck_cpu_init_generic();
2176 __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
2177 __mcheck_cpu_init_clear_banks();
2178}
2179
2180static struct syscore_ops mce_syscore_ops = {
2181 .suspend = mce_syscore_suspend,
2182 .shutdown = mce_syscore_shutdown,
2183 .resume = mce_syscore_resume,
2184};
2185
2186
2187
2188
2189
2190static void mce_cpu_restart(void *data)
2191{
2192 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2193 return;
2194 __mcheck_cpu_init_generic();
2195 __mcheck_cpu_init_clear_banks();
2196 __mcheck_cpu_init_timer();
2197}
2198
2199
2200static void mce_restart(void)
2201{
2202 mce_timer_delete_all();
2203 on_each_cpu(mce_cpu_restart, NULL, 1);
2204}
2205
2206
2207static void mce_disable_cmci(void *data)
2208{
2209 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2210 return;
2211 cmci_clear();
2212}
2213
2214static void mce_enable_ce(void *all)
2215{
2216 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2217 return;
2218 cmci_reenable();
2219 cmci_recheck();
2220 if (all)
2221 __mcheck_cpu_init_timer();
2222}
2223
2224static struct bus_type mce_subsys = {
2225 .name = "machinecheck",
2226 .dev_name = "machinecheck",
2227};
2228
2229DEFINE_PER_CPU(struct device *, mce_device);
2230
2231static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr)
2232{
2233 return container_of(attr, struct mce_bank_dev, attr);
2234}
2235
2236static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2237 char *buf)
2238{
2239 u8 bank = attr_to_bank(attr)->bank;
2240 struct mce_bank *b;
2241
2242 if (bank >= per_cpu(mce_num_banks, s->id))
2243 return -EINVAL;
2244
2245 b = &per_cpu(mce_banks_array, s->id)[bank];
2246
2247 if (!b->init)
2248 return -ENODEV;
2249
2250 return sprintf(buf, "%llx\n", b->ctl);
2251}
2252
2253static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2254 const char *buf, size_t size)
2255{
2256 u8 bank = attr_to_bank(attr)->bank;
2257 struct mce_bank *b;
2258 u64 new;
2259
2260 if (kstrtou64(buf, 0, &new) < 0)
2261 return -EINVAL;
2262
2263 if (bank >= per_cpu(mce_num_banks, s->id))
2264 return -EINVAL;
2265
2266 b = &per_cpu(mce_banks_array, s->id)[bank];
2267
2268 if (!b->init)
2269 return -ENODEV;
2270
2271 b->ctl = new;
2272 mce_restart();
2273
2274 return size;
2275}
2276
2277static ssize_t set_ignore_ce(struct device *s,
2278 struct device_attribute *attr,
2279 const char *buf, size_t size)
2280{
2281 u64 new;
2282
2283 if (kstrtou64(buf, 0, &new) < 0)
2284 return -EINVAL;
2285
2286 mutex_lock(&mce_sysfs_mutex);
2287 if (mca_cfg.ignore_ce ^ !!new) {
2288 if (new) {
2289
2290 mce_timer_delete_all();
2291 on_each_cpu(mce_disable_cmci, NULL, 1);
2292 mca_cfg.ignore_ce = true;
2293 } else {
2294
2295 mca_cfg.ignore_ce = false;
2296 on_each_cpu(mce_enable_ce, (void *)1, 1);
2297 }
2298 }
2299 mutex_unlock(&mce_sysfs_mutex);
2300
2301 return size;
2302}
2303
2304static ssize_t set_cmci_disabled(struct device *s,
2305 struct device_attribute *attr,
2306 const char *buf, size_t size)
2307{
2308 u64 new;
2309
2310 if (kstrtou64(buf, 0, &new) < 0)
2311 return -EINVAL;
2312
2313 mutex_lock(&mce_sysfs_mutex);
2314 if (mca_cfg.cmci_disabled ^ !!new) {
2315 if (new) {
2316
2317 on_each_cpu(mce_disable_cmci, NULL, 1);
2318 mca_cfg.cmci_disabled = true;
2319 } else {
2320
2321 mca_cfg.cmci_disabled = false;
2322 on_each_cpu(mce_enable_ce, NULL, 1);
2323 }
2324 }
2325 mutex_unlock(&mce_sysfs_mutex);
2326
2327 return size;
2328}
2329
2330static ssize_t store_int_with_restart(struct device *s,
2331 struct device_attribute *attr,
2332 const char *buf, size_t size)
2333{
2334 unsigned long old_check_interval = check_interval;
2335 ssize_t ret = device_store_ulong(s, attr, buf, size);
2336
2337 if (check_interval == old_check_interval)
2338 return ret;
2339
2340 mutex_lock(&mce_sysfs_mutex);
2341 mce_restart();
2342 mutex_unlock(&mce_sysfs_mutex);
2343
2344 return ret;
2345}
2346
2347static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2348static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2349static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2350static DEVICE_BOOL_ATTR(print_all, 0644, mca_cfg.print_all);
2351
2352static struct dev_ext_attribute dev_attr_check_interval = {
2353 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2354 &check_interval
2355};
2356
2357static struct dev_ext_attribute dev_attr_ignore_ce = {
2358 __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2359 &mca_cfg.ignore_ce
2360};
2361
2362static struct dev_ext_attribute dev_attr_cmci_disabled = {
2363 __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2364 &mca_cfg.cmci_disabled
2365};
2366
2367static struct device_attribute *mce_device_attrs[] = {
2368 &dev_attr_tolerant.attr,
2369 &dev_attr_check_interval.attr,
2370#ifdef CONFIG_X86_MCELOG_LEGACY
2371 &dev_attr_trigger,
2372#endif
2373 &dev_attr_monarch_timeout.attr,
2374 &dev_attr_dont_log_ce.attr,
2375 &dev_attr_print_all.attr,
2376 &dev_attr_ignore_ce.attr,
2377 &dev_attr_cmci_disabled.attr,
2378 NULL
2379};
2380
2381static cpumask_var_t mce_device_initialized;
2382
2383static void mce_device_release(struct device *dev)
2384{
2385 kfree(dev);
2386}
2387
2388
2389static int mce_device_create(unsigned int cpu)
2390{
2391 struct device *dev;
2392 int err;
2393 int i, j;
2394
2395 if (!mce_available(&boot_cpu_data))
2396 return -EIO;
2397
2398 dev = per_cpu(mce_device, cpu);
2399 if (dev)
2400 return 0;
2401
2402 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
2403 if (!dev)
2404 return -ENOMEM;
2405 dev->id = cpu;
2406 dev->bus = &mce_subsys;
2407 dev->release = &mce_device_release;
2408
2409 err = device_register(dev);
2410 if (err) {
2411 put_device(dev);
2412 return err;
2413 }
2414
2415 for (i = 0; mce_device_attrs[i]; i++) {
2416 err = device_create_file(dev, mce_device_attrs[i]);
2417 if (err)
2418 goto error;
2419 }
2420 for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) {
2421 err = device_create_file(dev, &mce_bank_devs[j].attr);
2422 if (err)
2423 goto error2;
2424 }
2425 cpumask_set_cpu(cpu, mce_device_initialized);
2426 per_cpu(mce_device, cpu) = dev;
2427
2428 return 0;
2429error2:
2430 while (--j >= 0)
2431 device_remove_file(dev, &mce_bank_devs[j].attr);
2432error:
2433 while (--i >= 0)
2434 device_remove_file(dev, mce_device_attrs[i]);
2435
2436 device_unregister(dev);
2437
2438 return err;
2439}
2440
2441static void mce_device_remove(unsigned int cpu)
2442{
2443 struct device *dev = per_cpu(mce_device, cpu);
2444 int i;
2445
2446 if (!cpumask_test_cpu(cpu, mce_device_initialized))
2447 return;
2448
2449 for (i = 0; mce_device_attrs[i]; i++)
2450 device_remove_file(dev, mce_device_attrs[i]);
2451
2452 for (i = 0; i < per_cpu(mce_num_banks, cpu); i++)
2453 device_remove_file(dev, &mce_bank_devs[i].attr);
2454
2455 device_unregister(dev);
2456 cpumask_clear_cpu(cpu, mce_device_initialized);
2457 per_cpu(mce_device, cpu) = NULL;
2458}
2459
2460
2461static void mce_disable_cpu(void)
2462{
2463 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2464 return;
2465
2466 if (!cpuhp_tasks_frozen)
2467 cmci_clear();
2468
2469 vendor_disable_error_reporting();
2470}
2471
2472static void mce_reenable_cpu(void)
2473{
2474 struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
2475 int i;
2476
2477 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2478 return;
2479
2480 if (!cpuhp_tasks_frozen)
2481 cmci_reenable();
2482 for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
2483 struct mce_bank *b = &mce_banks[i];
2484
2485 if (b->init)
2486 wrmsrl(msr_ops.ctl(i), b->ctl);
2487 }
2488}
2489
2490static int mce_cpu_dead(unsigned int cpu)
2491{
2492 mce_intel_hcpu_update(cpu);
2493
2494
2495 if (!cpuhp_tasks_frozen)
2496 cmci_rediscover();
2497 return 0;
2498}
2499
2500static int mce_cpu_online(unsigned int cpu)
2501{
2502 struct timer_list *t = this_cpu_ptr(&mce_timer);
2503 int ret;
2504
2505 mce_device_create(cpu);
2506
2507 ret = mce_threshold_create_device(cpu);
2508 if (ret) {
2509 mce_device_remove(cpu);
2510 return ret;
2511 }
2512 mce_reenable_cpu();
2513 mce_start_timer(t);
2514 return 0;
2515}
2516
2517static int mce_cpu_pre_down(unsigned int cpu)
2518{
2519 struct timer_list *t = this_cpu_ptr(&mce_timer);
2520
2521 mce_disable_cpu();
2522 del_timer_sync(t);
2523 mce_threshold_remove_device(cpu);
2524 mce_device_remove(cpu);
2525 return 0;
2526}
2527
2528static __init void mce_init_banks(void)
2529{
2530 int i;
2531
2532 for (i = 0; i < MAX_NR_BANKS; i++) {
2533 struct mce_bank_dev *b = &mce_bank_devs[i];
2534 struct device_attribute *a = &b->attr;
2535
2536 b->bank = i;
2537
2538 sysfs_attr_init(&a->attr);
2539 a->attr.name = b->attrname;
2540 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2541
2542 a->attr.mode = 0644;
2543 a->show = show_bank;
2544 a->store = set_bank;
2545 }
2546}
2547
2548
2549
2550
2551
2552
2553
2554
2555static __init int mcheck_init_device(void)
2556{
2557 int err;
2558
2559
2560
2561
2562
2563 MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
2564
2565 if (!mce_available(&boot_cpu_data)) {
2566 err = -EIO;
2567 goto err_out;
2568 }
2569
2570 if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2571 err = -ENOMEM;
2572 goto err_out;
2573 }
2574
2575 mce_init_banks();
2576
2577 err = subsys_system_register(&mce_subsys, NULL);
2578 if (err)
2579 goto err_out_mem;
2580
2581 err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
2582 mce_cpu_dead);
2583 if (err)
2584 goto err_out_mem;
2585
2586
2587
2588
2589
2590 err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
2591 mce_cpu_online, mce_cpu_pre_down);
2592 if (err < 0)
2593 goto err_out_online;
2594
2595 register_syscore_ops(&mce_syscore_ops);
2596
2597 return 0;
2598
2599err_out_online:
2600 cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
2601
2602err_out_mem:
2603 free_cpumask_var(mce_device_initialized);
2604
2605err_out:
2606 pr_err("Unable to init MCE device (rc: %d)\n", err);
2607
2608 return err;
2609}
2610device_initcall_sync(mcheck_init_device);
2611
2612
2613
2614
2615static int __init mcheck_disable(char *str)
2616{
2617 mca_cfg.disabled = 1;
2618 return 1;
2619}
2620__setup("nomce", mcheck_disable);
2621
2622#ifdef CONFIG_DEBUG_FS
2623struct dentry *mce_get_debugfs_dir(void)
2624{
2625 static struct dentry *dmce;
2626
2627 if (!dmce)
2628 dmce = debugfs_create_dir("mce", NULL);
2629
2630 return dmce;
2631}
2632
2633static void mce_reset(void)
2634{
2635 cpu_missing = 0;
2636 atomic_set(&mce_fake_panicked, 0);
2637 atomic_set(&mce_executing, 0);
2638 atomic_set(&mce_callin, 0);
2639 atomic_set(&global_nwo, 0);
2640}
2641
2642static int fake_panic_get(void *data, u64 *val)
2643{
2644 *val = fake_panic;
2645 return 0;
2646}
2647
2648static int fake_panic_set(void *data, u64 val)
2649{
2650 mce_reset();
2651 fake_panic = val;
2652 return 0;
2653}
2654
2655DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set,
2656 "%llu\n");
2657
2658static void __init mcheck_debugfs_init(void)
2659{
2660 struct dentry *dmce;
2661
2662 dmce = mce_get_debugfs_dir();
2663 debugfs_create_file_unsafe("fake_panic", 0444, dmce, NULL,
2664 &fake_panic_fops);
2665}
2666#else
2667static void __init mcheck_debugfs_init(void) { }
2668#endif
2669
2670DEFINE_STATIC_KEY_FALSE(mcsafe_key);
2671EXPORT_SYMBOL_GPL(mcsafe_key);
2672
2673static int __init mcheck_late_init(void)
2674{
2675 if (mca_cfg.recovery)
2676 static_branch_inc(&mcsafe_key);
2677
2678 mcheck_debugfs_init();
2679
2680
2681
2682
2683
2684 mce_schedule_work();
2685
2686 return 0;
2687}
2688late_initcall(mcheck_late_init);
2689