1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22#undef DEBUG
23#define pr_fmt(fmt) "mce: " fmt
24
25#include <linux/hardirq.h>
26#include <linux/types.h>
27#include <linux/ptrace.h>
28#include <linux/percpu.h>
29#include <linux/export.h>
30#include <linux/irq_work.h>
31#include <linux/extable.h>
32#include <linux/memblock.h>
33
34#include <asm/machdep.h>
35#include <asm/mce.h>
36#include <asm/nmi.h>
37#include <asm/extable.h>
38
39#include "setup.h"
40
41static void machine_check_process_queued_event(struct irq_work *work);
42static void machine_check_ue_irq_work(struct irq_work *work);
43static void machine_check_ue_event(struct machine_check_event *evt);
44static void machine_process_ue_event(struct work_struct *work);
45
46static struct irq_work mce_event_process_work = {
47 .func = machine_check_process_queued_event,
48};
49
50static struct irq_work mce_ue_event_irq_work = {
51 .func = machine_check_ue_irq_work,
52};
53
54DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
55
56static void mce_set_error_info(struct machine_check_event *mce,
57 struct mce_error_info *mce_err)
58{
59 mce->error_type = mce_err->error_type;
60 switch (mce_err->error_type) {
61 case MCE_ERROR_TYPE_UE:
62 mce->u.ue_error.ue_error_type = mce_err->u.ue_error_type;
63 break;
64 case MCE_ERROR_TYPE_SLB:
65 mce->u.slb_error.slb_error_type = mce_err->u.slb_error_type;
66 break;
67 case MCE_ERROR_TYPE_ERAT:
68 mce->u.erat_error.erat_error_type = mce_err->u.erat_error_type;
69 break;
70 case MCE_ERROR_TYPE_TLB:
71 mce->u.tlb_error.tlb_error_type = mce_err->u.tlb_error_type;
72 break;
73 case MCE_ERROR_TYPE_USER:
74 mce->u.user_error.user_error_type = mce_err->u.user_error_type;
75 break;
76 case MCE_ERROR_TYPE_RA:
77 mce->u.ra_error.ra_error_type = mce_err->u.ra_error_type;
78 break;
79 case MCE_ERROR_TYPE_LINK:
80 mce->u.link_error.link_error_type = mce_err->u.link_error_type;
81 break;
82 case MCE_ERROR_TYPE_UNKNOWN:
83 default:
84 break;
85 }
86}
87
88
89
90
91
92void save_mce_event(struct pt_regs *regs, long handled,
93 struct mce_error_info *mce_err,
94 uint64_t nip, uint64_t addr, uint64_t phys_addr)
95{
96 int index = local_paca->mce_info->mce_nest_count++;
97 struct machine_check_event *mce;
98
99 mce = &local_paca->mce_info->mce_event[index];
100
101
102
103
104
105 if (index >= MAX_MC_EVT)
106 return;
107
108
109 mce->version = MCE_V1;
110 mce->srr0 = nip;
111 mce->srr1 = regs->msr;
112 mce->gpr3 = regs->gpr[3];
113 mce->in_use = 1;
114 mce->cpu = get_paca()->paca_index;
115
116
117 if (handled && (regs->msr & MSR_RI))
118 mce->disposition = MCE_DISPOSITION_RECOVERED;
119 else
120 mce->disposition = MCE_DISPOSITION_NOT_RECOVERED;
121
122 mce->initiator = mce_err->initiator;
123 mce->severity = mce_err->severity;
124 mce->sync_error = mce_err->sync_error;
125 mce->error_class = mce_err->error_class;
126
127
128
129
130 mce_set_error_info(mce, mce_err);
131
132 if (!addr)
133 return;
134
135 if (mce->error_type == MCE_ERROR_TYPE_TLB) {
136 mce->u.tlb_error.effective_address_provided = true;
137 mce->u.tlb_error.effective_address = addr;
138 } else if (mce->error_type == MCE_ERROR_TYPE_SLB) {
139 mce->u.slb_error.effective_address_provided = true;
140 mce->u.slb_error.effective_address = addr;
141 } else if (mce->error_type == MCE_ERROR_TYPE_ERAT) {
142 mce->u.erat_error.effective_address_provided = true;
143 mce->u.erat_error.effective_address = addr;
144 } else if (mce->error_type == MCE_ERROR_TYPE_USER) {
145 mce->u.user_error.effective_address_provided = true;
146 mce->u.user_error.effective_address = addr;
147 } else if (mce->error_type == MCE_ERROR_TYPE_RA) {
148 mce->u.ra_error.effective_address_provided = true;
149 mce->u.ra_error.effective_address = addr;
150 } else if (mce->error_type == MCE_ERROR_TYPE_LINK) {
151 mce->u.link_error.effective_address_provided = true;
152 mce->u.link_error.effective_address = addr;
153 } else if (mce->error_type == MCE_ERROR_TYPE_UE) {
154 mce->u.ue_error.effective_address_provided = true;
155 mce->u.ue_error.effective_address = addr;
156 if (phys_addr != ULONG_MAX) {
157 mce->u.ue_error.physical_address_provided = true;
158 mce->u.ue_error.physical_address = phys_addr;
159 mce->u.ue_error.ignore_event = mce_err->ignore_event;
160 machine_check_ue_event(mce);
161 }
162 }
163 return;
164}
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183int get_mce_event(struct machine_check_event *mce, bool release)
184{
185 int index = local_paca->mce_info->mce_nest_count - 1;
186 struct machine_check_event *mc_evt;
187 int ret = 0;
188
189
190 if (index < 0)
191 return ret;
192
193
194 if (index < MAX_MC_EVT) {
195 mc_evt = &local_paca->mce_info->mce_event[index];
196
197 if (mce)
198 *mce = *mc_evt;
199 if (release)
200 mc_evt->in_use = 0;
201 ret = 1;
202 }
203
204 if (release)
205 local_paca->mce_info->mce_nest_count--;
206
207 return ret;
208}
209
210void release_mce_event(void)
211{
212 get_mce_event(NULL, true);
213}
214
215static void machine_check_ue_irq_work(struct irq_work *work)
216{
217 schedule_work(&mce_ue_event_work);
218}
219
220
221
222
223static void machine_check_ue_event(struct machine_check_event *evt)
224{
225 int index;
226
227 index = local_paca->mce_info->mce_ue_count++;
228
229 if (index >= MAX_MC_EVT) {
230 local_paca->mce_info->mce_ue_count--;
231 return;
232 }
233 memcpy(&local_paca->mce_info->mce_ue_event_queue[index],
234 evt, sizeof(*evt));
235
236
237 irq_work_queue(&mce_ue_event_irq_work);
238}
239
240
241
242
243void machine_check_queue_event(void)
244{
245 int index;
246 struct machine_check_event evt;
247
248 if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
249 return;
250
251 index = local_paca->mce_info->mce_queue_count++;
252
253 if (index >= MAX_MC_EVT) {
254 local_paca->mce_info->mce_queue_count--;
255 return;
256 }
257 memcpy(&local_paca->mce_info->mce_event_queue[index],
258 &evt, sizeof(evt));
259
260
261 irq_work_queue(&mce_event_process_work);
262}
263
264void mce_common_process_ue(struct pt_regs *regs,
265 struct mce_error_info *mce_err)
266{
267 const struct exception_table_entry *entry;
268
269 entry = search_kernel_exception_table(regs->nip);
270 if (entry) {
271 mce_err->ignore_event = true;
272 regs->nip = extable_fixup(entry);
273 }
274}
275
276
277
278
279
280static void machine_process_ue_event(struct work_struct *work)
281{
282 int index;
283 struct machine_check_event *evt;
284
285 while (local_paca->mce_info->mce_ue_count > 0) {
286 index = local_paca->mce_info->mce_ue_count - 1;
287 evt = &local_paca->mce_info->mce_ue_event_queue[index];
288#ifdef CONFIG_MEMORY_FAILURE
289
290
291
292
293
294
295
296
297 if (evt->error_type == MCE_ERROR_TYPE_UE) {
298 if (evt->u.ue_error.ignore_event) {
299 local_paca->mce_info->mce_ue_count--;
300 continue;
301 }
302
303 if (evt->u.ue_error.physical_address_provided) {
304 unsigned long pfn;
305
306 pfn = evt->u.ue_error.physical_address >>
307 PAGE_SHIFT;
308 memory_failure(pfn, 0);
309 } else
310 pr_warn("Failed to identify bad address from "
311 "where the uncorrectable error (UE) "
312 "was generated\n");
313 }
314#endif
315 local_paca->mce_info->mce_ue_count--;
316 }
317}
318
319
320
321
322static void machine_check_process_queued_event(struct irq_work *work)
323{
324 int index;
325 struct machine_check_event *evt;
326
327 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
328
329
330
331
332
333 while (local_paca->mce_info->mce_queue_count > 0) {
334 index = local_paca->mce_info->mce_queue_count - 1;
335 evt = &local_paca->mce_info->mce_event_queue[index];
336
337 if (evt->error_type == MCE_ERROR_TYPE_UE &&
338 evt->u.ue_error.ignore_event) {
339 local_paca->mce_info->mce_queue_count--;
340 continue;
341 }
342
343 machine_check_print_event_info(evt, false, false);
344 local_paca->mce_info->mce_queue_count--;
345 }
346}
347
348void machine_check_print_event_info(struct machine_check_event *evt,
349 bool user_mode, bool in_guest)
350{
351 const char *level, *sevstr, *subtype, *err_type, *initiator;
352 uint64_t ea = 0, pa = 0;
353 int n = 0;
354 char dar_str[50];
355 char pa_str[50];
356 static const char *mc_ue_types[] = {
357 "Indeterminate",
358 "Instruction fetch",
359 "Page table walk ifetch",
360 "Load/Store",
361 "Page table walk Load/Store",
362 };
363 static const char *mc_slb_types[] = {
364 "Indeterminate",
365 "Parity",
366 "Multihit",
367 };
368 static const char *mc_erat_types[] = {
369 "Indeterminate",
370 "Parity",
371 "Multihit",
372 };
373 static const char *mc_tlb_types[] = {
374 "Indeterminate",
375 "Parity",
376 "Multihit",
377 };
378 static const char *mc_user_types[] = {
379 "Indeterminate",
380 "tlbie(l) invalid",
381 "scv invalid",
382 };
383 static const char *mc_ra_types[] = {
384 "Indeterminate",
385 "Instruction fetch (bad)",
386 "Instruction fetch (foreign)",
387 "Page table walk ifetch (bad)",
388 "Page table walk ifetch (foreign)",
389 "Load (bad)",
390 "Store (bad)",
391 "Page table walk Load/Store (bad)",
392 "Page table walk Load/Store (foreign)",
393 "Load/Store (foreign)",
394 };
395 static const char *mc_link_types[] = {
396 "Indeterminate",
397 "Instruction fetch (timeout)",
398 "Page table walk ifetch (timeout)",
399 "Load (timeout)",
400 "Store (timeout)",
401 "Page table walk Load/Store (timeout)",
402 };
403 static const char *mc_error_class[] = {
404 "Unknown",
405 "Hardware error",
406 "Probable Hardware error (some chance of software cause)",
407 "Software error",
408 "Probable Software error (some chance of hardware cause)",
409 };
410
411
412 if (evt->version != MCE_V1) {
413 pr_err("Machine Check Exception, Unknown event version %d !\n",
414 evt->version);
415 return;
416 }
417 switch (evt->severity) {
418 case MCE_SEV_NO_ERROR:
419 level = KERN_INFO;
420 sevstr = "Harmless";
421 break;
422 case MCE_SEV_WARNING:
423 level = KERN_WARNING;
424 sevstr = "Warning";
425 break;
426 case MCE_SEV_SEVERE:
427 level = KERN_ERR;
428 sevstr = "Severe";
429 break;
430 case MCE_SEV_FATAL:
431 default:
432 level = KERN_ERR;
433 sevstr = "Fatal";
434 break;
435 }
436
437 switch(evt->initiator) {
438 case MCE_INITIATOR_CPU:
439 initiator = "CPU";
440 break;
441 case MCE_INITIATOR_PCI:
442 initiator = "PCI";
443 break;
444 case MCE_INITIATOR_ISA:
445 initiator = "ISA";
446 break;
447 case MCE_INITIATOR_MEMORY:
448 initiator = "Memory";
449 break;
450 case MCE_INITIATOR_POWERMGM:
451 initiator = "Power Management";
452 break;
453 case MCE_INITIATOR_UNKNOWN:
454 default:
455 initiator = "Unknown";
456 break;
457 }
458
459 switch (evt->error_type) {
460 case MCE_ERROR_TYPE_UE:
461 err_type = "UE";
462 subtype = evt->u.ue_error.ue_error_type <
463 ARRAY_SIZE(mc_ue_types) ?
464 mc_ue_types[evt->u.ue_error.ue_error_type]
465 : "Unknown";
466 if (evt->u.ue_error.effective_address_provided)
467 ea = evt->u.ue_error.effective_address;
468 if (evt->u.ue_error.physical_address_provided)
469 pa = evt->u.ue_error.physical_address;
470 break;
471 case MCE_ERROR_TYPE_SLB:
472 err_type = "SLB";
473 subtype = evt->u.slb_error.slb_error_type <
474 ARRAY_SIZE(mc_slb_types) ?
475 mc_slb_types[evt->u.slb_error.slb_error_type]
476 : "Unknown";
477 if (evt->u.slb_error.effective_address_provided)
478 ea = evt->u.slb_error.effective_address;
479 break;
480 case MCE_ERROR_TYPE_ERAT:
481 err_type = "ERAT";
482 subtype = evt->u.erat_error.erat_error_type <
483 ARRAY_SIZE(mc_erat_types) ?
484 mc_erat_types[evt->u.erat_error.erat_error_type]
485 : "Unknown";
486 if (evt->u.erat_error.effective_address_provided)
487 ea = evt->u.erat_error.effective_address;
488 break;
489 case MCE_ERROR_TYPE_TLB:
490 err_type = "TLB";
491 subtype = evt->u.tlb_error.tlb_error_type <
492 ARRAY_SIZE(mc_tlb_types) ?
493 mc_tlb_types[evt->u.tlb_error.tlb_error_type]
494 : "Unknown";
495 if (evt->u.tlb_error.effective_address_provided)
496 ea = evt->u.tlb_error.effective_address;
497 break;
498 case MCE_ERROR_TYPE_USER:
499 err_type = "User";
500 subtype = evt->u.user_error.user_error_type <
501 ARRAY_SIZE(mc_user_types) ?
502 mc_user_types[evt->u.user_error.user_error_type]
503 : "Unknown";
504 if (evt->u.user_error.effective_address_provided)
505 ea = evt->u.user_error.effective_address;
506 break;
507 case MCE_ERROR_TYPE_RA:
508 err_type = "Real address";
509 subtype = evt->u.ra_error.ra_error_type <
510 ARRAY_SIZE(mc_ra_types) ?
511 mc_ra_types[evt->u.ra_error.ra_error_type]
512 : "Unknown";
513 if (evt->u.ra_error.effective_address_provided)
514 ea = evt->u.ra_error.effective_address;
515 break;
516 case MCE_ERROR_TYPE_LINK:
517 err_type = "Link";
518 subtype = evt->u.link_error.link_error_type <
519 ARRAY_SIZE(mc_link_types) ?
520 mc_link_types[evt->u.link_error.link_error_type]
521 : "Unknown";
522 if (evt->u.link_error.effective_address_provided)
523 ea = evt->u.link_error.effective_address;
524 break;
525 case MCE_ERROR_TYPE_DCACHE:
526 err_type = "D-Cache";
527 subtype = "Unknown";
528 break;
529 case MCE_ERROR_TYPE_ICACHE:
530 err_type = "I-Cache";
531 subtype = "Unknown";
532 break;
533 default:
534 case MCE_ERROR_TYPE_UNKNOWN:
535 err_type = "Unknown";
536 subtype = "";
537 break;
538 }
539
540 dar_str[0] = pa_str[0] = '\0';
541 if (ea && evt->srr0 != ea) {
542
543 n = sprintf(dar_str, "DAR: %016llx ", ea);
544 if (pa)
545 sprintf(dar_str + n, "paddr: %016llx ", pa);
546 } else if (pa) {
547 sprintf(pa_str, " paddr: %016llx", pa);
548 }
549
550 printk("%sMCE: CPU%d: machine check (%s) %s %s %s %s[%s]\n",
551 level, evt->cpu, sevstr, in_guest ? "Guest" : "Host",
552 err_type, subtype, dar_str,
553 evt->disposition == MCE_DISPOSITION_RECOVERED ?
554 "Recovered" : "Not recovered");
555
556 if (in_guest || user_mode) {
557 printk("%sMCE: CPU%d: PID: %d Comm: %s %sNIP: [%016llx]%s\n",
558 level, evt->cpu, current->pid, current->comm,
559 in_guest ? "Guest " : "", evt->srr0, pa_str);
560 } else {
561 printk("%sMCE: CPU%d: NIP: [%016llx] %pS%s\n",
562 level, evt->cpu, evt->srr0, (void *)evt->srr0, pa_str);
563 }
564
565 printk("%sMCE: CPU%d: Initiator %s\n", level, evt->cpu, initiator);
566
567 subtype = evt->error_class < ARRAY_SIZE(mc_error_class) ?
568 mc_error_class[evt->error_class] : "Unknown";
569 printk("%sMCE: CPU%d: %s\n", level, evt->cpu, subtype);
570}
571EXPORT_SYMBOL_GPL(machine_check_print_event_info);
572
573
574
575
576
577
578long machine_check_early(struct pt_regs *regs)
579{
580 long handled = 0;
581
582 hv_nmi_check_nonrecoverable(regs);
583
584
585
586
587 if (ppc_md.machine_check_early)
588 handled = ppc_md.machine_check_early(regs);
589 return handled;
590}
591
592
593static enum {
594 DTRIG_UNKNOWN,
595 DTRIG_VECTOR_CI,
596 DTRIG_SUSPEND_ESCAPE,
597} hmer_debug_trig_function;
598
599static int init_debug_trig_function(void)
600{
601 int pvr;
602 struct device_node *cpun;
603 struct property *prop = NULL;
604 const char *str;
605
606
607 preempt_disable();
608 cpun = of_get_cpu_node(smp_processor_id(), NULL);
609 if (cpun) {
610 of_property_for_each_string(cpun, "ibm,hmi-special-triggers",
611 prop, str) {
612 if (strcmp(str, "bit17-vector-ci-load") == 0)
613 hmer_debug_trig_function = DTRIG_VECTOR_CI;
614 else if (strcmp(str, "bit17-tm-suspend-escape") == 0)
615 hmer_debug_trig_function = DTRIG_SUSPEND_ESCAPE;
616 }
617 of_node_put(cpun);
618 }
619 preempt_enable();
620
621
622 if (prop)
623 goto out;
624
625 pvr = mfspr(SPRN_PVR);
626
627 if ((PVR_VER(pvr) == PVR_POWER9) && (pvr & 0xe000) == 0) {
628
629 if ((pvr & 0xfff) >= 0x202)
630 hmer_debug_trig_function = DTRIG_SUSPEND_ESCAPE;
631
632 else if ((pvr & 0xfff) >= 0x200)
633 hmer_debug_trig_function = DTRIG_VECTOR_CI;
634 }
635
636 out:
637 switch (hmer_debug_trig_function) {
638 case DTRIG_VECTOR_CI:
639 pr_debug("HMI debug trigger used for vector CI load\n");
640 break;
641 case DTRIG_SUSPEND_ESCAPE:
642 pr_debug("HMI debug trigger used for TM suspend escape\n");
643 break;
644 default:
645 break;
646 }
647 return 0;
648}
649__initcall(init_debug_trig_function);
650
651
652
653
654
655
656
657
658long hmi_handle_debugtrig(struct pt_regs *regs)
659{
660 unsigned long hmer = mfspr(SPRN_HMER);
661 long ret = 0;
662
663
664 if (!((hmer & HMER_DEBUG_TRIG)
665 && hmer_debug_trig_function != DTRIG_UNKNOWN))
666 return -1;
667
668 hmer &= ~HMER_DEBUG_TRIG;
669
670 mtspr(SPRN_HMER, ~HMER_DEBUG_TRIG);
671
672 switch (hmer_debug_trig_function) {
673 case DTRIG_VECTOR_CI:
674
675
676
677
678
679 if (regs && user_mode(regs))
680 ret = local_paca->hmi_p9_special_emu = 1;
681
682 break;
683
684 default:
685 break;
686 }
687
688
689
690
691 if (hmer & mfspr(SPRN_HMEER))
692 return -1;
693
694 return ret;
695}
696
697
698
699
700long hmi_exception_realmode(struct pt_regs *regs)
701{
702 int ret;
703
704 local_paca->hmi_irqs++;
705
706 ret = hmi_handle_debugtrig(regs);
707 if (ret >= 0)
708 return ret;
709
710 wait_for_subcore_guest_exit();
711
712 if (ppc_md.hmi_exception_early)
713 ppc_md.hmi_exception_early(regs);
714
715 wait_for_tb_resync();
716
717 return 1;
718}
719
720void __init mce_init(void)
721{
722 struct mce_info *mce_info;
723 u64 limit;
724 int i;
725
726 limit = min(ppc64_bolted_size(), ppc64_rma_size);
727 for_each_possible_cpu(i) {
728 mce_info = memblock_alloc_try_nid(sizeof(*mce_info),
729 __alignof__(*mce_info),
730 MEMBLOCK_LOW_LIMIT,
731 limit, cpu_to_node(i));
732 if (!mce_info)
733 goto err;
734 paca_ptrs[i]->mce_info = mce_info;
735 }
736 return;
737err:
738 panic("Failed to allocate memory for MCE event data\n");
739}
740