1
2
3
4
5
6
7
8
9#undef DEBUG
10
11#include <linux/kernel.h>
12#include <linux/init.h>
13#include <linux/of.h>
14#include <linux/mm.h>
15#include <linux/slab.h>
16
17#include <asm/opal.h>
18#include <asm/cputable.h>
19#include <asm/machdep.h>
20
21#include "powernv.h"
22
23static int opal_hmi_handler_nb_init;
24struct OpalHmiEvtNode {
25 struct list_head list;
26 struct OpalHMIEvent hmi_evt;
27};
28
29struct xstop_reason {
30 uint32_t xstop_reason;
31 const char *unit_failed;
32 const char *description;
33};
34
35static LIST_HEAD(opal_hmi_evt_list);
36static DEFINE_SPINLOCK(opal_hmi_evt_lock);
37
38static void print_core_checkstop_reason(const char *level,
39 struct OpalHMIEvent *hmi_evt)
40{
41 int i;
42 static const struct xstop_reason xstop_reason[] = {
43 { CORE_CHECKSTOP_IFU_REGFILE, "IFU",
44 "RegFile core check stop" },
45 { CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
46 { CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
47 "Core checkstop during recovery" },
48 { CORE_CHECKSTOP_ISU_REGFILE, "ISU",
49 "RegFile core check stop (mapper error)" },
50 { CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
51 { CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
52 { CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
53 { CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
54 "Recovery in maintenance mode" },
55 { CORE_CHECKSTOP_LSU_REGFILE, "LSU",
56 "RegFile core check stop" },
57 { CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
58 "Forward Progress Error" },
59 { CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
60 { CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
61 { CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
62 "Hypervisor Resource error - core check stop" },
63 { CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
64 "Hang Recovery Failed (core check stop)" },
65 { CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
66 "Ambiguous Hang Detected (unknown source)" },
67 { CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
68 "Debug Trigger Error inject" },
69 { CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
70 "Hypervisor check stop via SPRC/SPRD" },
71 };
72
73
74 if (!hmi_evt->u.xstop_error.xstop_reason) {
75 printk("%s Unknown Core check stop.\n", level);
76 return;
77 }
78
79 printk("%s CPU PIR: %08x\n", level,
80 be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
81 for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
82 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
83 xstop_reason[i].xstop_reason)
84 printk("%s [Unit: %-3s] %s\n", level,
85 xstop_reason[i].unit_failed,
86 xstop_reason[i].description);
87}
88
89static void print_nx_checkstop_reason(const char *level,
90 struct OpalHMIEvent *hmi_evt)
91{
92 int i;
93 static const struct xstop_reason xstop_reason[] = {
94 { NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
95 "SHM invalid state error" },
96 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
97 "DMA invalid state error bit 15" },
98 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
99 "DMA invalid state error bit 16" },
100 { NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
101 "Channel 0 invalid state error" },
102 { NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
103 "Channel 1 invalid state error" },
104 { NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
105 "Channel 2 invalid state error" },
106 { NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
107 "Channel 3 invalid state error" },
108 { NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
109 "Channel 4 invalid state error" },
110 { NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
111 "Channel 5 invalid state error" },
112 { NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
113 "Channel 6 invalid state error" },
114 { NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
115 "Channel 7 invalid state error" },
116 { NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
117 "UE error on CRB(CSB address, CCB)" },
118 { NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
119 "SUE error on CRB(CSB address, CCB)" },
120 { NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
121 "CRB Kill ISN received while holding ISN with UE error" },
122 };
123
124
125 if (!hmi_evt->u.xstop_error.xstop_reason) {
126 printk("%s Unknown NX check stop.\n", level);
127 return;
128 }
129
130 printk("%s NX checkstop on CHIP ID: %x\n", level,
131 be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
132 for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
133 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
134 xstop_reason[i].xstop_reason)
135 printk("%s [Unit: %-3s] %s\n", level,
136 xstop_reason[i].unit_failed,
137 xstop_reason[i].description);
138}
139
140static void print_checkstop_reason(const char *level,
141 struct OpalHMIEvent *hmi_evt)
142{
143 uint8_t type = hmi_evt->u.xstop_error.xstop_type;
144 switch (type) {
145 case CHECKSTOP_TYPE_CORE:
146 print_core_checkstop_reason(level, hmi_evt);
147 break;
148 case CHECKSTOP_TYPE_NX:
149 print_nx_checkstop_reason(level, hmi_evt);
150 break;
151 default:
152 printk("%s Unknown Malfunction Alert of type %d\n",
153 level, type);
154 break;
155 }
156}
157
158static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
159{
160 const char *level, *sevstr, *error_info;
161 static const char *hmi_error_types[] = {
162 "Malfunction Alert",
163 "Processor Recovery done",
164 "Processor recovery occurred again",
165 "Processor recovery occurred for masked error",
166 "Timer facility experienced an error",
167 "TFMR SPR is corrupted",
168 "UPS (Uninterrupted Power System) Overflow indication",
169 "An XSCOM operation failure",
170 "An XSCOM operation completed",
171 "SCOM has set a reserved FIR bit to cause recovery",
172 "Debug trigger has set a reserved FIR bit to cause recovery",
173 "A hypervisor resource error occurred",
174 "CAPP recovery process is in progress",
175 };
176
177
178 if (hmi_evt->version < OpalHMIEvt_V1) {
179 pr_err("HMI Interrupt, Unknown event version %d !\n",
180 hmi_evt->version);
181 return;
182 }
183 switch (hmi_evt->severity) {
184 case OpalHMI_SEV_NO_ERROR:
185 level = KERN_INFO;
186 sevstr = "Harmless";
187 break;
188 case OpalHMI_SEV_WARNING:
189 level = KERN_WARNING;
190 sevstr = "";
191 break;
192 case OpalHMI_SEV_ERROR_SYNC:
193 level = KERN_ERR;
194 sevstr = "Severe";
195 break;
196 case OpalHMI_SEV_FATAL:
197 default:
198 level = KERN_ERR;
199 sevstr = "Fatal";
200 break;
201 }
202
203 printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
204 level, sevstr,
205 hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
206 "Recovered" : "Not recovered");
207 error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
208 hmi_error_types[hmi_evt->type]
209 : "Unknown";
210 printk("%s Error detail: %s\n", level, error_info);
211 printk("%s HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer));
212 if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
213 (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
214 printk("%s TFMR: %016llx\n", level,
215 be64_to_cpu(hmi_evt->tfmr));
216
217 if (hmi_evt->version < OpalHMIEvt_V2)
218 return;
219
220
221 if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
222 print_checkstop_reason(level, hmi_evt);
223}
224
225static void hmi_event_handler(struct work_struct *work)
226{
227 unsigned long flags;
228 struct OpalHMIEvent *hmi_evt;
229 struct OpalHmiEvtNode *msg_node;
230 uint8_t disposition;
231 struct opal_msg msg;
232 int unrecoverable = 0;
233
234 spin_lock_irqsave(&opal_hmi_evt_lock, flags);
235 while (!list_empty(&opal_hmi_evt_list)) {
236 msg_node = list_entry(opal_hmi_evt_list.next,
237 struct OpalHmiEvtNode, list);
238 list_del(&msg_node->list);
239 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
240
241 hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
242 print_hmi_event_info(hmi_evt);
243 disposition = hmi_evt->disposition;
244 kfree(msg_node);
245
246
247
248
249
250
251
252 if (disposition != OpalHMI_DISPOSITION_RECOVERED)
253 unrecoverable = 1;
254
255 spin_lock_irqsave(&opal_hmi_evt_lock, flags);
256 }
257 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
258
259 if (unrecoverable) {
260
261 while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
262 u32 type;
263
264 type = be32_to_cpu(msg.msg_type);
265
266
267 if (type != OPAL_MSG_HMI_EVT)
268 continue;
269
270
271 hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
272 print_hmi_event_info(hmi_evt);
273 }
274
275 pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception");
276 }
277}
278
279static DECLARE_WORK(hmi_event_work, hmi_event_handler);
280
281
282
283
284static int opal_handle_hmi_event(struct notifier_block *nb,
285 unsigned long msg_type, void *msg)
286{
287 unsigned long flags;
288 struct OpalHMIEvent *hmi_evt;
289 struct opal_msg *hmi_msg = msg;
290 struct OpalHmiEvtNode *msg_node;
291
292
293 if (msg_type != OPAL_MSG_HMI_EVT)
294 return 0;
295
296
297 hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
298
299
300 msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
301 if (!msg_node) {
302 pr_err("HMI: out of memory, Opal message event not handled\n");
303 return -ENOMEM;
304 }
305 memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(*hmi_evt));
306
307 spin_lock_irqsave(&opal_hmi_evt_lock, flags);
308 list_add(&msg_node->list, &opal_hmi_evt_list);
309 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
310
311 schedule_work(&hmi_event_work);
312 return 0;
313}
314
315static struct notifier_block opal_hmi_handler_nb = {
316 .notifier_call = opal_handle_hmi_event,
317 .next = NULL,
318 .priority = 0,
319};
320
321int __init opal_hmi_handler_init(void)
322{
323 int ret;
324
325 if (!opal_hmi_handler_nb_init) {
326 ret = opal_message_notifier_register(
327 OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
328 if (ret) {
329 pr_err("%s: Can't register OPAL event notifier (%d)\n",
330 __func__, ret);
331 return ret;
332 }
333 opal_hmi_handler_nb_init = 1;
334 }
335 return 0;
336}
337