1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19#include <linux/sched.h>
20#include <linux/interrupt.h>
21#include <linux/irq.h>
22#include <linux/of.h>
23#include <linux/fs.h>
24#include <linux/reboot.h>
25
26#include <asm/machdep.h>
27#include <asm/rtas.h>
28#include <asm/firmware.h>
29
30#include "pseries.h"
31
32static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX];
33static DEFINE_SPINLOCK(ras_log_buf_lock);
34
35static char global_mce_data_buf[RTAS_ERROR_LOG_MAX];
36static DEFINE_PER_CPU(__u64, mce_data_buf);
37
38static int ras_check_exception_token;
39
40#define EPOW_SENSOR_TOKEN 9
41#define EPOW_SENSOR_INDEX 0
42
43static irqreturn_t ras_epow_interrupt(int irq, void *dev_id);
44static irqreturn_t ras_error_interrupt(int irq, void *dev_id);
45
46
47
48
49
50
51static int __init init_ras_IRQ(void)
52{
53 struct device_node *np;
54
55 ras_check_exception_token = rtas_token("check-exception");
56
57
58 np = of_find_node_by_path("/event-sources/internal-errors");
59 if (np != NULL) {
60 request_event_sources_irqs(np, ras_error_interrupt,
61 "RAS_ERROR");
62 of_node_put(np);
63 }
64
65
66 np = of_find_node_by_path("/event-sources/epow-events");
67 if (np != NULL) {
68 request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW");
69 of_node_put(np);
70 }
71
72 return 0;
73}
74machine_subsys_initcall(pseries, init_ras_IRQ);
75
76#define EPOW_SHUTDOWN_NORMAL 1
77#define EPOW_SHUTDOWN_ON_UPS 2
78#define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS 3
79#define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH 4
80
81static void handle_system_shutdown(char event_modifier)
82{
83 switch (event_modifier) {
84 case EPOW_SHUTDOWN_NORMAL:
85 pr_emerg("Firmware initiated power off");
86 orderly_poweroff(true);
87 break;
88
89 case EPOW_SHUTDOWN_ON_UPS:
90 pr_emerg("Loss of power reported by firmware, system is "
91 "running on UPS/battery");
92 break;
93
94 case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS:
95 pr_emerg("Loss of system critical functions reported by "
96 "firmware");
97 pr_emerg("Check RTAS error log for details");
98 orderly_poweroff(true);
99 break;
100
101 case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH:
102 pr_emerg("Ambient temperature too high reported by firmware");
103 pr_emerg("Check RTAS error log for details");
104 orderly_poweroff(true);
105 break;
106
107 default:
108 pr_err("Unknown power/cooling shutdown event (modifier %d)",
109 event_modifier);
110 }
111}
112
113struct epow_errorlog {
114 unsigned char sensor_value;
115 unsigned char event_modifier;
116 unsigned char extended_modifier;
117 unsigned char reserved;
118 unsigned char platform_reason;
119};
120
121#define EPOW_RESET 0
122#define EPOW_WARN_COOLING 1
123#define EPOW_WARN_POWER 2
124#define EPOW_SYSTEM_SHUTDOWN 3
125#define EPOW_SYSTEM_HALT 4
126#define EPOW_MAIN_ENCLOSURE 5
127#define EPOW_POWER_OFF 7
128
129static void rtas_parse_epow_errlog(struct rtas_error_log *log)
130{
131 struct pseries_errorlog *pseries_log;
132 struct epow_errorlog *epow_log;
133 char action_code;
134 char modifier;
135
136 pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW);
137 if (pseries_log == NULL)
138 return;
139
140 epow_log = (struct epow_errorlog *)pseries_log->data;
141 action_code = epow_log->sensor_value & 0xF;
142 modifier = epow_log->event_modifier & 0xF;
143
144 switch (action_code) {
145 case EPOW_RESET:
146 pr_err("Non critical power or cooling issue cleared");
147 break;
148
149 case EPOW_WARN_COOLING:
150 pr_err("Non critical cooling issue reported by firmware");
151 pr_err("Check RTAS error log for details");
152 break;
153
154 case EPOW_WARN_POWER:
155 pr_err("Non critical power issue reported by firmware");
156 pr_err("Check RTAS error log for details");
157 break;
158
159 case EPOW_SYSTEM_SHUTDOWN:
160 handle_system_shutdown(epow_log->event_modifier);
161 break;
162
163 case EPOW_SYSTEM_HALT:
164 pr_emerg("Firmware initiated power off");
165 orderly_poweroff(true);
166 break;
167
168 case EPOW_MAIN_ENCLOSURE:
169 case EPOW_POWER_OFF:
170 pr_emerg("Critical power/cooling issue reported by firmware");
171 pr_emerg("Check RTAS error log for details");
172 pr_emerg("Immediate power off");
173 emergency_sync();
174 kernel_power_off();
175 break;
176
177 default:
178 pr_err("Unknown power/cooling event (action code %d)",
179 action_code);
180 }
181}
182
183
184static irqreturn_t ras_epow_interrupt(int irq, void *dev_id)
185{
186 int status;
187 int state;
188 int critical;
189
190 status = rtas_get_sensor(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, &state);
191
192 if (state > 3)
193 critical = 1;
194 else
195 critical = 0;
196
197 spin_lock(&ras_log_buf_lock);
198
199 status = rtas_call(ras_check_exception_token, 6, 1, NULL,
200 RTAS_VECTOR_EXTERNAL_INTERRUPT,
201 virq_to_hw(irq),
202 RTAS_EPOW_WARNING,
203 critical, __pa(&ras_log_buf),
204 rtas_get_error_log_max());
205
206 log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0);
207
208 rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf);
209
210 spin_unlock(&ras_log_buf_lock);
211 return IRQ_HANDLED;
212}
213
214
215
216
217
218
219
220
221
222static irqreturn_t ras_error_interrupt(int irq, void *dev_id)
223{
224 struct rtas_error_log *rtas_elog;
225 int status;
226 int fatal;
227
228 spin_lock(&ras_log_buf_lock);
229
230 status = rtas_call(ras_check_exception_token, 6, 1, NULL,
231 RTAS_VECTOR_EXTERNAL_INTERRUPT,
232 virq_to_hw(irq),
233 RTAS_INTERNAL_ERROR, 1 ,
234 __pa(&ras_log_buf),
235 rtas_get_error_log_max());
236
237 rtas_elog = (struct rtas_error_log *)ras_log_buf;
238
239 if (status == 0 &&
240 rtas_error_severity(rtas_elog) >= RTAS_SEVERITY_ERROR_SYNC)
241 fatal = 1;
242 else
243 fatal = 0;
244
245
246 log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal);
247
248 if (fatal) {
249 pr_emerg("Fatal hardware error reported by firmware");
250 pr_emerg("Check RTAS error log for details");
251 pr_emerg("Immediate power off");
252 emergency_sync();
253 kernel_power_off();
254 } else {
255 pr_err("Recoverable hardware error reported by firmware");
256 }
257
258 spin_unlock(&ras_log_buf_lock);
259 return IRQ_HANDLED;
260}
261
262
263
264
265
266#define VALID_FWNMI_BUFFER(A) \
267 ((((A) >= 0x7000) && ((A) < 0x7ff0)) || \
268 (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16))))
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs)
287{
288 unsigned long *savep;
289 struct rtas_error_log *h, *errhdr = NULL;
290
291
292 regs->gpr[3] &= ~(0x3UL << 62);
293
294 if (!VALID_FWNMI_BUFFER(regs->gpr[3])) {
295 printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]);
296 return NULL;
297 }
298
299 savep = __va(regs->gpr[3]);
300 regs->gpr[3] = savep[0];
301
302
303 h = (struct rtas_error_log *)&savep[1];
304 if (!rtas_error_extended(h)) {
305 memcpy(&__get_cpu_var(mce_data_buf), h, sizeof(__u64));
306 errhdr = (struct rtas_error_log *)&__get_cpu_var(mce_data_buf);
307 } else {
308 int len, error_log_length;
309
310 error_log_length = 8 + rtas_error_extended_log_length(h);
311 len = max_t(int, error_log_length, RTAS_ERROR_LOG_MAX);
312 memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX);
313 memcpy(global_mce_data_buf, h, len);
314 errhdr = (struct rtas_error_log *)global_mce_data_buf;
315 }
316
317 return errhdr;
318}
319
320
321
322
323
324static void fwnmi_release_errinfo(void)
325{
326 int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL);
327 if (ret != 0)
328 printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret);
329}
330
331int pSeries_system_reset_exception(struct pt_regs *regs)
332{
333 if (fwnmi_active) {
334 struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs);
335 if (errhdr) {
336
337 }
338 fwnmi_release_errinfo();
339 }
340 return 0;
341}
342
343
344
345
346
347
348
349
350
351
352static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
353{
354 int recovered = 0;
355 int disposition = rtas_error_disposition(err);
356
357 if (!(regs->msr & MSR_RI)) {
358
359 recovered = 0;
360
361 } else if (disposition == RTAS_DISP_FULLY_RECOVERED) {
362
363 recovered = 1;
364
365 } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
366
367 printk(KERN_ERR "MCE: limited recovery, system may "
368 "be degraded\n");
369 recovered = 1;
370
371 } else if (user_mode(regs) && !is_global_init(current) &&
372 rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) {
373
374
375
376
377
378
379
380 printk(KERN_ERR "MCE: uncorrectable error, killing task "
381 "%s:%d\n", current->comm, current->pid);
382
383 _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
384 recovered = 1;
385 }
386
387 log_error((char *)err, ERR_TYPE_RTAS_LOG, 0);
388
389 return recovered;
390}
391
392
393
394
395
396
397
398
399
400
401
402int pSeries_machine_check_exception(struct pt_regs *regs)
403{
404 struct rtas_error_log *errp;
405
406 if (fwnmi_active) {
407 errp = fwnmi_get_errinfo(regs);
408 fwnmi_release_errinfo();
409 if (errp && recover_mce(regs, errp))
410 return 1;
411 }
412
413 return 0;
414}
415