1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33#define DEBUG 1
34#include <linux/kernel.h>
35#include <linux/mm.h>
36#include <linux/page-flags.h>
37#include <linux/sched.h>
38#include <linux/ksm.h>
39#include <linux/rmap.h>
40#include <linux/pagemap.h>
41#include <linux/swap.h>
42#include <linux/backing-dev.h>
43#include "internal.h"
44
45int sysctl_memory_failure_early_kill __read_mostly = 0;
46
47int sysctl_memory_failure_recovery __read_mostly = 1;
48
49atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
50
51
52
53
54
55static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
56 unsigned long pfn)
57{
58 struct siginfo si;
59 int ret;
60
61 printk(KERN_ERR
62 "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n",
63 pfn, t->comm, t->pid);
64 si.si_signo = SIGBUS;
65 si.si_errno = 0;
66 si.si_code = BUS_MCEERR_AO;
67 si.si_addr = (void *)addr;
68#ifdef __ARCH_SI_TRAPNO
69 si.si_trapno = trapno;
70#endif
71 si.si_addr_lsb = PAGE_SHIFT;
72
73
74
75
76
77
78 ret = send_sig_info(SIGBUS, &si, t);
79 if (ret < 0)
80 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
81 t->comm, t->pid, ret);
82 return ret;
83}
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107struct to_kill {
108 struct list_head nd;
109 struct task_struct *tsk;
110 unsigned long addr;
111 unsigned addr_valid:1;
112};
113
114
115
116
117
118
119
120
121
122
123
124static void add_to_kill(struct task_struct *tsk, struct page *p,
125 struct vm_area_struct *vma,
126 struct list_head *to_kill,
127 struct to_kill **tkc)
128{
129 struct to_kill *tk;
130
131 if (*tkc) {
132 tk = *tkc;
133 *tkc = NULL;
134 } else {
135 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
136 if (!tk) {
137 printk(KERN_ERR
138 "MCE: Out of memory while machine check handling\n");
139 return;
140 }
141 }
142 tk->addr = page_address_in_vma(p, vma);
143 tk->addr_valid = 1;
144
145
146
147
148
149
150
151 if (tk->addr == -EFAULT) {
152 pr_debug("MCE: Unable to find user space address %lx in %s\n",
153 page_to_pfn(p), tsk->comm);
154 tk->addr_valid = 0;
155 }
156 get_task_struct(tsk);
157 tk->tsk = tsk;
158 list_add_tail(&tk->nd, to_kill);
159}
160
161
162
163
164
165
166
167
168
169static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
170 int fail, unsigned long pfn)
171{
172 struct to_kill *tk, *next;
173
174 list_for_each_entry_safe (tk, next, to_kill, nd) {
175 if (doit) {
176
177
178
179
180
181
182 if (fail || tk->addr_valid == 0) {
183 printk(KERN_ERR
184 "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
185 pfn, tk->tsk->comm, tk->tsk->pid);
186 force_sig(SIGKILL, tk->tsk);
187 }
188
189
190
191
192
193
194
195 else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
196 pfn) < 0)
197 printk(KERN_ERR
198 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
199 pfn, tk->tsk->comm, tk->tsk->pid);
200 }
201 put_task_struct(tk->tsk);
202 kfree(tk);
203 }
204}
205
206static int task_early_kill(struct task_struct *tsk)
207{
208 if (!tsk->mm)
209 return 0;
210 if (tsk->flags & PF_MCE_PROCESS)
211 return !!(tsk->flags & PF_MCE_EARLY);
212 return sysctl_memory_failure_early_kill;
213}
214
215
216
217
218static void collect_procs_anon(struct page *page, struct list_head *to_kill,
219 struct to_kill **tkc)
220{
221 struct vm_area_struct *vma;
222 struct task_struct *tsk;
223 struct anon_vma *av;
224
225 read_lock(&tasklist_lock);
226 av = page_lock_anon_vma(page);
227 if (av == NULL)
228 goto out;
229 for_each_process (tsk) {
230 if (!task_early_kill(tsk))
231 continue;
232 list_for_each_entry (vma, &av->head, anon_vma_node) {
233 if (!page_mapped_in_vma(page, vma))
234 continue;
235 if (vma->vm_mm == tsk->mm)
236 add_to_kill(tsk, page, vma, to_kill, tkc);
237 }
238 }
239 page_unlock_anon_vma(av);
240out:
241 read_unlock(&tasklist_lock);
242}
243
244
245
246
247static void collect_procs_file(struct page *page, struct list_head *to_kill,
248 struct to_kill **tkc)
249{
250 struct vm_area_struct *vma;
251 struct task_struct *tsk;
252 struct prio_tree_iter iter;
253 struct address_space *mapping = page->mapping;
254
255
256
257
258
259
260
261
262
263
264 read_lock(&tasklist_lock);
265 spin_lock(&mapping->i_mmap_lock);
266 for_each_process(tsk) {
267 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
268
269 if (!task_early_kill(tsk))
270 continue;
271
272 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
273 pgoff) {
274
275
276
277
278
279
280
281 if (vma->vm_mm == tsk->mm)
282 add_to_kill(tsk, page, vma, to_kill, tkc);
283 }
284 }
285 spin_unlock(&mapping->i_mmap_lock);
286 read_unlock(&tasklist_lock);
287}
288
289
290
291
292
293
294
295static void collect_procs(struct page *page, struct list_head *tokill)
296{
297 struct to_kill *tk;
298
299 if (!page->mapping)
300 return;
301
302 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
303 if (!tk)
304 return;
305 if (PageAnon(page))
306 collect_procs_anon(page, tokill, &tk);
307 else
308 collect_procs_file(page, tokill, &tk);
309 kfree(tk);
310}
311
312
313
314
315
316enum outcome {
317 FAILED,
318 DELAYED,
319 IGNORED,
320 RECOVERED,
321};
322
323static const char *action_name[] = {
324 [FAILED] = "Failed",
325 [DELAYED] = "Delayed",
326 [IGNORED] = "Ignored",
327 [RECOVERED] = "Recovered",
328};
329
330
331
332
333
334
335static int me_kernel(struct page *p, unsigned long pfn)
336{
337 return DELAYED;
338}
339
340
341
342
343static int me_ignore(struct page *p, unsigned long pfn)
344{
345 return IGNORED;
346}
347
348
349
350
351static int me_unknown(struct page *p, unsigned long pfn)
352{
353 printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
354 return FAILED;
355}
356
357
358
359
360static int me_free(struct page *p, unsigned long pfn)
361{
362 return DELAYED;
363}
364
365
366
367
368static int me_pagecache_clean(struct page *p, unsigned long pfn)
369{
370 int err;
371 int ret = FAILED;
372 struct address_space *mapping;
373
374
375
376
377
378 if (PageAnon(p))
379 return RECOVERED;
380
381
382
383
384
385
386
387
388 mapping = page_mapping(p);
389 if (!mapping) {
390
391
392
393 return FAILED;
394 }
395
396
397
398
399
400
401 if (mapping->a_ops->error_remove_page) {
402 err = mapping->a_ops->error_remove_page(mapping, p);
403 if (err != 0) {
404 printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
405 pfn, err);
406 } else if (page_has_private(p) &&
407 !try_to_release_page(p, GFP_NOIO)) {
408 pr_debug("MCE %#lx: failed to release buffers\n", pfn);
409 } else {
410 ret = RECOVERED;
411 }
412 } else {
413
414
415
416
417 if (invalidate_inode_page(p))
418 ret = RECOVERED;
419 else
420 printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
421 pfn);
422 }
423 return ret;
424}
425
426
427
428
429
430
431static int me_pagecache_dirty(struct page *p, unsigned long pfn)
432{
433 struct address_space *mapping = page_mapping(p);
434
435 SetPageError(p);
436
437 if (mapping) {
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472 mapping_set_error(mapping, EIO);
473 }
474
475 return me_pagecache_clean(p, pfn);
476}
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497static int me_swapcache_dirty(struct page *p, unsigned long pfn)
498{
499 ClearPageDirty(p);
500
501 ClearPageUptodate(p);
502
503 return DELAYED;
504}
505
506static int me_swapcache_clean(struct page *p, unsigned long pfn)
507{
508 delete_from_swap_cache(p);
509
510 return RECOVERED;
511}
512
513
514
515
516
517
518
519
520
521
522
523
524static int me_huge_page(struct page *p, unsigned long pfn)
525{
526 return FAILED;
527}
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542#define dirty (1UL << PG_dirty)
543#define sc (1UL << PG_swapcache)
544#define unevict (1UL << PG_unevictable)
545#define mlock (1UL << PG_mlocked)
546#define writeback (1UL << PG_writeback)
547#define lru (1UL << PG_lru)
548#define swapbacked (1UL << PG_swapbacked)
549#define head (1UL << PG_head)
550#define tail (1UL << PG_tail)
551#define compound (1UL << PG_compound)
552#define slab (1UL << PG_slab)
553#define buddy (1UL << PG_buddy)
554#define reserved (1UL << PG_reserved)
555
556static struct page_state {
557 unsigned long mask;
558 unsigned long res;
559 char *msg;
560 int (*action)(struct page *p, unsigned long pfn);
561} error_states[] = {
562 { reserved, reserved, "reserved kernel", me_ignore },
563 { buddy, buddy, "free kernel", me_free },
564
565
566
567
568
569
570 { slab, slab, "kernel slab", me_kernel },
571
572#ifdef CONFIG_PAGEFLAGS_EXTENDED
573 { head, head, "huge", me_huge_page },
574 { tail, tail, "huge", me_huge_page },
575#else
576 { compound, compound, "huge", me_huge_page },
577#endif
578
579 { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },
580 { sc|dirty, sc, "swapcache", me_swapcache_clean },
581
582 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
583 { unevict, unevict, "unevictable LRU", me_pagecache_clean},
584
585#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
586 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
587 { mlock, mlock, "mlocked LRU", me_pagecache_clean },
588#endif
589
590 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
591 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
592 { swapbacked, swapbacked, "anonymous", me_pagecache_clean },
593
594
595
596
597 { 0, 0, "unknown page state", me_unknown },
598};
599
600static void action_result(unsigned long pfn, char *msg, int result)
601{
602 struct page *page = NULL;
603 if (pfn_valid(pfn))
604 page = pfn_to_page(pfn);
605
606 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
607 pfn,
608 page && PageDirty(page) ? "dirty " : "",
609 msg, action_name[result]);
610}
611
612static int page_action(struct page_state *ps, struct page *p,
613 unsigned long pfn, int ref)
614{
615 int result;
616 int count;
617
618 result = ps->action(p, pfn);
619 action_result(pfn, ps->msg, result);
620
621 count = page_count(p) - 1 - ref;
622 if (count != 0)
623 printk(KERN_ERR
624 "MCE %#lx: %s page still referenced by %d users\n",
625 pfn, ps->msg, count);
626
627
628
629
630
631
632 return result == RECOVERED ? 0 : -EBUSY;
633}
634
635#define N_UNMAP_TRIES 5
636
637
638
639
640
641static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
642 int trapno)
643{
644 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
645 struct address_space *mapping;
646 LIST_HEAD(tokill);
647 int ret;
648 int i;
649 int kill = 1;
650
651 if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p))
652 return;
653
654
655
656
657
658 if (!page_mapped(p))
659 return;
660
661 if (PageSwapCache(p)) {
662 printk(KERN_ERR
663 "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
664 ttu |= TTU_IGNORE_HWPOISON;
665 }
666
667
668
669
670
671 mapping = page_mapping(p);
672 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
673 if (page_mkclean(p)) {
674 SetPageDirty(p);
675 } else {
676 kill = 0;
677 ttu |= TTU_IGNORE_HWPOISON;
678 printk(KERN_INFO
679 "MCE %#lx: corrupted page was clean: dropped without side effects\n",
680 pfn);
681 }
682 }
683
684
685
686
687
688
689
690
691
692 if (kill)
693 collect_procs(p, &tokill);
694
695
696
697
698
699 for (i = 0; i < N_UNMAP_TRIES; i++) {
700 ret = try_to_unmap(p, ttu);
701 if (ret == SWAP_SUCCESS)
702 break;
703 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
704 }
705
706 if (ret != SWAP_SUCCESS)
707 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
708 pfn, page_mapcount(p));
709
710
711
712
713
714
715
716
717
718
719 kill_procs_ao(&tokill, !!PageDirty(p), trapno,
720 ret != SWAP_SUCCESS, pfn);
721}
722
723int __memory_failure(unsigned long pfn, int trapno, int ref)
724{
725 unsigned long lru_flag;
726 struct page_state *ps;
727 struct page *p;
728 int res;
729
730 if (!sysctl_memory_failure_recovery)
731 panic("Memory failure from trap %d on page %lx", trapno, pfn);
732
733 if (!pfn_valid(pfn)) {
734 action_result(pfn, "memory outside kernel control", IGNORED);
735 return -EIO;
736 }
737
738 p = pfn_to_page(pfn);
739 if (TestSetPageHWPoison(p)) {
740 action_result(pfn, "already hardware poisoned", IGNORED);
741 return 0;
742 }
743
744 atomic_long_add(1, &mce_bad_pages);
745
746
747
748
749
750
751
752
753
754
755
756
757 if (!get_page_unless_zero(compound_head(p))) {
758 action_result(pfn, "free or high order kernel", IGNORED);
759 return PageBuddy(compound_head(p)) ? 0 : -EBUSY;
760 }
761
762
763
764
765
766
767
768
769
770 if (!PageLRU(p))
771 lru_add_drain_all();
772 lru_flag = p->flags & lru;
773 if (isolate_lru_page(p)) {
774 action_result(pfn, "non LRU", IGNORED);
775 put_page(p);
776 return -EBUSY;
777 }
778 page_cache_release(p);
779
780
781
782
783
784
785 lock_page_nosync(p);
786 wait_on_page_writeback(p);
787
788
789
790
791 hwpoison_user_mappings(p, pfn, trapno);
792
793
794
795
796 if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) {
797 action_result(pfn, "already truncated LRU", IGNORED);
798 res = 0;
799 goto out;
800 }
801
802 res = -EBUSY;
803 for (ps = error_states;; ps++) {
804 if (((p->flags | lru_flag)& ps->mask) == ps->res) {
805 res = page_action(ps, p, pfn, ref);
806 break;
807 }
808 }
809out:
810 unlock_page(p);
811 return res;
812}
813EXPORT_SYMBOL_GPL(__memory_failure);
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832void memory_failure(unsigned long pfn, int trapno)
833{
834 __memory_failure(pfn, trapno, 0);
835}
836