1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38#include <linux/kernel.h>
39#include <linux/mm.h>
40#include <linux/page-flags.h>
41#include <linux/kernel-page-flags.h>
42#include <linux/sched.h>
43#include <linux/ksm.h>
44#include <linux/rmap.h>
45#include <linux/export.h>
46#include <linux/pagemap.h>
47#include <linux/swap.h>
48#include <linux/backing-dev.h>
49#include <linux/migrate.h>
50#include <linux/page-isolation.h>
51#include <linux/suspend.h>
52#include <linux/slab.h>
53#include <linux/swapops.h>
54#include <linux/hugetlb.h>
55#include <linux/memory_hotplug.h>
56#include <linux/mm_inline.h>
57#include <linux/kfifo.h>
58#include "internal.h"
59
60int sysctl_memory_failure_early_kill __read_mostly = 0;
61
62int sysctl_memory_failure_recovery __read_mostly = 1;
63
64atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
65
66#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
67
68u32 hwpoison_filter_enable = 0;
69u32 hwpoison_filter_dev_major = ~0U;
70u32 hwpoison_filter_dev_minor = ~0U;
71u64 hwpoison_filter_flags_mask;
72u64 hwpoison_filter_flags_value;
73EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
74EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
75EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
76EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
77EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
78
79static int hwpoison_filter_dev(struct page *p)
80{
81 struct address_space *mapping;
82 dev_t dev;
83
84 if (hwpoison_filter_dev_major == ~0U &&
85 hwpoison_filter_dev_minor == ~0U)
86 return 0;
87
88
89
90
91 if (PageSlab(p))
92 return -EINVAL;
93
94 mapping = page_mapping(p);
95 if (mapping == NULL || mapping->host == NULL)
96 return -EINVAL;
97
98 dev = mapping->host->i_sb->s_dev;
99 if (hwpoison_filter_dev_major != ~0U &&
100 hwpoison_filter_dev_major != MAJOR(dev))
101 return -EINVAL;
102 if (hwpoison_filter_dev_minor != ~0U &&
103 hwpoison_filter_dev_minor != MINOR(dev))
104 return -EINVAL;
105
106 return 0;
107}
108
109static int hwpoison_filter_flags(struct page *p)
110{
111 if (!hwpoison_filter_flags_mask)
112 return 0;
113
114 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
115 hwpoison_filter_flags_value)
116 return 0;
117 else
118 return -EINVAL;
119}
120
121
122
123
124
125
126
127
128
129
130
131#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
132u64 hwpoison_filter_memcg;
133EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
134static int hwpoison_filter_task(struct page *p)
135{
136 struct mem_cgroup *mem;
137 struct cgroup_subsys_state *css;
138 unsigned long ino;
139
140 if (!hwpoison_filter_memcg)
141 return 0;
142
143 mem = try_get_mem_cgroup_from_page(p);
144 if (!mem)
145 return -EINVAL;
146
147 css = mem_cgroup_css(mem);
148
149 if (!css->cgroup->dentry)
150 return -EINVAL;
151
152 ino = css->cgroup->dentry->d_inode->i_ino;
153 css_put(css);
154
155 if (ino != hwpoison_filter_memcg)
156 return -EINVAL;
157
158 return 0;
159}
160#else
161static int hwpoison_filter_task(struct page *p) { return 0; }
162#endif
163
164int hwpoison_filter(struct page *p)
165{
166 if (!hwpoison_filter_enable)
167 return 0;
168
169 if (hwpoison_filter_dev(p))
170 return -EINVAL;
171
172 if (hwpoison_filter_flags(p))
173 return -EINVAL;
174
175 if (hwpoison_filter_task(p))
176 return -EINVAL;
177
178 return 0;
179}
180#else
181int hwpoison_filter(struct page *p)
182{
183 return 0;
184}
185#endif
186
187EXPORT_SYMBOL_GPL(hwpoison_filter);
188
189
190
191
192
193
194static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
195 unsigned long pfn, struct page *page, int flags)
196{
197 struct siginfo si;
198 int ret;
199
200 printk(KERN_ERR
201 "MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
202 pfn, t->comm, t->pid);
203 si.si_signo = SIGBUS;
204 si.si_errno = 0;
205 si.si_addr = (void *)addr;
206#ifdef __ARCH_SI_TRAPNO
207 si.si_trapno = trapno;
208#endif
209 si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
210
211 if ((flags & MF_ACTION_REQUIRED) && t == current) {
212 si.si_code = BUS_MCEERR_AR;
213 ret = force_sig_info(SIGBUS, &si, t);
214 } else {
215
216
217
218
219
220
221 si.si_code = BUS_MCEERR_AO;
222 ret = send_sig_info(SIGBUS, &si, t);
223 }
224 if (ret < 0)
225 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
226 t->comm, t->pid, ret);
227 return ret;
228}
229
230
231
232
233
234void shake_page(struct page *p, int access)
235{
236 if (!PageSlab(p)) {
237 lru_add_drain_all();
238 if (PageLRU(p))
239 return;
240 drain_all_pages();
241 if (PageLRU(p) || is_free_buddy_page(p))
242 return;
243 }
244
245
246
247
248
249 if (access) {
250 int nr;
251 do {
252 struct shrink_control shrink = {
253 .gfp_mask = GFP_KERNEL,
254 };
255
256 nr = shrink_slab(&shrink, 1000, 1000);
257 if (page_count(p) == 1)
258 break;
259 } while (nr > 10);
260 }
261}
262EXPORT_SYMBOL_GPL(shake_page);
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286struct to_kill {
287 struct list_head nd;
288 struct task_struct *tsk;
289 unsigned long addr;
290 char addr_valid;
291};
292
293
294
295
296
297
298
299
300
301
302
303static void add_to_kill(struct task_struct *tsk, struct page *p,
304 struct vm_area_struct *vma,
305 struct list_head *to_kill,
306 struct to_kill **tkc)
307{
308 struct to_kill *tk;
309
310 if (*tkc) {
311 tk = *tkc;
312 *tkc = NULL;
313 } else {
314 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
315 if (!tk) {
316 printk(KERN_ERR
317 "MCE: Out of memory while machine check handling\n");
318 return;
319 }
320 }
321 tk->addr = page_address_in_vma(p, vma);
322 tk->addr_valid = 1;
323
324
325
326
327
328
329
330 if (tk->addr == -EFAULT) {
331 pr_info("MCE: Unable to find user space address %lx in %s\n",
332 page_to_pfn(p), tsk->comm);
333 tk->addr_valid = 0;
334 }
335 get_task_struct(tsk);
336 tk->tsk = tsk;
337 list_add_tail(&tk->nd, to_kill);
338}
339
340
341
342
343
344
345
346
347
348static void kill_procs(struct list_head *to_kill, int doit, int trapno,
349 int fail, struct page *page, unsigned long pfn,
350 int flags)
351{
352 struct to_kill *tk, *next;
353
354 list_for_each_entry_safe (tk, next, to_kill, nd) {
355 if (doit) {
356
357
358
359
360
361 if (fail || tk->addr_valid == 0) {
362 printk(KERN_ERR
363 "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
364 pfn, tk->tsk->comm, tk->tsk->pid);
365 force_sig(SIGKILL, tk->tsk);
366 }
367
368
369
370
371
372
373
374 else if (kill_proc(tk->tsk, tk->addr, trapno,
375 pfn, page, flags) < 0)
376 printk(KERN_ERR
377 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
378 pfn, tk->tsk->comm, tk->tsk->pid);
379 }
380 put_task_struct(tk->tsk);
381 kfree(tk);
382 }
383}
384
385static int task_early_kill(struct task_struct *tsk)
386{
387 if (!tsk->mm)
388 return 0;
389 if (tsk->flags & PF_MCE_PROCESS)
390 return !!(tsk->flags & PF_MCE_EARLY);
391 return sysctl_memory_failure_early_kill;
392}
393
394
395
396
397static void collect_procs_anon(struct page *page, struct list_head *to_kill,
398 struct to_kill **tkc)
399{
400 struct vm_area_struct *vma;
401 struct task_struct *tsk;
402 struct anon_vma *av;
403
404 av = page_lock_anon_vma(page);
405 if (av == NULL)
406 return;
407
408 read_lock(&tasklist_lock);
409 for_each_process (tsk) {
410 struct anon_vma_chain *vmac;
411
412 if (!task_early_kill(tsk))
413 continue;
414 list_for_each_entry(vmac, &av->head, same_anon_vma) {
415 vma = vmac->vma;
416 if (!page_mapped_in_vma(page, vma))
417 continue;
418 if (vma->vm_mm == tsk->mm)
419 add_to_kill(tsk, page, vma, to_kill, tkc);
420 }
421 }
422 read_unlock(&tasklist_lock);
423 page_unlock_anon_vma(av);
424}
425
426
427
428
429static void collect_procs_file(struct page *page, struct list_head *to_kill,
430 struct to_kill **tkc)
431{
432 struct vm_area_struct *vma;
433 struct task_struct *tsk;
434 struct prio_tree_iter iter;
435 struct address_space *mapping = page->mapping;
436
437 mutex_lock(&mapping->i_mmap_mutex);
438 read_lock(&tasklist_lock);
439 for_each_process(tsk) {
440 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
441
442 if (!task_early_kill(tsk))
443 continue;
444
445 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
446 pgoff) {
447
448
449
450
451
452
453
454 if (vma->vm_mm == tsk->mm)
455 add_to_kill(tsk, page, vma, to_kill, tkc);
456 }
457 }
458 read_unlock(&tasklist_lock);
459 mutex_unlock(&mapping->i_mmap_mutex);
460}
461
462
463
464
465
466
467
468static void collect_procs(struct page *page, struct list_head *tokill)
469{
470 struct to_kill *tk;
471
472 if (!page->mapping)
473 return;
474
475 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
476 if (!tk)
477 return;
478 if (PageAnon(page))
479 collect_procs_anon(page, tokill, &tk);
480 else
481 collect_procs_file(page, tokill, &tk);
482 kfree(tk);
483}
484
485
486
487
488
489enum outcome {
490 IGNORED,
491 FAILED,
492 DELAYED,
493 RECOVERED,
494};
495
496static const char *action_name[] = {
497 [IGNORED] = "Ignored",
498 [FAILED] = "Failed",
499 [DELAYED] = "Delayed",
500 [RECOVERED] = "Recovered",
501};
502
503
504
505
506
507
508
509static int delete_from_lru_cache(struct page *p)
510{
511 if (!isolate_lru_page(p)) {
512
513
514
515
516 ClearPageActive(p);
517 ClearPageUnevictable(p);
518
519
520
521 page_cache_release(p);
522 return 0;
523 }
524 return -EIO;
525}
526
527
528
529
530
531
532static int me_kernel(struct page *p, unsigned long pfn)
533{
534 return IGNORED;
535}
536
537
538
539
540static int me_unknown(struct page *p, unsigned long pfn)
541{
542 printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
543 return FAILED;
544}
545
546
547
548
549static int me_pagecache_clean(struct page *p, unsigned long pfn)
550{
551 int err;
552 int ret = FAILED;
553 struct address_space *mapping;
554
555 delete_from_lru_cache(p);
556
557
558
559
560
561 if (PageAnon(p))
562 return RECOVERED;
563
564
565
566
567
568
569
570
571 mapping = page_mapping(p);
572 if (!mapping) {
573
574
575
576 return FAILED;
577 }
578
579
580
581
582
583
584 if (mapping->a_ops->error_remove_page) {
585 err = mapping->a_ops->error_remove_page(mapping, p);
586 if (err != 0) {
587 printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
588 pfn, err);
589 } else if (page_has_private(p) &&
590 !try_to_release_page(p, GFP_NOIO)) {
591 pr_info("MCE %#lx: failed to release buffers\n", pfn);
592 } else {
593 ret = RECOVERED;
594 }
595 } else {
596
597
598
599
600 if (invalidate_inode_page(p))
601 ret = RECOVERED;
602 else
603 printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
604 pfn);
605 }
606 return ret;
607}
608
609
610
611
612
613
614static int me_pagecache_dirty(struct page *p, unsigned long pfn)
615{
616 struct address_space *mapping = page_mapping(p);
617
618 SetPageError(p);
619
620 if (mapping) {
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655 mapping_set_error(mapping, EIO);
656 }
657
658 return me_pagecache_clean(p, pfn);
659}
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680static int me_swapcache_dirty(struct page *p, unsigned long pfn)
681{
682 ClearPageDirty(p);
683
684 ClearPageUptodate(p);
685
686 if (!delete_from_lru_cache(p))
687 return DELAYED;
688 else
689 return FAILED;
690}
691
692static int me_swapcache_clean(struct page *p, unsigned long pfn)
693{
694 delete_from_swap_cache(p);
695
696 if (!delete_from_lru_cache(p))
697 return RECOVERED;
698 else
699 return FAILED;
700}
701
702
703
704
705
706
707
708static int me_huge_page(struct page *p, unsigned long pfn)
709{
710 int res = 0;
711 struct page *hpage = compound_head(p);
712
713
714
715
716
717
718
719
720
721
722 if (!(page_mapping(hpage) || PageAnon(hpage))) {
723 res = dequeue_hwpoisoned_huge_page(hpage);
724 if (!res)
725 return RECOVERED;
726 }
727 return DELAYED;
728}
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743#define dirty (1UL << PG_dirty)
744#define sc (1UL << PG_swapcache)
745#define unevict (1UL << PG_unevictable)
746#define mlock (1UL << PG_mlocked)
747#define writeback (1UL << PG_writeback)
748#define lru (1UL << PG_lru)
749#define swapbacked (1UL << PG_swapbacked)
750#define head (1UL << PG_head)
751#define tail (1UL << PG_tail)
752#define compound (1UL << PG_compound)
753#define slab (1UL << PG_slab)
754#define reserved (1UL << PG_reserved)
755
756static struct page_state {
757 unsigned long mask;
758 unsigned long res;
759 char *msg;
760 int (*action)(struct page *p, unsigned long pfn);
761} error_states[] = {
762 { reserved, reserved, "reserved kernel", me_kernel },
763
764
765
766
767
768
769
770
771
772
773 { slab, slab, "kernel slab", me_kernel },
774
775#ifdef CONFIG_PAGEFLAGS_EXTENDED
776 { head, head, "huge", me_huge_page },
777 { tail, tail, "huge", me_huge_page },
778#else
779 { compound, compound, "huge", me_huge_page },
780#endif
781
782 { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },
783 { sc|dirty, sc, "swapcache", me_swapcache_clean },
784
785 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
786 { unevict, unevict, "unevictable LRU", me_pagecache_clean},
787
788 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
789 { mlock, mlock, "mlocked LRU", me_pagecache_clean },
790
791 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
792 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
793
794
795
796
797 { 0, 0, "unknown page state", me_unknown },
798};
799
800#undef dirty
801#undef sc
802#undef unevict
803#undef mlock
804#undef writeback
805#undef lru
806#undef swapbacked
807#undef head
808#undef tail
809#undef compound
810#undef slab
811#undef reserved
812
813static void action_result(unsigned long pfn, char *msg, int result)
814{
815 struct page *page = pfn_to_page(pfn);
816
817 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
818 pfn,
819 PageDirty(page) ? "dirty " : "",
820 msg, action_name[result]);
821}
822
823static int page_action(struct page_state *ps, struct page *p,
824 unsigned long pfn)
825{
826 int result;
827 int count;
828
829 result = ps->action(p, pfn);
830 action_result(pfn, ps->msg, result);
831
832 count = page_count(p) - 1;
833 if (ps->action == me_swapcache_dirty && result == DELAYED)
834 count--;
835 if (count != 0) {
836 printk(KERN_ERR
837 "MCE %#lx: %s page still referenced by %d users\n",
838 pfn, ps->msg, count);
839 result = FAILED;
840 }
841
842
843
844
845
846
847 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
848}
849
850
851
852
853
854static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
855 int trapno, int flags)
856{
857 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
858 struct address_space *mapping;
859 LIST_HEAD(tokill);
860 int ret;
861 int kill = 1;
862 struct page *hpage = compound_head(p);
863 struct page *ppage;
864
865 if (PageReserved(p) || PageSlab(p))
866 return SWAP_SUCCESS;
867
868
869
870
871
872 if (!page_mapped(hpage))
873 return SWAP_SUCCESS;
874
875 if (PageKsm(p))
876 return SWAP_FAIL;
877
878 if (PageSwapCache(p)) {
879 printk(KERN_ERR
880 "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
881 ttu |= TTU_IGNORE_HWPOISON;
882 }
883
884
885
886
887
888
889
890 mapping = page_mapping(hpage);
891 if (!PageDirty(hpage) && mapping &&
892 mapping_cap_writeback_dirty(mapping)) {
893 if (page_mkclean(hpage)) {
894 SetPageDirty(hpage);
895 } else {
896 kill = 0;
897 ttu |= TTU_IGNORE_HWPOISON;
898 printk(KERN_INFO
899 "MCE %#lx: corrupted page was clean: dropped without side effects\n",
900 pfn);
901 }
902 }
903
904
905
906
907
908
909
910 ppage = hpage;
911
912 if (PageTransHuge(hpage)) {
913
914
915
916
917
918
919
920
921
922
923 if (!PageHuge(hpage) && PageAnon(hpage)) {
924 if (unlikely(split_huge_page(hpage))) {
925
926
927
928
929
930
931 printk(KERN_INFO
932 "MCE %#lx: failed to split THP\n", pfn);
933
934 BUG_ON(!PageHWPoison(p));
935 return SWAP_FAIL;
936 }
937
938 ppage = p;
939 }
940 }
941
942
943
944
945
946
947
948
949
950 if (kill)
951 collect_procs(ppage, &tokill);
952
953 if (hpage != ppage)
954 lock_page(ppage);
955
956 ret = try_to_unmap(ppage, ttu);
957 if (ret != SWAP_SUCCESS)
958 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
959 pfn, page_mapcount(ppage));
960
961 if (hpage != ppage)
962 unlock_page(ppage);
963
964
965
966
967
968
969
970
971
972
973 kill_procs(&tokill, !!PageDirty(ppage), trapno,
974 ret != SWAP_SUCCESS, p, pfn, flags);
975
976 return ret;
977}
978
979static void set_page_hwpoison_huge_page(struct page *hpage)
980{
981 int i;
982 int nr_pages = 1 << compound_trans_order(hpage);
983 for (i = 0; i < nr_pages; i++)
984 SetPageHWPoison(hpage + i);
985}
986
987static void clear_page_hwpoison_huge_page(struct page *hpage)
988{
989 int i;
990 int nr_pages = 1 << compound_trans_order(hpage);
991 for (i = 0; i < nr_pages; i++)
992 ClearPageHWPoison(hpage + i);
993}
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013int memory_failure(unsigned long pfn, int trapno, int flags)
1014{
1015 struct page_state *ps;
1016 struct page *p;
1017 struct page *hpage;
1018 int res;
1019 unsigned int nr_pages;
1020
1021 if (!sysctl_memory_failure_recovery)
1022 panic("Memory failure from trap %d on page %lx", trapno, pfn);
1023
1024 if (!pfn_valid(pfn)) {
1025 printk(KERN_ERR
1026 "MCE %#lx: memory outside kernel control\n",
1027 pfn);
1028 return -ENXIO;
1029 }
1030
1031 p = pfn_to_page(pfn);
1032 hpage = compound_head(p);
1033 if (TestSetPageHWPoison(p)) {
1034 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
1035 return 0;
1036 }
1037
1038 nr_pages = 1 << compound_trans_order(hpage);
1039 atomic_long_add(nr_pages, &mce_bad_pages);
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055 if (!(flags & MF_COUNT_INCREASED) &&
1056 !get_page_unless_zero(hpage)) {
1057 if (is_free_buddy_page(p)) {
1058 action_result(pfn, "free buddy", DELAYED);
1059 return 0;
1060 } else if (PageHuge(hpage)) {
1061
1062
1063
1064
1065 lock_page(hpage);
1066 if (!PageHWPoison(hpage)
1067 || (hwpoison_filter(p) && TestClearPageHWPoison(p))
1068 || (p != hpage && TestSetPageHWPoison(hpage))) {
1069 atomic_long_sub(nr_pages, &mce_bad_pages);
1070 return 0;
1071 }
1072 set_page_hwpoison_huge_page(hpage);
1073 res = dequeue_hwpoisoned_huge_page(hpage);
1074 action_result(pfn, "free huge",
1075 res ? IGNORED : DELAYED);
1076 unlock_page(hpage);
1077 return res;
1078 } else {
1079 action_result(pfn, "high order kernel", IGNORED);
1080 return -EBUSY;
1081 }
1082 }
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092 if (!PageHuge(p) && !PageTransTail(p)) {
1093 if (!PageLRU(p))
1094 shake_page(p, 0);
1095 if (!PageLRU(p)) {
1096
1097
1098
1099 if (is_free_buddy_page(p)) {
1100 action_result(pfn, "free buddy, 2nd try",
1101 DELAYED);
1102 return 0;
1103 }
1104 action_result(pfn, "non LRU", IGNORED);
1105 put_page(p);
1106 return -EBUSY;
1107 }
1108 }
1109
1110
1111
1112
1113
1114
1115 lock_page(hpage);
1116
1117
1118
1119
1120 if (!PageHWPoison(p)) {
1121 printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
1122 res = 0;
1123 goto out;
1124 }
1125 if (hwpoison_filter(p)) {
1126 if (TestClearPageHWPoison(p))
1127 atomic_long_sub(nr_pages, &mce_bad_pages);
1128 unlock_page(hpage);
1129 put_page(hpage);
1130 return 0;
1131 }
1132
1133
1134
1135
1136
1137 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1138 action_result(pfn, "hugepage already hardware poisoned",
1139 IGNORED);
1140 unlock_page(hpage);
1141 put_page(hpage);
1142 return 0;
1143 }
1144
1145
1146
1147
1148
1149
1150 if (PageHuge(p))
1151 set_page_hwpoison_huge_page(hpage);
1152
1153 wait_on_page_writeback(p);
1154
1155
1156
1157
1158
1159 if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) {
1160 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
1161 res = -EBUSY;
1162 goto out;
1163 }
1164
1165
1166
1167
1168 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1169 action_result(pfn, "already truncated LRU", IGNORED);
1170 res = -EBUSY;
1171 goto out;
1172 }
1173
1174 res = -EBUSY;
1175 for (ps = error_states;; ps++) {
1176 if ((p->flags & ps->mask) == ps->res) {
1177 res = page_action(ps, p, pfn);
1178 break;
1179 }
1180 }
1181out:
1182 unlock_page(hpage);
1183 return res;
1184}
1185EXPORT_SYMBOL_GPL(memory_failure);
1186
1187#define MEMORY_FAILURE_FIFO_ORDER 4
1188#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1189
1190struct memory_failure_entry {
1191 unsigned long pfn;
1192 int trapno;
1193 int flags;
1194};
1195
1196struct memory_failure_cpu {
1197 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1198 MEMORY_FAILURE_FIFO_SIZE);
1199 spinlock_t lock;
1200 struct work_struct work;
1201};
1202
1203static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222void memory_failure_queue(unsigned long pfn, int trapno, int flags)
1223{
1224 struct memory_failure_cpu *mf_cpu;
1225 unsigned long proc_flags;
1226 struct memory_failure_entry entry = {
1227 .pfn = pfn,
1228 .trapno = trapno,
1229 .flags = flags,
1230 };
1231
1232 mf_cpu = &get_cpu_var(memory_failure_cpu);
1233 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1234 if (kfifo_put(&mf_cpu->fifo, &entry))
1235 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1236 else
1237 pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
1238 pfn);
1239 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1240 put_cpu_var(memory_failure_cpu);
1241}
1242EXPORT_SYMBOL_GPL(memory_failure_queue);
1243
1244static void memory_failure_work_func(struct work_struct *work)
1245{
1246 struct memory_failure_cpu *mf_cpu;
1247 struct memory_failure_entry entry = { 0, };
1248 unsigned long proc_flags;
1249 int gotten;
1250
1251 mf_cpu = &__get_cpu_var(memory_failure_cpu);
1252 for (;;) {
1253 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1254 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1255 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1256 if (!gotten)
1257 break;
1258 memory_failure(entry.pfn, entry.trapno, entry.flags);
1259 }
1260}
1261
1262static int __init memory_failure_init(void)
1263{
1264 struct memory_failure_cpu *mf_cpu;
1265 int cpu;
1266
1267 for_each_possible_cpu(cpu) {
1268 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1269 spin_lock_init(&mf_cpu->lock);
1270 INIT_KFIFO(mf_cpu->fifo);
1271 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1272 }
1273
1274 return 0;
1275}
1276core_initcall(memory_failure_init);
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290int unpoison_memory(unsigned long pfn)
1291{
1292 struct page *page;
1293 struct page *p;
1294 int freeit = 0;
1295 unsigned int nr_pages;
1296
1297 if (!pfn_valid(pfn))
1298 return -ENXIO;
1299
1300 p = pfn_to_page(pfn);
1301 page = compound_head(p);
1302
1303 if (!PageHWPoison(p)) {
1304 pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
1305 return 0;
1306 }
1307
1308 nr_pages = 1 << compound_trans_order(page);
1309
1310 if (!get_page_unless_zero(page)) {
1311
1312
1313
1314
1315
1316
1317 if (PageHuge(page)) {
1318 pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
1319 return 0;
1320 }
1321 if (TestClearPageHWPoison(p))
1322 atomic_long_sub(nr_pages, &mce_bad_pages);
1323 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1324 return 0;
1325 }
1326
1327 lock_page(page);
1328
1329
1330
1331
1332
1333
1334 if (TestClearPageHWPoison(page)) {
1335 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1336 atomic_long_sub(nr_pages, &mce_bad_pages);
1337 freeit = 1;
1338 if (PageHuge(page))
1339 clear_page_hwpoison_huge_page(page);
1340 }
1341 unlock_page(page);
1342
1343 put_page(page);
1344 if (freeit)
1345 put_page(page);
1346
1347 return 0;
1348}
1349EXPORT_SYMBOL(unpoison_memory);
1350
1351static struct page *new_page(struct page *p, unsigned long private, int **x)
1352{
1353 int nid = page_to_nid(p);
1354 if (PageHuge(p))
1355 return alloc_huge_page_node(page_hstate(compound_head(p)),
1356 nid);
1357 else
1358 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1359}
1360
1361
1362
1363
1364
1365
1366
1367static int get_any_page(struct page *p, unsigned long pfn, int flags)
1368{
1369 int ret;
1370
1371 if (flags & MF_COUNT_INCREASED)
1372 return 1;
1373
1374
1375
1376
1377
1378 lock_memory_hotplug();
1379
1380
1381
1382
1383
1384 set_migratetype_isolate(p);
1385
1386
1387
1388
1389 if (!get_page_unless_zero(compound_head(p))) {
1390 if (PageHuge(p)) {
1391 pr_info("get_any_page: %#lx free huge page\n", pfn);
1392 ret = dequeue_hwpoisoned_huge_page(compound_head(p));
1393 } else if (is_free_buddy_page(p)) {
1394 pr_info("get_any_page: %#lx free buddy page\n", pfn);
1395
1396 SetPageHWPoison(p);
1397 ret = 0;
1398 } else {
1399 pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
1400 pfn, p->flags);
1401 ret = -EIO;
1402 }
1403 } else {
1404
1405 ret = 1;
1406 }
1407 unset_migratetype_isolate(p);
1408 unlock_memory_hotplug();
1409 return ret;
1410}
1411
1412static int soft_offline_huge_page(struct page *page, int flags)
1413{
1414 int ret;
1415 unsigned long pfn = page_to_pfn(page);
1416 struct page *hpage = compound_head(page);
1417 LIST_HEAD(pagelist);
1418
1419 ret = get_any_page(page, pfn, flags);
1420 if (ret < 0)
1421 return ret;
1422 if (ret == 0)
1423 goto done;
1424
1425 if (PageHWPoison(hpage)) {
1426 put_page(hpage);
1427 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1428 return -EBUSY;
1429 }
1430
1431
1432
1433 list_add(&hpage->lru, &pagelist);
1434 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
1435 true);
1436 if (ret) {
1437 struct page *page1, *page2;
1438 list_for_each_entry_safe(page1, page2, &pagelist, lru)
1439 put_page(page1);
1440
1441 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1442 pfn, ret, page->flags);
1443 if (ret > 0)
1444 ret = -EIO;
1445 return ret;
1446 }
1447done:
1448 if (!PageHWPoison(hpage))
1449 atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
1450 set_page_hwpoison_huge_page(hpage);
1451 dequeue_hwpoisoned_huge_page(hpage);
1452
1453 return ret;
1454}
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478int soft_offline_page(struct page *page, int flags)
1479{
1480 int ret;
1481 unsigned long pfn = page_to_pfn(page);
1482
1483 if (PageHuge(page))
1484 return soft_offline_huge_page(page, flags);
1485
1486 ret = get_any_page(page, pfn, flags);
1487 if (ret < 0)
1488 return ret;
1489 if (ret == 0)
1490 goto done;
1491
1492
1493
1494
1495 if (!PageLRU(page)) {
1496
1497
1498
1499 put_page(page);
1500 shake_page(page, 1);
1501
1502
1503
1504
1505 ret = get_any_page(page, pfn, 0);
1506 if (ret < 0)
1507 return ret;
1508 if (ret == 0)
1509 goto done;
1510 }
1511 if (!PageLRU(page)) {
1512 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1513 pfn, page->flags);
1514 return -EIO;
1515 }
1516
1517 lock_page(page);
1518 wait_on_page_writeback(page);
1519
1520
1521
1522
1523 if (PageHWPoison(page)) {
1524 unlock_page(page);
1525 put_page(page);
1526 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1527 return -EBUSY;
1528 }
1529
1530
1531
1532
1533
1534 ret = invalidate_inode_page(page);
1535 unlock_page(page);
1536
1537
1538
1539
1540 if (ret == 1) {
1541 put_page(page);
1542 ret = 0;
1543 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1544 goto done;
1545 }
1546
1547
1548
1549
1550
1551
1552 ret = isolate_lru_page(page);
1553
1554
1555
1556
1557 put_page(page);
1558 if (!ret) {
1559 LIST_HEAD(pagelist);
1560 inc_zone_page_state(page, NR_ISOLATED_ANON +
1561 page_is_file_cache(page));
1562 list_add(&page->lru, &pagelist);
1563 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1564 0, MIGRATE_SYNC);
1565 if (ret) {
1566 putback_lru_pages(&pagelist);
1567 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1568 pfn, ret, page->flags);
1569 if (ret > 0)
1570 ret = -EIO;
1571 }
1572 } else {
1573 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1574 pfn, ret, page_count(page), page->flags);
1575 }
1576 if (ret)
1577 return ret;
1578
1579done:
1580 atomic_long_add(1, &mce_bad_pages);
1581 SetPageHWPoison(page);
1582
1583 return ret;
1584}
1585