1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38#include <linux/kernel.h>
39#include <linux/mm.h>
40#include <linux/page-flags.h>
41#include <linux/kernel-page-flags.h>
42#include <linux/sched.h>
43#include <linux/ksm.h>
44#include <linux/rmap.h>
45#include <linux/export.h>
46#include <linux/pagemap.h>
47#include <linux/swap.h>
48#include <linux/backing-dev.h>
49#include <linux/migrate.h>
50#include <linux/page-isolation.h>
51#include <linux/suspend.h>
52#include <linux/slab.h>
53#include <linux/swapops.h>
54#include <linux/hugetlb.h>
55#include <linux/memory_hotplug.h>
56#include <linux/mm_inline.h>
57#include <linux/kfifo.h>
58#include "internal.h"
59
60int sysctl_memory_failure_early_kill __read_mostly = 0;
61
62int sysctl_memory_failure_recovery __read_mostly = 1;
63
64atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
65
66#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
67
68u32 hwpoison_filter_enable = 0;
69u32 hwpoison_filter_dev_major = ~0U;
70u32 hwpoison_filter_dev_minor = ~0U;
71u64 hwpoison_filter_flags_mask;
72u64 hwpoison_filter_flags_value;
73EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
74EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
75EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
76EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
77EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
78
79static int hwpoison_filter_dev(struct page *p)
80{
81 struct address_space *mapping;
82 dev_t dev;
83
84 if (hwpoison_filter_dev_major == ~0U &&
85 hwpoison_filter_dev_minor == ~0U)
86 return 0;
87
88
89
90
91 if (PageSlab(p))
92 return -EINVAL;
93
94 mapping = page_mapping(p);
95 if (mapping == NULL || mapping->host == NULL)
96 return -EINVAL;
97
98 dev = mapping->host->i_sb->s_dev;
99 if (hwpoison_filter_dev_major != ~0U &&
100 hwpoison_filter_dev_major != MAJOR(dev))
101 return -EINVAL;
102 if (hwpoison_filter_dev_minor != ~0U &&
103 hwpoison_filter_dev_minor != MINOR(dev))
104 return -EINVAL;
105
106 return 0;
107}
108
109static int hwpoison_filter_flags(struct page *p)
110{
111 if (!hwpoison_filter_flags_mask)
112 return 0;
113
114 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
115 hwpoison_filter_flags_value)
116 return 0;
117 else
118 return -EINVAL;
119}
120
121
122
123
124
125
126
127
128
129
130
131#ifdef CONFIG_MEMCG_SWAP
132u64 hwpoison_filter_memcg;
133EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
134static int hwpoison_filter_task(struct page *p)
135{
136 struct mem_cgroup *mem;
137 struct cgroup_subsys_state *css;
138 unsigned long ino;
139
140 if (!hwpoison_filter_memcg)
141 return 0;
142
143 mem = try_get_mem_cgroup_from_page(p);
144 if (!mem)
145 return -EINVAL;
146
147 css = mem_cgroup_css(mem);
148 ino = cgroup_ino(css->cgroup);
149 css_put(css);
150
151 if (ino != hwpoison_filter_memcg)
152 return -EINVAL;
153
154 return 0;
155}
156#else
157static int hwpoison_filter_task(struct page *p) { return 0; }
158#endif
159
160int hwpoison_filter(struct page *p)
161{
162 if (!hwpoison_filter_enable)
163 return 0;
164
165 if (hwpoison_filter_dev(p))
166 return -EINVAL;
167
168 if (hwpoison_filter_flags(p))
169 return -EINVAL;
170
171 if (hwpoison_filter_task(p))
172 return -EINVAL;
173
174 return 0;
175}
176#else
177int hwpoison_filter(struct page *p)
178{
179 return 0;
180}
181#endif
182
183EXPORT_SYMBOL_GPL(hwpoison_filter);
184
185
186
187
188
189
190static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
191 unsigned long pfn, struct page *page, int flags)
192{
193 struct siginfo si;
194 int ret;
195
196 printk(KERN_ERR
197 "MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
198 pfn, t->comm, t->pid);
199 si.si_signo = SIGBUS;
200 si.si_errno = 0;
201 si.si_addr = (void *)addr;
202#ifdef __ARCH_SI_TRAPNO
203 si.si_trapno = trapno;
204#endif
205 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
206
207 if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
208 si.si_code = BUS_MCEERR_AR;
209 ret = force_sig_info(SIGBUS, &si, current);
210 } else {
211
212
213
214
215
216
217 si.si_code = BUS_MCEERR_AO;
218 ret = send_sig_info(SIGBUS, &si, t);
219 }
220 if (ret < 0)
221 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
222 t->comm, t->pid, ret);
223 return ret;
224}
225
226
227
228
229
230void shake_page(struct page *p, int access)
231{
232 if (!PageSlab(p)) {
233 lru_add_drain_all();
234 if (PageLRU(p))
235 return;
236 drain_all_pages(page_zone(p));
237 if (PageLRU(p) || is_free_buddy_page(p))
238 return;
239 }
240
241
242
243
244
245 if (access) {
246 int nr;
247 int nid = page_to_nid(p);
248 do {
249 nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000);
250 if (page_count(p) == 1)
251 break;
252 } while (nr > 10);
253 }
254}
255EXPORT_SYMBOL_GPL(shake_page);
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279struct to_kill {
280 struct list_head nd;
281 struct task_struct *tsk;
282 unsigned long addr;
283 char addr_valid;
284};
285
286
287
288
289
290
291
292
293
294
295
296static void add_to_kill(struct task_struct *tsk, struct page *p,
297 struct vm_area_struct *vma,
298 struct list_head *to_kill,
299 struct to_kill **tkc)
300{
301 struct to_kill *tk;
302
303 if (*tkc) {
304 tk = *tkc;
305 *tkc = NULL;
306 } else {
307 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
308 if (!tk) {
309 printk(KERN_ERR
310 "MCE: Out of memory while machine check handling\n");
311 return;
312 }
313 }
314 tk->addr = page_address_in_vma(p, vma);
315 tk->addr_valid = 1;
316
317
318
319
320
321
322
323 if (tk->addr == -EFAULT) {
324 pr_info("MCE: Unable to find user space address %lx in %s\n",
325 page_to_pfn(p), tsk->comm);
326 tk->addr_valid = 0;
327 }
328 get_task_struct(tsk);
329 tk->tsk = tsk;
330 list_add_tail(&tk->nd, to_kill);
331}
332
333
334
335
336
337
338
339
340
341static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
342 int fail, struct page *page, unsigned long pfn,
343 int flags)
344{
345 struct to_kill *tk, *next;
346
347 list_for_each_entry_safe (tk, next, to_kill, nd) {
348 if (forcekill) {
349
350
351
352
353
354 if (fail || tk->addr_valid == 0) {
355 printk(KERN_ERR
356 "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
357 pfn, tk->tsk->comm, tk->tsk->pid);
358 force_sig(SIGKILL, tk->tsk);
359 }
360
361
362
363
364
365
366
367 else if (kill_proc(tk->tsk, tk->addr, trapno,
368 pfn, page, flags) < 0)
369 printk(KERN_ERR
370 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
371 pfn, tk->tsk->comm, tk->tsk->pid);
372 }
373 put_task_struct(tk->tsk);
374 kfree(tk);
375 }
376}
377
378
379
380
381
382
383
384
385
386static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
387{
388 struct task_struct *t;
389
390 for_each_thread(tsk, t)
391 if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
392 return t;
393 return NULL;
394}
395
396
397
398
399
400
401
402static struct task_struct *task_early_kill(struct task_struct *tsk,
403 int force_early)
404{
405 struct task_struct *t;
406 if (!tsk->mm)
407 return NULL;
408 if (force_early)
409 return tsk;
410 t = find_early_kill_thread(tsk);
411 if (t)
412 return t;
413 if (sysctl_memory_failure_early_kill)
414 return tsk;
415 return NULL;
416}
417
418
419
420
421static void collect_procs_anon(struct page *page, struct list_head *to_kill,
422 struct to_kill **tkc, int force_early)
423{
424 struct vm_area_struct *vma;
425 struct task_struct *tsk;
426 struct anon_vma *av;
427 pgoff_t pgoff;
428
429 av = page_lock_anon_vma_read(page);
430 if (av == NULL)
431 return;
432
433 pgoff = page_to_pgoff(page);
434 read_lock(&tasklist_lock);
435 for_each_process (tsk) {
436 struct anon_vma_chain *vmac;
437 struct task_struct *t = task_early_kill(tsk, force_early);
438
439 if (!t)
440 continue;
441 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
442 pgoff, pgoff) {
443 vma = vmac->vma;
444 if (!page_mapped_in_vma(page, vma))
445 continue;
446 if (vma->vm_mm == t->mm)
447 add_to_kill(t, page, vma, to_kill, tkc);
448 }
449 }
450 read_unlock(&tasklist_lock);
451 page_unlock_anon_vma_read(av);
452}
453
454
455
456
457static void collect_procs_file(struct page *page, struct list_head *to_kill,
458 struct to_kill **tkc, int force_early)
459{
460 struct vm_area_struct *vma;
461 struct task_struct *tsk;
462 struct address_space *mapping = page->mapping;
463
464 i_mmap_lock_read(mapping);
465 read_lock(&tasklist_lock);
466 for_each_process(tsk) {
467 pgoff_t pgoff = page_to_pgoff(page);
468 struct task_struct *t = task_early_kill(tsk, force_early);
469
470 if (!t)
471 continue;
472 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
473 pgoff) {
474
475
476
477
478
479
480
481 if (vma->vm_mm == t->mm)
482 add_to_kill(t, page, vma, to_kill, tkc);
483 }
484 }
485 read_unlock(&tasklist_lock);
486 i_mmap_unlock_read(mapping);
487}
488
489
490
491
492
493
494
495static void collect_procs(struct page *page, struct list_head *tokill,
496 int force_early)
497{
498 struct to_kill *tk;
499
500 if (!page->mapping)
501 return;
502
503 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
504 if (!tk)
505 return;
506 if (PageAnon(page))
507 collect_procs_anon(page, tokill, &tk, force_early);
508 else
509 collect_procs_file(page, tokill, &tk, force_early);
510 kfree(tk);
511}
512
513
514
515
516
517enum outcome {
518 IGNORED,
519 FAILED,
520 DELAYED,
521 RECOVERED,
522};
523
524static const char *action_name[] = {
525 [IGNORED] = "Ignored",
526 [FAILED] = "Failed",
527 [DELAYED] = "Delayed",
528 [RECOVERED] = "Recovered",
529};
530
531
532
533
534
535
536
537static int delete_from_lru_cache(struct page *p)
538{
539 if (!isolate_lru_page(p)) {
540
541
542
543
544 ClearPageActive(p);
545 ClearPageUnevictable(p);
546
547
548
549 page_cache_release(p);
550 return 0;
551 }
552 return -EIO;
553}
554
555
556
557
558
559
560static int me_kernel(struct page *p, unsigned long pfn)
561{
562 return IGNORED;
563}
564
565
566
567
568static int me_unknown(struct page *p, unsigned long pfn)
569{
570 printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
571 return FAILED;
572}
573
574
575
576
577static int me_pagecache_clean(struct page *p, unsigned long pfn)
578{
579 int err;
580 int ret = FAILED;
581 struct address_space *mapping;
582
583 delete_from_lru_cache(p);
584
585
586
587
588
589 if (PageAnon(p))
590 return RECOVERED;
591
592
593
594
595
596
597
598
599 mapping = page_mapping(p);
600 if (!mapping) {
601
602
603
604 return FAILED;
605 }
606
607
608
609
610
611
612 if (mapping->a_ops->error_remove_page) {
613 err = mapping->a_ops->error_remove_page(mapping, p);
614 if (err != 0) {
615 printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
616 pfn, err);
617 } else if (page_has_private(p) &&
618 !try_to_release_page(p, GFP_NOIO)) {
619 pr_info("MCE %#lx: failed to release buffers\n", pfn);
620 } else {
621 ret = RECOVERED;
622 }
623 } else {
624
625
626
627
628 if (invalidate_inode_page(p))
629 ret = RECOVERED;
630 else
631 printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
632 pfn);
633 }
634 return ret;
635}
636
637
638
639
640
641
642static int me_pagecache_dirty(struct page *p, unsigned long pfn)
643{
644 struct address_space *mapping = page_mapping(p);
645
646 SetPageError(p);
647
648 if (mapping) {
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683 mapping_set_error(mapping, EIO);
684 }
685
686 return me_pagecache_clean(p, pfn);
687}
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708static int me_swapcache_dirty(struct page *p, unsigned long pfn)
709{
710 ClearPageDirty(p);
711
712 ClearPageUptodate(p);
713
714 if (!delete_from_lru_cache(p))
715 return DELAYED;
716 else
717 return FAILED;
718}
719
720static int me_swapcache_clean(struct page *p, unsigned long pfn)
721{
722 delete_from_swap_cache(p);
723
724 if (!delete_from_lru_cache(p))
725 return RECOVERED;
726 else
727 return FAILED;
728}
729
730
731
732
733
734
735
736static int me_huge_page(struct page *p, unsigned long pfn)
737{
738 int res = 0;
739 struct page *hpage = compound_head(p);
740
741
742
743
744
745
746
747
748
749
750 if (!(page_mapping(hpage) || PageAnon(hpage))) {
751 res = dequeue_hwpoisoned_huge_page(hpage);
752 if (!res)
753 return RECOVERED;
754 }
755 return DELAYED;
756}
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771#define dirty (1UL << PG_dirty)
772#define sc (1UL << PG_swapcache)
773#define unevict (1UL << PG_unevictable)
774#define mlock (1UL << PG_mlocked)
775#define writeback (1UL << PG_writeback)
776#define lru (1UL << PG_lru)
777#define swapbacked (1UL << PG_swapbacked)
778#define head (1UL << PG_head)
779#define tail (1UL << PG_tail)
780#define compound (1UL << PG_compound)
781#define slab (1UL << PG_slab)
782#define reserved (1UL << PG_reserved)
783
784static struct page_state {
785 unsigned long mask;
786 unsigned long res;
787 char *msg;
788 int (*action)(struct page *p, unsigned long pfn);
789} error_states[] = {
790 { reserved, reserved, "reserved kernel", me_kernel },
791
792
793
794
795
796
797
798
799
800
801 { slab, slab, "kernel slab", me_kernel },
802
803#ifdef CONFIG_PAGEFLAGS_EXTENDED
804 { head, head, "huge", me_huge_page },
805 { tail, tail, "huge", me_huge_page },
806#else
807 { compound, compound, "huge", me_huge_page },
808#endif
809
810 { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
811 { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
812
813 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
814 { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean },
815
816 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
817 { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean },
818
819 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
820 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
821
822
823
824
825 { 0, 0, "unknown page state", me_unknown },
826};
827
828#undef dirty
829#undef sc
830#undef unevict
831#undef mlock
832#undef writeback
833#undef lru
834#undef swapbacked
835#undef head
836#undef tail
837#undef compound
838#undef slab
839#undef reserved
840
841
842
843
844
845static void action_result(unsigned long pfn, char *msg, int result)
846{
847 pr_err("MCE %#lx: %s page recovery: %s\n",
848 pfn, msg, action_name[result]);
849}
850
851static int page_action(struct page_state *ps, struct page *p,
852 unsigned long pfn)
853{
854 int result;
855 int count;
856
857 result = ps->action(p, pfn);
858
859 count = page_count(p) - 1;
860 if (ps->action == me_swapcache_dirty && result == DELAYED)
861 count--;
862 if (count != 0) {
863 printk(KERN_ERR
864 "MCE %#lx: %s page still referenced by %d users\n",
865 pfn, ps->msg, count);
866 result = FAILED;
867 }
868 action_result(pfn, ps->msg, result);
869
870
871
872
873
874
875 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
876}
877
878
879
880
881
882static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
883 int trapno, int flags, struct page **hpagep)
884{
885 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
886 struct address_space *mapping;
887 LIST_HEAD(tokill);
888 int ret;
889 int kill = 1, forcekill;
890 struct page *hpage = *hpagep;
891 struct page *ppage;
892
893
894
895
896
897 if (PageReserved(p) || PageSlab(p))
898 return SWAP_SUCCESS;
899 if (!(PageLRU(hpage) || PageHuge(p)))
900 return SWAP_SUCCESS;
901
902
903
904
905
906 if (!page_mapped(hpage))
907 return SWAP_SUCCESS;
908
909 if (PageKsm(p)) {
910 pr_err("MCE %#lx: can't handle KSM pages.\n", pfn);
911 return SWAP_FAIL;
912 }
913
914 if (PageSwapCache(p)) {
915 printk(KERN_ERR
916 "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
917 ttu |= TTU_IGNORE_HWPOISON;
918 }
919
920
921
922
923
924
925
926 mapping = page_mapping(hpage);
927 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
928 mapping_cap_writeback_dirty(mapping)) {
929 if (page_mkclean(hpage)) {
930 SetPageDirty(hpage);
931 } else {
932 kill = 0;
933 ttu |= TTU_IGNORE_HWPOISON;
934 printk(KERN_INFO
935 "MCE %#lx: corrupted page was clean: dropped without side effects\n",
936 pfn);
937 }
938 }
939
940
941
942
943
944
945
946 ppage = hpage;
947
948 if (PageTransHuge(hpage)) {
949
950
951
952
953
954
955
956
957
958
959 if (!PageHuge(hpage) && PageAnon(hpage)) {
960 if (unlikely(split_huge_page(hpage))) {
961
962
963
964
965
966
967 printk(KERN_INFO
968 "MCE %#lx: failed to split THP\n", pfn);
969
970 BUG_ON(!PageHWPoison(p));
971 return SWAP_FAIL;
972 }
973
974
975
976
977
978
979 if (hpage != p) {
980 if (!(flags & MF_COUNT_INCREASED)) {
981 put_page(hpage);
982 get_page(p);
983 }
984 lock_page(p);
985 unlock_page(hpage);
986 *hpagep = p;
987 }
988
989 ppage = p;
990 }
991 }
992
993
994
995
996
997
998
999
1000
1001 if (kill)
1002 collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
1003
1004 ret = try_to_unmap(ppage, ttu);
1005 if (ret != SWAP_SUCCESS)
1006 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
1007 pfn, page_mapcount(ppage));
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019 forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
1020 kill_procs(&tokill, forcekill, trapno,
1021 ret != SWAP_SUCCESS, p, pfn, flags);
1022
1023 return ret;
1024}
1025
1026static void set_page_hwpoison_huge_page(struct page *hpage)
1027{
1028 int i;
1029 int nr_pages = 1 << compound_order(hpage);
1030 for (i = 0; i < nr_pages; i++)
1031 SetPageHWPoison(hpage + i);
1032}
1033
1034static void clear_page_hwpoison_huge_page(struct page *hpage)
1035{
1036 int i;
1037 int nr_pages = 1 << compound_order(hpage);
1038 for (i = 0; i < nr_pages; i++)
1039 ClearPageHWPoison(hpage + i);
1040}
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060int memory_failure(unsigned long pfn, int trapno, int flags)
1061{
1062 struct page_state *ps;
1063 struct page *p;
1064 struct page *hpage;
1065 int res;
1066 unsigned int nr_pages;
1067 unsigned long page_flags;
1068
1069 if (!sysctl_memory_failure_recovery)
1070 panic("Memory failure from trap %d on page %lx", trapno, pfn);
1071
1072 if (!pfn_valid(pfn)) {
1073 printk(KERN_ERR
1074 "MCE %#lx: memory outside kernel control\n",
1075 pfn);
1076 return -ENXIO;
1077 }
1078
1079 p = pfn_to_page(pfn);
1080 hpage = compound_head(p);
1081 if (TestSetPageHWPoison(p)) {
1082 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
1083 return 0;
1084 }
1085
1086
1087
1088
1089
1090
1091
1092
1093 if (PageHuge(p))
1094 nr_pages = 1 << compound_order(hpage);
1095 else
1096 nr_pages = 1;
1097 atomic_long_add(nr_pages, &num_poisoned_pages);
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113 if (!(flags & MF_COUNT_INCREASED) &&
1114 !get_page_unless_zero(hpage)) {
1115 if (is_free_buddy_page(p)) {
1116 action_result(pfn, "free buddy", DELAYED);
1117 return 0;
1118 } else if (PageHuge(hpage)) {
1119
1120
1121
1122 lock_page(hpage);
1123 if (PageHWPoison(hpage)) {
1124 if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
1125 || (p != hpage && TestSetPageHWPoison(hpage))) {
1126 atomic_long_sub(nr_pages, &num_poisoned_pages);
1127 unlock_page(hpage);
1128 return 0;
1129 }
1130 }
1131 set_page_hwpoison_huge_page(hpage);
1132 res = dequeue_hwpoisoned_huge_page(hpage);
1133 action_result(pfn, "free huge",
1134 res ? IGNORED : DELAYED);
1135 unlock_page(hpage);
1136 return res;
1137 } else {
1138 action_result(pfn, "high order kernel", IGNORED);
1139 return -EBUSY;
1140 }
1141 }
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151 if (!PageHuge(p) && !PageTransTail(p)) {
1152 if (!PageLRU(p))
1153 shake_page(p, 0);
1154 if (!PageLRU(p)) {
1155
1156
1157
1158 if (is_free_buddy_page(p)) {
1159 if (flags & MF_COUNT_INCREASED)
1160 action_result(pfn, "free buddy", DELAYED);
1161 else
1162 action_result(pfn, "free buddy, 2nd try", DELAYED);
1163 return 0;
1164 }
1165 }
1166 }
1167
1168 lock_page(hpage);
1169
1170
1171
1172
1173
1174 if (compound_head(p) != hpage) {
1175 action_result(pfn, "different compound page after locking", IGNORED);
1176 res = -EBUSY;
1177 goto out;
1178 }
1179
1180
1181
1182
1183
1184
1185
1186
1187 page_flags = p->flags;
1188
1189
1190
1191
1192 if (!PageHWPoison(p)) {
1193 printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
1194 atomic_long_sub(nr_pages, &num_poisoned_pages);
1195 put_page(hpage);
1196 res = 0;
1197 goto out;
1198 }
1199 if (hwpoison_filter(p)) {
1200 if (TestClearPageHWPoison(p))
1201 atomic_long_sub(nr_pages, &num_poisoned_pages);
1202 unlock_page(hpage);
1203 put_page(hpage);
1204 return 0;
1205 }
1206
1207 if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
1208 goto identify_page_state;
1209
1210
1211
1212
1213
1214 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1215 action_result(pfn, "hugepage already hardware poisoned",
1216 IGNORED);
1217 unlock_page(hpage);
1218 put_page(hpage);
1219 return 0;
1220 }
1221
1222
1223
1224
1225
1226
1227 if (PageHuge(p))
1228 set_page_hwpoison_huge_page(hpage);
1229
1230
1231
1232
1233
1234 wait_on_page_writeback(p);
1235
1236
1237
1238
1239
1240
1241
1242
1243 if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
1244 != SWAP_SUCCESS) {
1245 action_result(pfn, "unmapping failed", IGNORED);
1246 res = -EBUSY;
1247 goto out;
1248 }
1249
1250
1251
1252
1253 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1254 action_result(pfn, "already truncated LRU", IGNORED);
1255 res = -EBUSY;
1256 goto out;
1257 }
1258
1259identify_page_state:
1260 res = -EBUSY;
1261
1262
1263
1264
1265
1266 for (ps = error_states;; ps++)
1267 if ((p->flags & ps->mask) == ps->res)
1268 break;
1269
1270 page_flags |= (p->flags & (1UL << PG_dirty));
1271
1272 if (!ps->mask)
1273 for (ps = error_states;; ps++)
1274 if ((page_flags & ps->mask) == ps->res)
1275 break;
1276 res = page_action(ps, p, pfn);
1277out:
1278 unlock_page(hpage);
1279 return res;
1280}
1281EXPORT_SYMBOL_GPL(memory_failure);
1282
1283#define MEMORY_FAILURE_FIFO_ORDER 4
1284#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1285
1286struct memory_failure_entry {
1287 unsigned long pfn;
1288 int trapno;
1289 int flags;
1290};
1291
1292struct memory_failure_cpu {
1293 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1294 MEMORY_FAILURE_FIFO_SIZE);
1295 spinlock_t lock;
1296 struct work_struct work;
1297};
1298
1299static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318void memory_failure_queue(unsigned long pfn, int trapno, int flags)
1319{
1320 struct memory_failure_cpu *mf_cpu;
1321 unsigned long proc_flags;
1322 struct memory_failure_entry entry = {
1323 .pfn = pfn,
1324 .trapno = trapno,
1325 .flags = flags,
1326 };
1327
1328 mf_cpu = &get_cpu_var(memory_failure_cpu);
1329 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1330 if (kfifo_put(&mf_cpu->fifo, entry))
1331 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1332 else
1333 pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
1334 pfn);
1335 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1336 put_cpu_var(memory_failure_cpu);
1337}
1338EXPORT_SYMBOL_GPL(memory_failure_queue);
1339
1340static void memory_failure_work_func(struct work_struct *work)
1341{
1342 struct memory_failure_cpu *mf_cpu;
1343 struct memory_failure_entry entry = { 0, };
1344 unsigned long proc_flags;
1345 int gotten;
1346
1347 mf_cpu = this_cpu_ptr(&memory_failure_cpu);
1348 for (;;) {
1349 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1350 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1351 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1352 if (!gotten)
1353 break;
1354 if (entry.flags & MF_SOFT_OFFLINE)
1355 soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
1356 else
1357 memory_failure(entry.pfn, entry.trapno, entry.flags);
1358 }
1359}
1360
1361static int __init memory_failure_init(void)
1362{
1363 struct memory_failure_cpu *mf_cpu;
1364 int cpu;
1365
1366 for_each_possible_cpu(cpu) {
1367 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1368 spin_lock_init(&mf_cpu->lock);
1369 INIT_KFIFO(mf_cpu->fifo);
1370 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1371 }
1372
1373 return 0;
1374}
1375core_initcall(memory_failure_init);
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389int unpoison_memory(unsigned long pfn)
1390{
1391 struct page *page;
1392 struct page *p;
1393 int freeit = 0;
1394 unsigned int nr_pages;
1395
1396 if (!pfn_valid(pfn))
1397 return -ENXIO;
1398
1399 p = pfn_to_page(pfn);
1400 page = compound_head(p);
1401
1402 if (!PageHWPoison(p)) {
1403 pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
1404 return 0;
1405 }
1406
1407
1408
1409
1410
1411
1412 if (!PageHuge(page) && PageTransHuge(page)) {
1413 pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
1414 return 0;
1415 }
1416
1417 nr_pages = 1 << compound_order(page);
1418
1419 if (!get_page_unless_zero(page)) {
1420
1421
1422
1423
1424
1425
1426 if (PageHuge(page)) {
1427 pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
1428 return 0;
1429 }
1430 if (TestClearPageHWPoison(p))
1431 atomic_long_dec(&num_poisoned_pages);
1432 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1433 return 0;
1434 }
1435
1436 lock_page(page);
1437
1438
1439
1440
1441
1442
1443 if (TestClearPageHWPoison(page)) {
1444 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1445 atomic_long_sub(nr_pages, &num_poisoned_pages);
1446 freeit = 1;
1447 if (PageHuge(page))
1448 clear_page_hwpoison_huge_page(page);
1449 }
1450 unlock_page(page);
1451
1452 put_page(page);
1453 if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
1454 put_page(page);
1455
1456 return 0;
1457}
1458EXPORT_SYMBOL(unpoison_memory);
1459
1460static struct page *new_page(struct page *p, unsigned long private, int **x)
1461{
1462 int nid = page_to_nid(p);
1463 if (PageHuge(p))
1464 return alloc_huge_page_node(page_hstate(compound_head(p)),
1465 nid);
1466 else
1467 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1468}
1469
1470
1471
1472
1473
1474
1475
1476static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1477{
1478 int ret;
1479
1480 if (flags & MF_COUNT_INCREASED)
1481 return 1;
1482
1483
1484
1485
1486
1487 if (!get_page_unless_zero(compound_head(p))) {
1488 if (PageHuge(p)) {
1489 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1490 ret = 0;
1491 } else if (is_free_buddy_page(p)) {
1492 pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1493 ret = 0;
1494 } else {
1495 pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1496 __func__, pfn, p->flags);
1497 ret = -EIO;
1498 }
1499 } else {
1500
1501 ret = 1;
1502 }
1503 return ret;
1504}
1505
1506static int get_any_page(struct page *page, unsigned long pfn, int flags)
1507{
1508 int ret = __get_any_page(page, pfn, flags);
1509
1510 if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
1511
1512
1513
1514 put_page(page);
1515 shake_page(page, 1);
1516
1517
1518
1519
1520 ret = __get_any_page(page, pfn, 0);
1521 if (!PageLRU(page)) {
1522 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1523 pfn, page->flags);
1524 return -EIO;
1525 }
1526 }
1527 return ret;
1528}
1529
1530static int soft_offline_huge_page(struct page *page, int flags)
1531{
1532 int ret;
1533 unsigned long pfn = page_to_pfn(page);
1534 struct page *hpage = compound_head(page);
1535 LIST_HEAD(pagelist);
1536
1537
1538
1539
1540
1541 lock_page(hpage);
1542 if (PageHWPoison(hpage)) {
1543 unlock_page(hpage);
1544 put_page(hpage);
1545 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1546 return -EBUSY;
1547 }
1548 unlock_page(hpage);
1549
1550
1551 list_move(&hpage->lru, &pagelist);
1552 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1553 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1554 if (ret) {
1555 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1556 pfn, ret, page->flags);
1557
1558
1559
1560
1561
1562 putback_active_hugepage(hpage);
1563 if (ret > 0)
1564 ret = -EIO;
1565 } else {
1566
1567 if (PageHuge(page)) {
1568 set_page_hwpoison_huge_page(hpage);
1569 dequeue_hwpoisoned_huge_page(hpage);
1570 atomic_long_add(1 << compound_order(hpage),
1571 &num_poisoned_pages);
1572 } else {
1573 SetPageHWPoison(page);
1574 atomic_long_inc(&num_poisoned_pages);
1575 }
1576 }
1577 return ret;
1578}
1579
1580static int __soft_offline_page(struct page *page, int flags)
1581{
1582 int ret;
1583 unsigned long pfn = page_to_pfn(page);
1584
1585
1586
1587
1588
1589
1590
1591 lock_page(page);
1592 wait_on_page_writeback(page);
1593 if (PageHWPoison(page)) {
1594 unlock_page(page);
1595 put_page(page);
1596 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1597 return -EBUSY;
1598 }
1599
1600
1601
1602
1603 ret = invalidate_inode_page(page);
1604 unlock_page(page);
1605
1606
1607
1608
1609 if (ret == 1) {
1610 put_page(page);
1611 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1612 SetPageHWPoison(page);
1613 atomic_long_inc(&num_poisoned_pages);
1614 return 0;
1615 }
1616
1617
1618
1619
1620
1621
1622 ret = isolate_lru_page(page);
1623
1624
1625
1626
1627 put_page(page);
1628 if (!ret) {
1629 LIST_HEAD(pagelist);
1630 inc_zone_page_state(page, NR_ISOLATED_ANON +
1631 page_is_file_cache(page));
1632 list_add(&page->lru, &pagelist);
1633 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1634 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1635 if (ret) {
1636 if (!list_empty(&pagelist)) {
1637 list_del(&page->lru);
1638 dec_zone_page_state(page, NR_ISOLATED_ANON +
1639 page_is_file_cache(page));
1640 putback_lru_page(page);
1641 }
1642
1643 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1644 pfn, ret, page->flags);
1645 if (ret > 0)
1646 ret = -EIO;
1647 } else {
1648
1649
1650
1651
1652
1653
1654
1655
1656 if (!is_free_buddy_page(page))
1657 lru_add_drain_all();
1658 if (!is_free_buddy_page(page))
1659 drain_all_pages(page_zone(page));
1660 SetPageHWPoison(page);
1661 if (!is_free_buddy_page(page))
1662 pr_info("soft offline: %#lx: page leaked\n",
1663 pfn);
1664 atomic_long_inc(&num_poisoned_pages);
1665 }
1666 } else {
1667 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1668 pfn, ret, page_count(page), page->flags);
1669 }
1670 return ret;
1671}
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695int soft_offline_page(struct page *page, int flags)
1696{
1697 int ret;
1698 unsigned long pfn = page_to_pfn(page);
1699 struct page *hpage = compound_head(page);
1700
1701 if (PageHWPoison(page)) {
1702 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1703 return -EBUSY;
1704 }
1705 if (!PageHuge(page) && PageTransHuge(hpage)) {
1706 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
1707 pr_info("soft offline: %#lx: failed to split THP\n",
1708 pfn);
1709 return -EBUSY;
1710 }
1711 }
1712
1713 get_online_mems();
1714
1715
1716
1717
1718
1719
1720 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
1721 set_migratetype_isolate(page, true);
1722
1723 ret = get_any_page(page, pfn, flags);
1724 put_online_mems();
1725 if (ret > 0) {
1726 if (PageHuge(page))
1727 ret = soft_offline_huge_page(page, flags);
1728 else
1729 ret = __soft_offline_page(page, flags);
1730 } else if (ret == 0) {
1731 if (PageHuge(page)) {
1732 set_page_hwpoison_huge_page(hpage);
1733 dequeue_hwpoisoned_huge_page(hpage);
1734 atomic_long_add(1 << compound_order(hpage),
1735 &num_poisoned_pages);
1736 } else {
1737 SetPageHWPoison(page);
1738 atomic_long_inc(&num_poisoned_pages);
1739 }
1740 }
1741 unset_migratetype_isolate(page, MIGRATE_MOVABLE);
1742 return ret;
1743}
1744