1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38#include <linux/kernel.h>
39#include <linux/mm.h>
40#include <linux/page-flags.h>
41#include <linux/kernel-page-flags.h>
42#include <linux/sched.h>
43#include <linux/ksm.h>
44#include <linux/rmap.h>
45#include <linux/export.h>
46#include <linux/pagemap.h>
47#include <linux/swap.h>
48#include <linux/backing-dev.h>
49#include <linux/migrate.h>
50#include <linux/page-isolation.h>
51#include <linux/suspend.h>
52#include <linux/slab.h>
53#include <linux/swapops.h>
54#include <linux/hugetlb.h>
55#include <linux/memory_hotplug.h>
56#include <linux/mm_inline.h>
57#include <linux/kfifo.h>
58#include "internal.h"
59
60int sysctl_memory_failure_early_kill __read_mostly = 0;
61
62int sysctl_memory_failure_recovery __read_mostly = 1;
63
64atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
65
66#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
67
68u32 hwpoison_filter_enable = 0;
69u32 hwpoison_filter_dev_major = ~0U;
70u32 hwpoison_filter_dev_minor = ~0U;
71u64 hwpoison_filter_flags_mask;
72u64 hwpoison_filter_flags_value;
73EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
74EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
75EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
76EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
77EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
78
79static int hwpoison_filter_dev(struct page *p)
80{
81 struct address_space *mapping;
82 dev_t dev;
83
84 if (hwpoison_filter_dev_major == ~0U &&
85 hwpoison_filter_dev_minor == ~0U)
86 return 0;
87
88
89
90
91 if (PageSlab(p))
92 return -EINVAL;
93
94 mapping = page_mapping(p);
95 if (mapping == NULL || mapping->host == NULL)
96 return -EINVAL;
97
98 dev = mapping->host->i_sb->s_dev;
99 if (hwpoison_filter_dev_major != ~0U &&
100 hwpoison_filter_dev_major != MAJOR(dev))
101 return -EINVAL;
102 if (hwpoison_filter_dev_minor != ~0U &&
103 hwpoison_filter_dev_minor != MINOR(dev))
104 return -EINVAL;
105
106 return 0;
107}
108
109static int hwpoison_filter_flags(struct page *p)
110{
111 if (!hwpoison_filter_flags_mask)
112 return 0;
113
114 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
115 hwpoison_filter_flags_value)
116 return 0;
117 else
118 return -EINVAL;
119}
120
121
122
123
124
125
126
127
128
129
130
131#ifdef CONFIG_MEMCG_SWAP
132u64 hwpoison_filter_memcg;
133EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
134static int hwpoison_filter_task(struct page *p)
135{
136 struct mem_cgroup *mem;
137 struct cgroup_subsys_state *css;
138 unsigned long ino;
139
140 if (!hwpoison_filter_memcg)
141 return 0;
142
143 mem = try_get_mem_cgroup_from_page(p);
144 if (!mem)
145 return -EINVAL;
146
147 css = mem_cgroup_css(mem);
148
149 if (!css->cgroup->dentry)
150 return -EINVAL;
151
152 ino = css->cgroup->dentry->d_inode->i_ino;
153 css_put(css);
154
155 if (ino != hwpoison_filter_memcg)
156 return -EINVAL;
157
158 return 0;
159}
160#else
161static int hwpoison_filter_task(struct page *p) { return 0; }
162#endif
163
164int hwpoison_filter(struct page *p)
165{
166 if (!hwpoison_filter_enable)
167 return 0;
168
169 if (hwpoison_filter_dev(p))
170 return -EINVAL;
171
172 if (hwpoison_filter_flags(p))
173 return -EINVAL;
174
175 if (hwpoison_filter_task(p))
176 return -EINVAL;
177
178 return 0;
179}
180#else
181int hwpoison_filter(struct page *p)
182{
183 return 0;
184}
185#endif
186
187EXPORT_SYMBOL_GPL(hwpoison_filter);
188
189
190
191
192
193
194static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
195 unsigned long pfn, struct page *page, int flags)
196{
197 struct siginfo si;
198 int ret;
199
200 printk(KERN_ERR
201 "MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
202 pfn, t->comm, t->pid);
203 si.si_signo = SIGBUS;
204 si.si_errno = 0;
205 si.si_addr = (void *)addr;
206#ifdef __ARCH_SI_TRAPNO
207 si.si_trapno = trapno;
208#endif
209 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
210
211 if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
212 si.si_code = BUS_MCEERR_AR;
213 ret = force_sig_info(SIGBUS, &si, current);
214 } else {
215
216
217
218
219
220
221 si.si_code = BUS_MCEERR_AO;
222 ret = send_sig_info(SIGBUS, &si, t);
223 }
224 if (ret < 0)
225 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
226 t->comm, t->pid, ret);
227 return ret;
228}
229
230
231
232
233
234void shake_page(struct page *p, int access)
235{
236 if (PageHuge(p))
237 return;
238
239 if (!PageSlab(p)) {
240 lru_add_drain_all();
241 if (PageLRU(p))
242 return;
243 drain_all_pages();
244 if (PageLRU(p) || is_free_buddy_page(p))
245 return;
246 }
247
248
249
250
251
252 if (access) {
253 int nr;
254 do {
255 struct shrink_control shrink = {
256 .gfp_mask = GFP_KERNEL,
257 };
258
259 nr = shrink_slab(&shrink, 1000, 1000);
260 if (page_count(p) == 1)
261 break;
262 } while (nr > 10);
263 }
264}
265EXPORT_SYMBOL_GPL(shake_page);
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289struct to_kill {
290 struct list_head nd;
291 struct task_struct *tsk;
292 unsigned long addr;
293 char addr_valid;
294};
295
296
297
298
299
300
301
302
303
304
305
306static void add_to_kill(struct task_struct *tsk, struct page *p,
307 struct vm_area_struct *vma,
308 struct list_head *to_kill,
309 struct to_kill **tkc)
310{
311 struct to_kill *tk;
312
313 if (*tkc) {
314 tk = *tkc;
315 *tkc = NULL;
316 } else {
317 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
318 if (!tk) {
319 printk(KERN_ERR
320 "MCE: Out of memory while machine check handling\n");
321 return;
322 }
323 }
324 tk->addr = page_address_in_vma(p, vma);
325 tk->addr_valid = 1;
326
327
328
329
330
331
332
333 if (tk->addr == -EFAULT) {
334 pr_info("MCE: Unable to find user space address %lx in %s\n",
335 page_to_pfn(p), tsk->comm);
336 tk->addr_valid = 0;
337 }
338 get_task_struct(tsk);
339 tk->tsk = tsk;
340 list_add_tail(&tk->nd, to_kill);
341}
342
343
344
345
346
347
348
349
350
351static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
352 int fail, struct page *page, unsigned long pfn,
353 int flags)
354{
355 struct to_kill *tk, *next;
356
357 list_for_each_entry_safe (tk, next, to_kill, nd) {
358 if (forcekill) {
359
360
361
362
363
364 if (fail || tk->addr_valid == 0) {
365 printk(KERN_ERR
366 "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
367 pfn, tk->tsk->comm, tk->tsk->pid);
368 force_sig(SIGKILL, tk->tsk);
369 }
370
371
372
373
374
375
376
377 else if (kill_proc(tk->tsk, tk->addr, trapno,
378 pfn, page, flags) < 0)
379 printk(KERN_ERR
380 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
381 pfn, tk->tsk->comm, tk->tsk->pid);
382 }
383 put_task_struct(tk->tsk);
384 kfree(tk);
385 }
386}
387
388
389
390
391
392
393
394
395
396static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
397{
398 struct task_struct *t;
399
400 for_each_thread(tsk, t)
401 if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
402 return t;
403 return NULL;
404}
405
406
407
408
409
410
411
412static struct task_struct *task_early_kill(struct task_struct *tsk,
413 int force_early)
414{
415 struct task_struct *t;
416 if (!tsk->mm)
417 return NULL;
418 if (force_early)
419 return tsk;
420 t = find_early_kill_thread(tsk);
421 if (t)
422 return t;
423 if (sysctl_memory_failure_early_kill)
424 return tsk;
425 return NULL;
426}
427
428
429
430
431static void collect_procs_anon(struct page *page, struct list_head *to_kill,
432 struct to_kill **tkc, int force_early)
433{
434 struct vm_area_struct *vma;
435 struct task_struct *tsk;
436 struct anon_vma *av;
437 pgoff_t pgoff;
438
439 av = page_lock_anon_vma_read(page);
440 if (av == NULL)
441 return;
442
443 pgoff = page_to_pgoff(page);
444 qread_lock(&tasklist_lock);
445 for_each_process (tsk) {
446 struct anon_vma_chain *vmac;
447 struct task_struct *t = task_early_kill(tsk, force_early);
448
449 if (!t)
450 continue;
451 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
452 pgoff, pgoff) {
453 vma = vmac->vma;
454 if (!page_mapped_in_vma(page, vma))
455 continue;
456 if (vma->vm_mm == t->mm)
457 add_to_kill(t, page, vma, to_kill, tkc);
458 }
459 }
460 qread_unlock(&tasklist_lock);
461 page_unlock_anon_vma_read(av);
462}
463
464
465
466
467static void collect_procs_file(struct page *page, struct list_head *to_kill,
468 struct to_kill **tkc, int force_early)
469{
470 struct vm_area_struct *vma;
471 struct task_struct *tsk;
472 struct address_space *mapping = page->mapping;
473
474 mutex_lock(&mapping->i_mmap_mutex);
475 qread_lock(&tasklist_lock);
476 for_each_process(tsk) {
477 pgoff_t pgoff = page_to_pgoff(page);
478 struct task_struct *t = task_early_kill(tsk, force_early);
479
480 if (!t)
481 continue;
482 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
483 pgoff) {
484
485
486
487
488
489
490
491 if (vma->vm_mm == t->mm)
492 add_to_kill(t, page, vma, to_kill, tkc);
493 }
494 }
495 qread_unlock(&tasklist_lock);
496 mutex_unlock(&mapping->i_mmap_mutex);
497}
498
499
500
501
502
503
504
505static void collect_procs(struct page *page, struct list_head *tokill,
506 int force_early)
507{
508 struct to_kill *tk;
509
510 if (!page->mapping)
511 return;
512
513 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
514 if (!tk)
515 return;
516 if (PageAnon(page))
517 collect_procs_anon(page, tokill, &tk, force_early);
518 else
519 collect_procs_file(page, tokill, &tk, force_early);
520 kfree(tk);
521}
522
523
524
525
526
527enum outcome {
528 IGNORED,
529 FAILED,
530 DELAYED,
531 RECOVERED,
532};
533
534static const char *action_name[] = {
535 [IGNORED] = "Ignored",
536 [FAILED] = "Failed",
537 [DELAYED] = "Delayed",
538 [RECOVERED] = "Recovered",
539};
540
541
542
543
544
545
546
547static int delete_from_lru_cache(struct page *p)
548{
549 if (!isolate_lru_page(p)) {
550
551
552
553
554 ClearPageActive(p);
555 ClearPageUnevictable(p);
556
557
558
559 page_cache_release(p);
560 return 0;
561 }
562 return -EIO;
563}
564
565
566
567
568
569
570static int me_kernel(struct page *p, unsigned long pfn)
571{
572 return IGNORED;
573}
574
575
576
577
578static int me_unknown(struct page *p, unsigned long pfn)
579{
580 printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
581 return FAILED;
582}
583
584
585
586
587static int me_pagecache_clean(struct page *p, unsigned long pfn)
588{
589 int err;
590 int ret = FAILED;
591 struct address_space *mapping;
592
593 delete_from_lru_cache(p);
594
595
596
597
598
599 if (PageAnon(p))
600 return RECOVERED;
601
602
603
604
605
606
607
608
609 mapping = page_mapping(p);
610 if (!mapping) {
611
612
613
614 return FAILED;
615 }
616
617
618
619
620
621
622 if (mapping->a_ops->error_remove_page) {
623 err = mapping->a_ops->error_remove_page(mapping, p);
624 if (err != 0) {
625 printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
626 pfn, err);
627 } else if (page_has_private(p) &&
628 !try_to_release_page(p, GFP_NOIO)) {
629 pr_info("MCE %#lx: failed to release buffers\n", pfn);
630 } else {
631 ret = RECOVERED;
632 }
633 } else {
634
635
636
637
638 if (invalidate_inode_page(p))
639 ret = RECOVERED;
640 else
641 printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
642 pfn);
643 }
644 return ret;
645}
646
647
648
649
650
651
652static int me_pagecache_dirty(struct page *p, unsigned long pfn)
653{
654 struct address_space *mapping = page_mapping(p);
655
656 SetPageError(p);
657
658 if (mapping) {
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693 mapping_set_error(mapping, EIO);
694 }
695
696 return me_pagecache_clean(p, pfn);
697}
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718static int me_swapcache_dirty(struct page *p, unsigned long pfn)
719{
720 ClearPageDirty(p);
721
722 ClearPageUptodate(p);
723
724 if (!delete_from_lru_cache(p))
725 return DELAYED;
726 else
727 return FAILED;
728}
729
730static int me_swapcache_clean(struct page *p, unsigned long pfn)
731{
732 delete_from_swap_cache(p);
733
734 if (!delete_from_lru_cache(p))
735 return RECOVERED;
736 else
737 return FAILED;
738}
739
740
741
742
743
744
745
746static int me_huge_page(struct page *p, unsigned long pfn)
747{
748 int res = 0;
749 struct page *hpage = compound_head(p);
750
751 if (!PageHuge(hpage))
752 return DELAYED;
753
754
755
756
757
758
759
760
761
762 if (!(page_mapping(hpage) || PageAnon(hpage))) {
763 res = dequeue_hwpoisoned_huge_page(hpage);
764 if (!res)
765 return RECOVERED;
766 }
767 return DELAYED;
768}
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783#define dirty (1UL << PG_dirty)
784#define sc (1UL << PG_swapcache)
785#define unevict (1UL << PG_unevictable)
786#define mlock (1UL << PG_mlocked)
787#define writeback (1UL << PG_writeback)
788#define lru (1UL << PG_lru)
789#define swapbacked (1UL << PG_swapbacked)
790#define head (1UL << PG_head)
791#define tail (1UL << PG_tail)
792#define compound (1UL << PG_compound)
793#define slab (1UL << PG_slab)
794#define reserved (1UL << PG_reserved)
795
796static struct page_state {
797 unsigned long mask;
798 unsigned long res;
799 char *msg;
800 int (*action)(struct page *p, unsigned long pfn);
801} error_states[] = {
802 { reserved, reserved, "reserved kernel", me_kernel },
803
804
805
806
807
808
809
810
811
812
813 { slab, slab, "kernel slab", me_kernel },
814
815#ifdef CONFIG_PAGEFLAGS_EXTENDED
816 { head, head, "huge", me_huge_page },
817 { tail, tail, "huge", me_huge_page },
818#else
819 { compound, compound, "huge", me_huge_page },
820#endif
821
822 { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
823 { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
824
825 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
826 { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean },
827
828 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
829 { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean },
830
831 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
832 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
833
834
835
836
837 { 0, 0, "unknown page state", me_unknown },
838};
839
840#undef dirty
841#undef sc
842#undef unevict
843#undef mlock
844#undef writeback
845#undef lru
846#undef swapbacked
847#undef head
848#undef tail
849#undef compound
850#undef slab
851#undef reserved
852
853
854
855
856
857static void action_result(unsigned long pfn, char *msg, int result)
858{
859 pr_err("MCE %#lx: %s page recovery: %s\n",
860 pfn, msg, action_name[result]);
861}
862
863static int page_action(struct page_state *ps, struct page *p,
864 unsigned long pfn)
865{
866 int result;
867 int count;
868
869 result = ps->action(p, pfn);
870
871 count = page_count(p) - 1;
872 if (ps->action == me_swapcache_dirty && result == DELAYED)
873 count--;
874 if (count != 0) {
875 printk(KERN_ERR
876 "MCE %#lx: %s page still referenced by %d users\n",
877 pfn, ps->msg, count);
878 result = FAILED;
879 }
880 action_result(pfn, ps->msg, result);
881
882
883
884
885
886
887 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
888}
889
890
891
892
893
894
895
896
897int get_hwpoison_page(struct page *page)
898{
899 struct page *head = compound_head(page);
900
901 if (PageHuge(head))
902 return get_page_unless_zero(head);
903
904
905
906
907
908
909 if (PageTransHuge(head)) {
910 if (get_page_unless_zero(head)) {
911 if (PageTail(page))
912 get_page(page);
913 return 1;
914 } else {
915 return 0;
916 }
917 }
918
919 return get_page_unless_zero(page);
920}
921EXPORT_SYMBOL_GPL(get_hwpoison_page);
922
923
924
925
926
927static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
928 int trapno, int flags, struct page **hpagep)
929{
930 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
931 struct address_space *mapping;
932 LIST_HEAD(tokill);
933 int ret;
934 int kill = 1, forcekill;
935 struct page *hpage = *hpagep;
936 bool mlocked = PageMlocked(hpage);
937
938
939
940
941
942 if (PageReserved(p) || PageSlab(p))
943 return SWAP_SUCCESS;
944 if (!(PageLRU(hpage) || PageHuge(p)))
945 return SWAP_SUCCESS;
946
947
948
949
950
951 if (!page_mapped(hpage))
952 return SWAP_SUCCESS;
953
954 if (PageKsm(p)) {
955 pr_err("MCE %#lx: can't handle KSM pages.\n", pfn);
956 return SWAP_FAIL;
957 }
958
959 if (PageSwapCache(p)) {
960 printk(KERN_ERR
961 "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
962 ttu |= TTU_IGNORE_HWPOISON;
963 }
964
965
966
967
968
969
970
971 mapping = page_mapping(hpage);
972 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
973 mapping_cap_writeback_dirty(mapping)) {
974 if (page_mkclean(hpage)) {
975 SetPageDirty(hpage);
976 } else {
977 kill = 0;
978 ttu |= TTU_IGNORE_HWPOISON;
979 printk(KERN_INFO
980 "MCE %#lx: corrupted page was clean: dropped without side effects\n",
981 pfn);
982 }
983 }
984
985
986
987
988
989
990
991
992
993 if (kill)
994 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
995
996 ret = try_to_unmap(hpage, ttu);
997 if (ret != SWAP_SUCCESS)
998 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
999 pfn, page_mapcount(hpage));
1000
1001
1002
1003
1004
1005 if (mlocked)
1006 shake_page(hpage, 0);
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018 forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
1019 kill_procs(&tokill, forcekill, trapno,
1020 ret != SWAP_SUCCESS, p, pfn, flags);
1021
1022 return ret;
1023}
1024
1025static void set_page_hwpoison_huge_page(struct page *hpage)
1026{
1027 int i;
1028 int nr_pages = 1 << compound_order(hpage);
1029 for (i = 0; i < nr_pages; i++)
1030 SetPageHWPoison(hpage + i);
1031}
1032
1033static void clear_page_hwpoison_huge_page(struct page *hpage)
1034{
1035 int i;
1036 int nr_pages = 1 << compound_order(hpage);
1037 for (i = 0; i < nr_pages; i++)
1038 ClearPageHWPoison(hpage + i);
1039}
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059int memory_failure(unsigned long pfn, int trapno, int flags)
1060{
1061 struct page_state *ps;
1062 struct page *p;
1063 struct page *hpage;
1064 struct page *orig_head;
1065 int res;
1066 unsigned int nr_pages;
1067 unsigned long page_flags;
1068
1069 if (!sysctl_memory_failure_recovery)
1070 panic("Memory failure from trap %d on page %lx", trapno, pfn);
1071
1072 if (!pfn_valid(pfn)) {
1073 printk(KERN_ERR
1074 "MCE %#lx: memory outside kernel control\n",
1075 pfn);
1076 return -ENXIO;
1077 }
1078
1079 p = pfn_to_page(pfn);
1080 orig_head = hpage = compound_head(p);
1081 if (TestSetPageHWPoison(p)) {
1082 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
1083 return 0;
1084 }
1085
1086
1087
1088
1089
1090
1091
1092
1093 if (PageHuge(p))
1094 nr_pages = 1 << compound_order(hpage);
1095 else
1096 nr_pages = 1;
1097 atomic_long_add(nr_pages, &num_poisoned_pages);
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113 if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
1114 if (is_free_buddy_page(p)) {
1115 action_result(pfn, "free buddy", DELAYED);
1116 return 0;
1117 } else if (PageHuge(hpage)) {
1118
1119
1120
1121 lock_page(hpage);
1122 if (PageHWPoison(hpage)) {
1123 if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
1124 || (p != hpage && TestSetPageHWPoison(hpage))) {
1125 atomic_long_sub(nr_pages, &num_poisoned_pages);
1126 unlock_page(hpage);
1127 return 0;
1128 }
1129 }
1130 set_page_hwpoison_huge_page(hpage);
1131 res = dequeue_hwpoisoned_huge_page(hpage);
1132 action_result(pfn, "free huge",
1133 res ? IGNORED : DELAYED);
1134 unlock_page(hpage);
1135 return res;
1136 } else {
1137 action_result(pfn, "high order kernel", IGNORED);
1138 return -EBUSY;
1139 }
1140 }
1141
1142 if (!PageHuge(p) && PageTransHuge(hpage)) {
1143 if (!PageAnon(hpage)) {
1144 pr_err("MCE: %#lx: non anonymous thp\n", pfn);
1145 if (TestClearPageHWPoison(p))
1146 atomic_long_sub(nr_pages, &num_poisoned_pages);
1147 put_page(p);
1148 if (p != hpage)
1149 put_page(hpage);
1150 return -EBUSY;
1151 }
1152 if (unlikely(split_huge_page(hpage))) {
1153 pr_err("MCE: %#lx: thp split failed\n", pfn);
1154 if (TestClearPageHWPoison(p))
1155 atomic_long_sub(nr_pages, &num_poisoned_pages);
1156 put_page(p);
1157 if (p != hpage)
1158 put_page(hpage);
1159 return -EBUSY;
1160 }
1161 VM_BUG_ON_PAGE(!page_count(p), p);
1162 hpage = compound_head(p);
1163 }
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173 shake_page(p, 0);
1174
1175 if (!PageLRU(p) && is_free_buddy_page(p)) {
1176 if (flags & MF_COUNT_INCREASED)
1177 action_result(pfn, "free buddy", DELAYED);
1178 else
1179 action_result(pfn, "free buddy, 2nd try", DELAYED);
1180 return 0;
1181 }
1182
1183
1184
1185
1186
1187
1188 lock_page(hpage);
1189
1190
1191
1192
1193
1194 if (PageCompound(p) && compound_head(p) != orig_head) {
1195 action_result(pfn, "different compound page after locking", IGNORED);
1196 res = -EBUSY;
1197 goto out;
1198 }
1199
1200
1201
1202
1203
1204
1205
1206
1207 page_flags = p->flags;
1208
1209
1210
1211
1212 if (!PageHWPoison(p)) {
1213 printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
1214 atomic_long_sub(nr_pages, &num_poisoned_pages);
1215 put_page(hpage);
1216 res = 0;
1217 goto out;
1218 }
1219 if (hwpoison_filter(p)) {
1220 if (TestClearPageHWPoison(p))
1221 atomic_long_sub(nr_pages, &num_poisoned_pages);
1222 unlock_page(hpage);
1223 put_page(hpage);
1224 return 0;
1225 }
1226
1227 if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
1228 goto identify_page_state;
1229
1230
1231
1232
1233
1234 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1235 action_result(pfn, "hugepage already hardware poisoned",
1236 IGNORED);
1237 unlock_page(hpage);
1238 put_page(hpage);
1239 return 0;
1240 }
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251 if (PageHuge(p) && huge_page_size(page_hstate(hpage)) > PMD_SIZE) {
1252 action_result(pfn, "non-pmd-sized huge page", IGNORED);
1253 unlock_page(hpage);
1254 put_page(hpage);
1255 return -EBUSY;
1256 }
1257
1258
1259
1260
1261
1262
1263
1264 if (PageHuge(p))
1265 set_page_hwpoison_huge_page(hpage);
1266
1267 wait_on_page_writeback(p);
1268
1269
1270
1271
1272
1273
1274
1275
1276 if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
1277 != SWAP_SUCCESS) {
1278 action_result(pfn, "unmapping failed", IGNORED);
1279 res = -EBUSY;
1280 goto out;
1281 }
1282
1283
1284
1285
1286 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1287 action_result(pfn, "already truncated LRU", IGNORED);
1288 res = -EBUSY;
1289 goto out;
1290 }
1291
1292identify_page_state:
1293 res = -EBUSY;
1294
1295
1296
1297
1298
1299 for (ps = error_states;; ps++)
1300 if ((p->flags & ps->mask) == ps->res)
1301 break;
1302
1303 page_flags |= (p->flags & (1UL << PG_dirty));
1304
1305 if (!ps->mask)
1306 for (ps = error_states;; ps++)
1307 if ((page_flags & ps->mask) == ps->res)
1308 break;
1309 res = page_action(ps, p, pfn);
1310out:
1311 unlock_page(hpage);
1312 return res;
1313}
1314EXPORT_SYMBOL_GPL(memory_failure);
1315
1316#define MEMORY_FAILURE_FIFO_ORDER 4
1317#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1318
1319struct memory_failure_entry {
1320 unsigned long pfn;
1321 int trapno;
1322 int flags;
1323};
1324
1325struct memory_failure_cpu {
1326 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1327 MEMORY_FAILURE_FIFO_SIZE);
1328 spinlock_t lock;
1329 struct work_struct work;
1330};
1331
1332static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351void memory_failure_queue(unsigned long pfn, int trapno, int flags)
1352{
1353 struct memory_failure_cpu *mf_cpu;
1354 unsigned long proc_flags;
1355 struct memory_failure_entry entry = {
1356 .pfn = pfn,
1357 .trapno = trapno,
1358 .flags = flags,
1359 };
1360
1361 mf_cpu = &get_cpu_var(memory_failure_cpu);
1362 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1363 if (kfifo_put(&mf_cpu->fifo, &entry)) {
1364 gmb();
1365 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1366 } else {
1367 gmb();
1368 pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
1369 pfn);
1370 }
1371 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1372 put_cpu_var(memory_failure_cpu);
1373}
1374EXPORT_SYMBOL_GPL(memory_failure_queue);
1375
1376static void memory_failure_work_func(struct work_struct *work)
1377{
1378 struct memory_failure_cpu *mf_cpu;
1379 struct memory_failure_entry entry = { 0, };
1380 unsigned long proc_flags;
1381 int gotten;
1382
1383 mf_cpu = this_cpu_ptr(&memory_failure_cpu);
1384 for (;;) {
1385 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1386 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1387 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1388 if (!gotten)
1389 break;
1390 if (entry.flags & MF_SOFT_OFFLINE)
1391 soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
1392 else
1393 memory_failure(entry.pfn, entry.trapno, entry.flags);
1394 }
1395}
1396
1397static int __init memory_failure_init(void)
1398{
1399 struct memory_failure_cpu *mf_cpu;
1400 int cpu;
1401
1402 for_each_possible_cpu(cpu) {
1403 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1404 spin_lock_init(&mf_cpu->lock);
1405 INIT_KFIFO(mf_cpu->fifo);
1406 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1407 }
1408
1409 return 0;
1410}
1411core_initcall(memory_failure_init);
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425int unpoison_memory(unsigned long pfn)
1426{
1427 struct page *page;
1428 struct page *p;
1429 int freeit = 0;
1430 unsigned int nr_pages;
1431
1432 if (!pfn_valid(pfn))
1433 return -ENXIO;
1434
1435 p = pfn_to_page(pfn);
1436 page = compound_head(p);
1437
1438 if (!PageHWPoison(p)) {
1439 pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
1440 return 0;
1441 }
1442
1443
1444
1445
1446
1447
1448 if (!PageHuge(page) && PageTransHuge(page)) {
1449 pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
1450 return 0;
1451 }
1452
1453 nr_pages = 1 << compound_order(page);
1454
1455 if (!get_hwpoison_page(p)) {
1456
1457
1458
1459
1460
1461
1462 if (PageHuge(page)) {
1463 pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
1464 return 0;
1465 }
1466 if (TestClearPageHWPoison(p))
1467 atomic_long_dec(&num_poisoned_pages);
1468 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1469 return 0;
1470 }
1471
1472 lock_page(page);
1473
1474
1475
1476
1477
1478
1479 if (TestClearPageHWPoison(page)) {
1480 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1481 atomic_long_sub(nr_pages, &num_poisoned_pages);
1482 freeit = 1;
1483 if (PageHuge(page))
1484 clear_page_hwpoison_huge_page(page);
1485 }
1486 unlock_page(page);
1487
1488 put_page(page);
1489 if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
1490 put_page(page);
1491
1492 return 0;
1493}
1494EXPORT_SYMBOL(unpoison_memory);
1495
1496static struct page *new_page(struct page *p, unsigned long private, int **x)
1497{
1498 int nid = page_to_nid(p);
1499 if (PageHuge(p))
1500 return alloc_huge_page_node(page_hstate(compound_head(p)),
1501 nid);
1502 else
1503 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1504}
1505
1506
1507
1508
1509
1510
1511
1512static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1513{
1514 int ret;
1515
1516 if (flags & MF_COUNT_INCREASED)
1517 return 1;
1518
1519
1520
1521
1522
1523 if (!get_hwpoison_page(p)) {
1524 if (PageHuge(p)) {
1525 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1526 ret = 0;
1527 } else if (is_free_buddy_page(p)) {
1528 pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1529 ret = 0;
1530 } else {
1531 pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1532 __func__, pfn, p->flags);
1533 ret = -EIO;
1534 }
1535 } else {
1536
1537 ret = 1;
1538 }
1539 return ret;
1540}
1541
1542static int get_any_page(struct page *page, unsigned long pfn, int flags)
1543{
1544 int ret = __get_any_page(page, pfn, flags);
1545
1546 if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
1547
1548
1549
1550 put_page(page);
1551 shake_page(page, 1);
1552
1553
1554
1555
1556 ret = __get_any_page(page, pfn, 0);
1557 if (!PageLRU(page)) {
1558 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1559 pfn, page->flags);
1560 return -EIO;
1561 }
1562 }
1563 return ret;
1564}
1565
1566static int soft_offline_huge_page(struct page *page, int flags)
1567{
1568 int ret;
1569 unsigned long pfn = page_to_pfn(page);
1570 struct page *hpage = compound_head(page);
1571 LIST_HEAD(pagelist);
1572
1573
1574
1575
1576
1577 lock_page(hpage);
1578 if (PageHWPoison(hpage)) {
1579 unlock_page(hpage);
1580 put_page(hpage);
1581 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1582 return -EBUSY;
1583 }
1584 unlock_page(hpage);
1585
1586 ret = isolate_huge_page(hpage, &pagelist);
1587 if (ret) {
1588
1589
1590
1591
1592 put_page(hpage);
1593 } else {
1594 pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
1595 return -EBUSY;
1596 }
1597
1598 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1599 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1600 if (ret) {
1601 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1602 pfn, ret, page->flags);
1603
1604
1605
1606
1607
1608 putback_active_hugepage(hpage);
1609 if (ret > 0)
1610 ret = -EIO;
1611 } else {
1612
1613 if (PageHuge(page)) {
1614 set_page_hwpoison_huge_page(hpage);
1615 dequeue_hwpoisoned_huge_page(hpage);
1616 atomic_long_add(1 << compound_order(hpage),
1617 &num_poisoned_pages);
1618 } else {
1619 SetPageHWPoison(page);
1620 atomic_long_inc(&num_poisoned_pages);
1621 }
1622 }
1623 return ret;
1624}
1625
1626static int __soft_offline_page(struct page *page, int flags);
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650int soft_offline_page(struct page *page, int flags)
1651{
1652 int ret;
1653 unsigned long pfn = page_to_pfn(page);
1654 struct page *hpage = compound_head(page);
1655
1656 if (PageHWPoison(page)) {
1657 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1658 return -EBUSY;
1659 }
1660 if (!PageHuge(page) && PageTransHuge(hpage)) {
1661 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
1662 pr_info("soft offline: %#lx: failed to split THP\n",
1663 pfn);
1664 return -EBUSY;
1665 }
1666 }
1667
1668 get_online_mems();
1669
1670 ret = get_any_page(page, pfn, flags);
1671 put_online_mems();
1672 if (ret > 0) {
1673 if (PageHuge(page))
1674 ret = soft_offline_huge_page(page, flags);
1675 else
1676 ret = __soft_offline_page(page, flags);
1677 } else if (ret == 0) {
1678 if (PageHuge(page)) {
1679 set_page_hwpoison_huge_page(hpage);
1680 if (!dequeue_hwpoisoned_huge_page(hpage))
1681 atomic_long_add(1 << compound_order(hpage),
1682 &num_poisoned_pages);
1683 } else {
1684 if (!TestSetPageHWPoison(page))
1685 atomic_long_inc(&num_poisoned_pages);
1686 }
1687 }
1688 return ret;
1689}
1690
1691static int __soft_offline_page(struct page *page, int flags)
1692{
1693 int ret;
1694 unsigned long pfn = page_to_pfn(page);
1695
1696
1697
1698
1699
1700
1701
1702 lock_page(page);
1703 wait_on_page_writeback(page);
1704 if (PageHWPoison(page)) {
1705 unlock_page(page);
1706 put_page(page);
1707 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1708 return -EBUSY;
1709 }
1710
1711
1712
1713
1714 ret = invalidate_inode_page(page);
1715 unlock_page(page);
1716
1717
1718
1719
1720 if (ret == 1) {
1721 put_page(page);
1722 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1723 SetPageHWPoison(page);
1724 atomic_long_inc(&num_poisoned_pages);
1725 return 0;
1726 }
1727
1728
1729
1730
1731
1732
1733 ret = isolate_lru_page(page);
1734
1735
1736
1737
1738 put_page(page);
1739 if (!ret) {
1740 LIST_HEAD(pagelist);
1741 inc_zone_page_state(page, NR_ISOLATED_ANON +
1742 page_is_file_cache(page));
1743 list_add(&page->lru, &pagelist);
1744 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1745 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1746 if (ret) {
1747 putback_lru_pages(&pagelist);
1748 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1749 pfn, ret, page->flags);
1750 if (ret > 0)
1751 ret = -EIO;
1752 } else {
1753 SetPageHWPoison(page);
1754 atomic_long_inc(&num_poisoned_pages);
1755 }
1756 } else {
1757 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1758 pfn, ret, page_count(page), page->flags);
1759 }
1760 return ret;
1761}
1762