1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38#include <linux/kernel.h>
39#include <linux/mm.h>
40#include <linux/page-flags.h>
41#include <linux/kernel-page-flags.h>
42#include <linux/sched.h>
43#include <linux/ksm.h>
44#include <linux/rmap.h>
45#include <linux/export.h>
46#include <linux/pagemap.h>
47#include <linux/swap.h>
48#include <linux/backing-dev.h>
49#include <linux/migrate.h>
50#include <linux/page-isolation.h>
51#include <linux/suspend.h>
52#include <linux/slab.h>
53#include <linux/swapops.h>
54#include <linux/hugetlb.h>
55#include <linux/memory_hotplug.h>
56#include <linux/mm_inline.h>
57#include <linux/kfifo.h>
58#include "internal.h"
59
60int sysctl_memory_failure_early_kill __read_mostly = 0;
61
62int sysctl_memory_failure_recovery __read_mostly = 1;
63
64atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
65
66#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
67
68u32 hwpoison_filter_enable = 0;
69u32 hwpoison_filter_dev_major = ~0U;
70u32 hwpoison_filter_dev_minor = ~0U;
71u64 hwpoison_filter_flags_mask;
72u64 hwpoison_filter_flags_value;
73EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
74EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
75EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
76EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
77EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
78
79static int hwpoison_filter_dev(struct page *p)
80{
81 struct address_space *mapping;
82 dev_t dev;
83
84 if (hwpoison_filter_dev_major == ~0U &&
85 hwpoison_filter_dev_minor == ~0U)
86 return 0;
87
88
89
90
91 if (PageSlab(p))
92 return -EINVAL;
93
94 mapping = page_mapping(p);
95 if (mapping == NULL || mapping->host == NULL)
96 return -EINVAL;
97
98 dev = mapping->host->i_sb->s_dev;
99 if (hwpoison_filter_dev_major != ~0U &&
100 hwpoison_filter_dev_major != MAJOR(dev))
101 return -EINVAL;
102 if (hwpoison_filter_dev_minor != ~0U &&
103 hwpoison_filter_dev_minor != MINOR(dev))
104 return -EINVAL;
105
106 return 0;
107}
108
109static int hwpoison_filter_flags(struct page *p)
110{
111 if (!hwpoison_filter_flags_mask)
112 return 0;
113
114 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
115 hwpoison_filter_flags_value)
116 return 0;
117 else
118 return -EINVAL;
119}
120
121
122
123
124
125
126
127
128
129
130
131#ifdef CONFIG_MEMCG_SWAP
132u64 hwpoison_filter_memcg;
133EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
134static int hwpoison_filter_task(struct page *p)
135{
136 struct mem_cgroup *mem;
137 struct cgroup_subsys_state *css;
138 unsigned long ino;
139
140 if (!hwpoison_filter_memcg)
141 return 0;
142
143 mem = try_get_mem_cgroup_from_page(p);
144 if (!mem)
145 return -EINVAL;
146
147 css = mem_cgroup_css(mem);
148 ino = cgroup_ino(css->cgroup);
149 css_put(css);
150
151 if (ino != hwpoison_filter_memcg)
152 return -EINVAL;
153
154 return 0;
155}
156#else
157static int hwpoison_filter_task(struct page *p) { return 0; }
158#endif
159
160int hwpoison_filter(struct page *p)
161{
162 if (!hwpoison_filter_enable)
163 return 0;
164
165 if (hwpoison_filter_dev(p))
166 return -EINVAL;
167
168 if (hwpoison_filter_flags(p))
169 return -EINVAL;
170
171 if (hwpoison_filter_task(p))
172 return -EINVAL;
173
174 return 0;
175}
176#else
177int hwpoison_filter(struct page *p)
178{
179 return 0;
180}
181#endif
182
183EXPORT_SYMBOL_GPL(hwpoison_filter);
184
185
186
187
188
189
190static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
191 unsigned long pfn, struct page *page, int flags)
192{
193 struct siginfo si;
194 int ret;
195
196 printk(KERN_ERR
197 "MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
198 pfn, t->comm, t->pid);
199 si.si_signo = SIGBUS;
200 si.si_errno = 0;
201 si.si_addr = (void *)addr;
202#ifdef __ARCH_SI_TRAPNO
203 si.si_trapno = trapno;
204#endif
205 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
206
207 if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
208 si.si_code = BUS_MCEERR_AR;
209 ret = force_sig_info(SIGBUS, &si, current);
210 } else {
211
212
213
214
215
216
217 si.si_code = BUS_MCEERR_AO;
218 ret = send_sig_info(SIGBUS, &si, t);
219 }
220 if (ret < 0)
221 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
222 t->comm, t->pid, ret);
223 return ret;
224}
225
226
227
228
229
230void shake_page(struct page *p, int access)
231{
232 if (!PageSlab(p)) {
233 lru_add_drain_all();
234 if (PageLRU(p))
235 return;
236 drain_all_pages();
237 if (PageLRU(p) || is_free_buddy_page(p))
238 return;
239 }
240
241
242
243
244
245 if (access) {
246 int nr;
247 int nid = page_to_nid(p);
248 do {
249 struct shrink_control shrink = {
250 .gfp_mask = GFP_KERNEL,
251 };
252 node_set(nid, shrink.nodes_to_scan);
253
254 nr = shrink_slab(&shrink, 1000, 1000);
255 if (page_count(p) == 1)
256 break;
257 } while (nr > 10);
258 }
259}
260EXPORT_SYMBOL_GPL(shake_page);
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284struct to_kill {
285 struct list_head nd;
286 struct task_struct *tsk;
287 unsigned long addr;
288 char addr_valid;
289};
290
291
292
293
294
295
296
297
298
299
300
301static void add_to_kill(struct task_struct *tsk, struct page *p,
302 struct vm_area_struct *vma,
303 struct list_head *to_kill,
304 struct to_kill **tkc)
305{
306 struct to_kill *tk;
307
308 if (*tkc) {
309 tk = *tkc;
310 *tkc = NULL;
311 } else {
312 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
313 if (!tk) {
314 printk(KERN_ERR
315 "MCE: Out of memory while machine check handling\n");
316 return;
317 }
318 }
319 tk->addr = page_address_in_vma(p, vma);
320 tk->addr_valid = 1;
321
322
323
324
325
326
327
328 if (tk->addr == -EFAULT) {
329 pr_info("MCE: Unable to find user space address %lx in %s\n",
330 page_to_pfn(p), tsk->comm);
331 tk->addr_valid = 0;
332 }
333 get_task_struct(tsk);
334 tk->tsk = tsk;
335 list_add_tail(&tk->nd, to_kill);
336}
337
338
339
340
341
342
343
344
345
346static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
347 int fail, struct page *page, unsigned long pfn,
348 int flags)
349{
350 struct to_kill *tk, *next;
351
352 list_for_each_entry_safe (tk, next, to_kill, nd) {
353 if (forcekill) {
354
355
356
357
358
359 if (fail || tk->addr_valid == 0) {
360 printk(KERN_ERR
361 "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
362 pfn, tk->tsk->comm, tk->tsk->pid);
363 force_sig(SIGKILL, tk->tsk);
364 }
365
366
367
368
369
370
371
372 else if (kill_proc(tk->tsk, tk->addr, trapno,
373 pfn, page, flags) < 0)
374 printk(KERN_ERR
375 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
376 pfn, tk->tsk->comm, tk->tsk->pid);
377 }
378 put_task_struct(tk->tsk);
379 kfree(tk);
380 }
381}
382
383
384
385
386
387
388
389
390
391static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
392{
393 struct task_struct *t;
394
395 for_each_thread(tsk, t)
396 if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
397 return t;
398 return NULL;
399}
400
401
402
403
404
405
406
407static struct task_struct *task_early_kill(struct task_struct *tsk,
408 int force_early)
409{
410 struct task_struct *t;
411 if (!tsk->mm)
412 return NULL;
413 if (force_early)
414 return tsk;
415 t = find_early_kill_thread(tsk);
416 if (t)
417 return t;
418 if (sysctl_memory_failure_early_kill)
419 return tsk;
420 return NULL;
421}
422
423
424
425
426static void collect_procs_anon(struct page *page, struct list_head *to_kill,
427 struct to_kill **tkc, int force_early)
428{
429 struct vm_area_struct *vma;
430 struct task_struct *tsk;
431 struct anon_vma *av;
432 pgoff_t pgoff;
433
434 av = page_lock_anon_vma_read(page);
435 if (av == NULL)
436 return;
437
438 pgoff = page_to_pgoff(page);
439 read_lock(&tasklist_lock);
440 for_each_process (tsk) {
441 struct anon_vma_chain *vmac;
442 struct task_struct *t = task_early_kill(tsk, force_early);
443
444 if (!t)
445 continue;
446 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
447 pgoff, pgoff) {
448 vma = vmac->vma;
449 if (!page_mapped_in_vma(page, vma))
450 continue;
451 if (vma->vm_mm == t->mm)
452 add_to_kill(t, page, vma, to_kill, tkc);
453 }
454 }
455 read_unlock(&tasklist_lock);
456 page_unlock_anon_vma_read(av);
457}
458
459
460
461
462static void collect_procs_file(struct page *page, struct list_head *to_kill,
463 struct to_kill **tkc, int force_early)
464{
465 struct vm_area_struct *vma;
466 struct task_struct *tsk;
467 struct address_space *mapping = page->mapping;
468
469 mutex_lock(&mapping->i_mmap_mutex);
470 read_lock(&tasklist_lock);
471 for_each_process(tsk) {
472 pgoff_t pgoff = page_to_pgoff(page);
473 struct task_struct *t = task_early_kill(tsk, force_early);
474
475 if (!t)
476 continue;
477 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
478 pgoff) {
479
480
481
482
483
484
485
486 if (vma->vm_mm == t->mm)
487 add_to_kill(t, page, vma, to_kill, tkc);
488 }
489 }
490 read_unlock(&tasklist_lock);
491 mutex_unlock(&mapping->i_mmap_mutex);
492}
493
494
495
496
497
498
499
500static void collect_procs(struct page *page, struct list_head *tokill,
501 int force_early)
502{
503 struct to_kill *tk;
504
505 if (!page->mapping)
506 return;
507
508 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
509 if (!tk)
510 return;
511 if (PageAnon(page))
512 collect_procs_anon(page, tokill, &tk, force_early);
513 else
514 collect_procs_file(page, tokill, &tk, force_early);
515 kfree(tk);
516}
517
518
519
520
521
522enum outcome {
523 IGNORED,
524 FAILED,
525 DELAYED,
526 RECOVERED,
527};
528
529static const char *action_name[] = {
530 [IGNORED] = "Ignored",
531 [FAILED] = "Failed",
532 [DELAYED] = "Delayed",
533 [RECOVERED] = "Recovered",
534};
535
536
537
538
539
540
541
542static int delete_from_lru_cache(struct page *p)
543{
544 if (!isolate_lru_page(p)) {
545
546
547
548
549 ClearPageActive(p);
550 ClearPageUnevictable(p);
551
552
553
554 page_cache_release(p);
555 return 0;
556 }
557 return -EIO;
558}
559
560
561
562
563
564
565static int me_kernel(struct page *p, unsigned long pfn)
566{
567 return IGNORED;
568}
569
570
571
572
573static int me_unknown(struct page *p, unsigned long pfn)
574{
575 printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
576 return FAILED;
577}
578
579
580
581
582static int me_pagecache_clean(struct page *p, unsigned long pfn)
583{
584 int err;
585 int ret = FAILED;
586 struct address_space *mapping;
587
588 delete_from_lru_cache(p);
589
590
591
592
593
594 if (PageAnon(p))
595 return RECOVERED;
596
597
598
599
600
601
602
603
604 mapping = page_mapping(p);
605 if (!mapping) {
606
607
608
609 return FAILED;
610 }
611
612
613
614
615
616
617 if (mapping->a_ops->error_remove_page) {
618 err = mapping->a_ops->error_remove_page(mapping, p);
619 if (err != 0) {
620 printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
621 pfn, err);
622 } else if (page_has_private(p) &&
623 !try_to_release_page(p, GFP_NOIO)) {
624 pr_info("MCE %#lx: failed to release buffers\n", pfn);
625 } else {
626 ret = RECOVERED;
627 }
628 } else {
629
630
631
632
633 if (invalidate_inode_page(p))
634 ret = RECOVERED;
635 else
636 printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
637 pfn);
638 }
639 return ret;
640}
641
642
643
644
645
646
647static int me_pagecache_dirty(struct page *p, unsigned long pfn)
648{
649 struct address_space *mapping = page_mapping(p);
650
651 SetPageError(p);
652
653 if (mapping) {
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688 mapping_set_error(mapping, EIO);
689 }
690
691 return me_pagecache_clean(p, pfn);
692}
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713static int me_swapcache_dirty(struct page *p, unsigned long pfn)
714{
715 ClearPageDirty(p);
716
717 ClearPageUptodate(p);
718
719 if (!delete_from_lru_cache(p))
720 return DELAYED;
721 else
722 return FAILED;
723}
724
725static int me_swapcache_clean(struct page *p, unsigned long pfn)
726{
727 delete_from_swap_cache(p);
728
729 if (!delete_from_lru_cache(p))
730 return RECOVERED;
731 else
732 return FAILED;
733}
734
735
736
737
738
739
740
741static int me_huge_page(struct page *p, unsigned long pfn)
742{
743 int res = 0;
744 struct page *hpage = compound_head(p);
745
746
747
748
749
750
751
752
753
754
755 if (!(page_mapping(hpage) || PageAnon(hpage))) {
756 res = dequeue_hwpoisoned_huge_page(hpage);
757 if (!res)
758 return RECOVERED;
759 }
760 return DELAYED;
761}
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776#define dirty (1UL << PG_dirty)
777#define sc (1UL << PG_swapcache)
778#define unevict (1UL << PG_unevictable)
779#define mlock (1UL << PG_mlocked)
780#define writeback (1UL << PG_writeback)
781#define lru (1UL << PG_lru)
782#define swapbacked (1UL << PG_swapbacked)
783#define head (1UL << PG_head)
784#define tail (1UL << PG_tail)
785#define compound (1UL << PG_compound)
786#define slab (1UL << PG_slab)
787#define reserved (1UL << PG_reserved)
788
789static struct page_state {
790 unsigned long mask;
791 unsigned long res;
792 char *msg;
793 int (*action)(struct page *p, unsigned long pfn);
794} error_states[] = {
795 { reserved, reserved, "reserved kernel", me_kernel },
796
797
798
799
800
801
802
803
804
805
806 { slab, slab, "kernel slab", me_kernel },
807
808#ifdef CONFIG_PAGEFLAGS_EXTENDED
809 { head, head, "huge", me_huge_page },
810 { tail, tail, "huge", me_huge_page },
811#else
812 { compound, compound, "huge", me_huge_page },
813#endif
814
815 { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
816 { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
817
818 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
819 { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean },
820
821 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
822 { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean },
823
824 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
825 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
826
827
828
829
830 { 0, 0, "unknown page state", me_unknown },
831};
832
833#undef dirty
834#undef sc
835#undef unevict
836#undef mlock
837#undef writeback
838#undef lru
839#undef swapbacked
840#undef head
841#undef tail
842#undef compound
843#undef slab
844#undef reserved
845
846
847
848
849
850static void action_result(unsigned long pfn, char *msg, int result)
851{
852 pr_err("MCE %#lx: %s page recovery: %s\n",
853 pfn, msg, action_name[result]);
854}
855
856static int page_action(struct page_state *ps, struct page *p,
857 unsigned long pfn)
858{
859 int result;
860 int count;
861
862 result = ps->action(p, pfn);
863 action_result(pfn, ps->msg, result);
864
865 count = page_count(p) - 1;
866 if (ps->action == me_swapcache_dirty && result == DELAYED)
867 count--;
868 if (count != 0) {
869 printk(KERN_ERR
870 "MCE %#lx: %s page still referenced by %d users\n",
871 pfn, ps->msg, count);
872 result = FAILED;
873 }
874
875
876
877
878
879
880 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
881}
882
883
884
885
886
887static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
888 int trapno, int flags, struct page **hpagep)
889{
890 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
891 struct address_space *mapping;
892 LIST_HEAD(tokill);
893 int ret;
894 int kill = 1, forcekill;
895 struct page *hpage = *hpagep;
896 struct page *ppage;
897
898
899
900
901
902 if (PageReserved(p) || PageSlab(p))
903 return SWAP_SUCCESS;
904 if (!(PageLRU(hpage) || PageHuge(p)))
905 return SWAP_SUCCESS;
906
907
908
909
910
911 if (!page_mapped(hpage))
912 return SWAP_SUCCESS;
913
914 if (PageKsm(p)) {
915 pr_err("MCE %#lx: can't handle KSM pages.\n", pfn);
916 return SWAP_FAIL;
917 }
918
919 if (PageSwapCache(p)) {
920 printk(KERN_ERR
921 "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
922 ttu |= TTU_IGNORE_HWPOISON;
923 }
924
925
926
927
928
929
930
931 mapping = page_mapping(hpage);
932 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
933 mapping_cap_writeback_dirty(mapping)) {
934 if (page_mkclean(hpage)) {
935 SetPageDirty(hpage);
936 } else {
937 kill = 0;
938 ttu |= TTU_IGNORE_HWPOISON;
939 printk(KERN_INFO
940 "MCE %#lx: corrupted page was clean: dropped without side effects\n",
941 pfn);
942 }
943 }
944
945
946
947
948
949
950
951 ppage = hpage;
952
953 if (PageTransHuge(hpage)) {
954
955
956
957
958
959
960
961
962
963
964 if (!PageHuge(hpage) && PageAnon(hpage)) {
965 if (unlikely(split_huge_page(hpage))) {
966
967
968
969
970
971
972 printk(KERN_INFO
973 "MCE %#lx: failed to split THP\n", pfn);
974
975 BUG_ON(!PageHWPoison(p));
976 return SWAP_FAIL;
977 }
978
979
980
981
982
983
984 if (hpage != p) {
985 if (!(flags & MF_COUNT_INCREASED)) {
986 put_page(hpage);
987 get_page(p);
988 }
989 lock_page(p);
990 unlock_page(hpage);
991 *hpagep = p;
992 }
993
994 ppage = p;
995 }
996 }
997
998
999
1000
1001
1002
1003
1004
1005
1006 if (kill)
1007 collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
1008
1009 ret = try_to_unmap(ppage, ttu);
1010 if (ret != SWAP_SUCCESS)
1011 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
1012 pfn, page_mapcount(ppage));
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024 forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
1025 kill_procs(&tokill, forcekill, trapno,
1026 ret != SWAP_SUCCESS, p, pfn, flags);
1027
1028 return ret;
1029}
1030
1031static void set_page_hwpoison_huge_page(struct page *hpage)
1032{
1033 int i;
1034 int nr_pages = 1 << compound_order(hpage);
1035 for (i = 0; i < nr_pages; i++)
1036 SetPageHWPoison(hpage + i);
1037}
1038
1039static void clear_page_hwpoison_huge_page(struct page *hpage)
1040{
1041 int i;
1042 int nr_pages = 1 << compound_order(hpage);
1043 for (i = 0; i < nr_pages; i++)
1044 ClearPageHWPoison(hpage + i);
1045}
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065int memory_failure(unsigned long pfn, int trapno, int flags)
1066{
1067 struct page_state *ps;
1068 struct page *p;
1069 struct page *hpage;
1070 int res;
1071 unsigned int nr_pages;
1072 unsigned long page_flags;
1073
1074 if (!sysctl_memory_failure_recovery)
1075 panic("Memory failure from trap %d on page %lx", trapno, pfn);
1076
1077 if (!pfn_valid(pfn)) {
1078 printk(KERN_ERR
1079 "MCE %#lx: memory outside kernel control\n",
1080 pfn);
1081 return -ENXIO;
1082 }
1083
1084 p = pfn_to_page(pfn);
1085 hpage = compound_head(p);
1086 if (TestSetPageHWPoison(p)) {
1087 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
1088 return 0;
1089 }
1090
1091
1092
1093
1094
1095
1096
1097
1098 if (PageHuge(p))
1099 nr_pages = 1 << compound_order(hpage);
1100 else
1101 nr_pages = 1;
1102 atomic_long_add(nr_pages, &num_poisoned_pages);
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118 if (!(flags & MF_COUNT_INCREASED) &&
1119 !get_page_unless_zero(hpage)) {
1120 if (is_free_buddy_page(p)) {
1121 action_result(pfn, "free buddy", DELAYED);
1122 return 0;
1123 } else if (PageHuge(hpage)) {
1124
1125
1126
1127 lock_page(hpage);
1128 if (PageHWPoison(hpage)) {
1129 if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
1130 || (p != hpage && TestSetPageHWPoison(hpage))) {
1131 atomic_long_sub(nr_pages, &num_poisoned_pages);
1132 unlock_page(hpage);
1133 return 0;
1134 }
1135 }
1136 set_page_hwpoison_huge_page(hpage);
1137 res = dequeue_hwpoisoned_huge_page(hpage);
1138 action_result(pfn, "free huge",
1139 res ? IGNORED : DELAYED);
1140 unlock_page(hpage);
1141 return res;
1142 } else {
1143 action_result(pfn, "high order kernel", IGNORED);
1144 return -EBUSY;
1145 }
1146 }
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156 if (!PageHuge(p) && !PageTransTail(p)) {
1157 if (!PageLRU(p))
1158 shake_page(p, 0);
1159 if (!PageLRU(p)) {
1160
1161
1162
1163 if (is_free_buddy_page(p)) {
1164 if (flags & MF_COUNT_INCREASED)
1165 action_result(pfn, "free buddy", DELAYED);
1166 else
1167 action_result(pfn, "free buddy, 2nd try", DELAYED);
1168 return 0;
1169 }
1170 }
1171 }
1172
1173 lock_page(hpage);
1174
1175
1176
1177
1178
1179 if (compound_head(p) != hpage) {
1180 action_result(pfn, "different compound page after locking", IGNORED);
1181 res = -EBUSY;
1182 goto out;
1183 }
1184
1185
1186
1187
1188
1189
1190
1191
1192 page_flags = p->flags;
1193
1194
1195
1196
1197 if (!PageHWPoison(p)) {
1198 printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
1199 atomic_long_sub(nr_pages, &num_poisoned_pages);
1200 put_page(hpage);
1201 res = 0;
1202 goto out;
1203 }
1204 if (hwpoison_filter(p)) {
1205 if (TestClearPageHWPoison(p))
1206 atomic_long_sub(nr_pages, &num_poisoned_pages);
1207 unlock_page(hpage);
1208 put_page(hpage);
1209 return 0;
1210 }
1211
1212 if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
1213 goto identify_page_state;
1214
1215
1216
1217
1218
1219 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1220 action_result(pfn, "hugepage already hardware poisoned",
1221 IGNORED);
1222 unlock_page(hpage);
1223 put_page(hpage);
1224 return 0;
1225 }
1226
1227
1228
1229
1230
1231
1232 if (PageHuge(p))
1233 set_page_hwpoison_huge_page(hpage);
1234
1235
1236
1237
1238
1239 wait_on_page_writeback(p);
1240
1241
1242
1243
1244
1245
1246
1247
1248 if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
1249 != SWAP_SUCCESS) {
1250 action_result(pfn, "unmapping failed", IGNORED);
1251 res = -EBUSY;
1252 goto out;
1253 }
1254
1255
1256
1257
1258 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1259 action_result(pfn, "already truncated LRU", IGNORED);
1260 res = -EBUSY;
1261 goto out;
1262 }
1263
1264identify_page_state:
1265 res = -EBUSY;
1266
1267
1268
1269
1270
1271 for (ps = error_states;; ps++)
1272 if ((p->flags & ps->mask) == ps->res)
1273 break;
1274
1275 page_flags |= (p->flags & (1UL << PG_dirty));
1276
1277 if (!ps->mask)
1278 for (ps = error_states;; ps++)
1279 if ((page_flags & ps->mask) == ps->res)
1280 break;
1281 res = page_action(ps, p, pfn);
1282out:
1283 unlock_page(hpage);
1284 return res;
1285}
1286EXPORT_SYMBOL_GPL(memory_failure);
1287
1288#define MEMORY_FAILURE_FIFO_ORDER 4
1289#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1290
1291struct memory_failure_entry {
1292 unsigned long pfn;
1293 int trapno;
1294 int flags;
1295};
1296
1297struct memory_failure_cpu {
1298 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1299 MEMORY_FAILURE_FIFO_SIZE);
1300 spinlock_t lock;
1301 struct work_struct work;
1302};
1303
1304static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323void memory_failure_queue(unsigned long pfn, int trapno, int flags)
1324{
1325 struct memory_failure_cpu *mf_cpu;
1326 unsigned long proc_flags;
1327 struct memory_failure_entry entry = {
1328 .pfn = pfn,
1329 .trapno = trapno,
1330 .flags = flags,
1331 };
1332
1333 mf_cpu = &get_cpu_var(memory_failure_cpu);
1334 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1335 if (kfifo_put(&mf_cpu->fifo, entry))
1336 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1337 else
1338 pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
1339 pfn);
1340 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1341 put_cpu_var(memory_failure_cpu);
1342}
1343EXPORT_SYMBOL_GPL(memory_failure_queue);
1344
1345static void memory_failure_work_func(struct work_struct *work)
1346{
1347 struct memory_failure_cpu *mf_cpu;
1348 struct memory_failure_entry entry = { 0, };
1349 unsigned long proc_flags;
1350 int gotten;
1351
1352 mf_cpu = this_cpu_ptr(&memory_failure_cpu);
1353 for (;;) {
1354 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1355 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1356 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1357 if (!gotten)
1358 break;
1359 if (entry.flags & MF_SOFT_OFFLINE)
1360 soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
1361 else
1362 memory_failure(entry.pfn, entry.trapno, entry.flags);
1363 }
1364}
1365
1366static int __init memory_failure_init(void)
1367{
1368 struct memory_failure_cpu *mf_cpu;
1369 int cpu;
1370
1371 for_each_possible_cpu(cpu) {
1372 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1373 spin_lock_init(&mf_cpu->lock);
1374 INIT_KFIFO(mf_cpu->fifo);
1375 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1376 }
1377
1378 return 0;
1379}
1380core_initcall(memory_failure_init);
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394int unpoison_memory(unsigned long pfn)
1395{
1396 struct page *page;
1397 struct page *p;
1398 int freeit = 0;
1399 unsigned int nr_pages;
1400
1401 if (!pfn_valid(pfn))
1402 return -ENXIO;
1403
1404 p = pfn_to_page(pfn);
1405 page = compound_head(p);
1406
1407 if (!PageHWPoison(p)) {
1408 pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
1409 return 0;
1410 }
1411
1412
1413
1414
1415
1416
1417 if (!PageHuge(page) && PageTransHuge(page)) {
1418 pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
1419 return 0;
1420 }
1421
1422 nr_pages = 1 << compound_order(page);
1423
1424 if (!get_page_unless_zero(page)) {
1425
1426
1427
1428
1429
1430
1431 if (PageHuge(page)) {
1432 pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
1433 return 0;
1434 }
1435 if (TestClearPageHWPoison(p))
1436 atomic_long_dec(&num_poisoned_pages);
1437 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1438 return 0;
1439 }
1440
1441 lock_page(page);
1442
1443
1444
1445
1446
1447
1448 if (TestClearPageHWPoison(page)) {
1449 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1450 atomic_long_sub(nr_pages, &num_poisoned_pages);
1451 freeit = 1;
1452 if (PageHuge(page))
1453 clear_page_hwpoison_huge_page(page);
1454 }
1455 unlock_page(page);
1456
1457 put_page(page);
1458 if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
1459 put_page(page);
1460
1461 return 0;
1462}
1463EXPORT_SYMBOL(unpoison_memory);
1464
1465static struct page *new_page(struct page *p, unsigned long private, int **x)
1466{
1467 int nid = page_to_nid(p);
1468 if (PageHuge(p))
1469 return alloc_huge_page_node(page_hstate(compound_head(p)),
1470 nid);
1471 else
1472 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1473}
1474
1475
1476
1477
1478
1479
1480
1481static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1482{
1483 int ret;
1484
1485 if (flags & MF_COUNT_INCREASED)
1486 return 1;
1487
1488
1489
1490
1491
1492 if (!get_page_unless_zero(compound_head(p))) {
1493 if (PageHuge(p)) {
1494 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1495 ret = 0;
1496 } else if (is_free_buddy_page(p)) {
1497 pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1498 ret = 0;
1499 } else {
1500 pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1501 __func__, pfn, p->flags);
1502 ret = -EIO;
1503 }
1504 } else {
1505
1506 ret = 1;
1507 }
1508 return ret;
1509}
1510
1511static int get_any_page(struct page *page, unsigned long pfn, int flags)
1512{
1513 int ret = __get_any_page(page, pfn, flags);
1514
1515 if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
1516
1517
1518
1519 put_page(page);
1520 shake_page(page, 1);
1521
1522
1523
1524
1525 ret = __get_any_page(page, pfn, 0);
1526 if (!PageLRU(page)) {
1527 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1528 pfn, page->flags);
1529 return -EIO;
1530 }
1531 }
1532 return ret;
1533}
1534
1535static int soft_offline_huge_page(struct page *page, int flags)
1536{
1537 int ret;
1538 unsigned long pfn = page_to_pfn(page);
1539 struct page *hpage = compound_head(page);
1540 LIST_HEAD(pagelist);
1541
1542
1543
1544
1545
1546 lock_page(hpage);
1547 if (PageHWPoison(hpage)) {
1548 unlock_page(hpage);
1549 put_page(hpage);
1550 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1551 return -EBUSY;
1552 }
1553 unlock_page(hpage);
1554
1555
1556 list_move(&hpage->lru, &pagelist);
1557 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1558 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1559 if (ret) {
1560 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1561 pfn, ret, page->flags);
1562
1563
1564
1565
1566
1567 putback_active_hugepage(hpage);
1568 if (ret > 0)
1569 ret = -EIO;
1570 } else {
1571
1572 if (PageHuge(page)) {
1573 set_page_hwpoison_huge_page(hpage);
1574 dequeue_hwpoisoned_huge_page(hpage);
1575 atomic_long_add(1 << compound_order(hpage),
1576 &num_poisoned_pages);
1577 } else {
1578 SetPageHWPoison(page);
1579 atomic_long_inc(&num_poisoned_pages);
1580 }
1581 }
1582 return ret;
1583}
1584
1585static int __soft_offline_page(struct page *page, int flags)
1586{
1587 int ret;
1588 unsigned long pfn = page_to_pfn(page);
1589
1590
1591
1592
1593
1594
1595
1596 lock_page(page);
1597 wait_on_page_writeback(page);
1598 if (PageHWPoison(page)) {
1599 unlock_page(page);
1600 put_page(page);
1601 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1602 return -EBUSY;
1603 }
1604
1605
1606
1607
1608 ret = invalidate_inode_page(page);
1609 unlock_page(page);
1610
1611
1612
1613
1614 if (ret == 1) {
1615 put_page(page);
1616 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1617 SetPageHWPoison(page);
1618 atomic_long_inc(&num_poisoned_pages);
1619 return 0;
1620 }
1621
1622
1623
1624
1625
1626
1627 ret = isolate_lru_page(page);
1628
1629
1630
1631
1632 put_page(page);
1633 if (!ret) {
1634 LIST_HEAD(pagelist);
1635 inc_zone_page_state(page, NR_ISOLATED_ANON +
1636 page_is_file_cache(page));
1637 list_add(&page->lru, &pagelist);
1638 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1639 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1640 if (ret) {
1641 if (!list_empty(&pagelist)) {
1642 list_del(&page->lru);
1643 dec_zone_page_state(page, NR_ISOLATED_ANON +
1644 page_is_file_cache(page));
1645 putback_lru_page(page);
1646 }
1647
1648 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1649 pfn, ret, page->flags);
1650 if (ret > 0)
1651 ret = -EIO;
1652 } else {
1653
1654
1655
1656
1657
1658
1659
1660
1661 if (!is_free_buddy_page(page))
1662 lru_add_drain_all();
1663 if (!is_free_buddy_page(page))
1664 drain_all_pages();
1665 SetPageHWPoison(page);
1666 if (!is_free_buddy_page(page))
1667 pr_info("soft offline: %#lx: page leaked\n",
1668 pfn);
1669 atomic_long_inc(&num_poisoned_pages);
1670 }
1671 } else {
1672 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1673 pfn, ret, page_count(page), page->flags);
1674 }
1675 return ret;
1676}
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700int soft_offline_page(struct page *page, int flags)
1701{
1702 int ret;
1703 unsigned long pfn = page_to_pfn(page);
1704 struct page *hpage = compound_head(page);
1705
1706 if (PageHWPoison(page)) {
1707 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1708 return -EBUSY;
1709 }
1710 if (!PageHuge(page) && PageTransHuge(hpage)) {
1711 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
1712 pr_info("soft offline: %#lx: failed to split THP\n",
1713 pfn);
1714 return -EBUSY;
1715 }
1716 }
1717
1718 get_online_mems();
1719
1720
1721
1722
1723
1724
1725 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
1726 set_migratetype_isolate(page, true);
1727
1728 ret = get_any_page(page, pfn, flags);
1729 put_online_mems();
1730 if (ret > 0) {
1731 if (PageHuge(page))
1732 ret = soft_offline_huge_page(page, flags);
1733 else
1734 ret = __soft_offline_page(page, flags);
1735 } else if (ret == 0) {
1736 if (PageHuge(page)) {
1737 set_page_hwpoison_huge_page(hpage);
1738 dequeue_hwpoisoned_huge_page(hpage);
1739 atomic_long_add(1 << compound_order(hpage),
1740 &num_poisoned_pages);
1741 } else {
1742 SetPageHWPoison(page);
1743 atomic_long_inc(&num_poisoned_pages);
1744 }
1745 }
1746 unset_migratetype_isolate(page, MIGRATE_MOVABLE);
1747 return ret;
1748}
1749