1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38#include <linux/kernel.h>
39#include <linux/mm.h>
40#include <linux/page-flags.h>
41#include <linux/kernel-page-flags.h>
42#include <linux/sched.h>
43#include <linux/ksm.h>
44#include <linux/rmap.h>
45#include <linux/export.h>
46#include <linux/pagemap.h>
47#include <linux/swap.h>
48#include <linux/backing-dev.h>
49#include <linux/migrate.h>
50#include <linux/page-isolation.h>
51#include <linux/suspend.h>
52#include <linux/slab.h>
53#include <linux/swapops.h>
54#include <linux/hugetlb.h>
55#include <linux/memory_hotplug.h>
56#include <linux/mm_inline.h>
57#include <linux/kfifo.h>
58#include "internal.h"
59
60int sysctl_memory_failure_early_kill __read_mostly = 0;
61
62int sysctl_memory_failure_recovery __read_mostly = 1;
63
64atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
65
66#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
67
68u32 hwpoison_filter_enable = 0;
69u32 hwpoison_filter_dev_major = ~0U;
70u32 hwpoison_filter_dev_minor = ~0U;
71u64 hwpoison_filter_flags_mask;
72u64 hwpoison_filter_flags_value;
73EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
74EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
75EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
76EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
77EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
78
79static int hwpoison_filter_dev(struct page *p)
80{
81 struct address_space *mapping;
82 dev_t dev;
83
84 if (hwpoison_filter_dev_major == ~0U &&
85 hwpoison_filter_dev_minor == ~0U)
86 return 0;
87
88
89
90
91 if (PageSlab(p))
92 return -EINVAL;
93
94 mapping = page_mapping(p);
95 if (mapping == NULL || mapping->host == NULL)
96 return -EINVAL;
97
98 dev = mapping->host->i_sb->s_dev;
99 if (hwpoison_filter_dev_major != ~0U &&
100 hwpoison_filter_dev_major != MAJOR(dev))
101 return -EINVAL;
102 if (hwpoison_filter_dev_minor != ~0U &&
103 hwpoison_filter_dev_minor != MINOR(dev))
104 return -EINVAL;
105
106 return 0;
107}
108
109static int hwpoison_filter_flags(struct page *p)
110{
111 if (!hwpoison_filter_flags_mask)
112 return 0;
113
114 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
115 hwpoison_filter_flags_value)
116 return 0;
117 else
118 return -EINVAL;
119}
120
121
122
123
124
125
126
127
128
129
130
131#ifdef CONFIG_MEMCG_SWAP
132u64 hwpoison_filter_memcg;
133EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
134static int hwpoison_filter_task(struct page *p)
135{
136 struct mem_cgroup *mem;
137 struct cgroup_subsys_state *css;
138 unsigned long ino;
139
140 if (!hwpoison_filter_memcg)
141 return 0;
142
143 mem = try_get_mem_cgroup_from_page(p);
144 if (!mem)
145 return -EINVAL;
146
147 css = mem_cgroup_css(mem);
148
149 if (!css->cgroup->dentry)
150 return -EINVAL;
151
152 ino = css->cgroup->dentry->d_inode->i_ino;
153 css_put(css);
154
155 if (ino != hwpoison_filter_memcg)
156 return -EINVAL;
157
158 return 0;
159}
160#else
161static int hwpoison_filter_task(struct page *p) { return 0; }
162#endif
163
164int hwpoison_filter(struct page *p)
165{
166 if (!hwpoison_filter_enable)
167 return 0;
168
169 if (hwpoison_filter_dev(p))
170 return -EINVAL;
171
172 if (hwpoison_filter_flags(p))
173 return -EINVAL;
174
175 if (hwpoison_filter_task(p))
176 return -EINVAL;
177
178 return 0;
179}
180#else
181int hwpoison_filter(struct page *p)
182{
183 return 0;
184}
185#endif
186
187EXPORT_SYMBOL_GPL(hwpoison_filter);
188
189
190
191
192
193
194static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
195 unsigned long pfn, struct page *page, int flags)
196{
197 struct siginfo si;
198 int ret;
199
200 printk(KERN_ERR
201 "MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
202 pfn, t->comm, t->pid);
203 si.si_signo = SIGBUS;
204 si.si_errno = 0;
205 si.si_addr = (void *)addr;
206#ifdef __ARCH_SI_TRAPNO
207 si.si_trapno = trapno;
208#endif
209 si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
210
211 if ((flags & MF_ACTION_REQUIRED) && t == current) {
212 si.si_code = BUS_MCEERR_AR;
213 ret = force_sig_info(SIGBUS, &si, t);
214 } else {
215
216
217
218
219
220
221 si.si_code = BUS_MCEERR_AO;
222 ret = send_sig_info(SIGBUS, &si, t);
223 }
224 if (ret < 0)
225 printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
226 t->comm, t->pid, ret);
227 return ret;
228}
229
230
231
232
233
234void shake_page(struct page *p, int access)
235{
236 if (!PageSlab(p)) {
237 lru_add_drain_all();
238 if (PageLRU(p))
239 return;
240 drain_all_pages();
241 if (PageLRU(p) || is_free_buddy_page(p))
242 return;
243 }
244
245
246
247
248
249 if (access) {
250 int nr;
251 do {
252 struct shrink_control shrink = {
253 .gfp_mask = GFP_KERNEL,
254 };
255
256 nr = shrink_slab(&shrink, 1000, 1000);
257 if (page_count(p) == 1)
258 break;
259 } while (nr > 10);
260 }
261}
262EXPORT_SYMBOL_GPL(shake_page);
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286struct to_kill {
287 struct list_head nd;
288 struct task_struct *tsk;
289 unsigned long addr;
290 char addr_valid;
291};
292
293
294
295
296
297
298
299
300
301
302
303static void add_to_kill(struct task_struct *tsk, struct page *p,
304 struct vm_area_struct *vma,
305 struct list_head *to_kill,
306 struct to_kill **tkc)
307{
308 struct to_kill *tk;
309
310 if (*tkc) {
311 tk = *tkc;
312 *tkc = NULL;
313 } else {
314 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
315 if (!tk) {
316 printk(KERN_ERR
317 "MCE: Out of memory while machine check handling\n");
318 return;
319 }
320 }
321 tk->addr = page_address_in_vma(p, vma);
322 tk->addr_valid = 1;
323
324
325
326
327
328
329
330 if (tk->addr == -EFAULT) {
331 pr_info("MCE: Unable to find user space address %lx in %s\n",
332 page_to_pfn(p), tsk->comm);
333 tk->addr_valid = 0;
334 }
335 get_task_struct(tsk);
336 tk->tsk = tsk;
337 list_add_tail(&tk->nd, to_kill);
338}
339
340
341
342
343
344
345
346
347
348static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
349 int fail, struct page *page, unsigned long pfn,
350 int flags)
351{
352 struct to_kill *tk, *next;
353
354 list_for_each_entry_safe (tk, next, to_kill, nd) {
355 if (forcekill) {
356
357
358
359
360
361 if (fail || tk->addr_valid == 0) {
362 printk(KERN_ERR
363 "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
364 pfn, tk->tsk->comm, tk->tsk->pid);
365 force_sig(SIGKILL, tk->tsk);
366 }
367
368
369
370
371
372
373
374 else if (kill_proc(tk->tsk, tk->addr, trapno,
375 pfn, page, flags) < 0)
376 printk(KERN_ERR
377 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
378 pfn, tk->tsk->comm, tk->tsk->pid);
379 }
380 put_task_struct(tk->tsk);
381 kfree(tk);
382 }
383}
384
385static int task_early_kill(struct task_struct *tsk)
386{
387 if (!tsk->mm)
388 return 0;
389 if (tsk->flags & PF_MCE_PROCESS)
390 return !!(tsk->flags & PF_MCE_EARLY);
391 return sysctl_memory_failure_early_kill;
392}
393
394
395
396
397static void collect_procs_anon(struct page *page, struct list_head *to_kill,
398 struct to_kill **tkc)
399{
400 struct vm_area_struct *vma;
401 struct task_struct *tsk;
402 struct anon_vma *av;
403 pgoff_t pgoff;
404
405 av = page_lock_anon_vma_read(page);
406 if (av == NULL)
407 return;
408
409 pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
410 read_lock(&tasklist_lock);
411 for_each_process (tsk) {
412 struct anon_vma_chain *vmac;
413
414 if (!task_early_kill(tsk))
415 continue;
416 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
417 pgoff, pgoff) {
418 vma = vmac->vma;
419 if (!page_mapped_in_vma(page, vma))
420 continue;
421 if (vma->vm_mm == tsk->mm)
422 add_to_kill(tsk, page, vma, to_kill, tkc);
423 }
424 }
425 read_unlock(&tasklist_lock);
426 page_unlock_anon_vma_read(av);
427}
428
429
430
431
432static void collect_procs_file(struct page *page, struct list_head *to_kill,
433 struct to_kill **tkc)
434{
435 struct vm_area_struct *vma;
436 struct task_struct *tsk;
437 struct address_space *mapping = page->mapping;
438
439 mutex_lock(&mapping->i_mmap_mutex);
440 read_lock(&tasklist_lock);
441 for_each_process(tsk) {
442 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
443
444 if (!task_early_kill(tsk))
445 continue;
446
447 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
448 pgoff) {
449
450
451
452
453
454
455
456 if (vma->vm_mm == tsk->mm)
457 add_to_kill(tsk, page, vma, to_kill, tkc);
458 }
459 }
460 read_unlock(&tasklist_lock);
461 mutex_unlock(&mapping->i_mmap_mutex);
462}
463
464
465
466
467
468
469
470static void collect_procs(struct page *page, struct list_head *tokill)
471{
472 struct to_kill *tk;
473
474 if (!page->mapping)
475 return;
476
477 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
478 if (!tk)
479 return;
480 if (PageAnon(page))
481 collect_procs_anon(page, tokill, &tk);
482 else
483 collect_procs_file(page, tokill, &tk);
484 kfree(tk);
485}
486
487
488
489
490
491enum outcome {
492 IGNORED,
493 FAILED,
494 DELAYED,
495 RECOVERED,
496};
497
498static const char *action_name[] = {
499 [IGNORED] = "Ignored",
500 [FAILED] = "Failed",
501 [DELAYED] = "Delayed",
502 [RECOVERED] = "Recovered",
503};
504
505
506
507
508
509
510
511static int delete_from_lru_cache(struct page *p)
512{
513 if (!isolate_lru_page(p)) {
514
515
516
517
518 ClearPageActive(p);
519 ClearPageUnevictable(p);
520
521
522
523 page_cache_release(p);
524 return 0;
525 }
526 return -EIO;
527}
528
529
530
531
532
533
534static int me_kernel(struct page *p, unsigned long pfn)
535{
536 return IGNORED;
537}
538
539
540
541
542static int me_unknown(struct page *p, unsigned long pfn)
543{
544 printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
545 return FAILED;
546}
547
548
549
550
551static int me_pagecache_clean(struct page *p, unsigned long pfn)
552{
553 int err;
554 int ret = FAILED;
555 struct address_space *mapping;
556
557 delete_from_lru_cache(p);
558
559
560
561
562
563 if (PageAnon(p))
564 return RECOVERED;
565
566
567
568
569
570
571
572
573 mapping = page_mapping(p);
574 if (!mapping) {
575
576
577
578 return FAILED;
579 }
580
581
582
583
584
585
586 if (mapping->a_ops->error_remove_page) {
587 err = mapping->a_ops->error_remove_page(mapping, p);
588 if (err != 0) {
589 printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
590 pfn, err);
591 } else if (page_has_private(p) &&
592 !try_to_release_page(p, GFP_NOIO)) {
593 pr_info("MCE %#lx: failed to release buffers\n", pfn);
594 } else {
595 ret = RECOVERED;
596 }
597 } else {
598
599
600
601
602 if (invalidate_inode_page(p))
603 ret = RECOVERED;
604 else
605 printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
606 pfn);
607 }
608 return ret;
609}
610
611
612
613
614
615
616static int me_pagecache_dirty(struct page *p, unsigned long pfn)
617{
618 struct address_space *mapping = page_mapping(p);
619
620 SetPageError(p);
621
622 if (mapping) {
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657 mapping_set_error(mapping, EIO);
658 }
659
660 return me_pagecache_clean(p, pfn);
661}
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682static int me_swapcache_dirty(struct page *p, unsigned long pfn)
683{
684 ClearPageDirty(p);
685
686 ClearPageUptodate(p);
687
688 if (!delete_from_lru_cache(p))
689 return DELAYED;
690 else
691 return FAILED;
692}
693
694static int me_swapcache_clean(struct page *p, unsigned long pfn)
695{
696 delete_from_swap_cache(p);
697
698 if (!delete_from_lru_cache(p))
699 return RECOVERED;
700 else
701 return FAILED;
702}
703
704
705
706
707
708
709
710static int me_huge_page(struct page *p, unsigned long pfn)
711{
712 int res = 0;
713 struct page *hpage = compound_head(p);
714
715
716
717
718
719
720
721
722
723
724 if (!(page_mapping(hpage) || PageAnon(hpage))) {
725 res = dequeue_hwpoisoned_huge_page(hpage);
726 if (!res)
727 return RECOVERED;
728 }
729 return DELAYED;
730}
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745#define dirty (1UL << PG_dirty)
746#define sc (1UL << PG_swapcache)
747#define unevict (1UL << PG_unevictable)
748#define mlock (1UL << PG_mlocked)
749#define writeback (1UL << PG_writeback)
750#define lru (1UL << PG_lru)
751#define swapbacked (1UL << PG_swapbacked)
752#define head (1UL << PG_head)
753#define tail (1UL << PG_tail)
754#define compound (1UL << PG_compound)
755#define slab (1UL << PG_slab)
756#define reserved (1UL << PG_reserved)
757
758static struct page_state {
759 unsigned long mask;
760 unsigned long res;
761 char *msg;
762 int (*action)(struct page *p, unsigned long pfn);
763} error_states[] = {
764 { reserved, reserved, "reserved kernel", me_kernel },
765
766
767
768
769
770
771
772
773
774
775 { slab, slab, "kernel slab", me_kernel },
776
777#ifdef CONFIG_PAGEFLAGS_EXTENDED
778 { head, head, "huge", me_huge_page },
779 { tail, tail, "huge", me_huge_page },
780#else
781 { compound, compound, "huge", me_huge_page },
782#endif
783
784 { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
785 { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
786
787 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
788 { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean },
789
790 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
791 { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean },
792
793 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
794 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
795
796
797
798
799 { 0, 0, "unknown page state", me_unknown },
800};
801
802#undef dirty
803#undef sc
804#undef unevict
805#undef mlock
806#undef writeback
807#undef lru
808#undef swapbacked
809#undef head
810#undef tail
811#undef compound
812#undef slab
813#undef reserved
814
815
816
817
818
819static void action_result(unsigned long pfn, char *msg, int result)
820{
821 pr_err("MCE %#lx: %s page recovery: %s\n",
822 pfn, msg, action_name[result]);
823}
824
825static int page_action(struct page_state *ps, struct page *p,
826 unsigned long pfn)
827{
828 int result;
829 int count;
830
831 result = ps->action(p, pfn);
832 action_result(pfn, ps->msg, result);
833
834 count = page_count(p) - 1;
835 if (ps->action == me_swapcache_dirty && result == DELAYED)
836 count--;
837 if (count != 0) {
838 printk(KERN_ERR
839 "MCE %#lx: %s page still referenced by %d users\n",
840 pfn, ps->msg, count);
841 result = FAILED;
842 }
843
844
845
846
847
848
849 return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
850}
851
852
853
854
855
856static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
857 int trapno, int flags)
858{
859 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
860 struct address_space *mapping;
861 LIST_HEAD(tokill);
862 int ret;
863 int kill = 1, forcekill;
864 struct page *hpage = compound_head(p);
865 struct page *ppage;
866
867 if (PageReserved(p) || PageSlab(p))
868 return SWAP_SUCCESS;
869
870
871
872
873
874 if (!page_mapped(hpage))
875 return SWAP_SUCCESS;
876
877 if (PageKsm(p))
878 return SWAP_FAIL;
879
880 if (PageSwapCache(p)) {
881 printk(KERN_ERR
882 "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
883 ttu |= TTU_IGNORE_HWPOISON;
884 }
885
886
887
888
889
890
891
892 mapping = page_mapping(hpage);
893 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
894 mapping_cap_writeback_dirty(mapping)) {
895 if (page_mkclean(hpage)) {
896 SetPageDirty(hpage);
897 } else {
898 kill = 0;
899 ttu |= TTU_IGNORE_HWPOISON;
900 printk(KERN_INFO
901 "MCE %#lx: corrupted page was clean: dropped without side effects\n",
902 pfn);
903 }
904 }
905
906
907
908
909
910
911
912 ppage = hpage;
913
914 if (PageTransHuge(hpage)) {
915
916
917
918
919
920
921
922
923
924
925 if (!PageHuge(hpage) && PageAnon(hpage)) {
926 if (unlikely(split_huge_page(hpage))) {
927
928
929
930
931
932
933 printk(KERN_INFO
934 "MCE %#lx: failed to split THP\n", pfn);
935
936 BUG_ON(!PageHWPoison(p));
937 return SWAP_FAIL;
938 }
939
940 ppage = p;
941 }
942 }
943
944
945
946
947
948
949
950
951
952 if (kill)
953 collect_procs(ppage, &tokill);
954
955 if (hpage != ppage)
956 lock_page(ppage);
957
958 ret = try_to_unmap(ppage, ttu);
959 if (ret != SWAP_SUCCESS)
960 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
961 pfn, page_mapcount(ppage));
962
963 if (hpage != ppage)
964 unlock_page(ppage);
965
966
967
968
969
970
971
972
973
974
975
976 forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
977 kill_procs(&tokill, forcekill, trapno,
978 ret != SWAP_SUCCESS, p, pfn, flags);
979
980 return ret;
981}
982
983static void set_page_hwpoison_huge_page(struct page *hpage)
984{
985 int i;
986 int nr_pages = 1 << compound_trans_order(hpage);
987 for (i = 0; i < nr_pages; i++)
988 SetPageHWPoison(hpage + i);
989}
990
991static void clear_page_hwpoison_huge_page(struct page *hpage)
992{
993 int i;
994 int nr_pages = 1 << compound_trans_order(hpage);
995 for (i = 0; i < nr_pages; i++)
996 ClearPageHWPoison(hpage + i);
997}
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017int memory_failure(unsigned long pfn, int trapno, int flags)
1018{
1019 struct page_state *ps;
1020 struct page *p;
1021 struct page *hpage;
1022 int res;
1023 unsigned int nr_pages;
1024 unsigned long page_flags;
1025
1026 if (!sysctl_memory_failure_recovery)
1027 panic("Memory failure from trap %d on page %lx", trapno, pfn);
1028
1029 if (!pfn_valid(pfn)) {
1030 printk(KERN_ERR
1031 "MCE %#lx: memory outside kernel control\n",
1032 pfn);
1033 return -ENXIO;
1034 }
1035
1036 p = pfn_to_page(pfn);
1037 hpage = compound_head(p);
1038 if (TestSetPageHWPoison(p)) {
1039 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
1040 return 0;
1041 }
1042
1043
1044
1045
1046
1047
1048
1049
1050 if (PageHuge(p))
1051 nr_pages = 1 << compound_order(hpage);
1052 else
1053 nr_pages = 1;
1054 atomic_long_add(nr_pages, &num_poisoned_pages);
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070 if (!(flags & MF_COUNT_INCREASED) &&
1071 !get_page_unless_zero(hpage)) {
1072 if (is_free_buddy_page(p)) {
1073 action_result(pfn, "free buddy", DELAYED);
1074 return 0;
1075 } else if (PageHuge(hpage)) {
1076
1077
1078
1079
1080 lock_page(hpage);
1081 if (!PageHWPoison(hpage)
1082 || (hwpoison_filter(p) && TestClearPageHWPoison(p))
1083 || (p != hpage && TestSetPageHWPoison(hpage))) {
1084 atomic_long_sub(nr_pages, &num_poisoned_pages);
1085 return 0;
1086 }
1087 set_page_hwpoison_huge_page(hpage);
1088 res = dequeue_hwpoisoned_huge_page(hpage);
1089 action_result(pfn, "free huge",
1090 res ? IGNORED : DELAYED);
1091 unlock_page(hpage);
1092 return res;
1093 } else {
1094 action_result(pfn, "high order kernel", IGNORED);
1095 return -EBUSY;
1096 }
1097 }
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107 if (!PageHuge(p) && !PageTransTail(p)) {
1108 if (!PageLRU(p))
1109 shake_page(p, 0);
1110 if (!PageLRU(p)) {
1111
1112
1113
1114 if (is_free_buddy_page(p)) {
1115 action_result(pfn, "free buddy, 2nd try",
1116 DELAYED);
1117 return 0;
1118 }
1119 action_result(pfn, "non LRU", IGNORED);
1120 put_page(p);
1121 return -EBUSY;
1122 }
1123 }
1124
1125
1126
1127
1128
1129
1130 lock_page(hpage);
1131
1132
1133
1134
1135
1136
1137
1138
1139 page_flags = p->flags;
1140
1141
1142
1143
1144 if (!PageHWPoison(p)) {
1145 printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
1146 res = 0;
1147 goto out;
1148 }
1149 if (hwpoison_filter(p)) {
1150 if (TestClearPageHWPoison(p))
1151 atomic_long_sub(nr_pages, &num_poisoned_pages);
1152 unlock_page(hpage);
1153 put_page(hpage);
1154 return 0;
1155 }
1156
1157
1158
1159
1160
1161 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1162 action_result(pfn, "hugepage already hardware poisoned",
1163 IGNORED);
1164 unlock_page(hpage);
1165 put_page(hpage);
1166 return 0;
1167 }
1168
1169
1170
1171
1172
1173
1174 if (PageHuge(p))
1175 set_page_hwpoison_huge_page(hpage);
1176
1177 wait_on_page_writeback(p);
1178
1179
1180
1181
1182
1183 if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) {
1184 printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
1185 res = -EBUSY;
1186 goto out;
1187 }
1188
1189
1190
1191
1192 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1193 action_result(pfn, "already truncated LRU", IGNORED);
1194 res = -EBUSY;
1195 goto out;
1196 }
1197
1198 res = -EBUSY;
1199
1200
1201
1202
1203
1204 for (ps = error_states;; ps++)
1205 if ((p->flags & ps->mask) == ps->res)
1206 break;
1207 if (!ps->mask)
1208 for (ps = error_states;; ps++)
1209 if ((page_flags & ps->mask) == ps->res)
1210 break;
1211 res = page_action(ps, p, pfn);
1212out:
1213 unlock_page(hpage);
1214 return res;
1215}
1216EXPORT_SYMBOL_GPL(memory_failure);
1217
1218#define MEMORY_FAILURE_FIFO_ORDER 4
1219#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1220
1221struct memory_failure_entry {
1222 unsigned long pfn;
1223 int trapno;
1224 int flags;
1225};
1226
1227struct memory_failure_cpu {
1228 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1229 MEMORY_FAILURE_FIFO_SIZE);
1230 spinlock_t lock;
1231 struct work_struct work;
1232};
1233
1234static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253void memory_failure_queue(unsigned long pfn, int trapno, int flags)
1254{
1255 struct memory_failure_cpu *mf_cpu;
1256 unsigned long proc_flags;
1257 struct memory_failure_entry entry = {
1258 .pfn = pfn,
1259 .trapno = trapno,
1260 .flags = flags,
1261 };
1262
1263 mf_cpu = &get_cpu_var(memory_failure_cpu);
1264 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1265 if (kfifo_put(&mf_cpu->fifo, &entry))
1266 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1267 else
1268 pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
1269 pfn);
1270 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1271 put_cpu_var(memory_failure_cpu);
1272}
1273EXPORT_SYMBOL_GPL(memory_failure_queue);
1274
1275static void memory_failure_work_func(struct work_struct *work)
1276{
1277 struct memory_failure_cpu *mf_cpu;
1278 struct memory_failure_entry entry = { 0, };
1279 unsigned long proc_flags;
1280 int gotten;
1281
1282 mf_cpu = &__get_cpu_var(memory_failure_cpu);
1283 for (;;) {
1284 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1285 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1286 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1287 if (!gotten)
1288 break;
1289 memory_failure(entry.pfn, entry.trapno, entry.flags);
1290 }
1291}
1292
1293static int __init memory_failure_init(void)
1294{
1295 struct memory_failure_cpu *mf_cpu;
1296 int cpu;
1297
1298 for_each_possible_cpu(cpu) {
1299 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1300 spin_lock_init(&mf_cpu->lock);
1301 INIT_KFIFO(mf_cpu->fifo);
1302 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1303 }
1304
1305 return 0;
1306}
1307core_initcall(memory_failure_init);
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321int unpoison_memory(unsigned long pfn)
1322{
1323 struct page *page;
1324 struct page *p;
1325 int freeit = 0;
1326 unsigned int nr_pages;
1327
1328 if (!pfn_valid(pfn))
1329 return -ENXIO;
1330
1331 p = pfn_to_page(pfn);
1332 page = compound_head(p);
1333
1334 if (!PageHWPoison(p)) {
1335 pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
1336 return 0;
1337 }
1338
1339 nr_pages = 1 << compound_trans_order(page);
1340
1341 if (!get_page_unless_zero(page)) {
1342
1343
1344
1345
1346
1347
1348 if (PageHuge(page)) {
1349 pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
1350 return 0;
1351 }
1352 if (TestClearPageHWPoison(p))
1353 atomic_long_sub(nr_pages, &num_poisoned_pages);
1354 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1355 return 0;
1356 }
1357
1358 lock_page(page);
1359
1360
1361
1362
1363
1364
1365 if (TestClearPageHWPoison(page)) {
1366 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1367 atomic_long_sub(nr_pages, &num_poisoned_pages);
1368 freeit = 1;
1369 if (PageHuge(page))
1370 clear_page_hwpoison_huge_page(page);
1371 }
1372 unlock_page(page);
1373
1374 put_page(page);
1375 if (freeit)
1376 put_page(page);
1377
1378 return 0;
1379}
1380EXPORT_SYMBOL(unpoison_memory);
1381
1382static struct page *new_page(struct page *p, unsigned long private, int **x)
1383{
1384 int nid = page_to_nid(p);
1385 if (PageHuge(p))
1386 return alloc_huge_page_node(page_hstate(compound_head(p)),
1387 nid);
1388 else
1389 return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1390}
1391
1392
1393
1394
1395
1396
1397
1398static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1399{
1400 int ret;
1401
1402 if (flags & MF_COUNT_INCREASED)
1403 return 1;
1404
1405
1406
1407
1408
1409 lock_memory_hotplug();
1410
1411
1412
1413
1414
1415
1416 set_migratetype_isolate(p, true);
1417
1418
1419
1420
1421 if (!get_page_unless_zero(compound_head(p))) {
1422 if (PageHuge(p)) {
1423 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1424 ret = 0;
1425 } else if (is_free_buddy_page(p)) {
1426 pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1427 ret = 0;
1428 } else {
1429 pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1430 __func__, pfn, p->flags);
1431 ret = -EIO;
1432 }
1433 } else {
1434
1435 ret = 1;
1436 }
1437 unlock_memory_hotplug();
1438 return ret;
1439}
1440
1441static int get_any_page(struct page *page, unsigned long pfn, int flags)
1442{
1443 int ret = __get_any_page(page, pfn, flags);
1444
1445 if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
1446
1447
1448
1449 put_page(page);
1450 shake_page(page, 1);
1451
1452
1453
1454
1455 ret = __get_any_page(page, pfn, 0);
1456 if (!PageLRU(page)) {
1457 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1458 pfn, page->flags);
1459 return -EIO;
1460 }
1461 }
1462 return ret;
1463}
1464
1465static int soft_offline_huge_page(struct page *page, int flags)
1466{
1467 int ret;
1468 unsigned long pfn = page_to_pfn(page);
1469 struct page *hpage = compound_head(page);
1470
1471
1472
1473
1474
1475 lock_page(hpage);
1476 if (PageHWPoison(hpage)) {
1477 unlock_page(hpage);
1478 put_page(hpage);
1479 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1480 return -EBUSY;
1481 }
1482 unlock_page(hpage);
1483
1484
1485 ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL,
1486 MIGRATE_SYNC);
1487 put_page(hpage);
1488 if (ret) {
1489 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1490 pfn, ret, page->flags);
1491 } else {
1492 set_page_hwpoison_huge_page(hpage);
1493 dequeue_hwpoisoned_huge_page(hpage);
1494 atomic_long_add(1 << compound_trans_order(hpage),
1495 &num_poisoned_pages);
1496 }
1497 return ret;
1498}
1499
1500static int __soft_offline_page(struct page *page, int flags);
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524int soft_offline_page(struct page *page, int flags)
1525{
1526 int ret;
1527 unsigned long pfn = page_to_pfn(page);
1528 struct page *hpage = compound_trans_head(page);
1529
1530 if (PageHWPoison(page)) {
1531 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1532 return -EBUSY;
1533 }
1534 if (!PageHuge(page) && PageTransHuge(hpage)) {
1535 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
1536 pr_info("soft offline: %#lx: failed to split THP\n",
1537 pfn);
1538 return -EBUSY;
1539 }
1540 }
1541
1542 ret = get_any_page(page, pfn, flags);
1543 if (ret < 0)
1544 return ret;
1545 if (ret) {
1546 if (PageHuge(page))
1547 ret = soft_offline_huge_page(page, flags);
1548 else
1549 ret = __soft_offline_page(page, flags);
1550 } else {
1551 if (PageHuge(page)) {
1552 set_page_hwpoison_huge_page(hpage);
1553 dequeue_hwpoisoned_huge_page(hpage);
1554 atomic_long_add(1 << compound_trans_order(hpage),
1555 &num_poisoned_pages);
1556 } else {
1557 SetPageHWPoison(page);
1558 atomic_long_inc(&num_poisoned_pages);
1559 }
1560 }
1561 unset_migratetype_isolate(page, MIGRATE_MOVABLE);
1562 return ret;
1563}
1564
1565static int __soft_offline_page(struct page *page, int flags)
1566{
1567 int ret;
1568 unsigned long pfn = page_to_pfn(page);
1569
1570
1571
1572
1573
1574
1575
1576 lock_page(page);
1577 wait_on_page_writeback(page);
1578 if (PageHWPoison(page)) {
1579 unlock_page(page);
1580 put_page(page);
1581 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1582 return -EBUSY;
1583 }
1584
1585
1586
1587
1588 ret = invalidate_inode_page(page);
1589 unlock_page(page);
1590
1591
1592
1593
1594 if (ret == 1) {
1595 put_page(page);
1596 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1597 SetPageHWPoison(page);
1598 atomic_long_inc(&num_poisoned_pages);
1599 return 0;
1600 }
1601
1602
1603
1604
1605
1606
1607 ret = isolate_lru_page(page);
1608
1609
1610
1611
1612 put_page(page);
1613 if (!ret) {
1614 LIST_HEAD(pagelist);
1615 inc_zone_page_state(page, NR_ISOLATED_ANON +
1616 page_is_file_cache(page));
1617 list_add(&page->lru, &pagelist);
1618 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1619 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1620 if (ret) {
1621 putback_lru_pages(&pagelist);
1622 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1623 pfn, ret, page->flags);
1624 if (ret > 0)
1625 ret = -EIO;
1626 } else {
1627
1628
1629
1630
1631
1632
1633
1634
1635 if (!is_free_buddy_page(page))
1636 lru_add_drain_all();
1637 if (!is_free_buddy_page(page))
1638 drain_all_pages();
1639 SetPageHWPoison(page);
1640 if (!is_free_buddy_page(page))
1641 pr_info("soft offline: %#lx: page leaked\n",
1642 pfn);
1643 atomic_long_inc(&num_poisoned_pages);
1644 }
1645 } else {
1646 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1647 pfn, ret, page_count(page), page->flags);
1648 }
1649 return ret;
1650}
1651