1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39#include <linux/kernel.h>
40#include <linux/mm.h>
41#include <linux/page-flags.h>
42#include <linux/kernel-page-flags.h>
43#include <linux/sched/signal.h>
44#include <linux/sched/task.h>
45#include <linux/ksm.h>
46#include <linux/rmap.h>
47#include <linux/export.h>
48#include <linux/pagemap.h>
49#include <linux/swap.h>
50#include <linux/backing-dev.h>
51#include <linux/migrate.h>
52#include <linux/page-isolation.h>
53#include <linux/suspend.h>
54#include <linux/slab.h>
55#include <linux/swapops.h>
56#include <linux/hugetlb.h>
57#include <linux/memory_hotplug.h>
58#include <linux/mm_inline.h>
59#include <linux/kfifo.h>
60#include <linux/ratelimit.h>
61#include "internal.h"
62#include "ras/ras_event.h"
63
64int sysctl_memory_failure_early_kill __read_mostly = 0;
65
66int sysctl_memory_failure_recovery __read_mostly = 1;
67
68atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
69
70#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
71
72u32 hwpoison_filter_enable = 0;
73u32 hwpoison_filter_dev_major = ~0U;
74u32 hwpoison_filter_dev_minor = ~0U;
75u64 hwpoison_filter_flags_mask;
76u64 hwpoison_filter_flags_value;
77EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
78EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
79EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
80EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
81EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
82
83static int hwpoison_filter_dev(struct page *p)
84{
85 struct address_space *mapping;
86 dev_t dev;
87
88 if (hwpoison_filter_dev_major == ~0U &&
89 hwpoison_filter_dev_minor == ~0U)
90 return 0;
91
92
93
94
95 if (PageSlab(p))
96 return -EINVAL;
97
98 mapping = page_mapping(p);
99 if (mapping == NULL || mapping->host == NULL)
100 return -EINVAL;
101
102 dev = mapping->host->i_sb->s_dev;
103 if (hwpoison_filter_dev_major != ~0U &&
104 hwpoison_filter_dev_major != MAJOR(dev))
105 return -EINVAL;
106 if (hwpoison_filter_dev_minor != ~0U &&
107 hwpoison_filter_dev_minor != MINOR(dev))
108 return -EINVAL;
109
110 return 0;
111}
112
113static int hwpoison_filter_flags(struct page *p)
114{
115 if (!hwpoison_filter_flags_mask)
116 return 0;
117
118 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
119 hwpoison_filter_flags_value)
120 return 0;
121 else
122 return -EINVAL;
123}
124
125
126
127
128
129
130
131
132
133
134
135#ifdef CONFIG_MEMCG
136u64 hwpoison_filter_memcg;
137EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
138static int hwpoison_filter_task(struct page *p)
139{
140 if (!hwpoison_filter_memcg)
141 return 0;
142
143 if (page_cgroup_ino(p) != hwpoison_filter_memcg)
144 return -EINVAL;
145
146 return 0;
147}
148#else
149static int hwpoison_filter_task(struct page *p) { return 0; }
150#endif
151
152int hwpoison_filter(struct page *p)
153{
154 if (!hwpoison_filter_enable)
155 return 0;
156
157 if (hwpoison_filter_dev(p))
158 return -EINVAL;
159
160 if (hwpoison_filter_flags(p))
161 return -EINVAL;
162
163 if (hwpoison_filter_task(p))
164 return -EINVAL;
165
166 return 0;
167}
168#else
169int hwpoison_filter(struct page *p)
170{
171 return 0;
172}
173#endif
174
175EXPORT_SYMBOL_GPL(hwpoison_filter);
176
177
178
179
180
181
182static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
183 unsigned long pfn, struct page *page, int flags)
184{
185 struct siginfo si;
186 int ret;
187
188 pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
189 pfn, t->comm, t->pid);
190 si.si_signo = SIGBUS;
191 si.si_errno = 0;
192 si.si_addr = (void *)addr;
193#ifdef __ARCH_SI_TRAPNO
194 si.si_trapno = trapno;
195#endif
196 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
197
198 if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
199 si.si_code = BUS_MCEERR_AR;
200 ret = force_sig_info(SIGBUS, &si, current);
201 } else {
202
203
204
205
206
207
208 si.si_code = BUS_MCEERR_AO;
209 ret = send_sig_info(SIGBUS, &si, t);
210 }
211 if (ret < 0)
212 pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
213 t->comm, t->pid, ret);
214 return ret;
215}
216
217
218
219
220
221void shake_page(struct page *p, int access)
222{
223 if (PageHuge(p))
224 return;
225
226 if (!PageSlab(p)) {
227 lru_add_drain_all();
228 if (PageLRU(p))
229 return;
230 drain_all_pages(page_zone(p));
231 if (PageLRU(p) || is_free_buddy_page(p))
232 return;
233 }
234
235
236
237
238
239 if (access)
240 drop_slab_node(page_to_nid(p));
241}
242EXPORT_SYMBOL_GPL(shake_page);
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266struct to_kill {
267 struct list_head nd;
268 struct task_struct *tsk;
269 unsigned long addr;
270 char addr_valid;
271};
272
273
274
275
276
277
278
279
280
281
282
283static void add_to_kill(struct task_struct *tsk, struct page *p,
284 struct vm_area_struct *vma,
285 struct list_head *to_kill,
286 struct to_kill **tkc)
287{
288 struct to_kill *tk;
289
290 if (*tkc) {
291 tk = *tkc;
292 *tkc = NULL;
293 } else {
294 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
295 if (!tk) {
296 pr_err("Memory failure: Out of memory while machine check handling\n");
297 return;
298 }
299 }
300 tk->addr = page_address_in_vma(p, vma);
301 tk->addr_valid = 1;
302
303
304
305
306
307
308
309 if (tk->addr == -EFAULT) {
310 pr_info("Memory failure: Unable to find user space address %lx in %s\n",
311 page_to_pfn(p), tsk->comm);
312 tk->addr_valid = 0;
313 }
314 get_task_struct(tsk);
315 tk->tsk = tsk;
316 list_add_tail(&tk->nd, to_kill);
317}
318
319
320
321
322
323
324
325
326
327static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
328 bool fail, struct page *page, unsigned long pfn,
329 int flags)
330{
331 struct to_kill *tk, *next;
332
333 list_for_each_entry_safe (tk, next, to_kill, nd) {
334 if (forcekill) {
335
336
337
338
339
340 if (fail || tk->addr_valid == 0) {
341 pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
342 pfn, tk->tsk->comm, tk->tsk->pid);
343 force_sig(SIGKILL, tk->tsk);
344 }
345
346
347
348
349
350
351
352 else if (kill_proc(tk->tsk, tk->addr, trapno,
353 pfn, page, flags) < 0)
354 pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
355 pfn, tk->tsk->comm, tk->tsk->pid);
356 }
357 put_task_struct(tk->tsk);
358 kfree(tk);
359 }
360}
361
362
363
364
365
366
367
368
369
370static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
371{
372 struct task_struct *t;
373
374 for_each_thread(tsk, t)
375 if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
376 return t;
377 return NULL;
378}
379
380
381
382
383
384
385
386static struct task_struct *task_early_kill(struct task_struct *tsk,
387 int force_early)
388{
389 struct task_struct *t;
390 if (!tsk->mm)
391 return NULL;
392 if (force_early)
393 return tsk;
394 t = find_early_kill_thread(tsk);
395 if (t)
396 return t;
397 if (sysctl_memory_failure_early_kill)
398 return tsk;
399 return NULL;
400}
401
402
403
404
405static void collect_procs_anon(struct page *page, struct list_head *to_kill,
406 struct to_kill **tkc, int force_early)
407{
408 struct vm_area_struct *vma;
409 struct task_struct *tsk;
410 struct anon_vma *av;
411 pgoff_t pgoff;
412
413 av = page_lock_anon_vma_read(page);
414 if (av == NULL)
415 return;
416
417 pgoff = page_to_pgoff(page);
418 read_lock(&tasklist_lock);
419 for_each_process (tsk) {
420 struct anon_vma_chain *vmac;
421 struct task_struct *t = task_early_kill(tsk, force_early);
422
423 if (!t)
424 continue;
425 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
426 pgoff, pgoff) {
427 vma = vmac->vma;
428 if (!page_mapped_in_vma(page, vma))
429 continue;
430 if (vma->vm_mm == t->mm)
431 add_to_kill(t, page, vma, to_kill, tkc);
432 }
433 }
434 read_unlock(&tasklist_lock);
435 page_unlock_anon_vma_read(av);
436}
437
438
439
440
441static void collect_procs_file(struct page *page, struct list_head *to_kill,
442 struct to_kill **tkc, int force_early)
443{
444 struct vm_area_struct *vma;
445 struct task_struct *tsk;
446 struct address_space *mapping = page->mapping;
447
448 i_mmap_lock_read(mapping);
449 read_lock(&tasklist_lock);
450 for_each_process(tsk) {
451 pgoff_t pgoff = page_to_pgoff(page);
452 struct task_struct *t = task_early_kill(tsk, force_early);
453
454 if (!t)
455 continue;
456 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
457 pgoff) {
458
459
460
461
462
463
464
465 if (vma->vm_mm == t->mm)
466 add_to_kill(t, page, vma, to_kill, tkc);
467 }
468 }
469 read_unlock(&tasklist_lock);
470 i_mmap_unlock_read(mapping);
471}
472
473
474
475
476
477
478
479static void collect_procs(struct page *page, struct list_head *tokill,
480 int force_early)
481{
482 struct to_kill *tk;
483
484 if (!page->mapping)
485 return;
486
487 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
488 if (!tk)
489 return;
490 if (PageAnon(page))
491 collect_procs_anon(page, tokill, &tk, force_early);
492 else
493 collect_procs_file(page, tokill, &tk, force_early);
494 kfree(tk);
495}
496
497static const char *action_name[] = {
498 [MF_IGNORED] = "Ignored",
499 [MF_FAILED] = "Failed",
500 [MF_DELAYED] = "Delayed",
501 [MF_RECOVERED] = "Recovered",
502};
503
504static const char * const action_page_types[] = {
505 [MF_MSG_KERNEL] = "reserved kernel page",
506 [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
507 [MF_MSG_SLAB] = "kernel slab page",
508 [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
509 [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
510 [MF_MSG_HUGE] = "huge page",
511 [MF_MSG_FREE_HUGE] = "free huge page",
512 [MF_MSG_UNMAP_FAILED] = "unmapping failed page",
513 [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
514 [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
515 [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
516 [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
517 [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
518 [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
519 [MF_MSG_DIRTY_LRU] = "dirty LRU page",
520 [MF_MSG_CLEAN_LRU] = "clean LRU page",
521 [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
522 [MF_MSG_BUDDY] = "free buddy page",
523 [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
524 [MF_MSG_UNKNOWN] = "unknown page",
525};
526
527
528
529
530
531
532
533static int delete_from_lru_cache(struct page *p)
534{
535 if (!isolate_lru_page(p)) {
536
537
538
539
540 ClearPageActive(p);
541 ClearPageUnevictable(p);
542
543
544
545
546
547 mem_cgroup_uncharge(p);
548
549
550
551
552 put_page(p);
553 return 0;
554 }
555 return -EIO;
556}
557
558
559
560
561
562
563static int me_kernel(struct page *p, unsigned long pfn)
564{
565 return MF_IGNORED;
566}
567
568
569
570
571static int me_unknown(struct page *p, unsigned long pfn)
572{
573 pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
574 return MF_FAILED;
575}
576
577
578
579
580static int me_pagecache_clean(struct page *p, unsigned long pfn)
581{
582 int err;
583 int ret = MF_FAILED;
584 struct address_space *mapping;
585
586 delete_from_lru_cache(p);
587
588
589
590
591
592 if (PageAnon(p))
593 return MF_RECOVERED;
594
595
596
597
598
599
600
601
602 mapping = page_mapping(p);
603 if (!mapping) {
604
605
606
607 return MF_FAILED;
608 }
609
610
611
612
613
614
615 if (mapping->a_ops->error_remove_page) {
616 err = mapping->a_ops->error_remove_page(mapping, p);
617 if (err != 0) {
618 pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
619 pfn, err);
620 } else if (page_has_private(p) &&
621 !try_to_release_page(p, GFP_NOIO)) {
622 pr_info("Memory failure: %#lx: failed to release buffers\n",
623 pfn);
624 } else {
625 ret = MF_RECOVERED;
626 }
627 } else {
628
629
630
631
632 if (invalidate_inode_page(p))
633 ret = MF_RECOVERED;
634 else
635 pr_info("Memory failure: %#lx: Failed to invalidate\n",
636 pfn);
637 }
638 return ret;
639}
640
641
642
643
644
645
646static int me_pagecache_dirty(struct page *p, unsigned long pfn)
647{
648 struct address_space *mapping = page_mapping(p);
649
650 SetPageError(p);
651
652 if (mapping) {
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687 mapping_set_error(mapping, EIO);
688 }
689
690 return me_pagecache_clean(p, pfn);
691}
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712static int me_swapcache_dirty(struct page *p, unsigned long pfn)
713{
714 ClearPageDirty(p);
715
716 ClearPageUptodate(p);
717
718 if (!delete_from_lru_cache(p))
719 return MF_DELAYED;
720 else
721 return MF_FAILED;
722}
723
724static int me_swapcache_clean(struct page *p, unsigned long pfn)
725{
726 delete_from_swap_cache(p);
727
728 if (!delete_from_lru_cache(p))
729 return MF_RECOVERED;
730 else
731 return MF_FAILED;
732}
733
734
735
736
737
738
739
740static int me_huge_page(struct page *p, unsigned long pfn)
741{
742 int res = 0;
743 struct page *hpage = compound_head(p);
744
745 if (!PageHuge(hpage))
746 return MF_DELAYED;
747
748
749
750
751
752
753
754
755
756 if (!(page_mapping(hpage) || PageAnon(hpage))) {
757 res = dequeue_hwpoisoned_huge_page(hpage);
758 if (!res)
759 return MF_RECOVERED;
760 }
761 return MF_DELAYED;
762}
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777#define dirty (1UL << PG_dirty)
778#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
779#define unevict (1UL << PG_unevictable)
780#define mlock (1UL << PG_mlocked)
781#define writeback (1UL << PG_writeback)
782#define lru (1UL << PG_lru)
783#define head (1UL << PG_head)
784#define slab (1UL << PG_slab)
785#define reserved (1UL << PG_reserved)
786
787static struct page_state {
788 unsigned long mask;
789 unsigned long res;
790 enum mf_action_page_type type;
791 int (*action)(struct page *p, unsigned long pfn);
792} error_states[] = {
793 { reserved, reserved, MF_MSG_KERNEL, me_kernel },
794
795
796
797
798
799
800
801
802
803
804 { slab, slab, MF_MSG_SLAB, me_kernel },
805
806 { head, head, MF_MSG_HUGE, me_huge_page },
807
808 { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
809 { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
810
811 { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
812 { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
813
814 { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
815 { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
816
817 { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty },
818 { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean },
819
820
821
822
823 { 0, 0, MF_MSG_UNKNOWN, me_unknown },
824};
825
826#undef dirty
827#undef sc
828#undef unevict
829#undef mlock
830#undef writeback
831#undef lru
832#undef head
833#undef slab
834#undef reserved
835
836
837
838
839
840static void action_result(unsigned long pfn, enum mf_action_page_type type,
841 enum mf_result result)
842{
843 trace_memory_failure_event(pfn, type, result);
844
845 pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
846 pfn, action_page_types[type], action_name[result]);
847}
848
849static int page_action(struct page_state *ps, struct page *p,
850 unsigned long pfn)
851{
852 int result;
853 int count;
854
855 result = ps->action(p, pfn);
856
857 count = page_count(p) - 1;
858 if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
859 count--;
860 if (count != 0) {
861 pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
862 pfn, action_page_types[ps->type], count);
863 result = MF_FAILED;
864 }
865 action_result(pfn, ps->type, result);
866
867
868
869
870
871
872 return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
873}
874
875
876
877
878
879
880
881
882int get_hwpoison_page(struct page *page)
883{
884 struct page *head = compound_head(page);
885
886 if (!PageHuge(head) && PageTransHuge(head)) {
887
888
889
890
891
892
893 if (!PageAnon(head)) {
894 pr_err("Memory failure: %#lx: non anonymous thp\n",
895 page_to_pfn(page));
896 return 0;
897 }
898 }
899
900 if (get_page_unless_zero(head)) {
901 if (head == compound_head(page))
902 return 1;
903
904 pr_info("Memory failure: %#lx cannot catch tail\n",
905 page_to_pfn(page));
906 put_page(head);
907 }
908
909 return 0;
910}
911EXPORT_SYMBOL_GPL(get_hwpoison_page);
912
913
914
915
916
917static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
918 int trapno, int flags, struct page **hpagep)
919{
920 enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
921 struct address_space *mapping;
922 LIST_HEAD(tokill);
923 bool unmap_success;
924 int kill = 1, forcekill;
925 struct page *hpage = *hpagep;
926 bool mlocked = PageMlocked(hpage);
927
928
929
930
931
932 if (PageReserved(p) || PageSlab(p))
933 return true;
934 if (!(PageLRU(hpage) || PageHuge(p)))
935 return true;
936
937
938
939
940
941 if (!page_mapped(hpage))
942 return true;
943
944 if (PageKsm(p)) {
945 pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
946 return false;
947 }
948
949 if (PageSwapCache(p)) {
950 pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
951 pfn);
952 ttu |= TTU_IGNORE_HWPOISON;
953 }
954
955
956
957
958
959
960
961 mapping = page_mapping(hpage);
962 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
963 mapping_cap_writeback_dirty(mapping)) {
964 if (page_mkclean(hpage)) {
965 SetPageDirty(hpage);
966 } else {
967 kill = 0;
968 ttu |= TTU_IGNORE_HWPOISON;
969 pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
970 pfn);
971 }
972 }
973
974
975
976
977
978
979
980
981
982 if (kill)
983 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
984
985 unmap_success = try_to_unmap(hpage, ttu);
986 if (!unmap_success)
987 pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
988 pfn, page_mapcount(hpage));
989
990
991
992
993
994 if (mlocked)
995 shake_page(hpage, 0);
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007 forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
1008 kill_procs(&tokill, forcekill, trapno, !unmap_success, p, pfn, flags);
1009
1010 return unmap_success;
1011}
1012
1013static void set_page_hwpoison_huge_page(struct page *hpage)
1014{
1015 int i;
1016 int nr_pages = 1 << compound_order(hpage);
1017 for (i = 0; i < nr_pages; i++)
1018 SetPageHWPoison(hpage + i);
1019}
1020
1021static void clear_page_hwpoison_huge_page(struct page *hpage)
1022{
1023 int i;
1024 int nr_pages = 1 << compound_order(hpage);
1025 for (i = 0; i < nr_pages; i++)
1026 ClearPageHWPoison(hpage + i);
1027}
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047int memory_failure(unsigned long pfn, int trapno, int flags)
1048{
1049 struct page_state *ps;
1050 struct page *p;
1051 struct page *hpage;
1052 struct page *orig_head;
1053 int res;
1054 unsigned int nr_pages;
1055 unsigned long page_flags;
1056
1057 if (!sysctl_memory_failure_recovery)
1058 panic("Memory failure from trap %d on page %lx", trapno, pfn);
1059
1060 if (!pfn_valid(pfn)) {
1061 pr_err("Memory failure: %#lx: memory outside kernel control\n",
1062 pfn);
1063 return -ENXIO;
1064 }
1065
1066 p = pfn_to_page(pfn);
1067 orig_head = hpage = compound_head(p);
1068 if (TestSetPageHWPoison(p)) {
1069 pr_err("Memory failure: %#lx: already hardware poisoned\n",
1070 pfn);
1071 return 0;
1072 }
1073
1074
1075
1076
1077
1078
1079
1080
1081 if (PageHuge(p))
1082 nr_pages = 1 << compound_order(hpage);
1083 else
1084 nr_pages = 1;
1085 num_poisoned_pages_add(nr_pages);
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101 if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
1102 if (is_free_buddy_page(p)) {
1103 action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1104 return 0;
1105 } else if (PageHuge(hpage)) {
1106
1107
1108
1109 lock_page(hpage);
1110 if (PageHWPoison(hpage)) {
1111 if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
1112 || (p != hpage && TestSetPageHWPoison(hpage))) {
1113 num_poisoned_pages_sub(nr_pages);
1114 unlock_page(hpage);
1115 return 0;
1116 }
1117 }
1118 set_page_hwpoison_huge_page(hpage);
1119 res = dequeue_hwpoisoned_huge_page(hpage);
1120 action_result(pfn, MF_MSG_FREE_HUGE,
1121 res ? MF_IGNORED : MF_DELAYED);
1122 unlock_page(hpage);
1123 return res;
1124 } else {
1125 action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
1126 return -EBUSY;
1127 }
1128 }
1129
1130 if (!PageHuge(p) && PageTransHuge(hpage)) {
1131 lock_page(p);
1132 if (!PageAnon(p) || unlikely(split_huge_page(p))) {
1133 unlock_page(p);
1134 if (!PageAnon(p))
1135 pr_err("Memory failure: %#lx: non anonymous thp\n",
1136 pfn);
1137 else
1138 pr_err("Memory failure: %#lx: thp split failed\n",
1139 pfn);
1140 if (TestClearPageHWPoison(p))
1141 num_poisoned_pages_sub(nr_pages);
1142 put_hwpoison_page(p);
1143 return -EBUSY;
1144 }
1145 unlock_page(p);
1146 VM_BUG_ON_PAGE(!page_count(p), p);
1147 hpage = compound_head(p);
1148 }
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158 shake_page(p, 0);
1159
1160 if (!PageLRU(p) && is_free_buddy_page(p)) {
1161 if (flags & MF_COUNT_INCREASED)
1162 action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1163 else
1164 action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
1165 return 0;
1166 }
1167
1168 lock_page(hpage);
1169
1170
1171
1172
1173
1174 if (PageCompound(p) && compound_head(p) != orig_head) {
1175 action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
1176 res = -EBUSY;
1177 goto out;
1178 }
1179
1180
1181
1182
1183
1184
1185
1186
1187 if (PageHuge(p))
1188 page_flags = hpage->flags;
1189 else
1190 page_flags = p->flags;
1191
1192
1193
1194
1195 if (!PageHWPoison(p)) {
1196 pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1197 num_poisoned_pages_sub(nr_pages);
1198 unlock_page(hpage);
1199 put_hwpoison_page(hpage);
1200 return 0;
1201 }
1202 if (hwpoison_filter(p)) {
1203 if (TestClearPageHWPoison(p))
1204 num_poisoned_pages_sub(nr_pages);
1205 unlock_page(hpage);
1206 put_hwpoison_page(hpage);
1207 return 0;
1208 }
1209
1210 if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
1211 goto identify_page_state;
1212
1213
1214
1215
1216
1217 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1218 action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
1219 unlock_page(hpage);
1220 put_hwpoison_page(hpage);
1221 return 0;
1222 }
1223
1224
1225
1226
1227
1228
1229 if (PageHuge(p))
1230 set_page_hwpoison_huge_page(hpage);
1231
1232
1233
1234
1235
1236 wait_on_page_writeback(p);
1237
1238
1239
1240
1241
1242
1243
1244
1245 if (!hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)) {
1246 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1247 res = -EBUSY;
1248 goto out;
1249 }
1250
1251
1252
1253
1254 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1255 action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
1256 res = -EBUSY;
1257 goto out;
1258 }
1259
1260identify_page_state:
1261 res = -EBUSY;
1262
1263
1264
1265
1266
1267 for (ps = error_states;; ps++)
1268 if ((p->flags & ps->mask) == ps->res)
1269 break;
1270
1271 page_flags |= (p->flags & (1UL << PG_dirty));
1272
1273 if (!ps->mask)
1274 for (ps = error_states;; ps++)
1275 if ((page_flags & ps->mask) == ps->res)
1276 break;
1277 res = page_action(ps, p, pfn);
1278out:
1279 unlock_page(hpage);
1280 return res;
1281}
1282EXPORT_SYMBOL_GPL(memory_failure);
1283
1284#define MEMORY_FAILURE_FIFO_ORDER 4
1285#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1286
1287struct memory_failure_entry {
1288 unsigned long pfn;
1289 int trapno;
1290 int flags;
1291};
1292
1293struct memory_failure_cpu {
1294 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1295 MEMORY_FAILURE_FIFO_SIZE);
1296 spinlock_t lock;
1297 struct work_struct work;
1298};
1299
1300static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319void memory_failure_queue(unsigned long pfn, int trapno, int flags)
1320{
1321 struct memory_failure_cpu *mf_cpu;
1322 unsigned long proc_flags;
1323 struct memory_failure_entry entry = {
1324 .pfn = pfn,
1325 .trapno = trapno,
1326 .flags = flags,
1327 };
1328
1329 mf_cpu = &get_cpu_var(memory_failure_cpu);
1330 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1331 if (kfifo_put(&mf_cpu->fifo, entry))
1332 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1333 else
1334 pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
1335 pfn);
1336 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1337 put_cpu_var(memory_failure_cpu);
1338}
1339EXPORT_SYMBOL_GPL(memory_failure_queue);
1340
1341static void memory_failure_work_func(struct work_struct *work)
1342{
1343 struct memory_failure_cpu *mf_cpu;
1344 struct memory_failure_entry entry = { 0, };
1345 unsigned long proc_flags;
1346 int gotten;
1347
1348 mf_cpu = this_cpu_ptr(&memory_failure_cpu);
1349 for (;;) {
1350 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1351 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1352 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1353 if (!gotten)
1354 break;
1355 if (entry.flags & MF_SOFT_OFFLINE)
1356 soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
1357 else
1358 memory_failure(entry.pfn, entry.trapno, entry.flags);
1359 }
1360}
1361
1362static int __init memory_failure_init(void)
1363{
1364 struct memory_failure_cpu *mf_cpu;
1365 int cpu;
1366
1367 for_each_possible_cpu(cpu) {
1368 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1369 spin_lock_init(&mf_cpu->lock);
1370 INIT_KFIFO(mf_cpu->fifo);
1371 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1372 }
1373
1374 return 0;
1375}
1376core_initcall(memory_failure_init);
1377
1378#define unpoison_pr_info(fmt, pfn, rs) \
1379({ \
1380 if (__ratelimit(rs)) \
1381 pr_info(fmt, pfn); \
1382})
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396int unpoison_memory(unsigned long pfn)
1397{
1398 struct page *page;
1399 struct page *p;
1400 int freeit = 0;
1401 unsigned int nr_pages;
1402 static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
1403 DEFAULT_RATELIMIT_BURST);
1404
1405 if (!pfn_valid(pfn))
1406 return -ENXIO;
1407
1408 p = pfn_to_page(pfn);
1409 page = compound_head(p);
1410
1411 if (!PageHWPoison(p)) {
1412 unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
1413 pfn, &unpoison_rs);
1414 return 0;
1415 }
1416
1417 if (page_count(page) > 1) {
1418 unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
1419 pfn, &unpoison_rs);
1420 return 0;
1421 }
1422
1423 if (page_mapped(page)) {
1424 unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
1425 pfn, &unpoison_rs);
1426 return 0;
1427 }
1428
1429 if (page_mapping(page)) {
1430 unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
1431 pfn, &unpoison_rs);
1432 return 0;
1433 }
1434
1435
1436
1437
1438
1439
1440 if (!PageHuge(page) && PageTransHuge(page)) {
1441 unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
1442 pfn, &unpoison_rs);
1443 return 0;
1444 }
1445
1446 nr_pages = 1 << compound_order(page);
1447
1448 if (!get_hwpoison_page(p)) {
1449
1450
1451
1452
1453
1454
1455 if (PageHuge(page)) {
1456 unpoison_pr_info("Unpoison: Memory failure is now running on free hugepage %#lx\n",
1457 pfn, &unpoison_rs);
1458 return 0;
1459 }
1460 if (TestClearPageHWPoison(p))
1461 num_poisoned_pages_dec();
1462 unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
1463 pfn, &unpoison_rs);
1464 return 0;
1465 }
1466
1467 lock_page(page);
1468
1469
1470
1471
1472
1473
1474 if (TestClearPageHWPoison(page)) {
1475 unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
1476 pfn, &unpoison_rs);
1477 num_poisoned_pages_sub(nr_pages);
1478 freeit = 1;
1479 if (PageHuge(page))
1480 clear_page_hwpoison_huge_page(page);
1481 }
1482 unlock_page(page);
1483
1484 put_hwpoison_page(page);
1485 if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
1486 put_hwpoison_page(page);
1487
1488 return 0;
1489}
1490EXPORT_SYMBOL(unpoison_memory);
1491
1492static struct page *new_page(struct page *p, unsigned long private, int **x)
1493{
1494 int nid = page_to_nid(p);
1495 if (PageHuge(p))
1496 return alloc_huge_page_node(page_hstate(compound_head(p)),
1497 nid);
1498 else
1499 return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1500}
1501
1502
1503
1504
1505
1506
1507
1508static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1509{
1510 int ret;
1511
1512 if (flags & MF_COUNT_INCREASED)
1513 return 1;
1514
1515
1516
1517
1518
1519 if (!get_hwpoison_page(p)) {
1520 if (PageHuge(p)) {
1521 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1522 ret = 0;
1523 } else if (is_free_buddy_page(p)) {
1524 pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1525 ret = 0;
1526 } else {
1527 pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1528 __func__, pfn, p->flags);
1529 ret = -EIO;
1530 }
1531 } else {
1532
1533 ret = 1;
1534 }
1535 return ret;
1536}
1537
1538static int get_any_page(struct page *page, unsigned long pfn, int flags)
1539{
1540 int ret = __get_any_page(page, pfn, flags);
1541
1542 if (ret == 1 && !PageHuge(page) &&
1543 !PageLRU(page) && !__PageMovable(page)) {
1544
1545
1546
1547 put_hwpoison_page(page);
1548 shake_page(page, 1);
1549
1550
1551
1552
1553 ret = __get_any_page(page, pfn, 0);
1554 if (ret == 1 && !PageLRU(page)) {
1555
1556 put_hwpoison_page(page);
1557 pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
1558 pfn, page->flags, &page->flags);
1559 return -EIO;
1560 }
1561 }
1562 return ret;
1563}
1564
1565static int soft_offline_huge_page(struct page *page, int flags)
1566{
1567 int ret;
1568 unsigned long pfn = page_to_pfn(page);
1569 struct page *hpage = compound_head(page);
1570 LIST_HEAD(pagelist);
1571
1572
1573
1574
1575
1576 lock_page(hpage);
1577 if (PageHWPoison(hpage)) {
1578 unlock_page(hpage);
1579 put_hwpoison_page(hpage);
1580 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1581 return -EBUSY;
1582 }
1583 unlock_page(hpage);
1584
1585 ret = isolate_huge_page(hpage, &pagelist);
1586
1587
1588
1589
1590 put_hwpoison_page(hpage);
1591 if (!ret) {
1592 pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
1593 return -EBUSY;
1594 }
1595
1596 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1597 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1598 if (ret) {
1599 pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
1600 pfn, ret, page->flags, &page->flags);
1601 if (!list_empty(&pagelist))
1602 putback_movable_pages(&pagelist);
1603 if (ret > 0)
1604 ret = -EIO;
1605 } else {
1606
1607 if (PageHuge(page)) {
1608 set_page_hwpoison_huge_page(hpage);
1609 dequeue_hwpoisoned_huge_page(hpage);
1610 num_poisoned_pages_add(1 << compound_order(hpage));
1611 } else {
1612 SetPageHWPoison(page);
1613 num_poisoned_pages_inc();
1614 }
1615 }
1616 return ret;
1617}
1618
1619static int __soft_offline_page(struct page *page, int flags)
1620{
1621 int ret;
1622 unsigned long pfn = page_to_pfn(page);
1623
1624
1625
1626
1627
1628
1629
1630 lock_page(page);
1631 wait_on_page_writeback(page);
1632 if (PageHWPoison(page)) {
1633 unlock_page(page);
1634 put_hwpoison_page(page);
1635 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1636 return -EBUSY;
1637 }
1638
1639
1640
1641
1642 ret = invalidate_inode_page(page);
1643 unlock_page(page);
1644
1645
1646
1647
1648 if (ret == 1) {
1649 put_hwpoison_page(page);
1650 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1651 SetPageHWPoison(page);
1652 num_poisoned_pages_inc();
1653 return 0;
1654 }
1655
1656
1657
1658
1659
1660
1661 if (PageLRU(page))
1662 ret = isolate_lru_page(page);
1663 else
1664 ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1665
1666
1667
1668
1669 put_hwpoison_page(page);
1670 if (!ret) {
1671 LIST_HEAD(pagelist);
1672
1673
1674
1675
1676
1677 if (!__PageMovable(page))
1678 inc_node_page_state(page, NR_ISOLATED_ANON +
1679 page_is_file_cache(page));
1680 list_add(&page->lru, &pagelist);
1681 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1682 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1683 if (ret) {
1684 if (!list_empty(&pagelist))
1685 putback_movable_pages(&pagelist);
1686
1687 pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
1688 pfn, ret, page->flags, &page->flags);
1689 if (ret > 0)
1690 ret = -EIO;
1691 }
1692 } else {
1693 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n",
1694 pfn, ret, page_count(page), page->flags, &page->flags);
1695 }
1696 return ret;
1697}
1698
1699static int soft_offline_in_use_page(struct page *page, int flags)
1700{
1701 int ret;
1702 struct page *hpage = compound_head(page);
1703
1704 if (!PageHuge(page) && PageTransHuge(hpage)) {
1705 lock_page(hpage);
1706 if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
1707 unlock_page(hpage);
1708 if (!PageAnon(hpage))
1709 pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
1710 else
1711 pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
1712 put_hwpoison_page(hpage);
1713 return -EBUSY;
1714 }
1715 unlock_page(hpage);
1716 get_hwpoison_page(page);
1717 put_hwpoison_page(hpage);
1718 }
1719
1720 if (PageHuge(page))
1721 ret = soft_offline_huge_page(page, flags);
1722 else
1723 ret = __soft_offline_page(page, flags);
1724
1725 return ret;
1726}
1727
1728static void soft_offline_free_page(struct page *page)
1729{
1730 if (PageHuge(page)) {
1731 struct page *hpage = compound_head(page);
1732
1733 set_page_hwpoison_huge_page(hpage);
1734 if (!dequeue_hwpoisoned_huge_page(hpage))
1735 num_poisoned_pages_add(1 << compound_order(hpage));
1736 } else {
1737 if (!TestSetPageHWPoison(page))
1738 num_poisoned_pages_inc();
1739 }
1740}
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764int soft_offline_page(struct page *page, int flags)
1765{
1766 int ret;
1767 unsigned long pfn = page_to_pfn(page);
1768
1769 if (PageHWPoison(page)) {
1770 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1771 if (flags & MF_COUNT_INCREASED)
1772 put_hwpoison_page(page);
1773 return -EBUSY;
1774 }
1775
1776 get_online_mems();
1777 ret = get_any_page(page, pfn, flags);
1778 put_online_mems();
1779
1780 if (ret > 0)
1781 ret = soft_offline_in_use_page(page, flags);
1782 else if (ret == 0)
1783 soft_offline_free_page(page);
1784
1785 return ret;
1786}
1787