1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39#include <linux/kernel.h>
40#include <linux/mm.h>
41#include <linux/page-flags.h>
42#include <linux/kernel-page-flags.h>
43#include <linux/sched/signal.h>
44#include <linux/sched/task.h>
45#include <linux/ksm.h>
46#include <linux/rmap.h>
47#include <linux/export.h>
48#include <linux/pagemap.h>
49#include <linux/swap.h>
50#include <linux/backing-dev.h>
51#include <linux/migrate.h>
52#include <linux/suspend.h>
53#include <linux/slab.h>
54#include <linux/swapops.h>
55#include <linux/hugetlb.h>
56#include <linux/memory_hotplug.h>
57#include <linux/mm_inline.h>
58#include <linux/kfifo.h>
59#include <linux/ratelimit.h>
60#include <linux/page-isolation.h>
61#include "internal.h"
62#include "ras/ras_event.h"
63
64int sysctl_memory_failure_early_kill __read_mostly = 0;
65
66int sysctl_memory_failure_recovery __read_mostly = 1;
67
68atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
69
70#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
71
72u32 hwpoison_filter_enable = 0;
73u32 hwpoison_filter_dev_major = ~0U;
74u32 hwpoison_filter_dev_minor = ~0U;
75u64 hwpoison_filter_flags_mask;
76u64 hwpoison_filter_flags_value;
77EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
78EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
79EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
80EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
81EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
82
83static int hwpoison_filter_dev(struct page *p)
84{
85 struct address_space *mapping;
86 dev_t dev;
87
88 if (hwpoison_filter_dev_major == ~0U &&
89 hwpoison_filter_dev_minor == ~0U)
90 return 0;
91
92
93
94
95 if (PageSlab(p))
96 return -EINVAL;
97
98 mapping = page_mapping(p);
99 if (mapping == NULL || mapping->host == NULL)
100 return -EINVAL;
101
102 dev = mapping->host->i_sb->s_dev;
103 if (hwpoison_filter_dev_major != ~0U &&
104 hwpoison_filter_dev_major != MAJOR(dev))
105 return -EINVAL;
106 if (hwpoison_filter_dev_minor != ~0U &&
107 hwpoison_filter_dev_minor != MINOR(dev))
108 return -EINVAL;
109
110 return 0;
111}
112
113static int hwpoison_filter_flags(struct page *p)
114{
115 if (!hwpoison_filter_flags_mask)
116 return 0;
117
118 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
119 hwpoison_filter_flags_value)
120 return 0;
121 else
122 return -EINVAL;
123}
124
125
126
127
128
129
130
131
132
133
134
135#ifdef CONFIG_MEMCG
136u64 hwpoison_filter_memcg;
137EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
138static int hwpoison_filter_task(struct page *p)
139{
140 if (!hwpoison_filter_memcg)
141 return 0;
142
143 if (page_cgroup_ino(p) != hwpoison_filter_memcg)
144 return -EINVAL;
145
146 return 0;
147}
148#else
149static int hwpoison_filter_task(struct page *p) { return 0; }
150#endif
151
152int hwpoison_filter(struct page *p)
153{
154 if (!hwpoison_filter_enable)
155 return 0;
156
157 if (hwpoison_filter_dev(p))
158 return -EINVAL;
159
160 if (hwpoison_filter_flags(p))
161 return -EINVAL;
162
163 if (hwpoison_filter_task(p))
164 return -EINVAL;
165
166 return 0;
167}
168#else
169int hwpoison_filter(struct page *p)
170{
171 return 0;
172}
173#endif
174
175EXPORT_SYMBOL_GPL(hwpoison_filter);
176
177
178
179
180
181
182static int kill_proc(struct task_struct *t, unsigned long addr,
183 unsigned long pfn, struct page *page, int flags)
184{
185 short addr_lsb;
186 int ret;
187
188 pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
189 pfn, t->comm, t->pid);
190 addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
191
192 if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
193 ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)addr,
194 addr_lsb, current);
195 } else {
196
197
198
199
200
201
202 ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)addr,
203 addr_lsb, t);
204 }
205 if (ret < 0)
206 pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
207 t->comm, t->pid, ret);
208 return ret;
209}
210
211
212
213
214
215void shake_page(struct page *p, int access)
216{
217 if (PageHuge(p))
218 return;
219
220 if (!PageSlab(p)) {
221 lru_add_drain_all();
222 if (PageLRU(p))
223 return;
224 drain_all_pages(page_zone(p));
225 if (PageLRU(p) || is_free_buddy_page(p))
226 return;
227 }
228
229
230
231
232
233 if (access)
234 drop_slab_node(page_to_nid(p));
235}
236EXPORT_SYMBOL_GPL(shake_page);
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260struct to_kill {
261 struct list_head nd;
262 struct task_struct *tsk;
263 unsigned long addr;
264 char addr_valid;
265};
266
267
268
269
270
271
272
273
274
275
276
277static void add_to_kill(struct task_struct *tsk, struct page *p,
278 struct vm_area_struct *vma,
279 struct list_head *to_kill,
280 struct to_kill **tkc)
281{
282 struct to_kill *tk;
283
284 if (*tkc) {
285 tk = *tkc;
286 *tkc = NULL;
287 } else {
288 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
289 if (!tk) {
290 pr_err("Memory failure: Out of memory while machine check handling\n");
291 return;
292 }
293 }
294 tk->addr = page_address_in_vma(p, vma);
295 tk->addr_valid = 1;
296
297
298
299
300
301
302
303 if (tk->addr == -EFAULT) {
304 pr_info("Memory failure: Unable to find user space address %lx in %s\n",
305 page_to_pfn(p), tsk->comm);
306 tk->addr_valid = 0;
307 }
308 get_task_struct(tsk);
309 tk->tsk = tsk;
310 list_add_tail(&tk->nd, to_kill);
311}
312
313
314
315
316
317
318
319
320
321static void kill_procs(struct list_head *to_kill, int forcekill,
322 bool fail, struct page *page, unsigned long pfn,
323 int flags)
324{
325 struct to_kill *tk, *next;
326
327 list_for_each_entry_safe (tk, next, to_kill, nd) {
328 if (forcekill) {
329
330
331
332
333
334 if (fail || tk->addr_valid == 0) {
335 pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
336 pfn, tk->tsk->comm, tk->tsk->pid);
337 force_sig(SIGKILL, tk->tsk);
338 }
339
340
341
342
343
344
345
346 else if (kill_proc(tk->tsk, tk->addr,
347 pfn, page, flags) < 0)
348 pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
349 pfn, tk->tsk->comm, tk->tsk->pid);
350 }
351 put_task_struct(tk->tsk);
352 kfree(tk);
353 }
354}
355
356
357
358
359
360
361
362
363
364static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
365{
366 struct task_struct *t;
367
368 for_each_thread(tsk, t)
369 if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
370 return t;
371 return NULL;
372}
373
374
375
376
377
378
379
380static struct task_struct *task_early_kill(struct task_struct *tsk,
381 int force_early)
382{
383 struct task_struct *t;
384 if (!tsk->mm)
385 return NULL;
386 if (force_early)
387 return tsk;
388 t = find_early_kill_thread(tsk);
389 if (t)
390 return t;
391 if (sysctl_memory_failure_early_kill)
392 return tsk;
393 return NULL;
394}
395
396
397
398
399static void collect_procs_anon(struct page *page, struct list_head *to_kill,
400 struct to_kill **tkc, int force_early)
401{
402 struct vm_area_struct *vma;
403 struct task_struct *tsk;
404 struct anon_vma *av;
405 pgoff_t pgoff;
406
407 av = page_lock_anon_vma_read(page);
408 if (av == NULL)
409 return;
410
411 pgoff = page_to_pgoff(page);
412 read_lock(&tasklist_lock);
413 for_each_process (tsk) {
414 struct anon_vma_chain *vmac;
415 struct task_struct *t = task_early_kill(tsk, force_early);
416
417 if (!t)
418 continue;
419 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
420 pgoff, pgoff) {
421 vma = vmac->vma;
422 if (!page_mapped_in_vma(page, vma))
423 continue;
424 if (vma->vm_mm == t->mm)
425 add_to_kill(t, page, vma, to_kill, tkc);
426 }
427 }
428 read_unlock(&tasklist_lock);
429 page_unlock_anon_vma_read(av);
430}
431
432
433
434
435static void collect_procs_file(struct page *page, struct list_head *to_kill,
436 struct to_kill **tkc, int force_early)
437{
438 struct vm_area_struct *vma;
439 struct task_struct *tsk;
440 struct address_space *mapping = page->mapping;
441
442 i_mmap_lock_read(mapping);
443 read_lock(&tasklist_lock);
444 for_each_process(tsk) {
445 pgoff_t pgoff = page_to_pgoff(page);
446 struct task_struct *t = task_early_kill(tsk, force_early);
447
448 if (!t)
449 continue;
450 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
451 pgoff) {
452
453
454
455
456
457
458
459 if (vma->vm_mm == t->mm)
460 add_to_kill(t, page, vma, to_kill, tkc);
461 }
462 }
463 read_unlock(&tasklist_lock);
464 i_mmap_unlock_read(mapping);
465}
466
467
468
469
470
471
472
473static void collect_procs(struct page *page, struct list_head *tokill,
474 int force_early)
475{
476 struct to_kill *tk;
477
478 if (!page->mapping)
479 return;
480
481 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
482 if (!tk)
483 return;
484 if (PageAnon(page))
485 collect_procs_anon(page, tokill, &tk, force_early);
486 else
487 collect_procs_file(page, tokill, &tk, force_early);
488 kfree(tk);
489}
490
491static const char *action_name[] = {
492 [MF_IGNORED] = "Ignored",
493 [MF_FAILED] = "Failed",
494 [MF_DELAYED] = "Delayed",
495 [MF_RECOVERED] = "Recovered",
496};
497
498static const char * const action_page_types[] = {
499 [MF_MSG_KERNEL] = "reserved kernel page",
500 [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
501 [MF_MSG_SLAB] = "kernel slab page",
502 [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
503 [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
504 [MF_MSG_HUGE] = "huge page",
505 [MF_MSG_FREE_HUGE] = "free huge page",
506 [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page",
507 [MF_MSG_UNMAP_FAILED] = "unmapping failed page",
508 [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
509 [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
510 [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
511 [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
512 [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
513 [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
514 [MF_MSG_DIRTY_LRU] = "dirty LRU page",
515 [MF_MSG_CLEAN_LRU] = "clean LRU page",
516 [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
517 [MF_MSG_BUDDY] = "free buddy page",
518 [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
519 [MF_MSG_UNKNOWN] = "unknown page",
520};
521
522
523
524
525
526
527
528static int delete_from_lru_cache(struct page *p)
529{
530 if (!isolate_lru_page(p)) {
531
532
533
534
535 ClearPageActive(p);
536 ClearPageUnevictable(p);
537
538
539
540
541
542 mem_cgroup_uncharge(p);
543
544
545
546
547 put_page(p);
548 return 0;
549 }
550 return -EIO;
551}
552
553static int truncate_error_page(struct page *p, unsigned long pfn,
554 struct address_space *mapping)
555{
556 int ret = MF_FAILED;
557
558 if (mapping->a_ops->error_remove_page) {
559 int err = mapping->a_ops->error_remove_page(mapping, p);
560
561 if (err != 0) {
562 pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
563 pfn, err);
564 } else if (page_has_private(p) &&
565 !try_to_release_page(p, GFP_NOIO)) {
566 pr_info("Memory failure: %#lx: failed to release buffers\n",
567 pfn);
568 } else {
569 ret = MF_RECOVERED;
570 }
571 } else {
572
573
574
575
576 if (invalidate_inode_page(p))
577 ret = MF_RECOVERED;
578 else
579 pr_info("Memory failure: %#lx: Failed to invalidate\n",
580 pfn);
581 }
582
583 return ret;
584}
585
586
587
588
589
590
591static int me_kernel(struct page *p, unsigned long pfn)
592{
593 return MF_IGNORED;
594}
595
596
597
598
599static int me_unknown(struct page *p, unsigned long pfn)
600{
601 pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
602 return MF_FAILED;
603}
604
605
606
607
608static int me_pagecache_clean(struct page *p, unsigned long pfn)
609{
610 struct address_space *mapping;
611
612 delete_from_lru_cache(p);
613
614
615
616
617
618 if (PageAnon(p))
619 return MF_RECOVERED;
620
621
622
623
624
625
626
627
628 mapping = page_mapping(p);
629 if (!mapping) {
630
631
632
633 return MF_FAILED;
634 }
635
636
637
638
639
640
641 return truncate_error_page(p, pfn, mapping);
642}
643
644
645
646
647
648
649static int me_pagecache_dirty(struct page *p, unsigned long pfn)
650{
651 struct address_space *mapping = page_mapping(p);
652
653 SetPageError(p);
654
655 if (mapping) {
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690 mapping_set_error(mapping, -EIO);
691 }
692
693 return me_pagecache_clean(p, pfn);
694}
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715static int me_swapcache_dirty(struct page *p, unsigned long pfn)
716{
717 ClearPageDirty(p);
718
719 ClearPageUptodate(p);
720
721 if (!delete_from_lru_cache(p))
722 return MF_DELAYED;
723 else
724 return MF_FAILED;
725}
726
727static int me_swapcache_clean(struct page *p, unsigned long pfn)
728{
729 delete_from_swap_cache(p);
730
731 if (!delete_from_lru_cache(p))
732 return MF_RECOVERED;
733 else
734 return MF_FAILED;
735}
736
737
738
739
740
741
742
743static int me_huge_page(struct page *p, unsigned long pfn)
744{
745 int res = 0;
746 struct page *hpage = compound_head(p);
747 struct address_space *mapping;
748
749 if (!PageHuge(hpage))
750 return MF_DELAYED;
751
752 mapping = page_mapping(hpage);
753 if (mapping) {
754 res = truncate_error_page(hpage, pfn, mapping);
755 } else {
756 unlock_page(hpage);
757
758
759
760
761
762 if (PageAnon(hpage))
763 put_page(hpage);
764 dissolve_free_huge_page(p);
765 res = MF_RECOVERED;
766 lock_page(hpage);
767 }
768
769 return res;
770}
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785#define dirty (1UL << PG_dirty)
786#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
787#define unevict (1UL << PG_unevictable)
788#define mlock (1UL << PG_mlocked)
789#define writeback (1UL << PG_writeback)
790#define lru (1UL << PG_lru)
791#define head (1UL << PG_head)
792#define slab (1UL << PG_slab)
793#define reserved (1UL << PG_reserved)
794
795static struct page_state {
796 unsigned long mask;
797 unsigned long res;
798 enum mf_action_page_type type;
799 int (*action)(struct page *p, unsigned long pfn);
800} error_states[] = {
801 { reserved, reserved, MF_MSG_KERNEL, me_kernel },
802
803
804
805
806
807
808
809
810
811
812 { slab, slab, MF_MSG_SLAB, me_kernel },
813
814 { head, head, MF_MSG_HUGE, me_huge_page },
815
816 { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
817 { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
818
819 { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
820 { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
821
822 { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
823 { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
824
825 { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty },
826 { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean },
827
828
829
830
831 { 0, 0, MF_MSG_UNKNOWN, me_unknown },
832};
833
834#undef dirty
835#undef sc
836#undef unevict
837#undef mlock
838#undef writeback
839#undef lru
840#undef head
841#undef slab
842#undef reserved
843
844
845
846
847
848static void action_result(unsigned long pfn, enum mf_action_page_type type,
849 enum mf_result result)
850{
851 trace_memory_failure_event(pfn, type, result);
852
853 pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
854 pfn, action_page_types[type], action_name[result]);
855}
856
857static int page_action(struct page_state *ps, struct page *p,
858 unsigned long pfn)
859{
860 int result;
861 int count;
862
863 result = ps->action(p, pfn);
864
865 count = page_count(p) - 1;
866 if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
867 count--;
868 if (count > 0) {
869 pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
870 pfn, action_page_types[ps->type], count);
871 result = MF_FAILED;
872 }
873 action_result(pfn, ps->type, result);
874
875
876
877
878
879
880 return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
881}
882
883
884
885
886
887
888
889
890int get_hwpoison_page(struct page *page)
891{
892 struct page *head = compound_head(page);
893
894 if (!PageHuge(head) && PageTransHuge(head)) {
895
896
897
898
899
900
901 if (!PageAnon(head)) {
902 pr_err("Memory failure: %#lx: non anonymous thp\n",
903 page_to_pfn(page));
904 return 0;
905 }
906 }
907
908 if (get_page_unless_zero(head)) {
909 if (head == compound_head(page))
910 return 1;
911
912 pr_info("Memory failure: %#lx cannot catch tail\n",
913 page_to_pfn(page));
914 put_page(head);
915 }
916
917 return 0;
918}
919EXPORT_SYMBOL_GPL(get_hwpoison_page);
920
921
922
923
924
925static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
926 int flags, struct page **hpagep)
927{
928 enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
929 struct address_space *mapping;
930 LIST_HEAD(tokill);
931 bool unmap_success;
932 int kill = 1, forcekill;
933 struct page *hpage = *hpagep;
934 bool mlocked = PageMlocked(hpage);
935
936
937
938
939
940 if (PageReserved(p) || PageSlab(p))
941 return true;
942 if (!(PageLRU(hpage) || PageHuge(p)))
943 return true;
944
945
946
947
948
949 if (!page_mapped(hpage))
950 return true;
951
952 if (PageKsm(p)) {
953 pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
954 return false;
955 }
956
957 if (PageSwapCache(p)) {
958 pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
959 pfn);
960 ttu |= TTU_IGNORE_HWPOISON;
961 }
962
963
964
965
966
967
968
969 mapping = page_mapping(hpage);
970 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
971 mapping_cap_writeback_dirty(mapping)) {
972 if (page_mkclean(hpage)) {
973 SetPageDirty(hpage);
974 } else {
975 kill = 0;
976 ttu |= TTU_IGNORE_HWPOISON;
977 pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
978 pfn);
979 }
980 }
981
982
983
984
985
986
987
988
989
990 if (kill)
991 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
992
993 unmap_success = try_to_unmap(hpage, ttu);
994 if (!unmap_success)
995 pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
996 pfn, page_mapcount(hpage));
997
998
999
1000
1001
1002 if (mlocked)
1003 shake_page(hpage, 0);
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015 forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
1016 kill_procs(&tokill, forcekill, !unmap_success, p, pfn, flags);
1017
1018 return unmap_success;
1019}
1020
1021static int identify_page_state(unsigned long pfn, struct page *p,
1022 unsigned long page_flags)
1023{
1024 struct page_state *ps;
1025
1026
1027
1028
1029
1030
1031 for (ps = error_states;; ps++)
1032 if ((p->flags & ps->mask) == ps->res)
1033 break;
1034
1035 page_flags |= (p->flags & (1UL << PG_dirty));
1036
1037 if (!ps->mask)
1038 for (ps = error_states;; ps++)
1039 if ((page_flags & ps->mask) == ps->res)
1040 break;
1041 return page_action(ps, p, pfn);
1042}
1043
1044static int memory_failure_hugetlb(unsigned long pfn, int flags)
1045{
1046 struct page *p = pfn_to_page(pfn);
1047 struct page *head = compound_head(p);
1048 int res;
1049 unsigned long page_flags;
1050
1051 if (TestSetPageHWPoison(head)) {
1052 pr_err("Memory failure: %#lx: already hardware poisoned\n",
1053 pfn);
1054 return 0;
1055 }
1056
1057 num_poisoned_pages_inc();
1058
1059 if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
1060
1061
1062
1063 lock_page(head);
1064 if (PageHWPoison(head)) {
1065 if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
1066 || (p != head && TestSetPageHWPoison(head))) {
1067 num_poisoned_pages_dec();
1068 unlock_page(head);
1069 return 0;
1070 }
1071 }
1072 unlock_page(head);
1073 dissolve_free_huge_page(p);
1074 action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED);
1075 return 0;
1076 }
1077
1078 lock_page(head);
1079 page_flags = head->flags;
1080
1081 if (!PageHWPoison(head)) {
1082 pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1083 num_poisoned_pages_dec();
1084 unlock_page(head);
1085 put_hwpoison_page(head);
1086 return 0;
1087 }
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098 if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
1099 action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
1100 res = -EBUSY;
1101 goto out;
1102 }
1103
1104 if (!hwpoison_user_mappings(p, pfn, flags, &head)) {
1105 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1106 res = -EBUSY;
1107 goto out;
1108 }
1109
1110 res = identify_page_state(pfn, p, page_flags);
1111out:
1112 unlock_page(head);
1113 return res;
1114}
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133int memory_failure(unsigned long pfn, int flags)
1134{
1135 struct page *p;
1136 struct page *hpage;
1137 struct page *orig_head;
1138 int res;
1139 unsigned long page_flags;
1140
1141 if (!sysctl_memory_failure_recovery)
1142 panic("Memory failure on page %lx", pfn);
1143
1144 if (!pfn_valid(pfn)) {
1145 pr_err("Memory failure: %#lx: memory outside kernel control\n",
1146 pfn);
1147 return -ENXIO;
1148 }
1149
1150 p = pfn_to_page(pfn);
1151 if (PageHuge(p))
1152 return memory_failure_hugetlb(pfn, flags);
1153 if (TestSetPageHWPoison(p)) {
1154 pr_err("Memory failure: %#lx: already hardware poisoned\n",
1155 pfn);
1156 return 0;
1157 }
1158
1159 orig_head = hpage = compound_head(p);
1160 num_poisoned_pages_inc();
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173 if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
1174 if (is_free_buddy_page(p)) {
1175 action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1176 return 0;
1177 } else {
1178 action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
1179 return -EBUSY;
1180 }
1181 }
1182
1183 if (PageTransHuge(hpage)) {
1184 lock_page(p);
1185 if (!PageAnon(p) || unlikely(split_huge_page(p))) {
1186 unlock_page(p);
1187 if (!PageAnon(p))
1188 pr_err("Memory failure: %#lx: non anonymous thp\n",
1189 pfn);
1190 else
1191 pr_err("Memory failure: %#lx: thp split failed\n",
1192 pfn);
1193 if (TestClearPageHWPoison(p))
1194 num_poisoned_pages_dec();
1195 put_hwpoison_page(p);
1196 return -EBUSY;
1197 }
1198 unlock_page(p);
1199 VM_BUG_ON_PAGE(!page_count(p), p);
1200 hpage = compound_head(p);
1201 }
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211 shake_page(p, 0);
1212
1213 if (!PageLRU(p) && is_free_buddy_page(p)) {
1214 if (flags & MF_COUNT_INCREASED)
1215 action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1216 else
1217 action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
1218 return 0;
1219 }
1220
1221 lock_page(p);
1222
1223
1224
1225
1226
1227 if (PageCompound(p) && compound_head(p) != orig_head) {
1228 action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
1229 res = -EBUSY;
1230 goto out;
1231 }
1232
1233
1234
1235
1236
1237
1238
1239
1240 if (PageHuge(p))
1241 page_flags = hpage->flags;
1242 else
1243 page_flags = p->flags;
1244
1245
1246
1247
1248 if (!PageHWPoison(p)) {
1249 pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1250 num_poisoned_pages_dec();
1251 unlock_page(p);
1252 put_hwpoison_page(p);
1253 return 0;
1254 }
1255 if (hwpoison_filter(p)) {
1256 if (TestClearPageHWPoison(p))
1257 num_poisoned_pages_dec();
1258 unlock_page(p);
1259 put_hwpoison_page(p);
1260 return 0;
1261 }
1262
1263 if (!PageTransTail(p) && !PageLRU(p))
1264 goto identify_page_state;
1265
1266
1267
1268
1269
1270 wait_on_page_writeback(p);
1271
1272
1273
1274
1275
1276
1277
1278
1279 if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) {
1280 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1281 res = -EBUSY;
1282 goto out;
1283 }
1284
1285
1286
1287
1288 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1289 action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
1290 res = -EBUSY;
1291 goto out;
1292 }
1293
1294identify_page_state:
1295 res = identify_page_state(pfn, p, page_flags);
1296out:
1297 unlock_page(p);
1298 return res;
1299}
1300EXPORT_SYMBOL_GPL(memory_failure);
1301
1302#define MEMORY_FAILURE_FIFO_ORDER 4
1303#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1304
1305struct memory_failure_entry {
1306 unsigned long pfn;
1307 int flags;
1308};
1309
1310struct memory_failure_cpu {
1311 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1312 MEMORY_FAILURE_FIFO_SIZE);
1313 spinlock_t lock;
1314 struct work_struct work;
1315};
1316
1317static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335void memory_failure_queue(unsigned long pfn, int flags)
1336{
1337 struct memory_failure_cpu *mf_cpu;
1338 unsigned long proc_flags;
1339 struct memory_failure_entry entry = {
1340 .pfn = pfn,
1341 .flags = flags,
1342 };
1343
1344 mf_cpu = &get_cpu_var(memory_failure_cpu);
1345 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1346 if (kfifo_put(&mf_cpu->fifo, entry))
1347 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1348 else
1349 pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
1350 pfn);
1351 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1352 put_cpu_var(memory_failure_cpu);
1353}
1354EXPORT_SYMBOL_GPL(memory_failure_queue);
1355
1356static void memory_failure_work_func(struct work_struct *work)
1357{
1358 struct memory_failure_cpu *mf_cpu;
1359 struct memory_failure_entry entry = { 0, };
1360 unsigned long proc_flags;
1361 int gotten;
1362
1363 mf_cpu = this_cpu_ptr(&memory_failure_cpu);
1364 for (;;) {
1365 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1366 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1367 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1368 if (!gotten)
1369 break;
1370 if (entry.flags & MF_SOFT_OFFLINE)
1371 soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
1372 else
1373 memory_failure(entry.pfn, entry.flags);
1374 }
1375}
1376
1377static int __init memory_failure_init(void)
1378{
1379 struct memory_failure_cpu *mf_cpu;
1380 int cpu;
1381
1382 for_each_possible_cpu(cpu) {
1383 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1384 spin_lock_init(&mf_cpu->lock);
1385 INIT_KFIFO(mf_cpu->fifo);
1386 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1387 }
1388
1389 return 0;
1390}
1391core_initcall(memory_failure_init);
1392
1393#define unpoison_pr_info(fmt, pfn, rs) \
1394({ \
1395 if (__ratelimit(rs)) \
1396 pr_info(fmt, pfn); \
1397})
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411int unpoison_memory(unsigned long pfn)
1412{
1413 struct page *page;
1414 struct page *p;
1415 int freeit = 0;
1416 static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
1417 DEFAULT_RATELIMIT_BURST);
1418
1419 if (!pfn_valid(pfn))
1420 return -ENXIO;
1421
1422 p = pfn_to_page(pfn);
1423 page = compound_head(p);
1424
1425 if (!PageHWPoison(p)) {
1426 unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
1427 pfn, &unpoison_rs);
1428 return 0;
1429 }
1430
1431 if (page_count(page) > 1) {
1432 unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
1433 pfn, &unpoison_rs);
1434 return 0;
1435 }
1436
1437 if (page_mapped(page)) {
1438 unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
1439 pfn, &unpoison_rs);
1440 return 0;
1441 }
1442
1443 if (page_mapping(page)) {
1444 unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
1445 pfn, &unpoison_rs);
1446 return 0;
1447 }
1448
1449
1450
1451
1452
1453
1454 if (!PageHuge(page) && PageTransHuge(page)) {
1455 unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
1456 pfn, &unpoison_rs);
1457 return 0;
1458 }
1459
1460 if (!get_hwpoison_page(p)) {
1461 if (TestClearPageHWPoison(p))
1462 num_poisoned_pages_dec();
1463 unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
1464 pfn, &unpoison_rs);
1465 return 0;
1466 }
1467
1468 lock_page(page);
1469
1470
1471
1472
1473
1474
1475 if (TestClearPageHWPoison(page)) {
1476 unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
1477 pfn, &unpoison_rs);
1478 num_poisoned_pages_dec();
1479 freeit = 1;
1480 }
1481 unlock_page(page);
1482
1483 put_hwpoison_page(page);
1484 if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
1485 put_hwpoison_page(page);
1486
1487 return 0;
1488}
1489EXPORT_SYMBOL(unpoison_memory);
1490
1491static struct page *new_page(struct page *p, unsigned long private)
1492{
1493 int nid = page_to_nid(p);
1494
1495 return new_page_nodemask(p, nid, &node_states[N_MEMORY]);
1496}
1497
1498
1499
1500
1501
1502
1503
1504static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1505{
1506 int ret;
1507
1508 if (flags & MF_COUNT_INCREASED)
1509 return 1;
1510
1511
1512
1513
1514
1515 if (!get_hwpoison_page(p)) {
1516 if (PageHuge(p)) {
1517 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1518 ret = 0;
1519 } else if (is_free_buddy_page(p)) {
1520 pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1521 ret = 0;
1522 } else {
1523 pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1524 __func__, pfn, p->flags);
1525 ret = -EIO;
1526 }
1527 } else {
1528
1529 ret = 1;
1530 }
1531 return ret;
1532}
1533
1534static int get_any_page(struct page *page, unsigned long pfn, int flags)
1535{
1536 int ret = __get_any_page(page, pfn, flags);
1537
1538 if (ret == 1 && !PageHuge(page) &&
1539 !PageLRU(page) && !__PageMovable(page)) {
1540
1541
1542
1543 put_hwpoison_page(page);
1544 shake_page(page, 1);
1545
1546
1547
1548
1549 ret = __get_any_page(page, pfn, 0);
1550 if (ret == 1 && !PageLRU(page)) {
1551
1552 put_hwpoison_page(page);
1553 pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
1554 pfn, page->flags, &page->flags);
1555 return -EIO;
1556 }
1557 }
1558 return ret;
1559}
1560
1561static int soft_offline_huge_page(struct page *page, int flags)
1562{
1563 int ret;
1564 unsigned long pfn = page_to_pfn(page);
1565 struct page *hpage = compound_head(page);
1566 LIST_HEAD(pagelist);
1567
1568
1569
1570
1571
1572 lock_page(hpage);
1573 if (PageHWPoison(hpage)) {
1574 unlock_page(hpage);
1575 put_hwpoison_page(hpage);
1576 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1577 return -EBUSY;
1578 }
1579 unlock_page(hpage);
1580
1581 ret = isolate_huge_page(hpage, &pagelist);
1582
1583
1584
1585
1586 put_hwpoison_page(hpage);
1587 if (!ret) {
1588 pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
1589 return -EBUSY;
1590 }
1591
1592 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1593 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1594 if (ret) {
1595 pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n",
1596 pfn, ret, page->flags, &page->flags);
1597 if (!list_empty(&pagelist))
1598 putback_movable_pages(&pagelist);
1599 if (ret > 0)
1600 ret = -EIO;
1601 } else {
1602
1603
1604
1605
1606
1607
1608
1609 ret = dissolve_free_huge_page(page);
1610 if (!ret) {
1611 if (set_hwpoison_free_buddy_page(page))
1612 num_poisoned_pages_inc();
1613 else
1614 ret = -EBUSY;
1615 }
1616 }
1617 return ret;
1618}
1619
1620static int __soft_offline_page(struct page *page, int flags)
1621{
1622 int ret;
1623 unsigned long pfn = page_to_pfn(page);
1624
1625
1626
1627
1628
1629
1630
1631 lock_page(page);
1632 wait_on_page_writeback(page);
1633 if (PageHWPoison(page)) {
1634 unlock_page(page);
1635 put_hwpoison_page(page);
1636 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1637 return -EBUSY;
1638 }
1639
1640
1641
1642
1643 ret = invalidate_inode_page(page);
1644 unlock_page(page);
1645
1646
1647
1648
1649 if (ret == 1) {
1650 put_hwpoison_page(page);
1651 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1652 SetPageHWPoison(page);
1653 num_poisoned_pages_inc();
1654 return 0;
1655 }
1656
1657
1658
1659
1660
1661
1662 if (PageLRU(page))
1663 ret = isolate_lru_page(page);
1664 else
1665 ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1666
1667
1668
1669
1670 put_hwpoison_page(page);
1671 if (!ret) {
1672 LIST_HEAD(pagelist);
1673
1674
1675
1676
1677
1678 if (!__PageMovable(page))
1679 inc_node_page_state(page, NR_ISOLATED_ANON +
1680 page_is_file_cache(page));
1681 list_add(&page->lru, &pagelist);
1682 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1683 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1684 if (ret) {
1685 if (!list_empty(&pagelist))
1686 putback_movable_pages(&pagelist);
1687
1688 pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
1689 pfn, ret, page->flags, &page->flags);
1690 if (ret > 0)
1691 ret = -EIO;
1692 }
1693 } else {
1694 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n",
1695 pfn, ret, page_count(page), page->flags, &page->flags);
1696 }
1697 return ret;
1698}
1699
1700static int soft_offline_in_use_page(struct page *page, int flags)
1701{
1702 int ret;
1703 int mt;
1704 struct page *hpage = compound_head(page);
1705
1706 if (!PageHuge(page) && PageTransHuge(hpage)) {
1707 lock_page(hpage);
1708 if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
1709 unlock_page(hpage);
1710 if (!PageAnon(hpage))
1711 pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
1712 else
1713 pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
1714 put_hwpoison_page(hpage);
1715 return -EBUSY;
1716 }
1717 unlock_page(hpage);
1718 get_hwpoison_page(page);
1719 put_hwpoison_page(hpage);
1720 }
1721
1722
1723
1724
1725
1726
1727
1728
1729 mt = get_pageblock_migratetype(page);
1730 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
1731 if (PageHuge(page))
1732 ret = soft_offline_huge_page(page, flags);
1733 else
1734 ret = __soft_offline_page(page, flags);
1735 set_pageblock_migratetype(page, mt);
1736 return ret;
1737}
1738
1739static int soft_offline_free_page(struct page *page)
1740{
1741 int rc = dissolve_free_huge_page(page);
1742
1743 if (!rc) {
1744 if (set_hwpoison_free_buddy_page(page))
1745 num_poisoned_pages_inc();
1746 else
1747 rc = -EBUSY;
1748 }
1749 return rc;
1750}
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774int soft_offline_page(struct page *page, int flags)
1775{
1776 int ret;
1777 unsigned long pfn = page_to_pfn(page);
1778
1779 if (PageHWPoison(page)) {
1780 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1781 if (flags & MF_COUNT_INCREASED)
1782 put_hwpoison_page(page);
1783 return -EBUSY;
1784 }
1785
1786 get_online_mems();
1787 ret = get_any_page(page, pfn, flags);
1788 put_online_mems();
1789
1790 if (ret > 0)
1791 ret = soft_offline_in_use_page(page, flags);
1792 else if (ret == 0)
1793 ret = soft_offline_free_page(page);
1794
1795 return ret;
1796}
1797