1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36#include <linux/kernel.h>
37#include <linux/mm.h>
38#include <linux/page-flags.h>
39#include <linux/kernel-page-flags.h>
40#include <linux/sched/signal.h>
41#include <linux/sched/task.h>
42#include <linux/ksm.h>
43#include <linux/rmap.h>
44#include <linux/export.h>
45#include <linux/pagemap.h>
46#include <linux/swap.h>
47#include <linux/backing-dev.h>
48#include <linux/migrate.h>
49#include <linux/suspend.h>
50#include <linux/slab.h>
51#include <linux/swapops.h>
52#include <linux/hugetlb.h>
53#include <linux/memory_hotplug.h>
54#include <linux/mm_inline.h>
55#include <linux/memremap.h>
56#include <linux/kfifo.h>
57#include <linux/ratelimit.h>
58#include <linux/page-isolation.h>
59#include "internal.h"
60#include "ras/ras_event.h"
61
62int sysctl_memory_failure_early_kill __read_mostly = 0;
63
64int sysctl_memory_failure_recovery __read_mostly = 1;
65
66atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
67
68#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
69
70u32 hwpoison_filter_enable = 0;
71u32 hwpoison_filter_dev_major = ~0U;
72u32 hwpoison_filter_dev_minor = ~0U;
73u64 hwpoison_filter_flags_mask;
74u64 hwpoison_filter_flags_value;
75EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
76EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
77EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
78EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
79EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
80
81static int hwpoison_filter_dev(struct page *p)
82{
83 struct address_space *mapping;
84 dev_t dev;
85
86 if (hwpoison_filter_dev_major == ~0U &&
87 hwpoison_filter_dev_minor == ~0U)
88 return 0;
89
90
91
92
93 if (PageSlab(p))
94 return -EINVAL;
95
96 mapping = page_mapping(p);
97 if (mapping == NULL || mapping->host == NULL)
98 return -EINVAL;
99
100 dev = mapping->host->i_sb->s_dev;
101 if (hwpoison_filter_dev_major != ~0U &&
102 hwpoison_filter_dev_major != MAJOR(dev))
103 return -EINVAL;
104 if (hwpoison_filter_dev_minor != ~0U &&
105 hwpoison_filter_dev_minor != MINOR(dev))
106 return -EINVAL;
107
108 return 0;
109}
110
111static int hwpoison_filter_flags(struct page *p)
112{
113 if (!hwpoison_filter_flags_mask)
114 return 0;
115
116 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
117 hwpoison_filter_flags_value)
118 return 0;
119 else
120 return -EINVAL;
121}
122
123
124
125
126
127
128
129
130
131
132
133#ifdef CONFIG_MEMCG
134u64 hwpoison_filter_memcg;
135EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
136static int hwpoison_filter_task(struct page *p)
137{
138 if (!hwpoison_filter_memcg)
139 return 0;
140
141 if (page_cgroup_ino(p) != hwpoison_filter_memcg)
142 return -EINVAL;
143
144 return 0;
145}
146#else
147static int hwpoison_filter_task(struct page *p) { return 0; }
148#endif
149
150int hwpoison_filter(struct page *p)
151{
152 if (!hwpoison_filter_enable)
153 return 0;
154
155 if (hwpoison_filter_dev(p))
156 return -EINVAL;
157
158 if (hwpoison_filter_flags(p))
159 return -EINVAL;
160
161 if (hwpoison_filter_task(p))
162 return -EINVAL;
163
164 return 0;
165}
166#else
167int hwpoison_filter(struct page *p)
168{
169 return 0;
170}
171#endif
172
173EXPORT_SYMBOL_GPL(hwpoison_filter);
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197struct to_kill {
198 struct list_head nd;
199 struct task_struct *tsk;
200 unsigned long addr;
201 short size_shift;
202};
203
204
205
206
207
208
209static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
210{
211 struct task_struct *t = tk->tsk;
212 short addr_lsb = tk->size_shift;
213 int ret;
214
215 pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
216 pfn, t->comm, t->pid);
217
218 if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
219 ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr,
220 addr_lsb);
221 } else {
222
223
224
225
226
227
228 ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
229 addr_lsb, t);
230 }
231 if (ret < 0)
232 pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
233 t->comm, t->pid, ret);
234 return ret;
235}
236
237
238
239
240
241void shake_page(struct page *p, int access)
242{
243 if (PageHuge(p))
244 return;
245
246 if (!PageSlab(p)) {
247 lru_add_drain_all();
248 if (PageLRU(p))
249 return;
250 drain_all_pages(page_zone(p));
251 if (PageLRU(p) || is_free_buddy_page(p))
252 return;
253 }
254
255
256
257
258
259 if (access)
260 drop_slab_node(page_to_nid(p));
261}
262EXPORT_SYMBOL_GPL(shake_page);
263
264static unsigned long dev_pagemap_mapping_shift(struct page *page,
265 struct vm_area_struct *vma)
266{
267 unsigned long address = vma_address(page, vma);
268 pgd_t *pgd;
269 p4d_t *p4d;
270 pud_t *pud;
271 pmd_t *pmd;
272 pte_t *pte;
273
274 pgd = pgd_offset(vma->vm_mm, address);
275 if (!pgd_present(*pgd))
276 return 0;
277 p4d = p4d_offset(pgd, address);
278 if (!p4d_present(*p4d))
279 return 0;
280 pud = pud_offset(p4d, address);
281 if (!pud_present(*pud))
282 return 0;
283 if (pud_devmap(*pud))
284 return PUD_SHIFT;
285 pmd = pmd_offset(pud, address);
286 if (!pmd_present(*pmd))
287 return 0;
288 if (pmd_devmap(*pmd))
289 return PMD_SHIFT;
290 pte = pte_offset_map(pmd, address);
291 if (!pte_present(*pte))
292 return 0;
293 if (pte_devmap(*pte))
294 return PAGE_SHIFT;
295 return 0;
296}
297
298
299
300
301
302
303
304
305
306
307static void add_to_kill(struct task_struct *tsk, struct page *p,
308 struct vm_area_struct *vma,
309 struct list_head *to_kill)
310{
311 struct to_kill *tk;
312
313 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
314 if (!tk) {
315 pr_err("Memory failure: Out of memory while machine check handling\n");
316 return;
317 }
318
319 tk->addr = page_address_in_vma(p, vma);
320 if (is_zone_device_page(p))
321 tk->size_shift = dev_pagemap_mapping_shift(p, vma);
322 else
323 tk->size_shift = page_shift(compound_head(p));
324
325
326
327
328
329
330
331
332
333
334
335 if (tk->addr == -EFAULT) {
336 pr_info("Memory failure: Unable to find user space address %lx in %s\n",
337 page_to_pfn(p), tsk->comm);
338 } else if (tk->size_shift == 0) {
339 kfree(tk);
340 return;
341 }
342
343 get_task_struct(tsk);
344 tk->tsk = tsk;
345 list_add_tail(&tk->nd, to_kill);
346}
347
348
349
350
351
352
353
354
355
356static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
357 unsigned long pfn, int flags)
358{
359 struct to_kill *tk, *next;
360
361 list_for_each_entry_safe (tk, next, to_kill, nd) {
362 if (forcekill) {
363
364
365
366
367
368 if (fail || tk->addr == -EFAULT) {
369 pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
370 pfn, tk->tsk->comm, tk->tsk->pid);
371 do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
372 tk->tsk, PIDTYPE_PID);
373 }
374
375
376
377
378
379
380
381 else if (kill_proc(tk, pfn, flags) < 0)
382 pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
383 pfn, tk->tsk->comm, tk->tsk->pid);
384 }
385 put_task_struct(tk->tsk);
386 kfree(tk);
387 }
388}
389
390
391
392
393
394
395
396
397
398static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
399{
400 struct task_struct *t;
401
402 for_each_thread(tsk, t)
403 if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
404 return t;
405 return NULL;
406}
407
408
409
410
411
412
413
414static struct task_struct *task_early_kill(struct task_struct *tsk,
415 int force_early)
416{
417 struct task_struct *t;
418 if (!tsk->mm)
419 return NULL;
420 if (force_early)
421 return tsk;
422 t = find_early_kill_thread(tsk);
423 if (t)
424 return t;
425 if (sysctl_memory_failure_early_kill)
426 return tsk;
427 return NULL;
428}
429
430
431
432
433static void collect_procs_anon(struct page *page, struct list_head *to_kill,
434 int force_early)
435{
436 struct vm_area_struct *vma;
437 struct task_struct *tsk;
438 struct anon_vma *av;
439 pgoff_t pgoff;
440
441 av = page_lock_anon_vma_read(page);
442 if (av == NULL)
443 return;
444
445 pgoff = page_to_pgoff(page);
446 read_lock(&tasklist_lock);
447 for_each_process (tsk) {
448 struct anon_vma_chain *vmac;
449 struct task_struct *t = task_early_kill(tsk, force_early);
450
451 if (!t)
452 continue;
453 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
454 pgoff, pgoff) {
455 vma = vmac->vma;
456 if (!page_mapped_in_vma(page, vma))
457 continue;
458 if (vma->vm_mm == t->mm)
459 add_to_kill(t, page, vma, to_kill);
460 }
461 }
462 read_unlock(&tasklist_lock);
463 page_unlock_anon_vma_read(av);
464}
465
466
467
468
469static void collect_procs_file(struct page *page, struct list_head *to_kill,
470 int force_early)
471{
472 struct vm_area_struct *vma;
473 struct task_struct *tsk;
474 struct address_space *mapping = page->mapping;
475
476 i_mmap_lock_read(mapping);
477 read_lock(&tasklist_lock);
478 for_each_process(tsk) {
479 pgoff_t pgoff = page_to_pgoff(page);
480 struct task_struct *t = task_early_kill(tsk, force_early);
481
482 if (!t)
483 continue;
484 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
485 pgoff) {
486
487
488
489
490
491
492
493 if (vma->vm_mm == t->mm)
494 add_to_kill(t, page, vma, to_kill);
495 }
496 }
497 read_unlock(&tasklist_lock);
498 i_mmap_unlock_read(mapping);
499}
500
501
502
503
504static void collect_procs(struct page *page, struct list_head *tokill,
505 int force_early)
506{
507 if (!page->mapping)
508 return;
509
510 if (PageAnon(page))
511 collect_procs_anon(page, tokill, force_early);
512 else
513 collect_procs_file(page, tokill, force_early);
514}
515
516static const char *action_name[] = {
517 [MF_IGNORED] = "Ignored",
518 [MF_FAILED] = "Failed",
519 [MF_DELAYED] = "Delayed",
520 [MF_RECOVERED] = "Recovered",
521};
522
523static const char * const action_page_types[] = {
524 [MF_MSG_KERNEL] = "reserved kernel page",
525 [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
526 [MF_MSG_SLAB] = "kernel slab page",
527 [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
528 [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
529 [MF_MSG_HUGE] = "huge page",
530 [MF_MSG_FREE_HUGE] = "free huge page",
531 [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page",
532 [MF_MSG_UNMAP_FAILED] = "unmapping failed page",
533 [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
534 [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
535 [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
536 [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
537 [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
538 [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
539 [MF_MSG_DIRTY_LRU] = "dirty LRU page",
540 [MF_MSG_CLEAN_LRU] = "clean LRU page",
541 [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
542 [MF_MSG_BUDDY] = "free buddy page",
543 [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
544 [MF_MSG_DAX] = "dax page",
545 [MF_MSG_UNKNOWN] = "unknown page",
546};
547
548
549
550
551
552
553
554static int delete_from_lru_cache(struct page *p)
555{
556 if (!isolate_lru_page(p)) {
557
558
559
560
561 ClearPageActive(p);
562 ClearPageUnevictable(p);
563
564
565
566
567
568 mem_cgroup_uncharge(p);
569
570
571
572
573 put_page(p);
574 return 0;
575 }
576 return -EIO;
577}
578
579static int truncate_error_page(struct page *p, unsigned long pfn,
580 struct address_space *mapping)
581{
582 int ret = MF_FAILED;
583
584 if (mapping->a_ops->error_remove_page) {
585 int err = mapping->a_ops->error_remove_page(mapping, p);
586
587 if (err != 0) {
588 pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
589 pfn, err);
590 } else if (page_has_private(p) &&
591 !try_to_release_page(p, GFP_NOIO)) {
592 pr_info("Memory failure: %#lx: failed to release buffers\n",
593 pfn);
594 } else {
595 ret = MF_RECOVERED;
596 }
597 } else {
598
599
600
601
602 if (invalidate_inode_page(p))
603 ret = MF_RECOVERED;
604 else
605 pr_info("Memory failure: %#lx: Failed to invalidate\n",
606 pfn);
607 }
608
609 return ret;
610}
611
612
613
614
615
616
617static int me_kernel(struct page *p, unsigned long pfn)
618{
619 return MF_IGNORED;
620}
621
622
623
624
625static int me_unknown(struct page *p, unsigned long pfn)
626{
627 pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
628 return MF_FAILED;
629}
630
631
632
633
634static int me_pagecache_clean(struct page *p, unsigned long pfn)
635{
636 struct address_space *mapping;
637
638 delete_from_lru_cache(p);
639
640
641
642
643
644 if (PageAnon(p))
645 return MF_RECOVERED;
646
647
648
649
650
651
652
653
654 mapping = page_mapping(p);
655 if (!mapping) {
656
657
658
659 return MF_FAILED;
660 }
661
662
663
664
665
666
667 return truncate_error_page(p, pfn, mapping);
668}
669
670
671
672
673
674
675static int me_pagecache_dirty(struct page *p, unsigned long pfn)
676{
677 struct address_space *mapping = page_mapping(p);
678
679 SetPageError(p);
680
681 if (mapping) {
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716 mapping_set_error(mapping, -EIO);
717 }
718
719 return me_pagecache_clean(p, pfn);
720}
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741static int me_swapcache_dirty(struct page *p, unsigned long pfn)
742{
743 ClearPageDirty(p);
744
745 ClearPageUptodate(p);
746
747 if (!delete_from_lru_cache(p))
748 return MF_DELAYED;
749 else
750 return MF_FAILED;
751}
752
753static int me_swapcache_clean(struct page *p, unsigned long pfn)
754{
755 delete_from_swap_cache(p);
756
757 if (!delete_from_lru_cache(p))
758 return MF_RECOVERED;
759 else
760 return MF_FAILED;
761}
762
763
764
765
766
767
768
769static int me_huge_page(struct page *p, unsigned long pfn)
770{
771 int res = 0;
772 struct page *hpage = compound_head(p);
773 struct address_space *mapping;
774
775 if (!PageHuge(hpage))
776 return MF_DELAYED;
777
778 mapping = page_mapping(hpage);
779 if (mapping) {
780 res = truncate_error_page(hpage, pfn, mapping);
781 } else {
782 unlock_page(hpage);
783
784
785
786
787
788 if (PageAnon(hpage))
789 put_page(hpage);
790 dissolve_free_huge_page(p);
791 res = MF_RECOVERED;
792 lock_page(hpage);
793 }
794
795 return res;
796}
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811#define dirty (1UL << PG_dirty)
812#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
813#define unevict (1UL << PG_unevictable)
814#define mlock (1UL << PG_mlocked)
815#define writeback (1UL << PG_writeback)
816#define lru (1UL << PG_lru)
817#define head (1UL << PG_head)
818#define slab (1UL << PG_slab)
819#define reserved (1UL << PG_reserved)
820
821static struct page_state {
822 unsigned long mask;
823 unsigned long res;
824 enum mf_action_page_type type;
825 int (*action)(struct page *p, unsigned long pfn);
826} error_states[] = {
827 { reserved, reserved, MF_MSG_KERNEL, me_kernel },
828
829
830
831
832
833
834
835
836
837
838 { slab, slab, MF_MSG_SLAB, me_kernel },
839
840 { head, head, MF_MSG_HUGE, me_huge_page },
841
842 { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
843 { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
844
845 { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
846 { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
847
848 { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
849 { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
850
851 { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty },
852 { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean },
853
854
855
856
857 { 0, 0, MF_MSG_UNKNOWN, me_unknown },
858};
859
860#undef dirty
861#undef sc
862#undef unevict
863#undef mlock
864#undef writeback
865#undef lru
866#undef head
867#undef slab
868#undef reserved
869
870
871
872
873
874static void action_result(unsigned long pfn, enum mf_action_page_type type,
875 enum mf_result result)
876{
877 trace_memory_failure_event(pfn, type, result);
878
879 pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
880 pfn, action_page_types[type], action_name[result]);
881}
882
883static int page_action(struct page_state *ps, struct page *p,
884 unsigned long pfn)
885{
886 int result;
887 int count;
888
889 result = ps->action(p, pfn);
890
891 count = page_count(p) - 1;
892 if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
893 count--;
894 if (count > 0) {
895 pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
896 pfn, action_page_types[ps->type], count);
897 result = MF_FAILED;
898 }
899 action_result(pfn, ps->type, result);
900
901
902
903
904
905
906 return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
907}
908
909
910
911
912
913
914
915
916int get_hwpoison_page(struct page *page)
917{
918 struct page *head = compound_head(page);
919
920 if (!PageHuge(head) && PageTransHuge(head)) {
921
922
923
924
925
926
927 if (!PageAnon(head)) {
928 pr_err("Memory failure: %#lx: non anonymous thp\n",
929 page_to_pfn(page));
930 return 0;
931 }
932 }
933
934 if (get_page_unless_zero(head)) {
935 if (head == compound_head(page))
936 return 1;
937
938 pr_info("Memory failure: %#lx cannot catch tail\n",
939 page_to_pfn(page));
940 put_page(head);
941 }
942
943 return 0;
944}
945EXPORT_SYMBOL_GPL(get_hwpoison_page);
946
947
948
949
950
951static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
952 int flags, struct page **hpagep)
953{
954 enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
955 struct address_space *mapping;
956 LIST_HEAD(tokill);
957 bool unmap_success = true;
958 int kill = 1, forcekill;
959 struct page *hpage = *hpagep;
960 bool mlocked = PageMlocked(hpage);
961
962
963
964
965
966 if (PageReserved(p) || PageSlab(p))
967 return true;
968 if (!(PageLRU(hpage) || PageHuge(p)))
969 return true;
970
971
972
973
974
975 if (!page_mapped(hpage))
976 return true;
977
978 if (PageKsm(p)) {
979 pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
980 return false;
981 }
982
983 if (PageSwapCache(p)) {
984 pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
985 pfn);
986 ttu |= TTU_IGNORE_HWPOISON;
987 }
988
989
990
991
992
993
994
995 mapping = page_mapping(hpage);
996 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
997 mapping_cap_writeback_dirty(mapping)) {
998 if (page_mkclean(hpage)) {
999 SetPageDirty(hpage);
1000 } else {
1001 kill = 0;
1002 ttu |= TTU_IGNORE_HWPOISON;
1003 pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
1004 pfn);
1005 }
1006 }
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016 if (kill)
1017 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
1018
1019 if (!PageHuge(hpage)) {
1020 unmap_success = try_to_unmap(hpage, ttu);
1021 } else {
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033 mapping = hugetlb_page_mapping_lock_write(hpage);
1034
1035 if (mapping) {
1036 unmap_success = try_to_unmap(hpage,
1037 ttu|TTU_RMAP_LOCKED);
1038 i_mmap_unlock_write(mapping);
1039 } else {
1040 pr_info("Memory failure: %#lx: could not find mapping for mapped huge page\n",
1041 pfn);
1042 unmap_success = false;
1043 }
1044 }
1045 if (!unmap_success)
1046 pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
1047 pfn, page_mapcount(hpage));
1048
1049
1050
1051
1052
1053 if (mlocked)
1054 shake_page(hpage, 0);
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066 forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
1067 kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
1068
1069 return unmap_success;
1070}
1071
1072static int identify_page_state(unsigned long pfn, struct page *p,
1073 unsigned long page_flags)
1074{
1075 struct page_state *ps;
1076
1077
1078
1079
1080
1081
1082 for (ps = error_states;; ps++)
1083 if ((p->flags & ps->mask) == ps->res)
1084 break;
1085
1086 page_flags |= (p->flags & (1UL << PG_dirty));
1087
1088 if (!ps->mask)
1089 for (ps = error_states;; ps++)
1090 if ((page_flags & ps->mask) == ps->res)
1091 break;
1092 return page_action(ps, p, pfn);
1093}
1094
1095static int memory_failure_hugetlb(unsigned long pfn, int flags)
1096{
1097 struct page *p = pfn_to_page(pfn);
1098 struct page *head = compound_head(p);
1099 int res;
1100 unsigned long page_flags;
1101
1102 if (TestSetPageHWPoison(head)) {
1103 pr_err("Memory failure: %#lx: already hardware poisoned\n",
1104 pfn);
1105 return 0;
1106 }
1107
1108 num_poisoned_pages_inc();
1109
1110 if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
1111
1112
1113
1114 lock_page(head);
1115 if (PageHWPoison(head)) {
1116 if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
1117 || (p != head && TestSetPageHWPoison(head))) {
1118 num_poisoned_pages_dec();
1119 unlock_page(head);
1120 return 0;
1121 }
1122 }
1123 unlock_page(head);
1124 dissolve_free_huge_page(p);
1125 action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED);
1126 return 0;
1127 }
1128
1129 lock_page(head);
1130 page_flags = head->flags;
1131
1132 if (!PageHWPoison(head)) {
1133 pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1134 num_poisoned_pages_dec();
1135 unlock_page(head);
1136 put_hwpoison_page(head);
1137 return 0;
1138 }
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149 if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
1150 action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
1151 res = -EBUSY;
1152 goto out;
1153 }
1154
1155 if (!hwpoison_user_mappings(p, pfn, flags, &head)) {
1156 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1157 res = -EBUSY;
1158 goto out;
1159 }
1160
1161 res = identify_page_state(pfn, p, page_flags);
1162out:
1163 unlock_page(head);
1164 return res;
1165}
1166
1167static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
1168 struct dev_pagemap *pgmap)
1169{
1170 struct page *page = pfn_to_page(pfn);
1171 const bool unmap_success = true;
1172 unsigned long size = 0;
1173 struct to_kill *tk;
1174 LIST_HEAD(tokill);
1175 int rc = -EBUSY;
1176 loff_t start;
1177 dax_entry_t cookie;
1178
1179
1180
1181
1182
1183
1184
1185
1186 cookie = dax_lock_page(page);
1187 if (!cookie)
1188 goto out;
1189
1190 if (hwpoison_filter(page)) {
1191 rc = 0;
1192 goto unlock;
1193 }
1194
1195 if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
1196
1197
1198
1199
1200 goto unlock;
1201 }
1202
1203
1204
1205
1206
1207 SetPageHWPoison(page);
1208
1209
1210
1211
1212
1213
1214
1215 flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
1216 collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
1217
1218 list_for_each_entry(tk, &tokill, nd)
1219 if (tk->size_shift)
1220 size = max(size, 1UL << tk->size_shift);
1221 if (size) {
1222
1223
1224
1225
1226
1227
1228 start = (page->index << PAGE_SHIFT) & ~(size - 1);
1229 unmap_mapping_range(page->mapping, start, start + size, 0);
1230 }
1231 kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
1232 rc = 0;
1233unlock:
1234 dax_unlock_page(page, cookie);
1235out:
1236
1237 put_dev_pagemap(pgmap);
1238 action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
1239 return rc;
1240}
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259int memory_failure(unsigned long pfn, int flags)
1260{
1261 struct page *p;
1262 struct page *hpage;
1263 struct page *orig_head;
1264 struct dev_pagemap *pgmap;
1265 int res;
1266 unsigned long page_flags;
1267
1268 if (!sysctl_memory_failure_recovery)
1269 panic("Memory failure on page %lx", pfn);
1270
1271 p = pfn_to_online_page(pfn);
1272 if (!p) {
1273 if (pfn_valid(pfn)) {
1274 pgmap = get_dev_pagemap(pfn, NULL);
1275 if (pgmap)
1276 return memory_failure_dev_pagemap(pfn, flags,
1277 pgmap);
1278 }
1279 pr_err("Memory failure: %#lx: memory outside kernel control\n",
1280 pfn);
1281 return -ENXIO;
1282 }
1283
1284 if (PageHuge(p))
1285 return memory_failure_hugetlb(pfn, flags);
1286 if (TestSetPageHWPoison(p)) {
1287 pr_err("Memory failure: %#lx: already hardware poisoned\n",
1288 pfn);
1289 return 0;
1290 }
1291
1292 orig_head = hpage = compound_head(p);
1293 num_poisoned_pages_inc();
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306 if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
1307 if (is_free_buddy_page(p)) {
1308 action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1309 return 0;
1310 } else {
1311 action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
1312 return -EBUSY;
1313 }
1314 }
1315
1316 if (PageTransHuge(hpage)) {
1317 lock_page(p);
1318 if (!PageAnon(p) || unlikely(split_huge_page(p))) {
1319 unlock_page(p);
1320 if (!PageAnon(p))
1321 pr_err("Memory failure: %#lx: non anonymous thp\n",
1322 pfn);
1323 else
1324 pr_err("Memory failure: %#lx: thp split failed\n",
1325 pfn);
1326 if (TestClearPageHWPoison(p))
1327 num_poisoned_pages_dec();
1328 put_hwpoison_page(p);
1329 return -EBUSY;
1330 }
1331 unlock_page(p);
1332 VM_BUG_ON_PAGE(!page_count(p), p);
1333 hpage = compound_head(p);
1334 }
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344 shake_page(p, 0);
1345
1346 if (!PageLRU(p) && is_free_buddy_page(p)) {
1347 if (flags & MF_COUNT_INCREASED)
1348 action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1349 else
1350 action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
1351 return 0;
1352 }
1353
1354 lock_page(p);
1355
1356
1357
1358
1359
1360 if (PageCompound(p) && compound_head(p) != orig_head) {
1361 action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
1362 res = -EBUSY;
1363 goto out;
1364 }
1365
1366
1367
1368
1369
1370
1371
1372
1373 if (PageHuge(p))
1374 page_flags = hpage->flags;
1375 else
1376 page_flags = p->flags;
1377
1378
1379
1380
1381 if (!PageHWPoison(p)) {
1382 pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1383 num_poisoned_pages_dec();
1384 unlock_page(p);
1385 put_hwpoison_page(p);
1386 return 0;
1387 }
1388 if (hwpoison_filter(p)) {
1389 if (TestClearPageHWPoison(p))
1390 num_poisoned_pages_dec();
1391 unlock_page(p);
1392 put_hwpoison_page(p);
1393 return 0;
1394 }
1395
1396 if (!PageTransTail(p) && !PageLRU(p))
1397 goto identify_page_state;
1398
1399
1400
1401
1402
1403 wait_on_page_writeback(p);
1404
1405
1406
1407
1408
1409
1410
1411
1412 if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) {
1413 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1414 res = -EBUSY;
1415 goto out;
1416 }
1417
1418
1419
1420
1421 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1422 action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
1423 res = -EBUSY;
1424 goto out;
1425 }
1426
1427identify_page_state:
1428 res = identify_page_state(pfn, p, page_flags);
1429out:
1430 unlock_page(p);
1431 return res;
1432}
1433EXPORT_SYMBOL_GPL(memory_failure);
1434
1435#define MEMORY_FAILURE_FIFO_ORDER 4
1436#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1437
1438struct memory_failure_entry {
1439 unsigned long pfn;
1440 int flags;
1441};
1442
1443struct memory_failure_cpu {
1444 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1445 MEMORY_FAILURE_FIFO_SIZE);
1446 spinlock_t lock;
1447 struct work_struct work;
1448};
1449
1450static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468void memory_failure_queue(unsigned long pfn, int flags)
1469{
1470 struct memory_failure_cpu *mf_cpu;
1471 unsigned long proc_flags;
1472 struct memory_failure_entry entry = {
1473 .pfn = pfn,
1474 .flags = flags,
1475 };
1476
1477 mf_cpu = &get_cpu_var(memory_failure_cpu);
1478 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1479 if (kfifo_put(&mf_cpu->fifo, entry))
1480 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1481 else
1482 pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
1483 pfn);
1484 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1485 put_cpu_var(memory_failure_cpu);
1486}
1487EXPORT_SYMBOL_GPL(memory_failure_queue);
1488
1489static void memory_failure_work_func(struct work_struct *work)
1490{
1491 struct memory_failure_cpu *mf_cpu;
1492 struct memory_failure_entry entry = { 0, };
1493 unsigned long proc_flags;
1494 int gotten;
1495
1496 mf_cpu = this_cpu_ptr(&memory_failure_cpu);
1497 for (;;) {
1498 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1499 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1500 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1501 if (!gotten)
1502 break;
1503 if (entry.flags & MF_SOFT_OFFLINE)
1504 soft_offline_page(entry.pfn, entry.flags);
1505 else
1506 memory_failure(entry.pfn, entry.flags);
1507 }
1508}
1509
1510static int __init memory_failure_init(void)
1511{
1512 struct memory_failure_cpu *mf_cpu;
1513 int cpu;
1514
1515 for_each_possible_cpu(cpu) {
1516 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1517 spin_lock_init(&mf_cpu->lock);
1518 INIT_KFIFO(mf_cpu->fifo);
1519 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1520 }
1521
1522 return 0;
1523}
1524core_initcall(memory_failure_init);
1525
1526#define unpoison_pr_info(fmt, pfn, rs) \
1527({ \
1528 if (__ratelimit(rs)) \
1529 pr_info(fmt, pfn); \
1530})
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544int unpoison_memory(unsigned long pfn)
1545{
1546 struct page *page;
1547 struct page *p;
1548 int freeit = 0;
1549 static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
1550 DEFAULT_RATELIMIT_BURST);
1551
1552 if (!pfn_valid(pfn))
1553 return -ENXIO;
1554
1555 p = pfn_to_page(pfn);
1556 page = compound_head(p);
1557
1558 if (!PageHWPoison(p)) {
1559 unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
1560 pfn, &unpoison_rs);
1561 return 0;
1562 }
1563
1564 if (page_count(page) > 1) {
1565 unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
1566 pfn, &unpoison_rs);
1567 return 0;
1568 }
1569
1570 if (page_mapped(page)) {
1571 unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
1572 pfn, &unpoison_rs);
1573 return 0;
1574 }
1575
1576 if (page_mapping(page)) {
1577 unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
1578 pfn, &unpoison_rs);
1579 return 0;
1580 }
1581
1582
1583
1584
1585
1586
1587 if (!PageHuge(page) && PageTransHuge(page)) {
1588 unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
1589 pfn, &unpoison_rs);
1590 return 0;
1591 }
1592
1593 if (!get_hwpoison_page(p)) {
1594 if (TestClearPageHWPoison(p))
1595 num_poisoned_pages_dec();
1596 unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
1597 pfn, &unpoison_rs);
1598 return 0;
1599 }
1600
1601 lock_page(page);
1602
1603
1604
1605
1606
1607
1608 if (TestClearPageHWPoison(page)) {
1609 unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
1610 pfn, &unpoison_rs);
1611 num_poisoned_pages_dec();
1612 freeit = 1;
1613 }
1614 unlock_page(page);
1615
1616 put_hwpoison_page(page);
1617 if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
1618 put_hwpoison_page(page);
1619
1620 return 0;
1621}
1622EXPORT_SYMBOL(unpoison_memory);
1623
1624static struct page *new_page(struct page *p, unsigned long private)
1625{
1626 int nid = page_to_nid(p);
1627
1628 return new_page_nodemask(p, nid, &node_states[N_MEMORY]);
1629}
1630
1631
1632
1633
1634
1635
1636
1637static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1638{
1639 int ret;
1640
1641 if (flags & MF_COUNT_INCREASED)
1642 return 1;
1643
1644
1645
1646
1647
1648 if (!get_hwpoison_page(p)) {
1649 if (PageHuge(p)) {
1650 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1651 ret = 0;
1652 } else if (is_free_buddy_page(p)) {
1653 pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1654 ret = 0;
1655 } else {
1656 pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1657 __func__, pfn, p->flags);
1658 ret = -EIO;
1659 }
1660 } else {
1661
1662 ret = 1;
1663 }
1664 return ret;
1665}
1666
1667static int get_any_page(struct page *page, unsigned long pfn, int flags)
1668{
1669 int ret = __get_any_page(page, pfn, flags);
1670
1671 if (ret == 1 && !PageHuge(page) &&
1672 !PageLRU(page) && !__PageMovable(page)) {
1673
1674
1675
1676 put_hwpoison_page(page);
1677 shake_page(page, 1);
1678
1679
1680
1681
1682 ret = __get_any_page(page, pfn, 0);
1683 if (ret == 1 && !PageLRU(page)) {
1684
1685 put_hwpoison_page(page);
1686 pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
1687 pfn, page->flags, &page->flags);
1688 return -EIO;
1689 }
1690 }
1691 return ret;
1692}
1693
1694static int soft_offline_huge_page(struct page *page, int flags)
1695{
1696 int ret;
1697 unsigned long pfn = page_to_pfn(page);
1698 struct page *hpage = compound_head(page);
1699 LIST_HEAD(pagelist);
1700
1701
1702
1703
1704
1705 lock_page(hpage);
1706 if (PageHWPoison(hpage)) {
1707 unlock_page(hpage);
1708 put_hwpoison_page(hpage);
1709 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1710 return -EBUSY;
1711 }
1712 unlock_page(hpage);
1713
1714 ret = isolate_huge_page(hpage, &pagelist);
1715
1716
1717
1718
1719 put_hwpoison_page(hpage);
1720 if (!ret) {
1721 pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
1722 return -EBUSY;
1723 }
1724
1725 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1726 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1727 if (ret) {
1728 pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n",
1729 pfn, ret, page->flags, &page->flags);
1730 if (!list_empty(&pagelist))
1731 putback_movable_pages(&pagelist);
1732 if (ret > 0)
1733 ret = -EIO;
1734 } else {
1735
1736
1737
1738
1739
1740
1741
1742 ret = dissolve_free_huge_page(page);
1743 if (!ret) {
1744 if (set_hwpoison_free_buddy_page(page))
1745 num_poisoned_pages_inc();
1746 else
1747 ret = -EBUSY;
1748 }
1749 }
1750 return ret;
1751}
1752
1753static int __soft_offline_page(struct page *page, int flags)
1754{
1755 int ret;
1756 unsigned long pfn = page_to_pfn(page);
1757
1758
1759
1760
1761
1762
1763
1764 lock_page(page);
1765 wait_on_page_writeback(page);
1766 if (PageHWPoison(page)) {
1767 unlock_page(page);
1768 put_hwpoison_page(page);
1769 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1770 return -EBUSY;
1771 }
1772
1773
1774
1775
1776 ret = invalidate_inode_page(page);
1777 unlock_page(page);
1778
1779
1780
1781
1782 if (ret == 1) {
1783 put_hwpoison_page(page);
1784 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1785 SetPageHWPoison(page);
1786 num_poisoned_pages_inc();
1787 return 0;
1788 }
1789
1790
1791
1792
1793
1794
1795 if (PageLRU(page))
1796 ret = isolate_lru_page(page);
1797 else
1798 ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1799
1800
1801
1802
1803 put_hwpoison_page(page);
1804 if (!ret) {
1805 LIST_HEAD(pagelist);
1806
1807
1808
1809
1810
1811 if (!__PageMovable(page))
1812 inc_node_page_state(page, NR_ISOLATED_ANON +
1813 page_is_file_lru(page));
1814 list_add(&page->lru, &pagelist);
1815 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1816 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1817 if (ret) {
1818 if (!list_empty(&pagelist))
1819 putback_movable_pages(&pagelist);
1820
1821 pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
1822 pfn, ret, page->flags, &page->flags);
1823 if (ret > 0)
1824 ret = -EIO;
1825 }
1826 } else {
1827 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n",
1828 pfn, ret, page_count(page), page->flags, &page->flags);
1829 }
1830 return ret;
1831}
1832
1833static int soft_offline_in_use_page(struct page *page, int flags)
1834{
1835 int ret;
1836 int mt;
1837 struct page *hpage = compound_head(page);
1838
1839 if (!PageHuge(page) && PageTransHuge(hpage)) {
1840 lock_page(page);
1841 if (!PageAnon(page) || unlikely(split_huge_page(page))) {
1842 unlock_page(page);
1843 if (!PageAnon(page))
1844 pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
1845 else
1846 pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
1847 put_hwpoison_page(page);
1848 return -EBUSY;
1849 }
1850 unlock_page(page);
1851 }
1852
1853
1854
1855
1856
1857
1858
1859
1860 mt = get_pageblock_migratetype(page);
1861 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
1862 if (PageHuge(page))
1863 ret = soft_offline_huge_page(page, flags);
1864 else
1865 ret = __soft_offline_page(page, flags);
1866 set_pageblock_migratetype(page, mt);
1867 return ret;
1868}
1869
1870static int soft_offline_free_page(struct page *page)
1871{
1872 int rc = dissolve_free_huge_page(page);
1873
1874 if (!rc) {
1875 if (set_hwpoison_free_buddy_page(page))
1876 num_poisoned_pages_inc();
1877 else
1878 rc = -EBUSY;
1879 }
1880 return rc;
1881}
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905int soft_offline_page(unsigned long pfn, int flags)
1906{
1907 int ret;
1908 struct page *page;
1909
1910 if (!pfn_valid(pfn))
1911 return -ENXIO;
1912
1913 page = pfn_to_online_page(pfn);
1914 if (!page)
1915 return -EIO;
1916
1917 if (PageHWPoison(page)) {
1918 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1919 if (flags & MF_COUNT_INCREASED)
1920 put_hwpoison_page(page);
1921 return -EBUSY;
1922 }
1923
1924 get_online_mems();
1925 ret = get_any_page(page, pfn, flags);
1926 put_online_mems();
1927
1928 if (ret > 0)
1929 ret = soft_offline_in_use_page(page, flags);
1930 else if (ret == 0)
1931 ret = soft_offline_free_page(page);
1932
1933 return ret;
1934}
1935