1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39#include <linux/kernel.h>
40#include <linux/mm.h>
41#include <linux/page-flags.h>
42#include <linux/kernel-page-flags.h>
43#include <linux/sched/signal.h>
44#include <linux/sched/task.h>
45#include <linux/ksm.h>
46#include <linux/rmap.h>
47#include <linux/export.h>
48#include <linux/pagemap.h>
49#include <linux/swap.h>
50#include <linux/backing-dev.h>
51#include <linux/migrate.h>
52#include <linux/suspend.h>
53#include <linux/slab.h>
54#include <linux/swapops.h>
55#include <linux/hugetlb.h>
56#include <linux/memory_hotplug.h>
57#include <linux/mm_inline.h>
58#include <linux/memremap.h>
59#include <linux/kfifo.h>
60#include <linux/ratelimit.h>
61#include <linux/page-isolation.h>
62#include "internal.h"
63#include "ras/ras_event.h"
64
65int sysctl_memory_failure_early_kill __read_mostly = 0;
66
67int sysctl_memory_failure_recovery __read_mostly = 1;
68
69atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
70
71#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
72
73u32 hwpoison_filter_enable = 0;
74u32 hwpoison_filter_dev_major = ~0U;
75u32 hwpoison_filter_dev_minor = ~0U;
76u64 hwpoison_filter_flags_mask;
77u64 hwpoison_filter_flags_value;
78EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
79EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
80EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
81EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
82EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
83
84static int hwpoison_filter_dev(struct page *p)
85{
86 struct address_space *mapping;
87 dev_t dev;
88
89 if (hwpoison_filter_dev_major == ~0U &&
90 hwpoison_filter_dev_minor == ~0U)
91 return 0;
92
93
94
95
96 if (PageSlab(p))
97 return -EINVAL;
98
99 mapping = page_mapping(p);
100 if (mapping == NULL || mapping->host == NULL)
101 return -EINVAL;
102
103 dev = mapping->host->i_sb->s_dev;
104 if (hwpoison_filter_dev_major != ~0U &&
105 hwpoison_filter_dev_major != MAJOR(dev))
106 return -EINVAL;
107 if (hwpoison_filter_dev_minor != ~0U &&
108 hwpoison_filter_dev_minor != MINOR(dev))
109 return -EINVAL;
110
111 return 0;
112}
113
114static int hwpoison_filter_flags(struct page *p)
115{
116 if (!hwpoison_filter_flags_mask)
117 return 0;
118
119 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
120 hwpoison_filter_flags_value)
121 return 0;
122 else
123 return -EINVAL;
124}
125
126
127
128
129
130
131
132
133
134
135
136#ifdef CONFIG_MEMCG
137u64 hwpoison_filter_memcg;
138EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
139static int hwpoison_filter_task(struct page *p)
140{
141 if (!hwpoison_filter_memcg)
142 return 0;
143
144 if (page_cgroup_ino(p) != hwpoison_filter_memcg)
145 return -EINVAL;
146
147 return 0;
148}
149#else
150static int hwpoison_filter_task(struct page *p) { return 0; }
151#endif
152
153int hwpoison_filter(struct page *p)
154{
155 if (!hwpoison_filter_enable)
156 return 0;
157
158 if (hwpoison_filter_dev(p))
159 return -EINVAL;
160
161 if (hwpoison_filter_flags(p))
162 return -EINVAL;
163
164 if (hwpoison_filter_task(p))
165 return -EINVAL;
166
167 return 0;
168}
169#else
170int hwpoison_filter(struct page *p)
171{
172 return 0;
173}
174#endif
175
176EXPORT_SYMBOL_GPL(hwpoison_filter);
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200struct to_kill {
201 struct list_head nd;
202 struct task_struct *tsk;
203 unsigned long addr;
204 short size_shift;
205 char addr_valid;
206};
207
208
209
210
211
212
213static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
214{
215 struct task_struct *t = tk->tsk;
216 short addr_lsb = tk->size_shift;
217 int ret;
218
219 pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
220 pfn, t->comm, t->pid);
221
222 if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
223 ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr,
224 addr_lsb, current);
225 } else {
226
227
228
229
230
231
232 ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
233 addr_lsb, t);
234 }
235 if (ret < 0)
236 pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
237 t->comm, t->pid, ret);
238 return ret;
239}
240
241
242
243
244
245void shake_page(struct page *p, int access)
246{
247 if (PageHuge(p))
248 return;
249
250 if (!PageSlab(p)) {
251 lru_add_drain_all();
252 if (PageLRU(p))
253 return;
254 drain_all_pages(page_zone(p));
255 if (PageLRU(p) || is_free_buddy_page(p))
256 return;
257 }
258
259
260
261
262
263 if (access)
264 drop_slab_node(page_to_nid(p));
265}
266EXPORT_SYMBOL_GPL(shake_page);
267
268static unsigned long dev_pagemap_mapping_shift(struct page *page,
269 struct vm_area_struct *vma)
270{
271 unsigned long address = vma_address(page, vma);
272 pgd_t *pgd;
273 p4d_t *p4d;
274 pud_t *pud;
275 pmd_t *pmd;
276 pte_t *pte;
277
278 pgd = pgd_offset(vma->vm_mm, address);
279 if (!pgd_present(*pgd))
280 return 0;
281 p4d = p4d_offset(pgd, address);
282 if (!p4d_present(*p4d))
283 return 0;
284 pud = pud_offset(p4d, address);
285 if (!pud_present(*pud))
286 return 0;
287 if (pud_devmap(*pud))
288 return PUD_SHIFT;
289 pmd = pmd_offset(pud, address);
290 if (!pmd_present(*pmd))
291 return 0;
292 if (pmd_devmap(*pmd))
293 return PMD_SHIFT;
294 pte = pte_offset_map(pmd, address);
295 if (!pte_present(*pte))
296 return 0;
297 if (pte_devmap(*pte))
298 return PAGE_SHIFT;
299 return 0;
300}
301
302
303
304
305
306
307
308
309
310
311
312static void add_to_kill(struct task_struct *tsk, struct page *p,
313 struct vm_area_struct *vma,
314 struct list_head *to_kill,
315 struct to_kill **tkc)
316{
317 struct to_kill *tk;
318
319 if (*tkc) {
320 tk = *tkc;
321 *tkc = NULL;
322 } else {
323 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
324 if (!tk) {
325 pr_err("Memory failure: Out of memory while machine check handling\n");
326 return;
327 }
328 }
329 tk->addr = page_address_in_vma(p, vma);
330 tk->addr_valid = 1;
331 if (is_zone_device_page(p))
332 tk->size_shift = dev_pagemap_mapping_shift(p, vma);
333 else
334 tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
335
336
337
338
339
340
341
342 if (tk->addr == -EFAULT || tk->size_shift == 0) {
343 pr_info("Memory failure: Unable to find user space address %lx in %s\n",
344 page_to_pfn(p), tsk->comm);
345 tk->addr_valid = 0;
346 }
347 get_task_struct(tsk);
348 tk->tsk = tsk;
349 list_add_tail(&tk->nd, to_kill);
350}
351
352
353
354
355
356
357
358
359
360static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
361 unsigned long pfn, int flags)
362{
363 struct to_kill *tk, *next;
364
365 list_for_each_entry_safe (tk, next, to_kill, nd) {
366 if (forcekill) {
367
368
369
370
371
372 if (fail || tk->addr_valid == 0) {
373 pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
374 pfn, tk->tsk->comm, tk->tsk->pid);
375 force_sig(SIGKILL, tk->tsk);
376 }
377
378
379
380
381
382
383
384 else if (kill_proc(tk, pfn, flags) < 0)
385 pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
386 pfn, tk->tsk->comm, tk->tsk->pid);
387 }
388 put_task_struct(tk->tsk);
389 kfree(tk);
390 }
391}
392
393
394
395
396
397
398
399
400
401static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
402{
403 struct task_struct *t;
404
405 for_each_thread(tsk, t)
406 if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
407 return t;
408 return NULL;
409}
410
411
412
413
414
415
416
417static struct task_struct *task_early_kill(struct task_struct *tsk,
418 int force_early)
419{
420 struct task_struct *t;
421 if (!tsk->mm)
422 return NULL;
423 if (force_early)
424 return tsk;
425 t = find_early_kill_thread(tsk);
426 if (t)
427 return t;
428 if (sysctl_memory_failure_early_kill)
429 return tsk;
430 return NULL;
431}
432
433
434
435
436static void collect_procs_anon(struct page *page, struct list_head *to_kill,
437 struct to_kill **tkc, int force_early)
438{
439 struct vm_area_struct *vma;
440 struct task_struct *tsk;
441 struct anon_vma *av;
442 pgoff_t pgoff;
443
444 av = page_lock_anon_vma_read(page);
445 if (av == NULL)
446 return;
447
448 pgoff = page_to_pgoff(page);
449 read_lock(&tasklist_lock);
450 for_each_process (tsk) {
451 struct anon_vma_chain *vmac;
452 struct task_struct *t = task_early_kill(tsk, force_early);
453
454 if (!t)
455 continue;
456 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
457 pgoff, pgoff) {
458 vma = vmac->vma;
459 if (!page_mapped_in_vma(page, vma))
460 continue;
461 if (vma->vm_mm == t->mm)
462 add_to_kill(t, page, vma, to_kill, tkc);
463 }
464 }
465 read_unlock(&tasklist_lock);
466 page_unlock_anon_vma_read(av);
467}
468
469
470
471
472static void collect_procs_file(struct page *page, struct list_head *to_kill,
473 struct to_kill **tkc, int force_early)
474{
475 struct vm_area_struct *vma;
476 struct task_struct *tsk;
477 struct address_space *mapping = page->mapping;
478
479 i_mmap_lock_read(mapping);
480 read_lock(&tasklist_lock);
481 for_each_process(tsk) {
482 pgoff_t pgoff = page_to_pgoff(page);
483 struct task_struct *t = task_early_kill(tsk, force_early);
484
485 if (!t)
486 continue;
487 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
488 pgoff) {
489
490
491
492
493
494
495
496 if (vma->vm_mm == t->mm)
497 add_to_kill(t, page, vma, to_kill, tkc);
498 }
499 }
500 read_unlock(&tasklist_lock);
501 i_mmap_unlock_read(mapping);
502}
503
504
505
506
507
508
509
510static void collect_procs(struct page *page, struct list_head *tokill,
511 int force_early)
512{
513 struct to_kill *tk;
514
515 if (!page->mapping)
516 return;
517
518 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
519 if (!tk)
520 return;
521 if (PageAnon(page))
522 collect_procs_anon(page, tokill, &tk, force_early);
523 else
524 collect_procs_file(page, tokill, &tk, force_early);
525 kfree(tk);
526}
527
528static const char *action_name[] = {
529 [MF_IGNORED] = "Ignored",
530 [MF_FAILED] = "Failed",
531 [MF_DELAYED] = "Delayed",
532 [MF_RECOVERED] = "Recovered",
533};
534
535static const char * const action_page_types[] = {
536 [MF_MSG_KERNEL] = "reserved kernel page",
537 [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
538 [MF_MSG_SLAB] = "kernel slab page",
539 [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
540 [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
541 [MF_MSG_HUGE] = "huge page",
542 [MF_MSG_FREE_HUGE] = "free huge page",
543 [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page",
544 [MF_MSG_UNMAP_FAILED] = "unmapping failed page",
545 [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
546 [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
547 [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
548 [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
549 [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
550 [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
551 [MF_MSG_DIRTY_LRU] = "dirty LRU page",
552 [MF_MSG_CLEAN_LRU] = "clean LRU page",
553 [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
554 [MF_MSG_BUDDY] = "free buddy page",
555 [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
556 [MF_MSG_DAX] = "dax page",
557 [MF_MSG_UNKNOWN] = "unknown page",
558};
559
560
561
562
563
564
565
566static int delete_from_lru_cache(struct page *p)
567{
568 if (!isolate_lru_page(p)) {
569
570
571
572
573 ClearPageActive(p);
574 ClearPageUnevictable(p);
575
576
577
578
579
580 mem_cgroup_uncharge(p);
581
582
583
584
585 put_page(p);
586 return 0;
587 }
588 return -EIO;
589}
590
591static int truncate_error_page(struct page *p, unsigned long pfn,
592 struct address_space *mapping)
593{
594 int ret = MF_FAILED;
595
596 if (mapping->a_ops->error_remove_page) {
597 int err = mapping->a_ops->error_remove_page(mapping, p);
598
599 if (err != 0) {
600 pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
601 pfn, err);
602 } else if (page_has_private(p) &&
603 !try_to_release_page(p, GFP_NOIO)) {
604 pr_info("Memory failure: %#lx: failed to release buffers\n",
605 pfn);
606 } else {
607 ret = MF_RECOVERED;
608 }
609 } else {
610
611
612
613
614 if (invalidate_inode_page(p))
615 ret = MF_RECOVERED;
616 else
617 pr_info("Memory failure: %#lx: Failed to invalidate\n",
618 pfn);
619 }
620
621 return ret;
622}
623
624
625
626
627
628
629static int me_kernel(struct page *p, unsigned long pfn)
630{
631 return MF_IGNORED;
632}
633
634
635
636
637static int me_unknown(struct page *p, unsigned long pfn)
638{
639 pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
640 return MF_FAILED;
641}
642
643
644
645
646static int me_pagecache_clean(struct page *p, unsigned long pfn)
647{
648 struct address_space *mapping;
649
650 delete_from_lru_cache(p);
651
652
653
654
655
656 if (PageAnon(p))
657 return MF_RECOVERED;
658
659
660
661
662
663
664
665
666 mapping = page_mapping(p);
667 if (!mapping) {
668
669
670
671 return MF_FAILED;
672 }
673
674
675
676
677
678
679 return truncate_error_page(p, pfn, mapping);
680}
681
682
683
684
685
686
687static int me_pagecache_dirty(struct page *p, unsigned long pfn)
688{
689 struct address_space *mapping = page_mapping(p);
690
691 SetPageError(p);
692
693 if (mapping) {
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728 mapping_set_error(mapping, -EIO);
729 }
730
731 return me_pagecache_clean(p, pfn);
732}
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753static int me_swapcache_dirty(struct page *p, unsigned long pfn)
754{
755 ClearPageDirty(p);
756
757 ClearPageUptodate(p);
758
759 if (!delete_from_lru_cache(p))
760 return MF_DELAYED;
761 else
762 return MF_FAILED;
763}
764
765static int me_swapcache_clean(struct page *p, unsigned long pfn)
766{
767 delete_from_swap_cache(p);
768
769 if (!delete_from_lru_cache(p))
770 return MF_RECOVERED;
771 else
772 return MF_FAILED;
773}
774
775
776
777
778
779
780
781static int me_huge_page(struct page *p, unsigned long pfn)
782{
783 int res = 0;
784 struct page *hpage = compound_head(p);
785 struct address_space *mapping;
786
787 if (!PageHuge(hpage))
788 return MF_DELAYED;
789
790 mapping = page_mapping(hpage);
791 if (mapping) {
792 res = truncate_error_page(hpage, pfn, mapping);
793 } else {
794 unlock_page(hpage);
795
796
797
798
799
800 if (PageAnon(hpage))
801 put_page(hpage);
802 dissolve_free_huge_page(p);
803 res = MF_RECOVERED;
804 lock_page(hpage);
805 }
806
807 return res;
808}
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823#define dirty (1UL << PG_dirty)
824#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
825#define unevict (1UL << PG_unevictable)
826#define mlock (1UL << PG_mlocked)
827#define writeback (1UL << PG_writeback)
828#define lru (1UL << PG_lru)
829#define head (1UL << PG_head)
830#define slab (1UL << PG_slab)
831#define reserved (1UL << PG_reserved)
832
833static struct page_state {
834 unsigned long mask;
835 unsigned long res;
836 enum mf_action_page_type type;
837 int (*action)(struct page *p, unsigned long pfn);
838} error_states[] = {
839 { reserved, reserved, MF_MSG_KERNEL, me_kernel },
840
841
842
843
844
845
846
847
848
849
850 { slab, slab, MF_MSG_SLAB, me_kernel },
851
852 { head, head, MF_MSG_HUGE, me_huge_page },
853
854 { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
855 { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
856
857 { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
858 { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
859
860 { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
861 { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
862
863 { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty },
864 { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean },
865
866
867
868
869 { 0, 0, MF_MSG_UNKNOWN, me_unknown },
870};
871
872#undef dirty
873#undef sc
874#undef unevict
875#undef mlock
876#undef writeback
877#undef lru
878#undef head
879#undef slab
880#undef reserved
881
882
883
884
885
886static void action_result(unsigned long pfn, enum mf_action_page_type type,
887 enum mf_result result)
888{
889 trace_memory_failure_event(pfn, type, result);
890
891 pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
892 pfn, action_page_types[type], action_name[result]);
893}
894
895static int page_action(struct page_state *ps, struct page *p,
896 unsigned long pfn)
897{
898 int result;
899 int count;
900
901 result = ps->action(p, pfn);
902
903 count = page_count(p) - 1;
904 if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
905 count--;
906 if (count > 0) {
907 pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
908 pfn, action_page_types[ps->type], count);
909 result = MF_FAILED;
910 }
911 action_result(pfn, ps->type, result);
912
913
914
915
916
917
918 return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
919}
920
921
922
923
924
925
926
927
928int get_hwpoison_page(struct page *page)
929{
930 struct page *head = compound_head(page);
931
932 if (!PageHuge(head) && PageTransHuge(head)) {
933
934
935
936
937
938
939 if (!PageAnon(head)) {
940 pr_err("Memory failure: %#lx: non anonymous thp\n",
941 page_to_pfn(page));
942 return 0;
943 }
944 }
945
946 if (get_page_unless_zero(head)) {
947 if (head == compound_head(page))
948 return 1;
949
950 pr_info("Memory failure: %#lx cannot catch tail\n",
951 page_to_pfn(page));
952 put_page(head);
953 }
954
955 return 0;
956}
957EXPORT_SYMBOL_GPL(get_hwpoison_page);
958
959
960
961
962
963static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
964 int flags, struct page **hpagep)
965{
966 enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
967 struct address_space *mapping;
968 LIST_HEAD(tokill);
969 bool unmap_success;
970 int kill = 1, forcekill;
971 struct page *hpage = *hpagep;
972 bool mlocked = PageMlocked(hpage);
973
974
975
976
977
978 if (PageReserved(p) || PageSlab(p))
979 return true;
980 if (!(PageLRU(hpage) || PageHuge(p)))
981 return true;
982
983
984
985
986
987 if (!page_mapped(hpage))
988 return true;
989
990 if (PageKsm(p)) {
991 pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
992 return false;
993 }
994
995 if (PageSwapCache(p)) {
996 pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
997 pfn);
998 ttu |= TTU_IGNORE_HWPOISON;
999 }
1000
1001
1002
1003
1004
1005
1006
1007 mapping = page_mapping(hpage);
1008 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
1009 mapping_cap_writeback_dirty(mapping)) {
1010 if (page_mkclean(hpage)) {
1011 SetPageDirty(hpage);
1012 } else {
1013 kill = 0;
1014 ttu |= TTU_IGNORE_HWPOISON;
1015 pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
1016 pfn);
1017 }
1018 }
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028 if (kill)
1029 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
1030
1031 unmap_success = try_to_unmap(hpage, ttu);
1032 if (!unmap_success)
1033 pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
1034 pfn, page_mapcount(hpage));
1035
1036
1037
1038
1039
1040 if (mlocked)
1041 shake_page(hpage, 0);
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053 forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
1054 kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
1055
1056 return unmap_success;
1057}
1058
1059static int identify_page_state(unsigned long pfn, struct page *p,
1060 unsigned long page_flags)
1061{
1062 struct page_state *ps;
1063
1064
1065
1066
1067
1068
1069 for (ps = error_states;; ps++)
1070 if ((p->flags & ps->mask) == ps->res)
1071 break;
1072
1073 page_flags |= (p->flags & (1UL << PG_dirty));
1074
1075 if (!ps->mask)
1076 for (ps = error_states;; ps++)
1077 if ((page_flags & ps->mask) == ps->res)
1078 break;
1079 return page_action(ps, p, pfn);
1080}
1081
1082static int memory_failure_hugetlb(unsigned long pfn, int flags)
1083{
1084 struct page *p = pfn_to_page(pfn);
1085 struct page *head = compound_head(p);
1086 int res;
1087 unsigned long page_flags;
1088
1089 if (TestSetPageHWPoison(head)) {
1090 pr_err("Memory failure: %#lx: already hardware poisoned\n",
1091 pfn);
1092 return 0;
1093 }
1094
1095 num_poisoned_pages_inc();
1096
1097 if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
1098
1099
1100
1101 lock_page(head);
1102 if (PageHWPoison(head)) {
1103 if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
1104 || (p != head && TestSetPageHWPoison(head))) {
1105 num_poisoned_pages_dec();
1106 unlock_page(head);
1107 return 0;
1108 }
1109 }
1110 unlock_page(head);
1111 dissolve_free_huge_page(p);
1112 action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED);
1113 return 0;
1114 }
1115
1116 lock_page(head);
1117 page_flags = head->flags;
1118
1119 if (!PageHWPoison(head)) {
1120 pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1121 num_poisoned_pages_dec();
1122 unlock_page(head);
1123 put_hwpoison_page(head);
1124 return 0;
1125 }
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136 if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
1137 action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
1138 res = -EBUSY;
1139 goto out;
1140 }
1141
1142 if (!hwpoison_user_mappings(p, pfn, flags, &head)) {
1143 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1144 res = -EBUSY;
1145 goto out;
1146 }
1147
1148 res = identify_page_state(pfn, p, page_flags);
1149out:
1150 unlock_page(head);
1151 return res;
1152}
1153
1154static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
1155 struct dev_pagemap *pgmap)
1156{
1157 struct page *page = pfn_to_page(pfn);
1158 const bool unmap_success = true;
1159 unsigned long size = 0;
1160 struct to_kill *tk;
1161 LIST_HEAD(tokill);
1162 int rc = -EBUSY;
1163 loff_t start;
1164
1165
1166
1167
1168
1169
1170
1171
1172 if (!dax_lock_mapping_entry(page))
1173 goto out;
1174
1175 if (hwpoison_filter(page)) {
1176 rc = 0;
1177 goto unlock;
1178 }
1179
1180 switch (pgmap->type) {
1181 case MEMORY_DEVICE_PRIVATE:
1182 case MEMORY_DEVICE_PUBLIC:
1183
1184
1185
1186
1187 goto unlock;
1188 default:
1189 break;
1190 }
1191
1192
1193
1194
1195
1196 SetPageHWPoison(page);
1197
1198
1199
1200
1201
1202
1203
1204 flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
1205 collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
1206
1207 list_for_each_entry(tk, &tokill, nd)
1208 if (tk->size_shift)
1209 size = max(size, 1UL << tk->size_shift);
1210 if (size) {
1211
1212
1213
1214
1215
1216
1217 start = (page->index << PAGE_SHIFT) & ~(size - 1);
1218 unmap_mapping_range(page->mapping, start, start + size, 0);
1219 }
1220 kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
1221 rc = 0;
1222unlock:
1223 dax_unlock_mapping_entry(page);
1224out:
1225
1226 put_dev_pagemap(pgmap);
1227 action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
1228 return rc;
1229}
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248int memory_failure(unsigned long pfn, int flags)
1249{
1250 struct page *p;
1251 struct page *hpage;
1252 struct page *orig_head;
1253 struct dev_pagemap *pgmap;
1254 int res;
1255 unsigned long page_flags;
1256
1257 if (!sysctl_memory_failure_recovery)
1258 panic("Memory failure on page %lx", pfn);
1259
1260 if (!pfn_valid(pfn)) {
1261 pr_err("Memory failure: %#lx: memory outside kernel control\n",
1262 pfn);
1263 return -ENXIO;
1264 }
1265
1266 pgmap = get_dev_pagemap(pfn, NULL);
1267 if (pgmap)
1268 return memory_failure_dev_pagemap(pfn, flags, pgmap);
1269
1270 p = pfn_to_page(pfn);
1271 if (PageHuge(p))
1272 return memory_failure_hugetlb(pfn, flags);
1273 if (TestSetPageHWPoison(p)) {
1274 pr_err("Memory failure: %#lx: already hardware poisoned\n",
1275 pfn);
1276 return 0;
1277 }
1278
1279 orig_head = hpage = compound_head(p);
1280 num_poisoned_pages_inc();
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293 if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
1294 if (is_free_buddy_page(p)) {
1295 action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1296 return 0;
1297 } else {
1298 action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
1299 return -EBUSY;
1300 }
1301 }
1302
1303 if (PageTransHuge(hpage)) {
1304 lock_page(p);
1305 if (!PageAnon(p) || unlikely(split_huge_page(p))) {
1306 unlock_page(p);
1307 if (!PageAnon(p))
1308 pr_err("Memory failure: %#lx: non anonymous thp\n",
1309 pfn);
1310 else
1311 pr_err("Memory failure: %#lx: thp split failed\n",
1312 pfn);
1313 if (TestClearPageHWPoison(p))
1314 num_poisoned_pages_dec();
1315 put_hwpoison_page(p);
1316 return -EBUSY;
1317 }
1318 unlock_page(p);
1319 VM_BUG_ON_PAGE(!page_count(p), p);
1320 hpage = compound_head(p);
1321 }
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331 shake_page(p, 0);
1332
1333 if (!PageLRU(p) && is_free_buddy_page(p)) {
1334 if (flags & MF_COUNT_INCREASED)
1335 action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1336 else
1337 action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
1338 return 0;
1339 }
1340
1341 lock_page(p);
1342
1343
1344
1345
1346
1347 if (PageCompound(p) && compound_head(p) != orig_head) {
1348 action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
1349 res = -EBUSY;
1350 goto out;
1351 }
1352
1353
1354
1355
1356
1357
1358
1359
1360 if (PageHuge(p))
1361 page_flags = hpage->flags;
1362 else
1363 page_flags = p->flags;
1364
1365
1366
1367
1368 if (!PageHWPoison(p)) {
1369 pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1370 num_poisoned_pages_dec();
1371 unlock_page(p);
1372 put_hwpoison_page(p);
1373 return 0;
1374 }
1375 if (hwpoison_filter(p)) {
1376 if (TestClearPageHWPoison(p))
1377 num_poisoned_pages_dec();
1378 unlock_page(p);
1379 put_hwpoison_page(p);
1380 return 0;
1381 }
1382
1383 if (!PageTransTail(p) && !PageLRU(p))
1384 goto identify_page_state;
1385
1386
1387
1388
1389
1390 wait_on_page_writeback(p);
1391
1392
1393
1394
1395
1396
1397
1398
1399 if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) {
1400 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1401 res = -EBUSY;
1402 goto out;
1403 }
1404
1405
1406
1407
1408 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1409 action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
1410 res = -EBUSY;
1411 goto out;
1412 }
1413
1414identify_page_state:
1415 res = identify_page_state(pfn, p, page_flags);
1416out:
1417 unlock_page(p);
1418 return res;
1419}
1420EXPORT_SYMBOL_GPL(memory_failure);
1421
1422#define MEMORY_FAILURE_FIFO_ORDER 4
1423#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1424
1425struct memory_failure_entry {
1426 unsigned long pfn;
1427 int flags;
1428};
1429
1430struct memory_failure_cpu {
1431 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1432 MEMORY_FAILURE_FIFO_SIZE);
1433 spinlock_t lock;
1434 struct work_struct work;
1435};
1436
1437static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455void memory_failure_queue(unsigned long pfn, int flags)
1456{
1457 struct memory_failure_cpu *mf_cpu;
1458 unsigned long proc_flags;
1459 struct memory_failure_entry entry = {
1460 .pfn = pfn,
1461 .flags = flags,
1462 };
1463
1464 mf_cpu = &get_cpu_var(memory_failure_cpu);
1465 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1466 if (kfifo_put(&mf_cpu->fifo, entry))
1467 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1468 else
1469 pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
1470 pfn);
1471 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1472 put_cpu_var(memory_failure_cpu);
1473}
1474EXPORT_SYMBOL_GPL(memory_failure_queue);
1475
1476static void memory_failure_work_func(struct work_struct *work)
1477{
1478 struct memory_failure_cpu *mf_cpu;
1479 struct memory_failure_entry entry = { 0, };
1480 unsigned long proc_flags;
1481 int gotten;
1482
1483 mf_cpu = this_cpu_ptr(&memory_failure_cpu);
1484 for (;;) {
1485 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1486 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1487 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1488 if (!gotten)
1489 break;
1490 if (entry.flags & MF_SOFT_OFFLINE)
1491 soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
1492 else
1493 memory_failure(entry.pfn, entry.flags);
1494 }
1495}
1496
1497static int __init memory_failure_init(void)
1498{
1499 struct memory_failure_cpu *mf_cpu;
1500 int cpu;
1501
1502 for_each_possible_cpu(cpu) {
1503 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1504 spin_lock_init(&mf_cpu->lock);
1505 INIT_KFIFO(mf_cpu->fifo);
1506 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1507 }
1508
1509 return 0;
1510}
1511core_initcall(memory_failure_init);
1512
1513#define unpoison_pr_info(fmt, pfn, rs) \
1514({ \
1515 if (__ratelimit(rs)) \
1516 pr_info(fmt, pfn); \
1517})
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531int unpoison_memory(unsigned long pfn)
1532{
1533 struct page *page;
1534 struct page *p;
1535 int freeit = 0;
1536 static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
1537 DEFAULT_RATELIMIT_BURST);
1538
1539 if (!pfn_valid(pfn))
1540 return -ENXIO;
1541
1542 p = pfn_to_page(pfn);
1543 page = compound_head(p);
1544
1545 if (!PageHWPoison(p)) {
1546 unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
1547 pfn, &unpoison_rs);
1548 return 0;
1549 }
1550
1551 if (page_count(page) > 1) {
1552 unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
1553 pfn, &unpoison_rs);
1554 return 0;
1555 }
1556
1557 if (page_mapped(page)) {
1558 unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
1559 pfn, &unpoison_rs);
1560 return 0;
1561 }
1562
1563 if (page_mapping(page)) {
1564 unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
1565 pfn, &unpoison_rs);
1566 return 0;
1567 }
1568
1569
1570
1571
1572
1573
1574 if (!PageHuge(page) && PageTransHuge(page)) {
1575 unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
1576 pfn, &unpoison_rs);
1577 return 0;
1578 }
1579
1580 if (!get_hwpoison_page(p)) {
1581 if (TestClearPageHWPoison(p))
1582 num_poisoned_pages_dec();
1583 unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
1584 pfn, &unpoison_rs);
1585 return 0;
1586 }
1587
1588 lock_page(page);
1589
1590
1591
1592
1593
1594
1595 if (TestClearPageHWPoison(page)) {
1596 unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
1597 pfn, &unpoison_rs);
1598 num_poisoned_pages_dec();
1599 freeit = 1;
1600 }
1601 unlock_page(page);
1602
1603 put_hwpoison_page(page);
1604 if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
1605 put_hwpoison_page(page);
1606
1607 return 0;
1608}
1609EXPORT_SYMBOL(unpoison_memory);
1610
1611static struct page *new_page(struct page *p, unsigned long private)
1612{
1613 int nid = page_to_nid(p);
1614
1615 return new_page_nodemask(p, nid, &node_states[N_MEMORY]);
1616}
1617
1618
1619
1620
1621
1622
1623
1624static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1625{
1626 int ret;
1627
1628 if (flags & MF_COUNT_INCREASED)
1629 return 1;
1630
1631
1632
1633
1634
1635 if (!get_hwpoison_page(p)) {
1636 if (PageHuge(p)) {
1637 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1638 ret = 0;
1639 } else if (is_free_buddy_page(p)) {
1640 pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1641 ret = 0;
1642 } else {
1643 pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1644 __func__, pfn, p->flags);
1645 ret = -EIO;
1646 }
1647 } else {
1648
1649 ret = 1;
1650 }
1651 return ret;
1652}
1653
1654static int get_any_page(struct page *page, unsigned long pfn, int flags)
1655{
1656 int ret = __get_any_page(page, pfn, flags);
1657
1658 if (ret == 1 && !PageHuge(page) &&
1659 !PageLRU(page) && !__PageMovable(page)) {
1660
1661
1662
1663 put_hwpoison_page(page);
1664 shake_page(page, 1);
1665
1666
1667
1668
1669 ret = __get_any_page(page, pfn, 0);
1670 if (ret == 1 && !PageLRU(page)) {
1671
1672 put_hwpoison_page(page);
1673 pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
1674 pfn, page->flags, &page->flags);
1675 return -EIO;
1676 }
1677 }
1678 return ret;
1679}
1680
1681static int soft_offline_huge_page(struct page *page, int flags)
1682{
1683 int ret;
1684 unsigned long pfn = page_to_pfn(page);
1685 struct page *hpage = compound_head(page);
1686 LIST_HEAD(pagelist);
1687
1688
1689
1690
1691
1692 lock_page(hpage);
1693 if (PageHWPoison(hpage)) {
1694 unlock_page(hpage);
1695 put_hwpoison_page(hpage);
1696 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1697 return -EBUSY;
1698 }
1699 unlock_page(hpage);
1700
1701 ret = isolate_huge_page(hpage, &pagelist);
1702
1703
1704
1705
1706 put_hwpoison_page(hpage);
1707 if (!ret) {
1708 pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
1709 return -EBUSY;
1710 }
1711
1712 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1713 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1714 if (ret) {
1715 pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n",
1716 pfn, ret, page->flags, &page->flags);
1717 if (!list_empty(&pagelist))
1718 putback_movable_pages(&pagelist);
1719 if (ret > 0)
1720 ret = -EIO;
1721 } else {
1722
1723
1724
1725
1726
1727
1728
1729 ret = dissolve_free_huge_page(page);
1730 if (!ret) {
1731 if (set_hwpoison_free_buddy_page(page))
1732 num_poisoned_pages_inc();
1733 }
1734 }
1735 return ret;
1736}
1737
1738static int __soft_offline_page(struct page *page, int flags)
1739{
1740 int ret;
1741 unsigned long pfn = page_to_pfn(page);
1742
1743
1744
1745
1746
1747
1748
1749 lock_page(page);
1750 wait_on_page_writeback(page);
1751 if (PageHWPoison(page)) {
1752 unlock_page(page);
1753 put_hwpoison_page(page);
1754 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1755 return -EBUSY;
1756 }
1757
1758
1759
1760
1761 ret = invalidate_inode_page(page);
1762 unlock_page(page);
1763
1764
1765
1766
1767 if (ret == 1) {
1768 put_hwpoison_page(page);
1769 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1770 SetPageHWPoison(page);
1771 num_poisoned_pages_inc();
1772 return 0;
1773 }
1774
1775
1776
1777
1778
1779
1780 if (PageLRU(page))
1781 ret = isolate_lru_page(page);
1782 else
1783 ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1784
1785
1786
1787
1788 put_hwpoison_page(page);
1789 if (!ret) {
1790 LIST_HEAD(pagelist);
1791
1792
1793
1794
1795
1796 if (!__PageMovable(page))
1797 inc_node_page_state(page, NR_ISOLATED_ANON +
1798 page_is_file_cache(page));
1799 list_add(&page->lru, &pagelist);
1800 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1801 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1802 if (ret) {
1803 if (!list_empty(&pagelist))
1804 putback_movable_pages(&pagelist);
1805
1806 pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
1807 pfn, ret, page->flags, &page->flags);
1808 if (ret > 0)
1809 ret = -EIO;
1810 }
1811 } else {
1812 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n",
1813 pfn, ret, page_count(page), page->flags, &page->flags);
1814 }
1815 return ret;
1816}
1817
1818static int soft_offline_in_use_page(struct page *page, int flags)
1819{
1820 int ret;
1821 int mt;
1822 struct page *hpage = compound_head(page);
1823
1824 if (!PageHuge(page) && PageTransHuge(hpage)) {
1825 lock_page(hpage);
1826 if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
1827 unlock_page(hpage);
1828 if (!PageAnon(hpage))
1829 pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
1830 else
1831 pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
1832 put_hwpoison_page(hpage);
1833 return -EBUSY;
1834 }
1835 unlock_page(hpage);
1836 get_hwpoison_page(page);
1837 put_hwpoison_page(hpage);
1838 }
1839
1840
1841
1842
1843
1844
1845
1846
1847 mt = get_pageblock_migratetype(page);
1848 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
1849 if (PageHuge(page))
1850 ret = soft_offline_huge_page(page, flags);
1851 else
1852 ret = __soft_offline_page(page, flags);
1853 set_pageblock_migratetype(page, mt);
1854 return ret;
1855}
1856
1857static int soft_offline_free_page(struct page *page)
1858{
1859 int rc = 0;
1860 struct page *head = compound_head(page);
1861
1862 if (PageHuge(head))
1863 rc = dissolve_free_huge_page(page);
1864 if (!rc) {
1865 if (set_hwpoison_free_buddy_page(page))
1866 num_poisoned_pages_inc();
1867 else
1868 rc = -EBUSY;
1869 }
1870 return rc;
1871}
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895int soft_offline_page(struct page *page, int flags)
1896{
1897 int ret;
1898 unsigned long pfn = page_to_pfn(page);
1899
1900 if (is_zone_device_page(page)) {
1901 pr_debug_ratelimited("soft_offline: %#lx page is device page\n",
1902 pfn);
1903 if (flags & MF_COUNT_INCREASED)
1904 put_page(page);
1905 return -EIO;
1906 }
1907
1908 if (PageHWPoison(page)) {
1909 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1910 if (flags & MF_COUNT_INCREASED)
1911 put_hwpoison_page(page);
1912 return -EBUSY;
1913 }
1914
1915 get_online_mems();
1916 ret = get_any_page(page, pfn, flags);
1917 put_online_mems();
1918
1919 if (ret > 0)
1920 ret = soft_offline_in_use_page(page, flags);
1921 else if (ret == 0)
1922 ret = soft_offline_free_page(page);
1923
1924 return ret;
1925}
1926