1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36#include <linux/kernel.h>
37#include <linux/mm.h>
38#include <linux/page-flags.h>
39#include <linux/kernel-page-flags.h>
40#include <linux/sched/signal.h>
41#include <linux/sched/task.h>
42#include <linux/ksm.h>
43#include <linux/rmap.h>
44#include <linux/export.h>
45#include <linux/pagemap.h>
46#include <linux/swap.h>
47#include <linux/backing-dev.h>
48#include <linux/migrate.h>
49#include <linux/suspend.h>
50#include <linux/slab.h>
51#include <linux/swapops.h>
52#include <linux/hugetlb.h>
53#include <linux/memory_hotplug.h>
54#include <linux/mm_inline.h>
55#include <linux/memremap.h>
56#include <linux/kfifo.h>
57#include <linux/ratelimit.h>
58#include <linux/page-isolation.h>
59#include <linux/pagewalk.h>
60#include "internal.h"
61#include "ras/ras_event.h"
62
63int sysctl_memory_failure_early_kill __read_mostly = 0;
64
65int sysctl_memory_failure_recovery __read_mostly = 1;
66
67atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
68
69static bool __page_handle_poison(struct page *page)
70{
71 int ret;
72
73 zone_pcp_disable(page_zone(page));
74 ret = dissolve_free_huge_page(page);
75 if (!ret)
76 ret = take_page_off_buddy(page);
77 zone_pcp_enable(page_zone(page));
78
79 return ret > 0;
80}
81
82static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
83{
84 if (hugepage_or_freepage) {
85
86
87
88
89 if (!__page_handle_poison(page))
90
91
92
93
94
95
96
97 return false;
98 }
99
100 SetPageHWPoison(page);
101 if (release)
102 put_page(page);
103 page_ref_inc(page);
104 num_poisoned_pages_inc();
105
106 return true;
107}
108
109#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
110
111u32 hwpoison_filter_enable = 0;
112u32 hwpoison_filter_dev_major = ~0U;
113u32 hwpoison_filter_dev_minor = ~0U;
114u64 hwpoison_filter_flags_mask;
115u64 hwpoison_filter_flags_value;
116EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
117EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
118EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
119EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
120EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
121
122static int hwpoison_filter_dev(struct page *p)
123{
124 struct address_space *mapping;
125 dev_t dev;
126
127 if (hwpoison_filter_dev_major == ~0U &&
128 hwpoison_filter_dev_minor == ~0U)
129 return 0;
130
131
132
133
134 if (PageSlab(p))
135 return -EINVAL;
136
137 mapping = page_mapping(p);
138 if (mapping == NULL || mapping->host == NULL)
139 return -EINVAL;
140
141 dev = mapping->host->i_sb->s_dev;
142 if (hwpoison_filter_dev_major != ~0U &&
143 hwpoison_filter_dev_major != MAJOR(dev))
144 return -EINVAL;
145 if (hwpoison_filter_dev_minor != ~0U &&
146 hwpoison_filter_dev_minor != MINOR(dev))
147 return -EINVAL;
148
149 return 0;
150}
151
152static int hwpoison_filter_flags(struct page *p)
153{
154 if (!hwpoison_filter_flags_mask)
155 return 0;
156
157 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
158 hwpoison_filter_flags_value)
159 return 0;
160 else
161 return -EINVAL;
162}
163
164
165
166
167
168
169
170
171
172
173
174#ifdef CONFIG_MEMCG
175u64 hwpoison_filter_memcg;
176EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
177static int hwpoison_filter_task(struct page *p)
178{
179 if (!hwpoison_filter_memcg)
180 return 0;
181
182 if (page_cgroup_ino(p) != hwpoison_filter_memcg)
183 return -EINVAL;
184
185 return 0;
186}
187#else
188static int hwpoison_filter_task(struct page *p) { return 0; }
189#endif
190
191int hwpoison_filter(struct page *p)
192{
193 if (!hwpoison_filter_enable)
194 return 0;
195
196 if (hwpoison_filter_dev(p))
197 return -EINVAL;
198
199 if (hwpoison_filter_flags(p))
200 return -EINVAL;
201
202 if (hwpoison_filter_task(p))
203 return -EINVAL;
204
205 return 0;
206}
207#else
208int hwpoison_filter(struct page *p)
209{
210 return 0;
211}
212#endif
213
214EXPORT_SYMBOL_GPL(hwpoison_filter);
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238struct to_kill {
239 struct list_head nd;
240 struct task_struct *tsk;
241 unsigned long addr;
242 short size_shift;
243};
244
245
246
247
248
249
250static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
251{
252 struct task_struct *t = tk->tsk;
253 short addr_lsb = tk->size_shift;
254 int ret = 0;
255
256 pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
257 pfn, t->comm, t->pid);
258
259 if (flags & MF_ACTION_REQUIRED) {
260 if (t == current)
261 ret = force_sig_mceerr(BUS_MCEERR_AR,
262 (void __user *)tk->addr, addr_lsb);
263 else
264
265 ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
266 addr_lsb, t);
267 } else {
268
269
270
271
272
273
274 ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
275 addr_lsb, t);
276 }
277 if (ret < 0)
278 pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
279 t->comm, t->pid, ret);
280 return ret;
281}
282
283
284
285
286
287void shake_page(struct page *p)
288{
289 if (PageHuge(p))
290 return;
291
292 if (!PageSlab(p)) {
293 lru_add_drain_all();
294 if (PageLRU(p) || is_free_buddy_page(p))
295 return;
296 }
297
298
299
300
301
302}
303EXPORT_SYMBOL_GPL(shake_page);
304
305static unsigned long dev_pagemap_mapping_shift(struct page *page,
306 struct vm_area_struct *vma)
307{
308 unsigned long address = vma_address(page, vma);
309 unsigned long ret = 0;
310 pgd_t *pgd;
311 p4d_t *p4d;
312 pud_t *pud;
313 pmd_t *pmd;
314 pte_t *pte;
315
316 pgd = pgd_offset(vma->vm_mm, address);
317 if (!pgd_present(*pgd))
318 return 0;
319 p4d = p4d_offset(pgd, address);
320 if (!p4d_present(*p4d))
321 return 0;
322 pud = pud_offset(p4d, address);
323 if (!pud_present(*pud))
324 return 0;
325 if (pud_devmap(*pud))
326 return PUD_SHIFT;
327 pmd = pmd_offset(pud, address);
328 if (!pmd_present(*pmd))
329 return 0;
330 if (pmd_devmap(*pmd))
331 return PMD_SHIFT;
332 pte = pte_offset_map(pmd, address);
333 if (pte_present(*pte) && pte_devmap(*pte))
334 ret = PAGE_SHIFT;
335 pte_unmap(pte);
336 return ret;
337}
338
339
340
341
342
343
344
345
346
347
348static void add_to_kill(struct task_struct *tsk, struct page *p,
349 struct vm_area_struct *vma,
350 struct list_head *to_kill)
351{
352 struct to_kill *tk;
353
354 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
355 if (!tk) {
356 pr_err("Memory failure: Out of memory while machine check handling\n");
357 return;
358 }
359
360 tk->addr = page_address_in_vma(p, vma);
361 if (is_zone_device_page(p))
362 tk->size_shift = dev_pagemap_mapping_shift(p, vma);
363 else
364 tk->size_shift = page_shift(compound_head(p));
365
366
367
368
369
370
371
372
373
374
375
376 if (tk->addr == -EFAULT) {
377 pr_info("Memory failure: Unable to find user space address %lx in %s\n",
378 page_to_pfn(p), tsk->comm);
379 } else if (tk->size_shift == 0) {
380 kfree(tk);
381 return;
382 }
383
384 get_task_struct(tsk);
385 tk->tsk = tsk;
386 list_add_tail(&tk->nd, to_kill);
387}
388
389
390
391
392
393
394
395
396
397static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
398 unsigned long pfn, int flags)
399{
400 struct to_kill *tk, *next;
401
402 list_for_each_entry_safe (tk, next, to_kill, nd) {
403 if (forcekill) {
404
405
406
407
408
409 if (fail || tk->addr == -EFAULT) {
410 pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
411 pfn, tk->tsk->comm, tk->tsk->pid);
412 do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
413 tk->tsk, PIDTYPE_PID);
414 }
415
416
417
418
419
420
421
422 else if (kill_proc(tk, pfn, flags) < 0)
423 pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
424 pfn, tk->tsk->comm, tk->tsk->pid);
425 }
426 put_task_struct(tk->tsk);
427 kfree(tk);
428 }
429}
430
431
432
433
434
435
436
437
438
439static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
440{
441 struct task_struct *t;
442
443 for_each_thread(tsk, t) {
444 if (t->flags & PF_MCE_PROCESS) {
445 if (t->flags & PF_MCE_EARLY)
446 return t;
447 } else {
448 if (sysctl_memory_failure_early_kill)
449 return t;
450 }
451 }
452 return NULL;
453}
454
455
456
457
458
459
460
461
462
463
464
465
466
467static struct task_struct *task_early_kill(struct task_struct *tsk,
468 int force_early)
469{
470 if (!tsk->mm)
471 return NULL;
472
473
474
475
476 if (force_early && tsk->mm == current->mm)
477 return current;
478
479 return find_early_kill_thread(tsk);
480}
481
482
483
484
485static void collect_procs_anon(struct page *page, struct list_head *to_kill,
486 int force_early)
487{
488 struct vm_area_struct *vma;
489 struct task_struct *tsk;
490 struct anon_vma *av;
491 pgoff_t pgoff;
492
493 av = page_lock_anon_vma_read(page);
494 if (av == NULL)
495 return;
496
497 pgoff = page_to_pgoff(page);
498 read_lock(&tasklist_lock);
499 for_each_process (tsk) {
500 struct anon_vma_chain *vmac;
501 struct task_struct *t = task_early_kill(tsk, force_early);
502
503 if (!t)
504 continue;
505 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
506 pgoff, pgoff) {
507 vma = vmac->vma;
508 if (!page_mapped_in_vma(page, vma))
509 continue;
510 if (vma->vm_mm == t->mm)
511 add_to_kill(t, page, vma, to_kill);
512 }
513 }
514 read_unlock(&tasklist_lock);
515 page_unlock_anon_vma_read(av);
516}
517
518
519
520
521static void collect_procs_file(struct page *page, struct list_head *to_kill,
522 int force_early)
523{
524 struct vm_area_struct *vma;
525 struct task_struct *tsk;
526 struct address_space *mapping = page->mapping;
527 pgoff_t pgoff;
528
529 i_mmap_lock_read(mapping);
530 read_lock(&tasklist_lock);
531 pgoff = page_to_pgoff(page);
532 for_each_process(tsk) {
533 struct task_struct *t = task_early_kill(tsk, force_early);
534
535 if (!t)
536 continue;
537 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
538 pgoff) {
539
540
541
542
543
544
545
546 if (vma->vm_mm == t->mm)
547 add_to_kill(t, page, vma, to_kill);
548 }
549 }
550 read_unlock(&tasklist_lock);
551 i_mmap_unlock_read(mapping);
552}
553
554
555
556
557static void collect_procs(struct page *page, struct list_head *tokill,
558 int force_early)
559{
560 if (!page->mapping)
561 return;
562
563 if (PageAnon(page))
564 collect_procs_anon(page, tokill, force_early);
565 else
566 collect_procs_file(page, tokill, force_early);
567}
568
569struct hwp_walk {
570 struct to_kill tk;
571 unsigned long pfn;
572 int flags;
573};
574
575static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift)
576{
577 tk->addr = addr;
578 tk->size_shift = shift;
579}
580
581static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
582 unsigned long poisoned_pfn, struct to_kill *tk)
583{
584 unsigned long pfn = 0;
585
586 if (pte_present(pte)) {
587 pfn = pte_pfn(pte);
588 } else {
589 swp_entry_t swp = pte_to_swp_entry(pte);
590
591 if (is_hwpoison_entry(swp))
592 pfn = hwpoison_entry_to_pfn(swp);
593 }
594
595 if (!pfn || pfn != poisoned_pfn)
596 return 0;
597
598 set_to_kill(tk, addr, shift);
599 return 1;
600}
601
602#ifdef CONFIG_TRANSPARENT_HUGEPAGE
603static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
604 struct hwp_walk *hwp)
605{
606 pmd_t pmd = *pmdp;
607 unsigned long pfn;
608 unsigned long hwpoison_vaddr;
609
610 if (!pmd_present(pmd))
611 return 0;
612 pfn = pmd_pfn(pmd);
613 if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) {
614 hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT);
615 set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT);
616 return 1;
617 }
618 return 0;
619}
620#else
621static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
622 struct hwp_walk *hwp)
623{
624 return 0;
625}
626#endif
627
628static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
629 unsigned long end, struct mm_walk *walk)
630{
631 struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
632 int ret = 0;
633 pte_t *ptep, *mapped_pte;
634 spinlock_t *ptl;
635
636 ptl = pmd_trans_huge_lock(pmdp, walk->vma);
637 if (ptl) {
638 ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp);
639 spin_unlock(ptl);
640 goto out;
641 }
642
643 if (pmd_trans_unstable(pmdp))
644 goto out;
645
646 mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp,
647 addr, &ptl);
648 for (; addr != end; ptep++, addr += PAGE_SIZE) {
649 ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT,
650 hwp->pfn, &hwp->tk);
651 if (ret == 1)
652 break;
653 }
654 pte_unmap_unlock(mapped_pte, ptl);
655out:
656 cond_resched();
657 return ret;
658}
659
660#ifdef CONFIG_HUGETLB_PAGE
661static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
662 unsigned long addr, unsigned long end,
663 struct mm_walk *walk)
664{
665 struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
666 pte_t pte = huge_ptep_get(ptep);
667 struct hstate *h = hstate_vma(walk->vma);
668
669 return check_hwpoisoned_entry(pte, addr, huge_page_shift(h),
670 hwp->pfn, &hwp->tk);
671}
672#else
673#define hwpoison_hugetlb_range NULL
674#endif
675
676static struct mm_walk_ops hwp_walk_ops = {
677 .pmd_entry = hwpoison_pte_range,
678 .hugetlb_entry = hwpoison_hugetlb_range,
679};
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
695 int flags)
696{
697 int ret;
698 struct hwp_walk priv = {
699 .pfn = pfn,
700 };
701 priv.tk.tsk = p;
702
703 mmap_read_lock(p->mm);
704 ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops,
705 (void *)&priv);
706 if (ret == 1 && priv.tk.addr)
707 kill_proc(&priv.tk, pfn, flags);
708 mmap_read_unlock(p->mm);
709 return ret ? -EFAULT : -EHWPOISON;
710}
711
712static const char *action_name[] = {
713 [MF_IGNORED] = "Ignored",
714 [MF_FAILED] = "Failed",
715 [MF_DELAYED] = "Delayed",
716 [MF_RECOVERED] = "Recovered",
717};
718
719static const char * const action_page_types[] = {
720 [MF_MSG_KERNEL] = "reserved kernel page",
721 [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
722 [MF_MSG_SLAB] = "kernel slab page",
723 [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
724 [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
725 [MF_MSG_HUGE] = "huge page",
726 [MF_MSG_FREE_HUGE] = "free huge page",
727 [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page",
728 [MF_MSG_UNMAP_FAILED] = "unmapping failed page",
729 [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
730 [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
731 [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
732 [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
733 [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
734 [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
735 [MF_MSG_DIRTY_LRU] = "dirty LRU page",
736 [MF_MSG_CLEAN_LRU] = "clean LRU page",
737 [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
738 [MF_MSG_BUDDY] = "free buddy page",
739 [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
740 [MF_MSG_DAX] = "dax page",
741 [MF_MSG_UNSPLIT_THP] = "unsplit thp",
742 [MF_MSG_UNKNOWN] = "unknown page",
743};
744
745
746
747
748
749
750
751static int delete_from_lru_cache(struct page *p)
752{
753 if (!isolate_lru_page(p)) {
754
755
756
757
758 ClearPageActive(p);
759 ClearPageUnevictable(p);
760
761
762
763
764
765 mem_cgroup_uncharge(p);
766
767
768
769
770 put_page(p);
771 return 0;
772 }
773 return -EIO;
774}
775
776static int truncate_error_page(struct page *p, unsigned long pfn,
777 struct address_space *mapping)
778{
779 int ret = MF_FAILED;
780
781 if (mapping->a_ops->error_remove_page) {
782 int err = mapping->a_ops->error_remove_page(mapping, p);
783
784 if (err != 0) {
785 pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
786 pfn, err);
787 } else if (page_has_private(p) &&
788 !try_to_release_page(p, GFP_NOIO)) {
789 pr_info("Memory failure: %#lx: failed to release buffers\n",
790 pfn);
791 } else {
792 ret = MF_RECOVERED;
793 }
794 } else {
795
796
797
798
799 if (invalidate_inode_page(p))
800 ret = MF_RECOVERED;
801 else
802 pr_info("Memory failure: %#lx: Failed to invalidate\n",
803 pfn);
804 }
805
806 return ret;
807}
808
809
810
811
812
813
814static int me_kernel(struct page *p, unsigned long pfn)
815{
816 unlock_page(p);
817 return MF_IGNORED;
818}
819
820
821
822
823static int me_unknown(struct page *p, unsigned long pfn)
824{
825 pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
826 unlock_page(p);
827 return MF_FAILED;
828}
829
830
831
832
833static int me_pagecache_clean(struct page *p, unsigned long pfn)
834{
835 int ret;
836 struct address_space *mapping;
837
838 delete_from_lru_cache(p);
839
840
841
842
843
844 if (PageAnon(p)) {
845 ret = MF_RECOVERED;
846 goto out;
847 }
848
849
850
851
852
853
854
855
856 mapping = page_mapping(p);
857 if (!mapping) {
858
859
860
861 ret = MF_FAILED;
862 goto out;
863 }
864
865
866
867
868
869
870 ret = truncate_error_page(p, pfn, mapping);
871out:
872 unlock_page(p);
873 return ret;
874}
875
876
877
878
879
880
881static int me_pagecache_dirty(struct page *p, unsigned long pfn)
882{
883 struct address_space *mapping = page_mapping(p);
884
885 SetPageError(p);
886
887 if (mapping) {
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922 mapping_set_error(mapping, -EIO);
923 }
924
925 return me_pagecache_clean(p, pfn);
926}
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947static int me_swapcache_dirty(struct page *p, unsigned long pfn)
948{
949 int ret;
950
951 ClearPageDirty(p);
952
953 ClearPageUptodate(p);
954
955 ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
956 unlock_page(p);
957 return ret;
958}
959
960static int me_swapcache_clean(struct page *p, unsigned long pfn)
961{
962 int ret;
963
964 delete_from_swap_cache(p);
965
966 ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
967 unlock_page(p);
968 return ret;
969}
970
971
972
973
974
975
976
977static int me_huge_page(struct page *p, unsigned long pfn)
978{
979 int res;
980 struct page *hpage = compound_head(p);
981 struct address_space *mapping;
982
983 if (!PageHuge(hpage))
984 return MF_DELAYED;
985
986 mapping = page_mapping(hpage);
987 if (mapping) {
988 res = truncate_error_page(hpage, pfn, mapping);
989 unlock_page(hpage);
990 } else {
991 res = MF_FAILED;
992 unlock_page(hpage);
993
994
995
996
997
998 if (PageAnon(hpage))
999 put_page(hpage);
1000 if (__page_handle_poison(p)) {
1001 page_ref_inc(p);
1002 res = MF_RECOVERED;
1003 }
1004 }
1005
1006 return res;
1007}
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022#define dirty (1UL << PG_dirty)
1023#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
1024#define unevict (1UL << PG_unevictable)
1025#define mlock (1UL << PG_mlocked)
1026#define lru (1UL << PG_lru)
1027#define head (1UL << PG_head)
1028#define slab (1UL << PG_slab)
1029#define reserved (1UL << PG_reserved)
1030
1031static struct page_state {
1032 unsigned long mask;
1033 unsigned long res;
1034 enum mf_action_page_type type;
1035
1036
1037 int (*action)(struct page *p, unsigned long pfn);
1038} error_states[] = {
1039 { reserved, reserved, MF_MSG_KERNEL, me_kernel },
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050 { slab, slab, MF_MSG_SLAB, me_kernel },
1051
1052 { head, head, MF_MSG_HUGE, me_huge_page },
1053
1054 { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
1055 { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
1056
1057 { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
1058 { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
1059
1060 { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
1061 { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
1062
1063 { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty },
1064 { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean },
1065
1066
1067
1068
1069 { 0, 0, MF_MSG_UNKNOWN, me_unknown },
1070};
1071
1072#undef dirty
1073#undef sc
1074#undef unevict
1075#undef mlock
1076#undef lru
1077#undef head
1078#undef slab
1079#undef reserved
1080
1081
1082
1083
1084
1085static void action_result(unsigned long pfn, enum mf_action_page_type type,
1086 enum mf_result result)
1087{
1088 trace_memory_failure_event(pfn, type, result);
1089
1090 pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
1091 pfn, action_page_types[type], action_name[result]);
1092}
1093
1094static int page_action(struct page_state *ps, struct page *p,
1095 unsigned long pfn)
1096{
1097 int result;
1098 int count;
1099
1100
1101 result = ps->action(p, pfn);
1102
1103 count = page_count(p) - 1;
1104 if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
1105 count--;
1106 if (count > 0) {
1107 pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
1108 pfn, action_page_types[ps->type], count);
1109 result = MF_FAILED;
1110 }
1111 action_result(pfn, ps->type, result);
1112
1113
1114
1115
1116
1117
1118 return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
1119}
1120
1121
1122
1123
1124
1125
1126
1127static inline bool HWPoisonHandlable(struct page *page)
1128{
1129 return PageLRU(page) || __PageMovable(page) || is_free_buddy_page(page);
1130}
1131
1132static int __get_hwpoison_page(struct page *page)
1133{
1134 struct page *head = compound_head(page);
1135 int ret = 0;
1136 bool hugetlb = false;
1137
1138 ret = get_hwpoison_huge_page(head, &hugetlb);
1139 if (hugetlb)
1140 return ret;
1141
1142
1143
1144
1145
1146
1147 if (!HWPoisonHandlable(head))
1148 return -EBUSY;
1149
1150 if (get_page_unless_zero(head)) {
1151 if (head == compound_head(page))
1152 return 1;
1153
1154 pr_info("Memory failure: %#lx cannot catch tail\n",
1155 page_to_pfn(page));
1156 put_page(head);
1157 }
1158
1159 return 0;
1160}
1161
1162static int get_any_page(struct page *p, unsigned long flags)
1163{
1164 int ret = 0, pass = 0;
1165 bool count_increased = false;
1166
1167 if (flags & MF_COUNT_INCREASED)
1168 count_increased = true;
1169
1170try_again:
1171 if (!count_increased) {
1172 ret = __get_hwpoison_page(p);
1173 if (!ret) {
1174 if (page_count(p)) {
1175
1176 if (pass++ < 3)
1177 goto try_again;
1178 ret = -EBUSY;
1179 } else if (!PageHuge(p) && !is_free_buddy_page(p)) {
1180
1181 if (pass++ < 3)
1182 goto try_again;
1183 ret = -EIO;
1184 }
1185 goto out;
1186 } else if (ret == -EBUSY) {
1187
1188
1189
1190
1191 if (pass++ < 3) {
1192 shake_page(p);
1193 goto try_again;
1194 }
1195 ret = -EIO;
1196 goto out;
1197 }
1198 }
1199
1200 if (PageHuge(p) || HWPoisonHandlable(p)) {
1201 ret = 1;
1202 } else {
1203
1204
1205
1206
1207 if (pass++ < 3) {
1208 put_page(p);
1209 shake_page(p);
1210 count_increased = false;
1211 goto try_again;
1212 }
1213 put_page(p);
1214 ret = -EIO;
1215 }
1216out:
1217 if (ret == -EIO)
1218 dump_page(p, "hwpoison: unhandlable page");
1219
1220 return ret;
1221}
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245static int get_hwpoison_page(struct page *p, unsigned long flags)
1246{
1247 int ret;
1248
1249 zone_pcp_disable(page_zone(p));
1250 ret = get_any_page(p, flags);
1251 zone_pcp_enable(page_zone(p));
1252
1253 return ret;
1254}
1255
1256
1257
1258
1259
1260static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
1261 int flags, struct page *hpage)
1262{
1263 enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC;
1264 struct address_space *mapping;
1265 LIST_HEAD(tokill);
1266 bool unmap_success;
1267 int kill = 1, forcekill;
1268 bool mlocked = PageMlocked(hpage);
1269
1270
1271
1272
1273
1274 if (PageReserved(p) || PageSlab(p))
1275 return true;
1276 if (!(PageLRU(hpage) || PageHuge(p)))
1277 return true;
1278
1279
1280
1281
1282
1283 if (!page_mapped(hpage))
1284 return true;
1285
1286 if (PageKsm(p)) {
1287 pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
1288 return false;
1289 }
1290
1291 if (PageSwapCache(p)) {
1292 pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
1293 pfn);
1294 ttu |= TTU_IGNORE_HWPOISON;
1295 }
1296
1297
1298
1299
1300
1301
1302
1303 mapping = page_mapping(hpage);
1304 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
1305 mapping_can_writeback(mapping)) {
1306 if (page_mkclean(hpage)) {
1307 SetPageDirty(hpage);
1308 } else {
1309 kill = 0;
1310 ttu |= TTU_IGNORE_HWPOISON;
1311 pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
1312 pfn);
1313 }
1314 }
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324 if (kill)
1325 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
1326
1327 if (!PageHuge(hpage)) {
1328 try_to_unmap(hpage, ttu);
1329 } else {
1330 if (!PageAnon(hpage)) {
1331
1332
1333
1334
1335
1336
1337
1338 mapping = hugetlb_page_mapping_lock_write(hpage);
1339 if (mapping) {
1340 try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
1341 i_mmap_unlock_write(mapping);
1342 } else
1343 pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
1344 } else {
1345 try_to_unmap(hpage, ttu);
1346 }
1347 }
1348
1349 unmap_success = !page_mapped(hpage);
1350 if (!unmap_success)
1351 pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
1352 pfn, page_mapcount(hpage));
1353
1354
1355
1356
1357
1358 if (mlocked)
1359 shake_page(hpage);
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371 forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
1372 kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
1373
1374 return unmap_success;
1375}
1376
1377static int identify_page_state(unsigned long pfn, struct page *p,
1378 unsigned long page_flags)
1379{
1380 struct page_state *ps;
1381
1382
1383
1384
1385
1386
1387 for (ps = error_states;; ps++)
1388 if ((p->flags & ps->mask) == ps->res)
1389 break;
1390
1391 page_flags |= (p->flags & (1UL << PG_dirty));
1392
1393 if (!ps->mask)
1394 for (ps = error_states;; ps++)
1395 if ((page_flags & ps->mask) == ps->res)
1396 break;
1397 return page_action(ps, p, pfn);
1398}
1399
1400static int try_to_split_thp_page(struct page *page, const char *msg)
1401{
1402 lock_page(page);
1403 if (!PageAnon(page) || unlikely(split_huge_page(page))) {
1404 unsigned long pfn = page_to_pfn(page);
1405
1406 unlock_page(page);
1407 if (!PageAnon(page))
1408 pr_info("%s: %#lx: non anonymous thp\n", msg, pfn);
1409 else
1410 pr_info("%s: %#lx: thp split failed\n", msg, pfn);
1411 put_page(page);
1412 return -EBUSY;
1413 }
1414 unlock_page(page);
1415
1416 return 0;
1417}
1418
1419static int memory_failure_hugetlb(unsigned long pfn, int flags)
1420{
1421 struct page *p = pfn_to_page(pfn);
1422 struct page *head = compound_head(p);
1423 int res;
1424 unsigned long page_flags;
1425
1426 if (TestSetPageHWPoison(head)) {
1427 pr_err("Memory failure: %#lx: already hardware poisoned\n",
1428 pfn);
1429 res = -EHWPOISON;
1430 if (flags & MF_ACTION_REQUIRED)
1431 res = kill_accessing_process(current, page_to_pfn(head), flags);
1432 return res;
1433 }
1434
1435 num_poisoned_pages_inc();
1436
1437 if (!(flags & MF_COUNT_INCREASED)) {
1438 res = get_hwpoison_page(p, flags);
1439 if (!res) {
1440
1441
1442
1443 lock_page(head);
1444 if (PageHWPoison(head)) {
1445 if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
1446 || (p != head && TestSetPageHWPoison(head))) {
1447 num_poisoned_pages_dec();
1448 unlock_page(head);
1449 return 0;
1450 }
1451 }
1452 unlock_page(head);
1453 res = MF_FAILED;
1454 if (__page_handle_poison(p)) {
1455 page_ref_inc(p);
1456 res = MF_RECOVERED;
1457 }
1458 action_result(pfn, MF_MSG_FREE_HUGE, res);
1459 return res == MF_RECOVERED ? 0 : -EBUSY;
1460 } else if (res < 0) {
1461 action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
1462 return -EBUSY;
1463 }
1464 }
1465
1466 lock_page(head);
1467 page_flags = head->flags;
1468
1469 if (!PageHWPoison(head)) {
1470 pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1471 num_poisoned_pages_dec();
1472 unlock_page(head);
1473 put_page(head);
1474 return 0;
1475 }
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486 if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
1487 action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
1488 res = -EBUSY;
1489 goto out;
1490 }
1491
1492 if (!hwpoison_user_mappings(p, pfn, flags, head)) {
1493 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1494 res = -EBUSY;
1495 goto out;
1496 }
1497
1498 return identify_page_state(pfn, p, page_flags);
1499out:
1500 unlock_page(head);
1501 return res;
1502}
1503
1504static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
1505 struct dev_pagemap *pgmap)
1506{
1507 struct page *page = pfn_to_page(pfn);
1508 unsigned long size = 0;
1509 struct to_kill *tk;
1510 LIST_HEAD(tokill);
1511 int rc = -EBUSY;
1512 loff_t start;
1513 dax_entry_t cookie;
1514
1515 if (flags & MF_COUNT_INCREASED)
1516
1517
1518
1519 put_page(page);
1520
1521
1522 if (!pgmap_pfn_valid(pgmap, pfn)) {
1523 rc = -ENXIO;
1524 goto out;
1525 }
1526
1527
1528
1529
1530
1531
1532
1533
1534 cookie = dax_lock_page(page);
1535 if (!cookie)
1536 goto out;
1537
1538 if (hwpoison_filter(page)) {
1539 rc = 0;
1540 goto unlock;
1541 }
1542
1543 if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
1544
1545
1546
1547
1548 goto unlock;
1549 }
1550
1551
1552
1553
1554
1555 SetPageHWPoison(page);
1556
1557
1558
1559
1560
1561
1562
1563 flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
1564 collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
1565
1566 list_for_each_entry(tk, &tokill, nd)
1567 if (tk->size_shift)
1568 size = max(size, 1UL << tk->size_shift);
1569 if (size) {
1570
1571
1572
1573
1574
1575
1576 start = (page->index << PAGE_SHIFT) & ~(size - 1);
1577 unmap_mapping_range(page->mapping, start, size, 0);
1578 }
1579 kill_procs(&tokill, flags & MF_MUST_KILL, false, pfn, flags);
1580 rc = 0;
1581unlock:
1582 dax_unlock_page(page, cookie);
1583out:
1584
1585 put_dev_pagemap(pgmap);
1586 action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
1587 return rc;
1588}
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607int memory_failure(unsigned long pfn, int flags)
1608{
1609 struct page *p;
1610 struct page *hpage;
1611 struct page *orig_head;
1612 struct dev_pagemap *pgmap;
1613 int res = 0;
1614 unsigned long page_flags;
1615 bool retry = true;
1616 static DEFINE_MUTEX(mf_mutex);
1617
1618 if (!sysctl_memory_failure_recovery)
1619 panic("Memory failure on page %lx", pfn);
1620
1621 p = pfn_to_online_page(pfn);
1622 if (!p) {
1623 if (pfn_valid(pfn)) {
1624 pgmap = get_dev_pagemap(pfn, NULL);
1625 if (pgmap)
1626 return memory_failure_dev_pagemap(pfn, flags,
1627 pgmap);
1628 }
1629 pr_err("Memory failure: %#lx: memory outside kernel control\n",
1630 pfn);
1631 return -ENXIO;
1632 }
1633
1634 mutex_lock(&mf_mutex);
1635
1636try_again:
1637 if (PageHuge(p)) {
1638 res = memory_failure_hugetlb(pfn, flags);
1639 goto unlock_mutex;
1640 }
1641
1642 if (TestSetPageHWPoison(p)) {
1643 pr_err("Memory failure: %#lx: already hardware poisoned\n",
1644 pfn);
1645 res = -EHWPOISON;
1646 if (flags & MF_ACTION_REQUIRED)
1647 res = kill_accessing_process(current, pfn, flags);
1648 goto unlock_mutex;
1649 }
1650
1651 orig_head = hpage = compound_head(p);
1652 num_poisoned_pages_inc();
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665 if (!(flags & MF_COUNT_INCREASED)) {
1666 res = get_hwpoison_page(p, flags);
1667 if (!res) {
1668 if (is_free_buddy_page(p)) {
1669 if (take_page_off_buddy(p)) {
1670 page_ref_inc(p);
1671 res = MF_RECOVERED;
1672 } else {
1673
1674 if (retry) {
1675 ClearPageHWPoison(p);
1676 num_poisoned_pages_dec();
1677 retry = false;
1678 goto try_again;
1679 }
1680 res = MF_FAILED;
1681 }
1682 action_result(pfn, MF_MSG_BUDDY, res);
1683 res = res == MF_RECOVERED ? 0 : -EBUSY;
1684 } else {
1685 action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
1686 res = -EBUSY;
1687 }
1688 goto unlock_mutex;
1689 } else if (res < 0) {
1690 action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
1691 res = -EBUSY;
1692 goto unlock_mutex;
1693 }
1694 }
1695
1696 if (PageTransHuge(hpage)) {
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710 SetPageHasHWPoisoned(hpage);
1711 if (try_to_split_thp_page(p, "Memory Failure") < 0) {
1712 action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
1713 res = -EBUSY;
1714 goto unlock_mutex;
1715 }
1716 VM_BUG_ON_PAGE(!page_count(p), p);
1717 }
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727 shake_page(p);
1728
1729 lock_page(p);
1730
1731
1732
1733
1734
1735 if (PageCompound(p) && compound_head(p) != orig_head) {
1736 action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
1737 res = -EBUSY;
1738 goto unlock_page;
1739 }
1740
1741
1742
1743
1744
1745
1746
1747
1748 page_flags = p->flags;
1749
1750
1751
1752
1753 if (!PageHWPoison(p)) {
1754 pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1755 num_poisoned_pages_dec();
1756 unlock_page(p);
1757 put_page(p);
1758 goto unlock_mutex;
1759 }
1760 if (hwpoison_filter(p)) {
1761 if (TestClearPageHWPoison(p))
1762 num_poisoned_pages_dec();
1763 unlock_page(p);
1764 put_page(p);
1765 goto unlock_mutex;
1766 }
1767
1768
1769
1770
1771
1772
1773 if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p))
1774 goto identify_page_state;
1775
1776
1777
1778
1779
1780 wait_on_page_writeback(p);
1781
1782
1783
1784
1785
1786 if (!hwpoison_user_mappings(p, pfn, flags, p)) {
1787 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1788 res = -EBUSY;
1789 goto unlock_page;
1790 }
1791
1792
1793
1794
1795 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1796 action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
1797 res = -EBUSY;
1798 goto unlock_page;
1799 }
1800
1801identify_page_state:
1802 res = identify_page_state(pfn, p, page_flags);
1803 mutex_unlock(&mf_mutex);
1804 return res;
1805unlock_page:
1806 unlock_page(p);
1807unlock_mutex:
1808 mutex_unlock(&mf_mutex);
1809 return res;
1810}
1811EXPORT_SYMBOL_GPL(memory_failure);
1812
1813#define MEMORY_FAILURE_FIFO_ORDER 4
1814#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1815
1816struct memory_failure_entry {
1817 unsigned long pfn;
1818 int flags;
1819};
1820
1821struct memory_failure_cpu {
1822 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1823 MEMORY_FAILURE_FIFO_SIZE);
1824 spinlock_t lock;
1825 struct work_struct work;
1826};
1827
1828static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846void memory_failure_queue(unsigned long pfn, int flags)
1847{
1848 struct memory_failure_cpu *mf_cpu;
1849 unsigned long proc_flags;
1850 struct memory_failure_entry entry = {
1851 .pfn = pfn,
1852 .flags = flags,
1853 };
1854
1855 mf_cpu = &get_cpu_var(memory_failure_cpu);
1856 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1857 if (kfifo_put(&mf_cpu->fifo, entry))
1858 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1859 else
1860 pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
1861 pfn);
1862 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1863 put_cpu_var(memory_failure_cpu);
1864}
1865EXPORT_SYMBOL_GPL(memory_failure_queue);
1866
1867static void memory_failure_work_func(struct work_struct *work)
1868{
1869 struct memory_failure_cpu *mf_cpu;
1870 struct memory_failure_entry entry = { 0, };
1871 unsigned long proc_flags;
1872 int gotten;
1873
1874 mf_cpu = container_of(work, struct memory_failure_cpu, work);
1875 for (;;) {
1876 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1877 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1878 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1879 if (!gotten)
1880 break;
1881 if (entry.flags & MF_SOFT_OFFLINE)
1882 soft_offline_page(entry.pfn, entry.flags);
1883 else
1884 memory_failure(entry.pfn, entry.flags);
1885 }
1886}
1887
1888
1889
1890
1891
1892void memory_failure_queue_kick(int cpu)
1893{
1894 struct memory_failure_cpu *mf_cpu;
1895
1896 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1897 cancel_work_sync(&mf_cpu->work);
1898 memory_failure_work_func(&mf_cpu->work);
1899}
1900
1901static int __init memory_failure_init(void)
1902{
1903 struct memory_failure_cpu *mf_cpu;
1904 int cpu;
1905
1906 for_each_possible_cpu(cpu) {
1907 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1908 spin_lock_init(&mf_cpu->lock);
1909 INIT_KFIFO(mf_cpu->fifo);
1910 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1911 }
1912
1913 return 0;
1914}
1915core_initcall(memory_failure_init);
1916
1917#define unpoison_pr_info(fmt, pfn, rs) \
1918({ \
1919 if (__ratelimit(rs)) \
1920 pr_info(fmt, pfn); \
1921})
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935int unpoison_memory(unsigned long pfn)
1936{
1937 struct page *page;
1938 struct page *p;
1939 int freeit = 0;
1940 unsigned long flags = 0;
1941 static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
1942 DEFAULT_RATELIMIT_BURST);
1943
1944 if (!pfn_valid(pfn))
1945 return -ENXIO;
1946
1947 p = pfn_to_page(pfn);
1948 page = compound_head(p);
1949
1950 if (!PageHWPoison(p)) {
1951 unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
1952 pfn, &unpoison_rs);
1953 return 0;
1954 }
1955
1956 if (page_count(page) > 1) {
1957 unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
1958 pfn, &unpoison_rs);
1959 return 0;
1960 }
1961
1962 if (page_mapped(page)) {
1963 unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
1964 pfn, &unpoison_rs);
1965 return 0;
1966 }
1967
1968 if (page_mapping(page)) {
1969 unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
1970 pfn, &unpoison_rs);
1971 return 0;
1972 }
1973
1974
1975
1976
1977
1978
1979 if (!PageHuge(page) && PageTransHuge(page)) {
1980 unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
1981 pfn, &unpoison_rs);
1982 return 0;
1983 }
1984
1985 if (!get_hwpoison_page(p, flags)) {
1986 if (TestClearPageHWPoison(p))
1987 num_poisoned_pages_dec();
1988 unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
1989 pfn, &unpoison_rs);
1990 return 0;
1991 }
1992
1993 lock_page(page);
1994
1995
1996
1997
1998
1999
2000 if (TestClearPageHWPoison(page)) {
2001 unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
2002 pfn, &unpoison_rs);
2003 num_poisoned_pages_dec();
2004 freeit = 1;
2005 }
2006 unlock_page(page);
2007
2008 put_page(page);
2009 if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
2010 put_page(page);
2011
2012 return 0;
2013}
2014EXPORT_SYMBOL(unpoison_memory);
2015
2016static bool isolate_page(struct page *page, struct list_head *pagelist)
2017{
2018 bool isolated = false;
2019 bool lru = PageLRU(page);
2020
2021 if (PageHuge(page)) {
2022 isolated = isolate_huge_page(page, pagelist);
2023 } else {
2024 if (lru)
2025 isolated = !isolate_lru_page(page);
2026 else
2027 isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE);
2028
2029 if (isolated)
2030 list_add(&page->lru, pagelist);
2031 }
2032
2033 if (isolated && lru)
2034 inc_node_page_state(page, NR_ISOLATED_ANON +
2035 page_is_file_lru(page));
2036
2037
2038
2039
2040
2041
2042
2043
2044 put_page(page);
2045 return isolated;
2046}
2047
2048
2049
2050
2051
2052
2053static int __soft_offline_page(struct page *page)
2054{
2055 int ret = 0;
2056 unsigned long pfn = page_to_pfn(page);
2057 struct page *hpage = compound_head(page);
2058 char const *msg_page[] = {"page", "hugepage"};
2059 bool huge = PageHuge(page);
2060 LIST_HEAD(pagelist);
2061 struct migration_target_control mtc = {
2062 .nid = NUMA_NO_NODE,
2063 .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
2064 };
2065
2066
2067
2068
2069
2070
2071
2072 lock_page(page);
2073 if (!PageHuge(page))
2074 wait_on_page_writeback(page);
2075 if (PageHWPoison(page)) {
2076 unlock_page(page);
2077 put_page(page);
2078 pr_info("soft offline: %#lx page already poisoned\n", pfn);
2079 return 0;
2080 }
2081
2082 if (!PageHuge(page))
2083
2084
2085
2086
2087 ret = invalidate_inode_page(page);
2088 unlock_page(page);
2089
2090
2091
2092
2093
2094 if (ret) {
2095 pr_info("soft_offline: %#lx: invalidated\n", pfn);
2096 page_handle_poison(page, false, true);
2097 return 0;
2098 }
2099
2100 if (isolate_page(hpage, &pagelist)) {
2101 ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
2102 (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL);
2103 if (!ret) {
2104 bool release = !huge;
2105
2106 if (!page_handle_poison(page, huge, release))
2107 ret = -EBUSY;
2108 } else {
2109 if (!list_empty(&pagelist))
2110 putback_movable_pages(&pagelist);
2111
2112 pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n",
2113 pfn, msg_page[huge], ret, page->flags, &page->flags);
2114 if (ret > 0)
2115 ret = -EBUSY;
2116 }
2117 } else {
2118 pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %lx (%pGp)\n",
2119 pfn, msg_page[huge], page_count(page), page->flags, &page->flags);
2120 ret = -EBUSY;
2121 }
2122 return ret;
2123}
2124
2125static int soft_offline_in_use_page(struct page *page)
2126{
2127 struct page *hpage = compound_head(page);
2128
2129 if (!PageHuge(page) && PageTransHuge(hpage))
2130 if (try_to_split_thp_page(page, "soft offline") < 0)
2131 return -EBUSY;
2132 return __soft_offline_page(page);
2133}
2134
2135static int soft_offline_free_page(struct page *page)
2136{
2137 int rc = 0;
2138
2139 if (!page_handle_poison(page, true, false))
2140 rc = -EBUSY;
2141
2142 return rc;
2143}
2144
2145static void put_ref_page(struct page *page)
2146{
2147 if (page)
2148 put_page(page);
2149}
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173int soft_offline_page(unsigned long pfn, int flags)
2174{
2175 int ret;
2176 bool try_again = true;
2177 struct page *page, *ref_page = NULL;
2178
2179 WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED));
2180
2181 if (!pfn_valid(pfn))
2182 return -ENXIO;
2183 if (flags & MF_COUNT_INCREASED)
2184 ref_page = pfn_to_page(pfn);
2185
2186
2187 page = pfn_to_online_page(pfn);
2188 if (!page) {
2189 put_ref_page(ref_page);
2190 return -EIO;
2191 }
2192
2193 if (PageHWPoison(page)) {
2194 pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
2195 put_ref_page(ref_page);
2196 return 0;
2197 }
2198
2199retry:
2200 get_online_mems();
2201 ret = get_hwpoison_page(page, flags);
2202 put_online_mems();
2203
2204 if (ret > 0) {
2205 ret = soft_offline_in_use_page(page);
2206 } else if (ret == 0) {
2207 if (soft_offline_free_page(page) && try_again) {
2208 try_again = false;
2209 goto retry;
2210 }
2211 }
2212
2213 return ret;
2214}
2215