1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36#include <linux/kernel.h>
37#include <linux/mm.h>
38#include <linux/page-flags.h>
39#include <linux/kernel-page-flags.h>
40#include <linux/sched/signal.h>
41#include <linux/sched/task.h>
42#include <linux/ksm.h>
43#include <linux/rmap.h>
44#include <linux/export.h>
45#include <linux/pagemap.h>
46#include <linux/swap.h>
47#include <linux/backing-dev.h>
48#include <linux/migrate.h>
49#include <linux/suspend.h>
50#include <linux/slab.h>
51#include <linux/swapops.h>
52#include <linux/hugetlb.h>
53#include <linux/memory_hotplug.h>
54#include <linux/mm_inline.h>
55#include <linux/memremap.h>
56#include <linux/kfifo.h>
57#include <linux/ratelimit.h>
58#include <linux/page-isolation.h>
59#include "internal.h"
60#include "ras/ras_event.h"
61
62int sysctl_memory_failure_early_kill __read_mostly = 0;
63
64int sysctl_memory_failure_recovery __read_mostly = 1;
65
66atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
67
68#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
69
70u32 hwpoison_filter_enable = 0;
71u32 hwpoison_filter_dev_major = ~0U;
72u32 hwpoison_filter_dev_minor = ~0U;
73u64 hwpoison_filter_flags_mask;
74u64 hwpoison_filter_flags_value;
75EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
76EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
77EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
78EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
79EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
80
81static int hwpoison_filter_dev(struct page *p)
82{
83 struct address_space *mapping;
84 dev_t dev;
85
86 if (hwpoison_filter_dev_major == ~0U &&
87 hwpoison_filter_dev_minor == ~0U)
88 return 0;
89
90
91
92
93 if (PageSlab(p))
94 return -EINVAL;
95
96 mapping = page_mapping(p);
97 if (mapping == NULL || mapping->host == NULL)
98 return -EINVAL;
99
100 dev = mapping->host->i_sb->s_dev;
101 if (hwpoison_filter_dev_major != ~0U &&
102 hwpoison_filter_dev_major != MAJOR(dev))
103 return -EINVAL;
104 if (hwpoison_filter_dev_minor != ~0U &&
105 hwpoison_filter_dev_minor != MINOR(dev))
106 return -EINVAL;
107
108 return 0;
109}
110
111static int hwpoison_filter_flags(struct page *p)
112{
113 if (!hwpoison_filter_flags_mask)
114 return 0;
115
116 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
117 hwpoison_filter_flags_value)
118 return 0;
119 else
120 return -EINVAL;
121}
122
123
124
125
126
127
128
129
130
131
132
133#ifdef CONFIG_MEMCG
134u64 hwpoison_filter_memcg;
135EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
136static int hwpoison_filter_task(struct page *p)
137{
138 if (!hwpoison_filter_memcg)
139 return 0;
140
141 if (page_cgroup_ino(p) != hwpoison_filter_memcg)
142 return -EINVAL;
143
144 return 0;
145}
146#else
147static int hwpoison_filter_task(struct page *p) { return 0; }
148#endif
149
150int hwpoison_filter(struct page *p)
151{
152 if (!hwpoison_filter_enable)
153 return 0;
154
155 if (hwpoison_filter_dev(p))
156 return -EINVAL;
157
158 if (hwpoison_filter_flags(p))
159 return -EINVAL;
160
161 if (hwpoison_filter_task(p))
162 return -EINVAL;
163
164 return 0;
165}
166#else
167int hwpoison_filter(struct page *p)
168{
169 return 0;
170}
171#endif
172
173EXPORT_SYMBOL_GPL(hwpoison_filter);
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197struct to_kill {
198 struct list_head nd;
199 struct task_struct *tsk;
200 unsigned long addr;
201 short size_shift;
202 char addr_valid;
203};
204
205
206
207
208
209
210static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
211{
212 struct task_struct *t = tk->tsk;
213 short addr_lsb = tk->size_shift;
214 int ret;
215
216 pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
217 pfn, t->comm, t->pid);
218
219 if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
220 ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr,
221 addr_lsb, current);
222 } else {
223
224
225
226
227
228
229 ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
230 addr_lsb, t);
231 }
232 if (ret < 0)
233 pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
234 t->comm, t->pid, ret);
235 return ret;
236}
237
238
239
240
241
242void shake_page(struct page *p, int access)
243{
244 if (PageHuge(p))
245 return;
246
247 if (!PageSlab(p)) {
248 lru_add_drain_all();
249 if (PageLRU(p))
250 return;
251 drain_all_pages(page_zone(p));
252 if (PageLRU(p) || is_free_buddy_page(p))
253 return;
254 }
255
256
257
258
259
260 if (access)
261 drop_slab_node(page_to_nid(p));
262}
263EXPORT_SYMBOL_GPL(shake_page);
264
265static unsigned long dev_pagemap_mapping_shift(struct page *page,
266 struct vm_area_struct *vma)
267{
268 unsigned long address = vma_address(page, vma);
269 pgd_t *pgd;
270 p4d_t *p4d;
271 pud_t *pud;
272 pmd_t *pmd;
273 pte_t *pte;
274
275 pgd = pgd_offset(vma->vm_mm, address);
276 if (!pgd_present(*pgd))
277 return 0;
278 p4d = p4d_offset(pgd, address);
279 if (!p4d_present(*p4d))
280 return 0;
281 pud = pud_offset(p4d, address);
282 if (!pud_present(*pud))
283 return 0;
284 if (pud_devmap(*pud))
285 return PUD_SHIFT;
286 pmd = pmd_offset(pud, address);
287 if (!pmd_present(*pmd))
288 return 0;
289 if (pmd_devmap(*pmd))
290 return PMD_SHIFT;
291 pte = pte_offset_map(pmd, address);
292 if (!pte_present(*pte))
293 return 0;
294 if (pte_devmap(*pte))
295 return PAGE_SHIFT;
296 return 0;
297}
298
299
300
301
302
303
304
305
306
307
308
309static void add_to_kill(struct task_struct *tsk, struct page *p,
310 struct vm_area_struct *vma,
311 struct list_head *to_kill,
312 struct to_kill **tkc)
313{
314 struct to_kill *tk;
315
316 if (*tkc) {
317 tk = *tkc;
318 *tkc = NULL;
319 } else {
320 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
321 if (!tk) {
322 pr_err("Memory failure: Out of memory while machine check handling\n");
323 return;
324 }
325 }
326 tk->addr = page_address_in_vma(p, vma);
327 tk->addr_valid = 1;
328 if (is_zone_device_page(p))
329 tk->size_shift = dev_pagemap_mapping_shift(p, vma);
330 else
331 tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
332
333
334
335
336
337
338
339 if (tk->addr == -EFAULT || tk->size_shift == 0) {
340 pr_info("Memory failure: Unable to find user space address %lx in %s\n",
341 page_to_pfn(p), tsk->comm);
342 tk->addr_valid = 0;
343 }
344 get_task_struct(tsk);
345 tk->tsk = tsk;
346 list_add_tail(&tk->nd, to_kill);
347}
348
349
350
351
352
353
354
355
356
357static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
358 unsigned long pfn, int flags)
359{
360 struct to_kill *tk, *next;
361
362 list_for_each_entry_safe (tk, next, to_kill, nd) {
363 if (forcekill) {
364
365
366
367
368
369 if (fail || tk->addr_valid == 0) {
370 pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
371 pfn, tk->tsk->comm, tk->tsk->pid);
372 do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
373 tk->tsk, PIDTYPE_PID);
374 }
375
376
377
378
379
380
381
382 else if (kill_proc(tk, pfn, flags) < 0)
383 pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
384 pfn, tk->tsk->comm, tk->tsk->pid);
385 }
386 put_task_struct(tk->tsk);
387 kfree(tk);
388 }
389}
390
391
392
393
394
395
396
397
398
399static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
400{
401 struct task_struct *t;
402
403 for_each_thread(tsk, t)
404 if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
405 return t;
406 return NULL;
407}
408
409
410
411
412
413
414
415static struct task_struct *task_early_kill(struct task_struct *tsk,
416 int force_early)
417{
418 struct task_struct *t;
419 if (!tsk->mm)
420 return NULL;
421 if (force_early)
422 return tsk;
423 t = find_early_kill_thread(tsk);
424 if (t)
425 return t;
426 if (sysctl_memory_failure_early_kill)
427 return tsk;
428 return NULL;
429}
430
431
432
433
434static void collect_procs_anon(struct page *page, struct list_head *to_kill,
435 struct to_kill **tkc, int force_early)
436{
437 struct vm_area_struct *vma;
438 struct task_struct *tsk;
439 struct anon_vma *av;
440 pgoff_t pgoff;
441
442 av = page_lock_anon_vma_read(page);
443 if (av == NULL)
444 return;
445
446 pgoff = page_to_pgoff(page);
447 read_lock(&tasklist_lock);
448 for_each_process (tsk) {
449 struct anon_vma_chain *vmac;
450 struct task_struct *t = task_early_kill(tsk, force_early);
451
452 if (!t)
453 continue;
454 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
455 pgoff, pgoff) {
456 vma = vmac->vma;
457 if (!page_mapped_in_vma(page, vma))
458 continue;
459 if (vma->vm_mm == t->mm)
460 add_to_kill(t, page, vma, to_kill, tkc);
461 }
462 }
463 read_unlock(&tasklist_lock);
464 page_unlock_anon_vma_read(av);
465}
466
467
468
469
470static void collect_procs_file(struct page *page, struct list_head *to_kill,
471 struct to_kill **tkc, int force_early)
472{
473 struct vm_area_struct *vma;
474 struct task_struct *tsk;
475 struct address_space *mapping = page->mapping;
476
477 i_mmap_lock_read(mapping);
478 read_lock(&tasklist_lock);
479 for_each_process(tsk) {
480 pgoff_t pgoff = page_to_pgoff(page);
481 struct task_struct *t = task_early_kill(tsk, force_early);
482
483 if (!t)
484 continue;
485 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
486 pgoff) {
487
488
489
490
491
492
493
494 if (vma->vm_mm == t->mm)
495 add_to_kill(t, page, vma, to_kill, tkc);
496 }
497 }
498 read_unlock(&tasklist_lock);
499 i_mmap_unlock_read(mapping);
500}
501
502
503
504
505
506
507
508static void collect_procs(struct page *page, struct list_head *tokill,
509 int force_early)
510{
511 struct to_kill *tk;
512
513 if (!page->mapping)
514 return;
515
516 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
517 if (!tk)
518 return;
519 if (PageAnon(page))
520 collect_procs_anon(page, tokill, &tk, force_early);
521 else
522 collect_procs_file(page, tokill, &tk, force_early);
523 kfree(tk);
524}
525
526static const char *action_name[] = {
527 [MF_IGNORED] = "Ignored",
528 [MF_FAILED] = "Failed",
529 [MF_DELAYED] = "Delayed",
530 [MF_RECOVERED] = "Recovered",
531};
532
533static const char * const action_page_types[] = {
534 [MF_MSG_KERNEL] = "reserved kernel page",
535 [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
536 [MF_MSG_SLAB] = "kernel slab page",
537 [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
538 [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
539 [MF_MSG_HUGE] = "huge page",
540 [MF_MSG_FREE_HUGE] = "free huge page",
541 [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page",
542 [MF_MSG_UNMAP_FAILED] = "unmapping failed page",
543 [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
544 [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
545 [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
546 [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
547 [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
548 [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
549 [MF_MSG_DIRTY_LRU] = "dirty LRU page",
550 [MF_MSG_CLEAN_LRU] = "clean LRU page",
551 [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
552 [MF_MSG_BUDDY] = "free buddy page",
553 [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
554 [MF_MSG_DAX] = "dax page",
555 [MF_MSG_UNKNOWN] = "unknown page",
556};
557
558
559
560
561
562
563
564static int delete_from_lru_cache(struct page *p)
565{
566 if (!isolate_lru_page(p)) {
567
568
569
570
571 ClearPageActive(p);
572 ClearPageUnevictable(p);
573
574
575
576
577
578 mem_cgroup_uncharge(p);
579
580
581
582
583 put_page(p);
584 return 0;
585 }
586 return -EIO;
587}
588
589static int truncate_error_page(struct page *p, unsigned long pfn,
590 struct address_space *mapping)
591{
592 int ret = MF_FAILED;
593
594 if (mapping->a_ops->error_remove_page) {
595 int err = mapping->a_ops->error_remove_page(mapping, p);
596
597 if (err != 0) {
598 pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
599 pfn, err);
600 } else if (page_has_private(p) &&
601 !try_to_release_page(p, GFP_NOIO)) {
602 pr_info("Memory failure: %#lx: failed to release buffers\n",
603 pfn);
604 } else {
605 ret = MF_RECOVERED;
606 }
607 } else {
608
609
610
611
612 if (invalidate_inode_page(p))
613 ret = MF_RECOVERED;
614 else
615 pr_info("Memory failure: %#lx: Failed to invalidate\n",
616 pfn);
617 }
618
619 return ret;
620}
621
622
623
624
625
626
627static int me_kernel(struct page *p, unsigned long pfn)
628{
629 return MF_IGNORED;
630}
631
632
633
634
635static int me_unknown(struct page *p, unsigned long pfn)
636{
637 pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
638 return MF_FAILED;
639}
640
641
642
643
644static int me_pagecache_clean(struct page *p, unsigned long pfn)
645{
646 struct address_space *mapping;
647
648 delete_from_lru_cache(p);
649
650
651
652
653
654 if (PageAnon(p))
655 return MF_RECOVERED;
656
657
658
659
660
661
662
663
664 mapping = page_mapping(p);
665 if (!mapping) {
666
667
668
669 return MF_FAILED;
670 }
671
672
673
674
675
676
677 return truncate_error_page(p, pfn, mapping);
678}
679
680
681
682
683
684
685static int me_pagecache_dirty(struct page *p, unsigned long pfn)
686{
687 struct address_space *mapping = page_mapping(p);
688
689 SetPageError(p);
690
691 if (mapping) {
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726 mapping_set_error(mapping, -EIO);
727 }
728
729 return me_pagecache_clean(p, pfn);
730}
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751static int me_swapcache_dirty(struct page *p, unsigned long pfn)
752{
753 ClearPageDirty(p);
754
755 ClearPageUptodate(p);
756
757 if (!delete_from_lru_cache(p))
758 return MF_DELAYED;
759 else
760 return MF_FAILED;
761}
762
763static int me_swapcache_clean(struct page *p, unsigned long pfn)
764{
765 delete_from_swap_cache(p);
766
767 if (!delete_from_lru_cache(p))
768 return MF_RECOVERED;
769 else
770 return MF_FAILED;
771}
772
773
774
775
776
777
778
779static int me_huge_page(struct page *p, unsigned long pfn)
780{
781 int res = 0;
782 struct page *hpage = compound_head(p);
783 struct address_space *mapping;
784
785 if (!PageHuge(hpage))
786 return MF_DELAYED;
787
788 mapping = page_mapping(hpage);
789 if (mapping) {
790 res = truncate_error_page(hpage, pfn, mapping);
791 } else {
792 unlock_page(hpage);
793
794
795
796
797
798 if (PageAnon(hpage))
799 put_page(hpage);
800 dissolve_free_huge_page(p);
801 res = MF_RECOVERED;
802 lock_page(hpage);
803 }
804
805 return res;
806}
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821#define dirty (1UL << PG_dirty)
822#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
823#define unevict (1UL << PG_unevictable)
824#define mlock (1UL << PG_mlocked)
825#define writeback (1UL << PG_writeback)
826#define lru (1UL << PG_lru)
827#define head (1UL << PG_head)
828#define slab (1UL << PG_slab)
829#define reserved (1UL << PG_reserved)
830
831static struct page_state {
832 unsigned long mask;
833 unsigned long res;
834 enum mf_action_page_type type;
835 int (*action)(struct page *p, unsigned long pfn);
836} error_states[] = {
837 { reserved, reserved, MF_MSG_KERNEL, me_kernel },
838
839
840
841
842
843
844
845
846
847
848 { slab, slab, MF_MSG_SLAB, me_kernel },
849
850 { head, head, MF_MSG_HUGE, me_huge_page },
851
852 { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
853 { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
854
855 { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
856 { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
857
858 { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
859 { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
860
861 { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty },
862 { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean },
863
864
865
866
867 { 0, 0, MF_MSG_UNKNOWN, me_unknown },
868};
869
870#undef dirty
871#undef sc
872#undef unevict
873#undef mlock
874#undef writeback
875#undef lru
876#undef head
877#undef slab
878#undef reserved
879
880
881
882
883
884static void action_result(unsigned long pfn, enum mf_action_page_type type,
885 enum mf_result result)
886{
887 trace_memory_failure_event(pfn, type, result);
888
889 pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
890 pfn, action_page_types[type], action_name[result]);
891}
892
893static int page_action(struct page_state *ps, struct page *p,
894 unsigned long pfn)
895{
896 int result;
897 int count;
898
899 result = ps->action(p, pfn);
900
901 count = page_count(p) - 1;
902 if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
903 count--;
904 if (count > 0) {
905 pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
906 pfn, action_page_types[ps->type], count);
907 result = MF_FAILED;
908 }
909 action_result(pfn, ps->type, result);
910
911
912
913
914
915
916 return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
917}
918
919
920
921
922
923
924
925
926int get_hwpoison_page(struct page *page)
927{
928 struct page *head = compound_head(page);
929
930 if (!PageHuge(head) && PageTransHuge(head)) {
931
932
933
934
935
936
937 if (!PageAnon(head)) {
938 pr_err("Memory failure: %#lx: non anonymous thp\n",
939 page_to_pfn(page));
940 return 0;
941 }
942 }
943
944 if (get_page_unless_zero(head)) {
945 if (head == compound_head(page))
946 return 1;
947
948 pr_info("Memory failure: %#lx cannot catch tail\n",
949 page_to_pfn(page));
950 put_page(head);
951 }
952
953 return 0;
954}
955EXPORT_SYMBOL_GPL(get_hwpoison_page);
956
957
958
959
960
961static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
962 int flags, struct page **hpagep)
963{
964 enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
965 struct address_space *mapping;
966 LIST_HEAD(tokill);
967 bool unmap_success;
968 int kill = 1, forcekill;
969 struct page *hpage = *hpagep;
970 bool mlocked = PageMlocked(hpage);
971
972
973
974
975
976 if (PageReserved(p) || PageSlab(p))
977 return true;
978 if (!(PageLRU(hpage) || PageHuge(p)))
979 return true;
980
981
982
983
984
985 if (!page_mapped(hpage))
986 return true;
987
988 if (PageKsm(p)) {
989 pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
990 return false;
991 }
992
993 if (PageSwapCache(p)) {
994 pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
995 pfn);
996 ttu |= TTU_IGNORE_HWPOISON;
997 }
998
999
1000
1001
1002
1003
1004
1005 mapping = page_mapping(hpage);
1006 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
1007 mapping_cap_writeback_dirty(mapping)) {
1008 if (page_mkclean(hpage)) {
1009 SetPageDirty(hpage);
1010 } else {
1011 kill = 0;
1012 ttu |= TTU_IGNORE_HWPOISON;
1013 pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
1014 pfn);
1015 }
1016 }
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026 if (kill)
1027 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
1028
1029 unmap_success = try_to_unmap(hpage, ttu);
1030 if (!unmap_success)
1031 pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
1032 pfn, page_mapcount(hpage));
1033
1034
1035
1036
1037
1038 if (mlocked)
1039 shake_page(hpage, 0);
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051 forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
1052 kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
1053
1054 return unmap_success;
1055}
1056
1057static int identify_page_state(unsigned long pfn, struct page *p,
1058 unsigned long page_flags)
1059{
1060 struct page_state *ps;
1061
1062
1063
1064
1065
1066
1067 for (ps = error_states;; ps++)
1068 if ((p->flags & ps->mask) == ps->res)
1069 break;
1070
1071 page_flags |= (p->flags & (1UL << PG_dirty));
1072
1073 if (!ps->mask)
1074 for (ps = error_states;; ps++)
1075 if ((page_flags & ps->mask) == ps->res)
1076 break;
1077 return page_action(ps, p, pfn);
1078}
1079
1080static int memory_failure_hugetlb(unsigned long pfn, int flags)
1081{
1082 struct page *p = pfn_to_page(pfn);
1083 struct page *head = compound_head(p);
1084 int res;
1085 unsigned long page_flags;
1086
1087 if (TestSetPageHWPoison(head)) {
1088 pr_err("Memory failure: %#lx: already hardware poisoned\n",
1089 pfn);
1090 return 0;
1091 }
1092
1093 num_poisoned_pages_inc();
1094
1095 if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
1096
1097
1098
1099 lock_page(head);
1100 if (PageHWPoison(head)) {
1101 if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
1102 || (p != head && TestSetPageHWPoison(head))) {
1103 num_poisoned_pages_dec();
1104 unlock_page(head);
1105 return 0;
1106 }
1107 }
1108 unlock_page(head);
1109 dissolve_free_huge_page(p);
1110 action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED);
1111 return 0;
1112 }
1113
1114 lock_page(head);
1115 page_flags = head->flags;
1116
1117 if (!PageHWPoison(head)) {
1118 pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1119 num_poisoned_pages_dec();
1120 unlock_page(head);
1121 put_hwpoison_page(head);
1122 return 0;
1123 }
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134 if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
1135 action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
1136 res = -EBUSY;
1137 goto out;
1138 }
1139
1140 if (!hwpoison_user_mappings(p, pfn, flags, &head)) {
1141 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1142 res = -EBUSY;
1143 goto out;
1144 }
1145
1146 res = identify_page_state(pfn, p, page_flags);
1147out:
1148 unlock_page(head);
1149 return res;
1150}
1151
1152static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
1153 struct dev_pagemap *pgmap)
1154{
1155 struct page *page = pfn_to_page(pfn);
1156 const bool unmap_success = true;
1157 unsigned long size = 0;
1158 struct to_kill *tk;
1159 LIST_HEAD(tokill);
1160 int rc = -EBUSY;
1161 loff_t start;
1162 dax_entry_t cookie;
1163
1164
1165
1166
1167
1168
1169
1170
1171 cookie = dax_lock_page(page);
1172 if (!cookie)
1173 goto out;
1174
1175 if (hwpoison_filter(page)) {
1176 rc = 0;
1177 goto unlock;
1178 }
1179
1180 switch (pgmap->type) {
1181 case MEMORY_DEVICE_PRIVATE:
1182 case MEMORY_DEVICE_PUBLIC:
1183
1184
1185
1186
1187 goto unlock;
1188 default:
1189 break;
1190 }
1191
1192
1193
1194
1195
1196 SetPageHWPoison(page);
1197
1198
1199
1200
1201
1202
1203
1204 flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
1205 collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
1206
1207 list_for_each_entry(tk, &tokill, nd)
1208 if (tk->size_shift)
1209 size = max(size, 1UL << tk->size_shift);
1210 if (size) {
1211
1212
1213
1214
1215
1216
1217 start = (page->index << PAGE_SHIFT) & ~(size - 1);
1218 unmap_mapping_range(page->mapping, start, start + size, 0);
1219 }
1220 kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
1221 rc = 0;
1222unlock:
1223 dax_unlock_page(page, cookie);
1224out:
1225
1226 put_dev_pagemap(pgmap);
1227 action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
1228 return rc;
1229}
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248int memory_failure(unsigned long pfn, int flags)
1249{
1250 struct page *p;
1251 struct page *hpage;
1252 struct page *orig_head;
1253 struct dev_pagemap *pgmap;
1254 int res;
1255 unsigned long page_flags;
1256
1257 if (!sysctl_memory_failure_recovery)
1258 panic("Memory failure on page %lx", pfn);
1259
1260 if (!pfn_valid(pfn)) {
1261 pr_err("Memory failure: %#lx: memory outside kernel control\n",
1262 pfn);
1263 return -ENXIO;
1264 }
1265
1266 pgmap = get_dev_pagemap(pfn, NULL);
1267 if (pgmap)
1268 return memory_failure_dev_pagemap(pfn, flags, pgmap);
1269
1270 p = pfn_to_page(pfn);
1271 if (PageHuge(p))
1272 return memory_failure_hugetlb(pfn, flags);
1273 if (TestSetPageHWPoison(p)) {
1274 pr_err("Memory failure: %#lx: already hardware poisoned\n",
1275 pfn);
1276 return 0;
1277 }
1278
1279 orig_head = hpage = compound_head(p);
1280 num_poisoned_pages_inc();
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293 if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
1294 if (is_free_buddy_page(p)) {
1295 action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1296 return 0;
1297 } else {
1298 action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
1299 return -EBUSY;
1300 }
1301 }
1302
1303 if (PageTransHuge(hpage)) {
1304 lock_page(p);
1305 if (!PageAnon(p) || unlikely(split_huge_page(p))) {
1306 unlock_page(p);
1307 if (!PageAnon(p))
1308 pr_err("Memory failure: %#lx: non anonymous thp\n",
1309 pfn);
1310 else
1311 pr_err("Memory failure: %#lx: thp split failed\n",
1312 pfn);
1313 if (TestClearPageHWPoison(p))
1314 num_poisoned_pages_dec();
1315 put_hwpoison_page(p);
1316 return -EBUSY;
1317 }
1318 unlock_page(p);
1319 VM_BUG_ON_PAGE(!page_count(p), p);
1320 hpage = compound_head(p);
1321 }
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331 shake_page(p, 0);
1332
1333 if (!PageLRU(p) && is_free_buddy_page(p)) {
1334 if (flags & MF_COUNT_INCREASED)
1335 action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1336 else
1337 action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
1338 return 0;
1339 }
1340
1341 lock_page(p);
1342
1343
1344
1345
1346
1347 if (PageCompound(p) && compound_head(p) != orig_head) {
1348 action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
1349 res = -EBUSY;
1350 goto out;
1351 }
1352
1353
1354
1355
1356
1357
1358
1359
1360 if (PageHuge(p))
1361 page_flags = hpage->flags;
1362 else
1363 page_flags = p->flags;
1364
1365
1366
1367
1368 if (!PageHWPoison(p)) {
1369 pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1370 num_poisoned_pages_dec();
1371 unlock_page(p);
1372 put_hwpoison_page(p);
1373 return 0;
1374 }
1375 if (hwpoison_filter(p)) {
1376 if (TestClearPageHWPoison(p))
1377 num_poisoned_pages_dec();
1378 unlock_page(p);
1379 put_hwpoison_page(p);
1380 return 0;
1381 }
1382
1383 if (!PageTransTail(p) && !PageLRU(p))
1384 goto identify_page_state;
1385
1386
1387
1388
1389
1390 wait_on_page_writeback(p);
1391
1392
1393
1394
1395
1396
1397
1398
1399 if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) {
1400 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1401 res = -EBUSY;
1402 goto out;
1403 }
1404
1405
1406
1407
1408 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1409 action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
1410 res = -EBUSY;
1411 goto out;
1412 }
1413
1414identify_page_state:
1415 res = identify_page_state(pfn, p, page_flags);
1416out:
1417 unlock_page(p);
1418 return res;
1419}
1420EXPORT_SYMBOL_GPL(memory_failure);
1421
1422#define MEMORY_FAILURE_FIFO_ORDER 4
1423#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1424
1425struct memory_failure_entry {
1426 unsigned long pfn;
1427 int flags;
1428};
1429
1430struct memory_failure_cpu {
1431 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1432 MEMORY_FAILURE_FIFO_SIZE);
1433 spinlock_t lock;
1434 struct work_struct work;
1435};
1436
1437static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455void memory_failure_queue(unsigned long pfn, int flags)
1456{
1457 struct memory_failure_cpu *mf_cpu;
1458 unsigned long proc_flags;
1459 struct memory_failure_entry entry = {
1460 .pfn = pfn,
1461 .flags = flags,
1462 };
1463
1464 mf_cpu = &get_cpu_var(memory_failure_cpu);
1465 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1466 if (kfifo_put(&mf_cpu->fifo, entry))
1467 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1468 else
1469 pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
1470 pfn);
1471 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1472 put_cpu_var(memory_failure_cpu);
1473}
1474EXPORT_SYMBOL_GPL(memory_failure_queue);
1475
1476static void memory_failure_work_func(struct work_struct *work)
1477{
1478 struct memory_failure_cpu *mf_cpu;
1479 struct memory_failure_entry entry = { 0, };
1480 unsigned long proc_flags;
1481 int gotten;
1482
1483 mf_cpu = this_cpu_ptr(&memory_failure_cpu);
1484 for (;;) {
1485 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1486 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1487 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1488 if (!gotten)
1489 break;
1490 if (entry.flags & MF_SOFT_OFFLINE)
1491 soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
1492 else
1493 memory_failure(entry.pfn, entry.flags);
1494 }
1495}
1496
1497static int __init memory_failure_init(void)
1498{
1499 struct memory_failure_cpu *mf_cpu;
1500 int cpu;
1501
1502 for_each_possible_cpu(cpu) {
1503 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1504 spin_lock_init(&mf_cpu->lock);
1505 INIT_KFIFO(mf_cpu->fifo);
1506 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1507 }
1508
1509 return 0;
1510}
1511core_initcall(memory_failure_init);
1512
1513#define unpoison_pr_info(fmt, pfn, rs) \
1514({ \
1515 if (__ratelimit(rs)) \
1516 pr_info(fmt, pfn); \
1517})
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531int unpoison_memory(unsigned long pfn)
1532{
1533 struct page *page;
1534 struct page *p;
1535 int freeit = 0;
1536 static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
1537 DEFAULT_RATELIMIT_BURST);
1538
1539 if (!pfn_valid(pfn))
1540 return -ENXIO;
1541
1542 p = pfn_to_page(pfn);
1543 page = compound_head(p);
1544
1545 if (!PageHWPoison(p)) {
1546 unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
1547 pfn, &unpoison_rs);
1548 return 0;
1549 }
1550
1551 if (page_count(page) > 1) {
1552 unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
1553 pfn, &unpoison_rs);
1554 return 0;
1555 }
1556
1557 if (page_mapped(page)) {
1558 unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
1559 pfn, &unpoison_rs);
1560 return 0;
1561 }
1562
1563 if (page_mapping(page)) {
1564 unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
1565 pfn, &unpoison_rs);
1566 return 0;
1567 }
1568
1569
1570
1571
1572
1573
1574 if (!PageHuge(page) && PageTransHuge(page)) {
1575 unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
1576 pfn, &unpoison_rs);
1577 return 0;
1578 }
1579
1580 if (!get_hwpoison_page(p)) {
1581 if (TestClearPageHWPoison(p))
1582 num_poisoned_pages_dec();
1583 unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
1584 pfn, &unpoison_rs);
1585 return 0;
1586 }
1587
1588 lock_page(page);
1589
1590
1591
1592
1593
1594
1595 if (TestClearPageHWPoison(page)) {
1596 unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
1597 pfn, &unpoison_rs);
1598 num_poisoned_pages_dec();
1599 freeit = 1;
1600 }
1601 unlock_page(page);
1602
1603 put_hwpoison_page(page);
1604 if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
1605 put_hwpoison_page(page);
1606
1607 return 0;
1608}
1609EXPORT_SYMBOL(unpoison_memory);
1610
1611static struct page *new_page(struct page *p, unsigned long private)
1612{
1613 int nid = page_to_nid(p);
1614
1615 return new_page_nodemask(p, nid, &node_states[N_MEMORY]);
1616}
1617
1618
1619
1620
1621
1622
1623
1624static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1625{
1626 int ret;
1627
1628 if (flags & MF_COUNT_INCREASED)
1629 return 1;
1630
1631
1632
1633
1634
1635 if (!get_hwpoison_page(p)) {
1636 if (PageHuge(p)) {
1637 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1638 ret = 0;
1639 } else if (is_free_buddy_page(p)) {
1640 pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1641 ret = 0;
1642 } else {
1643 pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1644 __func__, pfn, p->flags);
1645 ret = -EIO;
1646 }
1647 } else {
1648
1649 ret = 1;
1650 }
1651 return ret;
1652}
1653
1654static int get_any_page(struct page *page, unsigned long pfn, int flags)
1655{
1656 int ret = __get_any_page(page, pfn, flags);
1657
1658 if (ret == 1 && !PageHuge(page) &&
1659 !PageLRU(page) && !__PageMovable(page)) {
1660
1661
1662
1663 put_hwpoison_page(page);
1664 shake_page(page, 1);
1665
1666
1667
1668
1669 ret = __get_any_page(page, pfn, 0);
1670 if (ret == 1 && !PageLRU(page)) {
1671
1672 put_hwpoison_page(page);
1673 pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
1674 pfn, page->flags, &page->flags);
1675 return -EIO;
1676 }
1677 }
1678 return ret;
1679}
1680
1681static int soft_offline_huge_page(struct page *page, int flags)
1682{
1683 int ret;
1684 unsigned long pfn = page_to_pfn(page);
1685 struct page *hpage = compound_head(page);
1686 LIST_HEAD(pagelist);
1687
1688
1689
1690
1691
1692 lock_page(hpage);
1693 if (PageHWPoison(hpage)) {
1694 unlock_page(hpage);
1695 put_hwpoison_page(hpage);
1696 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1697 return -EBUSY;
1698 }
1699 unlock_page(hpage);
1700
1701 ret = isolate_huge_page(hpage, &pagelist);
1702
1703
1704
1705
1706 put_hwpoison_page(hpage);
1707 if (!ret) {
1708 pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
1709 return -EBUSY;
1710 }
1711
1712 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1713 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1714 if (ret) {
1715 pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n",
1716 pfn, ret, page->flags, &page->flags);
1717 if (!list_empty(&pagelist))
1718 putback_movable_pages(&pagelist);
1719 if (ret > 0)
1720 ret = -EIO;
1721 } else {
1722
1723
1724
1725
1726
1727
1728
1729 ret = dissolve_free_huge_page(page);
1730 if (!ret) {
1731 if (set_hwpoison_free_buddy_page(page))
1732 num_poisoned_pages_inc();
1733 else
1734 ret = -EBUSY;
1735 }
1736 }
1737 return ret;
1738}
1739
1740static int __soft_offline_page(struct page *page, int flags)
1741{
1742 int ret;
1743 unsigned long pfn = page_to_pfn(page);
1744
1745
1746
1747
1748
1749
1750
1751 lock_page(page);
1752 wait_on_page_writeback(page);
1753 if (PageHWPoison(page)) {
1754 unlock_page(page);
1755 put_hwpoison_page(page);
1756 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1757 return -EBUSY;
1758 }
1759
1760
1761
1762
1763 ret = invalidate_inode_page(page);
1764 unlock_page(page);
1765
1766
1767
1768
1769 if (ret == 1) {
1770 put_hwpoison_page(page);
1771 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1772 SetPageHWPoison(page);
1773 num_poisoned_pages_inc();
1774 return 0;
1775 }
1776
1777
1778
1779
1780
1781
1782 if (PageLRU(page))
1783 ret = isolate_lru_page(page);
1784 else
1785 ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1786
1787
1788
1789
1790 put_hwpoison_page(page);
1791 if (!ret) {
1792 LIST_HEAD(pagelist);
1793
1794
1795
1796
1797
1798 if (!__PageMovable(page))
1799 inc_node_page_state(page, NR_ISOLATED_ANON +
1800 page_is_file_cache(page));
1801 list_add(&page->lru, &pagelist);
1802 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1803 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1804 if (ret) {
1805 if (!list_empty(&pagelist))
1806 putback_movable_pages(&pagelist);
1807
1808 pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
1809 pfn, ret, page->flags, &page->flags);
1810 if (ret > 0)
1811 ret = -EIO;
1812 }
1813 } else {
1814 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n",
1815 pfn, ret, page_count(page), page->flags, &page->flags);
1816 }
1817 return ret;
1818}
1819
1820static int soft_offline_in_use_page(struct page *page, int flags)
1821{
1822 int ret;
1823 int mt;
1824 struct page *hpage = compound_head(page);
1825
1826 if (!PageHuge(page) && PageTransHuge(hpage)) {
1827 lock_page(page);
1828 if (!PageAnon(page) || unlikely(split_huge_page(page))) {
1829 unlock_page(page);
1830 if (!PageAnon(page))
1831 pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
1832 else
1833 pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
1834 put_hwpoison_page(page);
1835 return -EBUSY;
1836 }
1837 unlock_page(page);
1838 }
1839
1840
1841
1842
1843
1844
1845
1846
1847 mt = get_pageblock_migratetype(page);
1848 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
1849 if (PageHuge(page))
1850 ret = soft_offline_huge_page(page, flags);
1851 else
1852 ret = __soft_offline_page(page, flags);
1853 set_pageblock_migratetype(page, mt);
1854 return ret;
1855}
1856
1857static int soft_offline_free_page(struct page *page)
1858{
1859 int rc = dissolve_free_huge_page(page);
1860
1861 if (!rc) {
1862 if (set_hwpoison_free_buddy_page(page))
1863 num_poisoned_pages_inc();
1864 else
1865 rc = -EBUSY;
1866 }
1867 return rc;
1868}
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892int soft_offline_page(struct page *page, int flags)
1893{
1894 int ret;
1895 unsigned long pfn = page_to_pfn(page);
1896
1897 if (is_zone_device_page(page)) {
1898 pr_debug_ratelimited("soft_offline: %#lx page is device page\n",
1899 pfn);
1900 if (flags & MF_COUNT_INCREASED)
1901 put_page(page);
1902 return -EIO;
1903 }
1904
1905 if (PageHWPoison(page)) {
1906 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1907 if (flags & MF_COUNT_INCREASED)
1908 put_hwpoison_page(page);
1909 return -EBUSY;
1910 }
1911
1912 get_online_mems();
1913 ret = get_any_page(page, pfn, flags);
1914 put_online_mems();
1915
1916 if (ret > 0)
1917 ret = soft_offline_in_use_page(page, flags);
1918 else if (ret == 0)
1919 ret = soft_offline_free_page(page);
1920
1921 return ret;
1922}
1923