1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39#include <linux/kernel.h>
40#include <linux/mm.h>
41#include <linux/page-flags.h>
42#include <linux/kernel-page-flags.h>
43#include <linux/sched/signal.h>
44#include <linux/sched/task.h>
45#include <linux/ksm.h>
46#include <linux/rmap.h>
47#include <linux/export.h>
48#include <linux/pagemap.h>
49#include <linux/swap.h>
50#include <linux/backing-dev.h>
51#include <linux/migrate.h>
52#include <linux/suspend.h>
53#include <linux/slab.h>
54#include <linux/swapops.h>
55#include <linux/hugetlb.h>
56#include <linux/memory_hotplug.h>
57#include <linux/mm_inline.h>
58#include <linux/kfifo.h>
59#include <linux/ratelimit.h>
60#include "internal.h"
61#include "ras/ras_event.h"
62
63int sysctl_memory_failure_early_kill __read_mostly = 0;
64
65int sysctl_memory_failure_recovery __read_mostly = 1;
66
67atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
68
69#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
70
71u32 hwpoison_filter_enable = 0;
72u32 hwpoison_filter_dev_major = ~0U;
73u32 hwpoison_filter_dev_minor = ~0U;
74u64 hwpoison_filter_flags_mask;
75u64 hwpoison_filter_flags_value;
76EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
77EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
78EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
79EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
80EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
81
82static int hwpoison_filter_dev(struct page *p)
83{
84 struct address_space *mapping;
85 dev_t dev;
86
87 if (hwpoison_filter_dev_major == ~0U &&
88 hwpoison_filter_dev_minor == ~0U)
89 return 0;
90
91
92
93
94 if (PageSlab(p))
95 return -EINVAL;
96
97 mapping = page_mapping(p);
98 if (mapping == NULL || mapping->host == NULL)
99 return -EINVAL;
100
101 dev = mapping->host->i_sb->s_dev;
102 if (hwpoison_filter_dev_major != ~0U &&
103 hwpoison_filter_dev_major != MAJOR(dev))
104 return -EINVAL;
105 if (hwpoison_filter_dev_minor != ~0U &&
106 hwpoison_filter_dev_minor != MINOR(dev))
107 return -EINVAL;
108
109 return 0;
110}
111
112static int hwpoison_filter_flags(struct page *p)
113{
114 if (!hwpoison_filter_flags_mask)
115 return 0;
116
117 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
118 hwpoison_filter_flags_value)
119 return 0;
120 else
121 return -EINVAL;
122}
123
124
125
126
127
128
129
130
131
132
133
134#ifdef CONFIG_MEMCG
135u64 hwpoison_filter_memcg;
136EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
137static int hwpoison_filter_task(struct page *p)
138{
139 if (!hwpoison_filter_memcg)
140 return 0;
141
142 if (page_cgroup_ino(p) != hwpoison_filter_memcg)
143 return -EINVAL;
144
145 return 0;
146}
147#else
148static int hwpoison_filter_task(struct page *p) { return 0; }
149#endif
150
151int hwpoison_filter(struct page *p)
152{
153 if (!hwpoison_filter_enable)
154 return 0;
155
156 if (hwpoison_filter_dev(p))
157 return -EINVAL;
158
159 if (hwpoison_filter_flags(p))
160 return -EINVAL;
161
162 if (hwpoison_filter_task(p))
163 return -EINVAL;
164
165 return 0;
166}
167#else
168int hwpoison_filter(struct page *p)
169{
170 return 0;
171}
172#endif
173
174EXPORT_SYMBOL_GPL(hwpoison_filter);
175
176
177
178
179
180
181static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
182 unsigned long pfn, struct page *page, int flags)
183{
184 struct siginfo si;
185 int ret;
186
187 pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
188 pfn, t->comm, t->pid);
189 si.si_signo = SIGBUS;
190 si.si_errno = 0;
191 si.si_addr = (void *)addr;
192#ifdef __ARCH_SI_TRAPNO
193 si.si_trapno = trapno;
194#endif
195 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
196
197 if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
198 si.si_code = BUS_MCEERR_AR;
199 ret = force_sig_info(SIGBUS, &si, current);
200 } else {
201
202
203
204
205
206
207 si.si_code = BUS_MCEERR_AO;
208 ret = send_sig_info(SIGBUS, &si, t);
209 }
210 if (ret < 0)
211 pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
212 t->comm, t->pid, ret);
213 return ret;
214}
215
216
217
218
219
220void shake_page(struct page *p, int access)
221{
222 if (PageHuge(p))
223 return;
224
225 if (!PageSlab(p)) {
226 lru_add_drain_all();
227 if (PageLRU(p))
228 return;
229 drain_all_pages(page_zone(p));
230 if (PageLRU(p) || is_free_buddy_page(p))
231 return;
232 }
233
234
235
236
237
238 if (access)
239 drop_slab_node(page_to_nid(p));
240}
241EXPORT_SYMBOL_GPL(shake_page);
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265struct to_kill {
266 struct list_head nd;
267 struct task_struct *tsk;
268 unsigned long addr;
269 char addr_valid;
270};
271
272
273
274
275
276
277
278
279
280
281
282static void add_to_kill(struct task_struct *tsk, struct page *p,
283 struct vm_area_struct *vma,
284 struct list_head *to_kill,
285 struct to_kill **tkc)
286{
287 struct to_kill *tk;
288
289 if (*tkc) {
290 tk = *tkc;
291 *tkc = NULL;
292 } else {
293 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
294 if (!tk) {
295 pr_err("Memory failure: Out of memory while machine check handling\n");
296 return;
297 }
298 }
299 tk->addr = page_address_in_vma(p, vma);
300 tk->addr_valid = 1;
301
302
303
304
305
306
307
308 if (tk->addr == -EFAULT) {
309 pr_info("Memory failure: Unable to find user space address %lx in %s\n",
310 page_to_pfn(p), tsk->comm);
311 tk->addr_valid = 0;
312 }
313 get_task_struct(tsk);
314 tk->tsk = tsk;
315 list_add_tail(&tk->nd, to_kill);
316}
317
318
319
320
321
322
323
324
325
326static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
327 bool fail, struct page *page, unsigned long pfn,
328 int flags)
329{
330 struct to_kill *tk, *next;
331
332 list_for_each_entry_safe (tk, next, to_kill, nd) {
333 if (forcekill) {
334
335
336
337
338
339 if (fail || tk->addr_valid == 0) {
340 pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
341 pfn, tk->tsk->comm, tk->tsk->pid);
342 force_sig(SIGKILL, tk->tsk);
343 }
344
345
346
347
348
349
350
351 else if (kill_proc(tk->tsk, tk->addr, trapno,
352 pfn, page, flags) < 0)
353 pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
354 pfn, tk->tsk->comm, tk->tsk->pid);
355 }
356 put_task_struct(tk->tsk);
357 kfree(tk);
358 }
359}
360
361
362
363
364
365
366
367
368
369static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
370{
371 struct task_struct *t;
372
373 for_each_thread(tsk, t)
374 if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
375 return t;
376 return NULL;
377}
378
379
380
381
382
383
384
385static struct task_struct *task_early_kill(struct task_struct *tsk,
386 int force_early)
387{
388 struct task_struct *t;
389 if (!tsk->mm)
390 return NULL;
391 if (force_early)
392 return tsk;
393 t = find_early_kill_thread(tsk);
394 if (t)
395 return t;
396 if (sysctl_memory_failure_early_kill)
397 return tsk;
398 return NULL;
399}
400
401
402
403
404static void collect_procs_anon(struct page *page, struct list_head *to_kill,
405 struct to_kill **tkc, int force_early)
406{
407 struct vm_area_struct *vma;
408 struct task_struct *tsk;
409 struct anon_vma *av;
410 pgoff_t pgoff;
411
412 av = page_lock_anon_vma_read(page);
413 if (av == NULL)
414 return;
415
416 pgoff = page_to_pgoff(page);
417 read_lock(&tasklist_lock);
418 for_each_process (tsk) {
419 struct anon_vma_chain *vmac;
420 struct task_struct *t = task_early_kill(tsk, force_early);
421
422 if (!t)
423 continue;
424 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
425 pgoff, pgoff) {
426 vma = vmac->vma;
427 if (!page_mapped_in_vma(page, vma))
428 continue;
429 if (vma->vm_mm == t->mm)
430 add_to_kill(t, page, vma, to_kill, tkc);
431 }
432 }
433 read_unlock(&tasklist_lock);
434 page_unlock_anon_vma_read(av);
435}
436
437
438
439
440static void collect_procs_file(struct page *page, struct list_head *to_kill,
441 struct to_kill **tkc, int force_early)
442{
443 struct vm_area_struct *vma;
444 struct task_struct *tsk;
445 struct address_space *mapping = page->mapping;
446
447 i_mmap_lock_read(mapping);
448 read_lock(&tasklist_lock);
449 for_each_process(tsk) {
450 pgoff_t pgoff = page_to_pgoff(page);
451 struct task_struct *t = task_early_kill(tsk, force_early);
452
453 if (!t)
454 continue;
455 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
456 pgoff) {
457
458
459
460
461
462
463
464 if (vma->vm_mm == t->mm)
465 add_to_kill(t, page, vma, to_kill, tkc);
466 }
467 }
468 read_unlock(&tasklist_lock);
469 i_mmap_unlock_read(mapping);
470}
471
472
473
474
475
476
477
478static void collect_procs(struct page *page, struct list_head *tokill,
479 int force_early)
480{
481 struct to_kill *tk;
482
483 if (!page->mapping)
484 return;
485
486 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
487 if (!tk)
488 return;
489 if (PageAnon(page))
490 collect_procs_anon(page, tokill, &tk, force_early);
491 else
492 collect_procs_file(page, tokill, &tk, force_early);
493 kfree(tk);
494}
495
496static const char *action_name[] = {
497 [MF_IGNORED] = "Ignored",
498 [MF_FAILED] = "Failed",
499 [MF_DELAYED] = "Delayed",
500 [MF_RECOVERED] = "Recovered",
501};
502
503static const char * const action_page_types[] = {
504 [MF_MSG_KERNEL] = "reserved kernel page",
505 [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
506 [MF_MSG_SLAB] = "kernel slab page",
507 [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
508 [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
509 [MF_MSG_HUGE] = "huge page",
510 [MF_MSG_FREE_HUGE] = "free huge page",
511 [MF_MSG_UNMAP_FAILED] = "unmapping failed page",
512 [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
513 [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
514 [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
515 [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
516 [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
517 [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
518 [MF_MSG_DIRTY_LRU] = "dirty LRU page",
519 [MF_MSG_CLEAN_LRU] = "clean LRU page",
520 [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
521 [MF_MSG_BUDDY] = "free buddy page",
522 [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
523 [MF_MSG_UNKNOWN] = "unknown page",
524};
525
526
527
528
529
530
531
532static int delete_from_lru_cache(struct page *p)
533{
534 if (!isolate_lru_page(p)) {
535
536
537
538
539 ClearPageActive(p);
540 ClearPageUnevictable(p);
541
542
543
544
545
546 mem_cgroup_uncharge(p);
547
548
549
550
551 put_page(p);
552 return 0;
553 }
554 return -EIO;
555}
556
557static int truncate_error_page(struct page *p, unsigned long pfn,
558 struct address_space *mapping)
559{
560 int ret = MF_FAILED;
561
562 if (mapping->a_ops->error_remove_page) {
563 int err = mapping->a_ops->error_remove_page(mapping, p);
564
565 if (err != 0) {
566 pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
567 pfn, err);
568 } else if (page_has_private(p) &&
569 !try_to_release_page(p, GFP_NOIO)) {
570 pr_info("Memory failure: %#lx: failed to release buffers\n",
571 pfn);
572 } else {
573 ret = MF_RECOVERED;
574 }
575 } else {
576
577
578
579
580 if (invalidate_inode_page(p))
581 ret = MF_RECOVERED;
582 else
583 pr_info("Memory failure: %#lx: Failed to invalidate\n",
584 pfn);
585 }
586
587 return ret;
588}
589
590
591
592
593
594
595static int me_kernel(struct page *p, unsigned long pfn)
596{
597 return MF_IGNORED;
598}
599
600
601
602
603static int me_unknown(struct page *p, unsigned long pfn)
604{
605 pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
606 return MF_FAILED;
607}
608
609
610
611
612static int me_pagecache_clean(struct page *p, unsigned long pfn)
613{
614 struct address_space *mapping;
615
616 delete_from_lru_cache(p);
617
618
619
620
621
622 if (PageAnon(p))
623 return MF_RECOVERED;
624
625
626
627
628
629
630
631
632 mapping = page_mapping(p);
633 if (!mapping) {
634
635
636
637 return MF_FAILED;
638 }
639
640
641
642
643
644
645 return truncate_error_page(p, pfn, mapping);
646}
647
648
649
650
651
652
653static int me_pagecache_dirty(struct page *p, unsigned long pfn)
654{
655 struct address_space *mapping = page_mapping(p);
656
657 SetPageError(p);
658
659 if (mapping) {
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694 mapping_set_error(mapping, -EIO);
695 }
696
697 return me_pagecache_clean(p, pfn);
698}
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719static int me_swapcache_dirty(struct page *p, unsigned long pfn)
720{
721 ClearPageDirty(p);
722
723 ClearPageUptodate(p);
724
725 if (!delete_from_lru_cache(p))
726 return MF_DELAYED;
727 else
728 return MF_FAILED;
729}
730
731static int me_swapcache_clean(struct page *p, unsigned long pfn)
732{
733 delete_from_swap_cache(p);
734
735 if (!delete_from_lru_cache(p))
736 return MF_RECOVERED;
737 else
738 return MF_FAILED;
739}
740
741
742
743
744
745
746
747static int me_huge_page(struct page *p, unsigned long pfn)
748{
749 int res = 0;
750 struct page *hpage = compound_head(p);
751 struct address_space *mapping;
752
753 if (!PageHuge(hpage))
754 return MF_DELAYED;
755
756 mapping = page_mapping(hpage);
757 if (mapping) {
758 res = truncate_error_page(hpage, pfn, mapping);
759 } else {
760 unlock_page(hpage);
761
762
763
764
765
766 if (PageAnon(hpage))
767 put_page(hpage);
768 dissolve_free_huge_page(p);
769 res = MF_RECOVERED;
770 lock_page(hpage);
771 }
772
773 return res;
774}
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789#define dirty (1UL << PG_dirty)
790#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
791#define unevict (1UL << PG_unevictable)
792#define mlock (1UL << PG_mlocked)
793#define writeback (1UL << PG_writeback)
794#define lru (1UL << PG_lru)
795#define head (1UL << PG_head)
796#define slab (1UL << PG_slab)
797#define reserved (1UL << PG_reserved)
798
799static struct page_state {
800 unsigned long mask;
801 unsigned long res;
802 enum mf_action_page_type type;
803 int (*action)(struct page *p, unsigned long pfn);
804} error_states[] = {
805 { reserved, reserved, MF_MSG_KERNEL, me_kernel },
806
807
808
809
810
811
812
813
814
815
816 { slab, slab, MF_MSG_SLAB, me_kernel },
817
818 { head, head, MF_MSG_HUGE, me_huge_page },
819
820 { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
821 { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
822
823 { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
824 { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
825
826 { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
827 { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
828
829 { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty },
830 { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean },
831
832
833
834
835 { 0, 0, MF_MSG_UNKNOWN, me_unknown },
836};
837
838#undef dirty
839#undef sc
840#undef unevict
841#undef mlock
842#undef writeback
843#undef lru
844#undef head
845#undef slab
846#undef reserved
847
848
849
850
851
852static void action_result(unsigned long pfn, enum mf_action_page_type type,
853 enum mf_result result)
854{
855 trace_memory_failure_event(pfn, type, result);
856
857 pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
858 pfn, action_page_types[type], action_name[result]);
859}
860
861static int page_action(struct page_state *ps, struct page *p,
862 unsigned long pfn)
863{
864 int result;
865 int count;
866
867 result = ps->action(p, pfn);
868
869 count = page_count(p) - 1;
870 if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
871 count--;
872 if (count > 0) {
873 pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
874 pfn, action_page_types[ps->type], count);
875 result = MF_FAILED;
876 }
877 action_result(pfn, ps->type, result);
878
879
880
881
882
883
884 return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
885}
886
887
888
889
890
891
892
893
894int get_hwpoison_page(struct page *page)
895{
896 struct page *head = compound_head(page);
897
898 if (!PageHuge(head) && PageTransHuge(head)) {
899
900
901
902
903
904
905 if (!PageAnon(head)) {
906 pr_err("Memory failure: %#lx: non anonymous thp\n",
907 page_to_pfn(page));
908 return 0;
909 }
910 }
911
912 if (get_page_unless_zero(head)) {
913 if (head == compound_head(page))
914 return 1;
915
916 pr_info("Memory failure: %#lx cannot catch tail\n",
917 page_to_pfn(page));
918 put_page(head);
919 }
920
921 return 0;
922}
923EXPORT_SYMBOL_GPL(get_hwpoison_page);
924
925
926
927
928
929static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
930 int trapno, int flags, struct page **hpagep)
931{
932 enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
933 struct address_space *mapping;
934 LIST_HEAD(tokill);
935 bool unmap_success;
936 int kill = 1, forcekill;
937 struct page *hpage = *hpagep;
938 bool mlocked = PageMlocked(hpage);
939
940
941
942
943
944 if (PageReserved(p) || PageSlab(p))
945 return true;
946 if (!(PageLRU(hpage) || PageHuge(p)))
947 return true;
948
949
950
951
952
953 if (!page_mapped(hpage))
954 return true;
955
956 if (PageKsm(p)) {
957 pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
958 return false;
959 }
960
961 if (PageSwapCache(p)) {
962 pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
963 pfn);
964 ttu |= TTU_IGNORE_HWPOISON;
965 }
966
967
968
969
970
971
972
973 mapping = page_mapping(hpage);
974 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
975 mapping_cap_writeback_dirty(mapping)) {
976 if (page_mkclean(hpage)) {
977 SetPageDirty(hpage);
978 } else {
979 kill = 0;
980 ttu |= TTU_IGNORE_HWPOISON;
981 pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
982 pfn);
983 }
984 }
985
986
987
988
989
990
991
992
993
994 if (kill)
995 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
996
997 unmap_success = try_to_unmap(hpage, ttu);
998 if (!unmap_success)
999 pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
1000 pfn, page_mapcount(hpage));
1001
1002
1003
1004
1005
1006 if (mlocked)
1007 shake_page(hpage, 0);
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019 forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
1020 kill_procs(&tokill, forcekill, trapno, !unmap_success, p, pfn, flags);
1021
1022 return unmap_success;
1023}
1024
1025static int identify_page_state(unsigned long pfn, struct page *p,
1026 unsigned long page_flags)
1027{
1028 struct page_state *ps;
1029
1030
1031
1032
1033
1034
1035 for (ps = error_states;; ps++)
1036 if ((p->flags & ps->mask) == ps->res)
1037 break;
1038
1039 page_flags |= (p->flags & (1UL << PG_dirty));
1040
1041 if (!ps->mask)
1042 for (ps = error_states;; ps++)
1043 if ((page_flags & ps->mask) == ps->res)
1044 break;
1045 return page_action(ps, p, pfn);
1046}
1047
1048static int memory_failure_hugetlb(unsigned long pfn, int trapno, int flags)
1049{
1050 struct page *p = pfn_to_page(pfn);
1051 struct page *head = compound_head(p);
1052 int res;
1053 unsigned long page_flags;
1054
1055 if (TestSetPageHWPoison(head)) {
1056 pr_err("Memory failure: %#lx: already hardware poisoned\n",
1057 pfn);
1058 return 0;
1059 }
1060
1061 num_poisoned_pages_inc();
1062
1063 if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
1064
1065
1066
1067 lock_page(head);
1068 if (PageHWPoison(head)) {
1069 if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
1070 || (p != head && TestSetPageHWPoison(head))) {
1071 num_poisoned_pages_dec();
1072 unlock_page(head);
1073 return 0;
1074 }
1075 }
1076 unlock_page(head);
1077 dissolve_free_huge_page(p);
1078 action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED);
1079 return 0;
1080 }
1081
1082 lock_page(head);
1083 page_flags = head->flags;
1084
1085 if (!PageHWPoison(head)) {
1086 pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1087 num_poisoned_pages_dec();
1088 unlock_page(head);
1089 put_hwpoison_page(head);
1090 return 0;
1091 }
1092
1093 if (!hwpoison_user_mappings(p, pfn, trapno, flags, &head)) {
1094 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1095 res = -EBUSY;
1096 goto out;
1097 }
1098
1099 res = identify_page_state(pfn, p, page_flags);
1100out:
1101 unlock_page(head);
1102 return res;
1103}
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123int memory_failure(unsigned long pfn, int trapno, int flags)
1124{
1125 struct page *p;
1126 struct page *hpage;
1127 struct page *orig_head;
1128 int res;
1129 unsigned long page_flags;
1130
1131 if (!sysctl_memory_failure_recovery)
1132 panic("Memory failure from trap %d on page %lx", trapno, pfn);
1133
1134 if (!pfn_valid(pfn)) {
1135 pr_err("Memory failure: %#lx: memory outside kernel control\n",
1136 pfn);
1137 return -ENXIO;
1138 }
1139
1140 p = pfn_to_page(pfn);
1141 if (PageHuge(p))
1142 return memory_failure_hugetlb(pfn, trapno, flags);
1143 if (TestSetPageHWPoison(p)) {
1144 pr_err("Memory failure: %#lx: already hardware poisoned\n",
1145 pfn);
1146 return 0;
1147 }
1148
1149 arch_unmap_kpfn(pfn);
1150
1151 orig_head = hpage = compound_head(p);
1152 num_poisoned_pages_inc();
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165 if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
1166 if (is_free_buddy_page(p)) {
1167 action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1168 return 0;
1169 } else {
1170 action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
1171 return -EBUSY;
1172 }
1173 }
1174
1175 if (PageTransHuge(hpage)) {
1176 lock_page(p);
1177 if (!PageAnon(p) || unlikely(split_huge_page(p))) {
1178 unlock_page(p);
1179 if (!PageAnon(p))
1180 pr_err("Memory failure: %#lx: non anonymous thp\n",
1181 pfn);
1182 else
1183 pr_err("Memory failure: %#lx: thp split failed\n",
1184 pfn);
1185 if (TestClearPageHWPoison(p))
1186 num_poisoned_pages_dec();
1187 put_hwpoison_page(p);
1188 return -EBUSY;
1189 }
1190 unlock_page(p);
1191 VM_BUG_ON_PAGE(!page_count(p), p);
1192 hpage = compound_head(p);
1193 }
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203 shake_page(p, 0);
1204
1205 if (!PageLRU(p) && is_free_buddy_page(p)) {
1206 if (flags & MF_COUNT_INCREASED)
1207 action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1208 else
1209 action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
1210 return 0;
1211 }
1212
1213 lock_page(p);
1214
1215
1216
1217
1218
1219 if (PageCompound(p) && compound_head(p) != orig_head) {
1220 action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
1221 res = -EBUSY;
1222 goto out;
1223 }
1224
1225
1226
1227
1228
1229
1230
1231
1232 if (PageHuge(p))
1233 page_flags = hpage->flags;
1234 else
1235 page_flags = p->flags;
1236
1237
1238
1239
1240 if (!PageHWPoison(p)) {
1241 pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1242 num_poisoned_pages_dec();
1243 unlock_page(p);
1244 put_hwpoison_page(p);
1245 return 0;
1246 }
1247 if (hwpoison_filter(p)) {
1248 if (TestClearPageHWPoison(p))
1249 num_poisoned_pages_dec();
1250 unlock_page(p);
1251 put_hwpoison_page(p);
1252 return 0;
1253 }
1254
1255 if (!PageTransTail(p) && !PageLRU(p))
1256 goto identify_page_state;
1257
1258
1259
1260
1261
1262 wait_on_page_writeback(p);
1263
1264
1265
1266
1267
1268
1269
1270
1271 if (!hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)) {
1272 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1273 res = -EBUSY;
1274 goto out;
1275 }
1276
1277
1278
1279
1280 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1281 action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
1282 res = -EBUSY;
1283 goto out;
1284 }
1285
1286identify_page_state:
1287 res = identify_page_state(pfn, p, page_flags);
1288out:
1289 unlock_page(p);
1290 return res;
1291}
1292EXPORT_SYMBOL_GPL(memory_failure);
1293
1294#define MEMORY_FAILURE_FIFO_ORDER 4
1295#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1296
1297struct memory_failure_entry {
1298 unsigned long pfn;
1299 int trapno;
1300 int flags;
1301};
1302
1303struct memory_failure_cpu {
1304 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1305 MEMORY_FAILURE_FIFO_SIZE);
1306 spinlock_t lock;
1307 struct work_struct work;
1308};
1309
1310static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329void memory_failure_queue(unsigned long pfn, int trapno, int flags)
1330{
1331 struct memory_failure_cpu *mf_cpu;
1332 unsigned long proc_flags;
1333 struct memory_failure_entry entry = {
1334 .pfn = pfn,
1335 .trapno = trapno,
1336 .flags = flags,
1337 };
1338
1339 mf_cpu = &get_cpu_var(memory_failure_cpu);
1340 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1341 if (kfifo_put(&mf_cpu->fifo, entry))
1342 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1343 else
1344 pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
1345 pfn);
1346 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1347 put_cpu_var(memory_failure_cpu);
1348}
1349EXPORT_SYMBOL_GPL(memory_failure_queue);
1350
1351static void memory_failure_work_func(struct work_struct *work)
1352{
1353 struct memory_failure_cpu *mf_cpu;
1354 struct memory_failure_entry entry = { 0, };
1355 unsigned long proc_flags;
1356 int gotten;
1357
1358 mf_cpu = this_cpu_ptr(&memory_failure_cpu);
1359 for (;;) {
1360 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1361 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1362 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1363 if (!gotten)
1364 break;
1365 if (entry.flags & MF_SOFT_OFFLINE)
1366 soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
1367 else
1368 memory_failure(entry.pfn, entry.trapno, entry.flags);
1369 }
1370}
1371
1372static int __init memory_failure_init(void)
1373{
1374 struct memory_failure_cpu *mf_cpu;
1375 int cpu;
1376
1377 for_each_possible_cpu(cpu) {
1378 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1379 spin_lock_init(&mf_cpu->lock);
1380 INIT_KFIFO(mf_cpu->fifo);
1381 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1382 }
1383
1384 return 0;
1385}
1386core_initcall(memory_failure_init);
1387
1388#define unpoison_pr_info(fmt, pfn, rs) \
1389({ \
1390 if (__ratelimit(rs)) \
1391 pr_info(fmt, pfn); \
1392})
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406int unpoison_memory(unsigned long pfn)
1407{
1408 struct page *page;
1409 struct page *p;
1410 int freeit = 0;
1411 static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
1412 DEFAULT_RATELIMIT_BURST);
1413
1414 if (!pfn_valid(pfn))
1415 return -ENXIO;
1416
1417 p = pfn_to_page(pfn);
1418 page = compound_head(p);
1419
1420 if (!PageHWPoison(p)) {
1421 unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
1422 pfn, &unpoison_rs);
1423 return 0;
1424 }
1425
1426 if (page_count(page) > 1) {
1427 unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
1428 pfn, &unpoison_rs);
1429 return 0;
1430 }
1431
1432 if (page_mapped(page)) {
1433 unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
1434 pfn, &unpoison_rs);
1435 return 0;
1436 }
1437
1438 if (page_mapping(page)) {
1439 unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
1440 pfn, &unpoison_rs);
1441 return 0;
1442 }
1443
1444
1445
1446
1447
1448
1449 if (!PageHuge(page) && PageTransHuge(page)) {
1450 unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
1451 pfn, &unpoison_rs);
1452 return 0;
1453 }
1454
1455 if (!get_hwpoison_page(p)) {
1456 if (TestClearPageHWPoison(p))
1457 num_poisoned_pages_dec();
1458 unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
1459 pfn, &unpoison_rs);
1460 return 0;
1461 }
1462
1463 lock_page(page);
1464
1465
1466
1467
1468
1469
1470 if (TestClearPageHWPoison(page)) {
1471 unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
1472 pfn, &unpoison_rs);
1473 num_poisoned_pages_dec();
1474 freeit = 1;
1475 }
1476 unlock_page(page);
1477
1478 put_hwpoison_page(page);
1479 if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
1480 put_hwpoison_page(page);
1481
1482 return 0;
1483}
1484EXPORT_SYMBOL(unpoison_memory);
1485
1486static struct page *new_page(struct page *p, unsigned long private, int **x)
1487{
1488 int nid = page_to_nid(p);
1489
1490 return new_page_nodemask(p, nid, &node_states[N_MEMORY]);
1491}
1492
1493
1494
1495
1496
1497
1498
1499static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1500{
1501 int ret;
1502
1503 if (flags & MF_COUNT_INCREASED)
1504 return 1;
1505
1506
1507
1508
1509
1510 if (!get_hwpoison_page(p)) {
1511 if (PageHuge(p)) {
1512 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1513 ret = 0;
1514 } else if (is_free_buddy_page(p)) {
1515 pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1516 ret = 0;
1517 } else {
1518 pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1519 __func__, pfn, p->flags);
1520 ret = -EIO;
1521 }
1522 } else {
1523
1524 ret = 1;
1525 }
1526 return ret;
1527}
1528
1529static int get_any_page(struct page *page, unsigned long pfn, int flags)
1530{
1531 int ret = __get_any_page(page, pfn, flags);
1532
1533 if (ret == 1 && !PageHuge(page) &&
1534 !PageLRU(page) && !__PageMovable(page)) {
1535
1536
1537
1538 put_hwpoison_page(page);
1539 shake_page(page, 1);
1540
1541
1542
1543
1544 ret = __get_any_page(page, pfn, 0);
1545 if (ret == 1 && !PageLRU(page)) {
1546
1547 put_hwpoison_page(page);
1548 pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
1549 pfn, page->flags, &page->flags);
1550 return -EIO;
1551 }
1552 }
1553 return ret;
1554}
1555
1556static int soft_offline_huge_page(struct page *page, int flags)
1557{
1558 int ret;
1559 unsigned long pfn = page_to_pfn(page);
1560 struct page *hpage = compound_head(page);
1561 LIST_HEAD(pagelist);
1562
1563
1564
1565
1566
1567 lock_page(hpage);
1568 if (PageHWPoison(hpage)) {
1569 unlock_page(hpage);
1570 put_hwpoison_page(hpage);
1571 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1572 return -EBUSY;
1573 }
1574 unlock_page(hpage);
1575
1576 ret = isolate_huge_page(hpage, &pagelist);
1577
1578
1579
1580
1581 put_hwpoison_page(hpage);
1582 if (!ret) {
1583 pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
1584 return -EBUSY;
1585 }
1586
1587 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1588 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1589 if (ret) {
1590 pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n",
1591 pfn, ret, page->flags, &page->flags);
1592 if (!list_empty(&pagelist))
1593 putback_movable_pages(&pagelist);
1594 if (ret > 0)
1595 ret = -EIO;
1596 } else {
1597 if (PageHuge(page))
1598 dissolve_free_huge_page(page);
1599 }
1600 return ret;
1601}
1602
1603static int __soft_offline_page(struct page *page, int flags)
1604{
1605 int ret;
1606 unsigned long pfn = page_to_pfn(page);
1607
1608
1609
1610
1611
1612
1613
1614 lock_page(page);
1615 wait_on_page_writeback(page);
1616 if (PageHWPoison(page)) {
1617 unlock_page(page);
1618 put_hwpoison_page(page);
1619 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1620 return -EBUSY;
1621 }
1622
1623
1624
1625
1626 ret = invalidate_inode_page(page);
1627 unlock_page(page);
1628
1629
1630
1631
1632 if (ret == 1) {
1633 put_hwpoison_page(page);
1634 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1635 SetPageHWPoison(page);
1636 num_poisoned_pages_inc();
1637 return 0;
1638 }
1639
1640
1641
1642
1643
1644
1645 if (PageLRU(page))
1646 ret = isolate_lru_page(page);
1647 else
1648 ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1649
1650
1651
1652
1653 put_hwpoison_page(page);
1654 if (!ret) {
1655 LIST_HEAD(pagelist);
1656
1657
1658
1659
1660
1661 if (!__PageMovable(page))
1662 inc_node_page_state(page, NR_ISOLATED_ANON +
1663 page_is_file_cache(page));
1664 list_add(&page->lru, &pagelist);
1665 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1666 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1667 if (ret) {
1668 if (!list_empty(&pagelist))
1669 putback_movable_pages(&pagelist);
1670
1671 pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
1672 pfn, ret, page->flags, &page->flags);
1673 if (ret > 0)
1674 ret = -EIO;
1675 }
1676 } else {
1677 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n",
1678 pfn, ret, page_count(page), page->flags, &page->flags);
1679 }
1680 return ret;
1681}
1682
1683static int soft_offline_in_use_page(struct page *page, int flags)
1684{
1685 int ret;
1686 struct page *hpage = compound_head(page);
1687
1688 if (!PageHuge(page) && PageTransHuge(hpage)) {
1689 lock_page(hpage);
1690 if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
1691 unlock_page(hpage);
1692 if (!PageAnon(hpage))
1693 pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
1694 else
1695 pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
1696 put_hwpoison_page(hpage);
1697 return -EBUSY;
1698 }
1699 unlock_page(hpage);
1700 get_hwpoison_page(page);
1701 put_hwpoison_page(hpage);
1702 }
1703
1704 if (PageHuge(page))
1705 ret = soft_offline_huge_page(page, flags);
1706 else
1707 ret = __soft_offline_page(page, flags);
1708
1709 return ret;
1710}
1711
1712static void soft_offline_free_page(struct page *page)
1713{
1714 struct page *head = compound_head(page);
1715
1716 if (!TestSetPageHWPoison(head)) {
1717 num_poisoned_pages_inc();
1718 if (PageHuge(head))
1719 dissolve_free_huge_page(page);
1720 }
1721}
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745int soft_offline_page(struct page *page, int flags)
1746{
1747 int ret;
1748 unsigned long pfn = page_to_pfn(page);
1749
1750 if (PageHWPoison(page)) {
1751 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1752 if (flags & MF_COUNT_INCREASED)
1753 put_hwpoison_page(page);
1754 return -EBUSY;
1755 }
1756
1757 get_online_mems();
1758 ret = get_any_page(page, pfn, flags);
1759 put_online_mems();
1760
1761 if (ret > 0)
1762 ret = soft_offline_in_use_page(page, flags);
1763 else if (ret == 0)
1764 soft_offline_free_page(page);
1765
1766 return ret;
1767}
1768