1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39#include <linux/kernel.h>
40#include <linux/mm.h>
41#include <linux/page-flags.h>
42#include <linux/kernel-page-flags.h>
43#include <linux/sched/signal.h>
44#include <linux/sched/task.h>
45#include <linux/ksm.h>
46#include <linux/rmap.h>
47#include <linux/export.h>
48#include <linux/pagemap.h>
49#include <linux/swap.h>
50#include <linux/backing-dev.h>
51#include <linux/migrate.h>
52#include <linux/suspend.h>
53#include <linux/slab.h>
54#include <linux/swapops.h>
55#include <linux/hugetlb.h>
56#include <linux/memory_hotplug.h>
57#include <linux/mm_inline.h>
58#include <linux/kfifo.h>
59#include <linux/ratelimit.h>
60#include "internal.h"
61#include "ras/ras_event.h"
62
63int sysctl_memory_failure_early_kill __read_mostly = 0;
64
65int sysctl_memory_failure_recovery __read_mostly = 1;
66
67atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
68
69#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
70
71u32 hwpoison_filter_enable = 0;
72u32 hwpoison_filter_dev_major = ~0U;
73u32 hwpoison_filter_dev_minor = ~0U;
74u64 hwpoison_filter_flags_mask;
75u64 hwpoison_filter_flags_value;
76EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
77EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
78EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
79EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
80EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
81
82static int hwpoison_filter_dev(struct page *p)
83{
84 struct address_space *mapping;
85 dev_t dev;
86
87 if (hwpoison_filter_dev_major == ~0U &&
88 hwpoison_filter_dev_minor == ~0U)
89 return 0;
90
91
92
93
94 if (PageSlab(p))
95 return -EINVAL;
96
97 mapping = page_mapping(p);
98 if (mapping == NULL || mapping->host == NULL)
99 return -EINVAL;
100
101 dev = mapping->host->i_sb->s_dev;
102 if (hwpoison_filter_dev_major != ~0U &&
103 hwpoison_filter_dev_major != MAJOR(dev))
104 return -EINVAL;
105 if (hwpoison_filter_dev_minor != ~0U &&
106 hwpoison_filter_dev_minor != MINOR(dev))
107 return -EINVAL;
108
109 return 0;
110}
111
112static int hwpoison_filter_flags(struct page *p)
113{
114 if (!hwpoison_filter_flags_mask)
115 return 0;
116
117 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
118 hwpoison_filter_flags_value)
119 return 0;
120 else
121 return -EINVAL;
122}
123
124
125
126
127
128
129
130
131
132
133
134#ifdef CONFIG_MEMCG
135u64 hwpoison_filter_memcg;
136EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
137static int hwpoison_filter_task(struct page *p)
138{
139 if (!hwpoison_filter_memcg)
140 return 0;
141
142 if (page_cgroup_ino(p) != hwpoison_filter_memcg)
143 return -EINVAL;
144
145 return 0;
146}
147#else
148static int hwpoison_filter_task(struct page *p) { return 0; }
149#endif
150
151int hwpoison_filter(struct page *p)
152{
153 if (!hwpoison_filter_enable)
154 return 0;
155
156 if (hwpoison_filter_dev(p))
157 return -EINVAL;
158
159 if (hwpoison_filter_flags(p))
160 return -EINVAL;
161
162 if (hwpoison_filter_task(p))
163 return -EINVAL;
164
165 return 0;
166}
167#else
168int hwpoison_filter(struct page *p)
169{
170 return 0;
171}
172#endif
173
174EXPORT_SYMBOL_GPL(hwpoison_filter);
175
176
177
178
179
180
181static int kill_proc(struct task_struct *t, unsigned long addr,
182 unsigned long pfn, struct page *page, int flags)
183{
184 short addr_lsb;
185 int ret;
186
187 pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
188 pfn, t->comm, t->pid);
189 addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
190
191 if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
192 ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)addr,
193 addr_lsb, current);
194 } else {
195
196
197
198
199
200
201 ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)addr,
202 addr_lsb, t);
203 }
204 if (ret < 0)
205 pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
206 t->comm, t->pid, ret);
207 return ret;
208}
209
210
211
212
213
214void shake_page(struct page *p, int access)
215{
216 if (PageHuge(p))
217 return;
218
219 if (!PageSlab(p)) {
220 lru_add_drain_all();
221 if (PageLRU(p))
222 return;
223 drain_all_pages(page_zone(p));
224 if (PageLRU(p) || is_free_buddy_page(p))
225 return;
226 }
227
228
229
230
231
232 if (access)
233 drop_slab_node(page_to_nid(p));
234}
235EXPORT_SYMBOL_GPL(shake_page);
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259struct to_kill {
260 struct list_head nd;
261 struct task_struct *tsk;
262 unsigned long addr;
263 char addr_valid;
264};
265
266
267
268
269
270
271
272
273
274
275
276static void add_to_kill(struct task_struct *tsk, struct page *p,
277 struct vm_area_struct *vma,
278 struct list_head *to_kill,
279 struct to_kill **tkc)
280{
281 struct to_kill *tk;
282
283 if (*tkc) {
284 tk = *tkc;
285 *tkc = NULL;
286 } else {
287 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
288 if (!tk) {
289 pr_err("Memory failure: Out of memory while machine check handling\n");
290 return;
291 }
292 }
293 tk->addr = page_address_in_vma(p, vma);
294 tk->addr_valid = 1;
295
296
297
298
299
300
301
302 if (tk->addr == -EFAULT) {
303 pr_info("Memory failure: Unable to find user space address %lx in %s\n",
304 page_to_pfn(p), tsk->comm);
305 tk->addr_valid = 0;
306 }
307 get_task_struct(tsk);
308 tk->tsk = tsk;
309 list_add_tail(&tk->nd, to_kill);
310}
311
312
313
314
315
316
317
318
319
320static void kill_procs(struct list_head *to_kill, int forcekill,
321 bool fail, struct page *page, unsigned long pfn,
322 int flags)
323{
324 struct to_kill *tk, *next;
325
326 list_for_each_entry_safe (tk, next, to_kill, nd) {
327 if (forcekill) {
328
329
330
331
332
333 if (fail || tk->addr_valid == 0) {
334 pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
335 pfn, tk->tsk->comm, tk->tsk->pid);
336 force_sig(SIGKILL, tk->tsk);
337 }
338
339
340
341
342
343
344
345 else if (kill_proc(tk->tsk, tk->addr,
346 pfn, page, flags) < 0)
347 pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
348 pfn, tk->tsk->comm, tk->tsk->pid);
349 }
350 put_task_struct(tk->tsk);
351 kfree(tk);
352 }
353}
354
355
356
357
358
359
360
361
362
363static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
364{
365 struct task_struct *t;
366
367 for_each_thread(tsk, t)
368 if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
369 return t;
370 return NULL;
371}
372
373
374
375
376
377
378
379static struct task_struct *task_early_kill(struct task_struct *tsk,
380 int force_early)
381{
382 struct task_struct *t;
383 if (!tsk->mm)
384 return NULL;
385 if (force_early)
386 return tsk;
387 t = find_early_kill_thread(tsk);
388 if (t)
389 return t;
390 if (sysctl_memory_failure_early_kill)
391 return tsk;
392 return NULL;
393}
394
395
396
397
398static void collect_procs_anon(struct page *page, struct list_head *to_kill,
399 struct to_kill **tkc, int force_early)
400{
401 struct vm_area_struct *vma;
402 struct task_struct *tsk;
403 struct anon_vma *av;
404 pgoff_t pgoff;
405
406 av = page_lock_anon_vma_read(page);
407 if (av == NULL)
408 return;
409
410 pgoff = page_to_pgoff(page);
411 read_lock(&tasklist_lock);
412 for_each_process (tsk) {
413 struct anon_vma_chain *vmac;
414 struct task_struct *t = task_early_kill(tsk, force_early);
415
416 if (!t)
417 continue;
418 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
419 pgoff, pgoff) {
420 vma = vmac->vma;
421 if (!page_mapped_in_vma(page, vma))
422 continue;
423 if (vma->vm_mm == t->mm)
424 add_to_kill(t, page, vma, to_kill, tkc);
425 }
426 }
427 read_unlock(&tasklist_lock);
428 page_unlock_anon_vma_read(av);
429}
430
431
432
433
434static void collect_procs_file(struct page *page, struct list_head *to_kill,
435 struct to_kill **tkc, int force_early)
436{
437 struct vm_area_struct *vma;
438 struct task_struct *tsk;
439 struct address_space *mapping = page->mapping;
440
441 i_mmap_lock_read(mapping);
442 read_lock(&tasklist_lock);
443 for_each_process(tsk) {
444 pgoff_t pgoff = page_to_pgoff(page);
445 struct task_struct *t = task_early_kill(tsk, force_early);
446
447 if (!t)
448 continue;
449 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
450 pgoff) {
451
452
453
454
455
456
457
458 if (vma->vm_mm == t->mm)
459 add_to_kill(t, page, vma, to_kill, tkc);
460 }
461 }
462 read_unlock(&tasklist_lock);
463 i_mmap_unlock_read(mapping);
464}
465
466
467
468
469
470
471
472static void collect_procs(struct page *page, struct list_head *tokill,
473 int force_early)
474{
475 struct to_kill *tk;
476
477 if (!page->mapping)
478 return;
479
480 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
481 if (!tk)
482 return;
483 if (PageAnon(page))
484 collect_procs_anon(page, tokill, &tk, force_early);
485 else
486 collect_procs_file(page, tokill, &tk, force_early);
487 kfree(tk);
488}
489
490static const char *action_name[] = {
491 [MF_IGNORED] = "Ignored",
492 [MF_FAILED] = "Failed",
493 [MF_DELAYED] = "Delayed",
494 [MF_RECOVERED] = "Recovered",
495};
496
497static const char * const action_page_types[] = {
498 [MF_MSG_KERNEL] = "reserved kernel page",
499 [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
500 [MF_MSG_SLAB] = "kernel slab page",
501 [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
502 [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
503 [MF_MSG_HUGE] = "huge page",
504 [MF_MSG_FREE_HUGE] = "free huge page",
505 [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page",
506 [MF_MSG_UNMAP_FAILED] = "unmapping failed page",
507 [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
508 [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
509 [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
510 [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
511 [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
512 [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
513 [MF_MSG_DIRTY_LRU] = "dirty LRU page",
514 [MF_MSG_CLEAN_LRU] = "clean LRU page",
515 [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
516 [MF_MSG_BUDDY] = "free buddy page",
517 [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
518 [MF_MSG_UNKNOWN] = "unknown page",
519};
520
521
522
523
524
525
526
527static int delete_from_lru_cache(struct page *p)
528{
529 if (!isolate_lru_page(p)) {
530
531
532
533
534 ClearPageActive(p);
535 ClearPageUnevictable(p);
536
537
538
539
540
541 mem_cgroup_uncharge(p);
542
543
544
545
546 put_page(p);
547 return 0;
548 }
549 return -EIO;
550}
551
552static int truncate_error_page(struct page *p, unsigned long pfn,
553 struct address_space *mapping)
554{
555 int ret = MF_FAILED;
556
557 if (mapping->a_ops->error_remove_page) {
558 int err = mapping->a_ops->error_remove_page(mapping, p);
559
560 if (err != 0) {
561 pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
562 pfn, err);
563 } else if (page_has_private(p) &&
564 !try_to_release_page(p, GFP_NOIO)) {
565 pr_info("Memory failure: %#lx: failed to release buffers\n",
566 pfn);
567 } else {
568 ret = MF_RECOVERED;
569 }
570 } else {
571
572
573
574
575 if (invalidate_inode_page(p))
576 ret = MF_RECOVERED;
577 else
578 pr_info("Memory failure: %#lx: Failed to invalidate\n",
579 pfn);
580 }
581
582 return ret;
583}
584
585
586
587
588
589
590static int me_kernel(struct page *p, unsigned long pfn)
591{
592 return MF_IGNORED;
593}
594
595
596
597
598static int me_unknown(struct page *p, unsigned long pfn)
599{
600 pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
601 return MF_FAILED;
602}
603
604
605
606
607static int me_pagecache_clean(struct page *p, unsigned long pfn)
608{
609 struct address_space *mapping;
610
611 delete_from_lru_cache(p);
612
613
614
615
616
617 if (PageAnon(p))
618 return MF_RECOVERED;
619
620
621
622
623
624
625
626
627 mapping = page_mapping(p);
628 if (!mapping) {
629
630
631
632 return MF_FAILED;
633 }
634
635
636
637
638
639
640 return truncate_error_page(p, pfn, mapping);
641}
642
643
644
645
646
647
648static int me_pagecache_dirty(struct page *p, unsigned long pfn)
649{
650 struct address_space *mapping = page_mapping(p);
651
652 SetPageError(p);
653
654 if (mapping) {
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689 mapping_set_error(mapping, -EIO);
690 }
691
692 return me_pagecache_clean(p, pfn);
693}
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714static int me_swapcache_dirty(struct page *p, unsigned long pfn)
715{
716 ClearPageDirty(p);
717
718 ClearPageUptodate(p);
719
720 if (!delete_from_lru_cache(p))
721 return MF_DELAYED;
722 else
723 return MF_FAILED;
724}
725
726static int me_swapcache_clean(struct page *p, unsigned long pfn)
727{
728 delete_from_swap_cache(p);
729
730 if (!delete_from_lru_cache(p))
731 return MF_RECOVERED;
732 else
733 return MF_FAILED;
734}
735
736
737
738
739
740
741
742static int me_huge_page(struct page *p, unsigned long pfn)
743{
744 int res = 0;
745 struct page *hpage = compound_head(p);
746 struct address_space *mapping;
747
748 if (!PageHuge(hpage))
749 return MF_DELAYED;
750
751 mapping = page_mapping(hpage);
752 if (mapping) {
753 res = truncate_error_page(hpage, pfn, mapping);
754 } else {
755 unlock_page(hpage);
756
757
758
759
760
761 if (PageAnon(hpage))
762 put_page(hpage);
763 dissolve_free_huge_page(p);
764 res = MF_RECOVERED;
765 lock_page(hpage);
766 }
767
768 return res;
769}
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784#define dirty (1UL << PG_dirty)
785#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
786#define unevict (1UL << PG_unevictable)
787#define mlock (1UL << PG_mlocked)
788#define writeback (1UL << PG_writeback)
789#define lru (1UL << PG_lru)
790#define head (1UL << PG_head)
791#define slab (1UL << PG_slab)
792#define reserved (1UL << PG_reserved)
793
794static struct page_state {
795 unsigned long mask;
796 unsigned long res;
797 enum mf_action_page_type type;
798 int (*action)(struct page *p, unsigned long pfn);
799} error_states[] = {
800 { reserved, reserved, MF_MSG_KERNEL, me_kernel },
801
802
803
804
805
806
807
808
809
810
811 { slab, slab, MF_MSG_SLAB, me_kernel },
812
813 { head, head, MF_MSG_HUGE, me_huge_page },
814
815 { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
816 { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
817
818 { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
819 { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
820
821 { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
822 { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
823
824 { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty },
825 { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean },
826
827
828
829
830 { 0, 0, MF_MSG_UNKNOWN, me_unknown },
831};
832
833#undef dirty
834#undef sc
835#undef unevict
836#undef mlock
837#undef writeback
838#undef lru
839#undef head
840#undef slab
841#undef reserved
842
843
844
845
846
847static void action_result(unsigned long pfn, enum mf_action_page_type type,
848 enum mf_result result)
849{
850 trace_memory_failure_event(pfn, type, result);
851
852 pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
853 pfn, action_page_types[type], action_name[result]);
854}
855
856static int page_action(struct page_state *ps, struct page *p,
857 unsigned long pfn)
858{
859 int result;
860 int count;
861
862 result = ps->action(p, pfn);
863
864 count = page_count(p) - 1;
865 if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
866 count--;
867 if (count > 0) {
868 pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
869 pfn, action_page_types[ps->type], count);
870 result = MF_FAILED;
871 }
872 action_result(pfn, ps->type, result);
873
874
875
876
877
878
879 return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
880}
881
882
883
884
885
886
887
888
889int get_hwpoison_page(struct page *page)
890{
891 struct page *head = compound_head(page);
892
893 if (!PageHuge(head) && PageTransHuge(head)) {
894
895
896
897
898
899
900 if (!PageAnon(head)) {
901 pr_err("Memory failure: %#lx: non anonymous thp\n",
902 page_to_pfn(page));
903 return 0;
904 }
905 }
906
907 if (get_page_unless_zero(head)) {
908 if (head == compound_head(page))
909 return 1;
910
911 pr_info("Memory failure: %#lx cannot catch tail\n",
912 page_to_pfn(page));
913 put_page(head);
914 }
915
916 return 0;
917}
918EXPORT_SYMBOL_GPL(get_hwpoison_page);
919
920
921
922
923
924static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
925 int flags, struct page **hpagep)
926{
927 enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
928 struct address_space *mapping;
929 LIST_HEAD(tokill);
930 bool unmap_success;
931 int kill = 1, forcekill;
932 struct page *hpage = *hpagep;
933 bool mlocked = PageMlocked(hpage);
934
935
936
937
938
939 if (PageReserved(p) || PageSlab(p))
940 return true;
941 if (!(PageLRU(hpage) || PageHuge(p)))
942 return true;
943
944
945
946
947
948 if (!page_mapped(hpage))
949 return true;
950
951 if (PageKsm(p)) {
952 pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
953 return false;
954 }
955
956 if (PageSwapCache(p)) {
957 pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
958 pfn);
959 ttu |= TTU_IGNORE_HWPOISON;
960 }
961
962
963
964
965
966
967
968 mapping = page_mapping(hpage);
969 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
970 mapping_cap_writeback_dirty(mapping)) {
971 if (page_mkclean(hpage)) {
972 SetPageDirty(hpage);
973 } else {
974 kill = 0;
975 ttu |= TTU_IGNORE_HWPOISON;
976 pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
977 pfn);
978 }
979 }
980
981
982
983
984
985
986
987
988
989 if (kill)
990 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
991
992 unmap_success = try_to_unmap(hpage, ttu);
993 if (!unmap_success)
994 pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
995 pfn, page_mapcount(hpage));
996
997
998
999
1000
1001 if (mlocked)
1002 shake_page(hpage, 0);
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014 forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
1015 kill_procs(&tokill, forcekill, !unmap_success, p, pfn, flags);
1016
1017 return unmap_success;
1018}
1019
1020static int identify_page_state(unsigned long pfn, struct page *p,
1021 unsigned long page_flags)
1022{
1023 struct page_state *ps;
1024
1025
1026
1027
1028
1029
1030 for (ps = error_states;; ps++)
1031 if ((p->flags & ps->mask) == ps->res)
1032 break;
1033
1034 page_flags |= (p->flags & (1UL << PG_dirty));
1035
1036 if (!ps->mask)
1037 for (ps = error_states;; ps++)
1038 if ((page_flags & ps->mask) == ps->res)
1039 break;
1040 return page_action(ps, p, pfn);
1041}
1042
1043static int memory_failure_hugetlb(unsigned long pfn, int flags)
1044{
1045 struct page *p = pfn_to_page(pfn);
1046 struct page *head = compound_head(p);
1047 int res;
1048 unsigned long page_flags;
1049
1050 if (TestSetPageHWPoison(head)) {
1051 pr_err("Memory failure: %#lx: already hardware poisoned\n",
1052 pfn);
1053 return 0;
1054 }
1055
1056 num_poisoned_pages_inc();
1057
1058 if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
1059
1060
1061
1062 lock_page(head);
1063 if (PageHWPoison(head)) {
1064 if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
1065 || (p != head && TestSetPageHWPoison(head))) {
1066 num_poisoned_pages_dec();
1067 unlock_page(head);
1068 return 0;
1069 }
1070 }
1071 unlock_page(head);
1072 dissolve_free_huge_page(p);
1073 action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED);
1074 return 0;
1075 }
1076
1077 lock_page(head);
1078 page_flags = head->flags;
1079
1080 if (!PageHWPoison(head)) {
1081 pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1082 num_poisoned_pages_dec();
1083 unlock_page(head);
1084 put_hwpoison_page(head);
1085 return 0;
1086 }
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097 if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
1098 action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
1099 res = -EBUSY;
1100 goto out;
1101 }
1102
1103 if (!hwpoison_user_mappings(p, pfn, flags, &head)) {
1104 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1105 res = -EBUSY;
1106 goto out;
1107 }
1108
1109 res = identify_page_state(pfn, p, page_flags);
1110out:
1111 unlock_page(head);
1112 return res;
1113}
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132int memory_failure(unsigned long pfn, int flags)
1133{
1134 struct page *p;
1135 struct page *hpage;
1136 struct page *orig_head;
1137 int res;
1138 unsigned long page_flags;
1139
1140 if (!sysctl_memory_failure_recovery)
1141 panic("Memory failure on page %lx", pfn);
1142
1143 if (!pfn_valid(pfn)) {
1144 pr_err("Memory failure: %#lx: memory outside kernel control\n",
1145 pfn);
1146 return -ENXIO;
1147 }
1148
1149 p = pfn_to_page(pfn);
1150 if (PageHuge(p))
1151 return memory_failure_hugetlb(pfn, flags);
1152 if (TestSetPageHWPoison(p)) {
1153 pr_err("Memory failure: %#lx: already hardware poisoned\n",
1154 pfn);
1155 return 0;
1156 }
1157
1158 orig_head = hpage = compound_head(p);
1159 num_poisoned_pages_inc();
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172 if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
1173 if (is_free_buddy_page(p)) {
1174 action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1175 return 0;
1176 } else {
1177 action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
1178 return -EBUSY;
1179 }
1180 }
1181
1182 if (PageTransHuge(hpage)) {
1183 lock_page(p);
1184 if (!PageAnon(p) || unlikely(split_huge_page(p))) {
1185 unlock_page(p);
1186 if (!PageAnon(p))
1187 pr_err("Memory failure: %#lx: non anonymous thp\n",
1188 pfn);
1189 else
1190 pr_err("Memory failure: %#lx: thp split failed\n",
1191 pfn);
1192 if (TestClearPageHWPoison(p))
1193 num_poisoned_pages_dec();
1194 put_hwpoison_page(p);
1195 return -EBUSY;
1196 }
1197 unlock_page(p);
1198 VM_BUG_ON_PAGE(!page_count(p), p);
1199 hpage = compound_head(p);
1200 }
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210 shake_page(p, 0);
1211
1212 if (!PageLRU(p) && is_free_buddy_page(p)) {
1213 if (flags & MF_COUNT_INCREASED)
1214 action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1215 else
1216 action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
1217 return 0;
1218 }
1219
1220 lock_page(p);
1221
1222
1223
1224
1225
1226 if (PageCompound(p) && compound_head(p) != orig_head) {
1227 action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
1228 res = -EBUSY;
1229 goto out;
1230 }
1231
1232
1233
1234
1235
1236
1237
1238
1239 if (PageHuge(p))
1240 page_flags = hpage->flags;
1241 else
1242 page_flags = p->flags;
1243
1244
1245
1246
1247 if (!PageHWPoison(p)) {
1248 pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1249 num_poisoned_pages_dec();
1250 unlock_page(p);
1251 put_hwpoison_page(p);
1252 return 0;
1253 }
1254 if (hwpoison_filter(p)) {
1255 if (TestClearPageHWPoison(p))
1256 num_poisoned_pages_dec();
1257 unlock_page(p);
1258 put_hwpoison_page(p);
1259 return 0;
1260 }
1261
1262 if (!PageTransTail(p) && !PageLRU(p))
1263 goto identify_page_state;
1264
1265
1266
1267
1268
1269 wait_on_page_writeback(p);
1270
1271
1272
1273
1274
1275
1276
1277
1278 if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) {
1279 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1280 res = -EBUSY;
1281 goto out;
1282 }
1283
1284
1285
1286
1287 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1288 action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
1289 res = -EBUSY;
1290 goto out;
1291 }
1292
1293identify_page_state:
1294 res = identify_page_state(pfn, p, page_flags);
1295out:
1296 unlock_page(p);
1297 return res;
1298}
1299EXPORT_SYMBOL_GPL(memory_failure);
1300
1301#define MEMORY_FAILURE_FIFO_ORDER 4
1302#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1303
1304struct memory_failure_entry {
1305 unsigned long pfn;
1306 int flags;
1307};
1308
1309struct memory_failure_cpu {
1310 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1311 MEMORY_FAILURE_FIFO_SIZE);
1312 spinlock_t lock;
1313 struct work_struct work;
1314};
1315
1316static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334void memory_failure_queue(unsigned long pfn, int flags)
1335{
1336 struct memory_failure_cpu *mf_cpu;
1337 unsigned long proc_flags;
1338 struct memory_failure_entry entry = {
1339 .pfn = pfn,
1340 .flags = flags,
1341 };
1342
1343 mf_cpu = &get_cpu_var(memory_failure_cpu);
1344 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1345 if (kfifo_put(&mf_cpu->fifo, entry))
1346 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1347 else
1348 pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
1349 pfn);
1350 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1351 put_cpu_var(memory_failure_cpu);
1352}
1353EXPORT_SYMBOL_GPL(memory_failure_queue);
1354
1355static void memory_failure_work_func(struct work_struct *work)
1356{
1357 struct memory_failure_cpu *mf_cpu;
1358 struct memory_failure_entry entry = { 0, };
1359 unsigned long proc_flags;
1360 int gotten;
1361
1362 mf_cpu = this_cpu_ptr(&memory_failure_cpu);
1363 for (;;) {
1364 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1365 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1366 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1367 if (!gotten)
1368 break;
1369 if (entry.flags & MF_SOFT_OFFLINE)
1370 soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
1371 else
1372 memory_failure(entry.pfn, entry.flags);
1373 }
1374}
1375
1376static int __init memory_failure_init(void)
1377{
1378 struct memory_failure_cpu *mf_cpu;
1379 int cpu;
1380
1381 for_each_possible_cpu(cpu) {
1382 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1383 spin_lock_init(&mf_cpu->lock);
1384 INIT_KFIFO(mf_cpu->fifo);
1385 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1386 }
1387
1388 return 0;
1389}
1390core_initcall(memory_failure_init);
1391
1392#define unpoison_pr_info(fmt, pfn, rs) \
1393({ \
1394 if (__ratelimit(rs)) \
1395 pr_info(fmt, pfn); \
1396})
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410int unpoison_memory(unsigned long pfn)
1411{
1412 struct page *page;
1413 struct page *p;
1414 int freeit = 0;
1415 static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
1416 DEFAULT_RATELIMIT_BURST);
1417
1418 if (!pfn_valid(pfn))
1419 return -ENXIO;
1420
1421 p = pfn_to_page(pfn);
1422 page = compound_head(p);
1423
1424 if (!PageHWPoison(p)) {
1425 unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
1426 pfn, &unpoison_rs);
1427 return 0;
1428 }
1429
1430 if (page_count(page) > 1) {
1431 unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
1432 pfn, &unpoison_rs);
1433 return 0;
1434 }
1435
1436 if (page_mapped(page)) {
1437 unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
1438 pfn, &unpoison_rs);
1439 return 0;
1440 }
1441
1442 if (page_mapping(page)) {
1443 unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
1444 pfn, &unpoison_rs);
1445 return 0;
1446 }
1447
1448
1449
1450
1451
1452
1453 if (!PageHuge(page) && PageTransHuge(page)) {
1454 unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
1455 pfn, &unpoison_rs);
1456 return 0;
1457 }
1458
1459 if (!get_hwpoison_page(p)) {
1460 if (TestClearPageHWPoison(p))
1461 num_poisoned_pages_dec();
1462 unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
1463 pfn, &unpoison_rs);
1464 return 0;
1465 }
1466
1467 lock_page(page);
1468
1469
1470
1471
1472
1473
1474 if (TestClearPageHWPoison(page)) {
1475 unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
1476 pfn, &unpoison_rs);
1477 num_poisoned_pages_dec();
1478 freeit = 1;
1479 }
1480 unlock_page(page);
1481
1482 put_hwpoison_page(page);
1483 if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
1484 put_hwpoison_page(page);
1485
1486 return 0;
1487}
1488EXPORT_SYMBOL(unpoison_memory);
1489
1490static struct page *new_page(struct page *p, unsigned long private)
1491{
1492 int nid = page_to_nid(p);
1493
1494 return new_page_nodemask(p, nid, &node_states[N_MEMORY]);
1495}
1496
1497
1498
1499
1500
1501
1502
1503static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1504{
1505 int ret;
1506
1507 if (flags & MF_COUNT_INCREASED)
1508 return 1;
1509
1510
1511
1512
1513
1514 if (!get_hwpoison_page(p)) {
1515 if (PageHuge(p)) {
1516 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1517 ret = 0;
1518 } else if (is_free_buddy_page(p)) {
1519 pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1520 ret = 0;
1521 } else {
1522 pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1523 __func__, pfn, p->flags);
1524 ret = -EIO;
1525 }
1526 } else {
1527
1528 ret = 1;
1529 }
1530 return ret;
1531}
1532
1533static int get_any_page(struct page *page, unsigned long pfn, int flags)
1534{
1535 int ret = __get_any_page(page, pfn, flags);
1536
1537 if (ret == 1 && !PageHuge(page) &&
1538 !PageLRU(page) && !__PageMovable(page)) {
1539
1540
1541
1542 put_hwpoison_page(page);
1543 shake_page(page, 1);
1544
1545
1546
1547
1548 ret = __get_any_page(page, pfn, 0);
1549 if (ret == 1 && !PageLRU(page)) {
1550
1551 put_hwpoison_page(page);
1552 pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
1553 pfn, page->flags, &page->flags);
1554 return -EIO;
1555 }
1556 }
1557 return ret;
1558}
1559
1560static int soft_offline_huge_page(struct page *page, int flags)
1561{
1562 int ret;
1563 unsigned long pfn = page_to_pfn(page);
1564 struct page *hpage = compound_head(page);
1565 LIST_HEAD(pagelist);
1566
1567
1568
1569
1570
1571 lock_page(hpage);
1572 if (PageHWPoison(hpage)) {
1573 unlock_page(hpage);
1574 put_hwpoison_page(hpage);
1575 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1576 return -EBUSY;
1577 }
1578 unlock_page(hpage);
1579
1580 ret = isolate_huge_page(hpage, &pagelist);
1581
1582
1583
1584
1585 put_hwpoison_page(hpage);
1586 if (!ret) {
1587 pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
1588 return -EBUSY;
1589 }
1590
1591 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1592 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1593 if (ret) {
1594 pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n",
1595 pfn, ret, page->flags, &page->flags);
1596 if (!list_empty(&pagelist))
1597 putback_movable_pages(&pagelist);
1598 if (ret > 0)
1599 ret = -EIO;
1600 } else {
1601 if (PageHuge(page))
1602 dissolve_free_huge_page(page);
1603 }
1604 return ret;
1605}
1606
1607static int __soft_offline_page(struct page *page, int flags)
1608{
1609 int ret;
1610 unsigned long pfn = page_to_pfn(page);
1611
1612
1613
1614
1615
1616
1617
1618 lock_page(page);
1619 wait_on_page_writeback(page);
1620 if (PageHWPoison(page)) {
1621 unlock_page(page);
1622 put_hwpoison_page(page);
1623 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1624 return -EBUSY;
1625 }
1626
1627
1628
1629
1630 ret = invalidate_inode_page(page);
1631 unlock_page(page);
1632
1633
1634
1635
1636 if (ret == 1) {
1637 put_hwpoison_page(page);
1638 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1639 SetPageHWPoison(page);
1640 num_poisoned_pages_inc();
1641 return 0;
1642 }
1643
1644
1645
1646
1647
1648
1649 if (PageLRU(page))
1650 ret = isolate_lru_page(page);
1651 else
1652 ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1653
1654
1655
1656
1657 put_hwpoison_page(page);
1658 if (!ret) {
1659 LIST_HEAD(pagelist);
1660
1661
1662
1663
1664
1665 if (!__PageMovable(page))
1666 inc_node_page_state(page, NR_ISOLATED_ANON +
1667 page_is_file_cache(page));
1668 list_add(&page->lru, &pagelist);
1669 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1670 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1671 if (ret) {
1672 if (!list_empty(&pagelist))
1673 putback_movable_pages(&pagelist);
1674
1675 pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
1676 pfn, ret, page->flags, &page->flags);
1677 if (ret > 0)
1678 ret = -EIO;
1679 }
1680 } else {
1681 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n",
1682 pfn, ret, page_count(page), page->flags, &page->flags);
1683 }
1684 return ret;
1685}
1686
1687static int soft_offline_in_use_page(struct page *page, int flags)
1688{
1689 int ret;
1690 struct page *hpage = compound_head(page);
1691
1692 if (!PageHuge(page) && PageTransHuge(hpage)) {
1693 lock_page(hpage);
1694 if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
1695 unlock_page(hpage);
1696 if (!PageAnon(hpage))
1697 pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
1698 else
1699 pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
1700 put_hwpoison_page(hpage);
1701 return -EBUSY;
1702 }
1703 unlock_page(hpage);
1704 get_hwpoison_page(page);
1705 put_hwpoison_page(hpage);
1706 }
1707
1708 if (PageHuge(page))
1709 ret = soft_offline_huge_page(page, flags);
1710 else
1711 ret = __soft_offline_page(page, flags);
1712
1713 return ret;
1714}
1715
1716static void soft_offline_free_page(struct page *page)
1717{
1718 struct page *head = compound_head(page);
1719
1720 if (!TestSetPageHWPoison(head)) {
1721 num_poisoned_pages_inc();
1722 if (PageHuge(head))
1723 dissolve_free_huge_page(page);
1724 }
1725}
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749int soft_offline_page(struct page *page, int flags)
1750{
1751 int ret;
1752 unsigned long pfn = page_to_pfn(page);
1753
1754 if (PageHWPoison(page)) {
1755 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1756 if (flags & MF_COUNT_INCREASED)
1757 put_hwpoison_page(page);
1758 return -EBUSY;
1759 }
1760
1761 get_online_mems();
1762 ret = get_any_page(page, pfn, flags);
1763 put_online_mems();
1764
1765 if (ret > 0)
1766 ret = soft_offline_in_use_page(page, flags);
1767 else if (ret == 0)
1768 soft_offline_free_page(page);
1769
1770 return ret;
1771}
1772