1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39#include <linux/kernel.h>
40#include <linux/mm.h>
41#include <linux/page-flags.h>
42#include <linux/kernel-page-flags.h>
43#include <linux/sched.h>
44#include <linux/ksm.h>
45#include <linux/rmap.h>
46#include <linux/export.h>
47#include <linux/pagemap.h>
48#include <linux/swap.h>
49#include <linux/backing-dev.h>
50#include <linux/migrate.h>
51#include <linux/page-isolation.h>
52#include <linux/suspend.h>
53#include <linux/slab.h>
54#include <linux/swapops.h>
55#include <linux/hugetlb.h>
56#include <linux/memory_hotplug.h>
57#include <linux/mm_inline.h>
58#include <linux/kfifo.h>
59#include <linux/ratelimit.h>
60#include "internal.h"
61#include "ras/ras_event.h"
62
63int sysctl_memory_failure_early_kill __read_mostly = 0;
64
65int sysctl_memory_failure_recovery __read_mostly = 1;
66
67atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
68
69#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
70
71u32 hwpoison_filter_enable = 0;
72u32 hwpoison_filter_dev_major = ~0U;
73u32 hwpoison_filter_dev_minor = ~0U;
74u64 hwpoison_filter_flags_mask;
75u64 hwpoison_filter_flags_value;
76EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
77EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
78EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
79EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
80EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
81
82static int hwpoison_filter_dev(struct page *p)
83{
84 struct address_space *mapping;
85 dev_t dev;
86
87 if (hwpoison_filter_dev_major == ~0U &&
88 hwpoison_filter_dev_minor == ~0U)
89 return 0;
90
91
92
93
94 if (PageSlab(p))
95 return -EINVAL;
96
97 mapping = page_mapping(p);
98 if (mapping == NULL || mapping->host == NULL)
99 return -EINVAL;
100
101 dev = mapping->host->i_sb->s_dev;
102 if (hwpoison_filter_dev_major != ~0U &&
103 hwpoison_filter_dev_major != MAJOR(dev))
104 return -EINVAL;
105 if (hwpoison_filter_dev_minor != ~0U &&
106 hwpoison_filter_dev_minor != MINOR(dev))
107 return -EINVAL;
108
109 return 0;
110}
111
112static int hwpoison_filter_flags(struct page *p)
113{
114 if (!hwpoison_filter_flags_mask)
115 return 0;
116
117 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
118 hwpoison_filter_flags_value)
119 return 0;
120 else
121 return -EINVAL;
122}
123
124
125
126
127
128
129
130
131
132
133
134#ifdef CONFIG_MEMCG
135u64 hwpoison_filter_memcg;
136EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
137static int hwpoison_filter_task(struct page *p)
138{
139 if (!hwpoison_filter_memcg)
140 return 0;
141
142 if (page_cgroup_ino(p) != hwpoison_filter_memcg)
143 return -EINVAL;
144
145 return 0;
146}
147#else
148static int hwpoison_filter_task(struct page *p) { return 0; }
149#endif
150
151int hwpoison_filter(struct page *p)
152{
153 if (!hwpoison_filter_enable)
154 return 0;
155
156 if (hwpoison_filter_dev(p))
157 return -EINVAL;
158
159 if (hwpoison_filter_flags(p))
160 return -EINVAL;
161
162 if (hwpoison_filter_task(p))
163 return -EINVAL;
164
165 return 0;
166}
167#else
168int hwpoison_filter(struct page *p)
169{
170 return 0;
171}
172#endif
173
174EXPORT_SYMBOL_GPL(hwpoison_filter);
175
176
177
178
179
180
181static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
182 unsigned long pfn, struct page *page, int flags)
183{
184 struct siginfo si;
185 int ret;
186
187 pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
188 pfn, t->comm, t->pid);
189 si.si_signo = SIGBUS;
190 si.si_errno = 0;
191 si.si_addr = (void *)addr;
192#ifdef __ARCH_SI_TRAPNO
193 si.si_trapno = trapno;
194#endif
195 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
196
197 if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
198 si.si_code = BUS_MCEERR_AR;
199 ret = force_sig_info(SIGBUS, &si, current);
200 } else {
201
202
203
204
205
206
207 si.si_code = BUS_MCEERR_AO;
208 ret = send_sig_info(SIGBUS, &si, t);
209 }
210 if (ret < 0)
211 pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
212 t->comm, t->pid, ret);
213 return ret;
214}
215
216
217
218
219
220void shake_page(struct page *p, int access)
221{
222 if (!PageSlab(p)) {
223 lru_add_drain_all();
224 if (PageLRU(p))
225 return;
226 drain_all_pages(page_zone(p));
227 if (PageLRU(p) || is_free_buddy_page(p))
228 return;
229 }
230
231
232
233
234
235 if (access)
236 drop_slab_node(page_to_nid(p));
237}
238EXPORT_SYMBOL_GPL(shake_page);
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262struct to_kill {
263 struct list_head nd;
264 struct task_struct *tsk;
265 unsigned long addr;
266 char addr_valid;
267};
268
269
270
271
272
273
274
275
276
277
278
279static void add_to_kill(struct task_struct *tsk, struct page *p,
280 struct vm_area_struct *vma,
281 struct list_head *to_kill,
282 struct to_kill **tkc)
283{
284 struct to_kill *tk;
285
286 if (*tkc) {
287 tk = *tkc;
288 *tkc = NULL;
289 } else {
290 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
291 if (!tk) {
292 pr_err("Memory failure: Out of memory while machine check handling\n");
293 return;
294 }
295 }
296 tk->addr = page_address_in_vma(p, vma);
297 tk->addr_valid = 1;
298
299
300
301
302
303
304
305 if (tk->addr == -EFAULT) {
306 pr_info("Memory failure: Unable to find user space address %lx in %s\n",
307 page_to_pfn(p), tsk->comm);
308 tk->addr_valid = 0;
309 }
310 get_task_struct(tsk);
311 tk->tsk = tsk;
312 list_add_tail(&tk->nd, to_kill);
313}
314
315
316
317
318
319
320
321
322
323static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
324 int fail, struct page *page, unsigned long pfn,
325 int flags)
326{
327 struct to_kill *tk, *next;
328
329 list_for_each_entry_safe (tk, next, to_kill, nd) {
330 if (forcekill) {
331
332
333
334
335
336 if (fail || tk->addr_valid == 0) {
337 pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
338 pfn, tk->tsk->comm, tk->tsk->pid);
339 force_sig(SIGKILL, tk->tsk);
340 }
341
342
343
344
345
346
347
348 else if (kill_proc(tk->tsk, tk->addr, trapno,
349 pfn, page, flags) < 0)
350 pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
351 pfn, tk->tsk->comm, tk->tsk->pid);
352 }
353 put_task_struct(tk->tsk);
354 kfree(tk);
355 }
356}
357
358
359
360
361
362
363
364
365
366static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
367{
368 struct task_struct *t;
369
370 for_each_thread(tsk, t)
371 if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
372 return t;
373 return NULL;
374}
375
376
377
378
379
380
381
382static struct task_struct *task_early_kill(struct task_struct *tsk,
383 int force_early)
384{
385 struct task_struct *t;
386 if (!tsk->mm)
387 return NULL;
388 if (force_early)
389 return tsk;
390 t = find_early_kill_thread(tsk);
391 if (t)
392 return t;
393 if (sysctl_memory_failure_early_kill)
394 return tsk;
395 return NULL;
396}
397
398
399
400
401static void collect_procs_anon(struct page *page, struct list_head *to_kill,
402 struct to_kill **tkc, int force_early)
403{
404 struct vm_area_struct *vma;
405 struct task_struct *tsk;
406 struct anon_vma *av;
407 pgoff_t pgoff;
408
409 av = page_lock_anon_vma_read(page);
410 if (av == NULL)
411 return;
412
413 pgoff = page_to_pgoff(page);
414 read_lock(&tasklist_lock);
415 for_each_process (tsk) {
416 struct anon_vma_chain *vmac;
417 struct task_struct *t = task_early_kill(tsk, force_early);
418
419 if (!t)
420 continue;
421 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
422 pgoff, pgoff) {
423 vma = vmac->vma;
424 if (!page_mapped_in_vma(page, vma))
425 continue;
426 if (vma->vm_mm == t->mm)
427 add_to_kill(t, page, vma, to_kill, tkc);
428 }
429 }
430 read_unlock(&tasklist_lock);
431 page_unlock_anon_vma_read(av);
432}
433
434
435
436
437static void collect_procs_file(struct page *page, struct list_head *to_kill,
438 struct to_kill **tkc, int force_early)
439{
440 struct vm_area_struct *vma;
441 struct task_struct *tsk;
442 struct address_space *mapping = page->mapping;
443
444 i_mmap_lock_read(mapping);
445 read_lock(&tasklist_lock);
446 for_each_process(tsk) {
447 pgoff_t pgoff = page_to_pgoff(page);
448 struct task_struct *t = task_early_kill(tsk, force_early);
449
450 if (!t)
451 continue;
452 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
453 pgoff) {
454
455
456
457
458
459
460
461 if (vma->vm_mm == t->mm)
462 add_to_kill(t, page, vma, to_kill, tkc);
463 }
464 }
465 read_unlock(&tasklist_lock);
466 i_mmap_unlock_read(mapping);
467}
468
469
470
471
472
473
474
475static void collect_procs(struct page *page, struct list_head *tokill,
476 int force_early)
477{
478 struct to_kill *tk;
479
480 if (!page->mapping)
481 return;
482
483 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
484 if (!tk)
485 return;
486 if (PageAnon(page))
487 collect_procs_anon(page, tokill, &tk, force_early);
488 else
489 collect_procs_file(page, tokill, &tk, force_early);
490 kfree(tk);
491}
492
493static const char *action_name[] = {
494 [MF_IGNORED] = "Ignored",
495 [MF_FAILED] = "Failed",
496 [MF_DELAYED] = "Delayed",
497 [MF_RECOVERED] = "Recovered",
498};
499
500static const char * const action_page_types[] = {
501 [MF_MSG_KERNEL] = "reserved kernel page",
502 [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
503 [MF_MSG_SLAB] = "kernel slab page",
504 [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
505 [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
506 [MF_MSG_HUGE] = "huge page",
507 [MF_MSG_FREE_HUGE] = "free huge page",
508 [MF_MSG_UNMAP_FAILED] = "unmapping failed page",
509 [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
510 [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
511 [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
512 [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
513 [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
514 [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
515 [MF_MSG_DIRTY_LRU] = "dirty LRU page",
516 [MF_MSG_CLEAN_LRU] = "clean LRU page",
517 [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
518 [MF_MSG_BUDDY] = "free buddy page",
519 [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
520 [MF_MSG_UNKNOWN] = "unknown page",
521};
522
523
524
525
526
527
528
529static int delete_from_lru_cache(struct page *p)
530{
531 if (!isolate_lru_page(p)) {
532
533
534
535
536 ClearPageActive(p);
537 ClearPageUnevictable(p);
538
539
540
541 put_page(p);
542 return 0;
543 }
544 return -EIO;
545}
546
547
548
549
550
551
552static int me_kernel(struct page *p, unsigned long pfn)
553{
554 return MF_IGNORED;
555}
556
557
558
559
560static int me_unknown(struct page *p, unsigned long pfn)
561{
562 pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
563 return MF_FAILED;
564}
565
566
567
568
569static int me_pagecache_clean(struct page *p, unsigned long pfn)
570{
571 int err;
572 int ret = MF_FAILED;
573 struct address_space *mapping;
574
575 delete_from_lru_cache(p);
576
577
578
579
580
581 if (PageAnon(p))
582 return MF_RECOVERED;
583
584
585
586
587
588
589
590
591 mapping = page_mapping(p);
592 if (!mapping) {
593
594
595
596 return MF_FAILED;
597 }
598
599
600
601
602
603
604 if (mapping->a_ops->error_remove_page) {
605 err = mapping->a_ops->error_remove_page(mapping, p);
606 if (err != 0) {
607 pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
608 pfn, err);
609 } else if (page_has_private(p) &&
610 !try_to_release_page(p, GFP_NOIO)) {
611 pr_info("Memory failure: %#lx: failed to release buffers\n",
612 pfn);
613 } else {
614 ret = MF_RECOVERED;
615 }
616 } else {
617
618
619
620
621 if (invalidate_inode_page(p))
622 ret = MF_RECOVERED;
623 else
624 pr_info("Memory failure: %#lx: Failed to invalidate\n",
625 pfn);
626 }
627 return ret;
628}
629
630
631
632
633
634
635static int me_pagecache_dirty(struct page *p, unsigned long pfn)
636{
637 struct address_space *mapping = page_mapping(p);
638
639 SetPageError(p);
640
641 if (mapping) {
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676 mapping_set_error(mapping, EIO);
677 }
678
679 return me_pagecache_clean(p, pfn);
680}
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701static int me_swapcache_dirty(struct page *p, unsigned long pfn)
702{
703 ClearPageDirty(p);
704
705 ClearPageUptodate(p);
706
707 if (!delete_from_lru_cache(p))
708 return MF_DELAYED;
709 else
710 return MF_FAILED;
711}
712
713static int me_swapcache_clean(struct page *p, unsigned long pfn)
714{
715 delete_from_swap_cache(p);
716
717 if (!delete_from_lru_cache(p))
718 return MF_RECOVERED;
719 else
720 return MF_FAILED;
721}
722
723
724
725
726
727
728
729static int me_huge_page(struct page *p, unsigned long pfn)
730{
731 int res = 0;
732 struct page *hpage = compound_head(p);
733
734 if (!PageHuge(hpage))
735 return MF_DELAYED;
736
737
738
739
740
741
742
743
744
745 if (!(page_mapping(hpage) || PageAnon(hpage))) {
746 res = dequeue_hwpoisoned_huge_page(hpage);
747 if (!res)
748 return MF_RECOVERED;
749 }
750 return MF_DELAYED;
751}
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766#define dirty (1UL << PG_dirty)
767#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
768#define unevict (1UL << PG_unevictable)
769#define mlock (1UL << PG_mlocked)
770#define writeback (1UL << PG_writeback)
771#define lru (1UL << PG_lru)
772#define head (1UL << PG_head)
773#define slab (1UL << PG_slab)
774#define reserved (1UL << PG_reserved)
775
776static struct page_state {
777 unsigned long mask;
778 unsigned long res;
779 enum mf_action_page_type type;
780 int (*action)(struct page *p, unsigned long pfn);
781} error_states[] = {
782 { reserved, reserved, MF_MSG_KERNEL, me_kernel },
783
784
785
786
787
788
789
790
791
792
793 { slab, slab, MF_MSG_SLAB, me_kernel },
794
795 { head, head, MF_MSG_HUGE, me_huge_page },
796
797 { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
798 { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
799
800 { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
801 { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
802
803 { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
804 { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
805
806 { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty },
807 { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean },
808
809
810
811
812 { 0, 0, MF_MSG_UNKNOWN, me_unknown },
813};
814
815#undef dirty
816#undef sc
817#undef unevict
818#undef mlock
819#undef writeback
820#undef lru
821#undef head
822#undef slab
823#undef reserved
824
825
826
827
828
829static void action_result(unsigned long pfn, enum mf_action_page_type type,
830 enum mf_result result)
831{
832 trace_memory_failure_event(pfn, type, result);
833
834 pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
835 pfn, action_page_types[type], action_name[result]);
836}
837
838static int page_action(struct page_state *ps, struct page *p,
839 unsigned long pfn)
840{
841 int result;
842 int count;
843
844 result = ps->action(p, pfn);
845
846 count = page_count(p) - 1;
847 if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
848 count--;
849 if (count != 0) {
850 pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
851 pfn, action_page_types[ps->type], count);
852 result = MF_FAILED;
853 }
854 action_result(pfn, ps->type, result);
855
856
857
858
859
860
861 return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
862}
863
864
865
866
867
868
869
870
871int get_hwpoison_page(struct page *page)
872{
873 struct page *head = compound_head(page);
874
875 if (!PageHuge(head) && PageTransHuge(head)) {
876
877
878
879
880
881
882 if (!PageAnon(head)) {
883 pr_err("Memory failure: %#lx: non anonymous thp\n",
884 page_to_pfn(page));
885 return 0;
886 }
887 }
888
889 if (get_page_unless_zero(head)) {
890 if (head == compound_head(page))
891 return 1;
892
893 pr_info("Memory failure: %#lx cannot catch tail\n",
894 page_to_pfn(page));
895 put_page(head);
896 }
897
898 return 0;
899}
900EXPORT_SYMBOL_GPL(get_hwpoison_page);
901
902
903
904
905
906static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
907 int trapno, int flags, struct page **hpagep)
908{
909 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
910 struct address_space *mapping;
911 LIST_HEAD(tokill);
912 int ret;
913 int kill = 1, forcekill;
914 struct page *hpage = *hpagep;
915
916
917
918
919
920 if (PageReserved(p) || PageSlab(p))
921 return SWAP_SUCCESS;
922 if (!(PageLRU(hpage) || PageHuge(p)))
923 return SWAP_SUCCESS;
924
925
926
927
928
929 if (!page_mapped(hpage))
930 return SWAP_SUCCESS;
931
932 if (PageKsm(p)) {
933 pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
934 return SWAP_FAIL;
935 }
936
937 if (PageSwapCache(p)) {
938 pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
939 pfn);
940 ttu |= TTU_IGNORE_HWPOISON;
941 }
942
943
944
945
946
947
948
949 mapping = page_mapping(hpage);
950 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
951 mapping_cap_writeback_dirty(mapping)) {
952 if (page_mkclean(hpage)) {
953 SetPageDirty(hpage);
954 } else {
955 kill = 0;
956 ttu |= TTU_IGNORE_HWPOISON;
957 pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
958 pfn);
959 }
960 }
961
962
963
964
965
966
967
968
969
970 if (kill)
971 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
972
973 ret = try_to_unmap(hpage, ttu);
974 if (ret != SWAP_SUCCESS)
975 pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
976 pfn, page_mapcount(hpage));
977
978
979
980
981
982
983
984
985
986
987
988 forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
989 kill_procs(&tokill, forcekill, trapno,
990 ret != SWAP_SUCCESS, p, pfn, flags);
991
992 return ret;
993}
994
995static void set_page_hwpoison_huge_page(struct page *hpage)
996{
997 int i;
998 int nr_pages = 1 << compound_order(hpage);
999 for (i = 0; i < nr_pages; i++)
1000 SetPageHWPoison(hpage + i);
1001}
1002
1003static void clear_page_hwpoison_huge_page(struct page *hpage)
1004{
1005 int i;
1006 int nr_pages = 1 << compound_order(hpage);
1007 for (i = 0; i < nr_pages; i++)
1008 ClearPageHWPoison(hpage + i);
1009}
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029int memory_failure(unsigned long pfn, int trapno, int flags)
1030{
1031 struct page_state *ps;
1032 struct page *p;
1033 struct page *hpage;
1034 struct page *orig_head;
1035 int res;
1036 unsigned int nr_pages;
1037 unsigned long page_flags;
1038
1039 if (!sysctl_memory_failure_recovery)
1040 panic("Memory failure from trap %d on page %lx", trapno, pfn);
1041
1042 if (!pfn_valid(pfn)) {
1043 pr_err("Memory failure: %#lx: memory outside kernel control\n",
1044 pfn);
1045 return -ENXIO;
1046 }
1047
1048 p = pfn_to_page(pfn);
1049 orig_head = hpage = compound_head(p);
1050 if (TestSetPageHWPoison(p)) {
1051 pr_err("Memory failure: %#lx: already hardware poisoned\n",
1052 pfn);
1053 return 0;
1054 }
1055
1056
1057
1058
1059
1060
1061
1062
1063 if (PageHuge(p))
1064 nr_pages = 1 << compound_order(hpage);
1065 else
1066 nr_pages = 1;
1067 num_poisoned_pages_add(nr_pages);
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083 if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
1084 if (is_free_buddy_page(p)) {
1085 action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1086 return 0;
1087 } else if (PageHuge(hpage)) {
1088
1089
1090
1091 lock_page(hpage);
1092 if (PageHWPoison(hpage)) {
1093 if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
1094 || (p != hpage && TestSetPageHWPoison(hpage))) {
1095 num_poisoned_pages_sub(nr_pages);
1096 unlock_page(hpage);
1097 return 0;
1098 }
1099 }
1100 set_page_hwpoison_huge_page(hpage);
1101 res = dequeue_hwpoisoned_huge_page(hpage);
1102 action_result(pfn, MF_MSG_FREE_HUGE,
1103 res ? MF_IGNORED : MF_DELAYED);
1104 unlock_page(hpage);
1105 return res;
1106 } else {
1107 action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
1108 return -EBUSY;
1109 }
1110 }
1111
1112 if (!PageHuge(p) && PageTransHuge(hpage)) {
1113 lock_page(p);
1114 if (!PageAnon(p) || unlikely(split_huge_page(p))) {
1115 unlock_page(p);
1116 if (!PageAnon(p))
1117 pr_err("Memory failure: %#lx: non anonymous thp\n",
1118 pfn);
1119 else
1120 pr_err("Memory failure: %#lx: thp split failed\n",
1121 pfn);
1122 if (TestClearPageHWPoison(p))
1123 num_poisoned_pages_sub(nr_pages);
1124 put_hwpoison_page(p);
1125 return -EBUSY;
1126 }
1127 unlock_page(p);
1128 VM_BUG_ON_PAGE(!page_count(p), p);
1129 hpage = compound_head(p);
1130 }
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140 if (!PageHuge(p)) {
1141 if (!PageLRU(p))
1142 shake_page(p, 0);
1143 if (!PageLRU(p)) {
1144
1145
1146
1147 if (is_free_buddy_page(p)) {
1148 if (flags & MF_COUNT_INCREASED)
1149 action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1150 else
1151 action_result(pfn, MF_MSG_BUDDY_2ND,
1152 MF_DELAYED);
1153 return 0;
1154 }
1155 }
1156 }
1157
1158 lock_page(hpage);
1159
1160
1161
1162
1163
1164 if (PageCompound(p) && compound_head(p) != orig_head) {
1165 action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
1166 res = -EBUSY;
1167 goto out;
1168 }
1169
1170
1171
1172
1173
1174
1175
1176
1177 page_flags = p->flags;
1178
1179
1180
1181
1182 if (!PageHWPoison(p)) {
1183 pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1184 num_poisoned_pages_sub(nr_pages);
1185 unlock_page(hpage);
1186 put_hwpoison_page(hpage);
1187 return 0;
1188 }
1189 if (hwpoison_filter(p)) {
1190 if (TestClearPageHWPoison(p))
1191 num_poisoned_pages_sub(nr_pages);
1192 unlock_page(hpage);
1193 put_hwpoison_page(hpage);
1194 return 0;
1195 }
1196
1197 if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
1198 goto identify_page_state;
1199
1200
1201
1202
1203
1204 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1205 action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
1206 unlock_page(hpage);
1207 put_hwpoison_page(hpage);
1208 return 0;
1209 }
1210
1211
1212
1213
1214
1215
1216 if (PageHuge(p))
1217 set_page_hwpoison_huge_page(hpage);
1218
1219
1220
1221
1222
1223 wait_on_page_writeback(p);
1224
1225
1226
1227
1228
1229
1230
1231
1232 if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
1233 != SWAP_SUCCESS) {
1234 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1235 res = -EBUSY;
1236 goto out;
1237 }
1238
1239
1240
1241
1242 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1243 action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
1244 res = -EBUSY;
1245 goto out;
1246 }
1247
1248identify_page_state:
1249 res = -EBUSY;
1250
1251
1252
1253
1254
1255 for (ps = error_states;; ps++)
1256 if ((p->flags & ps->mask) == ps->res)
1257 break;
1258
1259 page_flags |= (p->flags & (1UL << PG_dirty));
1260
1261 if (!ps->mask)
1262 for (ps = error_states;; ps++)
1263 if ((page_flags & ps->mask) == ps->res)
1264 break;
1265 res = page_action(ps, p, pfn);
1266out:
1267 unlock_page(hpage);
1268 return res;
1269}
1270EXPORT_SYMBOL_GPL(memory_failure);
1271
1272#define MEMORY_FAILURE_FIFO_ORDER 4
1273#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1274
1275struct memory_failure_entry {
1276 unsigned long pfn;
1277 int trapno;
1278 int flags;
1279};
1280
1281struct memory_failure_cpu {
1282 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1283 MEMORY_FAILURE_FIFO_SIZE);
1284 spinlock_t lock;
1285 struct work_struct work;
1286};
1287
1288static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307void memory_failure_queue(unsigned long pfn, int trapno, int flags)
1308{
1309 struct memory_failure_cpu *mf_cpu;
1310 unsigned long proc_flags;
1311 struct memory_failure_entry entry = {
1312 .pfn = pfn,
1313 .trapno = trapno,
1314 .flags = flags,
1315 };
1316
1317 mf_cpu = &get_cpu_var(memory_failure_cpu);
1318 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1319 if (kfifo_put(&mf_cpu->fifo, entry))
1320 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1321 else
1322 pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
1323 pfn);
1324 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1325 put_cpu_var(memory_failure_cpu);
1326}
1327EXPORT_SYMBOL_GPL(memory_failure_queue);
1328
1329static void memory_failure_work_func(struct work_struct *work)
1330{
1331 struct memory_failure_cpu *mf_cpu;
1332 struct memory_failure_entry entry = { 0, };
1333 unsigned long proc_flags;
1334 int gotten;
1335
1336 mf_cpu = this_cpu_ptr(&memory_failure_cpu);
1337 for (;;) {
1338 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1339 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1340 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1341 if (!gotten)
1342 break;
1343 if (entry.flags & MF_SOFT_OFFLINE)
1344 soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
1345 else
1346 memory_failure(entry.pfn, entry.trapno, entry.flags);
1347 }
1348}
1349
1350static int __init memory_failure_init(void)
1351{
1352 struct memory_failure_cpu *mf_cpu;
1353 int cpu;
1354
1355 for_each_possible_cpu(cpu) {
1356 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1357 spin_lock_init(&mf_cpu->lock);
1358 INIT_KFIFO(mf_cpu->fifo);
1359 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1360 }
1361
1362 return 0;
1363}
1364core_initcall(memory_failure_init);
1365
1366#define unpoison_pr_info(fmt, pfn, rs) \
1367({ \
1368 if (__ratelimit(rs)) \
1369 pr_info(fmt, pfn); \
1370})
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384int unpoison_memory(unsigned long pfn)
1385{
1386 struct page *page;
1387 struct page *p;
1388 int freeit = 0;
1389 unsigned int nr_pages;
1390 static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
1391 DEFAULT_RATELIMIT_BURST);
1392
1393 if (!pfn_valid(pfn))
1394 return -ENXIO;
1395
1396 p = pfn_to_page(pfn);
1397 page = compound_head(p);
1398
1399 if (!PageHWPoison(p)) {
1400 unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
1401 pfn, &unpoison_rs);
1402 return 0;
1403 }
1404
1405 if (page_count(page) > 1) {
1406 unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
1407 pfn, &unpoison_rs);
1408 return 0;
1409 }
1410
1411 if (page_mapped(page)) {
1412 unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
1413 pfn, &unpoison_rs);
1414 return 0;
1415 }
1416
1417 if (page_mapping(page)) {
1418 unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
1419 pfn, &unpoison_rs);
1420 return 0;
1421 }
1422
1423
1424
1425
1426
1427
1428 if (!PageHuge(page) && PageTransHuge(page)) {
1429 unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
1430 pfn, &unpoison_rs);
1431 return 0;
1432 }
1433
1434 nr_pages = 1 << compound_order(page);
1435
1436 if (!get_hwpoison_page(p)) {
1437
1438
1439
1440
1441
1442
1443 if (PageHuge(page)) {
1444 unpoison_pr_info("Unpoison: Memory failure is now running on free hugepage %#lx\n",
1445 pfn, &unpoison_rs);
1446 return 0;
1447 }
1448 if (TestClearPageHWPoison(p))
1449 num_poisoned_pages_dec();
1450 unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
1451 pfn, &unpoison_rs);
1452 return 0;
1453 }
1454
1455 lock_page(page);
1456
1457
1458
1459
1460
1461
1462 if (TestClearPageHWPoison(page)) {
1463 unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
1464 pfn, &unpoison_rs);
1465 num_poisoned_pages_sub(nr_pages);
1466 freeit = 1;
1467 if (PageHuge(page))
1468 clear_page_hwpoison_huge_page(page);
1469 }
1470 unlock_page(page);
1471
1472 put_hwpoison_page(page);
1473 if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
1474 put_hwpoison_page(page);
1475
1476 return 0;
1477}
1478EXPORT_SYMBOL(unpoison_memory);
1479
1480static struct page *new_page(struct page *p, unsigned long private, int **x)
1481{
1482 int nid = page_to_nid(p);
1483 if (PageHuge(p))
1484 return alloc_huge_page_node(page_hstate(compound_head(p)),
1485 nid);
1486 else
1487 return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1488}
1489
1490
1491
1492
1493
1494
1495
1496static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1497{
1498 int ret;
1499
1500 if (flags & MF_COUNT_INCREASED)
1501 return 1;
1502
1503
1504
1505
1506
1507 if (!get_hwpoison_page(p)) {
1508 if (PageHuge(p)) {
1509 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1510 ret = 0;
1511 } else if (is_free_buddy_page(p)) {
1512 pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1513 ret = 0;
1514 } else {
1515 pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1516 __func__, pfn, p->flags);
1517 ret = -EIO;
1518 }
1519 } else {
1520
1521 ret = 1;
1522 }
1523 return ret;
1524}
1525
1526static int get_any_page(struct page *page, unsigned long pfn, int flags)
1527{
1528 int ret = __get_any_page(page, pfn, flags);
1529
1530 if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
1531
1532
1533
1534 put_hwpoison_page(page);
1535 shake_page(page, 1);
1536
1537
1538
1539
1540 ret = __get_any_page(page, pfn, 0);
1541 if (ret == 1 && !PageLRU(page)) {
1542
1543 put_hwpoison_page(page);
1544 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1545 pfn, page->flags);
1546 return -EIO;
1547 }
1548 }
1549 return ret;
1550}
1551
1552static int soft_offline_huge_page(struct page *page, int flags)
1553{
1554 int ret;
1555 unsigned long pfn = page_to_pfn(page);
1556 struct page *hpage = compound_head(page);
1557 LIST_HEAD(pagelist);
1558
1559
1560
1561
1562
1563 lock_page(hpage);
1564 if (PageHWPoison(hpage)) {
1565 unlock_page(hpage);
1566 put_hwpoison_page(hpage);
1567 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1568 return -EBUSY;
1569 }
1570 unlock_page(hpage);
1571
1572 ret = isolate_huge_page(hpage, &pagelist);
1573
1574
1575
1576
1577 put_hwpoison_page(hpage);
1578 if (!ret) {
1579 pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
1580 return -EBUSY;
1581 }
1582
1583 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1584 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1585 if (ret) {
1586 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1587 pfn, ret, page->flags);
1588
1589
1590
1591
1592
1593 putback_active_hugepage(hpage);
1594 if (ret > 0)
1595 ret = -EIO;
1596 } else {
1597
1598 if (PageHuge(page)) {
1599 set_page_hwpoison_huge_page(hpage);
1600 dequeue_hwpoisoned_huge_page(hpage);
1601 num_poisoned_pages_add(1 << compound_order(hpage));
1602 } else {
1603 SetPageHWPoison(page);
1604 num_poisoned_pages_inc();
1605 }
1606 }
1607 return ret;
1608}
1609
1610static int __soft_offline_page(struct page *page, int flags)
1611{
1612 int ret;
1613 unsigned long pfn = page_to_pfn(page);
1614
1615
1616
1617
1618
1619
1620
1621 lock_page(page);
1622 wait_on_page_writeback(page);
1623 if (PageHWPoison(page)) {
1624 unlock_page(page);
1625 put_hwpoison_page(page);
1626 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1627 return -EBUSY;
1628 }
1629
1630
1631
1632
1633 ret = invalidate_inode_page(page);
1634 unlock_page(page);
1635
1636
1637
1638
1639 if (ret == 1) {
1640 put_hwpoison_page(page);
1641 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1642 SetPageHWPoison(page);
1643 num_poisoned_pages_inc();
1644 return 0;
1645 }
1646
1647
1648
1649
1650
1651
1652 ret = isolate_lru_page(page);
1653
1654
1655
1656
1657 put_hwpoison_page(page);
1658 if (!ret) {
1659 LIST_HEAD(pagelist);
1660 inc_node_page_state(page, NR_ISOLATED_ANON +
1661 page_is_file_cache(page));
1662 list_add(&page->lru, &pagelist);
1663 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1664 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1665 if (ret) {
1666 if (!list_empty(&pagelist)) {
1667 list_del(&page->lru);
1668 dec_node_page_state(page, NR_ISOLATED_ANON +
1669 page_is_file_cache(page));
1670 putback_lru_page(page);
1671 }
1672
1673 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1674 pfn, ret, page->flags);
1675 if (ret > 0)
1676 ret = -EIO;
1677 }
1678 } else {
1679 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1680 pfn, ret, page_count(page), page->flags);
1681 }
1682 return ret;
1683}
1684
1685static int soft_offline_in_use_page(struct page *page, int flags)
1686{
1687 int ret;
1688 struct page *hpage = compound_head(page);
1689
1690 if (!PageHuge(page) && PageTransHuge(hpage)) {
1691 lock_page(hpage);
1692 if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
1693 unlock_page(hpage);
1694 if (!PageAnon(hpage))
1695 pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
1696 else
1697 pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
1698 put_hwpoison_page(hpage);
1699 return -EBUSY;
1700 }
1701 unlock_page(hpage);
1702 get_hwpoison_page(page);
1703 put_hwpoison_page(hpage);
1704 }
1705
1706 if (PageHuge(page))
1707 ret = soft_offline_huge_page(page, flags);
1708 else
1709 ret = __soft_offline_page(page, flags);
1710
1711 return ret;
1712}
1713
1714static void soft_offline_free_page(struct page *page)
1715{
1716 if (PageHuge(page)) {
1717 struct page *hpage = compound_head(page);
1718
1719 set_page_hwpoison_huge_page(hpage);
1720 if (!dequeue_hwpoisoned_huge_page(hpage))
1721 num_poisoned_pages_add(1 << compound_order(hpage));
1722 } else {
1723 if (!TestSetPageHWPoison(page))
1724 num_poisoned_pages_inc();
1725 }
1726}
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750int soft_offline_page(struct page *page, int flags)
1751{
1752 int ret;
1753 unsigned long pfn = page_to_pfn(page);
1754
1755 if (PageHWPoison(page)) {
1756 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1757 if (flags & MF_COUNT_INCREASED)
1758 put_hwpoison_page(page);
1759 return -EBUSY;
1760 }
1761
1762 get_online_mems();
1763 ret = get_any_page(page, pfn, flags);
1764 put_online_mems();
1765
1766 if (ret > 0)
1767 ret = soft_offline_in_use_page(page, flags);
1768 else if (ret == 0)
1769 soft_offline_free_page(page);
1770
1771 return ret;
1772}
1773