1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39#include <linux/kernel.h>
40#include <linux/mm.h>
41#include <linux/page-flags.h>
42#include <linux/kernel-page-flags.h>
43#include <linux/sched.h>
44#include <linux/ksm.h>
45#include <linux/rmap.h>
46#include <linux/export.h>
47#include <linux/pagemap.h>
48#include <linux/swap.h>
49#include <linux/backing-dev.h>
50#include <linux/migrate.h>
51#include <linux/page-isolation.h>
52#include <linux/suspend.h>
53#include <linux/slab.h>
54#include <linux/swapops.h>
55#include <linux/hugetlb.h>
56#include <linux/memory_hotplug.h>
57#include <linux/mm_inline.h>
58#include <linux/kfifo.h>
59#include <linux/ratelimit.h>
60#include "internal.h"
61#include "ras/ras_event.h"
62
63int sysctl_memory_failure_early_kill __read_mostly = 0;
64
65int sysctl_memory_failure_recovery __read_mostly = 1;
66
67atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
68
69#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
70
71u32 hwpoison_filter_enable = 0;
72u32 hwpoison_filter_dev_major = ~0U;
73u32 hwpoison_filter_dev_minor = ~0U;
74u64 hwpoison_filter_flags_mask;
75u64 hwpoison_filter_flags_value;
76EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
77EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
78EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
79EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
80EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
81
82static int hwpoison_filter_dev(struct page *p)
83{
84 struct address_space *mapping;
85 dev_t dev;
86
87 if (hwpoison_filter_dev_major == ~0U &&
88 hwpoison_filter_dev_minor == ~0U)
89 return 0;
90
91
92
93
94 if (PageSlab(p))
95 return -EINVAL;
96
97 mapping = page_mapping(p);
98 if (mapping == NULL || mapping->host == NULL)
99 return -EINVAL;
100
101 dev = mapping->host->i_sb->s_dev;
102 if (hwpoison_filter_dev_major != ~0U &&
103 hwpoison_filter_dev_major != MAJOR(dev))
104 return -EINVAL;
105 if (hwpoison_filter_dev_minor != ~0U &&
106 hwpoison_filter_dev_minor != MINOR(dev))
107 return -EINVAL;
108
109 return 0;
110}
111
112static int hwpoison_filter_flags(struct page *p)
113{
114 if (!hwpoison_filter_flags_mask)
115 return 0;
116
117 if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
118 hwpoison_filter_flags_value)
119 return 0;
120 else
121 return -EINVAL;
122}
123
124
125
126
127
128
129
130
131
132
133
134#ifdef CONFIG_MEMCG
135u64 hwpoison_filter_memcg;
136EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
137static int hwpoison_filter_task(struct page *p)
138{
139 if (!hwpoison_filter_memcg)
140 return 0;
141
142 if (page_cgroup_ino(p) != hwpoison_filter_memcg)
143 return -EINVAL;
144
145 return 0;
146}
147#else
148static int hwpoison_filter_task(struct page *p) { return 0; }
149#endif
150
151int hwpoison_filter(struct page *p)
152{
153 if (!hwpoison_filter_enable)
154 return 0;
155
156 if (hwpoison_filter_dev(p))
157 return -EINVAL;
158
159 if (hwpoison_filter_flags(p))
160 return -EINVAL;
161
162 if (hwpoison_filter_task(p))
163 return -EINVAL;
164
165 return 0;
166}
167#else
168int hwpoison_filter(struct page *p)
169{
170 return 0;
171}
172#endif
173
174EXPORT_SYMBOL_GPL(hwpoison_filter);
175
176
177
178
179
180
181static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
182 unsigned long pfn, struct page *page, int flags)
183{
184 struct siginfo si;
185 int ret;
186
187 pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
188 pfn, t->comm, t->pid);
189 si.si_signo = SIGBUS;
190 si.si_errno = 0;
191 si.si_addr = (void *)addr;
192#ifdef __ARCH_SI_TRAPNO
193 si.si_trapno = trapno;
194#endif
195 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
196
197 if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
198 si.si_code = BUS_MCEERR_AR;
199 ret = force_sig_info(SIGBUS, &si, current);
200 } else {
201
202
203
204
205
206
207 si.si_code = BUS_MCEERR_AO;
208 ret = send_sig_info(SIGBUS, &si, t);
209 }
210 if (ret < 0)
211 pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
212 t->comm, t->pid, ret);
213 return ret;
214}
215
216
217
218
219
220void shake_page(struct page *p, int access)
221{
222 if (!PageSlab(p)) {
223 lru_add_drain_all();
224 if (PageLRU(p))
225 return;
226 drain_all_pages(page_zone(p));
227 if (PageLRU(p) || is_free_buddy_page(p))
228 return;
229 }
230
231
232
233
234
235 if (access)
236 drop_slab_node(page_to_nid(p));
237}
238EXPORT_SYMBOL_GPL(shake_page);
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262struct to_kill {
263 struct list_head nd;
264 struct task_struct *tsk;
265 unsigned long addr;
266 char addr_valid;
267};
268
269
270
271
272
273
274
275
276
277
278
279static void add_to_kill(struct task_struct *tsk, struct page *p,
280 struct vm_area_struct *vma,
281 struct list_head *to_kill,
282 struct to_kill **tkc)
283{
284 struct to_kill *tk;
285
286 if (*tkc) {
287 tk = *tkc;
288 *tkc = NULL;
289 } else {
290 tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
291 if (!tk) {
292 pr_err("Memory failure: Out of memory while machine check handling\n");
293 return;
294 }
295 }
296 tk->addr = page_address_in_vma(p, vma);
297 tk->addr_valid = 1;
298
299
300
301
302
303
304
305 if (tk->addr == -EFAULT) {
306 pr_info("Memory failure: Unable to find user space address %lx in %s\n",
307 page_to_pfn(p), tsk->comm);
308 tk->addr_valid = 0;
309 }
310 get_task_struct(tsk);
311 tk->tsk = tsk;
312 list_add_tail(&tk->nd, to_kill);
313}
314
315
316
317
318
319
320
321
322
323static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
324 int fail, struct page *page, unsigned long pfn,
325 int flags)
326{
327 struct to_kill *tk, *next;
328
329 list_for_each_entry_safe (tk, next, to_kill, nd) {
330 if (forcekill) {
331
332
333
334
335
336 if (fail || tk->addr_valid == 0) {
337 pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
338 pfn, tk->tsk->comm, tk->tsk->pid);
339 force_sig(SIGKILL, tk->tsk);
340 }
341
342
343
344
345
346
347
348 else if (kill_proc(tk->tsk, tk->addr, trapno,
349 pfn, page, flags) < 0)
350 pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
351 pfn, tk->tsk->comm, tk->tsk->pid);
352 }
353 put_task_struct(tk->tsk);
354 kfree(tk);
355 }
356}
357
358
359
360
361
362
363
364
365
366static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
367{
368 struct task_struct *t;
369
370 for_each_thread(tsk, t)
371 if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
372 return t;
373 return NULL;
374}
375
376
377
378
379
380
381
382static struct task_struct *task_early_kill(struct task_struct *tsk,
383 int force_early)
384{
385 struct task_struct *t;
386 if (!tsk->mm)
387 return NULL;
388 if (force_early)
389 return tsk;
390 t = find_early_kill_thread(tsk);
391 if (t)
392 return t;
393 if (sysctl_memory_failure_early_kill)
394 return tsk;
395 return NULL;
396}
397
398
399
400
401static void collect_procs_anon(struct page *page, struct list_head *to_kill,
402 struct to_kill **tkc, int force_early)
403{
404 struct vm_area_struct *vma;
405 struct task_struct *tsk;
406 struct anon_vma *av;
407 pgoff_t pgoff;
408
409 av = page_lock_anon_vma_read(page);
410 if (av == NULL)
411 return;
412
413 pgoff = page_to_pgoff(page);
414 read_lock(&tasklist_lock);
415 for_each_process (tsk) {
416 struct anon_vma_chain *vmac;
417 struct task_struct *t = task_early_kill(tsk, force_early);
418
419 if (!t)
420 continue;
421 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
422 pgoff, pgoff) {
423 vma = vmac->vma;
424 if (!page_mapped_in_vma(page, vma))
425 continue;
426 if (vma->vm_mm == t->mm)
427 add_to_kill(t, page, vma, to_kill, tkc);
428 }
429 }
430 read_unlock(&tasklist_lock);
431 page_unlock_anon_vma_read(av);
432}
433
434
435
436
437static void collect_procs_file(struct page *page, struct list_head *to_kill,
438 struct to_kill **tkc, int force_early)
439{
440 struct vm_area_struct *vma;
441 struct task_struct *tsk;
442 struct address_space *mapping = page->mapping;
443
444 i_mmap_lock_read(mapping);
445 read_lock(&tasklist_lock);
446 for_each_process(tsk) {
447 pgoff_t pgoff = page_to_pgoff(page);
448 struct task_struct *t = task_early_kill(tsk, force_early);
449
450 if (!t)
451 continue;
452 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
453 pgoff) {
454
455
456
457
458
459
460
461 if (vma->vm_mm == t->mm)
462 add_to_kill(t, page, vma, to_kill, tkc);
463 }
464 }
465 read_unlock(&tasklist_lock);
466 i_mmap_unlock_read(mapping);
467}
468
469
470
471
472
473
474
475static void collect_procs(struct page *page, struct list_head *tokill,
476 int force_early)
477{
478 struct to_kill *tk;
479
480 if (!page->mapping)
481 return;
482
483 tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
484 if (!tk)
485 return;
486 if (PageAnon(page))
487 collect_procs_anon(page, tokill, &tk, force_early);
488 else
489 collect_procs_file(page, tokill, &tk, force_early);
490 kfree(tk);
491}
492
493static const char *action_name[] = {
494 [MF_IGNORED] = "Ignored",
495 [MF_FAILED] = "Failed",
496 [MF_DELAYED] = "Delayed",
497 [MF_RECOVERED] = "Recovered",
498};
499
500static const char * const action_page_types[] = {
501 [MF_MSG_KERNEL] = "reserved kernel page",
502 [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
503 [MF_MSG_SLAB] = "kernel slab page",
504 [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
505 [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
506 [MF_MSG_HUGE] = "huge page",
507 [MF_MSG_FREE_HUGE] = "free huge page",
508 [MF_MSG_UNMAP_FAILED] = "unmapping failed page",
509 [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
510 [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
511 [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
512 [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
513 [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
514 [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
515 [MF_MSG_DIRTY_LRU] = "dirty LRU page",
516 [MF_MSG_CLEAN_LRU] = "clean LRU page",
517 [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
518 [MF_MSG_BUDDY] = "free buddy page",
519 [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
520 [MF_MSG_UNKNOWN] = "unknown page",
521};
522
523
524
525
526
527
528
529static int delete_from_lru_cache(struct page *p)
530{
531 if (!isolate_lru_page(p)) {
532
533
534
535
536 ClearPageActive(p);
537 ClearPageUnevictable(p);
538
539
540
541 put_page(p);
542 return 0;
543 }
544 return -EIO;
545}
546
547
548
549
550
551
552static int me_kernel(struct page *p, unsigned long pfn)
553{
554 return MF_IGNORED;
555}
556
557
558
559
560static int me_unknown(struct page *p, unsigned long pfn)
561{
562 pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
563 return MF_FAILED;
564}
565
566
567
568
569static int me_pagecache_clean(struct page *p, unsigned long pfn)
570{
571 int err;
572 int ret = MF_FAILED;
573 struct address_space *mapping;
574
575 delete_from_lru_cache(p);
576
577
578
579
580
581 if (PageAnon(p))
582 return MF_RECOVERED;
583
584
585
586
587
588
589
590
591 mapping = page_mapping(p);
592 if (!mapping) {
593
594
595
596 return MF_FAILED;
597 }
598
599
600
601
602
603
604 if (mapping->a_ops->error_remove_page) {
605 err = mapping->a_ops->error_remove_page(mapping, p);
606 if (err != 0) {
607 pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
608 pfn, err);
609 } else if (page_has_private(p) &&
610 !try_to_release_page(p, GFP_NOIO)) {
611 pr_info("Memory failure: %#lx: failed to release buffers\n",
612 pfn);
613 } else {
614 ret = MF_RECOVERED;
615 }
616 } else {
617
618
619
620
621 if (invalidate_inode_page(p))
622 ret = MF_RECOVERED;
623 else
624 pr_info("Memory failure: %#lx: Failed to invalidate\n",
625 pfn);
626 }
627 return ret;
628}
629
630
631
632
633
634
635static int me_pagecache_dirty(struct page *p, unsigned long pfn)
636{
637 struct address_space *mapping = page_mapping(p);
638
639 SetPageError(p);
640
641 if (mapping) {
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676 mapping_set_error(mapping, EIO);
677 }
678
679 return me_pagecache_clean(p, pfn);
680}
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701static int me_swapcache_dirty(struct page *p, unsigned long pfn)
702{
703 ClearPageDirty(p);
704
705 ClearPageUptodate(p);
706
707 if (!delete_from_lru_cache(p))
708 return MF_DELAYED;
709 else
710 return MF_FAILED;
711}
712
713static int me_swapcache_clean(struct page *p, unsigned long pfn)
714{
715 delete_from_swap_cache(p);
716
717 if (!delete_from_lru_cache(p))
718 return MF_RECOVERED;
719 else
720 return MF_FAILED;
721}
722
723
724
725
726
727
728
729static int me_huge_page(struct page *p, unsigned long pfn)
730{
731 int res = 0;
732 struct page *hpage = compound_head(p);
733
734 if (!PageHuge(hpage))
735 return MF_DELAYED;
736
737
738
739
740
741
742
743
744
745 if (!(page_mapping(hpage) || PageAnon(hpage))) {
746 res = dequeue_hwpoisoned_huge_page(hpage);
747 if (!res)
748 return MF_RECOVERED;
749 }
750 return MF_DELAYED;
751}
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766#define dirty (1UL << PG_dirty)
767#define sc (1UL << PG_swapcache)
768#define unevict (1UL << PG_unevictable)
769#define mlock (1UL << PG_mlocked)
770#define writeback (1UL << PG_writeback)
771#define lru (1UL << PG_lru)
772#define swapbacked (1UL << PG_swapbacked)
773#define head (1UL << PG_head)
774#define slab (1UL << PG_slab)
775#define reserved (1UL << PG_reserved)
776
777static struct page_state {
778 unsigned long mask;
779 unsigned long res;
780 enum mf_action_page_type type;
781 int (*action)(struct page *p, unsigned long pfn);
782} error_states[] = {
783 { reserved, reserved, MF_MSG_KERNEL, me_kernel },
784
785
786
787
788
789
790
791
792
793
794 { slab, slab, MF_MSG_SLAB, me_kernel },
795
796 { head, head, MF_MSG_HUGE, me_huge_page },
797
798 { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
799 { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
800
801 { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
802 { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
803
804 { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
805 { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
806
807 { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty },
808 { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean },
809
810
811
812
813 { 0, 0, MF_MSG_UNKNOWN, me_unknown },
814};
815
816#undef dirty
817#undef sc
818#undef unevict
819#undef mlock
820#undef writeback
821#undef lru
822#undef swapbacked
823#undef head
824#undef slab
825#undef reserved
826
827
828
829
830
831static void action_result(unsigned long pfn, enum mf_action_page_type type,
832 enum mf_result result)
833{
834 trace_memory_failure_event(pfn, type, result);
835
836 pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
837 pfn, action_page_types[type], action_name[result]);
838}
839
840static int page_action(struct page_state *ps, struct page *p,
841 unsigned long pfn)
842{
843 int result;
844 int count;
845
846 result = ps->action(p, pfn);
847
848 count = page_count(p) - 1;
849 if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
850 count--;
851 if (count != 0) {
852 pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
853 pfn, action_page_types[ps->type], count);
854 result = MF_FAILED;
855 }
856 action_result(pfn, ps->type, result);
857
858
859
860
861
862
863 return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
864}
865
866
867
868
869
870
871
872
873int get_hwpoison_page(struct page *page)
874{
875 struct page *head = compound_head(page);
876
877 if (!PageHuge(head) && PageTransHuge(head)) {
878
879
880
881
882
883
884 if (!PageAnon(head)) {
885 pr_err("Memory failure: %#lx: non anonymous thp\n",
886 page_to_pfn(page));
887 return 0;
888 }
889 }
890
891 if (get_page_unless_zero(head)) {
892 if (head == compound_head(page))
893 return 1;
894
895 pr_info("Memory failure: %#lx cannot catch tail\n",
896 page_to_pfn(page));
897 put_page(head);
898 }
899
900 return 0;
901}
902EXPORT_SYMBOL_GPL(get_hwpoison_page);
903
904
905
906
907
908static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
909 int trapno, int flags, struct page **hpagep)
910{
911 enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
912 struct address_space *mapping;
913 LIST_HEAD(tokill);
914 int ret;
915 int kill = 1, forcekill;
916 struct page *hpage = *hpagep;
917
918
919
920
921
922 if (PageReserved(p) || PageSlab(p))
923 return SWAP_SUCCESS;
924 if (!(PageLRU(hpage) || PageHuge(p)))
925 return SWAP_SUCCESS;
926
927
928
929
930
931 if (!page_mapped(hpage))
932 return SWAP_SUCCESS;
933
934 if (PageKsm(p)) {
935 pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
936 return SWAP_FAIL;
937 }
938
939 if (PageSwapCache(p)) {
940 pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
941 pfn);
942 ttu |= TTU_IGNORE_HWPOISON;
943 }
944
945
946
947
948
949
950
951 mapping = page_mapping(hpage);
952 if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
953 mapping_cap_writeback_dirty(mapping)) {
954 if (page_mkclean(hpage)) {
955 SetPageDirty(hpage);
956 } else {
957 kill = 0;
958 ttu |= TTU_IGNORE_HWPOISON;
959 pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
960 pfn);
961 }
962 }
963
964
965
966
967
968
969
970
971
972 if (kill)
973 collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
974
975 ret = try_to_unmap(hpage, ttu);
976 if (ret != SWAP_SUCCESS)
977 pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
978 pfn, page_mapcount(hpage));
979
980
981
982
983
984
985
986
987
988
989
990 forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
991 kill_procs(&tokill, forcekill, trapno,
992 ret != SWAP_SUCCESS, p, pfn, flags);
993
994 return ret;
995}
996
997static void set_page_hwpoison_huge_page(struct page *hpage)
998{
999 int i;
1000 int nr_pages = 1 << compound_order(hpage);
1001 for (i = 0; i < nr_pages; i++)
1002 SetPageHWPoison(hpage + i);
1003}
1004
1005static void clear_page_hwpoison_huge_page(struct page *hpage)
1006{
1007 int i;
1008 int nr_pages = 1 << compound_order(hpage);
1009 for (i = 0; i < nr_pages; i++)
1010 ClearPageHWPoison(hpage + i);
1011}
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031int memory_failure(unsigned long pfn, int trapno, int flags)
1032{
1033 struct page_state *ps;
1034 struct page *p;
1035 struct page *hpage;
1036 struct page *orig_head;
1037 int res;
1038 unsigned int nr_pages;
1039 unsigned long page_flags;
1040
1041 if (!sysctl_memory_failure_recovery)
1042 panic("Memory failure from trap %d on page %lx", trapno, pfn);
1043
1044 if (!pfn_valid(pfn)) {
1045 pr_err("Memory failure: %#lx: memory outside kernel control\n",
1046 pfn);
1047 return -ENXIO;
1048 }
1049
1050 p = pfn_to_page(pfn);
1051 orig_head = hpage = compound_head(p);
1052 if (TestSetPageHWPoison(p)) {
1053 pr_err("Memory failure: %#lx: already hardware poisoned\n",
1054 pfn);
1055 return 0;
1056 }
1057
1058
1059
1060
1061
1062
1063
1064
1065 if (PageHuge(p))
1066 nr_pages = 1 << compound_order(hpage);
1067 else
1068 nr_pages = 1;
1069 num_poisoned_pages_add(nr_pages);
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085 if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
1086 if (is_free_buddy_page(p)) {
1087 action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1088 return 0;
1089 } else if (PageHuge(hpage)) {
1090
1091
1092
1093 lock_page(hpage);
1094 if (PageHWPoison(hpage)) {
1095 if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
1096 || (p != hpage && TestSetPageHWPoison(hpage))) {
1097 num_poisoned_pages_sub(nr_pages);
1098 unlock_page(hpage);
1099 return 0;
1100 }
1101 }
1102 set_page_hwpoison_huge_page(hpage);
1103 res = dequeue_hwpoisoned_huge_page(hpage);
1104 action_result(pfn, MF_MSG_FREE_HUGE,
1105 res ? MF_IGNORED : MF_DELAYED);
1106 unlock_page(hpage);
1107 return res;
1108 } else {
1109 action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
1110 return -EBUSY;
1111 }
1112 }
1113
1114 if (!PageHuge(p) && PageTransHuge(hpage)) {
1115 lock_page(hpage);
1116 if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
1117 unlock_page(hpage);
1118 if (!PageAnon(hpage))
1119 pr_err("Memory failure: %#lx: non anonymous thp\n",
1120 pfn);
1121 else
1122 pr_err("Memory failure: %#lx: thp split failed\n",
1123 pfn);
1124 if (TestClearPageHWPoison(p))
1125 num_poisoned_pages_sub(nr_pages);
1126 put_hwpoison_page(p);
1127 return -EBUSY;
1128 }
1129 unlock_page(hpage);
1130 get_hwpoison_page(p);
1131 put_hwpoison_page(hpage);
1132 VM_BUG_ON_PAGE(!page_count(p), p);
1133 hpage = compound_head(p);
1134 }
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144 if (!PageHuge(p)) {
1145 if (!PageLRU(p))
1146 shake_page(p, 0);
1147 if (!PageLRU(p)) {
1148
1149
1150
1151 if (is_free_buddy_page(p)) {
1152 if (flags & MF_COUNT_INCREASED)
1153 action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1154 else
1155 action_result(pfn, MF_MSG_BUDDY_2ND,
1156 MF_DELAYED);
1157 return 0;
1158 }
1159 }
1160 }
1161
1162 lock_page(hpage);
1163
1164
1165
1166
1167
1168 if (PageCompound(p) && compound_head(p) != orig_head) {
1169 action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
1170 res = -EBUSY;
1171 goto out;
1172 }
1173
1174
1175
1176
1177
1178
1179
1180
1181 page_flags = p->flags;
1182
1183
1184
1185
1186 if (!PageHWPoison(p)) {
1187 pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1188 num_poisoned_pages_sub(nr_pages);
1189 unlock_page(hpage);
1190 put_hwpoison_page(hpage);
1191 return 0;
1192 }
1193 if (hwpoison_filter(p)) {
1194 if (TestClearPageHWPoison(p))
1195 num_poisoned_pages_sub(nr_pages);
1196 unlock_page(hpage);
1197 put_hwpoison_page(hpage);
1198 return 0;
1199 }
1200
1201 if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
1202 goto identify_page_state;
1203
1204
1205
1206
1207
1208 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1209 action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
1210 unlock_page(hpage);
1211 put_hwpoison_page(hpage);
1212 return 0;
1213 }
1214
1215
1216
1217
1218
1219
1220 if (PageHuge(p))
1221 set_page_hwpoison_huge_page(hpage);
1222
1223
1224
1225
1226
1227 wait_on_page_writeback(p);
1228
1229
1230
1231
1232
1233
1234
1235
1236 if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
1237 != SWAP_SUCCESS) {
1238 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1239 res = -EBUSY;
1240 goto out;
1241 }
1242
1243
1244
1245
1246 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1247 action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
1248 res = -EBUSY;
1249 goto out;
1250 }
1251
1252identify_page_state:
1253 res = -EBUSY;
1254
1255
1256
1257
1258
1259 for (ps = error_states;; ps++)
1260 if ((p->flags & ps->mask) == ps->res)
1261 break;
1262
1263 page_flags |= (p->flags & (1UL << PG_dirty));
1264
1265 if (!ps->mask)
1266 for (ps = error_states;; ps++)
1267 if ((page_flags & ps->mask) == ps->res)
1268 break;
1269 res = page_action(ps, p, pfn);
1270out:
1271 unlock_page(hpage);
1272 return res;
1273}
1274EXPORT_SYMBOL_GPL(memory_failure);
1275
1276#define MEMORY_FAILURE_FIFO_ORDER 4
1277#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1278
1279struct memory_failure_entry {
1280 unsigned long pfn;
1281 int trapno;
1282 int flags;
1283};
1284
1285struct memory_failure_cpu {
1286 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1287 MEMORY_FAILURE_FIFO_SIZE);
1288 spinlock_t lock;
1289 struct work_struct work;
1290};
1291
1292static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311void memory_failure_queue(unsigned long pfn, int trapno, int flags)
1312{
1313 struct memory_failure_cpu *mf_cpu;
1314 unsigned long proc_flags;
1315 struct memory_failure_entry entry = {
1316 .pfn = pfn,
1317 .trapno = trapno,
1318 .flags = flags,
1319 };
1320
1321 mf_cpu = &get_cpu_var(memory_failure_cpu);
1322 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1323 if (kfifo_put(&mf_cpu->fifo, entry))
1324 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1325 else
1326 pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
1327 pfn);
1328 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1329 put_cpu_var(memory_failure_cpu);
1330}
1331EXPORT_SYMBOL_GPL(memory_failure_queue);
1332
1333static void memory_failure_work_func(struct work_struct *work)
1334{
1335 struct memory_failure_cpu *mf_cpu;
1336 struct memory_failure_entry entry = { 0, };
1337 unsigned long proc_flags;
1338 int gotten;
1339
1340 mf_cpu = this_cpu_ptr(&memory_failure_cpu);
1341 for (;;) {
1342 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1343 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1344 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1345 if (!gotten)
1346 break;
1347 if (entry.flags & MF_SOFT_OFFLINE)
1348 soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
1349 else
1350 memory_failure(entry.pfn, entry.trapno, entry.flags);
1351 }
1352}
1353
1354static int __init memory_failure_init(void)
1355{
1356 struct memory_failure_cpu *mf_cpu;
1357 int cpu;
1358
1359 for_each_possible_cpu(cpu) {
1360 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1361 spin_lock_init(&mf_cpu->lock);
1362 INIT_KFIFO(mf_cpu->fifo);
1363 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1364 }
1365
1366 return 0;
1367}
1368core_initcall(memory_failure_init);
1369
1370#define unpoison_pr_info(fmt, pfn, rs) \
1371({ \
1372 if (__ratelimit(rs)) \
1373 pr_info(fmt, pfn); \
1374})
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388int unpoison_memory(unsigned long pfn)
1389{
1390 struct page *page;
1391 struct page *p;
1392 int freeit = 0;
1393 unsigned int nr_pages;
1394 static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
1395 DEFAULT_RATELIMIT_BURST);
1396
1397 if (!pfn_valid(pfn))
1398 return -ENXIO;
1399
1400 p = pfn_to_page(pfn);
1401 page = compound_head(p);
1402
1403 if (!PageHWPoison(p)) {
1404 unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
1405 pfn, &unpoison_rs);
1406 return 0;
1407 }
1408
1409 if (page_count(page) > 1) {
1410 unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
1411 pfn, &unpoison_rs);
1412 return 0;
1413 }
1414
1415 if (page_mapped(page)) {
1416 unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
1417 pfn, &unpoison_rs);
1418 return 0;
1419 }
1420
1421 if (page_mapping(page)) {
1422 unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
1423 pfn, &unpoison_rs);
1424 return 0;
1425 }
1426
1427
1428
1429
1430
1431
1432 if (!PageHuge(page) && PageTransHuge(page)) {
1433 unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
1434 pfn, &unpoison_rs);
1435 return 0;
1436 }
1437
1438 nr_pages = 1 << compound_order(page);
1439
1440 if (!get_hwpoison_page(p)) {
1441
1442
1443
1444
1445
1446
1447 if (PageHuge(page)) {
1448 unpoison_pr_info("Unpoison: Memory failure is now running on free hugepage %#lx\n",
1449 pfn, &unpoison_rs);
1450 return 0;
1451 }
1452 if (TestClearPageHWPoison(p))
1453 num_poisoned_pages_dec();
1454 unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
1455 pfn, &unpoison_rs);
1456 return 0;
1457 }
1458
1459 lock_page(page);
1460
1461
1462
1463
1464
1465
1466 if (TestClearPageHWPoison(page)) {
1467 unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
1468 pfn, &unpoison_rs);
1469 num_poisoned_pages_sub(nr_pages);
1470 freeit = 1;
1471 if (PageHuge(page))
1472 clear_page_hwpoison_huge_page(page);
1473 }
1474 unlock_page(page);
1475
1476 put_hwpoison_page(page);
1477 if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
1478 put_hwpoison_page(page);
1479
1480 return 0;
1481}
1482EXPORT_SYMBOL(unpoison_memory);
1483
1484static struct page *new_page(struct page *p, unsigned long private, int **x)
1485{
1486 int nid = page_to_nid(p);
1487 if (PageHuge(p))
1488 return alloc_huge_page_node(page_hstate(compound_head(p)),
1489 nid);
1490 else
1491 return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1492}
1493
1494
1495
1496
1497
1498
1499
1500static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1501{
1502 int ret;
1503
1504 if (flags & MF_COUNT_INCREASED)
1505 return 1;
1506
1507
1508
1509
1510
1511 if (!get_hwpoison_page(p)) {
1512 if (PageHuge(p)) {
1513 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1514 ret = 0;
1515 } else if (is_free_buddy_page(p)) {
1516 pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1517 ret = 0;
1518 } else {
1519 pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1520 __func__, pfn, p->flags);
1521 ret = -EIO;
1522 }
1523 } else {
1524
1525 ret = 1;
1526 }
1527 return ret;
1528}
1529
1530static int get_any_page(struct page *page, unsigned long pfn, int flags)
1531{
1532 int ret = __get_any_page(page, pfn, flags);
1533
1534 if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
1535
1536
1537
1538 put_hwpoison_page(page);
1539 shake_page(page, 1);
1540
1541
1542
1543
1544 ret = __get_any_page(page, pfn, 0);
1545 if (ret == 1 && !PageLRU(page)) {
1546
1547 put_hwpoison_page(page);
1548 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1549 pfn, page->flags);
1550 return -EIO;
1551 }
1552 }
1553 return ret;
1554}
1555
1556static int soft_offline_huge_page(struct page *page, int flags)
1557{
1558 int ret;
1559 unsigned long pfn = page_to_pfn(page);
1560 struct page *hpage = compound_head(page);
1561 LIST_HEAD(pagelist);
1562
1563
1564
1565
1566
1567 lock_page(hpage);
1568 if (PageHWPoison(hpage)) {
1569 unlock_page(hpage);
1570 put_hwpoison_page(hpage);
1571 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1572 return -EBUSY;
1573 }
1574 unlock_page(hpage);
1575
1576 ret = isolate_huge_page(hpage, &pagelist);
1577
1578
1579
1580
1581 put_hwpoison_page(hpage);
1582 if (!ret) {
1583 pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
1584 return -EBUSY;
1585 }
1586
1587 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1588 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1589 if (ret) {
1590 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1591 pfn, ret, page->flags);
1592
1593
1594
1595
1596
1597 putback_active_hugepage(hpage);
1598 if (ret > 0)
1599 ret = -EIO;
1600 } else {
1601
1602 if (PageHuge(page)) {
1603 set_page_hwpoison_huge_page(hpage);
1604 dequeue_hwpoisoned_huge_page(hpage);
1605 num_poisoned_pages_add(1 << compound_order(hpage));
1606 } else {
1607 SetPageHWPoison(page);
1608 num_poisoned_pages_inc();
1609 }
1610 }
1611 return ret;
1612}
1613
1614static int __soft_offline_page(struct page *page, int flags)
1615{
1616 int ret;
1617 unsigned long pfn = page_to_pfn(page);
1618
1619
1620
1621
1622
1623
1624
1625 lock_page(page);
1626 wait_on_page_writeback(page);
1627 if (PageHWPoison(page)) {
1628 unlock_page(page);
1629 put_hwpoison_page(page);
1630 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1631 return -EBUSY;
1632 }
1633
1634
1635
1636
1637 ret = invalidate_inode_page(page);
1638 unlock_page(page);
1639
1640
1641
1642
1643 if (ret == 1) {
1644 put_hwpoison_page(page);
1645 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1646 SetPageHWPoison(page);
1647 num_poisoned_pages_inc();
1648 return 0;
1649 }
1650
1651
1652
1653
1654
1655
1656 ret = isolate_lru_page(page);
1657
1658
1659
1660
1661 put_hwpoison_page(page);
1662 if (!ret) {
1663 LIST_HEAD(pagelist);
1664 inc_node_page_state(page, NR_ISOLATED_ANON +
1665 page_is_file_cache(page));
1666 list_add(&page->lru, &pagelist);
1667 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1668 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1669 if (ret) {
1670 if (!list_empty(&pagelist)) {
1671 list_del(&page->lru);
1672 dec_node_page_state(page, NR_ISOLATED_ANON +
1673 page_is_file_cache(page));
1674 putback_lru_page(page);
1675 }
1676
1677 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1678 pfn, ret, page->flags);
1679 if (ret > 0)
1680 ret = -EIO;
1681 }
1682 } else {
1683 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1684 pfn, ret, page_count(page), page->flags);
1685 }
1686 return ret;
1687}
1688
1689static int soft_offline_in_use_page(struct page *page, int flags)
1690{
1691 int ret;
1692 struct page *hpage = compound_head(page);
1693
1694 if (!PageHuge(page) && PageTransHuge(hpage)) {
1695 lock_page(hpage);
1696 if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
1697 unlock_page(hpage);
1698 if (!PageAnon(hpage))
1699 pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
1700 else
1701 pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
1702 put_hwpoison_page(hpage);
1703 return -EBUSY;
1704 }
1705 unlock_page(hpage);
1706 get_hwpoison_page(page);
1707 put_hwpoison_page(hpage);
1708 }
1709
1710 if (PageHuge(page))
1711 ret = soft_offline_huge_page(page, flags);
1712 else
1713 ret = __soft_offline_page(page, flags);
1714
1715 return ret;
1716}
1717
1718static void soft_offline_free_page(struct page *page)
1719{
1720 if (PageHuge(page)) {
1721 struct page *hpage = compound_head(page);
1722
1723 set_page_hwpoison_huge_page(hpage);
1724 if (!dequeue_hwpoisoned_huge_page(hpage))
1725 num_poisoned_pages_add(1 << compound_order(hpage));
1726 } else {
1727 if (!TestSetPageHWPoison(page))
1728 num_poisoned_pages_inc();
1729 }
1730}
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754int soft_offline_page(struct page *page, int flags)
1755{
1756 int ret;
1757 unsigned long pfn = page_to_pfn(page);
1758
1759 if (PageHWPoison(page)) {
1760 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1761 if (flags & MF_COUNT_INCREASED)
1762 put_hwpoison_page(page);
1763 return -EBUSY;
1764 }
1765
1766 get_online_mems();
1767 ret = get_any_page(page, pfn, flags);
1768 put_online_mems();
1769
1770 if (ret > 0)
1771 ret = soft_offline_in_use_page(page, flags);
1772 else if (ret == 0)
1773 soft_offline_free_page(page);
1774
1775 return ret;
1776}
1777