linux/mm/memory-failure.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (C) 2008, 2009 Intel Corporation
   4 * Authors: Andi Kleen, Fengguang Wu
   5 *
   6 * High level machine check handler. Handles pages reported by the
   7 * hardware as being corrupted usually due to a multi-bit ECC memory or cache
   8 * failure.
   9 * 
  10 * In addition there is a "soft offline" entry point that allows stop using
  11 * not-yet-corrupted-by-suspicious pages without killing anything.
  12 *
  13 * Handles page cache pages in various states.  The tricky part
  14 * here is that we can access any page asynchronously in respect to 
  15 * other VM users, because memory failures could happen anytime and 
  16 * anywhere. This could violate some of their assumptions. This is why 
  17 * this code has to be extremely careful. Generally it tries to use 
  18 * normal locking rules, as in get the standard locks, even if that means 
  19 * the error handling takes potentially a long time.
  20 *
  21 * It can be very tempting to add handling for obscure cases here.
  22 * In general any code for handling new cases should only be added iff:
  23 * - You know how to test it.
  24 * - You have a test that can be added to mce-test
  25 *   https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
  26 * - The case actually shows up as a frequent (top 10) page state in
  27 *   tools/vm/page-types when running a real workload.
  28 * 
  29 * There are several operations here with exponential complexity because
  30 * of unsuitable VM data structures. For example the operation to map back 
  31 * from RMAP chains to processes has to walk the complete process list and 
  32 * has non linear complexity with the number. But since memory corruptions
  33 * are rare we hope to get away with this. This avoids impacting the core 
  34 * VM.
  35 */
  36#include <linux/kernel.h>
  37#include <linux/mm.h>
  38#include <linux/page-flags.h>
  39#include <linux/kernel-page-flags.h>
  40#include <linux/sched/signal.h>
  41#include <linux/sched/task.h>
  42#include <linux/ksm.h>
  43#include <linux/rmap.h>
  44#include <linux/export.h>
  45#include <linux/pagemap.h>
  46#include <linux/swap.h>
  47#include <linux/backing-dev.h>
  48#include <linux/migrate.h>
  49#include <linux/suspend.h>
  50#include <linux/slab.h>
  51#include <linux/swapops.h>
  52#include <linux/hugetlb.h>
  53#include <linux/memory_hotplug.h>
  54#include <linux/mm_inline.h>
  55#include <linux/memremap.h>
  56#include <linux/kfifo.h>
  57#include <linux/ratelimit.h>
  58#include <linux/page-isolation.h>
  59#include "internal.h"
  60#include "ras/ras_event.h"
  61
  62int sysctl_memory_failure_early_kill __read_mostly = 0;
  63
  64int sysctl_memory_failure_recovery __read_mostly = 1;
  65
  66atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
  67
  68#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
  69
  70u32 hwpoison_filter_enable = 0;
  71u32 hwpoison_filter_dev_major = ~0U;
  72u32 hwpoison_filter_dev_minor = ~0U;
  73u64 hwpoison_filter_flags_mask;
  74u64 hwpoison_filter_flags_value;
  75EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
  76EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
  77EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
  78EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
  79EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
  80
  81static int hwpoison_filter_dev(struct page *p)
  82{
  83        struct address_space *mapping;
  84        dev_t dev;
  85
  86        if (hwpoison_filter_dev_major == ~0U &&
  87            hwpoison_filter_dev_minor == ~0U)
  88                return 0;
  89
  90        /*
  91         * page_mapping() does not accept slab pages.
  92         */
  93        if (PageSlab(p))
  94                return -EINVAL;
  95
  96        mapping = page_mapping(p);
  97        if (mapping == NULL || mapping->host == NULL)
  98                return -EINVAL;
  99
 100        dev = mapping->host->i_sb->s_dev;
 101        if (hwpoison_filter_dev_major != ~0U &&
 102            hwpoison_filter_dev_major != MAJOR(dev))
 103                return -EINVAL;
 104        if (hwpoison_filter_dev_minor != ~0U &&
 105            hwpoison_filter_dev_minor != MINOR(dev))
 106                return -EINVAL;
 107
 108        return 0;
 109}
 110
 111static int hwpoison_filter_flags(struct page *p)
 112{
 113        if (!hwpoison_filter_flags_mask)
 114                return 0;
 115
 116        if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
 117                                    hwpoison_filter_flags_value)
 118                return 0;
 119        else
 120                return -EINVAL;
 121}
 122
 123/*
 124 * This allows stress tests to limit test scope to a collection of tasks
 125 * by putting them under some memcg. This prevents killing unrelated/important
 126 * processes such as /sbin/init. Note that the target task may share clean
 127 * pages with init (eg. libc text), which is harmless. If the target task
 128 * share _dirty_ pages with another task B, the test scheme must make sure B
 129 * is also included in the memcg. At last, due to race conditions this filter
 130 * can only guarantee that the page either belongs to the memcg tasks, or is
 131 * a freed page.
 132 */
 133#ifdef CONFIG_MEMCG
 134u64 hwpoison_filter_memcg;
 135EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
 136static int hwpoison_filter_task(struct page *p)
 137{
 138        if (!hwpoison_filter_memcg)
 139                return 0;
 140
 141        if (page_cgroup_ino(p) != hwpoison_filter_memcg)
 142                return -EINVAL;
 143
 144        return 0;
 145}
 146#else
 147static int hwpoison_filter_task(struct page *p) { return 0; }
 148#endif
 149
 150int hwpoison_filter(struct page *p)
 151{
 152        if (!hwpoison_filter_enable)
 153                return 0;
 154
 155        if (hwpoison_filter_dev(p))
 156                return -EINVAL;
 157
 158        if (hwpoison_filter_flags(p))
 159                return -EINVAL;
 160
 161        if (hwpoison_filter_task(p))
 162                return -EINVAL;
 163
 164        return 0;
 165}
 166#else
 167int hwpoison_filter(struct page *p)
 168{
 169        return 0;
 170}
 171#endif
 172
 173EXPORT_SYMBOL_GPL(hwpoison_filter);
 174
 175/*
 176 * Kill all processes that have a poisoned page mapped and then isolate
 177 * the page.
 178 *
 179 * General strategy:
 180 * Find all processes having the page mapped and kill them.
 181 * But we keep a page reference around so that the page is not
 182 * actually freed yet.
 183 * Then stash the page away
 184 *
 185 * There's no convenient way to get back to mapped processes
 186 * from the VMAs. So do a brute-force search over all
 187 * running processes.
 188 *
 189 * Remember that machine checks are not common (or rather
 190 * if they are common you have other problems), so this shouldn't
 191 * be a performance issue.
 192 *
 193 * Also there are some races possible while we get from the
 194 * error detection to actually handle it.
 195 */
 196
 197struct to_kill {
 198        struct list_head nd;
 199        struct task_struct *tsk;
 200        unsigned long addr;
 201        short size_shift;
 202};
 203
 204/*
 205 * Send all the processes who have the page mapped a signal.
 206 * ``action optional'' if they are not immediately affected by the error
 207 * ``action required'' if error happened in current execution context
 208 */
 209static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
 210{
 211        struct task_struct *t = tk->tsk;
 212        short addr_lsb = tk->size_shift;
 213        int ret;
 214
 215        pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
 216                pfn, t->comm, t->pid);
 217
 218        if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
 219                ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr,
 220                                       addr_lsb);
 221        } else {
 222                /*
 223                 * Don't use force here, it's convenient if the signal
 224                 * can be temporarily blocked.
 225                 * This could cause a loop when the user sets SIGBUS
 226                 * to SIG_IGN, but hopefully no one will do that?
 227                 */
 228                ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
 229                                      addr_lsb, t);  /* synchronous? */
 230        }
 231        if (ret < 0)
 232                pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
 233                        t->comm, t->pid, ret);
 234        return ret;
 235}
 236
 237/*
 238 * When a unknown page type is encountered drain as many buffers as possible
 239 * in the hope to turn the page into a LRU or free page, which we can handle.
 240 */
 241void shake_page(struct page *p, int access)
 242{
 243        if (PageHuge(p))
 244                return;
 245
 246        if (!PageSlab(p)) {
 247                lru_add_drain_all();
 248                if (PageLRU(p))
 249                        return;
 250                drain_all_pages(page_zone(p));
 251                if (PageLRU(p) || is_free_buddy_page(p))
 252                        return;
 253        }
 254
 255        /*
 256         * Only call shrink_node_slabs here (which would also shrink
 257         * other caches) if access is not potentially fatal.
 258         */
 259        if (access)
 260                drop_slab_node(page_to_nid(p));
 261}
 262EXPORT_SYMBOL_GPL(shake_page);
 263
 264static unsigned long dev_pagemap_mapping_shift(struct page *page,
 265                struct vm_area_struct *vma)
 266{
 267        unsigned long address = vma_address(page, vma);
 268        pgd_t *pgd;
 269        p4d_t *p4d;
 270        pud_t *pud;
 271        pmd_t *pmd;
 272        pte_t *pte;
 273
 274        pgd = pgd_offset(vma->vm_mm, address);
 275        if (!pgd_present(*pgd))
 276                return 0;
 277        p4d = p4d_offset(pgd, address);
 278        if (!p4d_present(*p4d))
 279                return 0;
 280        pud = pud_offset(p4d, address);
 281        if (!pud_present(*pud))
 282                return 0;
 283        if (pud_devmap(*pud))
 284                return PUD_SHIFT;
 285        pmd = pmd_offset(pud, address);
 286        if (!pmd_present(*pmd))
 287                return 0;
 288        if (pmd_devmap(*pmd))
 289                return PMD_SHIFT;
 290        pte = pte_offset_map(pmd, address);
 291        if (!pte_present(*pte))
 292                return 0;
 293        if (pte_devmap(*pte))
 294                return PAGE_SHIFT;
 295        return 0;
 296}
 297
 298/*
 299 * Failure handling: if we can't find or can't kill a process there's
 300 * not much we can do.  We just print a message and ignore otherwise.
 301 */
 302
 303/*
 304 * Schedule a process for later kill.
 305 * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
 306 */
 307static void add_to_kill(struct task_struct *tsk, struct page *p,
 308                       struct vm_area_struct *vma,
 309                       struct list_head *to_kill)
 310{
 311        struct to_kill *tk;
 312
 313        tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
 314        if (!tk) {
 315                pr_err("Memory failure: Out of memory while machine check handling\n");
 316                return;
 317        }
 318
 319        tk->addr = page_address_in_vma(p, vma);
 320        if (is_zone_device_page(p))
 321                tk->size_shift = dev_pagemap_mapping_shift(p, vma);
 322        else
 323                tk->size_shift = page_shift(compound_head(p));
 324
 325        /*
 326         * Send SIGKILL if "tk->addr == -EFAULT". Also, as
 327         * "tk->size_shift" is always non-zero for !is_zone_device_page(),
 328         * so "tk->size_shift == 0" effectively checks no mapping on
 329         * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times
 330         * to a process' address space, it's possible not all N VMAs
 331         * contain mappings for the page, but at least one VMA does.
 332         * Only deliver SIGBUS with payload derived from the VMA that
 333         * has a mapping for the page.
 334         */
 335        if (tk->addr == -EFAULT) {
 336                pr_info("Memory failure: Unable to find user space address %lx in %s\n",
 337                        page_to_pfn(p), tsk->comm);
 338        } else if (tk->size_shift == 0) {
 339                kfree(tk);
 340                return;
 341        }
 342
 343        get_task_struct(tsk);
 344        tk->tsk = tsk;
 345        list_add_tail(&tk->nd, to_kill);
 346}
 347
 348/*
 349 * Kill the processes that have been collected earlier.
 350 *
 351 * Only do anything when DOIT is set, otherwise just free the list
 352 * (this is used for clean pages which do not need killing)
 353 * Also when FAIL is set do a force kill because something went
 354 * wrong earlier.
 355 */
 356static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
 357                unsigned long pfn, int flags)
 358{
 359        struct to_kill *tk, *next;
 360
 361        list_for_each_entry_safe (tk, next, to_kill, nd) {
 362                if (forcekill) {
 363                        /*
 364                         * In case something went wrong with munmapping
 365                         * make sure the process doesn't catch the
 366                         * signal and then access the memory. Just kill it.
 367                         */
 368                        if (fail || tk->addr == -EFAULT) {
 369                                pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
 370                                       pfn, tk->tsk->comm, tk->tsk->pid);
 371                                do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
 372                                                 tk->tsk, PIDTYPE_PID);
 373                        }
 374
 375                        /*
 376                         * In theory the process could have mapped
 377                         * something else on the address in-between. We could
 378                         * check for that, but we need to tell the
 379                         * process anyways.
 380                         */
 381                        else if (kill_proc(tk, pfn, flags) < 0)
 382                                pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
 383                                       pfn, tk->tsk->comm, tk->tsk->pid);
 384                }
 385                put_task_struct(tk->tsk);
 386                kfree(tk);
 387        }
 388}
 389
 390/*
 391 * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
 392 * on behalf of the thread group. Return task_struct of the (first found)
 393 * dedicated thread if found, and return NULL otherwise.
 394 *
 395 * We already hold read_lock(&tasklist_lock) in the caller, so we don't
 396 * have to call rcu_read_lock/unlock() in this function.
 397 */
 398static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
 399{
 400        struct task_struct *t;
 401
 402        for_each_thread(tsk, t)
 403                if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
 404                        return t;
 405        return NULL;
 406}
 407
 408/*
 409 * Determine whether a given process is "early kill" process which expects
 410 * to be signaled when some page under the process is hwpoisoned.
 411 * Return task_struct of the dedicated thread (main thread unless explicitly
 412 * specified) if the process is "early kill," and otherwise returns NULL.
 413 */
 414static struct task_struct *task_early_kill(struct task_struct *tsk,
 415                                           int force_early)
 416{
 417        struct task_struct *t;
 418        if (!tsk->mm)
 419                return NULL;
 420        if (force_early)
 421                return tsk;
 422        t = find_early_kill_thread(tsk);
 423        if (t)
 424                return t;
 425        if (sysctl_memory_failure_early_kill)
 426                return tsk;
 427        return NULL;
 428}
 429
 430/*
 431 * Collect processes when the error hit an anonymous page.
 432 */
 433static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 434                                int force_early)
 435{
 436        struct vm_area_struct *vma;
 437        struct task_struct *tsk;
 438        struct anon_vma *av;
 439        pgoff_t pgoff;
 440
 441        av = page_lock_anon_vma_read(page);
 442        if (av == NULL) /* Not actually mapped anymore */
 443                return;
 444
 445        pgoff = page_to_pgoff(page);
 446        read_lock(&tasklist_lock);
 447        for_each_process (tsk) {
 448                struct anon_vma_chain *vmac;
 449                struct task_struct *t = task_early_kill(tsk, force_early);
 450
 451                if (!t)
 452                        continue;
 453                anon_vma_interval_tree_foreach(vmac, &av->rb_root,
 454                                               pgoff, pgoff) {
 455                        vma = vmac->vma;
 456                        if (!page_mapped_in_vma(page, vma))
 457                                continue;
 458                        if (vma->vm_mm == t->mm)
 459                                add_to_kill(t, page, vma, to_kill);
 460                }
 461        }
 462        read_unlock(&tasklist_lock);
 463        page_unlock_anon_vma_read(av);
 464}
 465
 466/*
 467 * Collect processes when the error hit a file mapped page.
 468 */
 469static void collect_procs_file(struct page *page, struct list_head *to_kill,
 470                                int force_early)
 471{
 472        struct vm_area_struct *vma;
 473        struct task_struct *tsk;
 474        struct address_space *mapping = page->mapping;
 475
 476        i_mmap_lock_read(mapping);
 477        read_lock(&tasklist_lock);
 478        for_each_process(tsk) {
 479                pgoff_t pgoff = page_to_pgoff(page);
 480                struct task_struct *t = task_early_kill(tsk, force_early);
 481
 482                if (!t)
 483                        continue;
 484                vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
 485                                      pgoff) {
 486                        /*
 487                         * Send early kill signal to tasks where a vma covers
 488                         * the page but the corrupted page is not necessarily
 489                         * mapped it in its pte.
 490                         * Assume applications who requested early kill want
 491                         * to be informed of all such data corruptions.
 492                         */
 493                        if (vma->vm_mm == t->mm)
 494                                add_to_kill(t, page, vma, to_kill);
 495                }
 496        }
 497        read_unlock(&tasklist_lock);
 498        i_mmap_unlock_read(mapping);
 499}
 500
 501/*
 502 * Collect the processes who have the corrupted page mapped to kill.
 503 */
 504static void collect_procs(struct page *page, struct list_head *tokill,
 505                                int force_early)
 506{
 507        if (!page->mapping)
 508                return;
 509
 510        if (PageAnon(page))
 511                collect_procs_anon(page, tokill, force_early);
 512        else
 513                collect_procs_file(page, tokill, force_early);
 514}
 515
 516static const char *action_name[] = {
 517        [MF_IGNORED] = "Ignored",
 518        [MF_FAILED] = "Failed",
 519        [MF_DELAYED] = "Delayed",
 520        [MF_RECOVERED] = "Recovered",
 521};
 522
 523static const char * const action_page_types[] = {
 524        [MF_MSG_KERNEL]                 = "reserved kernel page",
 525        [MF_MSG_KERNEL_HIGH_ORDER]      = "high-order kernel page",
 526        [MF_MSG_SLAB]                   = "kernel slab page",
 527        [MF_MSG_DIFFERENT_COMPOUND]     = "different compound page after locking",
 528        [MF_MSG_POISONED_HUGE]          = "huge page already hardware poisoned",
 529        [MF_MSG_HUGE]                   = "huge page",
 530        [MF_MSG_FREE_HUGE]              = "free huge page",
 531        [MF_MSG_NON_PMD_HUGE]           = "non-pmd-sized huge page",
 532        [MF_MSG_UNMAP_FAILED]           = "unmapping failed page",
 533        [MF_MSG_DIRTY_SWAPCACHE]        = "dirty swapcache page",
 534        [MF_MSG_CLEAN_SWAPCACHE]        = "clean swapcache page",
 535        [MF_MSG_DIRTY_MLOCKED_LRU]      = "dirty mlocked LRU page",
 536        [MF_MSG_CLEAN_MLOCKED_LRU]      = "clean mlocked LRU page",
 537        [MF_MSG_DIRTY_UNEVICTABLE_LRU]  = "dirty unevictable LRU page",
 538        [MF_MSG_CLEAN_UNEVICTABLE_LRU]  = "clean unevictable LRU page",
 539        [MF_MSG_DIRTY_LRU]              = "dirty LRU page",
 540        [MF_MSG_CLEAN_LRU]              = "clean LRU page",
 541        [MF_MSG_TRUNCATED_LRU]          = "already truncated LRU page",
 542        [MF_MSG_BUDDY]                  = "free buddy page",
 543        [MF_MSG_BUDDY_2ND]              = "free buddy page (2nd try)",
 544        [MF_MSG_DAX]                    = "dax page",
 545        [MF_MSG_UNKNOWN]                = "unknown page",
 546};
 547
 548/*
 549 * XXX: It is possible that a page is isolated from LRU cache,
 550 * and then kept in swap cache or failed to remove from page cache.
 551 * The page count will stop it from being freed by unpoison.
 552 * Stress tests should be aware of this memory leak problem.
 553 */
 554static int delete_from_lru_cache(struct page *p)
 555{
 556        if (!isolate_lru_page(p)) {
 557                /*
 558                 * Clear sensible page flags, so that the buddy system won't
 559                 * complain when the page is unpoison-and-freed.
 560                 */
 561                ClearPageActive(p);
 562                ClearPageUnevictable(p);
 563
 564                /*
 565                 * Poisoned page might never drop its ref count to 0 so we have
 566                 * to uncharge it manually from its memcg.
 567                 */
 568                mem_cgroup_uncharge(p);
 569
 570                /*
 571                 * drop the page count elevated by isolate_lru_page()
 572                 */
 573                put_page(p);
 574                return 0;
 575        }
 576        return -EIO;
 577}
 578
 579static int truncate_error_page(struct page *p, unsigned long pfn,
 580                                struct address_space *mapping)
 581{
 582        int ret = MF_FAILED;
 583
 584        if (mapping->a_ops->error_remove_page) {
 585                int err = mapping->a_ops->error_remove_page(mapping, p);
 586
 587                if (err != 0) {
 588                        pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
 589                                pfn, err);
 590                } else if (page_has_private(p) &&
 591                           !try_to_release_page(p, GFP_NOIO)) {
 592                        pr_info("Memory failure: %#lx: failed to release buffers\n",
 593                                pfn);
 594                } else {
 595                        ret = MF_RECOVERED;
 596                }
 597        } else {
 598                /*
 599                 * If the file system doesn't support it just invalidate
 600                 * This fails on dirty or anything with private pages
 601                 */
 602                if (invalidate_inode_page(p))
 603                        ret = MF_RECOVERED;
 604                else
 605                        pr_info("Memory failure: %#lx: Failed to invalidate\n",
 606                                pfn);
 607        }
 608
 609        return ret;
 610}
 611
 612/*
 613 * Error hit kernel page.
 614 * Do nothing, try to be lucky and not touch this instead. For a few cases we
 615 * could be more sophisticated.
 616 */
 617static int me_kernel(struct page *p, unsigned long pfn)
 618{
 619        return MF_IGNORED;
 620}
 621
 622/*
 623 * Page in unknown state. Do nothing.
 624 */
 625static int me_unknown(struct page *p, unsigned long pfn)
 626{
 627        pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
 628        return MF_FAILED;
 629}
 630
 631/*
 632 * Clean (or cleaned) page cache page.
 633 */
 634static int me_pagecache_clean(struct page *p, unsigned long pfn)
 635{
 636        struct address_space *mapping;
 637
 638        delete_from_lru_cache(p);
 639
 640        /*
 641         * For anonymous pages we're done the only reference left
 642         * should be the one m_f() holds.
 643         */
 644        if (PageAnon(p))
 645                return MF_RECOVERED;
 646
 647        /*
 648         * Now truncate the page in the page cache. This is really
 649         * more like a "temporary hole punch"
 650         * Don't do this for block devices when someone else
 651         * has a reference, because it could be file system metadata
 652         * and that's not safe to truncate.
 653         */
 654        mapping = page_mapping(p);
 655        if (!mapping) {
 656                /*
 657                 * Page has been teared down in the meanwhile
 658                 */
 659                return MF_FAILED;
 660        }
 661
 662        /*
 663         * Truncation is a bit tricky. Enable it per file system for now.
 664         *
 665         * Open: to take i_mutex or not for this? Right now we don't.
 666         */
 667        return truncate_error_page(p, pfn, mapping);
 668}
 669
 670/*
 671 * Dirty pagecache page
 672 * Issues: when the error hit a hole page the error is not properly
 673 * propagated.
 674 */
 675static int me_pagecache_dirty(struct page *p, unsigned long pfn)
 676{
 677        struct address_space *mapping = page_mapping(p);
 678
 679        SetPageError(p);
 680        /* TBD: print more information about the file. */
 681        if (mapping) {
 682                /*
 683                 * IO error will be reported by write(), fsync(), etc.
 684                 * who check the mapping.
 685                 * This way the application knows that something went
 686                 * wrong with its dirty file data.
 687                 *
 688                 * There's one open issue:
 689                 *
 690                 * The EIO will be only reported on the next IO
 691                 * operation and then cleared through the IO map.
 692                 * Normally Linux has two mechanisms to pass IO error
 693                 * first through the AS_EIO flag in the address space
 694                 * and then through the PageError flag in the page.
 695                 * Since we drop pages on memory failure handling the
 696                 * only mechanism open to use is through AS_AIO.
 697                 *
 698                 * This has the disadvantage that it gets cleared on
 699                 * the first operation that returns an error, while
 700                 * the PageError bit is more sticky and only cleared
 701                 * when the page is reread or dropped.  If an
 702                 * application assumes it will always get error on
 703                 * fsync, but does other operations on the fd before
 704                 * and the page is dropped between then the error
 705                 * will not be properly reported.
 706                 *
 707                 * This can already happen even without hwpoisoned
 708                 * pages: first on metadata IO errors (which only
 709                 * report through AS_EIO) or when the page is dropped
 710                 * at the wrong time.
 711                 *
 712                 * So right now we assume that the application DTRT on
 713                 * the first EIO, but we're not worse than other parts
 714                 * of the kernel.
 715                 */
 716                mapping_set_error(mapping, -EIO);
 717        }
 718
 719        return me_pagecache_clean(p, pfn);
 720}
 721
 722/*
 723 * Clean and dirty swap cache.
 724 *
 725 * Dirty swap cache page is tricky to handle. The page could live both in page
 726 * cache and swap cache(ie. page is freshly swapped in). So it could be
 727 * referenced concurrently by 2 types of PTEs:
 728 * normal PTEs and swap PTEs. We try to handle them consistently by calling
 729 * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
 730 * and then
 731 *      - clear dirty bit to prevent IO
 732 *      - remove from LRU
 733 *      - but keep in the swap cache, so that when we return to it on
 734 *        a later page fault, we know the application is accessing
 735 *        corrupted data and shall be killed (we installed simple
 736 *        interception code in do_swap_page to catch it).
 737 *
 738 * Clean swap cache pages can be directly isolated. A later page fault will
 739 * bring in the known good data from disk.
 740 */
 741static int me_swapcache_dirty(struct page *p, unsigned long pfn)
 742{
 743        ClearPageDirty(p);
 744        /* Trigger EIO in shmem: */
 745        ClearPageUptodate(p);
 746
 747        if (!delete_from_lru_cache(p))
 748                return MF_DELAYED;
 749        else
 750                return MF_FAILED;
 751}
 752
 753static int me_swapcache_clean(struct page *p, unsigned long pfn)
 754{
 755        delete_from_swap_cache(p);
 756
 757        if (!delete_from_lru_cache(p))
 758                return MF_RECOVERED;
 759        else
 760                return MF_FAILED;
 761}
 762
 763/*
 764 * Huge pages. Needs work.
 765 * Issues:
 766 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
 767 *   To narrow down kill region to one page, we need to break up pmd.
 768 */
 769static int me_huge_page(struct page *p, unsigned long pfn)
 770{
 771        int res = 0;
 772        struct page *hpage = compound_head(p);
 773        struct address_space *mapping;
 774
 775        if (!PageHuge(hpage))
 776                return MF_DELAYED;
 777
 778        mapping = page_mapping(hpage);
 779        if (mapping) {
 780                res = truncate_error_page(hpage, pfn, mapping);
 781        } else {
 782                unlock_page(hpage);
 783                /*
 784                 * migration entry prevents later access on error anonymous
 785                 * hugepage, so we can free and dissolve it into buddy to
 786                 * save healthy subpages.
 787                 */
 788                if (PageAnon(hpage))
 789                        put_page(hpage);
 790                dissolve_free_huge_page(p);
 791                res = MF_RECOVERED;
 792                lock_page(hpage);
 793        }
 794
 795        return res;
 796}
 797
 798/*
 799 * Various page states we can handle.
 800 *
 801 * A page state is defined by its current page->flags bits.
 802 * The table matches them in order and calls the right handler.
 803 *
 804 * This is quite tricky because we can access page at any time
 805 * in its live cycle, so all accesses have to be extremely careful.
 806 *
 807 * This is not complete. More states could be added.
 808 * For any missing state don't attempt recovery.
 809 */
 810
 811#define dirty           (1UL << PG_dirty)
 812#define sc              ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
 813#define unevict         (1UL << PG_unevictable)
 814#define mlock           (1UL << PG_mlocked)
 815#define writeback       (1UL << PG_writeback)
 816#define lru             (1UL << PG_lru)
 817#define head            (1UL << PG_head)
 818#define slab            (1UL << PG_slab)
 819#define reserved        (1UL << PG_reserved)
 820
 821static struct page_state {
 822        unsigned long mask;
 823        unsigned long res;
 824        enum mf_action_page_type type;
 825        int (*action)(struct page *p, unsigned long pfn);
 826} error_states[] = {
 827        { reserved,     reserved,       MF_MSG_KERNEL,  me_kernel },
 828        /*
 829         * free pages are specially detected outside this table:
 830         * PG_buddy pages only make a small fraction of all free pages.
 831         */
 832
 833        /*
 834         * Could in theory check if slab page is free or if we can drop
 835         * currently unused objects without touching them. But just
 836         * treat it as standard kernel for now.
 837         */
 838        { slab,         slab,           MF_MSG_SLAB,    me_kernel },
 839
 840        { head,         head,           MF_MSG_HUGE,            me_huge_page },
 841
 842        { sc|dirty,     sc|dirty,       MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
 843        { sc|dirty,     sc,             MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
 844
 845        { mlock|dirty,  mlock|dirty,    MF_MSG_DIRTY_MLOCKED_LRU,       me_pagecache_dirty },
 846        { mlock|dirty,  mlock,          MF_MSG_CLEAN_MLOCKED_LRU,       me_pagecache_clean },
 847
 848        { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU,   me_pagecache_dirty },
 849        { unevict|dirty, unevict,       MF_MSG_CLEAN_UNEVICTABLE_LRU,   me_pagecache_clean },
 850
 851        { lru|dirty,    lru|dirty,      MF_MSG_DIRTY_LRU,       me_pagecache_dirty },
 852        { lru|dirty,    lru,            MF_MSG_CLEAN_LRU,       me_pagecache_clean },
 853
 854        /*
 855         * Catchall entry: must be at end.
 856         */
 857        { 0,            0,              MF_MSG_UNKNOWN, me_unknown },
 858};
 859
 860#undef dirty
 861#undef sc
 862#undef unevict
 863#undef mlock
 864#undef writeback
 865#undef lru
 866#undef head
 867#undef slab
 868#undef reserved
 869
 870/*
 871 * "Dirty/Clean" indication is not 100% accurate due to the possibility of
 872 * setting PG_dirty outside page lock. See also comment above set_page_dirty().
 873 */
 874static void action_result(unsigned long pfn, enum mf_action_page_type type,
 875                          enum mf_result result)
 876{
 877        trace_memory_failure_event(pfn, type, result);
 878
 879        pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
 880                pfn, action_page_types[type], action_name[result]);
 881}
 882
 883static int page_action(struct page_state *ps, struct page *p,
 884                        unsigned long pfn)
 885{
 886        int result;
 887        int count;
 888
 889        result = ps->action(p, pfn);
 890
 891        count = page_count(p) - 1;
 892        if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
 893                count--;
 894        if (count > 0) {
 895                pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
 896                       pfn, action_page_types[ps->type], count);
 897                result = MF_FAILED;
 898        }
 899        action_result(pfn, ps->type, result);
 900
 901        /* Could do more checks here if page looks ok */
 902        /*
 903         * Could adjust zone counters here to correct for the missing page.
 904         */
 905
 906        return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
 907}
 908
 909/**
 910 * get_hwpoison_page() - Get refcount for memory error handling:
 911 * @page:       raw error page (hit by memory error)
 912 *
 913 * Return: return 0 if failed to grab the refcount, otherwise true (some
 914 * non-zero value.)
 915 */
 916int get_hwpoison_page(struct page *page)
 917{
 918        struct page *head = compound_head(page);
 919
 920        if (!PageHuge(head) && PageTransHuge(head)) {
 921                /*
 922                 * Non anonymous thp exists only in allocation/free time. We
 923                 * can't handle such a case correctly, so let's give it up.
 924                 * This should be better than triggering BUG_ON when kernel
 925                 * tries to touch the "partially handled" page.
 926                 */
 927                if (!PageAnon(head)) {
 928                        pr_err("Memory failure: %#lx: non anonymous thp\n",
 929                                page_to_pfn(page));
 930                        return 0;
 931                }
 932        }
 933
 934        if (get_page_unless_zero(head)) {
 935                if (head == compound_head(page))
 936                        return 1;
 937
 938                pr_info("Memory failure: %#lx cannot catch tail\n",
 939                        page_to_pfn(page));
 940                put_page(head);
 941        }
 942
 943        return 0;
 944}
 945EXPORT_SYMBOL_GPL(get_hwpoison_page);
 946
 947/*
 948 * Do all that is necessary to remove user space mappings. Unmap
 949 * the pages and send SIGBUS to the processes if the data was dirty.
 950 */
 951static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 952                                  int flags, struct page **hpagep)
 953{
 954        enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
 955        struct address_space *mapping;
 956        LIST_HEAD(tokill);
 957        bool unmap_success = true;
 958        int kill = 1, forcekill;
 959        struct page *hpage = *hpagep;
 960        bool mlocked = PageMlocked(hpage);
 961
 962        /*
 963         * Here we are interested only in user-mapped pages, so skip any
 964         * other types of pages.
 965         */
 966        if (PageReserved(p) || PageSlab(p))
 967                return true;
 968        if (!(PageLRU(hpage) || PageHuge(p)))
 969                return true;
 970
 971        /*
 972         * This check implies we don't kill processes if their pages
 973         * are in the swap cache early. Those are always late kills.
 974         */
 975        if (!page_mapped(hpage))
 976                return true;
 977
 978        if (PageKsm(p)) {
 979                pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
 980                return false;
 981        }
 982
 983        if (PageSwapCache(p)) {
 984                pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
 985                        pfn);
 986                ttu |= TTU_IGNORE_HWPOISON;
 987        }
 988
 989        /*
 990         * Propagate the dirty bit from PTEs to struct page first, because we
 991         * need this to decide if we should kill or just drop the page.
 992         * XXX: the dirty test could be racy: set_page_dirty() may not always
 993         * be called inside page lock (it's recommended but not enforced).
 994         */
 995        mapping = page_mapping(hpage);
 996        if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
 997            mapping_cap_writeback_dirty(mapping)) {
 998                if (page_mkclean(hpage)) {
 999                        SetPageDirty(hpage);
1000                } else {
1001                        kill = 0;
1002                        ttu |= TTU_IGNORE_HWPOISON;
1003                        pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
1004                                pfn);
1005                }
1006        }
1007
1008        /*
1009         * First collect all the processes that have the page
1010         * mapped in dirty form.  This has to be done before try_to_unmap,
1011         * because ttu takes the rmap data structures down.
1012         *
1013         * Error handling: We ignore errors here because
1014         * there's nothing that can be done.
1015         */
1016        if (kill)
1017                collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
1018
1019        if (!PageHuge(hpage)) {
1020                unmap_success = try_to_unmap(hpage, ttu);
1021        } else {
1022                /*
1023                 * For hugetlb pages, try_to_unmap could potentially call
1024                 * huge_pmd_unshare.  Because of this, take semaphore in
1025                 * write mode here and set TTU_RMAP_LOCKED to indicate we
1026                 * have taken the lock at this higer level.
1027                 *
1028                 * Note that the call to hugetlb_page_mapping_lock_write
1029                 * is necessary even if mapping is already set.  It handles
1030                 * ugliness of potentially having to drop page lock to obtain
1031                 * i_mmap_rwsem.
1032                 */
1033                mapping = hugetlb_page_mapping_lock_write(hpage);
1034
1035                if (mapping) {
1036                        unmap_success = try_to_unmap(hpage,
1037                                                     ttu|TTU_RMAP_LOCKED);
1038                        i_mmap_unlock_write(mapping);
1039                } else {
1040                        pr_info("Memory failure: %#lx: could not find mapping for mapped huge page\n",
1041                                pfn);
1042                        unmap_success = false;
1043                }
1044        }
1045        if (!unmap_success)
1046                pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
1047                       pfn, page_mapcount(hpage));
1048
1049        /*
1050         * try_to_unmap() might put mlocked page in lru cache, so call
1051         * shake_page() again to ensure that it's flushed.
1052         */
1053        if (mlocked)
1054                shake_page(hpage, 0);
1055
1056        /*
1057         * Now that the dirty bit has been propagated to the
1058         * struct page and all unmaps done we can decide if
1059         * killing is needed or not.  Only kill when the page
1060         * was dirty or the process is not restartable,
1061         * otherwise the tokill list is merely
1062         * freed.  When there was a problem unmapping earlier
1063         * use a more force-full uncatchable kill to prevent
1064         * any accesses to the poisoned memory.
1065         */
1066        forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
1067        kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
1068
1069        return unmap_success;
1070}
1071
1072static int identify_page_state(unsigned long pfn, struct page *p,
1073                                unsigned long page_flags)
1074{
1075        struct page_state *ps;
1076
1077        /*
1078         * The first check uses the current page flags which may not have any
1079         * relevant information. The second check with the saved page flags is
1080         * carried out only if the first check can't determine the page status.
1081         */
1082        for (ps = error_states;; ps++)
1083                if ((p->flags & ps->mask) == ps->res)
1084                        break;
1085
1086        page_flags |= (p->flags & (1UL << PG_dirty));
1087
1088        if (!ps->mask)
1089                for (ps = error_states;; ps++)
1090                        if ((page_flags & ps->mask) == ps->res)
1091                                break;
1092        return page_action(ps, p, pfn);
1093}
1094
1095static int memory_failure_hugetlb(unsigned long pfn, int flags)
1096{
1097        struct page *p = pfn_to_page(pfn);
1098        struct page *head = compound_head(p);
1099        int res;
1100        unsigned long page_flags;
1101
1102        if (TestSetPageHWPoison(head)) {
1103                pr_err("Memory failure: %#lx: already hardware poisoned\n",
1104                       pfn);
1105                return 0;
1106        }
1107
1108        num_poisoned_pages_inc();
1109
1110        if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
1111                /*
1112                 * Check "filter hit" and "race with other subpage."
1113                 */
1114                lock_page(head);
1115                if (PageHWPoison(head)) {
1116                        if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
1117                            || (p != head && TestSetPageHWPoison(head))) {
1118                                num_poisoned_pages_dec();
1119                                unlock_page(head);
1120                                return 0;
1121                        }
1122                }
1123                unlock_page(head);
1124                dissolve_free_huge_page(p);
1125                action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED);
1126                return 0;
1127        }
1128
1129        lock_page(head);
1130        page_flags = head->flags;
1131
1132        if (!PageHWPoison(head)) {
1133                pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1134                num_poisoned_pages_dec();
1135                unlock_page(head);
1136                put_hwpoison_page(head);
1137                return 0;
1138        }
1139
1140        /*
1141         * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
1142         * simply disable it. In order to make it work properly, we need
1143         * make sure that:
1144         *  - conversion of a pud that maps an error hugetlb into hwpoison
1145         *    entry properly works, and
1146         *  - other mm code walking over page table is aware of pud-aligned
1147         *    hwpoison entries.
1148         */
1149        if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
1150                action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
1151                res = -EBUSY;
1152                goto out;
1153        }
1154
1155        if (!hwpoison_user_mappings(p, pfn, flags, &head)) {
1156                action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1157                res = -EBUSY;
1158                goto out;
1159        }
1160
1161        res = identify_page_state(pfn, p, page_flags);
1162out:
1163        unlock_page(head);
1164        return res;
1165}
1166
1167static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
1168                struct dev_pagemap *pgmap)
1169{
1170        struct page *page = pfn_to_page(pfn);
1171        const bool unmap_success = true;
1172        unsigned long size = 0;
1173        struct to_kill *tk;
1174        LIST_HEAD(tokill);
1175        int rc = -EBUSY;
1176        loff_t start;
1177        dax_entry_t cookie;
1178
1179        /*
1180         * Prevent the inode from being freed while we are interrogating
1181         * the address_space, typically this would be handled by
1182         * lock_page(), but dax pages do not use the page lock. This
1183         * also prevents changes to the mapping of this pfn until
1184         * poison signaling is complete.
1185         */
1186        cookie = dax_lock_page(page);
1187        if (!cookie)
1188                goto out;
1189
1190        if (hwpoison_filter(page)) {
1191                rc = 0;
1192                goto unlock;
1193        }
1194
1195        if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
1196                /*
1197                 * TODO: Handle HMM pages which may need coordination
1198                 * with device-side memory.
1199                 */
1200                goto unlock;
1201        }
1202
1203        /*
1204         * Use this flag as an indication that the dax page has been
1205         * remapped UC to prevent speculative consumption of poison.
1206         */
1207        SetPageHWPoison(page);
1208
1209        /*
1210         * Unlike System-RAM there is no possibility to swap in a
1211         * different physical page at a given virtual address, so all
1212         * userspace consumption of ZONE_DEVICE memory necessitates
1213         * SIGBUS (i.e. MF_MUST_KILL)
1214         */
1215        flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
1216        collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
1217
1218        list_for_each_entry(tk, &tokill, nd)
1219                if (tk->size_shift)
1220                        size = max(size, 1UL << tk->size_shift);
1221        if (size) {
1222                /*
1223                 * Unmap the largest mapping to avoid breaking up
1224                 * device-dax mappings which are constant size. The
1225                 * actual size of the mapping being torn down is
1226                 * communicated in siginfo, see kill_proc()
1227                 */
1228                start = (page->index << PAGE_SHIFT) & ~(size - 1);
1229                unmap_mapping_range(page->mapping, start, start + size, 0);
1230        }
1231        kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
1232        rc = 0;
1233unlock:
1234        dax_unlock_page(page, cookie);
1235out:
1236        /* drop pgmap ref acquired in caller */
1237        put_dev_pagemap(pgmap);
1238        action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
1239        return rc;
1240}
1241
1242/**
1243 * memory_failure - Handle memory failure of a page.
1244 * @pfn: Page Number of the corrupted page
1245 * @flags: fine tune action taken
1246 *
1247 * This function is called by the low level machine check code
1248 * of an architecture when it detects hardware memory corruption
1249 * of a page. It tries its best to recover, which includes
1250 * dropping pages, killing processes etc.
1251 *
1252 * The function is primarily of use for corruptions that
1253 * happen outside the current execution context (e.g. when
1254 * detected by a background scrubber)
1255 *
1256 * Must run in process context (e.g. a work queue) with interrupts
1257 * enabled and no spinlocks hold.
1258 */
1259int memory_failure(unsigned long pfn, int flags)
1260{
1261        struct page *p;
1262        struct page *hpage;
1263        struct page *orig_head;
1264        struct dev_pagemap *pgmap;
1265        int res;
1266        unsigned long page_flags;
1267
1268        if (!sysctl_memory_failure_recovery)
1269                panic("Memory failure on page %lx", pfn);
1270
1271        p = pfn_to_online_page(pfn);
1272        if (!p) {
1273                if (pfn_valid(pfn)) {
1274                        pgmap = get_dev_pagemap(pfn, NULL);
1275                        if (pgmap)
1276                                return memory_failure_dev_pagemap(pfn, flags,
1277                                                                  pgmap);
1278                }
1279                pr_err("Memory failure: %#lx: memory outside kernel control\n",
1280                        pfn);
1281                return -ENXIO;
1282        }
1283
1284        if (PageHuge(p))
1285                return memory_failure_hugetlb(pfn, flags);
1286        if (TestSetPageHWPoison(p)) {
1287                pr_err("Memory failure: %#lx: already hardware poisoned\n",
1288                        pfn);
1289                return 0;
1290        }
1291
1292        orig_head = hpage = compound_head(p);
1293        num_poisoned_pages_inc();
1294
1295        /*
1296         * We need/can do nothing about count=0 pages.
1297         * 1) it's a free page, and therefore in safe hand:
1298         *    prep_new_page() will be the gate keeper.
1299         * 2) it's part of a non-compound high order page.
1300         *    Implies some kernel user: cannot stop them from
1301         *    R/W the page; let's pray that the page has been
1302         *    used and will be freed some time later.
1303         * In fact it's dangerous to directly bump up page count from 0,
1304         * that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
1305         */
1306        if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
1307                if (is_free_buddy_page(p)) {
1308                        action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1309                        return 0;
1310                } else {
1311                        action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
1312                        return -EBUSY;
1313                }
1314        }
1315
1316        if (PageTransHuge(hpage)) {
1317                lock_page(p);
1318                if (!PageAnon(p) || unlikely(split_huge_page(p))) {
1319                        unlock_page(p);
1320                        if (!PageAnon(p))
1321                                pr_err("Memory failure: %#lx: non anonymous thp\n",
1322                                        pfn);
1323                        else
1324                                pr_err("Memory failure: %#lx: thp split failed\n",
1325                                        pfn);
1326                        if (TestClearPageHWPoison(p))
1327                                num_poisoned_pages_dec();
1328                        put_hwpoison_page(p);
1329                        return -EBUSY;
1330                }
1331                unlock_page(p);
1332                VM_BUG_ON_PAGE(!page_count(p), p);
1333                hpage = compound_head(p);
1334        }
1335
1336        /*
1337         * We ignore non-LRU pages for good reasons.
1338         * - PG_locked is only well defined for LRU pages and a few others
1339         * - to avoid races with __SetPageLocked()
1340         * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
1341         * The check (unnecessarily) ignores LRU pages being isolated and
1342         * walked by the page reclaim code, however that's not a big loss.
1343         */
1344        shake_page(p, 0);
1345        /* shake_page could have turned it free. */
1346        if (!PageLRU(p) && is_free_buddy_page(p)) {
1347                if (flags & MF_COUNT_INCREASED)
1348                        action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
1349                else
1350                        action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
1351                return 0;
1352        }
1353
1354        lock_page(p);
1355
1356        /*
1357         * The page could have changed compound pages during the locking.
1358         * If this happens just bail out.
1359         */
1360        if (PageCompound(p) && compound_head(p) != orig_head) {
1361                action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
1362                res = -EBUSY;
1363                goto out;
1364        }
1365
1366        /*
1367         * We use page flags to determine what action should be taken, but
1368         * the flags can be modified by the error containment action.  One
1369         * example is an mlocked page, where PG_mlocked is cleared by
1370         * page_remove_rmap() in try_to_unmap_one(). So to determine page status
1371         * correctly, we save a copy of the page flags at this time.
1372         */
1373        if (PageHuge(p))
1374                page_flags = hpage->flags;
1375        else
1376                page_flags = p->flags;
1377
1378        /*
1379         * unpoison always clear PG_hwpoison inside page lock
1380         */
1381        if (!PageHWPoison(p)) {
1382                pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
1383                num_poisoned_pages_dec();
1384                unlock_page(p);
1385                put_hwpoison_page(p);
1386                return 0;
1387        }
1388        if (hwpoison_filter(p)) {
1389                if (TestClearPageHWPoison(p))
1390                        num_poisoned_pages_dec();
1391                unlock_page(p);
1392                put_hwpoison_page(p);
1393                return 0;
1394        }
1395
1396        if (!PageTransTail(p) && !PageLRU(p))
1397                goto identify_page_state;
1398
1399        /*
1400         * It's very difficult to mess with pages currently under IO
1401         * and in many cases impossible, so we just avoid it here.
1402         */
1403        wait_on_page_writeback(p);
1404
1405        /*
1406         * Now take care of user space mappings.
1407         * Abort on fail: __delete_from_page_cache() assumes unmapped page.
1408         *
1409         * When the raw error page is thp tail page, hpage points to the raw
1410         * page after thp split.
1411         */
1412        if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) {
1413                action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1414                res = -EBUSY;
1415                goto out;
1416        }
1417
1418        /*
1419         * Torn down by someone else?
1420         */
1421        if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1422                action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
1423                res = -EBUSY;
1424                goto out;
1425        }
1426
1427identify_page_state:
1428        res = identify_page_state(pfn, p, page_flags);
1429out:
1430        unlock_page(p);
1431        return res;
1432}
1433EXPORT_SYMBOL_GPL(memory_failure);
1434
1435#define MEMORY_FAILURE_FIFO_ORDER       4
1436#define MEMORY_FAILURE_FIFO_SIZE        (1 << MEMORY_FAILURE_FIFO_ORDER)
1437
1438struct memory_failure_entry {
1439        unsigned long pfn;
1440        int flags;
1441};
1442
1443struct memory_failure_cpu {
1444        DECLARE_KFIFO(fifo, struct memory_failure_entry,
1445                      MEMORY_FAILURE_FIFO_SIZE);
1446        spinlock_t lock;
1447        struct work_struct work;
1448};
1449
1450static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1451
1452/**
1453 * memory_failure_queue - Schedule handling memory failure of a page.
1454 * @pfn: Page Number of the corrupted page
1455 * @flags: Flags for memory failure handling
1456 *
1457 * This function is called by the low level hardware error handler
1458 * when it detects hardware memory corruption of a page. It schedules
1459 * the recovering of error page, including dropping pages, killing
1460 * processes etc.
1461 *
1462 * The function is primarily of use for corruptions that
1463 * happen outside the current execution context (e.g. when
1464 * detected by a background scrubber)
1465 *
1466 * Can run in IRQ context.
1467 */
1468void memory_failure_queue(unsigned long pfn, int flags)
1469{
1470        struct memory_failure_cpu *mf_cpu;
1471        unsigned long proc_flags;
1472        struct memory_failure_entry entry = {
1473                .pfn =          pfn,
1474                .flags =        flags,
1475        };
1476
1477        mf_cpu = &get_cpu_var(memory_failure_cpu);
1478        spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1479        if (kfifo_put(&mf_cpu->fifo, entry))
1480                schedule_work_on(smp_processor_id(), &mf_cpu->work);
1481        else
1482                pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
1483                       pfn);
1484        spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1485        put_cpu_var(memory_failure_cpu);
1486}
1487EXPORT_SYMBOL_GPL(memory_failure_queue);
1488
1489static void memory_failure_work_func(struct work_struct *work)
1490{
1491        struct memory_failure_cpu *mf_cpu;
1492        struct memory_failure_entry entry = { 0, };
1493        unsigned long proc_flags;
1494        int gotten;
1495
1496        mf_cpu = this_cpu_ptr(&memory_failure_cpu);
1497        for (;;) {
1498                spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1499                gotten = kfifo_get(&mf_cpu->fifo, &entry);
1500                spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1501                if (!gotten)
1502                        break;
1503                if (entry.flags & MF_SOFT_OFFLINE)
1504                        soft_offline_page(entry.pfn, entry.flags);
1505                else
1506                        memory_failure(entry.pfn, entry.flags);
1507        }
1508}
1509
1510static int __init memory_failure_init(void)
1511{
1512        struct memory_failure_cpu *mf_cpu;
1513        int cpu;
1514
1515        for_each_possible_cpu(cpu) {
1516                mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1517                spin_lock_init(&mf_cpu->lock);
1518                INIT_KFIFO(mf_cpu->fifo);
1519                INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1520        }
1521
1522        return 0;
1523}
1524core_initcall(memory_failure_init);
1525
1526#define unpoison_pr_info(fmt, pfn, rs)                  \
1527({                                                      \
1528        if (__ratelimit(rs))                            \
1529                pr_info(fmt, pfn);                      \
1530})
1531
1532/**
1533 * unpoison_memory - Unpoison a previously poisoned page
1534 * @pfn: Page number of the to be unpoisoned page
1535 *
1536 * Software-unpoison a page that has been poisoned by
1537 * memory_failure() earlier.
1538 *
1539 * This is only done on the software-level, so it only works
1540 * for linux injected failures, not real hardware failures
1541 *
1542 * Returns 0 for success, otherwise -errno.
1543 */
1544int unpoison_memory(unsigned long pfn)
1545{
1546        struct page *page;
1547        struct page *p;
1548        int freeit = 0;
1549        static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
1550                                        DEFAULT_RATELIMIT_BURST);
1551
1552        if (!pfn_valid(pfn))
1553                return -ENXIO;
1554
1555        p = pfn_to_page(pfn);
1556        page = compound_head(p);
1557
1558        if (!PageHWPoison(p)) {
1559                unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
1560                                 pfn, &unpoison_rs);
1561                return 0;
1562        }
1563
1564        if (page_count(page) > 1) {
1565                unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
1566                                 pfn, &unpoison_rs);
1567                return 0;
1568        }
1569
1570        if (page_mapped(page)) {
1571                unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
1572                                 pfn, &unpoison_rs);
1573                return 0;
1574        }
1575
1576        if (page_mapping(page)) {
1577                unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
1578                                 pfn, &unpoison_rs);
1579                return 0;
1580        }
1581
1582        /*
1583         * unpoison_memory() can encounter thp only when the thp is being
1584         * worked by memory_failure() and the page lock is not held yet.
1585         * In such case, we yield to memory_failure() and make unpoison fail.
1586         */
1587        if (!PageHuge(page) && PageTransHuge(page)) {
1588                unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
1589                                 pfn, &unpoison_rs);
1590                return 0;
1591        }
1592
1593        if (!get_hwpoison_page(p)) {
1594                if (TestClearPageHWPoison(p))
1595                        num_poisoned_pages_dec();
1596                unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
1597                                 pfn, &unpoison_rs);
1598                return 0;
1599        }
1600
1601        lock_page(page);
1602        /*
1603         * This test is racy because PG_hwpoison is set outside of page lock.
1604         * That's acceptable because that won't trigger kernel panic. Instead,
1605         * the PG_hwpoison page will be caught and isolated on the entrance to
1606         * the free buddy page pool.
1607         */
1608        if (TestClearPageHWPoison(page)) {
1609                unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
1610                                 pfn, &unpoison_rs);
1611                num_poisoned_pages_dec();
1612                freeit = 1;
1613        }
1614        unlock_page(page);
1615
1616        put_hwpoison_page(page);
1617        if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
1618                put_hwpoison_page(page);
1619
1620        return 0;
1621}
1622EXPORT_SYMBOL(unpoison_memory);
1623
1624static struct page *new_page(struct page *p, unsigned long private)
1625{
1626        int nid = page_to_nid(p);
1627
1628        return new_page_nodemask(p, nid, &node_states[N_MEMORY]);
1629}
1630
1631/*
1632 * Safely get reference count of an arbitrary page.
1633 * Returns 0 for a free page, -EIO for a zero refcount page
1634 * that is not free, and 1 for any other page type.
1635 * For 1 the page is returned with increased page count, otherwise not.
1636 */
1637static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1638{
1639        int ret;
1640
1641        if (flags & MF_COUNT_INCREASED)
1642                return 1;
1643
1644        /*
1645         * When the target page is a free hugepage, just remove it
1646         * from free hugepage list.
1647         */
1648        if (!get_hwpoison_page(p)) {
1649                if (PageHuge(p)) {
1650                        pr_info("%s: %#lx free huge page\n", __func__, pfn);
1651                        ret = 0;
1652                } else if (is_free_buddy_page(p)) {
1653                        pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1654                        ret = 0;
1655                } else {
1656                        pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1657                                __func__, pfn, p->flags);
1658                        ret = -EIO;
1659                }
1660        } else {
1661                /* Not a free page */
1662                ret = 1;
1663        }
1664        return ret;
1665}
1666
1667static int get_any_page(struct page *page, unsigned long pfn, int flags)
1668{
1669        int ret = __get_any_page(page, pfn, flags);
1670
1671        if (ret == 1 && !PageHuge(page) &&
1672            !PageLRU(page) && !__PageMovable(page)) {
1673                /*
1674                 * Try to free it.
1675                 */
1676                put_hwpoison_page(page);
1677                shake_page(page, 1);
1678
1679                /*
1680                 * Did it turn free?
1681                 */
1682                ret = __get_any_page(page, pfn, 0);
1683                if (ret == 1 && !PageLRU(page)) {
1684                        /* Drop page reference which is from __get_any_page() */
1685                        put_hwpoison_page(page);
1686                        pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
1687                                pfn, page->flags, &page->flags);
1688                        return -EIO;
1689                }
1690        }
1691        return ret;
1692}
1693
1694static int soft_offline_huge_page(struct page *page, int flags)
1695{
1696        int ret;
1697        unsigned long pfn = page_to_pfn(page);
1698        struct page *hpage = compound_head(page);
1699        LIST_HEAD(pagelist);
1700
1701        /*
1702         * This double-check of PageHWPoison is to avoid the race with
1703         * memory_failure(). See also comment in __soft_offline_page().
1704         */
1705        lock_page(hpage);
1706        if (PageHWPoison(hpage)) {
1707                unlock_page(hpage);
1708                put_hwpoison_page(hpage);
1709                pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1710                return -EBUSY;
1711        }
1712        unlock_page(hpage);
1713
1714        ret = isolate_huge_page(hpage, &pagelist);
1715        /*
1716         * get_any_page() and isolate_huge_page() takes a refcount each,
1717         * so need to drop one here.
1718         */
1719        put_hwpoison_page(hpage);
1720        if (!ret) {
1721                pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
1722                return -EBUSY;
1723        }
1724
1725        ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1726                                MIGRATE_SYNC, MR_MEMORY_FAILURE);
1727        if (ret) {
1728                pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n",
1729                        pfn, ret, page->flags, &page->flags);
1730                if (!list_empty(&pagelist))
1731                        putback_movable_pages(&pagelist);
1732                if (ret > 0)
1733                        ret = -EIO;
1734        } else {
1735                /*
1736                 * We set PG_hwpoison only when the migration source hugepage
1737                 * was successfully dissolved, because otherwise hwpoisoned
1738                 * hugepage remains on free hugepage list, then userspace will
1739                 * find it as SIGBUS by allocation failure. That's not expected
1740                 * in soft-offlining.
1741                 */
1742                ret = dissolve_free_huge_page(page);
1743                if (!ret) {
1744                        if (set_hwpoison_free_buddy_page(page))
1745                                num_poisoned_pages_inc();
1746                        else
1747                                ret = -EBUSY;
1748                }
1749        }
1750        return ret;
1751}
1752
1753static int __soft_offline_page(struct page *page, int flags)
1754{
1755        int ret;
1756        unsigned long pfn = page_to_pfn(page);
1757
1758        /*
1759         * Check PageHWPoison again inside page lock because PageHWPoison
1760         * is set by memory_failure() outside page lock. Note that
1761         * memory_failure() also double-checks PageHWPoison inside page lock,
1762         * so there's no race between soft_offline_page() and memory_failure().
1763         */
1764        lock_page(page);
1765        wait_on_page_writeback(page);
1766        if (PageHWPoison(page)) {
1767                unlock_page(page);
1768                put_hwpoison_page(page);
1769                pr_info("soft offline: %#lx page already poisoned\n", pfn);
1770                return -EBUSY;
1771        }
1772        /*
1773         * Try to invalidate first. This should work for
1774         * non dirty unmapped page cache pages.
1775         */
1776        ret = invalidate_inode_page(page);
1777        unlock_page(page);
1778        /*
1779         * RED-PEN would be better to keep it isolated here, but we
1780         * would need to fix isolation locking first.
1781         */
1782        if (ret == 1) {
1783                put_hwpoison_page(page);
1784                pr_info("soft_offline: %#lx: invalidated\n", pfn);
1785                SetPageHWPoison(page);
1786                num_poisoned_pages_inc();
1787                return 0;
1788        }
1789
1790        /*
1791         * Simple invalidation didn't work.
1792         * Try to migrate to a new page instead. migrate.c
1793         * handles a large number of cases for us.
1794         */
1795        if (PageLRU(page))
1796                ret = isolate_lru_page(page);
1797        else
1798                ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
1799        /*
1800         * Drop page reference which is came from get_any_page()
1801         * successful isolate_lru_page() already took another one.
1802         */
1803        put_hwpoison_page(page);
1804        if (!ret) {
1805                LIST_HEAD(pagelist);
1806                /*
1807                 * After isolated lru page, the PageLRU will be cleared,
1808                 * so use !__PageMovable instead for LRU page's mapping
1809                 * cannot have PAGE_MAPPING_MOVABLE.
1810                 */
1811                if (!__PageMovable(page))
1812                        inc_node_page_state(page, NR_ISOLATED_ANON +
1813                                                page_is_file_lru(page));
1814                list_add(&page->lru, &pagelist);
1815                ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1816                                        MIGRATE_SYNC, MR_MEMORY_FAILURE);
1817                if (ret) {
1818                        if (!list_empty(&pagelist))
1819                                putback_movable_pages(&pagelist);
1820
1821                        pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
1822                                pfn, ret, page->flags, &page->flags);
1823                        if (ret > 0)
1824                                ret = -EIO;
1825                }
1826        } else {
1827                pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n",
1828                        pfn, ret, page_count(page), page->flags, &page->flags);
1829        }
1830        return ret;
1831}
1832
1833static int soft_offline_in_use_page(struct page *page, int flags)
1834{
1835        int ret;
1836        int mt;
1837        struct page *hpage = compound_head(page);
1838
1839        if (!PageHuge(page) && PageTransHuge(hpage)) {
1840                lock_page(page);
1841                if (!PageAnon(page) || unlikely(split_huge_page(page))) {
1842                        unlock_page(page);
1843                        if (!PageAnon(page))
1844                                pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
1845                        else
1846                                pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
1847                        put_hwpoison_page(page);
1848                        return -EBUSY;
1849                }
1850                unlock_page(page);
1851        }
1852
1853        /*
1854         * Setting MIGRATE_ISOLATE here ensures that the page will be linked
1855         * to free list immediately (not via pcplist) when released after
1856         * successful page migration. Otherwise we can't guarantee that the
1857         * page is really free after put_page() returns, so
1858         * set_hwpoison_free_buddy_page() highly likely fails.
1859         */
1860        mt = get_pageblock_migratetype(page);
1861        set_pageblock_migratetype(page, MIGRATE_ISOLATE);
1862        if (PageHuge(page))
1863                ret = soft_offline_huge_page(page, flags);
1864        else
1865                ret = __soft_offline_page(page, flags);
1866        set_pageblock_migratetype(page, mt);
1867        return ret;
1868}
1869
1870static int soft_offline_free_page(struct page *page)
1871{
1872        int rc = dissolve_free_huge_page(page);
1873
1874        if (!rc) {
1875                if (set_hwpoison_free_buddy_page(page))
1876                        num_poisoned_pages_inc();
1877                else
1878                        rc = -EBUSY;
1879        }
1880        return rc;
1881}
1882
1883/**
1884 * soft_offline_page - Soft offline a page.
1885 * @pfn: pfn to soft-offline
1886 * @flags: flags. Same as memory_failure().
1887 *
1888 * Returns 0 on success, otherwise negated errno.
1889 *
1890 * Soft offline a page, by migration or invalidation,
1891 * without killing anything. This is for the case when
1892 * a page is not corrupted yet (so it's still valid to access),
1893 * but has had a number of corrected errors and is better taken
1894 * out.
1895 *
1896 * The actual policy on when to do that is maintained by
1897 * user space.
1898 *
1899 * This should never impact any application or cause data loss,
1900 * however it might take some time.
1901 *
1902 * This is not a 100% solution for all memory, but tries to be
1903 * ``good enough'' for the majority of memory.
1904 */
1905int soft_offline_page(unsigned long pfn, int flags)
1906{
1907        int ret;
1908        struct page *page;
1909
1910        if (!pfn_valid(pfn))
1911                return -ENXIO;
1912        /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */
1913        page = pfn_to_online_page(pfn);
1914        if (!page)
1915                return -EIO;
1916
1917        if (PageHWPoison(page)) {
1918                pr_info("soft offline: %#lx page already poisoned\n", pfn);
1919                if (flags & MF_COUNT_INCREASED)
1920                        put_hwpoison_page(page);
1921                return -EBUSY;
1922        }
1923
1924        get_online_mems();
1925        ret = get_any_page(page, pfn, flags);
1926        put_online_mems();
1927
1928        if (ret > 0)
1929                ret = soft_offline_in_use_page(page, flags);
1930        else if (ret == 0)
1931                ret = soft_offline_free_page(page);
1932
1933        return ret;
1934}
1935