linux/mm/memory-failure.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2008, 2009 Intel Corporation
   3 * Authors: Andi Kleen, Fengguang Wu
   4 *
   5 * This software may be redistributed and/or modified under the terms of
   6 * the GNU General Public License ("GPL") version 2 only as published by the
   7 * Free Software Foundation.
   8 *
   9 * High level machine check handler. Handles pages reported by the
  10 * hardware as being corrupted usually due to a multi-bit ECC memory or cache
  11 * failure.
  12 * 
  13 * In addition there is a "soft offline" entry point that allows stop using
  14 * not-yet-corrupted-by-suspicious pages without killing anything.
  15 *
  16 * Handles page cache pages in various states.  The tricky part
  17 * here is that we can access any page asynchronously in respect to 
  18 * other VM users, because memory failures could happen anytime and 
  19 * anywhere. This could violate some of their assumptions. This is why 
  20 * this code has to be extremely careful. Generally it tries to use 
  21 * normal locking rules, as in get the standard locks, even if that means 
  22 * the error handling takes potentially a long time.
  23 * 
  24 * There are several operations here with exponential complexity because
  25 * of unsuitable VM data structures. For example the operation to map back 
  26 * from RMAP chains to processes has to walk the complete process list and 
  27 * has non linear complexity with the number. But since memory corruptions
  28 * are rare we hope to get away with this. This avoids impacting the core 
  29 * VM.
  30 */
  31
  32/*
  33 * Notebook:
  34 * - hugetlb needs more code
  35 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
  36 * - pass bad pages to kdump next kernel
  37 */
  38#include <linux/kernel.h>
  39#include <linux/mm.h>
  40#include <linux/page-flags.h>
  41#include <linux/kernel-page-flags.h>
  42#include <linux/sched.h>
  43#include <linux/ksm.h>
  44#include <linux/rmap.h>
  45#include <linux/export.h>
  46#include <linux/pagemap.h>
  47#include <linux/swap.h>
  48#include <linux/backing-dev.h>
  49#include <linux/migrate.h>
  50#include <linux/page-isolation.h>
  51#include <linux/suspend.h>
  52#include <linux/slab.h>
  53#include <linux/swapops.h>
  54#include <linux/hugetlb.h>
  55#include <linux/memory_hotplug.h>
  56#include <linux/mm_inline.h>
  57#include <linux/kfifo.h>
  58#include "internal.h"
  59
  60int sysctl_memory_failure_early_kill __read_mostly = 0;
  61
  62int sysctl_memory_failure_recovery __read_mostly = 1;
  63
  64atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
  65
  66#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
  67
  68u32 hwpoison_filter_enable = 0;
  69u32 hwpoison_filter_dev_major = ~0U;
  70u32 hwpoison_filter_dev_minor = ~0U;
  71u64 hwpoison_filter_flags_mask;
  72u64 hwpoison_filter_flags_value;
  73EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
  74EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
  75EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
  76EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
  77EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
  78
  79static int hwpoison_filter_dev(struct page *p)
  80{
  81        struct address_space *mapping;
  82        dev_t dev;
  83
  84        if (hwpoison_filter_dev_major == ~0U &&
  85            hwpoison_filter_dev_minor == ~0U)
  86                return 0;
  87
  88        /*
  89         * page_mapping() does not accept slab pages.
  90         */
  91        if (PageSlab(p))
  92                return -EINVAL;
  93
  94        mapping = page_mapping(p);
  95        if (mapping == NULL || mapping->host == NULL)
  96                return -EINVAL;
  97
  98        dev = mapping->host->i_sb->s_dev;
  99        if (hwpoison_filter_dev_major != ~0U &&
 100            hwpoison_filter_dev_major != MAJOR(dev))
 101                return -EINVAL;
 102        if (hwpoison_filter_dev_minor != ~0U &&
 103            hwpoison_filter_dev_minor != MINOR(dev))
 104                return -EINVAL;
 105
 106        return 0;
 107}
 108
 109static int hwpoison_filter_flags(struct page *p)
 110{
 111        if (!hwpoison_filter_flags_mask)
 112                return 0;
 113
 114        if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
 115                                    hwpoison_filter_flags_value)
 116                return 0;
 117        else
 118                return -EINVAL;
 119}
 120
 121/*
 122 * This allows stress tests to limit test scope to a collection of tasks
 123 * by putting them under some memcg. This prevents killing unrelated/important
 124 * processes such as /sbin/init. Note that the target task may share clean
 125 * pages with init (eg. libc text), which is harmless. If the target task
 126 * share _dirty_ pages with another task B, the test scheme must make sure B
 127 * is also included in the memcg. At last, due to race conditions this filter
 128 * can only guarantee that the page either belongs to the memcg tasks, or is
 129 * a freed page.
 130 */
 131#ifdef  CONFIG_MEMCG_SWAP
 132u64 hwpoison_filter_memcg;
 133EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
 134static int hwpoison_filter_task(struct page *p)
 135{
 136        struct mem_cgroup *mem;
 137        struct cgroup_subsys_state *css;
 138        unsigned long ino;
 139
 140        if (!hwpoison_filter_memcg)
 141                return 0;
 142
 143        mem = try_get_mem_cgroup_from_page(p);
 144        if (!mem)
 145                return -EINVAL;
 146
 147        css = mem_cgroup_css(mem);
 148        ino = cgroup_ino(css->cgroup);
 149        css_put(css);
 150
 151        if (ino != hwpoison_filter_memcg)
 152                return -EINVAL;
 153
 154        return 0;
 155}
 156#else
 157static int hwpoison_filter_task(struct page *p) { return 0; }
 158#endif
 159
 160int hwpoison_filter(struct page *p)
 161{
 162        if (!hwpoison_filter_enable)
 163                return 0;
 164
 165        if (hwpoison_filter_dev(p))
 166                return -EINVAL;
 167
 168        if (hwpoison_filter_flags(p))
 169                return -EINVAL;
 170
 171        if (hwpoison_filter_task(p))
 172                return -EINVAL;
 173
 174        return 0;
 175}
 176#else
 177int hwpoison_filter(struct page *p)
 178{
 179        return 0;
 180}
 181#endif
 182
 183EXPORT_SYMBOL_GPL(hwpoison_filter);
 184
 185/*
 186 * Send all the processes who have the page mapped a signal.
 187 * ``action optional'' if they are not immediately affected by the error
 188 * ``action required'' if error happened in current execution context
 189 */
 190static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
 191                        unsigned long pfn, struct page *page, int flags)
 192{
 193        struct siginfo si;
 194        int ret;
 195
 196        printk(KERN_ERR
 197                "MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
 198                pfn, t->comm, t->pid);
 199        si.si_signo = SIGBUS;
 200        si.si_errno = 0;
 201        si.si_addr = (void *)addr;
 202#ifdef __ARCH_SI_TRAPNO
 203        si.si_trapno = trapno;
 204#endif
 205        si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
 206
 207        if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
 208                si.si_code = BUS_MCEERR_AR;
 209                ret = force_sig_info(SIGBUS, &si, current);
 210        } else {
 211                /*
 212                 * Don't use force here, it's convenient if the signal
 213                 * can be temporarily blocked.
 214                 * This could cause a loop when the user sets SIGBUS
 215                 * to SIG_IGN, but hopefully no one will do that?
 216                 */
 217                si.si_code = BUS_MCEERR_AO;
 218                ret = send_sig_info(SIGBUS, &si, t);  /* synchronous? */
 219        }
 220        if (ret < 0)
 221                printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
 222                       t->comm, t->pid, ret);
 223        return ret;
 224}
 225
 226/*
 227 * When a unknown page type is encountered drain as many buffers as possible
 228 * in the hope to turn the page into a LRU or free page, which we can handle.
 229 */
 230void shake_page(struct page *p, int access)
 231{
 232        if (!PageSlab(p)) {
 233                lru_add_drain_all();
 234                if (PageLRU(p))
 235                        return;
 236                drain_all_pages(page_zone(p));
 237                if (PageLRU(p) || is_free_buddy_page(p))
 238                        return;
 239        }
 240
 241        /*
 242         * Only call shrink_node_slabs here (which would also shrink
 243         * other caches) if access is not potentially fatal.
 244         */
 245        if (access) {
 246                int nr;
 247                int nid = page_to_nid(p);
 248                do {
 249                        nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000);
 250                        if (page_count(p) == 1)
 251                                break;
 252                } while (nr > 10);
 253        }
 254}
 255EXPORT_SYMBOL_GPL(shake_page);
 256
 257/*
 258 * Kill all processes that have a poisoned page mapped and then isolate
 259 * the page.
 260 *
 261 * General strategy:
 262 * Find all processes having the page mapped and kill them.
 263 * But we keep a page reference around so that the page is not
 264 * actually freed yet.
 265 * Then stash the page away
 266 *
 267 * There's no convenient way to get back to mapped processes
 268 * from the VMAs. So do a brute-force search over all
 269 * running processes.
 270 *
 271 * Remember that machine checks are not common (or rather
 272 * if they are common you have other problems), so this shouldn't
 273 * be a performance issue.
 274 *
 275 * Also there are some races possible while we get from the
 276 * error detection to actually handle it.
 277 */
 278
 279struct to_kill {
 280        struct list_head nd;
 281        struct task_struct *tsk;
 282        unsigned long addr;
 283        char addr_valid;
 284};
 285
 286/*
 287 * Failure handling: if we can't find or can't kill a process there's
 288 * not much we can do.  We just print a message and ignore otherwise.
 289 */
 290
 291/*
 292 * Schedule a process for later kill.
 293 * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
 294 * TBD would GFP_NOIO be enough?
 295 */
 296static void add_to_kill(struct task_struct *tsk, struct page *p,
 297                       struct vm_area_struct *vma,
 298                       struct list_head *to_kill,
 299                       struct to_kill **tkc)
 300{
 301        struct to_kill *tk;
 302
 303        if (*tkc) {
 304                tk = *tkc;
 305                *tkc = NULL;
 306        } else {
 307                tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
 308                if (!tk) {
 309                        printk(KERN_ERR
 310                "MCE: Out of memory while machine check handling\n");
 311                        return;
 312                }
 313        }
 314        tk->addr = page_address_in_vma(p, vma);
 315        tk->addr_valid = 1;
 316
 317        /*
 318         * In theory we don't have to kill when the page was
 319         * munmaped. But it could be also a mremap. Since that's
 320         * likely very rare kill anyways just out of paranoia, but use
 321         * a SIGKILL because the error is not contained anymore.
 322         */
 323        if (tk->addr == -EFAULT) {
 324                pr_info("MCE: Unable to find user space address %lx in %s\n",
 325                        page_to_pfn(p), tsk->comm);
 326                tk->addr_valid = 0;
 327        }
 328        get_task_struct(tsk);
 329        tk->tsk = tsk;
 330        list_add_tail(&tk->nd, to_kill);
 331}
 332
 333/*
 334 * Kill the processes that have been collected earlier.
 335 *
 336 * Only do anything when DOIT is set, otherwise just free the list
 337 * (this is used for clean pages which do not need killing)
 338 * Also when FAIL is set do a force kill because something went
 339 * wrong earlier.
 340 */
 341static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
 342                          int fail, struct page *page, unsigned long pfn,
 343                          int flags)
 344{
 345        struct to_kill *tk, *next;
 346
 347        list_for_each_entry_safe (tk, next, to_kill, nd) {
 348                if (forcekill) {
 349                        /*
 350                         * In case something went wrong with munmapping
 351                         * make sure the process doesn't catch the
 352                         * signal and then access the memory. Just kill it.
 353                         */
 354                        if (fail || tk->addr_valid == 0) {
 355                                printk(KERN_ERR
 356                "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
 357                                        pfn, tk->tsk->comm, tk->tsk->pid);
 358                                force_sig(SIGKILL, tk->tsk);
 359                        }
 360
 361                        /*
 362                         * In theory the process could have mapped
 363                         * something else on the address in-between. We could
 364                         * check for that, but we need to tell the
 365                         * process anyways.
 366                         */
 367                        else if (kill_proc(tk->tsk, tk->addr, trapno,
 368                                              pfn, page, flags) < 0)
 369                                printk(KERN_ERR
 370                "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
 371                                        pfn, tk->tsk->comm, tk->tsk->pid);
 372                }
 373                put_task_struct(tk->tsk);
 374                kfree(tk);
 375        }
 376}
 377
 378/*
 379 * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
 380 * on behalf of the thread group. Return task_struct of the (first found)
 381 * dedicated thread if found, and return NULL otherwise.
 382 *
 383 * We already hold read_lock(&tasklist_lock) in the caller, so we don't
 384 * have to call rcu_read_lock/unlock() in this function.
 385 */
 386static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
 387{
 388        struct task_struct *t;
 389
 390        for_each_thread(tsk, t)
 391                if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
 392                        return t;
 393        return NULL;
 394}
 395
 396/*
 397 * Determine whether a given process is "early kill" process which expects
 398 * to be signaled when some page under the process is hwpoisoned.
 399 * Return task_struct of the dedicated thread (main thread unless explicitly
 400 * specified) if the process is "early kill," and otherwise returns NULL.
 401 */
 402static struct task_struct *task_early_kill(struct task_struct *tsk,
 403                                           int force_early)
 404{
 405        struct task_struct *t;
 406        if (!tsk->mm)
 407                return NULL;
 408        if (force_early)
 409                return tsk;
 410        t = find_early_kill_thread(tsk);
 411        if (t)
 412                return t;
 413        if (sysctl_memory_failure_early_kill)
 414                return tsk;
 415        return NULL;
 416}
 417
 418/*
 419 * Collect processes when the error hit an anonymous page.
 420 */
 421static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 422                              struct to_kill **tkc, int force_early)
 423{
 424        struct vm_area_struct *vma;
 425        struct task_struct *tsk;
 426        struct anon_vma *av;
 427        pgoff_t pgoff;
 428
 429        av = page_lock_anon_vma_read(page);
 430        if (av == NULL) /* Not actually mapped anymore */
 431                return;
 432
 433        pgoff = page_to_pgoff(page);
 434        read_lock(&tasklist_lock);
 435        for_each_process (tsk) {
 436                struct anon_vma_chain *vmac;
 437                struct task_struct *t = task_early_kill(tsk, force_early);
 438
 439                if (!t)
 440                        continue;
 441                anon_vma_interval_tree_foreach(vmac, &av->rb_root,
 442                                               pgoff, pgoff) {
 443                        vma = vmac->vma;
 444                        if (!page_mapped_in_vma(page, vma))
 445                                continue;
 446                        if (vma->vm_mm == t->mm)
 447                                add_to_kill(t, page, vma, to_kill, tkc);
 448                }
 449        }
 450        read_unlock(&tasklist_lock);
 451        page_unlock_anon_vma_read(av);
 452}
 453
 454/*
 455 * Collect processes when the error hit a file mapped page.
 456 */
 457static void collect_procs_file(struct page *page, struct list_head *to_kill,
 458                              struct to_kill **tkc, int force_early)
 459{
 460        struct vm_area_struct *vma;
 461        struct task_struct *tsk;
 462        struct address_space *mapping = page->mapping;
 463
 464        i_mmap_lock_read(mapping);
 465        read_lock(&tasklist_lock);
 466        for_each_process(tsk) {
 467                pgoff_t pgoff = page_to_pgoff(page);
 468                struct task_struct *t = task_early_kill(tsk, force_early);
 469
 470                if (!t)
 471                        continue;
 472                vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
 473                                      pgoff) {
 474                        /*
 475                         * Send early kill signal to tasks where a vma covers
 476                         * the page but the corrupted page is not necessarily
 477                         * mapped it in its pte.
 478                         * Assume applications who requested early kill want
 479                         * to be informed of all such data corruptions.
 480                         */
 481                        if (vma->vm_mm == t->mm)
 482                                add_to_kill(t, page, vma, to_kill, tkc);
 483                }
 484        }
 485        read_unlock(&tasklist_lock);
 486        i_mmap_unlock_read(mapping);
 487}
 488
 489/*
 490 * Collect the processes who have the corrupted page mapped to kill.
 491 * This is done in two steps for locking reasons.
 492 * First preallocate one tokill structure outside the spin locks,
 493 * so that we can kill at least one process reasonably reliable.
 494 */
 495static void collect_procs(struct page *page, struct list_head *tokill,
 496                                int force_early)
 497{
 498        struct to_kill *tk;
 499
 500        if (!page->mapping)
 501                return;
 502
 503        tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
 504        if (!tk)
 505                return;
 506        if (PageAnon(page))
 507                collect_procs_anon(page, tokill, &tk, force_early);
 508        else
 509                collect_procs_file(page, tokill, &tk, force_early);
 510        kfree(tk);
 511}
 512
 513/*
 514 * Error handlers for various types of pages.
 515 */
 516
 517enum outcome {
 518        IGNORED,        /* Error: cannot be handled */
 519        FAILED,         /* Error: handling failed */
 520        DELAYED,        /* Will be handled later */
 521        RECOVERED,      /* Successfully recovered */
 522};
 523
 524static const char *action_name[] = {
 525        [IGNORED] = "Ignored",
 526        [FAILED] = "Failed",
 527        [DELAYED] = "Delayed",
 528        [RECOVERED] = "Recovered",
 529};
 530
 531/*
 532 * XXX: It is possible that a page is isolated from LRU cache,
 533 * and then kept in swap cache or failed to remove from page cache.
 534 * The page count will stop it from being freed by unpoison.
 535 * Stress tests should be aware of this memory leak problem.
 536 */
 537static int delete_from_lru_cache(struct page *p)
 538{
 539        if (!isolate_lru_page(p)) {
 540                /*
 541                 * Clear sensible page flags, so that the buddy system won't
 542                 * complain when the page is unpoison-and-freed.
 543                 */
 544                ClearPageActive(p);
 545                ClearPageUnevictable(p);
 546                /*
 547                 * drop the page count elevated by isolate_lru_page()
 548                 */
 549                page_cache_release(p);
 550                return 0;
 551        }
 552        return -EIO;
 553}
 554
 555/*
 556 * Error hit kernel page.
 557 * Do nothing, try to be lucky and not touch this instead. For a few cases we
 558 * could be more sophisticated.
 559 */
 560static int me_kernel(struct page *p, unsigned long pfn)
 561{
 562        return IGNORED;
 563}
 564
 565/*
 566 * Page in unknown state. Do nothing.
 567 */
 568static int me_unknown(struct page *p, unsigned long pfn)
 569{
 570        printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
 571        return FAILED;
 572}
 573
 574/*
 575 * Clean (or cleaned) page cache page.
 576 */
 577static int me_pagecache_clean(struct page *p, unsigned long pfn)
 578{
 579        int err;
 580        int ret = FAILED;
 581        struct address_space *mapping;
 582
 583        delete_from_lru_cache(p);
 584
 585        /*
 586         * For anonymous pages we're done the only reference left
 587         * should be the one m_f() holds.
 588         */
 589        if (PageAnon(p))
 590                return RECOVERED;
 591
 592        /*
 593         * Now truncate the page in the page cache. This is really
 594         * more like a "temporary hole punch"
 595         * Don't do this for block devices when someone else
 596         * has a reference, because it could be file system metadata
 597         * and that's not safe to truncate.
 598         */
 599        mapping = page_mapping(p);
 600        if (!mapping) {
 601                /*
 602                 * Page has been teared down in the meanwhile
 603                 */
 604                return FAILED;
 605        }
 606
 607        /*
 608         * Truncation is a bit tricky. Enable it per file system for now.
 609         *
 610         * Open: to take i_mutex or not for this? Right now we don't.
 611         */
 612        if (mapping->a_ops->error_remove_page) {
 613                err = mapping->a_ops->error_remove_page(mapping, p);
 614                if (err != 0) {
 615                        printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
 616                                        pfn, err);
 617                } else if (page_has_private(p) &&
 618                                !try_to_release_page(p, GFP_NOIO)) {
 619                        pr_info("MCE %#lx: failed to release buffers\n", pfn);
 620                } else {
 621                        ret = RECOVERED;
 622                }
 623        } else {
 624                /*
 625                 * If the file system doesn't support it just invalidate
 626                 * This fails on dirty or anything with private pages
 627                 */
 628                if (invalidate_inode_page(p))
 629                        ret = RECOVERED;
 630                else
 631                        printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
 632                                pfn);
 633        }
 634        return ret;
 635}
 636
 637/*
 638 * Dirty pagecache page
 639 * Issues: when the error hit a hole page the error is not properly
 640 * propagated.
 641 */
 642static int me_pagecache_dirty(struct page *p, unsigned long pfn)
 643{
 644        struct address_space *mapping = page_mapping(p);
 645
 646        SetPageError(p);
 647        /* TBD: print more information about the file. */
 648        if (mapping) {
 649                /*
 650                 * IO error will be reported by write(), fsync(), etc.
 651                 * who check the mapping.
 652                 * This way the application knows that something went
 653                 * wrong with its dirty file data.
 654                 *
 655                 * There's one open issue:
 656                 *
 657                 * The EIO will be only reported on the next IO
 658                 * operation and then cleared through the IO map.
 659                 * Normally Linux has two mechanisms to pass IO error
 660                 * first through the AS_EIO flag in the address space
 661                 * and then through the PageError flag in the page.
 662                 * Since we drop pages on memory failure handling the
 663                 * only mechanism open to use is through AS_AIO.
 664                 *
 665                 * This has the disadvantage that it gets cleared on
 666                 * the first operation that returns an error, while
 667                 * the PageError bit is more sticky and only cleared
 668                 * when the page is reread or dropped.  If an
 669                 * application assumes it will always get error on
 670                 * fsync, but does other operations on the fd before
 671                 * and the page is dropped between then the error
 672                 * will not be properly reported.
 673                 *
 674                 * This can already happen even without hwpoisoned
 675                 * pages: first on metadata IO errors (which only
 676                 * report through AS_EIO) or when the page is dropped
 677                 * at the wrong time.
 678                 *
 679                 * So right now we assume that the application DTRT on
 680                 * the first EIO, but we're not worse than other parts
 681                 * of the kernel.
 682                 */
 683                mapping_set_error(mapping, EIO);
 684        }
 685
 686        return me_pagecache_clean(p, pfn);
 687}
 688
 689/*
 690 * Clean and dirty swap cache.
 691 *
 692 * Dirty swap cache page is tricky to handle. The page could live both in page
 693 * cache and swap cache(ie. page is freshly swapped in). So it could be
 694 * referenced concurrently by 2 types of PTEs:
 695 * normal PTEs and swap PTEs. We try to handle them consistently by calling
 696 * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
 697 * and then
 698 *      - clear dirty bit to prevent IO
 699 *      - remove from LRU
 700 *      - but keep in the swap cache, so that when we return to it on
 701 *        a later page fault, we know the application is accessing
 702 *        corrupted data and shall be killed (we installed simple
 703 *        interception code in do_swap_page to catch it).
 704 *
 705 * Clean swap cache pages can be directly isolated. A later page fault will
 706 * bring in the known good data from disk.
 707 */
 708static int me_swapcache_dirty(struct page *p, unsigned long pfn)
 709{
 710        ClearPageDirty(p);
 711        /* Trigger EIO in shmem: */
 712        ClearPageUptodate(p);
 713
 714        if (!delete_from_lru_cache(p))
 715                return DELAYED;
 716        else
 717                return FAILED;
 718}
 719
 720static int me_swapcache_clean(struct page *p, unsigned long pfn)
 721{
 722        delete_from_swap_cache(p);
 723
 724        if (!delete_from_lru_cache(p))
 725                return RECOVERED;
 726        else
 727                return FAILED;
 728}
 729
 730/*
 731 * Huge pages. Needs work.
 732 * Issues:
 733 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
 734 *   To narrow down kill region to one page, we need to break up pmd.
 735 */
 736static int me_huge_page(struct page *p, unsigned long pfn)
 737{
 738        int res = 0;
 739        struct page *hpage = compound_head(p);
 740        /*
 741         * We can safely recover from error on free or reserved (i.e.
 742         * not in-use) hugepage by dequeuing it from freelist.
 743         * To check whether a hugepage is in-use or not, we can't use
 744         * page->lru because it can be used in other hugepage operations,
 745         * such as __unmap_hugepage_range() and gather_surplus_pages().
 746         * So instead we use page_mapping() and PageAnon().
 747         * We assume that this function is called with page lock held,
 748         * so there is no race between isolation and mapping/unmapping.
 749         */
 750        if (!(page_mapping(hpage) || PageAnon(hpage))) {
 751                res = dequeue_hwpoisoned_huge_page(hpage);
 752                if (!res)
 753                        return RECOVERED;
 754        }
 755        return DELAYED;
 756}
 757
 758/*
 759 * Various page states we can handle.
 760 *
 761 * A page state is defined by its current page->flags bits.
 762 * The table matches them in order and calls the right handler.
 763 *
 764 * This is quite tricky because we can access page at any time
 765 * in its live cycle, so all accesses have to be extremely careful.
 766 *
 767 * This is not complete. More states could be added.
 768 * For any missing state don't attempt recovery.
 769 */
 770
 771#define dirty           (1UL << PG_dirty)
 772#define sc              (1UL << PG_swapcache)
 773#define unevict         (1UL << PG_unevictable)
 774#define mlock           (1UL << PG_mlocked)
 775#define writeback       (1UL << PG_writeback)
 776#define lru             (1UL << PG_lru)
 777#define swapbacked      (1UL << PG_swapbacked)
 778#define head            (1UL << PG_head)
 779#define tail            (1UL << PG_tail)
 780#define compound        (1UL << PG_compound)
 781#define slab            (1UL << PG_slab)
 782#define reserved        (1UL << PG_reserved)
 783
 784static struct page_state {
 785        unsigned long mask;
 786        unsigned long res;
 787        char *msg;
 788        int (*action)(struct page *p, unsigned long pfn);
 789} error_states[] = {
 790        { reserved,     reserved,       "reserved kernel",      me_kernel },
 791        /*
 792         * free pages are specially detected outside this table:
 793         * PG_buddy pages only make a small fraction of all free pages.
 794         */
 795
 796        /*
 797         * Could in theory check if slab page is free or if we can drop
 798         * currently unused objects without touching them. But just
 799         * treat it as standard kernel for now.
 800         */
 801        { slab,         slab,           "kernel slab",  me_kernel },
 802
 803#ifdef CONFIG_PAGEFLAGS_EXTENDED
 804        { head,         head,           "huge",         me_huge_page },
 805        { tail,         tail,           "huge",         me_huge_page },
 806#else
 807        { compound,     compound,       "huge",         me_huge_page },
 808#endif
 809
 810        { sc|dirty,     sc|dirty,       "dirty swapcache",      me_swapcache_dirty },
 811        { sc|dirty,     sc,             "clean swapcache",      me_swapcache_clean },
 812
 813        { mlock|dirty,  mlock|dirty,    "dirty mlocked LRU",    me_pagecache_dirty },
 814        { mlock|dirty,  mlock,          "clean mlocked LRU",    me_pagecache_clean },
 815
 816        { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
 817        { unevict|dirty, unevict,       "clean unevictable LRU", me_pagecache_clean },
 818
 819        { lru|dirty,    lru|dirty,      "dirty LRU",    me_pagecache_dirty },
 820        { lru|dirty,    lru,            "clean LRU",    me_pagecache_clean },
 821
 822        /*
 823         * Catchall entry: must be at end.
 824         */
 825        { 0,            0,              "unknown page state",   me_unknown },
 826};
 827
 828#undef dirty
 829#undef sc
 830#undef unevict
 831#undef mlock
 832#undef writeback
 833#undef lru
 834#undef swapbacked
 835#undef head
 836#undef tail
 837#undef compound
 838#undef slab
 839#undef reserved
 840
 841/*
 842 * "Dirty/Clean" indication is not 100% accurate due to the possibility of
 843 * setting PG_dirty outside page lock. See also comment above set_page_dirty().
 844 */
 845static void action_result(unsigned long pfn, char *msg, int result)
 846{
 847        pr_err("MCE %#lx: %s page recovery: %s\n",
 848                pfn, msg, action_name[result]);
 849}
 850
 851static int page_action(struct page_state *ps, struct page *p,
 852                        unsigned long pfn)
 853{
 854        int result;
 855        int count;
 856
 857        result = ps->action(p, pfn);
 858
 859        count = page_count(p) - 1;
 860        if (ps->action == me_swapcache_dirty && result == DELAYED)
 861                count--;
 862        if (count != 0) {
 863                printk(KERN_ERR
 864                       "MCE %#lx: %s page still referenced by %d users\n",
 865                       pfn, ps->msg, count);
 866                result = FAILED;
 867        }
 868        action_result(pfn, ps->msg, result);
 869
 870        /* Could do more checks here if page looks ok */
 871        /*
 872         * Could adjust zone counters here to correct for the missing page.
 873         */
 874
 875        return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
 876}
 877
 878/*
 879 * Do all that is necessary to remove user space mappings. Unmap
 880 * the pages and send SIGBUS to the processes if the data was dirty.
 881 */
 882static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 883                                  int trapno, int flags, struct page **hpagep)
 884{
 885        enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
 886        struct address_space *mapping;
 887        LIST_HEAD(tokill);
 888        int ret;
 889        int kill = 1, forcekill;
 890        struct page *hpage = *hpagep;
 891        struct page *ppage;
 892
 893        /*
 894         * Here we are interested only in user-mapped pages, so skip any
 895         * other types of pages.
 896         */
 897        if (PageReserved(p) || PageSlab(p))
 898                return SWAP_SUCCESS;
 899        if (!(PageLRU(hpage) || PageHuge(p)))
 900                return SWAP_SUCCESS;
 901
 902        /*
 903         * This check implies we don't kill processes if their pages
 904         * are in the swap cache early. Those are always late kills.
 905         */
 906        if (!page_mapped(hpage))
 907                return SWAP_SUCCESS;
 908
 909        if (PageKsm(p)) {
 910                pr_err("MCE %#lx: can't handle KSM pages.\n", pfn);
 911                return SWAP_FAIL;
 912        }
 913
 914        if (PageSwapCache(p)) {
 915                printk(KERN_ERR
 916                       "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
 917                ttu |= TTU_IGNORE_HWPOISON;
 918        }
 919
 920        /*
 921         * Propagate the dirty bit from PTEs to struct page first, because we
 922         * need this to decide if we should kill or just drop the page.
 923         * XXX: the dirty test could be racy: set_page_dirty() may not always
 924         * be called inside page lock (it's recommended but not enforced).
 925         */
 926        mapping = page_mapping(hpage);
 927        if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
 928            mapping_cap_writeback_dirty(mapping)) {
 929                if (page_mkclean(hpage)) {
 930                        SetPageDirty(hpage);
 931                } else {
 932                        kill = 0;
 933                        ttu |= TTU_IGNORE_HWPOISON;
 934                        printk(KERN_INFO
 935        "MCE %#lx: corrupted page was clean: dropped without side effects\n",
 936                                pfn);
 937                }
 938        }
 939
 940        /*
 941         * ppage: poisoned page
 942         *   if p is regular page(4k page)
 943         *        ppage == real poisoned page;
 944         *   else p is hugetlb or THP, ppage == head page.
 945         */
 946        ppage = hpage;
 947
 948        if (PageTransHuge(hpage)) {
 949                /*
 950                 * Verify that this isn't a hugetlbfs head page, the check for
 951                 * PageAnon is just for avoid tripping a split_huge_page
 952                 * internal debug check, as split_huge_page refuses to deal with
 953                 * anything that isn't an anon page. PageAnon can't go away fro
 954                 * under us because we hold a refcount on the hpage, without a
 955                 * refcount on the hpage. split_huge_page can't be safely called
 956                 * in the first place, having a refcount on the tail isn't
 957                 * enough * to be safe.
 958                 */
 959                if (!PageHuge(hpage) && PageAnon(hpage)) {
 960                        if (unlikely(split_huge_page(hpage))) {
 961                                /*
 962                                 * FIXME: if splitting THP is failed, it is
 963                                 * better to stop the following operation rather
 964                                 * than causing panic by unmapping. System might
 965                                 * survive if the page is freed later.
 966                                 */
 967                                printk(KERN_INFO
 968                                        "MCE %#lx: failed to split THP\n", pfn);
 969
 970                                BUG_ON(!PageHWPoison(p));
 971                                return SWAP_FAIL;
 972                        }
 973                        /*
 974                         * We pinned the head page for hwpoison handling,
 975                         * now we split the thp and we are interested in
 976                         * the hwpoisoned raw page, so move the refcount
 977                         * to it. Similarly, page lock is shifted.
 978                         */
 979                        if (hpage != p) {
 980                                if (!(flags & MF_COUNT_INCREASED)) {
 981                                        put_page(hpage);
 982                                        get_page(p);
 983                                }
 984                                lock_page(p);
 985                                unlock_page(hpage);
 986                                *hpagep = p;
 987                        }
 988                        /* THP is split, so ppage should be the real poisoned page. */
 989                        ppage = p;
 990                }
 991        }
 992
 993        /*
 994         * First collect all the processes that have the page
 995         * mapped in dirty form.  This has to be done before try_to_unmap,
 996         * because ttu takes the rmap data structures down.
 997         *
 998         * Error handling: We ignore errors here because
 999         * there's nothing that can be done.
1000         */
1001        if (kill)
1002                collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
1003
1004        ret = try_to_unmap(ppage, ttu);
1005        if (ret != SWAP_SUCCESS)
1006                printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
1007                                pfn, page_mapcount(ppage));
1008
1009        /*
1010         * Now that the dirty bit has been propagated to the
1011         * struct page and all unmaps done we can decide if
1012         * killing is needed or not.  Only kill when the page
1013         * was dirty or the process is not restartable,
1014         * otherwise the tokill list is merely
1015         * freed.  When there was a problem unmapping earlier
1016         * use a more force-full uncatchable kill to prevent
1017         * any accesses to the poisoned memory.
1018         */
1019        forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
1020        kill_procs(&tokill, forcekill, trapno,
1021                      ret != SWAP_SUCCESS, p, pfn, flags);
1022
1023        return ret;
1024}
1025
1026static void set_page_hwpoison_huge_page(struct page *hpage)
1027{
1028        int i;
1029        int nr_pages = 1 << compound_order(hpage);
1030        for (i = 0; i < nr_pages; i++)
1031                SetPageHWPoison(hpage + i);
1032}
1033
1034static void clear_page_hwpoison_huge_page(struct page *hpage)
1035{
1036        int i;
1037        int nr_pages = 1 << compound_order(hpage);
1038        for (i = 0; i < nr_pages; i++)
1039                ClearPageHWPoison(hpage + i);
1040}
1041
1042/**
1043 * memory_failure - Handle memory failure of a page.
1044 * @pfn: Page Number of the corrupted page
1045 * @trapno: Trap number reported in the signal to user space.
1046 * @flags: fine tune action taken
1047 *
1048 * This function is called by the low level machine check code
1049 * of an architecture when it detects hardware memory corruption
1050 * of a page. It tries its best to recover, which includes
1051 * dropping pages, killing processes etc.
1052 *
1053 * The function is primarily of use for corruptions that
1054 * happen outside the current execution context (e.g. when
1055 * detected by a background scrubber)
1056 *
1057 * Must run in process context (e.g. a work queue) with interrupts
1058 * enabled and no spinlocks hold.
1059 */
1060int memory_failure(unsigned long pfn, int trapno, int flags)
1061{
1062        struct page_state *ps;
1063        struct page *p;
1064        struct page *hpage;
1065        int res;
1066        unsigned int nr_pages;
1067        unsigned long page_flags;
1068
1069        if (!sysctl_memory_failure_recovery)
1070                panic("Memory failure from trap %d on page %lx", trapno, pfn);
1071
1072        if (!pfn_valid(pfn)) {
1073                printk(KERN_ERR
1074                       "MCE %#lx: memory outside kernel control\n",
1075                       pfn);
1076                return -ENXIO;
1077        }
1078
1079        p = pfn_to_page(pfn);
1080        hpage = compound_head(p);
1081        if (TestSetPageHWPoison(p)) {
1082                printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
1083                return 0;
1084        }
1085
1086        /*
1087         * Currently errors on hugetlbfs pages are measured in hugepage units,
1088         * so nr_pages should be 1 << compound_order.  OTOH when errors are on
1089         * transparent hugepages, they are supposed to be split and error
1090         * measurement is done in normal page units.  So nr_pages should be one
1091         * in this case.
1092         */
1093        if (PageHuge(p))
1094                nr_pages = 1 << compound_order(hpage);
1095        else /* normal page or thp */
1096                nr_pages = 1;
1097        atomic_long_add(nr_pages, &num_poisoned_pages);
1098
1099        /*
1100         * We need/can do nothing about count=0 pages.
1101         * 1) it's a free page, and therefore in safe hand:
1102         *    prep_new_page() will be the gate keeper.
1103         * 2) it's a free hugepage, which is also safe:
1104         *    an affected hugepage will be dequeued from hugepage freelist,
1105         *    so there's no concern about reusing it ever after.
1106         * 3) it's part of a non-compound high order page.
1107         *    Implies some kernel user: cannot stop them from
1108         *    R/W the page; let's pray that the page has been
1109         *    used and will be freed some time later.
1110         * In fact it's dangerous to directly bump up page count from 0,
1111         * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
1112         */
1113        if (!(flags & MF_COUNT_INCREASED) &&
1114                !get_page_unless_zero(hpage)) {
1115                if (is_free_buddy_page(p)) {
1116                        action_result(pfn, "free buddy", DELAYED);
1117                        return 0;
1118                } else if (PageHuge(hpage)) {
1119                        /*
1120                         * Check "filter hit" and "race with other subpage."
1121                         */
1122                        lock_page(hpage);
1123                        if (PageHWPoison(hpage)) {
1124                                if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
1125                                    || (p != hpage && TestSetPageHWPoison(hpage))) {
1126                                        atomic_long_sub(nr_pages, &num_poisoned_pages);
1127                                        unlock_page(hpage);
1128                                        return 0;
1129                                }
1130                        }
1131                        set_page_hwpoison_huge_page(hpage);
1132                        res = dequeue_hwpoisoned_huge_page(hpage);
1133                        action_result(pfn, "free huge",
1134                                      res ? IGNORED : DELAYED);
1135                        unlock_page(hpage);
1136                        return res;
1137                } else {
1138                        action_result(pfn, "high order kernel", IGNORED);
1139                        return -EBUSY;
1140                }
1141        }
1142
1143        /*
1144         * We ignore non-LRU pages for good reasons.
1145         * - PG_locked is only well defined for LRU pages and a few others
1146         * - to avoid races with __set_page_locked()
1147         * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
1148         * The check (unnecessarily) ignores LRU pages being isolated and
1149         * walked by the page reclaim code, however that's not a big loss.
1150         */
1151        if (!PageHuge(p) && !PageTransTail(p)) {
1152                if (!PageLRU(p))
1153                        shake_page(p, 0);
1154                if (!PageLRU(p)) {
1155                        /*
1156                         * shake_page could have turned it free.
1157                         */
1158                        if (is_free_buddy_page(p)) {
1159                                if (flags & MF_COUNT_INCREASED)
1160                                        action_result(pfn, "free buddy", DELAYED);
1161                                else
1162                                        action_result(pfn, "free buddy, 2nd try", DELAYED);
1163                                return 0;
1164                        }
1165                }
1166        }
1167
1168        lock_page(hpage);
1169
1170        /*
1171         * The page could have changed compound pages during the locking.
1172         * If this happens just bail out.
1173         */
1174        if (compound_head(p) != hpage) {
1175                action_result(pfn, "different compound page after locking", IGNORED);
1176                res = -EBUSY;
1177                goto out;
1178        }
1179
1180        /*
1181         * We use page flags to determine what action should be taken, but
1182         * the flags can be modified by the error containment action.  One
1183         * example is an mlocked page, where PG_mlocked is cleared by
1184         * page_remove_rmap() in try_to_unmap_one(). So to determine page status
1185         * correctly, we save a copy of the page flags at this time.
1186         */
1187        page_flags = p->flags;
1188
1189        /*
1190         * unpoison always clear PG_hwpoison inside page lock
1191         */
1192        if (!PageHWPoison(p)) {
1193                printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
1194                atomic_long_sub(nr_pages, &num_poisoned_pages);
1195                put_page(hpage);
1196                res = 0;
1197                goto out;
1198        }
1199        if (hwpoison_filter(p)) {
1200                if (TestClearPageHWPoison(p))
1201                        atomic_long_sub(nr_pages, &num_poisoned_pages);
1202                unlock_page(hpage);
1203                put_page(hpage);
1204                return 0;
1205        }
1206
1207        if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
1208                goto identify_page_state;
1209
1210        /*
1211         * For error on the tail page, we should set PG_hwpoison
1212         * on the head page to show that the hugepage is hwpoisoned
1213         */
1214        if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1215                action_result(pfn, "hugepage already hardware poisoned",
1216                                IGNORED);
1217                unlock_page(hpage);
1218                put_page(hpage);
1219                return 0;
1220        }
1221        /*
1222         * Set PG_hwpoison on all pages in an error hugepage,
1223         * because containment is done in hugepage unit for now.
1224         * Since we have done TestSetPageHWPoison() for the head page with
1225         * page lock held, we can safely set PG_hwpoison bits on tail pages.
1226         */
1227        if (PageHuge(p))
1228                set_page_hwpoison_huge_page(hpage);
1229
1230        /*
1231         * It's very difficult to mess with pages currently under IO
1232         * and in many cases impossible, so we just avoid it here.
1233         */
1234        wait_on_page_writeback(p);
1235
1236        /*
1237         * Now take care of user space mappings.
1238         * Abort on fail: __delete_from_page_cache() assumes unmapped page.
1239         *
1240         * When the raw error page is thp tail page, hpage points to the raw
1241         * page after thp split.
1242         */
1243        if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
1244            != SWAP_SUCCESS) {
1245                action_result(pfn, "unmapping failed", IGNORED);
1246                res = -EBUSY;
1247                goto out;
1248        }
1249
1250        /*
1251         * Torn down by someone else?
1252         */
1253        if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1254                action_result(pfn, "already truncated LRU", IGNORED);
1255                res = -EBUSY;
1256                goto out;
1257        }
1258
1259identify_page_state:
1260        res = -EBUSY;
1261        /*
1262         * The first check uses the current page flags which may not have any
1263         * relevant information. The second check with the saved page flagss is
1264         * carried out only if the first check can't determine the page status.
1265         */
1266        for (ps = error_states;; ps++)
1267                if ((p->flags & ps->mask) == ps->res)
1268                        break;
1269
1270        page_flags |= (p->flags & (1UL << PG_dirty));
1271
1272        if (!ps->mask)
1273                for (ps = error_states;; ps++)
1274                        if ((page_flags & ps->mask) == ps->res)
1275                                break;
1276        res = page_action(ps, p, pfn);
1277out:
1278        unlock_page(hpage);
1279        return res;
1280}
1281EXPORT_SYMBOL_GPL(memory_failure);
1282
1283#define MEMORY_FAILURE_FIFO_ORDER       4
1284#define MEMORY_FAILURE_FIFO_SIZE        (1 << MEMORY_FAILURE_FIFO_ORDER)
1285
1286struct memory_failure_entry {
1287        unsigned long pfn;
1288        int trapno;
1289        int flags;
1290};
1291
1292struct memory_failure_cpu {
1293        DECLARE_KFIFO(fifo, struct memory_failure_entry,
1294                      MEMORY_FAILURE_FIFO_SIZE);
1295        spinlock_t lock;
1296        struct work_struct work;
1297};
1298
1299static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1300
1301/**
1302 * memory_failure_queue - Schedule handling memory failure of a page.
1303 * @pfn: Page Number of the corrupted page
1304 * @trapno: Trap number reported in the signal to user space.
1305 * @flags: Flags for memory failure handling
1306 *
1307 * This function is called by the low level hardware error handler
1308 * when it detects hardware memory corruption of a page. It schedules
1309 * the recovering of error page, including dropping pages, killing
1310 * processes etc.
1311 *
1312 * The function is primarily of use for corruptions that
1313 * happen outside the current execution context (e.g. when
1314 * detected by a background scrubber)
1315 *
1316 * Can run in IRQ context.
1317 */
1318void memory_failure_queue(unsigned long pfn, int trapno, int flags)
1319{
1320        struct memory_failure_cpu *mf_cpu;
1321        unsigned long proc_flags;
1322        struct memory_failure_entry entry = {
1323                .pfn =          pfn,
1324                .trapno =       trapno,
1325                .flags =        flags,
1326        };
1327
1328        mf_cpu = &get_cpu_var(memory_failure_cpu);
1329        spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1330        if (kfifo_put(&mf_cpu->fifo, entry))
1331                schedule_work_on(smp_processor_id(), &mf_cpu->work);
1332        else
1333                pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
1334                       pfn);
1335        spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1336        put_cpu_var(memory_failure_cpu);
1337}
1338EXPORT_SYMBOL_GPL(memory_failure_queue);
1339
1340static void memory_failure_work_func(struct work_struct *work)
1341{
1342        struct memory_failure_cpu *mf_cpu;
1343        struct memory_failure_entry entry = { 0, };
1344        unsigned long proc_flags;
1345        int gotten;
1346
1347        mf_cpu = this_cpu_ptr(&memory_failure_cpu);
1348        for (;;) {
1349                spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1350                gotten = kfifo_get(&mf_cpu->fifo, &entry);
1351                spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1352                if (!gotten)
1353                        break;
1354                if (entry.flags & MF_SOFT_OFFLINE)
1355                        soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
1356                else
1357                        memory_failure(entry.pfn, entry.trapno, entry.flags);
1358        }
1359}
1360
1361static int __init memory_failure_init(void)
1362{
1363        struct memory_failure_cpu *mf_cpu;
1364        int cpu;
1365
1366        for_each_possible_cpu(cpu) {
1367                mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1368                spin_lock_init(&mf_cpu->lock);
1369                INIT_KFIFO(mf_cpu->fifo);
1370                INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1371        }
1372
1373        return 0;
1374}
1375core_initcall(memory_failure_init);
1376
1377/**
1378 * unpoison_memory - Unpoison a previously poisoned page
1379 * @pfn: Page number of the to be unpoisoned page
1380 *
1381 * Software-unpoison a page that has been poisoned by
1382 * memory_failure() earlier.
1383 *
1384 * This is only done on the software-level, so it only works
1385 * for linux injected failures, not real hardware failures
1386 *
1387 * Returns 0 for success, otherwise -errno.
1388 */
1389int unpoison_memory(unsigned long pfn)
1390{
1391        struct page *page;
1392        struct page *p;
1393        int freeit = 0;
1394        unsigned int nr_pages;
1395
1396        if (!pfn_valid(pfn))
1397                return -ENXIO;
1398
1399        p = pfn_to_page(pfn);
1400        page = compound_head(p);
1401
1402        if (!PageHWPoison(p)) {
1403                pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
1404                return 0;
1405        }
1406
1407        /*
1408         * unpoison_memory() can encounter thp only when the thp is being
1409         * worked by memory_failure() and the page lock is not held yet.
1410         * In such case, we yield to memory_failure() and make unpoison fail.
1411         */
1412        if (!PageHuge(page) && PageTransHuge(page)) {
1413                pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
1414                        return 0;
1415        }
1416
1417        nr_pages = 1 << compound_order(page);
1418
1419        if (!get_page_unless_zero(page)) {
1420                /*
1421                 * Since HWPoisoned hugepage should have non-zero refcount,
1422                 * race between memory failure and unpoison seems to happen.
1423                 * In such case unpoison fails and memory failure runs
1424                 * to the end.
1425                 */
1426                if (PageHuge(page)) {
1427                        pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
1428                        return 0;
1429                }
1430                if (TestClearPageHWPoison(p))
1431                        atomic_long_dec(&num_poisoned_pages);
1432                pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1433                return 0;
1434        }
1435
1436        lock_page(page);
1437        /*
1438         * This test is racy because PG_hwpoison is set outside of page lock.
1439         * That's acceptable because that won't trigger kernel panic. Instead,
1440         * the PG_hwpoison page will be caught and isolated on the entrance to
1441         * the free buddy page pool.
1442         */
1443        if (TestClearPageHWPoison(page)) {
1444                pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1445                atomic_long_sub(nr_pages, &num_poisoned_pages);
1446                freeit = 1;
1447                if (PageHuge(page))
1448                        clear_page_hwpoison_huge_page(page);
1449        }
1450        unlock_page(page);
1451
1452        put_page(page);
1453        if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
1454                put_page(page);
1455
1456        return 0;
1457}
1458EXPORT_SYMBOL(unpoison_memory);
1459
1460static struct page *new_page(struct page *p, unsigned long private, int **x)
1461{
1462        int nid = page_to_nid(p);
1463        if (PageHuge(p))
1464                return alloc_huge_page_node(page_hstate(compound_head(p)),
1465                                                   nid);
1466        else
1467                return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
1468}
1469
1470/*
1471 * Safely get reference count of an arbitrary page.
1472 * Returns 0 for a free page, -EIO for a zero refcount page
1473 * that is not free, and 1 for any other page type.
1474 * For 1 the page is returned with increased page count, otherwise not.
1475 */
1476static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1477{
1478        int ret;
1479
1480        if (flags & MF_COUNT_INCREASED)
1481                return 1;
1482
1483        /*
1484         * When the target page is a free hugepage, just remove it
1485         * from free hugepage list.
1486         */
1487        if (!get_page_unless_zero(compound_head(p))) {
1488                if (PageHuge(p)) {
1489                        pr_info("%s: %#lx free huge page\n", __func__, pfn);
1490                        ret = 0;
1491                } else if (is_free_buddy_page(p)) {
1492                        pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1493                        ret = 0;
1494                } else {
1495                        pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1496                                __func__, pfn, p->flags);
1497                        ret = -EIO;
1498                }
1499        } else {
1500                /* Not a free page */
1501                ret = 1;
1502        }
1503        return ret;
1504}
1505
1506static int get_any_page(struct page *page, unsigned long pfn, int flags)
1507{
1508        int ret = __get_any_page(page, pfn, flags);
1509
1510        if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
1511                /*
1512                 * Try to free it.
1513                 */
1514                put_page(page);
1515                shake_page(page, 1);
1516
1517                /*
1518                 * Did it turn free?
1519                 */
1520                ret = __get_any_page(page, pfn, 0);
1521                if (!PageLRU(page)) {
1522                        pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1523                                pfn, page->flags);
1524                        return -EIO;
1525                }
1526        }
1527        return ret;
1528}
1529
1530static int soft_offline_huge_page(struct page *page, int flags)
1531{
1532        int ret;
1533        unsigned long pfn = page_to_pfn(page);
1534        struct page *hpage = compound_head(page);
1535        LIST_HEAD(pagelist);
1536
1537        /*
1538         * This double-check of PageHWPoison is to avoid the race with
1539         * memory_failure(). See also comment in __soft_offline_page().
1540         */
1541        lock_page(hpage);
1542        if (PageHWPoison(hpage)) {
1543                unlock_page(hpage);
1544                put_page(hpage);
1545                pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1546                return -EBUSY;
1547        }
1548        unlock_page(hpage);
1549
1550        /* Keep page count to indicate a given hugepage is isolated. */
1551        list_move(&hpage->lru, &pagelist);
1552        ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1553                                MIGRATE_SYNC, MR_MEMORY_FAILURE);
1554        if (ret) {
1555                pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1556                        pfn, ret, page->flags);
1557                /*
1558                 * We know that soft_offline_huge_page() tries to migrate
1559                 * only one hugepage pointed to by hpage, so we need not
1560                 * run through the pagelist here.
1561                 */
1562                putback_active_hugepage(hpage);
1563                if (ret > 0)
1564                        ret = -EIO;
1565        } else {
1566                /* overcommit hugetlb page will be freed to buddy */
1567                if (PageHuge(page)) {
1568                        set_page_hwpoison_huge_page(hpage);
1569                        dequeue_hwpoisoned_huge_page(hpage);
1570                        atomic_long_add(1 << compound_order(hpage),
1571                                        &num_poisoned_pages);
1572                } else {
1573                        SetPageHWPoison(page);
1574                        atomic_long_inc(&num_poisoned_pages);
1575                }
1576        }
1577        return ret;
1578}
1579
1580static int __soft_offline_page(struct page *page, int flags)
1581{
1582        int ret;
1583        unsigned long pfn = page_to_pfn(page);
1584
1585        /*
1586         * Check PageHWPoison again inside page lock because PageHWPoison
1587         * is set by memory_failure() outside page lock. Note that
1588         * memory_failure() also double-checks PageHWPoison inside page lock,
1589         * so there's no race between soft_offline_page() and memory_failure().
1590         */
1591        lock_page(page);
1592        wait_on_page_writeback(page);
1593        if (PageHWPoison(page)) {
1594                unlock_page(page);
1595                put_page(page);
1596                pr_info("soft offline: %#lx page already poisoned\n", pfn);
1597                return -EBUSY;
1598        }
1599        /*
1600         * Try to invalidate first. This should work for
1601         * non dirty unmapped page cache pages.
1602         */
1603        ret = invalidate_inode_page(page);
1604        unlock_page(page);
1605        /*
1606         * RED-PEN would be better to keep it isolated here, but we
1607         * would need to fix isolation locking first.
1608         */
1609        if (ret == 1) {
1610                put_page(page);
1611                pr_info("soft_offline: %#lx: invalidated\n", pfn);
1612                SetPageHWPoison(page);
1613                atomic_long_inc(&num_poisoned_pages);
1614                return 0;
1615        }
1616
1617        /*
1618         * Simple invalidation didn't work.
1619         * Try to migrate to a new page instead. migrate.c
1620         * handles a large number of cases for us.
1621         */
1622        ret = isolate_lru_page(page);
1623        /*
1624         * Drop page reference which is came from get_any_page()
1625         * successful isolate_lru_page() already took another one.
1626         */
1627        put_page(page);
1628        if (!ret) {
1629                LIST_HEAD(pagelist);
1630                inc_zone_page_state(page, NR_ISOLATED_ANON +
1631                                        page_is_file_cache(page));
1632                list_add(&page->lru, &pagelist);
1633                ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1634                                        MIGRATE_SYNC, MR_MEMORY_FAILURE);
1635                if (ret) {
1636                        if (!list_empty(&pagelist)) {
1637                                list_del(&page->lru);
1638                                dec_zone_page_state(page, NR_ISOLATED_ANON +
1639                                                page_is_file_cache(page));
1640                                putback_lru_page(page);
1641                        }
1642
1643                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1644                                pfn, ret, page->flags);
1645                        if (ret > 0)
1646                                ret = -EIO;
1647                } else {
1648                        /*
1649                         * After page migration succeeds, the source page can
1650                         * be trapped in pagevec and actual freeing is delayed.
1651                         * Freeing code works differently based on PG_hwpoison,
1652                         * so there's a race. We need to make sure that the
1653                         * source page should be freed back to buddy before
1654                         * setting PG_hwpoison.
1655                         */
1656                        if (!is_free_buddy_page(page))
1657                                lru_add_drain_all();
1658                        if (!is_free_buddy_page(page))
1659                                drain_all_pages(page_zone(page));
1660                        SetPageHWPoison(page);
1661                        if (!is_free_buddy_page(page))
1662                                pr_info("soft offline: %#lx: page leaked\n",
1663                                        pfn);
1664                        atomic_long_inc(&num_poisoned_pages);
1665                }
1666        } else {
1667                pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1668                        pfn, ret, page_count(page), page->flags);
1669        }
1670        return ret;
1671}
1672
1673/**
1674 * soft_offline_page - Soft offline a page.
1675 * @page: page to offline
1676 * @flags: flags. Same as memory_failure().
1677 *
1678 * Returns 0 on success, otherwise negated errno.
1679 *
1680 * Soft offline a page, by migration or invalidation,
1681 * without killing anything. This is for the case when
1682 * a page is not corrupted yet (so it's still valid to access),
1683 * but has had a number of corrected errors and is better taken
1684 * out.
1685 *
1686 * The actual policy on when to do that is maintained by
1687 * user space.
1688 *
1689 * This should never impact any application or cause data loss,
1690 * however it might take some time.
1691 *
1692 * This is not a 100% solution for all memory, but tries to be
1693 * ``good enough'' for the majority of memory.
1694 */
1695int soft_offline_page(struct page *page, int flags)
1696{
1697        int ret;
1698        unsigned long pfn = page_to_pfn(page);
1699        struct page *hpage = compound_head(page);
1700
1701        if (PageHWPoison(page)) {
1702                pr_info("soft offline: %#lx page already poisoned\n", pfn);
1703                return -EBUSY;
1704        }
1705        if (!PageHuge(page) && PageTransHuge(hpage)) {
1706                if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
1707                        pr_info("soft offline: %#lx: failed to split THP\n",
1708                                pfn);
1709                        return -EBUSY;
1710                }
1711        }
1712
1713        get_online_mems();
1714
1715        /*
1716         * Isolate the page, so that it doesn't get reallocated if it
1717         * was free. This flag should be kept set until the source page
1718         * is freed and PG_hwpoison on it is set.
1719         */
1720        if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
1721                set_migratetype_isolate(page, true);
1722
1723        ret = get_any_page(page, pfn, flags);
1724        put_online_mems();
1725        if (ret > 0) { /* for in-use pages */
1726                if (PageHuge(page))
1727                        ret = soft_offline_huge_page(page, flags);
1728                else
1729                        ret = __soft_offline_page(page, flags);
1730        } else if (ret == 0) { /* for free pages */
1731                if (PageHuge(page)) {
1732                        set_page_hwpoison_huge_page(hpage);
1733                        dequeue_hwpoisoned_huge_page(hpage);
1734                        atomic_long_add(1 << compound_order(hpage),
1735                                        &num_poisoned_pages);
1736                } else {
1737                        SetPageHWPoison(page);
1738                        atomic_long_inc(&num_poisoned_pages);
1739                }
1740        }
1741        unset_migratetype_isolate(page, MIGRATE_MOVABLE);
1742        return ret;
1743}
1744