linux/arch/x86/kernel/cpu/mcheck/mce.c
<<
>>
Prefs
   1/*
   2 * Machine check handler.
   3 *
   4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   5 * Rest from unknown author(s).
   6 * 2004 Andi Kleen. Rewrote most of it.
   7 * Copyright 2008 Intel Corporation
   8 * Author: Andi Kleen
   9 */
  10
  11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  12
  13#include <linux/thread_info.h>
  14#include <linux/capability.h>
  15#include <linux/miscdevice.h>
  16#include <linux/ratelimit.h>
  17#include <linux/kallsyms.h>
  18#include <linux/rcupdate.h>
  19#include <linux/kobject.h>
  20#include <linux/uaccess.h>
  21#include <linux/kdebug.h>
  22#include <linux/kernel.h>
  23#include <linux/percpu.h>
  24#include <linux/string.h>
  25#include <linux/device.h>
  26#include <linux/syscore_ops.h>
  27#include <linux/delay.h>
  28#include <linux/ctype.h>
  29#include <linux/sched.h>
  30#include <linux/sysfs.h>
  31#include <linux/types.h>
  32#include <linux/slab.h>
  33#include <linux/init.h>
  34#include <linux/kmod.h>
  35#include <linux/poll.h>
  36#include <linux/nmi.h>
  37#include <linux/cpu.h>
  38#include <linux/smp.h>
  39#include <linux/fs.h>
  40#include <linux/mm.h>
  41#include <linux/debugfs.h>
  42#include <linux/irq_work.h>
  43#include <linux/export.h>
  44
  45#include <asm/intel-family.h>
  46#include <asm/processor.h>
  47#include <asm/traps.h>
  48#include <asm/mce.h>
  49#include <asm/msr.h>
  50#include <asm/reboot.h>
  51#include <asm/cacheflush.h>
  52
  53#include "mce-internal.h"
  54
  55static DEFINE_MUTEX(mce_chrdev_read_mutex);
  56
  57#define mce_log_get_idx_check(p) \
  58({ \
  59        rcu_lockdep_assert(rcu_read_lock_sched_held() || \
  60                           lockdep_is_held(&mce_chrdev_read_mutex), \
  61                           "suspicious mce_log_get_idx_check() usage"); \
  62        smp_load_acquire(&(p)); \
  63})
  64
  65#define CREATE_TRACE_POINTS
  66#include <trace/events/mce.h>
  67
  68#define SPINUNIT                100     /* 100ns */
  69
  70DEFINE_PER_CPU(unsigned, mce_exception_count);
  71
  72struct mce_bank *mce_banks __read_mostly;
  73struct mce_vendor_flags mce_flags __read_mostly;
  74
  75struct mca_config mca_cfg __read_mostly = {
  76        .bootlog  = -1,
  77        /*
  78         * Tolerant levels:
  79         * 0: always panic on uncorrected errors, log corrected errors
  80         * 1: panic or SIGBUS on uncorrected errors, log corrected errors
  81         * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
  82         * 3: never panic or SIGBUS, log all errors (for testing only)
  83         */
  84        .tolerant = 1,
  85        .monarch_timeout = -1
  86};
  87
  88/* User mode helper program triggered by machine check event */
  89static unsigned long            mce_need_notify;
  90static char                     mce_helper[128];
  91static char                     *mce_helper_argv[2] = { mce_helper, NULL };
  92
  93static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
  94
  95static DEFINE_PER_CPU(struct mce, mces_seen);
  96static int                      cpu_missing;
  97
  98/*
  99 * MCA banks polled by the period polling timer for corrected events.
 100 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
 101 */
 102DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 103        [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
 104};
 105
 106/*
 107 * MCA banks controlled through firmware first for corrected errors.
 108 * This is a global list of banks for which we won't enable CMCI and we
 109 * won't poll. Firmware controls these banks and is responsible for
 110 * reporting corrected errors through GHES. Uncorrected/recoverable
 111 * errors are still notified through a machine check.
 112 */
 113mce_banks_t mce_banks_ce_disabled;
 114
 115static struct work_struct mce_work;
 116static struct irq_work mce_irq_work;
 117
 118static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
 119
 120/* filter false positives from panic pr_emerg */
 121static int (*quirk_noprint)(struct mce *m);
 122
 123/*
 124 * CPU/chipset specific EDAC code can register a notifier call here to print
 125 * MCE errors in a human-readable form.
 126 */
 127BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
 128
 129/* Do initial initialization of a struct mce */
 130void mce_setup(struct mce *m)
 131{
 132        memset(m, 0, sizeof(struct mce));
 133        m->cpu = m->extcpu = smp_processor_id();
 134        m->tsc = rdtsc();
 135        /* We hope get_seconds stays lockless */
 136        m->time = get_seconds();
 137        m->cpuvendor = boot_cpu_data.x86_vendor;
 138        m->cpuid = cpuid_eax(1);
 139        m->socketid = cpu_data(m->extcpu).phys_proc_id;
 140        m->apicid = cpu_data(m->extcpu).initial_apicid;
 141        rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
 142
 143        if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
 144                rdmsrl(MSR_PPIN, m->ppin);
 145
 146        m->microcode = boot_cpu_data.microcode;
 147}
 148
 149DEFINE_PER_CPU(struct mce, injectm);
 150EXPORT_PER_CPU_SYMBOL_GPL(injectm);
 151
 152/*
 153 * Lockless MCE logging infrastructure.
 154 * This avoids deadlocks on printk locks without having to break locks. Also
 155 * separate MCEs from kernel messages to avoid bogus bug reports.
 156 */
 157
 158static struct mce_log mcelog = {
 159        .signature      = MCE_LOG_SIGNATURE,
 160        .len            = MCE_LOG_LEN,
 161        .recordlen      = sizeof(struct mce),
 162};
 163
 164void mce_log(struct mce *mce)
 165{
 166        unsigned next, entry;
 167
 168        /* Emit the trace record: */
 169        trace_mce_record(mce);
 170
 171        if (!mce_gen_pool_add(mce))
 172                irq_work_queue(&mce_irq_work);
 173
 174        wmb();
 175        for (;;) {
 176                entry = mce_log_get_idx_check(mcelog.next);
 177                for (;;) {
 178
 179                        /*
 180                         * When the buffer fills up discard new entries.
 181                         * Assume that the earlier errors are the more
 182                         * interesting ones:
 183                         */
 184                        if (entry >= MCE_LOG_LEN) {
 185                                set_bit(MCE_OVERFLOW,
 186                                        (unsigned long *)&mcelog.flags);
 187                                return;
 188                        }
 189                        /* Old left over entry. Skip: */
 190                        if (mcelog.entry[entry].finished) {
 191                                entry++;
 192                                continue;
 193                        }
 194                        break;
 195                }
 196                smp_rmb();
 197                next = entry + 1;
 198                if (cmpxchg(&mcelog.next, entry, next) == entry)
 199                        break;
 200        }
 201        memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 202        wmb();
 203        mcelog.entry[entry].finished = 1;
 204        wmb();
 205
 206        set_bit(0, &mce_need_notify);
 207}
 208
 209void mce_inject_log(struct mce *m)
 210{
 211        mutex_lock(&mce_chrdev_read_mutex);
 212        mce_log(m);
 213        mutex_unlock(&mce_chrdev_read_mutex);
 214}
 215EXPORT_SYMBOL_GPL(mce_inject_log);
 216
 217static struct notifier_block mce_srao_nb;
 218
 219static atomic_t num_notifiers;
 220
 221void mce_register_decode_chain(struct notifier_block *nb)
 222{
 223        atomic_inc(&num_notifiers);
 224
 225        WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC);
 226
 227        blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
 228}
 229EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 230
 231void mce_unregister_decode_chain(struct notifier_block *nb)
 232{
 233        atomic_dec(&num_notifiers);
 234
 235        blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
 236}
 237EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
 238
 239static inline u32 ctl_reg(int bank)
 240{
 241        return MSR_IA32_MCx_CTL(bank);
 242}
 243
 244static inline u32 status_reg(int bank)
 245{
 246        return MSR_IA32_MCx_STATUS(bank);
 247}
 248
 249static inline u32 addr_reg(int bank)
 250{
 251        return MSR_IA32_MCx_ADDR(bank);
 252}
 253
 254static inline u32 misc_reg(int bank)
 255{
 256        return MSR_IA32_MCx_MISC(bank);
 257}
 258
 259static inline u32 smca_ctl_reg(int bank)
 260{
 261        return MSR_AMD64_SMCA_MCx_CTL(bank);
 262}
 263
 264static inline u32 smca_status_reg(int bank)
 265{
 266        return MSR_AMD64_SMCA_MCx_STATUS(bank);
 267}
 268
 269static inline u32 smca_addr_reg(int bank)
 270{
 271        return MSR_AMD64_SMCA_MCx_ADDR(bank);
 272}
 273
 274static inline u32 smca_misc_reg(int bank)
 275{
 276        return MSR_AMD64_SMCA_MCx_MISC(bank);
 277}
 278
 279struct mca_msr_regs msr_ops = {
 280        .ctl    = ctl_reg,
 281        .status = status_reg,
 282        .addr   = addr_reg,
 283        .misc   = misc_reg
 284};
 285
 286static void __print_mce(struct mce *m)
 287{
 288        if (quirk_noprint && quirk_noprint(m))
 289                return;
 290
 291        pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
 292                 m->extcpu,
 293                 (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
 294                 m->mcgstatus, m->bank, m->status);
 295
 296        if (m->ip) {
 297                pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
 298                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 299                        m->cs, m->ip);
 300
 301                if (m->cs == __KERNEL_CS)
 302                        print_symbol("{%s}", m->ip);
 303                pr_cont("\n");
 304        }
 305
 306        pr_emerg(HW_ERR "TSC %llx ", m->tsc);
 307        if (m->addr)
 308                pr_cont("ADDR %llx ", m->addr);
 309        if (m->misc)
 310                pr_cont("MISC %llx ", m->misc);
 311
 312        if (mce_flags.smca) {
 313                if (m->synd)
 314                        pr_cont("SYND %llx ", m->synd);
 315                if (m->ipid)
 316                        pr_cont("IPID %llx ", m->ipid);
 317        }
 318
 319        pr_cont("\n");
 320        /*
 321         * Note this output is parsed by external tools and old fields
 322         * should not be changed.
 323         */
 324        pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
 325                m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
 326                m->microcode);
 327}
 328
 329static void print_mce(struct mce *m)
 330{
 331        __print_mce(m);
 332        pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
 333}
 334
 335#define PANIC_TIMEOUT 5 /* 5 seconds */
 336
 337static atomic_t mce_panicked;
 338
 339static int fake_panic;
 340static atomic_t mce_fake_panicked;
 341
 342/* Panic in progress. Enable interrupts and wait for final IPI */
 343static void wait_for_panic(void)
 344{
 345        long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
 346
 347        preempt_disable();
 348        local_irq_enable();
 349        while (timeout-- > 0)
 350                udelay(1);
 351        if (panic_timeout == 0)
 352                panic_timeout = mca_cfg.panic_timeout;
 353        panic("Panicing machine check CPU died");
 354}
 355
 356static void mce_panic(const char *msg, struct mce *final, char *exp)
 357{
 358        int apei_err = 0;
 359        struct llist_node *pending;
 360        struct mce_evt_llist *l;
 361
 362        if (!fake_panic) {
 363                /*
 364                 * Make sure only one CPU runs in machine check panic
 365                 */
 366                if (atomic_inc_return(&mce_panicked) > 1)
 367                        wait_for_panic();
 368                barrier();
 369
 370                bust_spinlocks(1);
 371                console_verbose();
 372        } else {
 373                /* Don't log too much for fake panic */
 374                if (atomic_inc_return(&mce_fake_panicked) > 1)
 375                        return;
 376        }
 377        pending = mce_gen_pool_prepare_records();
 378        /* First print corrected ones that are still unlogged */
 379        llist_for_each_entry(l, pending, llnode) {
 380                struct mce *m = &l->mce;
 381                if (!(m->status & MCI_STATUS_UC)) {
 382                        print_mce(m);
 383                        if (!apei_err)
 384                                apei_err = apei_write_mce(m);
 385                }
 386        }
 387        /* Now print uncorrected but with the final one last */
 388        llist_for_each_entry(l, pending, llnode) {
 389                struct mce *m = &l->mce;
 390                if (!(m->status & MCI_STATUS_UC))
 391                        continue;
 392                if (!final || mce_cmp(m, final)) {
 393                        print_mce(m);
 394                        if (!apei_err)
 395                                apei_err = apei_write_mce(m);
 396                }
 397        }
 398        if (final) {
 399                print_mce(final);
 400                if (!apei_err)
 401                        apei_err = apei_write_mce(final);
 402        }
 403        if (cpu_missing)
 404                pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
 405        if (exp)
 406                pr_emerg(HW_ERR "Machine check: %s\n", exp);
 407        if (!fake_panic) {
 408                if (panic_timeout == 0)
 409                        panic_timeout = mca_cfg.panic_timeout;
 410                panic(msg);
 411        } else
 412                pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
 413}
 414
 415/* Support code for software error injection */
 416
 417static int msr_to_offset(u32 msr)
 418{
 419        unsigned bank = __this_cpu_read(injectm.bank);
 420
 421        if (msr == mca_cfg.rip_msr)
 422                return offsetof(struct mce, ip);
 423        if (msr == msr_ops.status(bank))
 424                return offsetof(struct mce, status);
 425        if (msr == msr_ops.addr(bank))
 426                return offsetof(struct mce, addr);
 427        if (msr == msr_ops.misc(bank))
 428                return offsetof(struct mce, misc);
 429        if (msr == MSR_IA32_MCG_STATUS)
 430                return offsetof(struct mce, mcgstatus);
 431        return -1;
 432}
 433
 434/* MSR access wrappers used for error injection */
 435static u64 mce_rdmsrl(u32 msr)
 436{
 437        u64 v;
 438
 439        if (__this_cpu_read(injectm.finished)) {
 440                int offset = msr_to_offset(msr);
 441
 442                if (offset < 0)
 443                        return 0;
 444                return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
 445        }
 446
 447        if (rdmsrl_safe(msr, &v)) {
 448                WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
 449                /*
 450                 * Return zero in case the access faulted. This should
 451                 * not happen normally but can happen if the CPU does
 452                 * something weird, or if the code is buggy.
 453                 */
 454                v = 0;
 455        }
 456
 457        return v;
 458}
 459
 460static void mce_wrmsrl(u32 msr, u64 v)
 461{
 462        if (__this_cpu_read(injectm.finished)) {
 463                int offset = msr_to_offset(msr);
 464
 465                if (offset >= 0)
 466                        *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
 467                return;
 468        }
 469        wrmsrl(msr, v);
 470}
 471
 472/*
 473 * Collect all global (w.r.t. this processor) status about this machine
 474 * check into our "mce" struct so that we can use it later to assess
 475 * the severity of the problem as we read per-bank specific details.
 476 */
 477static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
 478{
 479        mce_setup(m);
 480
 481        m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 482        if (regs) {
 483                /*
 484                 * Get the address of the instruction at the time of
 485                 * the machine check error.
 486                 */
 487                if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
 488                        m->ip = regs->ip;
 489                        m->cs = regs->cs;
 490
 491                        /*
 492                         * When in VM86 mode make the cs look like ring 3
 493                         * always. This is a lie, but it's better than passing
 494                         * the additional vm86 bit around everywhere.
 495                         */
 496                        if (v8086_mode(regs))
 497                                m->cs |= 3;
 498                }
 499                /* Use accurate RIP reporting if available. */
 500                if (mca_cfg.rip_msr)
 501                        m->ip = mce_rdmsrl(mca_cfg.rip_msr);
 502        }
 503}
 504
 505int mce_available(struct cpuinfo_x86 *c)
 506{
 507        if (mca_cfg.disabled)
 508                return 0;
 509        return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 510}
 511
 512static void mce_schedule_work(void)
 513{
 514        if (!mce_gen_pool_empty())
 515                schedule_work(&mce_work);
 516}
 517
 518static void mce_irq_work_cb(struct irq_work *entry)
 519{
 520        mce_notify_irq();
 521        mce_schedule_work();
 522}
 523
 524static void mce_report_event(struct pt_regs *regs)
 525{
 526        if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
 527                mce_notify_irq();
 528                /*
 529                 * Triggering the work queue here is just an insurance
 530                 * policy in case the syscall exit notify handler
 531                 * doesn't run soon enough or ends up running on the
 532                 * wrong CPU (can happen when audit sleeps)
 533                 */
 534                mce_schedule_work();
 535                return;
 536        }
 537
 538        irq_work_queue(&mce_irq_work);
 539}
 540
 541/*
 542 * Check if the address reported by the CPU is in a format we can parse.
 543 * It would be possible to add code for most other cases, but all would
 544 * be somewhat complicated (e.g. segment offset would require an instruction
 545 * parser). So only support physical addresses up to page granuality for now.
 546 */
 547int mce_usable_address(struct mce *m)
 548{
 549        if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
 550                return 0;
 551
 552        /* Checks after this one are Intel-specific: */
 553        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
 554                return 1;
 555
 556        if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
 557                return 0;
 558        if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
 559                return 0;
 560        return 1;
 561}
 562EXPORT_SYMBOL_GPL(mce_usable_address);
 563
 564static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
 565                                void *data)
 566{
 567        struct mce *mce = (struct mce *)data;
 568        unsigned long pfn;
 569
 570        if (!mce)
 571                return NOTIFY_DONE;
 572
 573        if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
 574                pfn = mce->addr >> PAGE_SHIFT;
 575                if (!memory_failure(pfn, MCE_VECTOR, 0))
 576                        set_mce_nospec(pfn);
 577        }
 578
 579        return NOTIFY_OK;
 580}
 581static struct notifier_block mce_srao_nb = {
 582        .notifier_call  = srao_decode_notifier,
 583        .priority       = MCE_PRIO_SRAO,
 584};
 585
 586static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
 587                                void *data)
 588{
 589        struct mce *m = (struct mce *)data;
 590
 591        if (!m)
 592                return NOTIFY_DONE;
 593
 594        /*
 595         * Run the default notifier if we have only the SRAO
 596         * notifier and us registered.
 597         */
 598        if (atomic_read(&num_notifiers) > 2)
 599                return NOTIFY_DONE;
 600
 601        __print_mce(m);
 602
 603        return NOTIFY_DONE;
 604}
 605
 606static struct notifier_block mce_default_nb = {
 607        .notifier_call  = mce_default_notifier,
 608        /* lowest prio, we want it to run last. */
 609        .priority       = MCE_PRIO_LOWEST,
 610};
 611
 612/*
 613 * Read ADDR and MISC registers.
 614 */
 615static void mce_read_aux(struct mce *m, int i)
 616{
 617        if (m->status & MCI_STATUS_MISCV)
 618                m->misc = mce_rdmsrl(msr_ops.misc(i));
 619
 620        if (m->status & MCI_STATUS_ADDRV) {
 621                m->addr = mce_rdmsrl(msr_ops.addr(i));
 622
 623                /*
 624                 * Mask the reported address by the reported granularity.
 625                 */
 626                if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
 627                        u8 shift = MCI_MISC_ADDR_LSB(m->misc);
 628                        m->addr >>= shift;
 629                        m->addr <<= shift;
 630                }
 631
 632                /*
 633                 * Extract [55:<lsb>] where lsb is the least significant
 634                 * *valid* bit of the address bits.
 635                 */
 636                if (mce_flags.smca) {
 637                        u8 lsb = (m->addr >> 56) & 0x3f;
 638
 639                        m->addr &= GENMASK_ULL(55, lsb);
 640                }
 641        }
 642
 643        if (mce_flags.smca) {
 644                m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
 645
 646                if (m->status & MCI_STATUS_SYNDV)
 647                        m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
 648        }
 649}
 650
 651bool mce_is_memory_error(struct mce *m)
 652{
 653        if (m->cpuvendor == X86_VENDOR_AMD) {
 654                /* ErrCodeExt[20:16] */
 655                u8 xec = (m->status >> 16) & 0x1f;
 656
 657                return (xec == 0x0 || xec == 0x8);
 658        } else if (m->cpuvendor == X86_VENDOR_INTEL) {
 659                /*
 660                 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
 661                 *
 662                 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
 663                 * indicating a memory error. Bit 8 is used for indicating a
 664                 * cache hierarchy error. The combination of bit 2 and bit 3
 665                 * is used for indicating a `generic' cache hierarchy error
 666                 * But we can't just blindly check the above bits, because if
 667                 * bit 11 is set, then it is a bus/interconnect error - and
 668                 * either way the above bits just gives more detail on what
 669                 * bus/interconnect error happened. Note that bit 12 can be
 670                 * ignored, as it's the "filter" bit.
 671                 */
 672                return (m->status & 0xef80) == BIT(7) ||
 673                       (m->status & 0xef00) == BIT(8) ||
 674                       (m->status & 0xeffc) == 0xc;
 675        }
 676
 677        return false;
 678}
 679EXPORT_SYMBOL_GPL(mce_is_memory_error);
 680
 681bool mce_is_correctable(struct mce *m)
 682{
 683        if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
 684                return false;
 685
 686        if (m->status & MCI_STATUS_UC)
 687                return false;
 688
 689        return true;
 690}
 691EXPORT_SYMBOL_GPL(mce_is_correctable);
 692
 693DEFINE_PER_CPU(unsigned, mce_poll_count);
 694
 695/*
 696 * Poll for corrected events or events that happened before reset.
 697 * Those are just logged through /dev/mcelog.
 698 *
 699 * This is executed in standard interrupt context.
 700 *
 701 * Note: spec recommends to panic for fatal unsignalled
 702 * errors here. However this would be quite problematic --
 703 * we would need to reimplement the Monarch handling and
 704 * it would mess up the exclusion between exception handler
 705 * and poll hander -- * so we skip this for now.
 706 * These cases should not happen anyways, or only when the CPU
 707 * is already totally * confused. In this case it's likely it will
 708 * not fully execute the machine check handler either.
 709 */
 710bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 711{
 712        bool error_seen = false;
 713        struct mce m;
 714        int severity;
 715        int i;
 716
 717        this_cpu_inc(mce_poll_count);
 718
 719        mce_gather_info(&m, NULL);
 720
 721        for (i = 0; i < mca_cfg.banks; i++) {
 722                if (!mce_banks[i].ctl || !test_bit(i, *b))
 723                        continue;
 724
 725                m.misc = 0;
 726                m.addr = 0;
 727                m.bank = i;
 728                m.tsc = 0;
 729
 730                barrier();
 731                m.status = mce_rdmsrl(msr_ops.status(i));
 732                if (!(m.status & MCI_STATUS_VAL))
 733                        continue;
 734
 735
 736                /*
 737                 * Uncorrected or signalled events are handled by the exception
 738                 * handler when it is enabled, so don't process those here.
 739                 *
 740                 * TBD do the same check for MCI_STATUS_EN here?
 741                 */
 742                if (!(flags & MCP_UC) &&
 743                    (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
 744                        continue;
 745
 746                error_seen = true;
 747
 748                mce_read_aux(&m, i);
 749
 750                if (!(flags & MCP_TIMESTAMP))
 751                        m.tsc = 0;
 752
 753                severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
 754
 755                if (severity == MCE_DEFERRED_SEVERITY && mce_is_memory_error(&m))
 756                        if (m.status & MCI_STATUS_ADDRV)
 757                                m.severity = severity;
 758
 759                /*
 760                 * Don't get the IP here because it's unlikely to
 761                 * have anything to do with the actual error location.
 762                 */
 763                if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
 764                        mce_log(&m);
 765                else if (mce_usable_address(&m)) {
 766                        /*
 767                         * Although we skipped logging this, we still want
 768                         * to take action. Add to the pool so the registered
 769                         * notifiers will see it.
 770                         */
 771                        if (!mce_gen_pool_add(&m))
 772                                mce_schedule_work();
 773                }
 774
 775                /*
 776                 * Clear state for this bank.
 777                 */
 778                mce_wrmsrl(msr_ops.status(i), 0);
 779        }
 780
 781        /*
 782         * Don't clear MCG_STATUS here because it's only defined for
 783         * exceptions.
 784         */
 785
 786        sync_core();
 787
 788        return error_seen;
 789}
 790EXPORT_SYMBOL_GPL(machine_check_poll);
 791
 792/*
 793 * Do a quick check if any of the events requires a panic.
 794 * This decides if we keep the events around or clear them.
 795 */
 796static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
 797                          struct pt_regs *regs)
 798{
 799        int i, ret = 0;
 800        char *tmp;
 801
 802        for (i = 0; i < mca_cfg.banks; i++) {
 803                m->status = mce_rdmsrl(msr_ops.status(i));
 804                if (m->status & MCI_STATUS_VAL) {
 805                        __set_bit(i, validp);
 806                        if (quirk_no_way_out)
 807                                quirk_no_way_out(i, m, regs);
 808                }
 809
 810                if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
 811                        *msg = tmp;
 812                        ret = 1;
 813                }
 814        }
 815        return ret;
 816}
 817
 818/*
 819 * Variable to establish order between CPUs while scanning.
 820 * Each CPU spins initially until executing is equal its number.
 821 */
 822static atomic_t mce_executing;
 823
 824/*
 825 * Defines order of CPUs on entry. First CPU becomes Monarch.
 826 */
 827static atomic_t mce_callin;
 828
 829/*
 830 * Check if a timeout waiting for other CPUs happened.
 831 */
 832static int mce_timed_out(u64 *t, const char *msg)
 833{
 834        /*
 835         * The others already did panic for some reason.
 836         * Bail out like in a timeout.
 837         * rmb() to tell the compiler that system_state
 838         * might have been modified by someone else.
 839         */
 840        rmb();
 841        if (atomic_read(&mce_panicked))
 842                wait_for_panic();
 843        if (!mca_cfg.monarch_timeout)
 844                goto out;
 845        if ((s64)*t < SPINUNIT) {
 846                if (mca_cfg.tolerant <= 1)
 847                        mce_panic(msg, NULL, NULL);
 848                cpu_missing = 1;
 849                return 1;
 850        }
 851        *t -= SPINUNIT;
 852out:
 853        touch_nmi_watchdog();
 854        return 0;
 855}
 856
 857/*
 858 * The Monarch's reign.  The Monarch is the CPU who entered
 859 * the machine check handler first. It waits for the others to
 860 * raise the exception too and then grades them. When any
 861 * error is fatal panic. Only then let the others continue.
 862 *
 863 * The other CPUs entering the MCE handler will be controlled by the
 864 * Monarch. They are called Subjects.
 865 *
 866 * This way we prevent any potential data corruption in a unrecoverable case
 867 * and also makes sure always all CPU's errors are examined.
 868 *
 869 * Also this detects the case of a machine check event coming from outer
 870 * space (not detected by any CPUs) In this case some external agent wants
 871 * us to shut down, so panic too.
 872 *
 873 * The other CPUs might still decide to panic if the handler happens
 874 * in a unrecoverable place, but in this case the system is in a semi-stable
 875 * state and won't corrupt anything by itself. It's ok to let the others
 876 * continue for a bit first.
 877 *
 878 * All the spin loops have timeouts; when a timeout happens a CPU
 879 * typically elects itself to be Monarch.
 880 */
 881static void mce_reign(void)
 882{
 883        int cpu;
 884        struct mce *m = NULL;
 885        int global_worst = 0;
 886        char *msg = NULL;
 887        char *nmsg = NULL;
 888
 889        /*
 890         * This CPU is the Monarch and the other CPUs have run
 891         * through their handlers.
 892         * Grade the severity of the errors of all the CPUs.
 893         */
 894        for_each_possible_cpu(cpu) {
 895                int severity = mce_severity(&per_cpu(mces_seen, cpu),
 896                                            mca_cfg.tolerant,
 897                                            &nmsg, true);
 898                if (severity > global_worst) {
 899                        msg = nmsg;
 900                        global_worst = severity;
 901                        m = &per_cpu(mces_seen, cpu);
 902                }
 903        }
 904
 905        /*
 906         * Cannot recover? Panic here then.
 907         * This dumps all the mces in the log buffer and stops the
 908         * other CPUs.
 909         */
 910        if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
 911                mce_panic("Fatal machine check", m, msg);
 912
 913        /*
 914         * For UC somewhere we let the CPU who detects it handle it.
 915         * Also must let continue the others, otherwise the handling
 916         * CPU could deadlock on a lock.
 917         */
 918
 919        /*
 920         * No machine check event found. Must be some external
 921         * source or one CPU is hung. Panic.
 922         */
 923        if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
 924                mce_panic("Fatal machine check from unknown source", NULL, NULL);
 925
 926        /*
 927         * Now clear all the mces_seen so that they don't reappear on
 928         * the next mce.
 929         */
 930        for_each_possible_cpu(cpu)
 931                memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
 932}
 933
 934static atomic_t global_nwo;
 935
 936/*
 937 * Start of Monarch synchronization. This waits until all CPUs have
 938 * entered the exception handler and then determines if any of them
 939 * saw a fatal event that requires panic. Then it executes them
 940 * in the entry order.
 941 * TBD double check parallel CPU hotunplug
 942 */
 943static int mce_start(int *no_way_out)
 944{
 945        int order;
 946        int cpus = num_online_cpus();
 947        u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 948
 949        if (!timeout)
 950                return -1;
 951
 952        atomic_add(*no_way_out, &global_nwo);
 953        /*
 954         * global_nwo should be updated before mce_callin
 955         */
 956        smp_wmb();
 957        order = atomic_inc_return(&mce_callin);
 958
 959        /*
 960         * Wait for everyone.
 961         */
 962        while (atomic_read(&mce_callin) != cpus) {
 963                if (mce_timed_out(&timeout,
 964                                  "Timeout: Not all CPUs entered broadcast exception handler")) {
 965                        atomic_set(&global_nwo, 0);
 966                        return -1;
 967                }
 968                ndelay(SPINUNIT);
 969        }
 970
 971        /*
 972         * mce_callin should be read before global_nwo
 973         */
 974        smp_rmb();
 975
 976        if (order == 1) {
 977                /*
 978                 * Monarch: Starts executing now, the others wait.
 979                 */
 980                atomic_set(&mce_executing, 1);
 981        } else {
 982                /*
 983                 * Subject: Now start the scanning loop one by one in
 984                 * the original callin order.
 985                 * This way when there are any shared banks it will be
 986                 * only seen by one CPU before cleared, avoiding duplicates.
 987                 */
 988                while (atomic_read(&mce_executing) < order) {
 989                        if (mce_timed_out(&timeout,
 990                                          "Timeout: Subject CPUs unable to finish machine check processing")) {
 991                                atomic_set(&global_nwo, 0);
 992                                return -1;
 993                        }
 994                        ndelay(SPINUNIT);
 995                }
 996        }
 997
 998        /*
 999         * Cache the global no_way_out state.
1000         */
1001        *no_way_out = atomic_read(&global_nwo);
1002
1003        return order;
1004}
1005
1006/*
1007 * Synchronize between CPUs after main scanning loop.
1008 * This invokes the bulk of the Monarch processing.
1009 */
1010static int mce_end(int order)
1011{
1012        int ret = -1;
1013        u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
1014
1015        if (!timeout)
1016                goto reset;
1017        if (order < 0)
1018                goto reset;
1019
1020        /*
1021         * Allow others to run.
1022         */
1023        atomic_inc(&mce_executing);
1024
1025        if (order == 1) {
1026                /* CHECKME: Can this race with a parallel hotplug? */
1027                int cpus = num_online_cpus();
1028
1029                /*
1030                 * Monarch: Wait for everyone to go through their scanning
1031                 * loops.
1032                 */
1033                while (atomic_read(&mce_executing) <= cpus) {
1034                        if (mce_timed_out(&timeout,
1035                                          "Timeout: Monarch CPU unable to finish machine check processing"))
1036                                goto reset;
1037                        ndelay(SPINUNIT);
1038                }
1039
1040                mce_reign();
1041                barrier();
1042                ret = 0;
1043        } else {
1044                /*
1045                 * Subject: Wait for Monarch to finish.
1046                 */
1047                while (atomic_read(&mce_executing) != 0) {
1048                        if (mce_timed_out(&timeout,
1049                                          "Timeout: Monarch CPU did not finish machine check processing"))
1050                                goto reset;
1051                        ndelay(SPINUNIT);
1052                }
1053
1054                /*
1055                 * Don't reset anything. That's done by the Monarch.
1056                 */
1057                return 0;
1058        }
1059
1060        /*
1061         * Reset all global state.
1062         */
1063reset:
1064        atomic_set(&global_nwo, 0);
1065        atomic_set(&mce_callin, 0);
1066        barrier();
1067
1068        /*
1069         * Let others run again.
1070         */
1071        atomic_set(&mce_executing, 0);
1072        return ret;
1073}
1074
1075static void mce_clear_state(unsigned long *toclear)
1076{
1077        int i;
1078
1079        for (i = 0; i < mca_cfg.banks; i++) {
1080                if (test_bit(i, toclear))
1081                        mce_wrmsrl(msr_ops.status(i), 0);
1082        }
1083}
1084
1085static int do_memory_failure(struct mce *m)
1086{
1087        int flags = MF_ACTION_REQUIRED;
1088        int ret;
1089
1090        pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
1091        if (!(m->mcgstatus & MCG_STATUS_RIPV))
1092                flags |= MF_MUST_KILL;
1093        ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
1094        if (ret)
1095                pr_err("Memory error not recovered");
1096        else
1097                set_mce_nospec(m->addr >> PAGE_SHIFT);
1098        return ret;
1099}
1100
1101/*
1102 * The actual machine check handler. This only handles real
1103 * exceptions when something got corrupted coming in through int 18.
1104 *
1105 * This is executed in NMI context not subject to normal locking rules. This
1106 * implies that most kernel services cannot be safely used. Don't even
1107 * think about putting a printk in there!
1108 *
1109 * On Intel systems this is entered on all CPUs in parallel through
1110 * MCE broadcast. However some CPUs might be broken beyond repair,
1111 * so be always careful when synchronizing with others.
1112 */
1113void do_machine_check(struct pt_regs *regs, long error_code)
1114{
1115        struct mca_config *cfg = &mca_cfg;
1116        struct mce m, *final;
1117        int i;
1118        int worst = 0;
1119        int severity;
1120
1121        /*
1122         * Establish sequential order between the CPUs entering the machine
1123         * check handler.
1124         */
1125        int order = -1;
1126        /*
1127         * If no_way_out gets set, there is no safe way to recover from this
1128         * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
1129         */
1130        int no_way_out = 0;
1131        /*
1132         * If kill_it gets set, there might be a way to recover from this
1133         * error.
1134         */
1135        int kill_it = 0;
1136        DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1137        DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1138        char *msg = "Unknown";
1139
1140        /*
1141         * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
1142         * on Intel.
1143         */
1144        int lmce = 1;
1145        int cpu = smp_processor_id();
1146
1147        /*
1148         * Cases where we avoid rendezvous handler timeout:
1149         * 1) If this CPU is offline.
1150         *
1151         * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
1152         *  skip those CPUs which remain looping in the 1st kernel - see
1153         *  crash_nmi_callback().
1154         *
1155         * Note: there still is a small window between kexec-ing and the new,
1156         * kdump kernel establishing a new #MC handler where a broadcasted MCE
1157         * might not get handled properly.
1158         */
1159        if (cpu_is_offline(cpu) ||
1160            (crashing_cpu != -1 && crashing_cpu != cpu)) {
1161                u64 mcgstatus;
1162
1163                mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
1164                if (mcgstatus & MCG_STATUS_RIPV) {
1165                        mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1166                        return;
1167                }
1168        }
1169
1170        this_cpu_inc(mce_exception_count);
1171
1172        if (!cfg->banks)
1173                goto out;
1174
1175        mce_gather_info(&m, regs);
1176
1177        final = &__get_cpu_var(mces_seen);
1178        *final = m;
1179
1180        memset(valid_banks, 0, sizeof(valid_banks));
1181        no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1182
1183        barrier();
1184
1185        /*
1186         * When no restart IP might need to kill or panic.
1187         * Assume the worst for now, but if we find the
1188         * severity is MCE_AR_SEVERITY we have other options.
1189         */
1190        if (!(m.mcgstatus & MCG_STATUS_RIPV))
1191                kill_it = 1;
1192
1193        /*
1194         * Check if this MCE is signaled to only this logical processor,
1195         * on Intel only.
1196         */
1197        if (m.cpuvendor == X86_VENDOR_INTEL)
1198                lmce = m.mcgstatus & MCG_STATUS_LMCES;
1199
1200        /*
1201         * Go through all banks in exclusion of the other CPUs. This way we
1202         * don't report duplicated events on shared banks because the first one
1203         * to see it will clear it. If this is a Local MCE, then no need to
1204         * perform rendezvous.
1205         */
1206        if (!lmce)
1207                order = mce_start(&no_way_out);
1208
1209        for (i = 0; i < cfg->banks; i++) {
1210                __clear_bit(i, toclear);
1211                if (!test_bit(i, valid_banks))
1212                        continue;
1213                if (!mce_banks[i].ctl)
1214                        continue;
1215
1216                m.misc = 0;
1217                m.addr = 0;
1218                m.bank = i;
1219
1220                m.status = mce_rdmsrl(msr_ops.status(i));
1221                if ((m.status & MCI_STATUS_VAL) == 0)
1222                        continue;
1223
1224                /*
1225                 * Non uncorrected or non signaled errors are handled by
1226                 * machine_check_poll. Leave them alone, unless this panics.
1227                 */
1228                if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1229                        !no_way_out)
1230                        continue;
1231
1232                /*
1233                 * Set taint even when machine check was not enabled.
1234                 */
1235                add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1236
1237                severity = mce_severity(&m, cfg->tolerant, NULL, true);
1238
1239                /*
1240                 * When machine check was for corrected/deferred handler don't
1241                 * touch, unless we're panicing.
1242                 */
1243                if ((severity == MCE_KEEP_SEVERITY ||
1244                     severity == MCE_UCNA_SEVERITY) && !no_way_out)
1245                        continue;
1246                __set_bit(i, toclear);
1247                if (severity == MCE_NO_SEVERITY) {
1248                        /*
1249                         * Machine check event was not enabled. Clear, but
1250                         * ignore.
1251                         */
1252                        continue;
1253                }
1254
1255                mce_read_aux(&m, i);
1256
1257                /* assuming valid severity level != 0 */
1258                m.severity = severity;
1259
1260                mce_log(&m);
1261
1262                if (severity > worst) {
1263                        *final = m;
1264                        worst = severity;
1265                }
1266        }
1267
1268        /* mce_clear_state will clear *final, save locally for use later */
1269        m = *final;
1270
1271        if (!no_way_out)
1272                mce_clear_state(toclear);
1273
1274        /*
1275         * Do most of the synchronization with other CPUs.
1276         * When there's any problem use only local no_way_out state.
1277         */
1278        if (!lmce) {
1279                if (mce_end(order) < 0)
1280                        no_way_out = worst >= MCE_PANIC_SEVERITY;
1281        } else {
1282                /*
1283                 * Local MCE skipped calling mce_reign()
1284                 * If we found a fatal error, we need to panic here.
1285                 */
1286                 if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
1287                        mce_panic("Machine check from unknown source",
1288                                NULL, NULL);
1289        }
1290
1291        /*
1292         * If tolerant is at an insane level we drop requests to kill
1293         * processes and continue even when there is no way out.
1294         */
1295        if (cfg->tolerant == 3)
1296                kill_it = 0;
1297        else if (no_way_out)
1298                mce_panic("Fatal machine check on current CPU", &m, msg);
1299
1300        if (worst > 0)
1301                mce_report_event(regs);
1302        mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1303out:
1304        sync_core();
1305
1306        if (worst != MCE_AR_SEVERITY && !kill_it)
1307                return;
1308
1309        /* Fault was in user mode and we need to take some action */
1310        if ((m.cs & 3) == 3) {
1311                local_irq_enable();
1312
1313                if (kill_it || do_memory_failure(&m))
1314                        force_sig(SIGBUS, current);
1315                local_irq_disable();
1316        } else {
1317                if (!mc_fixup_exception(regs, X86_TRAP_MC))
1318                        mce_panic("Failed kernel mode recovery", &m, NULL);
1319        }
1320}
1321EXPORT_SYMBOL_GPL(do_machine_check);
1322
1323#ifndef CONFIG_MEMORY_FAILURE
1324int memory_failure(unsigned long pfn, int vector, int flags)
1325{
1326        /* mce_severity() should not hand us an ACTION_REQUIRED error */
1327        BUG_ON(flags & MF_ACTION_REQUIRED);
1328        pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1329               "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1330               pfn);
1331
1332        return 0;
1333}
1334#endif
1335
1336/*
1337 * Action optional processing happens here (picking up
1338 * from the list of faulting pages that do_machine_check()
1339 * placed into the genpool).
1340 */
1341static void mce_process_work(struct work_struct *dummy)
1342{
1343        mce_gen_pool_process();
1344}
1345
1346/*
1347 * Periodic polling timer for "silent" machine check errors.  If the
1348 * poller finds an MCE, poll 2x faster.  When the poller finds no more
1349 * errors, poll 2x slower (up to check_interval seconds).
1350 */
1351static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1352
1353static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1354static DEFINE_PER_CPU(struct timer_list, mce_timer);
1355
1356static unsigned long mce_adjust_timer_default(unsigned long interval)
1357{
1358        return interval;
1359}
1360
1361static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1362
1363static void __restart_timer(struct timer_list *t, unsigned long interval)
1364{
1365        unsigned long when = jiffies + interval;
1366        unsigned long flags;
1367
1368        local_irq_save(flags);
1369
1370        if (timer_pending(t)) {
1371                if (time_before(when, t->expires))
1372                        mod_timer_pinned(t, when);
1373        } else {
1374                t->expires = round_jiffies(when);
1375                add_timer_on(t, smp_processor_id());
1376        }
1377
1378        local_irq_restore(flags);
1379}
1380
1381static void mce_timer_fn(unsigned long data)
1382{
1383        struct timer_list *t = &__get_cpu_var(mce_timer);
1384        int cpu = smp_processor_id();
1385        unsigned long iv;
1386
1387        WARN_ON(cpu != data);
1388
1389        iv = __this_cpu_read(mce_next_interval);
1390
1391        if (mce_available(__this_cpu_ptr(&cpu_info))) {
1392                machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
1393
1394                if (mce_intel_cmci_poll()) {
1395                        iv = mce_adjust_timer(iv);
1396                        goto done;
1397                }
1398        }
1399
1400        /*
1401         * Alert userspace if needed. If we logged an MCE, reduce the polling
1402         * interval, otherwise increase the polling interval.
1403         */
1404        if (mce_notify_irq())
1405                iv = max(iv / 2, (unsigned long) HZ/100);
1406        else
1407                iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1408
1409done:
1410        __this_cpu_write(mce_next_interval, iv);
1411        __restart_timer(t, iv);
1412}
1413
1414/*
1415 * Ensure that the timer is firing in @interval from now.
1416 */
1417void mce_timer_kick(unsigned long interval)
1418{
1419        struct timer_list *t = &__get_cpu_var(mce_timer);
1420        unsigned long iv = __this_cpu_read(mce_next_interval);
1421
1422        __restart_timer(t, interval);
1423
1424        if (interval < iv)
1425                __this_cpu_write(mce_next_interval, interval);
1426}
1427
1428/* Must not be called in IRQ context where del_timer_sync() can deadlock */
1429static void mce_timer_delete_all(void)
1430{
1431        int cpu;
1432
1433        for_each_online_cpu(cpu)
1434                del_timer_sync(&per_cpu(mce_timer, cpu));
1435}
1436
1437static void mce_do_trigger(struct work_struct *work)
1438{
1439        call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1440}
1441
1442static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1443
1444/*
1445 * Notify the user(s) about new machine check events.
1446 * Can be called from interrupt context, but not from machine check/NMI
1447 * context.
1448 */
1449int mce_notify_irq(void)
1450{
1451        /* Not more than two messages every minute */
1452        static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1453
1454        if (test_and_clear_bit(0, &mce_need_notify)) {
1455                /* wake processes polling /dev/mcelog */
1456                wake_up_interruptible(&mce_chrdev_wait);
1457
1458                if (mce_helper[0])
1459                        schedule_work(&mce_trigger_work);
1460
1461                if (__ratelimit(&ratelimit))
1462                        pr_info(HW_ERR "Machine check events logged\n");
1463
1464                return 1;
1465        }
1466        return 0;
1467}
1468EXPORT_SYMBOL_GPL(mce_notify_irq);
1469
1470static int __mcheck_cpu_mce_banks_init(void)
1471{
1472        int i;
1473
1474        mce_banks = kzalloc(MAX_NR_BANKS * sizeof(struct mce_bank), GFP_KERNEL);
1475        if (!mce_banks)
1476                return -ENOMEM;
1477
1478        for (i = 0; i < MAX_NR_BANKS; i++) {
1479                struct mce_bank *b = &mce_banks[i];
1480
1481                b->ctl = -1ULL;
1482                b->init = 1;
1483        }
1484        return 0;
1485}
1486
1487/*
1488 * Initialize Machine Checks for a CPU.
1489 */
1490static int __mcheck_cpu_cap_init(void)
1491{
1492        u64 cap;
1493        u8 b;
1494
1495        rdmsrl(MSR_IA32_MCG_CAP, cap);
1496
1497        b = cap & MCG_BANKCNT_MASK;
1498        if (WARN_ON_ONCE(b > MAX_NR_BANKS))
1499                b = MAX_NR_BANKS;
1500
1501        mca_cfg.banks = max(mca_cfg.banks, b);
1502
1503        if (!mce_banks) {
1504                int err = __mcheck_cpu_mce_banks_init();
1505                if (err)
1506                        return err;
1507        }
1508
1509        /* Use accurate RIP reporting if available. */
1510        if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1511                mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1512
1513        if (cap & MCG_SER_P)
1514                mca_cfg.ser = true;
1515
1516        return 0;
1517}
1518
1519static void __mcheck_cpu_init_generic(void)
1520{
1521        enum mcp_flags m_fl = 0;
1522        mce_banks_t all_banks;
1523        u64 cap;
1524
1525        if (!mca_cfg.bootlog)
1526                m_fl = MCP_DONTLOG;
1527
1528        /*
1529         * Log the machine checks left over from the previous reset.
1530         */
1531        bitmap_fill(all_banks, MAX_NR_BANKS);
1532        machine_check_poll(MCP_UC | m_fl, &all_banks);
1533
1534        set_in_cr4(X86_CR4_MCE);
1535
1536        rdmsrl(MSR_IA32_MCG_CAP, cap);
1537        if (cap & MCG_CTL_P)
1538                wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1539}
1540
1541static void __mcheck_cpu_init_clear_banks(void)
1542{
1543        int i;
1544
1545        for (i = 0; i < mca_cfg.banks; i++) {
1546                struct mce_bank *b = &mce_banks[i];
1547
1548                if (!b->init)
1549                        continue;
1550                wrmsrl(msr_ops.ctl(i), b->ctl);
1551                wrmsrl(msr_ops.status(i), 0);
1552        }
1553}
1554
1555/*
1556 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1557 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1558 * Vol 3B Table 15-20). But this confuses both the code that determines
1559 * whether the machine check occurred in kernel or user mode, and also
1560 * the severity assessment code. Pretend that EIPV was set, and take the
1561 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1562 */
1563static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1564{
1565        if (bank != 0)
1566                return;
1567        if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1568                return;
1569        if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1570                          MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1571                          MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1572                          MCACOD)) !=
1573                         (MCI_STATUS_UC|MCI_STATUS_EN|
1574                          MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1575                          MCI_STATUS_AR|MCACOD_INSTR))
1576                return;
1577
1578        m->mcgstatus |= MCG_STATUS_EIPV;
1579        m->ip = regs->ip;
1580        m->cs = regs->cs;
1581}
1582
1583static int quirk_haswell_noprint(struct mce *m)
1584{
1585        if (m->bank == 0 &&
1586            (m->status & 0xa0000000ffffffff) == 0x80000000000f0005)
1587                return 1;
1588
1589        return 0;
1590}
1591
1592/* Add per CPU specific workarounds here */
1593static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1594{
1595        struct mca_config *cfg = &mca_cfg;
1596
1597        if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1598                pr_info("unknown CPU type - not enabling MCE support\n");
1599                return -EOPNOTSUPP;
1600        }
1601
1602        /* This should be disabled by the BIOS, but isn't always */
1603        if (c->x86_vendor == X86_VENDOR_AMD) {
1604                if (c->x86 == 15 && cfg->banks > 4) {
1605                        /*
1606                         * disable GART TBL walk error reporting, which
1607                         * trips off incorrectly with the IOMMU & 3ware
1608                         * & Cerberus:
1609                         */
1610                        clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1611                }
1612                if (c->x86 <= 17 && cfg->bootlog < 0) {
1613                        /*
1614                         * Lots of broken BIOS around that don't clear them
1615                         * by default and leave crap in there. Don't log:
1616                         */
1617                        cfg->bootlog = 0;
1618                }
1619                /*
1620                 * Various K7s with broken bank 0 around. Always disable
1621                 * by default.
1622                 */
1623                if (c->x86 == 6 && cfg->banks > 0)
1624                        mce_banks[0].ctl = 0;
1625
1626                /*
1627                 * overflow_recov is supported for F15h Models 00h-0fh
1628                 * even though we don't have a CPUID bit for it.
1629                 */
1630                if (c->x86 == 0x15 && c->x86_model <= 0xf)
1631                        mce_flags.overflow_recov = 1;
1632
1633                /*
1634                 * Turn off MC4_MISC thresholding banks on those models since
1635                 * they're not supported there.
1636                 */
1637                if (c->x86 == 0x15 &&
1638                    (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1639                        int i;
1640                        u64 hwcr;
1641                        bool need_toggle;
1642                        u32 msrs[] = {
1643                                0x00000413, /* MC4_MISC0 */
1644                                0xc0000408, /* MC4_MISC1 */
1645                        };
1646
1647                        rdmsrl(MSR_K7_HWCR, hwcr);
1648
1649                        /* McStatusWrEn has to be set */
1650                        need_toggle = !(hwcr & BIT(18));
1651
1652                        if (need_toggle)
1653                                wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1654
1655                        /* Clear CntP bit safely */
1656                        for (i = 0; i < ARRAY_SIZE(msrs); i++)
1657                                msr_clear_bit(msrs[i], 62);
1658
1659                        /* restore old settings */
1660                        if (need_toggle)
1661                                wrmsrl(MSR_K7_HWCR, hwcr);
1662                }
1663        }
1664
1665        if (c->x86_vendor == X86_VENDOR_INTEL) {
1666                /*
1667                 * SDM documents that on family 6 bank 0 should not be written
1668                 * because it aliases to another special BIOS controlled
1669                 * register.
1670                 * But it's not aliased anymore on model 0x1a+
1671                 * Don't ignore bank 0 completely because there could be a
1672                 * valid event later, merely don't write CTL0.
1673                 */
1674
1675                if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
1676                        mce_banks[0].init = 0;
1677
1678                /*
1679                 * All newer Intel systems support MCE broadcasting. Enable
1680                 * synchronization with a one second timeout.
1681                 */
1682                if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1683                        cfg->monarch_timeout < 0)
1684                        cfg->monarch_timeout = USEC_PER_SEC;
1685
1686                /*
1687                 * There are also broken BIOSes on some Pentium M and
1688                 * earlier systems:
1689                 */
1690                if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1691                        cfg->bootlog = 0;
1692
1693                if (c->x86 == 6 && c->x86_model == 45)
1694                        quirk_no_way_out = quirk_sandybridge_ifu;
1695
1696                if (c->x86 == 6) {
1697                        switch (c->x86_model) {
1698                        case 0x3c: /* HSD131, HSM142, HSW131 */
1699                        case 0x3d: /* BDM48 */
1700                        case 0x45: /* HSM142 */
1701                        case 0x46: /* HSM142 */
1702                                pr_info("Detected Haswell CPU. MCE quirk HSD131, HSM142, HSW131, BDM48, or HSM142 enabled.\n");
1703                                quirk_noprint = quirk_haswell_noprint;
1704                                break;
1705                        }
1706                }
1707        }
1708        if (cfg->monarch_timeout < 0)
1709                cfg->monarch_timeout = 0;
1710        if (cfg->bootlog != 0)
1711                cfg->panic_timeout = 30;
1712
1713        return 0;
1714}
1715
1716static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1717{
1718        if (c->x86 != 5)
1719                return 0;
1720
1721        switch (c->x86_vendor) {
1722        case X86_VENDOR_INTEL:
1723                intel_p5_mcheck_init(c);
1724                return 1;
1725                break;
1726        case X86_VENDOR_CENTAUR:
1727                winchip_mcheck_init(c);
1728                return 1;
1729                break;
1730        default:
1731                return 0;
1732        }
1733
1734        return 0;
1735}
1736
1737static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1738{
1739        switch (c->x86_vendor) {
1740        case X86_VENDOR_INTEL:
1741                mce_intel_feature_init(c);
1742                mce_adjust_timer = cmci_intel_adjust_timer;
1743                break;
1744
1745        case X86_VENDOR_AMD: {
1746                mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
1747                mce_flags.succor         = !!cpu_has(c, X86_FEATURE_SUCCOR);
1748                mce_flags.smca           = !!cpu_has(c, X86_FEATURE_SMCA);
1749
1750                /*
1751                 * Install proper ops for Scalable MCA enabled processors
1752                 */
1753                if (mce_flags.smca) {
1754                        msr_ops.ctl     = smca_ctl_reg;
1755                        msr_ops.status  = smca_status_reg;
1756                        msr_ops.addr    = smca_addr_reg;
1757                        msr_ops.misc    = smca_misc_reg;
1758                }
1759                mce_amd_feature_init(c);
1760
1761                break;
1762                }
1763
1764        default:
1765                break;
1766        }
1767}
1768
1769static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
1770{
1771        switch (c->x86_vendor) {
1772        case X86_VENDOR_INTEL:
1773                mce_intel_feature_clear(c);
1774                break;
1775        default:
1776                break;
1777        }
1778}
1779
1780static void mce_start_timer(unsigned int cpu, struct timer_list *t)
1781{
1782        unsigned long iv = check_interval * HZ;
1783
1784        if (mca_cfg.ignore_ce || !iv)
1785                return;
1786
1787        per_cpu(mce_next_interval, cpu) = iv;
1788
1789        t->expires = round_jiffies(jiffies + iv);
1790        add_timer_on(t, cpu);
1791}
1792
1793static void __mcheck_cpu_init_timer(void)
1794{
1795        struct timer_list *t = &__get_cpu_var(mce_timer);
1796        unsigned int cpu = smp_processor_id();
1797
1798        setup_timer(t, mce_timer_fn, cpu);
1799        mce_start_timer(cpu, t);
1800}
1801
1802/* Handle unconfigured int18 (should never happen) */
1803static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1804{
1805        pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1806               smp_processor_id());
1807}
1808
1809/* Call the installed machine check handler for this CPU setup. */
1810void (*machine_check_vector)(struct pt_regs *, long error_code) =
1811                                                unexpected_machine_check;
1812
1813dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
1814{
1815        machine_check_vector(regs, error_code);
1816}
1817
1818/*
1819 * Called for each booted CPU to set up machine checks.
1820 * Must be called with preempt off:
1821 */
1822void mcheck_cpu_init(struct cpuinfo_x86 *c)
1823{
1824        if (mca_cfg.disabled)
1825                return;
1826
1827        if (__mcheck_cpu_ancient_init(c))
1828                return;
1829
1830        if (!mce_available(c))
1831                return;
1832
1833        if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1834                mca_cfg.disabled = true;
1835                return;
1836        }
1837
1838        if (mce_gen_pool_init()) {
1839                mca_cfg.disabled = true;
1840                pr_emerg("Couldn't allocate MCE records pool!\n");
1841                return;
1842        }
1843
1844        machine_check_vector = do_machine_check;
1845
1846        __mcheck_cpu_init_generic();
1847        __mcheck_cpu_init_vendor(c);
1848        __mcheck_cpu_init_clear_banks();
1849        __mcheck_cpu_init_timer();
1850}
1851
1852/*
1853 * Called for each booted CPU to clear some machine checks opt-ins
1854 */
1855void mcheck_cpu_clear(struct cpuinfo_x86 *c)
1856{
1857        if (mca_cfg.disabled)
1858                return;
1859
1860        if (!mce_available(c))
1861                return;
1862
1863        /*
1864         * Possibly to clear general settings generic to x86
1865         * __mcheck_cpu_clear_generic(c);
1866         */
1867        __mcheck_cpu_clear_vendor(c);
1868
1869}
1870
1871/*
1872 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1873 */
1874
1875static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1876static int mce_chrdev_open_count;       /* #times opened */
1877static int mce_chrdev_open_exclu;       /* already open exclusive? */
1878
1879static int mce_chrdev_open(struct inode *inode, struct file *file)
1880{
1881        spin_lock(&mce_chrdev_state_lock);
1882
1883        if (mce_chrdev_open_exclu ||
1884            (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1885                spin_unlock(&mce_chrdev_state_lock);
1886
1887                return -EBUSY;
1888        }
1889
1890        if (file->f_flags & O_EXCL)
1891                mce_chrdev_open_exclu = 1;
1892        mce_chrdev_open_count++;
1893
1894        spin_unlock(&mce_chrdev_state_lock);
1895
1896        return nonseekable_open(inode, file);
1897}
1898
1899static int mce_chrdev_release(struct inode *inode, struct file *file)
1900{
1901        spin_lock(&mce_chrdev_state_lock);
1902
1903        mce_chrdev_open_count--;
1904        mce_chrdev_open_exclu = 0;
1905
1906        spin_unlock(&mce_chrdev_state_lock);
1907
1908        return 0;
1909}
1910
1911static void collect_tscs(void *data)
1912{
1913        unsigned long *cpu_tsc = (unsigned long *)data;
1914
1915        cpu_tsc[smp_processor_id()] = rdtsc();
1916}
1917
1918static int mce_apei_read_done;
1919
1920/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1921static int __mce_read_apei(char __user **ubuf, size_t usize)
1922{
1923        int rc;
1924        u64 record_id;
1925        struct mce m;
1926
1927        if (usize < sizeof(struct mce))
1928                return -EINVAL;
1929
1930        rc = apei_read_mce(&m, &record_id);
1931        /* Error or no more MCE record */
1932        if (rc <= 0) {
1933                mce_apei_read_done = 1;
1934                /*
1935                 * When ERST is disabled, mce_chrdev_read() should return
1936                 * "no record" instead of "no device."
1937                 */
1938                if (rc == -ENODEV)
1939                        return 0;
1940                return rc;
1941        }
1942        rc = -EFAULT;
1943        if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1944                return rc;
1945        /*
1946         * In fact, we should have cleared the record after that has
1947         * been flushed to the disk or sent to network in
1948         * /sbin/mcelog, but we have no interface to support that now,
1949         * so just clear it to avoid duplication.
1950         */
1951        rc = apei_clear_mce(record_id);
1952        if (rc) {
1953                mce_apei_read_done = 1;
1954                return rc;
1955        }
1956        *ubuf += sizeof(struct mce);
1957
1958        return 0;
1959}
1960
1961static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1962                                size_t usize, loff_t *off)
1963{
1964        char __user *buf = ubuf;
1965        unsigned long *cpu_tsc;
1966        unsigned prev, next;
1967        int i, err;
1968
1969        cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1970        if (!cpu_tsc)
1971                return -ENOMEM;
1972
1973        mutex_lock(&mce_chrdev_read_mutex);
1974
1975        if (!mce_apei_read_done) {
1976                err = __mce_read_apei(&buf, usize);
1977                if (err || buf != ubuf)
1978                        goto out;
1979        }
1980
1981        next = mce_log_get_idx_check(mcelog.next);
1982
1983        /* Only supports full reads right now */
1984        err = -EINVAL;
1985        if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1986                goto out;
1987
1988        err = 0;
1989        prev = 0;
1990        do {
1991                for (i = prev; i < next; i++) {
1992                        unsigned long start = jiffies;
1993                        struct mce *m = &mcelog.entry[i];
1994
1995                        while (!m->finished) {
1996                                if (time_after_eq(jiffies, start + 2)) {
1997                                        memset(m, 0, sizeof(*m));
1998                                        goto timeout;
1999                                }
2000                                cpu_relax();
2001                        }
2002                        smp_rmb();
2003                        err |= copy_to_user(buf, m, sizeof(*m));
2004                        buf += sizeof(*m);
2005timeout:
2006                        ;
2007                }
2008
2009                memset(mcelog.entry + prev, 0,
2010                       (next - prev) * sizeof(struct mce));
2011                prev = next;
2012                next = cmpxchg(&mcelog.next, prev, 0);
2013        } while (next != prev);
2014
2015        synchronize_sched();
2016
2017        /*
2018         * Collect entries that were still getting written before the
2019         * synchronize.
2020         */
2021        on_each_cpu(collect_tscs, cpu_tsc, 1);
2022
2023        for (i = next; i < MCE_LOG_LEN; i++) {
2024                struct mce *m = &mcelog.entry[i];
2025
2026                if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
2027                        err |= copy_to_user(buf, m, sizeof(*m));
2028                        smp_rmb();
2029                        buf += sizeof(*m);
2030                        memset(m, 0, sizeof(*m));
2031                }
2032        }
2033
2034        if (err)
2035                err = -EFAULT;
2036
2037out:
2038        mutex_unlock(&mce_chrdev_read_mutex);
2039        kfree(cpu_tsc);
2040
2041        return err ? err : buf - ubuf;
2042}
2043
2044static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
2045{
2046        poll_wait(file, &mce_chrdev_wait, wait);
2047        if (READ_ONCE(mcelog.next))
2048                return POLLIN | POLLRDNORM;
2049        if (!mce_apei_read_done && apei_check_mce())
2050                return POLLIN | POLLRDNORM;
2051        return 0;
2052}
2053
2054static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
2055                                unsigned long arg)
2056{
2057        int __user *p = (int __user *)arg;
2058
2059        if (!capable(CAP_SYS_ADMIN))
2060                return -EPERM;
2061
2062        switch (cmd) {
2063        case MCE_GET_RECORD_LEN:
2064                return put_user(sizeof(struct mce), p);
2065        case MCE_GET_LOG_LEN:
2066                return put_user(MCE_LOG_LEN, p);
2067        case MCE_GETCLEAR_FLAGS: {
2068                unsigned flags;
2069
2070                do {
2071                        flags = mcelog.flags;
2072                } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
2073
2074                return put_user(flags, p);
2075        }
2076        default:
2077                return -ENOTTY;
2078        }
2079}
2080
2081static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
2082                            size_t usize, loff_t *off);
2083
2084void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
2085                             const char __user *ubuf,
2086                             size_t usize, loff_t *off))
2087{
2088        mce_write = fn;
2089}
2090EXPORT_SYMBOL_GPL(register_mce_write_callback);
2091
2092static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
2093                                size_t usize, loff_t *off)
2094{
2095        if (mce_write)
2096                return mce_write(filp, ubuf, usize, off);
2097        else
2098                return -EINVAL;
2099}
2100
2101static const struct file_operations mce_chrdev_ops = {
2102        .open                   = mce_chrdev_open,
2103        .release                = mce_chrdev_release,
2104        .read                   = mce_chrdev_read,
2105        .write                  = mce_chrdev_write,
2106        .poll                   = mce_chrdev_poll,
2107        .unlocked_ioctl         = mce_chrdev_ioctl,
2108        .llseek                 = no_llseek,
2109};
2110
2111static struct miscdevice mce_chrdev_device = {
2112        MISC_MCELOG_MINOR,
2113        "mcelog",
2114        &mce_chrdev_ops,
2115};
2116
2117static void __mce_disable_bank(void *arg)
2118{
2119        int bank = *((int *)arg);
2120        __clear_bit(bank, __get_cpu_var(mce_poll_banks));
2121        cmci_disable_bank(bank);
2122}
2123
2124void mce_disable_bank(int bank)
2125{
2126        if (bank >= mca_cfg.banks) {
2127                pr_warn(FW_BUG
2128                        "Ignoring request to disable invalid MCA bank %d.\n",
2129                        bank);
2130                return;
2131        }
2132        set_bit(bank, mce_banks_ce_disabled);
2133        on_each_cpu(__mce_disable_bank, &bank, 1);
2134}
2135
2136/*
2137 * mce=off Disables machine check
2138 * mce=no_cmci Disables CMCI
2139 * mce=no_lmce Disables LMCE
2140 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
2141 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
2142 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
2143 *      monarchtimeout is how long to wait for other CPUs on machine
2144 *      check, or 0 to not wait
2145 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
2146 * mce=nobootlog Don't log MCEs from before booting.
2147 * mce=bios_cmci_threshold Don't program the CMCI threshold
2148 * mce=recovery force enable memcpy_mcsafe()
2149 */
2150static int __init mcheck_enable(char *str)
2151{
2152        struct mca_config *cfg = &mca_cfg;
2153
2154        if (*str == 0) {
2155                enable_p5_mce();
2156                return 1;
2157        }
2158        if (*str == '=')
2159                str++;
2160        if (!strcmp(str, "off"))
2161                cfg->disabled = true;
2162        else if (!strcmp(str, "no_cmci"))
2163                cfg->cmci_disabled = true;
2164        else if (!strcmp(str, "no_lmce"))
2165                cfg->lmce_disabled = true;
2166        else if (!strcmp(str, "dont_log_ce"))
2167                cfg->dont_log_ce = true;
2168        else if (!strcmp(str, "ignore_ce"))
2169                cfg->ignore_ce = true;
2170        else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
2171                cfg->bootlog = (str[0] == 'b');
2172        else if (!strcmp(str, "bios_cmci_threshold"))
2173                cfg->bios_cmci_threshold = true;
2174        else if (!strcmp(str, "recovery"))
2175                cfg->recovery = true;
2176        else if (isdigit(str[0])) {
2177                if (get_option(&str, &cfg->tolerant) == 2)
2178                        get_option(&str, &(cfg->monarch_timeout));
2179        } else {
2180                pr_info("mce argument %s ignored. Please use /sys\n", str);
2181                return 0;
2182        }
2183        return 1;
2184}
2185__setup("mce", mcheck_enable);
2186
2187int __init mcheck_init(void)
2188{
2189        mcheck_intel_therm_init();
2190        mce_register_decode_chain(&mce_srao_nb);
2191        mce_register_decode_chain(&mce_default_nb);
2192        mcheck_vendor_init_severity();
2193
2194        INIT_WORK(&mce_work, mce_process_work);
2195        init_irq_work(&mce_irq_work, mce_irq_work_cb);
2196
2197        return 0;
2198}
2199
2200/*
2201 * mce_syscore: PM support
2202 */
2203
2204/*
2205 * Disable machine checks on suspend and shutdown. We can't really handle
2206 * them later.
2207 */
2208static void mce_disable_error_reporting(void)
2209{
2210        int i;
2211
2212        for (i = 0; i < mca_cfg.banks; i++) {
2213                struct mce_bank *b = &mce_banks[i];
2214
2215                if (b->init)
2216                        wrmsrl(msr_ops.ctl(i), 0);
2217        }
2218        return;
2219}
2220
2221static void vendor_disable_error_reporting(void)
2222{
2223        /*
2224         * Don't clear on Intel CPUs. Some of these MSRs are socket-wide.
2225         * Disabling them for just a single offlined CPU is bad, since it will
2226         * inhibit reporting for all shared resources on the socket like the
2227         * last level cache (LLC), the integrated memory controller (iMC), etc.
2228         */
2229        if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
2230                return;
2231
2232        mce_disable_error_reporting();
2233}
2234
2235static int mce_syscore_suspend(void)
2236{
2237        vendor_disable_error_reporting();
2238        return 0;
2239}
2240
2241static void mce_syscore_shutdown(void)
2242{
2243        vendor_disable_error_reporting();
2244}
2245
2246/*
2247 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2248 * Only one CPU is active at this time, the others get re-added later using
2249 * CPU hotplug:
2250 */
2251static void mce_syscore_resume(void)
2252{
2253        __mcheck_cpu_init_generic();
2254        __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
2255        __mcheck_cpu_init_clear_banks();
2256}
2257
2258static struct syscore_ops mce_syscore_ops = {
2259        .suspend        = mce_syscore_suspend,
2260        .shutdown       = mce_syscore_shutdown,
2261        .resume         = mce_syscore_resume,
2262};
2263
2264/*
2265 * mce_device: Sysfs support
2266 */
2267
2268static void mce_cpu_restart(void *data)
2269{
2270        if (!mce_available(__this_cpu_ptr(&cpu_info)))
2271                return;
2272        __mcheck_cpu_init_generic();
2273        __mcheck_cpu_init_clear_banks();
2274        __mcheck_cpu_init_timer();
2275}
2276
2277/* Reinit MCEs after user configuration changes */
2278static void mce_restart(void)
2279{
2280        mce_timer_delete_all();
2281        on_each_cpu(mce_cpu_restart, NULL, 1);
2282}
2283
2284/* Toggle features for corrected errors */
2285static void mce_disable_cmci(void *data)
2286{
2287        if (!mce_available(__this_cpu_ptr(&cpu_info)))
2288                return;
2289        cmci_clear();
2290}
2291
2292static void mce_enable_ce(void *all)
2293{
2294        if (!mce_available(__this_cpu_ptr(&cpu_info)))
2295                return;
2296        cmci_reenable();
2297        cmci_recheck();
2298        if (all)
2299                __mcheck_cpu_init_timer();
2300}
2301
2302static struct bus_type mce_subsys = {
2303        .name           = "machinecheck",
2304        .dev_name       = "machinecheck",
2305};
2306
2307DEFINE_PER_CPU(struct device *, mce_device);
2308
2309void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
2310
2311static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2312{
2313        return container_of(attr, struct mce_bank, attr);
2314}
2315
2316static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2317                         char *buf)
2318{
2319        return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2320}
2321
2322static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2323                        const char *buf, size_t size)
2324{
2325        u64 new;
2326
2327        if (kstrtou64(buf, 0, &new) < 0)
2328                return -EINVAL;
2329
2330        attr_to_bank(attr)->ctl = new;
2331        mce_restart();
2332
2333        return size;
2334}
2335
2336static ssize_t
2337show_trigger(struct device *s, struct device_attribute *attr, char *buf)
2338{
2339        strcpy(buf, mce_helper);
2340        strcat(buf, "\n");
2341        return strlen(mce_helper) + 1;
2342}
2343
2344static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
2345                                const char *buf, size_t siz)
2346{
2347        char *p;
2348
2349        strncpy(mce_helper, buf, sizeof(mce_helper));
2350        mce_helper[sizeof(mce_helper)-1] = 0;
2351        p = strchr(mce_helper, '\n');
2352
2353        if (p)
2354                *p = 0;
2355
2356        return strlen(mce_helper) + !!p;
2357}
2358
2359static ssize_t set_ignore_ce(struct device *s,
2360                             struct device_attribute *attr,
2361                             const char *buf, size_t size)
2362{
2363        u64 new;
2364
2365        if (kstrtou64(buf, 0, &new) < 0)
2366                return -EINVAL;
2367
2368        if (mca_cfg.ignore_ce ^ !!new) {
2369                if (new) {
2370                        /* disable ce features */
2371                        mce_timer_delete_all();
2372                        on_each_cpu(mce_disable_cmci, NULL, 1);
2373                        mca_cfg.ignore_ce = true;
2374                } else {
2375                        /* enable ce features */
2376                        mca_cfg.ignore_ce = false;
2377                        on_each_cpu(mce_enable_ce, (void *)1, 1);
2378                }
2379        }
2380        return size;
2381}
2382
2383static ssize_t set_cmci_disabled(struct device *s,
2384                                 struct device_attribute *attr,
2385                                 const char *buf, size_t size)
2386{
2387        u64 new;
2388
2389        if (kstrtou64(buf, 0, &new) < 0)
2390                return -EINVAL;
2391
2392        if (mca_cfg.cmci_disabled ^ !!new) {
2393                if (new) {
2394                        /* disable cmci */
2395                        on_each_cpu(mce_disable_cmci, NULL, 1);
2396                        mca_cfg.cmci_disabled = true;
2397                } else {
2398                        /* enable cmci */
2399                        mca_cfg.cmci_disabled = false;
2400                        on_each_cpu(mce_enable_ce, NULL, 1);
2401                }
2402        }
2403        return size;
2404}
2405
2406static ssize_t store_int_with_restart(struct device *s,
2407                                      struct device_attribute *attr,
2408                                      const char *buf, size_t size)
2409{
2410        ssize_t ret = device_store_int(s, attr, buf, size);
2411        mce_restart();
2412        return ret;
2413}
2414
2415static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
2416static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2417static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2418static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2419
2420static struct dev_ext_attribute dev_attr_check_interval = {
2421        __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2422        &check_interval
2423};
2424
2425static struct dev_ext_attribute dev_attr_ignore_ce = {
2426        __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2427        &mca_cfg.ignore_ce
2428};
2429
2430static struct dev_ext_attribute dev_attr_cmci_disabled = {
2431        __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2432        &mca_cfg.cmci_disabled
2433};
2434
2435static struct device_attribute *mce_device_attrs[] = {
2436        &dev_attr_tolerant.attr,
2437        &dev_attr_check_interval.attr,
2438        &dev_attr_trigger,
2439        &dev_attr_monarch_timeout.attr,
2440        &dev_attr_dont_log_ce.attr,
2441        &dev_attr_ignore_ce.attr,
2442        &dev_attr_cmci_disabled.attr,
2443        NULL
2444};
2445
2446static cpumask_var_t mce_device_initialized;
2447
2448static void mce_device_release(struct device *dev)
2449{
2450        kfree(dev);
2451}
2452
2453/* Per cpu device init. All of the cpus still share the same ctrl bank: */
2454static int mce_device_create(unsigned int cpu)
2455{
2456        struct device *dev;
2457        int err;
2458        int i, j;
2459
2460        if (!mce_available(&boot_cpu_data))
2461                return -EIO;
2462
2463        dev = kzalloc(sizeof *dev, GFP_KERNEL);
2464        if (!dev)
2465                return -ENOMEM;
2466        dev->id  = cpu;
2467        dev->bus = &mce_subsys;
2468        dev->release = &mce_device_release;
2469
2470        err = device_register(dev);
2471        if (err) {
2472                put_device(dev);
2473                return err;
2474        }
2475
2476        for (i = 0; mce_device_attrs[i]; i++) {
2477                err = device_create_file(dev, mce_device_attrs[i]);
2478                if (err)
2479                        goto error;
2480        }
2481        for (j = 0; j < mca_cfg.banks; j++) {
2482                err = device_create_file(dev, &mce_banks[j].attr);
2483                if (err)
2484                        goto error2;
2485        }
2486        cpumask_set_cpu(cpu, mce_device_initialized);
2487        per_cpu(mce_device, cpu) = dev;
2488
2489        return 0;
2490error2:
2491        while (--j >= 0)
2492                device_remove_file(dev, &mce_banks[j].attr);
2493error:
2494        while (--i >= 0)
2495                device_remove_file(dev, mce_device_attrs[i]);
2496
2497        device_unregister(dev);
2498
2499        return err;
2500}
2501
2502static void mce_device_remove(unsigned int cpu)
2503{
2504        struct device *dev = per_cpu(mce_device, cpu);
2505        int i;
2506
2507        if (!cpumask_test_cpu(cpu, mce_device_initialized))
2508                return;
2509
2510        for (i = 0; mce_device_attrs[i]; i++)
2511                device_remove_file(dev, mce_device_attrs[i]);
2512
2513        for (i = 0; i < mca_cfg.banks; i++)
2514                device_remove_file(dev, &mce_banks[i].attr);
2515
2516        device_unregister(dev);
2517        cpumask_clear_cpu(cpu, mce_device_initialized);
2518        per_cpu(mce_device, cpu) = NULL;
2519}
2520
2521/* Make sure there are no machine checks on offlined CPUs. */
2522static void mce_disable_cpu(void *h)
2523{
2524        unsigned long action = *(unsigned long *)h;
2525
2526        if (!mce_available(__this_cpu_ptr(&cpu_info)))
2527                return;
2528
2529        if (!(action & CPU_TASKS_FROZEN))
2530                cmci_clear();
2531
2532        vendor_disable_error_reporting();
2533}
2534
2535static void mce_reenable_cpu(void *h)
2536{
2537        unsigned long action = *(unsigned long *)h;
2538        int i;
2539
2540        if (!mce_available(__this_cpu_ptr(&cpu_info)))
2541                return;
2542
2543        if (!(action & CPU_TASKS_FROZEN))
2544                cmci_reenable();
2545        for (i = 0; i < mca_cfg.banks; i++) {
2546                struct mce_bank *b = &mce_banks[i];
2547
2548                if (b->init)
2549                        wrmsrl(msr_ops.ctl(i), b->ctl);
2550        }
2551}
2552
2553/* Get notified when a cpu comes on/off. Be hotplug friendly. */
2554static int
2555mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2556{
2557        unsigned int cpu = (unsigned long)hcpu;
2558        struct timer_list *t = &per_cpu(mce_timer, cpu);
2559
2560        switch (action & ~CPU_TASKS_FROZEN) {
2561        case CPU_ONLINE:
2562                mce_device_create(cpu);
2563                if (threshold_cpu_callback)
2564                        threshold_cpu_callback(action, cpu);
2565                break;
2566        case CPU_DEAD:
2567                if (threshold_cpu_callback)
2568                        threshold_cpu_callback(action, cpu);
2569                mce_device_remove(cpu);
2570                mce_intel_hcpu_update(cpu);
2571                break;
2572        case CPU_DOWN_PREPARE:
2573                smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2574                del_timer_sync(t);
2575                break;
2576        case CPU_DOWN_FAILED:
2577                smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2578                mce_start_timer(cpu, t);
2579                break;
2580        }
2581
2582        if (action == CPU_POST_DEAD) {
2583                /* intentionally ignoring frozen here */
2584                cmci_rediscover();
2585        }
2586
2587        return NOTIFY_OK;
2588}
2589
2590static struct notifier_block mce_cpu_notifier = {
2591        .notifier_call = mce_cpu_callback,
2592};
2593
2594static __init void mce_init_banks(void)
2595{
2596        int i;
2597
2598        for (i = 0; i < mca_cfg.banks; i++) {
2599                struct mce_bank *b = &mce_banks[i];
2600                struct device_attribute *a = &b->attr;
2601
2602                sysfs_attr_init(&a->attr);
2603                a->attr.name    = b->attrname;
2604                snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2605
2606                a->attr.mode    = 0644;
2607                a->show         = show_bank;
2608                a->store        = set_bank;
2609        }
2610}
2611
2612static __init int mcheck_init_device(void)
2613{
2614        int err;
2615        int i = 0;
2616
2617        if (!mce_available(&boot_cpu_data)) {
2618                err = -EIO;
2619                goto err_out;
2620        }
2621
2622        if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2623                err = -ENOMEM;
2624                goto err_out;
2625        }
2626
2627        mce_init_banks();
2628
2629        err = subsys_system_register(&mce_subsys, NULL);
2630        if (err)
2631                goto err_out_mem;
2632
2633        cpu_notifier_register_begin();
2634        for_each_online_cpu(i) {
2635                err = mce_device_create(i);
2636                if (err) {
2637                        /*
2638                         * Register notifier anyway (and do not unreg it) so
2639                         * that we don't leave undeleted timers, see notifier
2640                         * callback above.
2641                         */
2642                        __register_hotcpu_notifier(&mce_cpu_notifier);
2643                        cpu_notifier_register_done();
2644                        goto err_device_create;
2645                }
2646        }
2647
2648        __register_hotcpu_notifier(&mce_cpu_notifier);
2649        cpu_notifier_register_done();
2650
2651        register_syscore_ops(&mce_syscore_ops);
2652
2653        /* register character device /dev/mcelog */
2654        err = misc_register(&mce_chrdev_device);
2655        if (err)
2656                goto err_register;
2657
2658        return 0;
2659
2660err_register:
2661        unregister_syscore_ops(&mce_syscore_ops);
2662
2663err_device_create:
2664        /*
2665         * We didn't keep track of which devices were created above, but
2666         * even if we had, the set of online cpus might have changed.
2667         * Play safe and remove for every possible cpu, since
2668         * mce_device_remove() will do the right thing.
2669         */
2670        for_each_possible_cpu(i)
2671                mce_device_remove(i);
2672
2673err_out_mem:
2674        free_cpumask_var(mce_device_initialized);
2675
2676err_out:
2677        pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
2678
2679        return err;
2680}
2681device_initcall_sync(mcheck_init_device);
2682
2683/*
2684 * Old style boot options parsing. Only for compatibility.
2685 */
2686static int __init mcheck_disable(char *str)
2687{
2688        mca_cfg.disabled = true;
2689        return 1;
2690}
2691__setup("nomce", mcheck_disable);
2692
2693#ifdef CONFIG_DEBUG_FS
2694struct dentry *mce_get_debugfs_dir(void)
2695{
2696        static struct dentry *dmce;
2697
2698        if (!dmce)
2699                dmce = debugfs_create_dir("mce", NULL);
2700
2701        return dmce;
2702}
2703
2704static void mce_reset(void)
2705{
2706        cpu_missing = 0;
2707        atomic_set(&mce_fake_panicked, 0);
2708        atomic_set(&mce_executing, 0);
2709        atomic_set(&mce_callin, 0);
2710        atomic_set(&global_nwo, 0);
2711}
2712
2713static int fake_panic_get(void *data, u64 *val)
2714{
2715        *val = fake_panic;
2716        return 0;
2717}
2718
2719static int fake_panic_set(void *data, u64 val)
2720{
2721        mce_reset();
2722        fake_panic = val;
2723        return 0;
2724}
2725
2726DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2727                        fake_panic_set, "%llu\n");
2728
2729static int __init mcheck_debugfs_init(void)
2730{
2731        struct dentry *dmce, *ffake_panic;
2732
2733        dmce = mce_get_debugfs_dir();
2734        if (!dmce)
2735                return -ENOMEM;
2736        ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2737                                          &fake_panic_fops);
2738        if (!ffake_panic)
2739                return -ENOMEM;
2740
2741        return 0;
2742}
2743#else
2744static int __init mcheck_debugfs_init(void) { return -EINVAL; }
2745#endif
2746
2747struct static_key mcsafe_key = STATIC_KEY_INIT_FALSE;
2748EXPORT_SYMBOL_GPL(mcsafe_key);
2749
2750static int __init mcheck_late_init(void)
2751{
2752        pr_info("Using %d MCE banks\n", mca_cfg.banks);
2753
2754        if (mca_cfg.recovery)
2755                static_key_slow_inc(&mcsafe_key);
2756
2757        mcheck_debugfs_init();
2758
2759        /*
2760         * Flush out everything that has been logged during early boot, now that
2761         * everything has been initialized (workqueues, decoders, ...).
2762         */
2763        mce_schedule_work();
2764
2765        return 0;
2766}
2767late_initcall(mcheck_late_init);
2768