LXR linux/arch/x86/kernel/cpu/mcheck/mce.c

   1/*
   2 * Machine check handler.
   3 *
   4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   5 * Rest from unknown author(s).
   6 * 2004 Andi Kleen. Rewrote most of it.
   7 * Copyright 2008 Intel Corporation
   8 * Author: Andi Kleen
   9 */
  10
  11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  12
  13#include <linux/thread_info.h>
  14#include <linux/capability.h>
  15#include <linux/miscdevice.h>
  16#include <linux/ratelimit.h>
  17#include <linux/rcupdate.h>
  18#include <linux/kobject.h>
  19#include <linux/uaccess.h>
  20#include <linux/kdebug.h>
  21#include <linux/kernel.h>
  22#include <linux/percpu.h>
  23#include <linux/string.h>
  24#include <linux/device.h>
  25#include <linux/syscore_ops.h>
  26#include <linux/delay.h>
  27#include <linux/ctype.h>
  28#include <linux/sched.h>
  29#include <linux/sysfs.h>
  30#include <linux/types.h>
  31#include <linux/slab.h>
  32#include <linux/init.h>
  33#include <linux/kmod.h>
  34#include <linux/poll.h>
  35#include <linux/nmi.h>
  36#include <linux/cpu.h>
  37#include <linux/ras.h>
  38#include <linux/smp.h>
  39#include <linux/fs.h>
  40#include <linux/mm.h>
  41#include <linux/debugfs.h>
  42#include <linux/irq_work.h>
  43#include <linux/export.h>
  44#include <linux/jump_label.h>
  45
  46#include <asm/intel-family.h>
  47#include <asm/processor.h>
  48#include <asm/traps.h>
  49#include <asm/tlbflush.h>
  50#include <asm/mce.h>
  51#include <asm/msr.h>
  52#include <asm/reboot.h>
  53#include <asm/set_memory.h>
  54
  55#include "mce-internal.h"
  56
  57static DEFINE_MUTEX(mce_log_mutex);
  58
  59/* sysfs synchronization */
  60static DEFINE_MUTEX(mce_sysfs_mutex);
  61
  62#define CREATE_TRACE_POINTS
  63#include <trace/events/mce.h>
  64
  65#define SPINUNIT                100     /* 100ns */
  66
  67DEFINE_PER_CPU(unsigned, mce_exception_count);
  68
  69struct mce_bank *mce_banks __read_mostly;
  70struct mce_vendor_flags mce_flags __read_mostly;
  71
  72struct mca_config mca_cfg __read_mostly = {
  73        .bootlog  = -1,
  74        /*
  75         * Tolerant levels:
  76         * 0: always panic on uncorrected errors, log corrected errors
  77         * 1: panic or SIGBUS on uncorrected errors, log corrected errors
  78         * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
  79         * 3: never panic or SIGBUS, log all errors (for testing only)
  80         */
  81        .tolerant = 1,
  82        .monarch_timeout = -1
  83};
  84
  85static DEFINE_PER_CPU(struct mce, mces_seen);
  86static unsigned long mce_need_notify;
  87static int cpu_missing;
  88
  89/*
  90 * MCA banks polled by the period polling timer for corrected events.
  91 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
  92 */
  93DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
  94        [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
  95};
  96
  97/*
  98 * MCA banks controlled through firmware first for corrected errors.
  99 * This is a global list of banks for which we won't enable CMCI and we
 100 * won't poll. Firmware controls these banks and is responsible for
 101 * reporting corrected errors through GHES. Uncorrected/recoverable
 102 * errors are still notified through a machine check.
 103 */
 104mce_banks_t mce_banks_ce_disabled;
 105
 106static struct work_struct mce_work;
 107static struct irq_work mce_irq_work;
 108
 109static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
 110
 111#ifndef mce_unmap_kpfn
 112static void mce_unmap_kpfn(unsigned long pfn);
 113#endif
 114
 115/*
 116 * CPU/chipset specific EDAC code can register a notifier call here to print
 117 * MCE errors in a human-readable form.
 118 */
 119BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
 120
 121/* Do initial initialization of a struct mce */
 122void mce_setup(struct mce *m)
 123{
 124        memset(m, 0, sizeof(struct mce));
 125        m->cpu = m->extcpu = smp_processor_id();
 126        /* We hope get_seconds stays lockless */
 127        m->time = get_seconds();
 128        m->cpuvendor = boot_cpu_data.x86_vendor;
 129        m->cpuid = cpuid_eax(1);
 130        m->socketid = cpu_data(m->extcpu).phys_proc_id;
 131        m->apicid = cpu_data(m->extcpu).initial_apicid;
 132        rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
 133
 134        if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
 135                rdmsrl(MSR_PPIN, m->ppin);
 136
 137        m->microcode = boot_cpu_data.microcode;
 138}
 139
 140DEFINE_PER_CPU(struct mce, injectm);
 141EXPORT_PER_CPU_SYMBOL_GPL(injectm);
 142
 143void mce_log(struct mce *m)
 144{
 145        if (!mce_gen_pool_add(m))
 146                irq_work_queue(&mce_irq_work);
 147}
 148
 149void mce_inject_log(struct mce *m)
 150{
 151        mutex_lock(&mce_log_mutex);
 152        mce_log(m);
 153        mutex_unlock(&mce_log_mutex);
 154}
 155EXPORT_SYMBOL_GPL(mce_inject_log);
 156
 157static struct notifier_block mce_srao_nb;
 158
 159/*
 160 * We run the default notifier if we have only the SRAO, the first and the
 161 * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
 162 * notifiers registered on the chain.
 163 */
 164#define NUM_DEFAULT_NOTIFIERS   3
 165static atomic_t num_notifiers;
 166
 167void mce_register_decode_chain(struct notifier_block *nb)
 168{
 169        if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
 170                return;
 171
 172        atomic_inc(&num_notifiers);
 173
 174        blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
 175}
 176EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 177
 178void mce_unregister_decode_chain(struct notifier_block *nb)
 179{
 180        atomic_dec(&num_notifiers);
 181
 182        blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
 183}
 184EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
 185
 186static inline u32 ctl_reg(int bank)
 187{
 188        return MSR_IA32_MCx_CTL(bank);
 189}
 190
 191static inline u32 status_reg(int bank)
 192{
 193        return MSR_IA32_MCx_STATUS(bank);
 194}
 195
 196static inline u32 addr_reg(int bank)
 197{
 198        return MSR_IA32_MCx_ADDR(bank);
 199}
 200
 201static inline u32 misc_reg(int bank)
 202{
 203        return MSR_IA32_MCx_MISC(bank);
 204}
 205
 206static inline u32 smca_ctl_reg(int bank)
 207{
 208        return MSR_AMD64_SMCA_MCx_CTL(bank);
 209}
 210
 211static inline u32 smca_status_reg(int bank)
 212{
 213        return MSR_AMD64_SMCA_MCx_STATUS(bank);
 214}
 215
 216static inline u32 smca_addr_reg(int bank)
 217{
 218        return MSR_AMD64_SMCA_MCx_ADDR(bank);
 219}
 220
 221static inline u32 smca_misc_reg(int bank)
 222{
 223        return MSR_AMD64_SMCA_MCx_MISC(bank);
 224}
 225
 226struct mca_msr_regs msr_ops = {
 227        .ctl    = ctl_reg,
 228        .status = status_reg,
 229        .addr   = addr_reg,
 230        .misc   = misc_reg
 231};
 232
 233static void __print_mce(struct mce *m)
 234{
 235        pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
 236                 m->extcpu,
 237                 (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
 238                 m->mcgstatus, m->bank, m->status);
 239
 240        if (m->ip) {
 241                pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
 242                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 243                        m->cs, m->ip);
 244
 245                if (m->cs == __KERNEL_CS)
 246                        pr_cont("{%pS}", (void *)(unsigned long)m->ip);
 247                pr_cont("\n");
 248        }
 249
 250        pr_emerg(HW_ERR "TSC %llx ", m->tsc);
 251        if (m->addr)
 252                pr_cont("ADDR %llx ", m->addr);
 253        if (m->misc)
 254                pr_cont("MISC %llx ", m->misc);
 255
 256        if (mce_flags.smca) {
 257                if (m->synd)
 258                        pr_cont("SYND %llx ", m->synd);
 259                if (m->ipid)
 260                        pr_cont("IPID %llx ", m->ipid);
 261        }
 262
 263        pr_cont("\n");
 264        /*
 265         * Note this output is parsed by external tools and old fields
 266         * should not be changed.
 267         */
 268        pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
 269                m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
 270                m->microcode);
 271}
 272
 273static void print_mce(struct mce *m)
 274{
 275        __print_mce(m);
 276        pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
 277}
 278
 279#define PANIC_TIMEOUT 5 /* 5 seconds */
 280
 281static atomic_t mce_panicked;
 282
 283static int fake_panic;
 284static atomic_t mce_fake_panicked;
 285
 286/* Panic in progress. Enable interrupts and wait for final IPI */
 287static void wait_for_panic(void)
 288{
 289        long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
 290
 291        preempt_disable();
 292        local_irq_enable();
 293        while (timeout-- > 0)
 294                udelay(1);
 295        if (panic_timeout == 0)
 296                panic_timeout = mca_cfg.panic_timeout;
 297        panic("Panicing machine check CPU died");
 298}
 299
 300static void mce_panic(const char *msg, struct mce *final, char *exp)
 301{
 302        int apei_err = 0;
 303        struct llist_node *pending;
 304        struct mce_evt_llist *l;
 305
 306        if (!fake_panic) {
 307                /*
 308                 * Make sure only one CPU runs in machine check panic
 309                 */
 310                if (atomic_inc_return(&mce_panicked) > 1)
 311                        wait_for_panic();
 312                barrier();
 313
 314                bust_spinlocks(1);
 315                console_verbose();
 316        } else {
 317                /* Don't log too much for fake panic */
 318                if (atomic_inc_return(&mce_fake_panicked) > 1)
 319                        return;
 320        }
 321        pending = mce_gen_pool_prepare_records();
 322        /* First print corrected ones that are still unlogged */
 323        llist_for_each_entry(l, pending, llnode) {
 324                struct mce *m = &l->mce;
 325                if (!(m->status & MCI_STATUS_UC)) {
 326                        print_mce(m);
 327                        if (!apei_err)
 328                                apei_err = apei_write_mce(m);
 329                }
 330        }
 331        /* Now print uncorrected but with the final one last */
 332        llist_for_each_entry(l, pending, llnode) {
 333                struct mce *m = &l->mce;
 334                if (!(m->status & MCI_STATUS_UC))
 335                        continue;
 336                if (!final || mce_cmp(m, final)) {
 337                        print_mce(m);
 338                        if (!apei_err)
 339                                apei_err = apei_write_mce(m);
 340                }
 341        }
 342        if (final) {
 343                print_mce(final);
 344                if (!apei_err)
 345                        apei_err = apei_write_mce(final);
 346        }
 347        if (cpu_missing)
 348                pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
 349        if (exp)
 350                pr_emerg(HW_ERR "Machine check: %s\n", exp);
 351        if (!fake_panic) {
 352                if (panic_timeout == 0)
 353                        panic_timeout = mca_cfg.panic_timeout;
 354                panic(msg);
 355        } else
 356                pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
 357}
 358
 359/* Support code for software error injection */
 360
 361static int msr_to_offset(u32 msr)
 362{
 363        unsigned bank = __this_cpu_read(injectm.bank);
 364
 365        if (msr == mca_cfg.rip_msr)
 366                return offsetof(struct mce, ip);
 367        if (msr == msr_ops.status(bank))
 368                return offsetof(struct mce, status);
 369        if (msr == msr_ops.addr(bank))
 370                return offsetof(struct mce, addr);
 371        if (msr == msr_ops.misc(bank))
 372                return offsetof(struct mce, misc);
 373        if (msr == MSR_IA32_MCG_STATUS)
 374                return offsetof(struct mce, mcgstatus);
 375        return -1;
 376}
 377
 378/* MSR access wrappers used for error injection */
 379static u64 mce_rdmsrl(u32 msr)
 380{
 381        u64 v;
 382
 383        if (__this_cpu_read(injectm.finished)) {
 384                int offset = msr_to_offset(msr);
 385
 386                if (offset < 0)
 387                        return 0;
 388                return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
 389        }
 390
 391        if (rdmsrl_safe(msr, &v)) {
 392                WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
 393                /*
 394                 * Return zero in case the access faulted. This should
 395                 * not happen normally but can happen if the CPU does
 396                 * something weird, or if the code is buggy.
 397                 */
 398                v = 0;
 399        }
 400
 401        return v;
 402}
 403
 404static void mce_wrmsrl(u32 msr, u64 v)
 405{
 406        if (__this_cpu_read(injectm.finished)) {
 407                int offset = msr_to_offset(msr);
 408
 409                if (offset >= 0)
 410                        *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
 411                return;
 412        }
 413        wrmsrl(msr, v);
 414}
 415
 416/*
 417 * Collect all global (w.r.t. this processor) status about this machine
 418 * check into our "mce" struct so that we can use it later to assess
 419 * the severity of the problem as we read per-bank specific details.
 420 */
 421static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
 422{
 423        mce_setup(m);
 424
 425        m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 426        if (regs) {
 427                /*
 428                 * Get the address of the instruction at the time of
 429                 * the machine check error.
 430                 */
 431                if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
 432                        m->ip = regs->ip;
 433                        m->cs = regs->cs;
 434
 435                        /*
 436                         * When in VM86 mode make the cs look like ring 3
 437                         * always. This is a lie, but it's better than passing
 438                         * the additional vm86 bit around everywhere.
 439                         */
 440                        if (v8086_mode(regs))
 441                                m->cs |= 3;
 442                }
 443                /* Use accurate RIP reporting if available. */
 444                if (mca_cfg.rip_msr)
 445                        m->ip = mce_rdmsrl(mca_cfg.rip_msr);
 446        }
 447}
 448
 449int mce_available(struct cpuinfo_x86 *c)
 450{
 451        if (mca_cfg.disabled)
 452                return 0;
 453        return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 454}
 455
 456static void mce_schedule_work(void)
 457{
 458        if (!mce_gen_pool_empty())
 459                schedule_work(&mce_work);
 460}
 461
 462static void mce_irq_work_cb(struct irq_work *entry)
 463{
 464        mce_schedule_work();
 465}
 466
 467static void mce_report_event(struct pt_regs *regs)
 468{
 469        if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
 470                mce_notify_irq();
 471                /*
 472                 * Triggering the work queue here is just an insurance
 473                 * policy in case the syscall exit notify handler
 474                 * doesn't run soon enough or ends up running on the
 475                 * wrong CPU (can happen when audit sleeps)
 476                 */
 477                mce_schedule_work();
 478                return;
 479        }
 480
 481        irq_work_queue(&mce_irq_work);
 482}
 483
 484/*
 485 * Check if the address reported by the CPU is in a format we can parse.
 486 * It would be possible to add code for most other cases, but all would
 487 * be somewhat complicated (e.g. segment offset would require an instruction
 488 * parser). So only support physical addresses up to page granuality for now.
 489 */
 490static int mce_usable_address(struct mce *m)
 491{
 492        if (!(m->status & MCI_STATUS_ADDRV))
 493                return 0;
 494
 495        /* Checks after this one are Intel-specific: */
 496        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
 497                return 1;
 498
 499        if (!(m->status & MCI_STATUS_MISCV))
 500                return 0;
 501
 502        if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
 503                return 0;
 504
 505        if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
 506                return 0;
 507
 508        return 1;
 509}
 510
 511bool mce_is_memory_error(struct mce *m)
 512{
 513        if (m->cpuvendor == X86_VENDOR_AMD) {
 514                return amd_mce_is_memory_error(m);
 515
 516        } else if (m->cpuvendor == X86_VENDOR_INTEL) {
 517                /*
 518                 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
 519                 *
 520                 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
 521                 * indicating a memory error. Bit 8 is used for indicating a
 522                 * cache hierarchy error. The combination of bit 2 and bit 3
 523                 * is used for indicating a `generic' cache hierarchy error
 524                 * But we can't just blindly check the above bits, because if
 525                 * bit 11 is set, then it is a bus/interconnect error - and
 526                 * either way the above bits just gives more detail on what
 527                 * bus/interconnect error happened. Note that bit 12 can be
 528                 * ignored, as it's the "filter" bit.
 529                 */
 530                return (m->status & 0xef80) == BIT(7) ||
 531                       (m->status & 0xef00) == BIT(8) ||
 532                       (m->status & 0xeffc) == 0xc;
 533        }
 534
 535        return false;
 536}
 537EXPORT_SYMBOL_GPL(mce_is_memory_error);
 538
 539static bool mce_is_correctable(struct mce *m)
 540{
 541        if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
 542                return false;
 543
 544        if (m->status & MCI_STATUS_UC)
 545                return false;
 546
 547        return true;
 548}
 549
 550static bool cec_add_mce(struct mce *m)
 551{
 552        if (!m)
 553                return false;
 554
 555        /* We eat only correctable DRAM errors with usable addresses. */
 556        if (mce_is_memory_error(m) &&
 557            mce_is_correctable(m)  &&
 558            mce_usable_address(m))
 559                if (!cec_add_elem(m->addr >> PAGE_SHIFT))
 560                        return true;
 561
 562        return false;
 563}
 564
 565static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
 566                              void *data)
 567{
 568        struct mce *m = (struct mce *)data;
 569
 570        if (!m)
 571                return NOTIFY_DONE;
 572
 573        if (cec_add_mce(m))
 574                return NOTIFY_STOP;
 575
 576        /* Emit the trace record: */
 577        trace_mce_record(m);
 578
 579        set_bit(0, &mce_need_notify);
 580
 581        mce_notify_irq();
 582
 583        return NOTIFY_DONE;
 584}
 585
 586static struct notifier_block first_nb = {
 587        .notifier_call  = mce_first_notifier,
 588        .priority       = MCE_PRIO_FIRST,
 589};
 590
 591static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
 592                                void *data)
 593{
 594        struct mce *mce = (struct mce *)data;
 595        unsigned long pfn;
 596
 597        if (!mce)
 598                return NOTIFY_DONE;
 599
 600        if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
 601                pfn = mce->addr >> PAGE_SHIFT;
 602                if (!memory_failure(pfn, 0))
 603                        mce_unmap_kpfn(pfn);
 604        }
 605
 606        return NOTIFY_OK;
 607}
 608static struct notifier_block mce_srao_nb = {
 609        .notifier_call  = srao_decode_notifier,
 610        .priority       = MCE_PRIO_SRAO,
 611};
 612
 613static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
 614                                void *data)
 615{
 616        struct mce *m = (struct mce *)data;
 617
 618        if (!m)
 619                return NOTIFY_DONE;
 620
 621        if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
 622                return NOTIFY_DONE;
 623
 624        __print_mce(m);
 625
 626        return NOTIFY_DONE;
 627}
 628
 629static struct notifier_block mce_default_nb = {
 630        .notifier_call  = mce_default_notifier,
 631        /* lowest prio, we want it to run last. */
 632        .priority       = MCE_PRIO_LOWEST,
 633};
 634
 635/*
 636 * Read ADDR and MISC registers.
 637 */
 638static void mce_read_aux(struct mce *m, int i)
 639{
 640        if (m->status & MCI_STATUS_MISCV)
 641                m->misc = mce_rdmsrl(msr_ops.misc(i));
 642
 643        if (m->status & MCI_STATUS_ADDRV) {
 644                m->addr = mce_rdmsrl(msr_ops.addr(i));
 645
 646                /*
 647                 * Mask the reported address by the reported granularity.
 648                 */
 649                if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
 650                        u8 shift = MCI_MISC_ADDR_LSB(m->misc);
 651                        m->addr >>= shift;
 652                        m->addr <<= shift;
 653                }
 654
 655                /*
 656                 * Extract [55:<lsb>] where lsb is the least significant
 657                 * *valid* bit of the address bits.
 658                 */
 659                if (mce_flags.smca) {
 660                        u8 lsb = (m->addr >> 56) & 0x3f;
 661
 662                        m->addr &= GENMASK_ULL(55, lsb);
 663                }
 664        }
 665
 666        if (mce_flags.smca) {
 667                m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
 668
 669                if (m->status & MCI_STATUS_SYNDV)
 670                        m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
 671        }
 672}
 673
 674DEFINE_PER_CPU(unsigned, mce_poll_count);
 675
 676/*
 677 * Poll for corrected events or events that happened before reset.
 678 * Those are just logged through /dev/mcelog.
 679 *
 680 * This is executed in standard interrupt context.
 681 *
 682 * Note: spec recommends to panic for fatal unsignalled
 683 * errors here. However this would be quite problematic --
 684 * we would need to reimplement the Monarch handling and
 685 * it would mess up the exclusion between exception handler
 686 * and poll hander -- * so we skip this for now.
 687 * These cases should not happen anyways, or only when the CPU
 688 * is already totally * confused. In this case it's likely it will
 689 * not fully execute the machine check handler either.
 690 */
 691bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 692{
 693        bool error_seen = false;
 694        struct mce m;
 695        int i;
 696
 697        this_cpu_inc(mce_poll_count);
 698
 699        mce_gather_info(&m, NULL);
 700
 701        if (flags & MCP_TIMESTAMP)
 702                m.tsc = rdtsc();
 703
 704        for (i = 0; i < mca_cfg.banks; i++) {
 705                if (!mce_banks[i].ctl || !test_bit(i, *b))
 706                        continue;
 707
 708                m.misc = 0;
 709                m.addr = 0;
 710                m.bank = i;
 711
 712                barrier();
 713                m.status = mce_rdmsrl(msr_ops.status(i));
 714                if (!(m.status & MCI_STATUS_VAL))
 715                        continue;
 716
 717                /*
 718                 * Uncorrected or signalled events are handled by the exception
 719                 * handler when it is enabled, so don't process those here.
 720                 *
 721                 * TBD do the same check for MCI_STATUS_EN here?
 722                 */
 723                if (!(flags & MCP_UC) &&
 724                    (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
 725                        continue;
 726
 727                error_seen = true;
 728
 729                mce_read_aux(&m, i);
 730
 731                m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
 732
 733                /*
 734                 * Don't get the IP here because it's unlikely to
 735                 * have anything to do with the actual error location.
 736                 */
 737                if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
 738                        mce_log(&m);
 739                else if (mce_usable_address(&m)) {
 740                        /*
 741                         * Although we skipped logging this, we still want
 742                         * to take action. Add to the pool so the registered
 743                         * notifiers will see it.
 744                         */
 745                        if (!mce_gen_pool_add(&m))
 746                                mce_schedule_work();
 747                }
 748
 749                /*
 750                 * Clear state for this bank.
 751                 */
 752                mce_wrmsrl(msr_ops.status(i), 0);
 753        }
 754
 755        /*
 756         * Don't clear MCG_STATUS here because it's only defined for
 757         * exceptions.
 758         */
 759
 760        sync_core();
 761
 762        return error_seen;
 763}
 764EXPORT_SYMBOL_GPL(machine_check_poll);
 765
 766/*
 767 * Do a quick check if any of the events requires a panic.
 768 * This decides if we keep the events around or clear them.
 769 */
 770static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
 771                          struct pt_regs *regs)
 772{
 773        int i, ret = 0;
 774        char *tmp;
 775
 776        for (i = 0; i < mca_cfg.banks; i++) {
 777                m->status = mce_rdmsrl(msr_ops.status(i));
 778                if (m->status & MCI_STATUS_VAL) {
 779                        __set_bit(i, validp);
 780                        if (quirk_no_way_out)
 781                                quirk_no_way_out(i, m, regs);
 782                }
 783
 784                if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
 785                        *msg = tmp;
 786                        ret = 1;
 787                }
 788        }
 789        return ret;
 790}
 791
 792/*
 793 * Variable to establish order between CPUs while scanning.
 794 * Each CPU spins initially until executing is equal its number.
 795 */
 796static atomic_t mce_executing;
 797
 798/*
 799 * Defines order of CPUs on entry. First CPU becomes Monarch.
 800 */
 801static atomic_t mce_callin;
 802
 803/*
 804 * Check if a timeout waiting for other CPUs happened.
 805 */
 806static int mce_timed_out(u64 *t, const char *msg)
 807{
 808        /*
 809         * The others already did panic for some reason.
 810         * Bail out like in a timeout.
 811         * rmb() to tell the compiler that system_state
 812         * might have been modified by someone else.
 813         */
 814        rmb();
 815        if (atomic_read(&mce_panicked))
 816                wait_for_panic();
 817        if (!mca_cfg.monarch_timeout)
 818                goto out;
 819        if ((s64)*t < SPINUNIT) {
 820                if (mca_cfg.tolerant <= 1)
 821                        mce_panic(msg, NULL, NULL);
 822                cpu_missing = 1;
 823                return 1;
 824        }
 825        *t -= SPINUNIT;
 826out:
 827        touch_nmi_watchdog();
 828        return 0;
 829}
 830
 831/*
 832 * The Monarch's reign.  The Monarch is the CPU who entered
 833 * the machine check handler first. It waits for the others to
 834 * raise the exception too and then grades them. When any
 835 * error is fatal panic. Only then let the others continue.
 836 *
 837 * The other CPUs entering the MCE handler will be controlled by the
 838 * Monarch. They are called Subjects.
 839 *
 840 * This way we prevent any potential data corruption in a unrecoverable case
 841 * and also makes sure always all CPU's errors are examined.
 842 *
 843 * Also this detects the case of a machine check event coming from outer
 844 * space (not detected by any CPUs) In this case some external agent wants
 845 * us to shut down, so panic too.
 846 *
 847 * The other CPUs might still decide to panic if the handler happens
 848 * in a unrecoverable place, but in this case the system is in a semi-stable
 849 * state and won't corrupt anything by itself. It's ok to let the others
 850 * continue for a bit first.
 851 *
 852 * All the spin loops have timeouts; when a timeout happens a CPU
 853 * typically elects itself to be Monarch.
 854 */
 855static void mce_reign(void)
 856{
 857        int cpu;
 858        struct mce *m = NULL;
 859        int global_worst = 0;
 860        char *msg = NULL;
 861        char *nmsg = NULL;
 862
 863        /*
 864         * This CPU is the Monarch and the other CPUs have run
 865         * through their handlers.
 866         * Grade the severity of the errors of all the CPUs.
 867         */
 868        for_each_possible_cpu(cpu) {
 869                int severity = mce_severity(&per_cpu(mces_seen, cpu),
 870                                            mca_cfg.tolerant,
 871                                            &nmsg, true);
 872                if (severity > global_worst) {
 873                        msg = nmsg;
 874                        global_worst = severity;
 875                        m = &per_cpu(mces_seen, cpu);
 876                }
 877        }
 878
 879        /*
 880         * Cannot recover? Panic here then.
 881         * This dumps all the mces in the log buffer and stops the
 882         * other CPUs.
 883         */
 884        if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
 885                mce_panic("Fatal machine check", m, msg);
 886
 887        /*
 888         * For UC somewhere we let the CPU who detects it handle it.
 889         * Also must let continue the others, otherwise the handling
 890         * CPU could deadlock on a lock.
 891         */
 892
 893        /*
 894         * No machine check event found. Must be some external
 895         * source or one CPU is hung. Panic.
 896         */
 897        if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
 898                mce_panic("Fatal machine check from unknown source", NULL, NULL);
 899
 900        /*
 901         * Now clear all the mces_seen so that they don't reappear on
 902         * the next mce.
 903         */
 904        for_each_possible_cpu(cpu)
 905                memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
 906}
 907
 908static atomic_t global_nwo;
 909
 910/*
 911 * Start of Monarch synchronization. This waits until all CPUs have
 912 * entered the exception handler and then determines if any of them
 913 * saw a fatal event that requires panic. Then it executes them
 914 * in the entry order.
 915 * TBD double check parallel CPU hotunplug
 916 */
 917static int mce_start(int *no_way_out)
 918{
 919        int order;
 920        int cpus = num_online_cpus();
 921        u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 922
 923        if (!timeout)
 924                return -1;
 925
 926        atomic_add(*no_way_out, &global_nwo);
 927        /*
 928         * Rely on the implied barrier below, such that global_nwo
 929         * is updated before mce_callin.
 930         */
 931        order = atomic_inc_return(&mce_callin);
 932
 933        /*
 934         * Wait for everyone.
 935         */
 936        while (atomic_read(&mce_callin) != cpus) {
 937                if (mce_timed_out(&timeout,
 938                                  "Timeout: Not all CPUs entered broadcast exception handler")) {
 939                        atomic_set(&global_nwo, 0);
 940                        return -1;
 941                }
 942                ndelay(SPINUNIT);
 943        }
 944
 945        /*
 946         * mce_callin should be read before global_nwo
 947         */
 948        smp_rmb();
 949
 950        if (order == 1) {
 951                /*
 952                 * Monarch: Starts executing now, the others wait.
 953                 */
 954                atomic_set(&mce_executing, 1);
 955        } else {
 956                /*
 957                 * Subject: Now start the scanning loop one by one in
 958                 * the original callin order.
 959                 * This way when there are any shared banks it will be
 960                 * only seen by one CPU before cleared, avoiding duplicates.
 961                 */
 962                while (atomic_read(&mce_executing) < order) {
 963                        if (mce_timed_out(&timeout,
 964                                          "Timeout: Subject CPUs unable to finish machine check processing")) {
 965                                atomic_set(&global_nwo, 0);
 966                                return -1;
 967                        }
 968                        ndelay(SPINUNIT);
 969                }
 970        }
 971
 972        /*
 973         * Cache the global no_way_out state.
 974         */
 975        *no_way_out = atomic_read(&global_nwo);
 976
 977        return order;
 978}
 979
 980/*
 981 * Synchronize between CPUs after main scanning loop.
 982 * This invokes the bulk of the Monarch processing.
 983 */
 984static int mce_end(int order)
 985{
 986        int ret = -1;
 987        u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 988
 989        if (!timeout)
 990                goto reset;
 991        if (order < 0)
 992                goto reset;
 993
 994        /*
 995         * Allow others to run.
 996         */
 997        atomic_inc(&mce_executing);
 998
 999        if (order == 1) {
1000                /* CHECKME: Can this race with a parallel hotplug? */

1001                int cpus = num_online_cpus();
1002
1003                /*
1004                 * Monarch: Wait for everyone to go through their scanning
1005                 * loops.
1006                 */
1007                while (atomic_read(&mce_executing) <= cpus) {
1008                        if (mce_timed_out(&timeout,
1009                                          "Timeout: Monarch CPU unable to finish machine check processing"))
1010                                goto reset;
1011                        ndelay(SPINUNIT);
1012                }
1013
1014                mce_reign();
1015                barrier();
1016                ret = 0;
1017        } else {
1018                /*
1019                 * Subject: Wait for Monarch to finish.
1020                 */
1021                while (atomic_read(&mce_executing) != 0) {
1022                        if (mce_timed_out(&timeout,
1023                                          "Timeout: Monarch CPU did not finish machine check processing"))
1024                                goto reset;
1025                        ndelay(SPINUNIT);
1026                }
1027
1028                /*
1029                 * Don't reset anything. That's done by the Monarch.
1030                 */
1031                return 0;
1032        }
1033
1034        /*
1035         * Reset all global state.
1036         */
1037reset:
1038        atomic_set(&global_nwo, 0);
1039        atomic_set(&mce_callin, 0);
1040        barrier();
1041
1042        /*
1043         * Let others run again.
1044         */
1045        atomic_set(&mce_executing, 0);
1046        return ret;
1047}
1048
1049static void mce_clear_state(unsigned long *toclear)
1050{
1051        int i;
1052
1053        for (i = 0; i < mca_cfg.banks; i++) {
1054                if (test_bit(i, toclear))
1055                        mce_wrmsrl(msr_ops.status(i), 0);
1056        }
1057}
1058
1059static int do_memory_failure(struct mce *m)
1060{
1061        int flags = MF_ACTION_REQUIRED;
1062        int ret;
1063
1064        pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
1065        if (!(m->mcgstatus & MCG_STATUS_RIPV))
1066                flags |= MF_MUST_KILL;
1067        ret = memory_failure(m->addr >> PAGE_SHIFT, flags);
1068        if (ret)
1069                pr_err("Memory error not recovered");
1070        else
1071                mce_unmap_kpfn(m->addr >> PAGE_SHIFT);
1072        return ret;
1073}
1074
1075#ifndef mce_unmap_kpfn
1076static void mce_unmap_kpfn(unsigned long pfn)
1077{
1078        unsigned long decoy_addr;
1079
1080        /*
1081         * Unmap this page from the kernel 1:1 mappings to make sure
1082         * we don't log more errors because of speculative access to
1083         * the page.
1084         * We would like to just call:
1085         *      set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
1086         * but doing that would radically increase the odds of a
1087         * speculative access to the poison page because we'd have
1088         * the virtual address of the kernel 1:1 mapping sitting
1089         * around in registers.
1090         * Instead we get tricky.  We create a non-canonical address
1091         * that looks just like the one we want, but has bit 63 flipped.
1092         * This relies on set_memory_np() not checking whether we passed
1093         * a legal address.
1094         */
1095
1096/*
1097 * Build time check to see if we have a spare virtual bit. Don't want
1098 * to leave this until run time because most developers don't have a
1099 * system that can exercise this code path. This will only become a
1100 * problem if/when we move beyond 5-level page tables.
1101 *
1102 * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD)
1103 */
1104#if PGDIR_SHIFT + 9 < 63
1105        decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
1106#else
1107#error "no unused virtual bit available"
1108#endif
1109
1110        if (set_memory_np(decoy_addr, 1))
1111                pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
1112}
1113#endif
1114
1115/*
1116 * The actual machine check handler. This only handles real
1117 * exceptions when something got corrupted coming in through int 18.
1118 *
1119 * This is executed in NMI context not subject to normal locking rules. This
1120 * implies that most kernel services cannot be safely used. Don't even
1121 * think about putting a printk in there!
1122 *
1123 * On Intel systems this is entered on all CPUs in parallel through
1124 * MCE broadcast. However some CPUs might be broken beyond repair,
1125 * so be always careful when synchronizing with others.
1126 */
1127void do_machine_check(struct pt_regs *regs, long error_code)
1128{
1129        struct mca_config *cfg = &mca_cfg;
1130        struct mce m, *final;
1131        int i;
1132        int worst = 0;
1133        int severity;
1134
1135        /*
1136         * Establish sequential order between the CPUs entering the machine
1137         * check handler.
1138         */
1139        int order = -1;
1140        /*
1141         * If no_way_out gets set, there is no safe way to recover from this
1142         * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
1143         */
1144        int no_way_out = 0;
1145        /*
1146         * If kill_it gets set, there might be a way to recover from this
1147         * error.
1148         */
1149        int kill_it = 0;
1150        DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1151        DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1152        char *msg = "Unknown";
1153
1154        /*
1155         * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
1156         * on Intel.
1157         */
1158        int lmce = 1;
1159        int cpu = smp_processor_id();
1160
1161        /*
1162         * Cases where we avoid rendezvous handler timeout:
1163         * 1) If this CPU is offline.
1164         *
1165         * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
1166         *  skip those CPUs which remain looping in the 1st kernel - see
1167         *  crash_nmi_callback().
1168         *
1169         * Note: there still is a small window between kexec-ing and the new,
1170         * kdump kernel establishing a new #MC handler where a broadcasted MCE
1171         * might not get handled properly.
1172         */
1173        if (cpu_is_offline(cpu) ||
1174            (crashing_cpu != -1 && crashing_cpu != cpu)) {
1175                u64 mcgstatus;
1176
1177                mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
1178                if (mcgstatus & MCG_STATUS_RIPV) {
1179                        mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1180                        return;
1181                }
1182        }
1183
1184        ist_enter(regs);
1185
1186        this_cpu_inc(mce_exception_count);
1187
1188        if (!cfg->banks)
1189                goto out;
1190
1191        mce_gather_info(&m, regs);
1192        m.tsc = rdtsc();
1193
1194        final = this_cpu_ptr(&mces_seen);
1195        *final = m;
1196
1197        memset(valid_banks, 0, sizeof(valid_banks));
1198        no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1199
1200        barrier();
1201
1202        /*
1203         * When no restart IP might need to kill or panic.
1204         * Assume the worst for now, but if we find the
1205         * severity is MCE_AR_SEVERITY we have other options.
1206         */
1207        if (!(m.mcgstatus & MCG_STATUS_RIPV))
1208                kill_it = 1;
1209
1210        /*
1211         * Check if this MCE is signaled to only this logical processor,
1212         * on Intel only.
1213         */
1214        if (m.cpuvendor == X86_VENDOR_INTEL)
1215                lmce = m.mcgstatus & MCG_STATUS_LMCES;
1216
1217        /*
1218         * Go through all banks in exclusion of the other CPUs. This way we
1219         * don't report duplicated events on shared banks because the first one
1220         * to see it will clear it. If this is a Local MCE, then no need to
1221         * perform rendezvous.
1222         */
1223        if (!lmce)
1224                order = mce_start(&no_way_out);
1225
1226        for (i = 0; i < cfg->banks; i++) {
1227                __clear_bit(i, toclear);
1228                if (!test_bit(i, valid_banks))
1229                        continue;
1230                if (!mce_banks[i].ctl)
1231                        continue;
1232
1233                m.misc = 0;
1234                m.addr = 0;
1235                m.bank = i;
1236
1237                m.status = mce_rdmsrl(msr_ops.status(i));
1238                if ((m.status & MCI_STATUS_VAL) == 0)
1239                        continue;
1240
1241                /*
1242                 * Non uncorrected or non signaled errors are handled by
1243                 * machine_check_poll. Leave them alone, unless this panics.
1244                 */
1245                if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1246                        !no_way_out)
1247                        continue;
1248
1249                /*
1250                 * Set taint even when machine check was not enabled.
1251                 */
1252                add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1253
1254                severity = mce_severity(&m, cfg->tolerant, NULL, true);
1255
1256                /*
1257                 * When machine check was for corrected/deferred handler don't
1258                 * touch, unless we're panicing.
1259                 */
1260                if ((severity == MCE_KEEP_SEVERITY ||
1261                     severity == MCE_UCNA_SEVERITY) && !no_way_out)
1262                        continue;
1263                __set_bit(i, toclear);
1264                if (severity == MCE_NO_SEVERITY) {
1265                        /*
1266                         * Machine check event was not enabled. Clear, but
1267                         * ignore.
1268                         */
1269                        continue;
1270                }
1271
1272                mce_read_aux(&m, i);
1273
1274                /* assuming valid severity level != 0 */
1275                m.severity = severity;
1276
1277                mce_log(&m);
1278
1279                if (severity > worst) {
1280                        *final = m;
1281                        worst = severity;
1282                }
1283        }
1284
1285        /* mce_clear_state will clear *final, save locally for use later */
1286        m = *final;
1287
1288        if (!no_way_out)
1289                mce_clear_state(toclear);
1290
1291        /*
1292         * Do most of the synchronization with other CPUs.
1293         * When there's any problem use only local no_way_out state.
1294         */
1295        if (!lmce) {
1296                if (mce_end(order) < 0)
1297                        no_way_out = worst >= MCE_PANIC_SEVERITY;
1298        } else {
1299                /*
1300                 * Local MCE skipped calling mce_reign()
1301                 * If we found a fatal error, we need to panic here.
1302                 */
1303                 if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
1304                        mce_panic("Machine check from unknown source",
1305                                NULL, NULL);
1306        }
1307
1308        /*
1309         * If tolerant is at an insane level we drop requests to kill
1310         * processes and continue even when there is no way out.
1311         */
1312        if (cfg->tolerant == 3)
1313                kill_it = 0;
1314        else if (no_way_out)
1315                mce_panic("Fatal machine check on current CPU", &m, msg);
1316
1317        if (worst > 0)
1318                mce_report_event(regs);
1319        mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1320out:
1321        sync_core();
1322
1323        if (worst != MCE_AR_SEVERITY && !kill_it)
1324                goto out_ist;
1325
1326        /* Fault was in user mode and we need to take some action */
1327        if ((m.cs & 3) == 3) {
1328                ist_begin_non_atomic(regs);
1329                local_irq_enable();
1330
1331                if (kill_it || do_memory_failure(&m))
1332                        force_sig(SIGBUS, current);
1333                local_irq_disable();
1334                ist_end_non_atomic();
1335        } else {
1336                if (!fixup_exception(regs, X86_TRAP_MC))
1337                        mce_panic("Failed kernel mode recovery", &m, NULL);
1338        }
1339
1340out_ist:
1341        ist_exit(regs);
1342}
1343EXPORT_SYMBOL_GPL(do_machine_check);
1344
1345#ifndef CONFIG_MEMORY_FAILURE
1346int memory_failure(unsigned long pfn, int flags)
1347{
1348        /* mce_severity() should not hand us an ACTION_REQUIRED error */
1349        BUG_ON(flags & MF_ACTION_REQUIRED);
1350        pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1351               "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1352               pfn);
1353
1354        return 0;
1355}
1356#endif
1357
1358/*
1359 * Periodic polling timer for "silent" machine check errors.  If the
1360 * poller finds an MCE, poll 2x faster.  When the poller finds no more
1361 * errors, poll 2x slower (up to check_interval seconds).
1362 */
1363static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1364
1365static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1366static DEFINE_PER_CPU(struct timer_list, mce_timer);
1367
1368static unsigned long mce_adjust_timer_default(unsigned long interval)
1369{
1370        return interval;
1371}
1372
1373static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1374
1375static void __start_timer(struct timer_list *t, unsigned long interval)
1376{
1377        unsigned long when = jiffies + interval;
1378        unsigned long flags;
1379
1380        local_irq_save(flags);
1381
1382        if (!timer_pending(t) || time_before(when, t->expires))
1383                mod_timer(t, round_jiffies(when));
1384
1385        local_irq_restore(flags);
1386}
1387
1388static void mce_timer_fn(struct timer_list *t)
1389{
1390        struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
1391        unsigned long iv;
1392
1393        WARN_ON(cpu_t != t);
1394
1395        iv = __this_cpu_read(mce_next_interval);
1396
1397        if (mce_available(this_cpu_ptr(&cpu_info))) {
1398                machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
1399
1400                if (mce_intel_cmci_poll()) {
1401                        iv = mce_adjust_timer(iv);
1402                        goto done;
1403                }
1404        }
1405
1406        /*
1407         * Alert userspace if needed. If we logged an MCE, reduce the polling
1408         * interval, otherwise increase the polling interval.
1409         */
1410        if (mce_notify_irq())
1411                iv = max(iv / 2, (unsigned long) HZ/100);
1412        else
1413                iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1414
1415done:
1416        __this_cpu_write(mce_next_interval, iv);
1417        __start_timer(t, iv);
1418}
1419
1420/*
1421 * Ensure that the timer is firing in @interval from now.
1422 */
1423void mce_timer_kick(unsigned long interval)
1424{
1425        struct timer_list *t = this_cpu_ptr(&mce_timer);
1426        unsigned long iv = __this_cpu_read(mce_next_interval);
1427
1428        __start_timer(t, interval);
1429
1430        if (interval < iv)
1431                __this_cpu_write(mce_next_interval, interval);
1432}
1433
1434/* Must not be called in IRQ context where del_timer_sync() can deadlock */
1435static void mce_timer_delete_all(void)
1436{
1437        int cpu;
1438
1439        for_each_online_cpu(cpu)
1440                del_timer_sync(&per_cpu(mce_timer, cpu));
1441}
1442
1443/*
1444 * Notify the user(s) about new machine check events.
1445 * Can be called from interrupt context, but not from machine check/NMI
1446 * context.
1447 */
1448int mce_notify_irq(void)
1449{
1450        /* Not more than two messages every minute */
1451        static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1452
1453        if (test_and_clear_bit(0, &mce_need_notify)) {
1454                mce_work_trigger();
1455
1456                if (__ratelimit(&ratelimit))
1457                        pr_info(HW_ERR "Machine check events logged\n");
1458
1459                return 1;
1460        }
1461        return 0;
1462}
1463EXPORT_SYMBOL_GPL(mce_notify_irq);
1464
1465static int __mcheck_cpu_mce_banks_init(void)
1466{
1467        int i;
1468        u8 num_banks = mca_cfg.banks;
1469
1470        mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
1471        if (!mce_banks)
1472                return -ENOMEM;
1473
1474        for (i = 0; i < num_banks; i++) {
1475                struct mce_bank *b = &mce_banks[i];
1476
1477                b->ctl = -1ULL;
1478                b->init = 1;
1479        }
1480        return 0;
1481}
1482
1483/*
1484 * Initialize Machine Checks for a CPU.
1485 */
1486static int __mcheck_cpu_cap_init(void)
1487{
1488        unsigned b;
1489        u64 cap;
1490
1491        rdmsrl(MSR_IA32_MCG_CAP, cap);
1492
1493        b = cap & MCG_BANKCNT_MASK;
1494        if (!mca_cfg.banks)
1495                pr_info("CPU supports %d MCE banks\n", b);
1496
1497        if (b > MAX_NR_BANKS) {
1498                pr_warn("Using only %u machine check banks out of %u\n",
1499                        MAX_NR_BANKS, b);
1500                b = MAX_NR_BANKS;
1501        }
1502
1503        /* Don't support asymmetric configurations today */
1504        WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
1505        mca_cfg.banks = b;
1506
1507        if (!mce_banks) {
1508                int err = __mcheck_cpu_mce_banks_init();
1509
1510                if (err)
1511                        return err;
1512        }
1513
1514        /* Use accurate RIP reporting if available. */
1515        if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1516                mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1517
1518        if (cap & MCG_SER_P)
1519                mca_cfg.ser = true;
1520
1521        return 0;
1522}
1523
1524static void __mcheck_cpu_init_generic(void)
1525{
1526        enum mcp_flags m_fl = 0;
1527        mce_banks_t all_banks;
1528        u64 cap;
1529
1530        if (!mca_cfg.bootlog)
1531                m_fl = MCP_DONTLOG;
1532
1533        /*
1534         * Log the machine checks left over from the previous reset.
1535         */
1536        bitmap_fill(all_banks, MAX_NR_BANKS);
1537        machine_check_poll(MCP_UC | m_fl, &all_banks);
1538
1539        cr4_set_bits(X86_CR4_MCE);
1540
1541        rdmsrl(MSR_IA32_MCG_CAP, cap);
1542        if (cap & MCG_CTL_P)
1543                wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1544}
1545
1546static void __mcheck_cpu_init_clear_banks(void)
1547{
1548        int i;
1549
1550        for (i = 0; i < mca_cfg.banks; i++) {
1551                struct mce_bank *b = &mce_banks[i];
1552
1553                if (!b->init)
1554                        continue;
1555                wrmsrl(msr_ops.ctl(i), b->ctl);
1556                wrmsrl(msr_ops.status(i), 0);
1557        }
1558}
1559
1560/*
1561 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1562 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1563 * Vol 3B Table 15-20). But this confuses both the code that determines
1564 * whether the machine check occurred in kernel or user mode, and also
1565 * the severity assessment code. Pretend that EIPV was set, and take the
1566 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1567 */
1568static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1569{
1570        if (bank != 0)
1571                return;
1572        if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1573                return;
1574        if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1575                          MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1576                          MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1577                          MCACOD)) !=
1578                         (MCI_STATUS_UC|MCI_STATUS_EN|
1579                          MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1580                          MCI_STATUS_AR|MCACOD_INSTR))
1581                return;
1582
1583        m->mcgstatus |= MCG_STATUS_EIPV;
1584        m->ip = regs->ip;
1585        m->cs = regs->cs;
1586}
1587
1588/* Add per CPU specific workarounds here */
1589static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1590{
1591        struct mca_config *cfg = &mca_cfg;
1592
1593        if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1594                pr_info("unknown CPU type - not enabling MCE support\n");
1595                return -EOPNOTSUPP;
1596        }
1597
1598        /* This should be disabled by the BIOS, but isn't always */
1599        if (c->x86_vendor == X86_VENDOR_AMD) {
1600                if (c->x86 == 15 && cfg->banks > 4) {
1601                        /*
1602                         * disable GART TBL walk error reporting, which
1603                         * trips off incorrectly with the IOMMU & 3ware
1604                         * & Cerberus:
1605                         */
1606                        clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1607                }
1608                if (c->x86 < 0x11 && cfg->bootlog < 0) {
1609                        /*
1610                         * Lots of broken BIOS around that don't clear them
1611                         * by default and leave crap in there. Don't log:
1612                         */
1613                        cfg->bootlog = 0;
1614                }
1615                /*
1616                 * Various K7s with broken bank 0 around. Always disable
1617                 * by default.
1618                 */
1619                if (c->x86 == 6 && cfg->banks > 0)
1620                        mce_banks[0].ctl = 0;
1621
1622                /*
1623                 * overflow_recov is supported for F15h Models 00h-0fh
1624                 * even though we don't have a CPUID bit for it.
1625                 */
1626                if (c->x86 == 0x15 && c->x86_model <= 0xf)
1627                        mce_flags.overflow_recov = 1;
1628
1629                /*
1630                 * Turn off MC4_MISC thresholding banks on those models since
1631                 * they're not supported there.
1632                 */
1633                if (c->x86 == 0x15 &&
1634                    (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1635                        int i;
1636                        u64 hwcr;
1637                        bool need_toggle;
1638                        u32 msrs[] = {
1639                                0x00000413, /* MC4_MISC0 */
1640                                0xc0000408, /* MC4_MISC1 */
1641                        };
1642
1643                        rdmsrl(MSR_K7_HWCR, hwcr);
1644
1645                        /* McStatusWrEn has to be set */
1646                        need_toggle = !(hwcr & BIT(18));
1647
1648                        if (need_toggle)
1649                                wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1650
1651                        /* Clear CntP bit safely */
1652                        for (i = 0; i < ARRAY_SIZE(msrs); i++)
1653                                msr_clear_bit(msrs[i], 62);
1654
1655                        /* restore old settings */
1656                        if (need_toggle)
1657                                wrmsrl(MSR_K7_HWCR, hwcr);
1658                }
1659        }
1660
1661        if (c->x86_vendor == X86_VENDOR_INTEL) {
1662                /*
1663                 * SDM documents that on family 6 bank 0 should not be written
1664                 * because it aliases to another special BIOS controlled
1665                 * register.
1666                 * But it's not aliased anymore on model 0x1a+
1667                 * Don't ignore bank 0 completely because there could be a
1668                 * valid event later, merely don't write CTL0.
1669                 */
1670
1671                if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
1672                        mce_banks[0].init = 0;
1673
1674                /*
1675                 * All newer Intel systems support MCE broadcasting. Enable
1676                 * synchronization with a one second timeout.
1677                 */
1678                if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1679                        cfg->monarch_timeout < 0)
1680                        cfg->monarch_timeout = USEC_PER_SEC;
1681
1682                /*
1683                 * There are also broken BIOSes on some Pentium M and
1684                 * earlier systems:
1685                 */
1686                if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1687                        cfg->bootlog = 0;
1688
1689                if (c->x86 == 6 && c->x86_model == 45)
1690                        quirk_no_way_out = quirk_sandybridge_ifu;
1691        }
1692        if (cfg->monarch_timeout < 0)
1693                cfg->monarch_timeout = 0;
1694        if (cfg->bootlog != 0)
1695                cfg->panic_timeout = 30;
1696
1697        return 0;
1698}
1699
1700static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1701{
1702        if (c->x86 != 5)
1703                return 0;
1704
1705        switch (c->x86_vendor) {
1706        case X86_VENDOR_INTEL:
1707                intel_p5_mcheck_init(c);
1708                return 1;
1709                break;
1710        case X86_VENDOR_CENTAUR:
1711                winchip_mcheck_init(c);
1712                return 1;
1713                break;
1714        default:
1715                return 0;
1716        }
1717
1718        return 0;
1719}
1720
1721/*
1722 * Init basic CPU features needed for early decoding of MCEs.
1723 */
1724static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
1725{
1726        if (c->x86_vendor == X86_VENDOR_AMD) {
1727                mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
1728                mce_flags.succor         = !!cpu_has(c, X86_FEATURE_SUCCOR);
1729                mce_flags.smca           = !!cpu_has(c, X86_FEATURE_SMCA);
1730
1731                if (mce_flags.smca) {
1732                        msr_ops.ctl     = smca_ctl_reg;
1733                        msr_ops.status  = smca_status_reg;
1734                        msr_ops.addr    = smca_addr_reg;
1735                        msr_ops.misc    = smca_misc_reg;
1736                }
1737        }
1738}
1739
1740static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1741{
1742        switch (c->x86_vendor) {
1743        case X86_VENDOR_INTEL:
1744                mce_intel_feature_init(c);
1745                mce_adjust_timer = cmci_intel_adjust_timer;
1746                break;
1747
1748        case X86_VENDOR_AMD: {
1749                mce_amd_feature_init(c);
1750                break;
1751                }
1752
1753        default:
1754                break;
1755        }
1756}
1757
1758static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
1759{
1760        switch (c->x86_vendor) {
1761        case X86_VENDOR_INTEL:
1762                mce_intel_feature_clear(c);
1763                break;
1764        default:
1765                break;
1766        }
1767}
1768
1769static void mce_start_timer(struct timer_list *t)
1770{
1771        unsigned long iv = check_interval * HZ;
1772
1773        if (mca_cfg.ignore_ce || !iv)
1774                return;
1775
1776        this_cpu_write(mce_next_interval, iv);
1777        __start_timer(t, iv);
1778}
1779
1780static void __mcheck_cpu_setup_timer(void)
1781{
1782        struct timer_list *t = this_cpu_ptr(&mce_timer);
1783
1784        timer_setup(t, mce_timer_fn, TIMER_PINNED);
1785}
1786
1787static void __mcheck_cpu_init_timer(void)
1788{
1789        struct timer_list *t = this_cpu_ptr(&mce_timer);
1790
1791        timer_setup(t, mce_timer_fn, TIMER_PINNED);
1792        mce_start_timer(t);
1793}
1794
1795/* Handle unconfigured int18 (should never happen) */
1796static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1797{
1798        pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1799               smp_processor_id());
1800}
1801
1802/* Call the installed machine check handler for this CPU setup. */
1803void (*machine_check_vector)(struct pt_regs *, long error_code) =
1804                                                unexpected_machine_check;
1805
1806dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
1807{
1808        machine_check_vector(regs, error_code);
1809}
1810
1811/*
1812 * Called for each booted CPU to set up machine checks.
1813 * Must be called with preempt off:
1814 */
1815void mcheck_cpu_init(struct cpuinfo_x86 *c)
1816{
1817        if (mca_cfg.disabled)
1818                return;
1819
1820        if (__mcheck_cpu_ancient_init(c))
1821                return;
1822
1823        if (!mce_available(c))
1824                return;
1825
1826        if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1827                mca_cfg.disabled = true;
1828                return;
1829        }
1830
1831        if (mce_gen_pool_init()) {
1832                mca_cfg.disabled = true;
1833                pr_emerg("Couldn't allocate MCE records pool!\n");
1834                return;
1835        }
1836
1837        machine_check_vector = do_machine_check;
1838
1839        __mcheck_cpu_init_early(c);
1840        __mcheck_cpu_init_generic();
1841        __mcheck_cpu_init_vendor(c);
1842        __mcheck_cpu_init_clear_banks();
1843        __mcheck_cpu_setup_timer();
1844}
1845
1846/*
1847 * Called for each booted CPU to clear some machine checks opt-ins
1848 */
1849void mcheck_cpu_clear(struct cpuinfo_x86 *c)
1850{
1851        if (mca_cfg.disabled)
1852                return;
1853
1854        if (!mce_available(c))
1855                return;
1856
1857        /*
1858         * Possibly to clear general settings generic to x86
1859         * __mcheck_cpu_clear_generic(c);
1860         */
1861        __mcheck_cpu_clear_vendor(c);
1862
1863}
1864
1865static void __mce_disable_bank(void *arg)
1866{
1867        int bank = *((int *)arg);
1868        __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
1869        cmci_disable_bank(bank);
1870}
1871
1872void mce_disable_bank(int bank)
1873{
1874        if (bank >= mca_cfg.banks) {
1875                pr_warn(FW_BUG
1876                        "Ignoring request to disable invalid MCA bank %d.\n",
1877                        bank);
1878                return;
1879        }
1880        set_bit(bank, mce_banks_ce_disabled);
1881        on_each_cpu(__mce_disable_bank, &bank, 1);
1882}
1883
1884/*
1885 * mce=off Disables machine check
1886 * mce=no_cmci Disables CMCI
1887 * mce=no_lmce Disables LMCE
1888 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1889 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1890 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1891 *      monarchtimeout is how long to wait for other CPUs on machine
1892 *      check, or 0 to not wait
1893 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
1894        and older.
1895 * mce=nobootlog Don't log MCEs from before booting.
1896 * mce=bios_cmci_threshold Don't program the CMCI threshold
1897 * mce=recovery force enable memcpy_mcsafe()
1898 */
1899static int __init mcheck_enable(char *str)
1900{
1901        struct mca_config *cfg = &mca_cfg;
1902
1903        if (*str == 0) {
1904                enable_p5_mce();
1905                return 1;
1906        }
1907        if (*str == '=')
1908                str++;
1909        if (!strcmp(str, "off"))
1910                cfg->disabled = true;
1911        else if (!strcmp(str, "no_cmci"))
1912                cfg->cmci_disabled = true;
1913        else if (!strcmp(str, "no_lmce"))
1914                cfg->lmce_disabled = true;
1915        else if (!strcmp(str, "dont_log_ce"))
1916                cfg->dont_log_ce = true;
1917        else if (!strcmp(str, "ignore_ce"))
1918                cfg->ignore_ce = true;
1919        else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1920                cfg->bootlog = (str[0] == 'b');
1921        else if (!strcmp(str, "bios_cmci_threshold"))
1922                cfg->bios_cmci_threshold = true;
1923        else if (!strcmp(str, "recovery"))
1924                cfg->recovery = true;
1925        else if (isdigit(str[0])) {
1926                if (get_option(&str, &cfg->tolerant) == 2)
1927                        get_option(&str, &(cfg->monarch_timeout));
1928        } else {
1929                pr_info("mce argument %s ignored. Please use /sys\n", str);
1930                return 0;
1931        }
1932        return 1;
1933}
1934__setup("mce", mcheck_enable);
1935
1936int __init mcheck_init(void)
1937{
1938        mcheck_intel_therm_init();
1939        mce_register_decode_chain(&first_nb);
1940        mce_register_decode_chain(&mce_srao_nb);
1941        mce_register_decode_chain(&mce_default_nb);
1942        mcheck_vendor_init_severity();
1943
1944        INIT_WORK(&mce_work, mce_gen_pool_process);
1945        init_irq_work(&mce_irq_work, mce_irq_work_cb);
1946
1947        return 0;
1948}
1949
1950/*
1951 * mce_syscore: PM support
1952 */
1953
1954/*
1955 * Disable machine checks on suspend and shutdown. We can't really handle
1956 * them later.
1957 */
1958static void mce_disable_error_reporting(void)
1959{
1960        int i;
1961
1962        for (i = 0; i < mca_cfg.banks; i++) {
1963                struct mce_bank *b = &mce_banks[i];
1964
1965                if (b->init)
1966                        wrmsrl(msr_ops.ctl(i), 0);
1967        }
1968        return;
1969}
1970
1971static void vendor_disable_error_reporting(void)
1972{
1973        /*
1974         * Don't clear on Intel or AMD CPUs. Some of these MSRs are socket-wide.
1975         * Disabling them for just a single offlined CPU is bad, since it will
1976         * inhibit reporting for all shared resources on the socket like the
1977         * last level cache (LLC), the integrated memory controller (iMC), etc.
1978         */
1979        if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
1980            boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1981                return;
1982
1983        mce_disable_error_reporting();
1984}
1985
1986static int mce_syscore_suspend(void)
1987{
1988        vendor_disable_error_reporting();
1989        return 0;
1990}
1991
1992static void mce_syscore_shutdown(void)
1993{
1994        vendor_disable_error_reporting();
1995}
1996
1997/*
1998 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1999 * Only one CPU is active at this time, the others get re-added later using
2000 * CPU hotplug:

2001 */
2002static void mce_syscore_resume(void)
2003{
2004        __mcheck_cpu_init_generic();
2005        __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
2006        __mcheck_cpu_init_clear_banks();
2007}
2008
2009static struct syscore_ops mce_syscore_ops = {
2010        .suspend        = mce_syscore_suspend,
2011        .shutdown       = mce_syscore_shutdown,
2012        .resume         = mce_syscore_resume,
2013};
2014
2015/*
2016 * mce_device: Sysfs support
2017 */
2018
2019static void mce_cpu_restart(void *data)
2020{
2021        if (!mce_available(raw_cpu_ptr(&cpu_info)))
2022                return;
2023        __mcheck_cpu_init_generic();
2024        __mcheck_cpu_init_clear_banks();
2025        __mcheck_cpu_init_timer();
2026}
2027
2028/* Reinit MCEs after user configuration changes */
2029static void mce_restart(void)
2030{
2031        mce_timer_delete_all();
2032        on_each_cpu(mce_cpu_restart, NULL, 1);
2033}
2034
2035/* Toggle features for corrected errors */
2036static void mce_disable_cmci(void *data)
2037{
2038        if (!mce_available(raw_cpu_ptr(&cpu_info)))
2039                return;
2040        cmci_clear();
2041}
2042
2043static void mce_enable_ce(void *all)
2044{
2045        if (!mce_available(raw_cpu_ptr(&cpu_info)))
2046                return;
2047        cmci_reenable();
2048        cmci_recheck();
2049        if (all)
2050                __mcheck_cpu_init_timer();
2051}
2052
2053static struct bus_type mce_subsys = {
2054        .name           = "machinecheck",
2055        .dev_name       = "machinecheck",
2056};
2057
2058DEFINE_PER_CPU(struct device *, mce_device);
2059
2060static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2061{
2062        return container_of(attr, struct mce_bank, attr);
2063}
2064
2065static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2066                         char *buf)
2067{
2068        return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2069}
2070
2071static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2072                        const char *buf, size_t size)
2073{
2074        u64 new;
2075
2076        if (kstrtou64(buf, 0, &new) < 0)
2077                return -EINVAL;
2078
2079        attr_to_bank(attr)->ctl = new;
2080        mce_restart();
2081
2082        return size;
2083}
2084
2085static ssize_t set_ignore_ce(struct device *s,
2086                             struct device_attribute *attr,
2087                             const char *buf, size_t size)
2088{
2089        u64 new;
2090
2091        if (kstrtou64(buf, 0, &new) < 0)
2092                return -EINVAL;
2093
2094        mutex_lock(&mce_sysfs_mutex);
2095        if (mca_cfg.ignore_ce ^ !!new) {
2096                if (new) {
2097                        /* disable ce features */
2098                        mce_timer_delete_all();
2099                        on_each_cpu(mce_disable_cmci, NULL, 1);
2100                        mca_cfg.ignore_ce = true;
2101                } else {
2102                        /* enable ce features */
2103                        mca_cfg.ignore_ce = false;
2104                        on_each_cpu(mce_enable_ce, (void *)1, 1);
2105                }
2106        }
2107        mutex_unlock(&mce_sysfs_mutex);
2108
2109        return size;
2110}
2111
2112static ssize_t set_cmci_disabled(struct device *s,
2113                                 struct device_attribute *attr,
2114                                 const char *buf, size_t size)
2115{
2116        u64 new;
2117
2118        if (kstrtou64(buf, 0, &new) < 0)
2119                return -EINVAL;
2120
2121        mutex_lock(&mce_sysfs_mutex);
2122        if (mca_cfg.cmci_disabled ^ !!new) {
2123                if (new) {
2124                        /* disable cmci */
2125                        on_each_cpu(mce_disable_cmci, NULL, 1);
2126                        mca_cfg.cmci_disabled = true;
2127                } else {
2128                        /* enable cmci */
2129                        mca_cfg.cmci_disabled = false;
2130                        on_each_cpu(mce_enable_ce, NULL, 1);
2131                }
2132        }
2133        mutex_unlock(&mce_sysfs_mutex);
2134
2135        return size;
2136}
2137
2138static ssize_t store_int_with_restart(struct device *s,
2139                                      struct device_attribute *attr,
2140                                      const char *buf, size_t size)
2141{
2142        unsigned long old_check_interval = check_interval;
2143        ssize_t ret = device_store_ulong(s, attr, buf, size);
2144
2145        if (check_interval == old_check_interval)
2146                return ret;
2147
2148        if (check_interval < 1)
2149                check_interval = 1;
2150
2151        mutex_lock(&mce_sysfs_mutex);
2152        mce_restart();
2153        mutex_unlock(&mce_sysfs_mutex);
2154
2155        return ret;
2156}
2157
2158static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2159static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2160static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2161
2162static struct dev_ext_attribute dev_attr_check_interval = {
2163        __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2164        &check_interval
2165};
2166
2167static struct dev_ext_attribute dev_attr_ignore_ce = {
2168        __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2169        &mca_cfg.ignore_ce
2170};
2171
2172static struct dev_ext_attribute dev_attr_cmci_disabled = {
2173        __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2174        &mca_cfg.cmci_disabled
2175};
2176
2177static struct device_attribute *mce_device_attrs[] = {
2178        &dev_attr_tolerant.attr,
2179        &dev_attr_check_interval.attr,
2180#ifdef CONFIG_X86_MCELOG_LEGACY
2181        &dev_attr_trigger,
2182#endif
2183        &dev_attr_monarch_timeout.attr,
2184        &dev_attr_dont_log_ce.attr,
2185        &dev_attr_ignore_ce.attr,
2186        &dev_attr_cmci_disabled.attr,
2187        NULL
2188};
2189
2190static cpumask_var_t mce_device_initialized;
2191
2192static void mce_device_release(struct device *dev)
2193{
2194        kfree(dev);
2195}
2196
2197/* Per cpu device init. All of the cpus still share the same ctrl bank: */
2198static int mce_device_create(unsigned int cpu)
2199{
2200        struct device *dev;
2201        int err;
2202        int i, j;
2203
2204        if (!mce_available(&boot_cpu_data))
2205                return -EIO;
2206
2207        dev = per_cpu(mce_device, cpu);
2208        if (dev)
2209                return 0;
2210
2211        dev = kzalloc(sizeof *dev, GFP_KERNEL);
2212        if (!dev)
2213                return -ENOMEM;
2214        dev->id  = cpu;
2215        dev->bus = &mce_subsys;
2216        dev->release = &mce_device_release;
2217
2218        err = device_register(dev);
2219        if (err) {
2220                put_device(dev);
2221                return err;
2222        }
2223
2224        for (i = 0; mce_device_attrs[i]; i++) {
2225                err = device_create_file(dev, mce_device_attrs[i]);
2226                if (err)
2227                        goto error;
2228        }
2229        for (j = 0; j < mca_cfg.banks; j++) {
2230                err = device_create_file(dev, &mce_banks[j].attr);
2231                if (err)
2232                        goto error2;
2233        }
2234        cpumask_set_cpu(cpu, mce_device_initialized);
2235        per_cpu(mce_device, cpu) = dev;
2236
2237        return 0;
2238error2:
2239        while (--j >= 0)
2240                device_remove_file(dev, &mce_banks[j].attr);
2241error:
2242        while (--i >= 0)
2243                device_remove_file(dev, mce_device_attrs[i]);
2244
2245        device_unregister(dev);
2246
2247        return err;
2248}
2249
2250static void mce_device_remove(unsigned int cpu)
2251{
2252        struct device *dev = per_cpu(mce_device, cpu);
2253        int i;
2254
2255        if (!cpumask_test_cpu(cpu, mce_device_initialized))
2256                return;
2257
2258        for (i = 0; mce_device_attrs[i]; i++)
2259                device_remove_file(dev, mce_device_attrs[i]);
2260
2261        for (i = 0; i < mca_cfg.banks; i++)
2262                device_remove_file(dev, &mce_banks[i].attr);
2263
2264        device_unregister(dev);
2265        cpumask_clear_cpu(cpu, mce_device_initialized);
2266        per_cpu(mce_device, cpu) = NULL;
2267}
2268
2269/* Make sure there are no machine checks on offlined CPUs. */
2270static void mce_disable_cpu(void)
2271{
2272        if (!mce_available(raw_cpu_ptr(&cpu_info)))
2273                return;
2274
2275        if (!cpuhp_tasks_frozen)
2276                cmci_clear();
2277
2278        vendor_disable_error_reporting();
2279}
2280
2281static void mce_reenable_cpu(void)
2282{
2283        int i;
2284
2285        if (!mce_available(raw_cpu_ptr(&cpu_info)))
2286                return;
2287
2288        if (!cpuhp_tasks_frozen)
2289                cmci_reenable();
2290        for (i = 0; i < mca_cfg.banks; i++) {
2291                struct mce_bank *b = &mce_banks[i];
2292
2293                if (b->init)
2294                        wrmsrl(msr_ops.ctl(i), b->ctl);
2295        }
2296}
2297
2298static int mce_cpu_dead(unsigned int cpu)
2299{
2300        mce_intel_hcpu_update(cpu);
2301
2302        /* intentionally ignoring frozen here */
2303        if (!cpuhp_tasks_frozen)
2304                cmci_rediscover();
2305        return 0;
2306}
2307
2308static int mce_cpu_online(unsigned int cpu)
2309{
2310        struct timer_list *t = this_cpu_ptr(&mce_timer);
2311        int ret;
2312
2313        mce_device_create(cpu);
2314
2315        ret = mce_threshold_create_device(cpu);
2316        if (ret) {
2317                mce_device_remove(cpu);
2318                return ret;
2319        }
2320        mce_reenable_cpu();
2321        mce_start_timer(t);
2322        return 0;
2323}
2324
2325static int mce_cpu_pre_down(unsigned int cpu)
2326{
2327        struct timer_list *t = this_cpu_ptr(&mce_timer);
2328
2329        mce_disable_cpu();
2330        del_timer_sync(t);
2331        mce_threshold_remove_device(cpu);
2332        mce_device_remove(cpu);
2333        return 0;
2334}
2335
2336static __init void mce_init_banks(void)
2337{
2338        int i;
2339
2340        for (i = 0; i < mca_cfg.banks; i++) {
2341                struct mce_bank *b = &mce_banks[i];
2342                struct device_attribute *a = &b->attr;
2343
2344                sysfs_attr_init(&a->attr);
2345                a->attr.name    = b->attrname;
2346                snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2347
2348                a->attr.mode    = 0644;
2349                a->show         = show_bank;
2350                a->store        = set_bank;
2351        }
2352}
2353
2354static __init int mcheck_init_device(void)
2355{
2356        int err;
2357
2358        if (!mce_available(&boot_cpu_data)) {
2359                err = -EIO;
2360                goto err_out;
2361        }
2362
2363        if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2364                err = -ENOMEM;
2365                goto err_out;
2366        }
2367
2368        mce_init_banks();
2369
2370        err = subsys_system_register(&mce_subsys, NULL);
2371        if (err)
2372                goto err_out_mem;
2373
2374        err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
2375                                mce_cpu_dead);
2376        if (err)
2377                goto err_out_mem;
2378
2379        err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
2380                                mce_cpu_online, mce_cpu_pre_down);
2381        if (err < 0)
2382                goto err_out_online;
2383
2384        register_syscore_ops(&mce_syscore_ops);
2385
2386        return 0;
2387
2388err_out_online:
2389        cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
2390
2391err_out_mem:
2392        free_cpumask_var(mce_device_initialized);
2393
2394err_out:
2395        pr_err("Unable to init MCE device (rc: %d)\n", err);
2396
2397        return err;
2398}
2399device_initcall_sync(mcheck_init_device);
2400
2401/*
2402 * Old style boot options parsing. Only for compatibility.
2403 */
2404static int __init mcheck_disable(char *str)
2405{
2406        mca_cfg.disabled = true;
2407        return 1;
2408}
2409__setup("nomce", mcheck_disable);
2410
2411#ifdef CONFIG_DEBUG_FS
2412struct dentry *mce_get_debugfs_dir(void)
2413{
2414        static struct dentry *dmce;
2415
2416        if (!dmce)
2417                dmce = debugfs_create_dir("mce", NULL);
2418
2419        return dmce;
2420}
2421
2422static void mce_reset(void)
2423{
2424        cpu_missing = 0;
2425        atomic_set(&mce_fake_panicked, 0);
2426        atomic_set(&mce_executing, 0);
2427        atomic_set(&mce_callin, 0);
2428        atomic_set(&global_nwo, 0);
2429}
2430
2431static int fake_panic_get(void *data, u64 *val)
2432{
2433        *val = fake_panic;
2434        return 0;
2435}
2436
2437static int fake_panic_set(void *data, u64 val)
2438{
2439        mce_reset();
2440        fake_panic = val;
2441        return 0;
2442}
2443
2444DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2445                        fake_panic_set, "%llu\n");
2446
2447static int __init mcheck_debugfs_init(void)
2448{
2449        struct dentry *dmce, *ffake_panic;
2450
2451        dmce = mce_get_debugfs_dir();
2452        if (!dmce)
2453                return -ENOMEM;
2454        ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2455                                          &fake_panic_fops);
2456        if (!ffake_panic)
2457                return -ENOMEM;
2458
2459        return 0;
2460}
2461#else
2462static int __init mcheck_debugfs_init(void) { return -EINVAL; }
2463#endif
2464
2465DEFINE_STATIC_KEY_FALSE(mcsafe_key);
2466EXPORT_SYMBOL_GPL(mcsafe_key);
2467
2468static int __init mcheck_late_init(void)
2469{
2470        if (mca_cfg.recovery)
2471                static_branch_inc(&mcsafe_key);
2472
2473        mcheck_debugfs_init();
2474        cec_init();
2475
2476        /*
2477         * Flush out everything that has been logged during early boot, now that
2478         * everything has been initialized (workqueues, decoders, ...).
2479         */
2480        mce_schedule_work();
2481
2482        return 0;
2483}
2484late_initcall(mcheck_late_init);
2485