linux/arch/x86/kernel/cpu/mcheck/mce.c
<<
>>
Prefs
   1/*
   2 * Machine check handler.
   3 *
   4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   5 * Rest from unknown author(s).
   6 * 2004 Andi Kleen. Rewrote most of it.
   7 * Copyright 2008 Intel Corporation
   8 * Author: Andi Kleen
   9 */
  10
  11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  12
  13#include <linux/thread_info.h>
  14#include <linux/capability.h>
  15#include <linux/miscdevice.h>
  16#include <linux/ratelimit.h>
  17#include <linux/rcupdate.h>
  18#include <linux/kobject.h>
  19#include <linux/uaccess.h>
  20#include <linux/kdebug.h>
  21#include <linux/kernel.h>
  22#include <linux/percpu.h>
  23#include <linux/string.h>
  24#include <linux/device.h>
  25#include <linux/syscore_ops.h>
  26#include <linux/delay.h>
  27#include <linux/ctype.h>
  28#include <linux/sched.h>
  29#include <linux/sysfs.h>
  30#include <linux/types.h>
  31#include <linux/slab.h>
  32#include <linux/init.h>
  33#include <linux/kmod.h>
  34#include <linux/poll.h>
  35#include <linux/nmi.h>
  36#include <linux/cpu.h>
  37#include <linux/ras.h>
  38#include <linux/smp.h>
  39#include <linux/fs.h>
  40#include <linux/mm.h>
  41#include <linux/debugfs.h>
  42#include <linux/irq_work.h>
  43#include <linux/export.h>
  44#include <linux/jump_label.h>
  45
  46#include <asm/intel-family.h>
  47#include <asm/processor.h>
  48#include <asm/traps.h>
  49#include <asm/tlbflush.h>
  50#include <asm/mce.h>
  51#include <asm/msr.h>
  52#include <asm/reboot.h>
  53#include <asm/set_memory.h>
  54
  55#include "mce-internal.h"
  56
  57static DEFINE_MUTEX(mce_log_mutex);
  58
  59/* sysfs synchronization */
  60static DEFINE_MUTEX(mce_sysfs_mutex);
  61
  62#define CREATE_TRACE_POINTS
  63#include <trace/events/mce.h>
  64
  65#define SPINUNIT                100     /* 100ns */
  66
  67DEFINE_PER_CPU(unsigned, mce_exception_count);
  68
  69struct mce_bank *mce_banks __read_mostly;
  70struct mce_vendor_flags mce_flags __read_mostly;
  71
  72struct mca_config mca_cfg __read_mostly = {
  73        .bootlog  = -1,
  74        /*
  75         * Tolerant levels:
  76         * 0: always panic on uncorrected errors, log corrected errors
  77         * 1: panic or SIGBUS on uncorrected errors, log corrected errors
  78         * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
  79         * 3: never panic or SIGBUS, log all errors (for testing only)
  80         */
  81        .tolerant = 1,
  82        .monarch_timeout = -1
  83};
  84
  85static DEFINE_PER_CPU(struct mce, mces_seen);
  86static unsigned long mce_need_notify;
  87static int cpu_missing;
  88
  89/*
  90 * MCA banks polled by the period polling timer for corrected events.
  91 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
  92 */
  93DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
  94        [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
  95};
  96
  97/*
  98 * MCA banks controlled through firmware first for corrected errors.
  99 * This is a global list of banks for which we won't enable CMCI and we
 100 * won't poll. Firmware controls these banks and is responsible for
 101 * reporting corrected errors through GHES. Uncorrected/recoverable
 102 * errors are still notified through a machine check.
 103 */
 104mce_banks_t mce_banks_ce_disabled;
 105
 106static struct work_struct mce_work;
 107static struct irq_work mce_irq_work;
 108
 109static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
 110
 111#ifndef mce_unmap_kpfn
 112static void mce_unmap_kpfn(unsigned long pfn);
 113#endif
 114
 115/*
 116 * CPU/chipset specific EDAC code can register a notifier call here to print
 117 * MCE errors in a human-readable form.
 118 */
 119BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
 120
 121/* Do initial initialization of a struct mce */
 122void mce_setup(struct mce *m)
 123{
 124        memset(m, 0, sizeof(struct mce));
 125        m->cpu = m->extcpu = smp_processor_id();
 126        /* We hope get_seconds stays lockless */
 127        m->time = get_seconds();
 128        m->cpuvendor = boot_cpu_data.x86_vendor;
 129        m->cpuid = cpuid_eax(1);
 130        m->socketid = cpu_data(m->extcpu).phys_proc_id;
 131        m->apicid = cpu_data(m->extcpu).initial_apicid;
 132        rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
 133
 134        if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
 135                rdmsrl(MSR_PPIN, m->ppin);
 136
 137        m->microcode = boot_cpu_data.microcode;
 138}
 139
 140DEFINE_PER_CPU(struct mce, injectm);
 141EXPORT_PER_CPU_SYMBOL_GPL(injectm);
 142
 143void mce_log(struct mce *m)
 144{
 145        if (!mce_gen_pool_add(m))
 146                irq_work_queue(&mce_irq_work);
 147}
 148
 149void mce_inject_log(struct mce *m)
 150{
 151        mutex_lock(&mce_log_mutex);
 152        mce_log(m);
 153        mutex_unlock(&mce_log_mutex);
 154}
 155EXPORT_SYMBOL_GPL(mce_inject_log);
 156
 157static struct notifier_block mce_srao_nb;
 158
 159/*
 160 * We run the default notifier if we have only the SRAO, the first and the
 161 * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
 162 * notifiers registered on the chain.
 163 */
 164#define NUM_DEFAULT_NOTIFIERS   3
 165static atomic_t num_notifiers;
 166
 167void mce_register_decode_chain(struct notifier_block *nb)
 168{
 169        if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
 170                return;
 171
 172        atomic_inc(&num_notifiers);
 173
 174        blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
 175}
 176EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 177
 178void mce_unregister_decode_chain(struct notifier_block *nb)
 179{
 180        atomic_dec(&num_notifiers);
 181
 182        blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
 183}
 184EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
 185
 186static inline u32 ctl_reg(int bank)
 187{
 188        return MSR_IA32_MCx_CTL(bank);
 189}
 190
 191static inline u32 status_reg(int bank)
 192{
 193        return MSR_IA32_MCx_STATUS(bank);
 194}
 195
 196static inline u32 addr_reg(int bank)
 197{
 198        return MSR_IA32_MCx_ADDR(bank);
 199}
 200
 201static inline u32 misc_reg(int bank)
 202{
 203        return MSR_IA32_MCx_MISC(bank);
 204}
 205
 206static inline u32 smca_ctl_reg(int bank)
 207{
 208        return MSR_AMD64_SMCA_MCx_CTL(bank);
 209}
 210
 211static inline u32 smca_status_reg(int bank)
 212{
 213        return MSR_AMD64_SMCA_MCx_STATUS(bank);
 214}
 215
 216static inline u32 smca_addr_reg(int bank)
 217{
 218        return MSR_AMD64_SMCA_MCx_ADDR(bank);
 219}
 220
 221static inline u32 smca_misc_reg(int bank)
 222{
 223        return MSR_AMD64_SMCA_MCx_MISC(bank);
 224}
 225
 226struct mca_msr_regs msr_ops = {
 227        .ctl    = ctl_reg,
 228        .status = status_reg,
 229        .addr   = addr_reg,
 230        .misc   = misc_reg
 231};
 232
 233static void __print_mce(struct mce *m)
 234{
 235        pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
 236                 m->extcpu,
 237                 (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
 238                 m->mcgstatus, m->bank, m->status);
 239
 240        if (m->ip) {
 241                pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
 242                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 243                        m->cs, m->ip);
 244
 245                if (m->cs == __KERNEL_CS)
 246                        pr_cont("{%pS}", (void *)(unsigned long)m->ip);
 247                pr_cont("\n");
 248        }
 249
 250        pr_emerg(HW_ERR "TSC %llx ", m->tsc);
 251        if (m->addr)
 252                pr_cont("ADDR %llx ", m->addr);
 253        if (m->misc)
 254                pr_cont("MISC %llx ", m->misc);
 255
 256        if (mce_flags.smca) {
 257                if (m->synd)
 258                        pr_cont("SYND %llx ", m->synd);
 259                if (m->ipid)
 260                        pr_cont("IPID %llx ", m->ipid);
 261        }
 262
 263        pr_cont("\n");
 264        /*
 265         * Note this output is parsed by external tools and old fields
 266         * should not be changed.
 267         */
 268        pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
 269                m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
 270                m->microcode);
 271}
 272
 273static void print_mce(struct mce *m)
 274{
 275        __print_mce(m);
 276
 277        if (m->cpuvendor != X86_VENDOR_AMD)
 278                pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
 279}
 280
 281#define PANIC_TIMEOUT 5 /* 5 seconds */
 282
 283static atomic_t mce_panicked;
 284
 285static int fake_panic;
 286static atomic_t mce_fake_panicked;
 287
 288/* Panic in progress. Enable interrupts and wait for final IPI */
 289static void wait_for_panic(void)
 290{
 291        long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
 292
 293        preempt_disable();
 294        local_irq_enable();
 295        while (timeout-- > 0)
 296                udelay(1);
 297        if (panic_timeout == 0)
 298                panic_timeout = mca_cfg.panic_timeout;
 299        panic("Panicing machine check CPU died");
 300}
 301
 302static void mce_panic(const char *msg, struct mce *final, char *exp)
 303{
 304        int apei_err = 0;
 305        struct llist_node *pending;
 306        struct mce_evt_llist *l;
 307
 308        if (!fake_panic) {
 309                /*
 310                 * Make sure only one CPU runs in machine check panic
 311                 */
 312                if (atomic_inc_return(&mce_panicked) > 1)
 313                        wait_for_panic();
 314                barrier();
 315
 316                bust_spinlocks(1);
 317                console_verbose();
 318        } else {
 319                /* Don't log too much for fake panic */
 320                if (atomic_inc_return(&mce_fake_panicked) > 1)
 321                        return;
 322        }
 323        pending = mce_gen_pool_prepare_records();
 324        /* First print corrected ones that are still unlogged */
 325        llist_for_each_entry(l, pending, llnode) {
 326                struct mce *m = &l->mce;
 327                if (!(m->status & MCI_STATUS_UC)) {
 328                        print_mce(m);
 329                        if (!apei_err)
 330                                apei_err = apei_write_mce(m);
 331                }
 332        }
 333        /* Now print uncorrected but with the final one last */
 334        llist_for_each_entry(l, pending, llnode) {
 335                struct mce *m = &l->mce;
 336                if (!(m->status & MCI_STATUS_UC))
 337                        continue;
 338                if (!final || mce_cmp(m, final)) {
 339                        print_mce(m);
 340                        if (!apei_err)
 341                                apei_err = apei_write_mce(m);
 342                }
 343        }
 344        if (final) {
 345                print_mce(final);
 346                if (!apei_err)
 347                        apei_err = apei_write_mce(final);
 348        }
 349        if (cpu_missing)
 350                pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
 351        if (exp)
 352                pr_emerg(HW_ERR "Machine check: %s\n", exp);
 353        if (!fake_panic) {
 354                if (panic_timeout == 0)
 355                        panic_timeout = mca_cfg.panic_timeout;
 356                panic(msg);
 357        } else
 358                pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
 359}
 360
 361/* Support code for software error injection */
 362
 363static int msr_to_offset(u32 msr)
 364{
 365        unsigned bank = __this_cpu_read(injectm.bank);
 366
 367        if (msr == mca_cfg.rip_msr)
 368                return offsetof(struct mce, ip);
 369        if (msr == msr_ops.status(bank))
 370                return offsetof(struct mce, status);
 371        if (msr == msr_ops.addr(bank))
 372                return offsetof(struct mce, addr);
 373        if (msr == msr_ops.misc(bank))
 374                return offsetof(struct mce, misc);
 375        if (msr == MSR_IA32_MCG_STATUS)
 376                return offsetof(struct mce, mcgstatus);
 377        return -1;
 378}
 379
 380/* MSR access wrappers used for error injection */
 381static u64 mce_rdmsrl(u32 msr)
 382{
 383        u64 v;
 384
 385        if (__this_cpu_read(injectm.finished)) {
 386                int offset = msr_to_offset(msr);
 387
 388                if (offset < 0)
 389                        return 0;
 390                return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
 391        }
 392
 393        if (rdmsrl_safe(msr, &v)) {
 394                WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
 395                /*
 396                 * Return zero in case the access faulted. This should
 397                 * not happen normally but can happen if the CPU does
 398                 * something weird, or if the code is buggy.
 399                 */
 400                v = 0;
 401        }
 402
 403        return v;
 404}
 405
 406static void mce_wrmsrl(u32 msr, u64 v)
 407{
 408        if (__this_cpu_read(injectm.finished)) {
 409                int offset = msr_to_offset(msr);
 410
 411                if (offset >= 0)
 412                        *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
 413                return;
 414        }
 415        wrmsrl(msr, v);
 416}
 417
 418/*
 419 * Collect all global (w.r.t. this processor) status about this machine
 420 * check into our "mce" struct so that we can use it later to assess
 421 * the severity of the problem as we read per-bank specific details.
 422 */
 423static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
 424{
 425        mce_setup(m);
 426
 427        m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 428        if (regs) {
 429                /*
 430                 * Get the address of the instruction at the time of
 431                 * the machine check error.
 432                 */
 433                if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
 434                        m->ip = regs->ip;
 435                        m->cs = regs->cs;
 436
 437                        /*
 438                         * When in VM86 mode make the cs look like ring 3
 439                         * always. This is a lie, but it's better than passing
 440                         * the additional vm86 bit around everywhere.
 441                         */
 442                        if (v8086_mode(regs))
 443                                m->cs |= 3;
 444                }
 445                /* Use accurate RIP reporting if available. */
 446                if (mca_cfg.rip_msr)
 447                        m->ip = mce_rdmsrl(mca_cfg.rip_msr);
 448        }
 449}
 450
 451int mce_available(struct cpuinfo_x86 *c)
 452{
 453        if (mca_cfg.disabled)
 454                return 0;
 455        return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 456}
 457
 458static void mce_schedule_work(void)
 459{
 460        if (!mce_gen_pool_empty())
 461                schedule_work(&mce_work);
 462}
 463
 464static void mce_irq_work_cb(struct irq_work *entry)
 465{
 466        mce_schedule_work();
 467}
 468
 469static void mce_report_event(struct pt_regs *regs)
 470{
 471        if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
 472                mce_notify_irq();
 473                /*
 474                 * Triggering the work queue here is just an insurance
 475                 * policy in case the syscall exit notify handler
 476                 * doesn't run soon enough or ends up running on the
 477                 * wrong CPU (can happen when audit sleeps)
 478                 */
 479                mce_schedule_work();
 480                return;
 481        }
 482
 483        irq_work_queue(&mce_irq_work);
 484}
 485
 486/*
 487 * Check if the address reported by the CPU is in a format we can parse.
 488 * It would be possible to add code for most other cases, but all would
 489 * be somewhat complicated (e.g. segment offset would require an instruction
 490 * parser). So only support physical addresses up to page granuality for now.
 491 */
 492static int mce_usable_address(struct mce *m)
 493{
 494        if (!(m->status & MCI_STATUS_ADDRV))
 495                return 0;
 496
 497        /* Checks after this one are Intel-specific: */
 498        if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
 499                return 1;
 500
 501        if (!(m->status & MCI_STATUS_MISCV))
 502                return 0;
 503
 504        if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
 505                return 0;
 506
 507        if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
 508                return 0;
 509
 510        return 1;
 511}
 512
 513bool mce_is_memory_error(struct mce *m)
 514{
 515        if (m->cpuvendor == X86_VENDOR_AMD) {
 516                return amd_mce_is_memory_error(m);
 517
 518        } else if (m->cpuvendor == X86_VENDOR_INTEL) {
 519                /*
 520                 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
 521                 *
 522                 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
 523                 * indicating a memory error. Bit 8 is used for indicating a
 524                 * cache hierarchy error. The combination of bit 2 and bit 3
 525                 * is used for indicating a `generic' cache hierarchy error
 526                 * But we can't just blindly check the above bits, because if
 527                 * bit 11 is set, then it is a bus/interconnect error - and
 528                 * either way the above bits just gives more detail on what
 529                 * bus/interconnect error happened. Note that bit 12 can be
 530                 * ignored, as it's the "filter" bit.
 531                 */
 532                return (m->status & 0xef80) == BIT(7) ||
 533                       (m->status & 0xef00) == BIT(8) ||
 534                       (m->status & 0xeffc) == 0xc;
 535        }
 536
 537        return false;
 538}
 539EXPORT_SYMBOL_GPL(mce_is_memory_error);
 540
 541static bool mce_is_correctable(struct mce *m)
 542{
 543        if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
 544                return false;
 545
 546        if (m->status & MCI_STATUS_UC)
 547                return false;
 548
 549        return true;
 550}
 551
 552static bool cec_add_mce(struct mce *m)
 553{
 554        if (!m)
 555                return false;
 556
 557        /* We eat only correctable DRAM errors with usable addresses. */
 558        if (mce_is_memory_error(m) &&
 559            mce_is_correctable(m)  &&
 560            mce_usable_address(m))
 561                if (!cec_add_elem(m->addr >> PAGE_SHIFT))
 562                        return true;
 563
 564        return false;
 565}
 566
 567static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
 568                              void *data)
 569{
 570        struct mce *m = (struct mce *)data;
 571
 572        if (!m)
 573                return NOTIFY_DONE;
 574
 575        if (cec_add_mce(m))
 576                return NOTIFY_STOP;
 577
 578        /* Emit the trace record: */
 579        trace_mce_record(m);
 580
 581        set_bit(0, &mce_need_notify);
 582
 583        mce_notify_irq();
 584
 585        return NOTIFY_DONE;
 586}
 587
 588static struct notifier_block first_nb = {
 589        .notifier_call  = mce_first_notifier,
 590        .priority       = MCE_PRIO_FIRST,
 591};
 592
 593static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
 594                                void *data)
 595{
 596        struct mce *mce = (struct mce *)data;
 597        unsigned long pfn;
 598
 599        if (!mce)
 600                return NOTIFY_DONE;
 601
 602        if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
 603                pfn = mce->addr >> PAGE_SHIFT;
 604                if (!memory_failure(pfn, 0))
 605                        mce_unmap_kpfn(pfn);
 606        }
 607
 608        return NOTIFY_OK;
 609}
 610static struct notifier_block mce_srao_nb = {
 611        .notifier_call  = srao_decode_notifier,
 612        .priority       = MCE_PRIO_SRAO,
 613};
 614
 615static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
 616                                void *data)
 617{
 618        struct mce *m = (struct mce *)data;
 619
 620        if (!m)
 621                return NOTIFY_DONE;
 622
 623        if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
 624                return NOTIFY_DONE;
 625
 626        __print_mce(m);
 627
 628        return NOTIFY_DONE;
 629}
 630
 631static struct notifier_block mce_default_nb = {
 632        .notifier_call  = mce_default_notifier,
 633        /* lowest prio, we want it to run last. */
 634        .priority       = MCE_PRIO_LOWEST,
 635};
 636
 637/*
 638 * Read ADDR and MISC registers.
 639 */
 640static void mce_read_aux(struct mce *m, int i)
 641{
 642        if (m->status & MCI_STATUS_MISCV)
 643                m->misc = mce_rdmsrl(msr_ops.misc(i));
 644
 645        if (m->status & MCI_STATUS_ADDRV) {
 646                m->addr = mce_rdmsrl(msr_ops.addr(i));
 647
 648                /*
 649                 * Mask the reported address by the reported granularity.
 650                 */
 651                if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
 652                        u8 shift = MCI_MISC_ADDR_LSB(m->misc);
 653                        m->addr >>= shift;
 654                        m->addr <<= shift;
 655                }
 656
 657                /*
 658                 * Extract [55:<lsb>] where lsb is the least significant
 659                 * *valid* bit of the address bits.
 660                 */
 661                if (mce_flags.smca) {
 662                        u8 lsb = (m->addr >> 56) & 0x3f;
 663
 664                        m->addr &= GENMASK_ULL(55, lsb);
 665                }
 666        }
 667
 668        if (mce_flags.smca) {
 669                m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
 670
 671                if (m->status & MCI_STATUS_SYNDV)
 672                        m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
 673        }
 674}
 675
 676DEFINE_PER_CPU(unsigned, mce_poll_count);
 677
 678/*
 679 * Poll for corrected events or events that happened before reset.
 680 * Those are just logged through /dev/mcelog.
 681 *
 682 * This is executed in standard interrupt context.
 683 *
 684 * Note: spec recommends to panic for fatal unsignalled
 685 * errors here. However this would be quite problematic --
 686 * we would need to reimplement the Monarch handling and
 687 * it would mess up the exclusion between exception handler
 688 * and poll hander -- * so we skip this for now.
 689 * These cases should not happen anyways, or only when the CPU
 690 * is already totally * confused. In this case it's likely it will
 691 * not fully execute the machine check handler either.
 692 */
 693bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 694{
 695        bool error_seen = false;
 696        struct mce m;
 697        int i;
 698
 699        this_cpu_inc(mce_poll_count);
 700
 701        mce_gather_info(&m, NULL);
 702
 703        if (flags & MCP_TIMESTAMP)
 704                m.tsc = rdtsc();
 705
 706        for (i = 0; i < mca_cfg.banks; i++) {
 707                if (!mce_banks[i].ctl || !test_bit(i, *b))
 708                        continue;
 709
 710                m.misc = 0;
 711                m.addr = 0;
 712                m.bank = i;
 713
 714                barrier();
 715                m.status = mce_rdmsrl(msr_ops.status(i));
 716                if (!(m.status & MCI_STATUS_VAL))
 717                        continue;
 718
 719                /*
 720                 * Uncorrected or signalled events are handled by the exception
 721                 * handler when it is enabled, so don't process those here.
 722                 *
 723                 * TBD do the same check for MCI_STATUS_EN here?
 724                 */
 725                if (!(flags & MCP_UC) &&
 726                    (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
 727                        continue;
 728
 729                error_seen = true;
 730
 731                mce_read_aux(&m, i);
 732
 733                m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
 734
 735                /*
 736                 * Don't get the IP here because it's unlikely to
 737                 * have anything to do with the actual error location.
 738                 */
 739                if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
 740                        mce_log(&m);
 741                else if (mce_usable_address(&m)) {
 742                        /*
 743                         * Although we skipped logging this, we still want
 744                         * to take action. Add to the pool so the registered
 745                         * notifiers will see it.
 746                         */
 747                        if (!mce_gen_pool_add(&m))
 748                                mce_schedule_work();
 749                }
 750
 751                /*
 752                 * Clear state for this bank.
 753                 */
 754                mce_wrmsrl(msr_ops.status(i), 0);
 755        }
 756
 757        /*
 758         * Don't clear MCG_STATUS here because it's only defined for
 759         * exceptions.
 760         */
 761
 762        sync_core();
 763
 764        return error_seen;
 765}
 766EXPORT_SYMBOL_GPL(machine_check_poll);
 767
 768/*
 769 * Do a quick check if any of the events requires a panic.
 770 * This decides if we keep the events around or clear them.
 771 */
 772static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
 773                          struct pt_regs *regs)
 774{
 775        int i, ret = 0;
 776        char *tmp;
 777
 778        for (i = 0; i < mca_cfg.banks; i++) {
 779                m->status = mce_rdmsrl(msr_ops.status(i));
 780                if (m->status & MCI_STATUS_VAL) {
 781                        __set_bit(i, validp);
 782                        if (quirk_no_way_out)
 783                                quirk_no_way_out(i, m, regs);
 784                }
 785
 786                if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
 787                        *msg = tmp;
 788                        ret = 1;
 789                }
 790        }
 791        return ret;
 792}
 793
 794/*
 795 * Variable to establish order between CPUs while scanning.
 796 * Each CPU spins initially until executing is equal its number.
 797 */
 798static atomic_t mce_executing;
 799
 800/*
 801 * Defines order of CPUs on entry. First CPU becomes Monarch.
 802 */
 803static atomic_t mce_callin;
 804
 805/*
 806 * Check if a timeout waiting for other CPUs happened.
 807 */
 808static int mce_timed_out(u64 *t, const char *msg)
 809{
 810        /*
 811         * The others already did panic for some reason.
 812         * Bail out like in a timeout.
 813         * rmb() to tell the compiler that system_state
 814         * might have been modified by someone else.
 815         */
 816        rmb();
 817        if (atomic_read(&mce_panicked))
 818                wait_for_panic();
 819        if (!mca_cfg.monarch_timeout)
 820                goto out;
 821        if ((s64)*t < SPINUNIT) {
 822                if (mca_cfg.tolerant <= 1)
 823                        mce_panic(msg, NULL, NULL);
 824                cpu_missing = 1;
 825                return 1;
 826        }
 827        *t -= SPINUNIT;
 828out:
 829        touch_nmi_watchdog();
 830        return 0;
 831}
 832
 833/*
 834 * The Monarch's reign.  The Monarch is the CPU who entered
 835 * the machine check handler first. It waits for the others to
 836 * raise the exception too and then grades them. When any
 837 * error is fatal panic. Only then let the others continue.
 838 *
 839 * The other CPUs entering the MCE handler will be controlled by the
 840 * Monarch. They are called Subjects.
 841 *
 842 * This way we prevent any potential data corruption in a unrecoverable case
 843 * and also makes sure always all CPU's errors are examined.
 844 *
 845 * Also this detects the case of a machine check event coming from outer
 846 * space (not detected by any CPUs) In this case some external agent wants
 847 * us to shut down, so panic too.
 848 *
 849 * The other CPUs might still decide to panic if the handler happens
 850 * in a unrecoverable place, but in this case the system is in a semi-stable
 851 * state and won't corrupt anything by itself. It's ok to let the others
 852 * continue for a bit first.
 853 *
 854 * All the spin loops have timeouts; when a timeout happens a CPU
 855 * typically elects itself to be Monarch.
 856 */
 857static void mce_reign(void)
 858{
 859        int cpu;
 860        struct mce *m = NULL;
 861        int global_worst = 0;
 862        char *msg = NULL;
 863        char *nmsg = NULL;
 864
 865        /*
 866         * This CPU is the Monarch and the other CPUs have run
 867         * through their handlers.
 868         * Grade the severity of the errors of all the CPUs.
 869         */
 870        for_each_possible_cpu(cpu) {
 871                int severity = mce_severity(&per_cpu(mces_seen, cpu),
 872                                            mca_cfg.tolerant,
 873                                            &nmsg, true);
 874                if (severity > global_worst) {
 875                        msg = nmsg;
 876                        global_worst = severity;
 877                        m = &per_cpu(mces_seen, cpu);
 878                }
 879        }
 880
 881        /*
 882         * Cannot recover? Panic here then.
 883         * This dumps all the mces in the log buffer and stops the
 884         * other CPUs.
 885         */
 886        if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
 887                mce_panic("Fatal machine check", m, msg);
 888
 889        /*
 890         * For UC somewhere we let the CPU who detects it handle it.
 891         * Also must let continue the others, otherwise the handling
 892         * CPU could deadlock on a lock.
 893         */
 894
 895        /*
 896         * No machine check event found. Must be some external
 897         * source or one CPU is hung. Panic.
 898         */
 899        if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
 900                mce_panic("Fatal machine check from unknown source", NULL, NULL);
 901
 902        /*
 903         * Now clear all the mces_seen so that they don't reappear on
 904         * the next mce.
 905         */
 906        for_each_possible_cpu(cpu)
 907                memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
 908}
 909
 910static atomic_t global_nwo;
 911
 912/*
 913 * Start of Monarch synchronization. This waits until all CPUs have
 914 * entered the exception handler and then determines if any of them
 915 * saw a fatal event that requires panic. Then it executes them
 916 * in the entry order.
 917 * TBD double check parallel CPU hotunplug
 918 */
 919static int mce_start(int *no_way_out)
 920{
 921        int order;
 922        int cpus = num_online_cpus();
 923        u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 924
 925        if (!timeout)
 926                return -1;
 927
 928        atomic_add(*no_way_out, &global_nwo);
 929        /*
 930         * Rely on the implied barrier below, such that global_nwo
 931         * is updated before mce_callin.
 932         */
 933        order = atomic_inc_return(&mce_callin);
 934
 935        /*
 936         * Wait for everyone.
 937         */
 938        while (atomic_read(&mce_callin) != cpus) {
 939                if (mce_timed_out(&timeout,
 940                                  "Timeout: Not all CPUs entered broadcast exception handler")) {
 941                        atomic_set(&global_nwo, 0);
 942                        return -1;
 943                }
 944                ndelay(SPINUNIT);
 945        }
 946
 947        /*
 948         * mce_callin should be read before global_nwo
 949         */
 950        smp_rmb();
 951
 952        if (order == 1) {
 953                /*
 954                 * Monarch: Starts executing now, the others wait.
 955                 */
 956                atomic_set(&mce_executing, 1);
 957        } else {
 958                /*
 959                 * Subject: Now start the scanning loop one by one in
 960                 * the original callin order.
 961                 * This way when there are any shared banks it will be
 962                 * only seen by one CPU before cleared, avoiding duplicates.
 963                 */
 964                while (atomic_read(&mce_executing) < order) {
 965                        if (mce_timed_out(&timeout,
 966                                          "Timeout: Subject CPUs unable to finish machine check processing")) {
 967                                atomic_set(&global_nwo, 0);
 968                                return -1;
 969                        }
 970                        ndelay(SPINUNIT);
 971                }
 972        }
 973
 974        /*
 975         * Cache the global no_way_out state.
 976         */
 977        *no_way_out = atomic_read(&global_nwo);
 978
 979        return order;
 980}
 981
 982/*
 983 * Synchronize between CPUs after main scanning loop.
 984 * This invokes the bulk of the Monarch processing.
 985 */
 986static int mce_end(int order)
 987{
 988        int ret = -1;
 989        u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
 990
 991        if (!timeout)
 992                goto reset;
 993        if (order < 0)
 994                goto reset;
 995
 996        /*
 997         * Allow others to run.
 998         */
 999        atomic_inc(&mce_executing);
1000
1001        if (order == 1) {
1002                /* CHECKME: Can this race with a parallel hotplug? */
1003                int cpus = num_online_cpus();
1004
1005                /*
1006                 * Monarch: Wait for everyone to go through their scanning
1007                 * loops.
1008                 */
1009                while (atomic_read(&mce_executing) <= cpus) {
1010                        if (mce_timed_out(&timeout,
1011                                          "Timeout: Monarch CPU unable to finish machine check processing"))
1012                                goto reset;
1013                        ndelay(SPINUNIT);
1014                }
1015
1016                mce_reign();
1017                barrier();
1018                ret = 0;
1019        } else {
1020                /*
1021                 * Subject: Wait for Monarch to finish.
1022                 */
1023                while (atomic_read(&mce_executing) != 0) {
1024                        if (mce_timed_out(&timeout,
1025                                          "Timeout: Monarch CPU did not finish machine check processing"))
1026                                goto reset;
1027                        ndelay(SPINUNIT);
1028                }
1029
1030                /*
1031                 * Don't reset anything. That's done by the Monarch.
1032                 */
1033                return 0;
1034        }
1035
1036        /*
1037         * Reset all global state.
1038         */
1039reset:
1040        atomic_set(&global_nwo, 0);
1041        atomic_set(&mce_callin, 0);
1042        barrier();
1043
1044        /*
1045         * Let others run again.
1046         */
1047        atomic_set(&mce_executing, 0);
1048        return ret;
1049}
1050
1051static void mce_clear_state(unsigned long *toclear)
1052{
1053        int i;
1054
1055        for (i = 0; i < mca_cfg.banks; i++) {
1056                if (test_bit(i, toclear))
1057                        mce_wrmsrl(msr_ops.status(i), 0);
1058        }
1059}
1060
1061static int do_memory_failure(struct mce *m)
1062{
1063        int flags = MF_ACTION_REQUIRED;
1064        int ret;
1065
1066        pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
1067        if (!(m->mcgstatus & MCG_STATUS_RIPV))
1068                flags |= MF_MUST_KILL;
1069        ret = memory_failure(m->addr >> PAGE_SHIFT, flags);
1070        if (ret)
1071                pr_err("Memory error not recovered");
1072        else
1073                mce_unmap_kpfn(m->addr >> PAGE_SHIFT);
1074        return ret;
1075}
1076
1077#ifndef mce_unmap_kpfn
1078static void mce_unmap_kpfn(unsigned long pfn)
1079{
1080        unsigned long decoy_addr;
1081
1082        /*
1083         * Unmap this page from the kernel 1:1 mappings to make sure
1084         * we don't log more errors because of speculative access to
1085         * the page.
1086         * We would like to just call:
1087         *      set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
1088         * but doing that would radically increase the odds of a
1089         * speculative access to the poison page because we'd have
1090         * the virtual address of the kernel 1:1 mapping sitting
1091         * around in registers.
1092         * Instead we get tricky.  We create a non-canonical address
1093         * that looks just like the one we want, but has bit 63 flipped.
1094         * This relies on set_memory_np() not checking whether we passed
1095         * a legal address.
1096         */
1097
1098        decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
1099
1100        if (set_memory_np(decoy_addr, 1))
1101                pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
1102}
1103#endif
1104
1105/*
1106 * The actual machine check handler. This only handles real
1107 * exceptions when something got corrupted coming in through int 18.
1108 *
1109 * This is executed in NMI context not subject to normal locking rules. This
1110 * implies that most kernel services cannot be safely used. Don't even
1111 * think about putting a printk in there!
1112 *
1113 * On Intel systems this is entered on all CPUs in parallel through
1114 * MCE broadcast. However some CPUs might be broken beyond repair,
1115 * so be always careful when synchronizing with others.
1116 */
1117void do_machine_check(struct pt_regs *regs, long error_code)
1118{
1119        struct mca_config *cfg = &mca_cfg;
1120        struct mce m, *final;
1121        int i;
1122        int worst = 0;
1123        int severity;
1124
1125        /*
1126         * Establish sequential order between the CPUs entering the machine
1127         * check handler.
1128         */
1129        int order = -1;
1130        /*
1131         * If no_way_out gets set, there is no safe way to recover from this
1132         * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
1133         */
1134        int no_way_out = 0;
1135        /*
1136         * If kill_it gets set, there might be a way to recover from this
1137         * error.
1138         */
1139        int kill_it = 0;
1140        DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1141        DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1142        char *msg = "Unknown";
1143
1144        /*
1145         * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
1146         * on Intel.
1147         */
1148        int lmce = 1;
1149        int cpu = smp_processor_id();
1150
1151        /*
1152         * Cases where we avoid rendezvous handler timeout:
1153         * 1) If this CPU is offline.
1154         *
1155         * 2) If crashing_cpu was set, e.g. we're entering kdump and we need to
1156         *  skip those CPUs which remain looping in the 1st kernel - see
1157         *  crash_nmi_callback().
1158         *
1159         * Note: there still is a small window between kexec-ing and the new,
1160         * kdump kernel establishing a new #MC handler where a broadcasted MCE
1161         * might not get handled properly.
1162         */
1163        if (cpu_is_offline(cpu) ||
1164            (crashing_cpu != -1 && crashing_cpu != cpu)) {
1165                u64 mcgstatus;
1166
1167                mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
1168                if (mcgstatus & MCG_STATUS_RIPV) {
1169                        mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1170                        return;
1171                }
1172        }
1173
1174        ist_enter(regs);
1175
1176        this_cpu_inc(mce_exception_count);
1177
1178        if (!cfg->banks)
1179                goto out;
1180
1181        mce_gather_info(&m, regs);
1182        m.tsc = rdtsc();
1183
1184        final = this_cpu_ptr(&mces_seen);
1185        *final = m;
1186
1187        memset(valid_banks, 0, sizeof(valid_banks));
1188        no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1189
1190        barrier();
1191
1192        /*
1193         * When no restart IP might need to kill or panic.
1194         * Assume the worst for now, but if we find the
1195         * severity is MCE_AR_SEVERITY we have other options.
1196         */
1197        if (!(m.mcgstatus & MCG_STATUS_RIPV))
1198                kill_it = 1;
1199
1200        /*
1201         * Check if this MCE is signaled to only this logical processor,
1202         * on Intel only.
1203         */
1204        if (m.cpuvendor == X86_VENDOR_INTEL)
1205                lmce = m.mcgstatus & MCG_STATUS_LMCES;
1206
1207        /*
1208         * Go through all banks in exclusion of the other CPUs. This way we
1209         * don't report duplicated events on shared banks because the first one
1210         * to see it will clear it. If this is a Local MCE, then no need to
1211         * perform rendezvous.
1212         */
1213        if (!lmce)
1214                order = mce_start(&no_way_out);
1215
1216        for (i = 0; i < cfg->banks; i++) {
1217                __clear_bit(i, toclear);
1218                if (!test_bit(i, valid_banks))
1219                        continue;
1220                if (!mce_banks[i].ctl)
1221                        continue;
1222
1223                m.misc = 0;
1224                m.addr = 0;
1225                m.bank = i;
1226
1227                m.status = mce_rdmsrl(msr_ops.status(i));
1228                if ((m.status & MCI_STATUS_VAL) == 0)
1229                        continue;
1230
1231                /*
1232                 * Non uncorrected or non signaled errors are handled by
1233                 * machine_check_poll. Leave them alone, unless this panics.
1234                 */
1235                if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1236                        !no_way_out)
1237                        continue;
1238
1239                /*
1240                 * Set taint even when machine check was not enabled.
1241                 */
1242                add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1243
1244                severity = mce_severity(&m, cfg->tolerant, NULL, true);
1245
1246                /*
1247                 * When machine check was for corrected/deferred handler don't
1248                 * touch, unless we're panicing.
1249                 */
1250                if ((severity == MCE_KEEP_SEVERITY ||
1251                     severity == MCE_UCNA_SEVERITY) && !no_way_out)
1252                        continue;
1253                __set_bit(i, toclear);
1254                if (severity == MCE_NO_SEVERITY) {
1255                        /*
1256                         * Machine check event was not enabled. Clear, but
1257                         * ignore.
1258                         */
1259                        continue;
1260                }
1261
1262                mce_read_aux(&m, i);
1263
1264                /* assuming valid severity level != 0 */
1265                m.severity = severity;
1266
1267                mce_log(&m);
1268
1269                if (severity > worst) {
1270                        *final = m;
1271                        worst = severity;
1272                }
1273        }
1274
1275        /* mce_clear_state will clear *final, save locally for use later */
1276        m = *final;
1277
1278        if (!no_way_out)
1279                mce_clear_state(toclear);
1280
1281        /*
1282         * Do most of the synchronization with other CPUs.
1283         * When there's any problem use only local no_way_out state.
1284         */
1285        if (!lmce) {
1286                if (mce_end(order) < 0)
1287                        no_way_out = worst >= MCE_PANIC_SEVERITY;
1288        } else {
1289                /*
1290                 * Local MCE skipped calling mce_reign()
1291                 * If we found a fatal error, we need to panic here.
1292                 */
1293                 if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
1294                        mce_panic("Machine check from unknown source",
1295                                NULL, NULL);
1296        }
1297
1298        /*
1299         * If tolerant is at an insane level we drop requests to kill
1300         * processes and continue even when there is no way out.
1301         */
1302        if (cfg->tolerant == 3)
1303                kill_it = 0;
1304        else if (no_way_out)
1305                mce_panic("Fatal machine check on current CPU", &m, msg);
1306
1307        if (worst > 0)
1308                mce_report_event(regs);
1309        mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1310out:
1311        sync_core();
1312
1313        if (worst != MCE_AR_SEVERITY && !kill_it)
1314                goto out_ist;
1315
1316        /* Fault was in user mode and we need to take some action */
1317        if ((m.cs & 3) == 3) {
1318                ist_begin_non_atomic(regs);
1319                local_irq_enable();
1320
1321                if (kill_it || do_memory_failure(&m))
1322                        force_sig(SIGBUS, current);
1323                local_irq_disable();
1324                ist_end_non_atomic();
1325        } else {
1326                if (!fixup_exception(regs, X86_TRAP_MC))
1327                        mce_panic("Failed kernel mode recovery", &m, NULL);
1328        }
1329
1330out_ist:
1331        ist_exit(regs);
1332}
1333EXPORT_SYMBOL_GPL(do_machine_check);
1334
1335#ifndef CONFIG_MEMORY_FAILURE
1336int memory_failure(unsigned long pfn, int flags)
1337{
1338        /* mce_severity() should not hand us an ACTION_REQUIRED error */
1339        BUG_ON(flags & MF_ACTION_REQUIRED);
1340        pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1341               "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1342               pfn);
1343
1344        return 0;
1345}
1346#endif
1347
1348/*
1349 * Periodic polling timer for "silent" machine check errors.  If the
1350 * poller finds an MCE, poll 2x faster.  When the poller finds no more
1351 * errors, poll 2x slower (up to check_interval seconds).
1352 */
1353static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1354
1355static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1356static DEFINE_PER_CPU(struct timer_list, mce_timer);
1357
1358static unsigned long mce_adjust_timer_default(unsigned long interval)
1359{
1360        return interval;
1361}
1362
1363static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1364
1365static void __start_timer(struct timer_list *t, unsigned long interval)
1366{
1367        unsigned long when = jiffies + interval;
1368        unsigned long flags;
1369
1370        local_irq_save(flags);
1371
1372        if (!timer_pending(t) || time_before(when, t->expires))
1373                mod_timer(t, round_jiffies(when));
1374
1375        local_irq_restore(flags);
1376}
1377
1378static void mce_timer_fn(struct timer_list *t)
1379{
1380        struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
1381        unsigned long iv;
1382
1383        WARN_ON(cpu_t != t);
1384
1385        iv = __this_cpu_read(mce_next_interval);
1386
1387        if (mce_available(this_cpu_ptr(&cpu_info))) {
1388                machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
1389
1390                if (mce_intel_cmci_poll()) {
1391                        iv = mce_adjust_timer(iv);
1392                        goto done;
1393                }
1394        }
1395
1396        /*
1397         * Alert userspace if needed. If we logged an MCE, reduce the polling
1398         * interval, otherwise increase the polling interval.
1399         */
1400        if (mce_notify_irq())
1401                iv = max(iv / 2, (unsigned long) HZ/100);
1402        else
1403                iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1404
1405done:
1406        __this_cpu_write(mce_next_interval, iv);
1407        __start_timer(t, iv);
1408}
1409
1410/*
1411 * Ensure that the timer is firing in @interval from now.
1412 */
1413void mce_timer_kick(unsigned long interval)
1414{
1415        struct timer_list *t = this_cpu_ptr(&mce_timer);
1416        unsigned long iv = __this_cpu_read(mce_next_interval);
1417
1418        __start_timer(t, interval);
1419
1420        if (interval < iv)
1421                __this_cpu_write(mce_next_interval, interval);
1422}
1423
1424/* Must not be called in IRQ context where del_timer_sync() can deadlock */
1425static void mce_timer_delete_all(void)
1426{
1427        int cpu;
1428
1429        for_each_online_cpu(cpu)
1430                del_timer_sync(&per_cpu(mce_timer, cpu));
1431}
1432
1433/*
1434 * Notify the user(s) about new machine check events.
1435 * Can be called from interrupt context, but not from machine check/NMI
1436 * context.
1437 */
1438int mce_notify_irq(void)
1439{
1440        /* Not more than two messages every minute */
1441        static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1442
1443        if (test_and_clear_bit(0, &mce_need_notify)) {
1444                mce_work_trigger();
1445
1446                if (__ratelimit(&ratelimit))
1447                        pr_info(HW_ERR "Machine check events logged\n");
1448
1449                return 1;
1450        }
1451        return 0;
1452}
1453EXPORT_SYMBOL_GPL(mce_notify_irq);
1454
1455static int __mcheck_cpu_mce_banks_init(void)
1456{
1457        int i;
1458        u8 num_banks = mca_cfg.banks;
1459
1460        mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
1461        if (!mce_banks)
1462                return -ENOMEM;
1463
1464        for (i = 0; i < num_banks; i++) {
1465                struct mce_bank *b = &mce_banks[i];
1466
1467                b->ctl = -1ULL;
1468                b->init = 1;
1469        }
1470        return 0;
1471}
1472
1473/*
1474 * Initialize Machine Checks for a CPU.
1475 */
1476static int __mcheck_cpu_cap_init(void)
1477{
1478        unsigned b;
1479        u64 cap;
1480
1481        rdmsrl(MSR_IA32_MCG_CAP, cap);
1482
1483        b = cap & MCG_BANKCNT_MASK;
1484        if (!mca_cfg.banks)
1485                pr_info("CPU supports %d MCE banks\n", b);
1486
1487        if (b > MAX_NR_BANKS) {
1488                pr_warn("Using only %u machine check banks out of %u\n",
1489                        MAX_NR_BANKS, b);
1490                b = MAX_NR_BANKS;
1491        }
1492
1493        /* Don't support asymmetric configurations today */
1494        WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
1495        mca_cfg.banks = b;
1496
1497        if (!mce_banks) {
1498                int err = __mcheck_cpu_mce_banks_init();
1499
1500                if (err)
1501                        return err;
1502        }
1503
1504        /* Use accurate RIP reporting if available. */
1505        if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1506                mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1507
1508        if (cap & MCG_SER_P)
1509                mca_cfg.ser = 1;
1510
1511        return 0;
1512}
1513
1514static void __mcheck_cpu_init_generic(void)
1515{
1516        enum mcp_flags m_fl = 0;
1517        mce_banks_t all_banks;
1518        u64 cap;
1519
1520        if (!mca_cfg.bootlog)
1521                m_fl = MCP_DONTLOG;
1522
1523        /*
1524         * Log the machine checks left over from the previous reset.
1525         */
1526        bitmap_fill(all_banks, MAX_NR_BANKS);
1527        machine_check_poll(MCP_UC | m_fl, &all_banks);
1528
1529        cr4_set_bits(X86_CR4_MCE);
1530
1531        rdmsrl(MSR_IA32_MCG_CAP, cap);
1532        if (cap & MCG_CTL_P)
1533                wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1534}
1535
1536static void __mcheck_cpu_init_clear_banks(void)
1537{
1538        int i;
1539
1540        for (i = 0; i < mca_cfg.banks; i++) {
1541                struct mce_bank *b = &mce_banks[i];
1542
1543                if (!b->init)
1544                        continue;
1545                wrmsrl(msr_ops.ctl(i), b->ctl);
1546                wrmsrl(msr_ops.status(i), 0);
1547        }
1548}
1549
1550/*
1551 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1552 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1553 * Vol 3B Table 15-20). But this confuses both the code that determines
1554 * whether the machine check occurred in kernel or user mode, and also
1555 * the severity assessment code. Pretend that EIPV was set, and take the
1556 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1557 */
1558static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1559{
1560        if (bank != 0)
1561                return;
1562        if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1563                return;
1564        if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1565                          MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1566                          MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1567                          MCACOD)) !=
1568                         (MCI_STATUS_UC|MCI_STATUS_EN|
1569                          MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1570                          MCI_STATUS_AR|MCACOD_INSTR))
1571                return;
1572
1573        m->mcgstatus |= MCG_STATUS_EIPV;
1574        m->ip = regs->ip;
1575        m->cs = regs->cs;
1576}
1577
1578/* Add per CPU specific workarounds here */
1579static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1580{
1581        struct mca_config *cfg = &mca_cfg;
1582
1583        if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1584                pr_info("unknown CPU type - not enabling MCE support\n");
1585                return -EOPNOTSUPP;
1586        }
1587
1588        /* This should be disabled by the BIOS, but isn't always */
1589        if (c->x86_vendor == X86_VENDOR_AMD) {
1590                if (c->x86 == 15 && cfg->banks > 4) {
1591                        /*
1592                         * disable GART TBL walk error reporting, which
1593                         * trips off incorrectly with the IOMMU & 3ware
1594                         * & Cerberus:
1595                         */
1596                        clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1597                }
1598                if (c->x86 < 0x11 && cfg->bootlog < 0) {
1599                        /*
1600                         * Lots of broken BIOS around that don't clear them
1601                         * by default and leave crap in there. Don't log:
1602                         */
1603                        cfg->bootlog = 0;
1604                }
1605                /*
1606                 * Various K7s with broken bank 0 around. Always disable
1607                 * by default.
1608                 */
1609                if (c->x86 == 6 && cfg->banks > 0)
1610                        mce_banks[0].ctl = 0;
1611
1612                /*
1613                 * overflow_recov is supported for F15h Models 00h-0fh
1614                 * even though we don't have a CPUID bit for it.
1615                 */
1616                if (c->x86 == 0x15 && c->x86_model <= 0xf)
1617                        mce_flags.overflow_recov = 1;
1618
1619                /*
1620                 * Turn off MC4_MISC thresholding banks on those models since
1621                 * they're not supported there.
1622                 */
1623                if (c->x86 == 0x15 &&
1624                    (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1625                        int i;
1626                        u64 hwcr;
1627                        bool need_toggle;
1628                        u32 msrs[] = {
1629                                0x00000413, /* MC4_MISC0 */
1630                                0xc0000408, /* MC4_MISC1 */
1631                        };
1632
1633                        rdmsrl(MSR_K7_HWCR, hwcr);
1634
1635                        /* McStatusWrEn has to be set */
1636                        need_toggle = !(hwcr & BIT(18));
1637
1638                        if (need_toggle)
1639                                wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1640
1641                        /* Clear CntP bit safely */
1642                        for (i = 0; i < ARRAY_SIZE(msrs); i++)
1643                                msr_clear_bit(msrs[i], 62);
1644
1645                        /* restore old settings */
1646                        if (need_toggle)
1647                                wrmsrl(MSR_K7_HWCR, hwcr);
1648                }
1649        }
1650
1651        if (c->x86_vendor == X86_VENDOR_INTEL) {
1652                /*
1653                 * SDM documents that on family 6 bank 0 should not be written
1654                 * because it aliases to another special BIOS controlled
1655                 * register.
1656                 * But it's not aliased anymore on model 0x1a+
1657                 * Don't ignore bank 0 completely because there could be a
1658                 * valid event later, merely don't write CTL0.
1659                 */
1660
1661                if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
1662                        mce_banks[0].init = 0;
1663
1664                /*
1665                 * All newer Intel systems support MCE broadcasting. Enable
1666                 * synchronization with a one second timeout.
1667                 */
1668                if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1669                        cfg->monarch_timeout < 0)
1670                        cfg->monarch_timeout = USEC_PER_SEC;
1671
1672                /*
1673                 * There are also broken BIOSes on some Pentium M and
1674                 * earlier systems:
1675                 */
1676                if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1677                        cfg->bootlog = 0;
1678
1679                if (c->x86 == 6 && c->x86_model == 45)
1680                        quirk_no_way_out = quirk_sandybridge_ifu;
1681        }
1682        if (cfg->monarch_timeout < 0)
1683                cfg->monarch_timeout = 0;
1684        if (cfg->bootlog != 0)
1685                cfg->panic_timeout = 30;
1686
1687        return 0;
1688}
1689
1690static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1691{
1692        if (c->x86 != 5)
1693                return 0;
1694
1695        switch (c->x86_vendor) {
1696        case X86_VENDOR_INTEL:
1697                intel_p5_mcheck_init(c);
1698                return 1;
1699                break;
1700        case X86_VENDOR_CENTAUR:
1701                winchip_mcheck_init(c);
1702                return 1;
1703                break;
1704        default:
1705                return 0;
1706        }
1707
1708        return 0;
1709}
1710
1711/*
1712 * Init basic CPU features needed for early decoding of MCEs.
1713 */
1714static void __mcheck_cpu_init_early(struct cpuinfo_x86 *c)
1715{
1716        if (c->x86_vendor == X86_VENDOR_AMD) {
1717                mce_flags.overflow_recov = !!cpu_has(c, X86_FEATURE_OVERFLOW_RECOV);
1718                mce_flags.succor         = !!cpu_has(c, X86_FEATURE_SUCCOR);
1719                mce_flags.smca           = !!cpu_has(c, X86_FEATURE_SMCA);
1720
1721                if (mce_flags.smca) {
1722                        msr_ops.ctl     = smca_ctl_reg;
1723                        msr_ops.status  = smca_status_reg;
1724                        msr_ops.addr    = smca_addr_reg;
1725                        msr_ops.misc    = smca_misc_reg;
1726                }
1727        }
1728}
1729
1730static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1731{
1732        switch (c->x86_vendor) {
1733        case X86_VENDOR_INTEL:
1734                mce_intel_feature_init(c);
1735                mce_adjust_timer = cmci_intel_adjust_timer;
1736                break;
1737
1738        case X86_VENDOR_AMD: {
1739                mce_amd_feature_init(c);
1740                break;
1741                }
1742
1743        default:
1744                break;
1745        }
1746}
1747
1748static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
1749{
1750        switch (c->x86_vendor) {
1751        case X86_VENDOR_INTEL:
1752                mce_intel_feature_clear(c);
1753                break;
1754        default:
1755                break;
1756        }
1757}
1758
1759static void mce_start_timer(struct timer_list *t)
1760{
1761        unsigned long iv = check_interval * HZ;
1762
1763        if (mca_cfg.ignore_ce || !iv)
1764                return;
1765
1766        this_cpu_write(mce_next_interval, iv);
1767        __start_timer(t, iv);
1768}
1769
1770static void __mcheck_cpu_setup_timer(void)
1771{
1772        struct timer_list *t = this_cpu_ptr(&mce_timer);
1773
1774        timer_setup(t, mce_timer_fn, TIMER_PINNED);
1775}
1776
1777static void __mcheck_cpu_init_timer(void)
1778{
1779        struct timer_list *t = this_cpu_ptr(&mce_timer);
1780
1781        timer_setup(t, mce_timer_fn, TIMER_PINNED);
1782        mce_start_timer(t);
1783}
1784
1785/* Handle unconfigured int18 (should never happen) */
1786static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1787{
1788        pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1789               smp_processor_id());
1790}
1791
1792/* Call the installed machine check handler for this CPU setup. */
1793void (*machine_check_vector)(struct pt_regs *, long error_code) =
1794                                                unexpected_machine_check;
1795
1796dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
1797{
1798        machine_check_vector(regs, error_code);
1799}
1800
1801/*
1802 * Called for each booted CPU to set up machine checks.
1803 * Must be called with preempt off:
1804 */
1805void mcheck_cpu_init(struct cpuinfo_x86 *c)
1806{
1807        if (mca_cfg.disabled)
1808                return;
1809
1810        if (__mcheck_cpu_ancient_init(c))
1811                return;
1812
1813        if (!mce_available(c))
1814                return;
1815
1816        if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1817                mca_cfg.disabled = 1;
1818                return;
1819        }
1820
1821        if (mce_gen_pool_init()) {
1822                mca_cfg.disabled = 1;
1823                pr_emerg("Couldn't allocate MCE records pool!\n");
1824                return;
1825        }
1826
1827        machine_check_vector = do_machine_check;
1828
1829        __mcheck_cpu_init_early(c);
1830        __mcheck_cpu_init_generic();
1831        __mcheck_cpu_init_vendor(c);
1832        __mcheck_cpu_init_clear_banks();
1833        __mcheck_cpu_setup_timer();
1834}
1835
1836/*
1837 * Called for each booted CPU to clear some machine checks opt-ins
1838 */
1839void mcheck_cpu_clear(struct cpuinfo_x86 *c)
1840{
1841        if (mca_cfg.disabled)
1842                return;
1843
1844        if (!mce_available(c))
1845                return;
1846
1847        /*
1848         * Possibly to clear general settings generic to x86
1849         * __mcheck_cpu_clear_generic(c);
1850         */
1851        __mcheck_cpu_clear_vendor(c);
1852
1853}
1854
1855static void __mce_disable_bank(void *arg)
1856{
1857        int bank = *((int *)arg);
1858        __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
1859        cmci_disable_bank(bank);
1860}
1861
1862void mce_disable_bank(int bank)
1863{
1864        if (bank >= mca_cfg.banks) {
1865                pr_warn(FW_BUG
1866                        "Ignoring request to disable invalid MCA bank %d.\n",
1867                        bank);
1868                return;
1869        }
1870        set_bit(bank, mce_banks_ce_disabled);
1871        on_each_cpu(__mce_disable_bank, &bank, 1);
1872}
1873
1874/*
1875 * mce=off Disables machine check
1876 * mce=no_cmci Disables CMCI
1877 * mce=no_lmce Disables LMCE
1878 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1879 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1880 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1881 *      monarchtimeout is how long to wait for other CPUs on machine
1882 *      check, or 0 to not wait
1883 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD Fam10h
1884        and older.
1885 * mce=nobootlog Don't log MCEs from before booting.
1886 * mce=bios_cmci_threshold Don't program the CMCI threshold
1887 * mce=recovery force enable memcpy_mcsafe()
1888 */
1889static int __init mcheck_enable(char *str)
1890{
1891        struct mca_config *cfg = &mca_cfg;
1892
1893        if (*str == 0) {
1894                enable_p5_mce();
1895                return 1;
1896        }
1897        if (*str == '=')
1898                str++;
1899        if (!strcmp(str, "off"))
1900                cfg->disabled = 1;
1901        else if (!strcmp(str, "no_cmci"))
1902                cfg->cmci_disabled = true;
1903        else if (!strcmp(str, "no_lmce"))
1904                cfg->lmce_disabled = 1;
1905        else if (!strcmp(str, "dont_log_ce"))
1906                cfg->dont_log_ce = true;
1907        else if (!strcmp(str, "ignore_ce"))
1908                cfg->ignore_ce = true;
1909        else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1910                cfg->bootlog = (str[0] == 'b');
1911        else if (!strcmp(str, "bios_cmci_threshold"))
1912                cfg->bios_cmci_threshold = 1;
1913        else if (!strcmp(str, "recovery"))
1914                cfg->recovery = 1;
1915        else if (isdigit(str[0])) {
1916                if (get_option(&str, &cfg->tolerant) == 2)
1917                        get_option(&str, &(cfg->monarch_timeout));
1918        } else {
1919                pr_info("mce argument %s ignored. Please use /sys\n", str);
1920                return 0;
1921        }
1922        return 1;
1923}
1924__setup("mce", mcheck_enable);
1925
1926int __init mcheck_init(void)
1927{
1928        mcheck_intel_therm_init();
1929        mce_register_decode_chain(&first_nb);
1930        mce_register_decode_chain(&mce_srao_nb);
1931        mce_register_decode_chain(&mce_default_nb);
1932        mcheck_vendor_init_severity();
1933
1934        INIT_WORK(&mce_work, mce_gen_pool_process);
1935        init_irq_work(&mce_irq_work, mce_irq_work_cb);
1936
1937        return 0;
1938}
1939
1940/*
1941 * mce_syscore: PM support
1942 */
1943
1944/*
1945 * Disable machine checks on suspend and shutdown. We can't really handle
1946 * them later.
1947 */
1948static void mce_disable_error_reporting(void)
1949{
1950        int i;
1951
1952        for (i = 0; i < mca_cfg.banks; i++) {
1953                struct mce_bank *b = &mce_banks[i];
1954
1955                if (b->init)
1956                        wrmsrl(msr_ops.ctl(i), 0);
1957        }
1958        return;
1959}
1960
1961static void vendor_disable_error_reporting(void)
1962{
1963        /*
1964         * Don't clear on Intel or AMD CPUs. Some of these MSRs are socket-wide.
1965         * Disabling them for just a single offlined CPU is bad, since it will
1966         * inhibit reporting for all shared resources on the socket like the
1967         * last level cache (LLC), the integrated memory controller (iMC), etc.
1968         */
1969        if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ||
1970            boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1971                return;
1972
1973        mce_disable_error_reporting();
1974}
1975
1976static int mce_syscore_suspend(void)
1977{
1978        vendor_disable_error_reporting();
1979        return 0;
1980}
1981
1982static void mce_syscore_shutdown(void)
1983{
1984        vendor_disable_error_reporting();
1985}
1986
1987/*
1988 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1989 * Only one CPU is active at this time, the others get re-added later using
1990 * CPU hotplug:
1991 */
1992static void mce_syscore_resume(void)
1993{
1994        __mcheck_cpu_init_generic();
1995        __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
1996        __mcheck_cpu_init_clear_banks();
1997}
1998
1999static struct syscore_ops mce_syscore_ops = {
2000        .suspend        = mce_syscore_suspend,
2001        .shutdown       = mce_syscore_shutdown,
2002        .resume         = mce_syscore_resume,
2003};
2004
2005/*
2006 * mce_device: Sysfs support
2007 */
2008
2009static void mce_cpu_restart(void *data)
2010{
2011        if (!mce_available(raw_cpu_ptr(&cpu_info)))
2012                return;
2013        __mcheck_cpu_init_generic();
2014        __mcheck_cpu_init_clear_banks();
2015        __mcheck_cpu_init_timer();
2016}
2017
2018/* Reinit MCEs after user configuration changes */
2019static void mce_restart(void)
2020{
2021        mce_timer_delete_all();
2022        on_each_cpu(mce_cpu_restart, NULL, 1);
2023}
2024
2025/* Toggle features for corrected errors */
2026static void mce_disable_cmci(void *data)
2027{
2028        if (!mce_available(raw_cpu_ptr(&cpu_info)))
2029                return;
2030        cmci_clear();
2031}
2032
2033static void mce_enable_ce(void *all)
2034{
2035        if (!mce_available(raw_cpu_ptr(&cpu_info)))
2036                return;
2037        cmci_reenable();
2038        cmci_recheck();
2039        if (all)
2040                __mcheck_cpu_init_timer();
2041}
2042
2043static struct bus_type mce_subsys = {
2044        .name           = "machinecheck",
2045        .dev_name       = "machinecheck",
2046};
2047
2048DEFINE_PER_CPU(struct device *, mce_device);
2049
2050static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2051{
2052        return container_of(attr, struct mce_bank, attr);
2053}
2054
2055static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2056                         char *buf)
2057{
2058        return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2059}
2060
2061static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2062                        const char *buf, size_t size)
2063{
2064        u64 new;
2065
2066        if (kstrtou64(buf, 0, &new) < 0)
2067                return -EINVAL;
2068
2069        attr_to_bank(attr)->ctl = new;
2070        mce_restart();
2071
2072        return size;
2073}
2074
2075static ssize_t set_ignore_ce(struct device *s,
2076                             struct device_attribute *attr,
2077                             const char *buf, size_t size)
2078{
2079        u64 new;
2080
2081        if (kstrtou64(buf, 0, &new) < 0)
2082                return -EINVAL;
2083
2084        mutex_lock(&mce_sysfs_mutex);
2085        if (mca_cfg.ignore_ce ^ !!new) {
2086                if (new) {
2087                        /* disable ce features */
2088                        mce_timer_delete_all();
2089                        on_each_cpu(mce_disable_cmci, NULL, 1);
2090                        mca_cfg.ignore_ce = true;
2091                } else {
2092                        /* enable ce features */
2093                        mca_cfg.ignore_ce = false;
2094                        on_each_cpu(mce_enable_ce, (void *)1, 1);
2095                }
2096        }
2097        mutex_unlock(&mce_sysfs_mutex);
2098
2099        return size;
2100}
2101
2102static ssize_t set_cmci_disabled(struct device *s,
2103                                 struct device_attribute *attr,
2104                                 const char *buf, size_t size)
2105{
2106        u64 new;
2107
2108        if (kstrtou64(buf, 0, &new) < 0)
2109                return -EINVAL;
2110
2111        mutex_lock(&mce_sysfs_mutex);
2112        if (mca_cfg.cmci_disabled ^ !!new) {
2113                if (new) {
2114                        /* disable cmci */
2115                        on_each_cpu(mce_disable_cmci, NULL, 1);
2116                        mca_cfg.cmci_disabled = true;
2117                } else {
2118                        /* enable cmci */
2119                        mca_cfg.cmci_disabled = false;
2120                        on_each_cpu(mce_enable_ce, NULL, 1);
2121                }
2122        }
2123        mutex_unlock(&mce_sysfs_mutex);
2124
2125        return size;
2126}
2127
2128static ssize_t store_int_with_restart(struct device *s,
2129                                      struct device_attribute *attr,
2130                                      const char *buf, size_t size)
2131{
2132        unsigned long old_check_interval = check_interval;
2133        ssize_t ret = device_store_ulong(s, attr, buf, size);
2134
2135        if (check_interval == old_check_interval)
2136                return ret;
2137
2138        if (check_interval < 1)
2139                check_interval = 1;
2140
2141        mutex_lock(&mce_sysfs_mutex);
2142        mce_restart();
2143        mutex_unlock(&mce_sysfs_mutex);
2144
2145        return ret;
2146}
2147
2148static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2149static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2150static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2151
2152static struct dev_ext_attribute dev_attr_check_interval = {
2153        __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2154        &check_interval
2155};
2156
2157static struct dev_ext_attribute dev_attr_ignore_ce = {
2158        __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2159        &mca_cfg.ignore_ce
2160};
2161
2162static struct dev_ext_attribute dev_attr_cmci_disabled = {
2163        __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2164        &mca_cfg.cmci_disabled
2165};
2166
2167static struct device_attribute *mce_device_attrs[] = {
2168        &dev_attr_tolerant.attr,
2169        &dev_attr_check_interval.attr,
2170#ifdef CONFIG_X86_MCELOG_LEGACY
2171        &dev_attr_trigger,
2172#endif
2173        &dev_attr_monarch_timeout.attr,
2174        &dev_attr_dont_log_ce.attr,
2175        &dev_attr_ignore_ce.attr,
2176        &dev_attr_cmci_disabled.attr,
2177        NULL
2178};
2179
2180static cpumask_var_t mce_device_initialized;
2181
2182static void mce_device_release(struct device *dev)
2183{
2184        kfree(dev);
2185}
2186
2187/* Per cpu device init. All of the cpus still share the same ctrl bank: */
2188static int mce_device_create(unsigned int cpu)
2189{
2190        struct device *dev;
2191        int err;
2192        int i, j;
2193
2194        if (!mce_available(&boot_cpu_data))
2195                return -EIO;
2196
2197        dev = per_cpu(mce_device, cpu);
2198        if (dev)
2199                return 0;
2200
2201        dev = kzalloc(sizeof *dev, GFP_KERNEL);
2202        if (!dev)
2203                return -ENOMEM;
2204        dev->id  = cpu;
2205        dev->bus = &mce_subsys;
2206        dev->release = &mce_device_release;
2207
2208        err = device_register(dev);
2209        if (err) {
2210                put_device(dev);
2211                return err;
2212        }
2213
2214        for (i = 0; mce_device_attrs[i]; i++) {
2215                err = device_create_file(dev, mce_device_attrs[i]);
2216                if (err)
2217                        goto error;
2218        }
2219        for (j = 0; j < mca_cfg.banks; j++) {
2220                err = device_create_file(dev, &mce_banks[j].attr);
2221                if (err)
2222                        goto error2;
2223        }
2224        cpumask_set_cpu(cpu, mce_device_initialized);
2225        per_cpu(mce_device, cpu) = dev;
2226
2227        return 0;
2228error2:
2229        while (--j >= 0)
2230                device_remove_file(dev, &mce_banks[j].attr);
2231error:
2232        while (--i >= 0)
2233                device_remove_file(dev, mce_device_attrs[i]);
2234
2235        device_unregister(dev);
2236
2237        return err;
2238}
2239
2240static void mce_device_remove(unsigned int cpu)
2241{
2242        struct device *dev = per_cpu(mce_device, cpu);
2243        int i;
2244
2245        if (!cpumask_test_cpu(cpu, mce_device_initialized))
2246                return;
2247
2248        for (i = 0; mce_device_attrs[i]; i++)
2249                device_remove_file(dev, mce_device_attrs[i]);
2250
2251        for (i = 0; i < mca_cfg.banks; i++)
2252                device_remove_file(dev, &mce_banks[i].attr);
2253
2254        device_unregister(dev);
2255        cpumask_clear_cpu(cpu, mce_device_initialized);
2256        per_cpu(mce_device, cpu) = NULL;
2257}
2258
2259/* Make sure there are no machine checks on offlined CPUs. */
2260static void mce_disable_cpu(void)
2261{
2262        if (!mce_available(raw_cpu_ptr(&cpu_info)))
2263                return;
2264
2265        if (!cpuhp_tasks_frozen)
2266                cmci_clear();
2267
2268        vendor_disable_error_reporting();
2269}
2270
2271static void mce_reenable_cpu(void)
2272{
2273        int i;
2274
2275        if (!mce_available(raw_cpu_ptr(&cpu_info)))
2276                return;
2277
2278        if (!cpuhp_tasks_frozen)
2279                cmci_reenable();
2280        for (i = 0; i < mca_cfg.banks; i++) {
2281                struct mce_bank *b = &mce_banks[i];
2282
2283                if (b->init)
2284                        wrmsrl(msr_ops.ctl(i), b->ctl);
2285        }
2286}
2287
2288static int mce_cpu_dead(unsigned int cpu)
2289{
2290        mce_intel_hcpu_update(cpu);
2291
2292        /* intentionally ignoring frozen here */
2293        if (!cpuhp_tasks_frozen)
2294                cmci_rediscover();
2295        return 0;
2296}
2297
2298static int mce_cpu_online(unsigned int cpu)
2299{
2300        struct timer_list *t = this_cpu_ptr(&mce_timer);
2301        int ret;
2302
2303        mce_device_create(cpu);
2304
2305        ret = mce_threshold_create_device(cpu);
2306        if (ret) {
2307                mce_device_remove(cpu);
2308                return ret;
2309        }
2310        mce_reenable_cpu();
2311        mce_start_timer(t);
2312        return 0;
2313}
2314
2315static int mce_cpu_pre_down(unsigned int cpu)
2316{
2317        struct timer_list *t = this_cpu_ptr(&mce_timer);
2318
2319        mce_disable_cpu();
2320        del_timer_sync(t);
2321        mce_threshold_remove_device(cpu);
2322        mce_device_remove(cpu);
2323        return 0;
2324}
2325
2326static __init void mce_init_banks(void)
2327{
2328        int i;
2329
2330        for (i = 0; i < mca_cfg.banks; i++) {
2331                struct mce_bank *b = &mce_banks[i];
2332                struct device_attribute *a = &b->attr;
2333
2334                sysfs_attr_init(&a->attr);
2335                a->attr.name    = b->attrname;
2336                snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2337
2338                a->attr.mode    = 0644;
2339                a->show         = show_bank;
2340                a->store        = set_bank;
2341        }
2342}
2343
2344static __init int mcheck_init_device(void)
2345{
2346        int err;
2347
2348        /*
2349         * Check if we have a spare virtual bit. This will only become
2350         * a problem if/when we move beyond 5-level page tables.
2351         */
2352        MAYBE_BUILD_BUG_ON(__VIRTUAL_MASK_SHIFT >= 63);
2353
2354        if (!mce_available(&boot_cpu_data)) {
2355                err = -EIO;
2356                goto err_out;
2357        }
2358
2359        if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2360                err = -ENOMEM;
2361                goto err_out;
2362        }
2363
2364        mce_init_banks();
2365
2366        err = subsys_system_register(&mce_subsys, NULL);
2367        if (err)
2368                goto err_out_mem;
2369
2370        err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
2371                                mce_cpu_dead);
2372        if (err)
2373                goto err_out_mem;
2374
2375        err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
2376                                mce_cpu_online, mce_cpu_pre_down);
2377        if (err < 0)
2378                goto err_out_online;
2379
2380        register_syscore_ops(&mce_syscore_ops);
2381
2382        return 0;
2383
2384err_out_online:
2385        cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
2386
2387err_out_mem:
2388        free_cpumask_var(mce_device_initialized);
2389
2390err_out:
2391        pr_err("Unable to init MCE device (rc: %d)\n", err);
2392
2393        return err;
2394}
2395device_initcall_sync(mcheck_init_device);
2396
2397/*
2398 * Old style boot options parsing. Only for compatibility.
2399 */
2400static int __init mcheck_disable(char *str)
2401{
2402        mca_cfg.disabled = 1;
2403        return 1;
2404}
2405__setup("nomce", mcheck_disable);
2406
2407#ifdef CONFIG_DEBUG_FS
2408struct dentry *mce_get_debugfs_dir(void)
2409{
2410        static struct dentry *dmce;
2411
2412        if (!dmce)
2413                dmce = debugfs_create_dir("mce", NULL);
2414
2415        return dmce;
2416}
2417
2418static void mce_reset(void)
2419{
2420        cpu_missing = 0;
2421        atomic_set(&mce_fake_panicked, 0);
2422        atomic_set(&mce_executing, 0);
2423        atomic_set(&mce_callin, 0);
2424        atomic_set(&global_nwo, 0);
2425}
2426
2427static int fake_panic_get(void *data, u64 *val)
2428{
2429        *val = fake_panic;
2430        return 0;
2431}
2432
2433static int fake_panic_set(void *data, u64 val)
2434{
2435        mce_reset();
2436        fake_panic = val;
2437        return 0;
2438}
2439
2440DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2441                        fake_panic_set, "%llu\n");
2442
2443static int __init mcheck_debugfs_init(void)
2444{
2445        struct dentry *dmce, *ffake_panic;
2446
2447        dmce = mce_get_debugfs_dir();
2448        if (!dmce)
2449                return -ENOMEM;
2450        ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2451                                          &fake_panic_fops);
2452        if (!ffake_panic)
2453                return -ENOMEM;
2454
2455        return 0;
2456}
2457#else
2458static int __init mcheck_debugfs_init(void) { return -EINVAL; }
2459#endif
2460
2461DEFINE_STATIC_KEY_FALSE(mcsafe_key);
2462EXPORT_SYMBOL_GPL(mcsafe_key);
2463
2464static int __init mcheck_late_init(void)
2465{
2466        if (mca_cfg.recovery)
2467                static_branch_inc(&mcsafe_key);
2468
2469        mcheck_debugfs_init();
2470        cec_init();
2471
2472        /*
2473         * Flush out everything that has been logged during early boot, now that
2474         * everything has been initialized (workqueues, decoders, ...).
2475         */
2476        mce_schedule_work();
2477
2478        return 0;
2479}
2480late_initcall(mcheck_late_init);
2481