LXR linux/arch/x86/kernel/cpu/mcheck/mce.c

   1/*
   2 * Machine check handler.
   3 *
   4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   5 * Rest from unknown author(s).
   6 * 2004 Andi Kleen. Rewrote most of it.
   7 * Copyright 2008 Intel Corporation
   8 * Author: Andi Kleen
   9 */
  10#include <linux/thread_info.h>
  11#include <linux/capability.h>
  12#include <linux/miscdevice.h>
  13#include <linux/interrupt.h>
  14#include <linux/ratelimit.h>
  15#include <linux/kallsyms.h>
  16#include <linux/rcupdate.h>
  17#include <linux/kobject.h>
  18#include <linux/uaccess.h>
  19#include <linux/kdebug.h>
  20#include <linux/kernel.h>
  21#include <linux/percpu.h>
  22#include <linux/string.h>
  23#include <linux/sysdev.h>
  24#include <linux/delay.h>
  25#include <linux/ctype.h>
  26#include <linux/sched.h>
  27#include <linux/sysfs.h>
  28#include <linux/types.h>
  29#include <linux/slab.h>
  30#include <linux/init.h>
  31#include <linux/kmod.h>
  32#include <linux/poll.h>
  33#include <linux/nmi.h>
  34#include <linux/cpu.h>
  35#include <linux/smp.h>
  36#include <linux/fs.h>
  37#include <linux/mm.h>
  38#include <linux/debugfs.h>
  39#include <linux/edac_mce.h>
  40
  41#include <asm/processor.h>
  42#include <asm/hw_irq.h>
  43#include <asm/apic.h>
  44#include <asm/idle.h>
  45#include <asm/ipi.h>
  46#include <asm/mce.h>
  47#include <asm/msr.h>
  48
  49#include "mce-internal.h"
  50
  51static DEFINE_MUTEX(mce_read_mutex);
  52
  53#define rcu_dereference_check_mce(p) \
  54        rcu_dereference_index_check((p), \
  55                              rcu_read_lock_sched_held() || \
  56                              lockdep_is_held(&mce_read_mutex))
  57
  58#define CREATE_TRACE_POINTS
  59#include <trace/events/mce.h>
  60
  61int mce_disabled __read_mostly;
  62
  63#define MISC_MCELOG_MINOR       227
  64
  65#define SPINUNIT 100    /* 100ns */
  66
  67atomic_t mce_entry;
  68
  69DEFINE_PER_CPU(unsigned, mce_exception_count);
  70
  71/*
  72 * Tolerant levels:
  73 *   0: always panic on uncorrected errors, log corrected errors
  74 *   1: panic or SIGBUS on uncorrected errors, log corrected errors
  75 *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
  76 *   3: never panic or SIGBUS, log all errors (for testing only)
  77 */
  78static int                      tolerant                __read_mostly = 1;
  79static int                      banks                   __read_mostly;
  80static int                      rip_msr                 __read_mostly;
  81static int                      mce_bootlog             __read_mostly = -1;
  82static int                      monarch_timeout         __read_mostly = -1;
  83static int                      mce_panic_timeout       __read_mostly;
  84static int                      mce_dont_log_ce         __read_mostly;
  85int                             mce_cmci_disabled       __read_mostly;
  86int                             mce_ignore_ce           __read_mostly;
  87int                             mce_ser                 __read_mostly;
  88
  89struct mce_bank                *mce_banks               __read_mostly;
  90
  91/* User mode helper program triggered by machine check event */
  92static unsigned long            mce_need_notify;
  93static char                     mce_helper[128];
  94static char                     *mce_helper_argv[2] = { mce_helper, NULL };
  95
  96static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
  97static DEFINE_PER_CPU(struct mce, mces_seen);
  98static int                      cpu_missing;
  99
 100/*
 101 * CPU/chipset specific EDAC code can register a notifier call here to print
 102 * MCE errors in a human-readable form.
 103 */
 104ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
 105EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
 106
 107static int default_decode_mce(struct notifier_block *nb, unsigned long val,
 108                               void *data)
 109{
 110        pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
 111        pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
 112
 113        return NOTIFY_STOP;
 114}
 115
 116static struct notifier_block mce_dec_nb = {
 117        .notifier_call = default_decode_mce,
 118        .priority      = -1,
 119};
 120
 121/* MCA banks polled by the period polling timer for corrected events */
 122DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 123        [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
 124};
 125
 126static DEFINE_PER_CPU(struct work_struct, mce_work);
 127
 128/* Do initial initialization of a struct mce */
 129void mce_setup(struct mce *m)
 130{
 131        memset(m, 0, sizeof(struct mce));
 132        m->cpu = m->extcpu = smp_processor_id();
 133        rdtscll(m->tsc);
 134        /* We hope get_seconds stays lockless */
 135        m->time = get_seconds();
 136        m->cpuvendor = boot_cpu_data.x86_vendor;
 137        m->cpuid = cpuid_eax(1);
 138#ifdef CONFIG_SMP
 139        m->socketid = cpu_data(m->extcpu).phys_proc_id;
 140#endif
 141        m->apicid = cpu_data(m->extcpu).initial_apicid;
 142        rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
 143}
 144
 145DEFINE_PER_CPU(struct mce, injectm);
 146EXPORT_PER_CPU_SYMBOL_GPL(injectm);
 147
 148/*
 149 * Lockless MCE logging infrastructure.
 150 * This avoids deadlocks on printk locks without having to break locks. Also
 151 * separate MCEs from kernel messages to avoid bogus bug reports.
 152 */
 153
 154static struct mce_log mcelog = {
 155        .signature      = MCE_LOG_SIGNATURE,
 156        .len            = MCE_LOG_LEN,
 157        .recordlen      = sizeof(struct mce),
 158};
 159
 160void mce_log(struct mce *mce)
 161{
 162        unsigned next, entry;
 163
 164        /* Emit the trace record: */
 165        trace_mce_record(mce);
 166
 167        mce->finished = 0;
 168        wmb();
 169        for (;;) {
 170                entry = rcu_dereference_check_mce(mcelog.next);
 171                for (;;) {
 172                        /*
 173                         * If edac_mce is enabled, it will check the error type
 174                         * and will process it, if it is a known error.
 175                         * Otherwise, the error will be sent through mcelog
 176                         * interface
 177                         */
 178                        if (edac_mce_parse(mce))
 179                                return;
 180
 181                        /*
 182                         * When the buffer fills up discard new entries.
 183                         * Assume that the earlier errors are the more
 184                         * interesting ones:
 185                         */
 186                        if (entry >= MCE_LOG_LEN) {
 187                                set_bit(MCE_OVERFLOW,
 188                                        (unsigned long *)&mcelog.flags);
 189                                return;
 190                        }
 191                        /* Old left over entry. Skip: */
 192                        if (mcelog.entry[entry].finished) {
 193                                entry++;
 194                                continue;
 195                        }
 196                        break;
 197                }
 198                smp_rmb();
 199                next = entry + 1;
 200                if (cmpxchg(&mcelog.next, entry, next) == entry)
 201                        break;
 202        }
 203        memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
 204        wmb();
 205        mcelog.entry[entry].finished = 1;
 206        wmb();
 207
 208        mce->finished = 1;
 209        set_bit(0, &mce_need_notify);
 210}
 211
 212static void print_mce(struct mce *m)
 213{
 214        pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
 215               m->extcpu, m->mcgstatus, m->bank, m->status);
 216
 217        if (m->ip) {
 218                pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
 219                        !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
 220                                m->cs, m->ip);
 221
 222                if (m->cs == __KERNEL_CS)
 223                        print_symbol("{%s}", m->ip);
 224                pr_cont("\n");
 225        }
 226
 227        pr_emerg(HW_ERR "TSC %llx ", m->tsc);
 228        if (m->addr)
 229                pr_cont("ADDR %llx ", m->addr);
 230        if (m->misc)
 231                pr_cont("MISC %llx ", m->misc);
 232
 233        pr_cont("\n");
 234        pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
 235                m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
 236
 237        /*
 238         * Print out human-readable details about the MCE error,
 239         * (if the CPU has an implementation for that)
 240         */
 241        atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
 242}
 243
 244#define PANIC_TIMEOUT 5 /* 5 seconds */
 245
 246static atomic_t mce_paniced;
 247
 248static int fake_panic;
 249static atomic_t mce_fake_paniced;
 250
 251/* Panic in progress. Enable interrupts and wait for final IPI */
 252static void wait_for_panic(void)
 253{
 254        long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
 255
 256        preempt_disable();
 257        local_irq_enable();
 258        while (timeout-- > 0)
 259                udelay(1);
 260        if (panic_timeout == 0)
 261                panic_timeout = mce_panic_timeout;
 262        panic("Panicing machine check CPU died");
 263}
 264
 265static void mce_panic(char *msg, struct mce *final, char *exp)
 266{
 267        int i, apei_err = 0;
 268
 269        if (!fake_panic) {
 270                /*
 271                 * Make sure only one CPU runs in machine check panic
 272                 */
 273                if (atomic_inc_return(&mce_paniced) > 1)
 274                        wait_for_panic();
 275                barrier();
 276
 277                bust_spinlocks(1);
 278                console_verbose();
 279        } else {
 280                /* Don't log too much for fake panic */
 281                if (atomic_inc_return(&mce_fake_paniced) > 1)
 282                        return;
 283        }
 284        /* First print corrected ones that are still unlogged */
 285        for (i = 0; i < MCE_LOG_LEN; i++) {
 286                struct mce *m = &mcelog.entry[i];
 287                if (!(m->status & MCI_STATUS_VAL))
 288                        continue;
 289                if (!(m->status & MCI_STATUS_UC)) {
 290                        print_mce(m);
 291                        if (!apei_err)
 292                                apei_err = apei_write_mce(m);
 293                }
 294        }
 295        /* Now print uncorrected but with the final one last */
 296        for (i = 0; i < MCE_LOG_LEN; i++) {
 297                struct mce *m = &mcelog.entry[i];
 298                if (!(m->status & MCI_STATUS_VAL))
 299                        continue;
 300                if (!(m->status & MCI_STATUS_UC))
 301                        continue;
 302                if (!final || memcmp(m, final, sizeof(struct mce))) {
 303                        print_mce(m);
 304                        if (!apei_err)
 305                                apei_err = apei_write_mce(m);
 306                }
 307        }
 308        if (final) {
 309                print_mce(final);
 310                if (!apei_err)
 311                        apei_err = apei_write_mce(final);
 312        }
 313        if (cpu_missing)
 314                pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
 315        if (exp)
 316                pr_emerg(HW_ERR "Machine check: %s\n", exp);
 317        if (!fake_panic) {
 318                if (panic_timeout == 0)
 319                        panic_timeout = mce_panic_timeout;
 320                panic(msg);
 321        } else
 322                pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
 323}
 324
 325/* Support code for software error injection */
 326
 327static int msr_to_offset(u32 msr)
 328{
 329        unsigned bank = __this_cpu_read(injectm.bank);
 330
 331        if (msr == rip_msr)
 332                return offsetof(struct mce, ip);
 333        if (msr == MSR_IA32_MCx_STATUS(bank))
 334                return offsetof(struct mce, status);
 335        if (msr == MSR_IA32_MCx_ADDR(bank))
 336                return offsetof(struct mce, addr);
 337        if (msr == MSR_IA32_MCx_MISC(bank))
 338                return offsetof(struct mce, misc);
 339        if (msr == MSR_IA32_MCG_STATUS)
 340                return offsetof(struct mce, mcgstatus);
 341        return -1;
 342}
 343
 344/* MSR access wrappers used for error injection */
 345static u64 mce_rdmsrl(u32 msr)
 346{
 347        u64 v;
 348
 349        if (__this_cpu_read(injectm.finished)) {
 350                int offset = msr_to_offset(msr);
 351
 352                if (offset < 0)
 353                        return 0;
 354                return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
 355        }
 356
 357        if (rdmsrl_safe(msr, &v)) {
 358                WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
 359                /*
 360                 * Return zero in case the access faulted. This should
 361                 * not happen normally but can happen if the CPU does
 362                 * something weird, or if the code is buggy.
 363                 */
 364                v = 0;
 365        }
 366
 367        return v;
 368}
 369
 370static void mce_wrmsrl(u32 msr, u64 v)
 371{
 372        if (__this_cpu_read(injectm.finished)) {
 373                int offset = msr_to_offset(msr);
 374
 375                if (offset >= 0)
 376                        *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
 377                return;
 378        }
 379        wrmsrl(msr, v);
 380}
 381
 382/*
 383 * Simple lockless ring to communicate PFNs from the exception handler with the
 384 * process context work function. This is vastly simplified because there's
 385 * only a single reader and a single writer.
 386 */
 387#define MCE_RING_SIZE 16        /* we use one entry less */
 388
 389struct mce_ring {
 390        unsigned short start;
 391        unsigned short end;
 392        unsigned long ring[MCE_RING_SIZE];
 393};
 394static DEFINE_PER_CPU(struct mce_ring, mce_ring);
 395
 396/* Runs with CPU affinity in workqueue */
 397static int mce_ring_empty(void)
 398{
 399        struct mce_ring *r = &__get_cpu_var(mce_ring);
 400
 401        return r->start == r->end;
 402}
 403
 404static int mce_ring_get(unsigned long *pfn)
 405{
 406        struct mce_ring *r;
 407        int ret = 0;
 408
 409        *pfn = 0;
 410        get_cpu();
 411        r = &__get_cpu_var(mce_ring);
 412        if (r->start == r->end)
 413                goto out;
 414        *pfn = r->ring[r->start];
 415        r->start = (r->start + 1) % MCE_RING_SIZE;
 416        ret = 1;
 417out:
 418        put_cpu();
 419        return ret;
 420}
 421
 422/* Always runs in MCE context with preempt off */
 423static int mce_ring_add(unsigned long pfn)
 424{
 425        struct mce_ring *r = &__get_cpu_var(mce_ring);
 426        unsigned next;
 427
 428        next = (r->end + 1) % MCE_RING_SIZE;
 429        if (next == r->start)
 430                return -1;
 431        r->ring[r->end] = pfn;
 432        wmb();
 433        r->end = next;
 434        return 0;
 435}
 436
 437int mce_available(struct cpuinfo_x86 *c)
 438{
 439        if (mce_disabled)
 440                return 0;
 441        return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 442}
 443
 444static void mce_schedule_work(void)
 445{
 446        if (!mce_ring_empty()) {
 447                struct work_struct *work = &__get_cpu_var(mce_work);
 448                if (!work_pending(work))
 449                        schedule_work(work);
 450        }
 451}
 452
 453/*
 454 * Get the address of the instruction at the time of the machine check
 455 * error.
 456 */
 457static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 458{
 459
 460        if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
 461                m->ip = regs->ip;
 462                m->cs = regs->cs;
 463        } else {
 464                m->ip = 0;
 465                m->cs = 0;
 466        }
 467        if (rip_msr)
 468                m->ip = mce_rdmsrl(rip_msr);
 469}
 470
 471#ifdef CONFIG_X86_LOCAL_APIC
 472/*
 473 * Called after interrupts have been reenabled again
 474 * when a MCE happened during an interrupts off region
 475 * in the kernel.
 476 */
 477asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
 478{
 479        ack_APIC_irq();
 480        exit_idle();
 481        irq_enter();
 482        mce_notify_irq();
 483        mce_schedule_work();
 484        irq_exit();
 485}
 486#endif
 487
 488static void mce_report_event(struct pt_regs *regs)
 489{
 490        if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
 491                mce_notify_irq();
 492                /*
 493                 * Triggering the work queue here is just an insurance
 494                 * policy in case the syscall exit notify handler
 495                 * doesn't run soon enough or ends up running on the
 496                 * wrong CPU (can happen when audit sleeps)
 497                 */
 498                mce_schedule_work();
 499                return;
 500        }
 501
 502#ifdef CONFIG_X86_LOCAL_APIC
 503        /*
 504         * Without APIC do not notify. The event will be picked
 505         * up eventually.
 506         */
 507        if (!cpu_has_apic)
 508                return;
 509
 510        /*
 511         * When interrupts are disabled we cannot use
 512         * kernel services safely. Trigger an self interrupt
 513         * through the APIC to instead do the notification
 514         * after interrupts are reenabled again.
 515         */
 516        apic->send_IPI_self(MCE_SELF_VECTOR);
 517
 518        /*
 519         * Wait for idle afterwards again so that we don't leave the
 520         * APIC in a non idle state because the normal APIC writes
 521         * cannot exclude us.
 522         */
 523        apic_wait_icr_idle();
 524#endif
 525}
 526
 527DEFINE_PER_CPU(unsigned, mce_poll_count);
 528
 529/*
 530 * Poll for corrected events or events that happened before reset.
 531 * Those are just logged through /dev/mcelog.
 532 *
 533 * This is executed in standard interrupt context.
 534 *
 535 * Note: spec recommends to panic for fatal unsignalled
 536 * errors here. However this would be quite problematic --
 537 * we would need to reimplement the Monarch handling and
 538 * it would mess up the exclusion between exception handler
 539 * and poll hander -- * so we skip this for now.
 540 * These cases should not happen anyways, or only when the CPU
 541 * is already totally * confused. In this case it's likely it will
 542 * not fully execute the machine check handler either.
 543 */
 544void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 545{
 546        struct mce m;
 547        int i;
 548
 549        percpu_inc(mce_poll_count);
 550
 551        mce_setup(&m);
 552
 553        m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 554        for (i = 0; i < banks; i++) {
 555                if (!mce_banks[i].ctl || !test_bit(i, *b))
 556                        continue;
 557
 558                m.misc = 0;
 559                m.addr = 0;
 560                m.bank = i;
 561                m.tsc = 0;
 562
 563                barrier();
 564                m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 565                if (!(m.status & MCI_STATUS_VAL))
 566                        continue;
 567
 568                /*
 569                 * Uncorrected or signalled events are handled by the exception
 570                 * handler when it is enabled, so don't process those here.
 571                 *
 572                 * TBD do the same check for MCI_STATUS_EN here?
 573                 */
 574                if (!(flags & MCP_UC) &&
 575                    (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
 576                        continue;
 577
 578                if (m.status & MCI_STATUS_MISCV)
 579                        m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
 580                if (m.status & MCI_STATUS_ADDRV)
 581                        m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
 582
 583                if (!(flags & MCP_TIMESTAMP))
 584                        m.tsc = 0;
 585                /*
 586                 * Don't get the IP here because it's unlikely to
 587                 * have anything to do with the actual error location.
 588                 */
 589                if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
 590                        mce_log(&m);
 591                        atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
 592                        add_taint(TAINT_MACHINE_CHECK);
 593                }
 594
 595                /*
 596                 * Clear state for this bank.
 597                 */
 598                mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
 599        }
 600
 601        /*
 602         * Don't clear MCG_STATUS here because it's only defined for
 603         * exceptions.
 604         */
 605
 606        sync_core();
 607}
 608EXPORT_SYMBOL_GPL(machine_check_poll);
 609
 610/*
 611 * Do a quick check if any of the events requires a panic.
 612 * This decides if we keep the events around or clear them.
 613 */
 614static int mce_no_way_out(struct mce *m, char **msg)
 615{
 616        int i;
 617
 618        for (i = 0; i < banks; i++) {
 619                m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 620                if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
 621                        return 1;
 622        }
 623        return 0;
 624}
 625
 626/*
 627 * Variable to establish order between CPUs while scanning.
 628 * Each CPU spins initially until executing is equal its number.
 629 */
 630static atomic_t mce_executing;
 631
 632/*
 633 * Defines order of CPUs on entry. First CPU becomes Monarch.
 634 */
 635static atomic_t mce_callin;
 636
 637/*
 638 * Check if a timeout waiting for other CPUs happened.
 639 */
 640static int mce_timed_out(u64 *t)
 641{
 642        /*
 643         * The others already did panic for some reason.
 644         * Bail out like in a timeout.
 645         * rmb() to tell the compiler that system_state
 646         * might have been modified by someone else.
 647         */
 648        rmb();
 649        if (atomic_read(&mce_paniced))
 650                wait_for_panic();
 651        if (!monarch_timeout)
 652                goto out;
 653        if ((s64)*t < SPINUNIT) {
 654                /* CHECKME: Make panic default for 1 too? */
 655                if (tolerant < 1)
 656                        mce_panic("Timeout synchronizing machine check over CPUs",
 657                                  NULL, NULL);
 658                cpu_missing = 1;
 659                return 1;
 660        }
 661        *t -= SPINUNIT;
 662out:
 663        touch_nmi_watchdog();
 664        return 0;
 665}
 666
 667/*
 668 * The Monarch's reign.  The Monarch is the CPU who entered
 669 * the machine check handler first. It waits for the others to
 670 * raise the exception too and then grades them. When any
 671 * error is fatal panic. Only then let the others continue.
 672 *
 673 * The other CPUs entering the MCE handler will be controlled by the
 674 * Monarch. They are called Subjects.
 675 *
 676 * This way we prevent any potential data corruption in a unrecoverable case
 677 * and also makes sure always all CPU's errors are examined.
 678 *
 679 * Also this detects the case of a machine check event coming from outer
 680 * space (not detected by any CPUs) In this case some external agent wants
 681 * us to shut down, so panic too.
 682 *
 683 * The other CPUs might still decide to panic if the handler happens
 684 * in a unrecoverable place, but in this case the system is in a semi-stable
 685 * state and won't corrupt anything by itself. It's ok to let the others
 686 * continue for a bit first.
 687 *
 688 * All the spin loops have timeouts; when a timeout happens a CPU
 689 * typically elects itself to be Monarch.
 690 */
 691static void mce_reign(void)
 692{
 693        int cpu;
 694        struct mce *m = NULL;
 695        int global_worst = 0;
 696        char *msg = NULL;
 697        char *nmsg = NULL;
 698
 699        /*
 700         * This CPU is the Monarch and the other CPUs have run
 701         * through their handlers.
 702         * Grade the severity of the errors of all the CPUs.
 703         */
 704        for_each_possible_cpu(cpu) {
 705                int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
 706                                            &nmsg);
 707                if (severity > global_worst) {
 708                        msg = nmsg;
 709                        global_worst = severity;
 710                        m = &per_cpu(mces_seen, cpu);
 711                }
 712        }
 713
 714        /*
 715         * Cannot recover? Panic here then.
 716         * This dumps all the mces in the log buffer and stops the
 717         * other CPUs.
 718         */
 719        if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
 720                mce_panic("Fatal Machine check", m, msg);
 721
 722        /*
 723         * For UC somewhere we let the CPU who detects it handle it.
 724         * Also must let continue the others, otherwise the handling
 725         * CPU could deadlock on a lock.
 726         */
 727
 728        /*
 729         * No machine check event found. Must be some external
 730         * source or one CPU is hung. Panic.
 731         */
 732        if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
 733                mce_panic("Machine check from unknown source", NULL, NULL);
 734
 735        /*
 736         * Now clear all the mces_seen so that they don't reappear on
 737         * the next mce.
 738         */
 739        for_each_possible_cpu(cpu)
 740                memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
 741}
 742
 743static atomic_t global_nwo;
 744
 745/*
 746 * Start of Monarch synchronization. This waits until all CPUs have
 747 * entered the exception handler and then determines if any of them
 748 * saw a fatal event that requires panic. Then it executes them
 749 * in the entry order.
 750 * TBD double check parallel CPU hotunplug
 751 */
 752static int mce_start(int *no_way_out)
 753{
 754        int order;
 755        int cpus = num_online_cpus();
 756        u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
 757
 758        if (!timeout)
 759                return -1;
 760
 761        atomic_add(*no_way_out, &global_nwo);
 762        /*
 763         * global_nwo should be updated before mce_callin
 764         */
 765        smp_wmb();
 766        order = atomic_inc_return(&mce_callin);
 767
 768        /*
 769         * Wait for everyone.
 770         */
 771        while (atomic_read(&mce_callin) != cpus) {
 772                if (mce_timed_out(&timeout)) {
 773                        atomic_set(&global_nwo, 0);
 774                        return -1;
 775                }
 776                ndelay(SPINUNIT);
 777        }
 778
 779        /*
 780         * mce_callin should be read before global_nwo
 781         */
 782        smp_rmb();
 783
 784        if (order == 1) {
 785                /*
 786                 * Monarch: Starts executing now, the others wait.
 787                 */
 788                atomic_set(&mce_executing, 1);
 789        } else {
 790                /*
 791                 * Subject: Now start the scanning loop one by one in
 792                 * the original callin order.
 793                 * This way when there are any shared banks it will be
 794                 * only seen by one CPU before cleared, avoiding duplicates.
 795                 */
 796                while (atomic_read(&mce_executing) < order) {
 797                        if (mce_timed_out(&timeout)) {
 798                                atomic_set(&global_nwo, 0);
 799                                return -1;
 800                        }
 801                        ndelay(SPINUNIT);
 802                }
 803        }
 804
 805        /*
 806         * Cache the global no_way_out state.
 807         */
 808        *no_way_out = atomic_read(&global_nwo);
 809
 810        return order;
 811}
 812
 813/*
 814 * Synchronize between CPUs after main scanning loop.
 815 * This invokes the bulk of the Monarch processing.
 816 */
 817static int mce_end(int order)
 818{
 819        int ret = -1;
 820        u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
 821
 822        if (!timeout)
 823                goto reset;
 824        if (order < 0)
 825                goto reset;
 826
 827        /*
 828         * Allow others to run.
 829         */
 830        atomic_inc(&mce_executing);
 831
 832        if (order == 1) {
 833                /* CHECKME: Can this race with a parallel hotplug? */
 834                int cpus = num_online_cpus();
 835
 836                /*
 837                 * Monarch: Wait for everyone to go through their scanning
 838                 * loops.
 839                 */
 840                while (atomic_read(&mce_executing) <= cpus) {
 841                        if (mce_timed_out(&timeout))
 842                                goto reset;
 843                        ndelay(SPINUNIT);
 844                }
 845
 846                mce_reign();
 847                barrier();
 848                ret = 0;
 849        } else {
 850                /*
 851                 * Subject: Wait for Monarch to finish.
 852                 */
 853                while (atomic_read(&mce_executing) != 0) {
 854                        if (mce_timed_out(&timeout))
 855                                goto reset;
 856                        ndelay(SPINUNIT);
 857                }
 858
 859                /*
 860                 * Don't reset anything. That's done by the Monarch.
 861                 */
 862                return 0;
 863        }
 864
 865        /*
 866         * Reset all global state.
 867         */
 868reset:
 869        atomic_set(&global_nwo, 0);
 870        atomic_set(&mce_callin, 0);
 871        barrier();
 872
 873        /*
 874         * Let others run again.
 875         */
 876        atomic_set(&mce_executing, 0);
 877        return ret;
 878}
 879
 880/*
 881 * Check if the address reported by the CPU is in a format we can parse.
 882 * It would be possible to add code for most other cases, but all would
 883 * be somewhat complicated (e.g. segment offset would require an instruction
 884 * parser). So only support physical addresses upto page granuality for now.
 885 */
 886static int mce_usable_address(struct mce *m)
 887{
 888        if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
 889                return 0;
 890        if ((m->misc & 0x3f) > PAGE_SHIFT)
 891                return 0;
 892        if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
 893                return 0;
 894        return 1;
 895}
 896
 897static void mce_clear_state(unsigned long *toclear)
 898{
 899        int i;
 900
 901        for (i = 0; i < banks; i++) {
 902                if (test_bit(i, toclear))
 903                        mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
 904        }
 905}
 906
 907/*
 908 * The actual machine check handler. This only handles real
 909 * exceptions when something got corrupted coming in through int 18.
 910 *
 911 * This is executed in NMI context not subject to normal locking rules. This
 912 * implies that most kernel services cannot be safely used. Don't even
 913 * think about putting a printk in there!
 914 *
 915 * On Intel systems this is entered on all CPUs in parallel through
 916 * MCE broadcast. However some CPUs might be broken beyond repair,
 917 * so be always careful when synchronizing with others.
 918 */
 919void do_machine_check(struct pt_regs *regs, long error_code)
 920{
 921        struct mce m, *final;
 922        int i;
 923        int worst = 0;
 924        int severity;
 925        /*
 926         * Establish sequential order between the CPUs entering the machine
 927         * check handler.
 928         */
 929        int order;
 930        /*
 931         * If no_way_out gets set, there is no safe way to recover from this
 932         * MCE.  If tolerant is cranked up, we'll try anyway.
 933         */
 934        int no_way_out = 0;
 935        /*
 936         * If kill_it gets set, there might be a way to recover from this
 937         * error.
 938         */
 939        int kill_it = 0;
 940        DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 941        char *msg = "Unknown";
 942
 943        atomic_inc(&mce_entry);
 944
 945        percpu_inc(mce_exception_count);
 946
 947        if (notify_die(DIE_NMI, "machine check", regs, error_code,
 948                           18, SIGKILL) == NOTIFY_STOP)
 949                goto out;
 950        if (!banks)
 951                goto out;
 952
 953        mce_setup(&m);
 954
 955        m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 956        final = &__get_cpu_var(mces_seen);
 957        *final = m;
 958
 959        no_way_out = mce_no_way_out(&m, &msg);
 960
 961        barrier();
 962
 963        /*
 964         * When no restart IP must always kill or panic.
 965         */
 966        if (!(m.mcgstatus & MCG_STATUS_RIPV))
 967                kill_it = 1;
 968
 969        /*
 970         * Go through all the banks in exclusion of the other CPUs.
 971         * This way we don't report duplicated events on shared banks
 972         * because the first one to see it will clear it.
 973         */
 974        order = mce_start(&no_way_out);
 975        for (i = 0; i < banks; i++) {
 976                __clear_bit(i, toclear);
 977                if (!mce_banks[i].ctl)
 978                        continue;
 979
 980                m.misc = 0;
 981                m.addr = 0;
 982                m.bank = i;
 983
 984                m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
 985                if ((m.status & MCI_STATUS_VAL) == 0)
 986                        continue;
 987
 988                /*
 989                 * Non uncorrected or non signaled errors are handled by
 990                 * machine_check_poll. Leave them alone, unless this panics.
 991                 */
 992                if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
 993                        !no_way_out)
 994                        continue;
 995
 996                /*
 997                 * Set taint even when machine check was not enabled.
 998                 */
 999                add_taint(TAINT_MACHINE_CHECK);
1000

1001                severity = mce_severity(&m, tolerant, NULL);
1002
1003                /*
1004                 * When machine check was for corrected handler don't touch,
1005                 * unless we're panicing.
1006                 */
1007                if (severity == MCE_KEEP_SEVERITY && !no_way_out)
1008                        continue;
1009                __set_bit(i, toclear);
1010                if (severity == MCE_NO_SEVERITY) {
1011                        /*
1012                         * Machine check event was not enabled. Clear, but
1013                         * ignore.
1014                         */
1015                        continue;
1016                }
1017
1018                /*
1019                 * Kill on action required.
1020                 */
1021                if (severity == MCE_AR_SEVERITY)
1022                        kill_it = 1;
1023
1024                if (m.status & MCI_STATUS_MISCV)
1025                        m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
1026                if (m.status & MCI_STATUS_ADDRV)
1027                        m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
1028
1029                /*
1030                 * Action optional error. Queue address for later processing.
1031                 * When the ring overflows we just ignore the AO error.
1032                 * RED-PEN add some logging mechanism when
1033                 * usable_address or mce_add_ring fails.
1034                 * RED-PEN don't ignore overflow for tolerant == 0
1035                 */
1036                if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1037                        mce_ring_add(m.addr >> PAGE_SHIFT);
1038
1039                mce_get_rip(&m, regs);
1040                mce_log(&m);
1041
1042                if (severity > worst) {
1043                        *final = m;
1044                        worst = severity;
1045                }
1046        }
1047
1048        if (!no_way_out)
1049                mce_clear_state(toclear);
1050
1051        /*
1052         * Do most of the synchronization with other CPUs.
1053         * When there's any problem use only local no_way_out state.
1054         */
1055        if (mce_end(order) < 0)
1056                no_way_out = worst >= MCE_PANIC_SEVERITY;
1057
1058        /*
1059         * If we have decided that we just CAN'T continue, and the user
1060         * has not set tolerant to an insane level, give up and die.
1061         *
1062         * This is mainly used in the case when the system doesn't
1063         * support MCE broadcasting or it has been disabled.
1064         */
1065        if (no_way_out && tolerant < 3)
1066                mce_panic("Fatal machine check on current CPU", final, msg);
1067
1068        /*
1069         * If the error seems to be unrecoverable, something should be
1070         * done.  Try to kill as little as possible.  If we can kill just
1071         * one task, do that.  If the user has set the tolerance very
1072         * high, don't try to do anything at all.
1073         */
1074
1075        if (kill_it && tolerant < 3)
1076                force_sig(SIGBUS, current);
1077
1078        /* notify userspace ASAP */
1079        set_thread_flag(TIF_MCE_NOTIFY);
1080
1081        if (worst > 0)
1082                mce_report_event(regs);
1083        mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1084out:
1085        atomic_dec(&mce_entry);
1086        sync_core();
1087}
1088EXPORT_SYMBOL_GPL(do_machine_check);
1089
1090/* dummy to break dependency. actual code is in mm/memory-failure.c */
1091void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
1092{
1093        printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
1094}
1095
1096/*
1097 * Called after mce notification in process context. This code
1098 * is allowed to sleep. Call the high level VM handler to process
1099 * any corrupted pages.
1100 * Assume that the work queue code only calls this one at a time
1101 * per CPU.
1102 * Note we don't disable preemption, so this code might run on the wrong
1103 * CPU. In this case the event is picked up by the scheduled work queue.
1104 * This is merely a fast path to expedite processing in some common
1105 * cases.
1106 */
1107void mce_notify_process(void)
1108{
1109        unsigned long pfn;
1110        mce_notify_irq();
1111        while (mce_ring_get(&pfn))
1112                memory_failure(pfn, MCE_VECTOR);
1113}
1114
1115static void mce_process_work(struct work_struct *dummy)
1116{
1117        mce_notify_process();
1118}
1119
1120#ifdef CONFIG_X86_MCE_INTEL
1121/***
1122 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1123 * @cpu: The CPU on which the event occurred.
1124 * @status: Event status information
1125 *
1126 * This function should be called by the thermal interrupt after the
1127 * event has been processed and the decision was made to log the event
1128 * further.
1129 *
1130 * The status parameter will be saved to the 'status' field of 'struct mce'
1131 * and historically has been the register value of the
1132 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1133 */
1134void mce_log_therm_throt_event(__u64 status)
1135{
1136        struct mce m;
1137
1138        mce_setup(&m);
1139        m.bank = MCE_THERMAL_BANK;
1140        m.status = status;
1141        mce_log(&m);
1142}
1143#endif /* CONFIG_X86_MCE_INTEL */
1144
1145/*
1146 * Periodic polling timer for "silent" machine check errors.  If the
1147 * poller finds an MCE, poll 2x faster.  When the poller finds no more
1148 * errors, poll 2x slower (up to check_interval seconds).
1149 */
1150static int check_interval = 5 * 60; /* 5 minutes */
1151
1152static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1153static DEFINE_PER_CPU(struct timer_list, mce_timer);
1154
1155static void mce_start_timer(unsigned long data)
1156{
1157        struct timer_list *t = &per_cpu(mce_timer, data);
1158        int *n;
1159
1160        WARN_ON(smp_processor_id() != data);
1161
1162        if (mce_available(__this_cpu_ptr(&cpu_info))) {
1163                machine_check_poll(MCP_TIMESTAMP,
1164                                &__get_cpu_var(mce_poll_banks));
1165        }
1166
1167        /*
1168         * Alert userspace if needed.  If we logged an MCE, reduce the
1169         * polling interval, otherwise increase the polling interval.
1170         */
1171        n = &__get_cpu_var(mce_next_interval);
1172        if (mce_notify_irq())
1173                *n = max(*n/2, HZ/100);
1174        else
1175                *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
1176
1177        t->expires = jiffies + *n;
1178        add_timer_on(t, smp_processor_id());
1179}
1180
1181static void mce_do_trigger(struct work_struct *work)
1182{
1183        call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1184}
1185
1186static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1187
1188/*
1189 * Notify the user(s) about new machine check events.
1190 * Can be called from interrupt context, but not from machine check/NMI
1191 * context.
1192 */
1193int mce_notify_irq(void)
1194{
1195        /* Not more than two messages every minute */
1196        static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1197
1198        clear_thread_flag(TIF_MCE_NOTIFY);
1199
1200        if (test_and_clear_bit(0, &mce_need_notify)) {
1201                wake_up_interruptible(&mce_wait);
1202
1203                /*
1204                 * There is no risk of missing notifications because
1205                 * work_pending is always cleared before the function is
1206                 * executed.
1207                 */
1208                if (mce_helper[0] && !work_pending(&mce_trigger_work))
1209                        schedule_work(&mce_trigger_work);
1210
1211                if (__ratelimit(&ratelimit))
1212                        pr_info(HW_ERR "Machine check events logged\n");
1213
1214                return 1;
1215        }
1216        return 0;
1217}
1218EXPORT_SYMBOL_GPL(mce_notify_irq);
1219
1220static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1221{
1222        int i;
1223
1224        mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
1225        if (!mce_banks)
1226                return -ENOMEM;
1227        for (i = 0; i < banks; i++) {
1228                struct mce_bank *b = &mce_banks[i];
1229
1230                b->ctl = -1ULL;
1231                b->init = 1;
1232        }
1233        return 0;
1234}
1235
1236/*
1237 * Initialize Machine Checks for a CPU.
1238 */
1239static int __cpuinit __mcheck_cpu_cap_init(void)
1240{
1241        unsigned b;
1242        u64 cap;
1243
1244        rdmsrl(MSR_IA32_MCG_CAP, cap);
1245
1246        b = cap & MCG_BANKCNT_MASK;
1247        if (!banks)
1248                printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1249
1250        if (b > MAX_NR_BANKS) {
1251                printk(KERN_WARNING
1252                       "MCE: Using only %u machine check banks out of %u\n",
1253                        MAX_NR_BANKS, b);
1254                b = MAX_NR_BANKS;
1255        }
1256
1257        /* Don't support asymmetric configurations today */
1258        WARN_ON(banks != 0 && b != banks);
1259        banks = b;
1260        if (!mce_banks) {
1261                int err = __mcheck_cpu_mce_banks_init();
1262
1263                if (err)
1264                        return err;
1265        }
1266
1267        /* Use accurate RIP reporting if available. */
1268        if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1269                rip_msr = MSR_IA32_MCG_EIP;
1270
1271        if (cap & MCG_SER_P)
1272                mce_ser = 1;
1273
1274        return 0;
1275}
1276
1277static void __mcheck_cpu_init_generic(void)
1278{
1279        mce_banks_t all_banks;
1280        u64 cap;
1281        int i;
1282
1283        /*
1284         * Log the machine checks left over from the previous reset.
1285         */
1286        bitmap_fill(all_banks, MAX_NR_BANKS);
1287        machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1288
1289        set_in_cr4(X86_CR4_MCE);
1290
1291        rdmsrl(MSR_IA32_MCG_CAP, cap);
1292        if (cap & MCG_CTL_P)
1293                wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1294
1295        for (i = 0; i < banks; i++) {
1296                struct mce_bank *b = &mce_banks[i];
1297
1298                if (!b->init)
1299                        continue;
1300                wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1301                wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1302        }
1303}
1304
1305/* Add per CPU specific workarounds here */
1306static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1307{
1308        if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1309                pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
1310                return -EOPNOTSUPP;
1311        }
1312
1313        /* This should be disabled by the BIOS, but isn't always */
1314        if (c->x86_vendor == X86_VENDOR_AMD) {
1315                if (c->x86 == 15 && banks > 4) {
1316                        /*
1317                         * disable GART TBL walk error reporting, which
1318                         * trips off incorrectly with the IOMMU & 3ware
1319                         * & Cerberus:
1320                         */
1321                        clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1322                }
1323                if (c->x86 <= 17 && mce_bootlog < 0) {
1324                        /*
1325                         * Lots of broken BIOS around that don't clear them
1326                         * by default and leave crap in there. Don't log:
1327                         */
1328                        mce_bootlog = 0;
1329                }
1330                /*
1331                 * Various K7s with broken bank 0 around. Always disable
1332                 * by default.
1333                 */
1334                 if (c->x86 == 6 && banks > 0)
1335                        mce_banks[0].ctl = 0;
1336        }
1337
1338        if (c->x86_vendor == X86_VENDOR_INTEL) {
1339                /*
1340                 * SDM documents that on family 6 bank 0 should not be written
1341                 * because it aliases to another special BIOS controlled
1342                 * register.
1343                 * But it's not aliased anymore on model 0x1a+
1344                 * Don't ignore bank 0 completely because there could be a
1345                 * valid event later, merely don't write CTL0.
1346                 */
1347
1348                if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
1349                        mce_banks[0].init = 0;
1350
1351                /*
1352                 * All newer Intel systems support MCE broadcasting. Enable
1353                 * synchronization with a one second timeout.
1354                 */
1355                if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1356                        monarch_timeout < 0)
1357                        monarch_timeout = USEC_PER_SEC;
1358
1359                /*
1360                 * There are also broken BIOSes on some Pentium M and
1361                 * earlier systems:
1362                 */
1363                if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
1364                        mce_bootlog = 0;
1365        }
1366        if (monarch_timeout < 0)
1367                monarch_timeout = 0;
1368        if (mce_bootlog != 0)
1369                mce_panic_timeout = 30;
1370
1371        return 0;
1372}
1373
1374static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1375{
1376        if (c->x86 != 5)
1377                return;
1378        switch (c->x86_vendor) {
1379        case X86_VENDOR_INTEL:
1380                intel_p5_mcheck_init(c);
1381                break;
1382        case X86_VENDOR_CENTAUR:
1383                winchip_mcheck_init(c);
1384                break;
1385        }
1386}
1387
1388static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1389{
1390        switch (c->x86_vendor) {
1391        case X86_VENDOR_INTEL:
1392                mce_intel_feature_init(c);
1393                break;
1394        case X86_VENDOR_AMD:
1395                mce_amd_feature_init(c);
1396                break;
1397        default:
1398                break;
1399        }
1400}
1401
1402static void __mcheck_cpu_init_timer(void)
1403{
1404        struct timer_list *t = &__get_cpu_var(mce_timer);
1405        int *n = &__get_cpu_var(mce_next_interval);
1406
1407        setup_timer(t, mce_start_timer, smp_processor_id());
1408
1409        if (mce_ignore_ce)
1410                return;
1411
1412        *n = check_interval * HZ;
1413        if (!*n)
1414                return;
1415        t->expires = round_jiffies(jiffies + *n);
1416        add_timer_on(t, smp_processor_id());
1417}
1418
1419/* Handle unconfigured int18 (should never happen) */
1420static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1421{
1422        printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
1423               smp_processor_id());
1424}
1425
1426/* Call the installed machine check handler for this CPU setup. */
1427void (*machine_check_vector)(struct pt_regs *, long error_code) =
1428                                                unexpected_machine_check;
1429
1430/*
1431 * Called for each booted CPU to set up machine checks.
1432 * Must be called with preempt off:
1433 */
1434void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1435{
1436        if (mce_disabled)
1437                return;
1438
1439        __mcheck_cpu_ancient_init(c);
1440
1441        if (!mce_available(c))
1442                return;
1443
1444        if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1445                mce_disabled = 1;
1446                return;
1447        }
1448
1449        machine_check_vector = do_machine_check;
1450
1451        __mcheck_cpu_init_generic();
1452        __mcheck_cpu_init_vendor(c);
1453        __mcheck_cpu_init_timer();
1454        INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1455
1456}
1457
1458/*
1459 * Character device to read and clear the MCE log.
1460 */
1461
1462static DEFINE_SPINLOCK(mce_state_lock);
1463static int              open_count;             /* #times opened */
1464static int              open_exclu;             /* already open exclusive? */
1465
1466static int mce_open(struct inode *inode, struct file *file)
1467{
1468        spin_lock(&mce_state_lock);
1469
1470        if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
1471                spin_unlock(&mce_state_lock);
1472
1473                return -EBUSY;
1474        }
1475
1476        if (file->f_flags & O_EXCL)
1477                open_exclu = 1;
1478        open_count++;
1479
1480        spin_unlock(&mce_state_lock);
1481
1482        return nonseekable_open(inode, file);
1483}
1484
1485static int mce_release(struct inode *inode, struct file *file)
1486{
1487        spin_lock(&mce_state_lock);
1488
1489        open_count--;
1490        open_exclu = 0;
1491
1492        spin_unlock(&mce_state_lock);
1493
1494        return 0;
1495}
1496
1497static void collect_tscs(void *data)
1498{
1499        unsigned long *cpu_tsc = (unsigned long *)data;
1500
1501        rdtscll(cpu_tsc[smp_processor_id()]);
1502}
1503
1504static int mce_apei_read_done;
1505
1506/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1507static int __mce_read_apei(char __user **ubuf, size_t usize)
1508{
1509        int rc;
1510        u64 record_id;
1511        struct mce m;
1512
1513        if (usize < sizeof(struct mce))
1514                return -EINVAL;
1515
1516        rc = apei_read_mce(&m, &record_id);
1517        /* Error or no more MCE record */
1518        if (rc <= 0) {
1519                mce_apei_read_done = 1;
1520                return rc;
1521        }
1522        rc = -EFAULT;
1523        if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1524                return rc;
1525        /*
1526         * In fact, we should have cleared the record after that has
1527         * been flushed to the disk or sent to network in
1528         * /sbin/mcelog, but we have no interface to support that now,
1529         * so just clear it to avoid duplication.
1530         */
1531        rc = apei_clear_mce(record_id);
1532        if (rc) {
1533                mce_apei_read_done = 1;
1534                return rc;
1535        }
1536        *ubuf += sizeof(struct mce);
1537
1538        return 0;
1539}
1540
1541static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1542                        loff_t *off)
1543{
1544        char __user *buf = ubuf;
1545        unsigned long *cpu_tsc;
1546        unsigned prev, next;
1547        int i, err;
1548
1549        cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1550        if (!cpu_tsc)
1551                return -ENOMEM;
1552
1553        mutex_lock(&mce_read_mutex);
1554
1555        if (!mce_apei_read_done) {
1556                err = __mce_read_apei(&buf, usize);
1557                if (err || buf != ubuf)
1558                        goto out;
1559        }
1560
1561        next = rcu_dereference_check_mce(mcelog.next);
1562
1563        /* Only supports full reads right now */
1564        err = -EINVAL;
1565        if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1566                goto out;
1567
1568        err = 0;
1569        prev = 0;
1570        do {
1571                for (i = prev; i < next; i++) {
1572                        unsigned long start = jiffies;
1573
1574                        while (!mcelog.entry[i].finished) {
1575                                if (time_after_eq(jiffies, start + 2)) {
1576                                        memset(mcelog.entry + i, 0,
1577                                               sizeof(struct mce));
1578                                        goto timeout;
1579                                }
1580                                cpu_relax();
1581                        }
1582                        smp_rmb();
1583                        err |= copy_to_user(buf, mcelog.entry + i,
1584                                            sizeof(struct mce));
1585                        buf += sizeof(struct mce);
1586timeout:
1587                        ;
1588                }
1589
1590                memset(mcelog.entry + prev, 0,
1591                       (next - prev) * sizeof(struct mce));
1592                prev = next;
1593                next = cmpxchg(&mcelog.next, prev, 0);
1594        } while (next != prev);
1595
1596        synchronize_sched();
1597
1598        /*
1599         * Collect entries that were still getting written before the
1600         * synchronize.
1601         */
1602        on_each_cpu(collect_tscs, cpu_tsc, 1);
1603
1604        for (i = next; i < MCE_LOG_LEN; i++) {
1605                if (mcelog.entry[i].finished &&
1606                    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
1607                        err |= copy_to_user(buf, mcelog.entry+i,
1608                                            sizeof(struct mce));
1609                        smp_rmb();
1610                        buf += sizeof(struct mce);
1611                        memset(&mcelog.entry[i], 0, sizeof(struct mce));
1612                }
1613        }
1614
1615        if (err)
1616                err = -EFAULT;
1617
1618out:
1619        mutex_unlock(&mce_read_mutex);
1620        kfree(cpu_tsc);
1621
1622        return err ? err : buf - ubuf;
1623}
1624
1625static unsigned int mce_poll(struct file *file, poll_table *wait)
1626{
1627        poll_wait(file, &mce_wait, wait);
1628        if (rcu_dereference_check_mce(mcelog.next))
1629                return POLLIN | POLLRDNORM;
1630        if (!mce_apei_read_done && apei_check_mce())
1631                return POLLIN | POLLRDNORM;
1632        return 0;
1633}
1634
1635static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
1636{
1637        int __user *p = (int __user *)arg;
1638
1639        if (!capable(CAP_SYS_ADMIN))
1640                return -EPERM;
1641
1642        switch (cmd) {
1643        case MCE_GET_RECORD_LEN:
1644                return put_user(sizeof(struct mce), p);
1645        case MCE_GET_LOG_LEN:
1646                return put_user(MCE_LOG_LEN, p);
1647        case MCE_GETCLEAR_FLAGS: {
1648                unsigned flags;
1649
1650                do {
1651                        flags = mcelog.flags;
1652                } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1653
1654                return put_user(flags, p);
1655        }
1656        default:
1657                return -ENOTTY;
1658        }
1659}
1660
1661/* Modified in mce-inject.c, so not static or const */
1662struct file_operations mce_chrdev_ops = {
1663        .open                   = mce_open,
1664        .release                = mce_release,
1665        .read                   = mce_read,
1666        .poll                   = mce_poll,
1667        .unlocked_ioctl         = mce_ioctl,
1668        .llseek         = no_llseek,
1669};
1670EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1671
1672static struct miscdevice mce_log_device = {
1673        MISC_MCELOG_MINOR,
1674        "mcelog",
1675        &mce_chrdev_ops,
1676};
1677
1678/*
1679 * mce=off Disables machine check
1680 * mce=no_cmci Disables CMCI
1681 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1682 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1683 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1684 *      monarchtimeout is how long to wait for other CPUs on machine
1685 *      check, or 0 to not wait
1686 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1687 * mce=nobootlog Don't log MCEs from before booting.
1688 */
1689static int __init mcheck_enable(char *str)
1690{
1691        if (*str == 0) {
1692                enable_p5_mce();
1693                return 1;
1694        }
1695        if (*str == '=')
1696                str++;
1697        if (!strcmp(str, "off"))
1698                mce_disabled = 1;
1699        else if (!strcmp(str, "no_cmci"))
1700                mce_cmci_disabled = 1;
1701        else if (!strcmp(str, "dont_log_ce"))
1702                mce_dont_log_ce = 1;
1703        else if (!strcmp(str, "ignore_ce"))
1704                mce_ignore_ce = 1;
1705        else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1706                mce_bootlog = (str[0] == 'b');
1707        else if (isdigit(str[0])) {
1708                get_option(&str, &tolerant);
1709                if (*str == ',') {
1710                        ++str;
1711                        get_option(&str, &monarch_timeout);
1712                }
1713        } else {
1714                printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1715                       str);
1716                return 0;
1717        }
1718        return 1;
1719}
1720__setup("mce", mcheck_enable);
1721
1722int __init mcheck_init(void)
1723{
1724        atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
1725
1726        mcheck_intel_therm_init();
1727
1728        return 0;
1729}
1730
1731/*
1732 * Sysfs support
1733 */
1734
1735/*
1736 * Disable machine checks on suspend and shutdown. We can't really handle
1737 * them later.
1738 */
1739static int mce_disable_error_reporting(void)
1740{
1741        int i;
1742
1743        for (i = 0; i < banks; i++) {
1744                struct mce_bank *b = &mce_banks[i];
1745
1746                if (b->init)
1747                        wrmsrl(MSR_IA32_MCx_CTL(i), 0);
1748        }
1749        return 0;
1750}
1751
1752static int mce_suspend(struct sys_device *dev, pm_message_t state)
1753{
1754        return mce_disable_error_reporting();
1755}
1756
1757static int mce_shutdown(struct sys_device *dev)
1758{
1759        return mce_disable_error_reporting();
1760}
1761
1762/*
1763 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1764 * Only one CPU is active at this time, the others get re-added later using
1765 * CPU hotplug:
1766 */
1767static int mce_resume(struct sys_device *dev)
1768{
1769        __mcheck_cpu_init_generic();
1770        __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
1771
1772        return 0;
1773}
1774
1775static void mce_cpu_restart(void *data)
1776{
1777        del_timer_sync(&__get_cpu_var(mce_timer));
1778        if (!mce_available(__this_cpu_ptr(&cpu_info)))
1779                return;
1780        __mcheck_cpu_init_generic();
1781        __mcheck_cpu_init_timer();
1782}
1783
1784/* Reinit MCEs after user configuration changes */
1785static void mce_restart(void)
1786{
1787        on_each_cpu(mce_cpu_restart, NULL, 1);
1788}
1789
1790/* Toggle features for corrected errors */
1791static void mce_disable_ce(void *all)
1792{
1793        if (!mce_available(__this_cpu_ptr(&cpu_info)))
1794                return;
1795        if (all)
1796                del_timer_sync(&__get_cpu_var(mce_timer));
1797        cmci_clear();
1798}
1799
1800static void mce_enable_ce(void *all)
1801{
1802        if (!mce_available(__this_cpu_ptr(&cpu_info)))
1803                return;
1804        cmci_reenable();
1805        cmci_recheck();
1806        if (all)
1807                __mcheck_cpu_init_timer();
1808}
1809
1810static struct sysdev_class mce_sysclass = {
1811        .suspend        = mce_suspend,
1812        .shutdown       = mce_shutdown,
1813        .resume         = mce_resume,
1814        .name           = "machinecheck",
1815};
1816
1817DEFINE_PER_CPU(struct sys_device, mce_dev);
1818
1819__cpuinitdata
1820void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1821
1822static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
1823{
1824        return container_of(attr, struct mce_bank, attr);
1825}
1826
1827static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1828                         char *buf)
1829{
1830        return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
1831}
1832
1833static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1834                        const char *buf, size_t size)
1835{
1836        u64 new;
1837
1838        if (strict_strtoull(buf, 0, &new) < 0)
1839                return -EINVAL;
1840
1841        attr_to_bank(attr)->ctl = new;
1842        mce_restart();
1843
1844        return size;
1845}
1846
1847static ssize_t
1848show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
1849{
1850        strcpy(buf, mce_helper);
1851        strcat(buf, "\n");
1852        return strlen(mce_helper) + 1;
1853}
1854
1855static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1856                                const char *buf, size_t siz)
1857{
1858        char *p;
1859
1860        strncpy(mce_helper, buf, sizeof(mce_helper));
1861        mce_helper[sizeof(mce_helper)-1] = 0;
1862        p = strchr(mce_helper, '\n');
1863
1864        if (p)
1865                *p = 0;
1866
1867        return strlen(mce_helper) + !!p;
1868}
1869
1870static ssize_t set_ignore_ce(struct sys_device *s,
1871                             struct sysdev_attribute *attr,
1872                             const char *buf, size_t size)
1873{
1874        u64 new;
1875
1876        if (strict_strtoull(buf, 0, &new) < 0)
1877                return -EINVAL;
1878
1879        if (mce_ignore_ce ^ !!new) {
1880                if (new) {
1881                        /* disable ce features */
1882                        on_each_cpu(mce_disable_ce, (void *)1, 1);
1883                        mce_ignore_ce = 1;
1884                } else {
1885                        /* enable ce features */
1886                        mce_ignore_ce = 0;
1887                        on_each_cpu(mce_enable_ce, (void *)1, 1);
1888                }
1889        }
1890        return size;
1891}
1892
1893static ssize_t set_cmci_disabled(struct sys_device *s,
1894                                 struct sysdev_attribute *attr,
1895                                 const char *buf, size_t size)
1896{
1897        u64 new;
1898
1899        if (strict_strtoull(buf, 0, &new) < 0)
1900                return -EINVAL;
1901
1902        if (mce_cmci_disabled ^ !!new) {
1903                if (new) {
1904                        /* disable cmci */
1905                        on_each_cpu(mce_disable_ce, NULL, 1);
1906                        mce_cmci_disabled = 1;
1907                } else {
1908                        /* enable cmci */
1909                        mce_cmci_disabled = 0;
1910                        on_each_cpu(mce_enable_ce, NULL, 1);
1911                }
1912        }
1913        return size;
1914}
1915
1916static ssize_t store_int_with_restart(struct sys_device *s,
1917                                      struct sysdev_attribute *attr,
1918                                      const char *buf, size_t size)
1919{
1920        ssize_t ret = sysdev_store_int(s, attr, buf, size);
1921        mce_restart();
1922        return ret;
1923}
1924
1925static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
1926static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
1927static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1928static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
1929
1930static struct sysdev_ext_attribute attr_check_interval = {
1931        _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1932                     store_int_with_restart),
1933        &check_interval
1934};
1935
1936static struct sysdev_ext_attribute attr_ignore_ce = {
1937        _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
1938        &mce_ignore_ce
1939};
1940
1941static struct sysdev_ext_attribute attr_cmci_disabled = {
1942        _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
1943        &mce_cmci_disabled
1944};
1945
1946static struct sysdev_attribute *mce_attrs[] = {
1947        &attr_tolerant.attr,
1948        &attr_check_interval.attr,
1949        &attr_trigger,
1950        &attr_monarch_timeout.attr,
1951        &attr_dont_log_ce.attr,
1952        &attr_ignore_ce.attr,
1953        &attr_cmci_disabled.attr,
1954        NULL
1955};
1956
1957static cpumask_var_t mce_dev_initialized;
1958
1959/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
1960static __cpuinit int mce_create_device(unsigned int cpu)
1961{
1962        int err;
1963        int i, j;
1964
1965        if (!mce_available(&boot_cpu_data))
1966                return -EIO;
1967
1968        memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
1969        per_cpu(mce_dev, cpu).id        = cpu;
1970        per_cpu(mce_dev, cpu).cls       = &mce_sysclass;
1971
1972        err = sysdev_register(&per_cpu(mce_dev, cpu));
1973        if (err)
1974                return err;
1975
1976        for (i = 0; mce_attrs[i]; i++) {
1977                err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1978                if (err)
1979                        goto error;
1980        }
1981        for (j = 0; j < banks; j++) {
1982                err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1983                                        &mce_banks[j].attr);
1984                if (err)
1985                        goto error2;
1986        }
1987        cpumask_set_cpu(cpu, mce_dev_initialized);
1988
1989        return 0;
1990error2:
1991        while (--j >= 0)
1992                sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
1993error:
1994        while (--i >= 0)
1995                sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1996
1997        sysdev_unregister(&per_cpu(mce_dev, cpu));
1998
1999        return err;
2000}

2001
2002static __cpuinit void mce_remove_device(unsigned int cpu)
2003{
2004        int i;
2005
2006        if (!cpumask_test_cpu(cpu, mce_dev_initialized))
2007                return;
2008
2009        for (i = 0; mce_attrs[i]; i++)
2010                sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
2011
2012        for (i = 0; i < banks; i++)
2013                sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
2014
2015        sysdev_unregister(&per_cpu(mce_dev, cpu));
2016        cpumask_clear_cpu(cpu, mce_dev_initialized);
2017}
2018
2019/* Make sure there are no machine checks on offlined CPUs. */
2020static void __cpuinit mce_disable_cpu(void *h)
2021{
2022        unsigned long action = *(unsigned long *)h;
2023        int i;
2024
2025        if (!mce_available(__this_cpu_ptr(&cpu_info)))
2026                return;
2027
2028        if (!(action & CPU_TASKS_FROZEN))
2029                cmci_clear();
2030        for (i = 0; i < banks; i++) {
2031                struct mce_bank *b = &mce_banks[i];
2032
2033                if (b->init)
2034                        wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2035        }
2036}
2037
2038static void __cpuinit mce_reenable_cpu(void *h)
2039{
2040        unsigned long action = *(unsigned long *)h;
2041        int i;
2042
2043        if (!mce_available(__this_cpu_ptr(&cpu_info)))
2044                return;
2045
2046        if (!(action & CPU_TASKS_FROZEN))
2047                cmci_reenable();
2048        for (i = 0; i < banks; i++) {
2049                struct mce_bank *b = &mce_banks[i];
2050
2051                if (b->init)
2052                        wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
2053        }
2054}
2055
2056/* Get notified when a cpu comes on/off. Be hotplug friendly. */
2057static int __cpuinit
2058mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2059{
2060        unsigned int cpu = (unsigned long)hcpu;
2061        struct timer_list *t = &per_cpu(mce_timer, cpu);
2062
2063        switch (action) {
2064        case CPU_ONLINE:
2065        case CPU_ONLINE_FROZEN:
2066                mce_create_device(cpu);
2067                if (threshold_cpu_callback)
2068                        threshold_cpu_callback(action, cpu);
2069                break;
2070        case CPU_DEAD:
2071        case CPU_DEAD_FROZEN:
2072                if (threshold_cpu_callback)
2073                        threshold_cpu_callback(action, cpu);
2074                mce_remove_device(cpu);
2075                break;
2076        case CPU_DOWN_PREPARE:
2077        case CPU_DOWN_PREPARE_FROZEN:
2078                del_timer_sync(t);
2079                smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2080                break;
2081        case CPU_DOWN_FAILED:
2082        case CPU_DOWN_FAILED_FROZEN:
2083                if (!mce_ignore_ce && check_interval) {
2084                        t->expires = round_jiffies(jiffies +
2085                                           __get_cpu_var(mce_next_interval));
2086                        add_timer_on(t, cpu);
2087                }
2088                smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2089                break;
2090        case CPU_POST_DEAD:
2091                /* intentionally ignoring frozen here */
2092                cmci_rediscover(cpu);
2093                break;
2094        }
2095        return NOTIFY_OK;
2096}
2097
2098static struct notifier_block mce_cpu_notifier __cpuinitdata = {
2099        .notifier_call = mce_cpu_callback,
2100};
2101
2102static __init void mce_init_banks(void)
2103{
2104        int i;
2105
2106        for (i = 0; i < banks; i++) {
2107                struct mce_bank *b = &mce_banks[i];
2108                struct sysdev_attribute *a = &b->attr;
2109
2110                sysfs_attr_init(&a->attr);
2111                a->attr.name    = b->attrname;
2112                snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2113
2114                a->attr.mode    = 0644;
2115                a->show         = show_bank;
2116                a->store        = set_bank;
2117        }
2118}
2119
2120static __init int mcheck_init_device(void)
2121{
2122        int err;
2123        int i = 0;
2124
2125        if (!mce_available(&boot_cpu_data))
2126                return -EIO;
2127
2128        zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
2129
2130        mce_init_banks();
2131
2132        err = sysdev_class_register(&mce_sysclass);
2133        if (err)
2134                return err;
2135
2136        for_each_online_cpu(i) {
2137                err = mce_create_device(i);
2138                if (err)
2139                        return err;
2140        }
2141
2142        register_hotcpu_notifier(&mce_cpu_notifier);
2143        misc_register(&mce_log_device);
2144
2145        return err;
2146}
2147
2148device_initcall(mcheck_init_device);
2149
2150/*
2151 * Old style boot options parsing. Only for compatibility.
2152 */
2153static int __init mcheck_disable(char *str)
2154{
2155        mce_disabled = 1;
2156        return 1;
2157}
2158__setup("nomce", mcheck_disable);
2159
2160#ifdef CONFIG_DEBUG_FS
2161struct dentry *mce_get_debugfs_dir(void)
2162{
2163        static struct dentry *dmce;
2164
2165        if (!dmce)
2166                dmce = debugfs_create_dir("mce", NULL);
2167
2168        return dmce;
2169}
2170
2171static void mce_reset(void)
2172{
2173        cpu_missing = 0;
2174        atomic_set(&mce_fake_paniced, 0);
2175        atomic_set(&mce_executing, 0);
2176        atomic_set(&mce_callin, 0);
2177        atomic_set(&global_nwo, 0);
2178}
2179
2180static int fake_panic_get(void *data, u64 *val)
2181{
2182        *val = fake_panic;
2183        return 0;
2184}
2185
2186static int fake_panic_set(void *data, u64 val)
2187{
2188        mce_reset();
2189        fake_panic = val;
2190        return 0;
2191}
2192
2193DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2194                        fake_panic_set, "%llu\n");
2195
2196static int __init mcheck_debugfs_init(void)
2197{
2198        struct dentry *dmce, *ffake_panic;
2199
2200        dmce = mce_get_debugfs_dir();
2201        if (!dmce)
2202                return -ENOMEM;
2203        ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2204                                          &fake_panic_fops);
2205        if (!ffake_panic)
2206                return -ENOMEM;
2207
2208        return 0;
2209}
2210late_initcall(mcheck_debugfs_init);
2211#endif
2212