linux/arch/x86/kernel/cpu/mcheck/dev-mcelog.c
<<
>>
Prefs
   1/*
   2 * /dev/mcelog driver
   3 *
   4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
   5 * Rest from unknown author(s).
   6 * 2004 Andi Kleen. Rewrote most of it.
   7 * Copyright 2008 Intel Corporation
   8 * Author: Andi Kleen
   9 */
  10
  11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  12
  13#include <linux/miscdevice.h>
  14#include <linux/slab.h>
  15#include <linux/kmod.h>
  16#include <linux/poll.h>
  17
  18#include "mce-internal.h"
  19
  20static DEFINE_MUTEX(mce_chrdev_read_mutex);
  21
  22static char mce_helper[128];
  23static char *mce_helper_argv[2] = { mce_helper, NULL };
  24
  25#define mce_log_get_idx_check(p) \
  26({ \
  27        RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
  28                         !lockdep_is_held(&mce_chrdev_read_mutex), \
  29                         "suspicious mce_log_get_idx_check() usage"); \
  30        smp_load_acquire(&(p)); \
  31})
  32
  33/*
  34 * Lockless MCE logging infrastructure.
  35 * This avoids deadlocks on printk locks without having to break locks. Also
  36 * separate MCEs from kernel messages to avoid bogus bug reports.
  37 */
  38
  39static struct mce_log_buffer mcelog = {
  40        .signature      = MCE_LOG_SIGNATURE,
  41        .len            = MCE_LOG_LEN,
  42        .recordlen      = sizeof(struct mce),
  43};
  44
  45static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
  46
  47/* User mode helper program triggered by machine check event */
  48extern char                     mce_helper[128];
  49
  50static int dev_mce_log(struct notifier_block *nb, unsigned long val,
  51                                void *data)
  52{
  53        struct mce *mce = (struct mce *)data;
  54        unsigned int next, entry;
  55
  56        wmb();
  57        for (;;) {
  58                entry = mce_log_get_idx_check(mcelog.next);
  59                for (;;) {
  60
  61                        /*
  62                         * When the buffer fills up discard new entries.
  63                         * Assume that the earlier errors are the more
  64                         * interesting ones:
  65                         */
  66                        if (entry >= MCE_LOG_LEN) {
  67                                set_bit(MCE_OVERFLOW,
  68                                        (unsigned long *)&mcelog.flags);
  69                                return NOTIFY_OK;
  70                        }
  71                        /* Old left over entry. Skip: */
  72                        if (mcelog.entry[entry].finished) {
  73                                entry++;
  74                                continue;
  75                        }
  76                        break;
  77                }
  78                smp_rmb();
  79                next = entry + 1;
  80                if (cmpxchg(&mcelog.next, entry, next) == entry)
  81                        break;
  82        }
  83        memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
  84        wmb();
  85        mcelog.entry[entry].finished = 1;
  86        wmb();
  87
  88        /* wake processes polling /dev/mcelog */
  89        wake_up_interruptible(&mce_chrdev_wait);
  90
  91        return NOTIFY_OK;
  92}
  93
  94static struct notifier_block dev_mcelog_nb = {
  95        .notifier_call  = dev_mce_log,
  96        .priority       = MCE_PRIO_MCELOG,
  97};
  98
  99static void mce_do_trigger(struct work_struct *work)
 100{
 101        call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
 102}
 103
 104static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
 105
 106
 107void mce_work_trigger(void)
 108{
 109        if (mce_helper[0])
 110                schedule_work(&mce_trigger_work);
 111}
 112
 113static ssize_t
 114show_trigger(struct device *s, struct device_attribute *attr, char *buf)
 115{
 116        strcpy(buf, mce_helper);
 117        strcat(buf, "\n");
 118        return strlen(mce_helper) + 1;
 119}
 120
 121static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
 122                                const char *buf, size_t siz)
 123{
 124        char *p;
 125
 126        strncpy(mce_helper, buf, sizeof(mce_helper));
 127        mce_helper[sizeof(mce_helper)-1] = 0;
 128        p = strchr(mce_helper, '\n');
 129
 130        if (p)
 131                *p = 0;
 132
 133        return strlen(mce_helper) + !!p;
 134}
 135
 136DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
 137
 138/*
 139 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
 140 */
 141
 142static DEFINE_SPINLOCK(mce_chrdev_state_lock);
 143static int mce_chrdev_open_count;       /* #times opened */
 144static int mce_chrdev_open_exclu;       /* already open exclusive? */
 145
 146static int mce_chrdev_open(struct inode *inode, struct file *file)
 147{
 148        spin_lock(&mce_chrdev_state_lock);
 149
 150        if (mce_chrdev_open_exclu ||
 151            (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
 152                spin_unlock(&mce_chrdev_state_lock);
 153
 154                return -EBUSY;
 155        }
 156
 157        if (file->f_flags & O_EXCL)
 158                mce_chrdev_open_exclu = 1;
 159        mce_chrdev_open_count++;
 160
 161        spin_unlock(&mce_chrdev_state_lock);
 162
 163        return nonseekable_open(inode, file);
 164}
 165
 166static int mce_chrdev_release(struct inode *inode, struct file *file)
 167{
 168        spin_lock(&mce_chrdev_state_lock);
 169
 170        mce_chrdev_open_count--;
 171        mce_chrdev_open_exclu = 0;
 172
 173        spin_unlock(&mce_chrdev_state_lock);
 174
 175        return 0;
 176}
 177
 178static void collect_tscs(void *data)
 179{
 180        unsigned long *cpu_tsc = (unsigned long *)data;
 181
 182        cpu_tsc[smp_processor_id()] = rdtsc();
 183}
 184
 185static int mce_apei_read_done;
 186
 187/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
 188static int __mce_read_apei(char __user **ubuf, size_t usize)
 189{
 190        int rc;
 191        u64 record_id;
 192        struct mce m;
 193
 194        if (usize < sizeof(struct mce))
 195                return -EINVAL;
 196
 197        rc = apei_read_mce(&m, &record_id);
 198        /* Error or no more MCE record */
 199        if (rc <= 0) {
 200                mce_apei_read_done = 1;
 201                /*
 202                 * When ERST is disabled, mce_chrdev_read() should return
 203                 * "no record" instead of "no device."
 204                 */
 205                if (rc == -ENODEV)
 206                        return 0;
 207                return rc;
 208        }
 209        rc = -EFAULT;
 210        if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
 211                return rc;
 212        /*
 213         * In fact, we should have cleared the record after that has
 214         * been flushed to the disk or sent to network in
 215         * /sbin/mcelog, but we have no interface to support that now,
 216         * so just clear it to avoid duplication.
 217         */
 218        rc = apei_clear_mce(record_id);
 219        if (rc) {
 220                mce_apei_read_done = 1;
 221                return rc;
 222        }
 223        *ubuf += sizeof(struct mce);
 224
 225        return 0;
 226}
 227
 228static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
 229                                size_t usize, loff_t *off)
 230{
 231        char __user *buf = ubuf;
 232        unsigned long *cpu_tsc;
 233        unsigned prev, next;
 234        int i, err;
 235
 236        cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
 237        if (!cpu_tsc)
 238                return -ENOMEM;
 239
 240        mutex_lock(&mce_chrdev_read_mutex);
 241
 242        if (!mce_apei_read_done) {
 243                err = __mce_read_apei(&buf, usize);
 244                if (err || buf != ubuf)
 245                        goto out;
 246        }
 247
 248        next = mce_log_get_idx_check(mcelog.next);
 249
 250        /* Only supports full reads right now */
 251        err = -EINVAL;
 252        if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
 253                goto out;
 254
 255        err = 0;
 256        prev = 0;
 257        do {
 258                for (i = prev; i < next; i++) {
 259                        unsigned long start = jiffies;
 260                        struct mce *m = &mcelog.entry[i];
 261
 262                        while (!m->finished) {
 263                                if (time_after_eq(jiffies, start + 2)) {
 264                                        memset(m, 0, sizeof(*m));
 265                                        goto timeout;
 266                                }
 267                                cpu_relax();
 268                        }
 269                        smp_rmb();
 270                        err |= copy_to_user(buf, m, sizeof(*m));
 271                        buf += sizeof(*m);
 272timeout:
 273                        ;
 274                }
 275
 276                memset(mcelog.entry + prev, 0,
 277                       (next - prev) * sizeof(struct mce));
 278                prev = next;
 279                next = cmpxchg(&mcelog.next, prev, 0);
 280        } while (next != prev);
 281
 282        synchronize_sched();
 283
 284        /*
 285         * Collect entries that were still getting written before the
 286         * synchronize.
 287         */
 288        on_each_cpu(collect_tscs, cpu_tsc, 1);
 289
 290        for (i = next; i < MCE_LOG_LEN; i++) {
 291                struct mce *m = &mcelog.entry[i];
 292
 293                if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
 294                        err |= copy_to_user(buf, m, sizeof(*m));
 295                        smp_rmb();
 296                        buf += sizeof(*m);
 297                        memset(m, 0, sizeof(*m));
 298                }
 299        }
 300
 301        if (err)
 302                err = -EFAULT;
 303
 304out:
 305        mutex_unlock(&mce_chrdev_read_mutex);
 306        kfree(cpu_tsc);
 307
 308        return err ? err : buf - ubuf;
 309}
 310
 311static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
 312{
 313        poll_wait(file, &mce_chrdev_wait, wait);
 314        if (READ_ONCE(mcelog.next))
 315                return POLLIN | POLLRDNORM;
 316        if (!mce_apei_read_done && apei_check_mce())
 317                return POLLIN | POLLRDNORM;
 318        return 0;
 319}
 320
 321static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
 322                                unsigned long arg)
 323{
 324        int __user *p = (int __user *)arg;
 325
 326        if (!capable(CAP_SYS_ADMIN))
 327                return -EPERM;
 328
 329        switch (cmd) {
 330        case MCE_GET_RECORD_LEN:
 331                return put_user(sizeof(struct mce), p);
 332        case MCE_GET_LOG_LEN:
 333                return put_user(MCE_LOG_LEN, p);
 334        case MCE_GETCLEAR_FLAGS: {
 335                unsigned flags;
 336
 337                do {
 338                        flags = mcelog.flags;
 339                } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
 340
 341                return put_user(flags, p);
 342        }
 343        default:
 344                return -ENOTTY;
 345        }
 346}
 347
 348static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
 349                            size_t usize, loff_t *off);
 350
 351void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
 352                             const char __user *ubuf,
 353                             size_t usize, loff_t *off))
 354{
 355        mce_write = fn;
 356}
 357EXPORT_SYMBOL_GPL(register_mce_write_callback);
 358
 359static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
 360                                size_t usize, loff_t *off)
 361{
 362        if (mce_write)
 363                return mce_write(filp, ubuf, usize, off);
 364        else
 365                return -EINVAL;
 366}
 367
 368static const struct file_operations mce_chrdev_ops = {
 369        .open                   = mce_chrdev_open,
 370        .release                = mce_chrdev_release,
 371        .read                   = mce_chrdev_read,
 372        .write                  = mce_chrdev_write,
 373        .poll                   = mce_chrdev_poll,
 374        .unlocked_ioctl         = mce_chrdev_ioctl,
 375        .llseek                 = no_llseek,
 376};
 377
 378static struct miscdevice mce_chrdev_device = {
 379        MISC_MCELOG_MINOR,
 380        "mcelog",
 381        &mce_chrdev_ops,
 382};
 383
 384static __init int dev_mcelog_init_device(void)
 385{
 386        int err;
 387
 388        /* register character device /dev/mcelog */
 389        err = misc_register(&mce_chrdev_device);
 390        if (err) {
 391                pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
 392                return err;
 393        }
 394        mce_register_decode_chain(&dev_mcelog_nb);
 395        return 0;
 396}
 397device_initcall_sync(dev_mcelog_init_device);
 398