linux/arch/powerpc/platforms/powernv/opal.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * PowerNV OPAL high level interfaces
   4 *
   5 * Copyright 2011 IBM Corp.
   6 */
   7
   8#define pr_fmt(fmt)     "opal: " fmt
   9
  10#include <linux/printk.h>
  11#include <linux/types.h>
  12#include <linux/of.h>
  13#include <linux/of_fdt.h>
  14#include <linux/of_platform.h>
  15#include <linux/of_address.h>
  16#include <linux/interrupt.h>
  17#include <linux/notifier.h>
  18#include <linux/slab.h>
  19#include <linux/sched.h>
  20#include <linux/kobject.h>
  21#include <linux/delay.h>
  22#include <linux/memblock.h>
  23#include <linux/kthread.h>
  24#include <linux/freezer.h>
  25#include <linux/kmsg_dump.h>
  26#include <linux/console.h>
  27#include <linux/sched/debug.h>
  28
  29#include <asm/machdep.h>
  30#include <asm/opal.h>
  31#include <asm/firmware.h>
  32#include <asm/mce.h>
  33#include <asm/imc-pmu.h>
  34#include <asm/bug.h>
  35
  36#include "powernv.h"
  37
  38#define OPAL_MSG_QUEUE_MAX 16
  39
  40struct opal_msg_node {
  41        struct list_head        list;
  42        struct opal_msg         msg;
  43};
  44
  45static DEFINE_SPINLOCK(msg_list_lock);
  46static LIST_HEAD(msg_list);
  47
  48/* /sys/firmware/opal */
  49struct kobject *opal_kobj;
  50
  51struct opal {
  52        u64 base;
  53        u64 entry;
  54        u64 size;
  55} opal;
  56
  57struct mcheck_recoverable_range {
  58        u64 start_addr;
  59        u64 end_addr;
  60        u64 recover_addr;
  61};
  62
  63static int msg_list_size;
  64
  65static struct mcheck_recoverable_range *mc_recoverable_range;
  66static int mc_recoverable_range_len;
  67
  68struct device_node *opal_node;
  69static DEFINE_SPINLOCK(opal_write_lock);
  70static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX];
  71static uint32_t opal_heartbeat;
  72static struct task_struct *kopald_tsk;
  73static struct opal_msg *opal_msg;
  74static u32 opal_msg_size __ro_after_init;
  75
  76void opal_configure_cores(void)
  77{
  78        u64 reinit_flags = 0;
  79
  80        /* Do the actual re-init, This will clobber all FPRs, VRs, etc...
  81         *
  82         * It will preserve non volatile GPRs and HSPRG0/1. It will
  83         * also restore HIDs and other SPRs to their original value
  84         * but it might clobber a bunch.
  85         */
  86#ifdef __BIG_ENDIAN__
  87        reinit_flags |= OPAL_REINIT_CPUS_HILE_BE;
  88#else
  89        reinit_flags |= OPAL_REINIT_CPUS_HILE_LE;
  90#endif
  91
  92        /*
  93         * POWER9 always support running hash:
  94         *  ie. Host hash  supports  hash guests
  95         *      Host radix supports  hash/radix guests
  96         */
  97        if (early_cpu_has_feature(CPU_FTR_ARCH_300)) {
  98                reinit_flags |= OPAL_REINIT_CPUS_MMU_HASH;
  99                if (early_radix_enabled())
 100                        reinit_flags |= OPAL_REINIT_CPUS_MMU_RADIX;
 101        }
 102
 103        opal_reinit_cpus(reinit_flags);
 104
 105        /* Restore some bits */
 106        if (cur_cpu_spec->cpu_restore)
 107                cur_cpu_spec->cpu_restore();
 108}
 109
 110int __init early_init_dt_scan_opal(unsigned long node,
 111                                   const char *uname, int depth, void *data)
 112{
 113        const void *basep, *entryp, *sizep;
 114        int basesz, entrysz, runtimesz;
 115
 116        if (depth != 1 || strcmp(uname, "ibm,opal") != 0)
 117                return 0;
 118
 119        basep  = of_get_flat_dt_prop(node, "opal-base-address", &basesz);
 120        entryp = of_get_flat_dt_prop(node, "opal-entry-address", &entrysz);
 121        sizep = of_get_flat_dt_prop(node, "opal-runtime-size", &runtimesz);
 122
 123        if (!basep || !entryp || !sizep)
 124                return 1;
 125
 126        opal.base = of_read_number(basep, basesz/4);
 127        opal.entry = of_read_number(entryp, entrysz/4);
 128        opal.size = of_read_number(sizep, runtimesz/4);
 129
 130        pr_debug("OPAL Base  = 0x%llx (basep=%p basesz=%d)\n",
 131                 opal.base, basep, basesz);
 132        pr_debug("OPAL Entry = 0x%llx (entryp=%p basesz=%d)\n",
 133                 opal.entry, entryp, entrysz);
 134        pr_debug("OPAL Entry = 0x%llx (sizep=%p runtimesz=%d)\n",
 135                 opal.size, sizep, runtimesz);
 136
 137        if (of_flat_dt_is_compatible(node, "ibm,opal-v3")) {
 138                powerpc_firmware_features |= FW_FEATURE_OPAL;
 139                pr_debug("OPAL detected !\n");
 140        } else {
 141                panic("OPAL != V3 detected, no longer supported.\n");
 142        }
 143
 144        return 1;
 145}
 146
 147int __init early_init_dt_scan_recoverable_ranges(unsigned long node,
 148                                   const char *uname, int depth, void *data)
 149{
 150        int i, psize, size;
 151        const __be32 *prop;
 152
 153        if (depth != 1 || strcmp(uname, "ibm,opal") != 0)
 154                return 0;
 155
 156        prop = of_get_flat_dt_prop(node, "mcheck-recoverable-ranges", &psize);
 157
 158        if (!prop)
 159                return 1;
 160
 161        pr_debug("Found machine check recoverable ranges.\n");
 162
 163        /*
 164         * Calculate number of available entries.
 165         *
 166         * Each recoverable address range entry is (start address, len,
 167         * recovery address), 2 cells each for start and recovery address,
 168         * 1 cell for len, totalling 5 cells per entry.
 169         */
 170        mc_recoverable_range_len = psize / (sizeof(*prop) * 5);
 171
 172        /* Sanity check */
 173        if (!mc_recoverable_range_len)
 174                return 1;
 175
 176        /* Size required to hold all the entries. */
 177        size = mc_recoverable_range_len *
 178                        sizeof(struct mcheck_recoverable_range);
 179
 180        /*
 181         * Allocate a buffer to hold the MC recoverable ranges.
 182         */
 183        mc_recoverable_range = memblock_alloc(size, __alignof__(u64));
 184        if (!mc_recoverable_range)
 185                panic("%s: Failed to allocate %u bytes align=0x%lx\n",
 186                      __func__, size, __alignof__(u64));
 187
 188        for (i = 0; i < mc_recoverable_range_len; i++) {
 189                mc_recoverable_range[i].start_addr =
 190                                        of_read_number(prop + (i * 5) + 0, 2);
 191                mc_recoverable_range[i].end_addr =
 192                                        mc_recoverable_range[i].start_addr +
 193                                        of_read_number(prop + (i * 5) + 2, 1);
 194                mc_recoverable_range[i].recover_addr =
 195                                        of_read_number(prop + (i * 5) + 3, 2);
 196
 197                pr_debug("Machine check recoverable range: %llx..%llx: %llx\n",
 198                                mc_recoverable_range[i].start_addr,
 199                                mc_recoverable_range[i].end_addr,
 200                                mc_recoverable_range[i].recover_addr);
 201        }
 202        return 1;
 203}
 204
 205static int __init opal_register_exception_handlers(void)
 206{
 207#ifdef __BIG_ENDIAN__
 208        u64 glue;
 209
 210        if (!(powerpc_firmware_features & FW_FEATURE_OPAL))
 211                return -ENODEV;
 212
 213        /* Hookup some exception handlers except machine check. We use the
 214         * fwnmi area at 0x7000 to provide the glue space to OPAL
 215         */
 216        glue = 0x7000;
 217
 218        /*
 219         * Only ancient OPAL firmware requires this.
 220         * Specifically, firmware from FW810.00 (released June 2014)
 221         * through FW810.20 (Released October 2014).
 222         *
 223         * Check if we are running on newer (post Oct 2014) firmware that
 224         * exports the OPAL_HANDLE_HMI token. If yes, then don't ask OPAL to
 225         * patch the HMI interrupt and we catch it directly in Linux.
 226         *
 227         * For older firmware (i.e < FW810.20), we fallback to old behavior and
 228         * let OPAL patch the HMI vector and handle it inside OPAL firmware.
 229         *
 230         * For newer firmware we catch/handle the HMI directly in Linux.
 231         */
 232        if (!opal_check_token(OPAL_HANDLE_HMI)) {
 233                pr_info("Old firmware detected, OPAL handles HMIs.\n");
 234                opal_register_exception_handler(
 235                                OPAL_HYPERVISOR_MAINTENANCE_HANDLER,
 236                                0, glue);
 237                glue += 128;
 238        }
 239
 240        /*
 241         * Only applicable to ancient firmware, all modern
 242         * (post March 2015/skiboot 5.0) firmware will just return
 243         * OPAL_UNSUPPORTED.
 244         */
 245        opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue);
 246#endif
 247
 248        return 0;
 249}
 250machine_early_initcall(powernv, opal_register_exception_handlers);
 251
 252static void queue_replay_msg(void *msg)
 253{
 254        struct opal_msg_node *msg_node;
 255
 256        if (msg_list_size < OPAL_MSG_QUEUE_MAX) {
 257                msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
 258                if (msg_node) {
 259                        INIT_LIST_HEAD(&msg_node->list);
 260                        memcpy(&msg_node->msg, msg, sizeof(struct opal_msg));
 261                        list_add_tail(&msg_node->list, &msg_list);
 262                        msg_list_size++;
 263                } else
 264                        pr_warn_once("message queue no memory\n");
 265
 266                if (msg_list_size >= OPAL_MSG_QUEUE_MAX)
 267                        pr_warn_once("message queue full\n");
 268        }
 269}
 270
 271static void dequeue_replay_msg(enum opal_msg_type msg_type)
 272{
 273        struct opal_msg_node *msg_node, *tmp;
 274
 275        list_for_each_entry_safe(msg_node, tmp, &msg_list, list) {
 276                if (be32_to_cpu(msg_node->msg.msg_type) != msg_type)
 277                        continue;
 278
 279                atomic_notifier_call_chain(&opal_msg_notifier_head[msg_type],
 280                                        msg_type,
 281                                        &msg_node->msg);
 282
 283                list_del(&msg_node->list);
 284                kfree(msg_node);
 285                msg_list_size--;
 286        }
 287}
 288
 289/*
 290 * Opal message notifier based on message type. Allow subscribers to get
 291 * notified for specific messgae type.
 292 */
 293int opal_message_notifier_register(enum opal_msg_type msg_type,
 294                                        struct notifier_block *nb)
 295{
 296        int ret;
 297        unsigned long flags;
 298
 299        if (!nb || msg_type >= OPAL_MSG_TYPE_MAX) {
 300                pr_warn("%s: Invalid arguments, msg_type:%d\n",
 301                        __func__, msg_type);
 302                return -EINVAL;
 303        }
 304
 305        spin_lock_irqsave(&msg_list_lock, flags);
 306        ret = atomic_notifier_chain_register(
 307                &opal_msg_notifier_head[msg_type], nb);
 308
 309        /*
 310         * If the registration succeeded, replay any queued messages that came
 311         * in prior to the notifier chain registration. msg_list_lock held here
 312         * to ensure they're delivered prior to any subsequent messages.
 313         */
 314        if (ret == 0)
 315                dequeue_replay_msg(msg_type);
 316
 317        spin_unlock_irqrestore(&msg_list_lock, flags);
 318
 319        return ret;
 320}
 321EXPORT_SYMBOL_GPL(opal_message_notifier_register);
 322
 323int opal_message_notifier_unregister(enum opal_msg_type msg_type,
 324                                     struct notifier_block *nb)
 325{
 326        return atomic_notifier_chain_unregister(
 327                        &opal_msg_notifier_head[msg_type], nb);
 328}
 329EXPORT_SYMBOL_GPL(opal_message_notifier_unregister);
 330
 331static void opal_message_do_notify(uint32_t msg_type, void *msg)
 332{
 333        unsigned long flags;
 334        bool queued = false;
 335
 336        spin_lock_irqsave(&msg_list_lock, flags);
 337        if (opal_msg_notifier_head[msg_type].head == NULL) {
 338                /*
 339                 * Queue up the msg since no notifiers have registered
 340                 * yet for this msg_type.
 341                 */
 342                queue_replay_msg(msg);
 343                queued = true;
 344        }
 345        spin_unlock_irqrestore(&msg_list_lock, flags);
 346
 347        if (queued)
 348                return;
 349
 350        /* notify subscribers */
 351        atomic_notifier_call_chain(&opal_msg_notifier_head[msg_type],
 352                                        msg_type, msg);
 353}
 354
 355static void opal_handle_message(void)
 356{
 357        s64 ret;
 358        u32 type;
 359
 360        ret = opal_get_msg(__pa(opal_msg), opal_msg_size);
 361        /* No opal message pending. */
 362        if (ret == OPAL_RESOURCE)
 363                return;
 364
 365        /* check for errors. */
 366        if (ret) {
 367                pr_warn("%s: Failed to retrieve opal message, err=%lld\n",
 368                        __func__, ret);
 369                return;
 370        }
 371
 372        type = be32_to_cpu(opal_msg->msg_type);
 373
 374        /* Sanity check */
 375        if (type >= OPAL_MSG_TYPE_MAX) {
 376                pr_warn_once("%s: Unknown message type: %u\n", __func__, type);
 377                return;
 378        }
 379        opal_message_do_notify(type, (void *)opal_msg);
 380}
 381
 382static irqreturn_t opal_message_notify(int irq, void *data)
 383{
 384        opal_handle_message();
 385        return IRQ_HANDLED;
 386}
 387
 388static int __init opal_message_init(struct device_node *opal_node)
 389{
 390        int ret, i, irq;
 391
 392        ret = of_property_read_u32(opal_node, "opal-msg-size", &opal_msg_size);
 393        if (ret) {
 394                pr_notice("Failed to read opal-msg-size property\n");
 395                opal_msg_size = sizeof(struct opal_msg);
 396        }
 397
 398        opal_msg = kmalloc(opal_msg_size, GFP_KERNEL);
 399        if (!opal_msg) {
 400                opal_msg_size = sizeof(struct opal_msg);
 401                /* Try to allocate fixed message size */
 402                opal_msg = kmalloc(opal_msg_size, GFP_KERNEL);
 403                BUG_ON(opal_msg == NULL);
 404        }
 405
 406        for (i = 0; i < OPAL_MSG_TYPE_MAX; i++)
 407                ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]);
 408
 409        irq = opal_event_request(ilog2(OPAL_EVENT_MSG_PENDING));
 410        if (!irq) {
 411                pr_err("%s: Can't register OPAL event irq (%d)\n",
 412                       __func__, irq);
 413                return irq;
 414        }
 415
 416        ret = request_irq(irq, opal_message_notify,
 417                        IRQ_TYPE_LEVEL_HIGH, "opal-msg", NULL);
 418        if (ret) {
 419                pr_err("%s: Can't request OPAL event irq (%d)\n",
 420                       __func__, ret);
 421                return ret;
 422        }
 423
 424        return 0;
 425}
 426
 427int opal_get_chars(uint32_t vtermno, char *buf, int count)
 428{
 429        s64 rc;
 430        __be64 evt, len;
 431
 432        if (!opal.entry)
 433                return -ENODEV;
 434        opal_poll_events(&evt);
 435        if ((be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_INPUT) == 0)
 436                return 0;
 437        len = cpu_to_be64(count);
 438        rc = opal_console_read(vtermno, &len, buf);
 439        if (rc == OPAL_SUCCESS)
 440                return be64_to_cpu(len);
 441        return 0;
 442}
 443
 444static int __opal_put_chars(uint32_t vtermno, const char *data, int total_len, bool atomic)
 445{
 446        unsigned long flags = 0 /* shut up gcc */;
 447        int written;
 448        __be64 olen;
 449        s64 rc;
 450
 451        if (!opal.entry)
 452                return -ENODEV;
 453
 454        if (atomic)
 455                spin_lock_irqsave(&opal_write_lock, flags);
 456        rc = opal_console_write_buffer_space(vtermno, &olen);
 457        if (rc || be64_to_cpu(olen) < total_len) {
 458                /* Closed -> drop characters */
 459                if (rc)
 460                        written = total_len;
 461                else
 462                        written = -EAGAIN;
 463                goto out;
 464        }
 465
 466        /* Should not get a partial write here because space is available. */
 467        olen = cpu_to_be64(total_len);
 468        rc = opal_console_write(vtermno, &olen, data);
 469        if (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
 470                if (rc == OPAL_BUSY_EVENT)
 471                        opal_poll_events(NULL);
 472                written = -EAGAIN;
 473                goto out;
 474        }
 475
 476        /* Closed or other error drop */
 477        if (rc != OPAL_SUCCESS) {
 478                written = opal_error_code(rc);
 479                goto out;
 480        }
 481
 482        written = be64_to_cpu(olen);
 483        if (written < total_len) {
 484                if (atomic) {
 485                        /* Should not happen */
 486                        pr_warn("atomic console write returned partial "
 487                                "len=%d written=%d\n", total_len, written);
 488                }
 489                if (!written)
 490                        written = -EAGAIN;
 491        }
 492
 493out:
 494        if (atomic)
 495                spin_unlock_irqrestore(&opal_write_lock, flags);
 496
 497        return written;
 498}
 499
 500int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
 501{
 502        return __opal_put_chars(vtermno, data, total_len, false);
 503}
 504
 505/*
 506 * opal_put_chars_atomic will not perform partial-writes. Data will be
 507 * atomically written to the terminal or not at all. This is not strictly
 508 * true at the moment because console space can race with OPAL's console
 509 * writes.
 510 */
 511int opal_put_chars_atomic(uint32_t vtermno, const char *data, int total_len)
 512{
 513        return __opal_put_chars(vtermno, data, total_len, true);
 514}
 515
 516static s64 __opal_flush_console(uint32_t vtermno)
 517{
 518        s64 rc;
 519
 520        if (!opal_check_token(OPAL_CONSOLE_FLUSH)) {
 521                __be64 evt;
 522
 523                /*
 524                 * If OPAL_CONSOLE_FLUSH is not implemented in the firmware,
 525                 * the console can still be flushed by calling the polling
 526                 * function while it has OPAL_EVENT_CONSOLE_OUTPUT events.
 527                 */
 528                WARN_ONCE(1, "opal: OPAL_CONSOLE_FLUSH missing.\n");
 529
 530                opal_poll_events(&evt);
 531                if (!(be64_to_cpu(evt) & OPAL_EVENT_CONSOLE_OUTPUT))
 532                        return OPAL_SUCCESS;
 533                return OPAL_BUSY;
 534
 535        } else {
 536                rc = opal_console_flush(vtermno);
 537                if (rc == OPAL_BUSY_EVENT) {
 538                        opal_poll_events(NULL);
 539                        rc = OPAL_BUSY;
 540                }
 541                return rc;
 542        }
 543
 544}
 545
 546/*
 547 * opal_flush_console spins until the console is flushed
 548 */
 549int opal_flush_console(uint32_t vtermno)
 550{
 551        for (;;) {
 552                s64 rc = __opal_flush_console(vtermno);
 553
 554                if (rc == OPAL_BUSY || rc == OPAL_PARTIAL) {
 555                        mdelay(1);
 556                        continue;
 557                }
 558
 559                return opal_error_code(rc);
 560        }
 561}
 562
 563/*
 564 * opal_flush_chars is an hvc interface that sleeps until the console is
 565 * flushed if wait, otherwise it will return -EBUSY if the console has data,
 566 * -EAGAIN if it has data and some of it was flushed.
 567 */
 568int opal_flush_chars(uint32_t vtermno, bool wait)
 569{
 570        for (;;) {
 571                s64 rc = __opal_flush_console(vtermno);
 572
 573                if (rc == OPAL_BUSY || rc == OPAL_PARTIAL) {
 574                        if (wait) {
 575                                msleep(OPAL_BUSY_DELAY_MS);
 576                                continue;
 577                        }
 578                        if (rc == OPAL_PARTIAL)
 579                                return -EAGAIN;
 580                }
 581
 582                return opal_error_code(rc);
 583        }
 584}
 585
 586static int opal_recover_mce(struct pt_regs *regs,
 587                                        struct machine_check_event *evt)
 588{
 589        int recovered = 0;
 590
 591        if (regs_is_unrecoverable(regs)) {
 592                /* If MSR_RI isn't set, we cannot recover */
 593                pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n");
 594                recovered = 0;
 595        } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) {
 596                /* Platform corrected itself */
 597                recovered = 1;
 598        } else if (evt->severity == MCE_SEV_FATAL) {
 599                /* Fatal machine check */
 600                pr_err("Machine check interrupt is fatal\n");
 601                recovered = 0;
 602        }
 603
 604        if (!recovered && evt->sync_error) {
 605                /*
 606                 * Try to kill processes if we get a synchronous machine check
 607                 * (e.g., one caused by execution of this instruction). This
 608                 * will devolve into a panic if we try to kill init or are in
 609                 * an interrupt etc.
 610                 *
 611                 * TODO: Queue up this address for hwpoisioning later.
 612                 * TODO: This is not quite right for d-side machine
 613                 *       checks ->nip is not necessarily the important
 614                 *       address.
 615                 */
 616                if ((user_mode(regs))) {
 617                        _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
 618                        recovered = 1;
 619                } else if (die_will_crash()) {
 620                        /*
 621                         * die() would kill the kernel, so better to go via
 622                         * the platform reboot code that will log the
 623                         * machine check.
 624                         */
 625                        recovered = 0;
 626                } else {
 627                        die_mce("Machine check", regs, SIGBUS);
 628                        recovered = 1;
 629                }
 630        }
 631
 632        return recovered;
 633}
 634
 635void __noreturn pnv_platform_error_reboot(struct pt_regs *regs, const char *msg)
 636{
 637        panic_flush_kmsg_start();
 638
 639        pr_emerg("Hardware platform error: %s\n", msg);
 640        if (regs)
 641                show_regs(regs);
 642        smp_send_stop();
 643
 644        panic_flush_kmsg_end();
 645
 646        /*
 647         * Don't bother to shut things down because this will
 648         * xstop the system.
 649         */
 650        if (opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, msg)
 651                                                == OPAL_UNSUPPORTED) {
 652                pr_emerg("Reboot type %d not supported for %s\n",
 653                                OPAL_REBOOT_PLATFORM_ERROR, msg);
 654        }
 655
 656        /*
 657         * We reached here. There can be three possibilities:
 658         * 1. We are running on a firmware level that do not support
 659         *    opal_cec_reboot2()
 660         * 2. We are running on a firmware level that do not support
 661         *    OPAL_REBOOT_PLATFORM_ERROR reboot type.
 662         * 3. We are running on FSP based system that does not need
 663         *    opal to trigger checkstop explicitly for error analysis.
 664         *    The FSP PRD component would have already got notified
 665         *    about this error through other channels.
 666         * 4. We are running on a newer skiboot that by default does
 667         *    not cause a checkstop, drops us back to the kernel to
 668         *    extract context and state at the time of the error.
 669         */
 670
 671        panic(msg);
 672}
 673
 674int opal_machine_check(struct pt_regs *regs)
 675{
 676        struct machine_check_event evt;
 677
 678        if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
 679                return 0;
 680
 681        /* Print things out */
 682        if (evt.version != MCE_V1) {
 683                pr_err("Machine Check Exception, Unknown event version %d !\n",
 684                       evt.version);
 685                return 0;
 686        }
 687        machine_check_print_event_info(&evt, user_mode(regs), false);
 688
 689        if (opal_recover_mce(regs, &evt))
 690                return 1;
 691
 692        pnv_platform_error_reboot(regs, "Unrecoverable Machine Check exception");
 693}
 694
 695/* Early hmi handler called in real mode. */
 696int opal_hmi_exception_early(struct pt_regs *regs)
 697{
 698        s64 rc;
 699
 700        /*
 701         * call opal hmi handler. Pass paca address as token.
 702         * The return value OPAL_SUCCESS is an indication that there is
 703         * an HMI event generated waiting to pull by Linux.
 704         */
 705        rc = opal_handle_hmi();
 706        if (rc == OPAL_SUCCESS) {
 707                local_paca->hmi_event_available = 1;
 708                return 1;
 709        }
 710        return 0;
 711}
 712
 713int opal_hmi_exception_early2(struct pt_regs *regs)
 714{
 715        s64 rc;
 716        __be64 out_flags;
 717
 718        /*
 719         * call opal hmi handler.
 720         * Check 64-bit flag mask to find out if an event was generated,
 721         * and whether TB is still valid or not etc.
 722         */
 723        rc = opal_handle_hmi2(&out_flags);
 724        if (rc != OPAL_SUCCESS)
 725                return 0;
 726
 727        if (be64_to_cpu(out_flags) & OPAL_HMI_FLAGS_NEW_EVENT)
 728                local_paca->hmi_event_available = 1;
 729        if (be64_to_cpu(out_flags) & OPAL_HMI_FLAGS_TOD_TB_FAIL)
 730                tb_invalid = true;
 731        return 1;
 732}
 733
 734/* HMI exception handler called in virtual mode when irqs are next enabled. */
 735int opal_handle_hmi_exception(struct pt_regs *regs)
 736{
 737        /*
 738         * Check if HMI event is available.
 739         * if Yes, then wake kopald to process them.
 740         */
 741        if (!local_paca->hmi_event_available)
 742                return 0;
 743
 744        local_paca->hmi_event_available = 0;
 745        opal_wake_poller();
 746
 747        return 1;
 748}
 749
 750static uint64_t find_recovery_address(uint64_t nip)
 751{
 752        int i;
 753
 754        for (i = 0; i < mc_recoverable_range_len; i++)
 755                if ((nip >= mc_recoverable_range[i].start_addr) &&
 756                    (nip < mc_recoverable_range[i].end_addr))
 757                    return mc_recoverable_range[i].recover_addr;
 758        return 0;
 759}
 760
 761bool opal_mce_check_early_recovery(struct pt_regs *regs)
 762{
 763        uint64_t recover_addr = 0;
 764
 765        if (!opal.base || !opal.size)
 766                goto out;
 767
 768        if ((regs->nip >= opal.base) &&
 769                        (regs->nip < (opal.base + opal.size)))
 770                recover_addr = find_recovery_address(regs->nip);
 771
 772        /*
 773         * Setup regs->nip to rfi into fixup address.
 774         */
 775        if (recover_addr)
 776                regs_set_return_ip(regs, recover_addr);
 777
 778out:
 779        return !!recover_addr;
 780}
 781
 782static int opal_sysfs_init(void)
 783{
 784        opal_kobj = kobject_create_and_add("opal", firmware_kobj);
 785        if (!opal_kobj) {
 786                pr_warn("kobject_create_and_add opal failed\n");
 787                return -ENOMEM;
 788        }
 789
 790        return 0;
 791}
 792
 793static ssize_t export_attr_read(struct file *fp, struct kobject *kobj,
 794                                struct bin_attribute *bin_attr, char *buf,
 795                                loff_t off, size_t count)
 796{
 797        return memory_read_from_buffer(buf, count, &off, bin_attr->private,
 798                                       bin_attr->size);
 799}
 800
 801static int opal_add_one_export(struct kobject *parent, const char *export_name,
 802                               struct device_node *np, const char *prop_name)
 803{
 804        struct bin_attribute *attr = NULL;
 805        const char *name = NULL;
 806        u64 vals[2];
 807        int rc;
 808
 809        rc = of_property_read_u64_array(np, prop_name, &vals[0], 2);
 810        if (rc)
 811                goto out;
 812
 813        attr = kzalloc(sizeof(*attr), GFP_KERNEL);
 814        if (!attr) {
 815                rc = -ENOMEM;
 816                goto out;
 817        }
 818        name = kstrdup(export_name, GFP_KERNEL);
 819        if (!name) {
 820                rc = -ENOMEM;
 821                goto out;
 822        }
 823
 824        sysfs_bin_attr_init(attr);
 825        attr->attr.name = name;
 826        attr->attr.mode = 0400;
 827        attr->read = export_attr_read;
 828        attr->private = __va(vals[0]);
 829        attr->size = vals[1];
 830
 831        rc = sysfs_create_bin_file(parent, attr);
 832out:
 833        if (rc) {
 834                kfree(name);
 835                kfree(attr);
 836        }
 837
 838        return rc;
 839}
 840
 841static void opal_add_exported_attrs(struct device_node *np,
 842                                    struct kobject *kobj)
 843{
 844        struct device_node *child;
 845        struct property *prop;
 846
 847        for_each_property_of_node(np, prop) {
 848                int rc;
 849
 850                if (!strcmp(prop->name, "name") ||
 851                    !strcmp(prop->name, "phandle"))
 852                        continue;
 853
 854                rc = opal_add_one_export(kobj, prop->name, np, prop->name);
 855                if (rc) {
 856                        pr_warn("Unable to add export %pOF/%s, rc = %d!\n",
 857                                np, prop->name, rc);
 858                }
 859        }
 860
 861        for_each_child_of_node(np, child) {
 862                struct kobject *child_kobj;
 863
 864                child_kobj = kobject_create_and_add(child->name, kobj);
 865                if (!child_kobj) {
 866                        pr_err("Unable to create export dir for %pOF\n", child);
 867                        continue;
 868                }
 869
 870                opal_add_exported_attrs(child, child_kobj);
 871        }
 872}
 873
 874/*
 875 * opal_export_attrs: creates a sysfs node for each property listed in
 876 * the device-tree under /ibm,opal/firmware/exports/
 877 * All new sysfs nodes are created under /opal/exports/.
 878 * This allows for reserved memory regions (e.g. HDAT) to be read.
 879 * The new sysfs nodes are only readable by root.
 880 */
 881static void opal_export_attrs(void)
 882{
 883        struct device_node *np;
 884        struct kobject *kobj;
 885        int rc;
 886
 887        np = of_find_node_by_path("/ibm,opal/firmware/exports");
 888        if (!np)
 889                return;
 890
 891        /* Create new 'exports' directory - /sys/firmware/opal/exports */
 892        kobj = kobject_create_and_add("exports", opal_kobj);
 893        if (!kobj) {
 894                pr_warn("kobject_create_and_add() of exports failed\n");
 895                return;
 896        }
 897
 898        opal_add_exported_attrs(np, kobj);
 899
 900        /*
 901         * NB: symbol_map existed before the generic export interface so it
 902         * lives under the top level opal_kobj.
 903         */
 904        rc = opal_add_one_export(opal_kobj, "symbol_map",
 905                                 np->parent, "symbol-map");
 906        if (rc)
 907                pr_warn("Error %d creating OPAL symbols file\n", rc);
 908
 909        of_node_put(np);
 910}
 911
 912static void __init opal_dump_region_init(void)
 913{
 914        void *addr;
 915        uint64_t size;
 916        int rc;
 917
 918        if (!opal_check_token(OPAL_REGISTER_DUMP_REGION))
 919                return;
 920
 921        /* Register kernel log buffer */
 922        addr = log_buf_addr_get();
 923        if (addr == NULL)
 924                return;
 925
 926        size = log_buf_len_get();
 927        if (size == 0)
 928                return;
 929
 930        rc = opal_register_dump_region(OPAL_DUMP_REGION_LOG_BUF,
 931                                       __pa(addr), size);
 932        /* Don't warn if this is just an older OPAL that doesn't
 933         * know about that call
 934         */
 935        if (rc && rc != OPAL_UNSUPPORTED)
 936                pr_warn("DUMP: Failed to register kernel log buffer. "
 937                        "rc = %d\n", rc);
 938}
 939
 940static void opal_pdev_init(const char *compatible)
 941{
 942        struct device_node *np;
 943
 944        for_each_compatible_node(np, NULL, compatible)
 945                of_platform_device_create(np, NULL, NULL);
 946}
 947
 948static void __init opal_imc_init_dev(void)
 949{
 950        struct device_node *np;
 951
 952        np = of_find_compatible_node(NULL, NULL, IMC_DTB_COMPAT);
 953        if (np)
 954                of_platform_device_create(np, NULL, NULL);
 955}
 956
 957static int kopald(void *unused)
 958{
 959        unsigned long timeout = msecs_to_jiffies(opal_heartbeat) + 1;
 960
 961        set_freezable();
 962        do {
 963                try_to_freeze();
 964
 965                opal_handle_events();
 966
 967                set_current_state(TASK_INTERRUPTIBLE);
 968                if (opal_have_pending_events())
 969                        __set_current_state(TASK_RUNNING);
 970                else
 971                        schedule_timeout(timeout);
 972
 973        } while (!kthread_should_stop());
 974
 975        return 0;
 976}
 977
 978void opal_wake_poller(void)
 979{
 980        if (kopald_tsk)
 981                wake_up_process(kopald_tsk);
 982}
 983
 984static void opal_init_heartbeat(void)
 985{
 986        /* Old firwmware, we assume the HVC heartbeat is sufficient */
 987        if (of_property_read_u32(opal_node, "ibm,heartbeat-ms",
 988                                 &opal_heartbeat) != 0)
 989                opal_heartbeat = 0;
 990
 991        if (opal_heartbeat)
 992                kopald_tsk = kthread_run(kopald, NULL, "kopald");
 993}
 994
 995static int __init opal_init(void)
 996{
 997        struct device_node *np, *consoles, *leds;
 998        int rc;
 999
1000        opal_node = of_find_node_by_path("/ibm,opal");
1001        if (!opal_node) {
1002                pr_warn("Device node not found\n");
1003                return -ENODEV;
1004        }
1005
1006        /* Register OPAL consoles if any ports */
1007        consoles = of_find_node_by_path("/ibm,opal/consoles");
1008        if (consoles) {
1009                for_each_child_of_node(consoles, np) {
1010                        if (!of_node_name_eq(np, "serial"))
1011                                continue;
1012                        of_platform_device_create(np, NULL, NULL);
1013                }
1014                of_node_put(consoles);
1015        }
1016
1017        /* Initialise OPAL messaging system */
1018        opal_message_init(opal_node);
1019
1020        /* Initialise OPAL asynchronous completion interface */
1021        opal_async_comp_init();
1022
1023        /* Initialise OPAL sensor interface */
1024        opal_sensor_init();
1025
1026        /* Initialise OPAL hypervisor maintainence interrupt handling */
1027        opal_hmi_handler_init();
1028
1029        /* Create i2c platform devices */
1030        opal_pdev_init("ibm,opal-i2c");
1031
1032        /* Handle non-volatile memory devices */
1033        opal_pdev_init("pmem-region");
1034
1035        /* Setup a heatbeat thread if requested by OPAL */
1036        opal_init_heartbeat();
1037
1038        /* Detect In-Memory Collection counters and create devices*/
1039        opal_imc_init_dev();
1040
1041        /* Create leds platform devices */
1042        leds = of_find_node_by_path("/ibm,opal/leds");
1043        if (leds) {
1044                of_platform_device_create(leds, "opal_leds", NULL);
1045                of_node_put(leds);
1046        }
1047
1048        /* Initialise OPAL message log interface */
1049        opal_msglog_init();
1050
1051        /* Create "opal" kobject under /sys/firmware */
1052        rc = opal_sysfs_init();
1053        if (rc == 0) {
1054                /* Setup dump region interface */
1055                opal_dump_region_init();
1056                /* Setup error log interface */
1057                rc = opal_elog_init();
1058                /* Setup code update interface */
1059                opal_flash_update_init();
1060                /* Setup platform dump extract interface */
1061                opal_platform_dump_init();
1062                /* Setup system parameters interface */
1063                opal_sys_param_init();
1064                /* Setup message log sysfs interface. */
1065                opal_msglog_sysfs_init();
1066                /* Add all export properties*/
1067                opal_export_attrs();
1068        }
1069
1070        /* Initialize platform devices: IPMI backend, PRD & flash interface */
1071        opal_pdev_init("ibm,opal-ipmi");
1072        opal_pdev_init("ibm,opal-flash");
1073        opal_pdev_init("ibm,opal-prd");
1074
1075        /* Initialise platform device: oppanel interface */
1076        opal_pdev_init("ibm,opal-oppanel");
1077
1078        /* Initialise OPAL kmsg dumper for flushing console on panic */
1079        opal_kmsg_init();
1080
1081        /* Initialise OPAL powercap interface */
1082        opal_powercap_init();
1083
1084        /* Initialise OPAL Power-Shifting-Ratio interface */
1085        opal_psr_init();
1086
1087        /* Initialise OPAL sensor groups */
1088        opal_sensor_groups_init();
1089
1090        /* Initialise OPAL Power control interface */
1091        opal_power_control_init();
1092
1093        /* Initialize OPAL secure variables */
1094        opal_pdev_init("ibm,secvar-backend");
1095
1096        return 0;
1097}
1098machine_subsys_initcall(powernv, opal_init);
1099
1100void opal_shutdown(void)
1101{
1102        long rc = OPAL_BUSY;
1103
1104        opal_event_shutdown();
1105
1106        /*
1107         * Then sync with OPAL which ensure anything that can
1108         * potentially write to our memory has completed such
1109         * as an ongoing dump retrieval
1110         */
1111        while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
1112                rc = opal_sync_host_reboot();
1113                if (rc == OPAL_BUSY)
1114                        opal_poll_events(NULL);
1115                else
1116                        mdelay(10);
1117        }
1118
1119        /* Unregister memory dump region */
1120        if (opal_check_token(OPAL_UNREGISTER_DUMP_REGION))
1121                opal_unregister_dump_region(OPAL_DUMP_REGION_LOG_BUF);
1122}
1123
1124/* Export this so that test modules can use it */
1125EXPORT_SYMBOL_GPL(opal_invalid_call);
1126EXPORT_SYMBOL_GPL(opal_xscom_read);
1127EXPORT_SYMBOL_GPL(opal_xscom_write);
1128EXPORT_SYMBOL_GPL(opal_ipmi_send);
1129EXPORT_SYMBOL_GPL(opal_ipmi_recv);
1130EXPORT_SYMBOL_GPL(opal_flash_read);
1131EXPORT_SYMBOL_GPL(opal_flash_write);
1132EXPORT_SYMBOL_GPL(opal_flash_erase);
1133EXPORT_SYMBOL_GPL(opal_prd_msg);
1134EXPORT_SYMBOL_GPL(opal_check_token);
1135
1136/* Convert a region of vmalloc memory to an opal sg list */
1137struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr,
1138                                             unsigned long vmalloc_size)
1139{
1140        struct opal_sg_list *sg, *first = NULL;
1141        unsigned long i = 0;
1142
1143        sg = kzalloc(PAGE_SIZE, GFP_KERNEL);
1144        if (!sg)
1145                goto nomem;
1146
1147        first = sg;
1148
1149        while (vmalloc_size > 0) {
1150                uint64_t data = vmalloc_to_pfn(vmalloc_addr) << PAGE_SHIFT;
1151                uint64_t length = min(vmalloc_size, PAGE_SIZE);
1152
1153                sg->entry[i].data = cpu_to_be64(data);
1154                sg->entry[i].length = cpu_to_be64(length);
1155                i++;
1156
1157                if (i >= SG_ENTRIES_PER_NODE) {
1158                        struct opal_sg_list *next;
1159
1160                        next = kzalloc(PAGE_SIZE, GFP_KERNEL);
1161                        if (!next)
1162                                goto nomem;
1163
1164                        sg->length = cpu_to_be64(
1165                                        i * sizeof(struct opal_sg_entry) + 16);
1166                        i = 0;
1167                        sg->next = cpu_to_be64(__pa(next));
1168                        sg = next;
1169                }
1170
1171                vmalloc_addr += length;
1172                vmalloc_size -= length;
1173        }
1174
1175        sg->length = cpu_to_be64(i * sizeof(struct opal_sg_entry) + 16);
1176
1177        return first;
1178
1179nomem:
1180        pr_err("%s : Failed to allocate memory\n", __func__);
1181        opal_free_sg_list(first);
1182        return NULL;
1183}
1184
1185void opal_free_sg_list(struct opal_sg_list *sg)
1186{
1187        while (sg) {
1188                uint64_t next = be64_to_cpu(sg->next);
1189
1190                kfree(sg);
1191
1192                if (next)
1193                        sg = __va(next);
1194                else
1195                        sg = NULL;
1196        }
1197}
1198
1199int opal_error_code(int rc)
1200{
1201        switch (rc) {
1202        case OPAL_SUCCESS:              return 0;
1203
1204        case OPAL_PARAMETER:            return -EINVAL;
1205        case OPAL_ASYNC_COMPLETION:     return -EINPROGRESS;
1206        case OPAL_BUSY:
1207        case OPAL_BUSY_EVENT:           return -EBUSY;
1208        case OPAL_NO_MEM:               return -ENOMEM;
1209        case OPAL_PERMISSION:           return -EPERM;
1210
1211        case OPAL_UNSUPPORTED:          return -EIO;
1212        case OPAL_HARDWARE:             return -EIO;
1213        case OPAL_INTERNAL_ERROR:       return -EIO;
1214        case OPAL_TIMEOUT:              return -ETIMEDOUT;
1215        default:
1216                pr_err("%s: unexpected OPAL error %d\n", __func__, rc);
1217                return -EIO;
1218        }
1219}
1220
1221void powernv_set_nmmu_ptcr(unsigned long ptcr)
1222{
1223        int rc;
1224
1225        if (firmware_has_feature(FW_FEATURE_OPAL)) {
1226                rc = opal_nmmu_set_ptcr(-1UL, ptcr);
1227                if (rc != OPAL_SUCCESS && rc != OPAL_UNSUPPORTED)
1228                        pr_warn("%s: Unable to set nest mmu ptcr\n", __func__);
1229        }
1230}
1231
1232EXPORT_SYMBOL_GPL(opal_poll_events);
1233EXPORT_SYMBOL_GPL(opal_rtc_read);
1234EXPORT_SYMBOL_GPL(opal_rtc_write);
1235EXPORT_SYMBOL_GPL(opal_tpo_read);
1236EXPORT_SYMBOL_GPL(opal_tpo_write);
1237EXPORT_SYMBOL_GPL(opal_i2c_request);
1238/* Export these symbols for PowerNV LED class driver */
1239EXPORT_SYMBOL_GPL(opal_leds_get_ind);
1240EXPORT_SYMBOL_GPL(opal_leds_set_ind);
1241/* Export this symbol for PowerNV Operator Panel class driver */
1242EXPORT_SYMBOL_GPL(opal_write_oppanel_async);
1243/* Export this for KVM */
1244EXPORT_SYMBOL_GPL(opal_int_set_mfrr);
1245EXPORT_SYMBOL_GPL(opal_int_eoi);
1246EXPORT_SYMBOL_GPL(opal_error_code);
1247/* Export the below symbol for NX compression */
1248EXPORT_SYMBOL(opal_nx_coproc_init);
1249