linux/arch/powerpc/platforms/powernv/opal-hmi.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * OPAL hypervisor Maintenance interrupt handling support in PowerNV.
   4 *
   5 * Copyright 2014 IBM Corporation
   6 * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
   7 */
   8
   9#undef DEBUG
  10
  11#include <linux/kernel.h>
  12#include <linux/init.h>
  13#include <linux/of.h>
  14#include <linux/mm.h>
  15#include <linux/slab.h>
  16
  17#include <asm/opal.h>
  18#include <asm/cputable.h>
  19#include <asm/machdep.h>
  20
  21#include "powernv.h"
  22
  23static int opal_hmi_handler_nb_init;
  24struct OpalHmiEvtNode {
  25        struct list_head list;
  26        struct OpalHMIEvent hmi_evt;
  27};
  28
  29struct xstop_reason {
  30        uint32_t xstop_reason;
  31        const char *unit_failed;
  32        const char *description;
  33};
  34
  35static LIST_HEAD(opal_hmi_evt_list);
  36static DEFINE_SPINLOCK(opal_hmi_evt_lock);
  37
  38static void print_core_checkstop_reason(const char *level,
  39                                        struct OpalHMIEvent *hmi_evt)
  40{
  41        int i;
  42        static const struct xstop_reason xstop_reason[] = {
  43                { CORE_CHECKSTOP_IFU_REGFILE, "IFU",
  44                                "RegFile core check stop" },
  45                { CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
  46                { CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
  47                                "Core checkstop during recovery" },
  48                { CORE_CHECKSTOP_ISU_REGFILE, "ISU",
  49                                "RegFile core check stop (mapper error)" },
  50                { CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
  51                { CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
  52                { CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
  53                { CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
  54                                "Recovery in maintenance mode" },
  55                { CORE_CHECKSTOP_LSU_REGFILE, "LSU",
  56                                "RegFile core check stop" },
  57                { CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
  58                                "Forward Progress Error" },
  59                { CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
  60                { CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
  61                { CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
  62                                "Hypervisor Resource error - core check stop" },
  63                { CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
  64                                "Hang Recovery Failed (core check stop)" },
  65                { CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
  66                                "Ambiguous Hang Detected (unknown source)" },
  67                { CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
  68                                "Debug Trigger Error inject" },
  69                { CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
  70                                "Hypervisor check stop via SPRC/SPRD" },
  71        };
  72
  73        /* Validity check */
  74        if (!hmi_evt->u.xstop_error.xstop_reason) {
  75                printk("%s      Unknown Core check stop.\n", level);
  76                return;
  77        }
  78
  79        printk("%s      CPU PIR: %08x\n", level,
  80                        be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
  81        for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
  82                if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
  83                                        xstop_reason[i].xstop_reason)
  84                        printk("%s      [Unit: %-3s] %s\n", level,
  85                                        xstop_reason[i].unit_failed,
  86                                        xstop_reason[i].description);
  87}
  88
  89static void print_nx_checkstop_reason(const char *level,
  90                                        struct OpalHMIEvent *hmi_evt)
  91{
  92        int i;
  93        static const struct xstop_reason xstop_reason[] = {
  94                { NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
  95                                        "SHM invalid state error" },
  96                { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
  97                                        "DMA invalid state error bit 15" },
  98                { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
  99                                        "DMA invalid state error bit 16" },
 100                { NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
 101                                        "Channel 0 invalid state error" },
 102                { NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
 103                                        "Channel 1 invalid state error" },
 104                { NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
 105                                        "Channel 2 invalid state error" },
 106                { NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
 107                                        "Channel 3 invalid state error" },
 108                { NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
 109                                        "Channel 4 invalid state error" },
 110                { NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
 111                                        "Channel 5 invalid state error" },
 112                { NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
 113                                        "Channel 6 invalid state error" },
 114                { NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
 115                                        "Channel 7 invalid state error" },
 116                { NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
 117                                        "UE error on CRB(CSB address, CCB)" },
 118                { NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
 119                                        "SUE error on CRB(CSB address, CCB)" },
 120                { NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
 121                "CRB Kill ISN received while holding ISN with UE error" },
 122        };
 123
 124        /* Validity check */
 125        if (!hmi_evt->u.xstop_error.xstop_reason) {
 126                printk("%s      Unknown NX check stop.\n", level);
 127                return;
 128        }
 129
 130        printk("%s      NX checkstop on CHIP ID: %x\n", level,
 131                        be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
 132        for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
 133                if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
 134                                        xstop_reason[i].xstop_reason)
 135                        printk("%s      [Unit: %-3s] %s\n", level,
 136                                        xstop_reason[i].unit_failed,
 137                                        xstop_reason[i].description);
 138}
 139
 140static void print_npu_checkstop_reason(const char *level,
 141                                        struct OpalHMIEvent *hmi_evt)
 142{
 143        uint8_t reason, reason_count, i;
 144
 145        /*
 146         * We may not have a checkstop reason on some combination of
 147         * hardware and/or skiboot version
 148         */
 149        if (!hmi_evt->u.xstop_error.xstop_reason) {
 150                printk("%s      NPU checkstop on chip %x\n", level,
 151                        be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
 152                return;
 153        }
 154
 155        /*
 156         * NPU2 has 3 FIRs. Reason encoded on a byte as:
 157         *   2 bits for the FIR number
 158         *   6 bits for the bit number
 159         * It may be possible to find several reasons.
 160         *
 161         * We don't display a specific message per FIR bit as there
 162         * are too many and most are meaningless without the workbook
 163         * and/or hw team help anyway.
 164         */
 165        reason_count = sizeof(hmi_evt->u.xstop_error.xstop_reason) /
 166                sizeof(reason);
 167        for (i = 0; i < reason_count; i++) {
 168                reason = (hmi_evt->u.xstop_error.xstop_reason >> (8 * i)) & 0xFF;
 169                if (reason)
 170                        printk("%s      NPU checkstop on chip %x: FIR%d bit %d is set\n",
 171                                level,
 172                                be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id),
 173                                reason >> 6, reason & 0x3F);
 174        }
 175}
 176
 177static void print_checkstop_reason(const char *level,
 178                                        struct OpalHMIEvent *hmi_evt)
 179{
 180        uint8_t type = hmi_evt->u.xstop_error.xstop_type;
 181        switch (type) {
 182        case CHECKSTOP_TYPE_CORE:
 183                print_core_checkstop_reason(level, hmi_evt);
 184                break;
 185        case CHECKSTOP_TYPE_NX:
 186                print_nx_checkstop_reason(level, hmi_evt);
 187                break;
 188        case CHECKSTOP_TYPE_NPU:
 189                print_npu_checkstop_reason(level, hmi_evt);
 190                break;
 191        default:
 192                printk("%s      Unknown Malfunction Alert of type %d\n",
 193                       level, type);
 194                break;
 195        }
 196}
 197
 198static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
 199{
 200        const char *level, *sevstr, *error_info;
 201        static const char *hmi_error_types[] = {
 202                "Malfunction Alert",
 203                "Processor Recovery done",
 204                "Processor recovery occurred again",
 205                "Processor recovery occurred for masked error",
 206                "Timer facility experienced an error",
 207                "TFMR SPR is corrupted",
 208                "UPS (Uninterrupted Power System) Overflow indication",
 209                "An XSCOM operation failure",
 210                "An XSCOM operation completed",
 211                "SCOM has set a reserved FIR bit to cause recovery",
 212                "Debug trigger has set a reserved FIR bit to cause recovery",
 213                "A hypervisor resource error occurred",
 214                "CAPP recovery process is in progress",
 215        };
 216        static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
 217                                      DEFAULT_RATELIMIT_BURST);
 218
 219        /* Print things out */
 220        if (hmi_evt->version < OpalHMIEvt_V1) {
 221                pr_err("HMI Interrupt, Unknown event version %d !\n",
 222                        hmi_evt->version);
 223                return;
 224        }
 225        switch (hmi_evt->severity) {
 226        case OpalHMI_SEV_NO_ERROR:
 227                level = KERN_INFO;
 228                sevstr = "Harmless";
 229                break;
 230        case OpalHMI_SEV_WARNING:
 231                level = KERN_WARNING;
 232                sevstr = "";
 233                break;
 234        case OpalHMI_SEV_ERROR_SYNC:
 235                level = KERN_ERR;
 236                sevstr = "Severe";
 237                break;
 238        case OpalHMI_SEV_FATAL:
 239        default:
 240                level = KERN_ERR;
 241                sevstr = "Fatal";
 242                break;
 243        }
 244
 245        if (hmi_evt->severity != OpalHMI_SEV_NO_ERROR || __ratelimit(&rs)) {
 246                printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
 247                        level, sevstr,
 248                        hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
 249                        "Recovered" : "Not recovered");
 250                error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
 251                                hmi_error_types[hmi_evt->type]
 252                                : "Unknown";
 253                printk("%s Error detail: %s\n", level, error_info);
 254                printk("%s      HMER: %016llx\n", level,
 255                                        be64_to_cpu(hmi_evt->hmer));
 256                if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
 257                        (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
 258                        printk("%s      TFMR: %016llx\n", level,
 259                                                be64_to_cpu(hmi_evt->tfmr));
 260        }
 261
 262        if (hmi_evt->version < OpalHMIEvt_V2)
 263                return;
 264
 265        /* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
 266        if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
 267                print_checkstop_reason(level, hmi_evt);
 268}
 269
 270static void hmi_event_handler(struct work_struct *work)
 271{
 272        unsigned long flags;
 273        struct OpalHMIEvent *hmi_evt;
 274        struct OpalHmiEvtNode *msg_node;
 275        uint8_t disposition;
 276        struct opal_msg msg;
 277        int unrecoverable = 0;
 278
 279        spin_lock_irqsave(&opal_hmi_evt_lock, flags);
 280        while (!list_empty(&opal_hmi_evt_list)) {
 281                msg_node = list_entry(opal_hmi_evt_list.next,
 282                                           struct OpalHmiEvtNode, list);
 283                list_del(&msg_node->list);
 284                spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
 285
 286                hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
 287                print_hmi_event_info(hmi_evt);
 288                disposition = hmi_evt->disposition;
 289                kfree(msg_node);
 290
 291                /*
 292                 * Check if HMI event has been recovered or not. If not
 293                 * then kernel can't continue, we need to panic.
 294                 * But before we do that, display all the HMI event
 295                 * available on the list and set unrecoverable flag to 1.
 296                 */
 297                if (disposition != OpalHMI_DISPOSITION_RECOVERED)
 298                        unrecoverable = 1;
 299
 300                spin_lock_irqsave(&opal_hmi_evt_lock, flags);
 301        }
 302        spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
 303
 304        if (unrecoverable) {
 305                /* Pull all HMI events from OPAL before we panic. */
 306                while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
 307                        u32 type;
 308
 309                        type = be32_to_cpu(msg.msg_type);
 310
 311                        /* skip if not HMI event */
 312                        if (type != OPAL_MSG_HMI_EVT)
 313                                continue;
 314
 315                        /* HMI event info starts from param[0] */
 316                        hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
 317                        print_hmi_event_info(hmi_evt);
 318                }
 319
 320                pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception");
 321        }
 322}
 323
 324static DECLARE_WORK(hmi_event_work, hmi_event_handler);
 325/*
 326 * opal_handle_hmi_event - notifier handler that queues up HMI events
 327 * to be preocessed later.
 328 */
 329static int opal_handle_hmi_event(struct notifier_block *nb,
 330                          unsigned long msg_type, void *msg)
 331{
 332        unsigned long flags;
 333        struct OpalHMIEvent *hmi_evt;
 334        struct opal_msg *hmi_msg = msg;
 335        struct OpalHmiEvtNode *msg_node;
 336
 337        /* Sanity Checks */
 338        if (msg_type != OPAL_MSG_HMI_EVT)
 339                return 0;
 340
 341        /* HMI event info starts from param[0] */
 342        hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
 343
 344        /* Delay the logging of HMI events to workqueue. */
 345        msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
 346        if (!msg_node) {
 347                pr_err("HMI: out of memory, Opal message event not handled\n");
 348                return -ENOMEM;
 349        }
 350        memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(*hmi_evt));
 351
 352        spin_lock_irqsave(&opal_hmi_evt_lock, flags);
 353        list_add(&msg_node->list, &opal_hmi_evt_list);
 354        spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
 355
 356        schedule_work(&hmi_event_work);
 357        return 0;
 358}
 359
 360static struct notifier_block opal_hmi_handler_nb = {
 361        .notifier_call  = opal_handle_hmi_event,
 362        .next           = NULL,
 363        .priority       = 0,
 364};
 365
 366int __init opal_hmi_handler_init(void)
 367{
 368        int ret;
 369
 370        if (!opal_hmi_handler_nb_init) {
 371                ret = opal_message_notifier_register(
 372                                OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
 373                if (ret) {
 374                        pr_err("%s: Can't register OPAL event notifier (%d)\n",
 375                               __func__, ret);
 376                        return ret;
 377                }
 378                opal_hmi_handler_nb_init = 1;
 379        }
 380        return 0;
 381}
 382