linux/arch/powerpc/platforms/powernv/opal-hmi.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * OPAL hypervisor Maintenance interrupt handling support in PowerNV.
   4 *
   5 * Copyright 2014 IBM Corporation
   6 * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
   7 */
   8
   9#undef DEBUG
  10
  11#include <linux/kernel.h>
  12#include <linux/init.h>
  13#include <linux/of.h>
  14#include <linux/mm.h>
  15#include <linux/slab.h>
  16
  17#include <asm/opal.h>
  18#include <asm/cputable.h>
  19#include <asm/machdep.h>
  20
  21#include "powernv.h"
  22
  23static int opal_hmi_handler_nb_init;
  24struct OpalHmiEvtNode {
  25        struct list_head list;
  26        struct OpalHMIEvent hmi_evt;
  27};
  28
  29struct xstop_reason {
  30        uint32_t xstop_reason;
  31        const char *unit_failed;
  32        const char *description;
  33};
  34
  35static LIST_HEAD(opal_hmi_evt_list);
  36static DEFINE_SPINLOCK(opal_hmi_evt_lock);
  37
  38static void print_core_checkstop_reason(const char *level,
  39                                        struct OpalHMIEvent *hmi_evt)
  40{
  41        int i;
  42        static const struct xstop_reason xstop_reason[] = {
  43                { CORE_CHECKSTOP_IFU_REGFILE, "IFU",
  44                                "RegFile core check stop" },
  45                { CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
  46                { CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
  47                                "Core checkstop during recovery" },
  48                { CORE_CHECKSTOP_ISU_REGFILE, "ISU",
  49                                "RegFile core check stop (mapper error)" },
  50                { CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
  51                { CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
  52                { CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
  53                { CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
  54                                "Recovery in maintenance mode" },
  55                { CORE_CHECKSTOP_LSU_REGFILE, "LSU",
  56                                "RegFile core check stop" },
  57                { CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
  58                                "Forward Progress Error" },
  59                { CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
  60                { CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
  61                { CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
  62                                "Hypervisor Resource error - core check stop" },
  63                { CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
  64                                "Hang Recovery Failed (core check stop)" },
  65                { CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
  66                                "Ambiguous Hang Detected (unknown source)" },
  67                { CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
  68                                "Debug Trigger Error inject" },
  69                { CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
  70                                "Hypervisor check stop via SPRC/SPRD" },
  71        };
  72
  73        /* Validity check */
  74        if (!hmi_evt->u.xstop_error.xstop_reason) {
  75                printk("%s      Unknown Core check stop.\n", level);
  76                return;
  77        }
  78
  79        printk("%s      CPU PIR: %08x\n", level,
  80                        be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
  81        for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
  82                if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
  83                                        xstop_reason[i].xstop_reason)
  84                        printk("%s      [Unit: %-3s] %s\n", level,
  85                                        xstop_reason[i].unit_failed,
  86                                        xstop_reason[i].description);
  87}
  88
  89static void print_nx_checkstop_reason(const char *level,
  90                                        struct OpalHMIEvent *hmi_evt)
  91{
  92        int i;
  93        static const struct xstop_reason xstop_reason[] = {
  94                { NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
  95                                        "SHM invalid state error" },
  96                { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
  97                                        "DMA invalid state error bit 15" },
  98                { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
  99                                        "DMA invalid state error bit 16" },
 100                { NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
 101                                        "Channel 0 invalid state error" },
 102                { NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
 103                                        "Channel 1 invalid state error" },
 104                { NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
 105                                        "Channel 2 invalid state error" },
 106                { NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
 107                                        "Channel 3 invalid state error" },
 108                { NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
 109                                        "Channel 4 invalid state error" },
 110                { NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
 111                                        "Channel 5 invalid state error" },
 112                { NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
 113                                        "Channel 6 invalid state error" },
 114                { NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
 115                                        "Channel 7 invalid state error" },
 116                { NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
 117                                        "UE error on CRB(CSB address, CCB)" },
 118                { NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
 119                                        "SUE error on CRB(CSB address, CCB)" },
 120                { NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
 121                "CRB Kill ISN received while holding ISN with UE error" },
 122        };
 123
 124        /* Validity check */
 125        if (!hmi_evt->u.xstop_error.xstop_reason) {
 126                printk("%s      Unknown NX check stop.\n", level);
 127                return;
 128        }
 129
 130        printk("%s      NX checkstop on CHIP ID: %x\n", level,
 131                        be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
 132        for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
 133                if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
 134                                        xstop_reason[i].xstop_reason)
 135                        printk("%s      [Unit: %-3s] %s\n", level,
 136                                        xstop_reason[i].unit_failed,
 137                                        xstop_reason[i].description);
 138}
 139
 140static void print_checkstop_reason(const char *level,
 141                                        struct OpalHMIEvent *hmi_evt)
 142{
 143        uint8_t type = hmi_evt->u.xstop_error.xstop_type;
 144        switch (type) {
 145        case CHECKSTOP_TYPE_CORE:
 146                print_core_checkstop_reason(level, hmi_evt);
 147                break;
 148        case CHECKSTOP_TYPE_NX:
 149                print_nx_checkstop_reason(level, hmi_evt);
 150                break;
 151        default:
 152                printk("%s      Unknown Malfunction Alert of type %d\n",
 153                       level, type);
 154                break;
 155        }
 156}
 157
 158static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
 159{
 160        const char *level, *sevstr, *error_info;
 161        static const char *hmi_error_types[] = {
 162                "Malfunction Alert",
 163                "Processor Recovery done",
 164                "Processor recovery occurred again",
 165                "Processor recovery occurred for masked error",
 166                "Timer facility experienced an error",
 167                "TFMR SPR is corrupted",
 168                "UPS (Uninterrupted Power System) Overflow indication",
 169                "An XSCOM operation failure",
 170                "An XSCOM operation completed",
 171                "SCOM has set a reserved FIR bit to cause recovery",
 172                "Debug trigger has set a reserved FIR bit to cause recovery",
 173                "A hypervisor resource error occurred",
 174                "CAPP recovery process is in progress",
 175        };
 176
 177        /* Print things out */
 178        if (hmi_evt->version < OpalHMIEvt_V1) {
 179                pr_err("HMI Interrupt, Unknown event version %d !\n",
 180                        hmi_evt->version);
 181                return;
 182        }
 183        switch (hmi_evt->severity) {
 184        case OpalHMI_SEV_NO_ERROR:
 185                level = KERN_INFO;
 186                sevstr = "Harmless";
 187                break;
 188        case OpalHMI_SEV_WARNING:
 189                level = KERN_WARNING;
 190                sevstr = "";
 191                break;
 192        case OpalHMI_SEV_ERROR_SYNC:
 193                level = KERN_ERR;
 194                sevstr = "Severe";
 195                break;
 196        case OpalHMI_SEV_FATAL:
 197        default:
 198                level = KERN_ERR;
 199                sevstr = "Fatal";
 200                break;
 201        }
 202
 203        printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
 204                level, sevstr,
 205                hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
 206                "Recovered" : "Not recovered");
 207        error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
 208                        hmi_error_types[hmi_evt->type]
 209                        : "Unknown";
 210        printk("%s Error detail: %s\n", level, error_info);
 211        printk("%s      HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer));
 212        if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
 213                (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
 214                printk("%s      TFMR: %016llx\n", level,
 215                                                be64_to_cpu(hmi_evt->tfmr));
 216
 217        if (hmi_evt->version < OpalHMIEvt_V2)
 218                return;
 219
 220        /* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
 221        if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
 222                print_checkstop_reason(level, hmi_evt);
 223}
 224
 225static void hmi_event_handler(struct work_struct *work)
 226{
 227        unsigned long flags;
 228        struct OpalHMIEvent *hmi_evt;
 229        struct OpalHmiEvtNode *msg_node;
 230        uint8_t disposition;
 231        struct opal_msg msg;
 232        int unrecoverable = 0;
 233
 234        spin_lock_irqsave(&opal_hmi_evt_lock, flags);
 235        while (!list_empty(&opal_hmi_evt_list)) {
 236                msg_node = list_entry(opal_hmi_evt_list.next,
 237                                           struct OpalHmiEvtNode, list);
 238                list_del(&msg_node->list);
 239                spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
 240
 241                hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
 242                print_hmi_event_info(hmi_evt);
 243                disposition = hmi_evt->disposition;
 244                kfree(msg_node);
 245
 246                /*
 247                 * Check if HMI event has been recovered or not. If not
 248                 * then kernel can't continue, we need to panic.
 249                 * But before we do that, display all the HMI event
 250                 * available on the list and set unrecoverable flag to 1.
 251                 */
 252                if (disposition != OpalHMI_DISPOSITION_RECOVERED)
 253                        unrecoverable = 1;
 254
 255                spin_lock_irqsave(&opal_hmi_evt_lock, flags);
 256        }
 257        spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
 258
 259        if (unrecoverable) {
 260                /* Pull all HMI events from OPAL before we panic. */
 261                while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
 262                        u32 type;
 263
 264                        type = be32_to_cpu(msg.msg_type);
 265
 266                        /* skip if not HMI event */
 267                        if (type != OPAL_MSG_HMI_EVT)
 268                                continue;
 269
 270                        /* HMI event info starts from param[0] */
 271                        hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
 272                        print_hmi_event_info(hmi_evt);
 273                }
 274
 275                pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception");
 276        }
 277}
 278
 279static DECLARE_WORK(hmi_event_work, hmi_event_handler);
 280/*
 281 * opal_handle_hmi_event - notifier handler that queues up HMI events
 282 * to be preocessed later.
 283 */
 284static int opal_handle_hmi_event(struct notifier_block *nb,
 285                          unsigned long msg_type, void *msg)
 286{
 287        unsigned long flags;
 288        struct OpalHMIEvent *hmi_evt;
 289        struct opal_msg *hmi_msg = msg;
 290        struct OpalHmiEvtNode *msg_node;
 291
 292        /* Sanity Checks */
 293        if (msg_type != OPAL_MSG_HMI_EVT)
 294                return 0;
 295
 296        /* HMI event info starts from param[0] */
 297        hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
 298
 299        /* Delay the logging of HMI events to workqueue. */
 300        msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
 301        if (!msg_node) {
 302                pr_err("HMI: out of memory, Opal message event not handled\n");
 303                return -ENOMEM;
 304        }
 305        memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(*hmi_evt));
 306
 307        spin_lock_irqsave(&opal_hmi_evt_lock, flags);
 308        list_add(&msg_node->list, &opal_hmi_evt_list);
 309        spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
 310
 311        schedule_work(&hmi_event_work);
 312        return 0;
 313}
 314
 315static struct notifier_block opal_hmi_handler_nb = {
 316        .notifier_call  = opal_handle_hmi_event,
 317        .next           = NULL,
 318        .priority       = 0,
 319};
 320
 321int __init opal_hmi_handler_init(void)
 322{
 323        int ret;
 324
 325        if (!opal_hmi_handler_nb_init) {
 326                ret = opal_message_notifier_register(
 327                                OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
 328                if (ret) {
 329                        pr_err("%s: Can't register OPAL event notifier (%d)\n",
 330                               __func__, ret);
 331                        return ret;
 332                }
 333                opal_hmi_handler_nb_init = 1;
 334        }
 335        return 0;
 336}
 337