linux/drivers/edac/ghes_edac.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * GHES/EDAC Linux driver
   4 *
   5 * Copyright (c) 2013 by Mauro Carvalho Chehab
   6 *
   7 * Red Hat Inc. http://www.redhat.com
   8 */
   9
  10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  11
  12#include <acpi/ghes.h>
  13#include <linux/edac.h>
  14#include <linux/dmi.h>
  15#include "edac_module.h"
  16#include <ras/ras_event.h>
  17
  18struct ghes_edac_pvt {
  19        struct list_head list;
  20        struct ghes *ghes;
  21        struct mem_ctl_info *mci;
  22
  23        /* Buffers for the error handling routine */
  24        char detail_location[240];
  25        char other_detail[160];
  26        char msg[80];
  27};
  28
  29static atomic_t ghes_init = ATOMIC_INIT(0);
  30static struct ghes_edac_pvt *ghes_pvt;
  31
  32/*
  33 * Sync with other, potentially concurrent callers of
  34 * ghes_edac_report_mem_error(). We don't know what the
  35 * "inventive" firmware would do.
  36 */
  37static DEFINE_SPINLOCK(ghes_lock);
  38
  39/* "ghes_edac.force_load=1" skips the platform check */
  40static bool __read_mostly force_load;
  41module_param(force_load, bool, 0);
  42
  43/* Memory Device - Type 17 of SMBIOS spec */
  44struct memdev_dmi_entry {
  45        u8 type;
  46        u8 length;
  47        u16 handle;
  48        u16 phys_mem_array_handle;
  49        u16 mem_err_info_handle;
  50        u16 total_width;
  51        u16 data_width;
  52        u16 size;
  53        u8 form_factor;
  54        u8 device_set;
  55        u8 device_locator;
  56        u8 bank_locator;
  57        u8 memory_type;
  58        u16 type_detail;
  59        u16 speed;
  60        u8 manufacturer;
  61        u8 serial_number;
  62        u8 asset_tag;
  63        u8 part_number;
  64        u8 attributes;
  65        u32 extended_size;
  66        u16 conf_mem_clk_speed;
  67} __attribute__((__packed__));
  68
  69struct ghes_edac_dimm_fill {
  70        struct mem_ctl_info *mci;
  71        unsigned count;
  72};
  73
  74static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
  75{
  76        int *num_dimm = arg;
  77
  78        if (dh->type == DMI_ENTRY_MEM_DEVICE)
  79                (*num_dimm)++;
  80}
  81
  82static int get_dimm_smbios_index(u16 handle)
  83{
  84        struct mem_ctl_info *mci = ghes_pvt->mci;
  85        int i;
  86
  87        for (i = 0; i < mci->tot_dimms; i++) {
  88                if (mci->dimms[i]->smbios_handle == handle)
  89                        return i;
  90        }
  91        return -1;
  92}
  93
  94static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
  95{
  96        struct ghes_edac_dimm_fill *dimm_fill = arg;
  97        struct mem_ctl_info *mci = dimm_fill->mci;
  98
  99        if (dh->type == DMI_ENTRY_MEM_DEVICE) {
 100                struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh;
 101                struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
 102                                                       mci->n_layers,
 103                                                       dimm_fill->count, 0, 0);
 104                u16 rdr_mask = BIT(7) | BIT(13);
 105
 106                if (entry->size == 0xffff) {
 107                        pr_info("Can't get DIMM%i size\n",
 108                                dimm_fill->count);
 109                        dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */
 110                } else if (entry->size == 0x7fff) {
 111                        dimm->nr_pages = MiB_TO_PAGES(entry->extended_size);
 112                } else {
 113                        if (entry->size & BIT(15))
 114                                dimm->nr_pages = MiB_TO_PAGES((entry->size & 0x7fff) << 10);
 115                        else
 116                                dimm->nr_pages = MiB_TO_PAGES(entry->size);
 117                }
 118
 119                switch (entry->memory_type) {
 120                case 0x12:
 121                        if (entry->type_detail & BIT(13))
 122                                dimm->mtype = MEM_RDDR;
 123                        else
 124                                dimm->mtype = MEM_DDR;
 125                        break;
 126                case 0x13:
 127                        if (entry->type_detail & BIT(13))
 128                                dimm->mtype = MEM_RDDR2;
 129                        else
 130                                dimm->mtype = MEM_DDR2;
 131                        break;
 132                case 0x14:
 133                        dimm->mtype = MEM_FB_DDR2;
 134                        break;
 135                case 0x18:
 136                        if (entry->type_detail & BIT(12))
 137                                dimm->mtype = MEM_NVDIMM;
 138                        else if (entry->type_detail & BIT(13))
 139                                dimm->mtype = MEM_RDDR3;
 140                        else
 141                                dimm->mtype = MEM_DDR3;
 142                        break;
 143                case 0x1a:
 144                        if (entry->type_detail & BIT(12))
 145                                dimm->mtype = MEM_NVDIMM;
 146                        else if (entry->type_detail & BIT(13))
 147                                dimm->mtype = MEM_RDDR4;
 148                        else
 149                                dimm->mtype = MEM_DDR4;
 150                        break;
 151                default:
 152                        if (entry->type_detail & BIT(6))
 153                                dimm->mtype = MEM_RMBS;
 154                        else if ((entry->type_detail & rdr_mask) == rdr_mask)
 155                                dimm->mtype = MEM_RDR;
 156                        else if (entry->type_detail & BIT(7))
 157                                dimm->mtype = MEM_SDR;
 158                        else if (entry->type_detail & BIT(9))
 159                                dimm->mtype = MEM_EDO;
 160                        else
 161                                dimm->mtype = MEM_UNKNOWN;
 162                }
 163
 164                /*
 165                 * Actually, we can only detect if the memory has bits for
 166                 * checksum or not
 167                 */
 168                if (entry->total_width == entry->data_width)
 169                        dimm->edac_mode = EDAC_NONE;
 170                else
 171                        dimm->edac_mode = EDAC_SECDED;
 172
 173                dimm->dtype = DEV_UNKNOWN;
 174                dimm->grain = 128;              /* Likely, worse case */
 175
 176                /*
 177                 * FIXME: It shouldn't be hard to also fill the DIMM labels
 178                 */
 179
 180                if (dimm->nr_pages) {
 181                        edac_dbg(1, "DIMM%i: %s size = %d MB%s\n",
 182                                dimm_fill->count, edac_mem_types[dimm->mtype],
 183                                PAGES_TO_MiB(dimm->nr_pages),
 184                                (dimm->edac_mode != EDAC_NONE) ? "(ECC)" : "");
 185                        edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n",
 186                                entry->memory_type, entry->type_detail,
 187                                entry->total_width, entry->data_width);
 188                }
 189
 190                dimm->smbios_handle = entry->handle;
 191
 192                dimm_fill->count++;
 193        }
 194}
 195
 196void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 197{
 198        enum hw_event_mc_err_type type;
 199        struct edac_raw_error_desc *e;
 200        struct mem_ctl_info *mci;
 201        struct ghes_edac_pvt *pvt = ghes_pvt;
 202        unsigned long flags;
 203        char *p;
 204        u8 grain_bits;
 205
 206        if (!pvt)
 207                return;
 208
 209        /*
 210         * We can do the locking below because GHES defers error processing
 211         * from NMI to IRQ context. Whenever that changes, we'd at least
 212         * know.
 213         */
 214        if (WARN_ON_ONCE(in_nmi()))
 215                return;
 216
 217        spin_lock_irqsave(&ghes_lock, flags);
 218
 219        mci = pvt->mci;
 220        e = &mci->error_desc;
 221
 222        /* Cleans the error report buffer */
 223        memset(e, 0, sizeof (*e));
 224        e->error_count = 1;
 225        strcpy(e->label, "unknown label");
 226        e->msg = pvt->msg;
 227        e->other_detail = pvt->other_detail;
 228        e->top_layer = -1;
 229        e->mid_layer = -1;
 230        e->low_layer = -1;
 231        *pvt->other_detail = '\0';
 232        *pvt->msg = '\0';
 233
 234        switch (sev) {
 235        case GHES_SEV_CORRECTED:
 236                type = HW_EVENT_ERR_CORRECTED;
 237                break;
 238        case GHES_SEV_RECOVERABLE:
 239                type = HW_EVENT_ERR_UNCORRECTED;
 240                break;
 241        case GHES_SEV_PANIC:
 242                type = HW_EVENT_ERR_FATAL;
 243                break;
 244        default:
 245        case GHES_SEV_NO:
 246                type = HW_EVENT_ERR_INFO;
 247        }
 248
 249        edac_dbg(1, "error validation_bits: 0x%08llx\n",
 250                 (long long)mem_err->validation_bits);
 251
 252        /* Error type, mapped on e->msg */
 253        if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
 254                p = pvt->msg;
 255                switch (mem_err->error_type) {
 256                case 0:
 257                        p += sprintf(p, "Unknown");
 258                        break;
 259                case 1:
 260                        p += sprintf(p, "No error");
 261                        break;
 262                case 2:
 263                        p += sprintf(p, "Single-bit ECC");
 264                        break;
 265                case 3:
 266                        p += sprintf(p, "Multi-bit ECC");
 267                        break;
 268                case 4:
 269                        p += sprintf(p, "Single-symbol ChipKill ECC");
 270                        break;
 271                case 5:
 272                        p += sprintf(p, "Multi-symbol ChipKill ECC");
 273                        break;
 274                case 6:
 275                        p += sprintf(p, "Master abort");
 276                        break;
 277                case 7:
 278                        p += sprintf(p, "Target abort");
 279                        break;
 280                case 8:
 281                        p += sprintf(p, "Parity Error");
 282                        break;
 283                case 9:
 284                        p += sprintf(p, "Watchdog timeout");
 285                        break;
 286                case 10:
 287                        p += sprintf(p, "Invalid address");
 288                        break;
 289                case 11:
 290                        p += sprintf(p, "Mirror Broken");
 291                        break;
 292                case 12:
 293                        p += sprintf(p, "Memory Sparing");
 294                        break;
 295                case 13:
 296                        p += sprintf(p, "Scrub corrected error");
 297                        break;
 298                case 14:
 299                        p += sprintf(p, "Scrub uncorrected error");
 300                        break;
 301                case 15:
 302                        p += sprintf(p, "Physical Memory Map-out event");
 303                        break;
 304                default:
 305                        p += sprintf(p, "reserved error (%d)",
 306                                     mem_err->error_type);
 307                }
 308        } else {
 309                strcpy(pvt->msg, "unknown error");
 310        }
 311
 312        /* Error address */
 313        if (mem_err->validation_bits & CPER_MEM_VALID_PA) {
 314                e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT;
 315                e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK;
 316        }
 317
 318        /* Error grain */
 319        if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK)
 320                e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK);
 321
 322        /* Memory error location, mapped on e->location */
 323        p = e->location;
 324        if (mem_err->validation_bits & CPER_MEM_VALID_NODE)
 325                p += sprintf(p, "node:%d ", mem_err->node);
 326        if (mem_err->validation_bits & CPER_MEM_VALID_CARD)
 327                p += sprintf(p, "card:%d ", mem_err->card);
 328        if (mem_err->validation_bits & CPER_MEM_VALID_MODULE)
 329                p += sprintf(p, "module:%d ", mem_err->module);
 330        if (mem_err->validation_bits & CPER_MEM_VALID_RANK_NUMBER)
 331                p += sprintf(p, "rank:%d ", mem_err->rank);
 332        if (mem_err->validation_bits & CPER_MEM_VALID_BANK)
 333                p += sprintf(p, "bank:%d ", mem_err->bank);
 334        if (mem_err->validation_bits & CPER_MEM_VALID_ROW)
 335                p += sprintf(p, "row:%d ", mem_err->row);
 336        if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN)
 337                p += sprintf(p, "col:%d ", mem_err->column);
 338        if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION)
 339                p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
 340        if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
 341                const char *bank = NULL, *device = NULL;
 342                int index = -1;
 343
 344                dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);
 345                if (bank != NULL && device != NULL)
 346                        p += sprintf(p, "DIMM location:%s %s ", bank, device);
 347                else
 348                        p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
 349                                     mem_err->mem_dev_handle);
 350
 351                index = get_dimm_smbios_index(mem_err->mem_dev_handle);
 352                if (index >= 0) {
 353                        e->top_layer = index;
 354                        e->enable_per_layer_report = true;
 355                }
 356
 357        }
 358        if (p > e->location)
 359                *(p - 1) = '\0';
 360
 361        /* All other fields are mapped on e->other_detail */
 362        p = pvt->other_detail;
 363        if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_STATUS) {
 364                u64 status = mem_err->error_status;
 365
 366                p += sprintf(p, "status(0x%016llx): ", (long long)status);
 367                switch ((status >> 8) & 0xff) {
 368                case 1:
 369                        p += sprintf(p, "Error detected internal to the component ");
 370                        break;
 371                case 16:
 372                        p += sprintf(p, "Error detected in the bus ");
 373                        break;
 374                case 4:
 375                        p += sprintf(p, "Storage error in DRAM memory ");
 376                        break;
 377                case 5:
 378                        p += sprintf(p, "Storage error in TLB ");
 379                        break;
 380                case 6:
 381                        p += sprintf(p, "Storage error in cache ");
 382                        break;
 383                case 7:
 384                        p += sprintf(p, "Error in one or more functional units ");
 385                        break;
 386                case 8:
 387                        p += sprintf(p, "component failed self test ");
 388                        break;
 389                case 9:
 390                        p += sprintf(p, "Overflow or undervalue of internal queue ");
 391                        break;
 392                case 17:
 393                        p += sprintf(p, "Virtual address not found on IO-TLB or IO-PDIR ");
 394                        break;
 395                case 18:
 396                        p += sprintf(p, "Improper access error ");
 397                        break;
 398                case 19:
 399                        p += sprintf(p, "Access to a memory address which is not mapped to any component ");
 400                        break;
 401                case 20:
 402                        p += sprintf(p, "Loss of Lockstep ");
 403                        break;
 404                case 21:
 405                        p += sprintf(p, "Response not associated with a request ");
 406                        break;
 407                case 22:
 408                        p += sprintf(p, "Bus parity error - must also set the A, C, or D Bits ");
 409                        break;
 410                case 23:
 411                        p += sprintf(p, "Detection of a PATH_ERROR ");
 412                        break;
 413                case 25:
 414                        p += sprintf(p, "Bus operation timeout ");
 415                        break;
 416                case 26:
 417                        p += sprintf(p, "A read was issued to data that has been poisoned ");
 418                        break;
 419                default:
 420                        p += sprintf(p, "reserved ");
 421                        break;
 422                }
 423        }
 424        if (mem_err->validation_bits & CPER_MEM_VALID_REQUESTOR_ID)
 425                p += sprintf(p, "requestorID: 0x%016llx ",
 426                             (long long)mem_err->requestor_id);
 427        if (mem_err->validation_bits & CPER_MEM_VALID_RESPONDER_ID)
 428                p += sprintf(p, "responderID: 0x%016llx ",
 429                             (long long)mem_err->responder_id);
 430        if (mem_err->validation_bits & CPER_MEM_VALID_TARGET_ID)
 431                p += sprintf(p, "targetID: 0x%016llx ",
 432                             (long long)mem_err->responder_id);
 433        if (p > pvt->other_detail)
 434                *(p - 1) = '\0';
 435
 436        /* Generate the trace event */
 437        grain_bits = fls_long(e->grain);
 438        snprintf(pvt->detail_location, sizeof(pvt->detail_location),
 439                 "APEI location: %s %s", e->location, e->other_detail);
 440        trace_mc_event(type, e->msg, e->label, e->error_count,
 441                       mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer,
 442                       (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
 443                       grain_bits, e->syndrome, pvt->detail_location);
 444
 445        edac_raw_mc_handle_error(type, mci, e);
 446        spin_unlock_irqrestore(&ghes_lock, flags);
 447}
 448
 449/*
 450 * Known systems that are safe to enable this module.
 451 */
 452static struct acpi_platform_list plat_list[] = {
 453        {"HPE   ", "Server  ", 0, ACPI_SIG_FADT, all_versions},
 454        { } /* End */
 455};
 456
 457int ghes_edac_register(struct ghes *ghes, struct device *dev)
 458{
 459        bool fake = false;
 460        int rc, num_dimm = 0;
 461        struct mem_ctl_info *mci;
 462        struct edac_mc_layer layers[1];
 463        struct ghes_edac_dimm_fill dimm_fill;
 464        int idx = -1;
 465
 466        if (IS_ENABLED(CONFIG_X86)) {
 467                /* Check if safe to enable on this system */
 468                idx = acpi_match_platform_list(plat_list);
 469                if (!force_load && idx < 0)
 470                        return -ENODEV;
 471        } else {
 472                idx = 0;
 473        }
 474
 475        /*
 476         * We have only one logical memory controller to which all DIMMs belong.
 477         */
 478        if (atomic_inc_return(&ghes_init) > 1)
 479                return 0;
 480
 481        /* Get the number of DIMMs */
 482        dmi_walk(ghes_edac_count_dimms, &num_dimm);
 483
 484        /* Check if we've got a bogus BIOS */
 485        if (num_dimm == 0) {
 486                fake = true;
 487                num_dimm = 1;
 488        }
 489
 490        layers[0].type = EDAC_MC_LAYER_ALL_MEM;
 491        layers[0].size = num_dimm;
 492        layers[0].is_virt_csrow = true;
 493
 494        mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(struct ghes_edac_pvt));
 495        if (!mci) {
 496                pr_info("Can't allocate memory for EDAC data\n");
 497                return -ENOMEM;
 498        }
 499
 500        ghes_pvt        = mci->pvt_info;
 501        ghes_pvt->ghes  = ghes;
 502        ghes_pvt->mci   = mci;
 503
 504        mci->pdev = dev;
 505        mci->mtype_cap = MEM_FLAG_EMPTY;
 506        mci->edac_ctl_cap = EDAC_FLAG_NONE;
 507        mci->edac_cap = EDAC_FLAG_NONE;
 508        mci->mod_name = "ghes_edac.c";
 509        mci->ctl_name = "ghes_edac";
 510        mci->dev_name = "ghes";
 511
 512        if (fake) {
 513                pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n");
 514                pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n");
 515                pr_info("work on such system. Use this driver with caution\n");
 516        } else if (idx < 0) {
 517                pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n");
 518                pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n");
 519                pr_info("So, the end result of using this driver varies from vendor to vendor.\n");
 520                pr_info("If you find incorrect reports, please contact your hardware vendor\n");
 521                pr_info("to correct its BIOS.\n");
 522                pr_info("This system has %d DIMM sockets.\n", num_dimm);
 523        }
 524
 525        if (!fake) {
 526                dimm_fill.count = 0;
 527                dimm_fill.mci = mci;
 528                dmi_walk(ghes_edac_dmidecode, &dimm_fill);
 529        } else {
 530                struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
 531                                                       mci->n_layers, 0, 0, 0);
 532
 533                dimm->nr_pages = 1;
 534                dimm->grain = 128;
 535                dimm->mtype = MEM_UNKNOWN;
 536                dimm->dtype = DEV_UNKNOWN;
 537                dimm->edac_mode = EDAC_SECDED;
 538        }
 539
 540        rc = edac_mc_add_mc(mci);
 541        if (rc < 0) {
 542                pr_info("Can't register at EDAC core\n");
 543                edac_mc_free(mci);
 544                return -ENODEV;
 545        }
 546        return 0;
 547}
 548
 549void ghes_edac_unregister(struct ghes *ghes)
 550{
 551        struct mem_ctl_info *mci;
 552
 553        if (!ghes_pvt)
 554                return;
 555
 556        mci = ghes_pvt->mci;
 557        edac_mc_del_mc(mci->pdev);
 558        edac_mc_free(mci);
 559}
 560