linux/drivers/edac/ghes_edac.c
<<
>>
Prefs
   1/*
   2 * GHES/EDAC Linux driver
   3 *
   4 * This file may be distributed under the terms of the GNU General Public
   5 * License version 2.
   6 *
   7 * Copyright (c) 2013 by Mauro Carvalho Chehab
   8 *
   9 * Red Hat Inc. http://www.redhat.com
  10 */
  11
  12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  13
  14#include <acpi/ghes.h>
  15#include <linux/edac.h>
  16#include <linux/dmi.h>
  17#include "edac_core.h"
  18#include <ras/ras_event.h>
  19
  20#define GHES_EDAC_REVISION " Ver: 1.0.0"
  21
  22struct ghes_edac_pvt {
  23        struct list_head list;
  24        struct ghes *ghes;
  25        struct mem_ctl_info *mci;
  26
  27        /* Buffers for the error handling routine */
  28        char detail_location[240];
  29        char other_detail[160];
  30        char msg[80];
  31};
  32
  33static LIST_HEAD(ghes_reglist);
  34static DEFINE_MUTEX(ghes_edac_lock);
  35static int ghes_edac_mc_num;
  36
  37
  38/* Memory Device - Type 17 of SMBIOS spec */
  39struct memdev_dmi_entry {
  40        u8 type;
  41        u8 length;
  42        u16 handle;
  43        u16 phys_mem_array_handle;
  44        u16 mem_err_info_handle;
  45        u16 total_width;
  46        u16 data_width;
  47        u16 size;
  48        u8 form_factor;
  49        u8 device_set;
  50        u8 device_locator;
  51        u8 bank_locator;
  52        u8 memory_type;
  53        u16 type_detail;
  54        u16 speed;
  55        u8 manufacturer;
  56        u8 serial_number;
  57        u8 asset_tag;
  58        u8 part_number;
  59        u8 attributes;
  60        u32 extended_size;
  61        u16 conf_mem_clk_speed;
  62} __attribute__((__packed__));
  63
  64struct ghes_edac_dimm_fill {
  65        struct mem_ctl_info *mci;
  66        unsigned count;
  67};
  68
  69static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
  70{
  71        int *num_dimm = arg;
  72
  73        if (dh->type == DMI_ENTRY_MEM_DEVICE)
  74                (*num_dimm)++;
  75}
  76
  77static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
  78{
  79        struct ghes_edac_dimm_fill *dimm_fill = arg;
  80        struct mem_ctl_info *mci = dimm_fill->mci;
  81
  82        if (dh->type == DMI_ENTRY_MEM_DEVICE) {
  83                struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh;
  84                struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
  85                                                       mci->n_layers,
  86                                                       dimm_fill->count, 0, 0);
  87
  88                if (entry->size == 0xffff) {
  89                        pr_info("Can't get DIMM%i size\n",
  90                                dimm_fill->count);
  91                        dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */
  92                } else if (entry->size == 0x7fff) {
  93                        dimm->nr_pages = MiB_TO_PAGES(entry->extended_size);
  94                } else {
  95                        if (entry->size & 1 << 15)
  96                                dimm->nr_pages = MiB_TO_PAGES((entry->size &
  97                                                               0x7fff) << 10);
  98                        else
  99                                dimm->nr_pages = MiB_TO_PAGES(entry->size);
 100                }
 101
 102                switch (entry->memory_type) {
 103                case 0x12:
 104                        if (entry->type_detail & 1 << 13)
 105                                dimm->mtype = MEM_RDDR;
 106                        else
 107                                dimm->mtype = MEM_DDR;
 108                        break;
 109                case 0x13:
 110                        if (entry->type_detail & 1 << 13)
 111                                dimm->mtype = MEM_RDDR2;
 112                        else
 113                                dimm->mtype = MEM_DDR2;
 114                        break;
 115                case 0x14:
 116                        dimm->mtype = MEM_FB_DDR2;
 117                        break;
 118                case 0x18:
 119                        if (entry->type_detail & 1 << 13)
 120                                dimm->mtype = MEM_RDDR3;
 121                        else
 122                                dimm->mtype = MEM_DDR3;
 123                        break;
 124                default:
 125                        if (entry->type_detail & 1 << 6)
 126                                dimm->mtype = MEM_RMBS;
 127                        else if ((entry->type_detail & ((1 << 7) | (1 << 13)))
 128                                 == ((1 << 7) | (1 << 13)))
 129                                dimm->mtype = MEM_RDR;
 130                        else if (entry->type_detail & 1 << 7)
 131                                dimm->mtype = MEM_SDR;
 132                        else if (entry->type_detail & 1 << 9)
 133                                dimm->mtype = MEM_EDO;
 134                        else
 135                                dimm->mtype = MEM_UNKNOWN;
 136                }
 137
 138                /*
 139                 * Actually, we can only detect if the memory has bits for
 140                 * checksum or not
 141                 */
 142                if (entry->total_width == entry->data_width)
 143                        dimm->edac_mode = EDAC_NONE;
 144                else
 145                        dimm->edac_mode = EDAC_SECDED;
 146
 147                dimm->dtype = DEV_UNKNOWN;
 148                dimm->grain = 128;              /* Likely, worse case */
 149
 150                /*
 151                 * FIXME: It shouldn't be hard to also fill the DIMM labels
 152                 */
 153
 154                if (dimm->nr_pages) {
 155                        edac_dbg(1, "DIMM%i: %s size = %d MB%s\n",
 156                                dimm_fill->count, edac_mem_types[dimm->mtype],
 157                                PAGES_TO_MiB(dimm->nr_pages),
 158                                (dimm->edac_mode != EDAC_NONE) ? "(ECC)" : "");
 159                        edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n",
 160                                entry->memory_type, entry->type_detail,
 161                                entry->total_width, entry->data_width);
 162                }
 163
 164                dimm_fill->count++;
 165        }
 166}
 167
 168void ghes_edac_report_mem_error(struct ghes *ghes, int sev,
 169                                struct cper_sec_mem_err *mem_err)
 170{
 171        enum hw_event_mc_err_type type;
 172        struct edac_raw_error_desc *e;
 173        struct mem_ctl_info *mci;
 174        struct ghes_edac_pvt *pvt = NULL;
 175        char *p;
 176        u8 grain_bits;
 177
 178        list_for_each_entry(pvt, &ghes_reglist, list) {
 179                if (ghes == pvt->ghes)
 180                        break;
 181        }
 182        if (!pvt) {
 183                pr_err("Internal error: Can't find EDAC structure\n");
 184                return;
 185        }
 186        mci = pvt->mci;
 187        e = &mci->error_desc;
 188
 189        /* Cleans the error report buffer */
 190        memset(e, 0, sizeof (*e));
 191        e->error_count = 1;
 192        strcpy(e->label, "unknown label");
 193        e->msg = pvt->msg;
 194        e->other_detail = pvt->other_detail;
 195        e->top_layer = -1;
 196        e->mid_layer = -1;
 197        e->low_layer = -1;
 198        *pvt->other_detail = '\0';
 199        *pvt->msg = '\0';
 200
 201        switch (sev) {
 202        case GHES_SEV_CORRECTED:
 203                type = HW_EVENT_ERR_CORRECTED;
 204                break;
 205        case GHES_SEV_RECOVERABLE:
 206                type = HW_EVENT_ERR_UNCORRECTED;
 207                break;
 208        case GHES_SEV_PANIC:
 209                type = HW_EVENT_ERR_FATAL;
 210                break;
 211        default:
 212        case GHES_SEV_NO:
 213                type = HW_EVENT_ERR_INFO;
 214        }
 215
 216        edac_dbg(1, "error validation_bits: 0x%08llx\n",
 217                 (long long)mem_err->validation_bits);
 218
 219        /* Error type, mapped on e->msg */
 220        if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
 221                p = pvt->msg;
 222                switch (mem_err->error_type) {
 223                case 0:
 224                        p += sprintf(p, "Unknown");
 225                        break;
 226                case 1:
 227                        p += sprintf(p, "No error");
 228                        break;
 229                case 2:
 230                        p += sprintf(p, "Single-bit ECC");
 231                        break;
 232                case 3:
 233                        p += sprintf(p, "Multi-bit ECC");
 234                        break;
 235                case 4:
 236                        p += sprintf(p, "Single-symbol ChipKill ECC");
 237                        break;
 238                case 5:
 239                        p += sprintf(p, "Multi-symbol ChipKill ECC");
 240                        break;
 241                case 6:
 242                        p += sprintf(p, "Master abort");
 243                        break;
 244                case 7:
 245                        p += sprintf(p, "Target abort");
 246                        break;
 247                case 8:
 248                        p += sprintf(p, "Parity Error");
 249                        break;
 250                case 9:
 251                        p += sprintf(p, "Watchdog timeout");
 252                        break;
 253                case 10:
 254                        p += sprintf(p, "Invalid address");
 255                        break;
 256                case 11:
 257                        p += sprintf(p, "Mirror Broken");
 258                        break;
 259                case 12:
 260                        p += sprintf(p, "Memory Sparing");
 261                        break;
 262                case 13:
 263                        p += sprintf(p, "Scrub corrected error");
 264                        break;
 265                case 14:
 266                        p += sprintf(p, "Scrub uncorrected error");
 267                        break;
 268                case 15:
 269                        p += sprintf(p, "Physical Memory Map-out event");
 270                        break;
 271                default:
 272                        p += sprintf(p, "reserved error (%d)",
 273                                     mem_err->error_type);
 274                }
 275        } else {
 276                strcpy(pvt->msg, "unknown error");
 277        }
 278
 279        /* Error address */
 280        if (mem_err->validation_bits & CPER_MEM_VALID_PA) {
 281                e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT;
 282                e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK;
 283        }
 284
 285        /* Error grain */
 286        if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK)
 287                e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK);
 288
 289        /* Memory error location, mapped on e->location */
 290        p = e->location;
 291        if (mem_err->validation_bits & CPER_MEM_VALID_NODE)
 292                p += sprintf(p, "node:%d ", mem_err->node);
 293        if (mem_err->validation_bits & CPER_MEM_VALID_CARD)
 294                p += sprintf(p, "card:%d ", mem_err->card);
 295        if (mem_err->validation_bits & CPER_MEM_VALID_MODULE)
 296                p += sprintf(p, "module:%d ", mem_err->module);
 297        if (mem_err->validation_bits & CPER_MEM_VALID_RANK_NUMBER)
 298                p += sprintf(p, "rank:%d ", mem_err->rank);
 299        if (mem_err->validation_bits & CPER_MEM_VALID_BANK)
 300                p += sprintf(p, "bank:%d ", mem_err->bank);
 301        if (mem_err->validation_bits & CPER_MEM_VALID_ROW)
 302                p += sprintf(p, "row:%d ", mem_err->row);
 303        if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN)
 304                p += sprintf(p, "col:%d ", mem_err->column);
 305        if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION)
 306                p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
 307        if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
 308                const char *bank = NULL, *device = NULL;
 309                dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);
 310                if (bank != NULL && device != NULL)
 311                        p += sprintf(p, "DIMM location:%s %s ", bank, device);
 312                else
 313                        p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
 314                                     mem_err->mem_dev_handle);
 315        }
 316        if (p > e->location)
 317                *(p - 1) = '\0';
 318
 319        /* All other fields are mapped on e->other_detail */
 320        p = pvt->other_detail;
 321        if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_STATUS) {
 322                u64 status = mem_err->error_status;
 323
 324                p += sprintf(p, "status(0x%016llx): ", (long long)status);
 325                switch ((status >> 8) & 0xff) {
 326                case 1:
 327                        p += sprintf(p, "Error detected internal to the component ");
 328                        break;
 329                case 16:
 330                        p += sprintf(p, "Error detected in the bus ");
 331                        break;
 332                case 4:
 333                        p += sprintf(p, "Storage error in DRAM memory ");
 334                        break;
 335                case 5:
 336                        p += sprintf(p, "Storage error in TLB ");
 337                        break;
 338                case 6:
 339                        p += sprintf(p, "Storage error in cache ");
 340                        break;
 341                case 7:
 342                        p += sprintf(p, "Error in one or more functional units ");
 343                        break;
 344                case 8:
 345                        p += sprintf(p, "component failed self test ");
 346                        break;
 347                case 9:
 348                        p += sprintf(p, "Overflow or undervalue of internal queue ");
 349                        break;
 350                case 17:
 351                        p += sprintf(p, "Virtual address not found on IO-TLB or IO-PDIR ");
 352                        break;
 353                case 18:
 354                        p += sprintf(p, "Improper access error ");
 355                        break;
 356                case 19:
 357                        p += sprintf(p, "Access to a memory address which is not mapped to any component ");
 358                        break;
 359                case 20:
 360                        p += sprintf(p, "Loss of Lockstep ");
 361                        break;
 362                case 21:
 363                        p += sprintf(p, "Response not associated with a request ");
 364                        break;
 365                case 22:
 366                        p += sprintf(p, "Bus parity error - must also set the A, C, or D Bits ");
 367                        break;
 368                case 23:
 369                        p += sprintf(p, "Detection of a PATH_ERROR ");
 370                        break;
 371                case 25:
 372                        p += sprintf(p, "Bus operation timeout ");
 373                        break;
 374                case 26:
 375                        p += sprintf(p, "A read was issued to data that has been poisoned ");
 376                        break;
 377                default:
 378                        p += sprintf(p, "reserved ");
 379                        break;
 380                }
 381        }
 382        if (mem_err->validation_bits & CPER_MEM_VALID_REQUESTOR_ID)
 383                p += sprintf(p, "requestorID: 0x%016llx ",
 384                             (long long)mem_err->requestor_id);
 385        if (mem_err->validation_bits & CPER_MEM_VALID_RESPONDER_ID)
 386                p += sprintf(p, "responderID: 0x%016llx ",
 387                             (long long)mem_err->responder_id);
 388        if (mem_err->validation_bits & CPER_MEM_VALID_TARGET_ID)
 389                p += sprintf(p, "targetID: 0x%016llx ",
 390                             (long long)mem_err->responder_id);
 391        if (p > pvt->other_detail)
 392                *(p - 1) = '\0';
 393
 394        /* Generate the trace event */
 395        grain_bits = fls_long(e->grain);
 396        snprintf(pvt->detail_location, sizeof(pvt->detail_location),
 397                 "APEI location: %s %s", e->location, e->other_detail);
 398        trace_mc_event(type, e->msg, e->label, e->error_count,
 399                       mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer,
 400                       (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
 401                       grain_bits, e->syndrome, pvt->detail_location);
 402
 403        /* Report the error via EDAC API */
 404        edac_raw_mc_handle_error(type, mci, e);
 405}
 406EXPORT_SYMBOL_GPL(ghes_edac_report_mem_error);
 407
 408int ghes_edac_register(struct ghes *ghes, struct device *dev)
 409{
 410        bool fake = false;
 411        int rc, num_dimm = 0;
 412        struct mem_ctl_info *mci;
 413        struct edac_mc_layer layers[1];
 414        struct ghes_edac_pvt *pvt;
 415        struct ghes_edac_dimm_fill dimm_fill;
 416
 417        /* Get the number of DIMMs */
 418        dmi_walk(ghes_edac_count_dimms, &num_dimm);
 419
 420        /* Check if we've got a bogus BIOS */
 421        if (num_dimm == 0) {
 422                fake = true;
 423                num_dimm = 1;
 424        }
 425
 426        layers[0].type = EDAC_MC_LAYER_ALL_MEM;
 427        layers[0].size = num_dimm;
 428        layers[0].is_virt_csrow = true;
 429
 430        /*
 431         * We need to serialize edac_mc_alloc() and edac_mc_add_mc(),
 432         * to avoid duplicated memory controller numbers
 433         */
 434        mutex_lock(&ghes_edac_lock);
 435        mci = edac_mc_alloc(ghes_edac_mc_num, ARRAY_SIZE(layers), layers,
 436                            sizeof(*pvt));
 437        if (!mci) {
 438                pr_info("Can't allocate memory for EDAC data\n");
 439                mutex_unlock(&ghes_edac_lock);
 440                return -ENOMEM;
 441        }
 442
 443        pvt = mci->pvt_info;
 444        memset(pvt, 0, sizeof(*pvt));
 445        list_add_tail(&pvt->list, &ghes_reglist);
 446        pvt->ghes = ghes;
 447        pvt->mci  = mci;
 448        mci->pdev = dev;
 449
 450        mci->mtype_cap = MEM_FLAG_EMPTY;
 451        mci->edac_ctl_cap = EDAC_FLAG_NONE;
 452        mci->edac_cap = EDAC_FLAG_NONE;
 453        mci->mod_name = "ghes_edac.c";
 454        mci->mod_ver = GHES_EDAC_REVISION;
 455        mci->ctl_name = "ghes_edac";
 456        mci->dev_name = "ghes";
 457
 458        if (!ghes_edac_mc_num) {
 459                if (!fake) {
 460                        pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n");
 461                        pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n");
 462                        pr_info("So, the end result of using this driver varies from vendor to vendor.\n");
 463                        pr_info("If you find incorrect reports, please contact your hardware vendor\n");
 464                        pr_info("to correct its BIOS.\n");
 465                        pr_info("This system has %d DIMM sockets.\n",
 466                                num_dimm);
 467                } else {
 468                        pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n");
 469                        pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n");
 470                        pr_info("work on such system. Use this driver with caution\n");
 471                }
 472        }
 473
 474        if (!fake) {
 475                /*
 476                 * Fill DIMM info from DMI for the memory controller #0
 477                 *
 478                 * Keep it in blank for the other memory controllers, as
 479                 * there's no reliable way to properly credit each DIMM to
 480                 * the memory controller, as different BIOSes fill the
 481                 * DMI bank location fields on different ways
 482                 */
 483                if (!ghes_edac_mc_num) {
 484                        dimm_fill.count = 0;
 485                        dimm_fill.mci = mci;
 486                        dmi_walk(ghes_edac_dmidecode, &dimm_fill);
 487                }
 488        } else {
 489                struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
 490                                                       mci->n_layers, 0, 0, 0);
 491
 492                dimm->nr_pages = 1;
 493                dimm->grain = 128;
 494                dimm->mtype = MEM_UNKNOWN;
 495                dimm->dtype = DEV_UNKNOWN;
 496                dimm->edac_mode = EDAC_SECDED;
 497        }
 498
 499        rc = edac_mc_add_mc(mci);
 500        if (rc < 0) {
 501                pr_info("Can't register at EDAC core\n");
 502                edac_mc_free(mci);
 503                mutex_unlock(&ghes_edac_lock);
 504                return -ENODEV;
 505        }
 506
 507        ghes_edac_mc_num++;
 508        mutex_unlock(&ghes_edac_lock);
 509        return 0;
 510}
 511EXPORT_SYMBOL_GPL(ghes_edac_register);
 512
 513void ghes_edac_unregister(struct ghes *ghes)
 514{
 515        struct mem_ctl_info *mci;
 516        struct ghes_edac_pvt *pvt, *tmp;
 517
 518        list_for_each_entry_safe(pvt, tmp, &ghes_reglist, list) {
 519                if (ghes == pvt->ghes) {
 520                        mci = pvt->mci;
 521                        edac_mc_del_mc(mci->pdev);
 522                        edac_mc_free(mci);
 523                        list_del(&pvt->list);
 524                }
 525        }
 526}
 527EXPORT_SYMBOL_GPL(ghes_edac_unregister);
 528