linux/drivers/edac/mce_amd.c
<<
>>
Prefs
   1#include <linux/module.h>
   2#include <linux/slab.h>
   3
   4#include "mce_amd.h"
   5
   6static struct amd_decoder_ops *fam_ops;
   7
   8static u8 xec_mask       = 0xf;
   9
  10static bool report_gart_errors;
  11static void (*decode_dram_ecc)(int node_id, struct mce *m);
  12
  13void amd_report_gart_errors(bool v)
  14{
  15        report_gart_errors = v;
  16}
  17EXPORT_SYMBOL_GPL(amd_report_gart_errors);
  18
  19void amd_register_ecc_decoder(void (*f)(int, struct mce *))
  20{
  21        decode_dram_ecc = f;
  22}
  23EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
  24
  25void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
  26{
  27        if (decode_dram_ecc) {
  28                WARN_ON(decode_dram_ecc != f);
  29
  30                decode_dram_ecc = NULL;
  31        }
  32}
  33EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
  34
  35/*
  36 * string representation for the different MCA reported error types, see F3x48
  37 * or MSR0000_0411.
  38 */
  39
  40/* transaction type */
  41static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
  42
  43/* cache level */
  44static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
  45
  46/* memory transaction type */
  47static const char * const rrrr_msgs[] = {
  48       "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
  49};
  50
  51/* participating processor */
  52const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
  53EXPORT_SYMBOL_GPL(pp_msgs);
  54
  55/* request timeout */
  56static const char * const to_msgs[] = { "no timeout", "timed out" };
  57
  58/* memory or i/o */
  59static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
  60
  61/* internal error type */
  62static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
  63
  64static const char * const f15h_mc1_mce_desc[] = {
  65        "UC during a demand linefill from L2",
  66        "Parity error during data load from IC",
  67        "Parity error for IC valid bit",
  68        "Main tag parity error",
  69        "Parity error in prediction queue",
  70        "PFB data/address parity error",
  71        "Parity error in the branch status reg",
  72        "PFB promotion address error",
  73        "Tag error during probe/victimization",
  74        "Parity error for IC probe tag valid bit",
  75        "PFB non-cacheable bit parity error",
  76        "PFB valid bit parity error",                   /* xec = 0xd */
  77        "Microcode Patch Buffer",                       /* xec = 010 */
  78        "uop queue",
  79        "insn buffer",
  80        "predecode buffer",
  81        "fetch address FIFO",
  82        "dispatch uop queue"
  83};
  84
  85static const char * const f15h_mc2_mce_desc[] = {
  86        "Fill ECC error on data fills",                 /* xec = 0x4 */
  87        "Fill parity error on insn fills",
  88        "Prefetcher request FIFO parity error",
  89        "PRQ address parity error",
  90        "PRQ data parity error",
  91        "WCC Tag ECC error",
  92        "WCC Data ECC error",
  93        "WCB Data parity error",
  94        "VB Data ECC or parity error",
  95        "L2 Tag ECC error",                             /* xec = 0x10 */
  96        "Hard L2 Tag ECC error",
  97        "Multiple hits on L2 tag",
  98        "XAB parity error",
  99        "PRB address parity error"
 100};
 101
 102static const char * const mc4_mce_desc[] = {
 103        "DRAM ECC error detected on the NB",
 104        "CRC error detected on HT link",
 105        "Link-defined sync error packets detected on HT link",
 106        "HT Master abort",
 107        "HT Target abort",
 108        "Invalid GART PTE entry during GART table walk",
 109        "Unsupported atomic RMW received from an IO link",
 110        "Watchdog timeout due to lack of progress",
 111        "DRAM ECC error detected on the NB",
 112        "SVM DMA Exclusion Vector error",
 113        "HT data error detected on link",
 114        "Protocol error (link, L3, probe filter)",
 115        "NB internal arrays parity error",
 116        "DRAM addr/ctl signals parity error",
 117        "IO link transmission error",
 118        "L3 data cache ECC error",                      /* xec = 0x1c */
 119        "L3 cache tag error",
 120        "L3 LRU parity bits error",
 121        "ECC Error in the Probe Filter directory"
 122};
 123
 124static const char * const mc5_mce_desc[] = {
 125        "CPU Watchdog timer expire",
 126        "Wakeup array dest tag",
 127        "AG payload array",
 128        "EX payload array",
 129        "IDRF array",
 130        "Retire dispatch queue",
 131        "Mapper checkpoint array",
 132        "Physical register file EX0 port",
 133        "Physical register file EX1 port",
 134        "Physical register file AG0 port",
 135        "Physical register file AG1 port",
 136        "Flag register file",
 137        "DE error occurred",
 138        "Retire status queue"
 139};
 140
 141static const char * const mc6_mce_desc[] = {
 142        "Hardware Assertion",
 143        "Free List",
 144        "Physical Register File",
 145        "Retire Queue",
 146        "Scheduler table",
 147        "Status Register File",
 148};
 149
 150/* Scalable MCA error strings */
 151static const char * const smca_ls_mce_desc[] = {
 152        "Load queue parity",
 153        "Store queue parity",
 154        "Miss address buffer payload parity",
 155        "L1 TLB parity",
 156        "Reserved",
 157        "DC tag error type 6",
 158        "DC tag error type 1",
 159        "Internal error type 1",
 160        "Internal error type 2",
 161        "Sys Read data error thread 0",
 162        "Sys read data error thread 1",
 163        "DC tag error type 2",
 164        "DC data error type 1 (poison comsumption)",
 165        "DC data error type 2",
 166        "DC data error type 3",
 167        "DC tag error type 4",
 168        "L2 TLB parity",
 169        "PDC parity error",
 170        "DC tag error type 3",
 171        "DC tag error type 5",
 172        "L2 fill data error",
 173};
 174
 175static const char * const smca_if_mce_desc[] = {
 176        "microtag probe port parity error",
 177        "IC microtag or full tag multi-hit error",
 178        "IC full tag parity",
 179        "IC data array parity",
 180        "Decoupling queue phys addr parity error",
 181        "L0 ITLB parity error",
 182        "L1 ITLB parity error",
 183        "L2 ITLB parity error",
 184        "BPQ snoop parity on Thread 0",
 185        "BPQ snoop parity on Thread 1",
 186        "L1 BTB multi-match error",
 187        "L2 BTB multi-match error",
 188        "L2 Cache Response Poison error",
 189        "System Read Data error",
 190};
 191
 192static const char * const smca_l2_mce_desc[] = {
 193        "L2M tag multi-way-hit error",
 194        "L2M tag ECC error",
 195        "L2M data ECC error",
 196        "HW assert",
 197};
 198
 199static const char * const smca_de_mce_desc[] = {
 200        "uop cache tag parity error",
 201        "uop cache data parity error",
 202        "Insn buffer parity error",
 203        "uop queue parity error",
 204        "Insn dispatch queue parity error",
 205        "Fetch address FIFO parity",
 206        "Patch RAM data parity",
 207        "Patch RAM sequencer parity",
 208        "uop buffer parity"
 209};
 210
 211static const char * const smca_ex_mce_desc[] = {
 212        "Watchdog timeout error",
 213        "Phy register file parity",
 214        "Flag register file parity",
 215        "Immediate displacement register file parity",
 216        "Address generator payload parity",
 217        "EX payload parity",
 218        "Checkpoint queue parity",
 219        "Retire dispatch queue parity",
 220        "Retire status queue parity error",
 221        "Scheduling queue parity error",
 222        "Branch buffer queue parity error",
 223};
 224
 225static const char * const smca_fp_mce_desc[] = {
 226        "Physical register file parity",
 227        "Freelist parity error",
 228        "Schedule queue parity",
 229        "NSQ parity error",
 230        "Retire queue parity",
 231        "Status register file parity",
 232        "Hardware assertion",
 233};
 234
 235static const char * const smca_l3_mce_desc[] = {
 236        "Shadow tag macro ECC error",
 237        "Shadow tag macro multi-way-hit error",
 238        "L3M tag ECC error",
 239        "L3M tag multi-way-hit error",
 240        "L3M data ECC error",
 241        "XI parity, L3 fill done channel error",
 242        "L3 victim queue parity",
 243        "L3 HW assert",
 244};
 245
 246static const char * const smca_cs_mce_desc[] = {
 247        "Illegal request from transport layer",
 248        "Address violation",
 249        "Security violation",
 250        "Illegal response from transport layer",
 251        "Unexpected response",
 252        "Parity error on incoming request or probe response data",
 253        "Parity error on incoming read response data",
 254        "Atomic request parity",
 255        "ECC error on probe filter access",
 256};
 257
 258static const char * const smca_pie_mce_desc[] = {
 259        "HW assert",
 260        "Internal PIE register security violation",
 261        "Error on GMI link",
 262        "Poison data written to internal PIE register",
 263};
 264
 265static const char * const smca_umc_mce_desc[] = {
 266        "DRAM ECC error",
 267        "Data poison error on DRAM",
 268        "SDP parity error",
 269        "Advanced peripheral bus error",
 270        "Command/address parity error",
 271        "Write data CRC error",
 272};
 273
 274static const char * const smca_pb_mce_desc[] = {
 275        "Parameter Block RAM ECC error",
 276};
 277
 278static const char * const smca_psp_mce_desc[] = {
 279        "PSP RAM ECC or parity error",
 280};
 281
 282static const char * const smca_smu_mce_desc[] = {
 283        "SMU RAM ECC or parity error",
 284};
 285
 286struct smca_mce_desc {
 287        const char * const *descs;
 288        unsigned int num_descs;
 289};
 290
 291static struct smca_mce_desc smca_mce_descs[] = {
 292        [SMCA_LS]       = { smca_ls_mce_desc,   ARRAY_SIZE(smca_ls_mce_desc)    },
 293        [SMCA_IF]       = { smca_if_mce_desc,   ARRAY_SIZE(smca_if_mce_desc)    },
 294        [SMCA_L2_CACHE] = { smca_l2_mce_desc,   ARRAY_SIZE(smca_l2_mce_desc)    },
 295        [SMCA_DE]       = { smca_de_mce_desc,   ARRAY_SIZE(smca_de_mce_desc)    },
 296        [SMCA_EX]       = { smca_ex_mce_desc,   ARRAY_SIZE(smca_ex_mce_desc)    },
 297        [SMCA_FP]       = { smca_fp_mce_desc,   ARRAY_SIZE(smca_fp_mce_desc)    },
 298        [SMCA_L3_CACHE] = { smca_l3_mce_desc,   ARRAY_SIZE(smca_l3_mce_desc)    },
 299        [SMCA_CS]       = { smca_cs_mce_desc,   ARRAY_SIZE(smca_cs_mce_desc)    },
 300        [SMCA_PIE]      = { smca_pie_mce_desc,  ARRAY_SIZE(smca_pie_mce_desc)   },
 301        [SMCA_UMC]      = { smca_umc_mce_desc,  ARRAY_SIZE(smca_umc_mce_desc)   },
 302        [SMCA_PB]       = { smca_pb_mce_desc,   ARRAY_SIZE(smca_pb_mce_desc)    },
 303        [SMCA_PSP]      = { smca_psp_mce_desc,  ARRAY_SIZE(smca_psp_mce_desc)   },
 304        [SMCA_SMU]      = { smca_smu_mce_desc,  ARRAY_SIZE(smca_smu_mce_desc)   },
 305};
 306
 307static bool f12h_mc0_mce(u16 ec, u8 xec)
 308{
 309        bool ret = false;
 310
 311        if (MEM_ERROR(ec)) {
 312                u8 ll = LL(ec);
 313                ret = true;
 314
 315                if (ll == LL_L2)
 316                        pr_cont("during L1 linefill from L2.\n");
 317                else if (ll == LL_L1)
 318                        pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
 319                else
 320                        ret = false;
 321        }
 322        return ret;
 323}
 324
 325static bool f10h_mc0_mce(u16 ec, u8 xec)
 326{
 327        if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
 328                pr_cont("during data scrub.\n");
 329                return true;
 330        }
 331        return f12h_mc0_mce(ec, xec);
 332}
 333
 334static bool k8_mc0_mce(u16 ec, u8 xec)
 335{
 336        if (BUS_ERROR(ec)) {
 337                pr_cont("during system linefill.\n");
 338                return true;
 339        }
 340
 341        return f10h_mc0_mce(ec, xec);
 342}
 343
 344static bool cat_mc0_mce(u16 ec, u8 xec)
 345{
 346        u8 r4    = R4(ec);
 347        bool ret = true;
 348
 349        if (MEM_ERROR(ec)) {
 350
 351                if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
 352                        return false;
 353
 354                switch (r4) {
 355                case R4_DRD:
 356                case R4_DWR:
 357                        pr_cont("Data/Tag parity error due to %s.\n",
 358                                (r4 == R4_DRD ? "load/hw prf" : "store"));
 359                        break;
 360                case R4_EVICT:
 361                        pr_cont("Copyback parity error on a tag miss.\n");
 362                        break;
 363                case R4_SNOOP:
 364                        pr_cont("Tag parity error during snoop.\n");
 365                        break;
 366                default:
 367                        ret = false;
 368                }
 369        } else if (BUS_ERROR(ec)) {
 370
 371                if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
 372                        return false;
 373
 374                pr_cont("System read data error on a ");
 375
 376                switch (r4) {
 377                case R4_RD:
 378                        pr_cont("TLB reload.\n");
 379                        break;
 380                case R4_DWR:
 381                        pr_cont("store.\n");
 382                        break;
 383                case R4_DRD:
 384                        pr_cont("load.\n");
 385                        break;
 386                default:
 387                        ret = false;
 388                }
 389        } else {
 390                ret = false;
 391        }
 392
 393        return ret;
 394}
 395
 396static bool f15h_mc0_mce(u16 ec, u8 xec)
 397{
 398        bool ret = true;
 399
 400        if (MEM_ERROR(ec)) {
 401
 402                switch (xec) {
 403                case 0x0:
 404                        pr_cont("Data Array access error.\n");
 405                        break;
 406
 407                case 0x1:
 408                        pr_cont("UC error during a linefill from L2/NB.\n");
 409                        break;
 410
 411                case 0x2:
 412                case 0x11:
 413                        pr_cont("STQ access error.\n");
 414                        break;
 415
 416                case 0x3:
 417                        pr_cont("SCB access error.\n");
 418                        break;
 419
 420                case 0x10:
 421                        pr_cont("Tag error.\n");
 422                        break;
 423
 424                case 0x12:
 425                        pr_cont("LDQ access error.\n");
 426                        break;
 427
 428                default:
 429                        ret = false;
 430                }
 431        } else if (BUS_ERROR(ec)) {
 432
 433                if (!xec)
 434                        pr_cont("System Read Data Error.\n");
 435                else
 436                        pr_cont(" Internal error condition type %d.\n", xec);
 437        } else if (INT_ERROR(ec)) {
 438                if (xec <= 0x1f)
 439                        pr_cont("Hardware Assert.\n");
 440                else
 441                        ret = false;
 442
 443        } else
 444                ret = false;
 445
 446        return ret;
 447}
 448
 449static void decode_mc0_mce(struct mce *m)
 450{
 451        u16 ec = EC(m->status);
 452        u8 xec = XEC(m->status, xec_mask);
 453
 454        pr_emerg(HW_ERR "MC0 Error: ");
 455
 456        /* TLB error signatures are the same across families */
 457        if (TLB_ERROR(ec)) {
 458                if (TT(ec) == TT_DATA) {
 459                        pr_cont("%s TLB %s.\n", LL_MSG(ec),
 460                                ((xec == 2) ? "locked miss"
 461                                            : (xec ? "multimatch" : "parity")));
 462                        return;
 463                }
 464        } else if (fam_ops->mc0_mce(ec, xec))
 465                ;
 466        else
 467                pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
 468}
 469
 470static bool k8_mc1_mce(u16 ec, u8 xec)
 471{
 472        u8 ll    = LL(ec);
 473        bool ret = true;
 474
 475        if (!MEM_ERROR(ec))
 476                return false;
 477
 478        if (ll == 0x2)
 479                pr_cont("during a linefill from L2.\n");
 480        else if (ll == 0x1) {
 481                switch (R4(ec)) {
 482                case R4_IRD:
 483                        pr_cont("Parity error during data load.\n");
 484                        break;
 485
 486                case R4_EVICT:
 487                        pr_cont("Copyback Parity/Victim error.\n");
 488                        break;
 489
 490                case R4_SNOOP:
 491                        pr_cont("Tag Snoop error.\n");
 492                        break;
 493
 494                default:
 495                        ret = false;
 496                        break;
 497                }
 498        } else
 499                ret = false;
 500
 501        return ret;
 502}
 503
 504static bool cat_mc1_mce(u16 ec, u8 xec)
 505{
 506        u8 r4    = R4(ec);
 507        bool ret = true;
 508
 509        if (!MEM_ERROR(ec))
 510                return false;
 511
 512        if (TT(ec) != TT_INSTR)
 513                return false;
 514
 515        if (r4 == R4_IRD)
 516                pr_cont("Data/tag array parity error for a tag hit.\n");
 517        else if (r4 == R4_SNOOP)
 518                pr_cont("Tag error during snoop/victimization.\n");
 519        else if (xec == 0x0)
 520                pr_cont("Tag parity error from victim castout.\n");
 521        else if (xec == 0x2)
 522                pr_cont("Microcode patch RAM parity error.\n");
 523        else
 524                ret = false;
 525
 526        return ret;
 527}
 528
 529static bool f15h_mc1_mce(u16 ec, u8 xec)
 530{
 531        bool ret = true;
 532
 533        if (!MEM_ERROR(ec))
 534                return false;
 535
 536        switch (xec) {
 537        case 0x0 ... 0xa:
 538                pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
 539                break;
 540
 541        case 0xd:
 542                pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
 543                break;
 544
 545        case 0x10:
 546                pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
 547                break;
 548
 549        case 0x11 ... 0x15:
 550                pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
 551                break;
 552
 553        default:
 554                ret = false;
 555        }
 556        return ret;
 557}
 558
 559static void decode_mc1_mce(struct mce *m)
 560{
 561        u16 ec = EC(m->status);
 562        u8 xec = XEC(m->status, xec_mask);
 563
 564        pr_emerg(HW_ERR "MC1 Error: ");
 565
 566        if (TLB_ERROR(ec))
 567                pr_cont("%s TLB %s.\n", LL_MSG(ec),
 568                        (xec ? "multimatch" : "parity error"));
 569        else if (BUS_ERROR(ec)) {
 570                bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
 571
 572                pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
 573        } else if (INT_ERROR(ec)) {
 574                if (xec <= 0x3f)
 575                        pr_cont("Hardware Assert.\n");
 576                else
 577                        goto wrong_mc1_mce;
 578        } else if (fam_ops->mc1_mce(ec, xec))
 579                ;
 580        else
 581                goto wrong_mc1_mce;
 582
 583        return;
 584
 585wrong_mc1_mce:
 586        pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
 587}
 588
 589static bool k8_mc2_mce(u16 ec, u8 xec)
 590{
 591        bool ret = true;
 592
 593        if (xec == 0x1)
 594                pr_cont(" in the write data buffers.\n");
 595        else if (xec == 0x3)
 596                pr_cont(" in the victim data buffers.\n");
 597        else if (xec == 0x2 && MEM_ERROR(ec))
 598                pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
 599        else if (xec == 0x0) {
 600                if (TLB_ERROR(ec))
 601                        pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
 602                                TT_MSG(ec));
 603                else if (BUS_ERROR(ec))
 604                        pr_cont(": %s/ECC error in data read from NB: %s.\n",
 605                                R4_MSG(ec), PP_MSG(ec));
 606                else if (MEM_ERROR(ec)) {
 607                        u8 r4 = R4(ec);
 608
 609                        if (r4 >= 0x7)
 610                                pr_cont(": %s error during data copyback.\n",
 611                                        R4_MSG(ec));
 612                        else if (r4 <= 0x1)
 613                                pr_cont(": %s parity/ECC error during data "
 614                                        "access from L2.\n", R4_MSG(ec));
 615                        else
 616                                ret = false;
 617                } else
 618                        ret = false;
 619        } else
 620                ret = false;
 621
 622        return ret;
 623}
 624
 625static bool f15h_mc2_mce(u16 ec, u8 xec)
 626{
 627        bool ret = true;
 628
 629        if (TLB_ERROR(ec)) {
 630                if (xec == 0x0)
 631                        pr_cont("Data parity TLB read error.\n");
 632                else if (xec == 0x1)
 633                        pr_cont("Poison data provided for TLB fill.\n");
 634                else
 635                        ret = false;
 636        } else if (BUS_ERROR(ec)) {
 637                if (xec > 2)
 638                        ret = false;
 639
 640                pr_cont("Error during attempted NB data read.\n");
 641        } else if (MEM_ERROR(ec)) {
 642                switch (xec) {
 643                case 0x4 ... 0xc:
 644                        pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
 645                        break;
 646
 647                case 0x10 ... 0x14:
 648                        pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
 649                        break;
 650
 651                default:
 652                        ret = false;
 653                }
 654        } else if (INT_ERROR(ec)) {
 655                if (xec <= 0x3f)
 656                        pr_cont("Hardware Assert.\n");
 657                else
 658                        ret = false;
 659        }
 660
 661        return ret;
 662}
 663
 664static bool f16h_mc2_mce(u16 ec, u8 xec)
 665{
 666        u8 r4 = R4(ec);
 667
 668        if (!MEM_ERROR(ec))
 669                return false;
 670
 671        switch (xec) {
 672        case 0x04 ... 0x05:
 673                pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
 674                break;
 675
 676        case 0x09 ... 0x0b:
 677        case 0x0d ... 0x0f:
 678                pr_cont("ECC error in L2 tag (%s).\n",
 679                        ((r4 == R4_GEN)   ? "BankReq" :
 680                        ((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
 681                break;
 682
 683        case 0x10 ... 0x19:
 684        case 0x1b:
 685                pr_cont("ECC error in L2 data array (%s).\n",
 686                        (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
 687                        ((r4 == R4_GEN)   ? "Attr" :
 688                        ((r4 == R4_EVICT) ? "Vict" : "Fill"))));
 689                break;
 690
 691        case 0x1c ... 0x1d:
 692        case 0x1f:
 693                pr_cont("Parity error in L2 attribute bits (%s).\n",
 694                        ((r4 == R4_RD)  ? "Hit"  :
 695                        ((r4 == R4_GEN) ? "Attr" : "Fill")));
 696                break;
 697
 698        default:
 699                return false;
 700        }
 701
 702        return true;
 703}
 704
 705static void decode_mc2_mce(struct mce *m)
 706{
 707        u16 ec = EC(m->status);
 708        u8 xec = XEC(m->status, xec_mask);
 709
 710        pr_emerg(HW_ERR "MC2 Error: ");
 711
 712        if (!fam_ops->mc2_mce(ec, xec))
 713                pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
 714}
 715
 716static void decode_mc3_mce(struct mce *m)
 717{
 718        u16 ec = EC(m->status);
 719        u8 xec = XEC(m->status, xec_mask);
 720
 721        if (boot_cpu_data.x86 >= 0x14) {
 722                pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
 723                         " please report on LKML.\n");
 724                return;
 725        }
 726
 727        pr_emerg(HW_ERR "MC3 Error");
 728
 729        if (xec == 0x0) {
 730                u8 r4 = R4(ec);
 731
 732                if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
 733                        goto wrong_mc3_mce;
 734
 735                pr_cont(" during %s.\n", R4_MSG(ec));
 736        } else
 737                goto wrong_mc3_mce;
 738
 739        return;
 740
 741 wrong_mc3_mce:
 742        pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
 743}
 744
 745static void decode_mc4_mce(struct mce *m)
 746{
 747        struct cpuinfo_x86 *c = &boot_cpu_data;
 748        int node_id = amd_get_nb_id(m->extcpu);
 749        u16 ec = EC(m->status);
 750        u8 xec = XEC(m->status, 0x1f);
 751        u8 offset = 0;
 752
 753        pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
 754
 755        switch (xec) {
 756        case 0x0 ... 0xe:
 757
 758                /* special handling for DRAM ECCs */
 759                if (xec == 0x0 || xec == 0x8) {
 760                        /* no ECCs on F11h */
 761                        if (c->x86 == 0x11)
 762                                goto wrong_mc4_mce;
 763
 764                        pr_cont("%s.\n", mc4_mce_desc[xec]);
 765
 766                        if (decode_dram_ecc)
 767                                decode_dram_ecc(node_id, m);
 768                        return;
 769                }
 770                break;
 771
 772        case 0xf:
 773                if (TLB_ERROR(ec))
 774                        pr_cont("GART Table Walk data error.\n");
 775                else if (BUS_ERROR(ec))
 776                        pr_cont("DMA Exclusion Vector Table Walk error.\n");
 777                else
 778                        goto wrong_mc4_mce;
 779                return;
 780
 781        case 0x19:
 782                if (boot_cpu_data.x86 == 0x15 || boot_cpu_data.x86 == 0x16)
 783                        pr_cont("Compute Unit Data Error.\n");
 784                else
 785                        goto wrong_mc4_mce;
 786                return;
 787
 788        case 0x1c ... 0x1f:
 789                offset = 13;
 790                break;
 791
 792        default:
 793                goto wrong_mc4_mce;
 794        }
 795
 796        pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
 797        return;
 798
 799 wrong_mc4_mce:
 800        pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
 801}
 802
 803static void decode_mc5_mce(struct mce *m)
 804{
 805        struct cpuinfo_x86 *c = &boot_cpu_data;
 806        u16 ec = EC(m->status);
 807        u8 xec = XEC(m->status, xec_mask);
 808
 809        if (c->x86 == 0xf || c->x86 == 0x11)
 810                goto wrong_mc5_mce;
 811
 812        pr_emerg(HW_ERR "MC5 Error: ");
 813
 814        if (INT_ERROR(ec)) {
 815                if (xec <= 0x1f) {
 816                        pr_cont("Hardware Assert.\n");
 817                        return;
 818                } else
 819                        goto wrong_mc5_mce;
 820        }
 821
 822        if (xec == 0x0 || xec == 0xc)
 823                pr_cont("%s.\n", mc5_mce_desc[xec]);
 824        else if (xec <= 0xd)
 825                pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
 826        else
 827                goto wrong_mc5_mce;
 828
 829        return;
 830
 831 wrong_mc5_mce:
 832        pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
 833}
 834
 835static void decode_mc6_mce(struct mce *m)
 836{
 837        u8 xec = XEC(m->status, xec_mask);
 838
 839        pr_emerg(HW_ERR "MC6 Error: ");
 840
 841        if (xec > 0x5)
 842                goto wrong_mc6_mce;
 843
 844        pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
 845        return;
 846
 847 wrong_mc6_mce:
 848        pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
 849}
 850
 851/* Decode errors according to Scalable MCA specification */
 852static void decode_smca_errors(struct mce *m)
 853{
 854        struct smca_hwid *hwid;
 855        unsigned int bank_type;
 856        const char *ip_name;
 857        u8 xec = XEC(m->status, xec_mask);
 858
 859        if (m->bank >= ARRAY_SIZE(smca_banks))
 860                return;
 861
 862        if (boot_cpu_data.x86 >= 0x17 && m->bank == 4)
 863                pr_emerg(HW_ERR "Bank 4 is reserved on Fam17h.\n");
 864
 865        hwid = smca_banks[m->bank].hwid;
 866        if (!hwid)
 867                return;
 868
 869        bank_type = hwid->bank_type;
 870        ip_name = smca_get_long_name(bank_type);
 871
 872        pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec);
 873
 874        /* Only print the decode of valid error codes */
 875        if (xec < smca_mce_descs[bank_type].num_descs &&
 876                        (hwid->xec_bitmap & BIT_ULL(xec))) {
 877                pr_emerg(HW_ERR "%s Error: ", ip_name);
 878                pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]);
 879        }
 880
 881        /*
 882         * amd_get_nb_id() returns the last level cache id.
 883         * The last level cache on Fam17h is 1 level below the node.
 884         */
 885        if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
 886                decode_dram_ecc(amd_get_nb_id(m->extcpu) >> 1, m);
 887}
 888
 889static inline void amd_decode_err_code(u16 ec)
 890{
 891        if (INT_ERROR(ec)) {
 892                pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
 893                return;
 894        }
 895
 896        pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
 897
 898        if (BUS_ERROR(ec))
 899                pr_cont(", mem/io: %s", II_MSG(ec));
 900        else
 901                pr_cont(", tx: %s", TT_MSG(ec));
 902
 903        if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
 904                pr_cont(", mem-tx: %s", R4_MSG(ec));
 905
 906                if (BUS_ERROR(ec))
 907                        pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
 908        }
 909
 910        pr_cont("\n");
 911}
 912
 913/*
 914 * Filter out unwanted MCE signatures here.
 915 */
 916static bool amd_filter_mce(struct mce *m)
 917{
 918        u8 xec = (m->status >> 16) & 0x1f;
 919
 920        /*
 921         * NB GART TLB error reporting is disabled by default.
 922         */
 923        if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
 924                return true;
 925
 926        return false;
 927}
 928
 929static const char *decode_error_status(struct mce *m)
 930{
 931        if (m->status & MCI_STATUS_UC) {
 932                if (m->status & MCI_STATUS_PCC)
 933                        return "System Fatal error.";
 934                if (m->mcgstatus & MCG_STATUS_RIPV)
 935                        return "Uncorrected, software restartable error.";
 936                return "Uncorrected, software containable error.";
 937        }
 938
 939        if (m->status & MCI_STATUS_DEFERRED)
 940                return "Deferred error, no action required.";
 941
 942        return "Corrected error, no action required.";
 943}
 944
 945static int
 946amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 947{
 948        struct mce *m = (struct mce *)data;
 949        struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
 950        int ecc;
 951
 952        if (amd_filter_mce(m))
 953                return NOTIFY_STOP;
 954
 955        pr_emerg(HW_ERR "%s\n", decode_error_status(m));
 956
 957        pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
 958                m->extcpu,
 959                c->x86, c->x86_model, c->x86_mask,
 960                m->bank,
 961                ((m->status & MCI_STATUS_OVER)  ? "Over"  : "-"),
 962                ((m->status & MCI_STATUS_UC)    ? "UE"    :
 963                 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
 964                ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
 965                ((m->status & MCI_STATUS_PCC)   ? "PCC"   : "-"),
 966                ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
 967
 968        if (c->x86 >= 0x15) {
 969                pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
 970
 971                /* F15h, bank4, bit 43 is part of McaStatSubCache. */
 972                if (c->x86 != 0x15 || m->bank != 4)
 973                        pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
 974        }
 975
 976        if (boot_cpu_has(X86_FEATURE_SMCA)) {
 977                u32 low, high;
 978                u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
 979
 980                pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
 981
 982                if (!rdmsr_safe(addr, &low, &high) &&
 983                    (low & MCI_CONFIG_MCAX))
 984                        pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
 985        }
 986
 987        /* do the two bits[14:13] together */
 988        ecc = (m->status >> 45) & 0x3;
 989        if (ecc)
 990                pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
 991
 992        pr_cont("]: 0x%016llx\n", m->status);
 993
 994        if (m->status & MCI_STATUS_ADDRV)
 995                pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
 996
 997        if (boot_cpu_has(X86_FEATURE_SMCA)) {
 998                pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
 999
1000                if (m->status & MCI_STATUS_SYNDV)
1001                        pr_cont(", Syndrome: 0x%016llx", m->synd);
1002
1003                pr_cont("\n");
1004
1005                decode_smca_errors(m);
1006                goto err_code;
1007        }
1008
1009        if (m->tsc)
1010                pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
1011
1012        if (!fam_ops)
1013                goto err_code;
1014
1015        switch (m->bank) {
1016        case 0:
1017                decode_mc0_mce(m);
1018                break;
1019
1020        case 1:
1021                decode_mc1_mce(m);
1022                break;
1023
1024        case 2:
1025                decode_mc2_mce(m);
1026                break;
1027
1028        case 3:
1029                decode_mc3_mce(m);
1030                break;
1031
1032        case 4:
1033                decode_mc4_mce(m);
1034                break;
1035
1036        case 5:
1037                decode_mc5_mce(m);
1038                break;
1039
1040        case 6:
1041                decode_mc6_mce(m);
1042                break;
1043
1044        default:
1045                break;
1046        }
1047
1048 err_code:
1049        amd_decode_err_code(m->status & 0xffff);
1050
1051        return NOTIFY_STOP;
1052}
1053
1054static struct notifier_block amd_mce_dec_nb = {
1055        .notifier_call  = amd_decode_mce,
1056        .priority       = MCE_PRIO_EDAC,
1057};
1058
1059static int __init mce_amd_init(void)
1060{
1061        struct cpuinfo_x86 *c = &boot_cpu_data;
1062
1063        if (c->x86_vendor != X86_VENDOR_AMD)
1064                return -ENODEV;
1065
1066        fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
1067        if (!fam_ops)
1068                return -ENOMEM;
1069
1070        switch (c->x86) {
1071        case 0xf:
1072                fam_ops->mc0_mce = k8_mc0_mce;
1073                fam_ops->mc1_mce = k8_mc1_mce;
1074                fam_ops->mc2_mce = k8_mc2_mce;
1075                break;
1076
1077        case 0x10:
1078                fam_ops->mc0_mce = f10h_mc0_mce;
1079                fam_ops->mc1_mce = k8_mc1_mce;
1080                fam_ops->mc2_mce = k8_mc2_mce;
1081                break;
1082
1083        case 0x11:
1084                fam_ops->mc0_mce = k8_mc0_mce;
1085                fam_ops->mc1_mce = k8_mc1_mce;
1086                fam_ops->mc2_mce = k8_mc2_mce;
1087                break;
1088
1089        case 0x12:
1090                fam_ops->mc0_mce = f12h_mc0_mce;
1091                fam_ops->mc1_mce = k8_mc1_mce;
1092                fam_ops->mc2_mce = k8_mc2_mce;
1093                break;
1094
1095        case 0x14:
1096                fam_ops->mc0_mce = cat_mc0_mce;
1097                fam_ops->mc1_mce = cat_mc1_mce;
1098                fam_ops->mc2_mce = k8_mc2_mce;
1099                break;
1100
1101        case 0x15:
1102                xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1103
1104                fam_ops->mc0_mce = f15h_mc0_mce;
1105                fam_ops->mc1_mce = f15h_mc1_mce;
1106                fam_ops->mc2_mce = f15h_mc2_mce;
1107                break;
1108
1109        case 0x16:
1110                xec_mask = 0x1f;
1111                fam_ops->mc0_mce = cat_mc0_mce;
1112                fam_ops->mc1_mce = cat_mc1_mce;
1113                fam_ops->mc2_mce = f16h_mc2_mce;
1114                break;
1115
1116        case 0x17:
1117                xec_mask = 0x3f;
1118                if (!boot_cpu_has(X86_FEATURE_SMCA)) {
1119                        printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
1120                        goto err_out;
1121                }
1122                break;
1123
1124        default:
1125                printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1126                goto err_out;
1127        }
1128
1129        pr_info("MCE: In-kernel MCE decoding enabled.\n");
1130
1131        mce_register_decode_chain(&amd_mce_dec_nb);
1132
1133        return 0;
1134
1135err_out:
1136        kfree(fam_ops);
1137        fam_ops = NULL;
1138        return -EINVAL;
1139}
1140early_initcall(mce_amd_init);
1141
1142#ifdef MODULE
1143static void __exit mce_amd_exit(void)
1144{
1145        mce_unregister_decode_chain(&amd_mce_dec_nb);
1146        kfree(fam_ops);
1147}
1148
1149MODULE_DESCRIPTION("AMD MCE decoder");
1150MODULE_ALIAS("edac-mce-amd");
1151MODULE_LICENSE("GPL");
1152module_exit(mce_amd_exit);
1153#endif
1154