linux/drivers/edac/mce_amd.c
<<
>>
Prefs
   1#include <linux/module.h>
   2#include <linux/slab.h>
   3
   4#include "mce_amd.h"
   5
   6static struct amd_decoder_ops *fam_ops;
   7
   8static u8 xec_mask       = 0xf;
   9
  10static bool report_gart_errors;
  11static void (*decode_dram_ecc)(int node_id, struct mce *m);
  12
  13void amd_report_gart_errors(bool v)
  14{
  15        report_gart_errors = v;
  16}
  17EXPORT_SYMBOL_GPL(amd_report_gart_errors);
  18
  19void amd_register_ecc_decoder(void (*f)(int, struct mce *))
  20{
  21        decode_dram_ecc = f;
  22}
  23EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
  24
  25void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
  26{
  27        if (decode_dram_ecc) {
  28                WARN_ON(decode_dram_ecc != f);
  29
  30                decode_dram_ecc = NULL;
  31        }
  32}
  33EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
  34
  35/*
  36 * string representation for the different MCA reported error types, see F3x48
  37 * or MSR0000_0411.
  38 */
  39
  40/* transaction type */
  41static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
  42
  43/* cache level */
  44static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
  45
  46/* memory transaction type */
  47static const char * const rrrr_msgs[] = {
  48       "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
  49};
  50
  51/* participating processor */
  52const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
  53EXPORT_SYMBOL_GPL(pp_msgs);
  54
  55/* request timeout */
  56static const char * const to_msgs[] = { "no timeout", "timed out" };
  57
  58/* memory or i/o */
  59static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
  60
  61/* internal error type */
  62static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
  63
  64static const char * const f15h_mc1_mce_desc[] = {
  65        "UC during a demand linefill from L2",
  66        "Parity error during data load from IC",
  67        "Parity error for IC valid bit",
  68        "Main tag parity error",
  69        "Parity error in prediction queue",
  70        "PFB data/address parity error",
  71        "Parity error in the branch status reg",
  72        "PFB promotion address error",
  73        "Tag error during probe/victimization",
  74        "Parity error for IC probe tag valid bit",
  75        "PFB non-cacheable bit parity error",
  76        "PFB valid bit parity error",                   /* xec = 0xd */
  77        "Microcode Patch Buffer",                       /* xec = 010 */
  78        "uop queue",
  79        "insn buffer",
  80        "predecode buffer",
  81        "fetch address FIFO",
  82        "dispatch uop queue"
  83};
  84
  85static const char * const f15h_mc2_mce_desc[] = {
  86        "Fill ECC error on data fills",                 /* xec = 0x4 */
  87        "Fill parity error on insn fills",
  88        "Prefetcher request FIFO parity error",
  89        "PRQ address parity error",
  90        "PRQ data parity error",
  91        "WCC Tag ECC error",
  92        "WCC Data ECC error",
  93        "WCB Data parity error",
  94        "VB Data ECC or parity error",
  95        "L2 Tag ECC error",                             /* xec = 0x10 */
  96        "Hard L2 Tag ECC error",
  97        "Multiple hits on L2 tag",
  98        "XAB parity error",
  99        "PRB address parity error"
 100};
 101
 102static const char * const mc4_mce_desc[] = {
 103        "DRAM ECC error detected on the NB",
 104        "CRC error detected on HT link",
 105        "Link-defined sync error packets detected on HT link",
 106        "HT Master abort",
 107        "HT Target abort",
 108        "Invalid GART PTE entry during GART table walk",
 109        "Unsupported atomic RMW received from an IO link",
 110        "Watchdog timeout due to lack of progress",
 111        "DRAM ECC error detected on the NB",
 112        "SVM DMA Exclusion Vector error",
 113        "HT data error detected on link",
 114        "Protocol error (link, L3, probe filter)",
 115        "NB internal arrays parity error",
 116        "DRAM addr/ctl signals parity error",
 117        "IO link transmission error",
 118        "L3 data cache ECC error",                      /* xec = 0x1c */
 119        "L3 cache tag error",
 120        "L3 LRU parity bits error",
 121        "ECC Error in the Probe Filter directory"
 122};
 123
 124static const char * const mc5_mce_desc[] = {
 125        "CPU Watchdog timer expire",
 126        "Wakeup array dest tag",
 127        "AG payload array",
 128        "EX payload array",
 129        "IDRF array",
 130        "Retire dispatch queue",
 131        "Mapper checkpoint array",
 132        "Physical register file EX0 port",
 133        "Physical register file EX1 port",
 134        "Physical register file AG0 port",
 135        "Physical register file AG1 port",
 136        "Flag register file",
 137        "DE error occurred",
 138        "Retire status queue"
 139};
 140
 141/* Scalable MCA error strings */
 142static const char * const smca_ls_mce_desc[] = {
 143        "Load queue parity",
 144        "Store queue parity",
 145        "Miss address buffer payload parity",
 146        "L1 TLB parity",
 147        "Reserved",
 148        "DC tag error type 6",
 149        "DC tag error type 1",
 150        "Internal error type 1",
 151        "Internal error type 2",
 152        "Sys Read data error thread 0",
 153        "Sys read data error thread 1",
 154        "DC tag error type 2",
 155        "DC data error type 1 (poison comsumption)",
 156        "DC data error type 2",
 157        "DC data error type 3",
 158        "DC tag error type 4",
 159        "L2 TLB parity",
 160        "PDC parity error",
 161        "DC tag error type 3",
 162        "DC tag error type 5",
 163        "L2 fill data error",
 164};
 165
 166static const char * const smca_if_mce_desc[] = {
 167        "microtag probe port parity error",
 168        "IC microtag or full tag multi-hit error",
 169        "IC full tag parity",
 170        "IC data array parity",
 171        "Decoupling queue phys addr parity error",
 172        "L0 ITLB parity error",
 173        "L1 ITLB parity error",
 174        "L2 ITLB parity error",
 175        "BPQ snoop parity on Thread 0",
 176        "BPQ snoop parity on Thread 1",
 177        "L1 BTB multi-match error",
 178        "L2 BTB multi-match error",
 179        "L2 Cache Response Poison error",
 180        "System Read Data error",
 181};
 182
 183static const char * const smca_l2_mce_desc[] = {
 184        "L2M tag multi-way-hit error",
 185        "L2M tag ECC error",
 186        "L2M data ECC error",
 187        "HW assert",
 188};
 189
 190static const char * const smca_de_mce_desc[] = {
 191        "uop cache tag parity error",
 192        "uop cache data parity error",
 193        "Insn buffer parity error",
 194        "uop queue parity error",
 195        "Insn dispatch queue parity error",
 196        "Fetch address FIFO parity",
 197        "Patch RAM data parity",
 198        "Patch RAM sequencer parity",
 199        "uop buffer parity"
 200};
 201
 202static const char * const smca_ex_mce_desc[] = {
 203        "Watchdog timeout error",
 204        "Phy register file parity",
 205        "Flag register file parity",
 206        "Immediate displacement register file parity",
 207        "Address generator payload parity",
 208        "EX payload parity",
 209        "Checkpoint queue parity",
 210        "Retire dispatch queue parity",
 211        "Retire status queue parity error",
 212        "Scheduling queue parity error",
 213        "Branch buffer queue parity error",
 214};
 215
 216static const char * const smca_fp_mce_desc[] = {
 217        "Physical register file parity",
 218        "Freelist parity error",
 219        "Schedule queue parity",
 220        "NSQ parity error",
 221        "Retire queue parity",
 222        "Status register file parity",
 223        "Hardware assertion",
 224};
 225
 226static const char * const smca_l3_mce_desc[] = {
 227        "Shadow tag macro ECC error",
 228        "Shadow tag macro multi-way-hit error",
 229        "L3M tag ECC error",
 230        "L3M tag multi-way-hit error",
 231        "L3M data ECC error",
 232        "XI parity, L3 fill done channel error",
 233        "L3 victim queue parity",
 234        "L3 HW assert",
 235};
 236
 237static const char * const smca_cs_mce_desc[] = {
 238        "Illegal request from transport layer",
 239        "Address violation",
 240        "Security violation",
 241        "Illegal response from transport layer",
 242        "Unexpected response",
 243        "Parity error on incoming request or probe response data",
 244        "Parity error on incoming read response data",
 245        "Atomic request parity",
 246        "ECC error on probe filter access",
 247};
 248
 249static const char * const smca_pie_mce_desc[] = {
 250        "HW assert",
 251        "Internal PIE register security violation",
 252        "Error on GMI link",
 253        "Poison data written to internal PIE register",
 254};
 255
 256static const char * const smca_umc_mce_desc[] = {
 257        "DRAM ECC error",
 258        "Data poison error on DRAM",
 259        "SDP parity error",
 260        "Advanced peripheral bus error",
 261        "Command/address parity error",
 262        "Write data CRC error",
 263};
 264
 265static const char * const smca_pb_mce_desc[] = {
 266        "Parameter Block RAM ECC error",
 267};
 268
 269static const char * const smca_psp_mce_desc[] = {
 270        "PSP RAM ECC or parity error",
 271};
 272
 273static const char * const smca_smu_mce_desc[] = {
 274        "SMU RAM ECC or parity error",
 275};
 276
 277struct smca_mce_desc {
 278        const char * const *descs;
 279        unsigned int num_descs;
 280};
 281
 282static struct smca_mce_desc smca_mce_descs[] = {
 283        [SMCA_LS]       = { smca_ls_mce_desc,   ARRAY_SIZE(smca_ls_mce_desc)    },
 284        [SMCA_IF]       = { smca_if_mce_desc,   ARRAY_SIZE(smca_if_mce_desc)    },
 285        [SMCA_L2_CACHE] = { smca_l2_mce_desc,   ARRAY_SIZE(smca_l2_mce_desc)    },
 286        [SMCA_DE]       = { smca_de_mce_desc,   ARRAY_SIZE(smca_de_mce_desc)    },
 287        [SMCA_EX]       = { smca_ex_mce_desc,   ARRAY_SIZE(smca_ex_mce_desc)    },
 288        [SMCA_FP]       = { smca_fp_mce_desc,   ARRAY_SIZE(smca_fp_mce_desc)    },
 289        [SMCA_L3_CACHE] = { smca_l3_mce_desc,   ARRAY_SIZE(smca_l3_mce_desc)    },
 290        [SMCA_CS]       = { smca_cs_mce_desc,   ARRAY_SIZE(smca_cs_mce_desc)    },
 291        [SMCA_PIE]      = { smca_pie_mce_desc,  ARRAY_SIZE(smca_pie_mce_desc)   },
 292        [SMCA_UMC]      = { smca_umc_mce_desc,  ARRAY_SIZE(smca_umc_mce_desc)   },
 293        [SMCA_PB]       = { smca_pb_mce_desc,   ARRAY_SIZE(smca_pb_mce_desc)    },
 294        [SMCA_PSP]      = { smca_psp_mce_desc,  ARRAY_SIZE(smca_psp_mce_desc)   },
 295        [SMCA_SMU]      = { smca_smu_mce_desc,  ARRAY_SIZE(smca_smu_mce_desc)   },
 296};
 297
 298static bool f12h_mc0_mce(u16 ec, u8 xec)
 299{
 300        bool ret = false;
 301
 302        if (MEM_ERROR(ec)) {
 303                u8 ll = LL(ec);
 304                ret = true;
 305
 306                if (ll == LL_L2)
 307                        pr_cont("during L1 linefill from L2.\n");
 308                else if (ll == LL_L1)
 309                        pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
 310                else
 311                        ret = false;
 312        }
 313        return ret;
 314}
 315
 316static bool f10h_mc0_mce(u16 ec, u8 xec)
 317{
 318        if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
 319                pr_cont("during data scrub.\n");
 320                return true;
 321        }
 322        return f12h_mc0_mce(ec, xec);
 323}
 324
 325static bool k8_mc0_mce(u16 ec, u8 xec)
 326{
 327        if (BUS_ERROR(ec)) {
 328                pr_cont("during system linefill.\n");
 329                return true;
 330        }
 331
 332        return f10h_mc0_mce(ec, xec);
 333}
 334
 335static bool cat_mc0_mce(u16 ec, u8 xec)
 336{
 337        u8 r4    = R4(ec);
 338        bool ret = true;
 339
 340        if (MEM_ERROR(ec)) {
 341
 342                if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
 343                        return false;
 344
 345                switch (r4) {
 346                case R4_DRD:
 347                case R4_DWR:
 348                        pr_cont("Data/Tag parity error due to %s.\n",
 349                                (r4 == R4_DRD ? "load/hw prf" : "store"));
 350                        break;
 351                case R4_EVICT:
 352                        pr_cont("Copyback parity error on a tag miss.\n");
 353                        break;
 354                case R4_SNOOP:
 355                        pr_cont("Tag parity error during snoop.\n");
 356                        break;
 357                default:
 358                        ret = false;
 359                }
 360        } else if (BUS_ERROR(ec)) {
 361
 362                if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
 363                        return false;
 364
 365                pr_cont("System read data error on a ");
 366
 367                switch (r4) {
 368                case R4_RD:
 369                        pr_cont("TLB reload.\n");
 370                        break;
 371                case R4_DWR:
 372                        pr_cont("store.\n");
 373                        break;
 374                case R4_DRD:
 375                        pr_cont("load.\n");
 376                        break;
 377                default:
 378                        ret = false;
 379                }
 380        } else {
 381                ret = false;
 382        }
 383
 384        return ret;
 385}
 386
 387static bool f15h_mc0_mce(u16 ec, u8 xec)
 388{
 389        bool ret = true;
 390
 391        if (MEM_ERROR(ec)) {
 392
 393                switch (xec) {
 394                case 0x0:
 395                        pr_cont("Data Array access error.\n");
 396                        break;
 397
 398                case 0x1:
 399                        pr_cont("UC error during a linefill from L2/NB.\n");
 400                        break;
 401
 402                case 0x2:
 403                case 0x11:
 404                        pr_cont("STQ access error.\n");
 405                        break;
 406
 407                case 0x3:
 408                        pr_cont("SCB access error.\n");
 409                        break;
 410
 411                case 0x10:
 412                        pr_cont("Tag error.\n");
 413                        break;
 414
 415                case 0x12:
 416                        pr_cont("LDQ access error.\n");
 417                        break;
 418
 419                default:
 420                        ret = false;
 421                }
 422        } else if (BUS_ERROR(ec)) {
 423
 424                if (!xec)
 425                        pr_cont("System Read Data Error.\n");
 426                else
 427                        pr_cont(" Internal error condition type %d.\n", xec);
 428        } else if (INT_ERROR(ec)) {
 429                if (xec <= 0x1f)
 430                        pr_cont("Hardware Assert.\n");
 431                else
 432                        ret = false;
 433
 434        } else
 435                ret = false;
 436
 437        return ret;
 438}
 439
 440static void decode_mc0_mce(struct mce *m)
 441{
 442        u16 ec = EC(m->status);
 443        u8 xec = XEC(m->status, xec_mask);
 444
 445        pr_emerg(HW_ERR "MC0 Error: ");
 446
 447        /* TLB error signatures are the same across families */
 448        if (TLB_ERROR(ec)) {
 449                if (TT(ec) == TT_DATA) {
 450                        pr_cont("%s TLB %s.\n", LL_MSG(ec),
 451                                ((xec == 2) ? "locked miss"
 452                                            : (xec ? "multimatch" : "parity")));
 453                        return;
 454                }
 455        } else if (fam_ops->mc0_mce(ec, xec))
 456                ;
 457        else
 458                pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
 459}
 460
 461static bool k8_mc1_mce(u16 ec, u8 xec)
 462{
 463        u8 ll    = LL(ec);
 464        bool ret = true;
 465
 466        if (!MEM_ERROR(ec))
 467                return false;
 468
 469        if (ll == 0x2)
 470                pr_cont("during a linefill from L2.\n");
 471        else if (ll == 0x1) {
 472                switch (R4(ec)) {
 473                case R4_IRD:
 474                        pr_cont("Parity error during data load.\n");
 475                        break;
 476
 477                case R4_EVICT:
 478                        pr_cont("Copyback Parity/Victim error.\n");
 479                        break;
 480
 481                case R4_SNOOP:
 482                        pr_cont("Tag Snoop error.\n");
 483                        break;
 484
 485                default:
 486                        ret = false;
 487                        break;
 488                }
 489        } else
 490                ret = false;
 491
 492        return ret;
 493}
 494
 495static bool cat_mc1_mce(u16 ec, u8 xec)
 496{
 497        u8 r4    = R4(ec);
 498        bool ret = true;
 499
 500        if (!MEM_ERROR(ec))
 501                return false;
 502
 503        if (TT(ec) != TT_INSTR)
 504                return false;
 505
 506        if (r4 == R4_IRD)
 507                pr_cont("Data/tag array parity error for a tag hit.\n");
 508        else if (r4 == R4_SNOOP)
 509                pr_cont("Tag error during snoop/victimization.\n");
 510        else if (xec == 0x0)
 511                pr_cont("Tag parity error from victim castout.\n");
 512        else if (xec == 0x2)
 513                pr_cont("Microcode patch RAM parity error.\n");
 514        else
 515                ret = false;
 516
 517        return ret;
 518}
 519
 520static bool f15h_mc1_mce(u16 ec, u8 xec)
 521{
 522        bool ret = true;
 523
 524        if (!MEM_ERROR(ec))
 525                return false;
 526
 527        switch (xec) {
 528        case 0x0 ... 0xa:
 529                pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
 530                break;
 531
 532        case 0xd:
 533                pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
 534                break;
 535
 536        case 0x10:
 537                pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
 538                break;
 539
 540        case 0x11 ... 0x15:
 541                pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
 542                break;
 543
 544        default:
 545                ret = false;
 546        }
 547        return ret;
 548}
 549
 550static void decode_mc1_mce(struct mce *m)
 551{
 552        u16 ec = EC(m->status);
 553        u8 xec = XEC(m->status, xec_mask);
 554
 555        pr_emerg(HW_ERR "MC1 Error: ");
 556
 557        if (TLB_ERROR(ec))
 558                pr_cont("%s TLB %s.\n", LL_MSG(ec),
 559                        (xec ? "multimatch" : "parity error"));
 560        else if (BUS_ERROR(ec)) {
 561                bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
 562
 563                pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
 564        } else if (INT_ERROR(ec)) {
 565                if (xec <= 0x3f)
 566                        pr_cont("Hardware Assert.\n");
 567                else
 568                        goto wrong_mc1_mce;
 569        } else if (fam_ops->mc1_mce(ec, xec))
 570                ;
 571        else
 572                goto wrong_mc1_mce;
 573
 574        return;
 575
 576wrong_mc1_mce:
 577        pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
 578}
 579
 580static bool k8_mc2_mce(u16 ec, u8 xec)
 581{
 582        bool ret = true;
 583
 584        if (xec == 0x1)
 585                pr_cont(" in the write data buffers.\n");
 586        else if (xec == 0x3)
 587                pr_cont(" in the victim data buffers.\n");
 588        else if (xec == 0x2 && MEM_ERROR(ec))
 589                pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
 590        else if (xec == 0x0) {
 591                if (TLB_ERROR(ec))
 592                        pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
 593                                TT_MSG(ec));
 594                else if (BUS_ERROR(ec))
 595                        pr_cont(": %s/ECC error in data read from NB: %s.\n",
 596                                R4_MSG(ec), PP_MSG(ec));
 597                else if (MEM_ERROR(ec)) {
 598                        u8 r4 = R4(ec);
 599
 600                        if (r4 >= 0x7)
 601                                pr_cont(": %s error during data copyback.\n",
 602                                        R4_MSG(ec));
 603                        else if (r4 <= 0x1)
 604                                pr_cont(": %s parity/ECC error during data "
 605                                        "access from L2.\n", R4_MSG(ec));
 606                        else
 607                                ret = false;
 608                } else
 609                        ret = false;
 610        } else
 611                ret = false;
 612
 613        return ret;
 614}
 615
 616static bool f15h_mc2_mce(u16 ec, u8 xec)
 617{
 618        bool ret = true;
 619
 620        if (TLB_ERROR(ec)) {
 621                if (xec == 0x0)
 622                        pr_cont("Data parity TLB read error.\n");
 623                else if (xec == 0x1)
 624                        pr_cont("Poison data provided for TLB fill.\n");
 625                else
 626                        ret = false;
 627        } else if (BUS_ERROR(ec)) {
 628                if (xec > 2)
 629                        ret = false;
 630
 631                pr_cont("Error during attempted NB data read.\n");
 632        } else if (MEM_ERROR(ec)) {
 633                switch (xec) {
 634                case 0x4 ... 0xc:
 635                        pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
 636                        break;
 637
 638                case 0x10 ... 0x14:
 639                        pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
 640                        break;
 641
 642                default:
 643                        ret = false;
 644                }
 645        } else if (INT_ERROR(ec)) {
 646                if (xec <= 0x3f)
 647                        pr_cont("Hardware Assert.\n");
 648                else
 649                        ret = false;
 650        }
 651
 652        return ret;
 653}
 654
 655static bool f16h_mc2_mce(u16 ec, u8 xec)
 656{
 657        u8 r4 = R4(ec);
 658
 659        if (!MEM_ERROR(ec))
 660                return false;
 661
 662        switch (xec) {
 663        case 0x04 ... 0x05:
 664                pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
 665                break;
 666
 667        case 0x09 ... 0x0b:
 668        case 0x0d ... 0x0f:
 669                pr_cont("ECC error in L2 tag (%s).\n",
 670                        ((r4 == R4_GEN)   ? "BankReq" :
 671                        ((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
 672                break;
 673
 674        case 0x10 ... 0x19:
 675        case 0x1b:
 676                pr_cont("ECC error in L2 data array (%s).\n",
 677                        (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
 678                        ((r4 == R4_GEN)   ? "Attr" :
 679                        ((r4 == R4_EVICT) ? "Vict" : "Fill"))));
 680                break;
 681
 682        case 0x1c ... 0x1d:
 683        case 0x1f:
 684                pr_cont("Parity error in L2 attribute bits (%s).\n",
 685                        ((r4 == R4_RD)  ? "Hit"  :
 686                        ((r4 == R4_GEN) ? "Attr" : "Fill")));
 687                break;
 688
 689        default:
 690                return false;
 691        }
 692
 693        return true;
 694}
 695
 696static void decode_mc2_mce(struct mce *m)
 697{
 698        u16 ec = EC(m->status);
 699        u8 xec = XEC(m->status, xec_mask);
 700
 701        pr_emerg(HW_ERR "MC2 Error: ");
 702
 703        if (!fam_ops->mc2_mce(ec, xec))
 704                pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
 705}
 706
 707static void decode_mc3_mce(struct mce *m)
 708{
 709        u16 ec = EC(m->status);
 710        u8 xec = XEC(m->status, xec_mask);
 711
 712        if (boot_cpu_data.x86 >= 0x14) {
 713                pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
 714                         " please report on LKML.\n");
 715                return;
 716        }
 717
 718        pr_emerg(HW_ERR "MC3 Error");
 719
 720        if (xec == 0x0) {
 721                u8 r4 = R4(ec);
 722
 723                if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
 724                        goto wrong_mc3_mce;
 725
 726                pr_cont(" during %s.\n", R4_MSG(ec));
 727        } else
 728                goto wrong_mc3_mce;
 729
 730        return;
 731
 732 wrong_mc3_mce:
 733        pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
 734}
 735
 736static void decode_mc4_mce(struct mce *m)
 737{
 738        struct cpuinfo_x86 *c = &boot_cpu_data;
 739        int node_id = amd_get_nb_id(m->extcpu);
 740        u16 ec = EC(m->status);
 741        u8 xec = XEC(m->status, 0x1f);
 742        u8 offset = 0;
 743
 744        pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
 745
 746        switch (xec) {
 747        case 0x0 ... 0xe:
 748
 749                /* special handling for DRAM ECCs */
 750                if (xec == 0x0 || xec == 0x8) {
 751                        /* no ECCs on F11h */
 752                        if (c->x86 == 0x11)
 753                                goto wrong_mc4_mce;
 754
 755                        pr_cont("%s.\n", mc4_mce_desc[xec]);
 756
 757                        if (decode_dram_ecc)
 758                                decode_dram_ecc(node_id, m);
 759                        return;
 760                }
 761                break;
 762
 763        case 0xf:
 764                if (TLB_ERROR(ec))
 765                        pr_cont("GART Table Walk data error.\n");
 766                else if (BUS_ERROR(ec))
 767                        pr_cont("DMA Exclusion Vector Table Walk error.\n");
 768                else
 769                        goto wrong_mc4_mce;
 770                return;
 771
 772        case 0x19:
 773                if (boot_cpu_data.x86 == 0x15 || boot_cpu_data.x86 == 0x16)
 774                        pr_cont("Compute Unit Data Error.\n");
 775                else
 776                        goto wrong_mc4_mce;
 777                return;
 778
 779        case 0x1c ... 0x1f:
 780                offset = 13;
 781                break;
 782
 783        default:
 784                goto wrong_mc4_mce;
 785        }
 786
 787        pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
 788        return;
 789
 790 wrong_mc4_mce:
 791        pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
 792}
 793
 794static void decode_mc5_mce(struct mce *m)
 795{
 796        struct cpuinfo_x86 *c = &boot_cpu_data;
 797        u16 ec = EC(m->status);
 798        u8 xec = XEC(m->status, xec_mask);
 799
 800        if (c->x86 == 0xf || c->x86 == 0x11)
 801                goto wrong_mc5_mce;
 802
 803        pr_emerg(HW_ERR "MC5 Error: ");
 804
 805        if (INT_ERROR(ec)) {
 806                if (xec <= 0x1f) {
 807                        pr_cont("Hardware Assert.\n");
 808                        return;
 809                } else
 810                        goto wrong_mc5_mce;
 811        }
 812
 813        if (xec == 0x0 || xec == 0xc)
 814                pr_cont("%s.\n", mc5_mce_desc[xec]);
 815        else if (xec <= 0xd)
 816                pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
 817        else
 818                goto wrong_mc5_mce;
 819
 820        return;
 821
 822 wrong_mc5_mce:
 823        pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
 824}
 825
 826static void decode_mc6_mce(struct mce *m)
 827{
 828        u8 xec = XEC(m->status, xec_mask);
 829
 830        pr_emerg(HW_ERR "MC6 Error: ");
 831
 832        switch (xec) {
 833        case 0x0:
 834                pr_cont("Hardware Assertion");
 835                break;
 836
 837        case 0x1:
 838                pr_cont("Free List");
 839                break;
 840
 841        case 0x2:
 842                pr_cont("Physical Register File");
 843                break;
 844
 845        case 0x3:
 846                pr_cont("Retire Queue");
 847                break;
 848
 849        case 0x4:
 850                pr_cont("Scheduler table");
 851                break;
 852
 853        case 0x5:
 854                pr_cont("Status Register File");
 855                break;
 856
 857        default:
 858                goto wrong_mc6_mce;
 859                break;
 860        }
 861
 862        pr_cont(" parity error.\n");
 863
 864        return;
 865
 866 wrong_mc6_mce:
 867        pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
 868}
 869
 870/* Decode errors according to Scalable MCA specification */
 871static void decode_smca_errors(struct mce *m)
 872{
 873        struct smca_hwid *hwid;
 874        unsigned int bank_type;
 875        const char *ip_name;
 876        u8 xec = XEC(m->status, xec_mask);
 877
 878        if (m->bank >= ARRAY_SIZE(smca_banks))
 879                return;
 880
 881        hwid = smca_banks[m->bank].hwid;
 882        if (!hwid)
 883                return;
 884
 885        bank_type = hwid->bank_type;
 886        ip_name = smca_names[bank_type].long_name;
 887
 888        pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec);
 889
 890        /* Only print the decode of valid error codes */
 891        if (xec < smca_mce_descs[bank_type].num_descs &&
 892                        (hwid->xec_bitmap & BIT_ULL(xec))) {
 893                pr_emerg(HW_ERR "%s Error: ", ip_name);
 894                pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]);
 895        }
 896
 897        /*
 898         * amd_get_nb_id() returns the last level cache id.
 899         * The last level cache on Fam17h is 1 level below the node.
 900         */
 901        if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
 902                decode_dram_ecc(amd_get_nb_id(m->extcpu) >> 1, m);
 903}
 904
 905static inline void amd_decode_err_code(u16 ec)
 906{
 907        if (INT_ERROR(ec)) {
 908                pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
 909                return;
 910        }
 911
 912        pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
 913
 914        if (BUS_ERROR(ec))
 915                pr_cont(", mem/io: %s", II_MSG(ec));
 916        else
 917                pr_cont(", tx: %s", TT_MSG(ec));
 918
 919        if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
 920                pr_cont(", mem-tx: %s", R4_MSG(ec));
 921
 922                if (BUS_ERROR(ec))
 923                        pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
 924        }
 925
 926        pr_cont("\n");
 927}
 928
 929/*
 930 * Filter out unwanted MCE signatures here.
 931 */
 932static bool amd_filter_mce(struct mce *m)
 933{
 934        u8 xec = (m->status >> 16) & 0x1f;
 935
 936        /*
 937         * NB GART TLB error reporting is disabled by default.
 938         */
 939        if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
 940                return true;
 941
 942        return false;
 943}
 944
 945static const char *decode_error_status(struct mce *m)
 946{
 947        if (m->status & MCI_STATUS_UC) {
 948                if (m->status & MCI_STATUS_PCC)
 949                        return "System Fatal error.";
 950                if (m->mcgstatus & MCG_STATUS_RIPV)
 951                        return "Uncorrected, software restartable error.";
 952                return "Uncorrected, software containable error.";
 953        }
 954
 955        if (m->status & MCI_STATUS_DEFERRED)
 956                return "Deferred error.";
 957
 958        return "Corrected error, no action required.";
 959}
 960
 961int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 962{
 963        struct mce *m = (struct mce *)data;
 964        struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
 965        int ecc;
 966
 967        if (amd_filter_mce(m))
 968                return NOTIFY_STOP;
 969
 970        pr_emerg(HW_ERR "%s\n", decode_error_status(m));
 971
 972        pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
 973                m->extcpu,
 974                c->x86, c->x86_model, c->x86_mask,
 975                m->bank,
 976                ((m->status & MCI_STATUS_OVER)  ? "Over"  : "-"),
 977                ((m->status & MCI_STATUS_UC)    ? "UE"    :
 978                 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
 979                ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
 980                ((m->status & MCI_STATUS_PCC)   ? "PCC"   : "-"),
 981                ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));
 982
 983        if (c->x86 >= 0x15)
 984                pr_cont("|%s|%s",
 985                        ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
 986                        ((m->status & MCI_STATUS_POISON)   ? "Poison"   : "-"));
 987
 988        if (boot_cpu_has(X86_FEATURE_SMCA)) {
 989                u32 low, high;
 990                u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
 991
 992                pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
 993
 994                if (!rdmsr_safe(addr, &low, &high) &&
 995                    (low & MCI_CONFIG_MCAX))
 996                        pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
 997        }
 998
 999        /* do the two bits[14:13] together */
1000        ecc = (m->status >> 45) & 0x3;
1001        if (ecc)
1002                pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
1003
1004        pr_cont("]: 0x%016llx\n", m->status);
1005
1006        if (m->status & MCI_STATUS_ADDRV)
1007                pr_emerg(HW_ERR "Error Addr: 0x%016llx", m->addr);
1008
1009        if (boot_cpu_has(X86_FEATURE_SMCA)) {
1010                if (m->status & MCI_STATUS_SYNDV)
1011                        pr_cont(", Syndrome: 0x%016llx", m->synd);
1012
1013                pr_cont(", IPID: 0x%016llx", m->ipid);
1014
1015                pr_cont("\n");
1016
1017                decode_smca_errors(m);
1018                goto err_code;
1019        } else
1020                pr_cont("\n");
1021
1022        if (!fam_ops)
1023                goto err_code;
1024
1025        switch (m->bank) {
1026        case 0:
1027                decode_mc0_mce(m);
1028                break;
1029
1030        case 1:
1031                decode_mc1_mce(m);
1032                break;
1033
1034        case 2:
1035                decode_mc2_mce(m);
1036                break;
1037
1038        case 3:
1039                decode_mc3_mce(m);
1040                break;
1041
1042        case 4:
1043                decode_mc4_mce(m);
1044                break;
1045
1046        case 5:
1047                decode_mc5_mce(m);
1048                break;
1049
1050        case 6:
1051                decode_mc6_mce(m);
1052                break;
1053
1054        default:
1055                break;
1056        }
1057
1058 err_code:
1059        amd_decode_err_code(m->status & 0xffff);
1060
1061        return NOTIFY_STOP;
1062}
1063EXPORT_SYMBOL_GPL(amd_decode_mce);
1064
1065static struct notifier_block amd_mce_dec_nb = {
1066        .notifier_call  = amd_decode_mce,
1067        .priority       = MCE_PRIO_EDAC,
1068};
1069
1070static int __init mce_amd_init(void)
1071{
1072        struct cpuinfo_x86 *c = &boot_cpu_data;
1073
1074        if (c->x86_vendor != X86_VENDOR_AMD)
1075                return -ENODEV;
1076
1077        fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
1078        if (!fam_ops)
1079                return -ENOMEM;
1080
1081        switch (c->x86) {
1082        case 0xf:
1083                fam_ops->mc0_mce = k8_mc0_mce;
1084                fam_ops->mc1_mce = k8_mc1_mce;
1085                fam_ops->mc2_mce = k8_mc2_mce;
1086                break;
1087
1088        case 0x10:
1089                fam_ops->mc0_mce = f10h_mc0_mce;
1090                fam_ops->mc1_mce = k8_mc1_mce;
1091                fam_ops->mc2_mce = k8_mc2_mce;
1092                break;
1093
1094        case 0x11:
1095                fam_ops->mc0_mce = k8_mc0_mce;
1096                fam_ops->mc1_mce = k8_mc1_mce;
1097                fam_ops->mc2_mce = k8_mc2_mce;
1098                break;
1099
1100        case 0x12:
1101                fam_ops->mc0_mce = f12h_mc0_mce;
1102                fam_ops->mc1_mce = k8_mc1_mce;
1103                fam_ops->mc2_mce = k8_mc2_mce;
1104                break;
1105
1106        case 0x14:
1107                fam_ops->mc0_mce = cat_mc0_mce;
1108                fam_ops->mc1_mce = cat_mc1_mce;
1109                fam_ops->mc2_mce = k8_mc2_mce;
1110                break;
1111
1112        case 0x15:
1113                xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1114
1115                fam_ops->mc0_mce = f15h_mc0_mce;
1116                fam_ops->mc1_mce = f15h_mc1_mce;
1117                fam_ops->mc2_mce = f15h_mc2_mce;
1118                break;
1119
1120        case 0x16:
1121                xec_mask = 0x1f;
1122                fam_ops->mc0_mce = cat_mc0_mce;
1123                fam_ops->mc1_mce = cat_mc1_mce;
1124                fam_ops->mc2_mce = f16h_mc2_mce;
1125                break;
1126
1127        case 0x17:
1128                xec_mask = 0x3f;
1129                if (!boot_cpu_has(X86_FEATURE_SMCA)) {
1130                        printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
1131                        goto err_out;
1132                }
1133                break;
1134
1135        default:
1136                printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1137                goto err_out;
1138        }
1139
1140        pr_info("MCE: In-kernel MCE decoding enabled.\n");
1141
1142        mce_register_decode_chain(&amd_mce_dec_nb);
1143
1144        return 0;
1145
1146err_out:
1147        kfree(fam_ops);
1148        fam_ops = NULL;
1149        return -EINVAL;
1150}
1151early_initcall(mce_amd_init);
1152
1153#ifdef MODULE
1154static void __exit mce_amd_exit(void)
1155{
1156        mce_unregister_decode_chain(&amd_mce_dec_nb);
1157        kfree(fam_ops);
1158}
1159
1160MODULE_DESCRIPTION("AMD MCE decoder");
1161MODULE_ALIAS("edac-mce-amd");
1162MODULE_LICENSE("GPL");
1163module_exit(mce_amd_exit);
1164#endif
1165