linux/drivers/edac/bluefield_edac.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Bluefield-specific EDAC driver.
   4 *
   5 * Copyright (c) 2019 Mellanox Technologies.
   6 */
   7
   8#include <linux/acpi.h>
   9#include <linux/arm-smccc.h>
  10#include <linux/bitfield.h>
  11#include <linux/edac.h>
  12#include <linux/io.h>
  13#include <linux/module.h>
  14#include <linux/platform_device.h>
  15
  16#include "edac_module.h"
  17
  18#define DRIVER_NAME             "bluefield-edac"
  19
  20/*
  21 * Mellanox BlueField EMI (External Memory Interface) register definitions.
  22 */
  23
  24#define MLXBF_ECC_CNT 0x340
  25#define MLXBF_ECC_CNT__SERR_CNT GENMASK(15, 0)
  26#define MLXBF_ECC_CNT__DERR_CNT GENMASK(31, 16)
  27
  28#define MLXBF_ECC_ERR 0x348
  29#define MLXBF_ECC_ERR__SECC BIT(0)
  30#define MLXBF_ECC_ERR__DECC BIT(16)
  31
  32#define MLXBF_ECC_LATCH_SEL 0x354
  33#define MLXBF_ECC_LATCH_SEL__START BIT(24)
  34
  35#define MLXBF_ERR_ADDR_0 0x358
  36
  37#define MLXBF_ERR_ADDR_1 0x37c
  38
  39#define MLXBF_SYNDROM 0x35c
  40#define MLXBF_SYNDROM__DERR BIT(0)
  41#define MLXBF_SYNDROM__SERR BIT(1)
  42#define MLXBF_SYNDROM__SYN GENMASK(25, 16)
  43
  44#define MLXBF_ADD_INFO 0x364
  45#define MLXBF_ADD_INFO__ERR_PRANK GENMASK(9, 8)
  46
  47#define MLXBF_EDAC_MAX_DIMM_PER_MC      2
  48#define MLXBF_EDAC_ERROR_GRAIN          8
  49
  50/*
  51 * Request MLNX_SIP_GET_DIMM_INFO
  52 *
  53 * Retrieve information about DIMM on a certain slot.
  54 *
  55 * Call register usage:
  56 * a0: MLNX_SIP_GET_DIMM_INFO
  57 * a1: (Memory controller index) << 16 | (Dimm index in memory controller)
  58 * a2-7: not used.
  59 *
  60 * Return status:
  61 * a0: MLXBF_DIMM_INFO defined below describing the DIMM.
  62 * a1-3: not used.
  63 */
  64#define MLNX_SIP_GET_DIMM_INFO          0x82000008
  65
  66/* Format for the SMC response about the memory information */
  67#define MLXBF_DIMM_INFO__SIZE_GB GENMASK_ULL(15, 0)
  68#define MLXBF_DIMM_INFO__IS_RDIMM BIT(16)
  69#define MLXBF_DIMM_INFO__IS_LRDIMM BIT(17)
  70#define MLXBF_DIMM_INFO__IS_NVDIMM BIT(18)
  71#define MLXBF_DIMM_INFO__RANKS GENMASK_ULL(23, 21)
  72#define MLXBF_DIMM_INFO__PACKAGE_X GENMASK_ULL(31, 24)
  73
  74struct bluefield_edac_priv {
  75        int dimm_ranks[MLXBF_EDAC_MAX_DIMM_PER_MC];
  76        void __iomem *emi_base;
  77        int dimm_per_mc;
  78};
  79
  80static u64 smc_call1(u64 smc_op, u64 smc_arg)
  81{
  82        struct arm_smccc_res res;
  83
  84        arm_smccc_smc(smc_op, smc_arg, 0, 0, 0, 0, 0, 0, &res);
  85
  86        return res.a0;
  87}
  88
  89/*
  90 * Gather the ECC information from the External Memory Interface registers
  91 * and report it to the edac handler.
  92 */
  93static void bluefield_gather_report_ecc(struct mem_ctl_info *mci,
  94                                        int error_cnt,
  95                                        int is_single_ecc)
  96{
  97        struct bluefield_edac_priv *priv = mci->pvt_info;
  98        u32 dram_additional_info, err_prank, edea0, edea1;
  99        u32 ecc_latch_select, dram_syndrom, serr, derr, syndrom;
 100        enum hw_event_mc_err_type ecc_type;
 101        u64 ecc_dimm_addr;
 102        int ecc_dimm;
 103
 104        ecc_type = is_single_ecc ? HW_EVENT_ERR_CORRECTED :
 105                                   HW_EVENT_ERR_UNCORRECTED;
 106
 107        /*
 108         * Tell the External Memory Interface to populate the relevant
 109         * registers with information about the last ECC error occurrence.
 110         */
 111        ecc_latch_select = MLXBF_ECC_LATCH_SEL__START;
 112        writel(ecc_latch_select, priv->emi_base + MLXBF_ECC_LATCH_SEL);
 113
 114        /*
 115         * Verify that the ECC reported info in the registers is of the
 116         * same type as the one asked to report. If not, just report the
 117         * error without the detailed information.
 118         */
 119        dram_syndrom = readl(priv->emi_base + MLXBF_SYNDROM);
 120        serr = FIELD_GET(MLXBF_SYNDROM__SERR, dram_syndrom);
 121        derr = FIELD_GET(MLXBF_SYNDROM__DERR, dram_syndrom);
 122        syndrom = FIELD_GET(MLXBF_SYNDROM__SYN, dram_syndrom);
 123
 124        if ((is_single_ecc && !serr) || (!is_single_ecc && !derr)) {
 125                edac_mc_handle_error(ecc_type, mci, error_cnt, 0, 0, 0,
 126                                     0, 0, -1, mci->ctl_name, "");
 127                return;
 128        }
 129
 130        dram_additional_info = readl(priv->emi_base + MLXBF_ADD_INFO);
 131        err_prank = FIELD_GET(MLXBF_ADD_INFO__ERR_PRANK, dram_additional_info);
 132
 133        ecc_dimm = (err_prank >= 2 && priv->dimm_ranks[0] <= 2) ? 1 : 0;
 134
 135        edea0 = readl(priv->emi_base + MLXBF_ERR_ADDR_0);
 136        edea1 = readl(priv->emi_base + MLXBF_ERR_ADDR_1);
 137
 138        ecc_dimm_addr = ((u64)edea1 << 32) | edea0;
 139
 140        edac_mc_handle_error(ecc_type, mci, error_cnt,
 141                             PFN_DOWN(ecc_dimm_addr),
 142                             offset_in_page(ecc_dimm_addr),
 143                             syndrom, ecc_dimm, 0, 0, mci->ctl_name, "");
 144}
 145
 146static void bluefield_edac_check(struct mem_ctl_info *mci)
 147{
 148        struct bluefield_edac_priv *priv = mci->pvt_info;
 149        u32 ecc_count, single_error_count, double_error_count, ecc_error = 0;
 150
 151        /*
 152         * The memory controller might not be initialized by the firmware
 153         * when there isn't memory, which may lead to bad register readings.
 154         */
 155        if (mci->edac_cap == EDAC_FLAG_NONE)
 156                return;
 157
 158        ecc_count = readl(priv->emi_base + MLXBF_ECC_CNT);
 159        single_error_count = FIELD_GET(MLXBF_ECC_CNT__SERR_CNT, ecc_count);
 160        double_error_count = FIELD_GET(MLXBF_ECC_CNT__DERR_CNT, ecc_count);
 161
 162        if (single_error_count) {
 163                ecc_error |= MLXBF_ECC_ERR__SECC;
 164
 165                bluefield_gather_report_ecc(mci, single_error_count, 1);
 166        }
 167
 168        if (double_error_count) {
 169                ecc_error |= MLXBF_ECC_ERR__DECC;
 170
 171                bluefield_gather_report_ecc(mci, double_error_count, 0);
 172        }
 173
 174        /* Write to clear reported errors. */
 175        if (ecc_count)
 176                writel(ecc_error, priv->emi_base + MLXBF_ECC_ERR);
 177}
 178
 179/* Initialize the DIMMs information for the given memory controller. */
 180static void bluefield_edac_init_dimms(struct mem_ctl_info *mci)
 181{
 182        struct bluefield_edac_priv *priv = mci->pvt_info;
 183        int mem_ctrl_idx = mci->mc_idx;
 184        struct dimm_info *dimm;
 185        u64 smc_info, smc_arg;
 186        int is_empty = 1, i;
 187
 188        for (i = 0; i < priv->dimm_per_mc; i++) {
 189                dimm = mci->dimms[i];
 190
 191                smc_arg = mem_ctrl_idx << 16 | i;
 192                smc_info = smc_call1(MLNX_SIP_GET_DIMM_INFO, smc_arg);
 193
 194                if (!FIELD_GET(MLXBF_DIMM_INFO__SIZE_GB, smc_info)) {
 195                        dimm->mtype = MEM_EMPTY;
 196                        continue;
 197                }
 198
 199                is_empty = 0;
 200
 201                dimm->edac_mode = EDAC_SECDED;
 202
 203                if (FIELD_GET(MLXBF_DIMM_INFO__IS_NVDIMM, smc_info))
 204                        dimm->mtype = MEM_NVDIMM;
 205                else if (FIELD_GET(MLXBF_DIMM_INFO__IS_LRDIMM, smc_info))
 206                        dimm->mtype = MEM_LRDDR4;
 207                else if (FIELD_GET(MLXBF_DIMM_INFO__IS_RDIMM, smc_info))
 208                        dimm->mtype = MEM_RDDR4;
 209                else
 210                        dimm->mtype = MEM_DDR4;
 211
 212                dimm->nr_pages =
 213                        FIELD_GET(MLXBF_DIMM_INFO__SIZE_GB, smc_info) *
 214                        (SZ_1G / PAGE_SIZE);
 215                dimm->grain = MLXBF_EDAC_ERROR_GRAIN;
 216
 217                /* Mem controller for BlueField only supports x4, x8 and x16 */
 218                switch (FIELD_GET(MLXBF_DIMM_INFO__PACKAGE_X, smc_info)) {
 219                case 4:
 220                        dimm->dtype = DEV_X4;
 221                        break;
 222                case 8:
 223                        dimm->dtype = DEV_X8;
 224                        break;
 225                case 16:
 226                        dimm->dtype = DEV_X16;
 227                        break;
 228                default:
 229                        dimm->dtype = DEV_UNKNOWN;
 230                }
 231
 232                priv->dimm_ranks[i] =
 233                        FIELD_GET(MLXBF_DIMM_INFO__RANKS, smc_info);
 234        }
 235
 236        if (is_empty)
 237                mci->edac_cap = EDAC_FLAG_NONE;
 238        else
 239                mci->edac_cap = EDAC_FLAG_SECDED;
 240}
 241
 242static int bluefield_edac_mc_probe(struct platform_device *pdev)
 243{
 244        struct bluefield_edac_priv *priv;
 245        struct device *dev = &pdev->dev;
 246        struct edac_mc_layer layers[1];
 247        struct mem_ctl_info *mci;
 248        struct resource *emi_res;
 249        unsigned int mc_idx, dimm_count;
 250        int rc, ret;
 251
 252        /* Read the MSS (Memory SubSystem) index from ACPI table. */
 253        if (device_property_read_u32(dev, "mss_number", &mc_idx)) {
 254                dev_warn(dev, "bf_edac: MSS number unknown\n");
 255                return -EINVAL;
 256        }
 257
 258        /* Read the DIMMs per MC from ACPI table. */
 259        if (device_property_read_u32(dev, "dimm_per_mc", &dimm_count)) {
 260                dev_warn(dev, "bf_edac: DIMMs per MC unknown\n");
 261                return -EINVAL;
 262        }
 263
 264        if (dimm_count > MLXBF_EDAC_MAX_DIMM_PER_MC) {
 265                dev_warn(dev, "bf_edac: DIMMs per MC not valid\n");
 266                return -EINVAL;
 267        }
 268
 269        emi_res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 270        if (!emi_res)
 271                return -EINVAL;
 272
 273        layers[0].type = EDAC_MC_LAYER_SLOT;
 274        layers[0].size = dimm_count;
 275        layers[0].is_virt_csrow = true;
 276
 277        mci = edac_mc_alloc(mc_idx, ARRAY_SIZE(layers), layers, sizeof(*priv));
 278        if (!mci)
 279                return -ENOMEM;
 280
 281        priv = mci->pvt_info;
 282
 283        priv->dimm_per_mc = dimm_count;
 284        priv->emi_base = devm_ioremap_resource(dev, emi_res);
 285        if (IS_ERR(priv->emi_base)) {
 286                dev_err(dev, "failed to map EMI IO resource\n");
 287                ret = PTR_ERR(priv->emi_base);
 288                goto err;
 289        }
 290
 291        mci->pdev = dev;
 292        mci->mtype_cap = MEM_FLAG_DDR4 | MEM_FLAG_RDDR4 |
 293                         MEM_FLAG_LRDDR4 | MEM_FLAG_NVDIMM;
 294        mci->edac_ctl_cap = EDAC_FLAG_SECDED;
 295
 296        mci->mod_name = DRIVER_NAME;
 297        mci->ctl_name = "BlueField_Memory_Controller";
 298        mci->dev_name = dev_name(dev);
 299        mci->edac_check = bluefield_edac_check;
 300
 301        /* Initialize mci with the actual populated DIMM information. */
 302        bluefield_edac_init_dimms(mci);
 303
 304        platform_set_drvdata(pdev, mci);
 305
 306        /* Register with EDAC core */
 307        rc = edac_mc_add_mc(mci);
 308        if (rc) {
 309                dev_err(dev, "failed to register with EDAC core\n");
 310                ret = rc;
 311                goto err;
 312        }
 313
 314        /* Only POLL mode supported so far. */
 315        edac_op_state = EDAC_OPSTATE_POLL;
 316
 317        return 0;
 318
 319err:
 320        edac_mc_free(mci);
 321
 322        return ret;
 323
 324}
 325
 326static int bluefield_edac_mc_remove(struct platform_device *pdev)
 327{
 328        struct mem_ctl_info *mci = platform_get_drvdata(pdev);
 329
 330        edac_mc_del_mc(&pdev->dev);
 331        edac_mc_free(mci);
 332
 333        return 0;
 334}
 335
 336static const struct acpi_device_id bluefield_mc_acpi_ids[] = {
 337        {"MLNXBF08", 0},
 338        {}
 339};
 340
 341MODULE_DEVICE_TABLE(acpi, bluefield_mc_acpi_ids);
 342
 343static struct platform_driver bluefield_edac_mc_driver = {
 344        .driver = {
 345                .name = DRIVER_NAME,
 346                .acpi_match_table = bluefield_mc_acpi_ids,
 347        },
 348        .probe = bluefield_edac_mc_probe,
 349        .remove = bluefield_edac_mc_remove,
 350};
 351
 352module_platform_driver(bluefield_edac_mc_driver);
 353
 354MODULE_DESCRIPTION("Mellanox BlueField memory edac driver");
 355MODULE_AUTHOR("Mellanox Technologies");
 356MODULE_LICENSE("GPL v2");
 357