linux/drivers/gpu/drm/amd/amdgpu/umc_v8_7.c
<<
>>
Prefs
   1/*
   2 * Copyright 2020 Advanced Micro Devices, Inc.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice shall be included in
  12 * all copies or substantial portions of the Software.
  13 *
  14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20 * OTHER DEALINGS IN THE SOFTWARE.
  21 *
  22 */
  23#include "umc_v8_7.h"
  24#include "amdgpu_ras.h"
  25#include "amdgpu.h"
  26
  27#include "rsmu/rsmu_0_0_2_offset.h"
  28#include "rsmu/rsmu_0_0_2_sh_mask.h"
  29#include "umc/umc_8_7_0_offset.h"
  30#include "umc/umc_8_7_0_sh_mask.h"
  31
  32#define UMC_8_INST_DIST                 0x40000
  33
  34const uint32_t
  35        umc_v8_7_channel_idx_tbl[UMC_V8_7_UMC_INSTANCE_NUM][UMC_V8_7_CHANNEL_INSTANCE_NUM] = {
  36                {2, 11},  {4, 13},
  37                {1, 8},   {7, 14},
  38                {10, 3},  {12, 5},
  39                {9, 0},   {15, 6}
  40};
  41
  42static inline uint32_t get_umc_8_reg_offset(struct amdgpu_device *adev,
  43                                            uint32_t umc_inst,
  44                                            uint32_t ch_inst)
  45{
  46        return adev->umc.channel_offs*ch_inst + UMC_8_INST_DIST*umc_inst;
  47}
  48
  49static void umc_v8_7_clear_error_count_per_channel(struct amdgpu_device *adev,
  50                                        uint32_t umc_reg_offset)
  51{
  52        uint32_t ecc_err_cnt_addr;
  53        uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
  54
  55        ecc_err_cnt_sel_addr =
  56                SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
  57        ecc_err_cnt_addr =
  58                SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
  59
  60        /* select the lower chip */
  61        ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
  62                                        umc_reg_offset) * 4);
  63        ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
  64                                        UMCCH0_0_GeccErrCntSel,
  65                                        GeccErrCntCsSel, 0);
  66        WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
  67                        ecc_err_cnt_sel);
  68
  69        /* clear lower chip error count */
  70        WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
  71                        UMC_V8_7_CE_CNT_INIT);
  72
  73        /* select the higher chip */
  74        ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr +
  75                                        umc_reg_offset) * 4);
  76        ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel,
  77                                        UMCCH0_0_GeccErrCntSel,
  78                                        GeccErrCntCsSel, 1);
  79        WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4,
  80                        ecc_err_cnt_sel);
  81
  82        /* clear higher chip error count */
  83        WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
  84                        UMC_V8_7_CE_CNT_INIT);
  85}
  86
  87static void umc_v8_7_clear_error_count(struct amdgpu_device *adev)
  88{
  89        uint32_t umc_inst        = 0;
  90        uint32_t ch_inst         = 0;
  91        uint32_t umc_reg_offset  = 0;
  92
  93        LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
  94                umc_reg_offset = get_umc_8_reg_offset(adev,
  95                                                umc_inst,
  96                                                ch_inst);
  97
  98                umc_v8_7_clear_error_count_per_channel(adev,
  99                                                umc_reg_offset);
 100        }
 101}
 102
 103static void umc_v8_7_query_correctable_error_count(struct amdgpu_device *adev,
 104                                                   uint32_t umc_reg_offset,
 105                                                   unsigned long *error_count)
 106{
 107        uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
 108        uint32_t ecc_err_cnt, ecc_err_cnt_addr;
 109        uint64_t mc_umc_status;
 110        uint32_t mc_umc_status_addr;
 111
 112        /* UMC 8_7_2 registers */
 113        ecc_err_cnt_sel_addr =
 114                SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
 115        ecc_err_cnt_addr =
 116                SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
 117        mc_umc_status_addr =
 118                SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
 119
 120        /* select the lower chip and check the error count */
 121        ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
 122        ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
 123                                        GeccErrCntCsSel, 0);
 124        WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
 125
 126        ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
 127        *error_count +=
 128                (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_GeccErrCnt, GeccErrCnt) -
 129                 UMC_V8_7_CE_CNT_INIT);
 130
 131        /* select the higher chip and check the err counter */
 132        ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
 133                                        GeccErrCntCsSel, 1);
 134        WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
 135
 136        ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
 137        *error_count +=
 138                (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_GeccErrCnt, GeccErrCnt) -
 139                 UMC_V8_7_CE_CNT_INIT);
 140
 141        /* check for SRAM correctable error
 142          MCUMC_STATUS is a 64 bit register */
 143        mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
 144        if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, ErrorCodeExt) == 6 &&
 145            REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
 146            REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
 147                *error_count += 1;
 148}
 149
 150static void umc_v8_7_querry_uncorrectable_error_count(struct amdgpu_device *adev,
 151                                                      uint32_t umc_reg_offset,
 152                                                      unsigned long *error_count)
 153{
 154        uint64_t mc_umc_status;
 155        uint32_t mc_umc_status_addr;
 156
 157        mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
 158
 159        /* check the MCUMC_STATUS */
 160        mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
 161        if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
 162            (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
 163            REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
 164            REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
 165            REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
 166            REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
 167                *error_count += 1;
 168}
 169
 170static void umc_v8_7_query_ras_error_count(struct amdgpu_device *adev,
 171                                           void *ras_error_status)
 172{
 173        struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
 174
 175        uint32_t umc_inst        = 0;
 176        uint32_t ch_inst         = 0;
 177        uint32_t umc_reg_offset  = 0;
 178
 179        LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
 180                umc_reg_offset = get_umc_8_reg_offset(adev,
 181                                                      umc_inst,
 182                                                      ch_inst);
 183
 184                umc_v8_7_query_correctable_error_count(adev,
 185                                                       umc_reg_offset,
 186                                                       &(err_data->ce_count));
 187                umc_v8_7_querry_uncorrectable_error_count(adev,
 188                                                          umc_reg_offset,
 189                                                          &(err_data->ue_count));
 190        }
 191
 192        umc_v8_7_clear_error_count(adev);
 193}
 194
 195static void umc_v8_7_query_error_address(struct amdgpu_device *adev,
 196                                         struct ras_err_data *err_data,
 197                                         uint32_t umc_reg_offset,
 198                                         uint32_t ch_inst,
 199                                         uint32_t umc_inst)
 200{
 201        uint32_t lsb, mc_umc_status_addr;
 202        uint64_t mc_umc_status, err_addr, retired_page, mc_umc_addrt0;
 203        struct eeprom_table_record *err_rec;
 204        uint32_t channel_index = adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num + ch_inst];
 205
 206        mc_umc_status_addr =
 207                SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
 208        mc_umc_addrt0 =
 209                SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_ADDRT0);
 210
 211        mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
 212
 213        if (mc_umc_status == 0)
 214                return;
 215
 216        if (!err_data->err_addr) {
 217                /* clear umc status */
 218                WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
 219                return;
 220        }
 221
 222        err_rec = &err_data->err_addr[err_data->err_addr_cnt];
 223
 224        /* calculate error address if ue/ce error is detected */
 225        if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
 226            (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
 227            REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)) {
 228
 229                err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
 230                /* the lowest lsb bits should be ignored */
 231                lsb = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, LSB);
 232                err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
 233                err_addr &= ~((0x1ULL << lsb) - 1);
 234
 235                /* translate umc channel address to soc pa, 3 parts are included */
 236                retired_page = ADDR_OF_8KB_BLOCK(err_addr) |
 237                                ADDR_OF_256B_BLOCK(channel_index) |
 238                                OFFSET_IN_256B_BLOCK(err_addr);
 239
 240                /* we only save ue error information currently, ce is skipped */
 241                if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
 242                                == 1) {
 243                        err_rec->address = err_addr;
 244                        /* page frame address is saved */
 245                        err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
 246                        err_rec->ts = (uint64_t)ktime_get_real_seconds();
 247                        err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
 248                        err_rec->cu = 0;
 249                        err_rec->mem_channel = channel_index;
 250                        err_rec->mcumc_id = umc_inst;
 251
 252                        err_data->err_addr_cnt++;
 253                }
 254        }
 255
 256        /* clear umc status */
 257        WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL);
 258}
 259
 260static void umc_v8_7_query_ras_error_address(struct amdgpu_device *adev,
 261                                             void *ras_error_status)
 262{
 263        struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
 264
 265        uint32_t umc_inst        = 0;
 266        uint32_t ch_inst         = 0;
 267        uint32_t umc_reg_offset  = 0;
 268
 269        LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
 270                umc_reg_offset = get_umc_8_reg_offset(adev,
 271                                                      umc_inst,
 272                                                      ch_inst);
 273
 274                umc_v8_7_query_error_address(adev,
 275                                             err_data,
 276                                             umc_reg_offset,
 277                                             ch_inst,
 278                                             umc_inst);
 279        }
 280}
 281
 282static void umc_v8_7_err_cnt_init_per_channel(struct amdgpu_device *adev,
 283                                              uint32_t umc_reg_offset)
 284{
 285        uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr;
 286        uint32_t ecc_err_cnt_addr;
 287
 288        ecc_err_cnt_sel_addr =
 289                SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCntSel);
 290        ecc_err_cnt_addr =
 291                SOC15_REG_OFFSET(UMC, 0, mmUMCCH0_0_GeccErrCnt);
 292
 293        /* select the lower chip and check the error count */
 294        ecc_err_cnt_sel = RREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4);
 295        ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
 296                                        GeccErrCntCsSel, 0);
 297        /* set ce error interrupt type to APIC based interrupt */
 298        ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
 299                                        GeccErrInt, 0x1);
 300        WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
 301        /* set error count to initial value */
 302        WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_7_CE_CNT_INIT);
 303
 304        /* select the higher chip and check the err counter */
 305        ecc_err_cnt_sel = REG_SET_FIELD(ecc_err_cnt_sel, UMCCH0_0_GeccErrCntSel,
 306                                        GeccErrCntCsSel, 1);
 307        WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, ecc_err_cnt_sel);
 308        WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, UMC_V8_7_CE_CNT_INIT);
 309}
 310
 311static void umc_v8_7_err_cnt_init(struct amdgpu_device *adev)
 312{
 313        uint32_t umc_inst        = 0;
 314        uint32_t ch_inst         = 0;
 315        uint32_t umc_reg_offset  = 0;
 316
 317        LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
 318                umc_reg_offset = get_umc_8_reg_offset(adev,
 319                                                      umc_inst,
 320                                                      ch_inst);
 321
 322                umc_v8_7_err_cnt_init_per_channel(adev, umc_reg_offset);
 323        }
 324}
 325
 326const struct amdgpu_umc_funcs umc_v8_7_funcs = {
 327        .err_cnt_init = umc_v8_7_err_cnt_init,
 328        .ras_late_init = amdgpu_umc_ras_late_init,
 329        .query_ras_error_count = umc_v8_7_query_ras_error_count,
 330        .query_ras_error_address = umc_v8_7_query_ras_error_address,
 331};
 332