linux/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
<<
>>
Prefs
   1/*
   2 * Copyright 2019 Advanced Micro Devices, Inc.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice shall be included in
  12 * all copies or substantial portions of the Software.
  13 *
  14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20 * OTHER DEALINGS IN THE SOFTWARE.
  21 *
  22 */
  23
  24#include "amdgpu_ras.h"
  25
  26int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
  27{
  28        int r;
  29        struct ras_fs_if fs_info = {
  30                .sysfs_name = "umc_err_count",
  31        };
  32        struct ras_ih_if ih_info = {
  33                .cb = amdgpu_umc_process_ras_data_cb,
  34        };
  35
  36        if (!adev->umc.ras_if) {
  37                adev->umc.ras_if =
  38                        kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
  39                if (!adev->umc.ras_if)
  40                        return -ENOMEM;
  41                adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
  42                adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
  43                adev->umc.ras_if->sub_block_index = 0;
  44        }
  45        ih_info.head = fs_info.head = *adev->umc.ras_if;
  46
  47        r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
  48                                 &fs_info, &ih_info);
  49        if (r)
  50                goto free;
  51
  52        if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
  53                r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
  54                if (r)
  55                        goto late_fini;
  56        } else {
  57                r = 0;
  58                goto free;
  59        }
  60
  61        /* ras init of specific umc version */
  62        if (adev->umc.ras_funcs &&
  63            adev->umc.ras_funcs->err_cnt_init)
  64                adev->umc.ras_funcs->err_cnt_init(adev);
  65
  66        return 0;
  67
  68late_fini:
  69        amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
  70free:
  71        kfree(adev->umc.ras_if);
  72        adev->umc.ras_if = NULL;
  73        return r;
  74}
  75
  76void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
  77{
  78        if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
  79                        adev->umc.ras_if) {
  80                struct ras_common_if *ras_if = adev->umc.ras_if;
  81                struct ras_ih_if ih_info = {
  82                        .head = *ras_if,
  83                        .cb = amdgpu_umc_process_ras_data_cb,
  84                };
  85
  86                amdgpu_ras_late_fini(adev, ras_if, &ih_info);
  87                kfree(ras_if);
  88        }
  89}
  90
  91int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
  92                void *ras_error_status,
  93                struct amdgpu_iv_entry *entry)
  94{
  95        struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
  96        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
  97
  98        kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
  99        if (adev->umc.ras_funcs &&
 100            adev->umc.ras_funcs->query_ras_error_count)
 101            adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status);
 102
 103        if (adev->umc.ras_funcs &&
 104            adev->umc.ras_funcs->query_ras_error_address &&
 105            adev->umc.max_ras_err_cnt_per_query) {
 106                err_data->err_addr =
 107                        kcalloc(adev->umc.max_ras_err_cnt_per_query,
 108                                sizeof(struct eeprom_table_record), GFP_KERNEL);
 109
 110                /* still call query_ras_error_address to clear error status
 111                 * even NOMEM error is encountered
 112                 */
 113                if(!err_data->err_addr)
 114                        dev_warn(adev->dev, "Failed to alloc memory for "
 115                                        "umc error address record!\n");
 116
 117                /* umc query_ras_error_address is also responsible for clearing
 118                 * error status
 119                 */
 120                adev->umc.ras_funcs->query_ras_error_address(adev, ras_error_status);
 121        }
 122
 123        /* only uncorrectable error needs gpu reset */
 124        if (err_data->ue_count) {
 125                dev_info(adev->dev, "%ld uncorrectable hardware errors "
 126                                "detected in UMC block\n",
 127                                err_data->ue_count);
 128
 129                if ((amdgpu_bad_page_threshold != 0) &&
 130                        err_data->err_addr_cnt) {
 131                        amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
 132                                                err_data->err_addr_cnt);
 133                        amdgpu_ras_save_bad_pages(adev);
 134
 135                        if (adev->smu.ppt_funcs && adev->smu.ppt_funcs->send_hbm_bad_pages_num)
 136                                adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs);
 137                }
 138
 139                amdgpu_ras_reset_gpu(adev);
 140        }
 141
 142        kfree(err_data->err_addr);
 143        return AMDGPU_RAS_SUCCESS;
 144}
 145
 146int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
 147                struct amdgpu_irq_src *source,
 148                struct amdgpu_iv_entry *entry)
 149{
 150        struct ras_common_if *ras_if = adev->umc.ras_if;
 151        struct ras_dispatch_if ih_data = {
 152                .entry = entry,
 153        };
 154
 155        if (!ras_if)
 156                return 0;
 157
 158        ih_data.head = *ras_if;
 159
 160        amdgpu_ras_interrupt_dispatch(adev, &ih_data);
 161        return 0;
 162}
 163