linux/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
<<
>>
Prefs
   1/*
   2 * Copyright 2019 Advanced Micro Devices, Inc.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice shall be included in
  12 * all copies or substantial portions of the Software.
  13 *
  14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20 * OTHER DEALINGS IN THE SOFTWARE.
  21 *
  22 */
  23
  24#include "amdgpu_ras.h"
  25
  26int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
  27{
  28        int r;
  29        struct ras_fs_if fs_info = {
  30                .sysfs_name = "umc_err_count",
  31        };
  32        struct ras_ih_if ih_info = {
  33                .cb = amdgpu_umc_process_ras_data_cb,
  34        };
  35
  36        if (!adev->umc.ras_if) {
  37                adev->umc.ras_if =
  38                        kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
  39                if (!adev->umc.ras_if)
  40                        return -ENOMEM;
  41                adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
  42                adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
  43                adev->umc.ras_if->sub_block_index = 0;
  44                strcpy(adev->umc.ras_if->name, "umc");
  45        }
  46        ih_info.head = fs_info.head = *adev->umc.ras_if;
  47
  48        r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
  49                                 &fs_info, &ih_info);
  50        if (r)
  51                goto free;
  52
  53        if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
  54                r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
  55                if (r)
  56                        goto late_fini;
  57        } else {
  58                r = 0;
  59                goto free;
  60        }
  61
  62        /* ras init of specific umc version */
  63        if (adev->umc.funcs && adev->umc.funcs->err_cnt_init)
  64                adev->umc.funcs->err_cnt_init(adev);
  65
  66        return 0;
  67
  68late_fini:
  69        amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
  70free:
  71        kfree(adev->umc.ras_if);
  72        adev->umc.ras_if = NULL;
  73        return r;
  74}
  75
  76void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
  77{
  78        if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
  79                        adev->umc.ras_if) {
  80                struct ras_common_if *ras_if = adev->umc.ras_if;
  81                struct ras_ih_if ih_info = {
  82                        .head = *ras_if,
  83                        .cb = amdgpu_umc_process_ras_data_cb,
  84                };
  85
  86                amdgpu_ras_late_fini(adev, ras_if, &ih_info);
  87                kfree(ras_if);
  88        }
  89}
  90
  91int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
  92                void *ras_error_status,
  93                struct amdgpu_iv_entry *entry)
  94{
  95        struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
  96
  97        kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
  98        if (adev->umc.funcs &&
  99            adev->umc.funcs->query_ras_error_count)
 100            adev->umc.funcs->query_ras_error_count(adev, ras_error_status);
 101
 102        if (adev->umc.funcs &&
 103            adev->umc.funcs->query_ras_error_address &&
 104            adev->umc.max_ras_err_cnt_per_query) {
 105                err_data->err_addr =
 106                        kcalloc(adev->umc.max_ras_err_cnt_per_query,
 107                                sizeof(struct eeprom_table_record), GFP_KERNEL);
 108
 109                /* still call query_ras_error_address to clear error status
 110                 * even NOMEM error is encountered
 111                 */
 112                if(!err_data->err_addr)
 113                        dev_warn(adev->dev, "Failed to alloc memory for "
 114                                        "umc error address record!\n");
 115
 116                /* umc query_ras_error_address is also responsible for clearing
 117                 * error status
 118                 */
 119                adev->umc.funcs->query_ras_error_address(adev, ras_error_status);
 120        }
 121
 122        /* only uncorrectable error needs gpu reset */
 123        if (err_data->ue_count) {
 124                dev_info(adev->dev, "%ld uncorrectable hardware errors "
 125                                "detected in UMC block\n",
 126                                err_data->ue_count);
 127
 128                if ((amdgpu_bad_page_threshold != 0) &&
 129                        err_data->err_addr_cnt) {
 130                        amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
 131                                                err_data->err_addr_cnt);
 132                        amdgpu_ras_save_bad_pages(adev);
 133                }
 134
 135                amdgpu_ras_reset_gpu(adev);
 136        }
 137
 138        kfree(err_data->err_addr);
 139        return AMDGPU_RAS_SUCCESS;
 140}
 141
 142int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
 143                struct amdgpu_irq_src *source,
 144                struct amdgpu_iv_entry *entry)
 145{
 146        struct ras_common_if *ras_if = adev->umc.ras_if;
 147        struct ras_dispatch_if ih_data = {
 148                .entry = entry,
 149        };
 150
 151        if (!ras_if)
 152                return 0;
 153
 154        ih_data.head = *ras_if;
 155
 156        amdgpu_ras_interrupt_dispatch(adev, &ih_data);
 157        return 0;
 158}
 159