LXR linux/drivers/gpu/drm/amd/amdgpu/amdgpu

   1/*
   2 * Copyright 2018 Advanced Micro Devices, Inc.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice shall be included in
  12 * all copies or substantial portions of the Software.
  13 *
  14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20 * OTHER DEALINGS IN THE SOFTWARE.
  21 *
  22 *
  23 */
  24#include <linux/debugfs.h>
  25#include <linux/list.h>
  26#include <linux/module.h>
  27#include <linux/uaccess.h>
  28#include <linux/reboot.h>
  29#include <linux/syscalls.h>
  30
  31#include "amdgpu.h"
  32#include "amdgpu_ras.h"
  33#include "amdgpu_atomfirmware.h"
  34#include "amdgpu_xgmi.h"
  35#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
  36
  37const char *ras_error_string[] = {
  38        "none",
  39        "parity",
  40        "single_correctable",
  41        "multi_uncorrectable",
  42        "poison",
  43};
  44
  45const char *ras_block_string[] = {
  46        "umc",
  47        "sdma",
  48        "gfx",
  49        "mmhub",
  50        "athub",
  51        "pcie_bif",
  52        "hdp",
  53        "xgmi_wafl",
  54        "df",
  55        "smn",
  56        "sem",
  57        "mp0",
  58        "mp1",
  59        "fuse",
  60};
  61
  62#define ras_err_str(i) (ras_error_string[ffs(i)])
  63#define ras_block_str(i) (ras_block_string[i])
  64
  65#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS           1
  66#define AMDGPU_RAS_FLAG_INIT_NEED_RESET         2
  67#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
  68
  69/* inject address is 52 bits */
  70#define RAS_UMC_INJECT_ADDR_LIMIT       (0x1ULL << 52)
  71
  72enum amdgpu_ras_retire_page_reservation {
  73        AMDGPU_RAS_RETIRE_PAGE_RESERVED,
  74        AMDGPU_RAS_RETIRE_PAGE_PENDING,
  75        AMDGPU_RAS_RETIRE_PAGE_FAULT,
  76};
  77
  78atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
  79
  80static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
  81                                uint64_t addr);
  82
  83static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
  84                                        size_t size, loff_t *pos)
  85{
  86        struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
  87        struct ras_query_if info = {
  88                .head = obj->head,
  89        };
  90        ssize_t s;
  91        char val[128];
  92
  93        if (amdgpu_ras_error_query(obj->adev, &info))
  94                return -EINVAL;
  95
  96        s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
  97                        "ue", info.ue_count,
  98                        "ce", info.ce_count);
  99        if (*pos >= s)
 100                return 0;
 101
 102        s -= *pos;
 103        s = min_t(u64, s, size);
 104
 105
 106        if (copy_to_user(buf, &val[*pos], s))
 107                return -EINVAL;
 108
 109        *pos += s;
 110
 111        return s;
 112}
 113
 114static const struct file_operations amdgpu_ras_debugfs_ops = {
 115        .owner = THIS_MODULE,
 116        .read = amdgpu_ras_debugfs_read,
 117        .write = NULL,
 118        .llseek = default_llseek
 119};
 120
 121static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
 122{
 123        int i;
 124
 125        for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
 126                *block_id = i;
 127                if (strcmp(name, ras_block_str(i)) == 0)
 128                        return 0;
 129        }
 130        return -EINVAL;
 131}
 132
 133static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
 134                const char __user *buf, size_t size,
 135                loff_t *pos, struct ras_debug_if *data)
 136{
 137        ssize_t s = min_t(u64, 64, size);
 138        char str[65];
 139        char block_name[33];
 140        char err[9] = "ue";
 141        int op = -1;
 142        int block_id;
 143        uint32_t sub_block;
 144        u64 address, value;
 145
 146        if (*pos)
 147                return -EINVAL;
 148        *pos = size;
 149
 150        memset(str, 0, sizeof(str));
 151        memset(data, 0, sizeof(*data));
 152
 153        if (copy_from_user(str, buf, s))
 154                return -EINVAL;
 155
 156        if (sscanf(str, "disable %32s", block_name) == 1)
 157                op = 0;
 158        else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
 159                op = 1;
 160        else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
 161                op = 2;
 162        else if (str[0] && str[1] && str[2] && str[3])
 163                /* ascii string, but commands are not matched. */
 164                return -EINVAL;
 165
 166        if (op != -1) {
 167                if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
 168                        return -EINVAL;
 169
 170                data->head.block = block_id;
 171                /* only ue and ce errors are supported */
 172                if (!memcmp("ue", err, 2))
 173                        data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
 174                else if (!memcmp("ce", err, 2))
 175                        data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
 176                else
 177                        return -EINVAL;
 178
 179                data->op = op;
 180
 181                if (op == 2) {
 182                        if (sscanf(str, "%*s %*s %*s %u %llu %llu",
 183                                                &sub_block, &address, &value) != 3)
 184                                if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
 185                                                        &sub_block, &address, &value) != 3)
 186                                        return -EINVAL;
 187                        data->head.sub_block_index = sub_block;
 188                        data->inject.address = address;
 189                        data->inject.value = value;
 190                }
 191        } else {
 192                if (size < sizeof(*data))
 193                        return -EINVAL;
 194
 195                if (copy_from_user(data, buf, sizeof(*data)))
 196                        return -EINVAL;
 197        }
 198
 199        return 0;
 200}
 201
 202/**
 203 * DOC: AMDGPU RAS debugfs control interface
 204 *
 205 * It accepts struct ras_debug_if who has two members.
 206 *
 207 * First member: ras_debug_if::head or ras_debug_if::inject.
 208 *
 209 * head is used to indicate which IP block will be under control.
 210 *
 211 * head has four members, they are block, type, sub_block_index, name.
 212 * block: which IP will be under control.
 213 * type: what kind of error will be enabled/disabled/injected.
 214 * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
 215 * name: the name of IP.
 216 *
 217 * inject has two more members than head, they are address, value.
 218 * As their names indicate, inject operation will write the
 219 * value to the address.
 220 *
 221 * The second member: struct ras_debug_if::op.
 222 * It has three kinds of operations.
 223 *
 224 * - 0: disable RAS on the block. Take ::head as its data.
 225 * - 1: enable RAS on the block. Take ::head as its data.
 226 * - 2: inject errors on the block. Take ::inject as its data.
 227 *
 228 * How to use the interface?
 229 *
 230 * Programs
 231 *
 232 * Copy the struct ras_debug_if in your codes and initialize it.
 233 * Write the struct to the control node.
 234 *
 235 * Shells
 236 *
 237 * .. code-block:: bash
 238 *
 239 *      echo op block [error [sub_block address value]] > .../ras/ras_ctrl
 240 *
 241 * Parameters:
 242 *
 243 * op: disable, enable, inject
 244 *      disable: only block is needed
 245 *      enable: block and error are needed
 246 *      inject: error, address, value are needed
 247 * block: umc, sdma, gfx, .........
 248 *      see ras_block_string[] for details
 249 * error: ue, ce
 250 *      ue: multi_uncorrectable
 251 *      ce: single_correctable
 252 * sub_block:
 253 *      sub block index, pass 0 if there is no sub block
 254 *
 255 * here are some examples for bash commands:
 256 *
 257 * .. code-block:: bash
 258 *
 259 *      echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
 260 *      echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
 261 *      echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
 262 *
 263 * How to check the result?
 264 *
 265 * For disable/enable, please check ras features at
 266 * /sys/class/drm/card[0/1/2...]/device/ras/features
 267 *
 268 * For inject, please check corresponding err count at
 269 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
 270 *
 271 * .. note::
 272 *      Operations are only allowed on blocks which are supported.
 273 *      Please check ras mask at /sys/module/amdgpu/parameters/ras_mask
 274 *      to see which blocks support RAS on a particular asic.
 275 *
 276 */
 277static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
 278                size_t size, loff_t *pos)
 279{
 280        struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
 281        struct ras_debug_if data;
 282        int ret = 0;
 283
 284        if (amdgpu_ras_intr_triggered()) {
 285                DRM_WARN("RAS WARN: error injection currently inaccessible\n");
 286                return size;
 287        }
 288
 289        ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
 290        if (ret)
 291                return -EINVAL;
 292
 293        if (!amdgpu_ras_is_supported(adev, data.head.block))
 294                return -EINVAL;
 295
 296        switch (data.op) {
 297        case 0:
 298                ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
 299                break;
 300        case 1:
 301                ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
 302                break;
 303        case 2:
 304                if ((data.inject.address >= adev->gmc.mc_vram_size) ||
 305                    (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
 306                        ret = -EINVAL;
 307                        break;
 308                }
 309
 310                /* umc ce/ue error injection for a bad page is not allowed */
 311                if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
 312                    amdgpu_ras_check_bad_page(adev, data.inject.address)) {
 313                        DRM_WARN("RAS WARN: 0x%llx has been marked as bad before error injection!\n",
 314                                        data.inject.address);
 315                        break;
 316                }
 317
 318                /* data.inject.address is offset instead of absolute gpu address */
 319                ret = amdgpu_ras_error_inject(adev, &data.inject);
 320                break;
 321        default:
 322                ret = -EINVAL;
 323                break;
 324        }
 325
 326        if (ret)
 327                return -EINVAL;
 328
 329        return size;
 330}
 331
 332/**
 333 * DOC: AMDGPU RAS debugfs EEPROM table reset interface
 334 *
 335 * Some boards contain an EEPROM which is used to persistently store a list of
 336 * bad pages which experiences ECC errors in vram.  This interface provides
 337 * a way to reset the EEPROM, e.g., after testing error injection.
 338 *
 339 * Usage:
 340 *
 341 * .. code-block:: bash
 342 *
 343 *      echo 1 > ../ras/ras_eeprom_reset
 344 *
 345 * will reset EEPROM table to 0 entries.
 346 *
 347 */
 348static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user *buf,
 349                size_t size, loff_t *pos)
 350{
 351        struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
 352        int ret;
 353
 354        ret = amdgpu_ras_eeprom_reset_table(&adev->psp.ras.ras->eeprom_control);
 355
 356        return ret == 1 ? size : -EIO;
 357}
 358
 359static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
 360        .owner = THIS_MODULE,
 361        .read = NULL,
 362        .write = amdgpu_ras_debugfs_ctrl_write,
 363        .llseek = default_llseek
 364};
 365
 366static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
 367        .owner = THIS_MODULE,
 368        .read = NULL,
 369        .write = amdgpu_ras_debugfs_eeprom_write,
 370        .llseek = default_llseek
 371};
 372
 373/**
 374 * DOC: AMDGPU RAS sysfs Error Count Interface
 375 *
 376 * It allows the user to read the error count for each IP block on the gpu through
 377 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
 378 *
 379 * It outputs the multiple lines which report the uncorrected (ue) and corrected
 380 * (ce) error counts.
 381 *
 382 * The format of one line is below,
 383 *
 384 * [ce|ue]: count
 385 *
 386 * Example:
 387 *
 388 * .. code-block:: bash
 389 *
 390 *      ue: 0
 391 *      ce: 1
 392 *
 393 */
 394static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
 395                struct device_attribute *attr, char *buf)
 396{
 397        struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
 398        struct ras_query_if info = {
 399                .head = obj->head,
 400        };
 401
 402        if (amdgpu_ras_intr_triggered())
 403                return snprintf(buf, PAGE_SIZE,
 404                                "Query currently inaccessible\n");
 405
 406        if (amdgpu_ras_error_query(obj->adev, &info))
 407                return -EINVAL;
 408
 409        return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
 410                        "ue", info.ue_count,
 411                        "ce", info.ce_count);
 412}
 413
 414/* obj begin */
 415
 416#define get_obj(obj) do { (obj)->use++; } while (0)
 417#define alive_obj(obj) ((obj)->use)
 418
 419static inline void put_obj(struct ras_manager *obj)
 420{
 421        if (obj && --obj->use == 0)
 422                list_del(&obj->node);
 423        if (obj && obj->use < 0) {
 424                 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
 425        }
 426}
 427
 428/* make one obj and return it. */
 429static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
 430                struct ras_common_if *head)
 431{
 432        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 433        struct ras_manager *obj;
 434
 435        if (!con)
 436                return NULL;
 437
 438        if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
 439                return NULL;
 440
 441        obj = &con->objs[head->block];
 442        /* already exist. return obj? */
 443        if (alive_obj(obj))
 444                return NULL;
 445
 446        obj->head = *head;
 447        obj->adev = adev;
 448        list_add(&obj->node, &con->head);
 449        get_obj(obj);
 450
 451        return obj;
 452}
 453
 454/* return an obj equal to head, or the first when head is NULL */
 455struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
 456                struct ras_common_if *head)
 457{
 458        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 459        struct ras_manager *obj;
 460        int i;
 461
 462        if (!con)
 463                return NULL;
 464
 465        if (head) {
 466                if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
 467                        return NULL;
 468
 469                obj = &con->objs[head->block];
 470
 471                if (alive_obj(obj)) {
 472                        WARN_ON(head->block != obj->head.block);
 473                        return obj;
 474                }
 475        } else {
 476                for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
 477                        obj = &con->objs[i];
 478                        if (alive_obj(obj)) {
 479                                WARN_ON(i != obj->head.block);
 480                                return obj;
 481                        }
 482                }
 483        }
 484
 485        return NULL;
 486}
 487/* obj end */
 488
 489/* feature ctl begin */
 490static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
 491                struct ras_common_if *head)
 492{
 493        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 494
 495        return con->hw_supported & BIT(head->block);
 496}
 497
 498static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
 499                struct ras_common_if *head)
 500{
 501        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 502
 503        return con->features & BIT(head->block);
 504}
 505
 506/*
 507 * if obj is not created, then create one.
 508 * set feature enable flag.
 509 */
 510static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 511                struct ras_common_if *head, int enable)
 512{
 513        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 514        struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
 515
 516        /* If hardware does not support ras, then do not create obj.
 517         * But if hardware support ras, we can create the obj.
 518         * Ras framework checks con->hw_supported to see if it need do
 519         * corresponding initialization.
 520         * IP checks con->support to see if it need disable ras.
 521         */
 522        if (!amdgpu_ras_is_feature_allowed(adev, head))
 523                return 0;
 524        if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
 525                return 0;
 526
 527        if (enable) {
 528                if (!obj) {
 529                        obj = amdgpu_ras_create_obj(adev, head);
 530                        if (!obj)
 531                                return -EINVAL;
 532                } else {
 533                        /* In case we create obj somewhere else */
 534                        get_obj(obj);
 535                }
 536                con->features |= BIT(head->block);
 537        } else {
 538                if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
 539                        con->features &= ~BIT(head->block);
 540                        put_obj(obj);
 541                }
 542        }
 543
 544        return 0;
 545}
 546
 547/* wrapper of psp_ras_enable_features */
 548int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 549                struct ras_common_if *head, bool enable)
 550{
 551        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 552        union ta_ras_cmd_input info;
 553        int ret;
 554
 555        if (!con)
 556                return -EINVAL;
 557
 558        if (!enable) {
 559                info.disable_features = (struct ta_ras_disable_features_input) {
 560                        .block_id =  amdgpu_ras_block_to_ta(head->block),
 561                        .error_type = amdgpu_ras_error_to_ta(head->type),
 562                };
 563        } else {
 564                info.enable_features = (struct ta_ras_enable_features_input) {
 565                        .block_id =  amdgpu_ras_block_to_ta(head->block),
 566                        .error_type = amdgpu_ras_error_to_ta(head->type),
 567                };
 568        }
 569
 570        /* Do not enable if it is not allowed. */
 571        WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
 572        /* Are we alerady in that state we are going to set? */
 573        if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
 574                return 0;
 575
 576        if (!amdgpu_ras_intr_triggered()) {
 577                ret = psp_ras_enable_features(&adev->psp, &info, enable);
 578                if (ret) {
 579                        DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
 580                                        enable ? "enable":"disable",
 581                                        ras_block_str(head->block),
 582                                        ret);
 583                        if (ret == TA_RAS_STATUS__RESET_NEEDED)
 584                                return -EAGAIN;
 585                        return -EINVAL;
 586                }
 587        }
 588
 589        /* setup the obj */
 590        __amdgpu_ras_feature_enable(adev, head, enable);
 591
 592        return 0;
 593}
 594
 595/* Only used in device probe stage and called only once. */
 596int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
 597                struct ras_common_if *head, bool enable)
 598{
 599        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 600        int ret;
 601
 602        if (!con)
 603                return -EINVAL;
 604
 605        if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
 606                if (enable) {
 607                        /* There is no harm to issue a ras TA cmd regardless of
 608                         * the currecnt ras state.
 609                         * If current state == target state, it will do nothing
 610                         * But sometimes it requests driver to reset and repost
 611                         * with error code -EAGAIN.
 612                         */
 613                        ret = amdgpu_ras_feature_enable(adev, head, 1);
 614                        /* With old ras TA, we might fail to enable ras.
 615                         * Log it and just setup the object.
 616                         * TODO need remove this WA in the future.
 617                         */
 618                        if (ret == -EINVAL) {
 619                                ret = __amdgpu_ras_feature_enable(adev, head, 1);
 620                                if (!ret)
 621                                        DRM_INFO("RAS INFO: %s setup object\n",
 622                                                ras_block_str(head->block));
 623                        }
 624                } else {
 625                        /* setup the object then issue a ras TA disable cmd.*/
 626                        ret = __amdgpu_ras_feature_enable(adev, head, 1);
 627                        if (ret)
 628                                return ret;
 629
 630                        ret = amdgpu_ras_feature_enable(adev, head, 0);
 631                }
 632        } else
 633                ret = amdgpu_ras_feature_enable(adev, head, enable);
 634
 635        return ret;
 636}
 637
 638static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
 639                bool bypass)
 640{
 641        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 642        struct ras_manager *obj, *tmp;
 643
 644        list_for_each_entry_safe(obj, tmp, &con->head, node) {
 645                /* bypass psp.
 646                 * aka just release the obj and corresponding flags
 647                 */
 648                if (bypass) {
 649                        if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
 650                                break;
 651                } else {
 652                        if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
 653                                break;
 654                }
 655        }
 656
 657        return con->features;
 658}
 659
 660static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
 661                bool bypass)
 662{
 663        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 664        int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
 665        int i;
 666        const enum amdgpu_ras_error_type default_ras_type =
 667                AMDGPU_RAS_ERROR__NONE;
 668
 669        for (i = 0; i < ras_block_count; i++) {
 670                struct ras_common_if head = {
 671                        .block = i,
 672                        .type = default_ras_type,
 673                        .sub_block_index = 0,
 674                };
 675                strcpy(head.name, ras_block_str(i));
 676                if (bypass) {
 677                        /*
 678                         * bypass psp. vbios enable ras for us.
 679                         * so just create the obj
 680                         */
 681                        if (__amdgpu_ras_feature_enable(adev, &head, 1))
 682                                break;
 683                } else {
 684                        if (amdgpu_ras_feature_enable(adev, &head, 1))
 685                                break;
 686                }
 687        }
 688
 689        return con->features;
 690}
 691/* feature ctl end */
 692
 693/* query/inject/cure begin */
 694int amdgpu_ras_error_query(struct amdgpu_device *adev,
 695                struct ras_query_if *info)
 696{
 697        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
 698        struct ras_err_data err_data = {0, 0, 0, NULL};
 699        int i;
 700
 701        if (!obj)
 702                return -EINVAL;
 703
 704        switch (info->head.block) {
 705        case AMDGPU_RAS_BLOCK__UMC:
 706                if (adev->umc.funcs->query_ras_error_count)
 707                        adev->umc.funcs->query_ras_error_count(adev, &err_data);
 708                /* umc query_ras_error_address is also responsible for clearing
 709                 * error status
 710                 */
 711                if (adev->umc.funcs->query_ras_error_address)
 712                        adev->umc.funcs->query_ras_error_address(adev, &err_data);
 713                break;
 714        case AMDGPU_RAS_BLOCK__SDMA:
 715                if (adev->sdma.funcs->query_ras_error_count) {
 716                        for (i = 0; i < adev->sdma.num_instances; i++)
 717                                adev->sdma.funcs->query_ras_error_count(adev, i,
 718                                                                        &err_data);
 719                }
 720                break;
 721        case AMDGPU_RAS_BLOCK__GFX:
 722                if (adev->gfx.funcs->query_ras_error_count)
 723                        adev->gfx.funcs->query_ras_error_count(adev, &err_data);
 724                break;
 725        case AMDGPU_RAS_BLOCK__MMHUB:
 726                if (adev->mmhub.funcs->query_ras_error_count)
 727                        adev->mmhub.funcs->query_ras_error_count(adev, &err_data);
 728                break;
 729        case AMDGPU_RAS_BLOCK__PCIE_BIF:
 730                if (adev->nbio.funcs->query_ras_error_count)
 731                        adev->nbio.funcs->query_ras_error_count(adev, &err_data);
 732                break;
 733        case AMDGPU_RAS_BLOCK__XGMI_WAFL:
 734                amdgpu_xgmi_query_ras_error_count(adev, &err_data);
 735                break;
 736        default:
 737                break;
 738        }
 739
 740        obj->err_data.ue_count += err_data.ue_count;
 741        obj->err_data.ce_count += err_data.ce_count;
 742
 743        info->ue_count = obj->err_data.ue_count;
 744        info->ce_count = obj->err_data.ce_count;
 745
 746        if (err_data.ce_count) {
 747                dev_info(adev->dev, "%ld correctable errors detected in %s block\n",
 748                         obj->err_data.ce_count, ras_block_str(info->head.block));
 749        }
 750        if (err_data.ue_count) {
 751                dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n",
 752                         obj->err_data.ue_count, ras_block_str(info->head.block));
 753        }
 754
 755        return 0;
 756}
 757
 758/* wrapper of psp_ras_trigger_error */
 759int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 760                struct ras_inject_if *info)
 761{
 762        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
 763        struct ta_ras_trigger_error_input block_info = {
 764                .block_id =  amdgpu_ras_block_to_ta(info->head.block),
 765                .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
 766                .sub_block_index = info->head.sub_block_index,
 767                .address = info->address,
 768                .value = info->value,
 769        };
 770        int ret = 0;
 771
 772        if (!obj)
 773                return -EINVAL;
 774
 775        /* Calculate XGMI relative offset */
 776        if (adev->gmc.xgmi.num_physical_nodes > 1) {
 777                block_info.address =
 778                        amdgpu_xgmi_get_relative_phy_addr(adev,
 779                                                          block_info.address);
 780        }
 781
 782        switch (info->head.block) {
 783        case AMDGPU_RAS_BLOCK__GFX:
 784                if (adev->gfx.funcs->ras_error_inject)
 785                        ret = adev->gfx.funcs->ras_error_inject(adev, info);
 786                else
 787                        ret = -EINVAL;
 788                break;
 789        case AMDGPU_RAS_BLOCK__UMC:
 790        case AMDGPU_RAS_BLOCK__MMHUB:
 791        case AMDGPU_RAS_BLOCK__XGMI_WAFL:
 792        case AMDGPU_RAS_BLOCK__PCIE_BIF:
 793                ret = psp_ras_trigger_error(&adev->psp, &block_info);
 794                break;
 795        default:
 796                DRM_INFO("%s error injection is not supported yet\n",
 797                         ras_block_str(info->head.block));
 798                ret = -EINVAL;
 799        }
 800
 801        if (ret)
 802                DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
 803                                ras_block_str(info->head.block),
 804                                ret);
 805
 806        return ret;
 807}
 808
 809int amdgpu_ras_error_cure(struct amdgpu_device *adev,
 810                struct ras_cure_if *info)
 811{
 812        /* psp fw has no cure interface for now. */
 813        return 0;
 814}
 815
 816/* get the total error counts on all IPs */
 817unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 818                bool is_ce)
 819{
 820        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 821        struct ras_manager *obj;
 822        struct ras_err_data data = {0, 0};
 823
 824        if (!con)
 825                return 0;
 826
 827        list_for_each_entry(obj, &con->head, node) {
 828                struct ras_query_if info = {
 829                        .head = obj->head,
 830                };
 831
 832                if (amdgpu_ras_error_query(adev, &info))
 833                        return 0;
 834
 835                data.ce_count += info.ce_count;
 836                data.ue_count += info.ue_count;
 837        }
 838
 839        return is_ce ? data.ce_count : data.ue_count;
 840}
 841/* query/inject/cure end */
 842
 843
 844/* sysfs begin */
 845
 846static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
 847                struct ras_badpage **bps, unsigned int *count);
 848
 849static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
 850{
 851        switch (flags) {
 852        case AMDGPU_RAS_RETIRE_PAGE_RESERVED:
 853                return "R";
 854        case AMDGPU_RAS_RETIRE_PAGE_PENDING:
 855                return "P";
 856        case AMDGPU_RAS_RETIRE_PAGE_FAULT:
 857        default:
 858                return "F";
 859        };
 860}
 861
 862/**
 863 * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface
 864 *
 865 * It allows user to read the bad pages of vram on the gpu through
 866 * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
 867 *
 868 * It outputs multiple lines, and each line stands for one gpu page.
 869 *
 870 * The format of one line is below,
 871 * gpu pfn : gpu page size : flags
 872 *
 873 * gpu pfn and gpu page size are printed in hex format.
 874 * flags can be one of below character,
 875 *
 876 * R: reserved, this gpu page is reserved and not able to use.
 877 *
 878 * P: pending for reserve, this gpu page is marked as bad, will be reserved
 879 * in next window of page_reserve.
 880 *
 881 * F: unable to reserve. this gpu page can't be reserved due to some reasons.
 882 *
 883 * Examples:
 884 *
 885 * .. code-block:: bash
 886 *
 887 *      0x00000001 : 0x00001000 : R
 888 *      0x00000002 : 0x00001000 : P
 889 *
 890 */
 891
 892static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
 893                struct kobject *kobj, struct bin_attribute *attr,
 894                char *buf, loff_t ppos, size_t count)
 895{
 896        struct amdgpu_ras *con =
 897                container_of(attr, struct amdgpu_ras, badpages_attr);
 898        struct amdgpu_device *adev = con->adev;
 899        const unsigned int element_size =
 900                sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
 901        unsigned int start = div64_ul(ppos + element_size - 1, element_size);
 902        unsigned int end = div64_ul(ppos + count - 1, element_size);
 903        ssize_t s = 0;
 904        struct ras_badpage *bps = NULL;
 905        unsigned int bps_count = 0;
 906
 907        memset(buf, 0, count);
 908
 909        if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
 910                return 0;
 911
 912        for (; start < end && start < bps_count; start++)
 913                s += scnprintf(&buf[s], element_size + 1,
 914                                "0x%08x : 0x%08x : %1s\n",
 915                                bps[start].bp,
 916                                bps[start].size,
 917                                amdgpu_ras_badpage_flags_str(bps[start].flags));
 918
 919        kfree(bps);
 920
 921        return s;
 922}
 923
 924static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
 925                struct device_attribute *attr, char *buf)
 926{
 927        struct amdgpu_ras *con =
 928                container_of(attr, struct amdgpu_ras, features_attr);
 929
 930        return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
 931}
 932
 933static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
 934{
 935        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 936        struct attribute *attrs[] = {
 937                &con->features_attr.attr,
 938                NULL
 939        };
 940        struct bin_attribute *bin_attrs[] = {
 941                &con->badpages_attr,
 942                NULL
 943        };
 944        struct attribute_group group = {
 945                .name = "ras",
 946                .attrs = attrs,
 947                .bin_attrs = bin_attrs,
 948        };
 949
 950        con->features_attr = (struct device_attribute) {
 951                .attr = {
 952                        .name = "features",
 953                        .mode = S_IRUGO,
 954                },
 955                        .show = amdgpu_ras_sysfs_features_read,
 956        };
 957
 958        con->badpages_attr = (struct bin_attribute) {
 959                .attr = {
 960                        .name = "gpu_vram_bad_pages",
 961                        .mode = S_IRUGO,
 962                },
 963                .size = 0,
 964                .private = NULL,
 965                .read = amdgpu_ras_sysfs_badpages_read,
 966        };
 967
 968        sysfs_attr_init(attrs[0]);
 969        sysfs_bin_attr_init(bin_attrs[0]);
 970
 971        return sysfs_create_group(&adev->dev->kobj, &group);
 972}
 973
 974static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
 975{
 976        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 977        struct attribute *attrs[] = {
 978                &con->features_attr.attr,
 979                NULL
 980        };
 981        struct bin_attribute *bin_attrs[] = {
 982                &con->badpages_attr,
 983                NULL
 984        };
 985        struct attribute_group group = {
 986                .name = "ras",
 987                .attrs = attrs,
 988                .bin_attrs = bin_attrs,
 989        };
 990
 991        sysfs_remove_group(&adev->dev->kobj, &group);
 992
 993        return 0;
 994}
 995
 996int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
 997                struct ras_fs_if *head)
 998{
 999        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
1000

1001        if (!obj || obj->attr_inuse)
1002                return -EINVAL;
1003
1004        get_obj(obj);
1005
1006        memcpy(obj->fs_data.sysfs_name,
1007                        head->sysfs_name,
1008                        sizeof(obj->fs_data.sysfs_name));
1009
1010        obj->sysfs_attr = (struct device_attribute){
1011                .attr = {
1012                        .name = obj->fs_data.sysfs_name,
1013                        .mode = S_IRUGO,
1014                },
1015                        .show = amdgpu_ras_sysfs_read,
1016        };
1017        sysfs_attr_init(&obj->sysfs_attr.attr);
1018
1019        if (sysfs_add_file_to_group(&adev->dev->kobj,
1020                                &obj->sysfs_attr.attr,
1021                                "ras")) {
1022                put_obj(obj);
1023                return -EINVAL;
1024        }
1025
1026        obj->attr_inuse = 1;
1027
1028        return 0;
1029}
1030
1031int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
1032                struct ras_common_if *head)
1033{
1034        struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1035
1036        if (!obj || !obj->attr_inuse)
1037                return -EINVAL;
1038
1039        sysfs_remove_file_from_group(&adev->dev->kobj,
1040                                &obj->sysfs_attr.attr,
1041                                "ras");
1042        obj->attr_inuse = 0;
1043        put_obj(obj);
1044
1045        return 0;
1046}
1047
1048static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
1049{
1050        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1051        struct ras_manager *obj, *tmp;
1052
1053        list_for_each_entry_safe(obj, tmp, &con->head, node) {
1054                amdgpu_ras_sysfs_remove(adev, &obj->head);
1055        }
1056
1057        amdgpu_ras_sysfs_remove_feature_node(adev);
1058
1059        return 0;
1060}
1061/* sysfs end */
1062
1063/**
1064 * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors
1065 *
1066 * Normally when there is an uncorrectable error, the driver will reset
1067 * the GPU to recover.  However, in the event of an unrecoverable error,
1068 * the driver provides an interface to reboot the system automatically
1069 * in that event.
1070 *
1071 * The following file in debugfs provides that interface:
1072 * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot
1073 *
1074 * Usage:
1075 *
1076 * .. code-block:: bash
1077 *
1078 *      echo true > .../ras/auto_reboot
1079 *
1080 */
1081/* debugfs begin */
1082static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
1083{
1084        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1085        struct drm_minor *minor = adev->ddev->primary;
1086
1087        con->dir = debugfs_create_dir("ras", minor->debugfs_root);
1088        debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir,
1089                                adev, &amdgpu_ras_debugfs_ctrl_ops);
1090        debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
1091                                adev, &amdgpu_ras_debugfs_eeprom_ops);
1092
1093        /*
1094         * After one uncorrectable error happens, usually GPU recovery will
1095         * be scheduled. But due to the known problem in GPU recovery failing
1096         * to bring GPU back, below interface provides one direct way to
1097         * user to reboot system automatically in such case within
1098         * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine
1099         * will never be called.
1100         */
1101        debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
1102                                &con->reboot);
1103}
1104
1105void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
1106                struct ras_fs_if *head)
1107{
1108        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1109        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
1110
1111        if (!obj || obj->ent)
1112                return;
1113
1114        get_obj(obj);
1115
1116        memcpy(obj->fs_data.debugfs_name,
1117                        head->debugfs_name,
1118                        sizeof(obj->fs_data.debugfs_name));
1119
1120        obj->ent = debugfs_create_file(obj->fs_data.debugfs_name,
1121                                       S_IWUGO | S_IRUGO, con->dir, obj,
1122                                       &amdgpu_ras_debugfs_ops);
1123}
1124
1125void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
1126{
1127        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1128        struct ras_manager *obj;
1129        struct ras_fs_if fs_info;
1130
1131        /*
1132         * it won't be called in resume path, no need to check
1133         * suspend and gpu reset status
1134         */
1135        if (!con)
1136                return;
1137
1138        amdgpu_ras_debugfs_create_ctrl_node(adev);
1139
1140        list_for_each_entry(obj, &con->head, node) {
1141                if (amdgpu_ras_is_supported(adev, obj->head.block) &&
1142                        (obj->attr_inuse == 1)) {
1143                        sprintf(fs_info.debugfs_name, "%s_err_inject",
1144                                        ras_block_str(obj->head.block));
1145                        fs_info.head = obj->head;
1146                        amdgpu_ras_debugfs_create(adev, &fs_info);
1147                }
1148        }
1149}
1150
1151void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
1152                struct ras_common_if *head)
1153{
1154        struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1155
1156        if (!obj || !obj->ent)
1157                return;
1158
1159        debugfs_remove(obj->ent);
1160        obj->ent = NULL;
1161        put_obj(obj);
1162}
1163
1164static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
1165{
1166        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1167        struct ras_manager *obj, *tmp;
1168
1169        list_for_each_entry_safe(obj, tmp, &con->head, node) {
1170                amdgpu_ras_debugfs_remove(adev, &obj->head);
1171        }
1172
1173        debugfs_remove_recursive(con->dir);
1174        con->dir = NULL;
1175}
1176/* debugfs end */
1177
1178/* ras fs */
1179
1180static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
1181{
1182        amdgpu_ras_sysfs_create_feature_node(adev);
1183
1184        return 0;
1185}
1186
1187static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
1188{
1189        amdgpu_ras_debugfs_remove_all(adev);
1190        amdgpu_ras_sysfs_remove_all(adev);
1191        return 0;
1192}
1193/* ras fs end */
1194
1195/* ih begin */
1196static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
1197{
1198        struct ras_ih_data *data = &obj->ih_data;
1199        struct amdgpu_iv_entry entry;
1200        int ret;
1201        struct ras_err_data err_data = {0, 0, 0, NULL};
1202
1203        while (data->rptr != data->wptr) {
1204                rmb();
1205                memcpy(&entry, &data->ring[data->rptr],
1206                                data->element_size);
1207
1208                wmb();
1209                data->rptr = (data->aligned_element_size +
1210                                data->rptr) % data->ring_size;
1211
1212                /* Let IP handle its data, maybe we need get the output
1213                 * from the callback to udpate the error type/count, etc
1214                 */
1215                if (data->cb) {
1216                        ret = data->cb(obj->adev, &err_data, &entry);
1217                        /* ue will trigger an interrupt, and in that case
1218                         * we need do a reset to recovery the whole system.
1219                         * But leave IP do that recovery, here we just dispatch
1220                         * the error.
1221                         */
1222                        if (ret == AMDGPU_RAS_SUCCESS) {
1223                                /* these counts could be left as 0 if
1224                                 * some blocks do not count error number
1225                                 */
1226                                obj->err_data.ue_count += err_data.ue_count;
1227                                obj->err_data.ce_count += err_data.ce_count;
1228                        }
1229                }
1230        }
1231}
1232
1233static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1234{
1235        struct ras_ih_data *data =
1236                container_of(work, struct ras_ih_data, ih_work);
1237        struct ras_manager *obj =
1238                container_of(data, struct ras_manager, ih_data);
1239
1240        amdgpu_ras_interrupt_handler(obj);
1241}
1242
1243int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1244                struct ras_dispatch_if *info)
1245{
1246        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1247        struct ras_ih_data *data = &obj->ih_data;
1248
1249        if (!obj)
1250                return -EINVAL;
1251
1252        if (data->inuse == 0)
1253                return 0;
1254
1255        /* Might be overflow... */
1256        memcpy(&data->ring[data->wptr], info->entry,
1257                        data->element_size);
1258
1259        wmb();
1260        data->wptr = (data->aligned_element_size +
1261                        data->wptr) % data->ring_size;
1262
1263        schedule_work(&data->ih_work);
1264
1265        return 0;
1266}
1267
1268int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1269                struct ras_ih_if *info)
1270{
1271        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1272        struct ras_ih_data *data;
1273
1274        if (!obj)
1275                return -EINVAL;
1276
1277        data = &obj->ih_data;
1278        if (data->inuse == 0)
1279                return 0;
1280
1281        cancel_work_sync(&data->ih_work);
1282
1283        kfree(data->ring);
1284        memset(data, 0, sizeof(*data));
1285        put_obj(obj);
1286
1287        return 0;
1288}
1289
1290int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1291                struct ras_ih_if *info)
1292{
1293        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1294        struct ras_ih_data *data;
1295
1296        if (!obj) {
1297                /* in case we registe the IH before enable ras feature */
1298                obj = amdgpu_ras_create_obj(adev, &info->head);
1299                if (!obj)
1300                        return -EINVAL;
1301        } else
1302                get_obj(obj);
1303
1304        data = &obj->ih_data;
1305        /* add the callback.etc */
1306        *data = (struct ras_ih_data) {
1307                .inuse = 0,
1308                .cb = info->cb,
1309                .element_size = sizeof(struct amdgpu_iv_entry),
1310                .rptr = 0,
1311                .wptr = 0,
1312        };
1313
1314        INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1315
1316        data->aligned_element_size = ALIGN(data->element_size, 8);
1317        /* the ring can store 64 iv entries. */
1318        data->ring_size = 64 * data->aligned_element_size;
1319        data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1320        if (!data->ring) {
1321                put_obj(obj);
1322                return -ENOMEM;
1323        }
1324
1325        /* IH is ready */
1326        data->inuse = 1;
1327
1328        return 0;
1329}
1330
1331static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1332{
1333        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1334        struct ras_manager *obj, *tmp;
1335
1336        list_for_each_entry_safe(obj, tmp, &con->head, node) {
1337                struct ras_ih_if info = {
1338                        .head = obj->head,
1339                };
1340                amdgpu_ras_interrupt_remove_handler(adev, &info);
1341        }
1342
1343        return 0;
1344}
1345/* ih end */
1346
1347/* traversal all IPs except NBIO to query error counter */
1348static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
1349{
1350        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1351        struct ras_manager *obj;
1352
1353        if (!con)
1354                return;
1355
1356        list_for_each_entry(obj, &con->head, node) {
1357                struct ras_query_if info = {
1358                        .head = obj->head,
1359                };
1360
1361                /*
1362                 * PCIE_BIF IP has one different isr by ras controller
1363                 * interrupt, the specific ras counter query will be
1364                 * done in that isr. So skip such block from common
1365                 * sync flood interrupt isr calling.
1366                 */
1367                if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
1368                        continue;
1369
1370                amdgpu_ras_error_query(adev, &info);
1371        }
1372}
1373
1374/* recovery begin */
1375
1376/* return 0 on success.
1377 * caller need free bps.
1378 */
1379static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1380                struct ras_badpage **bps, unsigned int *count)
1381{
1382        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1383        struct ras_err_handler_data *data;
1384        int i = 0;
1385        int ret = 0;
1386
1387        if (!con || !con->eh_data || !bps || !count)
1388                return -EINVAL;
1389
1390        mutex_lock(&con->recovery_lock);
1391        data = con->eh_data;
1392        if (!data || data->count == 0) {
1393                *bps = NULL;
1394                ret = -EINVAL;
1395                goto out;
1396        }
1397
1398        *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
1399        if (!*bps) {
1400                ret = -ENOMEM;
1401                goto out;
1402        }
1403
1404        for (; i < data->count; i++) {
1405                (*bps)[i] = (struct ras_badpage){
1406                        .bp = data->bps[i].retired_page,
1407                        .size = AMDGPU_GPU_PAGE_SIZE,
1408                        .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
1409                };
1410
1411                if (data->last_reserved <= i)
1412                        (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
1413                else if (data->bps_bo[i] == NULL)
1414                        (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
1415        }
1416
1417        *count = data->count;
1418out:
1419        mutex_unlock(&con->recovery_lock);
1420        return ret;
1421}
1422
1423static void amdgpu_ras_do_recovery(struct work_struct *work)
1424{
1425        struct amdgpu_ras *ras =
1426                container_of(work, struct amdgpu_ras, recovery_work);
1427        struct amdgpu_device *remote_adev = NULL;
1428        struct amdgpu_device *adev = ras->adev;
1429        struct list_head device_list, *device_list_handle =  NULL;
1430        struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false);
1431
1432        /* Build list of devices to query RAS related errors */
1433        if  (hive && adev->gmc.xgmi.num_physical_nodes > 1) {
1434                device_list_handle = &hive->device_list;
1435        } else {
1436                list_add_tail(&adev->gmc.xgmi.head, &device_list);
1437                device_list_handle = &device_list;
1438        }
1439
1440        list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) {
1441                amdgpu_ras_log_on_err_counter(remote_adev);
1442        }
1443
1444        if (amdgpu_device_should_recover_gpu(ras->adev))
1445                amdgpu_device_gpu_recover(ras->adev, 0);
1446        atomic_set(&ras->in_recovery, 0);
1447}
1448
1449/* alloc/realloc bps array */
1450static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1451                struct ras_err_handler_data *data, int pages)
1452{
1453        unsigned int old_space = data->count + data->space_left;
1454        unsigned int new_space = old_space + pages;
1455        unsigned int align_space = ALIGN(new_space, 512);
1456        void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1457        struct amdgpu_bo **bps_bo =
1458                        kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL);
1459
1460        if (!bps || !bps_bo) {
1461                kfree(bps);
1462                kfree(bps_bo);
1463                return -ENOMEM;
1464        }
1465
1466        if (data->bps) {
1467                memcpy(bps, data->bps,
1468                                data->count * sizeof(*data->bps));
1469                kfree(data->bps);
1470        }
1471        if (data->bps_bo) {
1472                memcpy(bps_bo, data->bps_bo,
1473                                data->count * sizeof(*data->bps_bo));
1474                kfree(data->bps_bo);
1475        }
1476
1477        data->bps = bps;
1478        data->bps_bo = bps_bo;
1479        data->space_left += align_space - old_space;
1480        return 0;
1481}
1482
1483/* it deal with vram only. */
1484int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
1485                struct eeprom_table_record *bps, int pages)
1486{
1487        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1488        struct ras_err_handler_data *data;
1489        int ret = 0;
1490
1491        if (!con || !con->eh_data || !bps || pages <= 0)
1492                return 0;
1493
1494        mutex_lock(&con->recovery_lock);
1495        data = con->eh_data;
1496        if (!data)
1497                goto out;
1498
1499        if (data->space_left <= pages)
1500                if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
1501                        ret = -ENOMEM;
1502                        goto out;
1503                }
1504
1505        memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
1506        data->count += pages;
1507        data->space_left -= pages;
1508
1509out:
1510        mutex_unlock(&con->recovery_lock);
1511
1512        return ret;
1513}
1514
1515/*
1516 * write error record array to eeprom, the function should be
1517 * protected by recovery_lock
1518 */
1519static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1520{
1521        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1522        struct ras_err_handler_data *data;
1523        struct amdgpu_ras_eeprom_control *control;
1524        int save_count;
1525
1526        if (!con || !con->eh_data)
1527                return 0;
1528
1529        control = &con->eeprom_control;
1530        data = con->eh_data;
1531        save_count = data->count - control->num_recs;
1532        /* only new entries are saved */
1533        if (save_count > 0)
1534                if (amdgpu_ras_eeprom_process_recods(control,
1535                                                        &data->bps[control->num_recs],
1536                                                        true,
1537                                                        save_count)) {
1538                        DRM_ERROR("Failed to save EEPROM table data!");
1539                        return -EIO;
1540                }
1541
1542        return 0;
1543}
1544
1545/*
1546 * read error record array in eeprom and reserve enough space for
1547 * storing new bad pages
1548 */
1549static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1550{
1551        struct amdgpu_ras_eeprom_control *control =
1552                                        &adev->psp.ras.ras->eeprom_control;
1553        struct eeprom_table_record *bps = NULL;
1554        int ret = 0;
1555
1556        /* no bad page record, skip eeprom access */
1557        if (!control->num_recs)
1558                return ret;
1559
1560        bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
1561        if (!bps)
1562                return -ENOMEM;
1563
1564        if (amdgpu_ras_eeprom_process_recods(control, bps, false,
1565                control->num_recs)) {
1566                DRM_ERROR("Failed to load EEPROM table records!");
1567                ret = -EIO;
1568                goto out;
1569        }
1570
1571        ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
1572
1573out:
1574        kfree(bps);
1575        return ret;
1576}
1577
1578/*
1579 * check if an address belongs to bad page
1580 *
1581 * Note: this check is only for umc block
1582 */
1583static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
1584                                uint64_t addr)
1585{
1586        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1587        struct ras_err_handler_data *data;
1588        int i;
1589        bool ret = false;
1590
1591        if (!con || !con->eh_data)
1592                return ret;
1593
1594        mutex_lock(&con->recovery_lock);
1595        data = con->eh_data;
1596        if (!data)
1597                goto out;
1598
1599        addr >>= AMDGPU_GPU_PAGE_SHIFT;
1600        for (i = 0; i < data->count; i++)
1601                if (addr == data->bps[i].retired_page) {
1602                        ret = true;
1603                        goto out;
1604                }
1605
1606out:
1607        mutex_unlock(&con->recovery_lock);
1608        return ret;
1609}
1610
1611/* called in gpu recovery/init */
1612int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
1613{
1614        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1615        struct ras_err_handler_data *data;
1616        uint64_t bp;
1617        struct amdgpu_bo *bo = NULL;
1618        int i, ret = 0;
1619
1620        if (!con || !con->eh_data)
1621                return 0;
1622
1623        mutex_lock(&con->recovery_lock);
1624        data = con->eh_data;
1625        if (!data)
1626                goto out;
1627        /* reserve vram at driver post stage. */
1628        for (i = data->last_reserved; i < data->count; i++) {
1629                bp = data->bps[i].retired_page;
1630
1631                /* There are two cases of reserve error should be ignored:
1632                 * 1) a ras bad page has been allocated (used by someone);
1633                 * 2) a ras bad page has been reserved (duplicate error injection
1634                 *    for one page);
1635                 */
1636                if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
1637                                               AMDGPU_GPU_PAGE_SIZE,
1638                                               AMDGPU_GEM_DOMAIN_VRAM,
1639                                               &bo, NULL))
1640                        DRM_WARN("RAS WARN: reserve vram for retired page %llx fail\n", bp);
1641
1642                data->bps_bo[i] = bo;
1643                data->last_reserved = i + 1;
1644                bo = NULL;
1645        }
1646
1647        /* continue to save bad pages to eeprom even reesrve_vram fails */
1648        ret = amdgpu_ras_save_bad_pages(adev);
1649out:
1650        mutex_unlock(&con->recovery_lock);
1651        return ret;
1652}
1653
1654/* called when driver unload */
1655static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
1656{
1657        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1658        struct ras_err_handler_data *data;
1659        struct amdgpu_bo *bo;
1660        int i;
1661
1662        if (!con || !con->eh_data)
1663                return 0;
1664
1665        mutex_lock(&con->recovery_lock);
1666        data = con->eh_data;
1667        if (!data)
1668                goto out;
1669
1670        for (i = data->last_reserved - 1; i >= 0; i--) {
1671                bo = data->bps_bo[i];
1672
1673                amdgpu_bo_free_kernel(&bo, NULL, NULL);
1674
1675                data->bps_bo[i] = bo;
1676                data->last_reserved = i;
1677        }
1678out:
1679        mutex_unlock(&con->recovery_lock);
1680        return 0;
1681}
1682
1683int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
1684{
1685        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1686        struct ras_err_handler_data **data;
1687        int ret;
1688
1689        if (con)
1690                data = &con->eh_data;
1691        else
1692                return 0;
1693
1694        *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
1695        if (!*data) {
1696                ret = -ENOMEM;
1697                goto out;
1698        }
1699
1700        mutex_init(&con->recovery_lock);
1701        INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
1702        atomic_set(&con->in_recovery, 0);
1703        con->adev = adev;
1704
1705        ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
1706        if (ret)
1707                goto free;
1708
1709        if (con->eeprom_control.num_recs) {
1710                ret = amdgpu_ras_load_bad_pages(adev);
1711                if (ret)
1712                        goto free;
1713                ret = amdgpu_ras_reserve_bad_pages(adev);
1714                if (ret)
1715                        goto release;
1716        }
1717
1718        return 0;
1719
1720release:
1721        amdgpu_ras_release_bad_pages(adev);
1722free:
1723        kfree((*data)->bps);
1724        kfree((*data)->bps_bo);
1725        kfree(*data);
1726        con->eh_data = NULL;
1727out:
1728        DRM_WARN("Failed to initialize ras recovery!\n");
1729
1730        return ret;
1731}
1732
1733static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
1734{
1735        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1736        struct ras_err_handler_data *data = con->eh_data;
1737
1738        /* recovery_init failed to init it, fini is useless */
1739        if (!data)
1740                return 0;
1741
1742        cancel_work_sync(&con->recovery_work);
1743        amdgpu_ras_release_bad_pages(adev);
1744
1745        mutex_lock(&con->recovery_lock);
1746        con->eh_data = NULL;
1747        kfree(data->bps);
1748        kfree(data->bps_bo);
1749        kfree(data);
1750        mutex_unlock(&con->recovery_lock);
1751
1752        return 0;
1753}
1754/* recovery end */
1755
1756/* return 0 if ras will reset gpu and repost.*/
1757int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
1758                unsigned int block)
1759{
1760        struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1761
1762        if (!ras)
1763                return -EINVAL;
1764
1765        ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1766        return 0;
1767}
1768
1769/*
1770 * check hardware's ras ability which will be saved in hw_supported.
1771 * if hardware does not support ras, we can skip some ras initializtion and
1772 * forbid some ras operations from IP.
1773 * if software itself, say boot parameter, limit the ras ability. We still
1774 * need allow IP do some limited operations, like disable. In such case,
1775 * we have to initialize ras as normal. but need check if operation is
1776 * allowed or not in each function.
1777 */
1778static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
1779                uint32_t *hw_supported, uint32_t *supported)
1780{
1781        *hw_supported = 0;
1782        *supported = 0;
1783
1784        if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
1785            (adev->asic_type != CHIP_VEGA20 &&
1786             adev->asic_type != CHIP_ARCTURUS))
1787                return;
1788
1789        if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
1790                DRM_INFO("HBM ECC is active.\n");
1791                *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC |
1792                                1 << AMDGPU_RAS_BLOCK__DF);
1793        } else
1794                DRM_INFO("HBM ECC is not presented.\n");
1795
1796        if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
1797                DRM_INFO("SRAM ECC is active.\n");
1798                *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
1799                                1 << AMDGPU_RAS_BLOCK__DF);
1800        } else
1801                DRM_INFO("SRAM ECC is not presented.\n");
1802
1803        /* hw_supported needs to be aligned with RAS block mask. */
1804        *hw_supported &= AMDGPU_RAS_BLOCK_MASK;
1805
1806        *supported = amdgpu_ras_enable == 0 ?
1807                        0 : *hw_supported & amdgpu_ras_mask;
1808}
1809
1810int amdgpu_ras_init(struct amdgpu_device *adev)
1811{
1812        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1813        int r;
1814
1815        if (con)
1816                return 0;
1817
1818        con = kmalloc(sizeof(struct amdgpu_ras) +
1819                        sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
1820                        GFP_KERNEL|__GFP_ZERO);
1821        if (!con)
1822                return -ENOMEM;
1823
1824        con->objs = (struct ras_manager *)(con + 1);
1825
1826        amdgpu_ras_set_context(adev, con);
1827
1828        amdgpu_ras_check_supported(adev, &con->hw_supported,
1829                        &con->supported);
1830        if (!con->hw_supported) {
1831                amdgpu_ras_set_context(adev, NULL);
1832                kfree(con);
1833                return 0;
1834        }
1835
1836        con->features = 0;
1837        INIT_LIST_HEAD(&con->head);
1838        /* Might need get this flag from vbios. */
1839        con->flags = RAS_DEFAULT_FLAGS;
1840
1841        if (adev->nbio.funcs->init_ras_controller_interrupt) {
1842                r = adev->nbio.funcs->init_ras_controller_interrupt(adev);
1843                if (r)
1844                        return r;
1845        }
1846
1847        if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) {
1848                r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev);
1849                if (r)
1850                        return r;
1851        }
1852
1853        amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
1854
1855        if (amdgpu_ras_fs_init(adev))
1856                goto fs_out;
1857
1858        DRM_INFO("RAS INFO: ras initialized successfully, "
1859                        "hardware ability[%x] ras_mask[%x]\n",
1860                        con->hw_supported, con->supported);
1861        return 0;
1862fs_out:
1863        amdgpu_ras_set_context(adev, NULL);
1864        kfree(con);
1865
1866        return -EINVAL;
1867}
1868
1869/* helper function to handle common stuff in ip late init phase */
1870int amdgpu_ras_late_init(struct amdgpu_device *adev,
1871                         struct ras_common_if *ras_block,
1872                         struct ras_fs_if *fs_info,
1873                         struct ras_ih_if *ih_info)
1874{
1875        int r;
1876
1877        /* disable RAS feature per IP block if it is not supported */
1878        if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
1879                amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
1880                return 0;
1881        }
1882
1883        r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
1884        if (r) {
1885                if (r == -EAGAIN) {
1886                        /* request gpu reset. will run again */
1887                        amdgpu_ras_request_reset_on_boot(adev,
1888                                        ras_block->block);
1889                        return 0;
1890                } else if (adev->in_suspend || adev->in_gpu_reset) {
1891                        /* in resume phase, if fail to enable ras,
1892                         * clean up all ras fs nodes, and disable ras */
1893                        goto cleanup;
1894                } else
1895                        return r;
1896        }
1897
1898        /* in resume phase, no need to create ras fs node */
1899        if (adev->in_suspend || adev->in_gpu_reset)
1900                return 0;
1901
1902        if (ih_info->cb) {
1903                r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
1904                if (r)
1905                        goto interrupt;
1906        }
1907
1908        r = amdgpu_ras_sysfs_create(adev, fs_info);
1909        if (r)
1910                goto sysfs;
1911
1912        return 0;
1913cleanup:
1914        amdgpu_ras_sysfs_remove(adev, ras_block);
1915sysfs:
1916        if (ih_info->cb)
1917                amdgpu_ras_interrupt_remove_handler(adev, ih_info);
1918interrupt:
1919        amdgpu_ras_feature_enable(adev, ras_block, 0);
1920        return r;
1921}
1922
1923/* helper function to remove ras fs node and interrupt handler */
1924void amdgpu_ras_late_fini(struct amdgpu_device *adev,
1925                          struct ras_common_if *ras_block,
1926                          struct ras_ih_if *ih_info)
1927{
1928        if (!ras_block || !ih_info)
1929                return;
1930
1931        amdgpu_ras_sysfs_remove(adev, ras_block);
1932        if (ih_info->cb)
1933                amdgpu_ras_interrupt_remove_handler(adev, ih_info);
1934        amdgpu_ras_feature_enable(adev, ras_block, 0);
1935}
1936
1937/* do some init work after IP late init as dependence.
1938 * and it runs in resume/gpu reset/booting up cases.
1939 */
1940void amdgpu_ras_resume(struct amdgpu_device *adev)
1941{
1942        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1943        struct ras_manager *obj, *tmp;
1944
1945        if (!con)
1946                return;
1947
1948        if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
1949                /* Set up all other IPs which are not implemented. There is a
1950                 * tricky thing that IP's actual ras error type should be
1951                 * MULTI_UNCORRECTABLE, but as driver does not handle it, so
1952                 * ERROR_NONE make sense anyway.
1953                 */
1954                amdgpu_ras_enable_all_features(adev, 1);
1955
1956                /* We enable ras on all hw_supported block, but as boot
1957                 * parameter might disable some of them and one or more IP has
1958                 * not implemented yet. So we disable them on behalf.
1959                 */
1960                list_for_each_entry_safe(obj, tmp, &con->head, node) {
1961                        if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
1962                                amdgpu_ras_feature_enable(adev, &obj->head, 0);
1963                                /* there should be no any reference. */
1964                                WARN_ON(alive_obj(obj));
1965                        }
1966                }
1967        }
1968
1969        if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) {
1970                con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1971                /* setup ras obj state as disabled.
1972                 * for init_by_vbios case.
1973                 * if we want to enable ras, just enable it in a normal way.
1974                 * If we want do disable it, need setup ras obj as enabled,
1975                 * then issue another TA disable cmd.
1976                 * See feature_enable_on_boot
1977                 */
1978                amdgpu_ras_disable_all_features(adev, 1);
1979                amdgpu_ras_reset_gpu(adev);
1980        }
1981}
1982
1983void amdgpu_ras_suspend(struct amdgpu_device *adev)
1984{
1985        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1986
1987        if (!con)
1988                return;
1989
1990        amdgpu_ras_disable_all_features(adev, 0);
1991        /* Make sure all ras objects are disabled. */
1992        if (con->features)
1993                amdgpu_ras_disable_all_features(adev, 1);
1994}
1995
1996/* do some fini work before IP fini as dependence */
1997int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
1998{
1999        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2000

2001        if (!con)
2002                return 0;
2003
2004        /* Need disable ras on all IPs here before ip [hw/sw]fini */
2005        amdgpu_ras_disable_all_features(adev, 0);
2006        amdgpu_ras_recovery_fini(adev);
2007        return 0;
2008}
2009
2010int amdgpu_ras_fini(struct amdgpu_device *adev)
2011{
2012        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2013
2014        if (!con)
2015                return 0;
2016
2017        amdgpu_ras_fs_fini(adev);
2018        amdgpu_ras_interrupt_remove_all(adev);
2019
2020        WARN(con->features, "Feature mask is not cleared");
2021
2022        if (con->features)
2023                amdgpu_ras_disable_all_features(adev, 1);
2024
2025        amdgpu_ras_set_context(adev, NULL);
2026        kfree(con);
2027
2028        return 0;
2029}
2030
2031void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
2032{
2033        uint32_t hw_supported, supported;
2034
2035        amdgpu_ras_check_supported(adev, &hw_supported, &supported);
2036        if (!hw_supported)
2037                return;
2038
2039        if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
2040                DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n");
2041
2042                amdgpu_ras_reset_gpu(adev);
2043        }
2044}
2045