linux/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
<<
>>
Prefs
   1/*
   2 * Copyright 2018 Advanced Micro Devices, Inc.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice shall be included in
  12 * all copies or substantial portions of the Software.
  13 *
  14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20 * OTHER DEALINGS IN THE SOFTWARE.
  21 *
  22 *
  23 */
  24#include <linux/debugfs.h>
  25#include <linux/list.h>
  26#include <linux/module.h>
  27#include "amdgpu.h"
  28#include "amdgpu_ras.h"
  29#include "amdgpu_atomfirmware.h"
  30
  31struct ras_ih_data {
  32        /* interrupt bottom half */
  33        struct work_struct ih_work;
  34        int inuse;
  35        /* IP callback */
  36        ras_ih_cb cb;
  37        /* full of entries */
  38        unsigned char *ring;
  39        unsigned int ring_size;
  40        unsigned int element_size;
  41        unsigned int aligned_element_size;
  42        unsigned int rptr;
  43        unsigned int wptr;
  44};
  45
  46struct ras_fs_data {
  47        char sysfs_name[32];
  48        char debugfs_name[32];
  49};
  50
  51struct ras_err_data {
  52        unsigned long ue_count;
  53        unsigned long ce_count;
  54};
  55
  56struct ras_err_handler_data {
  57        /* point to bad pages array */
  58        struct {
  59                unsigned long bp;
  60                struct amdgpu_bo *bo;
  61        } *bps;
  62        /* the count of entries */
  63        int count;
  64        /* the space can place new entries */
  65        int space_left;
  66        /* last reserved entry's index + 1 */
  67        int last_reserved;
  68};
  69
  70struct ras_manager {
  71        struct ras_common_if head;
  72        /* reference count */
  73        int use;
  74        /* ras block link */
  75        struct list_head node;
  76        /* the device */
  77        struct amdgpu_device *adev;
  78        /* debugfs */
  79        struct dentry *ent;
  80        /* sysfs */
  81        struct device_attribute sysfs_attr;
  82        int attr_inuse;
  83
  84        /* fs node name */
  85        struct ras_fs_data fs_data;
  86
  87        /* IH data */
  88        struct ras_ih_data ih_data;
  89
  90        struct ras_err_data err_data;
  91};
  92
  93const char *ras_error_string[] = {
  94        "none",
  95        "parity",
  96        "single_correctable",
  97        "multi_uncorrectable",
  98        "poison",
  99};
 100
 101const char *ras_block_string[] = {
 102        "umc",
 103        "sdma",
 104        "gfx",
 105        "mmhub",
 106        "athub",
 107        "pcie_bif",
 108        "hdp",
 109        "xgmi_wafl",
 110        "df",
 111        "smn",
 112        "sem",
 113        "mp0",
 114        "mp1",
 115        "fuse",
 116};
 117
 118#define ras_err_str(i) (ras_error_string[ffs(i)])
 119#define ras_block_str(i) (ras_block_string[i])
 120
 121#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1
 122#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
 123
 124static void amdgpu_ras_self_test(struct amdgpu_device *adev)
 125{
 126        /* TODO */
 127}
 128
 129static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
 130                                        size_t size, loff_t *pos)
 131{
 132        struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
 133        struct ras_query_if info = {
 134                .head = obj->head,
 135        };
 136        ssize_t s;
 137        char val[128];
 138
 139        if (amdgpu_ras_error_query(obj->adev, &info))
 140                return -EINVAL;
 141
 142        s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
 143                        "ue", info.ue_count,
 144                        "ce", info.ce_count);
 145        if (*pos >= s)
 146                return 0;
 147
 148        s -= *pos;
 149        s = min_t(u64, s, size);
 150
 151
 152        if (copy_to_user(buf, &val[*pos], s))
 153                return -EINVAL;
 154
 155        *pos += s;
 156
 157        return s;
 158}
 159
 160static const struct file_operations amdgpu_ras_debugfs_ops = {
 161        .owner = THIS_MODULE,
 162        .read = amdgpu_ras_debugfs_read,
 163        .write = NULL,
 164        .llseek = default_llseek
 165};
 166
 167static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
 168{
 169        int i;
 170
 171        for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
 172                *block_id = i;
 173                if (strcmp(name, ras_block_str(i)) == 0)
 174                        return 0;
 175        }
 176        return -EINVAL;
 177}
 178
 179static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
 180                const char __user *buf, size_t size,
 181                loff_t *pos, struct ras_debug_if *data)
 182{
 183        ssize_t s = min_t(u64, 64, size);
 184        char str[65];
 185        char block_name[33];
 186        char err[9] = "ue";
 187        int op = -1;
 188        int block_id;
 189        u64 address, value;
 190
 191        if (*pos)
 192                return -EINVAL;
 193        *pos = size;
 194
 195        memset(str, 0, sizeof(str));
 196        memset(data, 0, sizeof(*data));
 197
 198        if (copy_from_user(str, buf, s))
 199                return -EINVAL;
 200
 201        if (sscanf(str, "disable %32s", block_name) == 1)
 202                op = 0;
 203        else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
 204                op = 1;
 205        else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
 206                op = 2;
 207        else if (str[0] && str[1] && str[2] && str[3])
 208                /* ascii string, but commands are not matched. */
 209                return -EINVAL;
 210
 211        if (op != -1) {
 212                if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
 213                        return -EINVAL;
 214
 215                data->head.block = block_id;
 216                data->head.type = memcmp("ue", err, 2) == 0 ?
 217                        AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE :
 218                        AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
 219                data->op = op;
 220
 221                if (op == 2) {
 222                        if (sscanf(str, "%*s %*s %*s %llu %llu",
 223                                                &address, &value) != 2)
 224                                if (sscanf(str, "%*s %*s %*s 0x%llx 0x%llx",
 225                                                        &address, &value) != 2)
 226                                        return -EINVAL;
 227                        data->inject.address = address;
 228                        data->inject.value = value;
 229                }
 230        } else {
 231                if (size < sizeof(*data))
 232                        return -EINVAL;
 233
 234                if (copy_from_user(data, buf, sizeof(*data)))
 235                        return -EINVAL;
 236        }
 237
 238        return 0;
 239}
 240/*
 241 * DOC: ras debugfs control interface
 242 *
 243 * It accepts struct ras_debug_if who has two members.
 244 *
 245 * First member: ras_debug_if::head or ras_debug_if::inject.
 246 *
 247 * head is used to indicate which IP block will be under control.
 248 *
 249 * head has four members, they are block, type, sub_block_index, name.
 250 * block: which IP will be under control.
 251 * type: what kind of error will be enabled/disabled/injected.
 252 * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
 253 * name: the name of IP.
 254 *
 255 * inject has two more members than head, they are address, value.
 256 * As their names indicate, inject operation will write the
 257 * value to the address.
 258 *
 259 * Second member: struct ras_debug_if::op.
 260 * It has three kinds of operations.
 261 *  0: disable RAS on the block. Take ::head as its data.
 262 *  1: enable RAS on the block. Take ::head as its data.
 263 *  2: inject errors on the block. Take ::inject as its data.
 264 *
 265 * How to use the interface?
 266 * programs:
 267 * copy the struct ras_debug_if in your codes and initialize it.
 268 * write the struct to the control node.
 269 *
 270 * bash:
 271 * echo op block [error [address value]] > .../ras/ras_ctrl
 272 *      op: disable, enable, inject
 273 *              disable: only block is needed
 274 *              enable: block and error are needed
 275 *              inject: error, address, value are needed
 276 *      block: umc, smda, gfx, .........
 277 *              see ras_block_string[] for details
 278 *      error: ue, ce
 279 *              ue: multi_uncorrectable
 280 *              ce: single_correctable
 281 *
 282 * here are some examples for bash commands,
 283 *      echo inject umc ue 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
 284 *      echo inject umc ce 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
 285 *      echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
 286 *
 287 * How to check the result?
 288 *
 289 * For disable/enable, please check ras features at
 290 * /sys/class/drm/card[0/1/2...]/device/ras/features
 291 *
 292 * For inject, please check corresponding err count at
 293 * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
 294 *
 295 * NOTE: operation is only allowed on blocks which are supported.
 296 * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask
 297 */
 298static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
 299                size_t size, loff_t *pos)
 300{
 301        struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
 302        struct ras_debug_if data;
 303        int ret = 0;
 304
 305        ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
 306        if (ret)
 307                return -EINVAL;
 308
 309        if (!amdgpu_ras_is_supported(adev, data.head.block))
 310                return -EINVAL;
 311
 312        switch (data.op) {
 313        case 0:
 314                ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
 315                break;
 316        case 1:
 317                ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
 318                break;
 319        case 2:
 320                ret = amdgpu_ras_error_inject(adev, &data.inject);
 321                break;
 322        default:
 323                ret = -EINVAL;
 324                break;
 325        };
 326
 327        if (ret)
 328                return -EINVAL;
 329
 330        return size;
 331}
 332
 333static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
 334        .owner = THIS_MODULE,
 335        .read = NULL,
 336        .write = amdgpu_ras_debugfs_ctrl_write,
 337        .llseek = default_llseek
 338};
 339
 340static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
 341                struct device_attribute *attr, char *buf)
 342{
 343        struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
 344        struct ras_query_if info = {
 345                .head = obj->head,
 346        };
 347
 348        if (amdgpu_ras_error_query(obj->adev, &info))
 349                return -EINVAL;
 350
 351        return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
 352                        "ue", info.ue_count,
 353                        "ce", info.ce_count);
 354}
 355
 356/* obj begin */
 357
 358#define get_obj(obj) do { (obj)->use++; } while (0)
 359#define alive_obj(obj) ((obj)->use)
 360
 361static inline void put_obj(struct ras_manager *obj)
 362{
 363        if (obj && --obj->use == 0)
 364                list_del(&obj->node);
 365        if (obj && obj->use < 0) {
 366                 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
 367        }
 368}
 369
 370/* make one obj and return it. */
 371static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
 372                struct ras_common_if *head)
 373{
 374        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 375        struct ras_manager *obj;
 376
 377        if (!con)
 378                return NULL;
 379
 380        if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
 381                return NULL;
 382
 383        obj = &con->objs[head->block];
 384        /* already exist. return obj? */
 385        if (alive_obj(obj))
 386                return NULL;
 387
 388        obj->head = *head;
 389        obj->adev = adev;
 390        list_add(&obj->node, &con->head);
 391        get_obj(obj);
 392
 393        return obj;
 394}
 395
 396/* return an obj equal to head, or the first when head is NULL */
 397static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
 398                struct ras_common_if *head)
 399{
 400        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 401        struct ras_manager *obj;
 402        int i;
 403
 404        if (!con)
 405                return NULL;
 406
 407        if (head) {
 408                if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
 409                        return NULL;
 410
 411                obj = &con->objs[head->block];
 412
 413                if (alive_obj(obj)) {
 414                        WARN_ON(head->block != obj->head.block);
 415                        return obj;
 416                }
 417        } else {
 418                for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
 419                        obj = &con->objs[i];
 420                        if (alive_obj(obj)) {
 421                                WARN_ON(i != obj->head.block);
 422                                return obj;
 423                        }
 424                }
 425        }
 426
 427        return NULL;
 428}
 429/* obj end */
 430
 431/* feature ctl begin */
 432static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
 433                struct ras_common_if *head)
 434{
 435        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 436
 437        return con->hw_supported & BIT(head->block);
 438}
 439
 440static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
 441                struct ras_common_if *head)
 442{
 443        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 444
 445        return con->features & BIT(head->block);
 446}
 447
 448/*
 449 * if obj is not created, then create one.
 450 * set feature enable flag.
 451 */
 452static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 453                struct ras_common_if *head, int enable)
 454{
 455        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 456        struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
 457
 458        /* If hardware does not support ras, then do not create obj.
 459         * But if hardware support ras, we can create the obj.
 460         * Ras framework checks con->hw_supported to see if it need do
 461         * corresponding initialization.
 462         * IP checks con->support to see if it need disable ras.
 463         */
 464        if (!amdgpu_ras_is_feature_allowed(adev, head))
 465                return 0;
 466        if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
 467                return 0;
 468
 469        if (enable) {
 470                if (!obj) {
 471                        obj = amdgpu_ras_create_obj(adev, head);
 472                        if (!obj)
 473                                return -EINVAL;
 474                } else {
 475                        /* In case we create obj somewhere else */
 476                        get_obj(obj);
 477                }
 478                con->features |= BIT(head->block);
 479        } else {
 480                if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
 481                        con->features &= ~BIT(head->block);
 482                        put_obj(obj);
 483                }
 484        }
 485
 486        return 0;
 487}
 488
 489/* wrapper of psp_ras_enable_features */
 490int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 491                struct ras_common_if *head, bool enable)
 492{
 493        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 494        union ta_ras_cmd_input info;
 495        int ret;
 496
 497        if (!con)
 498                return -EINVAL;
 499
 500        if (!enable) {
 501                info.disable_features = (struct ta_ras_disable_features_input) {
 502                        .block_id =  amdgpu_ras_block_to_ta(head->block),
 503                        .error_type = amdgpu_ras_error_to_ta(head->type),
 504                };
 505        } else {
 506                info.enable_features = (struct ta_ras_enable_features_input) {
 507                        .block_id =  amdgpu_ras_block_to_ta(head->block),
 508                        .error_type = amdgpu_ras_error_to_ta(head->type),
 509                };
 510        }
 511
 512        /* Do not enable if it is not allowed. */
 513        WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
 514        /* Are we alerady in that state we are going to set? */
 515        if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
 516                return 0;
 517
 518        ret = psp_ras_enable_features(&adev->psp, &info, enable);
 519        if (ret) {
 520                DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
 521                                enable ? "enable":"disable",
 522                                ras_block_str(head->block),
 523                                ret);
 524                return -EINVAL;
 525        }
 526
 527        /* setup the obj */
 528        __amdgpu_ras_feature_enable(adev, head, enable);
 529
 530        return 0;
 531}
 532
 533/* Only used in device probe stage and called only once. */
 534int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
 535                struct ras_common_if *head, bool enable)
 536{
 537        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 538        int ret;
 539
 540        if (!con)
 541                return -EINVAL;
 542
 543        if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
 544                /* If ras is enabled by vbios, we set up ras object first in
 545                 * both case. For enable, that is all what we need do. For
 546                 * disable, we need perform a ras TA disable cmd after that.
 547                 */
 548                ret = __amdgpu_ras_feature_enable(adev, head, 1);
 549                if (ret)
 550                        return ret;
 551
 552                if (!enable)
 553                        ret = amdgpu_ras_feature_enable(adev, head, 0);
 554        } else
 555                ret = amdgpu_ras_feature_enable(adev, head, enable);
 556
 557        return ret;
 558}
 559
 560static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
 561                bool bypass)
 562{
 563        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 564        struct ras_manager *obj, *tmp;
 565
 566        list_for_each_entry_safe(obj, tmp, &con->head, node) {
 567                /* bypass psp.
 568                 * aka just release the obj and corresponding flags
 569                 */
 570                if (bypass) {
 571                        if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
 572                                break;
 573                } else {
 574                        if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
 575                                break;
 576                }
 577        }
 578
 579        return con->features;
 580}
 581
 582static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
 583                bool bypass)
 584{
 585        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 586        int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
 587        int i;
 588        const enum amdgpu_ras_error_type default_ras_type =
 589                AMDGPU_RAS_ERROR__NONE;
 590
 591        for (i = 0; i < ras_block_count; i++) {
 592                struct ras_common_if head = {
 593                        .block = i,
 594                        .type = default_ras_type,
 595                        .sub_block_index = 0,
 596                };
 597                strcpy(head.name, ras_block_str(i));
 598                if (bypass) {
 599                        /*
 600                         * bypass psp. vbios enable ras for us.
 601                         * so just create the obj
 602                         */
 603                        if (__amdgpu_ras_feature_enable(adev, &head, 1))
 604                                break;
 605                } else {
 606                        if (amdgpu_ras_feature_enable(adev, &head, 1))
 607                                break;
 608                }
 609        }
 610
 611        return con->features;
 612}
 613/* feature ctl end */
 614
 615/* query/inject/cure begin */
 616int amdgpu_ras_error_query(struct amdgpu_device *adev,
 617                struct ras_query_if *info)
 618{
 619        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
 620
 621        if (!obj)
 622                return -EINVAL;
 623        /* TODO might read the register to read the count */
 624
 625        info->ue_count = obj->err_data.ue_count;
 626        info->ce_count = obj->err_data.ce_count;
 627
 628        return 0;
 629}
 630
 631/* wrapper of psp_ras_trigger_error */
 632int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 633                struct ras_inject_if *info)
 634{
 635        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
 636        struct ta_ras_trigger_error_input block_info = {
 637                .block_id =  amdgpu_ras_block_to_ta(info->head.block),
 638                .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
 639                .sub_block_index = info->head.sub_block_index,
 640                .address = info->address,
 641                .value = info->value,
 642        };
 643        int ret = 0;
 644
 645        if (!obj)
 646                return -EINVAL;
 647
 648        ret = psp_ras_trigger_error(&adev->psp, &block_info);
 649        if (ret)
 650                DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
 651                                ras_block_str(info->head.block),
 652                                ret);
 653
 654        return ret;
 655}
 656
 657int amdgpu_ras_error_cure(struct amdgpu_device *adev,
 658                struct ras_cure_if *info)
 659{
 660        /* psp fw has no cure interface for now. */
 661        return 0;
 662}
 663
 664/* get the total error counts on all IPs */
 665int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 666                bool is_ce)
 667{
 668        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 669        struct ras_manager *obj;
 670        struct ras_err_data data = {0, 0};
 671
 672        if (!con)
 673                return -EINVAL;
 674
 675        list_for_each_entry(obj, &con->head, node) {
 676                struct ras_query_if info = {
 677                        .head = obj->head,
 678                };
 679
 680                if (amdgpu_ras_error_query(adev, &info))
 681                        return -EINVAL;
 682
 683                data.ce_count += info.ce_count;
 684                data.ue_count += info.ue_count;
 685        }
 686
 687        return is_ce ? data.ce_count : data.ue_count;
 688}
 689/* query/inject/cure end */
 690
 691
 692/* sysfs begin */
 693
 694static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
 695                struct device_attribute *attr, char *buf)
 696{
 697        struct amdgpu_ras *con =
 698                container_of(attr, struct amdgpu_ras, features_attr);
 699        struct drm_device *ddev = dev_get_drvdata(dev);
 700        struct amdgpu_device *adev = ddev->dev_private;
 701        struct ras_common_if head;
 702        int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
 703        int i;
 704        ssize_t s;
 705        struct ras_manager *obj;
 706
 707        s = scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
 708
 709        for (i = 0; i < ras_block_count; i++) {
 710                head.block = i;
 711
 712                if (amdgpu_ras_is_feature_enabled(adev, &head)) {
 713                        obj = amdgpu_ras_find_obj(adev, &head);
 714                        s += scnprintf(&buf[s], PAGE_SIZE - s,
 715                                        "%s: %s\n",
 716                                        ras_block_str(i),
 717                                        ras_err_str(obj->head.type));
 718                } else
 719                        s += scnprintf(&buf[s], PAGE_SIZE - s,
 720                                        "%s: disabled\n",
 721                                        ras_block_str(i));
 722        }
 723
 724        return s;
 725}
 726
 727static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
 728{
 729        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 730        struct attribute *attrs[] = {
 731                &con->features_attr.attr,
 732                NULL
 733        };
 734        struct attribute_group group = {
 735                .name = "ras",
 736                .attrs = attrs,
 737        };
 738
 739        con->features_attr = (struct device_attribute) {
 740                .attr = {
 741                        .name = "features",
 742                        .mode = S_IRUGO,
 743                },
 744                        .show = amdgpu_ras_sysfs_features_read,
 745        };
 746        sysfs_attr_init(attrs[0]);
 747
 748        return sysfs_create_group(&adev->dev->kobj, &group);
 749}
 750
 751static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
 752{
 753        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 754        struct attribute *attrs[] = {
 755                &con->features_attr.attr,
 756                NULL
 757        };
 758        struct attribute_group group = {
 759                .name = "ras",
 760                .attrs = attrs,
 761        };
 762
 763        sysfs_remove_group(&adev->dev->kobj, &group);
 764
 765        return 0;
 766}
 767
 768int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
 769                struct ras_fs_if *head)
 770{
 771        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
 772
 773        if (!obj || obj->attr_inuse)
 774                return -EINVAL;
 775
 776        get_obj(obj);
 777
 778        memcpy(obj->fs_data.sysfs_name,
 779                        head->sysfs_name,
 780                        sizeof(obj->fs_data.sysfs_name));
 781
 782        obj->sysfs_attr = (struct device_attribute){
 783                .attr = {
 784                        .name = obj->fs_data.sysfs_name,
 785                        .mode = S_IRUGO,
 786                },
 787                        .show = amdgpu_ras_sysfs_read,
 788        };
 789        sysfs_attr_init(&obj->sysfs_attr.attr);
 790
 791        if (sysfs_add_file_to_group(&adev->dev->kobj,
 792                                &obj->sysfs_attr.attr,
 793                                "ras")) {
 794                put_obj(obj);
 795                return -EINVAL;
 796        }
 797
 798        obj->attr_inuse = 1;
 799
 800        return 0;
 801}
 802
 803int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
 804                struct ras_common_if *head)
 805{
 806        struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
 807
 808        if (!obj || !obj->attr_inuse)
 809                return -EINVAL;
 810
 811        sysfs_remove_file_from_group(&adev->dev->kobj,
 812                                &obj->sysfs_attr.attr,
 813                                "ras");
 814        obj->attr_inuse = 0;
 815        put_obj(obj);
 816
 817        return 0;
 818}
 819
 820static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
 821{
 822        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 823        struct ras_manager *obj, *tmp;
 824
 825        list_for_each_entry_safe(obj, tmp, &con->head, node) {
 826                amdgpu_ras_sysfs_remove(adev, &obj->head);
 827        }
 828
 829        amdgpu_ras_sysfs_remove_feature_node(adev);
 830
 831        return 0;
 832}
 833/* sysfs end */
 834
 835/* debugfs begin */
 836static int amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
 837{
 838        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 839        struct drm_minor *minor = adev->ddev->primary;
 840        struct dentry *root = minor->debugfs_root, *dir;
 841        struct dentry *ent;
 842
 843        dir = debugfs_create_dir("ras", root);
 844        if (IS_ERR(dir))
 845                return -EINVAL;
 846
 847        con->dir = dir;
 848
 849        ent = debugfs_create_file("ras_ctrl",
 850                        S_IWUGO | S_IRUGO, con->dir,
 851                        adev, &amdgpu_ras_debugfs_ctrl_ops);
 852        if (IS_ERR(ent)) {
 853                debugfs_remove(con->dir);
 854                return -EINVAL;
 855        }
 856
 857        con->ent = ent;
 858        return 0;
 859}
 860
 861int amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
 862                struct ras_fs_if *head)
 863{
 864        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 865        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
 866        struct dentry *ent;
 867
 868        if (!obj || obj->ent)
 869                return -EINVAL;
 870
 871        get_obj(obj);
 872
 873        memcpy(obj->fs_data.debugfs_name,
 874                        head->debugfs_name,
 875                        sizeof(obj->fs_data.debugfs_name));
 876
 877        ent = debugfs_create_file(obj->fs_data.debugfs_name,
 878                        S_IWUGO | S_IRUGO, con->dir,
 879                        obj, &amdgpu_ras_debugfs_ops);
 880
 881        if (IS_ERR(ent))
 882                return -EINVAL;
 883
 884        obj->ent = ent;
 885
 886        return 0;
 887}
 888
 889int amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
 890                struct ras_common_if *head)
 891{
 892        struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
 893
 894        if (!obj || !obj->ent)
 895                return 0;
 896
 897        debugfs_remove(obj->ent);
 898        obj->ent = NULL;
 899        put_obj(obj);
 900
 901        return 0;
 902}
 903
 904static int amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
 905{
 906        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 907        struct ras_manager *obj, *tmp;
 908
 909        list_for_each_entry_safe(obj, tmp, &con->head, node) {
 910                amdgpu_ras_debugfs_remove(adev, &obj->head);
 911        }
 912
 913        debugfs_remove(con->ent);
 914        debugfs_remove(con->dir);
 915        con->dir = NULL;
 916        con->ent = NULL;
 917
 918        return 0;
 919}
 920/* debugfs end */
 921
 922/* ras fs */
 923
 924static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
 925{
 926        amdgpu_ras_sysfs_create_feature_node(adev);
 927        amdgpu_ras_debugfs_create_ctrl_node(adev);
 928
 929        return 0;
 930}
 931
 932static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
 933{
 934        amdgpu_ras_debugfs_remove_all(adev);
 935        amdgpu_ras_sysfs_remove_all(adev);
 936        return 0;
 937}
 938/* ras fs end */
 939
 940/* ih begin */
 941static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
 942{
 943        struct ras_ih_data *data = &obj->ih_data;
 944        struct amdgpu_iv_entry entry;
 945        int ret;
 946
 947        while (data->rptr != data->wptr) {
 948                rmb();
 949                memcpy(&entry, &data->ring[data->rptr],
 950                                data->element_size);
 951
 952                wmb();
 953                data->rptr = (data->aligned_element_size +
 954                                data->rptr) % data->ring_size;
 955
 956                /* Let IP handle its data, maybe we need get the output
 957                 * from the callback to udpate the error type/count, etc
 958                 */
 959                if (data->cb) {
 960                        ret = data->cb(obj->adev, &entry);
 961                        /* ue will trigger an interrupt, and in that case
 962                         * we need do a reset to recovery the whole system.
 963                         * But leave IP do that recovery, here we just dispatch
 964                         * the error.
 965                         */
 966                        if (ret == AMDGPU_RAS_UE) {
 967                                obj->err_data.ue_count++;
 968                        }
 969                        /* Might need get ce count by register, but not all IP
 970                         * saves ce count, some IP just use one bit or two bits
 971                         * to indicate ce happened.
 972                         */
 973                }
 974        }
 975}
 976
 977static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
 978{
 979        struct ras_ih_data *data =
 980                container_of(work, struct ras_ih_data, ih_work);
 981        struct ras_manager *obj =
 982                container_of(data, struct ras_manager, ih_data);
 983
 984        amdgpu_ras_interrupt_handler(obj);
 985}
 986
 987int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
 988                struct ras_dispatch_if *info)
 989{
 990        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
 991        struct ras_ih_data *data = &obj->ih_data;
 992
 993        if (!obj)
 994                return -EINVAL;
 995
 996        if (data->inuse == 0)
 997                return 0;
 998
 999        /* Might be overflow... */
1000        memcpy(&data->ring[data->wptr], info->entry,
1001                        data->element_size);
1002
1003        wmb();
1004        data->wptr = (data->aligned_element_size +
1005                        data->wptr) % data->ring_size;
1006
1007        schedule_work(&data->ih_work);
1008
1009        return 0;
1010}
1011
1012int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1013                struct ras_ih_if *info)
1014{
1015        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1016        struct ras_ih_data *data;
1017
1018        if (!obj)
1019                return -EINVAL;
1020
1021        data = &obj->ih_data;
1022        if (data->inuse == 0)
1023                return 0;
1024
1025        cancel_work_sync(&data->ih_work);
1026
1027        kfree(data->ring);
1028        memset(data, 0, sizeof(*data));
1029        put_obj(obj);
1030
1031        return 0;
1032}
1033
1034int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1035                struct ras_ih_if *info)
1036{
1037        struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1038        struct ras_ih_data *data;
1039
1040        if (!obj) {
1041                /* in case we registe the IH before enable ras feature */
1042                obj = amdgpu_ras_create_obj(adev, &info->head);
1043                if (!obj)
1044                        return -EINVAL;
1045        } else
1046                get_obj(obj);
1047
1048        data = &obj->ih_data;
1049        /* add the callback.etc */
1050        *data = (struct ras_ih_data) {
1051                .inuse = 0,
1052                .cb = info->cb,
1053                .element_size = sizeof(struct amdgpu_iv_entry),
1054                .rptr = 0,
1055                .wptr = 0,
1056        };
1057
1058        INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1059
1060        data->aligned_element_size = ALIGN(data->element_size, 8);
1061        /* the ring can store 64 iv entries. */
1062        data->ring_size = 64 * data->aligned_element_size;
1063        data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1064        if (!data->ring) {
1065                put_obj(obj);
1066                return -ENOMEM;
1067        }
1068
1069        /* IH is ready */
1070        data->inuse = 1;
1071
1072        return 0;
1073}
1074
1075static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1076{
1077        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1078        struct ras_manager *obj, *tmp;
1079
1080        list_for_each_entry_safe(obj, tmp, &con->head, node) {
1081                struct ras_ih_if info = {
1082                        .head = obj->head,
1083                };
1084                amdgpu_ras_interrupt_remove_handler(adev, &info);
1085        }
1086
1087        return 0;
1088}
1089/* ih end */
1090
1091/* recovery begin */
1092static void amdgpu_ras_do_recovery(struct work_struct *work)
1093{
1094        struct amdgpu_ras *ras =
1095                container_of(work, struct amdgpu_ras, recovery_work);
1096
1097        amdgpu_device_gpu_recover(ras->adev, 0);
1098        atomic_set(&ras->in_recovery, 0);
1099}
1100
1101static int amdgpu_ras_release_vram(struct amdgpu_device *adev,
1102                struct amdgpu_bo **bo_ptr)
1103{
1104        /* no need to free it actually. */
1105        amdgpu_bo_free_kernel(bo_ptr, NULL, NULL);
1106        return 0;
1107}
1108
1109/* reserve vram with size@offset */
1110static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
1111                uint64_t offset, uint64_t size,
1112                struct amdgpu_bo **bo_ptr)
1113{
1114        struct ttm_operation_ctx ctx = { false, false };
1115        struct amdgpu_bo_param bp;
1116        int r = 0;
1117        int i;
1118        struct amdgpu_bo *bo;
1119
1120        if (bo_ptr)
1121                *bo_ptr = NULL;
1122        memset(&bp, 0, sizeof(bp));
1123        bp.size = size;
1124        bp.byte_align = PAGE_SIZE;
1125        bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
1126        bp.flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
1127                AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
1128        bp.type = ttm_bo_type_kernel;
1129        bp.resv = NULL;
1130
1131        r = amdgpu_bo_create(adev, &bp, &bo);
1132        if (r)
1133                return -EINVAL;
1134
1135        r = amdgpu_bo_reserve(bo, false);
1136        if (r)
1137                goto error_reserve;
1138
1139        offset = ALIGN(offset, PAGE_SIZE);
1140        for (i = 0; i < bo->placement.num_placement; ++i) {
1141                bo->placements[i].fpfn = offset >> PAGE_SHIFT;
1142                bo->placements[i].lpfn = (offset + size) >> PAGE_SHIFT;
1143        }
1144
1145        ttm_bo_mem_put(&bo->tbo, &bo->tbo.mem);
1146        r = ttm_bo_mem_space(&bo->tbo, &bo->placement, &bo->tbo.mem, &ctx);
1147        if (r)
1148                goto error_pin;
1149
1150        r = amdgpu_bo_pin_restricted(bo,
1151                        AMDGPU_GEM_DOMAIN_VRAM,
1152                        offset,
1153                        offset + size);
1154        if (r)
1155                goto error_pin;
1156
1157        if (bo_ptr)
1158                *bo_ptr = bo;
1159
1160        amdgpu_bo_unreserve(bo);
1161        return r;
1162
1163error_pin:
1164        amdgpu_bo_unreserve(bo);
1165error_reserve:
1166        amdgpu_bo_unref(&bo);
1167        return r;
1168}
1169
1170/* alloc/realloc bps array */
1171static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1172                struct ras_err_handler_data *data, int pages)
1173{
1174        unsigned int old_space = data->count + data->space_left;
1175        unsigned int new_space = old_space + pages;
1176        unsigned int align_space = ALIGN(new_space, 1024);
1177        void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1178
1179        if (!tmp)
1180                return -ENOMEM;
1181
1182        if (data->bps) {
1183                memcpy(tmp, data->bps,
1184                                data->count * sizeof(*data->bps));
1185                kfree(data->bps);
1186        }
1187
1188        data->bps = tmp;
1189        data->space_left += align_space - old_space;
1190        return 0;
1191}
1192
1193/* it deal with vram only. */
1194int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
1195                unsigned long *bps, int pages)
1196{
1197        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1198        struct ras_err_handler_data *data;
1199        int i = pages;
1200        int ret = 0;
1201
1202        if (!con || !con->eh_data || !bps || pages <= 0)
1203                return 0;
1204
1205        mutex_lock(&con->recovery_lock);
1206        data = con->eh_data;
1207        if (!data)
1208                goto out;
1209
1210        if (data->space_left <= pages)
1211                if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
1212                        ret = -ENOMEM;
1213                        goto out;
1214                }
1215
1216        while (i--)
1217                data->bps[data->count++].bp = bps[i];
1218
1219        data->space_left -= pages;
1220out:
1221        mutex_unlock(&con->recovery_lock);
1222
1223        return ret;
1224}
1225
1226/* called in gpu recovery/init */
1227int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
1228{
1229        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1230        struct ras_err_handler_data *data;
1231        uint64_t bp;
1232        struct amdgpu_bo *bo;
1233        int i;
1234
1235        if (!con || !con->eh_data)
1236                return 0;
1237
1238        mutex_lock(&con->recovery_lock);
1239        data = con->eh_data;
1240        if (!data)
1241                goto out;
1242        /* reserve vram at driver post stage. */
1243        for (i = data->last_reserved; i < data->count; i++) {
1244                bp = data->bps[i].bp;
1245
1246                if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,
1247                                        PAGE_SIZE, &bo))
1248                        DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
1249
1250                data->bps[i].bo = bo;
1251                data->last_reserved = i + 1;
1252        }
1253out:
1254        mutex_unlock(&con->recovery_lock);
1255        return 0;
1256}
1257
1258/* called when driver unload */
1259static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
1260{
1261        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1262        struct ras_err_handler_data *data;
1263        struct amdgpu_bo *bo;
1264        int i;
1265
1266        if (!con || !con->eh_data)
1267                return 0;
1268
1269        mutex_lock(&con->recovery_lock);
1270        data = con->eh_data;
1271        if (!data)
1272                goto out;
1273
1274        for (i = data->last_reserved - 1; i >= 0; i--) {
1275                bo = data->bps[i].bo;
1276
1277                amdgpu_ras_release_vram(adev, &bo);
1278
1279                data->bps[i].bo = bo;
1280                data->last_reserved = i;
1281        }
1282out:
1283        mutex_unlock(&con->recovery_lock);
1284        return 0;
1285}
1286
1287static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1288{
1289        /* TODO
1290         * write the array to eeprom when SMU disabled.
1291         */
1292        return 0;
1293}
1294
1295static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1296{
1297        /* TODO
1298         * read the array to eeprom when SMU disabled.
1299         */
1300        return 0;
1301}
1302
1303static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
1304{
1305        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1306        struct ras_err_handler_data **data = &con->eh_data;
1307
1308        *data = kmalloc(sizeof(**data),
1309                        GFP_KERNEL|__GFP_ZERO);
1310        if (!*data)
1311                return -ENOMEM;
1312
1313        mutex_init(&con->recovery_lock);
1314        INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
1315        atomic_set(&con->in_recovery, 0);
1316        con->adev = adev;
1317
1318        amdgpu_ras_load_bad_pages(adev);
1319        amdgpu_ras_reserve_bad_pages(adev);
1320
1321        return 0;
1322}
1323
1324static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
1325{
1326        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1327        struct ras_err_handler_data *data = con->eh_data;
1328
1329        cancel_work_sync(&con->recovery_work);
1330        amdgpu_ras_save_bad_pages(adev);
1331        amdgpu_ras_release_bad_pages(adev);
1332
1333        mutex_lock(&con->recovery_lock);
1334        con->eh_data = NULL;
1335        kfree(data->bps);
1336        kfree(data);
1337        mutex_unlock(&con->recovery_lock);
1338
1339        return 0;
1340}
1341/* recovery end */
1342
1343/*
1344 * check hardware's ras ability which will be saved in hw_supported.
1345 * if hardware does not support ras, we can skip some ras initializtion and
1346 * forbid some ras operations from IP.
1347 * if software itself, say boot parameter, limit the ras ability. We still
1348 * need allow IP do some limited operations, like disable. In such case,
1349 * we have to initialize ras as normal. but need check if operation is
1350 * allowed or not in each function.
1351 */
1352static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
1353                uint32_t *hw_supported, uint32_t *supported)
1354{
1355        *hw_supported = 0;
1356        *supported = 0;
1357
1358        if (amdgpu_sriov_vf(adev) ||
1359                        adev->asic_type != CHIP_VEGA20)
1360                return;
1361
1362        if (adev->is_atom_fw &&
1363                        (amdgpu_atomfirmware_mem_ecc_supported(adev) ||
1364                         amdgpu_atomfirmware_sram_ecc_supported(adev)))
1365                *hw_supported = AMDGPU_RAS_BLOCK_MASK;
1366
1367        *supported = amdgpu_ras_enable == 0 ?
1368                                0 : *hw_supported & amdgpu_ras_mask;
1369}
1370
1371int amdgpu_ras_init(struct amdgpu_device *adev)
1372{
1373        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1374
1375        if (con)
1376                return 0;
1377
1378        con = kmalloc(sizeof(struct amdgpu_ras) +
1379                        sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
1380                        GFP_KERNEL|__GFP_ZERO);
1381        if (!con)
1382                return -ENOMEM;
1383
1384        con->objs = (struct ras_manager *)(con + 1);
1385
1386        amdgpu_ras_set_context(adev, con);
1387
1388        amdgpu_ras_check_supported(adev, &con->hw_supported,
1389                        &con->supported);
1390        con->features = 0;
1391        INIT_LIST_HEAD(&con->head);
1392        /* Might need get this flag from vbios. */
1393        con->flags = RAS_DEFAULT_FLAGS;
1394
1395        if (amdgpu_ras_recovery_init(adev))
1396                goto recovery_out;
1397
1398        amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
1399
1400        if (amdgpu_ras_fs_init(adev))
1401                goto fs_out;
1402
1403        amdgpu_ras_self_test(adev);
1404
1405        DRM_INFO("RAS INFO: ras initialized successfully, "
1406                        "hardware ability[%x] ras_mask[%x]\n",
1407                        con->hw_supported, con->supported);
1408        return 0;
1409fs_out:
1410        amdgpu_ras_recovery_fini(adev);
1411recovery_out:
1412        amdgpu_ras_set_context(adev, NULL);
1413        kfree(con);
1414
1415        return -EINVAL;
1416}
1417
1418/* do some init work after IP late init as dependence */
1419void amdgpu_ras_post_init(struct amdgpu_device *adev)
1420{
1421        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1422        struct ras_manager *obj, *tmp;
1423
1424        if (!con)
1425                return;
1426
1427        if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
1428                /* Set up all other IPs which are not implemented. There is a
1429                 * tricky thing that IP's actual ras error type should be
1430                 * MULTI_UNCORRECTABLE, but as driver does not handle it, so
1431                 * ERROR_NONE make sense anyway.
1432                 */
1433                amdgpu_ras_enable_all_features(adev, 1);
1434
1435                /* We enable ras on all hw_supported block, but as boot
1436                 * parameter might disable some of them and one or more IP has
1437                 * not implemented yet. So we disable them on behalf.
1438                 */
1439                list_for_each_entry_safe(obj, tmp, &con->head, node) {
1440                        if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
1441                                amdgpu_ras_feature_enable(adev, &obj->head, 0);
1442                                /* there should be no any reference. */
1443                                WARN_ON(alive_obj(obj));
1444                        }
1445                }
1446        }
1447}
1448
1449/* do some fini work before IP fini as dependence */
1450int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
1451{
1452        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1453
1454        if (!con)
1455                return 0;
1456
1457        /* Need disable ras on all IPs here before ip [hw/sw]fini */
1458        amdgpu_ras_disable_all_features(adev, 0);
1459        amdgpu_ras_recovery_fini(adev);
1460        return 0;
1461}
1462
1463int amdgpu_ras_fini(struct amdgpu_device *adev)
1464{
1465        struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1466
1467        if (!con)
1468                return 0;
1469
1470        amdgpu_ras_fs_fini(adev);
1471        amdgpu_ras_interrupt_remove_all(adev);
1472
1473        WARN(con->features, "Feature mask is not cleared");
1474
1475        if (con->features)
1476                amdgpu_ras_disable_all_features(adev, 1);
1477
1478        amdgpu_ras_set_context(adev, NULL);
1479        kfree(con);
1480
1481        return 0;
1482}
1483