linux/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
<<
>>
Prefs
   1/*
   2 * Copyright 2018 Advanced Micro Devices, Inc.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice shall be included in
  12 * all copies or substantial portions of the Software.
  13 *
  14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20 * OTHER DEALINGS IN THE SOFTWARE.
  21 *
  22 *
  23 */
  24#include <linux/list.h>
  25#include "amdgpu.h"
  26#include "amdgpu_xgmi.h"
  27#include "amdgpu_ras.h"
  28#include "soc15.h"
  29#include "df/df_3_6_offset.h"
  30#include "xgmi/xgmi_4_0_0_smn.h"
  31#include "xgmi/xgmi_4_0_0_sh_mask.h"
  32#include "wafl/wafl2_4_0_0_smn.h"
  33#include "wafl/wafl2_4_0_0_sh_mask.h"
  34
  35static DEFINE_MUTEX(xgmi_mutex);
  36
  37#define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE         4
  38
  39static LIST_HEAD(xgmi_hive_list);
  40
  41static const int xgmi_pcs_err_status_reg_vg20[] = {
  42        smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
  43        smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
  44};
  45
  46static const int wafl_pcs_err_status_reg_vg20[] = {
  47        smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
  48        smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
  49};
  50
  51static const int xgmi_pcs_err_status_reg_arct[] = {
  52        smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
  53        smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
  54        smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000,
  55        smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000,
  56        smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000,
  57        smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000,
  58};
  59
  60/* same as vg20*/
  61static const int wafl_pcs_err_status_reg_arct[] = {
  62        smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
  63        smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
  64};
  65
  66static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
  67        {"XGMI PCS DataLossErr",
  68         SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
  69        {"XGMI PCS TrainingErr",
  70         SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)},
  71        {"XGMI PCS CRCErr",
  72         SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)},
  73        {"XGMI PCS BERExceededErr",
  74         SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)},
  75        {"XGMI PCS TxMetaDataErr",
  76         SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)},
  77        {"XGMI PCS ReplayBufParityErr",
  78         SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)},
  79        {"XGMI PCS DataParityErr",
  80         SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)},
  81        {"XGMI PCS ReplayFifoOverflowErr",
  82         SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
  83        {"XGMI PCS ReplayFifoUnderflowErr",
  84         SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
  85        {"XGMI PCS ElasticFifoOverflowErr",
  86         SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
  87        {"XGMI PCS DeskewErr",
  88         SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)},
  89        {"XGMI PCS DataStartupLimitErr",
  90         SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)},
  91        {"XGMI PCS FCInitTimeoutErr",
  92         SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)},
  93        {"XGMI PCS RecoveryTimeoutErr",
  94         SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
  95        {"XGMI PCS ReadySerialTimeoutErr",
  96         SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
  97        {"XGMI PCS ReadySerialAttemptErr",
  98         SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
  99        {"XGMI PCS RecoveryAttemptErr",
 100         SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)},
 101        {"XGMI PCS RecoveryRelockAttemptErr",
 102         SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
 103};
 104
 105static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = {
 106        {"WAFL PCS DataLossErr",
 107         SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)},
 108        {"WAFL PCS TrainingErr",
 109         SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)},
 110        {"WAFL PCS CRCErr",
 111         SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)},
 112        {"WAFL PCS BERExceededErr",
 113         SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)},
 114        {"WAFL PCS TxMetaDataErr",
 115         SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)},
 116        {"WAFL PCS ReplayBufParityErr",
 117         SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)},
 118        {"WAFL PCS DataParityErr",
 119         SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)},
 120        {"WAFL PCS ReplayFifoOverflowErr",
 121         SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
 122        {"WAFL PCS ReplayFifoUnderflowErr",
 123         SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
 124        {"WAFL PCS ElasticFifoOverflowErr",
 125         SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
 126        {"WAFL PCS DeskewErr",
 127         SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)},
 128        {"WAFL PCS DataStartupLimitErr",
 129         SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)},
 130        {"WAFL PCS FCInitTimeoutErr",
 131         SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)},
 132        {"WAFL PCS RecoveryTimeoutErr",
 133         SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
 134        {"WAFL PCS ReadySerialTimeoutErr",
 135         SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
 136        {"WAFL PCS ReadySerialAttemptErr",
 137         SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
 138        {"WAFL PCS RecoveryAttemptErr",
 139         SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)},
 140        {"WAFL PCS RecoveryRelockAttemptErr",
 141         SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
 142};
 143
 144/**
 145 * DOC: AMDGPU XGMI Support
 146 *
 147 * XGMI is a high speed interconnect that joins multiple GPU cards
 148 * into a homogeneous memory space that is organized by a collective
 149 * hive ID and individual node IDs, both of which are 64-bit numbers.
 150 *
 151 * The file xgmi_device_id contains the unique per GPU device ID and
 152 * is stored in the /sys/class/drm/card${cardno}/device/ directory.
 153 *
 154 * Inside the device directory a sub-directory 'xgmi_hive_info' is
 155 * created which contains the hive ID and the list of nodes.
 156 *
 157 * The hive ID is stored in:
 158 *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id
 159 *
 160 * The node information is stored in numbered directories:
 161 *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id
 162 *
 163 * Each device has their own xgmi_hive_info direction with a mirror
 164 * set of node sub-directories.
 165 *
 166 * The XGMI memory space is built by contiguously adding the power of
 167 * two padded VRAM space from each node to each other.
 168 *
 169 */
 170
 171static struct attribute amdgpu_xgmi_hive_id = {
 172        .name = "xgmi_hive_id",
 173        .mode = S_IRUGO
 174};
 175
 176static struct attribute *amdgpu_xgmi_hive_attrs[] = {
 177        &amdgpu_xgmi_hive_id,
 178        NULL
 179};
 180
 181static ssize_t amdgpu_xgmi_show_attrs(struct kobject *kobj,
 182        struct attribute *attr, char *buf)
 183{
 184        struct amdgpu_hive_info *hive = container_of(
 185                kobj, struct amdgpu_hive_info, kobj);
 186
 187        if (attr == &amdgpu_xgmi_hive_id)
 188                return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
 189
 190        return 0;
 191}
 192
 193static void amdgpu_xgmi_hive_release(struct kobject *kobj)
 194{
 195        struct amdgpu_hive_info *hive = container_of(
 196                kobj, struct amdgpu_hive_info, kobj);
 197
 198        mutex_destroy(&hive->hive_lock);
 199        kfree(hive);
 200}
 201
 202static const struct sysfs_ops amdgpu_xgmi_hive_ops = {
 203        .show = amdgpu_xgmi_show_attrs,
 204};
 205
 206struct kobj_type amdgpu_xgmi_hive_type = {
 207        .release = amdgpu_xgmi_hive_release,
 208        .sysfs_ops = &amdgpu_xgmi_hive_ops,
 209        .default_attrs = amdgpu_xgmi_hive_attrs,
 210};
 211
 212static ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
 213                                     struct device_attribute *attr,
 214                                     char *buf)
 215{
 216        struct drm_device *ddev = dev_get_drvdata(dev);
 217        struct amdgpu_device *adev = drm_to_adev(ddev);
 218
 219        return sysfs_emit(buf, "%llu\n", adev->gmc.xgmi.node_id);
 220
 221}
 222
 223#define AMDGPU_XGMI_SET_FICAA(o)        ((o) | 0x456801)
 224static ssize_t amdgpu_xgmi_show_error(struct device *dev,
 225                                      struct device_attribute *attr,
 226                                      char *buf)
 227{
 228        struct drm_device *ddev = dev_get_drvdata(dev);
 229        struct amdgpu_device *adev = drm_to_adev(ddev);
 230        uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in;
 231        uint64_t fica_out;
 232        unsigned int error_count = 0;
 233
 234        ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200);
 235        ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208);
 236
 237        fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in);
 238        if (fica_out != 0x1f)
 239                pr_err("xGMI error counters not enabled!\n");
 240
 241        fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in);
 242
 243        if ((fica_out & 0xffff) == 2)
 244                error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63);
 245
 246        adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0);
 247
 248        return sysfs_emit(buf, "%u\n", error_count);
 249}
 250
 251
 252static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
 253static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL);
 254
 255static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
 256                                         struct amdgpu_hive_info *hive)
 257{
 258        int ret = 0;
 259        char node[10] = { 0 };
 260
 261        /* Create xgmi device id file */
 262        ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id);
 263        if (ret) {
 264                dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n");
 265                return ret;
 266        }
 267
 268        /* Create xgmi error file */
 269        ret = device_create_file(adev->dev, &dev_attr_xgmi_error);
 270        if (ret)
 271                pr_err("failed to create xgmi_error\n");
 272
 273
 274        /* Create sysfs link to hive info folder on the first device */
 275        if (hive->kobj.parent != (&adev->dev->kobj)) {
 276                ret = sysfs_create_link(&adev->dev->kobj, &hive->kobj,
 277                                        "xgmi_hive_info");
 278                if (ret) {
 279                        dev_err(adev->dev, "XGMI: Failed to create link to hive info");
 280                        goto remove_file;
 281                }
 282        }
 283
 284        sprintf(node, "node%d", atomic_read(&hive->number_devices));
 285        /* Create sysfs link form the hive folder to yourself */
 286        ret = sysfs_create_link(&hive->kobj, &adev->dev->kobj, node);
 287        if (ret) {
 288                dev_err(adev->dev, "XGMI: Failed to create link from hive info");
 289                goto remove_link;
 290        }
 291
 292        goto success;
 293
 294
 295remove_link:
 296        sysfs_remove_link(&adev->dev->kobj, adev_to_drm(adev)->unique);
 297
 298remove_file:
 299        device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
 300
 301success:
 302        return ret;
 303}
 304
 305static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
 306                                          struct amdgpu_hive_info *hive)
 307{
 308        char node[10];
 309        memset(node, 0, sizeof(node));
 310
 311        device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
 312        device_remove_file(adev->dev, &dev_attr_xgmi_error);
 313
 314        if (hive->kobj.parent != (&adev->dev->kobj))
 315                sysfs_remove_link(&adev->dev->kobj,"xgmi_hive_info");
 316
 317        sprintf(node, "node%d", atomic_read(&hive->number_devices));
 318        sysfs_remove_link(&hive->kobj, node);
 319
 320}
 321
 322
 323
 324struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
 325{
 326        struct amdgpu_hive_info *hive = NULL;
 327        int ret;
 328
 329        if (!adev->gmc.xgmi.hive_id)
 330                return NULL;
 331
 332        if (adev->hive) {
 333                kobject_get(&adev->hive->kobj);
 334                return adev->hive;
 335        }
 336
 337        mutex_lock(&xgmi_mutex);
 338
 339        list_for_each_entry(hive, &xgmi_hive_list, node)  {
 340                if (hive->hive_id == adev->gmc.xgmi.hive_id)
 341                        goto pro_end;
 342        }
 343
 344        hive = kzalloc(sizeof(*hive), GFP_KERNEL);
 345        if (!hive) {
 346                dev_err(adev->dev, "XGMI: allocation failed\n");
 347                hive = NULL;
 348                goto pro_end;
 349        }
 350
 351        /* initialize new hive if not exist */
 352        ret = kobject_init_and_add(&hive->kobj,
 353                        &amdgpu_xgmi_hive_type,
 354                        &adev->dev->kobj,
 355                        "%s", "xgmi_hive_info");
 356        if (ret) {
 357                dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi hive\n");
 358                kobject_put(&hive->kobj);
 359                kfree(hive);
 360                hive = NULL;
 361                goto pro_end;
 362        }
 363
 364        hive->hive_id = adev->gmc.xgmi.hive_id;
 365        INIT_LIST_HEAD(&hive->device_list);
 366        INIT_LIST_HEAD(&hive->node);
 367        mutex_init(&hive->hive_lock);
 368        atomic_set(&hive->in_reset, 0);
 369        atomic_set(&hive->number_devices, 0);
 370        task_barrier_init(&hive->tb);
 371        hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
 372        hive->hi_req_gpu = NULL;
 373        /*
 374         * hive pstate on boot is high in vega20 so we have to go to low
 375         * pstate on after boot.
 376         */
 377        hive->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE;
 378        list_add_tail(&hive->node, &xgmi_hive_list);
 379
 380pro_end:
 381        if (hive)
 382                kobject_get(&hive->kobj);
 383        mutex_unlock(&xgmi_mutex);
 384        return hive;
 385}
 386
 387void amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive)
 388{
 389        if (hive)
 390                kobject_put(&hive->kobj);
 391}
 392
 393int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
 394{
 395        int ret = 0;
 396        struct amdgpu_hive_info *hive;
 397        struct amdgpu_device *request_adev;
 398        bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
 399        bool init_low;
 400
 401        hive = amdgpu_get_xgmi_hive(adev);
 402        if (!hive)
 403                return 0;
 404
 405        request_adev = hive->hi_req_gpu ? hive->hi_req_gpu : adev;
 406        init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN;
 407        amdgpu_put_xgmi_hive(hive);
 408        /* fw bug so temporarily disable pstate switching */
 409        return 0;
 410
 411        if (!hive || adev->asic_type != CHIP_VEGA20)
 412                return 0;
 413
 414        mutex_lock(&hive->hive_lock);
 415
 416        if (is_hi_req)
 417                hive->hi_req_count++;
 418        else
 419                hive->hi_req_count--;
 420
 421        /*
 422         * Vega20 only needs single peer to request pstate high for the hive to
 423         * go high but all peers must request pstate low for the hive to go low
 424         */
 425        if (hive->pstate == pstate ||
 426                        (!is_hi_req && hive->hi_req_count && !init_low))
 427                goto out;
 428
 429        dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate);
 430
 431        ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate);
 432        if (ret) {
 433                dev_err(request_adev->dev,
 434                        "XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
 435                        request_adev->gmc.xgmi.node_id,
 436                        request_adev->gmc.xgmi.hive_id, ret);
 437                goto out;
 438        }
 439
 440        if (init_low)
 441                hive->pstate = hive->hi_req_count ?
 442                                        hive->pstate : AMDGPU_XGMI_PSTATE_MIN;
 443        else {
 444                hive->pstate = pstate;
 445                hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ?
 446                                                        adev : NULL;
 447        }
 448out:
 449        mutex_unlock(&hive->hive_lock);
 450        return ret;
 451}
 452
 453int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
 454{
 455        int ret;
 456
 457        /* Each psp need to set the latest topology */
 458        ret = psp_xgmi_set_topology_info(&adev->psp,
 459                                         atomic_read(&hive->number_devices),
 460                                         &adev->psp.xgmi_context.top_info);
 461        if (ret)
 462                dev_err(adev->dev,
 463                        "XGMI: Set topology failure on device %llx, hive %llx, ret %d",
 464                        adev->gmc.xgmi.node_id,
 465                        adev->gmc.xgmi.hive_id, ret);
 466
 467        return ret;
 468}
 469
 470
 471/*
 472 * NOTE psp_xgmi_node_info.num_hops layout is as follows:
 473 * num_hops[7:6] = link type (0 = xGMI2, 1 = xGMI3, 2/3 = reserved)
 474 * num_hops[5:3] = reserved
 475 * num_hops[2:0] = number of hops
 476 */
 477int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
 478                struct amdgpu_device *peer_adev)
 479{
 480        struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
 481        uint8_t num_hops_mask = 0x7;
 482        int i;
 483
 484        for (i = 0 ; i < top->num_nodes; ++i)
 485                if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
 486                        return top->nodes[i].num_hops & num_hops_mask;
 487        return  -EINVAL;
 488}
 489
 490int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 491{
 492        struct psp_xgmi_topology_info *top_info;
 493        struct amdgpu_hive_info *hive;
 494        struct amdgpu_xgmi      *entry;
 495        struct amdgpu_device *tmp_adev = NULL;
 496
 497        int count = 0, ret = 0;
 498
 499        if (!adev->gmc.xgmi.supported)
 500                return 0;
 501
 502        if (!adev->gmc.xgmi.pending_reset &&
 503            amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
 504                ret = psp_xgmi_initialize(&adev->psp);
 505                if (ret) {
 506                        dev_err(adev->dev,
 507                                "XGMI: Failed to initialize xgmi session\n");
 508                        return ret;
 509                }
 510
 511                ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id);
 512                if (ret) {
 513                        dev_err(adev->dev,
 514                                "XGMI: Failed to get hive id\n");
 515                        return ret;
 516                }
 517
 518                ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id);
 519                if (ret) {
 520                        dev_err(adev->dev,
 521                                "XGMI: Failed to get node id\n");
 522                        return ret;
 523                }
 524        } else {
 525                adev->gmc.xgmi.hive_id = 16;
 526                adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
 527        }
 528
 529        hive = amdgpu_get_xgmi_hive(adev);
 530        if (!hive) {
 531                ret = -EINVAL;
 532                dev_err(adev->dev,
 533                        "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",
 534                        adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id);
 535                goto exit;
 536        }
 537        mutex_lock(&hive->hive_lock);
 538
 539        top_info = &adev->psp.xgmi_context.top_info;
 540
 541        list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
 542        list_for_each_entry(entry, &hive->device_list, head)
 543                top_info->nodes[count++].node_id = entry->node_id;
 544        top_info->num_nodes = count;
 545        atomic_set(&hive->number_devices, count);
 546
 547        task_barrier_add_task(&hive->tb);
 548
 549        if (!adev->gmc.xgmi.pending_reset &&
 550            amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
 551                list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
 552                        /* update node list for other device in the hive */
 553                        if (tmp_adev != adev) {
 554                                top_info = &tmp_adev->psp.xgmi_context.top_info;
 555                                top_info->nodes[count - 1].node_id =
 556                                        adev->gmc.xgmi.node_id;
 557                                top_info->num_nodes = count;
 558                        }
 559                        ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
 560                        if (ret)
 561                                goto exit_unlock;
 562                }
 563
 564                /* get latest topology info for each device from psp */
 565                list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
 566                        ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
 567                                        &tmp_adev->psp.xgmi_context.top_info);
 568                        if (ret) {
 569                                dev_err(tmp_adev->dev,
 570                                        "XGMI: Get topology failure on device %llx, hive %llx, ret %d",
 571                                        tmp_adev->gmc.xgmi.node_id,
 572                                        tmp_adev->gmc.xgmi.hive_id, ret);
 573                                /* To do : continue with some node failed or disable the whole hive */
 574                                goto exit_unlock;
 575                        }
 576                }
 577        }
 578
 579        if (!ret && !adev->gmc.xgmi.pending_reset)
 580                ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
 581
 582exit_unlock:
 583        mutex_unlock(&hive->hive_lock);
 584exit:
 585        if (!ret) {
 586                adev->hive = hive;
 587                dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
 588                         adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id);
 589        } else {
 590                amdgpu_put_xgmi_hive(hive);
 591                dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",
 592                        adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,
 593                        ret);
 594        }
 595
 596        return ret;
 597}
 598
 599int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
 600{
 601        struct amdgpu_hive_info *hive = adev->hive;
 602
 603        if (!adev->gmc.xgmi.supported)
 604                return -EINVAL;
 605
 606        if (!hive)
 607                return -EINVAL;
 608
 609        mutex_lock(&hive->hive_lock);
 610        task_barrier_rem_task(&hive->tb);
 611        amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
 612        if (hive->hi_req_gpu == adev)
 613                hive->hi_req_gpu = NULL;
 614        list_del(&adev->gmc.xgmi.head);
 615        mutex_unlock(&hive->hive_lock);
 616
 617        amdgpu_put_xgmi_hive(hive);
 618        adev->hive = NULL;
 619
 620        if (atomic_dec_return(&hive->number_devices) == 0) {
 621                /* Remove the hive from global hive list */
 622                mutex_lock(&xgmi_mutex);
 623                list_del(&hive->node);
 624                mutex_unlock(&xgmi_mutex);
 625
 626                amdgpu_put_xgmi_hive(hive);
 627        }
 628
 629        return psp_xgmi_terminate(&adev->psp);
 630}
 631
 632static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)
 633{
 634        int r;
 635        struct ras_ih_if ih_info = {
 636                .cb = NULL,
 637        };
 638        struct ras_fs_if fs_info = {
 639                .sysfs_name = "xgmi_wafl_err_count",
 640        };
 641
 642        if (!adev->gmc.xgmi.supported ||
 643            adev->gmc.xgmi.num_physical_nodes == 0)
 644                return 0;
 645
 646        adev->gmc.xgmi.ras_funcs->reset_ras_error_count(adev);
 647
 648        if (!adev->gmc.xgmi.ras_if) {
 649                adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
 650                if (!adev->gmc.xgmi.ras_if)
 651                        return -ENOMEM;
 652                adev->gmc.xgmi.ras_if->block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
 653                adev->gmc.xgmi.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
 654                adev->gmc.xgmi.ras_if->sub_block_index = 0;
 655                strcpy(adev->gmc.xgmi.ras_if->name, "xgmi_wafl");
 656        }
 657        ih_info.head = fs_info.head = *adev->gmc.xgmi.ras_if;
 658        r = amdgpu_ras_late_init(adev, adev->gmc.xgmi.ras_if,
 659                                 &fs_info, &ih_info);
 660        if (r || !amdgpu_ras_is_supported(adev, adev->gmc.xgmi.ras_if->block)) {
 661                kfree(adev->gmc.xgmi.ras_if);
 662                adev->gmc.xgmi.ras_if = NULL;
 663        }
 664
 665        return r;
 666}
 667
 668static void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev)
 669{
 670        if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) &&
 671                        adev->gmc.xgmi.ras_if) {
 672                struct ras_common_if *ras_if = adev->gmc.xgmi.ras_if;
 673                struct ras_ih_if ih_info = {
 674                        .cb = NULL,
 675                };
 676
 677                amdgpu_ras_late_fini(adev, ras_if, &ih_info);
 678                kfree(ras_if);
 679        }
 680}
 681
 682uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
 683                                           uint64_t addr)
 684{
 685        struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi;
 686        return (addr + xgmi->physical_node_id * xgmi->node_segment_size);
 687}
 688
 689static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg)
 690{
 691        WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF);
 692        WREG32_PCIE(pcs_status_reg, 0);
 693}
 694
 695static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
 696{
 697        uint32_t i;
 698
 699        switch (adev->asic_type) {
 700        case CHIP_ARCTURUS:
 701                for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++)
 702                        pcs_clear_status(adev,
 703                                         xgmi_pcs_err_status_reg_arct[i]);
 704                break;
 705        case CHIP_VEGA20:
 706                for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++)
 707                        pcs_clear_status(adev,
 708                                         xgmi_pcs_err_status_reg_vg20[i]);
 709                break;
 710        default:
 711                break;
 712        }
 713}
 714
 715static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
 716                                              uint32_t value,
 717                                              uint32_t *ue_count,
 718                                              uint32_t *ce_count,
 719                                              bool is_xgmi_pcs)
 720{
 721        int i;
 722        int ue_cnt;
 723
 724        if (is_xgmi_pcs) {
 725                /* query xgmi pcs error status,
 726                 * only ue is supported */
 727                for (i = 0; i < ARRAY_SIZE(xgmi_pcs_ras_fields); i ++) {
 728                        ue_cnt = (value &
 729                                  xgmi_pcs_ras_fields[i].pcs_err_mask) >>
 730                                  xgmi_pcs_ras_fields[i].pcs_err_shift;
 731                        if (ue_cnt) {
 732                                dev_info(adev->dev, "%s detected\n",
 733                                         xgmi_pcs_ras_fields[i].err_name);
 734                                *ue_count += ue_cnt;
 735                        }
 736                }
 737        } else {
 738                /* query wafl pcs error status,
 739                 * only ue is supported */
 740                for (i = 0; i < ARRAY_SIZE(wafl_pcs_ras_fields); i++) {
 741                        ue_cnt = (value &
 742                                  wafl_pcs_ras_fields[i].pcs_err_mask) >>
 743                                  wafl_pcs_ras_fields[i].pcs_err_shift;
 744                        if (ue_cnt) {
 745                                dev_info(adev->dev, "%s detected\n",
 746                                         wafl_pcs_ras_fields[i].err_name);
 747                                *ue_count += ue_cnt;
 748                        }
 749                }
 750        }
 751
 752        return 0;
 753}
 754
 755static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
 756                                             void *ras_error_status)
 757{
 758        struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
 759        int i;
 760        uint32_t data;
 761        uint32_t ue_cnt = 0, ce_cnt = 0;
 762
 763        if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL))
 764                return -EINVAL;
 765
 766        err_data->ue_count = 0;
 767        err_data->ce_count = 0;
 768
 769        switch (adev->asic_type) {
 770        case CHIP_ARCTURUS:
 771                /* check xgmi pcs error */
 772                for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) {
 773                        data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]);
 774                        if (data)
 775                                amdgpu_xgmi_query_pcs_error_status(adev,
 776                                                data, &ue_cnt, &ce_cnt, true);
 777                }
 778                /* check wafl pcs error */
 779                for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) {
 780                        data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]);
 781                        if (data)
 782                                amdgpu_xgmi_query_pcs_error_status(adev,
 783                                                data, &ue_cnt, &ce_cnt, false);
 784                }
 785                break;
 786        case CHIP_VEGA20:
 787        default:
 788                /* check xgmi pcs error */
 789                for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) {
 790                        data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]);
 791                        if (data)
 792                                amdgpu_xgmi_query_pcs_error_status(adev,
 793                                                data, &ue_cnt, &ce_cnt, true);
 794                }
 795                /* check wafl pcs error */
 796                for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) {
 797                        data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]);
 798                        if (data)
 799                                amdgpu_xgmi_query_pcs_error_status(adev,
 800                                                data, &ue_cnt, &ce_cnt, false);
 801                }
 802                break;
 803        }
 804
 805        adev->gmc.xgmi.ras_funcs->reset_ras_error_count(adev);
 806
 807        err_data->ue_count += ue_cnt;
 808        err_data->ce_count += ce_cnt;
 809
 810        return 0;
 811}
 812
 813const struct amdgpu_xgmi_ras_funcs xgmi_ras_funcs = {
 814        .ras_late_init = amdgpu_xgmi_ras_late_init,
 815        .ras_fini = amdgpu_xgmi_ras_fini,
 816        .query_ras_error_count = amdgpu_xgmi_query_ras_error_count,
 817        .reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count,
 818};
 819