linux/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
<<
>>
Prefs
   1/*
   2 * Copyright 2018 Advanced Micro Devices, Inc.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice shall be included in
  12 * all copies or substantial portions of the Software.
  13 *
  14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20 * OTHER DEALINGS IN THE SOFTWARE.
  21 *
  22 *
  23 */
  24#include <linux/list.h>
  25#include "amdgpu.h"
  26#include "amdgpu_xgmi.h"
  27#include "amdgpu_smu.h"
  28
  29
  30static DEFINE_MUTEX(xgmi_mutex);
  31
  32#define AMDGPU_MAX_XGMI_HIVE                    8
  33#define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE         4
  34
  35static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE];
  36static unsigned hive_count = 0;
  37
  38void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
  39{
  40        return &hive->device_list;
  41}
  42
  43/**
  44 * DOC: AMDGPU XGMI Support
  45 *
  46 * XGMI is a high speed interconnect that joins multiple GPU cards
  47 * into a homogeneous memory space that is organized by a collective
  48 * hive ID and individual node IDs, both of which are 64-bit numbers.
  49 *
  50 * The file xgmi_device_id contains the unique per GPU device ID and
  51 * is stored in the /sys/class/drm/card${cardno}/device/ directory.
  52 *
  53 * Inside the device directory a sub-directory 'xgmi_hive_info' is
  54 * created which contains the hive ID and the list of nodes.
  55 *
  56 * The hive ID is stored in:
  57 *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id
  58 *
  59 * The node information is stored in numbered directories:
  60 *   /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id
  61 *
  62 * Each device has their own xgmi_hive_info direction with a mirror
  63 * set of node sub-directories.
  64 *
  65 * The XGMI memory space is built by contiguously adding the power of
  66 * two padded VRAM space from each node to each other.
  67 *
  68 */
  69
  70
  71static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev,
  72                struct device_attribute *attr, char *buf)
  73{
  74        struct amdgpu_hive_info *hive =
  75                        container_of(attr, struct amdgpu_hive_info, dev_attr);
  76
  77        return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
  78}
  79
  80static int amdgpu_xgmi_sysfs_create(struct amdgpu_device *adev,
  81                                    struct amdgpu_hive_info *hive)
  82{
  83        int ret = 0;
  84
  85        if (WARN_ON(hive->kobj))
  86                return -EINVAL;
  87
  88        hive->kobj = kobject_create_and_add("xgmi_hive_info", &adev->dev->kobj);
  89        if (!hive->kobj) {
  90                dev_err(adev->dev, "XGMI: Failed to allocate sysfs entry!\n");
  91                return -EINVAL;
  92        }
  93
  94        hive->dev_attr = (struct device_attribute) {
  95                .attr = {
  96                        .name = "xgmi_hive_id",
  97                        .mode = S_IRUGO,
  98
  99                },
 100                .show = amdgpu_xgmi_show_hive_id,
 101        };
 102
 103        ret = sysfs_create_file(hive->kobj, &hive->dev_attr.attr);
 104        if (ret) {
 105                dev_err(adev->dev, "XGMI: Failed to create device file xgmi_hive_id\n");
 106                kobject_del(hive->kobj);
 107                kobject_put(hive->kobj);
 108                hive->kobj = NULL;
 109        }
 110
 111        return ret;
 112}
 113
 114static void amdgpu_xgmi_sysfs_destroy(struct amdgpu_device *adev,
 115                                    struct amdgpu_hive_info *hive)
 116{
 117        sysfs_remove_file(hive->kobj, &hive->dev_attr.attr);
 118        kobject_del(hive->kobj);
 119        kobject_put(hive->kobj);
 120        hive->kobj = NULL;
 121}
 122
 123static ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
 124                                     struct device_attribute *attr,
 125                                     char *buf)
 126{
 127        struct drm_device *ddev = dev_get_drvdata(dev);
 128        struct amdgpu_device *adev = ddev->dev_private;
 129
 130        return snprintf(buf, PAGE_SIZE, "%llu\n", adev->gmc.xgmi.node_id);
 131
 132}
 133
 134
 135static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
 136
 137
 138static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
 139                                         struct amdgpu_hive_info *hive)
 140{
 141        int ret = 0;
 142        char node[10] = { 0 };
 143
 144        /* Create xgmi device id file */
 145        ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id);
 146        if (ret) {
 147                dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n");
 148                return ret;
 149        }
 150
 151        /* Create sysfs link to hive info folder on the first device */
 152        if (adev != hive->adev) {
 153                ret = sysfs_create_link(&adev->dev->kobj, hive->kobj,
 154                                        "xgmi_hive_info");
 155                if (ret) {
 156                        dev_err(adev->dev, "XGMI: Failed to create link to hive info");
 157                        goto remove_file;
 158                }
 159        }
 160
 161        sprintf(node, "node%d", hive->number_devices);
 162        /* Create sysfs link form the hive folder to yourself */
 163        ret = sysfs_create_link(hive->kobj, &adev->dev->kobj, node);
 164        if (ret) {
 165                dev_err(adev->dev, "XGMI: Failed to create link from hive info");
 166                goto remove_link;
 167        }
 168
 169        goto success;
 170
 171
 172remove_link:
 173        sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique);
 174
 175remove_file:
 176        device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
 177
 178success:
 179        return ret;
 180}
 181
 182static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
 183                                          struct amdgpu_hive_info *hive)
 184{
 185        device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
 186        sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique);
 187        sysfs_remove_link(hive->kobj, adev->ddev->unique);
 188}
 189
 190
 191
 192struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
 193{
 194        int i;
 195        struct amdgpu_hive_info *tmp;
 196
 197        if (!adev->gmc.xgmi.hive_id)
 198                return NULL;
 199
 200        mutex_lock(&xgmi_mutex);
 201
 202        for (i = 0 ; i < hive_count; ++i) {
 203                tmp = &xgmi_hives[i];
 204                if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
 205                        if (lock)
 206                                mutex_lock(&tmp->hive_lock);
 207                        mutex_unlock(&xgmi_mutex);
 208                        return tmp;
 209                }
 210        }
 211        if (i >= AMDGPU_MAX_XGMI_HIVE) {
 212                mutex_unlock(&xgmi_mutex);
 213                return NULL;
 214        }
 215
 216        /* initialize new hive if not exist */
 217        tmp = &xgmi_hives[hive_count++];
 218
 219        if (amdgpu_xgmi_sysfs_create(adev, tmp)) {
 220                mutex_unlock(&xgmi_mutex);
 221                return NULL;
 222        }
 223
 224        tmp->adev = adev;
 225        tmp->hive_id = adev->gmc.xgmi.hive_id;
 226        INIT_LIST_HEAD(&tmp->device_list);
 227        mutex_init(&tmp->hive_lock);
 228        mutex_init(&tmp->reset_lock);
 229
 230        if (lock)
 231                mutex_lock(&tmp->hive_lock);
 232        tmp->pstate = -1;
 233        mutex_unlock(&xgmi_mutex);
 234
 235        return tmp;
 236}
 237
 238int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
 239{
 240        int ret = 0;
 241        struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
 242
 243        if (!hive)
 244                return 0;
 245
 246        if (hive->pstate == pstate)
 247                return 0;
 248
 249        dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate);
 250
 251        if (is_support_sw_smu(adev))
 252                ret = smu_set_xgmi_pstate(&adev->smu, pstate);
 253        if (ret)
 254                dev_err(adev->dev,
 255                        "XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
 256                        adev->gmc.xgmi.node_id,
 257                        adev->gmc.xgmi.hive_id, ret);
 258
 259        return ret;
 260}
 261
 262int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
 263{
 264        int ret = -EINVAL;
 265
 266        /* Each psp need to set the latest topology */
 267        ret = psp_xgmi_set_topology_info(&adev->psp,
 268                                         hive->number_devices,
 269                                         &adev->psp.xgmi_context.top_info);
 270        if (ret)
 271                dev_err(adev->dev,
 272                        "XGMI: Set topology failure on device %llx, hive %llx, ret %d",
 273                        adev->gmc.xgmi.node_id,
 274                        adev->gmc.xgmi.hive_id, ret);
 275
 276        return ret;
 277}
 278
 279
 280int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
 281                struct amdgpu_device *peer_adev)
 282{
 283        struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
 284        int i;
 285
 286        for (i = 0 ; i < top->num_nodes; ++i)
 287                if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
 288                        return top->nodes[i].num_hops;
 289        return  -EINVAL;
 290}
 291
 292int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 293{
 294        struct psp_xgmi_topology_info *top_info;
 295        struct amdgpu_hive_info *hive;
 296        struct amdgpu_xgmi      *entry;
 297        struct amdgpu_device *tmp_adev = NULL;
 298
 299        int count = 0, ret = -EINVAL;
 300
 301        if (!adev->gmc.xgmi.supported)
 302                return 0;
 303
 304        ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id);
 305        if (ret) {
 306                dev_err(adev->dev,
 307                        "XGMI: Failed to get node id\n");
 308                return ret;
 309        }
 310
 311        ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id);
 312        if (ret) {
 313                dev_err(adev->dev,
 314                        "XGMI: Failed to get hive id\n");
 315                return ret;
 316        }
 317
 318        hive = amdgpu_get_xgmi_hive(adev, 1);
 319        if (!hive) {
 320                ret = -EINVAL;
 321                dev_err(adev->dev,
 322                        "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",
 323                        adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id);
 324                goto exit;
 325        }
 326
 327        top_info = &adev->psp.xgmi_context.top_info;
 328
 329        list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
 330        list_for_each_entry(entry, &hive->device_list, head)
 331                top_info->nodes[count++].node_id = entry->node_id;
 332        top_info->num_nodes = count;
 333        hive->number_devices = count;
 334
 335        list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
 336                /* update node list for other device in the hive */
 337                if (tmp_adev != adev) {
 338                        top_info = &tmp_adev->psp.xgmi_context.top_info;
 339                        top_info->nodes[count - 1].node_id = adev->gmc.xgmi.node_id;
 340                        top_info->num_nodes = count;
 341                }
 342                ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
 343                if (ret)
 344                        goto exit;
 345        }
 346
 347        /* get latest topology info for each device from psp */
 348        list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
 349                ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
 350                                &tmp_adev->psp.xgmi_context.top_info);
 351                if (ret) {
 352                        dev_err(tmp_adev->dev,
 353                                "XGMI: Get topology failure on device %llx, hive %llx, ret %d",
 354                                tmp_adev->gmc.xgmi.node_id,
 355                                tmp_adev->gmc.xgmi.hive_id, ret);
 356                        /* To do : continue with some node failed or disable the whole hive */
 357                        goto exit;
 358                }
 359        }
 360
 361        if (!ret)
 362                ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
 363
 364
 365        mutex_unlock(&hive->hive_lock);
 366exit:
 367        if (!ret)
 368                dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
 369                         adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id);
 370        else
 371                dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",
 372                        adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,
 373                        ret);
 374
 375        return ret;
 376}
 377
 378void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
 379{
 380        struct amdgpu_hive_info *hive;
 381
 382        if (!adev->gmc.xgmi.supported)
 383                return;
 384
 385        hive = amdgpu_get_xgmi_hive(adev, 1);
 386        if (!hive)
 387                return;
 388
 389        if (!(hive->number_devices--)) {
 390                amdgpu_xgmi_sysfs_destroy(adev, hive);
 391                mutex_destroy(&hive->hive_lock);
 392                mutex_destroy(&hive->reset_lock);
 393        } else {
 394                amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
 395                mutex_unlock(&hive->hive_lock);
 396        }
 397}
 398