1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/list.h>
25#include "amdgpu.h"
26#include "amdgpu_xgmi.h"
27#include "amdgpu_ras.h"
28#include "soc15.h"
29#include "df/df_3_6_offset.h"
30#include "xgmi/xgmi_4_0_0_smn.h"
31#include "xgmi/xgmi_4_0_0_sh_mask.h"
32#include "wafl/wafl2_4_0_0_smn.h"
33#include "wafl/wafl2_4_0_0_sh_mask.h"
34
35static DEFINE_MUTEX(xgmi_mutex);
36
37#define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4
38
39static LIST_HEAD(xgmi_hive_list);
40
41static const int xgmi_pcs_err_status_reg_vg20[] = {
42 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
43 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
44};
45
46static const int wafl_pcs_err_status_reg_vg20[] = {
47 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
48 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
49};
50
51static const int xgmi_pcs_err_status_reg_arct[] = {
52 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
53 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
54 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000,
55 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000,
56 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000,
57 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000,
58};
59
60
61static const int wafl_pcs_err_status_reg_arct[] = {
62 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
63 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
64};
65
66static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
67 {"XGMI PCS DataLossErr",
68 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
69 {"XGMI PCS TrainingErr",
70 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)},
71 {"XGMI PCS CRCErr",
72 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)},
73 {"XGMI PCS BERExceededErr",
74 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)},
75 {"XGMI PCS TxMetaDataErr",
76 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)},
77 {"XGMI PCS ReplayBufParityErr",
78 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)},
79 {"XGMI PCS DataParityErr",
80 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)},
81 {"XGMI PCS ReplayFifoOverflowErr",
82 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
83 {"XGMI PCS ReplayFifoUnderflowErr",
84 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
85 {"XGMI PCS ElasticFifoOverflowErr",
86 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
87 {"XGMI PCS DeskewErr",
88 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)},
89 {"XGMI PCS DataStartupLimitErr",
90 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)},
91 {"XGMI PCS FCInitTimeoutErr",
92 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)},
93 {"XGMI PCS RecoveryTimeoutErr",
94 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
95 {"XGMI PCS ReadySerialTimeoutErr",
96 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
97 {"XGMI PCS ReadySerialAttemptErr",
98 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
99 {"XGMI PCS RecoveryAttemptErr",
100 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)},
101 {"XGMI PCS RecoveryRelockAttemptErr",
102 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
103};
104
105static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = {
106 {"WAFL PCS DataLossErr",
107 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)},
108 {"WAFL PCS TrainingErr",
109 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)},
110 {"WAFL PCS CRCErr",
111 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)},
112 {"WAFL PCS BERExceededErr",
113 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)},
114 {"WAFL PCS TxMetaDataErr",
115 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)},
116 {"WAFL PCS ReplayBufParityErr",
117 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)},
118 {"WAFL PCS DataParityErr",
119 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)},
120 {"WAFL PCS ReplayFifoOverflowErr",
121 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
122 {"WAFL PCS ReplayFifoUnderflowErr",
123 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
124 {"WAFL PCS ElasticFifoOverflowErr",
125 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
126 {"WAFL PCS DeskewErr",
127 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)},
128 {"WAFL PCS DataStartupLimitErr",
129 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)},
130 {"WAFL PCS FCInitTimeoutErr",
131 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)},
132 {"WAFL PCS RecoveryTimeoutErr",
133 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
134 {"WAFL PCS ReadySerialTimeoutErr",
135 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
136 {"WAFL PCS ReadySerialAttemptErr",
137 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
138 {"WAFL PCS RecoveryAttemptErr",
139 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)},
140 {"WAFL PCS RecoveryRelockAttemptErr",
141 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
142};
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171static struct attribute amdgpu_xgmi_hive_id = {
172 .name = "xgmi_hive_id",
173 .mode = S_IRUGO
174};
175
176static struct attribute *amdgpu_xgmi_hive_attrs[] = {
177 &amdgpu_xgmi_hive_id,
178 NULL
179};
180
181static ssize_t amdgpu_xgmi_show_attrs(struct kobject *kobj,
182 struct attribute *attr, char *buf)
183{
184 struct amdgpu_hive_info *hive = container_of(
185 kobj, struct amdgpu_hive_info, kobj);
186
187 if (attr == &amdgpu_xgmi_hive_id)
188 return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
189
190 return 0;
191}
192
193static void amdgpu_xgmi_hive_release(struct kobject *kobj)
194{
195 struct amdgpu_hive_info *hive = container_of(
196 kobj, struct amdgpu_hive_info, kobj);
197
198 mutex_destroy(&hive->hive_lock);
199 kfree(hive);
200}
201
202static const struct sysfs_ops amdgpu_xgmi_hive_ops = {
203 .show = amdgpu_xgmi_show_attrs,
204};
205
206struct kobj_type amdgpu_xgmi_hive_type = {
207 .release = amdgpu_xgmi_hive_release,
208 .sysfs_ops = &amdgpu_xgmi_hive_ops,
209 .default_attrs = amdgpu_xgmi_hive_attrs,
210};
211
212static ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
213 struct device_attribute *attr,
214 char *buf)
215{
216 struct drm_device *ddev = dev_get_drvdata(dev);
217 struct amdgpu_device *adev = drm_to_adev(ddev);
218
219 return sysfs_emit(buf, "%llu\n", adev->gmc.xgmi.node_id);
220
221}
222
223#define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801)
224static ssize_t amdgpu_xgmi_show_error(struct device *dev,
225 struct device_attribute *attr,
226 char *buf)
227{
228 struct drm_device *ddev = dev_get_drvdata(dev);
229 struct amdgpu_device *adev = drm_to_adev(ddev);
230 uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in;
231 uint64_t fica_out;
232 unsigned int error_count = 0;
233
234 ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200);
235 ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208);
236
237 fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in);
238 if (fica_out != 0x1f)
239 pr_err("xGMI error counters not enabled!\n");
240
241 fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in);
242
243 if ((fica_out & 0xffff) == 2)
244 error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63);
245
246 adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0);
247
248 return sysfs_emit(buf, "%u\n", error_count);
249}
250
251
252static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
253static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL);
254
255static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
256 struct amdgpu_hive_info *hive)
257{
258 int ret = 0;
259 char node[10] = { 0 };
260
261
262 ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id);
263 if (ret) {
264 dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n");
265 return ret;
266 }
267
268
269 ret = device_create_file(adev->dev, &dev_attr_xgmi_error);
270 if (ret)
271 pr_err("failed to create xgmi_error\n");
272
273
274
275 if (hive->kobj.parent != (&adev->dev->kobj)) {
276 ret = sysfs_create_link(&adev->dev->kobj, &hive->kobj,
277 "xgmi_hive_info");
278 if (ret) {
279 dev_err(adev->dev, "XGMI: Failed to create link to hive info");
280 goto remove_file;
281 }
282 }
283
284 sprintf(node, "node%d", atomic_read(&hive->number_devices));
285
286 ret = sysfs_create_link(&hive->kobj, &adev->dev->kobj, node);
287 if (ret) {
288 dev_err(adev->dev, "XGMI: Failed to create link from hive info");
289 goto remove_link;
290 }
291
292 goto success;
293
294
295remove_link:
296 sysfs_remove_link(&adev->dev->kobj, adev_to_drm(adev)->unique);
297
298remove_file:
299 device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
300
301success:
302 return ret;
303}
304
305static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
306 struct amdgpu_hive_info *hive)
307{
308 char node[10];
309 memset(node, 0, sizeof(node));
310
311 device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
312 device_remove_file(adev->dev, &dev_attr_xgmi_error);
313
314 if (hive->kobj.parent != (&adev->dev->kobj))
315 sysfs_remove_link(&adev->dev->kobj,"xgmi_hive_info");
316
317 sprintf(node, "node%d", atomic_read(&hive->number_devices));
318 sysfs_remove_link(&hive->kobj, node);
319
320}
321
322
323
324struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
325{
326 struct amdgpu_hive_info *hive = NULL;
327 int ret;
328
329 if (!adev->gmc.xgmi.hive_id)
330 return NULL;
331
332 if (adev->hive) {
333 kobject_get(&adev->hive->kobj);
334 return adev->hive;
335 }
336
337 mutex_lock(&xgmi_mutex);
338
339 list_for_each_entry(hive, &xgmi_hive_list, node) {
340 if (hive->hive_id == adev->gmc.xgmi.hive_id)
341 goto pro_end;
342 }
343
344 hive = kzalloc(sizeof(*hive), GFP_KERNEL);
345 if (!hive) {
346 dev_err(adev->dev, "XGMI: allocation failed\n");
347 hive = NULL;
348 goto pro_end;
349 }
350
351
352 ret = kobject_init_and_add(&hive->kobj,
353 &amdgpu_xgmi_hive_type,
354 &adev->dev->kobj,
355 "%s", "xgmi_hive_info");
356 if (ret) {
357 dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi hive\n");
358 kobject_put(&hive->kobj);
359 kfree(hive);
360 hive = NULL;
361 goto pro_end;
362 }
363
364 hive->hive_id = adev->gmc.xgmi.hive_id;
365 INIT_LIST_HEAD(&hive->device_list);
366 INIT_LIST_HEAD(&hive->node);
367 mutex_init(&hive->hive_lock);
368 atomic_set(&hive->in_reset, 0);
369 atomic_set(&hive->number_devices, 0);
370 task_barrier_init(&hive->tb);
371 hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
372 hive->hi_req_gpu = NULL;
373
374
375
376
377 hive->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE;
378 list_add_tail(&hive->node, &xgmi_hive_list);
379
380pro_end:
381 if (hive)
382 kobject_get(&hive->kobj);
383 mutex_unlock(&xgmi_mutex);
384 return hive;
385}
386
387void amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive)
388{
389 if (hive)
390 kobject_put(&hive->kobj);
391}
392
393int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
394{
395 int ret = 0;
396 struct amdgpu_hive_info *hive;
397 struct amdgpu_device *request_adev;
398 bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
399 bool init_low;
400
401 hive = amdgpu_get_xgmi_hive(adev);
402 if (!hive)
403 return 0;
404
405 request_adev = hive->hi_req_gpu ? hive->hi_req_gpu : adev;
406 init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN;
407 amdgpu_put_xgmi_hive(hive);
408
409 return 0;
410
411 if (!hive || adev->asic_type != CHIP_VEGA20)
412 return 0;
413
414 mutex_lock(&hive->hive_lock);
415
416 if (is_hi_req)
417 hive->hi_req_count++;
418 else
419 hive->hi_req_count--;
420
421
422
423
424
425 if (hive->pstate == pstate ||
426 (!is_hi_req && hive->hi_req_count && !init_low))
427 goto out;
428
429 dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate);
430
431 ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate);
432 if (ret) {
433 dev_err(request_adev->dev,
434 "XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
435 request_adev->gmc.xgmi.node_id,
436 request_adev->gmc.xgmi.hive_id, ret);
437 goto out;
438 }
439
440 if (init_low)
441 hive->pstate = hive->hi_req_count ?
442 hive->pstate : AMDGPU_XGMI_PSTATE_MIN;
443 else {
444 hive->pstate = pstate;
445 hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ?
446 adev : NULL;
447 }
448out:
449 mutex_unlock(&hive->hive_lock);
450 return ret;
451}
452
453int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
454{
455 int ret;
456
457
458 ret = psp_xgmi_set_topology_info(&adev->psp,
459 atomic_read(&hive->number_devices),
460 &adev->psp.xgmi_context.top_info);
461 if (ret)
462 dev_err(adev->dev,
463 "XGMI: Set topology failure on device %llx, hive %llx, ret %d",
464 adev->gmc.xgmi.node_id,
465 adev->gmc.xgmi.hive_id, ret);
466
467 return ret;
468}
469
470
471
472
473
474
475
476
477int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
478 struct amdgpu_device *peer_adev)
479{
480 struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
481 uint8_t num_hops_mask = 0x7;
482 int i;
483
484 for (i = 0 ; i < top->num_nodes; ++i)
485 if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
486 return top->nodes[i].num_hops & num_hops_mask;
487 return -EINVAL;
488}
489
490int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
491{
492 struct psp_xgmi_topology_info *top_info;
493 struct amdgpu_hive_info *hive;
494 struct amdgpu_xgmi *entry;
495 struct amdgpu_device *tmp_adev = NULL;
496
497 int count = 0, ret = 0;
498
499 if (!adev->gmc.xgmi.supported)
500 return 0;
501
502 if (!adev->gmc.xgmi.pending_reset &&
503 amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
504 ret = psp_xgmi_initialize(&adev->psp);
505 if (ret) {
506 dev_err(adev->dev,
507 "XGMI: Failed to initialize xgmi session\n");
508 return ret;
509 }
510
511 ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id);
512 if (ret) {
513 dev_err(adev->dev,
514 "XGMI: Failed to get hive id\n");
515 return ret;
516 }
517
518 ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id);
519 if (ret) {
520 dev_err(adev->dev,
521 "XGMI: Failed to get node id\n");
522 return ret;
523 }
524 } else {
525 adev->gmc.xgmi.hive_id = 16;
526 adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
527 }
528
529 hive = amdgpu_get_xgmi_hive(adev);
530 if (!hive) {
531 ret = -EINVAL;
532 dev_err(adev->dev,
533 "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",
534 adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id);
535 goto exit;
536 }
537 mutex_lock(&hive->hive_lock);
538
539 top_info = &adev->psp.xgmi_context.top_info;
540
541 list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
542 list_for_each_entry(entry, &hive->device_list, head)
543 top_info->nodes[count++].node_id = entry->node_id;
544 top_info->num_nodes = count;
545 atomic_set(&hive->number_devices, count);
546
547 task_barrier_add_task(&hive->tb);
548
549 if (!adev->gmc.xgmi.pending_reset &&
550 amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
551 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
552
553 if (tmp_adev != adev) {
554 top_info = &tmp_adev->psp.xgmi_context.top_info;
555 top_info->nodes[count - 1].node_id =
556 adev->gmc.xgmi.node_id;
557 top_info->num_nodes = count;
558 }
559 ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
560 if (ret)
561 goto exit_unlock;
562 }
563
564
565 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
566 ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
567 &tmp_adev->psp.xgmi_context.top_info);
568 if (ret) {
569 dev_err(tmp_adev->dev,
570 "XGMI: Get topology failure on device %llx, hive %llx, ret %d",
571 tmp_adev->gmc.xgmi.node_id,
572 tmp_adev->gmc.xgmi.hive_id, ret);
573
574 goto exit_unlock;
575 }
576 }
577 }
578
579 if (!ret && !adev->gmc.xgmi.pending_reset)
580 ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
581
582exit_unlock:
583 mutex_unlock(&hive->hive_lock);
584exit:
585 if (!ret) {
586 adev->hive = hive;
587 dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
588 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id);
589 } else {
590 amdgpu_put_xgmi_hive(hive);
591 dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",
592 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,
593 ret);
594 }
595
596 return ret;
597}
598
599int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
600{
601 struct amdgpu_hive_info *hive = adev->hive;
602
603 if (!adev->gmc.xgmi.supported)
604 return -EINVAL;
605
606 if (!hive)
607 return -EINVAL;
608
609 mutex_lock(&hive->hive_lock);
610 task_barrier_rem_task(&hive->tb);
611 amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
612 if (hive->hi_req_gpu == adev)
613 hive->hi_req_gpu = NULL;
614 list_del(&adev->gmc.xgmi.head);
615 mutex_unlock(&hive->hive_lock);
616
617 amdgpu_put_xgmi_hive(hive);
618 adev->hive = NULL;
619
620 if (atomic_dec_return(&hive->number_devices) == 0) {
621
622 mutex_lock(&xgmi_mutex);
623 list_del(&hive->node);
624 mutex_unlock(&xgmi_mutex);
625
626 amdgpu_put_xgmi_hive(hive);
627 }
628
629 return psp_xgmi_terminate(&adev->psp);
630}
631
632static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)
633{
634 int r;
635 struct ras_ih_if ih_info = {
636 .cb = NULL,
637 };
638 struct ras_fs_if fs_info = {
639 .sysfs_name = "xgmi_wafl_err_count",
640 };
641
642 if (!adev->gmc.xgmi.supported ||
643 adev->gmc.xgmi.num_physical_nodes == 0)
644 return 0;
645
646 adev->gmc.xgmi.ras_funcs->reset_ras_error_count(adev);
647
648 if (!adev->gmc.xgmi.ras_if) {
649 adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
650 if (!adev->gmc.xgmi.ras_if)
651 return -ENOMEM;
652 adev->gmc.xgmi.ras_if->block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
653 adev->gmc.xgmi.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
654 adev->gmc.xgmi.ras_if->sub_block_index = 0;
655 strcpy(adev->gmc.xgmi.ras_if->name, "xgmi_wafl");
656 }
657 ih_info.head = fs_info.head = *adev->gmc.xgmi.ras_if;
658 r = amdgpu_ras_late_init(adev, adev->gmc.xgmi.ras_if,
659 &fs_info, &ih_info);
660 if (r || !amdgpu_ras_is_supported(adev, adev->gmc.xgmi.ras_if->block)) {
661 kfree(adev->gmc.xgmi.ras_if);
662 adev->gmc.xgmi.ras_if = NULL;
663 }
664
665 return r;
666}
667
668static void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev)
669{
670 if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) &&
671 adev->gmc.xgmi.ras_if) {
672 struct ras_common_if *ras_if = adev->gmc.xgmi.ras_if;
673 struct ras_ih_if ih_info = {
674 .cb = NULL,
675 };
676
677 amdgpu_ras_late_fini(adev, ras_if, &ih_info);
678 kfree(ras_if);
679 }
680}
681
682uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
683 uint64_t addr)
684{
685 struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi;
686 return (addr + xgmi->physical_node_id * xgmi->node_segment_size);
687}
688
689static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg)
690{
691 WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF);
692 WREG32_PCIE(pcs_status_reg, 0);
693}
694
695static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
696{
697 uint32_t i;
698
699 switch (adev->asic_type) {
700 case CHIP_ARCTURUS:
701 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++)
702 pcs_clear_status(adev,
703 xgmi_pcs_err_status_reg_arct[i]);
704 break;
705 case CHIP_VEGA20:
706 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++)
707 pcs_clear_status(adev,
708 xgmi_pcs_err_status_reg_vg20[i]);
709 break;
710 default:
711 break;
712 }
713}
714
715static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
716 uint32_t value,
717 uint32_t *ue_count,
718 uint32_t *ce_count,
719 bool is_xgmi_pcs)
720{
721 int i;
722 int ue_cnt;
723
724 if (is_xgmi_pcs) {
725
726
727 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_ras_fields); i ++) {
728 ue_cnt = (value &
729 xgmi_pcs_ras_fields[i].pcs_err_mask) >>
730 xgmi_pcs_ras_fields[i].pcs_err_shift;
731 if (ue_cnt) {
732 dev_info(adev->dev, "%s detected\n",
733 xgmi_pcs_ras_fields[i].err_name);
734 *ue_count += ue_cnt;
735 }
736 }
737 } else {
738
739
740 for (i = 0; i < ARRAY_SIZE(wafl_pcs_ras_fields); i++) {
741 ue_cnt = (value &
742 wafl_pcs_ras_fields[i].pcs_err_mask) >>
743 wafl_pcs_ras_fields[i].pcs_err_shift;
744 if (ue_cnt) {
745 dev_info(adev->dev, "%s detected\n",
746 wafl_pcs_ras_fields[i].err_name);
747 *ue_count += ue_cnt;
748 }
749 }
750 }
751
752 return 0;
753}
754
755static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
756 void *ras_error_status)
757{
758 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
759 int i;
760 uint32_t data;
761 uint32_t ue_cnt = 0, ce_cnt = 0;
762
763 if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL))
764 return -EINVAL;
765
766 err_data->ue_count = 0;
767 err_data->ce_count = 0;
768
769 switch (adev->asic_type) {
770 case CHIP_ARCTURUS:
771
772 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) {
773 data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]);
774 if (data)
775 amdgpu_xgmi_query_pcs_error_status(adev,
776 data, &ue_cnt, &ce_cnt, true);
777 }
778
779 for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) {
780 data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]);
781 if (data)
782 amdgpu_xgmi_query_pcs_error_status(adev,
783 data, &ue_cnt, &ce_cnt, false);
784 }
785 break;
786 case CHIP_VEGA20:
787 default:
788
789 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) {
790 data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]);
791 if (data)
792 amdgpu_xgmi_query_pcs_error_status(adev,
793 data, &ue_cnt, &ce_cnt, true);
794 }
795
796 for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) {
797 data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]);
798 if (data)
799 amdgpu_xgmi_query_pcs_error_status(adev,
800 data, &ue_cnt, &ce_cnt, false);
801 }
802 break;
803 }
804
805 adev->gmc.xgmi.ras_funcs->reset_ras_error_count(adev);
806
807 err_data->ue_count += ue_cnt;
808 err_data->ce_count += ce_cnt;
809
810 return 0;
811}
812
813const struct amdgpu_xgmi_ras_funcs xgmi_ras_funcs = {
814 .ras_late_init = amdgpu_xgmi_ras_late_init,
815 .ras_fini = amdgpu_xgmi_ras_fini,
816 .query_ras_error_count = amdgpu_xgmi_query_ras_error_count,
817 .reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count,
818};
819