1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/list.h>
25#include "amdgpu.h"
26#include "amdgpu_xgmi.h"
27#include "amdgpu_ras.h"
28#include "soc15.h"
29#include "df/df_3_6_offset.h"
30#include "xgmi/xgmi_4_0_0_smn.h"
31#include "xgmi/xgmi_4_0_0_sh_mask.h"
32#include "wafl/wafl2_4_0_0_smn.h"
33#include "wafl/wafl2_4_0_0_sh_mask.h"
34
35#define smnPCS_XGMI23_PCS_ERROR_STATUS 0x11a01210
36#define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c
37#define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210
38
39static DEFINE_MUTEX(xgmi_mutex);
40
41#define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4
42
43static LIST_HEAD(xgmi_hive_list);
44
45static const int xgmi_pcs_err_status_reg_vg20[] = {
46 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
47 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
48};
49
50static const int wafl_pcs_err_status_reg_vg20[] = {
51 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
52 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
53};
54
55static const int xgmi_pcs_err_status_reg_arct[] = {
56 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
57 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
58 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000,
59 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000,
60 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000,
61 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000,
62};
63
64
65static const int wafl_pcs_err_status_reg_arct[] = {
66 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
67 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
68};
69
70static const int xgmi23_pcs_err_status_reg_aldebaran[] = {
71 smnPCS_XGMI23_PCS_ERROR_STATUS,
72 smnPCS_XGMI23_PCS_ERROR_STATUS + 0x100000,
73 smnPCS_XGMI23_PCS_ERROR_STATUS + 0x200000,
74 smnPCS_XGMI23_PCS_ERROR_STATUS + 0x300000,
75 smnPCS_XGMI23_PCS_ERROR_STATUS + 0x400000,
76 smnPCS_XGMI23_PCS_ERROR_STATUS + 0x500000,
77 smnPCS_XGMI23_PCS_ERROR_STATUS + 0x600000,
78 smnPCS_XGMI23_PCS_ERROR_STATUS + 0x700000
79};
80
81static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = {
82 smnPCS_XGMI3X16_PCS_ERROR_STATUS,
83 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000,
84 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x200000,
85 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x300000,
86 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x400000,
87 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x500000,
88 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x600000,
89 smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000
90};
91
92static const int walf_pcs_err_status_reg_aldebaran[] = {
93 smnPCS_GOPX1_PCS_ERROR_STATUS,
94 smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000
95};
96
97static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
98 {"XGMI PCS DataLossErr",
99 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
100 {"XGMI PCS TrainingErr",
101 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)},
102 {"XGMI PCS CRCErr",
103 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)},
104 {"XGMI PCS BERExceededErr",
105 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)},
106 {"XGMI PCS TxMetaDataErr",
107 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)},
108 {"XGMI PCS ReplayBufParityErr",
109 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)},
110 {"XGMI PCS DataParityErr",
111 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)},
112 {"XGMI PCS ReplayFifoOverflowErr",
113 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
114 {"XGMI PCS ReplayFifoUnderflowErr",
115 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
116 {"XGMI PCS ElasticFifoOverflowErr",
117 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
118 {"XGMI PCS DeskewErr",
119 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)},
120 {"XGMI PCS DataStartupLimitErr",
121 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)},
122 {"XGMI PCS FCInitTimeoutErr",
123 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)},
124 {"XGMI PCS RecoveryTimeoutErr",
125 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
126 {"XGMI PCS ReadySerialTimeoutErr",
127 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
128 {"XGMI PCS ReadySerialAttemptErr",
129 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
130 {"XGMI PCS RecoveryAttemptErr",
131 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)},
132 {"XGMI PCS RecoveryRelockAttemptErr",
133 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
134};
135
136static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = {
137 {"WAFL PCS DataLossErr",
138 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)},
139 {"WAFL PCS TrainingErr",
140 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)},
141 {"WAFL PCS CRCErr",
142 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)},
143 {"WAFL PCS BERExceededErr",
144 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)},
145 {"WAFL PCS TxMetaDataErr",
146 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)},
147 {"WAFL PCS ReplayBufParityErr",
148 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)},
149 {"WAFL PCS DataParityErr",
150 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)},
151 {"WAFL PCS ReplayFifoOverflowErr",
152 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
153 {"WAFL PCS ReplayFifoUnderflowErr",
154 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
155 {"WAFL PCS ElasticFifoOverflowErr",
156 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
157 {"WAFL PCS DeskewErr",
158 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)},
159 {"WAFL PCS DataStartupLimitErr",
160 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)},
161 {"WAFL PCS FCInitTimeoutErr",
162 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)},
163 {"WAFL PCS RecoveryTimeoutErr",
164 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
165 {"WAFL PCS ReadySerialTimeoutErr",
166 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
167 {"WAFL PCS ReadySerialAttemptErr",
168 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
169 {"WAFL PCS RecoveryAttemptErr",
170 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)},
171 {"WAFL PCS RecoveryRelockAttemptErr",
172 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
173};
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202static struct attribute amdgpu_xgmi_hive_id = {
203 .name = "xgmi_hive_id",
204 .mode = S_IRUGO
205};
206
207static struct attribute *amdgpu_xgmi_hive_attrs[] = {
208 &amdgpu_xgmi_hive_id,
209 NULL
210};
211
212static ssize_t amdgpu_xgmi_show_attrs(struct kobject *kobj,
213 struct attribute *attr, char *buf)
214{
215 struct amdgpu_hive_info *hive = container_of(
216 kobj, struct amdgpu_hive_info, kobj);
217
218 if (attr == &amdgpu_xgmi_hive_id)
219 return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
220
221 return 0;
222}
223
224static void amdgpu_xgmi_hive_release(struct kobject *kobj)
225{
226 struct amdgpu_hive_info *hive = container_of(
227 kobj, struct amdgpu_hive_info, kobj);
228
229 mutex_destroy(&hive->hive_lock);
230 kfree(hive);
231}
232
233static const struct sysfs_ops amdgpu_xgmi_hive_ops = {
234 .show = amdgpu_xgmi_show_attrs,
235};
236
237struct kobj_type amdgpu_xgmi_hive_type = {
238 .release = amdgpu_xgmi_hive_release,
239 .sysfs_ops = &amdgpu_xgmi_hive_ops,
240 .default_attrs = amdgpu_xgmi_hive_attrs,
241};
242
243static ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
244 struct device_attribute *attr,
245 char *buf)
246{
247 struct drm_device *ddev = dev_get_drvdata(dev);
248 struct amdgpu_device *adev = drm_to_adev(ddev);
249
250 return sysfs_emit(buf, "%llu\n", adev->gmc.xgmi.node_id);
251
252}
253
254#define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801)
255static ssize_t amdgpu_xgmi_show_error(struct device *dev,
256 struct device_attribute *attr,
257 char *buf)
258{
259 struct drm_device *ddev = dev_get_drvdata(dev);
260 struct amdgpu_device *adev = drm_to_adev(ddev);
261 uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in;
262 uint64_t fica_out;
263 unsigned int error_count = 0;
264
265 ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200);
266 ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208);
267
268 fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in);
269 if (fica_out != 0x1f)
270 pr_err("xGMI error counters not enabled!\n");
271
272 fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in);
273
274 if ((fica_out & 0xffff) == 2)
275 error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63);
276
277 adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0);
278
279 return sysfs_emit(buf, "%u\n", error_count);
280}
281
282
283static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
284static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL);
285
286static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
287 struct amdgpu_hive_info *hive)
288{
289 int ret = 0;
290 char node[10] = { 0 };
291
292
293 ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id);
294 if (ret) {
295 dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n");
296 return ret;
297 }
298
299
300 ret = device_create_file(adev->dev, &dev_attr_xgmi_error);
301 if (ret)
302 pr_err("failed to create xgmi_error\n");
303
304
305
306 if (hive->kobj.parent != (&adev->dev->kobj)) {
307 ret = sysfs_create_link(&adev->dev->kobj, &hive->kobj,
308 "xgmi_hive_info");
309 if (ret) {
310 dev_err(adev->dev, "XGMI: Failed to create link to hive info");
311 goto remove_file;
312 }
313 }
314
315 sprintf(node, "node%d", atomic_read(&hive->number_devices));
316
317 ret = sysfs_create_link(&hive->kobj, &adev->dev->kobj, node);
318 if (ret) {
319 dev_err(adev->dev, "XGMI: Failed to create link from hive info");
320 goto remove_link;
321 }
322
323 goto success;
324
325
326remove_link:
327 sysfs_remove_link(&adev->dev->kobj, adev_to_drm(adev)->unique);
328
329remove_file:
330 device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
331
332success:
333 return ret;
334}
335
336static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
337 struct amdgpu_hive_info *hive)
338{
339 char node[10];
340 memset(node, 0, sizeof(node));
341
342 device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
343 device_remove_file(adev->dev, &dev_attr_xgmi_error);
344
345 if (hive->kobj.parent != (&adev->dev->kobj))
346 sysfs_remove_link(&adev->dev->kobj,"xgmi_hive_info");
347
348 sprintf(node, "node%d", atomic_read(&hive->number_devices));
349 sysfs_remove_link(&hive->kobj, node);
350
351}
352
353
354
355struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
356{
357 struct amdgpu_hive_info *hive = NULL;
358 int ret;
359
360 if (!adev->gmc.xgmi.hive_id)
361 return NULL;
362
363 if (adev->hive) {
364 kobject_get(&adev->hive->kobj);
365 return adev->hive;
366 }
367
368 mutex_lock(&xgmi_mutex);
369
370 list_for_each_entry(hive, &xgmi_hive_list, node) {
371 if (hive->hive_id == adev->gmc.xgmi.hive_id)
372 goto pro_end;
373 }
374
375 hive = kzalloc(sizeof(*hive), GFP_KERNEL);
376 if (!hive) {
377 dev_err(adev->dev, "XGMI: allocation failed\n");
378 hive = NULL;
379 goto pro_end;
380 }
381
382
383 ret = kobject_init_and_add(&hive->kobj,
384 &amdgpu_xgmi_hive_type,
385 &adev->dev->kobj,
386 "%s", "xgmi_hive_info");
387 if (ret) {
388 dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi hive\n");
389 kfree(hive);
390 hive = NULL;
391 goto pro_end;
392 }
393
394 hive->hive_id = adev->gmc.xgmi.hive_id;
395 INIT_LIST_HEAD(&hive->device_list);
396 INIT_LIST_HEAD(&hive->node);
397 mutex_init(&hive->hive_lock);
398 atomic_set(&hive->in_reset, 0);
399 atomic_set(&hive->number_devices, 0);
400 task_barrier_init(&hive->tb);
401 hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
402 hive->hi_req_gpu = NULL;
403
404
405
406
407 hive->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE;
408 list_add_tail(&hive->node, &xgmi_hive_list);
409
410pro_end:
411 if (hive)
412 kobject_get(&hive->kobj);
413 mutex_unlock(&xgmi_mutex);
414 return hive;
415}
416
417void amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive)
418{
419 if (hive)
420 kobject_put(&hive->kobj);
421}
422
423int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
424{
425 int ret = 0;
426 struct amdgpu_hive_info *hive;
427 struct amdgpu_device *request_adev;
428 bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
429 bool init_low;
430
431 hive = amdgpu_get_xgmi_hive(adev);
432 if (!hive)
433 return 0;
434
435 request_adev = hive->hi_req_gpu ? hive->hi_req_gpu : adev;
436 init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN;
437 amdgpu_put_xgmi_hive(hive);
438
439 return 0;
440
441 if (!hive || adev->asic_type != CHIP_VEGA20)
442 return 0;
443
444 mutex_lock(&hive->hive_lock);
445
446 if (is_hi_req)
447 hive->hi_req_count++;
448 else
449 hive->hi_req_count--;
450
451
452
453
454
455 if (hive->pstate == pstate ||
456 (!is_hi_req && hive->hi_req_count && !init_low))
457 goto out;
458
459 dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate);
460
461 ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate);
462 if (ret) {
463 dev_err(request_adev->dev,
464 "XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
465 request_adev->gmc.xgmi.node_id,
466 request_adev->gmc.xgmi.hive_id, ret);
467 goto out;
468 }
469
470 if (init_low)
471 hive->pstate = hive->hi_req_count ?
472 hive->pstate : AMDGPU_XGMI_PSTATE_MIN;
473 else {
474 hive->pstate = pstate;
475 hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ?
476 adev : NULL;
477 }
478out:
479 mutex_unlock(&hive->hive_lock);
480 return ret;
481}
482
483int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
484{
485 int ret;
486
487
488 ret = psp_xgmi_set_topology_info(&adev->psp,
489 atomic_read(&hive->number_devices),
490 &adev->psp.xgmi_context.top_info);
491 if (ret)
492 dev_err(adev->dev,
493 "XGMI: Set topology failure on device %llx, hive %llx, ret %d",
494 adev->gmc.xgmi.node_id,
495 adev->gmc.xgmi.hive_id, ret);
496
497 return ret;
498}
499
500
501
502
503
504
505
506
507int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
508 struct amdgpu_device *peer_adev)
509{
510 struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
511 uint8_t num_hops_mask = 0x7;
512 int i;
513
514 for (i = 0 ; i < top->num_nodes; ++i)
515 if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
516 return top->nodes[i].num_hops & num_hops_mask;
517 return -EINVAL;
518}
519
520int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
521 struct amdgpu_device *peer_adev)
522{
523 struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
524 int i;
525
526 for (i = 0 ; i < top->num_nodes; ++i)
527 if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
528 return top->nodes[i].num_links;
529 return -EINVAL;
530}
531
532
533
534
535
536
537
538static int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_info *hive,
539 bool set_extended_data)
540{
541 struct amdgpu_device *tmp_adev;
542 int ret;
543
544 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
545 ret = psp_xgmi_initialize(&tmp_adev->psp, set_extended_data, false);
546 if (ret) {
547 dev_err(tmp_adev->dev,
548 "XGMI: Failed to initialize xgmi session for data partition %i\n",
549 set_extended_data);
550 return ret;
551 }
552
553 }
554
555 return 0;
556}
557
558int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
559{
560 struct psp_xgmi_topology_info *top_info;
561 struct amdgpu_hive_info *hive;
562 struct amdgpu_xgmi *entry;
563 struct amdgpu_device *tmp_adev = NULL;
564
565 int count = 0, ret = 0;
566
567 if (!adev->gmc.xgmi.supported)
568 return 0;
569
570 if (!adev->gmc.xgmi.pending_reset &&
571 amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
572 ret = psp_xgmi_initialize(&adev->psp, false, true);
573 if (ret) {
574 dev_err(adev->dev,
575 "XGMI: Failed to initialize xgmi session\n");
576 return ret;
577 }
578
579 ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id);
580 if (ret) {
581 dev_err(adev->dev,
582 "XGMI: Failed to get hive id\n");
583 return ret;
584 }
585
586 ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id);
587 if (ret) {
588 dev_err(adev->dev,
589 "XGMI: Failed to get node id\n");
590 return ret;
591 }
592 } else {
593 adev->gmc.xgmi.hive_id = 16;
594 adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
595 }
596
597 hive = amdgpu_get_xgmi_hive(adev);
598 if (!hive) {
599 ret = -EINVAL;
600 dev_err(adev->dev,
601 "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",
602 adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id);
603 goto exit;
604 }
605 mutex_lock(&hive->hive_lock);
606
607 top_info = &adev->psp.xgmi_context.top_info;
608
609 list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
610 list_for_each_entry(entry, &hive->device_list, head)
611 top_info->nodes[count++].node_id = entry->node_id;
612 top_info->num_nodes = count;
613 atomic_set(&hive->number_devices, count);
614
615 task_barrier_add_task(&hive->tb);
616
617 if (!adev->gmc.xgmi.pending_reset &&
618 amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
619 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
620
621 if (tmp_adev != adev) {
622 top_info = &tmp_adev->psp.xgmi_context.top_info;
623 top_info->nodes[count - 1].node_id =
624 adev->gmc.xgmi.node_id;
625 top_info->num_nodes = count;
626 }
627 ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
628 if (ret)
629 goto exit_unlock;
630 }
631
632
633 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
634 ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
635 &tmp_adev->psp.xgmi_context.top_info, false);
636 if (ret) {
637 dev_err(tmp_adev->dev,
638 "XGMI: Get topology failure on device %llx, hive %llx, ret %d",
639 tmp_adev->gmc.xgmi.node_id,
640 tmp_adev->gmc.xgmi.hive_id, ret);
641
642 goto exit_unlock;
643 }
644 }
645
646
647 if (adev->psp.xgmi_context.supports_extended_data) {
648
649
650 ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, true);
651 if (ret)
652 goto exit_unlock;
653
654
655 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
656 ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
657 &tmp_adev->psp.xgmi_context.top_info, true);
658 if (ret) {
659 dev_err(tmp_adev->dev,
660 "XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d",
661 tmp_adev->gmc.xgmi.node_id,
662 tmp_adev->gmc.xgmi.hive_id, ret);
663 goto exit_unlock;
664 }
665 }
666
667
668 ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, false);
669 if (ret)
670 goto exit_unlock;
671
672 }
673 }
674
675 if (!ret && !adev->gmc.xgmi.pending_reset)
676 ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
677
678exit_unlock:
679 mutex_unlock(&hive->hive_lock);
680exit:
681 if (!ret) {
682 adev->hive = hive;
683 dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
684 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id);
685 } else {
686 amdgpu_put_xgmi_hive(hive);
687 dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",
688 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,
689 ret);
690 }
691
692 return ret;
693}
694
695int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
696{
697 struct amdgpu_hive_info *hive = adev->hive;
698
699 if (!adev->gmc.xgmi.supported)
700 return -EINVAL;
701
702 if (!hive)
703 return -EINVAL;
704
705 mutex_lock(&hive->hive_lock);
706 task_barrier_rem_task(&hive->tb);
707 amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
708 if (hive->hi_req_gpu == adev)
709 hive->hi_req_gpu = NULL;
710 list_del(&adev->gmc.xgmi.head);
711 mutex_unlock(&hive->hive_lock);
712
713 amdgpu_put_xgmi_hive(hive);
714 adev->hive = NULL;
715
716 if (atomic_dec_return(&hive->number_devices) == 0) {
717
718 mutex_lock(&xgmi_mutex);
719 list_del(&hive->node);
720 mutex_unlock(&xgmi_mutex);
721
722 amdgpu_put_xgmi_hive(hive);
723 }
724
725 return psp_xgmi_terminate(&adev->psp);
726}
727
728static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)
729{
730 int r;
731 struct ras_ih_if ih_info = {
732 .cb = NULL,
733 };
734 struct ras_fs_if fs_info = {
735 .sysfs_name = "xgmi_wafl_err_count",
736 };
737
738 if (!adev->gmc.xgmi.supported ||
739 adev->gmc.xgmi.num_physical_nodes == 0)
740 return 0;
741
742 adev->gmc.xgmi.ras_funcs->reset_ras_error_count(adev);
743
744 if (!adev->gmc.xgmi.ras_if) {
745 adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
746 if (!adev->gmc.xgmi.ras_if)
747 return -ENOMEM;
748 adev->gmc.xgmi.ras_if->block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
749 adev->gmc.xgmi.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
750 adev->gmc.xgmi.ras_if->sub_block_index = 0;
751 }
752 ih_info.head = fs_info.head = *adev->gmc.xgmi.ras_if;
753 r = amdgpu_ras_late_init(adev, adev->gmc.xgmi.ras_if,
754 &fs_info, &ih_info);
755 if (r || !amdgpu_ras_is_supported(adev, adev->gmc.xgmi.ras_if->block)) {
756 kfree(adev->gmc.xgmi.ras_if);
757 adev->gmc.xgmi.ras_if = NULL;
758 }
759
760 return r;
761}
762
763static void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev)
764{
765 if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) &&
766 adev->gmc.xgmi.ras_if) {
767 struct ras_common_if *ras_if = adev->gmc.xgmi.ras_if;
768 struct ras_ih_if ih_info = {
769 .cb = NULL,
770 };
771
772 amdgpu_ras_late_fini(adev, ras_if, &ih_info);
773 kfree(ras_if);
774 }
775}
776
777uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
778 uint64_t addr)
779{
780 struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi;
781 return (addr + xgmi->physical_node_id * xgmi->node_segment_size);
782}
783
784static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg)
785{
786 WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF);
787 WREG32_PCIE(pcs_status_reg, 0);
788}
789
790static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
791{
792 uint32_t i;
793
794 switch (adev->asic_type) {
795 case CHIP_ARCTURUS:
796 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++)
797 pcs_clear_status(adev,
798 xgmi_pcs_err_status_reg_arct[i]);
799 break;
800 case CHIP_VEGA20:
801 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++)
802 pcs_clear_status(adev,
803 xgmi_pcs_err_status_reg_vg20[i]);
804 break;
805 case CHIP_ALDEBARAN:
806 for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++)
807 pcs_clear_status(adev,
808 xgmi23_pcs_err_status_reg_aldebaran[i]);
809 for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++)
810 pcs_clear_status(adev,
811 xgmi23_pcs_err_status_reg_aldebaran[i]);
812 for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++)
813 pcs_clear_status(adev,
814 walf_pcs_err_status_reg_aldebaran[i]);
815 break;
816 default:
817 break;
818 }
819}
820
821static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
822 uint32_t value,
823 uint32_t *ue_count,
824 uint32_t *ce_count,
825 bool is_xgmi_pcs)
826{
827 int i;
828 int ue_cnt;
829
830 if (is_xgmi_pcs) {
831
832
833 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_ras_fields); i ++) {
834 ue_cnt = (value &
835 xgmi_pcs_ras_fields[i].pcs_err_mask) >>
836 xgmi_pcs_ras_fields[i].pcs_err_shift;
837 if (ue_cnt) {
838 dev_info(adev->dev, "%s detected\n",
839 xgmi_pcs_ras_fields[i].err_name);
840 *ue_count += ue_cnt;
841 }
842 }
843 } else {
844
845
846 for (i = 0; i < ARRAY_SIZE(wafl_pcs_ras_fields); i++) {
847 ue_cnt = (value &
848 wafl_pcs_ras_fields[i].pcs_err_mask) >>
849 wafl_pcs_ras_fields[i].pcs_err_shift;
850 if (ue_cnt) {
851 dev_info(adev->dev, "%s detected\n",
852 wafl_pcs_ras_fields[i].err_name);
853 *ue_count += ue_cnt;
854 }
855 }
856 }
857
858 return 0;
859}
860
861static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
862 void *ras_error_status)
863{
864 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
865 int i;
866 uint32_t data;
867 uint32_t ue_cnt = 0, ce_cnt = 0;
868
869 if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL))
870 return -EINVAL;
871
872 err_data->ue_count = 0;
873 err_data->ce_count = 0;
874
875 switch (adev->asic_type) {
876 case CHIP_ARCTURUS:
877
878 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) {
879 data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]);
880 if (data)
881 amdgpu_xgmi_query_pcs_error_status(adev,
882 data, &ue_cnt, &ce_cnt, true);
883 }
884
885 for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) {
886 data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]);
887 if (data)
888 amdgpu_xgmi_query_pcs_error_status(adev,
889 data, &ue_cnt, &ce_cnt, false);
890 }
891 break;
892 case CHIP_VEGA20:
893
894 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) {
895 data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]);
896 if (data)
897 amdgpu_xgmi_query_pcs_error_status(adev,
898 data, &ue_cnt, &ce_cnt, true);
899 }
900
901 for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) {
902 data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]);
903 if (data)
904 amdgpu_xgmi_query_pcs_error_status(adev,
905 data, &ue_cnt, &ce_cnt, false);
906 }
907 break;
908 case CHIP_ALDEBARAN:
909
910 for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++) {
911 data = RREG32_PCIE(xgmi23_pcs_err_status_reg_aldebaran[i]);
912 if (data)
913 amdgpu_xgmi_query_pcs_error_status(adev,
914 data, &ue_cnt, &ce_cnt, true);
915 }
916
917 for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) {
918 data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]);
919 if (data)
920 amdgpu_xgmi_query_pcs_error_status(adev,
921 data, &ue_cnt, &ce_cnt, true);
922 }
923
924 for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) {
925 data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]);
926 if (data)
927 amdgpu_xgmi_query_pcs_error_status(adev,
928 data, &ue_cnt, &ce_cnt, false);
929 }
930 break;
931 default:
932 dev_warn(adev->dev, "XGMI RAS error query not supported");
933 break;
934 }
935
936 adev->gmc.xgmi.ras_funcs->reset_ras_error_count(adev);
937
938 err_data->ue_count += ue_cnt;
939 err_data->ce_count += ce_cnt;
940
941 return 0;
942}
943
944const struct amdgpu_xgmi_ras_funcs xgmi_ras_funcs = {
945 .ras_late_init = amdgpu_xgmi_ras_late_init,
946 .ras_fini = amdgpu_xgmi_ras_fini,
947 .query_ras_error_count = amdgpu_xgmi_query_ras_error_count,
948 .reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count,
949};
950