1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/list.h>
25#include "amdgpu.h"
26#include "amdgpu_xgmi.h"
27#include "amdgpu_smu.h"
28#include "amdgpu_ras.h"
29#include "soc15.h"
30#include "df/df_3_6_offset.h"
31#include "xgmi/xgmi_4_0_0_smn.h"
32#include "xgmi/xgmi_4_0_0_sh_mask.h"
33#include "wafl/wafl2_4_0_0_smn.h"
34#include "wafl/wafl2_4_0_0_sh_mask.h"
35
36static DEFINE_MUTEX(xgmi_mutex);
37
38#define AMDGPU_MAX_XGMI_HIVE 8
39#define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4
40
41static struct amdgpu_hive_info xgmi_hives[AMDGPU_MAX_XGMI_HIVE];
42static unsigned hive_count = 0;
43
44static const int xgmi_pcs_err_status_reg_vg20[] = {
45 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
46 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
47};
48
49static const int wafl_pcs_err_status_reg_vg20[] = {
50 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
51 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
52};
53
54static const int xgmi_pcs_err_status_reg_arct[] = {
55 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS,
56 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000,
57 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000,
58 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000,
59 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000,
60 smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000,
61};
62
63
64static const int wafl_pcs_err_status_reg_arct[] = {
65 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS,
66 smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
67};
68
69static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
70 {"XGMI PCS DataLossErr",
71 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
72 {"XGMI PCS TrainingErr",
73 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)},
74 {"XGMI PCS CRCErr",
75 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)},
76 {"XGMI PCS BERExceededErr",
77 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)},
78 {"XGMI PCS TxMetaDataErr",
79 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)},
80 {"XGMI PCS ReplayBufParityErr",
81 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)},
82 {"XGMI PCS DataParityErr",
83 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)},
84 {"XGMI PCS ReplayFifoOverflowErr",
85 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
86 {"XGMI PCS ReplayFifoUnderflowErr",
87 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
88 {"XGMI PCS ElasticFifoOverflowErr",
89 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
90 {"XGMI PCS DeskewErr",
91 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)},
92 {"XGMI PCS DataStartupLimitErr",
93 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)},
94 {"XGMI PCS FCInitTimeoutErr",
95 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)},
96 {"XGMI PCS RecoveryTimeoutErr",
97 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
98 {"XGMI PCS ReadySerialTimeoutErr",
99 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
100 {"XGMI PCS ReadySerialAttemptErr",
101 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
102 {"XGMI PCS RecoveryAttemptErr",
103 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)},
104 {"XGMI PCS RecoveryRelockAttemptErr",
105 SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
106};
107
108static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = {
109 {"WAFL PCS DataLossErr",
110 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)},
111 {"WAFL PCS TrainingErr",
112 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)},
113 {"WAFL PCS CRCErr",
114 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)},
115 {"WAFL PCS BERExceededErr",
116 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)},
117 {"WAFL PCS TxMetaDataErr",
118 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)},
119 {"WAFL PCS ReplayBufParityErr",
120 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)},
121 {"WAFL PCS DataParityErr",
122 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)},
123 {"WAFL PCS ReplayFifoOverflowErr",
124 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)},
125 {"WAFL PCS ReplayFifoUnderflowErr",
126 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)},
127 {"WAFL PCS ElasticFifoOverflowErr",
128 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)},
129 {"WAFL PCS DeskewErr",
130 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)},
131 {"WAFL PCS DataStartupLimitErr",
132 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)},
133 {"WAFL PCS FCInitTimeoutErr",
134 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)},
135 {"WAFL PCS RecoveryTimeoutErr",
136 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)},
137 {"WAFL PCS ReadySerialTimeoutErr",
138 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)},
139 {"WAFL PCS ReadySerialAttemptErr",
140 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)},
141 {"WAFL PCS RecoveryAttemptErr",
142 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)},
143 {"WAFL PCS RecoveryRelockAttemptErr",
144 SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)},
145};
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175static ssize_t amdgpu_xgmi_show_hive_id(struct device *dev,
176 struct device_attribute *attr, char *buf)
177{
178 struct amdgpu_hive_info *hive =
179 container_of(attr, struct amdgpu_hive_info, dev_attr);
180
181 return snprintf(buf, PAGE_SIZE, "%llu\n", hive->hive_id);
182}
183
184static int amdgpu_xgmi_sysfs_create(struct amdgpu_device *adev,
185 struct amdgpu_hive_info *hive)
186{
187 int ret = 0;
188
189 if (WARN_ON(hive->kobj))
190 return -EINVAL;
191
192 hive->kobj = kobject_create_and_add("xgmi_hive_info", &adev->dev->kobj);
193 if (!hive->kobj) {
194 dev_err(adev->dev, "XGMI: Failed to allocate sysfs entry!\n");
195 return -EINVAL;
196 }
197
198 hive->dev_attr = (struct device_attribute) {
199 .attr = {
200 .name = "xgmi_hive_id",
201 .mode = S_IRUGO,
202
203 },
204 .show = amdgpu_xgmi_show_hive_id,
205 };
206
207 ret = sysfs_create_file(hive->kobj, &hive->dev_attr.attr);
208 if (ret) {
209 dev_err(adev->dev, "XGMI: Failed to create device file xgmi_hive_id\n");
210 kobject_del(hive->kobj);
211 kobject_put(hive->kobj);
212 hive->kobj = NULL;
213 }
214
215 return ret;
216}
217
218static void amdgpu_xgmi_sysfs_destroy(struct amdgpu_device *adev,
219 struct amdgpu_hive_info *hive)
220{
221 sysfs_remove_file(hive->kobj, &hive->dev_attr.attr);
222 kobject_del(hive->kobj);
223 kobject_put(hive->kobj);
224 hive->kobj = NULL;
225}
226
227static ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
228 struct device_attribute *attr,
229 char *buf)
230{
231 struct drm_device *ddev = dev_get_drvdata(dev);
232 struct amdgpu_device *adev = ddev->dev_private;
233
234 return snprintf(buf, PAGE_SIZE, "%llu\n", adev->gmc.xgmi.node_id);
235
236}
237
238#define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801)
239static ssize_t amdgpu_xgmi_show_error(struct device *dev,
240 struct device_attribute *attr,
241 char *buf)
242{
243 struct drm_device *ddev = dev_get_drvdata(dev);
244 struct amdgpu_device *adev = ddev->dev_private;
245 uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in;
246 uint64_t fica_out;
247 unsigned int error_count = 0;
248
249 ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200);
250 ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208);
251
252 fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in);
253 if (fica_out != 0x1f)
254 pr_err("xGMI error counters not enabled!\n");
255
256 fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in);
257
258 if ((fica_out & 0xffff) == 2)
259 error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63);
260
261 adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0);
262
263 return snprintf(buf, PAGE_SIZE, "%d\n", error_count);
264}
265
266
267static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
268static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL);
269
270static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
271 struct amdgpu_hive_info *hive)
272{
273 int ret = 0;
274 char node[10] = { 0 };
275
276
277 ret = device_create_file(adev->dev, &dev_attr_xgmi_device_id);
278 if (ret) {
279 dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n");
280 return ret;
281 }
282
283
284 ret = device_create_file(adev->dev, &dev_attr_xgmi_error);
285 if (ret)
286 pr_err("failed to create xgmi_error\n");
287
288
289
290 if (adev != hive->adev) {
291 ret = sysfs_create_link(&adev->dev->kobj, hive->kobj,
292 "xgmi_hive_info");
293 if (ret) {
294 dev_err(adev->dev, "XGMI: Failed to create link to hive info");
295 goto remove_file;
296 }
297 }
298
299 sprintf(node, "node%d", hive->number_devices);
300
301 ret = sysfs_create_link(hive->kobj, &adev->dev->kobj, node);
302 if (ret) {
303 dev_err(adev->dev, "XGMI: Failed to create link from hive info");
304 goto remove_link;
305 }
306
307 goto success;
308
309
310remove_link:
311 sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique);
312
313remove_file:
314 device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
315
316success:
317 return ret;
318}
319
320static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
321 struct amdgpu_hive_info *hive)
322{
323 char node[10];
324 memset(node, 0, sizeof(node));
325
326 device_remove_file(adev->dev, &dev_attr_xgmi_device_id);
327 device_remove_file(adev->dev, &dev_attr_xgmi_error);
328
329 if (adev != hive->adev)
330 sysfs_remove_link(&adev->dev->kobj,"xgmi_hive_info");
331
332 sprintf(node, "node%d", hive->number_devices);
333 sysfs_remove_link(hive->kobj, node);
334
335}
336
337
338
339struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
340{
341 int i;
342 struct amdgpu_hive_info *tmp;
343
344 if (!adev->gmc.xgmi.hive_id)
345 return NULL;
346
347 mutex_lock(&xgmi_mutex);
348
349 for (i = 0 ; i < hive_count; ++i) {
350 tmp = &xgmi_hives[i];
351 if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
352 if (lock)
353 mutex_lock(&tmp->hive_lock);
354 mutex_unlock(&xgmi_mutex);
355 return tmp;
356 }
357 }
358 if (i >= AMDGPU_MAX_XGMI_HIVE) {
359 mutex_unlock(&xgmi_mutex);
360 return NULL;
361 }
362
363
364 tmp = &xgmi_hives[hive_count++];
365
366 if (amdgpu_xgmi_sysfs_create(adev, tmp)) {
367 mutex_unlock(&xgmi_mutex);
368 return NULL;
369 }
370
371 tmp->adev = adev;
372 tmp->hive_id = adev->gmc.xgmi.hive_id;
373 INIT_LIST_HEAD(&tmp->device_list);
374 mutex_init(&tmp->hive_lock);
375 mutex_init(&tmp->reset_lock);
376 task_barrier_init(&tmp->tb);
377
378 if (lock)
379 mutex_lock(&tmp->hive_lock);
380 tmp->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
381 tmp->hi_req_gpu = NULL;
382
383
384
385
386 tmp->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE;
387 mutex_unlock(&xgmi_mutex);
388
389 return tmp;
390}
391
392int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
393{
394 int ret = 0;
395 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
396 struct amdgpu_device *request_adev = hive->hi_req_gpu ?
397 hive->hi_req_gpu : adev;
398 bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
399 bool init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN;
400
401
402 return 0;
403
404 if (!hive || adev->asic_type != CHIP_VEGA20)
405 return 0;
406
407 mutex_lock(&hive->hive_lock);
408
409 if (is_hi_req)
410 hive->hi_req_count++;
411 else
412 hive->hi_req_count--;
413
414
415
416
417
418 if (hive->pstate == pstate ||
419 (!is_hi_req && hive->hi_req_count && !init_low))
420 goto out;
421
422 dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate);
423
424 ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate);
425 if (ret) {
426 dev_err(request_adev->dev,
427 "XGMI: Set pstate failure on device %llx, hive %llx, ret %d",
428 request_adev->gmc.xgmi.node_id,
429 request_adev->gmc.xgmi.hive_id, ret);
430 goto out;
431 }
432
433 if (init_low)
434 hive->pstate = hive->hi_req_count ?
435 hive->pstate : AMDGPU_XGMI_PSTATE_MIN;
436 else {
437 hive->pstate = pstate;
438 hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ?
439 adev : NULL;
440 }
441out:
442 mutex_unlock(&hive->hive_lock);
443 return ret;
444}
445
446int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)
447{
448 int ret;
449
450
451 ret = psp_xgmi_set_topology_info(&adev->psp,
452 hive->number_devices,
453 &adev->psp.xgmi_context.top_info);
454 if (ret)
455 dev_err(adev->dev,
456 "XGMI: Set topology failure on device %llx, hive %llx, ret %d",
457 adev->gmc.xgmi.node_id,
458 adev->gmc.xgmi.hive_id, ret);
459
460 return ret;
461}
462
463
464int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
465 struct amdgpu_device *peer_adev)
466{
467 struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
468 int i;
469
470 for (i = 0 ; i < top->num_nodes; ++i)
471 if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
472 return top->nodes[i].num_hops;
473 return -EINVAL;
474}
475
476int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
477{
478 struct psp_xgmi_topology_info *top_info;
479 struct amdgpu_hive_info *hive;
480 struct amdgpu_xgmi *entry;
481 struct amdgpu_device *tmp_adev = NULL;
482
483 int count = 0, ret = 0;
484
485 if (!adev->gmc.xgmi.supported)
486 return 0;
487
488 if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
489 ret = psp_xgmi_initialize(&adev->psp);
490 if (ret) {
491 dev_err(adev->dev,
492 "XGMI: Failed to initialize xgmi session\n");
493 return ret;
494 }
495
496 ret = psp_xgmi_get_hive_id(&adev->psp, &adev->gmc.xgmi.hive_id);
497 if (ret) {
498 dev_err(adev->dev,
499 "XGMI: Failed to get hive id\n");
500 return ret;
501 }
502
503 ret = psp_xgmi_get_node_id(&adev->psp, &adev->gmc.xgmi.node_id);
504 if (ret) {
505 dev_err(adev->dev,
506 "XGMI: Failed to get node id\n");
507 return ret;
508 }
509 } else {
510 adev->gmc.xgmi.hive_id = 16;
511 adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
512 }
513
514 hive = amdgpu_get_xgmi_hive(adev, 1);
515 if (!hive) {
516 ret = -EINVAL;
517 dev_err(adev->dev,
518 "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n",
519 adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id);
520 goto exit;
521 }
522
523 top_info = &adev->psp.xgmi_context.top_info;
524
525 list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
526 list_for_each_entry(entry, &hive->device_list, head)
527 top_info->nodes[count++].node_id = entry->node_id;
528 top_info->num_nodes = count;
529 hive->number_devices = count;
530
531 task_barrier_add_task(&hive->tb);
532
533 if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
534 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
535
536 if (tmp_adev != adev) {
537 top_info = &tmp_adev->psp.xgmi_context.top_info;
538 top_info->nodes[count - 1].node_id =
539 adev->gmc.xgmi.node_id;
540 top_info->num_nodes = count;
541 }
542 ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
543 if (ret)
544 goto exit;
545 }
546
547
548 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
549 ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
550 &tmp_adev->psp.xgmi_context.top_info);
551 if (ret) {
552 dev_err(tmp_adev->dev,
553 "XGMI: Get topology failure on device %llx, hive %llx, ret %d",
554 tmp_adev->gmc.xgmi.node_id,
555 tmp_adev->gmc.xgmi.hive_id, ret);
556
557 goto exit;
558 }
559 }
560 }
561
562 if (!ret)
563 ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
564
565
566 mutex_unlock(&hive->hive_lock);
567exit:
568 if (!ret)
569 dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
570 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id);
571 else
572 dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n",
573 adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,
574 ret);
575
576 return ret;
577}
578
579int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
580{
581 struct amdgpu_hive_info *hive;
582
583 if (!adev->gmc.xgmi.supported)
584 return -EINVAL;
585
586 hive = amdgpu_get_xgmi_hive(adev, 1);
587 if (!hive)
588 return -EINVAL;
589
590 task_barrier_rem_task(&hive->tb);
591 amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
592 mutex_unlock(&hive->hive_lock);
593
594 if(!(--hive->number_devices)){
595 amdgpu_xgmi_sysfs_destroy(adev, hive);
596 mutex_destroy(&hive->hive_lock);
597 mutex_destroy(&hive->reset_lock);
598 }
599
600 return psp_xgmi_terminate(&adev->psp);
601}
602
603int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)
604{
605 int r;
606 struct ras_ih_if ih_info = {
607 .cb = NULL,
608 };
609 struct ras_fs_if fs_info = {
610 .sysfs_name = "xgmi_wafl_err_count",
611 };
612
613 if (!adev->gmc.xgmi.supported ||
614 adev->gmc.xgmi.num_physical_nodes == 0)
615 return 0;
616
617 amdgpu_xgmi_reset_ras_error_count(adev);
618
619 if (!adev->gmc.xgmi.ras_if) {
620 adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
621 if (!adev->gmc.xgmi.ras_if)
622 return -ENOMEM;
623 adev->gmc.xgmi.ras_if->block = AMDGPU_RAS_BLOCK__XGMI_WAFL;
624 adev->gmc.xgmi.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
625 adev->gmc.xgmi.ras_if->sub_block_index = 0;
626 strcpy(adev->gmc.xgmi.ras_if->name, "xgmi_wafl");
627 }
628 ih_info.head = fs_info.head = *adev->gmc.xgmi.ras_if;
629 r = amdgpu_ras_late_init(adev, adev->gmc.xgmi.ras_if,
630 &fs_info, &ih_info);
631 if (r || !amdgpu_ras_is_supported(adev, adev->gmc.xgmi.ras_if->block)) {
632 kfree(adev->gmc.xgmi.ras_if);
633 adev->gmc.xgmi.ras_if = NULL;
634 }
635
636 return r;
637}
638
639void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev)
640{
641 if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL) &&
642 adev->gmc.xgmi.ras_if) {
643 struct ras_common_if *ras_if = adev->gmc.xgmi.ras_if;
644 struct ras_ih_if ih_info = {
645 .cb = NULL,
646 };
647
648 amdgpu_ras_late_fini(adev, ras_if, &ih_info);
649 kfree(ras_if);
650 }
651}
652
653uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
654 uint64_t addr)
655{
656 struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi;
657 return (addr + xgmi->physical_node_id * xgmi->node_segment_size);
658}
659
660static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg)
661{
662 WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF);
663 WREG32_PCIE(pcs_status_reg, 0);
664}
665
666void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
667{
668 uint32_t i;
669
670 switch (adev->asic_type) {
671 case CHIP_ARCTURUS:
672 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++)
673 pcs_clear_status(adev,
674 xgmi_pcs_err_status_reg_arct[i]);
675 break;
676 case CHIP_VEGA20:
677 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++)
678 pcs_clear_status(adev,
679 xgmi_pcs_err_status_reg_vg20[i]);
680 break;
681 default:
682 break;
683 }
684}
685
686static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
687 uint32_t value,
688 uint32_t *ue_count,
689 uint32_t *ce_count,
690 bool is_xgmi_pcs)
691{
692 int i;
693 int ue_cnt;
694
695 if (is_xgmi_pcs) {
696
697
698 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_ras_fields); i ++) {
699 ue_cnt = (value &
700 xgmi_pcs_ras_fields[i].pcs_err_mask) >>
701 xgmi_pcs_ras_fields[i].pcs_err_shift;
702 if (ue_cnt) {
703 dev_info(adev->dev, "%s detected\n",
704 xgmi_pcs_ras_fields[i].err_name);
705 *ue_count += ue_cnt;
706 }
707 }
708 } else {
709
710
711 for (i = 0; i < ARRAY_SIZE(wafl_pcs_ras_fields); i++) {
712 ue_cnt = (value &
713 wafl_pcs_ras_fields[i].pcs_err_mask) >>
714 wafl_pcs_ras_fields[i].pcs_err_shift;
715 if (ue_cnt) {
716 dev_info(adev->dev, "%s detected\n",
717 wafl_pcs_ras_fields[i].err_name);
718 *ue_count += ue_cnt;
719 }
720 }
721 }
722
723 return 0;
724}
725
726int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
727 void *ras_error_status)
728{
729 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
730 int i;
731 uint32_t data;
732 uint32_t ue_cnt = 0, ce_cnt = 0;
733
734 if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__XGMI_WAFL))
735 return -EINVAL;
736
737 err_data->ue_count = 0;
738 err_data->ce_count = 0;
739
740 switch (adev->asic_type) {
741 case CHIP_ARCTURUS:
742
743 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) {
744 data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]);
745 if (data)
746 amdgpu_xgmi_query_pcs_error_status(adev,
747 data, &ue_cnt, &ce_cnt, true);
748 }
749
750 for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) {
751 data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]);
752 if (data)
753 amdgpu_xgmi_query_pcs_error_status(adev,
754 data, &ue_cnt, &ce_cnt, false);
755 }
756 break;
757 case CHIP_VEGA20:
758 default:
759
760 for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) {
761 data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]);
762 if (data)
763 amdgpu_xgmi_query_pcs_error_status(adev,
764 data, &ue_cnt, &ce_cnt, true);
765 }
766
767 for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) {
768 data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]);
769 if (data)
770 amdgpu_xgmi_query_pcs_error_status(adev,
771 data, &ue_cnt, &ce_cnt, false);
772 }
773 break;
774 }
775
776 amdgpu_xgmi_reset_ras_error_count(adev);
777
778 err_data->ue_count += ue_cnt;
779 err_data->ce_count += ce_cnt;
780
781 return 0;
782}
783