1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/debugfs.h>
25#include <linux/list.h>
26#include <linux/module.h>
27#include <linux/uaccess.h>
28#include <linux/reboot.h>
29#include <linux/syscalls.h>
30
31#include "amdgpu.h"
32#include "amdgpu_ras.h"
33#include "amdgpu_atomfirmware.h"
34#include "amdgpu_xgmi.h"
35#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
36
37const char *ras_error_string[] = {
38 "none",
39 "parity",
40 "single_correctable",
41 "multi_uncorrectable",
42 "poison",
43};
44
45const char *ras_block_string[] = {
46 "umc",
47 "sdma",
48 "gfx",
49 "mmhub",
50 "athub",
51 "pcie_bif",
52 "hdp",
53 "xgmi_wafl",
54 "df",
55 "smn",
56 "sem",
57 "mp0",
58 "mp1",
59 "fuse",
60};
61
62#define ras_err_str(i) (ras_error_string[ffs(i)])
63#define ras_block_str(i) (ras_block_string[i])
64
65#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1
66#define AMDGPU_RAS_FLAG_INIT_NEED_RESET 2
67#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
68
69
70#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
71
72enum amdgpu_ras_retire_page_reservation {
73 AMDGPU_RAS_RETIRE_PAGE_RESERVED,
74 AMDGPU_RAS_RETIRE_PAGE_PENDING,
75 AMDGPU_RAS_RETIRE_PAGE_FAULT,
76};
77
78atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
79
80static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
81 uint64_t addr);
82
83static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
84 size_t size, loff_t *pos)
85{
86 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
87 struct ras_query_if info = {
88 .head = obj->head,
89 };
90 ssize_t s;
91 char val[128];
92
93 if (amdgpu_ras_error_query(obj->adev, &info))
94 return -EINVAL;
95
96 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
97 "ue", info.ue_count,
98 "ce", info.ce_count);
99 if (*pos >= s)
100 return 0;
101
102 s -= *pos;
103 s = min_t(u64, s, size);
104
105
106 if (copy_to_user(buf, &val[*pos], s))
107 return -EINVAL;
108
109 *pos += s;
110
111 return s;
112}
113
114static const struct file_operations amdgpu_ras_debugfs_ops = {
115 .owner = THIS_MODULE,
116 .read = amdgpu_ras_debugfs_read,
117 .write = NULL,
118 .llseek = default_llseek
119};
120
121static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
122{
123 int i;
124
125 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
126 *block_id = i;
127 if (strcmp(name, ras_block_str(i)) == 0)
128 return 0;
129 }
130 return -EINVAL;
131}
132
133static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
134 const char __user *buf, size_t size,
135 loff_t *pos, struct ras_debug_if *data)
136{
137 ssize_t s = min_t(u64, 64, size);
138 char str[65];
139 char block_name[33];
140 char err[9] = "ue";
141 int op = -1;
142 int block_id;
143 uint32_t sub_block;
144 u64 address, value;
145
146 if (*pos)
147 return -EINVAL;
148 *pos = size;
149
150 memset(str, 0, sizeof(str));
151 memset(data, 0, sizeof(*data));
152
153 if (copy_from_user(str, buf, s))
154 return -EINVAL;
155
156 if (sscanf(str, "disable %32s", block_name) == 1)
157 op = 0;
158 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
159 op = 1;
160 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
161 op = 2;
162 else if (str[0] && str[1] && str[2] && str[3])
163
164 return -EINVAL;
165
166 if (op != -1) {
167 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
168 return -EINVAL;
169
170 data->head.block = block_id;
171
172 if (!memcmp("ue", err, 2))
173 data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
174 else if (!memcmp("ce", err, 2))
175 data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
176 else
177 return -EINVAL;
178
179 data->op = op;
180
181 if (op == 2) {
182 if (sscanf(str, "%*s %*s %*s %u %llu %llu",
183 &sub_block, &address, &value) != 3)
184 if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
185 &sub_block, &address, &value) != 3)
186 return -EINVAL;
187 data->head.sub_block_index = sub_block;
188 data->inject.address = address;
189 data->inject.value = value;
190 }
191 } else {
192 if (size < sizeof(*data))
193 return -EINVAL;
194
195 if (copy_from_user(data, buf, sizeof(*data)))
196 return -EINVAL;
197 }
198
199 return 0;
200}
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
278 size_t size, loff_t *pos)
279{
280 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
281 struct ras_debug_if data;
282 int ret = 0;
283
284 if (amdgpu_ras_intr_triggered()) {
285 DRM_WARN("RAS WARN: error injection currently inaccessible\n");
286 return size;
287 }
288
289 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
290 if (ret)
291 return -EINVAL;
292
293 if (!amdgpu_ras_is_supported(adev, data.head.block))
294 return -EINVAL;
295
296 switch (data.op) {
297 case 0:
298 ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
299 break;
300 case 1:
301 ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
302 break;
303 case 2:
304 if ((data.inject.address >= adev->gmc.mc_vram_size) ||
305 (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
306 ret = -EINVAL;
307 break;
308 }
309
310
311 if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
312 amdgpu_ras_check_bad_page(adev, data.inject.address)) {
313 DRM_WARN("RAS WARN: 0x%llx has been marked as bad before error injection!\n",
314 data.inject.address);
315 break;
316 }
317
318
319 ret = amdgpu_ras_error_inject(adev, &data.inject);
320 break;
321 default:
322 ret = -EINVAL;
323 break;
324 }
325
326 if (ret)
327 return -EINVAL;
328
329 return size;
330}
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user *buf,
349 size_t size, loff_t *pos)
350{
351 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
352 int ret;
353
354 ret = amdgpu_ras_eeprom_reset_table(&adev->psp.ras.ras->eeprom_control);
355
356 return ret == 1 ? size : -EIO;
357}
358
359static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
360 .owner = THIS_MODULE,
361 .read = NULL,
362 .write = amdgpu_ras_debugfs_ctrl_write,
363 .llseek = default_llseek
364};
365
366static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
367 .owner = THIS_MODULE,
368 .read = NULL,
369 .write = amdgpu_ras_debugfs_eeprom_write,
370 .llseek = default_llseek
371};
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
395 struct device_attribute *attr, char *buf)
396{
397 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
398 struct ras_query_if info = {
399 .head = obj->head,
400 };
401
402 if (amdgpu_ras_intr_triggered())
403 return snprintf(buf, PAGE_SIZE,
404 "Query currently inaccessible\n");
405
406 if (amdgpu_ras_error_query(obj->adev, &info))
407 return -EINVAL;
408
409 return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
410 "ue", info.ue_count,
411 "ce", info.ce_count);
412}
413
414
415
416#define get_obj(obj) do { (obj)->use++; } while (0)
417#define alive_obj(obj) ((obj)->use)
418
419static inline void put_obj(struct ras_manager *obj)
420{
421 if (obj && --obj->use == 0)
422 list_del(&obj->node);
423 if (obj && obj->use < 0) {
424 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
425 }
426}
427
428
429static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
430 struct ras_common_if *head)
431{
432 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
433 struct ras_manager *obj;
434
435 if (!con)
436 return NULL;
437
438 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
439 return NULL;
440
441 obj = &con->objs[head->block];
442
443 if (alive_obj(obj))
444 return NULL;
445
446 obj->head = *head;
447 obj->adev = adev;
448 list_add(&obj->node, &con->head);
449 get_obj(obj);
450
451 return obj;
452}
453
454
455struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
456 struct ras_common_if *head)
457{
458 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
459 struct ras_manager *obj;
460 int i;
461
462 if (!con)
463 return NULL;
464
465 if (head) {
466 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
467 return NULL;
468
469 obj = &con->objs[head->block];
470
471 if (alive_obj(obj)) {
472 WARN_ON(head->block != obj->head.block);
473 return obj;
474 }
475 } else {
476 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
477 obj = &con->objs[i];
478 if (alive_obj(obj)) {
479 WARN_ON(i != obj->head.block);
480 return obj;
481 }
482 }
483 }
484
485 return NULL;
486}
487
488
489
490static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
491 struct ras_common_if *head)
492{
493 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
494
495 return con->hw_supported & BIT(head->block);
496}
497
498static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
499 struct ras_common_if *head)
500{
501 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
502
503 return con->features & BIT(head->block);
504}
505
506
507
508
509
510static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
511 struct ras_common_if *head, int enable)
512{
513 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
514 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
515
516
517
518
519
520
521
522 if (!amdgpu_ras_is_feature_allowed(adev, head))
523 return 0;
524 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
525 return 0;
526
527 if (enable) {
528 if (!obj) {
529 obj = amdgpu_ras_create_obj(adev, head);
530 if (!obj)
531 return -EINVAL;
532 } else {
533
534 get_obj(obj);
535 }
536 con->features |= BIT(head->block);
537 } else {
538 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
539 con->features &= ~BIT(head->block);
540 put_obj(obj);
541 }
542 }
543
544 return 0;
545}
546
547
548int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
549 struct ras_common_if *head, bool enable)
550{
551 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
552 union ta_ras_cmd_input info;
553 int ret;
554
555 if (!con)
556 return -EINVAL;
557
558 if (!enable) {
559 info.disable_features = (struct ta_ras_disable_features_input) {
560 .block_id = amdgpu_ras_block_to_ta(head->block),
561 .error_type = amdgpu_ras_error_to_ta(head->type),
562 };
563 } else {
564 info.enable_features = (struct ta_ras_enable_features_input) {
565 .block_id = amdgpu_ras_block_to_ta(head->block),
566 .error_type = amdgpu_ras_error_to_ta(head->type),
567 };
568 }
569
570
571 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
572
573 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
574 return 0;
575
576 if (!amdgpu_ras_intr_triggered()) {
577 ret = psp_ras_enable_features(&adev->psp, &info, enable);
578 if (ret) {
579 DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
580 enable ? "enable":"disable",
581 ras_block_str(head->block),
582 ret);
583 if (ret == TA_RAS_STATUS__RESET_NEEDED)
584 return -EAGAIN;
585 return -EINVAL;
586 }
587 }
588
589
590 __amdgpu_ras_feature_enable(adev, head, enable);
591
592 return 0;
593}
594
595
596int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
597 struct ras_common_if *head, bool enable)
598{
599 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
600 int ret;
601
602 if (!con)
603 return -EINVAL;
604
605 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
606 if (enable) {
607
608
609
610
611
612
613 ret = amdgpu_ras_feature_enable(adev, head, 1);
614
615
616
617
618 if (ret == -EINVAL) {
619 ret = __amdgpu_ras_feature_enable(adev, head, 1);
620 if (!ret)
621 DRM_INFO("RAS INFO: %s setup object\n",
622 ras_block_str(head->block));
623 }
624 } else {
625
626 ret = __amdgpu_ras_feature_enable(adev, head, 1);
627 if (ret)
628 return ret;
629
630 ret = amdgpu_ras_feature_enable(adev, head, 0);
631 }
632 } else
633 ret = amdgpu_ras_feature_enable(adev, head, enable);
634
635 return ret;
636}
637
638static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
639 bool bypass)
640{
641 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
642 struct ras_manager *obj, *tmp;
643
644 list_for_each_entry_safe(obj, tmp, &con->head, node) {
645
646
647
648 if (bypass) {
649 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
650 break;
651 } else {
652 if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
653 break;
654 }
655 }
656
657 return con->features;
658}
659
660static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
661 bool bypass)
662{
663 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
664 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
665 int i;
666 const enum amdgpu_ras_error_type default_ras_type =
667 AMDGPU_RAS_ERROR__NONE;
668
669 for (i = 0; i < ras_block_count; i++) {
670 struct ras_common_if head = {
671 .block = i,
672 .type = default_ras_type,
673 .sub_block_index = 0,
674 };
675 strcpy(head.name, ras_block_str(i));
676 if (bypass) {
677
678
679
680
681 if (__amdgpu_ras_feature_enable(adev, &head, 1))
682 break;
683 } else {
684 if (amdgpu_ras_feature_enable(adev, &head, 1))
685 break;
686 }
687 }
688
689 return con->features;
690}
691
692
693
694int amdgpu_ras_error_query(struct amdgpu_device *adev,
695 struct ras_query_if *info)
696{
697 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
698 struct ras_err_data err_data = {0, 0, 0, NULL};
699 int i;
700
701 if (!obj)
702 return -EINVAL;
703
704 switch (info->head.block) {
705 case AMDGPU_RAS_BLOCK__UMC:
706 if (adev->umc.funcs->query_ras_error_count)
707 adev->umc.funcs->query_ras_error_count(adev, &err_data);
708
709
710
711 if (adev->umc.funcs->query_ras_error_address)
712 adev->umc.funcs->query_ras_error_address(adev, &err_data);
713 break;
714 case AMDGPU_RAS_BLOCK__SDMA:
715 if (adev->sdma.funcs->query_ras_error_count) {
716 for (i = 0; i < adev->sdma.num_instances; i++)
717 adev->sdma.funcs->query_ras_error_count(adev, i,
718 &err_data);
719 }
720 break;
721 case AMDGPU_RAS_BLOCK__GFX:
722 if (adev->gfx.funcs->query_ras_error_count)
723 adev->gfx.funcs->query_ras_error_count(adev, &err_data);
724 break;
725 case AMDGPU_RAS_BLOCK__MMHUB:
726 if (adev->mmhub.funcs->query_ras_error_count)
727 adev->mmhub.funcs->query_ras_error_count(adev, &err_data);
728 break;
729 case AMDGPU_RAS_BLOCK__PCIE_BIF:
730 if (adev->nbio.funcs->query_ras_error_count)
731 adev->nbio.funcs->query_ras_error_count(adev, &err_data);
732 break;
733 case AMDGPU_RAS_BLOCK__XGMI_WAFL:
734 amdgpu_xgmi_query_ras_error_count(adev, &err_data);
735 break;
736 default:
737 break;
738 }
739
740 obj->err_data.ue_count += err_data.ue_count;
741 obj->err_data.ce_count += err_data.ce_count;
742
743 info->ue_count = obj->err_data.ue_count;
744 info->ce_count = obj->err_data.ce_count;
745
746 if (err_data.ce_count) {
747 dev_info(adev->dev, "%ld correctable errors detected in %s block\n",
748 obj->err_data.ce_count, ras_block_str(info->head.block));
749 }
750 if (err_data.ue_count) {
751 dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n",
752 obj->err_data.ue_count, ras_block_str(info->head.block));
753 }
754
755 return 0;
756}
757
758
759int amdgpu_ras_error_inject(struct amdgpu_device *adev,
760 struct ras_inject_if *info)
761{
762 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
763 struct ta_ras_trigger_error_input block_info = {
764 .block_id = amdgpu_ras_block_to_ta(info->head.block),
765 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
766 .sub_block_index = info->head.sub_block_index,
767 .address = info->address,
768 .value = info->value,
769 };
770 int ret = 0;
771
772 if (!obj)
773 return -EINVAL;
774
775
776 if (adev->gmc.xgmi.num_physical_nodes > 1) {
777 block_info.address =
778 amdgpu_xgmi_get_relative_phy_addr(adev,
779 block_info.address);
780 }
781
782 switch (info->head.block) {
783 case AMDGPU_RAS_BLOCK__GFX:
784 if (adev->gfx.funcs->ras_error_inject)
785 ret = adev->gfx.funcs->ras_error_inject(adev, info);
786 else
787 ret = -EINVAL;
788 break;
789 case AMDGPU_RAS_BLOCK__UMC:
790 case AMDGPU_RAS_BLOCK__MMHUB:
791 case AMDGPU_RAS_BLOCK__XGMI_WAFL:
792 case AMDGPU_RAS_BLOCK__PCIE_BIF:
793 ret = psp_ras_trigger_error(&adev->psp, &block_info);
794 break;
795 default:
796 DRM_INFO("%s error injection is not supported yet\n",
797 ras_block_str(info->head.block));
798 ret = -EINVAL;
799 }
800
801 if (ret)
802 DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
803 ras_block_str(info->head.block),
804 ret);
805
806 return ret;
807}
808
809int amdgpu_ras_error_cure(struct amdgpu_device *adev,
810 struct ras_cure_if *info)
811{
812
813 return 0;
814}
815
816
817unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
818 bool is_ce)
819{
820 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
821 struct ras_manager *obj;
822 struct ras_err_data data = {0, 0};
823
824 if (!con)
825 return 0;
826
827 list_for_each_entry(obj, &con->head, node) {
828 struct ras_query_if info = {
829 .head = obj->head,
830 };
831
832 if (amdgpu_ras_error_query(adev, &info))
833 return 0;
834
835 data.ce_count += info.ce_count;
836 data.ue_count += info.ue_count;
837 }
838
839 return is_ce ? data.ce_count : data.ue_count;
840}
841
842
843
844
845
846static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
847 struct ras_badpage **bps, unsigned int *count);
848
849static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
850{
851 switch (flags) {
852 case AMDGPU_RAS_RETIRE_PAGE_RESERVED:
853 return "R";
854 case AMDGPU_RAS_RETIRE_PAGE_PENDING:
855 return "P";
856 case AMDGPU_RAS_RETIRE_PAGE_FAULT:
857 default:
858 return "F";
859 };
860}
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
893 struct kobject *kobj, struct bin_attribute *attr,
894 char *buf, loff_t ppos, size_t count)
895{
896 struct amdgpu_ras *con =
897 container_of(attr, struct amdgpu_ras, badpages_attr);
898 struct amdgpu_device *adev = con->adev;
899 const unsigned int element_size =
900 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
901 unsigned int start = div64_ul(ppos + element_size - 1, element_size);
902 unsigned int end = div64_ul(ppos + count - 1, element_size);
903 ssize_t s = 0;
904 struct ras_badpage *bps = NULL;
905 unsigned int bps_count = 0;
906
907 memset(buf, 0, count);
908
909 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
910 return 0;
911
912 for (; start < end && start < bps_count; start++)
913 s += scnprintf(&buf[s], element_size + 1,
914 "0x%08x : 0x%08x : %1s\n",
915 bps[start].bp,
916 bps[start].size,
917 amdgpu_ras_badpage_flags_str(bps[start].flags));
918
919 kfree(bps);
920
921 return s;
922}
923
924static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
925 struct device_attribute *attr, char *buf)
926{
927 struct amdgpu_ras *con =
928 container_of(attr, struct amdgpu_ras, features_attr);
929
930 return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
931}
932
933static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
934{
935 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
936 struct attribute *attrs[] = {
937 &con->features_attr.attr,
938 NULL
939 };
940 struct bin_attribute *bin_attrs[] = {
941 &con->badpages_attr,
942 NULL
943 };
944 struct attribute_group group = {
945 .name = "ras",
946 .attrs = attrs,
947 .bin_attrs = bin_attrs,
948 };
949
950 con->features_attr = (struct device_attribute) {
951 .attr = {
952 .name = "features",
953 .mode = S_IRUGO,
954 },
955 .show = amdgpu_ras_sysfs_features_read,
956 };
957
958 con->badpages_attr = (struct bin_attribute) {
959 .attr = {
960 .name = "gpu_vram_bad_pages",
961 .mode = S_IRUGO,
962 },
963 .size = 0,
964 .private = NULL,
965 .read = amdgpu_ras_sysfs_badpages_read,
966 };
967
968 sysfs_attr_init(attrs[0]);
969 sysfs_bin_attr_init(bin_attrs[0]);
970
971 return sysfs_create_group(&adev->dev->kobj, &group);
972}
973
974static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
975{
976 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
977 struct attribute *attrs[] = {
978 &con->features_attr.attr,
979 NULL
980 };
981 struct bin_attribute *bin_attrs[] = {
982 &con->badpages_attr,
983 NULL
984 };
985 struct attribute_group group = {
986 .name = "ras",
987 .attrs = attrs,
988 .bin_attrs = bin_attrs,
989 };
990
991 sysfs_remove_group(&adev->dev->kobj, &group);
992
993 return 0;
994}
995
996int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
997 struct ras_fs_if *head)
998{
999 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
1000
1001 if (!obj || obj->attr_inuse)
1002 return -EINVAL;
1003
1004 get_obj(obj);
1005
1006 memcpy(obj->fs_data.sysfs_name,
1007 head->sysfs_name,
1008 sizeof(obj->fs_data.sysfs_name));
1009
1010 obj->sysfs_attr = (struct device_attribute){
1011 .attr = {
1012 .name = obj->fs_data.sysfs_name,
1013 .mode = S_IRUGO,
1014 },
1015 .show = amdgpu_ras_sysfs_read,
1016 };
1017 sysfs_attr_init(&obj->sysfs_attr.attr);
1018
1019 if (sysfs_add_file_to_group(&adev->dev->kobj,
1020 &obj->sysfs_attr.attr,
1021 "ras")) {
1022 put_obj(obj);
1023 return -EINVAL;
1024 }
1025
1026 obj->attr_inuse = 1;
1027
1028 return 0;
1029}
1030
1031int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
1032 struct ras_common_if *head)
1033{
1034 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1035
1036 if (!obj || !obj->attr_inuse)
1037 return -EINVAL;
1038
1039 sysfs_remove_file_from_group(&adev->dev->kobj,
1040 &obj->sysfs_attr.attr,
1041 "ras");
1042 obj->attr_inuse = 0;
1043 put_obj(obj);
1044
1045 return 0;
1046}
1047
1048static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
1049{
1050 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1051 struct ras_manager *obj, *tmp;
1052
1053 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1054 amdgpu_ras_sysfs_remove(adev, &obj->head);
1055 }
1056
1057 amdgpu_ras_sysfs_remove_feature_node(adev);
1058
1059 return 0;
1060}
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
1083{
1084 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1085 struct drm_minor *minor = adev->ddev->primary;
1086
1087 con->dir = debugfs_create_dir("ras", minor->debugfs_root);
1088 debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir,
1089 adev, &amdgpu_ras_debugfs_ctrl_ops);
1090 debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
1091 adev, &amdgpu_ras_debugfs_eeprom_ops);
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101 debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
1102 &con->reboot);
1103}
1104
1105void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
1106 struct ras_fs_if *head)
1107{
1108 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1109 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
1110
1111 if (!obj || obj->ent)
1112 return;
1113
1114 get_obj(obj);
1115
1116 memcpy(obj->fs_data.debugfs_name,
1117 head->debugfs_name,
1118 sizeof(obj->fs_data.debugfs_name));
1119
1120 obj->ent = debugfs_create_file(obj->fs_data.debugfs_name,
1121 S_IWUGO | S_IRUGO, con->dir, obj,
1122 &amdgpu_ras_debugfs_ops);
1123}
1124
1125void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
1126{
1127 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1128 struct ras_manager *obj;
1129 struct ras_fs_if fs_info;
1130
1131
1132
1133
1134
1135 if (!con)
1136 return;
1137
1138 amdgpu_ras_debugfs_create_ctrl_node(adev);
1139
1140 list_for_each_entry(obj, &con->head, node) {
1141 if (amdgpu_ras_is_supported(adev, obj->head.block) &&
1142 (obj->attr_inuse == 1)) {
1143 sprintf(fs_info.debugfs_name, "%s_err_inject",
1144 ras_block_str(obj->head.block));
1145 fs_info.head = obj->head;
1146 amdgpu_ras_debugfs_create(adev, &fs_info);
1147 }
1148 }
1149}
1150
1151void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
1152 struct ras_common_if *head)
1153{
1154 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1155
1156 if (!obj || !obj->ent)
1157 return;
1158
1159 debugfs_remove(obj->ent);
1160 obj->ent = NULL;
1161 put_obj(obj);
1162}
1163
1164static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
1165{
1166 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1167 struct ras_manager *obj, *tmp;
1168
1169 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1170 amdgpu_ras_debugfs_remove(adev, &obj->head);
1171 }
1172
1173 debugfs_remove_recursive(con->dir);
1174 con->dir = NULL;
1175}
1176
1177
1178
1179
1180static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
1181{
1182 amdgpu_ras_sysfs_create_feature_node(adev);
1183
1184 return 0;
1185}
1186
1187static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
1188{
1189 amdgpu_ras_debugfs_remove_all(adev);
1190 amdgpu_ras_sysfs_remove_all(adev);
1191 return 0;
1192}
1193
1194
1195
1196static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
1197{
1198 struct ras_ih_data *data = &obj->ih_data;
1199 struct amdgpu_iv_entry entry;
1200 int ret;
1201 struct ras_err_data err_data = {0, 0, 0, NULL};
1202
1203 while (data->rptr != data->wptr) {
1204 rmb();
1205 memcpy(&entry, &data->ring[data->rptr],
1206 data->element_size);
1207
1208 wmb();
1209 data->rptr = (data->aligned_element_size +
1210 data->rptr) % data->ring_size;
1211
1212
1213
1214
1215 if (data->cb) {
1216 ret = data->cb(obj->adev, &err_data, &entry);
1217
1218
1219
1220
1221
1222 if (ret == AMDGPU_RAS_SUCCESS) {
1223
1224
1225
1226 obj->err_data.ue_count += err_data.ue_count;
1227 obj->err_data.ce_count += err_data.ce_count;
1228 }
1229 }
1230 }
1231}
1232
1233static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1234{
1235 struct ras_ih_data *data =
1236 container_of(work, struct ras_ih_data, ih_work);
1237 struct ras_manager *obj =
1238 container_of(data, struct ras_manager, ih_data);
1239
1240 amdgpu_ras_interrupt_handler(obj);
1241}
1242
1243int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1244 struct ras_dispatch_if *info)
1245{
1246 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1247 struct ras_ih_data *data = &obj->ih_data;
1248
1249 if (!obj)
1250 return -EINVAL;
1251
1252 if (data->inuse == 0)
1253 return 0;
1254
1255
1256 memcpy(&data->ring[data->wptr], info->entry,
1257 data->element_size);
1258
1259 wmb();
1260 data->wptr = (data->aligned_element_size +
1261 data->wptr) % data->ring_size;
1262
1263 schedule_work(&data->ih_work);
1264
1265 return 0;
1266}
1267
1268int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1269 struct ras_ih_if *info)
1270{
1271 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1272 struct ras_ih_data *data;
1273
1274 if (!obj)
1275 return -EINVAL;
1276
1277 data = &obj->ih_data;
1278 if (data->inuse == 0)
1279 return 0;
1280
1281 cancel_work_sync(&data->ih_work);
1282
1283 kfree(data->ring);
1284 memset(data, 0, sizeof(*data));
1285 put_obj(obj);
1286
1287 return 0;
1288}
1289
1290int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1291 struct ras_ih_if *info)
1292{
1293 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1294 struct ras_ih_data *data;
1295
1296 if (!obj) {
1297
1298 obj = amdgpu_ras_create_obj(adev, &info->head);
1299 if (!obj)
1300 return -EINVAL;
1301 } else
1302 get_obj(obj);
1303
1304 data = &obj->ih_data;
1305
1306 *data = (struct ras_ih_data) {
1307 .inuse = 0,
1308 .cb = info->cb,
1309 .element_size = sizeof(struct amdgpu_iv_entry),
1310 .rptr = 0,
1311 .wptr = 0,
1312 };
1313
1314 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1315
1316 data->aligned_element_size = ALIGN(data->element_size, 8);
1317
1318 data->ring_size = 64 * data->aligned_element_size;
1319 data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1320 if (!data->ring) {
1321 put_obj(obj);
1322 return -ENOMEM;
1323 }
1324
1325
1326 data->inuse = 1;
1327
1328 return 0;
1329}
1330
1331static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1332{
1333 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1334 struct ras_manager *obj, *tmp;
1335
1336 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1337 struct ras_ih_if info = {
1338 .head = obj->head,
1339 };
1340 amdgpu_ras_interrupt_remove_handler(adev, &info);
1341 }
1342
1343 return 0;
1344}
1345
1346
1347
1348static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
1349{
1350 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1351 struct ras_manager *obj;
1352
1353 if (!con)
1354 return;
1355
1356 list_for_each_entry(obj, &con->head, node) {
1357 struct ras_query_if info = {
1358 .head = obj->head,
1359 };
1360
1361
1362
1363
1364
1365
1366
1367 if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
1368 continue;
1369
1370 amdgpu_ras_error_query(adev, &info);
1371 }
1372}
1373
1374
1375
1376
1377
1378
1379static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1380 struct ras_badpage **bps, unsigned int *count)
1381{
1382 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1383 struct ras_err_handler_data *data;
1384 int i = 0;
1385 int ret = 0;
1386
1387 if (!con || !con->eh_data || !bps || !count)
1388 return -EINVAL;
1389
1390 mutex_lock(&con->recovery_lock);
1391 data = con->eh_data;
1392 if (!data || data->count == 0) {
1393 *bps = NULL;
1394 ret = -EINVAL;
1395 goto out;
1396 }
1397
1398 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
1399 if (!*bps) {
1400 ret = -ENOMEM;
1401 goto out;
1402 }
1403
1404 for (; i < data->count; i++) {
1405 (*bps)[i] = (struct ras_badpage){
1406 .bp = data->bps[i].retired_page,
1407 .size = AMDGPU_GPU_PAGE_SIZE,
1408 .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
1409 };
1410
1411 if (data->last_reserved <= i)
1412 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
1413 else if (data->bps_bo[i] == NULL)
1414 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
1415 }
1416
1417 *count = data->count;
1418out:
1419 mutex_unlock(&con->recovery_lock);
1420 return ret;
1421}
1422
1423static void amdgpu_ras_do_recovery(struct work_struct *work)
1424{
1425 struct amdgpu_ras *ras =
1426 container_of(work, struct amdgpu_ras, recovery_work);
1427 struct amdgpu_device *remote_adev = NULL;
1428 struct amdgpu_device *adev = ras->adev;
1429 struct list_head device_list, *device_list_handle = NULL;
1430 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false);
1431
1432
1433 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) {
1434 device_list_handle = &hive->device_list;
1435 } else {
1436 list_add_tail(&adev->gmc.xgmi.head, &device_list);
1437 device_list_handle = &device_list;
1438 }
1439
1440 list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) {
1441 amdgpu_ras_log_on_err_counter(remote_adev);
1442 }
1443
1444 if (amdgpu_device_should_recover_gpu(ras->adev))
1445 amdgpu_device_gpu_recover(ras->adev, 0);
1446 atomic_set(&ras->in_recovery, 0);
1447}
1448
1449
1450static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1451 struct ras_err_handler_data *data, int pages)
1452{
1453 unsigned int old_space = data->count + data->space_left;
1454 unsigned int new_space = old_space + pages;
1455 unsigned int align_space = ALIGN(new_space, 512);
1456 void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1457 struct amdgpu_bo **bps_bo =
1458 kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL);
1459
1460 if (!bps || !bps_bo) {
1461 kfree(bps);
1462 kfree(bps_bo);
1463 return -ENOMEM;
1464 }
1465
1466 if (data->bps) {
1467 memcpy(bps, data->bps,
1468 data->count * sizeof(*data->bps));
1469 kfree(data->bps);
1470 }
1471 if (data->bps_bo) {
1472 memcpy(bps_bo, data->bps_bo,
1473 data->count * sizeof(*data->bps_bo));
1474 kfree(data->bps_bo);
1475 }
1476
1477 data->bps = bps;
1478 data->bps_bo = bps_bo;
1479 data->space_left += align_space - old_space;
1480 return 0;
1481}
1482
1483
1484int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
1485 struct eeprom_table_record *bps, int pages)
1486{
1487 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1488 struct ras_err_handler_data *data;
1489 int ret = 0;
1490
1491 if (!con || !con->eh_data || !bps || pages <= 0)
1492 return 0;
1493
1494 mutex_lock(&con->recovery_lock);
1495 data = con->eh_data;
1496 if (!data)
1497 goto out;
1498
1499 if (data->space_left <= pages)
1500 if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
1501 ret = -ENOMEM;
1502 goto out;
1503 }
1504
1505 memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
1506 data->count += pages;
1507 data->space_left -= pages;
1508
1509out:
1510 mutex_unlock(&con->recovery_lock);
1511
1512 return ret;
1513}
1514
1515
1516
1517
1518
1519static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1520{
1521 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1522 struct ras_err_handler_data *data;
1523 struct amdgpu_ras_eeprom_control *control;
1524 int save_count;
1525
1526 if (!con || !con->eh_data)
1527 return 0;
1528
1529 control = &con->eeprom_control;
1530 data = con->eh_data;
1531 save_count = data->count - control->num_recs;
1532
1533 if (save_count > 0)
1534 if (amdgpu_ras_eeprom_process_recods(control,
1535 &data->bps[control->num_recs],
1536 true,
1537 save_count)) {
1538 DRM_ERROR("Failed to save EEPROM table data!");
1539 return -EIO;
1540 }
1541
1542 return 0;
1543}
1544
1545
1546
1547
1548
1549static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1550{
1551 struct amdgpu_ras_eeprom_control *control =
1552 &adev->psp.ras.ras->eeprom_control;
1553 struct eeprom_table_record *bps = NULL;
1554 int ret = 0;
1555
1556
1557 if (!control->num_recs)
1558 return ret;
1559
1560 bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
1561 if (!bps)
1562 return -ENOMEM;
1563
1564 if (amdgpu_ras_eeprom_process_recods(control, bps, false,
1565 control->num_recs)) {
1566 DRM_ERROR("Failed to load EEPROM table records!");
1567 ret = -EIO;
1568 goto out;
1569 }
1570
1571 ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
1572
1573out:
1574 kfree(bps);
1575 return ret;
1576}
1577
1578
1579
1580
1581
1582
1583static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
1584 uint64_t addr)
1585{
1586 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1587 struct ras_err_handler_data *data;
1588 int i;
1589 bool ret = false;
1590
1591 if (!con || !con->eh_data)
1592 return ret;
1593
1594 mutex_lock(&con->recovery_lock);
1595 data = con->eh_data;
1596 if (!data)
1597 goto out;
1598
1599 addr >>= AMDGPU_GPU_PAGE_SHIFT;
1600 for (i = 0; i < data->count; i++)
1601 if (addr == data->bps[i].retired_page) {
1602 ret = true;
1603 goto out;
1604 }
1605
1606out:
1607 mutex_unlock(&con->recovery_lock);
1608 return ret;
1609}
1610
1611
1612int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
1613{
1614 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1615 struct ras_err_handler_data *data;
1616 uint64_t bp;
1617 struct amdgpu_bo *bo = NULL;
1618 int i, ret = 0;
1619
1620 if (!con || !con->eh_data)
1621 return 0;
1622
1623 mutex_lock(&con->recovery_lock);
1624 data = con->eh_data;
1625 if (!data)
1626 goto out;
1627
1628 for (i = data->last_reserved; i < data->count; i++) {
1629 bp = data->bps[i].retired_page;
1630
1631
1632
1633
1634
1635
1636 if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
1637 AMDGPU_GPU_PAGE_SIZE,
1638 AMDGPU_GEM_DOMAIN_VRAM,
1639 &bo, NULL))
1640 DRM_WARN("RAS WARN: reserve vram for retired page %llx fail\n", bp);
1641
1642 data->bps_bo[i] = bo;
1643 data->last_reserved = i + 1;
1644 bo = NULL;
1645 }
1646
1647
1648 ret = amdgpu_ras_save_bad_pages(adev);
1649out:
1650 mutex_unlock(&con->recovery_lock);
1651 return ret;
1652}
1653
1654
1655static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
1656{
1657 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1658 struct ras_err_handler_data *data;
1659 struct amdgpu_bo *bo;
1660 int i;
1661
1662 if (!con || !con->eh_data)
1663 return 0;
1664
1665 mutex_lock(&con->recovery_lock);
1666 data = con->eh_data;
1667 if (!data)
1668 goto out;
1669
1670 for (i = data->last_reserved - 1; i >= 0; i--) {
1671 bo = data->bps_bo[i];
1672
1673 amdgpu_bo_free_kernel(&bo, NULL, NULL);
1674
1675 data->bps_bo[i] = bo;
1676 data->last_reserved = i;
1677 }
1678out:
1679 mutex_unlock(&con->recovery_lock);
1680 return 0;
1681}
1682
1683int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
1684{
1685 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1686 struct ras_err_handler_data **data;
1687 int ret;
1688
1689 if (con)
1690 data = &con->eh_data;
1691 else
1692 return 0;
1693
1694 *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
1695 if (!*data) {
1696 ret = -ENOMEM;
1697 goto out;
1698 }
1699
1700 mutex_init(&con->recovery_lock);
1701 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
1702 atomic_set(&con->in_recovery, 0);
1703 con->adev = adev;
1704
1705 ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
1706 if (ret)
1707 goto free;
1708
1709 if (con->eeprom_control.num_recs) {
1710 ret = amdgpu_ras_load_bad_pages(adev);
1711 if (ret)
1712 goto free;
1713 ret = amdgpu_ras_reserve_bad_pages(adev);
1714 if (ret)
1715 goto release;
1716 }
1717
1718 return 0;
1719
1720release:
1721 amdgpu_ras_release_bad_pages(adev);
1722free:
1723 kfree((*data)->bps);
1724 kfree((*data)->bps_bo);
1725 kfree(*data);
1726 con->eh_data = NULL;
1727out:
1728 DRM_WARN("Failed to initialize ras recovery!\n");
1729
1730 return ret;
1731}
1732
1733static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
1734{
1735 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1736 struct ras_err_handler_data *data = con->eh_data;
1737
1738
1739 if (!data)
1740 return 0;
1741
1742 cancel_work_sync(&con->recovery_work);
1743 amdgpu_ras_release_bad_pages(adev);
1744
1745 mutex_lock(&con->recovery_lock);
1746 con->eh_data = NULL;
1747 kfree(data->bps);
1748 kfree(data->bps_bo);
1749 kfree(data);
1750 mutex_unlock(&con->recovery_lock);
1751
1752 return 0;
1753}
1754
1755
1756
1757int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
1758 unsigned int block)
1759{
1760 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1761
1762 if (!ras)
1763 return -EINVAL;
1764
1765 ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1766 return 0;
1767}
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
1779 uint32_t *hw_supported, uint32_t *supported)
1780{
1781 *hw_supported = 0;
1782 *supported = 0;
1783
1784 if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
1785 (adev->asic_type != CHIP_VEGA20 &&
1786 adev->asic_type != CHIP_ARCTURUS))
1787 return;
1788
1789 if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
1790 DRM_INFO("HBM ECC is active.\n");
1791 *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC |
1792 1 << AMDGPU_RAS_BLOCK__DF);
1793 } else
1794 DRM_INFO("HBM ECC is not presented.\n");
1795
1796 if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
1797 DRM_INFO("SRAM ECC is active.\n");
1798 *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
1799 1 << AMDGPU_RAS_BLOCK__DF);
1800 } else
1801 DRM_INFO("SRAM ECC is not presented.\n");
1802
1803
1804 *hw_supported &= AMDGPU_RAS_BLOCK_MASK;
1805
1806 *supported = amdgpu_ras_enable == 0 ?
1807 0 : *hw_supported & amdgpu_ras_mask;
1808}
1809
1810int amdgpu_ras_init(struct amdgpu_device *adev)
1811{
1812 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1813 int r;
1814
1815 if (con)
1816 return 0;
1817
1818 con = kmalloc(sizeof(struct amdgpu_ras) +
1819 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
1820 GFP_KERNEL|__GFP_ZERO);
1821 if (!con)
1822 return -ENOMEM;
1823
1824 con->objs = (struct ras_manager *)(con + 1);
1825
1826 amdgpu_ras_set_context(adev, con);
1827
1828 amdgpu_ras_check_supported(adev, &con->hw_supported,
1829 &con->supported);
1830 if (!con->hw_supported) {
1831 amdgpu_ras_set_context(adev, NULL);
1832 kfree(con);
1833 return 0;
1834 }
1835
1836 con->features = 0;
1837 INIT_LIST_HEAD(&con->head);
1838
1839 con->flags = RAS_DEFAULT_FLAGS;
1840
1841 if (adev->nbio.funcs->init_ras_controller_interrupt) {
1842 r = adev->nbio.funcs->init_ras_controller_interrupt(adev);
1843 if (r)
1844 return r;
1845 }
1846
1847 if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) {
1848 r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev);
1849 if (r)
1850 return r;
1851 }
1852
1853 amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
1854
1855 if (amdgpu_ras_fs_init(adev))
1856 goto fs_out;
1857
1858 DRM_INFO("RAS INFO: ras initialized successfully, "
1859 "hardware ability[%x] ras_mask[%x]\n",
1860 con->hw_supported, con->supported);
1861 return 0;
1862fs_out:
1863 amdgpu_ras_set_context(adev, NULL);
1864 kfree(con);
1865
1866 return -EINVAL;
1867}
1868
1869
1870int amdgpu_ras_late_init(struct amdgpu_device *adev,
1871 struct ras_common_if *ras_block,
1872 struct ras_fs_if *fs_info,
1873 struct ras_ih_if *ih_info)
1874{
1875 int r;
1876
1877
1878 if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
1879 amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
1880 return 0;
1881 }
1882
1883 r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
1884 if (r) {
1885 if (r == -EAGAIN) {
1886
1887 amdgpu_ras_request_reset_on_boot(adev,
1888 ras_block->block);
1889 return 0;
1890 } else if (adev->in_suspend || adev->in_gpu_reset) {
1891
1892
1893 goto cleanup;
1894 } else
1895 return r;
1896 }
1897
1898
1899 if (adev->in_suspend || adev->in_gpu_reset)
1900 return 0;
1901
1902 if (ih_info->cb) {
1903 r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
1904 if (r)
1905 goto interrupt;
1906 }
1907
1908 r = amdgpu_ras_sysfs_create(adev, fs_info);
1909 if (r)
1910 goto sysfs;
1911
1912 return 0;
1913cleanup:
1914 amdgpu_ras_sysfs_remove(adev, ras_block);
1915sysfs:
1916 if (ih_info->cb)
1917 amdgpu_ras_interrupt_remove_handler(adev, ih_info);
1918interrupt:
1919 amdgpu_ras_feature_enable(adev, ras_block, 0);
1920 return r;
1921}
1922
1923
1924void amdgpu_ras_late_fini(struct amdgpu_device *adev,
1925 struct ras_common_if *ras_block,
1926 struct ras_ih_if *ih_info)
1927{
1928 if (!ras_block || !ih_info)
1929 return;
1930
1931 amdgpu_ras_sysfs_remove(adev, ras_block);
1932 if (ih_info->cb)
1933 amdgpu_ras_interrupt_remove_handler(adev, ih_info);
1934 amdgpu_ras_feature_enable(adev, ras_block, 0);
1935}
1936
1937
1938
1939
1940void amdgpu_ras_resume(struct amdgpu_device *adev)
1941{
1942 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1943 struct ras_manager *obj, *tmp;
1944
1945 if (!con)
1946 return;
1947
1948 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
1949
1950
1951
1952
1953
1954 amdgpu_ras_enable_all_features(adev, 1);
1955
1956
1957
1958
1959
1960 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1961 if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
1962 amdgpu_ras_feature_enable(adev, &obj->head, 0);
1963
1964 WARN_ON(alive_obj(obj));
1965 }
1966 }
1967 }
1968
1969 if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) {
1970 con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1971
1972
1973
1974
1975
1976
1977
1978 amdgpu_ras_disable_all_features(adev, 1);
1979 amdgpu_ras_reset_gpu(adev);
1980 }
1981}
1982
1983void amdgpu_ras_suspend(struct amdgpu_device *adev)
1984{
1985 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1986
1987 if (!con)
1988 return;
1989
1990 amdgpu_ras_disable_all_features(adev, 0);
1991
1992 if (con->features)
1993 amdgpu_ras_disable_all_features(adev, 1);
1994}
1995
1996
1997int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
1998{
1999 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2000
2001 if (!con)
2002 return 0;
2003
2004
2005 amdgpu_ras_disable_all_features(adev, 0);
2006 amdgpu_ras_recovery_fini(adev);
2007 return 0;
2008}
2009
2010int amdgpu_ras_fini(struct amdgpu_device *adev)
2011{
2012 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2013
2014 if (!con)
2015 return 0;
2016
2017 amdgpu_ras_fs_fini(adev);
2018 amdgpu_ras_interrupt_remove_all(adev);
2019
2020 WARN(con->features, "Feature mask is not cleared");
2021
2022 if (con->features)
2023 amdgpu_ras_disable_all_features(adev, 1);
2024
2025 amdgpu_ras_set_context(adev, NULL);
2026 kfree(con);
2027
2028 return 0;
2029}
2030
2031void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
2032{
2033 uint32_t hw_supported, supported;
2034
2035 amdgpu_ras_check_supported(adev, &hw_supported, &supported);
2036 if (!hw_supported)
2037 return;
2038
2039 if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
2040 DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n");
2041
2042 amdgpu_ras_reset_gpu(adev);
2043 }
2044}
2045