1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/debugfs.h>
25#include <linux/list.h>
26#include <linux/module.h>
27#include <linux/uaccess.h>
28#include <linux/reboot.h>
29#include <linux/syscalls.h>
30
31#include "amdgpu.h"
32#include "amdgpu_ras.h"
33#include "amdgpu_atomfirmware.h"
34#include "amdgpu_xgmi.h"
35#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
36
37static const char *RAS_FS_NAME = "ras";
38
39const char *ras_error_string[] = {
40 "none",
41 "parity",
42 "single_correctable",
43 "multi_uncorrectable",
44 "poison",
45};
46
47const char *ras_block_string[] = {
48 "umc",
49 "sdma",
50 "gfx",
51 "mmhub",
52 "athub",
53 "pcie_bif",
54 "hdp",
55 "xgmi_wafl",
56 "df",
57 "smn",
58 "sem",
59 "mp0",
60 "mp1",
61 "fuse",
62};
63
64#define ras_err_str(i) (ras_error_string[ffs(i)])
65#define ras_block_str(i) (ras_block_string[i])
66
67#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
68
69
70#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
71
72
73#define RAS_BAD_PAGE_RATE (100 * 1024 * 1024ULL)
74
75enum amdgpu_ras_retire_page_reservation {
76 AMDGPU_RAS_RETIRE_PAGE_RESERVED,
77 AMDGPU_RAS_RETIRE_PAGE_PENDING,
78 AMDGPU_RAS_RETIRE_PAGE_FAULT,
79};
80
81atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
82
83static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
84 uint64_t addr);
85static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
86 uint64_t addr);
87
88void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
89{
90 if (adev && amdgpu_ras_get_context(adev))
91 amdgpu_ras_get_context(adev)->error_query_ready = ready;
92}
93
94static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
95{
96 if (adev && amdgpu_ras_get_context(adev))
97 return amdgpu_ras_get_context(adev)->error_query_ready;
98
99 return false;
100}
101
102static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
103 size_t size, loff_t *pos)
104{
105 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
106 struct ras_query_if info = {
107 .head = obj->head,
108 };
109 ssize_t s;
110 char val[128];
111
112 if (amdgpu_ras_error_query(obj->adev, &info))
113 return -EINVAL;
114
115 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
116 "ue", info.ue_count,
117 "ce", info.ce_count);
118 if (*pos >= s)
119 return 0;
120
121 s -= *pos;
122 s = min_t(u64, s, size);
123
124
125 if (copy_to_user(buf, &val[*pos], s))
126 return -EINVAL;
127
128 *pos += s;
129
130 return s;
131}
132
133static const struct file_operations amdgpu_ras_debugfs_ops = {
134 .owner = THIS_MODULE,
135 .read = amdgpu_ras_debugfs_read,
136 .write = NULL,
137 .llseek = default_llseek
138};
139
140static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
141{
142 int i;
143
144 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
145 *block_id = i;
146 if (strcmp(name, ras_block_str(i)) == 0)
147 return 0;
148 }
149 return -EINVAL;
150}
151
152static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
153 const char __user *buf, size_t size,
154 loff_t *pos, struct ras_debug_if *data)
155{
156 ssize_t s = min_t(u64, 64, size);
157 char str[65];
158 char block_name[33];
159 char err[9] = "ue";
160 int op = -1;
161 int block_id;
162 uint32_t sub_block;
163 u64 address, value;
164
165 if (*pos)
166 return -EINVAL;
167 *pos = size;
168
169 memset(str, 0, sizeof(str));
170 memset(data, 0, sizeof(*data));
171
172 if (copy_from_user(str, buf, s))
173 return -EINVAL;
174
175 if (sscanf(str, "disable %32s", block_name) == 1)
176 op = 0;
177 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
178 op = 1;
179 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
180 op = 2;
181 else if (str[0] && str[1] && str[2] && str[3])
182
183 return -EINVAL;
184
185 if (op != -1) {
186 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
187 return -EINVAL;
188
189 data->head.block = block_id;
190
191 if (!memcmp("ue", err, 2))
192 data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
193 else if (!memcmp("ce", err, 2))
194 data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
195 else
196 return -EINVAL;
197
198 data->op = op;
199
200 if (op == 2) {
201 if (sscanf(str, "%*s %*s %*s %u %llu %llu",
202 &sub_block, &address, &value) != 3)
203 if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
204 &sub_block, &address, &value) != 3)
205 return -EINVAL;
206 data->head.sub_block_index = sub_block;
207 data->inject.address = address;
208 data->inject.value = value;
209 }
210 } else {
211 if (size < sizeof(*data))
212 return -EINVAL;
213
214 if (copy_from_user(data, buf, sizeof(*data)))
215 return -EINVAL;
216 }
217
218 return 0;
219}
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
297 size_t size, loff_t *pos)
298{
299 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
300 struct ras_debug_if data;
301 int ret = 0;
302
303 if (!amdgpu_ras_get_error_query_ready(adev)) {
304 dev_warn(adev->dev, "RAS WARN: error injection "
305 "currently inaccessible\n");
306 return size;
307 }
308
309 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
310 if (ret)
311 return -EINVAL;
312
313 if (!amdgpu_ras_is_supported(adev, data.head.block))
314 return -EINVAL;
315
316 switch (data.op) {
317 case 0:
318 ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
319 break;
320 case 1:
321 ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
322 break;
323 case 2:
324 if ((data.inject.address >= adev->gmc.mc_vram_size) ||
325 (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
326 dev_warn(adev->dev, "RAS WARN: input address "
327 "0x%llx is invalid.",
328 data.inject.address);
329 ret = -EINVAL;
330 break;
331 }
332
333
334 if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
335 amdgpu_ras_check_bad_page(adev, data.inject.address)) {
336 dev_warn(adev->dev, "RAS WARN: 0x%llx has been marked "
337 "as bad before error injection!\n",
338 data.inject.address);
339 break;
340 }
341
342
343 ret = amdgpu_ras_error_inject(adev, &data.inject);
344 break;
345 default:
346 ret = -EINVAL;
347 break;
348 }
349
350 if (ret)
351 return -EINVAL;
352
353 return size;
354}
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user *buf,
373 size_t size, loff_t *pos)
374{
375 struct amdgpu_device *adev =
376 (struct amdgpu_device *)file_inode(f)->i_private;
377 int ret;
378
379 ret = amdgpu_ras_eeprom_reset_table(
380 &(amdgpu_ras_get_context(adev)->eeprom_control));
381
382 if (ret == 1) {
383 amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS;
384 return size;
385 } else {
386 return -EIO;
387 }
388}
389
390static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
391 .owner = THIS_MODULE,
392 .read = NULL,
393 .write = amdgpu_ras_debugfs_ctrl_write,
394 .llseek = default_llseek
395};
396
397static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
398 .owner = THIS_MODULE,
399 .read = NULL,
400 .write = amdgpu_ras_debugfs_eeprom_write,
401 .llseek = default_llseek
402};
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
426 struct device_attribute *attr, char *buf)
427{
428 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
429 struct ras_query_if info = {
430 .head = obj->head,
431 };
432
433 if (!amdgpu_ras_get_error_query_ready(obj->adev))
434 return snprintf(buf, PAGE_SIZE,
435 "Query currently inaccessible\n");
436
437 if (amdgpu_ras_error_query(obj->adev, &info))
438 return -EINVAL;
439
440 return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
441 "ue", info.ue_count,
442 "ce", info.ce_count);
443}
444
445
446
447#define get_obj(obj) do { (obj)->use++; } while (0)
448#define alive_obj(obj) ((obj)->use)
449
450static inline void put_obj(struct ras_manager *obj)
451{
452 if (obj && --obj->use == 0)
453 list_del(&obj->node);
454 if (obj && obj->use < 0) {
455 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
456 }
457}
458
459
460static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
461 struct ras_common_if *head)
462{
463 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
464 struct ras_manager *obj;
465
466 if (!con)
467 return NULL;
468
469 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
470 return NULL;
471
472 obj = &con->objs[head->block];
473
474 if (alive_obj(obj))
475 return NULL;
476
477 obj->head = *head;
478 obj->adev = adev;
479 list_add(&obj->node, &con->head);
480 get_obj(obj);
481
482 return obj;
483}
484
485
486struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
487 struct ras_common_if *head)
488{
489 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
490 struct ras_manager *obj;
491 int i;
492
493 if (!con)
494 return NULL;
495
496 if (head) {
497 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
498 return NULL;
499
500 obj = &con->objs[head->block];
501
502 if (alive_obj(obj)) {
503 WARN_ON(head->block != obj->head.block);
504 return obj;
505 }
506 } else {
507 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
508 obj = &con->objs[i];
509 if (alive_obj(obj)) {
510 WARN_ON(i != obj->head.block);
511 return obj;
512 }
513 }
514 }
515
516 return NULL;
517}
518
519
520static void amdgpu_ras_parse_status_code(struct amdgpu_device *adev,
521 const char* invoke_type,
522 const char* block_name,
523 enum ta_ras_status ret)
524{
525 switch (ret) {
526 case TA_RAS_STATUS__SUCCESS:
527 return;
528 case TA_RAS_STATUS__ERROR_RAS_NOT_AVAILABLE:
529 dev_warn(adev->dev,
530 "RAS WARN: %s %s currently unavailable\n",
531 invoke_type,
532 block_name);
533 break;
534 default:
535 dev_err(adev->dev,
536 "RAS ERROR: %s %s error failed ret 0x%X\n",
537 invoke_type,
538 block_name,
539 ret);
540 }
541}
542
543
544static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
545 struct ras_common_if *head)
546{
547 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
548
549 return con->hw_supported & BIT(head->block);
550}
551
552static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
553 struct ras_common_if *head)
554{
555 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
556
557 return con->features & BIT(head->block);
558}
559
560
561
562
563
564static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
565 struct ras_common_if *head, int enable)
566{
567 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
568 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
569
570
571
572
573
574
575
576 if (!amdgpu_ras_is_feature_allowed(adev, head))
577 return 0;
578 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
579 return 0;
580
581 if (enable) {
582 if (!obj) {
583 obj = amdgpu_ras_create_obj(adev, head);
584 if (!obj)
585 return -EINVAL;
586 } else {
587
588 get_obj(obj);
589 }
590 con->features |= BIT(head->block);
591 } else {
592 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
593 con->features &= ~BIT(head->block);
594 put_obj(obj);
595 }
596 }
597
598 return 0;
599}
600
601
602int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
603 struct ras_common_if *head, bool enable)
604{
605 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
606 union ta_ras_cmd_input *info;
607 int ret;
608
609 if (!con)
610 return -EINVAL;
611
612 info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);
613 if (!info)
614 return -ENOMEM;
615
616 if (!enable) {
617 info->disable_features = (struct ta_ras_disable_features_input) {
618 .block_id = amdgpu_ras_block_to_ta(head->block),
619 .error_type = amdgpu_ras_error_to_ta(head->type),
620 };
621 } else {
622 info->enable_features = (struct ta_ras_enable_features_input) {
623 .block_id = amdgpu_ras_block_to_ta(head->block),
624 .error_type = amdgpu_ras_error_to_ta(head->type),
625 };
626 }
627
628
629 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
630
631 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) {
632 ret = 0;
633 goto out;
634 }
635
636 if (!amdgpu_ras_intr_triggered()) {
637 ret = psp_ras_enable_features(&adev->psp, info, enable);
638 if (ret) {
639 amdgpu_ras_parse_status_code(adev,
640 enable ? "enable":"disable",
641 ras_block_str(head->block),
642 (enum ta_ras_status)ret);
643 if (ret == TA_RAS_STATUS__RESET_NEEDED)
644 ret = -EAGAIN;
645 else
646 ret = -EINVAL;
647
648 goto out;
649 }
650 }
651
652
653 __amdgpu_ras_feature_enable(adev, head, enable);
654 ret = 0;
655out:
656 kfree(info);
657 return ret;
658}
659
660
661int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
662 struct ras_common_if *head, bool enable)
663{
664 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
665 int ret;
666
667 if (!con)
668 return -EINVAL;
669
670 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
671 if (enable) {
672
673
674
675
676
677
678 ret = amdgpu_ras_feature_enable(adev, head, 1);
679
680
681
682
683 if (ret == -EINVAL) {
684 ret = __amdgpu_ras_feature_enable(adev, head, 1);
685 if (!ret)
686 dev_info(adev->dev,
687 "RAS INFO: %s setup object\n",
688 ras_block_str(head->block));
689 }
690 } else {
691
692 ret = __amdgpu_ras_feature_enable(adev, head, 1);
693 if (ret)
694 return ret;
695
696 ret = amdgpu_ras_feature_enable(adev, head, 0);
697 }
698 } else
699 ret = amdgpu_ras_feature_enable(adev, head, enable);
700
701 return ret;
702}
703
704static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
705 bool bypass)
706{
707 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
708 struct ras_manager *obj, *tmp;
709
710 list_for_each_entry_safe(obj, tmp, &con->head, node) {
711
712
713
714 if (bypass) {
715 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
716 break;
717 } else {
718 if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
719 break;
720 }
721 }
722
723 return con->features;
724}
725
726static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
727 bool bypass)
728{
729 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
730 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
731 int i;
732 const enum amdgpu_ras_error_type default_ras_type =
733 AMDGPU_RAS_ERROR__NONE;
734
735 for (i = 0; i < ras_block_count; i++) {
736 struct ras_common_if head = {
737 .block = i,
738 .type = default_ras_type,
739 .sub_block_index = 0,
740 };
741 strcpy(head.name, ras_block_str(i));
742 if (bypass) {
743
744
745
746
747 if (__amdgpu_ras_feature_enable(adev, &head, 1))
748 break;
749 } else {
750 if (amdgpu_ras_feature_enable(adev, &head, 1))
751 break;
752 }
753 }
754
755 return con->features;
756}
757
758
759
760int amdgpu_ras_error_query(struct amdgpu_device *adev,
761 struct ras_query_if *info)
762{
763 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
764 struct ras_err_data err_data = {0, 0, 0, NULL};
765 int i;
766
767 if (!obj)
768 return -EINVAL;
769
770 switch (info->head.block) {
771 case AMDGPU_RAS_BLOCK__UMC:
772 if (adev->umc.funcs->query_ras_error_count)
773 adev->umc.funcs->query_ras_error_count(adev, &err_data);
774
775
776
777 if (adev->umc.funcs->query_ras_error_address)
778 adev->umc.funcs->query_ras_error_address(adev, &err_data);
779 break;
780 case AMDGPU_RAS_BLOCK__SDMA:
781 if (adev->sdma.funcs->query_ras_error_count) {
782 for (i = 0; i < adev->sdma.num_instances; i++)
783 adev->sdma.funcs->query_ras_error_count(adev, i,
784 &err_data);
785 }
786 break;
787 case AMDGPU_RAS_BLOCK__GFX:
788 if (adev->gfx.funcs->query_ras_error_count)
789 adev->gfx.funcs->query_ras_error_count(adev, &err_data);
790 break;
791 case AMDGPU_RAS_BLOCK__MMHUB:
792 if (adev->mmhub.funcs->query_ras_error_count)
793 adev->mmhub.funcs->query_ras_error_count(adev, &err_data);
794 break;
795 case AMDGPU_RAS_BLOCK__PCIE_BIF:
796 if (adev->nbio.funcs->query_ras_error_count)
797 adev->nbio.funcs->query_ras_error_count(adev, &err_data);
798 break;
799 case AMDGPU_RAS_BLOCK__XGMI_WAFL:
800 amdgpu_xgmi_query_ras_error_count(adev, &err_data);
801 break;
802 default:
803 break;
804 }
805
806 obj->err_data.ue_count += err_data.ue_count;
807 obj->err_data.ce_count += err_data.ce_count;
808
809 info->ue_count = obj->err_data.ue_count;
810 info->ce_count = obj->err_data.ce_count;
811
812 if (err_data.ce_count) {
813 dev_info(adev->dev, "%ld correctable hardware errors "
814 "detected in %s block, no user "
815 "action is needed.\n",
816 obj->err_data.ce_count,
817 ras_block_str(info->head.block));
818 }
819 if (err_data.ue_count) {
820 dev_info(adev->dev, "%ld uncorrectable hardware errors "
821 "detected in %s block\n",
822 obj->err_data.ue_count,
823 ras_block_str(info->head.block));
824 }
825
826 return 0;
827}
828
829
830static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
831 struct ta_ras_trigger_error_input *block_info)
832{
833 int ret;
834
835 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
836 dev_warn(adev->dev, "Failed to disallow df cstate");
837
838 if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
839 dev_warn(adev->dev, "Failed to disallow XGMI power down");
840
841 ret = psp_ras_trigger_error(&adev->psp, block_info);
842
843 if (amdgpu_ras_intr_triggered())
844 return ret;
845
846 if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
847 dev_warn(adev->dev, "Failed to allow XGMI power down");
848
849 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
850 dev_warn(adev->dev, "Failed to allow df cstate");
851
852 return ret;
853}
854
855
856int amdgpu_ras_error_inject(struct amdgpu_device *adev,
857 struct ras_inject_if *info)
858{
859 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
860 struct ta_ras_trigger_error_input block_info = {
861 .block_id = amdgpu_ras_block_to_ta(info->head.block),
862 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
863 .sub_block_index = info->head.sub_block_index,
864 .address = info->address,
865 .value = info->value,
866 };
867 int ret = 0;
868
869 if (!obj)
870 return -EINVAL;
871
872
873 if (adev->gmc.xgmi.num_physical_nodes > 1) {
874 block_info.address =
875 amdgpu_xgmi_get_relative_phy_addr(adev,
876 block_info.address);
877 }
878
879 switch (info->head.block) {
880 case AMDGPU_RAS_BLOCK__GFX:
881 if (adev->gfx.funcs->ras_error_inject)
882 ret = adev->gfx.funcs->ras_error_inject(adev, info);
883 else
884 ret = -EINVAL;
885 break;
886 case AMDGPU_RAS_BLOCK__UMC:
887 case AMDGPU_RAS_BLOCK__MMHUB:
888 case AMDGPU_RAS_BLOCK__PCIE_BIF:
889 ret = psp_ras_trigger_error(&adev->psp, &block_info);
890 break;
891 case AMDGPU_RAS_BLOCK__XGMI_WAFL:
892 ret = amdgpu_ras_error_inject_xgmi(adev, &block_info);
893 break;
894 default:
895 dev_info(adev->dev, "%s error injection is not supported yet\n",
896 ras_block_str(info->head.block));
897 ret = -EINVAL;
898 }
899
900 amdgpu_ras_parse_status_code(adev,
901 "inject",
902 ras_block_str(info->head.block),
903 (enum ta_ras_status)ret);
904
905 return ret;
906}
907
908
909unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
910 bool is_ce)
911{
912 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
913 struct ras_manager *obj;
914 struct ras_err_data data = {0, 0};
915
916 if (!con)
917 return 0;
918
919 list_for_each_entry(obj, &con->head, node) {
920 struct ras_query_if info = {
921 .head = obj->head,
922 };
923
924 if (amdgpu_ras_error_query(adev, &info))
925 return 0;
926
927 data.ce_count += info.ce_count;
928 data.ue_count += info.ue_count;
929 }
930
931 return is_ce ? data.ce_count : data.ue_count;
932}
933
934
935
936
937
938static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
939 struct ras_badpage **bps, unsigned int *count);
940
941static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
942{
943 switch (flags) {
944 case AMDGPU_RAS_RETIRE_PAGE_RESERVED:
945 return "R";
946 case AMDGPU_RAS_RETIRE_PAGE_PENDING:
947 return "P";
948 case AMDGPU_RAS_RETIRE_PAGE_FAULT:
949 default:
950 return "F";
951 }
952}
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
985 struct kobject *kobj, struct bin_attribute *attr,
986 char *buf, loff_t ppos, size_t count)
987{
988 struct amdgpu_ras *con =
989 container_of(attr, struct amdgpu_ras, badpages_attr);
990 struct amdgpu_device *adev = con->adev;
991 const unsigned int element_size =
992 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
993 unsigned int start = div64_ul(ppos + element_size - 1, element_size);
994 unsigned int end = div64_ul(ppos + count - 1, element_size);
995 ssize_t s = 0;
996 struct ras_badpage *bps = NULL;
997 unsigned int bps_count = 0;
998
999 memset(buf, 0, count);
1000
1001 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
1002 return 0;
1003
1004 for (; start < end && start < bps_count; start++)
1005 s += scnprintf(&buf[s], element_size + 1,
1006 "0x%08x : 0x%08x : %1s\n",
1007 bps[start].bp,
1008 bps[start].size,
1009 amdgpu_ras_badpage_flags_str(bps[start].flags));
1010
1011 kfree(bps);
1012
1013 return s;
1014}
1015
1016static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
1017 struct device_attribute *attr, char *buf)
1018{
1019 struct amdgpu_ras *con =
1020 container_of(attr, struct amdgpu_ras, features_attr);
1021
1022 return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
1023}
1024
1025static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev)
1026{
1027 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1028
1029 sysfs_remove_file_from_group(&adev->dev->kobj,
1030 &con->badpages_attr.attr,
1031 RAS_FS_NAME);
1032}
1033
1034static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
1035{
1036 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1037 struct attribute *attrs[] = {
1038 &con->features_attr.attr,
1039 NULL
1040 };
1041 struct attribute_group group = {
1042 .name = RAS_FS_NAME,
1043 .attrs = attrs,
1044 };
1045
1046 sysfs_remove_group(&adev->dev->kobj, &group);
1047
1048 return 0;
1049}
1050
1051int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
1052 struct ras_fs_if *head)
1053{
1054 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
1055
1056 if (!obj || obj->attr_inuse)
1057 return -EINVAL;
1058
1059 get_obj(obj);
1060
1061 memcpy(obj->fs_data.sysfs_name,
1062 head->sysfs_name,
1063 sizeof(obj->fs_data.sysfs_name));
1064
1065 obj->sysfs_attr = (struct device_attribute){
1066 .attr = {
1067 .name = obj->fs_data.sysfs_name,
1068 .mode = S_IRUGO,
1069 },
1070 .show = amdgpu_ras_sysfs_read,
1071 };
1072 sysfs_attr_init(&obj->sysfs_attr.attr);
1073
1074 if (sysfs_add_file_to_group(&adev->dev->kobj,
1075 &obj->sysfs_attr.attr,
1076 RAS_FS_NAME)) {
1077 put_obj(obj);
1078 return -EINVAL;
1079 }
1080
1081 obj->attr_inuse = 1;
1082
1083 return 0;
1084}
1085
1086int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
1087 struct ras_common_if *head)
1088{
1089 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1090
1091 if (!obj || !obj->attr_inuse)
1092 return -EINVAL;
1093
1094 sysfs_remove_file_from_group(&adev->dev->kobj,
1095 &obj->sysfs_attr.attr,
1096 RAS_FS_NAME);
1097 obj->attr_inuse = 0;
1098 put_obj(obj);
1099
1100 return 0;
1101}
1102
1103static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
1104{
1105 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1106 struct ras_manager *obj, *tmp;
1107
1108 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1109 amdgpu_ras_sysfs_remove(adev, &obj->head);
1110 }
1111
1112 if (amdgpu_bad_page_threshold != 0)
1113 amdgpu_ras_sysfs_remove_bad_page_node(adev);
1114
1115 amdgpu_ras_sysfs_remove_feature_node(adev);
1116
1117 return 0;
1118}
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
1141{
1142 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1143 struct drm_minor *minor = adev_to_drm(adev)->primary;
1144
1145 con->dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root);
1146 debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir,
1147 adev, &amdgpu_ras_debugfs_ctrl_ops);
1148 debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
1149 adev, &amdgpu_ras_debugfs_eeprom_ops);
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159 debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
1160 &con->reboot);
1161
1162
1163
1164
1165
1166 debugfs_create_bool("disable_ras_err_cnt_harvest", 0644,
1167 con->dir, &con->disable_ras_err_cnt_harvest);
1168}
1169
1170static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
1171 struct ras_fs_if *head)
1172{
1173 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1174 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
1175
1176 if (!obj || obj->ent)
1177 return;
1178
1179 get_obj(obj);
1180
1181 memcpy(obj->fs_data.debugfs_name,
1182 head->debugfs_name,
1183 sizeof(obj->fs_data.debugfs_name));
1184
1185 obj->ent = debugfs_create_file(obj->fs_data.debugfs_name,
1186 S_IWUGO | S_IRUGO, con->dir, obj,
1187 &amdgpu_ras_debugfs_ops);
1188}
1189
1190void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
1191{
1192 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1193 struct ras_manager *obj;
1194 struct ras_fs_if fs_info;
1195
1196
1197
1198
1199
1200 if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con)
1201 return;
1202
1203 amdgpu_ras_debugfs_create_ctrl_node(adev);
1204
1205 list_for_each_entry(obj, &con->head, node) {
1206 if (amdgpu_ras_is_supported(adev, obj->head.block) &&
1207 (obj->attr_inuse == 1)) {
1208 sprintf(fs_info.debugfs_name, "%s_err_inject",
1209 ras_block_str(obj->head.block));
1210 fs_info.head = obj->head;
1211 amdgpu_ras_debugfs_create(adev, &fs_info);
1212 }
1213 }
1214}
1215
1216static void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
1217 struct ras_common_if *head)
1218{
1219 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1220
1221 if (!obj || !obj->ent)
1222 return;
1223
1224 obj->ent = NULL;
1225 put_obj(obj);
1226}
1227
1228static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
1229{
1230 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1231 struct ras_manager *obj, *tmp;
1232
1233 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1234 amdgpu_ras_debugfs_remove(adev, &obj->head);
1235 }
1236
1237 con->dir = NULL;
1238}
1239
1240
1241
1242static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO,
1243 amdgpu_ras_sysfs_badpages_read, NULL, 0);
1244static DEVICE_ATTR(features, S_IRUGO,
1245 amdgpu_ras_sysfs_features_read, NULL);
1246static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
1247{
1248 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1249 struct attribute_group group = {
1250 .name = RAS_FS_NAME,
1251 };
1252 struct attribute *attrs[] = {
1253 &con->features_attr.attr,
1254 NULL
1255 };
1256 struct bin_attribute *bin_attrs[] = {
1257 NULL,
1258 NULL,
1259 };
1260 int r;
1261
1262
1263 con->features_attr = dev_attr_features;
1264 group.attrs = attrs;
1265 sysfs_attr_init(attrs[0]);
1266
1267 if (amdgpu_bad_page_threshold != 0) {
1268
1269 bin_attr_gpu_vram_bad_pages.private = NULL;
1270 con->badpages_attr = bin_attr_gpu_vram_bad_pages;
1271 bin_attrs[0] = &con->badpages_attr;
1272 group.bin_attrs = bin_attrs;
1273 sysfs_bin_attr_init(bin_attrs[0]);
1274 }
1275
1276 r = sysfs_create_group(&adev->dev->kobj, &group);
1277 if (r)
1278 dev_err(adev->dev, "Failed to create RAS sysfs group!");
1279
1280 return 0;
1281}
1282
1283static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
1284{
1285 if (IS_ENABLED(CONFIG_DEBUG_FS))
1286 amdgpu_ras_debugfs_remove_all(adev);
1287 amdgpu_ras_sysfs_remove_all(adev);
1288 return 0;
1289}
1290
1291
1292
1293static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
1294{
1295 struct ras_ih_data *data = &obj->ih_data;
1296 struct amdgpu_iv_entry entry;
1297 int ret;
1298 struct ras_err_data err_data = {0, 0, 0, NULL};
1299
1300 while (data->rptr != data->wptr) {
1301 rmb();
1302 memcpy(&entry, &data->ring[data->rptr],
1303 data->element_size);
1304
1305 wmb();
1306 data->rptr = (data->aligned_element_size +
1307 data->rptr) % data->ring_size;
1308
1309
1310
1311
1312 if (data->cb) {
1313 ret = data->cb(obj->adev, &err_data, &entry);
1314
1315
1316
1317
1318
1319 if (ret == AMDGPU_RAS_SUCCESS) {
1320
1321
1322
1323 obj->err_data.ue_count += err_data.ue_count;
1324 obj->err_data.ce_count += err_data.ce_count;
1325 }
1326 }
1327 }
1328}
1329
1330static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1331{
1332 struct ras_ih_data *data =
1333 container_of(work, struct ras_ih_data, ih_work);
1334 struct ras_manager *obj =
1335 container_of(data, struct ras_manager, ih_data);
1336
1337 amdgpu_ras_interrupt_handler(obj);
1338}
1339
1340int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1341 struct ras_dispatch_if *info)
1342{
1343 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1344 struct ras_ih_data *data = &obj->ih_data;
1345
1346 if (!obj)
1347 return -EINVAL;
1348
1349 if (data->inuse == 0)
1350 return 0;
1351
1352
1353 memcpy(&data->ring[data->wptr], info->entry,
1354 data->element_size);
1355
1356 wmb();
1357 data->wptr = (data->aligned_element_size +
1358 data->wptr) % data->ring_size;
1359
1360 schedule_work(&data->ih_work);
1361
1362 return 0;
1363}
1364
1365int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1366 struct ras_ih_if *info)
1367{
1368 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1369 struct ras_ih_data *data;
1370
1371 if (!obj)
1372 return -EINVAL;
1373
1374 data = &obj->ih_data;
1375 if (data->inuse == 0)
1376 return 0;
1377
1378 cancel_work_sync(&data->ih_work);
1379
1380 kfree(data->ring);
1381 memset(data, 0, sizeof(*data));
1382 put_obj(obj);
1383
1384 return 0;
1385}
1386
1387int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1388 struct ras_ih_if *info)
1389{
1390 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1391 struct ras_ih_data *data;
1392
1393 if (!obj) {
1394
1395 obj = amdgpu_ras_create_obj(adev, &info->head);
1396 if (!obj)
1397 return -EINVAL;
1398 } else
1399 get_obj(obj);
1400
1401 data = &obj->ih_data;
1402
1403 *data = (struct ras_ih_data) {
1404 .inuse = 0,
1405 .cb = info->cb,
1406 .element_size = sizeof(struct amdgpu_iv_entry),
1407 .rptr = 0,
1408 .wptr = 0,
1409 };
1410
1411 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1412
1413 data->aligned_element_size = ALIGN(data->element_size, 8);
1414
1415 data->ring_size = 64 * data->aligned_element_size;
1416 data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1417 if (!data->ring) {
1418 put_obj(obj);
1419 return -ENOMEM;
1420 }
1421
1422
1423 data->inuse = 1;
1424
1425 return 0;
1426}
1427
1428static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1429{
1430 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1431 struct ras_manager *obj, *tmp;
1432
1433 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1434 struct ras_ih_if info = {
1435 .head = obj->head,
1436 };
1437 amdgpu_ras_interrupt_remove_handler(adev, &info);
1438 }
1439
1440 return 0;
1441}
1442
1443
1444
1445static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
1446{
1447 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1448 struct ras_manager *obj;
1449
1450 if (!con)
1451 return;
1452
1453 list_for_each_entry(obj, &con->head, node) {
1454 struct ras_query_if info = {
1455 .head = obj->head,
1456 };
1457
1458
1459
1460
1461
1462
1463
1464 if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
1465 continue;
1466
1467 amdgpu_ras_error_query(adev, &info);
1468 }
1469}
1470
1471
1472static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
1473 struct ras_query_if *info)
1474{
1475
1476
1477
1478
1479 switch (info->head.block) {
1480 case AMDGPU_RAS_BLOCK__GFX:
1481 if (adev->gfx.funcs->query_ras_error_status)
1482 adev->gfx.funcs->query_ras_error_status(adev);
1483 break;
1484 case AMDGPU_RAS_BLOCK__MMHUB:
1485 if (adev->mmhub.funcs->query_ras_error_status)
1486 adev->mmhub.funcs->query_ras_error_status(adev);
1487 break;
1488 default:
1489 break;
1490 }
1491}
1492
1493static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
1494{
1495 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1496 struct ras_manager *obj;
1497
1498 if (!con)
1499 return;
1500
1501 list_for_each_entry(obj, &con->head, node) {
1502 struct ras_query_if info = {
1503 .head = obj->head,
1504 };
1505
1506 amdgpu_ras_error_status_query(adev, &info);
1507 }
1508}
1509
1510
1511
1512
1513
1514
1515static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1516 struct ras_badpage **bps, unsigned int *count)
1517{
1518 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1519 struct ras_err_handler_data *data;
1520 int i = 0;
1521 int ret = 0, status;
1522
1523 if (!con || !con->eh_data || !bps || !count)
1524 return -EINVAL;
1525
1526 mutex_lock(&con->recovery_lock);
1527 data = con->eh_data;
1528 if (!data || data->count == 0) {
1529 *bps = NULL;
1530 ret = -EINVAL;
1531 goto out;
1532 }
1533
1534 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
1535 if (!*bps) {
1536 ret = -ENOMEM;
1537 goto out;
1538 }
1539
1540 for (; i < data->count; i++) {
1541 (*bps)[i] = (struct ras_badpage){
1542 .bp = data->bps[i].retired_page,
1543 .size = AMDGPU_GPU_PAGE_SIZE,
1544 .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
1545 };
1546 status = amdgpu_vram_mgr_query_page_status(
1547 ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
1548 data->bps[i].retired_page);
1549 if (status == -EBUSY)
1550 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
1551 else if (status == -ENOENT)
1552 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
1553 }
1554
1555 *count = data->count;
1556out:
1557 mutex_unlock(&con->recovery_lock);
1558 return ret;
1559}
1560
1561static void amdgpu_ras_do_recovery(struct work_struct *work)
1562{
1563 struct amdgpu_ras *ras =
1564 container_of(work, struct amdgpu_ras, recovery_work);
1565 struct amdgpu_device *remote_adev = NULL;
1566 struct amdgpu_device *adev = ras->adev;
1567 struct list_head device_list, *device_list_handle = NULL;
1568
1569 if (!ras->disable_ras_err_cnt_harvest) {
1570 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
1571
1572
1573 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) {
1574 device_list_handle = &hive->device_list;
1575 } else {
1576 INIT_LIST_HEAD(&device_list);
1577 list_add_tail(&adev->gmc.xgmi.head, &device_list);
1578 device_list_handle = &device_list;
1579 }
1580
1581 list_for_each_entry(remote_adev,
1582 device_list_handle, gmc.xgmi.head) {
1583 amdgpu_ras_query_err_status(remote_adev);
1584 amdgpu_ras_log_on_err_counter(remote_adev);
1585 }
1586
1587 amdgpu_put_xgmi_hive(hive);
1588 }
1589
1590 if (amdgpu_device_should_recover_gpu(ras->adev))
1591 amdgpu_device_gpu_recover(ras->adev, NULL);
1592 atomic_set(&ras->in_recovery, 0);
1593}
1594
1595
1596static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1597 struct ras_err_handler_data *data, int pages)
1598{
1599 unsigned int old_space = data->count + data->space_left;
1600 unsigned int new_space = old_space + pages;
1601 unsigned int align_space = ALIGN(new_space, 512);
1602 void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1603
1604 if (!bps) {
1605 kfree(bps);
1606 return -ENOMEM;
1607 }
1608
1609 if (data->bps) {
1610 memcpy(bps, data->bps,
1611 data->count * sizeof(*data->bps));
1612 kfree(data->bps);
1613 }
1614
1615 data->bps = bps;
1616 data->space_left += align_space - old_space;
1617 return 0;
1618}
1619
1620
1621int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
1622 struct eeprom_table_record *bps, int pages)
1623{
1624 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1625 struct ras_err_handler_data *data;
1626 int ret = 0;
1627 uint32_t i;
1628
1629 if (!con || !con->eh_data || !bps || pages <= 0)
1630 return 0;
1631
1632 mutex_lock(&con->recovery_lock);
1633 data = con->eh_data;
1634 if (!data)
1635 goto out;
1636
1637 for (i = 0; i < pages; i++) {
1638 if (amdgpu_ras_check_bad_page_unlock(con,
1639 bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
1640 continue;
1641
1642 if (!data->space_left &&
1643 amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
1644 ret = -ENOMEM;
1645 goto out;
1646 }
1647
1648 amdgpu_vram_mgr_reserve_range(
1649 ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
1650 bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
1651 AMDGPU_GPU_PAGE_SIZE);
1652
1653 memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
1654 data->count++;
1655 data->space_left--;
1656 }
1657out:
1658 mutex_unlock(&con->recovery_lock);
1659
1660 return ret;
1661}
1662
1663
1664
1665
1666
1667int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1668{
1669 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1670 struct ras_err_handler_data *data;
1671 struct amdgpu_ras_eeprom_control *control;
1672 int save_count;
1673
1674 if (!con || !con->eh_data)
1675 return 0;
1676
1677 control = &con->eeprom_control;
1678 data = con->eh_data;
1679 save_count = data->count - control->num_recs;
1680
1681 if (save_count > 0) {
1682 if (amdgpu_ras_eeprom_process_recods(control,
1683 &data->bps[control->num_recs],
1684 true,
1685 save_count)) {
1686 dev_err(adev->dev, "Failed to save EEPROM table data!");
1687 return -EIO;
1688 }
1689
1690 dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count);
1691 }
1692
1693 return 0;
1694}
1695
1696
1697
1698
1699
1700static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1701{
1702 struct amdgpu_ras_eeprom_control *control =
1703 &adev->psp.ras.ras->eeprom_control;
1704 struct eeprom_table_record *bps = NULL;
1705 int ret = 0;
1706
1707
1708 if (!control->num_recs || (amdgpu_bad_page_threshold == 0))
1709 return ret;
1710
1711 bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
1712 if (!bps)
1713 return -ENOMEM;
1714
1715 if (amdgpu_ras_eeprom_process_recods(control, bps, false,
1716 control->num_recs)) {
1717 dev_err(adev->dev, "Failed to load EEPROM table records!");
1718 ret = -EIO;
1719 goto out;
1720 }
1721
1722 ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
1723
1724out:
1725 kfree(bps);
1726 return ret;
1727}
1728
1729static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
1730 uint64_t addr)
1731{
1732 struct ras_err_handler_data *data = con->eh_data;
1733 int i;
1734
1735 addr >>= AMDGPU_GPU_PAGE_SHIFT;
1736 for (i = 0; i < data->count; i++)
1737 if (addr == data->bps[i].retired_page)
1738 return true;
1739
1740 return false;
1741}
1742
1743
1744
1745
1746
1747
1748static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
1749 uint64_t addr)
1750{
1751 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1752 bool ret = false;
1753
1754 if (!con || !con->eh_data)
1755 return ret;
1756
1757 mutex_lock(&con->recovery_lock);
1758 ret = amdgpu_ras_check_bad_page_unlock(con, addr);
1759 mutex_unlock(&con->recovery_lock);
1760 return ret;
1761}
1762
1763static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
1764 uint32_t max_length)
1765{
1766 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1767 int tmp_threshold = amdgpu_bad_page_threshold;
1768 u64 val;
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789 if (tmp_threshold < -1)
1790 tmp_threshold = -1;
1791 else if (tmp_threshold > max_length)
1792 tmp_threshold = max_length;
1793
1794 if (tmp_threshold == -1) {
1795 val = adev->gmc.mc_vram_size;
1796 do_div(val, RAS_BAD_PAGE_RATE);
1797 con->bad_page_cnt_threshold = min(lower_32_bits(val),
1798 max_length);
1799 } else {
1800 con->bad_page_cnt_threshold = tmp_threshold;
1801 }
1802}
1803
1804int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
1805{
1806 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1807 struct ras_err_handler_data **data;
1808 uint32_t max_eeprom_records_len = 0;
1809 bool exc_err_limit = false;
1810 int ret;
1811
1812 if (con)
1813 data = &con->eh_data;
1814 else
1815 return 0;
1816
1817 *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
1818 if (!*data) {
1819 ret = -ENOMEM;
1820 goto out;
1821 }
1822
1823 mutex_init(&con->recovery_lock);
1824 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
1825 atomic_set(&con->in_recovery, 0);
1826 con->adev = adev;
1827
1828 max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
1829 amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
1830
1831 ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
1832
1833
1834
1835
1836 if (exc_err_limit || ret)
1837 goto free;
1838
1839 if (con->eeprom_control.num_recs) {
1840 ret = amdgpu_ras_load_bad_pages(adev);
1841 if (ret)
1842 goto free;
1843 }
1844
1845 return 0;
1846
1847free:
1848 kfree((*data)->bps);
1849 kfree(*data);
1850 con->eh_data = NULL;
1851out:
1852 dev_warn(adev->dev, "Failed to initialize ras recovery!\n");
1853
1854
1855
1856
1857
1858 if (!exc_err_limit)
1859 ret = 0;
1860 else
1861 ret = -EINVAL;
1862
1863 return ret;
1864}
1865
1866static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
1867{
1868 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1869 struct ras_err_handler_data *data = con->eh_data;
1870
1871
1872 if (!data)
1873 return 0;
1874
1875 cancel_work_sync(&con->recovery_work);
1876
1877 mutex_lock(&con->recovery_lock);
1878 con->eh_data = NULL;
1879 kfree(data->bps);
1880 kfree(data);
1881 mutex_unlock(&con->recovery_lock);
1882
1883 return 0;
1884}
1885
1886
1887
1888int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
1889 unsigned int block)
1890{
1891 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1892
1893 if (!ras)
1894 return -EINVAL;
1895
1896 ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1897 return 0;
1898}
1899
1900static int amdgpu_ras_check_asic_type(struct amdgpu_device *adev)
1901{
1902 if (adev->asic_type != CHIP_VEGA10 &&
1903 adev->asic_type != CHIP_VEGA20 &&
1904 adev->asic_type != CHIP_ARCTURUS &&
1905 adev->asic_type != CHIP_SIENNA_CICHLID)
1906 return 1;
1907 else
1908 return 0;
1909}
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
1921 uint32_t *hw_supported, uint32_t *supported)
1922{
1923 *hw_supported = 0;
1924 *supported = 0;
1925
1926 if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
1927 amdgpu_ras_check_asic_type(adev))
1928 return;
1929
1930 if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
1931 dev_info(adev->dev, "HBM ECC is active.\n");
1932 *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC |
1933 1 << AMDGPU_RAS_BLOCK__DF);
1934 } else
1935 dev_info(adev->dev, "HBM ECC is not presented.\n");
1936
1937 if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
1938 dev_info(adev->dev, "SRAM ECC is active.\n");
1939 *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
1940 1 << AMDGPU_RAS_BLOCK__DF);
1941 } else
1942 dev_info(adev->dev, "SRAM ECC is not presented.\n");
1943
1944
1945 *hw_supported &= AMDGPU_RAS_BLOCK_MASK;
1946
1947 *supported = amdgpu_ras_enable == 0 ?
1948 0 : *hw_supported & amdgpu_ras_mask;
1949 adev->ras_features = *supported;
1950}
1951
1952int amdgpu_ras_init(struct amdgpu_device *adev)
1953{
1954 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1955 int r;
1956
1957 if (con)
1958 return 0;
1959
1960 con = kmalloc(sizeof(struct amdgpu_ras) +
1961 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
1962 GFP_KERNEL|__GFP_ZERO);
1963 if (!con)
1964 return -ENOMEM;
1965
1966 con->objs = (struct ras_manager *)(con + 1);
1967
1968 amdgpu_ras_set_context(adev, con);
1969
1970 amdgpu_ras_check_supported(adev, &con->hw_supported,
1971 &con->supported);
1972 if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) {
1973 r = 0;
1974 goto release_con;
1975 }
1976
1977 con->features = 0;
1978 INIT_LIST_HEAD(&con->head);
1979
1980 con->flags = RAS_DEFAULT_FLAGS;
1981
1982 if (adev->nbio.funcs->init_ras_controller_interrupt) {
1983 r = adev->nbio.funcs->init_ras_controller_interrupt(adev);
1984 if (r)
1985 goto release_con;
1986 }
1987
1988 if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) {
1989 r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev);
1990 if (r)
1991 goto release_con;
1992 }
1993
1994 if (amdgpu_ras_fs_init(adev)) {
1995 r = -EINVAL;
1996 goto release_con;
1997 }
1998
1999 dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
2000 "hardware ability[%x] ras_mask[%x]\n",
2001 con->hw_supported, con->supported);
2002 return 0;
2003release_con:
2004 amdgpu_ras_set_context(adev, NULL);
2005 kfree(con);
2006
2007 return r;
2008}
2009
2010
2011int amdgpu_ras_late_init(struct amdgpu_device *adev,
2012 struct ras_common_if *ras_block,
2013 struct ras_fs_if *fs_info,
2014 struct ras_ih_if *ih_info)
2015{
2016 int r;
2017
2018
2019 if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
2020 amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
2021 return 0;
2022 }
2023
2024 r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
2025 if (r) {
2026 if (r == -EAGAIN) {
2027
2028 amdgpu_ras_request_reset_on_boot(adev,
2029 ras_block->block);
2030 return 0;
2031 } else if (adev->in_suspend || amdgpu_in_reset(adev)) {
2032
2033
2034 goto cleanup;
2035 } else
2036 return r;
2037 }
2038
2039
2040 if (adev->in_suspend || amdgpu_in_reset(adev))
2041 return 0;
2042
2043 if (ih_info->cb) {
2044 r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
2045 if (r)
2046 goto interrupt;
2047 }
2048
2049 r = amdgpu_ras_sysfs_create(adev, fs_info);
2050 if (r)
2051 goto sysfs;
2052
2053 return 0;
2054cleanup:
2055 amdgpu_ras_sysfs_remove(adev, ras_block);
2056sysfs:
2057 if (ih_info->cb)
2058 amdgpu_ras_interrupt_remove_handler(adev, ih_info);
2059interrupt:
2060 amdgpu_ras_feature_enable(adev, ras_block, 0);
2061 return r;
2062}
2063
2064
2065void amdgpu_ras_late_fini(struct amdgpu_device *adev,
2066 struct ras_common_if *ras_block,
2067 struct ras_ih_if *ih_info)
2068{
2069 if (!ras_block || !ih_info)
2070 return;
2071
2072 amdgpu_ras_sysfs_remove(adev, ras_block);
2073 if (ih_info->cb)
2074 amdgpu_ras_interrupt_remove_handler(adev, ih_info);
2075 amdgpu_ras_feature_enable(adev, ras_block, 0);
2076}
2077
2078
2079
2080
2081void amdgpu_ras_resume(struct amdgpu_device *adev)
2082{
2083 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2084 struct ras_manager *obj, *tmp;
2085
2086 if (!con)
2087 return;
2088
2089 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
2090
2091
2092
2093
2094
2095 amdgpu_ras_enable_all_features(adev, 1);
2096
2097
2098
2099
2100
2101 list_for_each_entry_safe(obj, tmp, &con->head, node) {
2102 if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
2103 amdgpu_ras_feature_enable(adev, &obj->head, 0);
2104
2105 WARN_ON(alive_obj(obj));
2106 }
2107 }
2108 }
2109
2110 if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) {
2111 con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET;
2112
2113
2114
2115
2116
2117
2118
2119 amdgpu_ras_disable_all_features(adev, 1);
2120 amdgpu_ras_reset_gpu(adev);
2121 }
2122}
2123
2124void amdgpu_ras_suspend(struct amdgpu_device *adev)
2125{
2126 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2127
2128 if (!con)
2129 return;
2130
2131 amdgpu_ras_disable_all_features(adev, 0);
2132
2133 if (con->features)
2134 amdgpu_ras_disable_all_features(adev, 1);
2135}
2136
2137
2138int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
2139{
2140 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2141
2142 if (!con)
2143 return 0;
2144
2145
2146 amdgpu_ras_disable_all_features(adev, 0);
2147 amdgpu_ras_recovery_fini(adev);
2148 return 0;
2149}
2150
2151int amdgpu_ras_fini(struct amdgpu_device *adev)
2152{
2153 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2154
2155 if (!con)
2156 return 0;
2157
2158 amdgpu_ras_fs_fini(adev);
2159 amdgpu_ras_interrupt_remove_all(adev);
2160
2161 WARN(con->features, "Feature mask is not cleared");
2162
2163 if (con->features)
2164 amdgpu_ras_disable_all_features(adev, 1);
2165
2166 amdgpu_ras_set_context(adev, NULL);
2167 kfree(con);
2168
2169 return 0;
2170}
2171
2172void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
2173{
2174 uint32_t hw_supported, supported;
2175
2176 amdgpu_ras_check_supported(adev, &hw_supported, &supported);
2177 if (!hw_supported)
2178 return;
2179
2180 if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
2181 dev_info(adev->dev, "uncorrectable hardware error"
2182 "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
2183
2184 amdgpu_ras_reset_gpu(adev);
2185 }
2186}
2187
2188bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
2189{
2190 if (adev->asic_type == CHIP_VEGA20 &&
2191 adev->pm.fw_version <= 0x283400) {
2192 return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) &&
2193 amdgpu_ras_intr_triggered();
2194 }
2195
2196 return false;
2197}
2198
2199bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev)
2200{
2201 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2202 bool exc_err_limit = false;
2203
2204 if (con && (amdgpu_bad_page_threshold != 0))
2205 amdgpu_ras_eeprom_check_err_threshold(&con->eeprom_control,
2206 &exc_err_limit);
2207
2208
2209
2210
2211
2212 return exc_err_limit;
2213}
2214