1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/debugfs.h>
25#include <linux/list.h>
26#include <linux/module.h>
27#include <linux/uaccess.h>
28#include <linux/reboot.h>
29#include <linux/syscalls.h>
30#include <linux/pm_runtime.h>
31
32#include "amdgpu.h"
33#include "amdgpu_ras.h"
34#include "amdgpu_atomfirmware.h"
35#include "amdgpu_xgmi.h"
36#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
37#include "atom.h"
38#ifdef CONFIG_X86_MCE_AMD
39#include <asm/mce.h>
40
41static bool notifier_registered;
42#endif
43static const char *RAS_FS_NAME = "ras";
44
45const char *ras_error_string[] = {
46 "none",
47 "parity",
48 "single_correctable",
49 "multi_uncorrectable",
50 "poison",
51};
52
53const char *ras_block_string[] = {
54 "umc",
55 "sdma",
56 "gfx",
57 "mmhub",
58 "athub",
59 "pcie_bif",
60 "hdp",
61 "xgmi_wafl",
62 "df",
63 "smn",
64 "sem",
65 "mp0",
66 "mp1",
67 "fuse",
68 "mca",
69};
70
71const char *ras_mca_block_string[] = {
72 "mca_mp0",
73 "mca_mp1",
74 "mca_mpio",
75 "mca_iohc",
76};
77
78struct amdgpu_ras_block_list {
79
80 struct list_head node;
81
82 struct amdgpu_ras_block_object *ras_obj;
83};
84
85const char *get_ras_block_str(struct ras_common_if *ras_block)
86{
87 if (!ras_block)
88 return "NULL";
89
90 if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT)
91 return "OUT OF RANGE";
92
93 if (ras_block->block == AMDGPU_RAS_BLOCK__MCA)
94 return ras_mca_block_string[ras_block->sub_block_index];
95
96 return ras_block_string[ras_block->block];
97}
98
99#define ras_block_str(_BLOCK_) \
100 (((_BLOCK_) < ARRAY_SIZE(ras_block_string)) ? ras_block_string[_BLOCK_] : "Out Of Range")
101
102#define ras_err_str(i) (ras_error_string[ffs(i)])
103
104#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
105
106
107#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
108
109
110#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
111
112enum amdgpu_ras_retire_page_reservation {
113 AMDGPU_RAS_RETIRE_PAGE_RESERVED,
114 AMDGPU_RAS_RETIRE_PAGE_PENDING,
115 AMDGPU_RAS_RETIRE_PAGE_FAULT,
116};
117
118atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
119
120static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
121 uint64_t addr);
122static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
123 uint64_t addr);
124#ifdef CONFIG_X86_MCE_AMD
125static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
126struct mce_notifier_adev_list {
127 struct amdgpu_device *devs[MAX_GPU_INSTANCE];
128 int num_gpu;
129};
130static struct mce_notifier_adev_list mce_adev_list;
131#endif
132
133void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
134{
135 if (adev && amdgpu_ras_get_context(adev))
136 amdgpu_ras_get_context(adev)->error_query_ready = ready;
137}
138
139static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
140{
141 if (adev && amdgpu_ras_get_context(adev))
142 return amdgpu_ras_get_context(adev)->error_query_ready;
143
144 return false;
145}
146
147static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address)
148{
149 struct ras_err_data err_data = {0, 0, 0, NULL};
150 struct eeprom_table_record err_rec;
151
152 if ((address >= adev->gmc.mc_vram_size) ||
153 (address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
154 dev_warn(adev->dev,
155 "RAS WARN: input address 0x%llx is invalid.\n",
156 address);
157 return -EINVAL;
158 }
159
160 if (amdgpu_ras_check_bad_page(adev, address)) {
161 dev_warn(adev->dev,
162 "RAS WARN: 0x%llx has already been marked as bad page!\n",
163 address);
164 return 0;
165 }
166
167 memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
168 err_data.err_addr = &err_rec;
169 amdgpu_umc_fill_error_record(&err_data, address,
170 (address >> AMDGPU_GPU_PAGE_SHIFT), 0, 0);
171
172 if (amdgpu_bad_page_threshold != 0) {
173 amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
174 err_data.err_addr_cnt);
175 amdgpu_ras_save_bad_pages(adev);
176 }
177
178 dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
179 dev_warn(adev->dev, "Clear EEPROM:\n");
180 dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n");
181
182 return 0;
183}
184
185static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
186 size_t size, loff_t *pos)
187{
188 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
189 struct ras_query_if info = {
190 .head = obj->head,
191 };
192 ssize_t s;
193 char val[128];
194
195 if (amdgpu_ras_query_error_status(obj->adev, &info))
196 return -EINVAL;
197
198 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
199 "ue", info.ue_count,
200 "ce", info.ce_count);
201 if (*pos >= s)
202 return 0;
203
204 s -= *pos;
205 s = min_t(u64, s, size);
206
207
208 if (copy_to_user(buf, &val[*pos], s))
209 return -EINVAL;
210
211 *pos += s;
212
213 return s;
214}
215
216static const struct file_operations amdgpu_ras_debugfs_ops = {
217 .owner = THIS_MODULE,
218 .read = amdgpu_ras_debugfs_read,
219 .write = NULL,
220 .llseek = default_llseek
221};
222
223static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
224{
225 int i;
226
227 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
228 *block_id = i;
229 if (strcmp(name, ras_block_string[i]) == 0)
230 return 0;
231 }
232 return -EINVAL;
233}
234
235static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
236 const char __user *buf, size_t size,
237 loff_t *pos, struct ras_debug_if *data)
238{
239 ssize_t s = min_t(u64, 64, size);
240 char str[65];
241 char block_name[33];
242 char err[9] = "ue";
243 int op = -1;
244 int block_id;
245 uint32_t sub_block;
246 u64 address, value;
247
248 if (*pos)
249 return -EINVAL;
250 *pos = size;
251
252 memset(str, 0, sizeof(str));
253 memset(data, 0, sizeof(*data));
254
255 if (copy_from_user(str, buf, s))
256 return -EINVAL;
257
258 if (sscanf(str, "disable %32s", block_name) == 1)
259 op = 0;
260 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
261 op = 1;
262 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
263 op = 2;
264 else if (strstr(str, "retire_page") != NULL)
265 op = 3;
266 else if (str[0] && str[1] && str[2] && str[3])
267
268 return -EINVAL;
269
270 if (op != -1) {
271 if (op == 3) {
272 if (sscanf(str, "%*s 0x%llx", &address) != 1 &&
273 sscanf(str, "%*s %llu", &address) != 1)
274 return -EINVAL;
275
276 data->op = op;
277 data->inject.address = address;
278
279 return 0;
280 }
281
282 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
283 return -EINVAL;
284
285 data->head.block = block_id;
286
287 if (!memcmp("ue", err, 2))
288 data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
289 else if (!memcmp("ce", err, 2))
290 data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
291 else
292 return -EINVAL;
293
294 data->op = op;
295
296 if (op == 2) {
297 if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
298 &sub_block, &address, &value) != 3 &&
299 sscanf(str, "%*s %*s %*s %u %llu %llu",
300 &sub_block, &address, &value) != 3)
301 return -EINVAL;
302 data->head.sub_block_index = sub_block;
303 data->inject.address = address;
304 data->inject.value = value;
305 }
306 } else {
307 if (size < sizeof(*data))
308 return -EINVAL;
309
310 if (copy_from_user(data, buf, sizeof(*data)))
311 return -EINVAL;
312 }
313
314 return 0;
315}
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
397 const char __user *buf,
398 size_t size, loff_t *pos)
399{
400 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
401 struct ras_debug_if data;
402 int ret = 0;
403
404 if (!amdgpu_ras_get_error_query_ready(adev)) {
405 dev_warn(adev->dev, "RAS WARN: error injection "
406 "currently inaccessible\n");
407 return size;
408 }
409
410 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
411 if (ret)
412 return ret;
413
414 if (data.op == 3) {
415 ret = amdgpu_reserve_page_direct(adev, data.inject.address);
416 if (!ret)
417 return size;
418 else
419 return ret;
420 }
421
422 if (!amdgpu_ras_is_supported(adev, data.head.block))
423 return -EINVAL;
424
425 switch (data.op) {
426 case 0:
427 ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
428 break;
429 case 1:
430 ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
431 break;
432 case 2:
433 if ((data.inject.address >= adev->gmc.mc_vram_size) ||
434 (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
435 dev_warn(adev->dev, "RAS WARN: input address "
436 "0x%llx is invalid.",
437 data.inject.address);
438 ret = -EINVAL;
439 break;
440 }
441
442
443 if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
444 amdgpu_ras_check_bad_page(adev, data.inject.address)) {
445 dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
446 "already been marked as bad!\n",
447 data.inject.address);
448 break;
449 }
450
451
452 ret = amdgpu_ras_error_inject(adev, &data.inject);
453 break;
454 default:
455 ret = -EINVAL;
456 break;
457 }
458
459 if (ret)
460 return ret;
461
462 return size;
463}
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f,
482 const char __user *buf,
483 size_t size, loff_t *pos)
484{
485 struct amdgpu_device *adev =
486 (struct amdgpu_device *)file_inode(f)->i_private;
487 int ret;
488
489 ret = amdgpu_ras_eeprom_reset_table(
490 &(amdgpu_ras_get_context(adev)->eeprom_control));
491
492 if (!ret) {
493
494
495 amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS;
496 return size;
497 } else {
498 return ret;
499 }
500}
501
502static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
503 .owner = THIS_MODULE,
504 .read = NULL,
505 .write = amdgpu_ras_debugfs_ctrl_write,
506 .llseek = default_llseek
507};
508
509static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
510 .owner = THIS_MODULE,
511 .read = NULL,
512 .write = amdgpu_ras_debugfs_eeprom_write,
513 .llseek = default_llseek
514};
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
538 struct device_attribute *attr, char *buf)
539{
540 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
541 struct ras_query_if info = {
542 .head = obj->head,
543 };
544
545 if (!amdgpu_ras_get_error_query_ready(obj->adev))
546 return sysfs_emit(buf, "Query currently inaccessible\n");
547
548 if (amdgpu_ras_query_error_status(obj->adev, &info))
549 return -EINVAL;
550
551 if (obj->adev->asic_type == CHIP_ALDEBARAN) {
552 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
553 DRM_WARN("Failed to reset error counter and error status");
554 }
555
556 return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
557 "ce", info.ce_count);
558}
559
560
561
562#define get_obj(obj) do { (obj)->use++; } while (0)
563#define alive_obj(obj) ((obj)->use)
564
565static inline void put_obj(struct ras_manager *obj)
566{
567 if (obj && (--obj->use == 0))
568 list_del(&obj->node);
569 if (obj && (obj->use < 0))
570 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head));
571}
572
573
574static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
575 struct ras_common_if *head)
576{
577 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
578 struct ras_manager *obj;
579
580 if (!adev->ras_enabled || !con)
581 return NULL;
582
583 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
584 return NULL;
585
586 if (head->block == AMDGPU_RAS_BLOCK__MCA) {
587 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
588 return NULL;
589
590 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
591 } else
592 obj = &con->objs[head->block];
593
594
595 if (alive_obj(obj))
596 return NULL;
597
598 obj->head = *head;
599 obj->adev = adev;
600 list_add(&obj->node, &con->head);
601 get_obj(obj);
602
603 return obj;
604}
605
606
607struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
608 struct ras_common_if *head)
609{
610 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
611 struct ras_manager *obj;
612 int i;
613
614 if (!adev->ras_enabled || !con)
615 return NULL;
616
617 if (head) {
618 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
619 return NULL;
620
621 if (head->block == AMDGPU_RAS_BLOCK__MCA) {
622 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
623 return NULL;
624
625 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
626 } else
627 obj = &con->objs[head->block];
628
629 if (alive_obj(obj))
630 return obj;
631 } else {
632 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT + AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
633 obj = &con->objs[i];
634 if (alive_obj(obj))
635 return obj;
636 }
637 }
638
639 return NULL;
640}
641
642
643
644static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
645 struct ras_common_if *head)
646{
647 return adev->ras_hw_enabled & BIT(head->block);
648}
649
650static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
651 struct ras_common_if *head)
652{
653 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
654
655 return con->features & BIT(head->block);
656}
657
658
659
660
661
662static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
663 struct ras_common_if *head, int enable)
664{
665 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
666 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
667
668
669
670
671
672
673
674 if (!amdgpu_ras_is_feature_allowed(adev, head))
675 return 0;
676
677 if (enable) {
678 if (!obj) {
679 obj = amdgpu_ras_create_obj(adev, head);
680 if (!obj)
681 return -EINVAL;
682 } else {
683
684 get_obj(obj);
685 }
686 con->features |= BIT(head->block);
687 } else {
688 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
689 con->features &= ~BIT(head->block);
690 put_obj(obj);
691 }
692 }
693
694 return 0;
695}
696
697
698int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
699 struct ras_common_if *head, bool enable)
700{
701 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
702 union ta_ras_cmd_input *info;
703 int ret;
704
705 if (!con)
706 return -EINVAL;
707
708 info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);
709 if (!info)
710 return -ENOMEM;
711
712 if (!enable) {
713 info->disable_features = (struct ta_ras_disable_features_input) {
714 .block_id = amdgpu_ras_block_to_ta(head->block),
715 .error_type = amdgpu_ras_error_to_ta(head->type),
716 };
717 } else {
718 info->enable_features = (struct ta_ras_enable_features_input) {
719 .block_id = amdgpu_ras_block_to_ta(head->block),
720 .error_type = amdgpu_ras_error_to_ta(head->type),
721 };
722 }
723
724
725 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
726
727 if (!amdgpu_ras_intr_triggered()) {
728 ret = psp_ras_enable_features(&adev->psp, info, enable);
729 if (ret) {
730 dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n",
731 enable ? "enable":"disable",
732 get_ras_block_str(head),
733 amdgpu_ras_is_poison_mode_supported(adev), ret);
734 goto out;
735 }
736 }
737
738
739 __amdgpu_ras_feature_enable(adev, head, enable);
740 ret = 0;
741out:
742 kfree(info);
743 return ret;
744}
745
746
747int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
748 struct ras_common_if *head, bool enable)
749{
750 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
751 int ret;
752
753 if (!con)
754 return -EINVAL;
755
756 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
757 if (enable) {
758
759
760
761
762
763
764 ret = amdgpu_ras_feature_enable(adev, head, 1);
765
766
767
768
769 if (ret == -EINVAL) {
770 ret = __amdgpu_ras_feature_enable(adev, head, 1);
771 if (!ret)
772 dev_info(adev->dev,
773 "RAS INFO: %s setup object\n",
774 get_ras_block_str(head));
775 }
776 } else {
777
778 ret = __amdgpu_ras_feature_enable(adev, head, 1);
779 if (ret)
780 return ret;
781
782
783 if (head->block == AMDGPU_RAS_BLOCK__GFX)
784 con->features |= BIT(head->block);
785
786 ret = amdgpu_ras_feature_enable(adev, head, 0);
787
788
789 if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX)
790 con->features &= ~BIT(head->block);
791 }
792 } else
793 ret = amdgpu_ras_feature_enable(adev, head, enable);
794
795 return ret;
796}
797
798static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
799 bool bypass)
800{
801 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
802 struct ras_manager *obj, *tmp;
803
804 list_for_each_entry_safe(obj, tmp, &con->head, node) {
805
806
807
808 if (bypass) {
809 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
810 break;
811 } else {
812 if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
813 break;
814 }
815 }
816
817 return con->features;
818}
819
820static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
821 bool bypass)
822{
823 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
824 int i;
825 const enum amdgpu_ras_error_type default_ras_type = AMDGPU_RAS_ERROR__NONE;
826
827 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
828 struct ras_common_if head = {
829 .block = i,
830 .type = default_ras_type,
831 .sub_block_index = 0,
832 };
833
834 if (i == AMDGPU_RAS_BLOCK__MCA)
835 continue;
836
837 if (bypass) {
838
839
840
841
842 if (__amdgpu_ras_feature_enable(adev, &head, 1))
843 break;
844 } else {
845 if (amdgpu_ras_feature_enable(adev, &head, 1))
846 break;
847 }
848 }
849
850 for (i = 0; i < AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
851 struct ras_common_if head = {
852 .block = AMDGPU_RAS_BLOCK__MCA,
853 .type = default_ras_type,
854 .sub_block_index = i,
855 };
856
857 if (bypass) {
858
859
860
861
862 if (__amdgpu_ras_feature_enable(adev, &head, 1))
863 break;
864 } else {
865 if (amdgpu_ras_feature_enable(adev, &head, 1))
866 break;
867 }
868 }
869
870 return con->features;
871}
872
873
874static int amdgpu_ras_block_match_default(struct amdgpu_ras_block_object *block_obj,
875 enum amdgpu_ras_block block)
876{
877 if (!block_obj)
878 return -EINVAL;
879
880 if (block_obj->ras_comm.block == block)
881 return 0;
882
883 return -EINVAL;
884}
885
886static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_device *adev,
887 enum amdgpu_ras_block block, uint32_t sub_block_index)
888{
889 struct amdgpu_ras_block_list *node, *tmp;
890 struct amdgpu_ras_block_object *obj;
891
892 if (block >= AMDGPU_RAS_BLOCK__LAST)
893 return NULL;
894
895 if (!amdgpu_ras_is_supported(adev, block))
896 return NULL;
897
898 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
899 if (!node->ras_obj) {
900 dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
901 continue;
902 }
903
904 obj = node->ras_obj;
905 if (obj->ras_block_match) {
906 if (obj->ras_block_match(obj, block, sub_block_index) == 0)
907 return obj;
908 } else {
909 if (amdgpu_ras_block_match_default(obj, block) == 0)
910 return obj;
911 }
912 }
913
914 return NULL;
915}
916
917static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data)
918{
919 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
920 int ret = 0;
921
922
923
924
925
926 ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc));
927 if (ret == -EOPNOTSUPP) {
928 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
929 adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
930 adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data);
931
932
933
934
935 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
936 adev->umc.ras->ras_block.hw_ops->query_ras_error_address)
937 adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data);
938 } else if (!ret) {
939 if (adev->umc.ras &&
940 adev->umc.ras->ecc_info_query_ras_error_count)
941 adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data);
942
943 if (adev->umc.ras &&
944 adev->umc.ras->ecc_info_query_ras_error_address)
945 adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data);
946 }
947}
948
949
950int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
951 struct ras_query_if *info)
952{
953 struct amdgpu_ras_block_object *block_obj = NULL;
954 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
955 struct ras_err_data err_data = {0, 0, 0, NULL};
956
957 if (!obj)
958 return -EINVAL;
959
960 if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
961 amdgpu_ras_get_ecc_info(adev, &err_data);
962 } else {
963 block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
964 if (!block_obj || !block_obj->hw_ops) {
965 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
966 get_ras_block_str(&info->head));
967 return -EINVAL;
968 }
969
970 if (block_obj->hw_ops->query_ras_error_count)
971 block_obj->hw_ops->query_ras_error_count(adev, &err_data);
972
973 if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
974 (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
975 (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
976 if (block_obj->hw_ops->query_ras_error_status)
977 block_obj->hw_ops->query_ras_error_status(adev);
978 }
979 }
980
981 obj->err_data.ue_count += err_data.ue_count;
982 obj->err_data.ce_count += err_data.ce_count;
983
984 info->ue_count = obj->err_data.ue_count;
985 info->ce_count = obj->err_data.ce_count;
986
987 if (err_data.ce_count) {
988 if (adev->smuio.funcs &&
989 adev->smuio.funcs->get_socket_id &&
990 adev->smuio.funcs->get_die_id) {
991 dev_info(adev->dev, "socket: %d, die: %d "
992 "%ld correctable hardware errors "
993 "detected in %s block, no user "
994 "action is needed.\n",
995 adev->smuio.funcs->get_socket_id(adev),
996 adev->smuio.funcs->get_die_id(adev),
997 obj->err_data.ce_count,
998 get_ras_block_str(&info->head));
999 } else {
1000 dev_info(adev->dev, "%ld correctable hardware errors "
1001 "detected in %s block, no user "
1002 "action is needed.\n",
1003 obj->err_data.ce_count,
1004 get_ras_block_str(&info->head));
1005 }
1006 }
1007 if (err_data.ue_count) {
1008 if (adev->smuio.funcs &&
1009 adev->smuio.funcs->get_socket_id &&
1010 adev->smuio.funcs->get_die_id) {
1011 dev_info(adev->dev, "socket: %d, die: %d "
1012 "%ld uncorrectable hardware errors "
1013 "detected in %s block\n",
1014 adev->smuio.funcs->get_socket_id(adev),
1015 adev->smuio.funcs->get_die_id(adev),
1016 obj->err_data.ue_count,
1017 get_ras_block_str(&info->head));
1018 } else {
1019 dev_info(adev->dev, "%ld uncorrectable hardware errors "
1020 "detected in %s block\n",
1021 obj->err_data.ue_count,
1022 get_ras_block_str(&info->head));
1023 }
1024 }
1025
1026 if (!amdgpu_persistent_edc_harvesting_supported(adev))
1027 amdgpu_ras_reset_error_status(adev, info->head.block);
1028
1029 return 0;
1030}
1031
1032int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
1033 enum amdgpu_ras_block block)
1034{
1035 struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0);
1036
1037 if (!amdgpu_ras_is_supported(adev, block))
1038 return -EINVAL;
1039
1040 if (!block_obj || !block_obj->hw_ops) {
1041 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
1042 ras_block_str(block));
1043 return -EINVAL;
1044 }
1045
1046 if (block_obj->hw_ops->reset_ras_error_count)
1047 block_obj->hw_ops->reset_ras_error_count(adev);
1048
1049 if ((block == AMDGPU_RAS_BLOCK__GFX) ||
1050 (block == AMDGPU_RAS_BLOCK__MMHUB)) {
1051 if (block_obj->hw_ops->reset_ras_error_status)
1052 block_obj->hw_ops->reset_ras_error_status(adev);
1053 }
1054
1055 return 0;
1056}
1057
1058
1059int amdgpu_ras_error_inject(struct amdgpu_device *adev,
1060 struct ras_inject_if *info)
1061{
1062 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1063 struct ta_ras_trigger_error_input block_info = {
1064 .block_id = amdgpu_ras_block_to_ta(info->head.block),
1065 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
1066 .sub_block_index = info->head.sub_block_index,
1067 .address = info->address,
1068 .value = info->value,
1069 };
1070 int ret = -EINVAL;
1071 struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev,
1072 info->head.block,
1073 info->head.sub_block_index);
1074
1075 if (!obj)
1076 return -EINVAL;
1077
1078 if (!block_obj || !block_obj->hw_ops) {
1079 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
1080 get_ras_block_str(&info->head));
1081 return -EINVAL;
1082 }
1083
1084
1085 if (adev->gmc.xgmi.num_physical_nodes > 1) {
1086 block_info.address =
1087 amdgpu_xgmi_get_relative_phy_addr(adev,
1088 block_info.address);
1089 }
1090
1091 if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
1092 if (block_obj->hw_ops->ras_error_inject)
1093 ret = block_obj->hw_ops->ras_error_inject(adev, info);
1094 } else {
1095
1096 if (block_obj->hw_ops->ras_error_inject)
1097 ret = block_obj->hw_ops->ras_error_inject(adev, &block_info);
1098 else
1099 ret = psp_ras_trigger_error(&adev->psp, &block_info);
1100 }
1101
1102 if (ret)
1103 dev_err(adev->dev, "ras inject %s failed %d\n",
1104 get_ras_block_str(&info->head), ret);
1105
1106 return ret;
1107}
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
1121 unsigned long *ce_count,
1122 unsigned long *ue_count)
1123{
1124 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1125 struct ras_manager *obj;
1126 unsigned long ce, ue;
1127
1128 if (!adev->ras_enabled || !con)
1129 return -EOPNOTSUPP;
1130
1131
1132
1133 if (!ce_count && !ue_count)
1134 return 0;
1135
1136 ce = 0;
1137 ue = 0;
1138 list_for_each_entry(obj, &con->head, node) {
1139 struct ras_query_if info = {
1140 .head = obj->head,
1141 };
1142 int res;
1143
1144 res = amdgpu_ras_query_error_status(adev, &info);
1145 if (res)
1146 return res;
1147
1148 ce += info.ce_count;
1149 ue += info.ue_count;
1150 }
1151
1152 if (ce_count)
1153 *ce_count = ce;
1154
1155 if (ue_count)
1156 *ue_count = ue;
1157
1158 return 0;
1159}
1160
1161
1162
1163
1164
1165static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1166 struct ras_badpage **bps, unsigned int *count);
1167
1168static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
1169{
1170 switch (flags) {
1171 case AMDGPU_RAS_RETIRE_PAGE_RESERVED:
1172 return "R";
1173 case AMDGPU_RAS_RETIRE_PAGE_PENDING:
1174 return "P";
1175 case AMDGPU_RAS_RETIRE_PAGE_FAULT:
1176 default:
1177 return "F";
1178 }
1179}
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
1212 struct kobject *kobj, struct bin_attribute *attr,
1213 char *buf, loff_t ppos, size_t count)
1214{
1215 struct amdgpu_ras *con =
1216 container_of(attr, struct amdgpu_ras, badpages_attr);
1217 struct amdgpu_device *adev = con->adev;
1218 const unsigned int element_size =
1219 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
1220 unsigned int start = div64_ul(ppos + element_size - 1, element_size);
1221 unsigned int end = div64_ul(ppos + count - 1, element_size);
1222 ssize_t s = 0;
1223 struct ras_badpage *bps = NULL;
1224 unsigned int bps_count = 0;
1225
1226 memset(buf, 0, count);
1227
1228 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
1229 return 0;
1230
1231 for (; start < end && start < bps_count; start++)
1232 s += scnprintf(&buf[s], element_size + 1,
1233 "0x%08x : 0x%08x : %1s\n",
1234 bps[start].bp,
1235 bps[start].size,
1236 amdgpu_ras_badpage_flags_str(bps[start].flags));
1237
1238 kfree(bps);
1239
1240 return s;
1241}
1242
1243static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
1244 struct device_attribute *attr, char *buf)
1245{
1246 struct amdgpu_ras *con =
1247 container_of(attr, struct amdgpu_ras, features_attr);
1248
1249 return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
1250}
1251
1252static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev)
1253{
1254 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1255
1256 sysfs_remove_file_from_group(&adev->dev->kobj,
1257 &con->badpages_attr.attr,
1258 RAS_FS_NAME);
1259}
1260
1261static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
1262{
1263 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1264 struct attribute *attrs[] = {
1265 &con->features_attr.attr,
1266 NULL
1267 };
1268 struct attribute_group group = {
1269 .name = RAS_FS_NAME,
1270 .attrs = attrs,
1271 };
1272
1273 sysfs_remove_group(&adev->dev->kobj, &group);
1274
1275 return 0;
1276}
1277
1278int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
1279 struct ras_common_if *head)
1280{
1281 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1282
1283 if (!obj || obj->attr_inuse)
1284 return -EINVAL;
1285
1286 get_obj(obj);
1287
1288 snprintf(obj->fs_data.sysfs_name, sizeof(obj->fs_data.sysfs_name),
1289 "%s_err_count", head->name);
1290
1291 obj->sysfs_attr = (struct device_attribute){
1292 .attr = {
1293 .name = obj->fs_data.sysfs_name,
1294 .mode = S_IRUGO,
1295 },
1296 .show = amdgpu_ras_sysfs_read,
1297 };
1298 sysfs_attr_init(&obj->sysfs_attr.attr);
1299
1300 if (sysfs_add_file_to_group(&adev->dev->kobj,
1301 &obj->sysfs_attr.attr,
1302 RAS_FS_NAME)) {
1303 put_obj(obj);
1304 return -EINVAL;
1305 }
1306
1307 obj->attr_inuse = 1;
1308
1309 return 0;
1310}
1311
1312int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
1313 struct ras_common_if *head)
1314{
1315 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1316
1317 if (!obj || !obj->attr_inuse)
1318 return -EINVAL;
1319
1320 sysfs_remove_file_from_group(&adev->dev->kobj,
1321 &obj->sysfs_attr.attr,
1322 RAS_FS_NAME);
1323 obj->attr_inuse = 0;
1324 put_obj(obj);
1325
1326 return 0;
1327}
1328
1329static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
1330{
1331 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1332 struct ras_manager *obj, *tmp;
1333
1334 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1335 amdgpu_ras_sysfs_remove(adev, &obj->head);
1336 }
1337
1338 if (amdgpu_bad_page_threshold != 0)
1339 amdgpu_ras_sysfs_remove_bad_page_node(adev);
1340
1341 amdgpu_ras_sysfs_remove_feature_node(adev);
1342
1343 return 0;
1344}
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
1367{
1368 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1369 struct drm_minor *minor = adev_to_drm(adev)->primary;
1370 struct dentry *dir;
1371
1372 dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root);
1373 debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev,
1374 &amdgpu_ras_debugfs_ctrl_ops);
1375 debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev,
1376 &amdgpu_ras_debugfs_eeprom_ops);
1377 debugfs_create_u32("bad_page_cnt_threshold", 0444, dir,
1378 &con->bad_page_cnt_threshold);
1379 debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled);
1380 debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled);
1381 debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev,
1382 &amdgpu_ras_debugfs_eeprom_size_ops);
1383 con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table",
1384 S_IRUGO, dir, adev,
1385 &amdgpu_ras_debugfs_eeprom_table_ops);
1386 amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control);
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396 debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot);
1397
1398
1399
1400
1401
1402 debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, dir,
1403 &con->disable_ras_err_cnt_harvest);
1404 return dir;
1405}
1406
1407static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
1408 struct ras_fs_if *head,
1409 struct dentry *dir)
1410{
1411 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
1412
1413 if (!obj || !dir)
1414 return;
1415
1416 get_obj(obj);
1417
1418 memcpy(obj->fs_data.debugfs_name,
1419 head->debugfs_name,
1420 sizeof(obj->fs_data.debugfs_name));
1421
1422 debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir,
1423 obj, &amdgpu_ras_debugfs_ops);
1424}
1425
1426void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
1427{
1428 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1429 struct dentry *dir;
1430 struct ras_manager *obj;
1431 struct ras_fs_if fs_info;
1432
1433
1434
1435
1436
1437 if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con)
1438 return;
1439
1440 dir = amdgpu_ras_debugfs_create_ctrl_node(adev);
1441
1442 list_for_each_entry(obj, &con->head, node) {
1443 if (amdgpu_ras_is_supported(adev, obj->head.block) &&
1444 (obj->attr_inuse == 1)) {
1445 sprintf(fs_info.debugfs_name, "%s_err_inject",
1446 get_ras_block_str(&obj->head));
1447 fs_info.head = obj->head;
1448 amdgpu_ras_debugfs_create(adev, &fs_info, dir);
1449 }
1450 }
1451}
1452
1453
1454
1455
1456static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO,
1457 amdgpu_ras_sysfs_badpages_read, NULL, 0);
1458static DEVICE_ATTR(features, S_IRUGO,
1459 amdgpu_ras_sysfs_features_read, NULL);
1460static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
1461{
1462 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1463 struct attribute_group group = {
1464 .name = RAS_FS_NAME,
1465 };
1466 struct attribute *attrs[] = {
1467 &con->features_attr.attr,
1468 NULL
1469 };
1470 struct bin_attribute *bin_attrs[] = {
1471 NULL,
1472 NULL,
1473 };
1474 int r;
1475
1476
1477 con->features_attr = dev_attr_features;
1478 group.attrs = attrs;
1479 sysfs_attr_init(attrs[0]);
1480
1481 if (amdgpu_bad_page_threshold != 0) {
1482
1483 bin_attr_gpu_vram_bad_pages.private = NULL;
1484 con->badpages_attr = bin_attr_gpu_vram_bad_pages;
1485 bin_attrs[0] = &con->badpages_attr;
1486 group.bin_attrs = bin_attrs;
1487 sysfs_bin_attr_init(bin_attrs[0]);
1488 }
1489
1490 r = sysfs_create_group(&adev->dev->kobj, &group);
1491 if (r)
1492 dev_err(adev->dev, "Failed to create RAS sysfs group!");
1493
1494 return 0;
1495}
1496
1497static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
1498{
1499 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1500 struct ras_manager *con_obj, *ip_obj, *tmp;
1501
1502 if (IS_ENABLED(CONFIG_DEBUG_FS)) {
1503 list_for_each_entry_safe(con_obj, tmp, &con->head, node) {
1504 ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head);
1505 if (ip_obj)
1506 put_obj(ip_obj);
1507 }
1508 }
1509
1510 amdgpu_ras_sysfs_remove_all(adev);
1511 return 0;
1512}
1513
1514
1515
1516static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
1517{
1518 struct ras_ih_data *data = &obj->ih_data;
1519 struct amdgpu_iv_entry entry;
1520 int ret;
1521 struct ras_err_data err_data = {0, 0, 0, NULL};
1522
1523 while (data->rptr != data->wptr) {
1524 rmb();
1525 memcpy(&entry, &data->ring[data->rptr],
1526 data->element_size);
1527
1528 wmb();
1529 data->rptr = (data->aligned_element_size +
1530 data->rptr) % data->ring_size;
1531
1532 if (data->cb) {
1533 if (amdgpu_ras_is_poison_mode_supported(obj->adev) &&
1534 obj->head.block == AMDGPU_RAS_BLOCK__UMC)
1535 dev_info(obj->adev->dev,
1536 "Poison is created, no user action is needed.\n");
1537 else {
1538
1539
1540
1541 memset(&err_data, 0, sizeof(err_data));
1542 ret = data->cb(obj->adev, &err_data, &entry);
1543
1544
1545
1546
1547
1548 if (ret == AMDGPU_RAS_SUCCESS) {
1549
1550
1551
1552 obj->err_data.ue_count += err_data.ue_count;
1553 obj->err_data.ce_count += err_data.ce_count;
1554 }
1555 }
1556 }
1557 }
1558}
1559
1560static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1561{
1562 struct ras_ih_data *data =
1563 container_of(work, struct ras_ih_data, ih_work);
1564 struct ras_manager *obj =
1565 container_of(data, struct ras_manager, ih_data);
1566
1567 amdgpu_ras_interrupt_handler(obj);
1568}
1569
1570int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1571 struct ras_dispatch_if *info)
1572{
1573 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1574 struct ras_ih_data *data = &obj->ih_data;
1575
1576 if (!obj)
1577 return -EINVAL;
1578
1579 if (data->inuse == 0)
1580 return 0;
1581
1582
1583 memcpy(&data->ring[data->wptr], info->entry,
1584 data->element_size);
1585
1586 wmb();
1587 data->wptr = (data->aligned_element_size +
1588 data->wptr) % data->ring_size;
1589
1590 schedule_work(&data->ih_work);
1591
1592 return 0;
1593}
1594
1595int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1596 struct ras_common_if *head)
1597{
1598 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1599 struct ras_ih_data *data;
1600
1601 if (!obj)
1602 return -EINVAL;
1603
1604 data = &obj->ih_data;
1605 if (data->inuse == 0)
1606 return 0;
1607
1608 cancel_work_sync(&data->ih_work);
1609
1610 kfree(data->ring);
1611 memset(data, 0, sizeof(*data));
1612 put_obj(obj);
1613
1614 return 0;
1615}
1616
1617int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1618 struct ras_common_if *head)
1619{
1620 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1621 struct ras_ih_data *data;
1622 struct amdgpu_ras_block_object *ras_obj;
1623
1624 if (!obj) {
1625
1626 obj = amdgpu_ras_create_obj(adev, head);
1627 if (!obj)
1628 return -EINVAL;
1629 } else
1630 get_obj(obj);
1631
1632 ras_obj = container_of(head, struct amdgpu_ras_block_object, ras_comm);
1633
1634 data = &obj->ih_data;
1635
1636 *data = (struct ras_ih_data) {
1637 .inuse = 0,
1638 .cb = ras_obj->ras_cb,
1639 .element_size = sizeof(struct amdgpu_iv_entry),
1640 .rptr = 0,
1641 .wptr = 0,
1642 };
1643
1644 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1645
1646 data->aligned_element_size = ALIGN(data->element_size, 8);
1647
1648 data->ring_size = 64 * data->aligned_element_size;
1649 data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1650 if (!data->ring) {
1651 put_obj(obj);
1652 return -ENOMEM;
1653 }
1654
1655
1656 data->inuse = 1;
1657
1658 return 0;
1659}
1660
1661static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1662{
1663 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1664 struct ras_manager *obj, *tmp;
1665
1666 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1667 amdgpu_ras_interrupt_remove_handler(adev, &obj->head);
1668 }
1669
1670 return 0;
1671}
1672
1673
1674
1675static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
1676{
1677 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1678 struct ras_manager *obj;
1679
1680 if (!adev->ras_enabled || !con)
1681 return;
1682
1683 list_for_each_entry(obj, &con->head, node) {
1684 struct ras_query_if info = {
1685 .head = obj->head,
1686 };
1687
1688
1689
1690
1691
1692
1693
1694 if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
1695 continue;
1696
1697
1698
1699
1700
1701
1702
1703 if ((info.head.block == AMDGPU_RAS_BLOCK__UMC) &&
1704 (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)))
1705 continue;
1706
1707 amdgpu_ras_query_error_status(adev, &info);
1708 }
1709}
1710
1711
1712static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
1713 struct ras_query_if *info)
1714{
1715 struct amdgpu_ras_block_object *block_obj;
1716
1717
1718
1719
1720 if ((info->head.block != AMDGPU_RAS_BLOCK__GFX) &&
1721 (info->head.block != AMDGPU_RAS_BLOCK__MMHUB))
1722 return;
1723
1724 block_obj = amdgpu_ras_get_ras_block(adev,
1725 info->head.block,
1726 info->head.sub_block_index);
1727
1728 if (!block_obj || !block_obj->hw_ops) {
1729 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
1730 get_ras_block_str(&info->head));
1731 return;
1732 }
1733
1734 if (block_obj->hw_ops->query_ras_error_status)
1735 block_obj->hw_ops->query_ras_error_status(adev);
1736
1737}
1738
1739static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
1740{
1741 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1742 struct ras_manager *obj;
1743
1744 if (!adev->ras_enabled || !con)
1745 return;
1746
1747 list_for_each_entry(obj, &con->head, node) {
1748 struct ras_query_if info = {
1749 .head = obj->head,
1750 };
1751
1752 amdgpu_ras_error_status_query(adev, &info);
1753 }
1754}
1755
1756
1757
1758
1759
1760
1761static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1762 struct ras_badpage **bps, unsigned int *count)
1763{
1764 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1765 struct ras_err_handler_data *data;
1766 int i = 0;
1767 int ret = 0, status;
1768
1769 if (!con || !con->eh_data || !bps || !count)
1770 return -EINVAL;
1771
1772 mutex_lock(&con->recovery_lock);
1773 data = con->eh_data;
1774 if (!data || data->count == 0) {
1775 *bps = NULL;
1776 ret = -EINVAL;
1777 goto out;
1778 }
1779
1780 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
1781 if (!*bps) {
1782 ret = -ENOMEM;
1783 goto out;
1784 }
1785
1786 for (; i < data->count; i++) {
1787 (*bps)[i] = (struct ras_badpage){
1788 .bp = data->bps[i].retired_page,
1789 .size = AMDGPU_GPU_PAGE_SIZE,
1790 .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
1791 };
1792 status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
1793 data->bps[i].retired_page);
1794 if (status == -EBUSY)
1795 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
1796 else if (status == -ENOENT)
1797 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
1798 }
1799
1800 *count = data->count;
1801out:
1802 mutex_unlock(&con->recovery_lock);
1803 return ret;
1804}
1805
1806static void amdgpu_ras_do_recovery(struct work_struct *work)
1807{
1808 struct amdgpu_ras *ras =
1809 container_of(work, struct amdgpu_ras, recovery_work);
1810 struct amdgpu_device *remote_adev = NULL;
1811 struct amdgpu_device *adev = ras->adev;
1812 struct list_head device_list, *device_list_handle = NULL;
1813
1814 if (!ras->disable_ras_err_cnt_harvest) {
1815 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
1816
1817
1818 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) {
1819 device_list_handle = &hive->device_list;
1820 } else {
1821 INIT_LIST_HEAD(&device_list);
1822 list_add_tail(&adev->gmc.xgmi.head, &device_list);
1823 device_list_handle = &device_list;
1824 }
1825
1826 list_for_each_entry(remote_adev,
1827 device_list_handle, gmc.xgmi.head) {
1828 amdgpu_ras_query_err_status(remote_adev);
1829 amdgpu_ras_log_on_err_counter(remote_adev);
1830 }
1831
1832 amdgpu_put_xgmi_hive(hive);
1833 }
1834
1835 if (amdgpu_device_should_recover_gpu(ras->adev))
1836 amdgpu_device_gpu_recover(ras->adev, NULL);
1837 atomic_set(&ras->in_recovery, 0);
1838}
1839
1840
1841static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1842 struct ras_err_handler_data *data, int pages)
1843{
1844 unsigned int old_space = data->count + data->space_left;
1845 unsigned int new_space = old_space + pages;
1846 unsigned int align_space = ALIGN(new_space, 512);
1847 void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1848
1849 if (!bps) {
1850 kfree(bps);
1851 return -ENOMEM;
1852 }
1853
1854 if (data->bps) {
1855 memcpy(bps, data->bps,
1856 data->count * sizeof(*data->bps));
1857 kfree(data->bps);
1858 }
1859
1860 data->bps = bps;
1861 data->space_left += align_space - old_space;
1862 return 0;
1863}
1864
1865
1866int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
1867 struct eeprom_table_record *bps, int pages)
1868{
1869 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1870 struct ras_err_handler_data *data;
1871 int ret = 0;
1872 uint32_t i;
1873
1874 if (!con || !con->eh_data || !bps || pages <= 0)
1875 return 0;
1876
1877 mutex_lock(&con->recovery_lock);
1878 data = con->eh_data;
1879 if (!data)
1880 goto out;
1881
1882 for (i = 0; i < pages; i++) {
1883 if (amdgpu_ras_check_bad_page_unlock(con,
1884 bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
1885 continue;
1886
1887 if (!data->space_left &&
1888 amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
1889 ret = -ENOMEM;
1890 goto out;
1891 }
1892
1893 amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
1894 bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
1895 AMDGPU_GPU_PAGE_SIZE);
1896
1897 memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
1898 data->count++;
1899 data->space_left--;
1900 }
1901out:
1902 mutex_unlock(&con->recovery_lock);
1903
1904 return ret;
1905}
1906
1907
1908
1909
1910
1911int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1912{
1913 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1914 struct ras_err_handler_data *data;
1915 struct amdgpu_ras_eeprom_control *control;
1916 int save_count;
1917
1918 if (!con || !con->eh_data)
1919 return 0;
1920
1921 mutex_lock(&con->recovery_lock);
1922 control = &con->eeprom_control;
1923 data = con->eh_data;
1924 save_count = data->count - control->ras_num_recs;
1925 mutex_unlock(&con->recovery_lock);
1926
1927 if (save_count > 0) {
1928 if (amdgpu_ras_eeprom_append(control,
1929 &data->bps[control->ras_num_recs],
1930 save_count)) {
1931 dev_err(adev->dev, "Failed to save EEPROM table data!");
1932 return -EIO;
1933 }
1934
1935 dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count);
1936 }
1937
1938 return 0;
1939}
1940
1941
1942
1943
1944
1945static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1946{
1947 struct amdgpu_ras_eeprom_control *control =
1948 &adev->psp.ras_context.ras->eeprom_control;
1949 struct eeprom_table_record *bps;
1950 int ret;
1951
1952
1953 if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0)
1954 return 0;
1955
1956 bps = kcalloc(control->ras_num_recs, sizeof(*bps), GFP_KERNEL);
1957 if (!bps)
1958 return -ENOMEM;
1959
1960 ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs);
1961 if (ret)
1962 dev_err(adev->dev, "Failed to load EEPROM table records!");
1963 else
1964 ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs);
1965
1966 kfree(bps);
1967 return ret;
1968}
1969
1970static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
1971 uint64_t addr)
1972{
1973 struct ras_err_handler_data *data = con->eh_data;
1974 int i;
1975
1976 addr >>= AMDGPU_GPU_PAGE_SHIFT;
1977 for (i = 0; i < data->count; i++)
1978 if (addr == data->bps[i].retired_page)
1979 return true;
1980
1981 return false;
1982}
1983
1984
1985
1986
1987
1988
1989static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
1990 uint64_t addr)
1991{
1992 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1993 bool ret = false;
1994
1995 if (!con || !con->eh_data)
1996 return ret;
1997
1998 mutex_lock(&con->recovery_lock);
1999 ret = amdgpu_ras_check_bad_page_unlock(con, addr);
2000 mutex_unlock(&con->recovery_lock);
2001 return ret;
2002}
2003
2004static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
2005 uint32_t max_count)
2006{
2007 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028 if (amdgpu_bad_page_threshold < 0) {
2029 u64 val = adev->gmc.mc_vram_size;
2030
2031 do_div(val, RAS_BAD_PAGE_COVER);
2032 con->bad_page_cnt_threshold = min(lower_32_bits(val),
2033 max_count);
2034 } else {
2035 con->bad_page_cnt_threshold = min_t(int, max_count,
2036 amdgpu_bad_page_threshold);
2037 }
2038}
2039
2040int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
2041{
2042 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2043 struct ras_err_handler_data **data;
2044 u32 max_eeprom_records_count = 0;
2045 bool exc_err_limit = false;
2046 int ret;
2047
2048 if (!con)
2049 return 0;
2050
2051
2052
2053
2054
2055
2056 con->adev = adev;
2057
2058 if (!adev->ras_enabled)
2059 return 0;
2060
2061 data = &con->eh_data;
2062 *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
2063 if (!*data) {
2064 ret = -ENOMEM;
2065 goto out;
2066 }
2067
2068 mutex_init(&con->recovery_lock);
2069 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
2070 atomic_set(&con->in_recovery, 0);
2071 con->eeprom_control.bad_channel_bitmap = 0;
2072
2073 max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
2074 amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
2075
2076
2077
2078
2079
2080 if (adev->gmc.xgmi.pending_reset)
2081 return 0;
2082 ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
2083
2084
2085
2086
2087 if (exc_err_limit || ret)
2088 goto free;
2089
2090 if (con->eeprom_control.ras_num_recs) {
2091 ret = amdgpu_ras_load_bad_pages(adev);
2092 if (ret)
2093 goto free;
2094
2095 amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
2096
2097 if (con->update_channel_flag == true) {
2098 amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
2099 con->update_channel_flag = false;
2100 }
2101 }
2102
2103#ifdef CONFIG_X86_MCE_AMD
2104 if ((adev->asic_type == CHIP_ALDEBARAN) &&
2105 (adev->gmc.xgmi.connected_to_cpu))
2106 amdgpu_register_bad_pages_mca_notifier(adev);
2107#endif
2108 return 0;
2109
2110free:
2111 kfree((*data)->bps);
2112 kfree(*data);
2113 con->eh_data = NULL;
2114out:
2115 dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret);
2116
2117
2118
2119
2120
2121 if (!exc_err_limit)
2122 ret = 0;
2123 else
2124 ret = -EINVAL;
2125
2126 return ret;
2127}
2128
2129static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
2130{
2131 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2132 struct ras_err_handler_data *data = con->eh_data;
2133
2134
2135 if (!data)
2136 return 0;
2137
2138 cancel_work_sync(&con->recovery_work);
2139
2140 mutex_lock(&con->recovery_lock);
2141 con->eh_data = NULL;
2142 kfree(data->bps);
2143 kfree(data);
2144 mutex_unlock(&con->recovery_lock);
2145
2146 return 0;
2147}
2148
2149
2150static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
2151{
2152 return adev->asic_type == CHIP_VEGA10 ||
2153 adev->asic_type == CHIP_VEGA20 ||
2154 adev->asic_type == CHIP_ARCTURUS ||
2155 adev->asic_type == CHIP_ALDEBARAN ||
2156 adev->asic_type == CHIP_SIENNA_CICHLID;
2157}
2158
2159
2160
2161
2162
2163
2164static void amdgpu_ras_get_quirks(struct amdgpu_device *adev)
2165{
2166 struct atom_context *ctx = adev->mode_info.atom_context;
2167
2168 if (!ctx)
2169 return;
2170
2171 if (strnstr(ctx->vbios_version, "D16406",
2172 sizeof(ctx->vbios_version)) ||
2173 strnstr(ctx->vbios_version, "D36002",
2174 sizeof(ctx->vbios_version)))
2175 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
2176}
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
2188{
2189 adev->ras_hw_enabled = adev->ras_enabled = 0;
2190
2191 if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
2192 !amdgpu_ras_asic_supported(adev))
2193 return;
2194
2195 if (!adev->gmc.xgmi.connected_to_cpu) {
2196 if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
2197 dev_info(adev->dev, "MEM ECC is active.\n");
2198 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
2199 1 << AMDGPU_RAS_BLOCK__DF);
2200 } else {
2201 dev_info(adev->dev, "MEM ECC is not presented.\n");
2202 }
2203
2204 if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
2205 dev_info(adev->dev, "SRAM ECC is active.\n");
2206 adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
2207 1 << AMDGPU_RAS_BLOCK__DF);
2208 } else {
2209 dev_info(adev->dev, "SRAM ECC is not presented.\n");
2210 }
2211 } else {
2212
2213
2214 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX |
2215 1 << AMDGPU_RAS_BLOCK__SDMA |
2216 1 << AMDGPU_RAS_BLOCK__MMHUB);
2217 }
2218
2219 amdgpu_ras_get_quirks(adev);
2220
2221
2222 adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK;
2223
2224 adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
2225 adev->ras_hw_enabled & amdgpu_ras_mask;
2226}
2227
2228static void amdgpu_ras_counte_dw(struct work_struct *work)
2229{
2230 struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
2231 ras_counte_delay_work.work);
2232 struct amdgpu_device *adev = con->adev;
2233 struct drm_device *dev = adev_to_drm(adev);
2234 unsigned long ce_count, ue_count;
2235 int res;
2236
2237 res = pm_runtime_get_sync(dev->dev);
2238 if (res < 0)
2239 goto Out;
2240
2241
2242
2243 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) {
2244 atomic_set(&con->ras_ce_count, ce_count);
2245 atomic_set(&con->ras_ue_count, ue_count);
2246 }
2247
2248 pm_runtime_mark_last_busy(dev->dev);
2249Out:
2250 pm_runtime_put_autosuspend(dev->dev);
2251}
2252
2253int amdgpu_ras_init(struct amdgpu_device *adev)
2254{
2255 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2256 int r;
2257 bool df_poison, umc_poison;
2258
2259 if (con)
2260 return 0;
2261
2262 con = kmalloc(sizeof(struct amdgpu_ras) +
2263 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT +
2264 sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT,
2265 GFP_KERNEL|__GFP_ZERO);
2266 if (!con)
2267 return -ENOMEM;
2268
2269 con->adev = adev;
2270 INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
2271 atomic_set(&con->ras_ce_count, 0);
2272 atomic_set(&con->ras_ue_count, 0);
2273
2274 con->objs = (struct ras_manager *)(con + 1);
2275
2276 amdgpu_ras_set_context(adev, con);
2277
2278 amdgpu_ras_check_supported(adev);
2279
2280 if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) {
2281
2282
2283
2284 if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) {
2285 con->features |= BIT(AMDGPU_RAS_BLOCK__GFX);
2286
2287 return 0;
2288 }
2289
2290 r = 0;
2291 goto release_con;
2292 }
2293
2294 con->update_channel_flag = false;
2295 con->features = 0;
2296 INIT_LIST_HEAD(&con->head);
2297
2298 con->flags = RAS_DEFAULT_FLAGS;
2299
2300
2301
2302
2303 switch (adev->asic_type) {
2304 case CHIP_VEGA20:
2305 case CHIP_ARCTURUS:
2306 case CHIP_ALDEBARAN:
2307 if (!adev->gmc.xgmi.connected_to_cpu) {
2308 adev->nbio.ras = &nbio_v7_4_ras;
2309 amdgpu_ras_register_ras_block(adev, &adev->nbio.ras->ras_block);
2310 adev->nbio.ras_if = &adev->nbio.ras->ras_block.ras_comm;
2311 }
2312 break;
2313 default:
2314
2315 break;
2316 }
2317
2318 if (adev->nbio.ras &&
2319 adev->nbio.ras->init_ras_controller_interrupt) {
2320 r = adev->nbio.ras->init_ras_controller_interrupt(adev);
2321 if (r)
2322 goto release_con;
2323 }
2324
2325 if (adev->nbio.ras &&
2326 adev->nbio.ras->init_ras_err_event_athub_interrupt) {
2327 r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev);
2328 if (r)
2329 goto release_con;
2330 }
2331
2332
2333 if (adev->gmc.xgmi.connected_to_cpu) {
2334
2335 con->poison_supported = true;
2336 }
2337 else if (adev->df.funcs &&
2338 adev->df.funcs->query_ras_poison_mode &&
2339 adev->umc.ras &&
2340 adev->umc.ras->query_ras_poison_mode) {
2341 df_poison =
2342 adev->df.funcs->query_ras_poison_mode(adev);
2343 umc_poison =
2344 adev->umc.ras->query_ras_poison_mode(adev);
2345
2346 if (df_poison && umc_poison)
2347 con->poison_supported = true;
2348 else if (df_poison != umc_poison)
2349 dev_warn(adev->dev, "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
2350 df_poison, umc_poison);
2351 }
2352
2353 if (amdgpu_ras_fs_init(adev)) {
2354 r = -EINVAL;
2355 goto release_con;
2356 }
2357
2358 dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
2359 "hardware ability[%x] ras_mask[%x]\n",
2360 adev->ras_hw_enabled, adev->ras_enabled);
2361
2362 return 0;
2363release_con:
2364 amdgpu_ras_set_context(adev, NULL);
2365 kfree(con);
2366
2367 return r;
2368}
2369
2370int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
2371{
2372 if (adev->gmc.xgmi.connected_to_cpu)
2373 return 1;
2374 return 0;
2375}
2376
2377static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
2378 struct ras_common_if *ras_block)
2379{
2380 struct ras_query_if info = {
2381 .head = *ras_block,
2382 };
2383
2384 if (!amdgpu_persistent_edc_harvesting_supported(adev))
2385 return 0;
2386
2387 if (amdgpu_ras_query_error_status(adev, &info) != 0)
2388 DRM_WARN("RAS init harvest failure");
2389
2390 if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0)
2391 DRM_WARN("RAS init harvest reset failure");
2392
2393 return 0;
2394}
2395
2396bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev)
2397{
2398 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2399
2400 if (!con)
2401 return false;
2402
2403 return con->poison_supported;
2404}
2405
2406
2407int amdgpu_ras_block_late_init(struct amdgpu_device *adev,
2408 struct ras_common_if *ras_block)
2409{
2410 struct amdgpu_ras_block_object *ras_obj = NULL;
2411 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2412 unsigned long ue_count, ce_count;
2413 int r;
2414
2415
2416 if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
2417 amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
2418 return 0;
2419 }
2420
2421 r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
2422 if (r) {
2423 if (adev->in_suspend || amdgpu_in_reset(adev)) {
2424
2425
2426 goto cleanup;
2427 } else
2428 return r;
2429 }
2430
2431
2432 amdgpu_persistent_edc_harvesting(adev, ras_block);
2433
2434
2435 if (adev->in_suspend || amdgpu_in_reset(adev))
2436 return 0;
2437
2438 ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
2439 if (ras_obj->ras_cb) {
2440 r = amdgpu_ras_interrupt_add_handler(adev, ras_block);
2441 if (r)
2442 goto cleanup;
2443 }
2444
2445 r = amdgpu_ras_sysfs_create(adev, ras_block);
2446 if (r)
2447 goto interrupt;
2448
2449
2450
2451 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) {
2452 atomic_set(&con->ras_ce_count, ce_count);
2453 atomic_set(&con->ras_ue_count, ue_count);
2454 }
2455
2456 return 0;
2457
2458interrupt:
2459 if (ras_obj->ras_cb)
2460 amdgpu_ras_interrupt_remove_handler(adev, ras_block);
2461cleanup:
2462 amdgpu_ras_feature_enable(adev, ras_block, 0);
2463 return r;
2464}
2465
2466static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev,
2467 struct ras_common_if *ras_block)
2468{
2469 return amdgpu_ras_block_late_init(adev, ras_block);
2470}
2471
2472
2473void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
2474 struct ras_common_if *ras_block)
2475{
2476 struct amdgpu_ras_block_object *ras_obj;
2477 if (!ras_block)
2478 return;
2479
2480 amdgpu_ras_sysfs_remove(adev, ras_block);
2481
2482 ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
2483 if (ras_obj->ras_cb)
2484 amdgpu_ras_interrupt_remove_handler(adev, ras_block);
2485}
2486
2487static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev,
2488 struct ras_common_if *ras_block)
2489{
2490 return amdgpu_ras_block_late_fini(adev, ras_block);
2491}
2492
2493
2494
2495
2496void amdgpu_ras_resume(struct amdgpu_device *adev)
2497{
2498 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2499 struct ras_manager *obj, *tmp;
2500
2501 if (!adev->ras_enabled || !con) {
2502
2503 amdgpu_release_ras_context(adev);
2504
2505 return;
2506 }
2507
2508 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
2509
2510
2511
2512
2513
2514 amdgpu_ras_enable_all_features(adev, 1);
2515
2516
2517
2518
2519
2520 list_for_each_entry_safe(obj, tmp, &con->head, node) {
2521 if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
2522 amdgpu_ras_feature_enable(adev, &obj->head, 0);
2523
2524 WARN_ON(alive_obj(obj));
2525 }
2526 }
2527 }
2528}
2529
2530void amdgpu_ras_suspend(struct amdgpu_device *adev)
2531{
2532 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2533
2534 if (!adev->ras_enabled || !con)
2535 return;
2536
2537 amdgpu_ras_disable_all_features(adev, 0);
2538
2539 if (con->features)
2540 amdgpu_ras_disable_all_features(adev, 1);
2541}
2542
2543int amdgpu_ras_late_init(struct amdgpu_device *adev)
2544{
2545 struct amdgpu_ras_block_list *node, *tmp;
2546 struct amdgpu_ras_block_object *obj;
2547 int r;
2548
2549 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
2550 if (!node->ras_obj) {
2551 dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
2552 continue;
2553 }
2554
2555 obj = node->ras_obj;
2556 if (obj->ras_late_init) {
2557 r = obj->ras_late_init(adev, &obj->ras_comm);
2558 if (r) {
2559 dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n",
2560 obj->ras_comm.name, r);
2561 return r;
2562 }
2563 } else
2564 amdgpu_ras_block_late_init_default(adev, &obj->ras_comm);
2565 }
2566
2567 return 0;
2568}
2569
2570
2571int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
2572{
2573 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2574
2575 if (!adev->ras_enabled || !con)
2576 return 0;
2577
2578
2579
2580 amdgpu_ras_disable_all_features(adev, 0);
2581 amdgpu_ras_recovery_fini(adev);
2582 return 0;
2583}
2584
2585int amdgpu_ras_fini(struct amdgpu_device *adev)
2586{
2587 struct amdgpu_ras_block_list *ras_node, *tmp;
2588 struct amdgpu_ras_block_object *obj = NULL;
2589 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2590
2591 if (!adev->ras_enabled || !con)
2592 return 0;
2593
2594 list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {
2595 if (ras_node->ras_obj) {
2596 obj = ras_node->ras_obj;
2597 if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) &&
2598 obj->ras_fini)
2599 obj->ras_fini(adev, &obj->ras_comm);
2600 else
2601 amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm);
2602 }
2603
2604
2605 list_del(&ras_node->node);
2606 kfree(ras_node);
2607 }
2608
2609 amdgpu_ras_fs_fini(adev);
2610 amdgpu_ras_interrupt_remove_all(adev);
2611
2612 WARN(con->features, "Feature mask is not cleared");
2613
2614 if (con->features)
2615 amdgpu_ras_disable_all_features(adev, 1);
2616
2617 cancel_delayed_work_sync(&con->ras_counte_delay_work);
2618
2619 amdgpu_ras_set_context(adev, NULL);
2620 kfree(con);
2621
2622 return 0;
2623}
2624
2625void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
2626{
2627 amdgpu_ras_check_supported(adev);
2628 if (!adev->ras_hw_enabled)
2629 return;
2630
2631 if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
2632 dev_info(adev->dev, "uncorrectable hardware error"
2633 "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
2634
2635 amdgpu_ras_reset_gpu(adev);
2636 }
2637}
2638
2639bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
2640{
2641 if (adev->asic_type == CHIP_VEGA20 &&
2642 adev->pm.fw_version <= 0x283400) {
2643 return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) &&
2644 amdgpu_ras_intr_triggered();
2645 }
2646
2647 return false;
2648}
2649
2650void amdgpu_release_ras_context(struct amdgpu_device *adev)
2651{
2652 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2653
2654 if (!con)
2655 return;
2656
2657 if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) {
2658 con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX);
2659 amdgpu_ras_set_context(adev, NULL);
2660 kfree(con);
2661 }
2662}
2663
2664#ifdef CONFIG_X86_MCE_AMD
2665static struct amdgpu_device *find_adev(uint32_t node_id)
2666{
2667 int i;
2668 struct amdgpu_device *adev = NULL;
2669
2670 for (i = 0; i < mce_adev_list.num_gpu; i++) {
2671 adev = mce_adev_list.devs[i];
2672
2673 if (adev && adev->gmc.xgmi.connected_to_cpu &&
2674 adev->gmc.xgmi.physical_node_id == node_id)
2675 break;
2676 adev = NULL;
2677 }
2678
2679 return adev;
2680}
2681
2682#define GET_MCA_IPID_GPUID(m) (((m) >> 44) & 0xF)
2683#define GET_UMC_INST(m) (((m) >> 21) & 0x7)
2684#define GET_CHAN_INDEX(m) ((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4))
2685#define GPU_ID_OFFSET 8
2686
2687static int amdgpu_bad_page_notifier(struct notifier_block *nb,
2688 unsigned long val, void *data)
2689{
2690 struct mce *m = (struct mce *)data;
2691 struct amdgpu_device *adev = NULL;
2692 uint32_t gpu_id = 0;
2693 uint32_t umc_inst = 0;
2694 uint32_t ch_inst, channel_index = 0;
2695 struct ras_err_data err_data = {0, 0, 0, NULL};
2696 struct eeprom_table_record err_rec;
2697 uint64_t retired_page;
2698
2699
2700
2701
2702
2703
2704 if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) &&
2705 (XEC(m->status, 0x3f) == 0x0)))
2706 return NOTIFY_DONE;
2707
2708
2709
2710
2711 if (mce_is_correctable(m))
2712 return NOTIFY_OK;
2713
2714
2715
2716
2717 gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET;
2718
2719 adev = find_adev(gpu_id);
2720 if (!adev) {
2721 DRM_WARN("%s: Unable to find adev for gpu_id: %d\n", __func__,
2722 gpu_id);
2723 return NOTIFY_DONE;
2724 }
2725
2726
2727
2728
2729
2730 umc_inst = GET_UMC_INST(m->ipid);
2731 ch_inst = GET_CHAN_INDEX(m->ipid);
2732
2733 dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",
2734 umc_inst, ch_inst);
2735
2736
2737
2738
2739 channel_index =
2740 adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num
2741 + ch_inst];
2742
2743 retired_page = ADDR_OF_8KB_BLOCK(m->addr) |
2744 ADDR_OF_256B_BLOCK(channel_index) |
2745 OFFSET_IN_256B_BLOCK(m->addr);
2746
2747 memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
2748 err_data.err_addr = &err_rec;
2749 amdgpu_umc_fill_error_record(&err_data, m->addr,
2750 retired_page, channel_index, umc_inst);
2751
2752 if (amdgpu_bad_page_threshold != 0) {
2753 amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
2754 err_data.err_addr_cnt);
2755 amdgpu_ras_save_bad_pages(adev);
2756 }
2757
2758 return NOTIFY_OK;
2759}
2760
2761static struct notifier_block amdgpu_bad_page_nb = {
2762 .notifier_call = amdgpu_bad_page_notifier,
2763 .priority = MCE_PRIO_UC,
2764};
2765
2766static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev)
2767{
2768
2769
2770
2771
2772
2773
2774
2775
2776 mce_adev_list.devs[mce_adev_list.num_gpu++] = adev;
2777
2778
2779
2780
2781
2782 if (notifier_registered == false) {
2783 mce_register_decode_chain(&amdgpu_bad_page_nb);
2784 notifier_registered = true;
2785 }
2786}
2787#endif
2788
2789struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev)
2790{
2791 if (!adev)
2792 return NULL;
2793
2794 return adev->psp.ras_context.ras;
2795}
2796
2797int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con)
2798{
2799 if (!adev)
2800 return -EINVAL;
2801
2802 adev->psp.ras_context.ras = ras_con;
2803 return 0;
2804}
2805
2806
2807int amdgpu_ras_is_supported(struct amdgpu_device *adev,
2808 unsigned int block)
2809{
2810 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
2811
2812 if (block >= AMDGPU_RAS_BLOCK_COUNT)
2813 return 0;
2814 return ras && (adev->ras_enabled & (1 << block));
2815}
2816
2817int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
2818{
2819 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
2820
2821 if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
2822 schedule_work(&ras->recovery_work);
2823 return 0;
2824}
2825
2826
2827
2828int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
2829 struct amdgpu_ras_block_object *ras_block_obj)
2830{
2831 struct amdgpu_ras_block_list *ras_node;
2832 if (!adev || !ras_block_obj)
2833 return -EINVAL;
2834
2835 if (!amdgpu_ras_asic_supported(adev))
2836 return 0;
2837
2838 ras_node = kzalloc(sizeof(*ras_node), GFP_KERNEL);
2839 if (!ras_node)
2840 return -ENOMEM;
2841
2842 INIT_LIST_HEAD(&ras_node->node);
2843 ras_node->ras_obj = ras_block_obj;
2844 list_add_tail(&ras_node->node, &adev->ras_list);
2845
2846 return 0;
2847}
2848