1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/debugfs.h>
25#include <linux/list.h>
26#include <linux/module.h>
27#include <linux/uaccess.h>
28#include <linux/reboot.h>
29#include <linux/syscalls.h>
30#include <linux/pm_runtime.h>
31
32#include "amdgpu.h"
33#include "amdgpu_ras.h"
34#include "amdgpu_atomfirmware.h"
35#include "amdgpu_xgmi.h"
36#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
37#include "atom.h"
38#ifdef CONFIG_X86_MCE_AMD
39#include <asm/mce.h>
40
41static bool notifier_registered;
42#endif
43static const char *RAS_FS_NAME = "ras";
44
45const char *ras_error_string[] = {
46 "none",
47 "parity",
48 "single_correctable",
49 "multi_uncorrectable",
50 "poison",
51};
52
53const char *ras_block_string[] = {
54 "umc",
55 "sdma",
56 "gfx",
57 "mmhub",
58 "athub",
59 "pcie_bif",
60 "hdp",
61 "xgmi_wafl",
62 "df",
63 "smn",
64 "sem",
65 "mp0",
66 "mp1",
67 "fuse",
68 "mca",
69};
70
71const char *ras_mca_block_string[] = {
72 "mca_mp0",
73 "mca_mp1",
74 "mca_mpio",
75 "mca_iohc",
76};
77
78const char *get_ras_block_str(struct ras_common_if *ras_block)
79{
80 if (!ras_block)
81 return "NULL";
82
83 if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT)
84 return "OUT OF RANGE";
85
86 if (ras_block->block == AMDGPU_RAS_BLOCK__MCA)
87 return ras_mca_block_string[ras_block->sub_block_index];
88
89 return ras_block_string[ras_block->block];
90}
91
92#define ras_err_str(i) (ras_error_string[ffs(i)])
93
94#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
95
96
97#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
98
99
100#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
101
102enum amdgpu_ras_retire_page_reservation {
103 AMDGPU_RAS_RETIRE_PAGE_RESERVED,
104 AMDGPU_RAS_RETIRE_PAGE_PENDING,
105 AMDGPU_RAS_RETIRE_PAGE_FAULT,
106};
107
108atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
109
110static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
111 uint64_t addr);
112static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
113 uint64_t addr);
114#ifdef CONFIG_X86_MCE_AMD
115static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
116struct mce_notifier_adev_list {
117 struct amdgpu_device *devs[MAX_GPU_INSTANCE];
118 int num_gpu;
119};
120static struct mce_notifier_adev_list mce_adev_list;
121#endif
122
123void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
124{
125 if (adev && amdgpu_ras_get_context(adev))
126 amdgpu_ras_get_context(adev)->error_query_ready = ready;
127}
128
129static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
130{
131 if (adev && amdgpu_ras_get_context(adev))
132 return amdgpu_ras_get_context(adev)->error_query_ready;
133
134 return false;
135}
136
137static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address)
138{
139 struct ras_err_data err_data = {0, 0, 0, NULL};
140 struct eeprom_table_record err_rec;
141
142 if ((address >= adev->gmc.mc_vram_size) ||
143 (address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
144 dev_warn(adev->dev,
145 "RAS WARN: input address 0x%llx is invalid.\n",
146 address);
147 return -EINVAL;
148 }
149
150 if (amdgpu_ras_check_bad_page(adev, address)) {
151 dev_warn(adev->dev,
152 "RAS WARN: 0x%llx has already been marked as bad page!\n",
153 address);
154 return 0;
155 }
156
157 memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
158
159 err_rec.address = address;
160 err_rec.retired_page = address >> AMDGPU_GPU_PAGE_SHIFT;
161 err_rec.ts = (uint64_t)ktime_get_real_seconds();
162 err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
163
164 err_data.err_addr = &err_rec;
165 err_data.err_addr_cnt = 1;
166
167 if (amdgpu_bad_page_threshold != 0) {
168 amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
169 err_data.err_addr_cnt);
170 amdgpu_ras_save_bad_pages(adev);
171 }
172
173 dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
174 dev_warn(adev->dev, "Clear EEPROM:\n");
175 dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n");
176
177 return 0;
178}
179
180static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
181 size_t size, loff_t *pos)
182{
183 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
184 struct ras_query_if info = {
185 .head = obj->head,
186 };
187 ssize_t s;
188 char val[128];
189
190 if (amdgpu_ras_query_error_status(obj->adev, &info))
191 return -EINVAL;
192
193 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
194 "ue", info.ue_count,
195 "ce", info.ce_count);
196 if (*pos >= s)
197 return 0;
198
199 s -= *pos;
200 s = min_t(u64, s, size);
201
202
203 if (copy_to_user(buf, &val[*pos], s))
204 return -EINVAL;
205
206 *pos += s;
207
208 return s;
209}
210
211static const struct file_operations amdgpu_ras_debugfs_ops = {
212 .owner = THIS_MODULE,
213 .read = amdgpu_ras_debugfs_read,
214 .write = NULL,
215 .llseek = default_llseek
216};
217
218static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
219{
220 int i;
221
222 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
223 *block_id = i;
224 if (strcmp(name, ras_block_string[i]) == 0)
225 return 0;
226 }
227 return -EINVAL;
228}
229
230static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
231 const char __user *buf, size_t size,
232 loff_t *pos, struct ras_debug_if *data)
233{
234 ssize_t s = min_t(u64, 64, size);
235 char str[65];
236 char block_name[33];
237 char err[9] = "ue";
238 int op = -1;
239 int block_id;
240 uint32_t sub_block;
241 u64 address, value;
242
243 if (*pos)
244 return -EINVAL;
245 *pos = size;
246
247 memset(str, 0, sizeof(str));
248 memset(data, 0, sizeof(*data));
249
250 if (copy_from_user(str, buf, s))
251 return -EINVAL;
252
253 if (sscanf(str, "disable %32s", block_name) == 1)
254 op = 0;
255 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
256 op = 1;
257 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
258 op = 2;
259 else if (strstr(str, "retire_page") != NULL)
260 op = 3;
261 else if (str[0] && str[1] && str[2] && str[3])
262
263 return -EINVAL;
264
265 if (op != -1) {
266 if (op == 3) {
267 if (sscanf(str, "%*s 0x%llx", &address) != 1 &&
268 sscanf(str, "%*s %llu", &address) != 1)
269 return -EINVAL;
270
271 data->op = op;
272 data->inject.address = address;
273
274 return 0;
275 }
276
277 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
278 return -EINVAL;
279
280 data->head.block = block_id;
281
282 if (!memcmp("ue", err, 2))
283 data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
284 else if (!memcmp("ce", err, 2))
285 data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
286 else
287 return -EINVAL;
288
289 data->op = op;
290
291 if (op == 2) {
292 if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
293 &sub_block, &address, &value) != 3 &&
294 sscanf(str, "%*s %*s %*s %u %llu %llu",
295 &sub_block, &address, &value) != 3)
296 return -EINVAL;
297 data->head.sub_block_index = sub_block;
298 data->inject.address = address;
299 data->inject.value = value;
300 }
301 } else {
302 if (size < sizeof(*data))
303 return -EINVAL;
304
305 if (copy_from_user(data, buf, sizeof(*data)))
306 return -EINVAL;
307 }
308
309 return 0;
310}
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
392 const char __user *buf,
393 size_t size, loff_t *pos)
394{
395 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
396 struct ras_debug_if data;
397 int ret = 0;
398
399 if (!amdgpu_ras_get_error_query_ready(adev)) {
400 dev_warn(adev->dev, "RAS WARN: error injection "
401 "currently inaccessible\n");
402 return size;
403 }
404
405 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
406 if (ret)
407 return ret;
408
409 if (data.op == 3) {
410 ret = amdgpu_reserve_page_direct(adev, data.inject.address);
411 if (!ret)
412 return size;
413 else
414 return ret;
415 }
416
417 if (!amdgpu_ras_is_supported(adev, data.head.block))
418 return -EINVAL;
419
420 switch (data.op) {
421 case 0:
422 ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
423 break;
424 case 1:
425 ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
426 break;
427 case 2:
428 if ((data.inject.address >= adev->gmc.mc_vram_size) ||
429 (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
430 dev_warn(adev->dev, "RAS WARN: input address "
431 "0x%llx is invalid.",
432 data.inject.address);
433 ret = -EINVAL;
434 break;
435 }
436
437
438 if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
439 amdgpu_ras_check_bad_page(adev, data.inject.address)) {
440 dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
441 "already been marked as bad!\n",
442 data.inject.address);
443 break;
444 }
445
446
447 ret = amdgpu_ras_error_inject(adev, &data.inject);
448 break;
449 default:
450 ret = -EINVAL;
451 break;
452 }
453
454 if (ret)
455 return -EINVAL;
456
457 return size;
458}
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f,
477 const char __user *buf,
478 size_t size, loff_t *pos)
479{
480 struct amdgpu_device *adev =
481 (struct amdgpu_device *)file_inode(f)->i_private;
482 int ret;
483
484 ret = amdgpu_ras_eeprom_reset_table(
485 &(amdgpu_ras_get_context(adev)->eeprom_control));
486
487 if (!ret) {
488
489
490 amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS;
491 return size;
492 } else {
493 return ret;
494 }
495}
496
497static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
498 .owner = THIS_MODULE,
499 .read = NULL,
500 .write = amdgpu_ras_debugfs_ctrl_write,
501 .llseek = default_llseek
502};
503
504static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
505 .owner = THIS_MODULE,
506 .read = NULL,
507 .write = amdgpu_ras_debugfs_eeprom_write,
508 .llseek = default_llseek
509};
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
533 struct device_attribute *attr, char *buf)
534{
535 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
536 struct ras_query_if info = {
537 .head = obj->head,
538 };
539
540 if (!amdgpu_ras_get_error_query_ready(obj->adev))
541 return sysfs_emit(buf, "Query currently inaccessible\n");
542
543 if (amdgpu_ras_query_error_status(obj->adev, &info))
544 return -EINVAL;
545
546 if (obj->adev->asic_type == CHIP_ALDEBARAN) {
547 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
548 DRM_WARN("Failed to reset error counter and error status");
549 }
550
551 return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
552 "ce", info.ce_count);
553}
554
555
556
557#define get_obj(obj) do { (obj)->use++; } while (0)
558#define alive_obj(obj) ((obj)->use)
559
560static inline void put_obj(struct ras_manager *obj)
561{
562 if (obj && (--obj->use == 0))
563 list_del(&obj->node);
564 if (obj && (obj->use < 0))
565 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head));
566}
567
568
569static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
570 struct ras_common_if *head)
571{
572 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
573 struct ras_manager *obj;
574
575 if (!adev->ras_enabled || !con)
576 return NULL;
577
578 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
579 return NULL;
580
581 if (head->block == AMDGPU_RAS_BLOCK__MCA) {
582 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
583 return NULL;
584
585 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
586 } else
587 obj = &con->objs[head->block];
588
589
590 if (alive_obj(obj))
591 return NULL;
592
593 obj->head = *head;
594 obj->adev = adev;
595 list_add(&obj->node, &con->head);
596 get_obj(obj);
597
598 return obj;
599}
600
601
602struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
603 struct ras_common_if *head)
604{
605 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
606 struct ras_manager *obj;
607 int i;
608
609 if (!adev->ras_enabled || !con)
610 return NULL;
611
612 if (head) {
613 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
614 return NULL;
615
616 if (head->block == AMDGPU_RAS_BLOCK__MCA) {
617 if (head->sub_block_index >= AMDGPU_RAS_MCA_BLOCK__LAST)
618 return NULL;
619
620 obj = &con->objs[AMDGPU_RAS_BLOCK__LAST + head->sub_block_index];
621 } else
622 obj = &con->objs[head->block];
623
624 if (alive_obj(obj))
625 return obj;
626 } else {
627 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT + AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
628 obj = &con->objs[i];
629 if (alive_obj(obj))
630 return obj;
631 }
632 }
633
634 return NULL;
635}
636
637
638
639static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
640 struct ras_common_if *head)
641{
642 return adev->ras_hw_enabled & BIT(head->block);
643}
644
645static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
646 struct ras_common_if *head)
647{
648 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
649
650 return con->features & BIT(head->block);
651}
652
653
654
655
656
657static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
658 struct ras_common_if *head, int enable)
659{
660 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
661 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
662
663
664
665
666
667
668
669 if (!amdgpu_ras_is_feature_allowed(adev, head))
670 return 0;
671
672 if (enable) {
673 if (!obj) {
674 obj = amdgpu_ras_create_obj(adev, head);
675 if (!obj)
676 return -EINVAL;
677 } else {
678
679 get_obj(obj);
680 }
681 con->features |= BIT(head->block);
682 } else {
683 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
684 con->features &= ~BIT(head->block);
685 put_obj(obj);
686 }
687 }
688
689 return 0;
690}
691
692
693int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
694 struct ras_common_if *head, bool enable)
695{
696 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
697 union ta_ras_cmd_input *info;
698 int ret;
699
700 if (!con)
701 return -EINVAL;
702
703 info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);
704 if (!info)
705 return -ENOMEM;
706
707 if (!enable) {
708 info->disable_features = (struct ta_ras_disable_features_input) {
709 .block_id = amdgpu_ras_block_to_ta(head->block),
710 .error_type = amdgpu_ras_error_to_ta(head->type),
711 };
712 } else {
713 info->enable_features = (struct ta_ras_enable_features_input) {
714 .block_id = amdgpu_ras_block_to_ta(head->block),
715 .error_type = amdgpu_ras_error_to_ta(head->type),
716 };
717 }
718
719
720 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
721
722 if (!amdgpu_ras_intr_triggered()) {
723 ret = psp_ras_enable_features(&adev->psp, info, enable);
724 if (ret) {
725 dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n",
726 enable ? "enable":"disable",
727 get_ras_block_str(head),
728 amdgpu_ras_is_poison_mode_supported(adev), ret);
729 goto out;
730 }
731 }
732
733
734 __amdgpu_ras_feature_enable(adev, head, enable);
735 ret = 0;
736out:
737 kfree(info);
738 return ret;
739}
740
741
742int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
743 struct ras_common_if *head, bool enable)
744{
745 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
746 int ret;
747
748 if (!con)
749 return -EINVAL;
750
751 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
752 if (enable) {
753
754
755
756
757
758
759 ret = amdgpu_ras_feature_enable(adev, head, 1);
760
761
762
763
764 if (ret == -EINVAL) {
765 ret = __amdgpu_ras_feature_enable(adev, head, 1);
766 if (!ret)
767 dev_info(adev->dev,
768 "RAS INFO: %s setup object\n",
769 get_ras_block_str(head));
770 }
771 } else {
772
773 ret = __amdgpu_ras_feature_enable(adev, head, 1);
774 if (ret)
775 return ret;
776
777
778 if (head->block == AMDGPU_RAS_BLOCK__GFX)
779 con->features |= BIT(head->block);
780
781 ret = amdgpu_ras_feature_enable(adev, head, 0);
782
783
784 if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX)
785 con->features &= ~BIT(head->block);
786 }
787 } else
788 ret = amdgpu_ras_feature_enable(adev, head, enable);
789
790 return ret;
791}
792
793static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
794 bool bypass)
795{
796 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
797 struct ras_manager *obj, *tmp;
798
799 list_for_each_entry_safe(obj, tmp, &con->head, node) {
800
801
802
803 if (bypass) {
804 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
805 break;
806 } else {
807 if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
808 break;
809 }
810 }
811
812 return con->features;
813}
814
815static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
816 bool bypass)
817{
818 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
819 int i;
820 const enum amdgpu_ras_error_type default_ras_type = AMDGPU_RAS_ERROR__NONE;
821
822 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
823 struct ras_common_if head = {
824 .block = i,
825 .type = default_ras_type,
826 .sub_block_index = 0,
827 };
828
829 if (i == AMDGPU_RAS_BLOCK__MCA)
830 continue;
831
832 if (bypass) {
833
834
835
836
837 if (__amdgpu_ras_feature_enable(adev, &head, 1))
838 break;
839 } else {
840 if (amdgpu_ras_feature_enable(adev, &head, 1))
841 break;
842 }
843 }
844
845 for (i = 0; i < AMDGPU_RAS_MCA_BLOCK_COUNT; i++) {
846 struct ras_common_if head = {
847 .block = AMDGPU_RAS_BLOCK__MCA,
848 .type = default_ras_type,
849 .sub_block_index = i,
850 };
851
852 if (bypass) {
853
854
855
856
857 if (__amdgpu_ras_feature_enable(adev, &head, 1))
858 break;
859 } else {
860 if (amdgpu_ras_feature_enable(adev, &head, 1))
861 break;
862 }
863 }
864
865 return con->features;
866}
867
868
869
870static void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,
871 struct ras_common_if *ras_block,
872 struct ras_err_data *err_data)
873{
874 switch (ras_block->sub_block_index) {
875 case AMDGPU_RAS_MCA_BLOCK__MP0:
876 if (adev->mca.mp0.ras_funcs &&
877 adev->mca.mp0.ras_funcs->query_ras_error_count)
878 adev->mca.mp0.ras_funcs->query_ras_error_count(adev, &err_data);
879 break;
880 case AMDGPU_RAS_MCA_BLOCK__MP1:
881 if (adev->mca.mp1.ras_funcs &&
882 adev->mca.mp1.ras_funcs->query_ras_error_count)
883 adev->mca.mp1.ras_funcs->query_ras_error_count(adev, &err_data);
884 break;
885 case AMDGPU_RAS_MCA_BLOCK__MPIO:
886 if (adev->mca.mpio.ras_funcs &&
887 adev->mca.mpio.ras_funcs->query_ras_error_count)
888 adev->mca.mpio.ras_funcs->query_ras_error_count(adev, &err_data);
889 break;
890 default:
891 break;
892 }
893}
894
895static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data)
896{
897 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
898 int ret = 0;
899
900
901
902
903
904 ret = smu_get_ecc_info(&adev->smu, (void *)&(ras->umc_ecc));
905 if (ret == -EOPNOTSUPP) {
906 if (adev->umc.ras_funcs &&
907 adev->umc.ras_funcs->query_ras_error_count)
908 adev->umc.ras_funcs->query_ras_error_count(adev, err_data);
909
910
911
912
913 if (adev->umc.ras_funcs &&
914 adev->umc.ras_funcs->query_ras_error_address)
915 adev->umc.ras_funcs->query_ras_error_address(adev, err_data);
916 } else if (!ret) {
917 if (adev->umc.ras_funcs &&
918 adev->umc.ras_funcs->ecc_info_query_ras_error_count)
919 adev->umc.ras_funcs->ecc_info_query_ras_error_count(adev, err_data);
920
921 if (adev->umc.ras_funcs &&
922 adev->umc.ras_funcs->ecc_info_query_ras_error_address)
923 adev->umc.ras_funcs->ecc_info_query_ras_error_address(adev, err_data);
924 }
925}
926
927
928int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
929 struct ras_query_if *info)
930{
931 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
932 struct ras_err_data err_data = {0, 0, 0, NULL};
933 int i;
934
935 if (!obj)
936 return -EINVAL;
937
938 switch (info->head.block) {
939 case AMDGPU_RAS_BLOCK__UMC:
940 amdgpu_ras_get_ecc_info(adev, &err_data);
941 break;
942 case AMDGPU_RAS_BLOCK__SDMA:
943 if (adev->sdma.funcs->query_ras_error_count) {
944 for (i = 0; i < adev->sdma.num_instances; i++)
945 adev->sdma.funcs->query_ras_error_count(adev, i,
946 &err_data);
947 }
948 break;
949 case AMDGPU_RAS_BLOCK__GFX:
950 if (adev->gfx.ras_funcs &&
951 adev->gfx.ras_funcs->query_ras_error_count)
952 adev->gfx.ras_funcs->query_ras_error_count(adev, &err_data);
953
954 if (adev->gfx.ras_funcs &&
955 adev->gfx.ras_funcs->query_ras_error_status)
956 adev->gfx.ras_funcs->query_ras_error_status(adev);
957 break;
958 case AMDGPU_RAS_BLOCK__MMHUB:
959 if (adev->mmhub.ras_funcs &&
960 adev->mmhub.ras_funcs->query_ras_error_count)
961 adev->mmhub.ras_funcs->query_ras_error_count(adev, &err_data);
962
963 if (adev->mmhub.ras_funcs &&
964 adev->mmhub.ras_funcs->query_ras_error_status)
965 adev->mmhub.ras_funcs->query_ras_error_status(adev);
966 break;
967 case AMDGPU_RAS_BLOCK__PCIE_BIF:
968 if (adev->nbio.ras_funcs &&
969 adev->nbio.ras_funcs->query_ras_error_count)
970 adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data);
971 break;
972 case AMDGPU_RAS_BLOCK__XGMI_WAFL:
973 if (adev->gmc.xgmi.ras_funcs &&
974 adev->gmc.xgmi.ras_funcs->query_ras_error_count)
975 adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, &err_data);
976 break;
977 case AMDGPU_RAS_BLOCK__HDP:
978 if (adev->hdp.ras_funcs &&
979 adev->hdp.ras_funcs->query_ras_error_count)
980 adev->hdp.ras_funcs->query_ras_error_count(adev, &err_data);
981 break;
982 case AMDGPU_RAS_BLOCK__MCA:
983 amdgpu_ras_mca_query_error_status(adev, &info->head, &err_data);
984 break;
985 default:
986 break;
987 }
988
989 obj->err_data.ue_count += err_data.ue_count;
990 obj->err_data.ce_count += err_data.ce_count;
991
992 info->ue_count = obj->err_data.ue_count;
993 info->ce_count = obj->err_data.ce_count;
994
995 if (err_data.ce_count) {
996 if (adev->smuio.funcs &&
997 adev->smuio.funcs->get_socket_id &&
998 adev->smuio.funcs->get_die_id) {
999 dev_info(adev->dev, "socket: %d, die: %d "
1000 "%ld correctable hardware errors "
1001 "detected in %s block, no user "
1002 "action is needed.\n",
1003 adev->smuio.funcs->get_socket_id(adev),
1004 adev->smuio.funcs->get_die_id(adev),
1005 obj->err_data.ce_count,
1006 get_ras_block_str(&info->head));
1007 } else {
1008 dev_info(adev->dev, "%ld correctable hardware errors "
1009 "detected in %s block, no user "
1010 "action is needed.\n",
1011 obj->err_data.ce_count,
1012 get_ras_block_str(&info->head));
1013 }
1014 }
1015 if (err_data.ue_count) {
1016 if (adev->smuio.funcs &&
1017 adev->smuio.funcs->get_socket_id &&
1018 adev->smuio.funcs->get_die_id) {
1019 dev_info(adev->dev, "socket: %d, die: %d "
1020 "%ld uncorrectable hardware errors "
1021 "detected in %s block\n",
1022 adev->smuio.funcs->get_socket_id(adev),
1023 adev->smuio.funcs->get_die_id(adev),
1024 obj->err_data.ue_count,
1025 get_ras_block_str(&info->head));
1026 } else {
1027 dev_info(adev->dev, "%ld uncorrectable hardware errors "
1028 "detected in %s block\n",
1029 obj->err_data.ue_count,
1030 get_ras_block_str(&info->head));
1031 }
1032 }
1033
1034 if (!amdgpu_persistent_edc_harvesting_supported(adev))
1035 amdgpu_ras_reset_error_status(adev, info->head.block);
1036
1037 return 0;
1038}
1039
1040int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
1041 enum amdgpu_ras_block block)
1042{
1043 if (!amdgpu_ras_is_supported(adev, block))
1044 return -EINVAL;
1045
1046 switch (block) {
1047 case AMDGPU_RAS_BLOCK__GFX:
1048 if (adev->gfx.ras_funcs &&
1049 adev->gfx.ras_funcs->reset_ras_error_count)
1050 adev->gfx.ras_funcs->reset_ras_error_count(adev);
1051
1052 if (adev->gfx.ras_funcs &&
1053 adev->gfx.ras_funcs->reset_ras_error_status)
1054 adev->gfx.ras_funcs->reset_ras_error_status(adev);
1055 break;
1056 case AMDGPU_RAS_BLOCK__MMHUB:
1057 if (adev->mmhub.ras_funcs &&
1058 adev->mmhub.ras_funcs->reset_ras_error_count)
1059 adev->mmhub.ras_funcs->reset_ras_error_count(adev);
1060
1061 if (adev->mmhub.ras_funcs &&
1062 adev->mmhub.ras_funcs->reset_ras_error_status)
1063 adev->mmhub.ras_funcs->reset_ras_error_status(adev);
1064 break;
1065 case AMDGPU_RAS_BLOCK__SDMA:
1066 if (adev->sdma.funcs->reset_ras_error_count)
1067 adev->sdma.funcs->reset_ras_error_count(adev);
1068 break;
1069 case AMDGPU_RAS_BLOCK__HDP:
1070 if (adev->hdp.ras_funcs &&
1071 adev->hdp.ras_funcs->reset_ras_error_count)
1072 adev->hdp.ras_funcs->reset_ras_error_count(adev);
1073 break;
1074 default:
1075 break;
1076 }
1077
1078 return 0;
1079}
1080
1081
1082static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
1083 struct ta_ras_trigger_error_input *block_info)
1084{
1085 int ret;
1086
1087 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
1088 dev_warn(adev->dev, "Failed to disallow df cstate");
1089
1090 if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
1091 dev_warn(adev->dev, "Failed to disallow XGMI power down");
1092
1093 ret = psp_ras_trigger_error(&adev->psp, block_info);
1094
1095 if (amdgpu_ras_intr_triggered())
1096 return ret;
1097
1098 if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
1099 dev_warn(adev->dev, "Failed to allow XGMI power down");
1100
1101 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
1102 dev_warn(adev->dev, "Failed to allow df cstate");
1103
1104 return ret;
1105}
1106
1107
1108int amdgpu_ras_error_inject(struct amdgpu_device *adev,
1109 struct ras_inject_if *info)
1110{
1111 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1112 struct ta_ras_trigger_error_input block_info = {
1113 .block_id = amdgpu_ras_block_to_ta(info->head.block),
1114 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
1115 .sub_block_index = info->head.sub_block_index,
1116 .address = info->address,
1117 .value = info->value,
1118 };
1119 int ret = 0;
1120
1121 if (!obj)
1122 return -EINVAL;
1123
1124
1125 if (adev->gmc.xgmi.num_physical_nodes > 1) {
1126 block_info.address =
1127 amdgpu_xgmi_get_relative_phy_addr(adev,
1128 block_info.address);
1129 }
1130
1131 switch (info->head.block) {
1132 case AMDGPU_RAS_BLOCK__GFX:
1133 if (adev->gfx.ras_funcs &&
1134 adev->gfx.ras_funcs->ras_error_inject)
1135 ret = adev->gfx.ras_funcs->ras_error_inject(adev, info);
1136 else
1137 ret = -EINVAL;
1138 break;
1139 case AMDGPU_RAS_BLOCK__UMC:
1140 case AMDGPU_RAS_BLOCK__SDMA:
1141 case AMDGPU_RAS_BLOCK__MMHUB:
1142 case AMDGPU_RAS_BLOCK__PCIE_BIF:
1143 case AMDGPU_RAS_BLOCK__MCA:
1144 ret = psp_ras_trigger_error(&adev->psp, &block_info);
1145 break;
1146 case AMDGPU_RAS_BLOCK__XGMI_WAFL:
1147 ret = amdgpu_ras_error_inject_xgmi(adev, &block_info);
1148 break;
1149 default:
1150 dev_info(adev->dev, "%s error injection is not supported yet\n",
1151 get_ras_block_str(&info->head));
1152 ret = -EINVAL;
1153 }
1154
1155 if (ret)
1156 dev_err(adev->dev, "ras inject %s failed %d\n",
1157 get_ras_block_str(&info->head), ret);
1158
1159 return ret;
1160}
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
1174 unsigned long *ce_count,
1175 unsigned long *ue_count)
1176{
1177 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1178 struct ras_manager *obj;
1179 unsigned long ce, ue;
1180
1181 if (!adev->ras_enabled || !con)
1182 return -EOPNOTSUPP;
1183
1184
1185
1186 if (!ce_count && !ue_count)
1187 return 0;
1188
1189 ce = 0;
1190 ue = 0;
1191 list_for_each_entry(obj, &con->head, node) {
1192 struct ras_query_if info = {
1193 .head = obj->head,
1194 };
1195 int res;
1196
1197 res = amdgpu_ras_query_error_status(adev, &info);
1198 if (res)
1199 return res;
1200
1201 ce += info.ce_count;
1202 ue += info.ue_count;
1203 }
1204
1205 if (ce_count)
1206 *ce_count = ce;
1207
1208 if (ue_count)
1209 *ue_count = ue;
1210
1211 return 0;
1212}
1213
1214
1215
1216
1217
1218static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1219 struct ras_badpage **bps, unsigned int *count);
1220
1221static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
1222{
1223 switch (flags) {
1224 case AMDGPU_RAS_RETIRE_PAGE_RESERVED:
1225 return "R";
1226 case AMDGPU_RAS_RETIRE_PAGE_PENDING:
1227 return "P";
1228 case AMDGPU_RAS_RETIRE_PAGE_FAULT:
1229 default:
1230 return "F";
1231 }
1232}
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
1265 struct kobject *kobj, struct bin_attribute *attr,
1266 char *buf, loff_t ppos, size_t count)
1267{
1268 struct amdgpu_ras *con =
1269 container_of(attr, struct amdgpu_ras, badpages_attr);
1270 struct amdgpu_device *adev = con->adev;
1271 const unsigned int element_size =
1272 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
1273 unsigned int start = div64_ul(ppos + element_size - 1, element_size);
1274 unsigned int end = div64_ul(ppos + count - 1, element_size);
1275 ssize_t s = 0;
1276 struct ras_badpage *bps = NULL;
1277 unsigned int bps_count = 0;
1278
1279 memset(buf, 0, count);
1280
1281 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
1282 return 0;
1283
1284 for (; start < end && start < bps_count; start++)
1285 s += scnprintf(&buf[s], element_size + 1,
1286 "0x%08x : 0x%08x : %1s\n",
1287 bps[start].bp,
1288 bps[start].size,
1289 amdgpu_ras_badpage_flags_str(bps[start].flags));
1290
1291 kfree(bps);
1292
1293 return s;
1294}
1295
1296static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
1297 struct device_attribute *attr, char *buf)
1298{
1299 struct amdgpu_ras *con =
1300 container_of(attr, struct amdgpu_ras, features_attr);
1301
1302 return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
1303}
1304
1305static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev)
1306{
1307 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1308
1309 sysfs_remove_file_from_group(&adev->dev->kobj,
1310 &con->badpages_attr.attr,
1311 RAS_FS_NAME);
1312}
1313
1314static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
1315{
1316 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1317 struct attribute *attrs[] = {
1318 &con->features_attr.attr,
1319 NULL
1320 };
1321 struct attribute_group group = {
1322 .name = RAS_FS_NAME,
1323 .attrs = attrs,
1324 };
1325
1326 sysfs_remove_group(&adev->dev->kobj, &group);
1327
1328 return 0;
1329}
1330
1331int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
1332 struct ras_fs_if *head)
1333{
1334 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
1335
1336 if (!obj || obj->attr_inuse)
1337 return -EINVAL;
1338
1339 get_obj(obj);
1340
1341 memcpy(obj->fs_data.sysfs_name,
1342 head->sysfs_name,
1343 sizeof(obj->fs_data.sysfs_name));
1344
1345 obj->sysfs_attr = (struct device_attribute){
1346 .attr = {
1347 .name = obj->fs_data.sysfs_name,
1348 .mode = S_IRUGO,
1349 },
1350 .show = amdgpu_ras_sysfs_read,
1351 };
1352 sysfs_attr_init(&obj->sysfs_attr.attr);
1353
1354 if (sysfs_add_file_to_group(&adev->dev->kobj,
1355 &obj->sysfs_attr.attr,
1356 RAS_FS_NAME)) {
1357 put_obj(obj);
1358 return -EINVAL;
1359 }
1360
1361 obj->attr_inuse = 1;
1362
1363 return 0;
1364}
1365
1366int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
1367 struct ras_common_if *head)
1368{
1369 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1370
1371 if (!obj || !obj->attr_inuse)
1372 return -EINVAL;
1373
1374 sysfs_remove_file_from_group(&adev->dev->kobj,
1375 &obj->sysfs_attr.attr,
1376 RAS_FS_NAME);
1377 obj->attr_inuse = 0;
1378 put_obj(obj);
1379
1380 return 0;
1381}
1382
1383static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
1384{
1385 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1386 struct ras_manager *obj, *tmp;
1387
1388 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1389 amdgpu_ras_sysfs_remove(adev, &obj->head);
1390 }
1391
1392 if (amdgpu_bad_page_threshold != 0)
1393 amdgpu_ras_sysfs_remove_bad_page_node(adev);
1394
1395 amdgpu_ras_sysfs_remove_feature_node(adev);
1396
1397 return 0;
1398}
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
1421{
1422 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1423 struct drm_minor *minor = adev_to_drm(adev)->primary;
1424 struct dentry *dir;
1425
1426 dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root);
1427 debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev,
1428 &amdgpu_ras_debugfs_ctrl_ops);
1429 debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev,
1430 &amdgpu_ras_debugfs_eeprom_ops);
1431 debugfs_create_u32("bad_page_cnt_threshold", 0444, dir,
1432 &con->bad_page_cnt_threshold);
1433 debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled);
1434 debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled);
1435 debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev,
1436 &amdgpu_ras_debugfs_eeprom_size_ops);
1437 con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table",
1438 S_IRUGO, dir, adev,
1439 &amdgpu_ras_debugfs_eeprom_table_ops);
1440 amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control);
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450 debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot);
1451
1452
1453
1454
1455
1456 debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, dir,
1457 &con->disable_ras_err_cnt_harvest);
1458 return dir;
1459}
1460
1461static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
1462 struct ras_fs_if *head,
1463 struct dentry *dir)
1464{
1465 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
1466
1467 if (!obj || !dir)
1468 return;
1469
1470 get_obj(obj);
1471
1472 memcpy(obj->fs_data.debugfs_name,
1473 head->debugfs_name,
1474 sizeof(obj->fs_data.debugfs_name));
1475
1476 debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir,
1477 obj, &amdgpu_ras_debugfs_ops);
1478}
1479
1480void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
1481{
1482 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1483 struct dentry *dir;
1484 struct ras_manager *obj;
1485 struct ras_fs_if fs_info;
1486
1487
1488
1489
1490
1491 if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con)
1492 return;
1493
1494 dir = amdgpu_ras_debugfs_create_ctrl_node(adev);
1495
1496 list_for_each_entry(obj, &con->head, node) {
1497 if (amdgpu_ras_is_supported(adev, obj->head.block) &&
1498 (obj->attr_inuse == 1)) {
1499 sprintf(fs_info.debugfs_name, "%s_err_inject",
1500 get_ras_block_str(&obj->head));
1501 fs_info.head = obj->head;
1502 amdgpu_ras_debugfs_create(adev, &fs_info, dir);
1503 }
1504 }
1505}
1506
1507
1508
1509
1510static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO,
1511 amdgpu_ras_sysfs_badpages_read, NULL, 0);
1512static DEVICE_ATTR(features, S_IRUGO,
1513 amdgpu_ras_sysfs_features_read, NULL);
1514static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
1515{
1516 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1517 struct attribute_group group = {
1518 .name = RAS_FS_NAME,
1519 };
1520 struct attribute *attrs[] = {
1521 &con->features_attr.attr,
1522 NULL
1523 };
1524 struct bin_attribute *bin_attrs[] = {
1525 NULL,
1526 NULL,
1527 };
1528 int r;
1529
1530
1531 con->features_attr = dev_attr_features;
1532 group.attrs = attrs;
1533 sysfs_attr_init(attrs[0]);
1534
1535 if (amdgpu_bad_page_threshold != 0) {
1536
1537 bin_attr_gpu_vram_bad_pages.private = NULL;
1538 con->badpages_attr = bin_attr_gpu_vram_bad_pages;
1539 bin_attrs[0] = &con->badpages_attr;
1540 group.bin_attrs = bin_attrs;
1541 sysfs_bin_attr_init(bin_attrs[0]);
1542 }
1543
1544 r = sysfs_create_group(&adev->dev->kobj, &group);
1545 if (r)
1546 dev_err(adev->dev, "Failed to create RAS sysfs group!");
1547
1548 return 0;
1549}
1550
1551static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
1552{
1553 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1554 struct ras_manager *con_obj, *ip_obj, *tmp;
1555
1556 if (IS_ENABLED(CONFIG_DEBUG_FS)) {
1557 list_for_each_entry_safe(con_obj, tmp, &con->head, node) {
1558 ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head);
1559 if (ip_obj)
1560 put_obj(ip_obj);
1561 }
1562 }
1563
1564 amdgpu_ras_sysfs_remove_all(adev);
1565 return 0;
1566}
1567
1568
1569
1570static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
1571{
1572 struct ras_ih_data *data = &obj->ih_data;
1573 struct amdgpu_iv_entry entry;
1574 int ret;
1575 struct ras_err_data err_data = {0, 0, 0, NULL};
1576
1577 while (data->rptr != data->wptr) {
1578 rmb();
1579 memcpy(&entry, &data->ring[data->rptr],
1580 data->element_size);
1581
1582 wmb();
1583 data->rptr = (data->aligned_element_size +
1584 data->rptr) % data->ring_size;
1585
1586 if (data->cb) {
1587 if (amdgpu_ras_is_poison_mode_supported(obj->adev) &&
1588 obj->head.block == AMDGPU_RAS_BLOCK__UMC)
1589 dev_info(obj->adev->dev,
1590 "Poison is created, no user action is needed.\n");
1591 else {
1592
1593
1594
1595 memset(&err_data, 0, sizeof(err_data));
1596 ret = data->cb(obj->adev, &err_data, &entry);
1597
1598
1599
1600
1601
1602 if (ret == AMDGPU_RAS_SUCCESS) {
1603
1604
1605
1606 obj->err_data.ue_count += err_data.ue_count;
1607 obj->err_data.ce_count += err_data.ce_count;
1608 }
1609 }
1610 }
1611 }
1612}
1613
1614static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1615{
1616 struct ras_ih_data *data =
1617 container_of(work, struct ras_ih_data, ih_work);
1618 struct ras_manager *obj =
1619 container_of(data, struct ras_manager, ih_data);
1620
1621 amdgpu_ras_interrupt_handler(obj);
1622}
1623
1624int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1625 struct ras_dispatch_if *info)
1626{
1627 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1628 struct ras_ih_data *data = &obj->ih_data;
1629
1630 if (!obj)
1631 return -EINVAL;
1632
1633 if (data->inuse == 0)
1634 return 0;
1635
1636
1637 memcpy(&data->ring[data->wptr], info->entry,
1638 data->element_size);
1639
1640 wmb();
1641 data->wptr = (data->aligned_element_size +
1642 data->wptr) % data->ring_size;
1643
1644 schedule_work(&data->ih_work);
1645
1646 return 0;
1647}
1648
1649int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1650 struct ras_ih_if *info)
1651{
1652 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1653 struct ras_ih_data *data;
1654
1655 if (!obj)
1656 return -EINVAL;
1657
1658 data = &obj->ih_data;
1659 if (data->inuse == 0)
1660 return 0;
1661
1662 cancel_work_sync(&data->ih_work);
1663
1664 kfree(data->ring);
1665 memset(data, 0, sizeof(*data));
1666 put_obj(obj);
1667
1668 return 0;
1669}
1670
1671int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1672 struct ras_ih_if *info)
1673{
1674 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1675 struct ras_ih_data *data;
1676
1677 if (!obj) {
1678
1679 obj = amdgpu_ras_create_obj(adev, &info->head);
1680 if (!obj)
1681 return -EINVAL;
1682 } else
1683 get_obj(obj);
1684
1685 data = &obj->ih_data;
1686
1687 *data = (struct ras_ih_data) {
1688 .inuse = 0,
1689 .cb = info->cb,
1690 .element_size = sizeof(struct amdgpu_iv_entry),
1691 .rptr = 0,
1692 .wptr = 0,
1693 };
1694
1695 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1696
1697 data->aligned_element_size = ALIGN(data->element_size, 8);
1698
1699 data->ring_size = 64 * data->aligned_element_size;
1700 data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1701 if (!data->ring) {
1702 put_obj(obj);
1703 return -ENOMEM;
1704 }
1705
1706
1707 data->inuse = 1;
1708
1709 return 0;
1710}
1711
1712static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1713{
1714 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1715 struct ras_manager *obj, *tmp;
1716
1717 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1718 struct ras_ih_if info = {
1719 .head = obj->head,
1720 };
1721 amdgpu_ras_interrupt_remove_handler(adev, &info);
1722 }
1723
1724 return 0;
1725}
1726
1727
1728
1729static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
1730{
1731 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1732 struct ras_manager *obj;
1733
1734 if (!adev->ras_enabled || !con)
1735 return;
1736
1737 list_for_each_entry(obj, &con->head, node) {
1738 struct ras_query_if info = {
1739 .head = obj->head,
1740 };
1741
1742
1743
1744
1745
1746
1747
1748 if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
1749 continue;
1750
1751
1752
1753
1754
1755
1756
1757 if ((info.head.block == AMDGPU_RAS_BLOCK__UMC) &&
1758 (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)))
1759 continue;
1760
1761 amdgpu_ras_query_error_status(adev, &info);
1762 }
1763}
1764
1765
1766static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
1767 struct ras_query_if *info)
1768{
1769
1770
1771
1772
1773 switch (info->head.block) {
1774 case AMDGPU_RAS_BLOCK__GFX:
1775 if (adev->gfx.ras_funcs &&
1776 adev->gfx.ras_funcs->query_ras_error_status)
1777 adev->gfx.ras_funcs->query_ras_error_status(adev);
1778 break;
1779 case AMDGPU_RAS_BLOCK__MMHUB:
1780 if (adev->mmhub.ras_funcs &&
1781 adev->mmhub.ras_funcs->query_ras_error_status)
1782 adev->mmhub.ras_funcs->query_ras_error_status(adev);
1783 break;
1784 default:
1785 break;
1786 }
1787}
1788
1789static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
1790{
1791 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1792 struct ras_manager *obj;
1793
1794 if (!adev->ras_enabled || !con)
1795 return;
1796
1797 list_for_each_entry(obj, &con->head, node) {
1798 struct ras_query_if info = {
1799 .head = obj->head,
1800 };
1801
1802 amdgpu_ras_error_status_query(adev, &info);
1803 }
1804}
1805
1806
1807
1808
1809
1810
1811static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1812 struct ras_badpage **bps, unsigned int *count)
1813{
1814 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1815 struct ras_err_handler_data *data;
1816 int i = 0;
1817 int ret = 0, status;
1818
1819 if (!con || !con->eh_data || !bps || !count)
1820 return -EINVAL;
1821
1822 mutex_lock(&con->recovery_lock);
1823 data = con->eh_data;
1824 if (!data || data->count == 0) {
1825 *bps = NULL;
1826 ret = -EINVAL;
1827 goto out;
1828 }
1829
1830 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
1831 if (!*bps) {
1832 ret = -ENOMEM;
1833 goto out;
1834 }
1835
1836 for (; i < data->count; i++) {
1837 (*bps)[i] = (struct ras_badpage){
1838 .bp = data->bps[i].retired_page,
1839 .size = AMDGPU_GPU_PAGE_SIZE,
1840 .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
1841 };
1842 status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,
1843 data->bps[i].retired_page);
1844 if (status == -EBUSY)
1845 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
1846 else if (status == -ENOENT)
1847 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
1848 }
1849
1850 *count = data->count;
1851out:
1852 mutex_unlock(&con->recovery_lock);
1853 return ret;
1854}
1855
1856static void amdgpu_ras_do_recovery(struct work_struct *work)
1857{
1858 struct amdgpu_ras *ras =
1859 container_of(work, struct amdgpu_ras, recovery_work);
1860 struct amdgpu_device *remote_adev = NULL;
1861 struct amdgpu_device *adev = ras->adev;
1862 struct list_head device_list, *device_list_handle = NULL;
1863
1864 if (!ras->disable_ras_err_cnt_harvest) {
1865 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
1866
1867
1868 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) {
1869 device_list_handle = &hive->device_list;
1870 } else {
1871 INIT_LIST_HEAD(&device_list);
1872 list_add_tail(&adev->gmc.xgmi.head, &device_list);
1873 device_list_handle = &device_list;
1874 }
1875
1876 list_for_each_entry(remote_adev,
1877 device_list_handle, gmc.xgmi.head) {
1878 amdgpu_ras_query_err_status(remote_adev);
1879 amdgpu_ras_log_on_err_counter(remote_adev);
1880 }
1881
1882 amdgpu_put_xgmi_hive(hive);
1883 }
1884
1885 if (amdgpu_device_should_recover_gpu(ras->adev))
1886 amdgpu_device_gpu_recover(ras->adev, NULL);
1887 atomic_set(&ras->in_recovery, 0);
1888}
1889
1890
1891static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1892 struct ras_err_handler_data *data, int pages)
1893{
1894 unsigned int old_space = data->count + data->space_left;
1895 unsigned int new_space = old_space + pages;
1896 unsigned int align_space = ALIGN(new_space, 512);
1897 void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1898
1899 if (!bps) {
1900 kfree(bps);
1901 return -ENOMEM;
1902 }
1903
1904 if (data->bps) {
1905 memcpy(bps, data->bps,
1906 data->count * sizeof(*data->bps));
1907 kfree(data->bps);
1908 }
1909
1910 data->bps = bps;
1911 data->space_left += align_space - old_space;
1912 return 0;
1913}
1914
1915
1916int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
1917 struct eeprom_table_record *bps, int pages)
1918{
1919 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1920 struct ras_err_handler_data *data;
1921 int ret = 0;
1922 uint32_t i;
1923
1924 if (!con || !con->eh_data || !bps || pages <= 0)
1925 return 0;
1926
1927 mutex_lock(&con->recovery_lock);
1928 data = con->eh_data;
1929 if (!data)
1930 goto out;
1931
1932 for (i = 0; i < pages; i++) {
1933 if (amdgpu_ras_check_bad_page_unlock(con,
1934 bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
1935 continue;
1936
1937 if (!data->space_left &&
1938 amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
1939 ret = -ENOMEM;
1940 goto out;
1941 }
1942
1943 amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,
1944 bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
1945 AMDGPU_GPU_PAGE_SIZE);
1946
1947 memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
1948 data->count++;
1949 data->space_left--;
1950 }
1951out:
1952 mutex_unlock(&con->recovery_lock);
1953
1954 return ret;
1955}
1956
1957
1958
1959
1960
1961int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1962{
1963 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1964 struct ras_err_handler_data *data;
1965 struct amdgpu_ras_eeprom_control *control;
1966 int save_count;
1967
1968 if (!con || !con->eh_data)
1969 return 0;
1970
1971 mutex_lock(&con->recovery_lock);
1972 control = &con->eeprom_control;
1973 data = con->eh_data;
1974 save_count = data->count - control->ras_num_recs;
1975 mutex_unlock(&con->recovery_lock);
1976
1977 if (save_count > 0) {
1978 if (amdgpu_ras_eeprom_append(control,
1979 &data->bps[control->ras_num_recs],
1980 save_count)) {
1981 dev_err(adev->dev, "Failed to save EEPROM table data!");
1982 return -EIO;
1983 }
1984
1985 dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count);
1986 }
1987
1988 return 0;
1989}
1990
1991
1992
1993
1994
1995static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1996{
1997 struct amdgpu_ras_eeprom_control *control =
1998 &adev->psp.ras_context.ras->eeprom_control;
1999 struct eeprom_table_record *bps;
2000 int ret;
2001
2002
2003 if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0)
2004 return 0;
2005
2006 bps = kcalloc(control->ras_num_recs, sizeof(*bps), GFP_KERNEL);
2007 if (!bps)
2008 return -ENOMEM;
2009
2010 ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs);
2011 if (ret)
2012 dev_err(adev->dev, "Failed to load EEPROM table records!");
2013 else
2014 ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs);
2015
2016 kfree(bps);
2017 return ret;
2018}
2019
2020static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
2021 uint64_t addr)
2022{
2023 struct ras_err_handler_data *data = con->eh_data;
2024 int i;
2025
2026 addr >>= AMDGPU_GPU_PAGE_SHIFT;
2027 for (i = 0; i < data->count; i++)
2028 if (addr == data->bps[i].retired_page)
2029 return true;
2030
2031 return false;
2032}
2033
2034
2035
2036
2037
2038
2039static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
2040 uint64_t addr)
2041{
2042 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2043 bool ret = false;
2044
2045 if (!con || !con->eh_data)
2046 return ret;
2047
2048 mutex_lock(&con->recovery_lock);
2049 ret = amdgpu_ras_check_bad_page_unlock(con, addr);
2050 mutex_unlock(&con->recovery_lock);
2051 return ret;
2052}
2053
2054static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
2055 uint32_t max_count)
2056{
2057 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078 if (amdgpu_bad_page_threshold < 0) {
2079 u64 val = adev->gmc.mc_vram_size;
2080
2081 do_div(val, RAS_BAD_PAGE_COVER);
2082 con->bad_page_cnt_threshold = min(lower_32_bits(val),
2083 max_count);
2084 } else {
2085 con->bad_page_cnt_threshold = min_t(int, max_count,
2086 amdgpu_bad_page_threshold);
2087 }
2088}
2089
2090int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
2091{
2092 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2093 struct ras_err_handler_data **data;
2094 u32 max_eeprom_records_count = 0;
2095 bool exc_err_limit = false;
2096 int ret;
2097
2098 if (!con)
2099 return 0;
2100
2101
2102
2103
2104
2105
2106 con->adev = adev;
2107
2108 if (!adev->ras_enabled)
2109 return 0;
2110
2111 data = &con->eh_data;
2112 *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
2113 if (!*data) {
2114 ret = -ENOMEM;
2115 goto out;
2116 }
2117
2118 mutex_init(&con->recovery_lock);
2119 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
2120 atomic_set(&con->in_recovery, 0);
2121
2122 max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
2123 amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
2124
2125
2126
2127
2128
2129 if (adev->gmc.xgmi.pending_reset)
2130 return 0;
2131 ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
2132
2133
2134
2135
2136 if (exc_err_limit || ret)
2137 goto free;
2138
2139 if (con->eeprom_control.ras_num_recs) {
2140 ret = amdgpu_ras_load_bad_pages(adev);
2141 if (ret)
2142 goto free;
2143
2144 if (adev->smu.ppt_funcs && adev->smu.ppt_funcs->send_hbm_bad_pages_num)
2145 adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs);
2146 }
2147
2148#ifdef CONFIG_X86_MCE_AMD
2149 if ((adev->asic_type == CHIP_ALDEBARAN) &&
2150 (adev->gmc.xgmi.connected_to_cpu))
2151 amdgpu_register_bad_pages_mca_notifier(adev);
2152#endif
2153 return 0;
2154
2155free:
2156 kfree((*data)->bps);
2157 kfree(*data);
2158 con->eh_data = NULL;
2159out:
2160 dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret);
2161
2162
2163
2164
2165
2166 if (!exc_err_limit)
2167 ret = 0;
2168 else
2169 ret = -EINVAL;
2170
2171 return ret;
2172}
2173
2174static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
2175{
2176 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2177 struct ras_err_handler_data *data = con->eh_data;
2178
2179
2180 if (!data)
2181 return 0;
2182
2183 cancel_work_sync(&con->recovery_work);
2184
2185 mutex_lock(&con->recovery_lock);
2186 con->eh_data = NULL;
2187 kfree(data->bps);
2188 kfree(data);
2189 mutex_unlock(&con->recovery_lock);
2190
2191 return 0;
2192}
2193
2194
2195static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
2196{
2197 return adev->asic_type == CHIP_VEGA10 ||
2198 adev->asic_type == CHIP_VEGA20 ||
2199 adev->asic_type == CHIP_ARCTURUS ||
2200 adev->asic_type == CHIP_ALDEBARAN ||
2201 adev->asic_type == CHIP_SIENNA_CICHLID;
2202}
2203
2204
2205
2206
2207
2208
2209static void amdgpu_ras_get_quirks(struct amdgpu_device *adev)
2210{
2211 struct atom_context *ctx = adev->mode_info.atom_context;
2212
2213 if (!ctx)
2214 return;
2215
2216 if (strnstr(ctx->vbios_version, "D16406",
2217 sizeof(ctx->vbios_version)) ||
2218 strnstr(ctx->vbios_version, "D36002",
2219 sizeof(ctx->vbios_version)))
2220 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
2221}
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
2233{
2234 adev->ras_hw_enabled = adev->ras_enabled = 0;
2235
2236 if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
2237 !amdgpu_ras_asic_supported(adev))
2238 return;
2239
2240 if (!adev->gmc.xgmi.connected_to_cpu) {
2241 if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
2242 dev_info(adev->dev, "MEM ECC is active.\n");
2243 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
2244 1 << AMDGPU_RAS_BLOCK__DF);
2245 } else {
2246 dev_info(adev->dev, "MEM ECC is not presented.\n");
2247 }
2248
2249 if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
2250 dev_info(adev->dev, "SRAM ECC is active.\n");
2251 adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
2252 1 << AMDGPU_RAS_BLOCK__DF);
2253 } else {
2254 dev_info(adev->dev, "SRAM ECC is not presented.\n");
2255 }
2256 } else {
2257
2258
2259 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX |
2260 1 << AMDGPU_RAS_BLOCK__SDMA |
2261 1 << AMDGPU_RAS_BLOCK__MMHUB);
2262 }
2263
2264 amdgpu_ras_get_quirks(adev);
2265
2266
2267 adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK;
2268
2269 adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
2270 adev->ras_hw_enabled & amdgpu_ras_mask;
2271}
2272
2273static void amdgpu_ras_counte_dw(struct work_struct *work)
2274{
2275 struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
2276 ras_counte_delay_work.work);
2277 struct amdgpu_device *adev = con->adev;
2278 struct drm_device *dev = adev_to_drm(adev);
2279 unsigned long ce_count, ue_count;
2280 int res;
2281
2282 res = pm_runtime_get_sync(dev->dev);
2283 if (res < 0)
2284 goto Out;
2285
2286
2287
2288 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) {
2289 atomic_set(&con->ras_ce_count, ce_count);
2290 atomic_set(&con->ras_ue_count, ue_count);
2291 }
2292
2293 pm_runtime_mark_last_busy(dev->dev);
2294Out:
2295 pm_runtime_put_autosuspend(dev->dev);
2296}
2297
2298int amdgpu_ras_init(struct amdgpu_device *adev)
2299{
2300 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2301 int r;
2302 bool df_poison, umc_poison;
2303
2304 if (con)
2305 return 0;
2306
2307 con = kmalloc(sizeof(struct amdgpu_ras) +
2308 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT +
2309 sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT,
2310 GFP_KERNEL|__GFP_ZERO);
2311 if (!con)
2312 return -ENOMEM;
2313
2314 con->adev = adev;
2315 INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
2316 atomic_set(&con->ras_ce_count, 0);
2317 atomic_set(&con->ras_ue_count, 0);
2318
2319 con->objs = (struct ras_manager *)(con + 1);
2320
2321 amdgpu_ras_set_context(adev, con);
2322
2323 amdgpu_ras_check_supported(adev);
2324
2325 if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) {
2326
2327
2328
2329 if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) {
2330 con->features |= BIT(AMDGPU_RAS_BLOCK__GFX);
2331
2332 return 0;
2333 }
2334
2335 r = 0;
2336 goto release_con;
2337 }
2338
2339 con->features = 0;
2340 INIT_LIST_HEAD(&con->head);
2341
2342 con->flags = RAS_DEFAULT_FLAGS;
2343
2344
2345
2346
2347 switch (adev->asic_type) {
2348 case CHIP_VEGA20:
2349 case CHIP_ARCTURUS:
2350 case CHIP_ALDEBARAN:
2351 if (!adev->gmc.xgmi.connected_to_cpu)
2352 adev->nbio.ras_funcs = &nbio_v7_4_ras_funcs;
2353 break;
2354 default:
2355
2356 break;
2357 }
2358
2359 if (adev->nbio.ras_funcs &&
2360 adev->nbio.ras_funcs->init_ras_controller_interrupt) {
2361 r = adev->nbio.ras_funcs->init_ras_controller_interrupt(adev);
2362 if (r)
2363 goto release_con;
2364 }
2365
2366 if (adev->nbio.ras_funcs &&
2367 adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt) {
2368 r = adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt(adev);
2369 if (r)
2370 goto release_con;
2371 }
2372
2373
2374 if (adev->gmc.xgmi.connected_to_cpu) {
2375
2376 con->poison_supported = true;
2377 }
2378 else if (adev->df.funcs &&
2379 adev->df.funcs->query_ras_poison_mode &&
2380 adev->umc.ras_funcs &&
2381 adev->umc.ras_funcs->query_ras_poison_mode) {
2382 df_poison =
2383 adev->df.funcs->query_ras_poison_mode(adev);
2384 umc_poison =
2385 adev->umc.ras_funcs->query_ras_poison_mode(adev);
2386
2387 if (df_poison && umc_poison)
2388 con->poison_supported = true;
2389 else if (df_poison != umc_poison)
2390 dev_warn(adev->dev, "Poison setting is inconsistent in DF/UMC(%d:%d)!\n",
2391 df_poison, umc_poison);
2392 }
2393
2394 if (amdgpu_ras_fs_init(adev)) {
2395 r = -EINVAL;
2396 goto release_con;
2397 }
2398
2399 dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
2400 "hardware ability[%x] ras_mask[%x]\n",
2401 adev->ras_hw_enabled, adev->ras_enabled);
2402
2403 return 0;
2404release_con:
2405 amdgpu_ras_set_context(adev, NULL);
2406 kfree(con);
2407
2408 return r;
2409}
2410
2411int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
2412{
2413 if (adev->gmc.xgmi.connected_to_cpu)
2414 return 1;
2415 return 0;
2416}
2417
2418static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
2419 struct ras_common_if *ras_block)
2420{
2421 struct ras_query_if info = {
2422 .head = *ras_block,
2423 };
2424
2425 if (!amdgpu_persistent_edc_harvesting_supported(adev))
2426 return 0;
2427
2428 if (amdgpu_ras_query_error_status(adev, &info) != 0)
2429 DRM_WARN("RAS init harvest failure");
2430
2431 if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0)
2432 DRM_WARN("RAS init harvest reset failure");
2433
2434 return 0;
2435}
2436
2437bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev)
2438{
2439 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2440
2441 if (!con)
2442 return false;
2443
2444 return con->poison_supported;
2445}
2446
2447
2448int amdgpu_ras_late_init(struct amdgpu_device *adev,
2449 struct ras_common_if *ras_block,
2450 struct ras_fs_if *fs_info,
2451 struct ras_ih_if *ih_info)
2452{
2453 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2454 unsigned long ue_count, ce_count;
2455 int r;
2456
2457
2458 if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
2459 amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
2460 return 0;
2461 }
2462
2463 r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
2464 if (r) {
2465 if (adev->in_suspend || amdgpu_in_reset(adev)) {
2466
2467
2468 goto cleanup;
2469 } else
2470 return r;
2471 }
2472
2473
2474 amdgpu_persistent_edc_harvesting(adev, ras_block);
2475
2476
2477 if (adev->in_suspend || amdgpu_in_reset(adev))
2478 return 0;
2479
2480 if (ih_info->cb) {
2481 r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
2482 if (r)
2483 goto interrupt;
2484 }
2485
2486 r = amdgpu_ras_sysfs_create(adev, fs_info);
2487 if (r)
2488 goto sysfs;
2489
2490
2491
2492 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) {
2493 atomic_set(&con->ras_ce_count, ce_count);
2494 atomic_set(&con->ras_ue_count, ue_count);
2495 }
2496
2497 return 0;
2498cleanup:
2499 amdgpu_ras_sysfs_remove(adev, ras_block);
2500sysfs:
2501 if (ih_info->cb)
2502 amdgpu_ras_interrupt_remove_handler(adev, ih_info);
2503interrupt:
2504 amdgpu_ras_feature_enable(adev, ras_block, 0);
2505 return r;
2506}
2507
2508
2509void amdgpu_ras_late_fini(struct amdgpu_device *adev,
2510 struct ras_common_if *ras_block,
2511 struct ras_ih_if *ih_info)
2512{
2513 if (!ras_block || !ih_info)
2514 return;
2515
2516 amdgpu_ras_sysfs_remove(adev, ras_block);
2517 if (ih_info->cb)
2518 amdgpu_ras_interrupt_remove_handler(adev, ih_info);
2519}
2520
2521
2522
2523
2524void amdgpu_ras_resume(struct amdgpu_device *adev)
2525{
2526 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2527 struct ras_manager *obj, *tmp;
2528
2529 if (!adev->ras_enabled || !con) {
2530
2531 amdgpu_release_ras_context(adev);
2532
2533 return;
2534 }
2535
2536 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
2537
2538
2539
2540
2541
2542 amdgpu_ras_enable_all_features(adev, 1);
2543
2544
2545
2546
2547
2548 list_for_each_entry_safe(obj, tmp, &con->head, node) {
2549 if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
2550 amdgpu_ras_feature_enable(adev, &obj->head, 0);
2551
2552 WARN_ON(alive_obj(obj));
2553 }
2554 }
2555 }
2556}
2557
2558void amdgpu_ras_suspend(struct amdgpu_device *adev)
2559{
2560 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2561
2562 if (!adev->ras_enabled || !con)
2563 return;
2564
2565 amdgpu_ras_disable_all_features(adev, 0);
2566
2567 if (con->features)
2568 amdgpu_ras_disable_all_features(adev, 1);
2569}
2570
2571
2572int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
2573{
2574 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2575
2576 if (!adev->ras_enabled || !con)
2577 return 0;
2578
2579
2580
2581 amdgpu_ras_disable_all_features(adev, 0);
2582 amdgpu_ras_recovery_fini(adev);
2583 return 0;
2584}
2585
2586int amdgpu_ras_fini(struct amdgpu_device *adev)
2587{
2588 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2589
2590 if (!adev->ras_enabled || !con)
2591 return 0;
2592
2593 amdgpu_ras_fs_fini(adev);
2594 amdgpu_ras_interrupt_remove_all(adev);
2595
2596 WARN(con->features, "Feature mask is not cleared");
2597
2598 if (con->features)
2599 amdgpu_ras_disable_all_features(adev, 1);
2600
2601 cancel_delayed_work_sync(&con->ras_counte_delay_work);
2602
2603 amdgpu_ras_set_context(adev, NULL);
2604 kfree(con);
2605
2606 return 0;
2607}
2608
2609void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
2610{
2611 amdgpu_ras_check_supported(adev);
2612 if (!adev->ras_hw_enabled)
2613 return;
2614
2615 if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
2616 dev_info(adev->dev, "uncorrectable hardware error"
2617 "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
2618
2619 amdgpu_ras_reset_gpu(adev);
2620 }
2621}
2622
2623bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
2624{
2625 if (adev->asic_type == CHIP_VEGA20 &&
2626 adev->pm.fw_version <= 0x283400) {
2627 return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) &&
2628 amdgpu_ras_intr_triggered();
2629 }
2630
2631 return false;
2632}
2633
2634void amdgpu_release_ras_context(struct amdgpu_device *adev)
2635{
2636 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2637
2638 if (!con)
2639 return;
2640
2641 if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) {
2642 con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX);
2643 amdgpu_ras_set_context(adev, NULL);
2644 kfree(con);
2645 }
2646}
2647
2648#ifdef CONFIG_X86_MCE_AMD
2649static struct amdgpu_device *find_adev(uint32_t node_id)
2650{
2651 int i;
2652 struct amdgpu_device *adev = NULL;
2653
2654 for (i = 0; i < mce_adev_list.num_gpu; i++) {
2655 adev = mce_adev_list.devs[i];
2656
2657 if (adev && adev->gmc.xgmi.connected_to_cpu &&
2658 adev->gmc.xgmi.physical_node_id == node_id)
2659 break;
2660 adev = NULL;
2661 }
2662
2663 return adev;
2664}
2665
2666#define GET_MCA_IPID_GPUID(m) (((m) >> 44) & 0xF)
2667#define GET_UMC_INST(m) (((m) >> 21) & 0x7)
2668#define GET_CHAN_INDEX(m) ((((m) >> 12) & 0x3) | (((m) >> 18) & 0x4))
2669#define GPU_ID_OFFSET 8
2670
2671static int amdgpu_bad_page_notifier(struct notifier_block *nb,
2672 unsigned long val, void *data)
2673{
2674 struct mce *m = (struct mce *)data;
2675 struct amdgpu_device *adev = NULL;
2676 uint32_t gpu_id = 0;
2677 uint32_t umc_inst = 0;
2678 uint32_t ch_inst, channel_index = 0;
2679 struct ras_err_data err_data = {0, 0, 0, NULL};
2680 struct eeprom_table_record err_rec;
2681 uint64_t retired_page;
2682
2683
2684
2685
2686
2687
2688 if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) &&
2689 (XEC(m->status, 0x3f) == 0x0)))
2690 return NOTIFY_DONE;
2691
2692
2693
2694
2695 if (mce_is_correctable(m))
2696 return NOTIFY_OK;
2697
2698
2699
2700
2701 gpu_id = GET_MCA_IPID_GPUID(m->ipid) - GPU_ID_OFFSET;
2702
2703 adev = find_adev(gpu_id);
2704 if (!adev) {
2705 DRM_WARN("%s: Unable to find adev for gpu_id: %d\n", __func__,
2706 gpu_id);
2707 return NOTIFY_DONE;
2708 }
2709
2710
2711
2712
2713
2714 umc_inst = GET_UMC_INST(m->ipid);
2715 ch_inst = GET_CHAN_INDEX(m->ipid);
2716
2717 dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",
2718 umc_inst, ch_inst);
2719
2720 memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
2721
2722
2723
2724
2725 channel_index =
2726 adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num
2727 + ch_inst];
2728
2729 retired_page = ADDR_OF_8KB_BLOCK(m->addr) |
2730 ADDR_OF_256B_BLOCK(channel_index) |
2731 OFFSET_IN_256B_BLOCK(m->addr);
2732
2733 err_rec.address = m->addr;
2734 err_rec.retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
2735 err_rec.ts = (uint64_t)ktime_get_real_seconds();
2736 err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
2737 err_rec.cu = 0;
2738 err_rec.mem_channel = channel_index;
2739 err_rec.mcumc_id = umc_inst;
2740
2741 err_data.err_addr = &err_rec;
2742 err_data.err_addr_cnt = 1;
2743
2744 if (amdgpu_bad_page_threshold != 0) {
2745 amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
2746 err_data.err_addr_cnt);
2747 amdgpu_ras_save_bad_pages(adev);
2748 }
2749
2750 return NOTIFY_OK;
2751}
2752
2753static struct notifier_block amdgpu_bad_page_nb = {
2754 .notifier_call = amdgpu_bad_page_notifier,
2755 .priority = MCE_PRIO_UC,
2756};
2757
2758static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev)
2759{
2760
2761
2762
2763
2764
2765
2766
2767
2768 mce_adev_list.devs[mce_adev_list.num_gpu++] = adev;
2769
2770
2771
2772
2773
2774 if (notifier_registered == false) {
2775 mce_register_decode_chain(&amdgpu_bad_page_nb);
2776 notifier_registered = true;
2777 }
2778}
2779#endif
2780