1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/debugfs.h>
25#include <linux/list.h>
26#include <linux/module.h>
27#include <linux/uaccess.h>
28#include <linux/reboot.h>
29#include <linux/syscalls.h>
30#include <linux/pm_runtime.h>
31
32#include "amdgpu.h"
33#include "amdgpu_ras.h"
34#include "amdgpu_atomfirmware.h"
35#include "amdgpu_xgmi.h"
36#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
37#include "atom.h"
38
39static const char *RAS_FS_NAME = "ras";
40
41const char *ras_error_string[] = {
42 "none",
43 "parity",
44 "single_correctable",
45 "multi_uncorrectable",
46 "poison",
47};
48
49const char *ras_block_string[] = {
50 "umc",
51 "sdma",
52 "gfx",
53 "mmhub",
54 "athub",
55 "pcie_bif",
56 "hdp",
57 "xgmi_wafl",
58 "df",
59 "smn",
60 "sem",
61 "mp0",
62 "mp1",
63 "fuse",
64};
65
66#define ras_err_str(i) (ras_error_string[ffs(i)])
67
68#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
69
70
71#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
72
73
74#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
75
76enum amdgpu_ras_retire_page_reservation {
77 AMDGPU_RAS_RETIRE_PAGE_RESERVED,
78 AMDGPU_RAS_RETIRE_PAGE_PENDING,
79 AMDGPU_RAS_RETIRE_PAGE_FAULT,
80};
81
82atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
83
84static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
85 uint64_t addr);
86static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
87 uint64_t addr);
88
89void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
90{
91 if (adev && amdgpu_ras_get_context(adev))
92 amdgpu_ras_get_context(adev)->error_query_ready = ready;
93}
94
95static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
96{
97 if (adev && amdgpu_ras_get_context(adev))
98 return amdgpu_ras_get_context(adev)->error_query_ready;
99
100 return false;
101}
102
103static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address)
104{
105 struct ras_err_data err_data = {0, 0, 0, NULL};
106 struct eeprom_table_record err_rec;
107
108 if ((address >= adev->gmc.mc_vram_size) ||
109 (address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
110 dev_warn(adev->dev,
111 "RAS WARN: input address 0x%llx is invalid.\n",
112 address);
113 return -EINVAL;
114 }
115
116 if (amdgpu_ras_check_bad_page(adev, address)) {
117 dev_warn(adev->dev,
118 "RAS WARN: 0x%llx has already been marked as bad page!\n",
119 address);
120 return 0;
121 }
122
123 memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
124
125 err_rec.address = address;
126 err_rec.retired_page = address >> AMDGPU_GPU_PAGE_SHIFT;
127 err_rec.ts = (uint64_t)ktime_get_real_seconds();
128 err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
129
130 err_data.err_addr = &err_rec;
131 err_data.err_addr_cnt = 1;
132
133 if (amdgpu_bad_page_threshold != 0) {
134 amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
135 err_data.err_addr_cnt);
136 amdgpu_ras_save_bad_pages(adev);
137 }
138
139 dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
140 dev_warn(adev->dev, "Clear EEPROM:\n");
141 dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n");
142
143 return 0;
144}
145
146static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
147 size_t size, loff_t *pos)
148{
149 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
150 struct ras_query_if info = {
151 .head = obj->head,
152 };
153 ssize_t s;
154 char val[128];
155
156 if (amdgpu_ras_query_error_status(obj->adev, &info))
157 return -EINVAL;
158
159 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
160 "ue", info.ue_count,
161 "ce", info.ce_count);
162 if (*pos >= s)
163 return 0;
164
165 s -= *pos;
166 s = min_t(u64, s, size);
167
168
169 if (copy_to_user(buf, &val[*pos], s))
170 return -EINVAL;
171
172 *pos += s;
173
174 return s;
175}
176
177static const struct file_operations amdgpu_ras_debugfs_ops = {
178 .owner = THIS_MODULE,
179 .read = amdgpu_ras_debugfs_read,
180 .write = NULL,
181 .llseek = default_llseek
182};
183
184static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
185{
186 int i;
187
188 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
189 *block_id = i;
190 if (strcmp(name, ras_block_str(i)) == 0)
191 return 0;
192 }
193 return -EINVAL;
194}
195
196static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
197 const char __user *buf, size_t size,
198 loff_t *pos, struct ras_debug_if *data)
199{
200 ssize_t s = min_t(u64, 64, size);
201 char str[65];
202 char block_name[33];
203 char err[9] = "ue";
204 int op = -1;
205 int block_id;
206 uint32_t sub_block;
207 u64 address, value;
208
209 if (*pos)
210 return -EINVAL;
211 *pos = size;
212
213 memset(str, 0, sizeof(str));
214 memset(data, 0, sizeof(*data));
215
216 if (copy_from_user(str, buf, s))
217 return -EINVAL;
218
219 if (sscanf(str, "disable %32s", block_name) == 1)
220 op = 0;
221 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
222 op = 1;
223 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
224 op = 2;
225 else if (strstr(str, "retire_page") != NULL)
226 op = 3;
227 else if (str[0] && str[1] && str[2] && str[3])
228
229 return -EINVAL;
230
231 if (op != -1) {
232 if (op == 3) {
233 if (sscanf(str, "%*s 0x%llx", &address) != 1 &&
234 sscanf(str, "%*s %llu", &address) != 1)
235 return -EINVAL;
236
237 data->op = op;
238 data->inject.address = address;
239
240 return 0;
241 }
242
243 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
244 return -EINVAL;
245
246 data->head.block = block_id;
247
248 if (!memcmp("ue", err, 2))
249 data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
250 else if (!memcmp("ce", err, 2))
251 data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
252 else
253 return -EINVAL;
254
255 data->op = op;
256
257 if (op == 2) {
258 if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
259 &sub_block, &address, &value) != 3 &&
260 sscanf(str, "%*s %*s %*s %u %llu %llu",
261 &sub_block, &address, &value) != 3)
262 return -EINVAL;
263 data->head.sub_block_index = sub_block;
264 data->inject.address = address;
265 data->inject.value = value;
266 }
267 } else {
268 if (size < sizeof(*data))
269 return -EINVAL;
270
271 if (copy_from_user(data, buf, sizeof(*data)))
272 return -EINVAL;
273 }
274
275 return 0;
276}
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
358 const char __user *buf,
359 size_t size, loff_t *pos)
360{
361 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
362 struct ras_debug_if data;
363 int ret = 0;
364
365 if (!amdgpu_ras_get_error_query_ready(adev)) {
366 dev_warn(adev->dev, "RAS WARN: error injection "
367 "currently inaccessible\n");
368 return size;
369 }
370
371 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
372 if (ret)
373 return ret;
374
375 if (data.op == 3) {
376 ret = amdgpu_reserve_page_direct(adev, data.inject.address);
377 if (!ret)
378 return size;
379 else
380 return ret;
381 }
382
383 if (!amdgpu_ras_is_supported(adev, data.head.block))
384 return -EINVAL;
385
386 switch (data.op) {
387 case 0:
388 ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
389 break;
390 case 1:
391 ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
392 break;
393 case 2:
394 if ((data.inject.address >= adev->gmc.mc_vram_size) ||
395 (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
396 dev_warn(adev->dev, "RAS WARN: input address "
397 "0x%llx is invalid.",
398 data.inject.address);
399 ret = -EINVAL;
400 break;
401 }
402
403
404 if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
405 amdgpu_ras_check_bad_page(adev, data.inject.address)) {
406 dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
407 "already been marked as bad!\n",
408 data.inject.address);
409 break;
410 }
411
412
413 ret = amdgpu_ras_error_inject(adev, &data.inject);
414 break;
415 default:
416 ret = -EINVAL;
417 break;
418 }
419
420 if (ret)
421 return -EINVAL;
422
423 return size;
424}
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f,
443 const char __user *buf,
444 size_t size, loff_t *pos)
445{
446 struct amdgpu_device *adev =
447 (struct amdgpu_device *)file_inode(f)->i_private;
448 int ret;
449
450 ret = amdgpu_ras_eeprom_reset_table(
451 &(amdgpu_ras_get_context(adev)->eeprom_control));
452
453 if (!ret) {
454
455
456 amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS;
457 return size;
458 } else {
459 return ret;
460 }
461}
462
463static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
464 .owner = THIS_MODULE,
465 .read = NULL,
466 .write = amdgpu_ras_debugfs_ctrl_write,
467 .llseek = default_llseek
468};
469
470static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
471 .owner = THIS_MODULE,
472 .read = NULL,
473 .write = amdgpu_ras_debugfs_eeprom_write,
474 .llseek = default_llseek
475};
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
499 struct device_attribute *attr, char *buf)
500{
501 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
502 struct ras_query_if info = {
503 .head = obj->head,
504 };
505
506 if (!amdgpu_ras_get_error_query_ready(obj->adev))
507 return sysfs_emit(buf, "Query currently inaccessible\n");
508
509 if (amdgpu_ras_query_error_status(obj->adev, &info))
510 return -EINVAL;
511
512
513 if (obj->adev->asic_type == CHIP_ALDEBARAN) {
514 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
515 DRM_WARN("Failed to reset error counter and error status");
516 }
517
518 return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
519 "ce", info.ce_count);
520}
521
522
523
524#define get_obj(obj) do { (obj)->use++; } while (0)
525#define alive_obj(obj) ((obj)->use)
526
527static inline void put_obj(struct ras_manager *obj)
528{
529 if (obj && (--obj->use == 0))
530 list_del(&obj->node);
531 if (obj && (obj->use < 0))
532 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", ras_block_str(obj->head.block));
533}
534
535
536static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
537 struct ras_common_if *head)
538{
539 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
540 struct ras_manager *obj;
541
542 if (!adev->ras_enabled || !con)
543 return NULL;
544
545 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
546 return NULL;
547
548 obj = &con->objs[head->block];
549
550 if (alive_obj(obj))
551 return NULL;
552
553 obj->head = *head;
554 obj->adev = adev;
555 list_add(&obj->node, &con->head);
556 get_obj(obj);
557
558 return obj;
559}
560
561
562struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
563 struct ras_common_if *head)
564{
565 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
566 struct ras_manager *obj;
567 int i;
568
569 if (!adev->ras_enabled || !con)
570 return NULL;
571
572 if (head) {
573 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
574 return NULL;
575
576 obj = &con->objs[head->block];
577
578 if (alive_obj(obj)) {
579 WARN_ON(head->block != obj->head.block);
580 return obj;
581 }
582 } else {
583 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
584 obj = &con->objs[i];
585 if (alive_obj(obj)) {
586 WARN_ON(i != obj->head.block);
587 return obj;
588 }
589 }
590 }
591
592 return NULL;
593}
594
595
596
597static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
598 struct ras_common_if *head)
599{
600 return adev->ras_hw_enabled & BIT(head->block);
601}
602
603static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
604 struct ras_common_if *head)
605{
606 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
607
608 return con->features & BIT(head->block);
609}
610
611
612
613
614
615static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
616 struct ras_common_if *head, int enable)
617{
618 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
619 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
620
621
622
623
624
625
626
627 if (!amdgpu_ras_is_feature_allowed(adev, head))
628 return 0;
629 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
630 return 0;
631
632 if (enable) {
633 if (!obj) {
634 obj = amdgpu_ras_create_obj(adev, head);
635 if (!obj)
636 return -EINVAL;
637 } else {
638
639 get_obj(obj);
640 }
641 con->features |= BIT(head->block);
642 } else {
643 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
644 con->features &= ~BIT(head->block);
645 put_obj(obj);
646 }
647 }
648
649 return 0;
650}
651
652
653int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
654 struct ras_common_if *head, bool enable)
655{
656 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
657 union ta_ras_cmd_input *info;
658 int ret;
659
660 if (!con)
661 return -EINVAL;
662
663 info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);
664 if (!info)
665 return -ENOMEM;
666
667 if (!enable) {
668 info->disable_features = (struct ta_ras_disable_features_input) {
669 .block_id = amdgpu_ras_block_to_ta(head->block),
670 .error_type = amdgpu_ras_error_to_ta(head->type),
671 };
672 } else {
673 info->enable_features = (struct ta_ras_enable_features_input) {
674 .block_id = amdgpu_ras_block_to_ta(head->block),
675 .error_type = amdgpu_ras_error_to_ta(head->type),
676 };
677 }
678
679
680 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
681
682 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) {
683 ret = 0;
684 goto out;
685 }
686
687 if (!amdgpu_ras_intr_triggered()) {
688 ret = psp_ras_enable_features(&adev->psp, info, enable);
689 if (ret) {
690 dev_err(adev->dev, "ras %s %s failed %d\n",
691 enable ? "enable":"disable",
692 ras_block_str(head->block),
693 ret);
694 goto out;
695 }
696 }
697
698
699 __amdgpu_ras_feature_enable(adev, head, enable);
700 ret = 0;
701out:
702 kfree(info);
703 return ret;
704}
705
706
707int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
708 struct ras_common_if *head, bool enable)
709{
710 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
711 int ret;
712
713 if (!con)
714 return -EINVAL;
715
716 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
717 if (enable) {
718
719
720
721
722
723
724 ret = amdgpu_ras_feature_enable(adev, head, 1);
725
726
727
728
729 if (ret == -EINVAL) {
730 ret = __amdgpu_ras_feature_enable(adev, head, 1);
731 if (!ret)
732 dev_info(adev->dev,
733 "RAS INFO: %s setup object\n",
734 ras_block_str(head->block));
735 }
736 } else {
737
738 ret = __amdgpu_ras_feature_enable(adev, head, 1);
739 if (ret)
740 return ret;
741
742
743 if (head->block == AMDGPU_RAS_BLOCK__GFX)
744 con->features |= BIT(head->block);
745
746 ret = amdgpu_ras_feature_enable(adev, head, 0);
747
748
749 if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX)
750 con->features &= ~BIT(head->block);
751 }
752 } else
753 ret = amdgpu_ras_feature_enable(adev, head, enable);
754
755 return ret;
756}
757
758static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
759 bool bypass)
760{
761 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
762 struct ras_manager *obj, *tmp;
763
764 list_for_each_entry_safe(obj, tmp, &con->head, node) {
765
766
767
768 if (bypass) {
769 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
770 break;
771 } else {
772 if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
773 break;
774 }
775 }
776
777 return con->features;
778}
779
780static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
781 bool bypass)
782{
783 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
784 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
785 int i;
786 const enum amdgpu_ras_error_type default_ras_type =
787 AMDGPU_RAS_ERROR__NONE;
788
789 for (i = 0; i < ras_block_count; i++) {
790 struct ras_common_if head = {
791 .block = i,
792 .type = default_ras_type,
793 .sub_block_index = 0,
794 };
795 if (bypass) {
796
797
798
799
800 if (__amdgpu_ras_feature_enable(adev, &head, 1))
801 break;
802 } else {
803 if (amdgpu_ras_feature_enable(adev, &head, 1))
804 break;
805 }
806 }
807
808 return con->features;
809}
810
811
812
813int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
814 struct ras_query_if *info)
815{
816 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
817 struct ras_err_data err_data = {0, 0, 0, NULL};
818 int i;
819
820 if (!obj)
821 return -EINVAL;
822
823 switch (info->head.block) {
824 case AMDGPU_RAS_BLOCK__UMC:
825 if (adev->umc.ras_funcs &&
826 adev->umc.ras_funcs->query_ras_error_count)
827 adev->umc.ras_funcs->query_ras_error_count(adev, &err_data);
828
829
830
831 if (adev->umc.ras_funcs &&
832 adev->umc.ras_funcs->query_ras_error_address)
833 adev->umc.ras_funcs->query_ras_error_address(adev, &err_data);
834 break;
835 case AMDGPU_RAS_BLOCK__SDMA:
836 if (adev->sdma.funcs->query_ras_error_count) {
837 for (i = 0; i < adev->sdma.num_instances; i++)
838 adev->sdma.funcs->query_ras_error_count(adev, i,
839 &err_data);
840 }
841 break;
842 case AMDGPU_RAS_BLOCK__GFX:
843 if (adev->gfx.ras_funcs &&
844 adev->gfx.ras_funcs->query_ras_error_count)
845 adev->gfx.ras_funcs->query_ras_error_count(adev, &err_data);
846
847 if (adev->gfx.ras_funcs &&
848 adev->gfx.ras_funcs->query_ras_error_status)
849 adev->gfx.ras_funcs->query_ras_error_status(adev);
850 break;
851 case AMDGPU_RAS_BLOCK__MMHUB:
852 if (adev->mmhub.ras_funcs &&
853 adev->mmhub.ras_funcs->query_ras_error_count)
854 adev->mmhub.ras_funcs->query_ras_error_count(adev, &err_data);
855
856 if (adev->mmhub.ras_funcs &&
857 adev->mmhub.ras_funcs->query_ras_error_status)
858 adev->mmhub.ras_funcs->query_ras_error_status(adev);
859 break;
860 case AMDGPU_RAS_BLOCK__PCIE_BIF:
861 if (adev->nbio.ras_funcs &&
862 adev->nbio.ras_funcs->query_ras_error_count)
863 adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data);
864 break;
865 case AMDGPU_RAS_BLOCK__XGMI_WAFL:
866 if (adev->gmc.xgmi.ras_funcs &&
867 adev->gmc.xgmi.ras_funcs->query_ras_error_count)
868 adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, &err_data);
869 break;
870 case AMDGPU_RAS_BLOCK__HDP:
871 if (adev->hdp.ras_funcs &&
872 adev->hdp.ras_funcs->query_ras_error_count)
873 adev->hdp.ras_funcs->query_ras_error_count(adev, &err_data);
874 break;
875 default:
876 break;
877 }
878
879 obj->err_data.ue_count += err_data.ue_count;
880 obj->err_data.ce_count += err_data.ce_count;
881
882 info->ue_count = obj->err_data.ue_count;
883 info->ce_count = obj->err_data.ce_count;
884
885 if (err_data.ce_count) {
886 if (adev->smuio.funcs &&
887 adev->smuio.funcs->get_socket_id &&
888 adev->smuio.funcs->get_die_id) {
889 dev_info(adev->dev, "socket: %d, die: %d "
890 "%ld correctable hardware errors "
891 "detected in %s block, no user "
892 "action is needed.\n",
893 adev->smuio.funcs->get_socket_id(adev),
894 adev->smuio.funcs->get_die_id(adev),
895 obj->err_data.ce_count,
896 ras_block_str(info->head.block));
897 } else {
898 dev_info(adev->dev, "%ld correctable hardware errors "
899 "detected in %s block, no user "
900 "action is needed.\n",
901 obj->err_data.ce_count,
902 ras_block_str(info->head.block));
903 }
904 }
905 if (err_data.ue_count) {
906 if (adev->smuio.funcs &&
907 adev->smuio.funcs->get_socket_id &&
908 adev->smuio.funcs->get_die_id) {
909 dev_info(adev->dev, "socket: %d, die: %d "
910 "%ld uncorrectable hardware errors "
911 "detected in %s block\n",
912 adev->smuio.funcs->get_socket_id(adev),
913 adev->smuio.funcs->get_die_id(adev),
914 obj->err_data.ue_count,
915 ras_block_str(info->head.block));
916 } else {
917 dev_info(adev->dev, "%ld uncorrectable hardware errors "
918 "detected in %s block\n",
919 obj->err_data.ue_count,
920 ras_block_str(info->head.block));
921 }
922 }
923
924 return 0;
925}
926
927int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
928 enum amdgpu_ras_block block)
929{
930 if (!amdgpu_ras_is_supported(adev, block))
931 return -EINVAL;
932
933 switch (block) {
934 case AMDGPU_RAS_BLOCK__GFX:
935 if (adev->gfx.ras_funcs &&
936 adev->gfx.ras_funcs->reset_ras_error_count)
937 adev->gfx.ras_funcs->reset_ras_error_count(adev);
938
939 if (adev->gfx.ras_funcs &&
940 adev->gfx.ras_funcs->reset_ras_error_status)
941 adev->gfx.ras_funcs->reset_ras_error_status(adev);
942 break;
943 case AMDGPU_RAS_BLOCK__MMHUB:
944 if (adev->mmhub.ras_funcs &&
945 adev->mmhub.ras_funcs->reset_ras_error_count)
946 adev->mmhub.ras_funcs->reset_ras_error_count(adev);
947
948 if (adev->mmhub.ras_funcs &&
949 adev->mmhub.ras_funcs->reset_ras_error_status)
950 adev->mmhub.ras_funcs->reset_ras_error_status(adev);
951 break;
952 case AMDGPU_RAS_BLOCK__SDMA:
953 if (adev->sdma.funcs->reset_ras_error_count)
954 adev->sdma.funcs->reset_ras_error_count(adev);
955 break;
956 case AMDGPU_RAS_BLOCK__HDP:
957 if (adev->hdp.ras_funcs &&
958 adev->hdp.ras_funcs->reset_ras_error_count)
959 adev->hdp.ras_funcs->reset_ras_error_count(adev);
960 break;
961 default:
962 break;
963 }
964
965 return 0;
966}
967
968
969static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
970 struct ta_ras_trigger_error_input *block_info)
971{
972 int ret;
973
974 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
975 dev_warn(adev->dev, "Failed to disallow df cstate");
976
977 if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
978 dev_warn(adev->dev, "Failed to disallow XGMI power down");
979
980 ret = psp_ras_trigger_error(&adev->psp, block_info);
981
982 if (amdgpu_ras_intr_triggered())
983 return ret;
984
985 if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
986 dev_warn(adev->dev, "Failed to allow XGMI power down");
987
988 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
989 dev_warn(adev->dev, "Failed to allow df cstate");
990
991 return ret;
992}
993
994
995int amdgpu_ras_error_inject(struct amdgpu_device *adev,
996 struct ras_inject_if *info)
997{
998 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
999 struct ta_ras_trigger_error_input block_info = {
1000 .block_id = amdgpu_ras_block_to_ta(info->head.block),
1001 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
1002 .sub_block_index = info->head.sub_block_index,
1003 .address = info->address,
1004 .value = info->value,
1005 };
1006 int ret = 0;
1007
1008 if (!obj)
1009 return -EINVAL;
1010
1011
1012 if (adev->gmc.xgmi.num_physical_nodes > 1) {
1013 block_info.address =
1014 amdgpu_xgmi_get_relative_phy_addr(adev,
1015 block_info.address);
1016 }
1017
1018 switch (info->head.block) {
1019 case AMDGPU_RAS_BLOCK__GFX:
1020 if (adev->gfx.ras_funcs &&
1021 adev->gfx.ras_funcs->ras_error_inject)
1022 ret = adev->gfx.ras_funcs->ras_error_inject(adev, info);
1023 else
1024 ret = -EINVAL;
1025 break;
1026 case AMDGPU_RAS_BLOCK__UMC:
1027 case AMDGPU_RAS_BLOCK__SDMA:
1028 case AMDGPU_RAS_BLOCK__MMHUB:
1029 case AMDGPU_RAS_BLOCK__PCIE_BIF:
1030 ret = psp_ras_trigger_error(&adev->psp, &block_info);
1031 break;
1032 case AMDGPU_RAS_BLOCK__XGMI_WAFL:
1033 ret = amdgpu_ras_error_inject_xgmi(adev, &block_info);
1034 break;
1035 default:
1036 dev_info(adev->dev, "%s error injection is not supported yet\n",
1037 ras_block_str(info->head.block));
1038 ret = -EINVAL;
1039 }
1040
1041 if (ret)
1042 dev_err(adev->dev, "ras inject %s failed %d\n",
1043 ras_block_str(info->head.block), ret);
1044
1045 return ret;
1046}
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
1060 unsigned long *ce_count,
1061 unsigned long *ue_count)
1062{
1063 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1064 struct ras_manager *obj;
1065 unsigned long ce, ue;
1066
1067 if (!adev->ras_enabled || !con)
1068 return -EOPNOTSUPP;
1069
1070
1071
1072 if (!ce_count && !ue_count)
1073 return 0;
1074
1075 ce = 0;
1076 ue = 0;
1077 list_for_each_entry(obj, &con->head, node) {
1078 struct ras_query_if info = {
1079 .head = obj->head,
1080 };
1081 int res;
1082
1083 res = amdgpu_ras_query_error_status(adev, &info);
1084 if (res)
1085 return res;
1086
1087 ce += info.ce_count;
1088 ue += info.ue_count;
1089 }
1090
1091 if (ce_count)
1092 *ce_count = ce;
1093
1094 if (ue_count)
1095 *ue_count = ue;
1096
1097 return 0;
1098}
1099
1100
1101
1102
1103
1104static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1105 struct ras_badpage **bps, unsigned int *count);
1106
1107static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
1108{
1109 switch (flags) {
1110 case AMDGPU_RAS_RETIRE_PAGE_RESERVED:
1111 return "R";
1112 case AMDGPU_RAS_RETIRE_PAGE_PENDING:
1113 return "P";
1114 case AMDGPU_RAS_RETIRE_PAGE_FAULT:
1115 default:
1116 return "F";
1117 }
1118}
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
1151 struct kobject *kobj, struct bin_attribute *attr,
1152 char *buf, loff_t ppos, size_t count)
1153{
1154 struct amdgpu_ras *con =
1155 container_of(attr, struct amdgpu_ras, badpages_attr);
1156 struct amdgpu_device *adev = con->adev;
1157 const unsigned int element_size =
1158 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
1159 unsigned int start = div64_ul(ppos + element_size - 1, element_size);
1160 unsigned int end = div64_ul(ppos + count - 1, element_size);
1161 ssize_t s = 0;
1162 struct ras_badpage *bps = NULL;
1163 unsigned int bps_count = 0;
1164
1165 memset(buf, 0, count);
1166
1167 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
1168 return 0;
1169
1170 for (; start < end && start < bps_count; start++)
1171 s += scnprintf(&buf[s], element_size + 1,
1172 "0x%08x : 0x%08x : %1s\n",
1173 bps[start].bp,
1174 bps[start].size,
1175 amdgpu_ras_badpage_flags_str(bps[start].flags));
1176
1177 kfree(bps);
1178
1179 return s;
1180}
1181
1182static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
1183 struct device_attribute *attr, char *buf)
1184{
1185 struct amdgpu_ras *con =
1186 container_of(attr, struct amdgpu_ras, features_attr);
1187
1188 return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
1189}
1190
1191static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev)
1192{
1193 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1194
1195 sysfs_remove_file_from_group(&adev->dev->kobj,
1196 &con->badpages_attr.attr,
1197 RAS_FS_NAME);
1198}
1199
1200static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
1201{
1202 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1203 struct attribute *attrs[] = {
1204 &con->features_attr.attr,
1205 NULL
1206 };
1207 struct attribute_group group = {
1208 .name = RAS_FS_NAME,
1209 .attrs = attrs,
1210 };
1211
1212 sysfs_remove_group(&adev->dev->kobj, &group);
1213
1214 return 0;
1215}
1216
1217int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
1218 struct ras_fs_if *head)
1219{
1220 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
1221
1222 if (!obj || obj->attr_inuse)
1223 return -EINVAL;
1224
1225 get_obj(obj);
1226
1227 memcpy(obj->fs_data.sysfs_name,
1228 head->sysfs_name,
1229 sizeof(obj->fs_data.sysfs_name));
1230
1231 obj->sysfs_attr = (struct device_attribute){
1232 .attr = {
1233 .name = obj->fs_data.sysfs_name,
1234 .mode = S_IRUGO,
1235 },
1236 .show = amdgpu_ras_sysfs_read,
1237 };
1238 sysfs_attr_init(&obj->sysfs_attr.attr);
1239
1240 if (sysfs_add_file_to_group(&adev->dev->kobj,
1241 &obj->sysfs_attr.attr,
1242 RAS_FS_NAME)) {
1243 put_obj(obj);
1244 return -EINVAL;
1245 }
1246
1247 obj->attr_inuse = 1;
1248
1249 return 0;
1250}
1251
1252int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
1253 struct ras_common_if *head)
1254{
1255 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1256
1257 if (!obj || !obj->attr_inuse)
1258 return -EINVAL;
1259
1260 sysfs_remove_file_from_group(&adev->dev->kobj,
1261 &obj->sysfs_attr.attr,
1262 RAS_FS_NAME);
1263 obj->attr_inuse = 0;
1264 put_obj(obj);
1265
1266 return 0;
1267}
1268
1269static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
1270{
1271 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1272 struct ras_manager *obj, *tmp;
1273
1274 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1275 amdgpu_ras_sysfs_remove(adev, &obj->head);
1276 }
1277
1278 if (amdgpu_bad_page_threshold != 0)
1279 amdgpu_ras_sysfs_remove_bad_page_node(adev);
1280
1281 amdgpu_ras_sysfs_remove_feature_node(adev);
1282
1283 return 0;
1284}
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
1307{
1308 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1309 struct drm_minor *minor = adev_to_drm(adev)->primary;
1310 struct dentry *dir;
1311
1312 dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root);
1313 debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev,
1314 &amdgpu_ras_debugfs_ctrl_ops);
1315 debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev,
1316 &amdgpu_ras_debugfs_eeprom_ops);
1317 debugfs_create_u32("bad_page_cnt_threshold", 0444, dir,
1318 &con->bad_page_cnt_threshold);
1319 debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled);
1320 debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled);
1321 debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev,
1322 &amdgpu_ras_debugfs_eeprom_size_ops);
1323 con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table",
1324 S_IRUGO, dir, adev,
1325 &amdgpu_ras_debugfs_eeprom_table_ops);
1326 amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control);
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336 debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, dir, &con->reboot);
1337
1338
1339
1340
1341
1342 debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, dir,
1343 &con->disable_ras_err_cnt_harvest);
1344 return dir;
1345}
1346
1347static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
1348 struct ras_fs_if *head,
1349 struct dentry *dir)
1350{
1351 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
1352
1353 if (!obj || !dir)
1354 return;
1355
1356 get_obj(obj);
1357
1358 memcpy(obj->fs_data.debugfs_name,
1359 head->debugfs_name,
1360 sizeof(obj->fs_data.debugfs_name));
1361
1362 debugfs_create_file(obj->fs_data.debugfs_name, S_IWUGO | S_IRUGO, dir,
1363 obj, &amdgpu_ras_debugfs_ops);
1364}
1365
1366void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
1367{
1368 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1369 struct dentry *dir;
1370 struct ras_manager *obj;
1371 struct ras_fs_if fs_info;
1372
1373
1374
1375
1376
1377 if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con)
1378 return;
1379
1380 dir = amdgpu_ras_debugfs_create_ctrl_node(adev);
1381
1382 list_for_each_entry(obj, &con->head, node) {
1383 if (amdgpu_ras_is_supported(adev, obj->head.block) &&
1384 (obj->attr_inuse == 1)) {
1385 sprintf(fs_info.debugfs_name, "%s_err_inject",
1386 ras_block_str(obj->head.block));
1387 fs_info.head = obj->head;
1388 amdgpu_ras_debugfs_create(adev, &fs_info, dir);
1389 }
1390 }
1391}
1392
1393
1394
1395
1396static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO,
1397 amdgpu_ras_sysfs_badpages_read, NULL, 0);
1398static DEVICE_ATTR(features, S_IRUGO,
1399 amdgpu_ras_sysfs_features_read, NULL);
1400static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
1401{
1402 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1403 struct attribute_group group = {
1404 .name = RAS_FS_NAME,
1405 };
1406 struct attribute *attrs[] = {
1407 &con->features_attr.attr,
1408 NULL
1409 };
1410 struct bin_attribute *bin_attrs[] = {
1411 NULL,
1412 NULL,
1413 };
1414 int r;
1415
1416
1417 con->features_attr = dev_attr_features;
1418 group.attrs = attrs;
1419 sysfs_attr_init(attrs[0]);
1420
1421 if (amdgpu_bad_page_threshold != 0) {
1422
1423 bin_attr_gpu_vram_bad_pages.private = NULL;
1424 con->badpages_attr = bin_attr_gpu_vram_bad_pages;
1425 bin_attrs[0] = &con->badpages_attr;
1426 group.bin_attrs = bin_attrs;
1427 sysfs_bin_attr_init(bin_attrs[0]);
1428 }
1429
1430 r = sysfs_create_group(&adev->dev->kobj, &group);
1431 if (r)
1432 dev_err(adev->dev, "Failed to create RAS sysfs group!");
1433
1434 return 0;
1435}
1436
1437static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
1438{
1439 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1440 struct ras_manager *con_obj, *ip_obj, *tmp;
1441
1442 if (IS_ENABLED(CONFIG_DEBUG_FS)) {
1443 list_for_each_entry_safe(con_obj, tmp, &con->head, node) {
1444 ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head);
1445 if (ip_obj)
1446 put_obj(ip_obj);
1447 }
1448 }
1449
1450 amdgpu_ras_sysfs_remove_all(adev);
1451 return 0;
1452}
1453
1454
1455
1456static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
1457{
1458 struct ras_ih_data *data = &obj->ih_data;
1459 struct amdgpu_iv_entry entry;
1460 int ret;
1461 struct ras_err_data err_data = {0, 0, 0, NULL};
1462
1463 while (data->rptr != data->wptr) {
1464 rmb();
1465 memcpy(&entry, &data->ring[data->rptr],
1466 data->element_size);
1467
1468 wmb();
1469 data->rptr = (data->aligned_element_size +
1470 data->rptr) % data->ring_size;
1471
1472
1473
1474
1475 if (data->cb) {
1476 ret = data->cb(obj->adev, &err_data, &entry);
1477
1478
1479
1480
1481
1482 if (ret == AMDGPU_RAS_SUCCESS) {
1483
1484
1485
1486 obj->err_data.ue_count += err_data.ue_count;
1487 obj->err_data.ce_count += err_data.ce_count;
1488 }
1489 }
1490 }
1491}
1492
1493static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1494{
1495 struct ras_ih_data *data =
1496 container_of(work, struct ras_ih_data, ih_work);
1497 struct ras_manager *obj =
1498 container_of(data, struct ras_manager, ih_data);
1499
1500 amdgpu_ras_interrupt_handler(obj);
1501}
1502
1503int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1504 struct ras_dispatch_if *info)
1505{
1506 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1507 struct ras_ih_data *data = &obj->ih_data;
1508
1509 if (!obj)
1510 return -EINVAL;
1511
1512 if (data->inuse == 0)
1513 return 0;
1514
1515
1516 memcpy(&data->ring[data->wptr], info->entry,
1517 data->element_size);
1518
1519 wmb();
1520 data->wptr = (data->aligned_element_size +
1521 data->wptr) % data->ring_size;
1522
1523 schedule_work(&data->ih_work);
1524
1525 return 0;
1526}
1527
1528int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1529 struct ras_ih_if *info)
1530{
1531 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1532 struct ras_ih_data *data;
1533
1534 if (!obj)
1535 return -EINVAL;
1536
1537 data = &obj->ih_data;
1538 if (data->inuse == 0)
1539 return 0;
1540
1541 cancel_work_sync(&data->ih_work);
1542
1543 kfree(data->ring);
1544 memset(data, 0, sizeof(*data));
1545 put_obj(obj);
1546
1547 return 0;
1548}
1549
1550int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1551 struct ras_ih_if *info)
1552{
1553 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1554 struct ras_ih_data *data;
1555
1556 if (!obj) {
1557
1558 obj = amdgpu_ras_create_obj(adev, &info->head);
1559 if (!obj)
1560 return -EINVAL;
1561 } else
1562 get_obj(obj);
1563
1564 data = &obj->ih_data;
1565
1566 *data = (struct ras_ih_data) {
1567 .inuse = 0,
1568 .cb = info->cb,
1569 .element_size = sizeof(struct amdgpu_iv_entry),
1570 .rptr = 0,
1571 .wptr = 0,
1572 };
1573
1574 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1575
1576 data->aligned_element_size = ALIGN(data->element_size, 8);
1577
1578 data->ring_size = 64 * data->aligned_element_size;
1579 data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1580 if (!data->ring) {
1581 put_obj(obj);
1582 return -ENOMEM;
1583 }
1584
1585
1586 data->inuse = 1;
1587
1588 return 0;
1589}
1590
1591static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1592{
1593 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1594 struct ras_manager *obj, *tmp;
1595
1596 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1597 struct ras_ih_if info = {
1598 .head = obj->head,
1599 };
1600 amdgpu_ras_interrupt_remove_handler(adev, &info);
1601 }
1602
1603 return 0;
1604}
1605
1606
1607
1608static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
1609{
1610 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1611 struct ras_manager *obj;
1612
1613 if (!adev->ras_enabled || !con)
1614 return;
1615
1616 list_for_each_entry(obj, &con->head, node) {
1617 struct ras_query_if info = {
1618 .head = obj->head,
1619 };
1620
1621
1622
1623
1624
1625
1626
1627 if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
1628 continue;
1629
1630 amdgpu_ras_query_error_status(adev, &info);
1631 }
1632}
1633
1634
1635static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
1636 struct ras_query_if *info)
1637{
1638
1639
1640
1641
1642 switch (info->head.block) {
1643 case AMDGPU_RAS_BLOCK__GFX:
1644 if (adev->gfx.ras_funcs &&
1645 adev->gfx.ras_funcs->query_ras_error_status)
1646 adev->gfx.ras_funcs->query_ras_error_status(adev);
1647 break;
1648 case AMDGPU_RAS_BLOCK__MMHUB:
1649 if (adev->mmhub.ras_funcs &&
1650 adev->mmhub.ras_funcs->query_ras_error_status)
1651 adev->mmhub.ras_funcs->query_ras_error_status(adev);
1652 break;
1653 default:
1654 break;
1655 }
1656}
1657
1658static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
1659{
1660 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1661 struct ras_manager *obj;
1662
1663 if (!adev->ras_enabled || !con)
1664 return;
1665
1666 list_for_each_entry(obj, &con->head, node) {
1667 struct ras_query_if info = {
1668 .head = obj->head,
1669 };
1670
1671 amdgpu_ras_error_status_query(adev, &info);
1672 }
1673}
1674
1675
1676
1677
1678
1679
1680static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1681 struct ras_badpage **bps, unsigned int *count)
1682{
1683 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1684 struct ras_err_handler_data *data;
1685 int i = 0;
1686 int ret = 0, status;
1687
1688 if (!con || !con->eh_data || !bps || !count)
1689 return -EINVAL;
1690
1691 mutex_lock(&con->recovery_lock);
1692 data = con->eh_data;
1693 if (!data || data->count == 0) {
1694 *bps = NULL;
1695 ret = -EINVAL;
1696 goto out;
1697 }
1698
1699 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
1700 if (!*bps) {
1701 ret = -ENOMEM;
1702 goto out;
1703 }
1704
1705 for (; i < data->count; i++) {
1706 (*bps)[i] = (struct ras_badpage){
1707 .bp = data->bps[i].retired_page,
1708 .size = AMDGPU_GPU_PAGE_SIZE,
1709 .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
1710 };
1711 status = amdgpu_vram_mgr_query_page_status(
1712 ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
1713 data->bps[i].retired_page);
1714 if (status == -EBUSY)
1715 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
1716 else if (status == -ENOENT)
1717 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
1718 }
1719
1720 *count = data->count;
1721out:
1722 mutex_unlock(&con->recovery_lock);
1723 return ret;
1724}
1725
1726static void amdgpu_ras_do_recovery(struct work_struct *work)
1727{
1728 struct amdgpu_ras *ras =
1729 container_of(work, struct amdgpu_ras, recovery_work);
1730 struct amdgpu_device *remote_adev = NULL;
1731 struct amdgpu_device *adev = ras->adev;
1732 struct list_head device_list, *device_list_handle = NULL;
1733
1734 if (!ras->disable_ras_err_cnt_harvest) {
1735 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
1736
1737
1738 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) {
1739 device_list_handle = &hive->device_list;
1740 } else {
1741 INIT_LIST_HEAD(&device_list);
1742 list_add_tail(&adev->gmc.xgmi.head, &device_list);
1743 device_list_handle = &device_list;
1744 }
1745
1746 list_for_each_entry(remote_adev,
1747 device_list_handle, gmc.xgmi.head) {
1748 amdgpu_ras_query_err_status(remote_adev);
1749 amdgpu_ras_log_on_err_counter(remote_adev);
1750 }
1751
1752 amdgpu_put_xgmi_hive(hive);
1753 }
1754
1755 if (amdgpu_device_should_recover_gpu(ras->adev))
1756 amdgpu_device_gpu_recover(ras->adev, NULL);
1757 atomic_set(&ras->in_recovery, 0);
1758}
1759
1760
1761static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1762 struct ras_err_handler_data *data, int pages)
1763{
1764 unsigned int old_space = data->count + data->space_left;
1765 unsigned int new_space = old_space + pages;
1766 unsigned int align_space = ALIGN(new_space, 512);
1767 void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1768
1769 if (!bps) {
1770 kfree(bps);
1771 return -ENOMEM;
1772 }
1773
1774 if (data->bps) {
1775 memcpy(bps, data->bps,
1776 data->count * sizeof(*data->bps));
1777 kfree(data->bps);
1778 }
1779
1780 data->bps = bps;
1781 data->space_left += align_space - old_space;
1782 return 0;
1783}
1784
1785
1786int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
1787 struct eeprom_table_record *bps, int pages)
1788{
1789 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1790 struct ras_err_handler_data *data;
1791 int ret = 0;
1792 uint32_t i;
1793
1794 if (!con || !con->eh_data || !bps || pages <= 0)
1795 return 0;
1796
1797 mutex_lock(&con->recovery_lock);
1798 data = con->eh_data;
1799 if (!data)
1800 goto out;
1801
1802 for (i = 0; i < pages; i++) {
1803 if (amdgpu_ras_check_bad_page_unlock(con,
1804 bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
1805 continue;
1806
1807 if (!data->space_left &&
1808 amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
1809 ret = -ENOMEM;
1810 goto out;
1811 }
1812
1813 amdgpu_vram_mgr_reserve_range(
1814 ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
1815 bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
1816 AMDGPU_GPU_PAGE_SIZE);
1817
1818 memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
1819 data->count++;
1820 data->space_left--;
1821 }
1822out:
1823 mutex_unlock(&con->recovery_lock);
1824
1825 return ret;
1826}
1827
1828
1829
1830
1831
1832int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1833{
1834 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1835 struct ras_err_handler_data *data;
1836 struct amdgpu_ras_eeprom_control *control;
1837 int save_count;
1838
1839 if (!con || !con->eh_data)
1840 return 0;
1841
1842 control = &con->eeprom_control;
1843 data = con->eh_data;
1844 save_count = data->count - control->ras_num_recs;
1845
1846 if (save_count > 0) {
1847 if (amdgpu_ras_eeprom_append(control,
1848 &data->bps[control->ras_num_recs],
1849 save_count)) {
1850 dev_err(adev->dev, "Failed to save EEPROM table data!");
1851 return -EIO;
1852 }
1853
1854 dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count);
1855 }
1856
1857 return 0;
1858}
1859
1860
1861
1862
1863
1864static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1865{
1866 struct amdgpu_ras_eeprom_control *control =
1867 &adev->psp.ras_context.ras->eeprom_control;
1868 struct eeprom_table_record *bps;
1869 int ret;
1870
1871
1872 if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0)
1873 return 0;
1874
1875 bps = kcalloc(control->ras_num_recs, sizeof(*bps), GFP_KERNEL);
1876 if (!bps)
1877 return -ENOMEM;
1878
1879 ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs);
1880 if (ret)
1881 dev_err(adev->dev, "Failed to load EEPROM table records!");
1882 else
1883 ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs);
1884
1885 kfree(bps);
1886 return ret;
1887}
1888
1889static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
1890 uint64_t addr)
1891{
1892 struct ras_err_handler_data *data = con->eh_data;
1893 int i;
1894
1895 addr >>= AMDGPU_GPU_PAGE_SHIFT;
1896 for (i = 0; i < data->count; i++)
1897 if (addr == data->bps[i].retired_page)
1898 return true;
1899
1900 return false;
1901}
1902
1903
1904
1905
1906
1907
1908static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
1909 uint64_t addr)
1910{
1911 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1912 bool ret = false;
1913
1914 if (!con || !con->eh_data)
1915 return ret;
1916
1917 mutex_lock(&con->recovery_lock);
1918 ret = amdgpu_ras_check_bad_page_unlock(con, addr);
1919 mutex_unlock(&con->recovery_lock);
1920 return ret;
1921}
1922
1923static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
1924 uint32_t max_count)
1925{
1926 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947 if (amdgpu_bad_page_threshold < 0) {
1948 u64 val = adev->gmc.mc_vram_size;
1949
1950 do_div(val, RAS_BAD_PAGE_COVER);
1951 con->bad_page_cnt_threshold = min(lower_32_bits(val),
1952 max_count);
1953 } else {
1954 con->bad_page_cnt_threshold = min_t(int, max_count,
1955 amdgpu_bad_page_threshold);
1956 }
1957}
1958
1959int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
1960{
1961 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1962 struct ras_err_handler_data **data;
1963 u32 max_eeprom_records_count = 0;
1964 bool exc_err_limit = false;
1965 int ret;
1966
1967 if (!con)
1968 return 0;
1969
1970
1971
1972
1973
1974
1975 con->adev = adev;
1976
1977 if (!adev->ras_enabled)
1978 return 0;
1979
1980 data = &con->eh_data;
1981 *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
1982 if (!*data) {
1983 ret = -ENOMEM;
1984 goto out;
1985 }
1986
1987 mutex_init(&con->recovery_lock);
1988 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
1989 atomic_set(&con->in_recovery, 0);
1990
1991 max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
1992 amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
1993
1994
1995
1996
1997
1998 if (adev->gmc.xgmi.pending_reset)
1999 return 0;
2000 ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
2001
2002
2003
2004
2005 if (exc_err_limit || ret)
2006 goto free;
2007
2008 if (con->eeprom_control.ras_num_recs) {
2009 ret = amdgpu_ras_load_bad_pages(adev);
2010 if (ret)
2011 goto free;
2012
2013 if (adev->smu.ppt_funcs && adev->smu.ppt_funcs->send_hbm_bad_pages_num)
2014 adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs);
2015 }
2016
2017 return 0;
2018
2019free:
2020 kfree((*data)->bps);
2021 kfree(*data);
2022 con->eh_data = NULL;
2023out:
2024 dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret);
2025
2026
2027
2028
2029
2030 if (!exc_err_limit)
2031 ret = 0;
2032 else
2033 ret = -EINVAL;
2034
2035 return ret;
2036}
2037
2038static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
2039{
2040 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2041 struct ras_err_handler_data *data = con->eh_data;
2042
2043
2044 if (!data)
2045 return 0;
2046
2047 cancel_work_sync(&con->recovery_work);
2048
2049 mutex_lock(&con->recovery_lock);
2050 con->eh_data = NULL;
2051 kfree(data->bps);
2052 kfree(data);
2053 mutex_unlock(&con->recovery_lock);
2054
2055 return 0;
2056}
2057
2058
2059
2060int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
2061 unsigned int block)
2062{
2063 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
2064
2065 if (!ras)
2066 return -EINVAL;
2067
2068 ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET;
2069 return 0;
2070}
2071
2072static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
2073{
2074 return adev->asic_type == CHIP_VEGA10 ||
2075 adev->asic_type == CHIP_VEGA20 ||
2076 adev->asic_type == CHIP_ARCTURUS ||
2077 adev->asic_type == CHIP_ALDEBARAN ||
2078 adev->asic_type == CHIP_SIENNA_CICHLID;
2079}
2080
2081
2082
2083
2084
2085
2086static void amdgpu_ras_get_quirks(struct amdgpu_device *adev)
2087{
2088 struct atom_context *ctx = adev->mode_info.atom_context;
2089
2090 if (!ctx)
2091 return;
2092
2093 if (strnstr(ctx->vbios_version, "D16406",
2094 sizeof(ctx->vbios_version)) ||
2095 strnstr(ctx->vbios_version, "D36002",
2096 sizeof(ctx->vbios_version)))
2097 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);
2098}
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
2110{
2111 adev->ras_hw_enabled = adev->ras_enabled = 0;
2112
2113 if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
2114 !amdgpu_ras_asic_supported(adev))
2115 return;
2116
2117 if (!adev->gmc.xgmi.connected_to_cpu) {
2118 if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
2119 dev_info(adev->dev, "MEM ECC is active.\n");
2120 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC |
2121 1 << AMDGPU_RAS_BLOCK__DF);
2122 } else {
2123 dev_info(adev->dev, "MEM ECC is not presented.\n");
2124 }
2125
2126 if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
2127 dev_info(adev->dev, "SRAM ECC is active.\n");
2128 adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
2129 1 << AMDGPU_RAS_BLOCK__DF);
2130 } else {
2131 dev_info(adev->dev, "SRAM ECC is not presented.\n");
2132 }
2133 } else {
2134
2135
2136 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX |
2137 1 << AMDGPU_RAS_BLOCK__SDMA |
2138 1 << AMDGPU_RAS_BLOCK__MMHUB);
2139 }
2140
2141 amdgpu_ras_get_quirks(adev);
2142
2143
2144 adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK;
2145
2146 adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
2147 adev->ras_hw_enabled & amdgpu_ras_mask;
2148}
2149
2150static void amdgpu_ras_counte_dw(struct work_struct *work)
2151{
2152 struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
2153 ras_counte_delay_work.work);
2154 struct amdgpu_device *adev = con->adev;
2155 struct drm_device *dev = adev_to_drm(adev);
2156 unsigned long ce_count, ue_count;
2157 int res;
2158
2159 res = pm_runtime_get_sync(dev->dev);
2160 if (res < 0)
2161 goto Out;
2162
2163
2164
2165 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) {
2166 atomic_set(&con->ras_ce_count, ce_count);
2167 atomic_set(&con->ras_ue_count, ue_count);
2168 }
2169
2170 pm_runtime_mark_last_busy(dev->dev);
2171Out:
2172 pm_runtime_put_autosuspend(dev->dev);
2173}
2174
2175int amdgpu_ras_init(struct amdgpu_device *adev)
2176{
2177 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2178 int r;
2179
2180 if (con)
2181 return 0;
2182
2183 con = kmalloc(sizeof(struct amdgpu_ras) +
2184 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
2185 GFP_KERNEL|__GFP_ZERO);
2186 if (!con)
2187 return -ENOMEM;
2188
2189 con->adev = adev;
2190 INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
2191 atomic_set(&con->ras_ce_count, 0);
2192 atomic_set(&con->ras_ue_count, 0);
2193
2194 con->objs = (struct ras_manager *)(con + 1);
2195
2196 amdgpu_ras_set_context(adev, con);
2197
2198 amdgpu_ras_check_supported(adev);
2199
2200 if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) {
2201
2202
2203
2204 if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) {
2205 con->features |= BIT(AMDGPU_RAS_BLOCK__GFX);
2206
2207 return 0;
2208 }
2209
2210 r = 0;
2211 goto release_con;
2212 }
2213
2214 con->features = 0;
2215 INIT_LIST_HEAD(&con->head);
2216
2217 con->flags = RAS_DEFAULT_FLAGS;
2218
2219
2220
2221
2222 switch (adev->asic_type) {
2223 case CHIP_VEGA20:
2224 case CHIP_ARCTURUS:
2225 case CHIP_ALDEBARAN:
2226 if (!adev->gmc.xgmi.connected_to_cpu)
2227 adev->nbio.ras_funcs = &nbio_v7_4_ras_funcs;
2228 break;
2229 default:
2230
2231 break;
2232 }
2233
2234 if (adev->nbio.ras_funcs &&
2235 adev->nbio.ras_funcs->init_ras_controller_interrupt) {
2236 r = adev->nbio.ras_funcs->init_ras_controller_interrupt(adev);
2237 if (r)
2238 goto release_con;
2239 }
2240
2241 if (adev->nbio.ras_funcs &&
2242 adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt) {
2243 r = adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt(adev);
2244 if (r)
2245 goto release_con;
2246 }
2247
2248 if (amdgpu_ras_fs_init(adev)) {
2249 r = -EINVAL;
2250 goto release_con;
2251 }
2252
2253 dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
2254 "hardware ability[%x] ras_mask[%x]\n",
2255 adev->ras_hw_enabled, adev->ras_enabled);
2256
2257 return 0;
2258release_con:
2259 amdgpu_ras_set_context(adev, NULL);
2260 kfree(con);
2261
2262 return r;
2263}
2264
2265int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)
2266{
2267 if (adev->gmc.xgmi.connected_to_cpu)
2268 return 1;
2269 return 0;
2270}
2271
2272static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev,
2273 struct ras_common_if *ras_block)
2274{
2275 struct ras_query_if info = {
2276 .head = *ras_block,
2277 };
2278
2279 if (!amdgpu_persistent_edc_harvesting_supported(adev))
2280 return 0;
2281
2282 if (amdgpu_ras_query_error_status(adev, &info) != 0)
2283 DRM_WARN("RAS init harvest failure");
2284
2285 if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0)
2286 DRM_WARN("RAS init harvest reset failure");
2287
2288 return 0;
2289}
2290
2291
2292int amdgpu_ras_late_init(struct amdgpu_device *adev,
2293 struct ras_common_if *ras_block,
2294 struct ras_fs_if *fs_info,
2295 struct ras_ih_if *ih_info)
2296{
2297 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2298 unsigned long ue_count, ce_count;
2299 int r;
2300
2301
2302 if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
2303 amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
2304 return 0;
2305 }
2306
2307 r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
2308 if (r) {
2309 if (r == -EAGAIN) {
2310
2311 amdgpu_ras_request_reset_on_boot(adev,
2312 ras_block->block);
2313 return 0;
2314 } else if (adev->in_suspend || amdgpu_in_reset(adev)) {
2315
2316
2317 goto cleanup;
2318 } else
2319 return r;
2320 }
2321
2322
2323 amdgpu_persistent_edc_harvesting(adev, ras_block);
2324
2325
2326 if (adev->in_suspend || amdgpu_in_reset(adev))
2327 return 0;
2328
2329 if (ih_info->cb) {
2330 r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
2331 if (r)
2332 goto interrupt;
2333 }
2334
2335 r = amdgpu_ras_sysfs_create(adev, fs_info);
2336 if (r)
2337 goto sysfs;
2338
2339
2340
2341 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count) == 0) {
2342 atomic_set(&con->ras_ce_count, ce_count);
2343 atomic_set(&con->ras_ue_count, ue_count);
2344 }
2345
2346 return 0;
2347cleanup:
2348 amdgpu_ras_sysfs_remove(adev, ras_block);
2349sysfs:
2350 if (ih_info->cb)
2351 amdgpu_ras_interrupt_remove_handler(adev, ih_info);
2352interrupt:
2353 amdgpu_ras_feature_enable(adev, ras_block, 0);
2354 return r;
2355}
2356
2357
2358void amdgpu_ras_late_fini(struct amdgpu_device *adev,
2359 struct ras_common_if *ras_block,
2360 struct ras_ih_if *ih_info)
2361{
2362 if (!ras_block || !ih_info)
2363 return;
2364
2365 amdgpu_ras_sysfs_remove(adev, ras_block);
2366 if (ih_info->cb)
2367 amdgpu_ras_interrupt_remove_handler(adev, ih_info);
2368 amdgpu_ras_feature_enable(adev, ras_block, 0);
2369}
2370
2371
2372
2373
2374void amdgpu_ras_resume(struct amdgpu_device *adev)
2375{
2376 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2377 struct ras_manager *obj, *tmp;
2378
2379 if (!adev->ras_enabled || !con) {
2380
2381 amdgpu_release_ras_context(adev);
2382
2383 return;
2384 }
2385
2386 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
2387
2388
2389
2390
2391
2392 amdgpu_ras_enable_all_features(adev, 1);
2393
2394
2395
2396
2397
2398 list_for_each_entry_safe(obj, tmp, &con->head, node) {
2399 if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
2400 amdgpu_ras_feature_enable(adev, &obj->head, 0);
2401
2402 WARN_ON(alive_obj(obj));
2403 }
2404 }
2405 }
2406
2407 if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) {
2408 con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET;
2409
2410
2411
2412
2413
2414
2415
2416 amdgpu_ras_disable_all_features(adev, 1);
2417 amdgpu_ras_reset_gpu(adev);
2418 }
2419}
2420
2421void amdgpu_ras_suspend(struct amdgpu_device *adev)
2422{
2423 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2424
2425 if (!adev->ras_enabled || !con)
2426 return;
2427
2428 amdgpu_ras_disable_all_features(adev, 0);
2429
2430 if (con->features)
2431 amdgpu_ras_disable_all_features(adev, 1);
2432}
2433
2434
2435int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
2436{
2437 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2438
2439 if (!adev->ras_enabled || !con)
2440 return 0;
2441
2442
2443
2444 amdgpu_ras_disable_all_features(adev, 0);
2445 amdgpu_ras_recovery_fini(adev);
2446 return 0;
2447}
2448
2449int amdgpu_ras_fini(struct amdgpu_device *adev)
2450{
2451 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2452
2453 if (!adev->ras_enabled || !con)
2454 return 0;
2455
2456 amdgpu_ras_fs_fini(adev);
2457 amdgpu_ras_interrupt_remove_all(adev);
2458
2459 WARN(con->features, "Feature mask is not cleared");
2460
2461 if (con->features)
2462 amdgpu_ras_disable_all_features(adev, 1);
2463
2464 cancel_delayed_work_sync(&con->ras_counte_delay_work);
2465
2466 amdgpu_ras_set_context(adev, NULL);
2467 kfree(con);
2468
2469 return 0;
2470}
2471
2472void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
2473{
2474 amdgpu_ras_check_supported(adev);
2475 if (!adev->ras_hw_enabled)
2476 return;
2477
2478 if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
2479 dev_info(adev->dev, "uncorrectable hardware error"
2480 "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
2481
2482 amdgpu_ras_reset_gpu(adev);
2483 }
2484}
2485
2486bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
2487{
2488 if (adev->asic_type == CHIP_VEGA20 &&
2489 adev->pm.fw_version <= 0x283400) {
2490 return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) &&
2491 amdgpu_ras_intr_triggered();
2492 }
2493
2494 return false;
2495}
2496
2497void amdgpu_release_ras_context(struct amdgpu_device *adev)
2498{
2499 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2500
2501 if (!con)
2502 return;
2503
2504 if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) {
2505 con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX);
2506 amdgpu_ras_set_context(adev, NULL);
2507 kfree(con);
2508 }
2509}
2510