1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/debugfs.h>
25#include <linux/list.h>
26#include <linux/module.h>
27#include <linux/uaccess.h>
28#include <linux/reboot.h>
29#include <linux/syscalls.h>
30
31#include "amdgpu.h"
32#include "amdgpu_ras.h"
33#include "amdgpu_atomfirmware.h"
34#include "amdgpu_xgmi.h"
35#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
36
37static const char *RAS_FS_NAME = "ras";
38
39const char *ras_error_string[] = {
40 "none",
41 "parity",
42 "single_correctable",
43 "multi_uncorrectable",
44 "poison",
45};
46
47const char *ras_block_string[] = {
48 "umc",
49 "sdma",
50 "gfx",
51 "mmhub",
52 "athub",
53 "pcie_bif",
54 "hdp",
55 "xgmi_wafl",
56 "df",
57 "smn",
58 "sem",
59 "mp0",
60 "mp1",
61 "fuse",
62};
63
64#define ras_err_str(i) (ras_error_string[ffs(i)])
65#define ras_block_str(i) (ras_block_string[i])
66
67#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
68
69
70#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
71
72
73#define RAS_BAD_PAGE_RATE (100 * 1024 * 1024ULL)
74
75enum amdgpu_ras_retire_page_reservation {
76 AMDGPU_RAS_RETIRE_PAGE_RESERVED,
77 AMDGPU_RAS_RETIRE_PAGE_PENDING,
78 AMDGPU_RAS_RETIRE_PAGE_FAULT,
79};
80
81atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
82
83static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
84 uint64_t addr);
85
86void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready)
87{
88 if (adev && amdgpu_ras_get_context(adev))
89 amdgpu_ras_get_context(adev)->error_query_ready = ready;
90}
91
92static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
93{
94 if (adev && amdgpu_ras_get_context(adev))
95 return amdgpu_ras_get_context(adev)->error_query_ready;
96
97 return false;
98}
99
100static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
101 size_t size, loff_t *pos)
102{
103 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
104 struct ras_query_if info = {
105 .head = obj->head,
106 };
107 ssize_t s;
108 char val[128];
109
110 if (amdgpu_ras_error_query(obj->adev, &info))
111 return -EINVAL;
112
113 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
114 "ue", info.ue_count,
115 "ce", info.ce_count);
116 if (*pos >= s)
117 return 0;
118
119 s -= *pos;
120 s = min_t(u64, s, size);
121
122
123 if (copy_to_user(buf, &val[*pos], s))
124 return -EINVAL;
125
126 *pos += s;
127
128 return s;
129}
130
131static const struct file_operations amdgpu_ras_debugfs_ops = {
132 .owner = THIS_MODULE,
133 .read = amdgpu_ras_debugfs_read,
134 .write = NULL,
135 .llseek = default_llseek
136};
137
138static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
139{
140 int i;
141
142 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
143 *block_id = i;
144 if (strcmp(name, ras_block_str(i)) == 0)
145 return 0;
146 }
147 return -EINVAL;
148}
149
150static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
151 const char __user *buf, size_t size,
152 loff_t *pos, struct ras_debug_if *data)
153{
154 ssize_t s = min_t(u64, 64, size);
155 char str[65];
156 char block_name[33];
157 char err[9] = "ue";
158 int op = -1;
159 int block_id;
160 uint32_t sub_block;
161 u64 address, value;
162
163 if (*pos)
164 return -EINVAL;
165 *pos = size;
166
167 memset(str, 0, sizeof(str));
168 memset(data, 0, sizeof(*data));
169
170 if (copy_from_user(str, buf, s))
171 return -EINVAL;
172
173 if (sscanf(str, "disable %32s", block_name) == 1)
174 op = 0;
175 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
176 op = 1;
177 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
178 op = 2;
179 else if (str[0] && str[1] && str[2] && str[3])
180
181 return -EINVAL;
182
183 if (op != -1) {
184 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
185 return -EINVAL;
186
187 data->head.block = block_id;
188
189 if (!memcmp("ue", err, 2))
190 data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
191 else if (!memcmp("ce", err, 2))
192 data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
193 else
194 return -EINVAL;
195
196 data->op = op;
197
198 if (op == 2) {
199 if (sscanf(str, "%*s %*s %*s %u %llu %llu",
200 &sub_block, &address, &value) != 3)
201 if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
202 &sub_block, &address, &value) != 3)
203 return -EINVAL;
204 data->head.sub_block_index = sub_block;
205 data->inject.address = address;
206 data->inject.value = value;
207 }
208 } else {
209 if (size < sizeof(*data))
210 return -EINVAL;
211
212 if (copy_from_user(data, buf, sizeof(*data)))
213 return -EINVAL;
214 }
215
216 return 0;
217}
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
295 size_t size, loff_t *pos)
296{
297 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
298 struct ras_debug_if data;
299 int ret = 0;
300
301 if (!amdgpu_ras_get_error_query_ready(adev)) {
302 dev_warn(adev->dev, "RAS WARN: error injection "
303 "currently inaccessible\n");
304 return size;
305 }
306
307 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
308 if (ret)
309 return -EINVAL;
310
311 if (!amdgpu_ras_is_supported(adev, data.head.block))
312 return -EINVAL;
313
314 switch (data.op) {
315 case 0:
316 ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
317 break;
318 case 1:
319 ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
320 break;
321 case 2:
322 if ((data.inject.address >= adev->gmc.mc_vram_size) ||
323 (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
324 dev_warn(adev->dev, "RAS WARN: input address "
325 "0x%llx is invalid.",
326 data.inject.address);
327 ret = -EINVAL;
328 break;
329 }
330
331
332 if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
333 amdgpu_ras_check_bad_page(adev, data.inject.address)) {
334 dev_warn(adev->dev, "RAS WARN: 0x%llx has been marked "
335 "as bad before error injection!\n",
336 data.inject.address);
337 break;
338 }
339
340
341 ret = amdgpu_ras_error_inject(adev, &data.inject);
342 break;
343 default:
344 ret = -EINVAL;
345 break;
346 }
347
348 if (ret)
349 return -EINVAL;
350
351 return size;
352}
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user *buf,
371 size_t size, loff_t *pos)
372{
373 struct amdgpu_device *adev =
374 (struct amdgpu_device *)file_inode(f)->i_private;
375 int ret;
376
377 ret = amdgpu_ras_eeprom_reset_table(
378 &(amdgpu_ras_get_context(adev)->eeprom_control));
379
380 if (ret == 1) {
381 amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS;
382 return size;
383 } else {
384 return -EIO;
385 }
386}
387
388static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
389 .owner = THIS_MODULE,
390 .read = NULL,
391 .write = amdgpu_ras_debugfs_ctrl_write,
392 .llseek = default_llseek
393};
394
395static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
396 .owner = THIS_MODULE,
397 .read = NULL,
398 .write = amdgpu_ras_debugfs_eeprom_write,
399 .llseek = default_llseek
400};
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
424 struct device_attribute *attr, char *buf)
425{
426 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
427 struct ras_query_if info = {
428 .head = obj->head,
429 };
430
431 if (!amdgpu_ras_get_error_query_ready(obj->adev))
432 return snprintf(buf, PAGE_SIZE,
433 "Query currently inaccessible\n");
434
435 if (amdgpu_ras_error_query(obj->adev, &info))
436 return -EINVAL;
437
438 return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
439 "ue", info.ue_count,
440 "ce", info.ce_count);
441}
442
443
444
445#define get_obj(obj) do { (obj)->use++; } while (0)
446#define alive_obj(obj) ((obj)->use)
447
448static inline void put_obj(struct ras_manager *obj)
449{
450 if (obj && --obj->use == 0)
451 list_del(&obj->node);
452 if (obj && obj->use < 0) {
453 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
454 }
455}
456
457
458static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
459 struct ras_common_if *head)
460{
461 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
462 struct ras_manager *obj;
463
464 if (!con)
465 return NULL;
466
467 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
468 return NULL;
469
470 obj = &con->objs[head->block];
471
472 if (alive_obj(obj))
473 return NULL;
474
475 obj->head = *head;
476 obj->adev = adev;
477 list_add(&obj->node, &con->head);
478 get_obj(obj);
479
480 return obj;
481}
482
483
484struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
485 struct ras_common_if *head)
486{
487 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
488 struct ras_manager *obj;
489 int i;
490
491 if (!con)
492 return NULL;
493
494 if (head) {
495 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
496 return NULL;
497
498 obj = &con->objs[head->block];
499
500 if (alive_obj(obj)) {
501 WARN_ON(head->block != obj->head.block);
502 return obj;
503 }
504 } else {
505 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
506 obj = &con->objs[i];
507 if (alive_obj(obj)) {
508 WARN_ON(i != obj->head.block);
509 return obj;
510 }
511 }
512 }
513
514 return NULL;
515}
516
517
518static void amdgpu_ras_parse_status_code(struct amdgpu_device *adev,
519 const char* invoke_type,
520 const char* block_name,
521 enum ta_ras_status ret)
522{
523 switch (ret) {
524 case TA_RAS_STATUS__SUCCESS:
525 return;
526 case TA_RAS_STATUS__ERROR_RAS_NOT_AVAILABLE:
527 dev_warn(adev->dev,
528 "RAS WARN: %s %s currently unavailable\n",
529 invoke_type,
530 block_name);
531 break;
532 default:
533 dev_err(adev->dev,
534 "RAS ERROR: %s %s error failed ret 0x%X\n",
535 invoke_type,
536 block_name,
537 ret);
538 }
539}
540
541
542static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
543 struct ras_common_if *head)
544{
545 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
546
547 return con->hw_supported & BIT(head->block);
548}
549
550static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
551 struct ras_common_if *head)
552{
553 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
554
555 return con->features & BIT(head->block);
556}
557
558
559
560
561
562static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
563 struct ras_common_if *head, int enable)
564{
565 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
566 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
567
568
569
570
571
572
573
574 if (!amdgpu_ras_is_feature_allowed(adev, head))
575 return 0;
576 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
577 return 0;
578
579 if (enable) {
580 if (!obj) {
581 obj = amdgpu_ras_create_obj(adev, head);
582 if (!obj)
583 return -EINVAL;
584 } else {
585
586 get_obj(obj);
587 }
588 con->features |= BIT(head->block);
589 } else {
590 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
591 con->features &= ~BIT(head->block);
592 put_obj(obj);
593 }
594 }
595
596 return 0;
597}
598
599
600int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
601 struct ras_common_if *head, bool enable)
602{
603 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
604 union ta_ras_cmd_input *info;
605 int ret;
606
607 if (!con)
608 return -EINVAL;
609
610 info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);
611 if (!info)
612 return -ENOMEM;
613
614 if (!enable) {
615 info->disable_features = (struct ta_ras_disable_features_input) {
616 .block_id = amdgpu_ras_block_to_ta(head->block),
617 .error_type = amdgpu_ras_error_to_ta(head->type),
618 };
619 } else {
620 info->enable_features = (struct ta_ras_enable_features_input) {
621 .block_id = amdgpu_ras_block_to_ta(head->block),
622 .error_type = amdgpu_ras_error_to_ta(head->type),
623 };
624 }
625
626
627 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
628
629 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) {
630 ret = 0;
631 goto out;
632 }
633
634 if (!amdgpu_ras_intr_triggered()) {
635 ret = psp_ras_enable_features(&adev->psp, info, enable);
636 if (ret) {
637 amdgpu_ras_parse_status_code(adev,
638 enable ? "enable":"disable",
639 ras_block_str(head->block),
640 (enum ta_ras_status)ret);
641 if (ret == TA_RAS_STATUS__RESET_NEEDED)
642 ret = -EAGAIN;
643 else
644 ret = -EINVAL;
645
646 goto out;
647 }
648 }
649
650
651 __amdgpu_ras_feature_enable(adev, head, enable);
652 ret = 0;
653out:
654 kfree(info);
655 return ret;
656}
657
658
659int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
660 struct ras_common_if *head, bool enable)
661{
662 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
663 int ret;
664
665 if (!con)
666 return -EINVAL;
667
668 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
669 if (enable) {
670
671
672
673
674
675
676 ret = amdgpu_ras_feature_enable(adev, head, 1);
677
678
679
680
681 if (ret == -EINVAL) {
682 ret = __amdgpu_ras_feature_enable(adev, head, 1);
683 if (!ret)
684 dev_info(adev->dev,
685 "RAS INFO: %s setup object\n",
686 ras_block_str(head->block));
687 }
688 } else {
689
690 ret = __amdgpu_ras_feature_enable(adev, head, 1);
691 if (ret)
692 return ret;
693
694 ret = amdgpu_ras_feature_enable(adev, head, 0);
695 }
696 } else
697 ret = amdgpu_ras_feature_enable(adev, head, enable);
698
699 return ret;
700}
701
702static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
703 bool bypass)
704{
705 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
706 struct ras_manager *obj, *tmp;
707
708 list_for_each_entry_safe(obj, tmp, &con->head, node) {
709
710
711
712 if (bypass) {
713 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
714 break;
715 } else {
716 if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
717 break;
718 }
719 }
720
721 return con->features;
722}
723
724static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
725 bool bypass)
726{
727 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
728 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
729 int i;
730 const enum amdgpu_ras_error_type default_ras_type =
731 AMDGPU_RAS_ERROR__NONE;
732
733 for (i = 0; i < ras_block_count; i++) {
734 struct ras_common_if head = {
735 .block = i,
736 .type = default_ras_type,
737 .sub_block_index = 0,
738 };
739 strcpy(head.name, ras_block_str(i));
740 if (bypass) {
741
742
743
744
745 if (__amdgpu_ras_feature_enable(adev, &head, 1))
746 break;
747 } else {
748 if (amdgpu_ras_feature_enable(adev, &head, 1))
749 break;
750 }
751 }
752
753 return con->features;
754}
755
756
757
758int amdgpu_ras_error_query(struct amdgpu_device *adev,
759 struct ras_query_if *info)
760{
761 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
762 struct ras_err_data err_data = {0, 0, 0, NULL};
763 int i;
764
765 if (!obj)
766 return -EINVAL;
767
768 switch (info->head.block) {
769 case AMDGPU_RAS_BLOCK__UMC:
770 if (adev->umc.funcs->query_ras_error_count)
771 adev->umc.funcs->query_ras_error_count(adev, &err_data);
772
773
774
775 if (adev->umc.funcs->query_ras_error_address)
776 adev->umc.funcs->query_ras_error_address(adev, &err_data);
777 break;
778 case AMDGPU_RAS_BLOCK__SDMA:
779 if (adev->sdma.funcs->query_ras_error_count) {
780 for (i = 0; i < adev->sdma.num_instances; i++)
781 adev->sdma.funcs->query_ras_error_count(adev, i,
782 &err_data);
783 }
784 break;
785 case AMDGPU_RAS_BLOCK__GFX:
786 if (adev->gfx.funcs->query_ras_error_count)
787 adev->gfx.funcs->query_ras_error_count(adev, &err_data);
788 break;
789 case AMDGPU_RAS_BLOCK__MMHUB:
790 if (adev->mmhub.funcs->query_ras_error_count)
791 adev->mmhub.funcs->query_ras_error_count(adev, &err_data);
792 break;
793 case AMDGPU_RAS_BLOCK__PCIE_BIF:
794 if (adev->nbio.funcs->query_ras_error_count)
795 adev->nbio.funcs->query_ras_error_count(adev, &err_data);
796 break;
797 case AMDGPU_RAS_BLOCK__XGMI_WAFL:
798 amdgpu_xgmi_query_ras_error_count(adev, &err_data);
799 break;
800 default:
801 break;
802 }
803
804 obj->err_data.ue_count += err_data.ue_count;
805 obj->err_data.ce_count += err_data.ce_count;
806
807 info->ue_count = obj->err_data.ue_count;
808 info->ce_count = obj->err_data.ce_count;
809
810 if (err_data.ce_count) {
811 dev_info(adev->dev, "%ld correctable hardware errors "
812 "detected in %s block, no user "
813 "action is needed.\n",
814 obj->err_data.ce_count,
815 ras_block_str(info->head.block));
816 }
817 if (err_data.ue_count) {
818 dev_info(adev->dev, "%ld uncorrectable hardware errors "
819 "detected in %s block\n",
820 obj->err_data.ue_count,
821 ras_block_str(info->head.block));
822 }
823
824 return 0;
825}
826
827
828static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
829 struct ta_ras_trigger_error_input *block_info)
830{
831 int ret;
832
833 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
834 dev_warn(adev->dev, "Failed to disallow df cstate");
835
836 if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
837 dev_warn(adev->dev, "Failed to disallow XGMI power down");
838
839 ret = psp_ras_trigger_error(&adev->psp, block_info);
840
841 if (amdgpu_ras_intr_triggered())
842 return ret;
843
844 if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
845 dev_warn(adev->dev, "Failed to allow XGMI power down");
846
847 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
848 dev_warn(adev->dev, "Failed to allow df cstate");
849
850 return ret;
851}
852
853
854int amdgpu_ras_error_inject(struct amdgpu_device *adev,
855 struct ras_inject_if *info)
856{
857 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
858 struct ta_ras_trigger_error_input block_info = {
859 .block_id = amdgpu_ras_block_to_ta(info->head.block),
860 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
861 .sub_block_index = info->head.sub_block_index,
862 .address = info->address,
863 .value = info->value,
864 };
865 int ret = 0;
866
867 if (!obj)
868 return -EINVAL;
869
870
871 if (adev->gmc.xgmi.num_physical_nodes > 1) {
872 block_info.address =
873 amdgpu_xgmi_get_relative_phy_addr(adev,
874 block_info.address);
875 }
876
877 switch (info->head.block) {
878 case AMDGPU_RAS_BLOCK__GFX:
879 if (adev->gfx.funcs->ras_error_inject)
880 ret = adev->gfx.funcs->ras_error_inject(adev, info);
881 else
882 ret = -EINVAL;
883 break;
884 case AMDGPU_RAS_BLOCK__UMC:
885 case AMDGPU_RAS_BLOCK__MMHUB:
886 case AMDGPU_RAS_BLOCK__PCIE_BIF:
887 ret = psp_ras_trigger_error(&adev->psp, &block_info);
888 break;
889 case AMDGPU_RAS_BLOCK__XGMI_WAFL:
890 ret = amdgpu_ras_error_inject_xgmi(adev, &block_info);
891 break;
892 default:
893 dev_info(adev->dev, "%s error injection is not supported yet\n",
894 ras_block_str(info->head.block));
895 ret = -EINVAL;
896 }
897
898 amdgpu_ras_parse_status_code(adev,
899 "inject",
900 ras_block_str(info->head.block),
901 (enum ta_ras_status)ret);
902
903 return ret;
904}
905
906int amdgpu_ras_error_cure(struct amdgpu_device *adev,
907 struct ras_cure_if *info)
908{
909
910 return 0;
911}
912
913
914unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
915 bool is_ce)
916{
917 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
918 struct ras_manager *obj;
919 struct ras_err_data data = {0, 0};
920
921 if (!con)
922 return 0;
923
924 list_for_each_entry(obj, &con->head, node) {
925 struct ras_query_if info = {
926 .head = obj->head,
927 };
928
929 if (amdgpu_ras_error_query(adev, &info))
930 return 0;
931
932 data.ce_count += info.ce_count;
933 data.ue_count += info.ue_count;
934 }
935
936 return is_ce ? data.ce_count : data.ue_count;
937}
938
939
940
941
942
943static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
944 struct ras_badpage **bps, unsigned int *count);
945
946static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
947{
948 switch (flags) {
949 case AMDGPU_RAS_RETIRE_PAGE_RESERVED:
950 return "R";
951 case AMDGPU_RAS_RETIRE_PAGE_PENDING:
952 return "P";
953 case AMDGPU_RAS_RETIRE_PAGE_FAULT:
954 default:
955 return "F";
956 };
957}
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
990 struct kobject *kobj, struct bin_attribute *attr,
991 char *buf, loff_t ppos, size_t count)
992{
993 struct amdgpu_ras *con =
994 container_of(attr, struct amdgpu_ras, badpages_attr);
995 struct amdgpu_device *adev = con->adev;
996 const unsigned int element_size =
997 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
998 unsigned int start = div64_ul(ppos + element_size - 1, element_size);
999 unsigned int end = div64_ul(ppos + count - 1, element_size);
1000 ssize_t s = 0;
1001 struct ras_badpage *bps = NULL;
1002 unsigned int bps_count = 0;
1003
1004 memset(buf, 0, count);
1005
1006 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
1007 return 0;
1008
1009 for (; start < end && start < bps_count; start++)
1010 s += scnprintf(&buf[s], element_size + 1,
1011 "0x%08x : 0x%08x : %1s\n",
1012 bps[start].bp,
1013 bps[start].size,
1014 amdgpu_ras_badpage_flags_str(bps[start].flags));
1015
1016 kfree(bps);
1017
1018 return s;
1019}
1020
1021static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
1022 struct device_attribute *attr, char *buf)
1023{
1024 struct amdgpu_ras *con =
1025 container_of(attr, struct amdgpu_ras, features_attr);
1026
1027 return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
1028}
1029
1030static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev)
1031{
1032 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1033
1034 sysfs_remove_file_from_group(&adev->dev->kobj,
1035 &con->badpages_attr.attr,
1036 RAS_FS_NAME);
1037}
1038
1039static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
1040{
1041 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1042 struct attribute *attrs[] = {
1043 &con->features_attr.attr,
1044 NULL
1045 };
1046 struct attribute_group group = {
1047 .name = RAS_FS_NAME,
1048 .attrs = attrs,
1049 };
1050
1051 sysfs_remove_group(&adev->dev->kobj, &group);
1052
1053 return 0;
1054}
1055
1056int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
1057 struct ras_fs_if *head)
1058{
1059 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
1060
1061 if (!obj || obj->attr_inuse)
1062 return -EINVAL;
1063
1064 get_obj(obj);
1065
1066 memcpy(obj->fs_data.sysfs_name,
1067 head->sysfs_name,
1068 sizeof(obj->fs_data.sysfs_name));
1069
1070 obj->sysfs_attr = (struct device_attribute){
1071 .attr = {
1072 .name = obj->fs_data.sysfs_name,
1073 .mode = S_IRUGO,
1074 },
1075 .show = amdgpu_ras_sysfs_read,
1076 };
1077 sysfs_attr_init(&obj->sysfs_attr.attr);
1078
1079 if (sysfs_add_file_to_group(&adev->dev->kobj,
1080 &obj->sysfs_attr.attr,
1081 RAS_FS_NAME)) {
1082 put_obj(obj);
1083 return -EINVAL;
1084 }
1085
1086 obj->attr_inuse = 1;
1087
1088 return 0;
1089}
1090
1091int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
1092 struct ras_common_if *head)
1093{
1094 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1095
1096 if (!obj || !obj->attr_inuse)
1097 return -EINVAL;
1098
1099 sysfs_remove_file_from_group(&adev->dev->kobj,
1100 &obj->sysfs_attr.attr,
1101 RAS_FS_NAME);
1102 obj->attr_inuse = 0;
1103 put_obj(obj);
1104
1105 return 0;
1106}
1107
1108static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
1109{
1110 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1111 struct ras_manager *obj, *tmp;
1112
1113 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1114 amdgpu_ras_sysfs_remove(adev, &obj->head);
1115 }
1116
1117 if (amdgpu_bad_page_threshold != 0)
1118 amdgpu_ras_sysfs_remove_bad_page_node(adev);
1119
1120 amdgpu_ras_sysfs_remove_feature_node(adev);
1121
1122 return 0;
1123}
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
1146{
1147 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1148 struct drm_minor *minor = adev_to_drm(adev)->primary;
1149
1150 con->dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root);
1151 debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir,
1152 adev, &amdgpu_ras_debugfs_ctrl_ops);
1153 debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
1154 adev, &amdgpu_ras_debugfs_eeprom_ops);
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164 debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
1165 &con->reboot);
1166
1167
1168
1169
1170
1171 debugfs_create_bool("disable_ras_err_cnt_harvest", 0644,
1172 con->dir, &con->disable_ras_err_cnt_harvest);
1173}
1174
1175static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
1176 struct ras_fs_if *head)
1177{
1178 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1179 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
1180
1181 if (!obj || obj->ent)
1182 return;
1183
1184 get_obj(obj);
1185
1186 memcpy(obj->fs_data.debugfs_name,
1187 head->debugfs_name,
1188 sizeof(obj->fs_data.debugfs_name));
1189
1190 obj->ent = debugfs_create_file(obj->fs_data.debugfs_name,
1191 S_IWUGO | S_IRUGO, con->dir, obj,
1192 &amdgpu_ras_debugfs_ops);
1193}
1194
1195void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)
1196{
1197 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1198 struct ras_manager *obj;
1199 struct ras_fs_if fs_info;
1200
1201
1202
1203
1204
1205 if (!IS_ENABLED(CONFIG_DEBUG_FS) || !con)
1206 return;
1207
1208 amdgpu_ras_debugfs_create_ctrl_node(adev);
1209
1210 list_for_each_entry(obj, &con->head, node) {
1211 if (amdgpu_ras_is_supported(adev, obj->head.block) &&
1212 (obj->attr_inuse == 1)) {
1213 sprintf(fs_info.debugfs_name, "%s_err_inject",
1214 ras_block_str(obj->head.block));
1215 fs_info.head = obj->head;
1216 amdgpu_ras_debugfs_create(adev, &fs_info);
1217 }
1218 }
1219}
1220
1221static void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
1222 struct ras_common_if *head)
1223{
1224 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1225
1226 if (!obj || !obj->ent)
1227 return;
1228
1229 obj->ent = NULL;
1230 put_obj(obj);
1231}
1232
1233static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
1234{
1235 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1236 struct ras_manager *obj, *tmp;
1237
1238 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1239 amdgpu_ras_debugfs_remove(adev, &obj->head);
1240 }
1241
1242 con->dir = NULL;
1243}
1244
1245
1246
1247static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO,
1248 amdgpu_ras_sysfs_badpages_read, NULL, 0);
1249static DEVICE_ATTR(features, S_IRUGO,
1250 amdgpu_ras_sysfs_features_read, NULL);
1251static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
1252{
1253 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1254 struct attribute_group group = {
1255 .name = RAS_FS_NAME,
1256 };
1257 struct attribute *attrs[] = {
1258 &con->features_attr.attr,
1259 NULL
1260 };
1261 struct bin_attribute *bin_attrs[] = {
1262 NULL,
1263 NULL,
1264 };
1265 int r;
1266
1267
1268 con->features_attr = dev_attr_features;
1269 group.attrs = attrs;
1270 sysfs_attr_init(attrs[0]);
1271
1272 if (amdgpu_bad_page_threshold != 0) {
1273
1274 bin_attr_gpu_vram_bad_pages.private = NULL;
1275 con->badpages_attr = bin_attr_gpu_vram_bad_pages;
1276 bin_attrs[0] = &con->badpages_attr;
1277 group.bin_attrs = bin_attrs;
1278 sysfs_bin_attr_init(bin_attrs[0]);
1279 }
1280
1281 r = sysfs_create_group(&adev->dev->kobj, &group);
1282 if (r)
1283 dev_err(adev->dev, "Failed to create RAS sysfs group!");
1284
1285 return 0;
1286}
1287
1288static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
1289{
1290 if (IS_ENABLED(CONFIG_DEBUG_FS))
1291 amdgpu_ras_debugfs_remove_all(adev);
1292 amdgpu_ras_sysfs_remove_all(adev);
1293 return 0;
1294}
1295
1296
1297
1298static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
1299{
1300 struct ras_ih_data *data = &obj->ih_data;
1301 struct amdgpu_iv_entry entry;
1302 int ret;
1303 struct ras_err_data err_data = {0, 0, 0, NULL};
1304
1305 while (data->rptr != data->wptr) {
1306 rmb();
1307 memcpy(&entry, &data->ring[data->rptr],
1308 data->element_size);
1309
1310 wmb();
1311 data->rptr = (data->aligned_element_size +
1312 data->rptr) % data->ring_size;
1313
1314
1315
1316
1317 if (data->cb) {
1318 ret = data->cb(obj->adev, &err_data, &entry);
1319
1320
1321
1322
1323
1324 if (ret == AMDGPU_RAS_SUCCESS) {
1325
1326
1327
1328 obj->err_data.ue_count += err_data.ue_count;
1329 obj->err_data.ce_count += err_data.ce_count;
1330 }
1331 }
1332 }
1333}
1334
1335static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1336{
1337 struct ras_ih_data *data =
1338 container_of(work, struct ras_ih_data, ih_work);
1339 struct ras_manager *obj =
1340 container_of(data, struct ras_manager, ih_data);
1341
1342 amdgpu_ras_interrupt_handler(obj);
1343}
1344
1345int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1346 struct ras_dispatch_if *info)
1347{
1348 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1349 struct ras_ih_data *data = &obj->ih_data;
1350
1351 if (!obj)
1352 return -EINVAL;
1353
1354 if (data->inuse == 0)
1355 return 0;
1356
1357
1358 memcpy(&data->ring[data->wptr], info->entry,
1359 data->element_size);
1360
1361 wmb();
1362 data->wptr = (data->aligned_element_size +
1363 data->wptr) % data->ring_size;
1364
1365 schedule_work(&data->ih_work);
1366
1367 return 0;
1368}
1369
1370int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1371 struct ras_ih_if *info)
1372{
1373 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1374 struct ras_ih_data *data;
1375
1376 if (!obj)
1377 return -EINVAL;
1378
1379 data = &obj->ih_data;
1380 if (data->inuse == 0)
1381 return 0;
1382
1383 cancel_work_sync(&data->ih_work);
1384
1385 kfree(data->ring);
1386 memset(data, 0, sizeof(*data));
1387 put_obj(obj);
1388
1389 return 0;
1390}
1391
1392int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1393 struct ras_ih_if *info)
1394{
1395 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1396 struct ras_ih_data *data;
1397
1398 if (!obj) {
1399
1400 obj = amdgpu_ras_create_obj(adev, &info->head);
1401 if (!obj)
1402 return -EINVAL;
1403 } else
1404 get_obj(obj);
1405
1406 data = &obj->ih_data;
1407
1408 *data = (struct ras_ih_data) {
1409 .inuse = 0,
1410 .cb = info->cb,
1411 .element_size = sizeof(struct amdgpu_iv_entry),
1412 .rptr = 0,
1413 .wptr = 0,
1414 };
1415
1416 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1417
1418 data->aligned_element_size = ALIGN(data->element_size, 8);
1419
1420 data->ring_size = 64 * data->aligned_element_size;
1421 data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1422 if (!data->ring) {
1423 put_obj(obj);
1424 return -ENOMEM;
1425 }
1426
1427
1428 data->inuse = 1;
1429
1430 return 0;
1431}
1432
1433static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1434{
1435 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1436 struct ras_manager *obj, *tmp;
1437
1438 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1439 struct ras_ih_if info = {
1440 .head = obj->head,
1441 };
1442 amdgpu_ras_interrupt_remove_handler(adev, &info);
1443 }
1444
1445 return 0;
1446}
1447
1448
1449
1450static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
1451{
1452 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1453 struct ras_manager *obj;
1454
1455 if (!con)
1456 return;
1457
1458 list_for_each_entry(obj, &con->head, node) {
1459 struct ras_query_if info = {
1460 .head = obj->head,
1461 };
1462
1463
1464
1465
1466
1467
1468
1469 if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)
1470 continue;
1471
1472 amdgpu_ras_error_query(adev, &info);
1473 }
1474}
1475
1476
1477void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
1478 struct ras_query_if *info)
1479{
1480
1481
1482
1483
1484 switch (info->head.block) {
1485 case AMDGPU_RAS_BLOCK__GFX:
1486 if (adev->gfx.funcs->query_ras_error_status)
1487 adev->gfx.funcs->query_ras_error_status(adev);
1488 break;
1489 case AMDGPU_RAS_BLOCK__MMHUB:
1490 if (adev->mmhub.funcs->query_ras_error_status)
1491 adev->mmhub.funcs->query_ras_error_status(adev);
1492 break;
1493 default:
1494 break;
1495 }
1496}
1497
1498static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
1499{
1500 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1501 struct ras_manager *obj;
1502
1503 if (!con)
1504 return;
1505
1506 list_for_each_entry(obj, &con->head, node) {
1507 struct ras_query_if info = {
1508 .head = obj->head,
1509 };
1510
1511 amdgpu_ras_error_status_query(adev, &info);
1512 }
1513}
1514
1515
1516
1517
1518
1519
1520static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1521 struct ras_badpage **bps, unsigned int *count)
1522{
1523 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1524 struct ras_err_handler_data *data;
1525 int i = 0;
1526 int ret = 0;
1527
1528 if (!con || !con->eh_data || !bps || !count)
1529 return -EINVAL;
1530
1531 mutex_lock(&con->recovery_lock);
1532 data = con->eh_data;
1533 if (!data || data->count == 0) {
1534 *bps = NULL;
1535 ret = -EINVAL;
1536 goto out;
1537 }
1538
1539 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
1540 if (!*bps) {
1541 ret = -ENOMEM;
1542 goto out;
1543 }
1544
1545 for (; i < data->count; i++) {
1546 (*bps)[i] = (struct ras_badpage){
1547 .bp = data->bps[i].retired_page,
1548 .size = AMDGPU_GPU_PAGE_SIZE,
1549 .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
1550 };
1551
1552 if (data->last_reserved <= i)
1553 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
1554 else if (data->bps_bo[i] == NULL)
1555 (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
1556 }
1557
1558 *count = data->count;
1559out:
1560 mutex_unlock(&con->recovery_lock);
1561 return ret;
1562}
1563
1564static void amdgpu_ras_do_recovery(struct work_struct *work)
1565{
1566 struct amdgpu_ras *ras =
1567 container_of(work, struct amdgpu_ras, recovery_work);
1568 struct amdgpu_device *remote_adev = NULL;
1569 struct amdgpu_device *adev = ras->adev;
1570 struct list_head device_list, *device_list_handle = NULL;
1571
1572 if (!ras->disable_ras_err_cnt_harvest) {
1573 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
1574
1575
1576 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) {
1577 device_list_handle = &hive->device_list;
1578 } else {
1579 INIT_LIST_HEAD(&device_list);
1580 list_add_tail(&adev->gmc.xgmi.head, &device_list);
1581 device_list_handle = &device_list;
1582 }
1583
1584 list_for_each_entry(remote_adev,
1585 device_list_handle, gmc.xgmi.head) {
1586 amdgpu_ras_query_err_status(remote_adev);
1587 amdgpu_ras_log_on_err_counter(remote_adev);
1588 }
1589
1590 amdgpu_put_xgmi_hive(hive);
1591 }
1592
1593 if (amdgpu_device_should_recover_gpu(ras->adev))
1594 amdgpu_device_gpu_recover(ras->adev, NULL);
1595 atomic_set(&ras->in_recovery, 0);
1596}
1597
1598
1599static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1600 struct ras_err_handler_data *data, int pages)
1601{
1602 unsigned int old_space = data->count + data->space_left;
1603 unsigned int new_space = old_space + pages;
1604 unsigned int align_space = ALIGN(new_space, 512);
1605 void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1606 struct amdgpu_bo **bps_bo =
1607 kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL);
1608
1609 if (!bps || !bps_bo) {
1610 kfree(bps);
1611 kfree(bps_bo);
1612 return -ENOMEM;
1613 }
1614
1615 if (data->bps) {
1616 memcpy(bps, data->bps,
1617 data->count * sizeof(*data->bps));
1618 kfree(data->bps);
1619 }
1620 if (data->bps_bo) {
1621 memcpy(bps_bo, data->bps_bo,
1622 data->count * sizeof(*data->bps_bo));
1623 kfree(data->bps_bo);
1624 }
1625
1626 data->bps = bps;
1627 data->bps_bo = bps_bo;
1628 data->space_left += align_space - old_space;
1629 return 0;
1630}
1631
1632
1633int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
1634 struct eeprom_table_record *bps, int pages)
1635{
1636 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1637 struct ras_err_handler_data *data;
1638 int ret = 0;
1639
1640 if (!con || !con->eh_data || !bps || pages <= 0)
1641 return 0;
1642
1643 mutex_lock(&con->recovery_lock);
1644 data = con->eh_data;
1645 if (!data)
1646 goto out;
1647
1648 if (data->space_left <= pages)
1649 if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
1650 ret = -ENOMEM;
1651 goto out;
1652 }
1653
1654 memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
1655 data->count += pages;
1656 data->space_left -= pages;
1657
1658out:
1659 mutex_unlock(&con->recovery_lock);
1660
1661 return ret;
1662}
1663
1664
1665
1666
1667
1668static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1669{
1670 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1671 struct ras_err_handler_data *data;
1672 struct amdgpu_ras_eeprom_control *control;
1673 int save_count;
1674
1675 if (!con || !con->eh_data)
1676 return 0;
1677
1678 control = &con->eeprom_control;
1679 data = con->eh_data;
1680 save_count = data->count - control->num_recs;
1681
1682 if (save_count > 0) {
1683 if (amdgpu_ras_eeprom_process_recods(control,
1684 &data->bps[control->num_recs],
1685 true,
1686 save_count)) {
1687 dev_err(adev->dev, "Failed to save EEPROM table data!");
1688 return -EIO;
1689 }
1690
1691 dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count);
1692 }
1693
1694 return 0;
1695}
1696
1697
1698
1699
1700
1701static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1702{
1703 struct amdgpu_ras_eeprom_control *control =
1704 &adev->psp.ras.ras->eeprom_control;
1705 struct eeprom_table_record *bps = NULL;
1706 int ret = 0;
1707
1708
1709 if (!control->num_recs || (amdgpu_bad_page_threshold == 0))
1710 return ret;
1711
1712 bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
1713 if (!bps)
1714 return -ENOMEM;
1715
1716 if (amdgpu_ras_eeprom_process_recods(control, bps, false,
1717 control->num_recs)) {
1718 dev_err(adev->dev, "Failed to load EEPROM table records!");
1719 ret = -EIO;
1720 goto out;
1721 }
1722
1723 ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
1724
1725out:
1726 kfree(bps);
1727 return ret;
1728}
1729
1730
1731
1732
1733
1734
1735static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
1736 uint64_t addr)
1737{
1738 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1739 struct ras_err_handler_data *data;
1740 int i;
1741 bool ret = false;
1742
1743 if (!con || !con->eh_data)
1744 return ret;
1745
1746 mutex_lock(&con->recovery_lock);
1747 data = con->eh_data;
1748 if (!data)
1749 goto out;
1750
1751 addr >>= AMDGPU_GPU_PAGE_SHIFT;
1752 for (i = 0; i < data->count; i++)
1753 if (addr == data->bps[i].retired_page) {
1754 ret = true;
1755 goto out;
1756 }
1757
1758out:
1759 mutex_unlock(&con->recovery_lock);
1760 return ret;
1761}
1762
1763static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
1764 uint32_t max_length)
1765{
1766 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1767 int tmp_threshold = amdgpu_bad_page_threshold;
1768 u64 val;
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789 if (tmp_threshold < -1)
1790 tmp_threshold = -1;
1791 else if (tmp_threshold > max_length)
1792 tmp_threshold = max_length;
1793
1794 if (tmp_threshold == -1) {
1795 val = adev->gmc.mc_vram_size;
1796 do_div(val, RAS_BAD_PAGE_RATE);
1797 con->bad_page_cnt_threshold = min(lower_32_bits(val),
1798 max_length);
1799 } else {
1800 con->bad_page_cnt_threshold = tmp_threshold;
1801 }
1802}
1803
1804
1805int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
1806{
1807 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1808 struct ras_err_handler_data *data;
1809 uint64_t bp;
1810 struct amdgpu_bo *bo = NULL;
1811 int i, ret = 0;
1812
1813
1814 if (!con || !con->eh_data || (amdgpu_bad_page_threshold == 0))
1815 return 0;
1816
1817 mutex_lock(&con->recovery_lock);
1818 data = con->eh_data;
1819 if (!data)
1820 goto out;
1821
1822 for (i = data->last_reserved; i < data->count; i++) {
1823 bp = data->bps[i].retired_page;
1824
1825
1826
1827
1828
1829
1830 if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
1831 AMDGPU_GPU_PAGE_SIZE,
1832 AMDGPU_GEM_DOMAIN_VRAM,
1833 &bo, NULL))
1834 dev_warn(adev->dev, "RAS WARN: reserve vram for "
1835 "retired page %llx fail\n", bp);
1836
1837 data->bps_bo[i] = bo;
1838 data->last_reserved = i + 1;
1839 bo = NULL;
1840 }
1841
1842
1843 ret = amdgpu_ras_save_bad_pages(adev);
1844out:
1845 mutex_unlock(&con->recovery_lock);
1846 return ret;
1847}
1848
1849
1850static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
1851{
1852 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1853 struct ras_err_handler_data *data;
1854 struct amdgpu_bo *bo;
1855 int i;
1856
1857 if (!con || !con->eh_data)
1858 return 0;
1859
1860 mutex_lock(&con->recovery_lock);
1861 data = con->eh_data;
1862 if (!data)
1863 goto out;
1864
1865 for (i = data->last_reserved - 1; i >= 0; i--) {
1866 bo = data->bps_bo[i];
1867
1868 amdgpu_bo_free_kernel(&bo, NULL, NULL);
1869
1870 data->bps_bo[i] = bo;
1871 data->last_reserved = i;
1872 }
1873out:
1874 mutex_unlock(&con->recovery_lock);
1875 return 0;
1876}
1877
1878int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
1879{
1880 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1881 struct ras_err_handler_data **data;
1882 uint32_t max_eeprom_records_len = 0;
1883 bool exc_err_limit = false;
1884 int ret;
1885
1886 if (con)
1887 data = &con->eh_data;
1888 else
1889 return 0;
1890
1891 *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
1892 if (!*data) {
1893 ret = -ENOMEM;
1894 goto out;
1895 }
1896
1897 mutex_init(&con->recovery_lock);
1898 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
1899 atomic_set(&con->in_recovery, 0);
1900 con->adev = adev;
1901
1902 max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
1903 amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
1904
1905 ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
1906
1907
1908
1909
1910 if (exc_err_limit || ret)
1911 goto free;
1912
1913 if (con->eeprom_control.num_recs) {
1914 ret = amdgpu_ras_load_bad_pages(adev);
1915 if (ret)
1916 goto free;
1917 ret = amdgpu_ras_reserve_bad_pages(adev);
1918 if (ret)
1919 goto release;
1920 }
1921
1922 return 0;
1923
1924release:
1925 amdgpu_ras_release_bad_pages(adev);
1926free:
1927 kfree((*data)->bps);
1928 kfree((*data)->bps_bo);
1929 kfree(*data);
1930 con->eh_data = NULL;
1931out:
1932 dev_warn(adev->dev, "Failed to initialize ras recovery!\n");
1933
1934
1935
1936
1937
1938 if (!exc_err_limit)
1939 ret = 0;
1940 else
1941 ret = -EINVAL;
1942
1943 return ret;
1944}
1945
1946static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
1947{
1948 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1949 struct ras_err_handler_data *data = con->eh_data;
1950
1951
1952 if (!data)
1953 return 0;
1954
1955 cancel_work_sync(&con->recovery_work);
1956 amdgpu_ras_release_bad_pages(adev);
1957
1958 mutex_lock(&con->recovery_lock);
1959 con->eh_data = NULL;
1960 kfree(data->bps);
1961 kfree(data->bps_bo);
1962 kfree(data);
1963 mutex_unlock(&con->recovery_lock);
1964
1965 return 0;
1966}
1967
1968
1969
1970int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
1971 unsigned int block)
1972{
1973 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1974
1975 if (!ras)
1976 return -EINVAL;
1977
1978 ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1979 return 0;
1980}
1981
1982static int amdgpu_ras_check_asic_type(struct amdgpu_device *adev)
1983{
1984 if (adev->asic_type != CHIP_VEGA10 &&
1985 adev->asic_type != CHIP_VEGA20 &&
1986 adev->asic_type != CHIP_ARCTURUS &&
1987 adev->asic_type != CHIP_SIENNA_CICHLID)
1988 return 1;
1989 else
1990 return 0;
1991}
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
2003 uint32_t *hw_supported, uint32_t *supported)
2004{
2005 *hw_supported = 0;
2006 *supported = 0;
2007
2008 if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||
2009 amdgpu_ras_check_asic_type(adev))
2010 return;
2011
2012 if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
2013 dev_info(adev->dev, "HBM ECC is active.\n");
2014 *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC |
2015 1 << AMDGPU_RAS_BLOCK__DF);
2016 } else
2017 dev_info(adev->dev, "HBM ECC is not presented.\n");
2018
2019 if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
2020 dev_info(adev->dev, "SRAM ECC is active.\n");
2021 *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
2022 1 << AMDGPU_RAS_BLOCK__DF);
2023 } else
2024 dev_info(adev->dev, "SRAM ECC is not presented.\n");
2025
2026
2027 *hw_supported &= AMDGPU_RAS_BLOCK_MASK;
2028
2029 *supported = amdgpu_ras_enable == 0 ?
2030 0 : *hw_supported & amdgpu_ras_mask;
2031 adev->ras_features = *supported;
2032}
2033
2034int amdgpu_ras_init(struct amdgpu_device *adev)
2035{
2036 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2037 int r;
2038
2039 if (con)
2040 return 0;
2041
2042 con = kmalloc(sizeof(struct amdgpu_ras) +
2043 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
2044 GFP_KERNEL|__GFP_ZERO);
2045 if (!con)
2046 return -ENOMEM;
2047
2048 con->objs = (struct ras_manager *)(con + 1);
2049
2050 amdgpu_ras_set_context(adev, con);
2051
2052 amdgpu_ras_check_supported(adev, &con->hw_supported,
2053 &con->supported);
2054 if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) {
2055 r = 0;
2056 goto release_con;
2057 }
2058
2059 con->features = 0;
2060 INIT_LIST_HEAD(&con->head);
2061
2062 con->flags = RAS_DEFAULT_FLAGS;
2063
2064 if (adev->nbio.funcs->init_ras_controller_interrupt) {
2065 r = adev->nbio.funcs->init_ras_controller_interrupt(adev);
2066 if (r)
2067 goto release_con;
2068 }
2069
2070 if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) {
2071 r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev);
2072 if (r)
2073 goto release_con;
2074 }
2075
2076 if (amdgpu_ras_fs_init(adev)) {
2077 r = -EINVAL;
2078 goto release_con;
2079 }
2080
2081 dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
2082 "hardware ability[%x] ras_mask[%x]\n",
2083 con->hw_supported, con->supported);
2084 return 0;
2085release_con:
2086 amdgpu_ras_set_context(adev, NULL);
2087 kfree(con);
2088
2089 return r;
2090}
2091
2092
2093int amdgpu_ras_late_init(struct amdgpu_device *adev,
2094 struct ras_common_if *ras_block,
2095 struct ras_fs_if *fs_info,
2096 struct ras_ih_if *ih_info)
2097{
2098 int r;
2099
2100
2101 if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
2102 amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
2103 return 0;
2104 }
2105
2106 r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
2107 if (r) {
2108 if (r == -EAGAIN) {
2109
2110 amdgpu_ras_request_reset_on_boot(adev,
2111 ras_block->block);
2112 return 0;
2113 } else if (adev->in_suspend || amdgpu_in_reset(adev)) {
2114
2115
2116 goto cleanup;
2117 } else
2118 return r;
2119 }
2120
2121
2122 if (adev->in_suspend || amdgpu_in_reset(adev))
2123 return 0;
2124
2125 if (ih_info->cb) {
2126 r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
2127 if (r)
2128 goto interrupt;
2129 }
2130
2131 r = amdgpu_ras_sysfs_create(adev, fs_info);
2132 if (r)
2133 goto sysfs;
2134
2135 return 0;
2136cleanup:
2137 amdgpu_ras_sysfs_remove(adev, ras_block);
2138sysfs:
2139 if (ih_info->cb)
2140 amdgpu_ras_interrupt_remove_handler(adev, ih_info);
2141interrupt:
2142 amdgpu_ras_feature_enable(adev, ras_block, 0);
2143 return r;
2144}
2145
2146
2147void amdgpu_ras_late_fini(struct amdgpu_device *adev,
2148 struct ras_common_if *ras_block,
2149 struct ras_ih_if *ih_info)
2150{
2151 if (!ras_block || !ih_info)
2152 return;
2153
2154 amdgpu_ras_sysfs_remove(adev, ras_block);
2155 if (ih_info->cb)
2156 amdgpu_ras_interrupt_remove_handler(adev, ih_info);
2157 amdgpu_ras_feature_enable(adev, ras_block, 0);
2158}
2159
2160
2161
2162
2163void amdgpu_ras_resume(struct amdgpu_device *adev)
2164{
2165 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2166 struct ras_manager *obj, *tmp;
2167
2168 if (!con)
2169 return;
2170
2171 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
2172
2173
2174
2175
2176
2177 amdgpu_ras_enable_all_features(adev, 1);
2178
2179
2180
2181
2182
2183 list_for_each_entry_safe(obj, tmp, &con->head, node) {
2184 if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
2185 amdgpu_ras_feature_enable(adev, &obj->head, 0);
2186
2187 WARN_ON(alive_obj(obj));
2188 }
2189 }
2190 }
2191
2192 if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) {
2193 con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET;
2194
2195
2196
2197
2198
2199
2200
2201 amdgpu_ras_disable_all_features(adev, 1);
2202 amdgpu_ras_reset_gpu(adev);
2203 }
2204}
2205
2206void amdgpu_ras_suspend(struct amdgpu_device *adev)
2207{
2208 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2209
2210 if (!con)
2211 return;
2212
2213 amdgpu_ras_disable_all_features(adev, 0);
2214
2215 if (con->features)
2216 amdgpu_ras_disable_all_features(adev, 1);
2217}
2218
2219
2220int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
2221{
2222 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2223
2224 if (!con)
2225 return 0;
2226
2227
2228 amdgpu_ras_disable_all_features(adev, 0);
2229 amdgpu_ras_recovery_fini(adev);
2230 return 0;
2231}
2232
2233int amdgpu_ras_fini(struct amdgpu_device *adev)
2234{
2235 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2236
2237 if (!con)
2238 return 0;
2239
2240 amdgpu_ras_fs_fini(adev);
2241 amdgpu_ras_interrupt_remove_all(adev);
2242
2243 WARN(con->features, "Feature mask is not cleared");
2244
2245 if (con->features)
2246 amdgpu_ras_disable_all_features(adev, 1);
2247
2248 amdgpu_ras_set_context(adev, NULL);
2249 kfree(con);
2250
2251 return 0;
2252}
2253
2254void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
2255{
2256 uint32_t hw_supported, supported;
2257
2258 amdgpu_ras_check_supported(adev, &hw_supported, &supported);
2259 if (!hw_supported)
2260 return;
2261
2262 if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
2263 dev_info(adev->dev, "uncorrectable hardware error"
2264 "(ERREVENT_ATHUB_INTERRUPT) detected!\n");
2265
2266 amdgpu_ras_reset_gpu(adev);
2267 }
2268}
2269
2270bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
2271{
2272 if (adev->asic_type == CHIP_VEGA20 &&
2273 adev->pm.fw_version <= 0x283400) {
2274 return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) &&
2275 amdgpu_ras_intr_triggered();
2276 }
2277
2278 return false;
2279}
2280
2281bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev)
2282{
2283 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2284 bool exc_err_limit = false;
2285
2286 if (con && (amdgpu_bad_page_threshold != 0))
2287 amdgpu_ras_eeprom_check_err_threshold(&con->eeprom_control,
2288 &exc_err_limit);
2289
2290
2291
2292
2293
2294 return exc_err_limit;
2295}
2296