1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/debugfs.h>
25#include <linux/list.h>
26#include <linux/module.h>
27#include <linux/uaccess.h>
28
29#include "amdgpu.h"
30#include "amdgpu_ras.h"
31#include "amdgpu_atomfirmware.h"
32
33struct ras_ih_data {
34
35 struct work_struct ih_work;
36 int inuse;
37
38 ras_ih_cb cb;
39
40 unsigned char *ring;
41 unsigned int ring_size;
42 unsigned int element_size;
43 unsigned int aligned_element_size;
44 unsigned int rptr;
45 unsigned int wptr;
46};
47
48struct ras_fs_data {
49 char sysfs_name[32];
50 char debugfs_name[32];
51};
52
53struct ras_err_data {
54 unsigned long ue_count;
55 unsigned long ce_count;
56};
57
58struct ras_err_handler_data {
59
60 struct {
61 unsigned long bp;
62 struct amdgpu_bo *bo;
63 } *bps;
64
65 int count;
66
67 int space_left;
68
69 int last_reserved;
70};
71
72struct ras_manager {
73 struct ras_common_if head;
74
75 int use;
76
77 struct list_head node;
78
79 struct amdgpu_device *adev;
80
81 struct dentry *ent;
82
83 struct device_attribute sysfs_attr;
84 int attr_inuse;
85
86
87 struct ras_fs_data fs_data;
88
89
90 struct ras_ih_data ih_data;
91
92 struct ras_err_data err_data;
93};
94
95struct ras_badpage {
96 unsigned int bp;
97 unsigned int size;
98 unsigned int flags;
99};
100
101const char *ras_error_string[] = {
102 "none",
103 "parity",
104 "single_correctable",
105 "multi_uncorrectable",
106 "poison",
107};
108
109const char *ras_block_string[] = {
110 "umc",
111 "sdma",
112 "gfx",
113 "mmhub",
114 "athub",
115 "pcie_bif",
116 "hdp",
117 "xgmi_wafl",
118 "df",
119 "smn",
120 "sem",
121 "mp0",
122 "mp1",
123 "fuse",
124};
125
126#define ras_err_str(i) (ras_error_string[ffs(i)])
127#define ras_block_str(i) (ras_block_string[i])
128
129#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1
130#define AMDGPU_RAS_FLAG_INIT_NEED_RESET 2
131#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
132
133static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
134 uint64_t offset, uint64_t size,
135 struct amdgpu_bo **bo_ptr);
136static int amdgpu_ras_release_vram(struct amdgpu_device *adev,
137 struct amdgpu_bo **bo_ptr);
138
139static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
140 size_t size, loff_t *pos)
141{
142 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
143 struct ras_query_if info = {
144 .head = obj->head,
145 };
146 ssize_t s;
147 char val[128];
148
149 if (amdgpu_ras_error_query(obj->adev, &info))
150 return -EINVAL;
151
152 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
153 "ue", info.ue_count,
154 "ce", info.ce_count);
155 if (*pos >= s)
156 return 0;
157
158 s -= *pos;
159 s = min_t(u64, s, size);
160
161
162 if (copy_to_user(buf, &val[*pos], s))
163 return -EINVAL;
164
165 *pos += s;
166
167 return s;
168}
169
170static const struct file_operations amdgpu_ras_debugfs_ops = {
171 .owner = THIS_MODULE,
172 .read = amdgpu_ras_debugfs_read,
173 .write = NULL,
174 .llseek = default_llseek
175};
176
177static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
178{
179 int i;
180
181 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
182 *block_id = i;
183 if (strcmp(name, ras_block_str(i)) == 0)
184 return 0;
185 }
186 return -EINVAL;
187}
188
189static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
190 const char __user *buf, size_t size,
191 loff_t *pos, struct ras_debug_if *data)
192{
193 ssize_t s = min_t(u64, 64, size);
194 char str[65];
195 char block_name[33];
196 char err[9] = "ue";
197 int op = -1;
198 int block_id;
199 u64 address, value;
200
201 if (*pos)
202 return -EINVAL;
203 *pos = size;
204
205 memset(str, 0, sizeof(str));
206 memset(data, 0, sizeof(*data));
207
208 if (copy_from_user(str, buf, s))
209 return -EINVAL;
210
211 if (sscanf(str, "disable %32s", block_name) == 1)
212 op = 0;
213 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
214 op = 1;
215 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
216 op = 2;
217 else if (str[0] && str[1] && str[2] && str[3])
218
219 return -EINVAL;
220
221 if (op != -1) {
222 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
223 return -EINVAL;
224
225 data->head.block = block_id;
226 data->head.type = memcmp("ue", err, 2) == 0 ?
227 AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE :
228 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
229 data->op = op;
230
231 if (op == 2) {
232 if (sscanf(str, "%*s %*s %*s %llu %llu",
233 &address, &value) != 2)
234 if (sscanf(str, "%*s %*s %*s 0x%llx 0x%llx",
235 &address, &value) != 2)
236 return -EINVAL;
237 data->inject.address = address;
238 data->inject.value = value;
239 }
240 } else {
241 if (size < sizeof(*data))
242 return -EINVAL;
243
244 if (copy_from_user(data, buf, sizeof(*data)))
245 return -EINVAL;
246 }
247
248 return 0;
249}
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
309 size_t size, loff_t *pos)
310{
311 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
312 struct ras_debug_if data;
313 struct amdgpu_bo *bo;
314 int ret = 0;
315
316 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
317 if (ret)
318 return -EINVAL;
319
320 if (!amdgpu_ras_is_supported(adev, data.head.block))
321 return -EINVAL;
322
323 switch (data.op) {
324 case 0:
325 ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
326 break;
327 case 1:
328 ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
329 break;
330 case 2:
331 ret = amdgpu_ras_reserve_vram(adev,
332 data.inject.address, PAGE_SIZE, &bo);
333 if (ret) {
334
335 data.inject.address += adev->gmc.vram_start;
336 if (data.inject.address > adev->gmc.vram_end)
337 break;
338 } else
339 data.inject.address = amdgpu_bo_gpu_offset(bo);
340 ret = amdgpu_ras_error_inject(adev, &data.inject);
341 amdgpu_ras_release_vram(adev, &bo);
342 break;
343 default:
344 ret = -EINVAL;
345 break;
346 };
347
348 if (ret)
349 return -EINVAL;
350
351 return size;
352}
353
354static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
355 .owner = THIS_MODULE,
356 .read = NULL,
357 .write = amdgpu_ras_debugfs_ctrl_write,
358 .llseek = default_llseek
359};
360
361static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
362 struct device_attribute *attr, char *buf)
363{
364 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
365 struct ras_query_if info = {
366 .head = obj->head,
367 };
368
369 if (amdgpu_ras_error_query(obj->adev, &info))
370 return -EINVAL;
371
372 return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
373 "ue", info.ue_count,
374 "ce", info.ce_count);
375}
376
377
378
379#define get_obj(obj) do { (obj)->use++; } while (0)
380#define alive_obj(obj) ((obj)->use)
381
382static inline void put_obj(struct ras_manager *obj)
383{
384 if (obj && --obj->use == 0)
385 list_del(&obj->node);
386 if (obj && obj->use < 0) {
387 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
388 }
389}
390
391
392static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
393 struct ras_common_if *head)
394{
395 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
396 struct ras_manager *obj;
397
398 if (!con)
399 return NULL;
400
401 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
402 return NULL;
403
404 obj = &con->objs[head->block];
405
406 if (alive_obj(obj))
407 return NULL;
408
409 obj->head = *head;
410 obj->adev = adev;
411 list_add(&obj->node, &con->head);
412 get_obj(obj);
413
414 return obj;
415}
416
417
418static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
419 struct ras_common_if *head)
420{
421 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
422 struct ras_manager *obj;
423 int i;
424
425 if (!con)
426 return NULL;
427
428 if (head) {
429 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
430 return NULL;
431
432 obj = &con->objs[head->block];
433
434 if (alive_obj(obj)) {
435 WARN_ON(head->block != obj->head.block);
436 return obj;
437 }
438 } else {
439 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
440 obj = &con->objs[i];
441 if (alive_obj(obj)) {
442 WARN_ON(i != obj->head.block);
443 return obj;
444 }
445 }
446 }
447
448 return NULL;
449}
450
451
452
453static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
454 struct ras_common_if *head)
455{
456 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
457
458 return con->hw_supported & BIT(head->block);
459}
460
461static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
462 struct ras_common_if *head)
463{
464 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
465
466 return con->features & BIT(head->block);
467}
468
469
470
471
472
473static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
474 struct ras_common_if *head, int enable)
475{
476 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
477 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
478
479
480
481
482
483
484
485 if (!amdgpu_ras_is_feature_allowed(adev, head))
486 return 0;
487 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
488 return 0;
489
490 if (enable) {
491 if (!obj) {
492 obj = amdgpu_ras_create_obj(adev, head);
493 if (!obj)
494 return -EINVAL;
495 } else {
496
497 get_obj(obj);
498 }
499 con->features |= BIT(head->block);
500 } else {
501 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
502 con->features &= ~BIT(head->block);
503 put_obj(obj);
504 }
505 }
506
507 return 0;
508}
509
510
511int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
512 struct ras_common_if *head, bool enable)
513{
514 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
515 union ta_ras_cmd_input info;
516 int ret;
517
518 if (!con)
519 return -EINVAL;
520
521 if (!enable) {
522 info.disable_features = (struct ta_ras_disable_features_input) {
523 .block_id = amdgpu_ras_block_to_ta(head->block),
524 .error_type = amdgpu_ras_error_to_ta(head->type),
525 };
526 } else {
527 info.enable_features = (struct ta_ras_enable_features_input) {
528 .block_id = amdgpu_ras_block_to_ta(head->block),
529 .error_type = amdgpu_ras_error_to_ta(head->type),
530 };
531 }
532
533
534 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
535
536 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
537 return 0;
538
539 ret = psp_ras_enable_features(&adev->psp, &info, enable);
540 if (ret) {
541 DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
542 enable ? "enable":"disable",
543 ras_block_str(head->block),
544 ret);
545 if (ret == TA_RAS_STATUS__RESET_NEEDED)
546 return -EAGAIN;
547 return -EINVAL;
548 }
549
550
551 __amdgpu_ras_feature_enable(adev, head, enable);
552
553 return 0;
554}
555
556
557int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
558 struct ras_common_if *head, bool enable)
559{
560 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
561 int ret;
562
563 if (!con)
564 return -EINVAL;
565
566 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
567 if (enable) {
568
569
570
571
572
573
574 ret = amdgpu_ras_feature_enable(adev, head, 1);
575
576
577
578
579 if (ret == -EINVAL) {
580 ret = __amdgpu_ras_feature_enable(adev, head, 1);
581 if (!ret)
582 DRM_INFO("RAS INFO: %s setup object\n",
583 ras_block_str(head->block));
584 }
585 } else {
586
587 ret = __amdgpu_ras_feature_enable(adev, head, 1);
588 if (ret)
589 return ret;
590
591 ret = amdgpu_ras_feature_enable(adev, head, 0);
592 }
593 } else
594 ret = amdgpu_ras_feature_enable(adev, head, enable);
595
596 return ret;
597}
598
599static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
600 bool bypass)
601{
602 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
603 struct ras_manager *obj, *tmp;
604
605 list_for_each_entry_safe(obj, tmp, &con->head, node) {
606
607
608
609 if (bypass) {
610 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
611 break;
612 } else {
613 if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
614 break;
615 }
616 }
617
618 return con->features;
619}
620
621static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
622 bool bypass)
623{
624 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
625 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
626 int i;
627 const enum amdgpu_ras_error_type default_ras_type =
628 AMDGPU_RAS_ERROR__NONE;
629
630 for (i = 0; i < ras_block_count; i++) {
631 struct ras_common_if head = {
632 .block = i,
633 .type = default_ras_type,
634 .sub_block_index = 0,
635 };
636 strcpy(head.name, ras_block_str(i));
637 if (bypass) {
638
639
640
641
642 if (__amdgpu_ras_feature_enable(adev, &head, 1))
643 break;
644 } else {
645 if (amdgpu_ras_feature_enable(adev, &head, 1))
646 break;
647 }
648 }
649
650 return con->features;
651}
652
653
654
655int amdgpu_ras_error_query(struct amdgpu_device *adev,
656 struct ras_query_if *info)
657{
658 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
659
660 if (!obj)
661 return -EINVAL;
662
663
664 info->ue_count = obj->err_data.ue_count;
665 info->ce_count = obj->err_data.ce_count;
666
667 return 0;
668}
669
670
671int amdgpu_ras_error_inject(struct amdgpu_device *adev,
672 struct ras_inject_if *info)
673{
674 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
675 struct ta_ras_trigger_error_input block_info = {
676 .block_id = amdgpu_ras_block_to_ta(info->head.block),
677 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
678 .sub_block_index = info->head.sub_block_index,
679 .address = info->address,
680 .value = info->value,
681 };
682 int ret = 0;
683
684 if (!obj)
685 return -EINVAL;
686
687 if (block_info.block_id != TA_RAS_BLOCK__UMC) {
688 DRM_INFO("%s error injection is not supported yet\n",
689 ras_block_str(info->head.block));
690 return -EINVAL;
691 }
692
693 ret = psp_ras_trigger_error(&adev->psp, &block_info);
694 if (ret)
695 DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
696 ras_block_str(info->head.block),
697 ret);
698
699 return ret;
700}
701
702int amdgpu_ras_error_cure(struct amdgpu_device *adev,
703 struct ras_cure_if *info)
704{
705
706 return 0;
707}
708
709
710int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
711 bool is_ce)
712{
713 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
714 struct ras_manager *obj;
715 struct ras_err_data data = {0, 0};
716
717 if (!con)
718 return -EINVAL;
719
720 list_for_each_entry(obj, &con->head, node) {
721 struct ras_query_if info = {
722 .head = obj->head,
723 };
724
725 if (amdgpu_ras_error_query(adev, &info))
726 return -EINVAL;
727
728 data.ce_count += info.ce_count;
729 data.ue_count += info.ue_count;
730 }
731
732 return is_ce ? data.ce_count : data.ue_count;
733}
734
735
736
737
738
739static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
740 struct ras_badpage **bps, unsigned int *count);
741
742static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
743{
744 switch (flags) {
745 case 0:
746 return "R";
747 case 1:
748 return "P";
749 case 2:
750 default:
751 return "F";
752 };
753}
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
779 struct kobject *kobj, struct bin_attribute *attr,
780 char *buf, loff_t ppos, size_t count)
781{
782 struct amdgpu_ras *con =
783 container_of(attr, struct amdgpu_ras, badpages_attr);
784 struct amdgpu_device *adev = con->adev;
785 const unsigned int element_size =
786 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
787 unsigned int start = div64_ul(ppos + element_size - 1, element_size);
788 unsigned int end = div64_ul(ppos + count - 1, element_size);
789 ssize_t s = 0;
790 struct ras_badpage *bps = NULL;
791 unsigned int bps_count = 0;
792
793 memset(buf, 0, count);
794
795 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
796 return 0;
797
798 for (; start < end && start < bps_count; start++)
799 s += scnprintf(&buf[s], element_size + 1,
800 "0x%08x : 0x%08x : %1s\n",
801 bps[start].bp,
802 bps[start].size,
803 amdgpu_ras_badpage_flags_str(bps[start].flags));
804
805 kfree(bps);
806
807 return s;
808}
809
810static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
811 struct device_attribute *attr, char *buf)
812{
813 struct amdgpu_ras *con =
814 container_of(attr, struct amdgpu_ras, features_attr);
815 struct drm_device *ddev = dev_get_drvdata(dev);
816 struct amdgpu_device *adev = ddev->dev_private;
817 struct ras_common_if head;
818 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
819 int i;
820 ssize_t s;
821 struct ras_manager *obj;
822
823 s = scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
824
825 for (i = 0; i < ras_block_count; i++) {
826 head.block = i;
827
828 if (amdgpu_ras_is_feature_enabled(adev, &head)) {
829 obj = amdgpu_ras_find_obj(adev, &head);
830 s += scnprintf(&buf[s], PAGE_SIZE - s,
831 "%s: %s\n",
832 ras_block_str(i),
833 ras_err_str(obj->head.type));
834 } else
835 s += scnprintf(&buf[s], PAGE_SIZE - s,
836 "%s: disabled\n",
837 ras_block_str(i));
838 }
839
840 return s;
841}
842
843static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
844{
845 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
846 struct attribute *attrs[] = {
847 &con->features_attr.attr,
848 NULL
849 };
850 struct bin_attribute *bin_attrs[] = {
851 &con->badpages_attr,
852 NULL
853 };
854 struct attribute_group group = {
855 .name = "ras",
856 .attrs = attrs,
857 .bin_attrs = bin_attrs,
858 };
859
860 con->features_attr = (struct device_attribute) {
861 .attr = {
862 .name = "features",
863 .mode = S_IRUGO,
864 },
865 .show = amdgpu_ras_sysfs_features_read,
866 };
867
868 con->badpages_attr = (struct bin_attribute) {
869 .attr = {
870 .name = "gpu_vram_bad_pages",
871 .mode = S_IRUGO,
872 },
873 .size = 0,
874 .private = NULL,
875 .read = amdgpu_ras_sysfs_badpages_read,
876 };
877
878 sysfs_attr_init(attrs[0]);
879 sysfs_bin_attr_init(bin_attrs[0]);
880
881 return sysfs_create_group(&adev->dev->kobj, &group);
882}
883
884static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
885{
886 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
887 struct attribute *attrs[] = {
888 &con->features_attr.attr,
889 NULL
890 };
891 struct bin_attribute *bin_attrs[] = {
892 &con->badpages_attr,
893 NULL
894 };
895 struct attribute_group group = {
896 .name = "ras",
897 .attrs = attrs,
898 .bin_attrs = bin_attrs,
899 };
900
901 sysfs_remove_group(&adev->dev->kobj, &group);
902
903 return 0;
904}
905
906int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
907 struct ras_fs_if *head)
908{
909 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
910
911 if (!obj || obj->attr_inuse)
912 return -EINVAL;
913
914 get_obj(obj);
915
916 memcpy(obj->fs_data.sysfs_name,
917 head->sysfs_name,
918 sizeof(obj->fs_data.sysfs_name));
919
920 obj->sysfs_attr = (struct device_attribute){
921 .attr = {
922 .name = obj->fs_data.sysfs_name,
923 .mode = S_IRUGO,
924 },
925 .show = amdgpu_ras_sysfs_read,
926 };
927 sysfs_attr_init(&obj->sysfs_attr.attr);
928
929 if (sysfs_add_file_to_group(&adev->dev->kobj,
930 &obj->sysfs_attr.attr,
931 "ras")) {
932 put_obj(obj);
933 return -EINVAL;
934 }
935
936 obj->attr_inuse = 1;
937
938 return 0;
939}
940
941int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
942 struct ras_common_if *head)
943{
944 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
945
946 if (!obj || !obj->attr_inuse)
947 return -EINVAL;
948
949 sysfs_remove_file_from_group(&adev->dev->kobj,
950 &obj->sysfs_attr.attr,
951 "ras");
952 obj->attr_inuse = 0;
953 put_obj(obj);
954
955 return 0;
956}
957
958static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
959{
960 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
961 struct ras_manager *obj, *tmp;
962
963 list_for_each_entry_safe(obj, tmp, &con->head, node) {
964 amdgpu_ras_sysfs_remove(adev, &obj->head);
965 }
966
967 amdgpu_ras_sysfs_remove_feature_node(adev);
968
969 return 0;
970}
971
972
973
974static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
975{
976 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
977 struct drm_minor *minor = adev->ddev->primary;
978
979 con->dir = debugfs_create_dir("ras", minor->debugfs_root);
980 con->ent = debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir,
981 adev, &amdgpu_ras_debugfs_ctrl_ops);
982}
983
984void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
985 struct ras_fs_if *head)
986{
987 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
988 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
989
990 if (!obj || obj->ent)
991 return;
992
993 get_obj(obj);
994
995 memcpy(obj->fs_data.debugfs_name,
996 head->debugfs_name,
997 sizeof(obj->fs_data.debugfs_name));
998
999 obj->ent = debugfs_create_file(obj->fs_data.debugfs_name,
1000 S_IWUGO | S_IRUGO, con->dir, obj,
1001 &amdgpu_ras_debugfs_ops);
1002}
1003
1004void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
1005 struct ras_common_if *head)
1006{
1007 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
1008
1009 if (!obj || !obj->ent)
1010 return;
1011
1012 debugfs_remove(obj->ent);
1013 obj->ent = NULL;
1014 put_obj(obj);
1015}
1016
1017static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
1018{
1019 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1020 struct ras_manager *obj, *tmp;
1021
1022 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1023 amdgpu_ras_debugfs_remove(adev, &obj->head);
1024 }
1025
1026 debugfs_remove(con->ent);
1027 debugfs_remove(con->dir);
1028 con->dir = NULL;
1029 con->ent = NULL;
1030}
1031
1032
1033
1034
1035static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
1036{
1037 amdgpu_ras_sysfs_create_feature_node(adev);
1038 amdgpu_ras_debugfs_create_ctrl_node(adev);
1039
1040 return 0;
1041}
1042
1043static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
1044{
1045 amdgpu_ras_debugfs_remove_all(adev);
1046 amdgpu_ras_sysfs_remove_all(adev);
1047 return 0;
1048}
1049
1050
1051
1052static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
1053{
1054 struct ras_ih_data *data = &obj->ih_data;
1055 struct amdgpu_iv_entry entry;
1056 int ret;
1057
1058 while (data->rptr != data->wptr) {
1059 rmb();
1060 memcpy(&entry, &data->ring[data->rptr],
1061 data->element_size);
1062
1063 wmb();
1064 data->rptr = (data->aligned_element_size +
1065 data->rptr) % data->ring_size;
1066
1067
1068
1069
1070 if (data->cb) {
1071 ret = data->cb(obj->adev, &entry);
1072
1073
1074
1075
1076
1077 if (ret == AMDGPU_RAS_UE) {
1078 obj->err_data.ue_count++;
1079 }
1080
1081
1082
1083
1084 }
1085 }
1086}
1087
1088static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1089{
1090 struct ras_ih_data *data =
1091 container_of(work, struct ras_ih_data, ih_work);
1092 struct ras_manager *obj =
1093 container_of(data, struct ras_manager, ih_data);
1094
1095 amdgpu_ras_interrupt_handler(obj);
1096}
1097
1098int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1099 struct ras_dispatch_if *info)
1100{
1101 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1102 struct ras_ih_data *data = &obj->ih_data;
1103
1104 if (!obj)
1105 return -EINVAL;
1106
1107 if (data->inuse == 0)
1108 return 0;
1109
1110
1111 memcpy(&data->ring[data->wptr], info->entry,
1112 data->element_size);
1113
1114 wmb();
1115 data->wptr = (data->aligned_element_size +
1116 data->wptr) % data->ring_size;
1117
1118 schedule_work(&data->ih_work);
1119
1120 return 0;
1121}
1122
1123int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1124 struct ras_ih_if *info)
1125{
1126 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1127 struct ras_ih_data *data;
1128
1129 if (!obj)
1130 return -EINVAL;
1131
1132 data = &obj->ih_data;
1133 if (data->inuse == 0)
1134 return 0;
1135
1136 cancel_work_sync(&data->ih_work);
1137
1138 kfree(data->ring);
1139 memset(data, 0, sizeof(*data));
1140 put_obj(obj);
1141
1142 return 0;
1143}
1144
1145int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1146 struct ras_ih_if *info)
1147{
1148 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1149 struct ras_ih_data *data;
1150
1151 if (!obj) {
1152
1153 obj = amdgpu_ras_create_obj(adev, &info->head);
1154 if (!obj)
1155 return -EINVAL;
1156 } else
1157 get_obj(obj);
1158
1159 data = &obj->ih_data;
1160
1161 *data = (struct ras_ih_data) {
1162 .inuse = 0,
1163 .cb = info->cb,
1164 .element_size = sizeof(struct amdgpu_iv_entry),
1165 .rptr = 0,
1166 .wptr = 0,
1167 };
1168
1169 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1170
1171 data->aligned_element_size = ALIGN(data->element_size, 8);
1172
1173 data->ring_size = 64 * data->aligned_element_size;
1174 data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1175 if (!data->ring) {
1176 put_obj(obj);
1177 return -ENOMEM;
1178 }
1179
1180
1181 data->inuse = 1;
1182
1183 return 0;
1184}
1185
1186static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1187{
1188 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1189 struct ras_manager *obj, *tmp;
1190
1191 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1192 struct ras_ih_if info = {
1193 .head = obj->head,
1194 };
1195 amdgpu_ras_interrupt_remove_handler(adev, &info);
1196 }
1197
1198 return 0;
1199}
1200
1201
1202
1203
1204
1205
1206
1207static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1208 struct ras_badpage **bps, unsigned int *count)
1209{
1210 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1211 struct ras_err_handler_data *data;
1212 int i = 0;
1213 int ret = 0;
1214
1215 if (!con || !con->eh_data || !bps || !count)
1216 return -EINVAL;
1217
1218 mutex_lock(&con->recovery_lock);
1219 data = con->eh_data;
1220 if (!data || data->count == 0) {
1221 *bps = NULL;
1222 goto out;
1223 }
1224
1225 *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
1226 if (!*bps) {
1227 ret = -ENOMEM;
1228 goto out;
1229 }
1230
1231 for (; i < data->count; i++) {
1232 (*bps)[i] = (struct ras_badpage){
1233 .bp = data->bps[i].bp,
1234 .size = AMDGPU_GPU_PAGE_SIZE,
1235 .flags = 0,
1236 };
1237
1238 if (data->last_reserved <= i)
1239 (*bps)[i].flags = 1;
1240 else if (data->bps[i].bo == NULL)
1241 (*bps)[i].flags = 2;
1242 }
1243
1244 *count = data->count;
1245out:
1246 mutex_unlock(&con->recovery_lock);
1247 return ret;
1248}
1249
1250static void amdgpu_ras_do_recovery(struct work_struct *work)
1251{
1252 struct amdgpu_ras *ras =
1253 container_of(work, struct amdgpu_ras, recovery_work);
1254
1255 amdgpu_device_gpu_recover(ras->adev, 0);
1256 atomic_set(&ras->in_recovery, 0);
1257}
1258
1259static int amdgpu_ras_release_vram(struct amdgpu_device *adev,
1260 struct amdgpu_bo **bo_ptr)
1261{
1262
1263 amdgpu_bo_free_kernel(bo_ptr, NULL, NULL);
1264 return 0;
1265}
1266
1267
1268static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
1269 uint64_t offset, uint64_t size,
1270 struct amdgpu_bo **bo_ptr)
1271{
1272 struct ttm_operation_ctx ctx = { false, false };
1273 struct amdgpu_bo_param bp;
1274 int r = 0;
1275 int i;
1276 struct amdgpu_bo *bo;
1277
1278 if (bo_ptr)
1279 *bo_ptr = NULL;
1280 memset(&bp, 0, sizeof(bp));
1281 bp.size = size;
1282 bp.byte_align = PAGE_SIZE;
1283 bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
1284 bp.flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
1285 AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
1286 bp.type = ttm_bo_type_kernel;
1287 bp.resv = NULL;
1288
1289 r = amdgpu_bo_create(adev, &bp, &bo);
1290 if (r)
1291 return -EINVAL;
1292
1293 r = amdgpu_bo_reserve(bo, false);
1294 if (r)
1295 goto error_reserve;
1296
1297 offset = ALIGN(offset, PAGE_SIZE);
1298 for (i = 0; i < bo->placement.num_placement; ++i) {
1299 bo->placements[i].fpfn = offset >> PAGE_SHIFT;
1300 bo->placements[i].lpfn = (offset + size) >> PAGE_SHIFT;
1301 }
1302
1303 ttm_bo_mem_put(&bo->tbo, &bo->tbo.mem);
1304 r = ttm_bo_mem_space(&bo->tbo, &bo->placement, &bo->tbo.mem, &ctx);
1305 if (r)
1306 goto error_pin;
1307
1308 r = amdgpu_bo_pin_restricted(bo,
1309 AMDGPU_GEM_DOMAIN_VRAM,
1310 offset,
1311 offset + size);
1312 if (r)
1313 goto error_pin;
1314
1315 if (bo_ptr)
1316 *bo_ptr = bo;
1317
1318 amdgpu_bo_unreserve(bo);
1319 return r;
1320
1321error_pin:
1322 amdgpu_bo_unreserve(bo);
1323error_reserve:
1324 amdgpu_bo_unref(&bo);
1325 return r;
1326}
1327
1328
1329static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1330 struct ras_err_handler_data *data, int pages)
1331{
1332 unsigned int old_space = data->count + data->space_left;
1333 unsigned int new_space = old_space + pages;
1334 unsigned int align_space = ALIGN(new_space, 1024);
1335 void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1336
1337 if (!tmp)
1338 return -ENOMEM;
1339
1340 if (data->bps) {
1341 memcpy(tmp, data->bps,
1342 data->count * sizeof(*data->bps));
1343 kfree(data->bps);
1344 }
1345
1346 data->bps = tmp;
1347 data->space_left += align_space - old_space;
1348 return 0;
1349}
1350
1351
1352int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
1353 unsigned long *bps, int pages)
1354{
1355 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1356 struct ras_err_handler_data *data;
1357 int i = pages;
1358 int ret = 0;
1359
1360 if (!con || !con->eh_data || !bps || pages <= 0)
1361 return 0;
1362
1363 mutex_lock(&con->recovery_lock);
1364 data = con->eh_data;
1365 if (!data)
1366 goto out;
1367
1368 if (data->space_left <= pages)
1369 if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
1370 ret = -ENOMEM;
1371 goto out;
1372 }
1373
1374 while (i--)
1375 data->bps[data->count++].bp = bps[i];
1376
1377 data->space_left -= pages;
1378out:
1379 mutex_unlock(&con->recovery_lock);
1380
1381 return ret;
1382}
1383
1384
1385int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
1386{
1387 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1388 struct ras_err_handler_data *data;
1389 uint64_t bp;
1390 struct amdgpu_bo *bo;
1391 int i;
1392
1393 if (!con || !con->eh_data)
1394 return 0;
1395
1396 mutex_lock(&con->recovery_lock);
1397 data = con->eh_data;
1398 if (!data)
1399 goto out;
1400
1401 for (i = data->last_reserved; i < data->count; i++) {
1402 bp = data->bps[i].bp;
1403
1404 if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,
1405 PAGE_SIZE, &bo))
1406 DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
1407
1408 data->bps[i].bo = bo;
1409 data->last_reserved = i + 1;
1410 }
1411out:
1412 mutex_unlock(&con->recovery_lock);
1413 return 0;
1414}
1415
1416
1417static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
1418{
1419 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1420 struct ras_err_handler_data *data;
1421 struct amdgpu_bo *bo;
1422 int i;
1423
1424 if (!con || !con->eh_data)
1425 return 0;
1426
1427 mutex_lock(&con->recovery_lock);
1428 data = con->eh_data;
1429 if (!data)
1430 goto out;
1431
1432 for (i = data->last_reserved - 1; i >= 0; i--) {
1433 bo = data->bps[i].bo;
1434
1435 amdgpu_ras_release_vram(adev, &bo);
1436
1437 data->bps[i].bo = bo;
1438 data->last_reserved = i;
1439 }
1440out:
1441 mutex_unlock(&con->recovery_lock);
1442 return 0;
1443}
1444
1445static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1446{
1447
1448
1449
1450 return 0;
1451}
1452
1453static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1454{
1455
1456
1457
1458 return 0;
1459}
1460
1461static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
1462{
1463 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1464 struct ras_err_handler_data **data = &con->eh_data;
1465
1466 *data = kmalloc(sizeof(**data),
1467 GFP_KERNEL|__GFP_ZERO);
1468 if (!*data)
1469 return -ENOMEM;
1470
1471 mutex_init(&con->recovery_lock);
1472 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
1473 atomic_set(&con->in_recovery, 0);
1474 con->adev = adev;
1475
1476 amdgpu_ras_load_bad_pages(adev);
1477 amdgpu_ras_reserve_bad_pages(adev);
1478
1479 return 0;
1480}
1481
1482static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
1483{
1484 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1485 struct ras_err_handler_data *data = con->eh_data;
1486
1487 cancel_work_sync(&con->recovery_work);
1488 amdgpu_ras_save_bad_pages(adev);
1489 amdgpu_ras_release_bad_pages(adev);
1490
1491 mutex_lock(&con->recovery_lock);
1492 con->eh_data = NULL;
1493 kfree(data->bps);
1494 kfree(data);
1495 mutex_unlock(&con->recovery_lock);
1496
1497 return 0;
1498}
1499
1500
1501
1502int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
1503 unsigned int block)
1504{
1505 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1506
1507 if (!ras)
1508 return -EINVAL;
1509
1510 ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1511 return 0;
1512}
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
1524 uint32_t *hw_supported, uint32_t *supported)
1525{
1526 *hw_supported = 0;
1527 *supported = 0;
1528
1529 if (amdgpu_sriov_vf(adev) ||
1530 adev->asic_type != CHIP_VEGA20)
1531 return;
1532
1533 if (adev->is_atom_fw &&
1534 (amdgpu_atomfirmware_mem_ecc_supported(adev) ||
1535 amdgpu_atomfirmware_sram_ecc_supported(adev)))
1536 *hw_supported = AMDGPU_RAS_BLOCK_MASK;
1537
1538 *supported = amdgpu_ras_enable == 0 ?
1539 0 : *hw_supported & amdgpu_ras_mask;
1540}
1541
1542int amdgpu_ras_init(struct amdgpu_device *adev)
1543{
1544 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1545
1546 if (con)
1547 return 0;
1548
1549 con = kmalloc(sizeof(struct amdgpu_ras) +
1550 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
1551 GFP_KERNEL|__GFP_ZERO);
1552 if (!con)
1553 return -ENOMEM;
1554
1555 con->objs = (struct ras_manager *)(con + 1);
1556
1557 amdgpu_ras_set_context(adev, con);
1558
1559 amdgpu_ras_check_supported(adev, &con->hw_supported,
1560 &con->supported);
1561 if (!con->hw_supported) {
1562 amdgpu_ras_set_context(adev, NULL);
1563 kfree(con);
1564 return 0;
1565 }
1566
1567 con->features = 0;
1568 INIT_LIST_HEAD(&con->head);
1569
1570 con->flags = RAS_DEFAULT_FLAGS;
1571
1572 if (amdgpu_ras_recovery_init(adev))
1573 goto recovery_out;
1574
1575 amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
1576
1577 if (amdgpu_ras_fs_init(adev))
1578 goto fs_out;
1579
1580 DRM_INFO("RAS INFO: ras initialized successfully, "
1581 "hardware ability[%x] ras_mask[%x]\n",
1582 con->hw_supported, con->supported);
1583 return 0;
1584fs_out:
1585 amdgpu_ras_recovery_fini(adev);
1586recovery_out:
1587 amdgpu_ras_set_context(adev, NULL);
1588 kfree(con);
1589
1590 return -EINVAL;
1591}
1592
1593
1594
1595
1596void amdgpu_ras_resume(struct amdgpu_device *adev)
1597{
1598 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1599 struct ras_manager *obj, *tmp;
1600
1601 if (!con)
1602 return;
1603
1604 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
1605
1606
1607
1608
1609
1610 amdgpu_ras_enable_all_features(adev, 1);
1611
1612
1613
1614
1615
1616 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1617 if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
1618 amdgpu_ras_feature_enable(adev, &obj->head, 0);
1619
1620 WARN_ON(alive_obj(obj));
1621 }
1622 }
1623 }
1624
1625 if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) {
1626 con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1627
1628
1629
1630
1631
1632
1633
1634 amdgpu_ras_disable_all_features(adev, 1);
1635 amdgpu_ras_reset_gpu(adev, 0);
1636 }
1637}
1638
1639void amdgpu_ras_suspend(struct amdgpu_device *adev)
1640{
1641 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1642
1643 if (!con)
1644 return;
1645
1646 amdgpu_ras_disable_all_features(adev, 0);
1647
1648 if (con->features)
1649 amdgpu_ras_disable_all_features(adev, 1);
1650}
1651
1652
1653int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
1654{
1655 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1656
1657 if (!con)
1658 return 0;
1659
1660
1661 amdgpu_ras_disable_all_features(adev, 0);
1662 amdgpu_ras_recovery_fini(adev);
1663 return 0;
1664}
1665
1666int amdgpu_ras_fini(struct amdgpu_device *adev)
1667{
1668 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1669
1670 if (!con)
1671 return 0;
1672
1673 amdgpu_ras_fs_fini(adev);
1674 amdgpu_ras_interrupt_remove_all(adev);
1675
1676 WARN(con->features, "Feature mask is not cleared");
1677
1678 if (con->features)
1679 amdgpu_ras_disable_all_features(adev, 1);
1680
1681 amdgpu_ras_set_context(adev, NULL);
1682 kfree(con);
1683
1684 return 0;
1685}
1686