1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#include <linux/debugfs.h>
25#include <linux/list.h>
26#include <linux/module.h>
27#include "amdgpu.h"
28#include "amdgpu_ras.h"
29#include "amdgpu_atomfirmware.h"
30
31struct ras_ih_data {
32
33 struct work_struct ih_work;
34 int inuse;
35
36 ras_ih_cb cb;
37
38 unsigned char *ring;
39 unsigned int ring_size;
40 unsigned int element_size;
41 unsigned int aligned_element_size;
42 unsigned int rptr;
43 unsigned int wptr;
44};
45
46struct ras_fs_data {
47 char sysfs_name[32];
48 char debugfs_name[32];
49};
50
51struct ras_err_data {
52 unsigned long ue_count;
53 unsigned long ce_count;
54};
55
56struct ras_err_handler_data {
57
58 struct {
59 unsigned long bp;
60 struct amdgpu_bo *bo;
61 } *bps;
62
63 int count;
64
65 int space_left;
66
67 int last_reserved;
68};
69
70struct ras_manager {
71 struct ras_common_if head;
72
73 int use;
74
75 struct list_head node;
76
77 struct amdgpu_device *adev;
78
79 struct dentry *ent;
80
81 struct device_attribute sysfs_attr;
82 int attr_inuse;
83
84
85 struct ras_fs_data fs_data;
86
87
88 struct ras_ih_data ih_data;
89
90 struct ras_err_data err_data;
91};
92
93const char *ras_error_string[] = {
94 "none",
95 "parity",
96 "single_correctable",
97 "multi_uncorrectable",
98 "poison",
99};
100
101const char *ras_block_string[] = {
102 "umc",
103 "sdma",
104 "gfx",
105 "mmhub",
106 "athub",
107 "pcie_bif",
108 "hdp",
109 "xgmi_wafl",
110 "df",
111 "smn",
112 "sem",
113 "mp0",
114 "mp1",
115 "fuse",
116};
117
118#define ras_err_str(i) (ras_error_string[ffs(i)])
119#define ras_block_str(i) (ras_block_string[i])
120
121#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1
122#define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
123
124static void amdgpu_ras_self_test(struct amdgpu_device *adev)
125{
126
127}
128
129static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
130 size_t size, loff_t *pos)
131{
132 struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
133 struct ras_query_if info = {
134 .head = obj->head,
135 };
136 ssize_t s;
137 char val[128];
138
139 if (amdgpu_ras_error_query(obj->adev, &info))
140 return -EINVAL;
141
142 s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
143 "ue", info.ue_count,
144 "ce", info.ce_count);
145 if (*pos >= s)
146 return 0;
147
148 s -= *pos;
149 s = min_t(u64, s, size);
150
151
152 if (copy_to_user(buf, &val[*pos], s))
153 return -EINVAL;
154
155 *pos += s;
156
157 return s;
158}
159
160static const struct file_operations amdgpu_ras_debugfs_ops = {
161 .owner = THIS_MODULE,
162 .read = amdgpu_ras_debugfs_read,
163 .write = NULL,
164 .llseek = default_llseek
165};
166
167static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
168{
169 int i;
170
171 for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
172 *block_id = i;
173 if (strcmp(name, ras_block_str(i)) == 0)
174 return 0;
175 }
176 return -EINVAL;
177}
178
179static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
180 const char __user *buf, size_t size,
181 loff_t *pos, struct ras_debug_if *data)
182{
183 ssize_t s = min_t(u64, 64, size);
184 char str[65];
185 char block_name[33];
186 char err[9] = "ue";
187 int op = -1;
188 int block_id;
189 u64 address, value;
190
191 if (*pos)
192 return -EINVAL;
193 *pos = size;
194
195 memset(str, 0, sizeof(str));
196 memset(data, 0, sizeof(*data));
197
198 if (copy_from_user(str, buf, s))
199 return -EINVAL;
200
201 if (sscanf(str, "disable %32s", block_name) == 1)
202 op = 0;
203 else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
204 op = 1;
205 else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
206 op = 2;
207 else if (str[0] && str[1] && str[2] && str[3])
208
209 return -EINVAL;
210
211 if (op != -1) {
212 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
213 return -EINVAL;
214
215 data->head.block = block_id;
216 data->head.type = memcmp("ue", err, 2) == 0 ?
217 AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE :
218 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
219 data->op = op;
220
221 if (op == 2) {
222 if (sscanf(str, "%*s %*s %*s %llu %llu",
223 &address, &value) != 2)
224 if (sscanf(str, "%*s %*s %*s 0x%llx 0x%llx",
225 &address, &value) != 2)
226 return -EINVAL;
227 data->inject.address = address;
228 data->inject.value = value;
229 }
230 } else {
231 if (size < sizeof(*data))
232 return -EINVAL;
233
234 if (copy_from_user(data, buf, sizeof(*data)))
235 return -EINVAL;
236 }
237
238 return 0;
239}
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
299 size_t size, loff_t *pos)
300{
301 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
302 struct ras_debug_if data;
303 int ret = 0;
304
305 ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
306 if (ret)
307 return -EINVAL;
308
309 if (!amdgpu_ras_is_supported(adev, data.head.block))
310 return -EINVAL;
311
312 switch (data.op) {
313 case 0:
314 ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
315 break;
316 case 1:
317 ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
318 break;
319 case 2:
320 ret = amdgpu_ras_error_inject(adev, &data.inject);
321 break;
322 default:
323 ret = -EINVAL;
324 break;
325 };
326
327 if (ret)
328 return -EINVAL;
329
330 return size;
331}
332
333static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
334 .owner = THIS_MODULE,
335 .read = NULL,
336 .write = amdgpu_ras_debugfs_ctrl_write,
337 .llseek = default_llseek
338};
339
340static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
341 struct device_attribute *attr, char *buf)
342{
343 struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
344 struct ras_query_if info = {
345 .head = obj->head,
346 };
347
348 if (amdgpu_ras_error_query(obj->adev, &info))
349 return -EINVAL;
350
351 return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
352 "ue", info.ue_count,
353 "ce", info.ce_count);
354}
355
356
357
358#define get_obj(obj) do { (obj)->use++; } while (0)
359#define alive_obj(obj) ((obj)->use)
360
361static inline void put_obj(struct ras_manager *obj)
362{
363 if (obj && --obj->use == 0)
364 list_del(&obj->node);
365 if (obj && obj->use < 0) {
366 DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
367 }
368}
369
370
371static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
372 struct ras_common_if *head)
373{
374 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
375 struct ras_manager *obj;
376
377 if (!con)
378 return NULL;
379
380 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
381 return NULL;
382
383 obj = &con->objs[head->block];
384
385 if (alive_obj(obj))
386 return NULL;
387
388 obj->head = *head;
389 obj->adev = adev;
390 list_add(&obj->node, &con->head);
391 get_obj(obj);
392
393 return obj;
394}
395
396
397static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
398 struct ras_common_if *head)
399{
400 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
401 struct ras_manager *obj;
402 int i;
403
404 if (!con)
405 return NULL;
406
407 if (head) {
408 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
409 return NULL;
410
411 obj = &con->objs[head->block];
412
413 if (alive_obj(obj)) {
414 WARN_ON(head->block != obj->head.block);
415 return obj;
416 }
417 } else {
418 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
419 obj = &con->objs[i];
420 if (alive_obj(obj)) {
421 WARN_ON(i != obj->head.block);
422 return obj;
423 }
424 }
425 }
426
427 return NULL;
428}
429
430
431
432static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
433 struct ras_common_if *head)
434{
435 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
436
437 return con->hw_supported & BIT(head->block);
438}
439
440static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
441 struct ras_common_if *head)
442{
443 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
444
445 return con->features & BIT(head->block);
446}
447
448
449
450
451
452static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
453 struct ras_common_if *head, int enable)
454{
455 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
456 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
457
458
459
460
461
462
463
464 if (!amdgpu_ras_is_feature_allowed(adev, head))
465 return 0;
466 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
467 return 0;
468
469 if (enable) {
470 if (!obj) {
471 obj = amdgpu_ras_create_obj(adev, head);
472 if (!obj)
473 return -EINVAL;
474 } else {
475
476 get_obj(obj);
477 }
478 con->features |= BIT(head->block);
479 } else {
480 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
481 con->features &= ~BIT(head->block);
482 put_obj(obj);
483 }
484 }
485
486 return 0;
487}
488
489
490int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
491 struct ras_common_if *head, bool enable)
492{
493 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
494 union ta_ras_cmd_input info;
495 int ret;
496
497 if (!con)
498 return -EINVAL;
499
500 if (!enable) {
501 info.disable_features = (struct ta_ras_disable_features_input) {
502 .block_id = amdgpu_ras_block_to_ta(head->block),
503 .error_type = amdgpu_ras_error_to_ta(head->type),
504 };
505 } else {
506 info.enable_features = (struct ta_ras_enable_features_input) {
507 .block_id = amdgpu_ras_block_to_ta(head->block),
508 .error_type = amdgpu_ras_error_to_ta(head->type),
509 };
510 }
511
512
513 WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
514
515 if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
516 return 0;
517
518 ret = psp_ras_enable_features(&adev->psp, &info, enable);
519 if (ret) {
520 DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
521 enable ? "enable":"disable",
522 ras_block_str(head->block),
523 ret);
524 return -EINVAL;
525 }
526
527
528 __amdgpu_ras_feature_enable(adev, head, enable);
529
530 return 0;
531}
532
533
534int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
535 struct ras_common_if *head, bool enable)
536{
537 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
538 int ret;
539
540 if (!con)
541 return -EINVAL;
542
543 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
544
545
546
547
548 ret = __amdgpu_ras_feature_enable(adev, head, 1);
549 if (ret)
550 return ret;
551
552 if (!enable)
553 ret = amdgpu_ras_feature_enable(adev, head, 0);
554 } else
555 ret = amdgpu_ras_feature_enable(adev, head, enable);
556
557 return ret;
558}
559
560static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
561 bool bypass)
562{
563 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
564 struct ras_manager *obj, *tmp;
565
566 list_for_each_entry_safe(obj, tmp, &con->head, node) {
567
568
569
570 if (bypass) {
571 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
572 break;
573 } else {
574 if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
575 break;
576 }
577 }
578
579 return con->features;
580}
581
582static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
583 bool bypass)
584{
585 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
586 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
587 int i;
588 const enum amdgpu_ras_error_type default_ras_type =
589 AMDGPU_RAS_ERROR__NONE;
590
591 for (i = 0; i < ras_block_count; i++) {
592 struct ras_common_if head = {
593 .block = i,
594 .type = default_ras_type,
595 .sub_block_index = 0,
596 };
597 strcpy(head.name, ras_block_str(i));
598 if (bypass) {
599
600
601
602
603 if (__amdgpu_ras_feature_enable(adev, &head, 1))
604 break;
605 } else {
606 if (amdgpu_ras_feature_enable(adev, &head, 1))
607 break;
608 }
609 }
610
611 return con->features;
612}
613
614
615
616int amdgpu_ras_error_query(struct amdgpu_device *adev,
617 struct ras_query_if *info)
618{
619 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
620
621 if (!obj)
622 return -EINVAL;
623
624
625 info->ue_count = obj->err_data.ue_count;
626 info->ce_count = obj->err_data.ce_count;
627
628 return 0;
629}
630
631
632int amdgpu_ras_error_inject(struct amdgpu_device *adev,
633 struct ras_inject_if *info)
634{
635 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
636 struct ta_ras_trigger_error_input block_info = {
637 .block_id = amdgpu_ras_block_to_ta(info->head.block),
638 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
639 .sub_block_index = info->head.sub_block_index,
640 .address = info->address,
641 .value = info->value,
642 };
643 int ret = 0;
644
645 if (!obj)
646 return -EINVAL;
647
648 ret = psp_ras_trigger_error(&adev->psp, &block_info);
649 if (ret)
650 DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
651 ras_block_str(info->head.block),
652 ret);
653
654 return ret;
655}
656
657int amdgpu_ras_error_cure(struct amdgpu_device *adev,
658 struct ras_cure_if *info)
659{
660
661 return 0;
662}
663
664
665int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
666 bool is_ce)
667{
668 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
669 struct ras_manager *obj;
670 struct ras_err_data data = {0, 0};
671
672 if (!con)
673 return -EINVAL;
674
675 list_for_each_entry(obj, &con->head, node) {
676 struct ras_query_if info = {
677 .head = obj->head,
678 };
679
680 if (amdgpu_ras_error_query(adev, &info))
681 return -EINVAL;
682
683 data.ce_count += info.ce_count;
684 data.ue_count += info.ue_count;
685 }
686
687 return is_ce ? data.ce_count : data.ue_count;
688}
689
690
691
692
693
694static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
695 struct device_attribute *attr, char *buf)
696{
697 struct amdgpu_ras *con =
698 container_of(attr, struct amdgpu_ras, features_attr);
699 struct drm_device *ddev = dev_get_drvdata(dev);
700 struct amdgpu_device *adev = ddev->dev_private;
701 struct ras_common_if head;
702 int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
703 int i;
704 ssize_t s;
705 struct ras_manager *obj;
706
707 s = scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
708
709 for (i = 0; i < ras_block_count; i++) {
710 head.block = i;
711
712 if (amdgpu_ras_is_feature_enabled(adev, &head)) {
713 obj = amdgpu_ras_find_obj(adev, &head);
714 s += scnprintf(&buf[s], PAGE_SIZE - s,
715 "%s: %s\n",
716 ras_block_str(i),
717 ras_err_str(obj->head.type));
718 } else
719 s += scnprintf(&buf[s], PAGE_SIZE - s,
720 "%s: disabled\n",
721 ras_block_str(i));
722 }
723
724 return s;
725}
726
727static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
728{
729 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
730 struct attribute *attrs[] = {
731 &con->features_attr.attr,
732 NULL
733 };
734 struct attribute_group group = {
735 .name = "ras",
736 .attrs = attrs,
737 };
738
739 con->features_attr = (struct device_attribute) {
740 .attr = {
741 .name = "features",
742 .mode = S_IRUGO,
743 },
744 .show = amdgpu_ras_sysfs_features_read,
745 };
746 sysfs_attr_init(attrs[0]);
747
748 return sysfs_create_group(&adev->dev->kobj, &group);
749}
750
751static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
752{
753 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
754 struct attribute *attrs[] = {
755 &con->features_attr.attr,
756 NULL
757 };
758 struct attribute_group group = {
759 .name = "ras",
760 .attrs = attrs,
761 };
762
763 sysfs_remove_group(&adev->dev->kobj, &group);
764
765 return 0;
766}
767
768int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
769 struct ras_fs_if *head)
770{
771 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
772
773 if (!obj || obj->attr_inuse)
774 return -EINVAL;
775
776 get_obj(obj);
777
778 memcpy(obj->fs_data.sysfs_name,
779 head->sysfs_name,
780 sizeof(obj->fs_data.sysfs_name));
781
782 obj->sysfs_attr = (struct device_attribute){
783 .attr = {
784 .name = obj->fs_data.sysfs_name,
785 .mode = S_IRUGO,
786 },
787 .show = amdgpu_ras_sysfs_read,
788 };
789 sysfs_attr_init(&obj->sysfs_attr.attr);
790
791 if (sysfs_add_file_to_group(&adev->dev->kobj,
792 &obj->sysfs_attr.attr,
793 "ras")) {
794 put_obj(obj);
795 return -EINVAL;
796 }
797
798 obj->attr_inuse = 1;
799
800 return 0;
801}
802
803int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
804 struct ras_common_if *head)
805{
806 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
807
808 if (!obj || !obj->attr_inuse)
809 return -EINVAL;
810
811 sysfs_remove_file_from_group(&adev->dev->kobj,
812 &obj->sysfs_attr.attr,
813 "ras");
814 obj->attr_inuse = 0;
815 put_obj(obj);
816
817 return 0;
818}
819
820static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
821{
822 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
823 struct ras_manager *obj, *tmp;
824
825 list_for_each_entry_safe(obj, tmp, &con->head, node) {
826 amdgpu_ras_sysfs_remove(adev, &obj->head);
827 }
828
829 amdgpu_ras_sysfs_remove_feature_node(adev);
830
831 return 0;
832}
833
834
835
836static int amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
837{
838 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
839 struct drm_minor *minor = adev->ddev->primary;
840 struct dentry *root = minor->debugfs_root, *dir;
841 struct dentry *ent;
842
843 dir = debugfs_create_dir("ras", root);
844 if (IS_ERR(dir))
845 return -EINVAL;
846
847 con->dir = dir;
848
849 ent = debugfs_create_file("ras_ctrl",
850 S_IWUGO | S_IRUGO, con->dir,
851 adev, &amdgpu_ras_debugfs_ctrl_ops);
852 if (IS_ERR(ent)) {
853 debugfs_remove(con->dir);
854 return -EINVAL;
855 }
856
857 con->ent = ent;
858 return 0;
859}
860
861int amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
862 struct ras_fs_if *head)
863{
864 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
865 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
866 struct dentry *ent;
867
868 if (!obj || obj->ent)
869 return -EINVAL;
870
871 get_obj(obj);
872
873 memcpy(obj->fs_data.debugfs_name,
874 head->debugfs_name,
875 sizeof(obj->fs_data.debugfs_name));
876
877 ent = debugfs_create_file(obj->fs_data.debugfs_name,
878 S_IWUGO | S_IRUGO, con->dir,
879 obj, &amdgpu_ras_debugfs_ops);
880
881 if (IS_ERR(ent))
882 return -EINVAL;
883
884 obj->ent = ent;
885
886 return 0;
887}
888
889int amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
890 struct ras_common_if *head)
891{
892 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
893
894 if (!obj || !obj->ent)
895 return 0;
896
897 debugfs_remove(obj->ent);
898 obj->ent = NULL;
899 put_obj(obj);
900
901 return 0;
902}
903
904static int amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
905{
906 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
907 struct ras_manager *obj, *tmp;
908
909 list_for_each_entry_safe(obj, tmp, &con->head, node) {
910 amdgpu_ras_debugfs_remove(adev, &obj->head);
911 }
912
913 debugfs_remove(con->ent);
914 debugfs_remove(con->dir);
915 con->dir = NULL;
916 con->ent = NULL;
917
918 return 0;
919}
920
921
922
923
924static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
925{
926 amdgpu_ras_sysfs_create_feature_node(adev);
927 amdgpu_ras_debugfs_create_ctrl_node(adev);
928
929 return 0;
930}
931
932static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
933{
934 amdgpu_ras_debugfs_remove_all(adev);
935 amdgpu_ras_sysfs_remove_all(adev);
936 return 0;
937}
938
939
940
941static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
942{
943 struct ras_ih_data *data = &obj->ih_data;
944 struct amdgpu_iv_entry entry;
945 int ret;
946
947 while (data->rptr != data->wptr) {
948 rmb();
949 memcpy(&entry, &data->ring[data->rptr],
950 data->element_size);
951
952 wmb();
953 data->rptr = (data->aligned_element_size +
954 data->rptr) % data->ring_size;
955
956
957
958
959 if (data->cb) {
960 ret = data->cb(obj->adev, &entry);
961
962
963
964
965
966 if (ret == AMDGPU_RAS_UE) {
967 obj->err_data.ue_count++;
968 }
969
970
971
972
973 }
974 }
975}
976
977static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
978{
979 struct ras_ih_data *data =
980 container_of(work, struct ras_ih_data, ih_work);
981 struct ras_manager *obj =
982 container_of(data, struct ras_manager, ih_data);
983
984 amdgpu_ras_interrupt_handler(obj);
985}
986
987int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
988 struct ras_dispatch_if *info)
989{
990 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
991 struct ras_ih_data *data = &obj->ih_data;
992
993 if (!obj)
994 return -EINVAL;
995
996 if (data->inuse == 0)
997 return 0;
998
999
1000 memcpy(&data->ring[data->wptr], info->entry,
1001 data->element_size);
1002
1003 wmb();
1004 data->wptr = (data->aligned_element_size +
1005 data->wptr) % data->ring_size;
1006
1007 schedule_work(&data->ih_work);
1008
1009 return 0;
1010}
1011
1012int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1013 struct ras_ih_if *info)
1014{
1015 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1016 struct ras_ih_data *data;
1017
1018 if (!obj)
1019 return -EINVAL;
1020
1021 data = &obj->ih_data;
1022 if (data->inuse == 0)
1023 return 0;
1024
1025 cancel_work_sync(&data->ih_work);
1026
1027 kfree(data->ring);
1028 memset(data, 0, sizeof(*data));
1029 put_obj(obj);
1030
1031 return 0;
1032}
1033
1034int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1035 struct ras_ih_if *info)
1036{
1037 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1038 struct ras_ih_data *data;
1039
1040 if (!obj) {
1041
1042 obj = amdgpu_ras_create_obj(adev, &info->head);
1043 if (!obj)
1044 return -EINVAL;
1045 } else
1046 get_obj(obj);
1047
1048 data = &obj->ih_data;
1049
1050 *data = (struct ras_ih_data) {
1051 .inuse = 0,
1052 .cb = info->cb,
1053 .element_size = sizeof(struct amdgpu_iv_entry),
1054 .rptr = 0,
1055 .wptr = 0,
1056 };
1057
1058 INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1059
1060 data->aligned_element_size = ALIGN(data->element_size, 8);
1061
1062 data->ring_size = 64 * data->aligned_element_size;
1063 data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1064 if (!data->ring) {
1065 put_obj(obj);
1066 return -ENOMEM;
1067 }
1068
1069
1070 data->inuse = 1;
1071
1072 return 0;
1073}
1074
1075static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1076{
1077 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1078 struct ras_manager *obj, *tmp;
1079
1080 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1081 struct ras_ih_if info = {
1082 .head = obj->head,
1083 };
1084 amdgpu_ras_interrupt_remove_handler(adev, &info);
1085 }
1086
1087 return 0;
1088}
1089
1090
1091
1092static void amdgpu_ras_do_recovery(struct work_struct *work)
1093{
1094 struct amdgpu_ras *ras =
1095 container_of(work, struct amdgpu_ras, recovery_work);
1096
1097 amdgpu_device_gpu_recover(ras->adev, 0);
1098 atomic_set(&ras->in_recovery, 0);
1099}
1100
1101static int amdgpu_ras_release_vram(struct amdgpu_device *adev,
1102 struct amdgpu_bo **bo_ptr)
1103{
1104
1105 amdgpu_bo_free_kernel(bo_ptr, NULL, NULL);
1106 return 0;
1107}
1108
1109
1110static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
1111 uint64_t offset, uint64_t size,
1112 struct amdgpu_bo **bo_ptr)
1113{
1114 struct ttm_operation_ctx ctx = { false, false };
1115 struct amdgpu_bo_param bp;
1116 int r = 0;
1117 int i;
1118 struct amdgpu_bo *bo;
1119
1120 if (bo_ptr)
1121 *bo_ptr = NULL;
1122 memset(&bp, 0, sizeof(bp));
1123 bp.size = size;
1124 bp.byte_align = PAGE_SIZE;
1125 bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
1126 bp.flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
1127 AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
1128 bp.type = ttm_bo_type_kernel;
1129 bp.resv = NULL;
1130
1131 r = amdgpu_bo_create(adev, &bp, &bo);
1132 if (r)
1133 return -EINVAL;
1134
1135 r = amdgpu_bo_reserve(bo, false);
1136 if (r)
1137 goto error_reserve;
1138
1139 offset = ALIGN(offset, PAGE_SIZE);
1140 for (i = 0; i < bo->placement.num_placement; ++i) {
1141 bo->placements[i].fpfn = offset >> PAGE_SHIFT;
1142 bo->placements[i].lpfn = (offset + size) >> PAGE_SHIFT;
1143 }
1144
1145 ttm_bo_mem_put(&bo->tbo, &bo->tbo.mem);
1146 r = ttm_bo_mem_space(&bo->tbo, &bo->placement, &bo->tbo.mem, &ctx);
1147 if (r)
1148 goto error_pin;
1149
1150 r = amdgpu_bo_pin_restricted(bo,
1151 AMDGPU_GEM_DOMAIN_VRAM,
1152 offset,
1153 offset + size);
1154 if (r)
1155 goto error_pin;
1156
1157 if (bo_ptr)
1158 *bo_ptr = bo;
1159
1160 amdgpu_bo_unreserve(bo);
1161 return r;
1162
1163error_pin:
1164 amdgpu_bo_unreserve(bo);
1165error_reserve:
1166 amdgpu_bo_unref(&bo);
1167 return r;
1168}
1169
1170
1171static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1172 struct ras_err_handler_data *data, int pages)
1173{
1174 unsigned int old_space = data->count + data->space_left;
1175 unsigned int new_space = old_space + pages;
1176 unsigned int align_space = ALIGN(new_space, 1024);
1177 void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1178
1179 if (!tmp)
1180 return -ENOMEM;
1181
1182 if (data->bps) {
1183 memcpy(tmp, data->bps,
1184 data->count * sizeof(*data->bps));
1185 kfree(data->bps);
1186 }
1187
1188 data->bps = tmp;
1189 data->space_left += align_space - old_space;
1190 return 0;
1191}
1192
1193
1194int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
1195 unsigned long *bps, int pages)
1196{
1197 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1198 struct ras_err_handler_data *data;
1199 int i = pages;
1200 int ret = 0;
1201
1202 if (!con || !con->eh_data || !bps || pages <= 0)
1203 return 0;
1204
1205 mutex_lock(&con->recovery_lock);
1206 data = con->eh_data;
1207 if (!data)
1208 goto out;
1209
1210 if (data->space_left <= pages)
1211 if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
1212 ret = -ENOMEM;
1213 goto out;
1214 }
1215
1216 while (i--)
1217 data->bps[data->count++].bp = bps[i];
1218
1219 data->space_left -= pages;
1220out:
1221 mutex_unlock(&con->recovery_lock);
1222
1223 return ret;
1224}
1225
1226
1227int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
1228{
1229 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1230 struct ras_err_handler_data *data;
1231 uint64_t bp;
1232 struct amdgpu_bo *bo;
1233 int i;
1234
1235 if (!con || !con->eh_data)
1236 return 0;
1237
1238 mutex_lock(&con->recovery_lock);
1239 data = con->eh_data;
1240 if (!data)
1241 goto out;
1242
1243 for (i = data->last_reserved; i < data->count; i++) {
1244 bp = data->bps[i].bp;
1245
1246 if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,
1247 PAGE_SIZE, &bo))
1248 DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
1249
1250 data->bps[i].bo = bo;
1251 data->last_reserved = i + 1;
1252 }
1253out:
1254 mutex_unlock(&con->recovery_lock);
1255 return 0;
1256}
1257
1258
1259static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
1260{
1261 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1262 struct ras_err_handler_data *data;
1263 struct amdgpu_bo *bo;
1264 int i;
1265
1266 if (!con || !con->eh_data)
1267 return 0;
1268
1269 mutex_lock(&con->recovery_lock);
1270 data = con->eh_data;
1271 if (!data)
1272 goto out;
1273
1274 for (i = data->last_reserved - 1; i >= 0; i--) {
1275 bo = data->bps[i].bo;
1276
1277 amdgpu_ras_release_vram(adev, &bo);
1278
1279 data->bps[i].bo = bo;
1280 data->last_reserved = i;
1281 }
1282out:
1283 mutex_unlock(&con->recovery_lock);
1284 return 0;
1285}
1286
1287static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1288{
1289
1290
1291
1292 return 0;
1293}
1294
1295static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1296{
1297
1298
1299
1300 return 0;
1301}
1302
1303static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
1304{
1305 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1306 struct ras_err_handler_data **data = &con->eh_data;
1307
1308 *data = kmalloc(sizeof(**data),
1309 GFP_KERNEL|__GFP_ZERO);
1310 if (!*data)
1311 return -ENOMEM;
1312
1313 mutex_init(&con->recovery_lock);
1314 INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
1315 atomic_set(&con->in_recovery, 0);
1316 con->adev = adev;
1317
1318 amdgpu_ras_load_bad_pages(adev);
1319 amdgpu_ras_reserve_bad_pages(adev);
1320
1321 return 0;
1322}
1323
1324static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
1325{
1326 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1327 struct ras_err_handler_data *data = con->eh_data;
1328
1329 cancel_work_sync(&con->recovery_work);
1330 amdgpu_ras_save_bad_pages(adev);
1331 amdgpu_ras_release_bad_pages(adev);
1332
1333 mutex_lock(&con->recovery_lock);
1334 con->eh_data = NULL;
1335 kfree(data->bps);
1336 kfree(data);
1337 mutex_unlock(&con->recovery_lock);
1338
1339 return 0;
1340}
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
1353 uint32_t *hw_supported, uint32_t *supported)
1354{
1355 *hw_supported = 0;
1356 *supported = 0;
1357
1358 if (amdgpu_sriov_vf(adev) ||
1359 adev->asic_type != CHIP_VEGA20)
1360 return;
1361
1362 if (adev->is_atom_fw &&
1363 (amdgpu_atomfirmware_mem_ecc_supported(adev) ||
1364 amdgpu_atomfirmware_sram_ecc_supported(adev)))
1365 *hw_supported = AMDGPU_RAS_BLOCK_MASK;
1366
1367 *supported = amdgpu_ras_enable == 0 ?
1368 0 : *hw_supported & amdgpu_ras_mask;
1369}
1370
1371int amdgpu_ras_init(struct amdgpu_device *adev)
1372{
1373 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1374
1375 if (con)
1376 return 0;
1377
1378 con = kmalloc(sizeof(struct amdgpu_ras) +
1379 sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
1380 GFP_KERNEL|__GFP_ZERO);
1381 if (!con)
1382 return -ENOMEM;
1383
1384 con->objs = (struct ras_manager *)(con + 1);
1385
1386 amdgpu_ras_set_context(adev, con);
1387
1388 amdgpu_ras_check_supported(adev, &con->hw_supported,
1389 &con->supported);
1390 con->features = 0;
1391 INIT_LIST_HEAD(&con->head);
1392
1393 con->flags = RAS_DEFAULT_FLAGS;
1394
1395 if (amdgpu_ras_recovery_init(adev))
1396 goto recovery_out;
1397
1398 amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
1399
1400 if (amdgpu_ras_fs_init(adev))
1401 goto fs_out;
1402
1403 amdgpu_ras_self_test(adev);
1404
1405 DRM_INFO("RAS INFO: ras initialized successfully, "
1406 "hardware ability[%x] ras_mask[%x]\n",
1407 con->hw_supported, con->supported);
1408 return 0;
1409fs_out:
1410 amdgpu_ras_recovery_fini(adev);
1411recovery_out:
1412 amdgpu_ras_set_context(adev, NULL);
1413 kfree(con);
1414
1415 return -EINVAL;
1416}
1417
1418
1419void amdgpu_ras_post_init(struct amdgpu_device *adev)
1420{
1421 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1422 struct ras_manager *obj, *tmp;
1423
1424 if (!con)
1425 return;
1426
1427 if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
1428
1429
1430
1431
1432
1433 amdgpu_ras_enable_all_features(adev, 1);
1434
1435
1436
1437
1438
1439 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1440 if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
1441 amdgpu_ras_feature_enable(adev, &obj->head, 0);
1442
1443 WARN_ON(alive_obj(obj));
1444 }
1445 }
1446 }
1447}
1448
1449
1450int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
1451{
1452 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1453
1454 if (!con)
1455 return 0;
1456
1457
1458 amdgpu_ras_disable_all_features(adev, 0);
1459 amdgpu_ras_recovery_fini(adev);
1460 return 0;
1461}
1462
1463int amdgpu_ras_fini(struct amdgpu_device *adev)
1464{
1465 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1466
1467 if (!con)
1468 return 0;
1469
1470 amdgpu_ras_fs_fini(adev);
1471 amdgpu_ras_interrupt_remove_all(adev);
1472
1473 WARN(con->features, "Feature mask is not cleared");
1474
1475 if (con->features)
1476 amdgpu_ras_disable_all_features(adev, 1);
1477
1478 amdgpu_ras_set_context(adev, NULL);
1479 kfree(con);
1480
1481 return 0;
1482}
1483