1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24#ifndef _AMDGPU_RAS_H
25#define _AMDGPU_RAS_H
26
27#include <linux/debugfs.h>
28#include <linux/list.h>
29#include "amdgpu.h"
30#include "amdgpu_psp.h"
31#include "ta_ras_if.h"
32
33enum amdgpu_ras_block {
34 AMDGPU_RAS_BLOCK__UMC = 0,
35 AMDGPU_RAS_BLOCK__SDMA,
36 AMDGPU_RAS_BLOCK__GFX,
37 AMDGPU_RAS_BLOCK__MMHUB,
38 AMDGPU_RAS_BLOCK__ATHUB,
39 AMDGPU_RAS_BLOCK__PCIE_BIF,
40 AMDGPU_RAS_BLOCK__HDP,
41 AMDGPU_RAS_BLOCK__XGMI_WAFL,
42 AMDGPU_RAS_BLOCK__DF,
43 AMDGPU_RAS_BLOCK__SMN,
44 AMDGPU_RAS_BLOCK__SEM,
45 AMDGPU_RAS_BLOCK__MP0,
46 AMDGPU_RAS_BLOCK__MP1,
47 AMDGPU_RAS_BLOCK__FUSE,
48
49 AMDGPU_RAS_BLOCK__LAST
50};
51
52#define AMDGPU_RAS_BLOCK_COUNT AMDGPU_RAS_BLOCK__LAST
53#define AMDGPU_RAS_BLOCK_MASK ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1)
54
55enum amdgpu_ras_error_type {
56 AMDGPU_RAS_ERROR__NONE = 0,
57 AMDGPU_RAS_ERROR__PARITY = 1,
58 AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE = 2,
59 AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE = 4,
60 AMDGPU_RAS_ERROR__POISON = 8,
61};
62
63enum amdgpu_ras_ret {
64 AMDGPU_RAS_SUCCESS = 0,
65 AMDGPU_RAS_FAIL,
66 AMDGPU_RAS_UE,
67 AMDGPU_RAS_CE,
68 AMDGPU_RAS_PT,
69};
70
71struct ras_common_if {
72 enum amdgpu_ras_block block;
73 enum amdgpu_ras_error_type type;
74 uint32_t sub_block_index;
75
76 char name[32];
77};
78
79typedef int (*ras_ih_cb)(struct amdgpu_device *adev,
80 struct amdgpu_iv_entry *entry);
81
82struct amdgpu_ras {
83
84
85 uint32_t hw_supported;
86
87 uint32_t supported;
88 uint32_t features;
89 struct list_head head;
90
91 struct dentry *dir;
92
93 struct dentry *ent;
94
95 struct device_attribute features_attr;
96 struct bin_attribute badpages_attr;
97
98 struct ras_manager *objs;
99
100
101 struct work_struct recovery_work;
102 atomic_t in_recovery;
103 struct amdgpu_device *adev;
104
105 struct ras_err_handler_data *eh_data;
106 struct mutex recovery_lock;
107
108 uint32_t flags;
109};
110
111
112
113struct ras_fs_if {
114 struct ras_common_if head;
115 char sysfs_name[32];
116 char debugfs_name[32];
117};
118
119struct ras_query_if {
120 struct ras_common_if head;
121 unsigned long ue_count;
122 unsigned long ce_count;
123};
124
125struct ras_inject_if {
126 struct ras_common_if head;
127 uint64_t address;
128 uint64_t value;
129};
130
131struct ras_cure_if {
132 struct ras_common_if head;
133 uint64_t address;
134};
135
136struct ras_ih_if {
137 struct ras_common_if head;
138 ras_ih_cb cb;
139};
140
141struct ras_dispatch_if {
142 struct ras_common_if head;
143 struct amdgpu_iv_entry *entry;
144};
145
146struct ras_debug_if {
147 union {
148 struct ras_common_if head;
149 struct ras_inject_if inject;
150 };
151 int op;
152};
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167#define amdgpu_ras_get_context(adev) ((adev)->psp.ras.ras)
168#define amdgpu_ras_set_context(adev, ras_con) ((adev)->psp.ras.ras = (ras_con))
169
170
171static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
172 unsigned int block)
173{
174 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
175
176 if (block >= AMDGPU_RAS_BLOCK_COUNT)
177 return 0;
178 return ras && (ras->supported & (1 << block));
179}
180
181int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
182 unsigned int block);
183
184void amdgpu_ras_resume(struct amdgpu_device *adev);
185void amdgpu_ras_suspend(struct amdgpu_device *adev);
186
187int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
188 bool is_ce);
189
190
191int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
192 unsigned long *bps, int pages);
193
194int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev);
195
196static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev,
197 bool is_baco)
198{
199 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
200
201 if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
202 schedule_work(&ras->recovery_work);
203 return 0;
204}
205
206static inline enum ta_ras_block
207amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
208 switch (block) {
209 case AMDGPU_RAS_BLOCK__UMC:
210 return TA_RAS_BLOCK__UMC;
211 case AMDGPU_RAS_BLOCK__SDMA:
212 return TA_RAS_BLOCK__SDMA;
213 case AMDGPU_RAS_BLOCK__GFX:
214 return TA_RAS_BLOCK__GFX;
215 case AMDGPU_RAS_BLOCK__MMHUB:
216 return TA_RAS_BLOCK__MMHUB;
217 case AMDGPU_RAS_BLOCK__ATHUB:
218 return TA_RAS_BLOCK__ATHUB;
219 case AMDGPU_RAS_BLOCK__PCIE_BIF:
220 return TA_RAS_BLOCK__PCIE_BIF;
221 case AMDGPU_RAS_BLOCK__HDP:
222 return TA_RAS_BLOCK__HDP;
223 case AMDGPU_RAS_BLOCK__XGMI_WAFL:
224 return TA_RAS_BLOCK__XGMI_WAFL;
225 case AMDGPU_RAS_BLOCK__DF:
226 return TA_RAS_BLOCK__DF;
227 case AMDGPU_RAS_BLOCK__SMN:
228 return TA_RAS_BLOCK__SMN;
229 case AMDGPU_RAS_BLOCK__SEM:
230 return TA_RAS_BLOCK__SEM;
231 case AMDGPU_RAS_BLOCK__MP0:
232 return TA_RAS_BLOCK__MP0;
233 case AMDGPU_RAS_BLOCK__MP1:
234 return TA_RAS_BLOCK__MP1;
235 case AMDGPU_RAS_BLOCK__FUSE:
236 return TA_RAS_BLOCK__FUSE;
237 default:
238 WARN_ONCE(1, "RAS ERROR: unexpected block id %d\n", block);
239 return TA_RAS_BLOCK__UMC;
240 }
241}
242
243static inline enum ta_ras_error_type
244amdgpu_ras_error_to_ta(enum amdgpu_ras_error_type error) {
245 switch (error) {
246 case AMDGPU_RAS_ERROR__NONE:
247 return TA_RAS_ERROR__NONE;
248 case AMDGPU_RAS_ERROR__PARITY:
249 return TA_RAS_ERROR__PARITY;
250 case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE:
251 return TA_RAS_ERROR__SINGLE_CORRECTABLE;
252 case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE:
253 return TA_RAS_ERROR__MULTI_UNCORRECTABLE;
254 case AMDGPU_RAS_ERROR__POISON:
255 return TA_RAS_ERROR__POISON;
256 default:
257 WARN_ONCE(1, "RAS ERROR: unexpected error type %d\n", error);
258 return TA_RAS_ERROR__NONE;
259 }
260}
261
262
263int amdgpu_ras_init(struct amdgpu_device *adev);
264int amdgpu_ras_fini(struct amdgpu_device *adev);
265int amdgpu_ras_pre_fini(struct amdgpu_device *adev);
266
267int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
268 struct ras_common_if *head, bool enable);
269
270int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
271 struct ras_common_if *head, bool enable);
272
273int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
274 struct ras_fs_if *head);
275
276int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
277 struct ras_common_if *head);
278
279void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
280 struct ras_fs_if *head);
281
282void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
283 struct ras_common_if *head);
284
285int amdgpu_ras_error_query(struct amdgpu_device *adev,
286 struct ras_query_if *info);
287
288int amdgpu_ras_error_inject(struct amdgpu_device *adev,
289 struct ras_inject_if *info);
290
291int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
292 struct ras_ih_if *info);
293
294int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
295 struct ras_ih_if *info);
296
297int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
298 struct ras_dispatch_if *info);
299#endif
300