1 | /* |
2 | * Copyright 2018 Advanced Micro Devices, Inc. |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice shall be included in |
12 | * all copies or substantial portions of the Software. |
13 | * |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
20 | * OTHER DEALINGS IN THE SOFTWARE. |
21 | * |
22 | * |
23 | */ |
24 | #include <linux/list.h> |
25 | #include "amdgpu.h" |
26 | #include "amdgpu_xgmi.h" |
27 | #include "amdgpu_ras.h" |
28 | #include "soc15.h" |
29 | #include "df/df_3_6_offset.h" |
30 | #include "xgmi/xgmi_4_0_0_smn.h" |
31 | #include "xgmi/xgmi_4_0_0_sh_mask.h" |
32 | #include "xgmi/xgmi_6_1_0_sh_mask.h" |
33 | #include "wafl/wafl2_4_0_0_smn.h" |
34 | #include "wafl/wafl2_4_0_0_sh_mask.h" |
35 | |
36 | #include "amdgpu_reset.h" |
37 | |
38 | #define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c |
39 | #define smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK 0x11a00218 |
40 | #define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210 |
41 | #define smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK 0x12200218 |
42 | |
43 | static DEFINE_MUTEX(xgmi_mutex); |
44 | |
45 | #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4 |
46 | |
47 | static LIST_HEAD(xgmi_hive_list); |
48 | |
49 | static const int xgmi_pcs_err_status_reg_vg20[] = { |
50 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, |
51 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000, |
52 | }; |
53 | |
54 | static const int wafl_pcs_err_status_reg_vg20[] = { |
55 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, |
56 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, |
57 | }; |
58 | |
59 | static const int xgmi_pcs_err_status_reg_arct[] = { |
60 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS, |
61 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x100000, |
62 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x500000, |
63 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x600000, |
64 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x700000, |
65 | smnXGMI0_PCS_GOPX16_PCS_ERROR_STATUS + 0x800000, |
66 | }; |
67 | |
68 | /* same as vg20*/ |
69 | static const int wafl_pcs_err_status_reg_arct[] = { |
70 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, |
71 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000, |
72 | }; |
73 | |
74 | static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = { |
75 | smnPCS_XGMI3X16_PCS_ERROR_STATUS, |
76 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000, |
77 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x200000, |
78 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x300000, |
79 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x400000, |
80 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x500000, |
81 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x600000, |
82 | smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000 |
83 | }; |
84 | |
85 | static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[] = { |
86 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK, |
87 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000, |
88 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x200000, |
89 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x300000, |
90 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x400000, |
91 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x500000, |
92 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x600000, |
93 | smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x700000 |
94 | }; |
95 | |
96 | static const int walf_pcs_err_status_reg_aldebaran[] = { |
97 | smnPCS_GOPX1_PCS_ERROR_STATUS, |
98 | smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000 |
99 | }; |
100 | |
101 | static const int walf_pcs_err_noncorrectable_mask_reg_aldebaran[] = { |
102 | smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK, |
103 | smnPCS_GOPX1_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000 |
104 | }; |
105 | |
106 | static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = { |
107 | {"XGMI PCS DataLossErr" , |
108 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)}, |
109 | {"XGMI PCS TrainingErr" , |
110 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TrainingErr)}, |
111 | {"XGMI PCS CRCErr" , |
112 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, CRCErr)}, |
113 | {"XGMI PCS BERExceededErr" , |
114 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, BERExceededErr)}, |
115 | {"XGMI PCS TxMetaDataErr" , |
116 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, TxMetaDataErr)}, |
117 | {"XGMI PCS ReplayBufParityErr" , |
118 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayBufParityErr)}, |
119 | {"XGMI PCS DataParityErr" , |
120 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataParityErr)}, |
121 | {"XGMI PCS ReplayFifoOverflowErr" , |
122 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, |
123 | {"XGMI PCS ReplayFifoUnderflowErr" , |
124 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, |
125 | {"XGMI PCS ElasticFifoOverflowErr" , |
126 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, |
127 | {"XGMI PCS DeskewErr" , |
128 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DeskewErr)}, |
129 | {"XGMI PCS DataStartupLimitErr" , |
130 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataStartupLimitErr)}, |
131 | {"XGMI PCS FCInitTimeoutErr" , |
132 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, FCInitTimeoutErr)}, |
133 | {"XGMI PCS RecoveryTimeoutErr" , |
134 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, |
135 | {"XGMI PCS ReadySerialTimeoutErr" , |
136 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, |
137 | {"XGMI PCS ReadySerialAttemptErr" , |
138 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, |
139 | {"XGMI PCS RecoveryAttemptErr" , |
140 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryAttemptErr)}, |
141 | {"XGMI PCS RecoveryRelockAttemptErr" , |
142 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, |
143 | }; |
144 | |
145 | static const struct amdgpu_pcs_ras_field wafl_pcs_ras_fields[] = { |
146 | {"WAFL PCS DataLossErr" , |
147 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataLossErr)}, |
148 | {"WAFL PCS TrainingErr" , |
149 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TrainingErr)}, |
150 | {"WAFL PCS CRCErr" , |
151 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, CRCErr)}, |
152 | {"WAFL PCS BERExceededErr" , |
153 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, BERExceededErr)}, |
154 | {"WAFL PCS TxMetaDataErr" , |
155 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, TxMetaDataErr)}, |
156 | {"WAFL PCS ReplayBufParityErr" , |
157 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayBufParityErr)}, |
158 | {"WAFL PCS DataParityErr" , |
159 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataParityErr)}, |
160 | {"WAFL PCS ReplayFifoOverflowErr" , |
161 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, |
162 | {"WAFL PCS ReplayFifoUnderflowErr" , |
163 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, |
164 | {"WAFL PCS ElasticFifoOverflowErr" , |
165 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, |
166 | {"WAFL PCS DeskewErr" , |
167 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DeskewErr)}, |
168 | {"WAFL PCS DataStartupLimitErr" , |
169 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, DataStartupLimitErr)}, |
170 | {"WAFL PCS FCInitTimeoutErr" , |
171 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, FCInitTimeoutErr)}, |
172 | {"WAFL PCS RecoveryTimeoutErr" , |
173 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, |
174 | {"WAFL PCS ReadySerialTimeoutErr" , |
175 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, |
176 | {"WAFL PCS ReadySerialAttemptErr" , |
177 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, |
178 | {"WAFL PCS RecoveryAttemptErr" , |
179 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryAttemptErr)}, |
180 | {"WAFL PCS RecoveryRelockAttemptErr" , |
181 | SOC15_REG_FIELD(PCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, |
182 | }; |
183 | |
184 | static const struct amdgpu_pcs_ras_field xgmi3x16_pcs_ras_fields[] = { |
185 | {"XGMI3X16 PCS DataLossErr" , |
186 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataLossErr)}, |
187 | {"XGMI3X16 PCS TrainingErr" , |
188 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TrainingErr)}, |
189 | {"XGMI3X16 PCS FlowCtrlAckErr" , |
190 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FlowCtrlAckErr)}, |
191 | {"XGMI3X16 PCS RxFifoUnderflowErr" , |
192 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxFifoUnderflowErr)}, |
193 | {"XGMI3X16 PCS RxFifoOverflowErr" , |
194 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxFifoOverflowErr)}, |
195 | {"XGMI3X16 PCS CRCErr" , |
196 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, CRCErr)}, |
197 | {"XGMI3X16 PCS BERExceededErr" , |
198 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, BERExceededErr)}, |
199 | {"XGMI3X16 PCS TxVcidDataErr" , |
200 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TxVcidDataErr)}, |
201 | {"XGMI3X16 PCS ReplayBufParityErr" , |
202 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayBufParityErr)}, |
203 | {"XGMI3X16 PCS DataParityErr" , |
204 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataParityErr)}, |
205 | {"XGMI3X16 PCS ReplayFifoOverflowErr" , |
206 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayFifoOverflowErr)}, |
207 | {"XGMI3X16 PCS ReplayFifoUnderflowErr" , |
208 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayFifoUnderflowErr)}, |
209 | {"XGMI3X16 PCS ElasticFifoOverflowErr" , |
210 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ElasticFifoOverflowErr)}, |
211 | {"XGMI3X16 PCS DeskewErr" , |
212 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DeskewErr)}, |
213 | {"XGMI3X16 PCS FlowCtrlCRCErr" , |
214 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FlowCtrlCRCErr)}, |
215 | {"XGMI3X16 PCS DataStartupLimitErr" , |
216 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, DataStartupLimitErr)}, |
217 | {"XGMI3X16 PCS FCInitTimeoutErr" , |
218 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, FCInitTimeoutErr)}, |
219 | {"XGMI3X16 PCS RecoveryTimeoutErr" , |
220 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryTimeoutErr)}, |
221 | {"XGMI3X16 PCS ReadySerialTimeoutErr" , |
222 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReadySerialTimeoutErr)}, |
223 | {"XGMI3X16 PCS ReadySerialAttemptErr" , |
224 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReadySerialAttemptErr)}, |
225 | {"XGMI3X16 PCS RecoveryAttemptErr" , |
226 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryAttemptErr)}, |
227 | {"XGMI3X16 PCS RecoveryRelockAttemptErr" , |
228 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RecoveryRelockAttemptErr)}, |
229 | {"XGMI3X16 PCS ReplayAttemptErr" , |
230 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, ReplayAttemptErr)}, |
231 | {"XGMI3X16 PCS SyncHdrErr" , |
232 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, SyncHdrErr)}, |
233 | {"XGMI3X16 PCS TxReplayTimeoutErr" , |
234 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, TxReplayTimeoutErr)}, |
235 | {"XGMI3X16 PCS RxReplayTimeoutErr" , |
236 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxReplayTimeoutErr)}, |
237 | {"XGMI3X16 PCS LinkSubTxTimeoutErr" , |
238 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, LinkSubTxTimeoutErr)}, |
239 | {"XGMI3X16 PCS LinkSubRxTimeoutErr" , |
240 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, LinkSubRxTimeoutErr)}, |
241 | {"XGMI3X16 PCS RxCMDPktErr" , |
242 | SOC15_REG_FIELD(PCS_XGMI3X16_PCS_ERROR_STATUS, RxCMDPktErr)}, |
243 | }; |
244 | |
245 | /** |
246 | * DOC: AMDGPU XGMI Support |
247 | * |
248 | * XGMI is a high speed interconnect that joins multiple GPU cards |
249 | * into a homogeneous memory space that is organized by a collective |
250 | * hive ID and individual node IDs, both of which are 64-bit numbers. |
251 | * |
252 | * The file xgmi_device_id contains the unique per GPU device ID and |
253 | * is stored in the /sys/class/drm/card${cardno}/device/ directory. |
254 | * |
255 | * Inside the device directory a sub-directory 'xgmi_hive_info' is |
256 | * created which contains the hive ID and the list of nodes. |
257 | * |
258 | * The hive ID is stored in: |
259 | * /sys/class/drm/card${cardno}/device/xgmi_hive_info/xgmi_hive_id |
260 | * |
261 | * The node information is stored in numbered directories: |
262 | * /sys/class/drm/card${cardno}/device/xgmi_hive_info/node${nodeno}/xgmi_device_id |
263 | * |
264 | * Each device has their own xgmi_hive_info direction with a mirror |
265 | * set of node sub-directories. |
266 | * |
267 | * The XGMI memory space is built by contiguously adding the power of |
268 | * two padded VRAM space from each node to each other. |
269 | * |
270 | */ |
271 | |
272 | static struct attribute amdgpu_xgmi_hive_id = { |
273 | .name = "xgmi_hive_id" , |
274 | .mode = S_IRUGO |
275 | }; |
276 | |
277 | static struct attribute *amdgpu_xgmi_hive_attrs[] = { |
278 | &amdgpu_xgmi_hive_id, |
279 | NULL |
280 | }; |
281 | ATTRIBUTE_GROUPS(amdgpu_xgmi_hive); |
282 | |
283 | static ssize_t amdgpu_xgmi_show_attrs(struct kobject *kobj, |
284 | struct attribute *attr, char *buf) |
285 | { |
286 | struct amdgpu_hive_info *hive = container_of( |
287 | kobj, struct amdgpu_hive_info, kobj); |
288 | |
289 | if (attr == &amdgpu_xgmi_hive_id) |
290 | return snprintf(buf, PAGE_SIZE, fmt: "%llu\n" , hive->hive_id); |
291 | |
292 | return 0; |
293 | } |
294 | |
295 | static void amdgpu_xgmi_hive_release(struct kobject *kobj) |
296 | { |
297 | struct amdgpu_hive_info *hive = container_of( |
298 | kobj, struct amdgpu_hive_info, kobj); |
299 | |
300 | amdgpu_reset_put_reset_domain(domain: hive->reset_domain); |
301 | hive->reset_domain = NULL; |
302 | |
303 | mutex_destroy(lock: &hive->hive_lock); |
304 | kfree(objp: hive); |
305 | } |
306 | |
307 | static const struct sysfs_ops amdgpu_xgmi_hive_ops = { |
308 | .show = amdgpu_xgmi_show_attrs, |
309 | }; |
310 | |
311 | static const struct kobj_type amdgpu_xgmi_hive_type = { |
312 | .release = amdgpu_xgmi_hive_release, |
313 | .sysfs_ops = &amdgpu_xgmi_hive_ops, |
314 | .default_groups = amdgpu_xgmi_hive_groups, |
315 | }; |
316 | |
317 | static ssize_t amdgpu_xgmi_show_device_id(struct device *dev, |
318 | struct device_attribute *attr, |
319 | char *buf) |
320 | { |
321 | struct drm_device *ddev = dev_get_drvdata(dev); |
322 | struct amdgpu_device *adev = drm_to_adev(ddev); |
323 | |
324 | return sysfs_emit(buf, fmt: "%llu\n" , adev->gmc.xgmi.node_id); |
325 | |
326 | } |
327 | |
328 | static ssize_t amdgpu_xgmi_show_physical_id(struct device *dev, |
329 | struct device_attribute *attr, |
330 | char *buf) |
331 | { |
332 | struct drm_device *ddev = dev_get_drvdata(dev); |
333 | struct amdgpu_device *adev = drm_to_adev(ddev); |
334 | |
335 | return sysfs_emit(buf, fmt: "%u\n" , adev->gmc.xgmi.physical_node_id); |
336 | |
337 | } |
338 | |
339 | static ssize_t amdgpu_xgmi_show_num_hops(struct device *dev, |
340 | struct device_attribute *attr, |
341 | char *buf) |
342 | { |
343 | struct drm_device *ddev = dev_get_drvdata(dev); |
344 | struct amdgpu_device *adev = drm_to_adev(ddev); |
345 | struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; |
346 | int i; |
347 | |
348 | for (i = 0; i < top->num_nodes; i++) |
349 | sprintf(buf: buf + 3 * i, fmt: "%02x " , top->nodes[i].num_hops); |
350 | |
351 | return sysfs_emit(buf, fmt: "%s\n" , buf); |
352 | } |
353 | |
354 | static ssize_t amdgpu_xgmi_show_num_links(struct device *dev, |
355 | struct device_attribute *attr, |
356 | char *buf) |
357 | { |
358 | struct drm_device *ddev = dev_get_drvdata(dev); |
359 | struct amdgpu_device *adev = drm_to_adev(ddev); |
360 | struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; |
361 | int i; |
362 | |
363 | for (i = 0; i < top->num_nodes; i++) |
364 | sprintf(buf: buf + 3 * i, fmt: "%02x " , top->nodes[i].num_links); |
365 | |
366 | return sysfs_emit(buf, fmt: "%s\n" , buf); |
367 | } |
368 | |
369 | #define AMDGPU_XGMI_SET_FICAA(o) ((o) | 0x456801) |
370 | static ssize_t amdgpu_xgmi_show_error(struct device *dev, |
371 | struct device_attribute *attr, |
372 | char *buf) |
373 | { |
374 | struct drm_device *ddev = dev_get_drvdata(dev); |
375 | struct amdgpu_device *adev = drm_to_adev(ddev); |
376 | uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in; |
377 | uint64_t fica_out; |
378 | unsigned int error_count = 0; |
379 | |
380 | ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200); |
381 | ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208); |
382 | |
383 | if ((!adev->df.funcs) || |
384 | (!adev->df.funcs->get_fica) || |
385 | (!adev->df.funcs->set_fica)) |
386 | return -EINVAL; |
387 | |
388 | fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_ctl_in); |
389 | if (fica_out != 0x1f) |
390 | pr_err("xGMI error counters not enabled!\n" ); |
391 | |
392 | fica_out = adev->df.funcs->get_fica(adev, ficaa_pie_status_in); |
393 | |
394 | if ((fica_out & 0xffff) == 2) |
395 | error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63); |
396 | |
397 | adev->df.funcs->set_fica(adev, ficaa_pie_status_in, 0, 0); |
398 | |
399 | return sysfs_emit(buf, fmt: "%u\n" , error_count); |
400 | } |
401 | |
402 | |
403 | static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL); |
404 | static DEVICE_ATTR(xgmi_physical_id, 0444, amdgpu_xgmi_show_physical_id, NULL); |
405 | static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, NULL); |
406 | static DEVICE_ATTR(xgmi_num_hops, S_IRUGO, amdgpu_xgmi_show_num_hops, NULL); |
407 | static DEVICE_ATTR(xgmi_num_links, S_IRUGO, amdgpu_xgmi_show_num_links, NULL); |
408 | |
409 | static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev, |
410 | struct amdgpu_hive_info *hive) |
411 | { |
412 | int ret = 0; |
413 | char node[10] = { 0 }; |
414 | |
415 | /* Create xgmi device id file */ |
416 | ret = device_create_file(device: adev->dev, entry: &dev_attr_xgmi_device_id); |
417 | if (ret) { |
418 | dev_err(adev->dev, "XGMI: Failed to create device file xgmi_device_id\n" ); |
419 | return ret; |
420 | } |
421 | |
422 | ret = device_create_file(device: adev->dev, entry: &dev_attr_xgmi_physical_id); |
423 | if (ret) { |
424 | dev_err(adev->dev, "XGMI: Failed to create device file xgmi_physical_id\n" ); |
425 | return ret; |
426 | } |
427 | |
428 | /* Create xgmi error file */ |
429 | ret = device_create_file(device: adev->dev, entry: &dev_attr_xgmi_error); |
430 | if (ret) |
431 | pr_err("failed to create xgmi_error\n" ); |
432 | |
433 | /* Create xgmi num hops file */ |
434 | ret = device_create_file(device: adev->dev, entry: &dev_attr_xgmi_num_hops); |
435 | if (ret) |
436 | pr_err("failed to create xgmi_num_hops\n" ); |
437 | |
438 | /* Create xgmi num links file */ |
439 | ret = device_create_file(device: adev->dev, entry: &dev_attr_xgmi_num_links); |
440 | if (ret) |
441 | pr_err("failed to create xgmi_num_links\n" ); |
442 | |
443 | /* Create sysfs link to hive info folder on the first device */ |
444 | if (hive->kobj.parent != (&adev->dev->kobj)) { |
445 | ret = sysfs_create_link(kobj: &adev->dev->kobj, target: &hive->kobj, |
446 | name: "xgmi_hive_info" ); |
447 | if (ret) { |
448 | dev_err(adev->dev, "XGMI: Failed to create link to hive info" ); |
449 | goto remove_file; |
450 | } |
451 | } |
452 | |
453 | sprintf(buf: node, fmt: "node%d" , atomic_read(v: &hive->number_devices)); |
454 | /* Create sysfs link form the hive folder to yourself */ |
455 | ret = sysfs_create_link(kobj: &hive->kobj, target: &adev->dev->kobj, name: node); |
456 | if (ret) { |
457 | dev_err(adev->dev, "XGMI: Failed to create link from hive info" ); |
458 | goto remove_link; |
459 | } |
460 | |
461 | goto success; |
462 | |
463 | |
464 | remove_link: |
465 | sysfs_remove_link(kobj: &adev->dev->kobj, name: adev_to_drm(adev)->unique); |
466 | |
467 | remove_file: |
468 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_device_id); |
469 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_physical_id); |
470 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_error); |
471 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_num_hops); |
472 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_num_links); |
473 | |
474 | success: |
475 | return ret; |
476 | } |
477 | |
478 | static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev, |
479 | struct amdgpu_hive_info *hive) |
480 | { |
481 | char node[10]; |
482 | memset(node, 0, sizeof(node)); |
483 | |
484 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_device_id); |
485 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_physical_id); |
486 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_error); |
487 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_num_hops); |
488 | device_remove_file(dev: adev->dev, attr: &dev_attr_xgmi_num_links); |
489 | |
490 | if (hive->kobj.parent != (&adev->dev->kobj)) |
491 | sysfs_remove_link(kobj: &adev->dev->kobj,name: "xgmi_hive_info" ); |
492 | |
493 | sprintf(buf: node, fmt: "node%d" , atomic_read(v: &hive->number_devices)); |
494 | sysfs_remove_link(kobj: &hive->kobj, name: node); |
495 | |
496 | } |
497 | |
498 | |
499 | |
500 | struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) |
501 | { |
502 | struct amdgpu_hive_info *hive = NULL; |
503 | int ret; |
504 | |
505 | if (!adev->gmc.xgmi.hive_id) |
506 | return NULL; |
507 | |
508 | if (adev->hive) { |
509 | kobject_get(kobj: &adev->hive->kobj); |
510 | return adev->hive; |
511 | } |
512 | |
513 | mutex_lock(&xgmi_mutex); |
514 | |
515 | list_for_each_entry(hive, &xgmi_hive_list, node) { |
516 | if (hive->hive_id == adev->gmc.xgmi.hive_id) |
517 | goto pro_end; |
518 | } |
519 | |
520 | hive = kzalloc(size: sizeof(*hive), GFP_KERNEL); |
521 | if (!hive) { |
522 | dev_err(adev->dev, "XGMI: allocation failed\n" ); |
523 | ret = -ENOMEM; |
524 | hive = NULL; |
525 | goto pro_end; |
526 | } |
527 | |
528 | /* initialize new hive if not exist */ |
529 | ret = kobject_init_and_add(kobj: &hive->kobj, |
530 | ktype: &amdgpu_xgmi_hive_type, |
531 | parent: &adev->dev->kobj, |
532 | fmt: "%s" , "xgmi_hive_info" ); |
533 | if (ret) { |
534 | dev_err(adev->dev, "XGMI: failed initializing kobject for xgmi hive\n" ); |
535 | kobject_put(kobj: &hive->kobj); |
536 | hive = NULL; |
537 | goto pro_end; |
538 | } |
539 | |
540 | /** |
541 | * Only init hive->reset_domain for none SRIOV configuration. For SRIOV, |
542 | * Host driver decide how to reset the GPU either through FLR or chain reset. |
543 | * Guest side will get individual notifications from the host for the FLR |
544 | * if necessary. |
545 | */ |
546 | if (!amdgpu_sriov_vf(adev)) { |
547 | /** |
548 | * Avoid recreating reset domain when hive is reconstructed for the case |
549 | * of reset the devices in the XGMI hive during probe for passthrough GPU |
550 | * See https://www.spinics.net/lists/amd-gfx/msg58836.html |
551 | */ |
552 | if (adev->reset_domain->type != XGMI_HIVE) { |
553 | hive->reset_domain = |
554 | amdgpu_reset_create_reset_domain(type: XGMI_HIVE, wq_name: "amdgpu-reset-hive" ); |
555 | if (!hive->reset_domain) { |
556 | dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n" ); |
557 | ret = -ENOMEM; |
558 | kobject_put(kobj: &hive->kobj); |
559 | hive = NULL; |
560 | goto pro_end; |
561 | } |
562 | } else { |
563 | amdgpu_reset_get_reset_domain(domain: adev->reset_domain); |
564 | hive->reset_domain = adev->reset_domain; |
565 | } |
566 | } |
567 | |
568 | hive->hive_id = adev->gmc.xgmi.hive_id; |
569 | INIT_LIST_HEAD(list: &hive->device_list); |
570 | INIT_LIST_HEAD(list: &hive->node); |
571 | mutex_init(&hive->hive_lock); |
572 | atomic_set(v: &hive->number_devices, i: 0); |
573 | task_barrier_init(tb: &hive->tb); |
574 | hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; |
575 | hive->hi_req_gpu = NULL; |
576 | |
577 | /* |
578 | * hive pstate on boot is high in vega20 so we have to go to low |
579 | * pstate on after boot. |
580 | */ |
581 | hive->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE; |
582 | list_add_tail(new: &hive->node, head: &xgmi_hive_list); |
583 | |
584 | pro_end: |
585 | if (hive) |
586 | kobject_get(kobj: &hive->kobj); |
587 | mutex_unlock(lock: &xgmi_mutex); |
588 | return hive; |
589 | } |
590 | |
591 | void amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive) |
592 | { |
593 | if (hive) |
594 | kobject_put(kobj: &hive->kobj); |
595 | } |
596 | |
597 | int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate) |
598 | { |
599 | int ret = 0; |
600 | struct amdgpu_hive_info *hive; |
601 | struct amdgpu_device *request_adev; |
602 | bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20; |
603 | bool init_low; |
604 | |
605 | hive = amdgpu_get_xgmi_hive(adev); |
606 | if (!hive) |
607 | return 0; |
608 | |
609 | request_adev = hive->hi_req_gpu ? hive->hi_req_gpu : adev; |
610 | init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN; |
611 | amdgpu_put_xgmi_hive(hive); |
612 | /* fw bug so temporarily disable pstate switching */ |
613 | return 0; |
614 | |
615 | if (!hive || adev->asic_type != CHIP_VEGA20) |
616 | return 0; |
617 | |
618 | mutex_lock(&hive->hive_lock); |
619 | |
620 | if (is_hi_req) |
621 | hive->hi_req_count++; |
622 | else |
623 | hive->hi_req_count--; |
624 | |
625 | /* |
626 | * Vega20 only needs single peer to request pstate high for the hive to |
627 | * go high but all peers must request pstate low for the hive to go low |
628 | */ |
629 | if (hive->pstate == pstate || |
630 | (!is_hi_req && hive->hi_req_count && !init_low)) |
631 | goto out; |
632 | |
633 | dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n" , pstate); |
634 | |
635 | ret = amdgpu_dpm_set_xgmi_pstate(adev: request_adev, pstate); |
636 | if (ret) { |
637 | dev_err(request_adev->dev, |
638 | "XGMI: Set pstate failure on device %llx, hive %llx, ret %d" , |
639 | request_adev->gmc.xgmi.node_id, |
640 | request_adev->gmc.xgmi.hive_id, ret); |
641 | goto out; |
642 | } |
643 | |
644 | if (init_low) |
645 | hive->pstate = hive->hi_req_count ? |
646 | hive->pstate : AMDGPU_XGMI_PSTATE_MIN; |
647 | else { |
648 | hive->pstate = pstate; |
649 | hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ? |
650 | adev : NULL; |
651 | } |
652 | out: |
653 | mutex_unlock(lock: &hive->hive_lock); |
654 | return ret; |
655 | } |
656 | |
657 | int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev) |
658 | { |
659 | int ret; |
660 | |
661 | if (amdgpu_sriov_vf(adev)) |
662 | return 0; |
663 | |
664 | /* Each psp need to set the latest topology */ |
665 | ret = psp_xgmi_set_topology_info(psp: &adev->psp, |
666 | number_devices: atomic_read(v: &hive->number_devices), |
667 | topology: &adev->psp.xgmi_context.top_info); |
668 | if (ret) |
669 | dev_err(adev->dev, |
670 | "XGMI: Set topology failure on device %llx, hive %llx, ret %d" , |
671 | adev->gmc.xgmi.node_id, |
672 | adev->gmc.xgmi.hive_id, ret); |
673 | |
674 | return ret; |
675 | } |
676 | |
677 | |
678 | /* |
679 | * NOTE psp_xgmi_node_info.num_hops layout is as follows: |
680 | * num_hops[7:6] = link type (0 = xGMI2, 1 = xGMI3, 2/3 = reserved) |
681 | * num_hops[5:3] = reserved |
682 | * num_hops[2:0] = number of hops |
683 | */ |
684 | int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev, |
685 | struct amdgpu_device *peer_adev) |
686 | { |
687 | struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; |
688 | uint8_t num_hops_mask = 0x7; |
689 | int i; |
690 | |
691 | for (i = 0 ; i < top->num_nodes; ++i) |
692 | if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) |
693 | return top->nodes[i].num_hops & num_hops_mask; |
694 | return -EINVAL; |
695 | } |
696 | |
697 | int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev, |
698 | struct amdgpu_device *peer_adev) |
699 | { |
700 | struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; |
701 | int i; |
702 | |
703 | for (i = 0 ; i < top->num_nodes; ++i) |
704 | if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) |
705 | return top->nodes[i].num_links; |
706 | return -EINVAL; |
707 | } |
708 | |
709 | /* |
710 | * Devices that support extended data require the entire hive to initialize with |
711 | * the shared memory buffer flag set. |
712 | * |
713 | * Hive locks and conditions apply - see amdgpu_xgmi_add_device |
714 | */ |
715 | static int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_info *hive, |
716 | bool set_extended_data) |
717 | { |
718 | struct amdgpu_device *tmp_adev; |
719 | int ret; |
720 | |
721 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { |
722 | ret = psp_xgmi_initialize(psp: &tmp_adev->psp, set_extended_data, load_ta: false); |
723 | if (ret) { |
724 | dev_err(tmp_adev->dev, |
725 | "XGMI: Failed to initialize xgmi session for data partition %i\n" , |
726 | set_extended_data); |
727 | return ret; |
728 | } |
729 | |
730 | } |
731 | |
732 | return 0; |
733 | } |
734 | |
735 | int amdgpu_xgmi_add_device(struct amdgpu_device *adev) |
736 | { |
737 | struct psp_xgmi_topology_info *top_info; |
738 | struct amdgpu_hive_info *hive; |
739 | struct amdgpu_xgmi *entry; |
740 | struct amdgpu_device *tmp_adev = NULL; |
741 | |
742 | int count = 0, ret = 0; |
743 | |
744 | if (!adev->gmc.xgmi.supported) |
745 | return 0; |
746 | |
747 | if (!adev->gmc.xgmi.pending_reset && |
748 | amdgpu_device_ip_get_ip_block(adev, type: AMD_IP_BLOCK_TYPE_PSP)) { |
749 | ret = psp_xgmi_initialize(psp: &adev->psp, set_extended_data: false, load_ta: true); |
750 | if (ret) { |
751 | dev_err(adev->dev, |
752 | "XGMI: Failed to initialize xgmi session\n" ); |
753 | return ret; |
754 | } |
755 | |
756 | ret = psp_xgmi_get_hive_id(psp: &adev->psp, hive_id: &adev->gmc.xgmi.hive_id); |
757 | if (ret) { |
758 | dev_err(adev->dev, |
759 | "XGMI: Failed to get hive id\n" ); |
760 | return ret; |
761 | } |
762 | |
763 | ret = psp_xgmi_get_node_id(psp: &adev->psp, node_id: &adev->gmc.xgmi.node_id); |
764 | if (ret) { |
765 | dev_err(adev->dev, |
766 | "XGMI: Failed to get node id\n" ); |
767 | return ret; |
768 | } |
769 | } else { |
770 | adev->gmc.xgmi.hive_id = 16; |
771 | adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16; |
772 | } |
773 | |
774 | hive = amdgpu_get_xgmi_hive(adev); |
775 | if (!hive) { |
776 | ret = -EINVAL; |
777 | dev_err(adev->dev, |
778 | "XGMI: node 0x%llx, can not match hive 0x%llx in the hive list.\n" , |
779 | adev->gmc.xgmi.node_id, adev->gmc.xgmi.hive_id); |
780 | goto exit; |
781 | } |
782 | mutex_lock(&hive->hive_lock); |
783 | |
784 | top_info = &adev->psp.xgmi_context.top_info; |
785 | |
786 | list_add_tail(new: &adev->gmc.xgmi.head, head: &hive->device_list); |
787 | list_for_each_entry(entry, &hive->device_list, head) |
788 | top_info->nodes[count++].node_id = entry->node_id; |
789 | top_info->num_nodes = count; |
790 | atomic_set(v: &hive->number_devices, i: count); |
791 | |
792 | task_barrier_add_task(tb: &hive->tb); |
793 | |
794 | if (!adev->gmc.xgmi.pending_reset && |
795 | amdgpu_device_ip_get_ip_block(adev, type: AMD_IP_BLOCK_TYPE_PSP)) { |
796 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { |
797 | /* update node list for other device in the hive */ |
798 | if (tmp_adev != adev) { |
799 | top_info = &tmp_adev->psp.xgmi_context.top_info; |
800 | top_info->nodes[count - 1].node_id = |
801 | adev->gmc.xgmi.node_id; |
802 | top_info->num_nodes = count; |
803 | } |
804 | ret = amdgpu_xgmi_update_topology(hive, adev: tmp_adev); |
805 | if (ret) |
806 | goto exit_unlock; |
807 | } |
808 | |
809 | /* get latest topology info for each device from psp */ |
810 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { |
811 | ret = psp_xgmi_get_topology_info(psp: &tmp_adev->psp, number_devices: count, |
812 | topology: &tmp_adev->psp.xgmi_context.top_info, get_extended_data: false); |
813 | if (ret) { |
814 | dev_err(tmp_adev->dev, |
815 | "XGMI: Get topology failure on device %llx, hive %llx, ret %d" , |
816 | tmp_adev->gmc.xgmi.node_id, |
817 | tmp_adev->gmc.xgmi.hive_id, ret); |
818 | /* To do : continue with some node failed or disable the whole hive */ |
819 | goto exit_unlock; |
820 | } |
821 | } |
822 | |
823 | /* get topology again for hives that support extended data */ |
824 | if (adev->psp.xgmi_context.supports_extended_data) { |
825 | |
826 | /* initialize the hive to get extended data. */ |
827 | ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, set_extended_data: true); |
828 | if (ret) |
829 | goto exit_unlock; |
830 | |
831 | /* get the extended data. */ |
832 | list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { |
833 | ret = psp_xgmi_get_topology_info(psp: &tmp_adev->psp, number_devices: count, |
834 | topology: &tmp_adev->psp.xgmi_context.top_info, get_extended_data: true); |
835 | if (ret) { |
836 | dev_err(tmp_adev->dev, |
837 | "XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d" , |
838 | tmp_adev->gmc.xgmi.node_id, |
839 | tmp_adev->gmc.xgmi.hive_id, ret); |
840 | goto exit_unlock; |
841 | } |
842 | } |
843 | |
844 | /* initialize the hive to get non-extended data for the next round. */ |
845 | ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, set_extended_data: false); |
846 | if (ret) |
847 | goto exit_unlock; |
848 | |
849 | } |
850 | } |
851 | |
852 | if (!ret && !adev->gmc.xgmi.pending_reset) |
853 | ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive); |
854 | |
855 | exit_unlock: |
856 | mutex_unlock(lock: &hive->hive_lock); |
857 | exit: |
858 | if (!ret) { |
859 | adev->hive = hive; |
860 | dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n" , |
861 | adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id); |
862 | } else { |
863 | amdgpu_put_xgmi_hive(hive); |
864 | dev_err(adev->dev, "XGMI: Failed to add node %d, hive 0x%llx ret: %d\n" , |
865 | adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id, |
866 | ret); |
867 | } |
868 | |
869 | return ret; |
870 | } |
871 | |
872 | int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) |
873 | { |
874 | struct amdgpu_hive_info *hive = adev->hive; |
875 | |
876 | if (!adev->gmc.xgmi.supported) |
877 | return -EINVAL; |
878 | |
879 | if (!hive) |
880 | return -EINVAL; |
881 | |
882 | mutex_lock(&hive->hive_lock); |
883 | task_barrier_rem_task(tb: &hive->tb); |
884 | amdgpu_xgmi_sysfs_rem_dev_info(adev, hive); |
885 | if (hive->hi_req_gpu == adev) |
886 | hive->hi_req_gpu = NULL; |
887 | list_del(entry: &adev->gmc.xgmi.head); |
888 | mutex_unlock(lock: &hive->hive_lock); |
889 | |
890 | amdgpu_put_xgmi_hive(hive); |
891 | adev->hive = NULL; |
892 | |
893 | if (atomic_dec_return(v: &hive->number_devices) == 0) { |
894 | /* Remove the hive from global hive list */ |
895 | mutex_lock(&xgmi_mutex); |
896 | list_del(entry: &hive->node); |
897 | mutex_unlock(lock: &xgmi_mutex); |
898 | |
899 | amdgpu_put_xgmi_hive(hive); |
900 | } |
901 | |
902 | return 0; |
903 | } |
904 | |
905 | static int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block) |
906 | { |
907 | if (!adev->gmc.xgmi.supported || |
908 | adev->gmc.xgmi.num_physical_nodes == 0) |
909 | return 0; |
910 | |
911 | amdgpu_ras_reset_error_count(adev, block: AMDGPU_RAS_BLOCK__XGMI_WAFL); |
912 | |
913 | return amdgpu_ras_block_late_init(adev, ras_block); |
914 | } |
915 | |
916 | uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, |
917 | uint64_t addr) |
918 | { |
919 | struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi; |
920 | return (addr + xgmi->physical_node_id * xgmi->node_segment_size); |
921 | } |
922 | |
923 | static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg) |
924 | { |
925 | WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF); |
926 | WREG32_PCIE(pcs_status_reg, 0); |
927 | } |
928 | |
929 | static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev) |
930 | { |
931 | uint32_t i; |
932 | |
933 | switch (adev->asic_type) { |
934 | case CHIP_ARCTURUS: |
935 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) |
936 | pcs_clear_status(adev, |
937 | pcs_status_reg: xgmi_pcs_err_status_reg_arct[i]); |
938 | break; |
939 | case CHIP_VEGA20: |
940 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) |
941 | pcs_clear_status(adev, |
942 | pcs_status_reg: xgmi_pcs_err_status_reg_vg20[i]); |
943 | break; |
944 | case CHIP_ALDEBARAN: |
945 | for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) |
946 | pcs_clear_status(adev, |
947 | pcs_status_reg: xgmi3x16_pcs_err_status_reg_aldebaran[i]); |
948 | for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) |
949 | pcs_clear_status(adev, |
950 | pcs_status_reg: walf_pcs_err_status_reg_aldebaran[i]); |
951 | break; |
952 | default: |
953 | break; |
954 | } |
955 | } |
956 | |
957 | static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, |
958 | uint32_t value, |
959 | uint32_t mask_value, |
960 | uint32_t *ue_count, |
961 | uint32_t *ce_count, |
962 | bool is_xgmi_pcs, |
963 | bool check_mask) |
964 | { |
965 | int i; |
966 | int ue_cnt = 0; |
967 | const struct amdgpu_pcs_ras_field *pcs_ras_fields = NULL; |
968 | uint32_t field_array_size = 0; |
969 | |
970 | if (is_xgmi_pcs) { |
971 | if (amdgpu_ip_version(adev, ip: XGMI_HWIP, inst: 0) == |
972 | IP_VERSION(6, 1, 0)) { |
973 | pcs_ras_fields = &xgmi3x16_pcs_ras_fields[0]; |
974 | field_array_size = ARRAY_SIZE(xgmi3x16_pcs_ras_fields); |
975 | } else { |
976 | pcs_ras_fields = &xgmi_pcs_ras_fields[0]; |
977 | field_array_size = ARRAY_SIZE(xgmi_pcs_ras_fields); |
978 | } |
979 | } else { |
980 | pcs_ras_fields = &wafl_pcs_ras_fields[0]; |
981 | field_array_size = ARRAY_SIZE(wafl_pcs_ras_fields); |
982 | } |
983 | |
984 | if (check_mask) |
985 | value = value & ~mask_value; |
986 | |
987 | /* query xgmi/walf pcs error status, |
988 | * only ue is supported */ |
989 | for (i = 0; value && i < field_array_size; i++) { |
990 | ue_cnt = (value & |
991 | pcs_ras_fields[i].pcs_err_mask) >> |
992 | pcs_ras_fields[i].pcs_err_shift; |
993 | if (ue_cnt) { |
994 | dev_info(adev->dev, "%s detected\n" , |
995 | pcs_ras_fields[i].err_name); |
996 | *ue_count += ue_cnt; |
997 | } |
998 | |
999 | /* reset bit value if the bit is checked */ |
1000 | value &= ~(pcs_ras_fields[i].pcs_err_mask); |
1001 | } |
1002 | |
1003 | return 0; |
1004 | } |
1005 | |
1006 | static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, |
1007 | void *ras_error_status) |
1008 | { |
1009 | struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; |
1010 | int i; |
1011 | uint32_t data, mask_data = 0; |
1012 | uint32_t ue_cnt = 0, ce_cnt = 0; |
1013 | |
1014 | if (!amdgpu_ras_is_supported(adev, block: AMDGPU_RAS_BLOCK__XGMI_WAFL)) |
1015 | return ; |
1016 | |
1017 | err_data->ue_count = 0; |
1018 | err_data->ce_count = 0; |
1019 | |
1020 | switch (adev->asic_type) { |
1021 | case CHIP_ARCTURUS: |
1022 | /* check xgmi pcs error */ |
1023 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) { |
1024 | data = RREG32_PCIE(xgmi_pcs_err_status_reg_arct[i]); |
1025 | if (data) |
1026 | amdgpu_xgmi_query_pcs_error_status(adev, value: data, |
1027 | mask_value: mask_data, ue_count: &ue_cnt, ce_count: &ce_cnt, is_xgmi_pcs: true, check_mask: false); |
1028 | } |
1029 | /* check wafl pcs error */ |
1030 | for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_arct); i++) { |
1031 | data = RREG32_PCIE(wafl_pcs_err_status_reg_arct[i]); |
1032 | if (data) |
1033 | amdgpu_xgmi_query_pcs_error_status(adev, value: data, |
1034 | mask_value: mask_data, ue_count: &ue_cnt, ce_count: &ce_cnt, is_xgmi_pcs: false, check_mask: false); |
1035 | } |
1036 | break; |
1037 | case CHIP_VEGA20: |
1038 | /* check xgmi pcs error */ |
1039 | for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) { |
1040 | data = RREG32_PCIE(xgmi_pcs_err_status_reg_vg20[i]); |
1041 | if (data) |
1042 | amdgpu_xgmi_query_pcs_error_status(adev, value: data, |
1043 | mask_value: mask_data, ue_count: &ue_cnt, ce_count: &ce_cnt, is_xgmi_pcs: true, check_mask: false); |
1044 | } |
1045 | /* check wafl pcs error */ |
1046 | for (i = 0; i < ARRAY_SIZE(wafl_pcs_err_status_reg_vg20); i++) { |
1047 | data = RREG32_PCIE(wafl_pcs_err_status_reg_vg20[i]); |
1048 | if (data) |
1049 | amdgpu_xgmi_query_pcs_error_status(adev, value: data, |
1050 | mask_value: mask_data, ue_count: &ue_cnt, ce_count: &ce_cnt, is_xgmi_pcs: false, check_mask: false); |
1051 | } |
1052 | break; |
1053 | case CHIP_ALDEBARAN: |
1054 | /* check xgmi3x16 pcs error */ |
1055 | for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) { |
1056 | data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]); |
1057 | mask_data = |
1058 | RREG32_PCIE(xgmi3x16_pcs_err_noncorrectable_mask_reg_aldebaran[i]); |
1059 | if (data) |
1060 | amdgpu_xgmi_query_pcs_error_status(adev, value: data, |
1061 | mask_value: mask_data, ue_count: &ue_cnt, ce_count: &ce_cnt, is_xgmi_pcs: true, check_mask: true); |
1062 | } |
1063 | /* check wafl pcs error */ |
1064 | for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) { |
1065 | data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]); |
1066 | mask_data = |
1067 | RREG32_PCIE(walf_pcs_err_noncorrectable_mask_reg_aldebaran[i]); |
1068 | if (data) |
1069 | amdgpu_xgmi_query_pcs_error_status(adev, value: data, |
1070 | mask_value: mask_data, ue_count: &ue_cnt, ce_count: &ce_cnt, is_xgmi_pcs: false, check_mask: true); |
1071 | } |
1072 | break; |
1073 | default: |
1074 | dev_warn(adev->dev, "XGMI RAS error query not supported" ); |
1075 | break; |
1076 | } |
1077 | |
1078 | amdgpu_ras_reset_error_count(adev, block: AMDGPU_RAS_BLOCK__XGMI_WAFL); |
1079 | |
1080 | err_data->ue_count += ue_cnt; |
1081 | err_data->ce_count += ce_cnt; |
1082 | } |
1083 | |
1084 | /* Trigger XGMI/WAFL error */ |
1085 | static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, |
1086 | void *inject_if, uint32_t instance_mask) |
1087 | { |
1088 | int ret = 0; |
1089 | struct ta_ras_trigger_error_input *block_info = |
1090 | (struct ta_ras_trigger_error_input *)inject_if; |
1091 | |
1092 | if (amdgpu_dpm_set_df_cstate(adev, cstate: DF_CSTATE_DISALLOW)) |
1093 | dev_warn(adev->dev, "Failed to disallow df cstate" ); |
1094 | |
1095 | if (amdgpu_dpm_set_xgmi_plpd_mode(adev, mode: XGMI_PLPD_DISALLOW)) |
1096 | dev_warn(adev->dev, "Failed to disallow XGMI power down" ); |
1097 | |
1098 | ret = psp_ras_trigger_error(psp: &adev->psp, info: block_info, instance_mask); |
1099 | |
1100 | if (amdgpu_ras_intr_triggered()) |
1101 | return ret; |
1102 | |
1103 | if (amdgpu_dpm_set_xgmi_plpd_mode(adev, mode: XGMI_PLPD_DEFAULT)) |
1104 | dev_warn(adev->dev, "Failed to allow XGMI power down" ); |
1105 | |
1106 | if (amdgpu_dpm_set_df_cstate(adev, cstate: DF_CSTATE_ALLOW)) |
1107 | dev_warn(adev->dev, "Failed to allow df cstate" ); |
1108 | |
1109 | return ret; |
1110 | } |
1111 | |
1112 | struct amdgpu_ras_block_hw_ops xgmi_ras_hw_ops = { |
1113 | .query_ras_error_count = amdgpu_xgmi_query_ras_error_count, |
1114 | .reset_ras_error_count = amdgpu_xgmi_reset_ras_error_count, |
1115 | .ras_error_inject = amdgpu_ras_error_inject_xgmi, |
1116 | }; |
1117 | |
1118 | struct amdgpu_xgmi_ras xgmi_ras = { |
1119 | .ras_block = { |
1120 | .hw_ops = &xgmi_ras_hw_ops, |
1121 | .ras_late_init = amdgpu_xgmi_ras_late_init, |
1122 | }, |
1123 | }; |
1124 | |
1125 | int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev) |
1126 | { |
1127 | int err; |
1128 | struct amdgpu_xgmi_ras *ras; |
1129 | |
1130 | if (!adev->gmc.xgmi.ras) |
1131 | return 0; |
1132 | |
1133 | ras = adev->gmc.xgmi.ras; |
1134 | err = amdgpu_ras_register_ras_block(adev, ras_block_obj: &ras->ras_block); |
1135 | if (err) { |
1136 | dev_err(adev->dev, "Failed to register xgmi_wafl_pcs ras block!\n" ); |
1137 | return err; |
1138 | } |
1139 | |
1140 | strcpy(p: ras->ras_block.ras_comm.name, q: "xgmi_wafl" ); |
1141 | ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__XGMI_WAFL; |
1142 | ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; |
1143 | adev->gmc.xgmi.ras_if = &ras->ras_block.ras_comm; |
1144 | |
1145 | return 0; |
1146 | } |
1147 | |