1 | /* |
2 | * Copyright 2014-2018 Advanced Micro Devices, Inc. |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice shall be included in |
12 | * all copies or substantial portions of the Software. |
13 | * |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
20 | * OTHER DEALINGS IN THE SOFTWARE. |
21 | */ |
22 | |
23 | #define pr_fmt(fmt) "kfd2kgd: " fmt |
24 | |
25 | #include <linux/module.h> |
26 | #include <linux/fdtable.h> |
27 | #include <linux/uaccess.h> |
28 | #include <linux/firmware.h> |
29 | #include <linux/mmu_context.h> |
30 | #include <drm/drmP.h> |
31 | #include "amdgpu.h" |
32 | #include "amdgpu_amdkfd.h" |
33 | #include "amdgpu_ucode.h" |
34 | #include "soc15_hw_ip.h" |
35 | #include "gc/gc_9_0_offset.h" |
36 | #include "gc/gc_9_0_sh_mask.h" |
37 | #include "vega10_enum.h" |
38 | #include "sdma0/sdma0_4_0_offset.h" |
39 | #include "sdma0/sdma0_4_0_sh_mask.h" |
40 | #include "sdma1/sdma1_4_0_offset.h" |
41 | #include "sdma1/sdma1_4_0_sh_mask.h" |
42 | #include "athub/athub_1_0_offset.h" |
43 | #include "athub/athub_1_0_sh_mask.h" |
44 | #include "oss/osssys_4_0_offset.h" |
45 | #include "oss/osssys_4_0_sh_mask.h" |
46 | #include "soc15_common.h" |
47 | #include "v9_structs.h" |
48 | #include "soc15.h" |
49 | #include "soc15d.h" |
50 | #include "mmhub_v1_0.h" |
51 | #include "gfxhub_v1_0.h" |
52 | |
53 | |
54 | #define V9_PIPE_PER_MEC (4) |
55 | #define V9_QUEUES_PER_PIPE_MEC (8) |
56 | |
57 | enum hqd_dequeue_request_type { |
58 | NO_ACTION = 0, |
59 | DRAIN_PIPE, |
60 | RESET_WAVES |
61 | }; |
62 | |
63 | /* |
64 | * Register access functions |
65 | */ |
66 | |
67 | static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, |
68 | uint32_t sh_mem_config, |
69 | uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit, |
70 | uint32_t sh_mem_bases); |
71 | static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, |
72 | unsigned int vmid); |
73 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id); |
74 | static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, |
75 | uint32_t queue_id, uint32_t __user *wptr, |
76 | uint32_t wptr_shift, uint32_t wptr_mask, |
77 | struct mm_struct *mm); |
78 | static int kgd_hqd_dump(struct kgd_dev *kgd, |
79 | uint32_t pipe_id, uint32_t queue_id, |
80 | uint32_t (**dump)[2], uint32_t *n_regs); |
81 | static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, |
82 | uint32_t __user *wptr, struct mm_struct *mm); |
83 | static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, |
84 | uint32_t engine_id, uint32_t queue_id, |
85 | uint32_t (**dump)[2], uint32_t *n_regs); |
86 | static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, |
87 | uint32_t pipe_id, uint32_t queue_id); |
88 | static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd); |
89 | static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, |
90 | enum kfd_preempt_type reset_type, |
91 | unsigned int utimeout, uint32_t pipe_id, |
92 | uint32_t queue_id); |
93 | static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, |
94 | unsigned int utimeout); |
95 | static int kgd_address_watch_disable(struct kgd_dev *kgd); |
96 | static int kgd_address_watch_execute(struct kgd_dev *kgd, |
97 | unsigned int watch_point_id, |
98 | uint32_t cntl_val, |
99 | uint32_t addr_hi, |
100 | uint32_t addr_lo); |
101 | static int kgd_wave_control_execute(struct kgd_dev *kgd, |
102 | uint32_t gfx_index_val, |
103 | uint32_t sq_cmd); |
104 | static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, |
105 | unsigned int watch_point_id, |
106 | unsigned int reg_offset); |
107 | |
108 | static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, |
109 | uint8_t vmid); |
110 | static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, |
111 | uint8_t vmid); |
112 | static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, |
113 | uint64_t page_table_base); |
114 | static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type); |
115 | static void set_scratch_backing_va(struct kgd_dev *kgd, |
116 | uint64_t va, uint32_t vmid); |
117 | static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid); |
118 | static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid); |
119 | |
120 | /* Because of REG_GET_FIELD() being used, we put this function in the |
121 | * asic specific file. |
122 | */ |
123 | static int amdgpu_amdkfd_get_tile_config(struct kgd_dev *kgd, |
124 | struct tile_config *config) |
125 | { |
126 | struct amdgpu_device *adev = (struct amdgpu_device *)kgd; |
127 | |
128 | config->gb_addr_config = adev->gfx.config.gb_addr_config; |
129 | |
130 | config->tile_config_ptr = adev->gfx.config.tile_mode_array; |
131 | config->num_tile_configs = |
132 | ARRAY_SIZE(adev->gfx.config.tile_mode_array); |
133 | config->macro_tile_config_ptr = |
134 | adev->gfx.config.macrotile_mode_array; |
135 | config->num_macro_tile_configs = |
136 | ARRAY_SIZE(adev->gfx.config.macrotile_mode_array); |
137 | |
138 | return 0; |
139 | } |
140 | |
141 | static const struct kfd2kgd_calls kfd2kgd = { |
142 | .program_sh_mem_settings = kgd_program_sh_mem_settings, |
143 | .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, |
144 | .init_interrupts = kgd_init_interrupts, |
145 | .hqd_load = kgd_hqd_load, |
146 | .hqd_sdma_load = kgd_hqd_sdma_load, |
147 | .hqd_dump = kgd_hqd_dump, |
148 | .hqd_sdma_dump = kgd_hqd_sdma_dump, |
149 | .hqd_is_occupied = kgd_hqd_is_occupied, |
150 | .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, |
151 | .hqd_destroy = kgd_hqd_destroy, |
152 | .hqd_sdma_destroy = kgd_hqd_sdma_destroy, |
153 | .address_watch_disable = kgd_address_watch_disable, |
154 | .address_watch_execute = kgd_address_watch_execute, |
155 | .wave_control_execute = kgd_wave_control_execute, |
156 | .address_watch_get_offset = kgd_address_watch_get_offset, |
157 | .get_atc_vmid_pasid_mapping_pasid = |
158 | get_atc_vmid_pasid_mapping_pasid, |
159 | .get_atc_vmid_pasid_mapping_valid = |
160 | get_atc_vmid_pasid_mapping_valid, |
161 | .get_fw_version = get_fw_version, |
162 | .set_scratch_backing_va = set_scratch_backing_va, |
163 | .get_tile_config = amdgpu_amdkfd_get_tile_config, |
164 | .set_vm_context_page_table_base = set_vm_context_page_table_base, |
165 | .invalidate_tlbs = invalidate_tlbs, |
166 | .invalidate_tlbs_vmid = invalidate_tlbs_vmid, |
167 | .get_hive_id = amdgpu_amdkfd_get_hive_id, |
168 | }; |
169 | |
170 | struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void) |
171 | { |
172 | return (struct kfd2kgd_calls *)&kfd2kgd; |
173 | } |
174 | |
175 | static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd) |
176 | { |
177 | return (struct amdgpu_device *)kgd; |
178 | } |
179 | |
180 | static void lock_srbm(struct kgd_dev *kgd, uint32_t mec, uint32_t pipe, |
181 | uint32_t queue, uint32_t vmid) |
182 | { |
183 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
184 | |
185 | mutex_lock(&adev->srbm_mutex); |
186 | soc15_grbm_select(adev, mec, pipe, queue, vmid); |
187 | } |
188 | |
189 | static void unlock_srbm(struct kgd_dev *kgd) |
190 | { |
191 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
192 | |
193 | soc15_grbm_select(adev, 0, 0, 0, 0); |
194 | mutex_unlock(&adev->srbm_mutex); |
195 | } |
196 | |
197 | static void acquire_queue(struct kgd_dev *kgd, uint32_t pipe_id, |
198 | uint32_t queue_id) |
199 | { |
200 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
201 | |
202 | uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; |
203 | uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); |
204 | |
205 | lock_srbm(kgd, mec, pipe, queue_id, 0); |
206 | } |
207 | |
208 | static uint32_t get_queue_mask(struct amdgpu_device *adev, |
209 | uint32_t pipe_id, uint32_t queue_id) |
210 | { |
211 | unsigned int bit = (pipe_id * adev->gfx.mec.num_queue_per_pipe + |
212 | queue_id) & 31; |
213 | |
214 | return ((uint32_t)1) << bit; |
215 | } |
216 | |
217 | static void release_queue(struct kgd_dev *kgd) |
218 | { |
219 | unlock_srbm(kgd); |
220 | } |
221 | |
222 | static void kgd_program_sh_mem_settings(struct kgd_dev *kgd, uint32_t vmid, |
223 | uint32_t sh_mem_config, |
224 | uint32_t sh_mem_ape1_base, |
225 | uint32_t sh_mem_ape1_limit, |
226 | uint32_t sh_mem_bases) |
227 | { |
228 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
229 | |
230 | lock_srbm(kgd, 0, 0, 0, vmid); |
231 | |
232 | WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config); |
233 | WREG32(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases); |
234 | /* APE1 no longer exists on GFX9 */ |
235 | |
236 | unlock_srbm(kgd); |
237 | } |
238 | |
239 | static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, |
240 | unsigned int vmid) |
241 | { |
242 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
243 | |
244 | /* |
245 | * We have to assume that there is no outstanding mapping. |
246 | * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because |
247 | * a mapping is in progress or because a mapping finished |
248 | * and the SW cleared it. |
249 | * So the protocol is to always wait & clear. |
250 | */ |
251 | uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | |
252 | ATC_VMID0_PASID_MAPPING__VALID_MASK; |
253 | |
254 | /* |
255 | * need to do this twice, once for gfx and once for mmhub |
256 | * for ATC add 16 to VMID for mmhub, for IH different registers. |
257 | * ATC_VMID0..15 registers are separate from ATC_VMID16..31. |
258 | */ |
259 | |
260 | WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid, |
261 | pasid_mapping); |
262 | |
263 | while (!(RREG32(SOC15_REG_OFFSET( |
264 | ATHUB, 0, |
265 | mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & |
266 | (1U << vmid))) |
267 | cpu_relax(); |
268 | |
269 | WREG32(SOC15_REG_OFFSET(ATHUB, 0, |
270 | mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), |
271 | 1U << vmid); |
272 | |
273 | /* Mapping vmid to pasid also for IH block */ |
274 | WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid, |
275 | pasid_mapping); |
276 | |
277 | WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid, |
278 | pasid_mapping); |
279 | |
280 | while (!(RREG32(SOC15_REG_OFFSET( |
281 | ATHUB, 0, |
282 | mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & |
283 | (1U << (vmid + 16)))) |
284 | cpu_relax(); |
285 | |
286 | WREG32(SOC15_REG_OFFSET(ATHUB, 0, |
287 | mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), |
288 | 1U << (vmid + 16)); |
289 | |
290 | /* Mapping vmid to pasid also for IH block */ |
291 | WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid, |
292 | pasid_mapping); |
293 | return 0; |
294 | } |
295 | |
296 | /* TODO - RING0 form of field is obsolete, seems to date back to SI |
297 | * but still works |
298 | */ |
299 | |
300 | static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) |
301 | { |
302 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
303 | uint32_t mec; |
304 | uint32_t pipe; |
305 | |
306 | mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; |
307 | pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); |
308 | |
309 | lock_srbm(kgd, mec, pipe, 0, 0); |
310 | |
311 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCPC_INT_CNTL), |
312 | CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | |
313 | CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); |
314 | |
315 | unlock_srbm(kgd); |
316 | |
317 | return 0; |
318 | } |
319 | |
320 | static uint32_t get_sdma_base_addr(struct amdgpu_device *adev, |
321 | unsigned int engine_id, |
322 | unsigned int queue_id) |
323 | { |
324 | uint32_t base[2] = { |
325 | SOC15_REG_OFFSET(SDMA0, 0, |
326 | mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL, |
327 | SOC15_REG_OFFSET(SDMA1, 0, |
328 | mmSDMA1_RLC0_RB_CNTL) - mmSDMA1_RLC0_RB_CNTL |
329 | }; |
330 | uint32_t retval; |
331 | |
332 | retval = base[engine_id] + queue_id * (mmSDMA0_RLC1_RB_CNTL - |
333 | mmSDMA0_RLC0_RB_CNTL); |
334 | |
335 | pr_debug("sdma base address: 0x%x\n" , retval); |
336 | |
337 | return retval; |
338 | } |
339 | |
340 | static inline struct v9_mqd *get_mqd(void *mqd) |
341 | { |
342 | return (struct v9_mqd *)mqd; |
343 | } |
344 | |
345 | static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) |
346 | { |
347 | return (struct v9_sdma_mqd *)mqd; |
348 | } |
349 | |
350 | static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, |
351 | uint32_t queue_id, uint32_t __user *wptr, |
352 | uint32_t wptr_shift, uint32_t wptr_mask, |
353 | struct mm_struct *mm) |
354 | { |
355 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
356 | struct v9_mqd *m; |
357 | uint32_t *mqd_hqd; |
358 | uint32_t reg, hqd_base, data; |
359 | |
360 | m = get_mqd(mqd); |
361 | |
362 | acquire_queue(kgd, pipe_id, queue_id); |
363 | |
364 | /* HIQ is set during driver init period with vmid set to 0*/ |
365 | if (m->cp_hqd_vmid == 0) { |
366 | uint32_t value, mec, pipe; |
367 | |
368 | mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; |
369 | pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); |
370 | |
371 | pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n" , |
372 | mec, pipe, queue_id); |
373 | value = RREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS)); |
374 | value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1, |
375 | ((mec << 5) | (pipe << 3) | queue_id | 0x80)); |
376 | WREG32(SOC15_REG_OFFSET(GC, 0, mmRLC_CP_SCHEDULERS), value); |
377 | } |
378 | |
379 | /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ |
380 | mqd_hqd = &m->cp_mqd_base_addr_lo; |
381 | hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); |
382 | |
383 | for (reg = hqd_base; |
384 | reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) |
385 | WREG32(reg, mqd_hqd[reg - hqd_base]); |
386 | |
387 | |
388 | /* Activate doorbell logic before triggering WPTR poll. */ |
389 | data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, |
390 | CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); |
391 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data); |
392 | |
393 | if (wptr) { |
394 | /* Don't read wptr with get_user because the user |
395 | * context may not be accessible (if this function |
396 | * runs in a work queue). Instead trigger a one-shot |
397 | * polling read from memory in the CP. This assumes |
398 | * that wptr is GPU-accessible in the queue's VMID via |
399 | * ATC or SVM. WPTR==RPTR before starting the poll so |
400 | * the CP starts fetching new commands from the right |
401 | * place. |
402 | * |
403 | * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit |
404 | * tricky. Assume that the queue didn't overflow. The |
405 | * number of valid bits in the 32-bit RPTR depends on |
406 | * the queue size. The remaining bits are taken from |
407 | * the saved 64-bit WPTR. If the WPTR wrapped, add the |
408 | * queue size. |
409 | */ |
410 | uint32_t queue_size = |
411 | 2 << REG_GET_FIELD(m->cp_hqd_pq_control, |
412 | CP_HQD_PQ_CONTROL, QUEUE_SIZE); |
413 | uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1); |
414 | |
415 | if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr) |
416 | guessed_wptr += queue_size; |
417 | guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1); |
418 | guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32; |
419 | |
420 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO), |
421 | lower_32_bits(guessed_wptr)); |
422 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI), |
423 | upper_32_bits(guessed_wptr)); |
424 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR), |
425 | lower_32_bits((uintptr_t)wptr)); |
426 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI), |
427 | upper_32_bits((uintptr_t)wptr)); |
428 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1), |
429 | get_queue_mask(adev, pipe_id, queue_id)); |
430 | } |
431 | |
432 | /* Start the EOP fetcher */ |
433 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR), |
434 | REG_SET_FIELD(m->cp_hqd_eop_rptr, |
435 | CP_HQD_EOP_RPTR, INIT_FETCHER, 1)); |
436 | |
437 | data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); |
438 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data); |
439 | |
440 | release_queue(kgd); |
441 | |
442 | return 0; |
443 | } |
444 | |
445 | static int kgd_hqd_dump(struct kgd_dev *kgd, |
446 | uint32_t pipe_id, uint32_t queue_id, |
447 | uint32_t (**dump)[2], uint32_t *n_regs) |
448 | { |
449 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
450 | uint32_t i = 0, reg; |
451 | #define HQD_N_REGS 56 |
452 | #define DUMP_REG(addr) do { \ |
453 | if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ |
454 | break; \ |
455 | (*dump)[i][0] = (addr) << 2; \ |
456 | (*dump)[i++][1] = RREG32(addr); \ |
457 | } while (0) |
458 | |
459 | *dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL); |
460 | if (*dump == NULL) |
461 | return -ENOMEM; |
462 | |
463 | acquire_queue(kgd, pipe_id, queue_id); |
464 | |
465 | for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); |
466 | reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) |
467 | DUMP_REG(reg); |
468 | |
469 | release_queue(kgd); |
470 | |
471 | WARN_ON_ONCE(i != HQD_N_REGS); |
472 | *n_regs = i; |
473 | |
474 | return 0; |
475 | } |
476 | |
477 | static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd, |
478 | uint32_t __user *wptr, struct mm_struct *mm) |
479 | { |
480 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
481 | struct v9_sdma_mqd *m; |
482 | uint32_t sdma_base_addr, sdmax_gfx_context_cntl; |
483 | unsigned long end_jiffies; |
484 | uint32_t data; |
485 | uint64_t data64; |
486 | uint64_t __user *wptr64 = (uint64_t __user *)wptr; |
487 | |
488 | m = get_sdma_mqd(mqd); |
489 | sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id, |
490 | m->sdma_queue_id); |
491 | sdmax_gfx_context_cntl = m->sdma_engine_id ? |
492 | SOC15_REG_OFFSET(SDMA1, 0, mmSDMA1_GFX_CONTEXT_CNTL) : |
493 | SOC15_REG_OFFSET(SDMA0, 0, mmSDMA0_GFX_CONTEXT_CNTL); |
494 | |
495 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, |
496 | m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); |
497 | |
498 | end_jiffies = msecs_to_jiffies(2000) + jiffies; |
499 | while (true) { |
500 | data = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); |
501 | if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) |
502 | break; |
503 | if (time_after(jiffies, end_jiffies)) |
504 | return -ETIME; |
505 | usleep_range(500, 1000); |
506 | } |
507 | data = RREG32(sdmax_gfx_context_cntl); |
508 | data = REG_SET_FIELD(data, SDMA0_GFX_CONTEXT_CNTL, |
509 | RESUME_CTX, 0); |
510 | WREG32(sdmax_gfx_context_cntl, data); |
511 | |
512 | WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL_OFFSET, |
513 | m->sdmax_rlcx_doorbell_offset); |
514 | |
515 | data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, |
516 | ENABLE, 1); |
517 | WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, data); |
518 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR, m->sdmax_rlcx_rb_rptr); |
519 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI, |
520 | m->sdmax_rlcx_rb_rptr_hi); |
521 | |
522 | WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1); |
523 | if (read_user_wptr(mm, wptr64, data64)) { |
524 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, |
525 | lower_32_bits(data64)); |
526 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, |
527 | upper_32_bits(data64)); |
528 | } else { |
529 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR, |
530 | m->sdmax_rlcx_rb_rptr); |
531 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_WPTR_HI, |
532 | m->sdmax_rlcx_rb_rptr_hi); |
533 | } |
534 | WREG32(sdma_base_addr + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0); |
535 | |
536 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); |
537 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_BASE_HI, |
538 | m->sdmax_rlcx_rb_base_hi); |
539 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, |
540 | m->sdmax_rlcx_rb_rptr_addr_lo); |
541 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, |
542 | m->sdmax_rlcx_rb_rptr_addr_hi); |
543 | |
544 | data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, |
545 | RB_ENABLE, 1); |
546 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, data); |
547 | |
548 | return 0; |
549 | } |
550 | |
551 | static int kgd_hqd_sdma_dump(struct kgd_dev *kgd, |
552 | uint32_t engine_id, uint32_t queue_id, |
553 | uint32_t (**dump)[2], uint32_t *n_regs) |
554 | { |
555 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
556 | uint32_t sdma_base_addr = get_sdma_base_addr(adev, engine_id, queue_id); |
557 | uint32_t i = 0, reg; |
558 | #undef HQD_N_REGS |
559 | #define HQD_N_REGS (19+6+7+10) |
560 | |
561 | *dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL); |
562 | if (*dump == NULL) |
563 | return -ENOMEM; |
564 | |
565 | for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) |
566 | DUMP_REG(sdma_base_addr + reg); |
567 | for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++) |
568 | DUMP_REG(sdma_base_addr + reg); |
569 | for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; |
570 | reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++) |
571 | DUMP_REG(sdma_base_addr + reg); |
572 | for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; |
573 | reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++) |
574 | DUMP_REG(sdma_base_addr + reg); |
575 | |
576 | WARN_ON_ONCE(i != HQD_N_REGS); |
577 | *n_regs = i; |
578 | |
579 | return 0; |
580 | } |
581 | |
582 | static bool kgd_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, |
583 | uint32_t pipe_id, uint32_t queue_id) |
584 | { |
585 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
586 | uint32_t act; |
587 | bool retval = false; |
588 | uint32_t low, high; |
589 | |
590 | acquire_queue(kgd, pipe_id, queue_id); |
591 | act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); |
592 | if (act) { |
593 | low = lower_32_bits(queue_address >> 8); |
594 | high = upper_32_bits(queue_address >> 8); |
595 | |
596 | if (low == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE)) && |
597 | high == RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_BASE_HI))) |
598 | retval = true; |
599 | } |
600 | release_queue(kgd); |
601 | return retval; |
602 | } |
603 | |
604 | static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd) |
605 | { |
606 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
607 | struct v9_sdma_mqd *m; |
608 | uint32_t sdma_base_addr; |
609 | uint32_t sdma_rlc_rb_cntl; |
610 | |
611 | m = get_sdma_mqd(mqd); |
612 | sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id, |
613 | m->sdma_queue_id); |
614 | |
615 | sdma_rlc_rb_cntl = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); |
616 | |
617 | if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK) |
618 | return true; |
619 | |
620 | return false; |
621 | } |
622 | |
623 | static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, |
624 | enum kfd_preempt_type reset_type, |
625 | unsigned int utimeout, uint32_t pipe_id, |
626 | uint32_t queue_id) |
627 | { |
628 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
629 | enum hqd_dequeue_request_type type; |
630 | unsigned long end_jiffies; |
631 | uint32_t temp; |
632 | struct v9_mqd *m = get_mqd(mqd); |
633 | |
634 | if (adev->in_gpu_reset) |
635 | return -EIO; |
636 | |
637 | acquire_queue(kgd, pipe_id, queue_id); |
638 | |
639 | if (m->cp_hqd_vmid == 0) |
640 | WREG32_FIELD15(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0); |
641 | |
642 | switch (reset_type) { |
643 | case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: |
644 | type = DRAIN_PIPE; |
645 | break; |
646 | case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: |
647 | type = RESET_WAVES; |
648 | break; |
649 | default: |
650 | type = DRAIN_PIPE; |
651 | break; |
652 | } |
653 | |
654 | WREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type); |
655 | |
656 | end_jiffies = (utimeout * HZ / 1000) + jiffies; |
657 | while (true) { |
658 | temp = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); |
659 | if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) |
660 | break; |
661 | if (time_after(jiffies, end_jiffies)) { |
662 | pr_err("cp queue preemption time out.\n" ); |
663 | release_queue(kgd); |
664 | return -ETIME; |
665 | } |
666 | usleep_range(500, 1000); |
667 | } |
668 | |
669 | release_queue(kgd); |
670 | return 0; |
671 | } |
672 | |
673 | static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, |
674 | unsigned int utimeout) |
675 | { |
676 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
677 | struct v9_sdma_mqd *m; |
678 | uint32_t sdma_base_addr; |
679 | uint32_t temp; |
680 | unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; |
681 | |
682 | m = get_sdma_mqd(mqd); |
683 | sdma_base_addr = get_sdma_base_addr(adev, m->sdma_engine_id, |
684 | m->sdma_queue_id); |
685 | |
686 | temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL); |
687 | temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK; |
688 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, temp); |
689 | |
690 | while (true) { |
691 | temp = RREG32(sdma_base_addr + mmSDMA0_RLC0_CONTEXT_STATUS); |
692 | if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) |
693 | break; |
694 | if (time_after(jiffies, end_jiffies)) |
695 | return -ETIME; |
696 | usleep_range(500, 1000); |
697 | } |
698 | |
699 | WREG32(sdma_base_addr + mmSDMA0_RLC0_DOORBELL, 0); |
700 | WREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL, |
701 | RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_CNTL) | |
702 | SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); |
703 | |
704 | m->sdmax_rlcx_rb_rptr = RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR); |
705 | m->sdmax_rlcx_rb_rptr_hi = |
706 | RREG32(sdma_base_addr + mmSDMA0_RLC0_RB_RPTR_HI); |
707 | |
708 | return 0; |
709 | } |
710 | |
711 | static bool get_atc_vmid_pasid_mapping_valid(struct kgd_dev *kgd, |
712 | uint8_t vmid) |
713 | { |
714 | uint32_t reg; |
715 | struct amdgpu_device *adev = (struct amdgpu_device *) kgd; |
716 | |
717 | reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) |
718 | + vmid); |
719 | return reg & ATC_VMID0_PASID_MAPPING__VALID_MASK; |
720 | } |
721 | |
722 | static uint16_t get_atc_vmid_pasid_mapping_pasid(struct kgd_dev *kgd, |
723 | uint8_t vmid) |
724 | { |
725 | uint32_t reg; |
726 | struct amdgpu_device *adev = (struct amdgpu_device *) kgd; |
727 | |
728 | reg = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) |
729 | + vmid); |
730 | return reg & ATC_VMID0_PASID_MAPPING__PASID_MASK; |
731 | } |
732 | |
733 | static void write_vmid_invalidate_request(struct kgd_dev *kgd, uint8_t vmid) |
734 | { |
735 | struct amdgpu_device *adev = (struct amdgpu_device *) kgd; |
736 | |
737 | /* Use legacy mode tlb invalidation. |
738 | * |
739 | * Currently on Raven the code below is broken for anything but |
740 | * legacy mode due to a MMHUB power gating problem. A workaround |
741 | * is for MMHUB to wait until the condition PER_VMID_INVALIDATE_REQ |
742 | * == PER_VMID_INVALIDATE_ACK instead of simply waiting for the ack |
743 | * bit. |
744 | * |
745 | * TODO 1: agree on the right set of invalidation registers for |
746 | * KFD use. Use the last one for now. Invalidate both GC and |
747 | * MMHUB. |
748 | * |
749 | * TODO 2: support range-based invalidation, requires kfg2kgd |
750 | * interface change |
751 | */ |
752 | amdgpu_gmc_flush_gpu_tlb(adev, vmid, 0); |
753 | } |
754 | |
755 | static int invalidate_tlbs_with_kiq(struct amdgpu_device *adev, uint16_t pasid) |
756 | { |
757 | signed long r; |
758 | uint32_t seq; |
759 | struct amdgpu_ring *ring = &adev->gfx.kiq.ring; |
760 | |
761 | spin_lock(&adev->gfx.kiq.ring_lock); |
762 | amdgpu_ring_alloc(ring, 12); /* fence + invalidate_tlbs package*/ |
763 | amdgpu_ring_write(ring, PACKET3(PACKET3_INVALIDATE_TLBS, 0)); |
764 | amdgpu_ring_write(ring, |
765 | PACKET3_INVALIDATE_TLBS_DST_SEL(1) | |
766 | PACKET3_INVALIDATE_TLBS_ALL_HUB(1) | |
767 | PACKET3_INVALIDATE_TLBS_PASID(pasid) | |
768 | PACKET3_INVALIDATE_TLBS_FLUSH_TYPE(0)); /* legacy */ |
769 | amdgpu_fence_emit_polling(ring, &seq); |
770 | amdgpu_ring_commit(ring); |
771 | spin_unlock(&adev->gfx.kiq.ring_lock); |
772 | |
773 | r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout); |
774 | if (r < 1) { |
775 | DRM_ERROR("wait for kiq fence error: %ld.\n" , r); |
776 | return -ETIME; |
777 | } |
778 | |
779 | return 0; |
780 | } |
781 | |
782 | static int invalidate_tlbs(struct kgd_dev *kgd, uint16_t pasid) |
783 | { |
784 | struct amdgpu_device *adev = (struct amdgpu_device *) kgd; |
785 | int vmid; |
786 | struct amdgpu_ring *ring = &adev->gfx.kiq.ring; |
787 | |
788 | if (adev->in_gpu_reset) |
789 | return -EIO; |
790 | |
791 | if (ring->sched.ready) |
792 | return invalidate_tlbs_with_kiq(adev, pasid); |
793 | |
794 | for (vmid = 0; vmid < 16; vmid++) { |
795 | if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) |
796 | continue; |
797 | if (get_atc_vmid_pasid_mapping_valid(kgd, vmid)) { |
798 | if (get_atc_vmid_pasid_mapping_pasid(kgd, vmid) |
799 | == pasid) { |
800 | write_vmid_invalidate_request(kgd, vmid); |
801 | break; |
802 | } |
803 | } |
804 | } |
805 | |
806 | return 0; |
807 | } |
808 | |
809 | static int invalidate_tlbs_vmid(struct kgd_dev *kgd, uint16_t vmid) |
810 | { |
811 | struct amdgpu_device *adev = (struct amdgpu_device *) kgd; |
812 | |
813 | if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) { |
814 | pr_err("non kfd vmid %d\n" , vmid); |
815 | return 0; |
816 | } |
817 | |
818 | write_vmid_invalidate_request(kgd, vmid); |
819 | return 0; |
820 | } |
821 | |
822 | static int kgd_address_watch_disable(struct kgd_dev *kgd) |
823 | { |
824 | return 0; |
825 | } |
826 | |
827 | static int kgd_address_watch_execute(struct kgd_dev *kgd, |
828 | unsigned int watch_point_id, |
829 | uint32_t cntl_val, |
830 | uint32_t addr_hi, |
831 | uint32_t addr_lo) |
832 | { |
833 | return 0; |
834 | } |
835 | |
836 | static int kgd_wave_control_execute(struct kgd_dev *kgd, |
837 | uint32_t gfx_index_val, |
838 | uint32_t sq_cmd) |
839 | { |
840 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
841 | uint32_t data = 0; |
842 | |
843 | mutex_lock(&adev->grbm_idx_mutex); |
844 | |
845 | WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), gfx_index_val); |
846 | WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CMD), sq_cmd); |
847 | |
848 | data = REG_SET_FIELD(data, GRBM_GFX_INDEX, |
849 | INSTANCE_BROADCAST_WRITES, 1); |
850 | data = REG_SET_FIELD(data, GRBM_GFX_INDEX, |
851 | SH_BROADCAST_WRITES, 1); |
852 | data = REG_SET_FIELD(data, GRBM_GFX_INDEX, |
853 | SE_BROADCAST_WRITES, 1); |
854 | |
855 | WREG32(SOC15_REG_OFFSET(GC, 0, mmGRBM_GFX_INDEX), data); |
856 | mutex_unlock(&adev->grbm_idx_mutex); |
857 | |
858 | return 0; |
859 | } |
860 | |
861 | static uint32_t kgd_address_watch_get_offset(struct kgd_dev *kgd, |
862 | unsigned int watch_point_id, |
863 | unsigned int reg_offset) |
864 | { |
865 | return 0; |
866 | } |
867 | |
868 | static void set_scratch_backing_va(struct kgd_dev *kgd, |
869 | uint64_t va, uint32_t vmid) |
870 | { |
871 | /* No longer needed on GFXv9. The scratch base address is |
872 | * passed to the shader by the CP. It's the user mode driver's |
873 | * responsibility. |
874 | */ |
875 | } |
876 | |
877 | /* FIXME: Does this need to be ASIC-specific code? */ |
878 | static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type) |
879 | { |
880 | struct amdgpu_device *adev = (struct amdgpu_device *) kgd; |
881 | const union amdgpu_firmware_header *hdr; |
882 | |
883 | switch (type) { |
884 | case KGD_ENGINE_PFP: |
885 | hdr = (const union amdgpu_firmware_header *)adev->gfx.pfp_fw->data; |
886 | break; |
887 | |
888 | case KGD_ENGINE_ME: |
889 | hdr = (const union amdgpu_firmware_header *)adev->gfx.me_fw->data; |
890 | break; |
891 | |
892 | case KGD_ENGINE_CE: |
893 | hdr = (const union amdgpu_firmware_header *)adev->gfx.ce_fw->data; |
894 | break; |
895 | |
896 | case KGD_ENGINE_MEC1: |
897 | hdr = (const union amdgpu_firmware_header *)adev->gfx.mec_fw->data; |
898 | break; |
899 | |
900 | case KGD_ENGINE_MEC2: |
901 | hdr = (const union amdgpu_firmware_header *)adev->gfx.mec2_fw->data; |
902 | break; |
903 | |
904 | case KGD_ENGINE_RLC: |
905 | hdr = (const union amdgpu_firmware_header *)adev->gfx.rlc_fw->data; |
906 | break; |
907 | |
908 | case KGD_ENGINE_SDMA1: |
909 | hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[0].fw->data; |
910 | break; |
911 | |
912 | case KGD_ENGINE_SDMA2: |
913 | hdr = (const union amdgpu_firmware_header *)adev->sdma.instance[1].fw->data; |
914 | break; |
915 | |
916 | default: |
917 | return 0; |
918 | } |
919 | |
920 | if (hdr == NULL) |
921 | return 0; |
922 | |
923 | /* Only 12 bit in use*/ |
924 | return hdr->common.ucode_version; |
925 | } |
926 | |
927 | static void set_vm_context_page_table_base(struct kgd_dev *kgd, uint32_t vmid, |
928 | uint64_t page_table_base) |
929 | { |
930 | struct amdgpu_device *adev = get_amdgpu_device(kgd); |
931 | |
932 | if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) { |
933 | pr_err("trying to set page table base for wrong VMID %u\n" , |
934 | vmid); |
935 | return; |
936 | } |
937 | |
938 | /* TODO: take advantage of per-process address space size. For |
939 | * now, all processes share the same address space size, like |
940 | * on GFX8 and older. |
941 | */ |
942 | mmhub_v1_0_setup_vm_pt_regs(adev, vmid, page_table_base); |
943 | |
944 | gfxhub_v1_0_setup_vm_pt_regs(adev, vmid, page_table_base); |
945 | } |
946 | |