1 | /* |
2 | * Copyright 2014-2018 Advanced Micro Devices, Inc. |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice shall be included in |
12 | * all copies or substantial portions of the Software. |
13 | * |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
20 | * OTHER DEALINGS IN THE SOFTWARE. |
21 | */ |
22 | #include "amdgpu.h" |
23 | #include "amdgpu_amdkfd.h" |
24 | #include "gc/gc_9_0_offset.h" |
25 | #include "gc/gc_9_0_sh_mask.h" |
26 | #include "vega10_enum.h" |
27 | #include "sdma0/sdma0_4_0_offset.h" |
28 | #include "sdma0/sdma0_4_0_sh_mask.h" |
29 | #include "sdma1/sdma1_4_0_offset.h" |
30 | #include "sdma1/sdma1_4_0_sh_mask.h" |
31 | #include "athub/athub_1_0_offset.h" |
32 | #include "athub/athub_1_0_sh_mask.h" |
33 | #include "oss/osssys_4_0_offset.h" |
34 | #include "oss/osssys_4_0_sh_mask.h" |
35 | #include "soc15_common.h" |
36 | #include "v9_structs.h" |
37 | #include "soc15.h" |
38 | #include "soc15d.h" |
39 | #include "gfx_v9_0.h" |
40 | #include "amdgpu_amdkfd_gfx_v9.h" |
41 | #include <uapi/linux/kfd_ioctl.h> |
42 | |
43 | enum hqd_dequeue_request_type { |
44 | NO_ACTION = 0, |
45 | DRAIN_PIPE, |
46 | RESET_WAVES, |
47 | SAVE_WAVES |
48 | }; |
49 | |
50 | static void kgd_gfx_v9_lock_srbm(struct amdgpu_device *adev, uint32_t mec, uint32_t pipe, |
51 | uint32_t queue, uint32_t vmid, uint32_t inst) |
52 | { |
53 | mutex_lock(&adev->srbm_mutex); |
54 | soc15_grbm_select(adev, me: mec, pipe, queue, vmid, GET_INST(GC, inst)); |
55 | } |
56 | |
57 | static void kgd_gfx_v9_unlock_srbm(struct amdgpu_device *adev, uint32_t inst) |
58 | { |
59 | soc15_grbm_select(adev, me: 0, pipe: 0, queue: 0, vmid: 0, GET_INST(GC, inst)); |
60 | mutex_unlock(lock: &adev->srbm_mutex); |
61 | } |
62 | |
63 | void kgd_gfx_v9_acquire_queue(struct amdgpu_device *adev, uint32_t pipe_id, |
64 | uint32_t queue_id, uint32_t inst) |
65 | { |
66 | uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; |
67 | uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); |
68 | |
69 | kgd_gfx_v9_lock_srbm(adev, mec, pipe, queue: queue_id, vmid: 0, inst); |
70 | } |
71 | |
72 | uint64_t kgd_gfx_v9_get_queue_mask(struct amdgpu_device *adev, |
73 | uint32_t pipe_id, uint32_t queue_id) |
74 | { |
75 | unsigned int bit = pipe_id * adev->gfx.mec.num_queue_per_pipe + |
76 | queue_id; |
77 | |
78 | return 1ull << bit; |
79 | } |
80 | |
81 | void kgd_gfx_v9_release_queue(struct amdgpu_device *adev, uint32_t inst) |
82 | { |
83 | kgd_gfx_v9_unlock_srbm(adev, inst); |
84 | } |
85 | |
86 | void kgd_gfx_v9_program_sh_mem_settings(struct amdgpu_device *adev, uint32_t vmid, |
87 | uint32_t sh_mem_config, |
88 | uint32_t sh_mem_ape1_base, |
89 | uint32_t sh_mem_ape1_limit, |
90 | uint32_t sh_mem_bases, uint32_t inst) |
91 | { |
92 | kgd_gfx_v9_lock_srbm(adev, mec: 0, pipe: 0, queue: 0, vmid, inst); |
93 | |
94 | WREG32_RLC(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmSH_MEM_CONFIG), sh_mem_config); |
95 | WREG32_RLC(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmSH_MEM_BASES), sh_mem_bases); |
96 | /* APE1 no longer exists on GFX9 */ |
97 | |
98 | kgd_gfx_v9_unlock_srbm(adev, inst); |
99 | } |
100 | |
101 | int kgd_gfx_v9_set_pasid_vmid_mapping(struct amdgpu_device *adev, u32 pasid, |
102 | unsigned int vmid, uint32_t inst) |
103 | { |
104 | /* |
105 | * We have to assume that there is no outstanding mapping. |
106 | * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because |
107 | * a mapping is in progress or because a mapping finished |
108 | * and the SW cleared it. |
109 | * So the protocol is to always wait & clear. |
110 | */ |
111 | uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | |
112 | ATC_VMID0_PASID_MAPPING__VALID_MASK; |
113 | |
114 | /* |
115 | * need to do this twice, once for gfx and once for mmhub |
116 | * for ATC add 16 to VMID for mmhub, for IH different registers. |
117 | * ATC_VMID0..15 registers are separate from ATC_VMID16..31. |
118 | */ |
119 | |
120 | WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid, |
121 | pasid_mapping); |
122 | |
123 | while (!(RREG32(SOC15_REG_OFFSET( |
124 | ATHUB, 0, |
125 | mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & |
126 | (1U << vmid))) |
127 | cpu_relax(); |
128 | |
129 | WREG32(SOC15_REG_OFFSET(ATHUB, 0, |
130 | mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), |
131 | 1U << vmid); |
132 | |
133 | /* Mapping vmid to pasid also for IH block */ |
134 | WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid, |
135 | pasid_mapping); |
136 | |
137 | WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid, |
138 | pasid_mapping); |
139 | |
140 | while (!(RREG32(SOC15_REG_OFFSET( |
141 | ATHUB, 0, |
142 | mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & |
143 | (1U << (vmid + 16)))) |
144 | cpu_relax(); |
145 | |
146 | WREG32(SOC15_REG_OFFSET(ATHUB, 0, |
147 | mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), |
148 | 1U << (vmid + 16)); |
149 | |
150 | /* Mapping vmid to pasid also for IH block */ |
151 | WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid, |
152 | pasid_mapping); |
153 | return 0; |
154 | } |
155 | |
156 | /* TODO - RING0 form of field is obsolete, seems to date back to SI |
157 | * but still works |
158 | */ |
159 | |
160 | int kgd_gfx_v9_init_interrupts(struct amdgpu_device *adev, uint32_t pipe_id, |
161 | uint32_t inst) |
162 | { |
163 | uint32_t mec; |
164 | uint32_t pipe; |
165 | |
166 | mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; |
167 | pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); |
168 | |
169 | kgd_gfx_v9_lock_srbm(adev, mec, pipe, queue: 0, vmid: 0, inst); |
170 | |
171 | WREG32_SOC15(GC, GET_INST(GC, inst), mmCPC_INT_CNTL, |
172 | CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | |
173 | CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); |
174 | |
175 | kgd_gfx_v9_unlock_srbm(adev, inst); |
176 | |
177 | return 0; |
178 | } |
179 | |
180 | static uint32_t get_sdma_rlc_reg_offset(struct amdgpu_device *adev, |
181 | unsigned int engine_id, |
182 | unsigned int queue_id) |
183 | { |
184 | uint32_t sdma_engine_reg_base = 0; |
185 | uint32_t sdma_rlc_reg_offset; |
186 | |
187 | switch (engine_id) { |
188 | default: |
189 | dev_warn(adev->dev, |
190 | "Invalid sdma engine id (%d), using engine id 0\n" , |
191 | engine_id); |
192 | fallthrough; |
193 | case 0: |
194 | sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA0, 0, |
195 | mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL; |
196 | break; |
197 | case 1: |
198 | sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA1, 0, |
199 | mmSDMA1_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL; |
200 | break; |
201 | } |
202 | |
203 | sdma_rlc_reg_offset = sdma_engine_reg_base |
204 | + queue_id * (mmSDMA0_RLC1_RB_CNTL - mmSDMA0_RLC0_RB_CNTL); |
205 | |
206 | pr_debug("RLC register offset for SDMA%d RLC%d: 0x%x\n" , engine_id, |
207 | queue_id, sdma_rlc_reg_offset); |
208 | |
209 | return sdma_rlc_reg_offset; |
210 | } |
211 | |
212 | static inline struct v9_mqd *get_mqd(void *mqd) |
213 | { |
214 | return (struct v9_mqd *)mqd; |
215 | } |
216 | |
217 | static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) |
218 | { |
219 | return (struct v9_sdma_mqd *)mqd; |
220 | } |
221 | |
222 | int kgd_gfx_v9_hqd_load(struct amdgpu_device *adev, void *mqd, |
223 | uint32_t pipe_id, uint32_t queue_id, |
224 | uint32_t __user *wptr, uint32_t wptr_shift, |
225 | uint32_t wptr_mask, struct mm_struct *mm, |
226 | uint32_t inst) |
227 | { |
228 | struct v9_mqd *m; |
229 | uint32_t *mqd_hqd; |
230 | uint32_t reg, hqd_base, data; |
231 | |
232 | m = get_mqd(mqd); |
233 | |
234 | kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst); |
235 | |
236 | /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ |
237 | mqd_hqd = &m->cp_mqd_base_addr_lo; |
238 | hqd_base = SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_MQD_BASE_ADDR); |
239 | |
240 | for (reg = hqd_base; |
241 | reg <= SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_PQ_WPTR_HI); reg++) |
242 | WREG32_RLC(reg, mqd_hqd[reg - hqd_base]); |
243 | |
244 | |
245 | /* Activate doorbell logic before triggering WPTR poll. */ |
246 | data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, |
247 | CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); |
248 | WREG32_RLC(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_PQ_DOORBELL_CONTROL), |
249 | data); |
250 | |
251 | if (wptr) { |
252 | /* Don't read wptr with get_user because the user |
253 | * context may not be accessible (if this function |
254 | * runs in a work queue). Instead trigger a one-shot |
255 | * polling read from memory in the CP. This assumes |
256 | * that wptr is GPU-accessible in the queue's VMID via |
257 | * ATC or SVM. WPTR==RPTR before starting the poll so |
258 | * the CP starts fetching new commands from the right |
259 | * place. |
260 | * |
261 | * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit |
262 | * tricky. Assume that the queue didn't overflow. The |
263 | * number of valid bits in the 32-bit RPTR depends on |
264 | * the queue size. The remaining bits are taken from |
265 | * the saved 64-bit WPTR. If the WPTR wrapped, add the |
266 | * queue size. |
267 | */ |
268 | uint32_t queue_size = |
269 | 2 << REG_GET_FIELD(m->cp_hqd_pq_control, |
270 | CP_HQD_PQ_CONTROL, QUEUE_SIZE); |
271 | uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1); |
272 | |
273 | if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr) |
274 | guessed_wptr += queue_size; |
275 | guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1); |
276 | guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32; |
277 | |
278 | WREG32_RLC(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_PQ_WPTR_LO), |
279 | lower_32_bits(guessed_wptr)); |
280 | WREG32_RLC(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_PQ_WPTR_HI), |
281 | upper_32_bits(guessed_wptr)); |
282 | WREG32_RLC(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_PQ_WPTR_POLL_ADDR), |
283 | lower_32_bits((uintptr_t)wptr)); |
284 | WREG32_RLC(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_PQ_WPTR_POLL_ADDR_HI), |
285 | upper_32_bits((uintptr_t)wptr)); |
286 | WREG32_SOC15(GC, GET_INST(GC, inst), mmCP_PQ_WPTR_POLL_CNTL1, |
287 | (uint32_t)kgd_gfx_v9_get_queue_mask(adev, pipe_id, queue_id)); |
288 | } |
289 | |
290 | /* Start the EOP fetcher */ |
291 | WREG32_RLC(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_EOP_RPTR), |
292 | REG_SET_FIELD(m->cp_hqd_eop_rptr, |
293 | CP_HQD_EOP_RPTR, INIT_FETCHER, 1)); |
294 | |
295 | data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); |
296 | WREG32_RLC(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE), data); |
297 | |
298 | kgd_gfx_v9_release_queue(adev, inst); |
299 | |
300 | return 0; |
301 | } |
302 | |
303 | int kgd_gfx_v9_hiq_mqd_load(struct amdgpu_device *adev, void *mqd, |
304 | uint32_t pipe_id, uint32_t queue_id, |
305 | uint32_t doorbell_off, uint32_t inst) |
306 | { |
307 | struct amdgpu_ring *kiq_ring = &adev->gfx.kiq[inst].ring; |
308 | struct v9_mqd *m; |
309 | uint32_t mec, pipe; |
310 | int r; |
311 | |
312 | m = get_mqd(mqd); |
313 | |
314 | kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst); |
315 | |
316 | mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; |
317 | pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); |
318 | |
319 | pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n" , |
320 | mec, pipe, queue_id); |
321 | |
322 | spin_lock(lock: &adev->gfx.kiq[inst].ring_lock); |
323 | r = amdgpu_ring_alloc(ring: kiq_ring, ndw: 7); |
324 | if (r) { |
325 | pr_err("Failed to alloc KIQ (%d).\n" , r); |
326 | goto out_unlock; |
327 | } |
328 | |
329 | amdgpu_ring_write(ring: kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5)); |
330 | amdgpu_ring_write(ring: kiq_ring, |
331 | PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ |
332 | PACKET3_MAP_QUEUES_VMID(m->cp_hqd_vmid) | /* VMID */ |
333 | PACKET3_MAP_QUEUES_QUEUE(queue_id) | |
334 | PACKET3_MAP_QUEUES_PIPE(pipe) | |
335 | PACKET3_MAP_QUEUES_ME((mec - 1)) | |
336 | PACKET3_MAP_QUEUES_QUEUE_TYPE(0) | /*queue_type: normal compute queue */ |
337 | PACKET3_MAP_QUEUES_ALLOC_FORMAT(0) | /* alloc format: all_on_one_pipe */ |
338 | PACKET3_MAP_QUEUES_ENGINE_SEL(1) | /* engine_sel: hiq */ |
339 | PACKET3_MAP_QUEUES_NUM_QUEUES(1)); /* num_queues: must be 1 */ |
340 | amdgpu_ring_write(ring: kiq_ring, |
341 | PACKET3_MAP_QUEUES_DOORBELL_OFFSET(doorbell_off)); |
342 | amdgpu_ring_write(ring: kiq_ring, v: m->cp_mqd_base_addr_lo); |
343 | amdgpu_ring_write(ring: kiq_ring, v: m->cp_mqd_base_addr_hi); |
344 | amdgpu_ring_write(ring: kiq_ring, v: m->cp_hqd_pq_wptr_poll_addr_lo); |
345 | amdgpu_ring_write(ring: kiq_ring, v: m->cp_hqd_pq_wptr_poll_addr_hi); |
346 | amdgpu_ring_commit(ring: kiq_ring); |
347 | |
348 | out_unlock: |
349 | spin_unlock(lock: &adev->gfx.kiq[inst].ring_lock); |
350 | kgd_gfx_v9_release_queue(adev, inst); |
351 | |
352 | return r; |
353 | } |
354 | |
355 | int kgd_gfx_v9_hqd_dump(struct amdgpu_device *adev, |
356 | uint32_t pipe_id, uint32_t queue_id, |
357 | uint32_t (**dump)[2], uint32_t *n_regs, uint32_t inst) |
358 | { |
359 | uint32_t i = 0, reg; |
360 | #define HQD_N_REGS 56 |
361 | #define DUMP_REG(addr) do { \ |
362 | if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ |
363 | break; \ |
364 | (*dump)[i][0] = (addr) << 2; \ |
365 | (*dump)[i++][1] = RREG32(addr); \ |
366 | } while (0) |
367 | |
368 | *dump = kmalloc_array(HQD_N_REGS * 2, size: sizeof(uint32_t), GFP_KERNEL); |
369 | if (*dump == NULL) |
370 | return -ENOMEM; |
371 | |
372 | kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst); |
373 | |
374 | for (reg = SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_MQD_BASE_ADDR); |
375 | reg <= SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_PQ_WPTR_HI); reg++) |
376 | DUMP_REG(reg); |
377 | |
378 | kgd_gfx_v9_release_queue(adev, inst); |
379 | |
380 | WARN_ON_ONCE(i != HQD_N_REGS); |
381 | *n_regs = i; |
382 | |
383 | return 0; |
384 | } |
385 | |
386 | static int kgd_hqd_sdma_load(struct amdgpu_device *adev, void *mqd, |
387 | uint32_t __user *wptr, struct mm_struct *mm) |
388 | { |
389 | struct v9_sdma_mqd *m; |
390 | uint32_t sdma_rlc_reg_offset; |
391 | unsigned long end_jiffies; |
392 | uint32_t data; |
393 | uint64_t data64; |
394 | uint64_t __user *wptr64 = (uint64_t __user *)wptr; |
395 | |
396 | m = get_sdma_mqd(mqd); |
397 | sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, engine_id: m->sdma_engine_id, |
398 | queue_id: m->sdma_queue_id); |
399 | |
400 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, |
401 | m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); |
402 | |
403 | end_jiffies = msecs_to_jiffies(m: 2000) + jiffies; |
404 | while (true) { |
405 | data = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS); |
406 | if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) |
407 | break; |
408 | if (time_after(jiffies, end_jiffies)) { |
409 | pr_err("SDMA RLC not idle in %s\n" , __func__); |
410 | return -ETIME; |
411 | } |
412 | usleep_range(min: 500, max: 1000); |
413 | } |
414 | |
415 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL_OFFSET, |
416 | m->sdmax_rlcx_doorbell_offset); |
417 | |
418 | data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, |
419 | ENABLE, 1); |
420 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, data); |
421 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR, |
422 | m->sdmax_rlcx_rb_rptr); |
423 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI, |
424 | m->sdmax_rlcx_rb_rptr_hi); |
425 | |
426 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1); |
427 | if (read_user_wptr(mm, wptr64, data64)) { |
428 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR, |
429 | lower_32_bits(data64)); |
430 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI, |
431 | upper_32_bits(data64)); |
432 | } else { |
433 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR, |
434 | m->sdmax_rlcx_rb_rptr); |
435 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI, |
436 | m->sdmax_rlcx_rb_rptr_hi); |
437 | } |
438 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0); |
439 | |
440 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); |
441 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE_HI, |
442 | m->sdmax_rlcx_rb_base_hi); |
443 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, |
444 | m->sdmax_rlcx_rb_rptr_addr_lo); |
445 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, |
446 | m->sdmax_rlcx_rb_rptr_addr_hi); |
447 | |
448 | data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, |
449 | RB_ENABLE, 1); |
450 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, data); |
451 | |
452 | return 0; |
453 | } |
454 | |
455 | static int kgd_hqd_sdma_dump(struct amdgpu_device *adev, |
456 | uint32_t engine_id, uint32_t queue_id, |
457 | uint32_t (**dump)[2], uint32_t *n_regs) |
458 | { |
459 | uint32_t sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, |
460 | engine_id, queue_id); |
461 | uint32_t i = 0, reg; |
462 | #undef HQD_N_REGS |
463 | #define HQD_N_REGS (19+6+7+10) |
464 | |
465 | *dump = kmalloc_array(HQD_N_REGS * 2, size: sizeof(uint32_t), GFP_KERNEL); |
466 | if (*dump == NULL) |
467 | return -ENOMEM; |
468 | |
469 | for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) |
470 | DUMP_REG(sdma_rlc_reg_offset + reg); |
471 | for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++) |
472 | DUMP_REG(sdma_rlc_reg_offset + reg); |
473 | for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; |
474 | reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++) |
475 | DUMP_REG(sdma_rlc_reg_offset + reg); |
476 | for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; |
477 | reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++) |
478 | DUMP_REG(sdma_rlc_reg_offset + reg); |
479 | |
480 | WARN_ON_ONCE(i != HQD_N_REGS); |
481 | *n_regs = i; |
482 | |
483 | return 0; |
484 | } |
485 | |
486 | bool kgd_gfx_v9_hqd_is_occupied(struct amdgpu_device *adev, |
487 | uint64_t queue_address, uint32_t pipe_id, |
488 | uint32_t queue_id, uint32_t inst) |
489 | { |
490 | uint32_t act; |
491 | bool retval = false; |
492 | uint32_t low, high; |
493 | |
494 | kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst); |
495 | act = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE); |
496 | if (act) { |
497 | low = lower_32_bits(queue_address >> 8); |
498 | high = upper_32_bits(queue_address >> 8); |
499 | |
500 | if (low == RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE) && |
501 | high == RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE_HI)) |
502 | retval = true; |
503 | } |
504 | kgd_gfx_v9_release_queue(adev, inst); |
505 | return retval; |
506 | } |
507 | |
508 | static bool kgd_hqd_sdma_is_occupied(struct amdgpu_device *adev, void *mqd) |
509 | { |
510 | struct v9_sdma_mqd *m; |
511 | uint32_t sdma_rlc_reg_offset; |
512 | uint32_t sdma_rlc_rb_cntl; |
513 | |
514 | m = get_sdma_mqd(mqd); |
515 | sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, engine_id: m->sdma_engine_id, |
516 | queue_id: m->sdma_queue_id); |
517 | |
518 | sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL); |
519 | |
520 | if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK) |
521 | return true; |
522 | |
523 | return false; |
524 | } |
525 | |
526 | int kgd_gfx_v9_hqd_destroy(struct amdgpu_device *adev, void *mqd, |
527 | enum kfd_preempt_type reset_type, |
528 | unsigned int utimeout, uint32_t pipe_id, |
529 | uint32_t queue_id, uint32_t inst) |
530 | { |
531 | enum hqd_dequeue_request_type type; |
532 | unsigned long end_jiffies; |
533 | uint32_t temp; |
534 | struct v9_mqd *m = get_mqd(mqd); |
535 | |
536 | if (amdgpu_in_reset(adev)) |
537 | return -EIO; |
538 | |
539 | kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst); |
540 | |
541 | if (m->cp_hqd_vmid == 0) |
542 | WREG32_FIELD15_RLC(GC, GET_INST(GC, inst), RLC_CP_SCHEDULERS, scheduler1, 0); |
543 | |
544 | switch (reset_type) { |
545 | case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: |
546 | type = DRAIN_PIPE; |
547 | break; |
548 | case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: |
549 | type = RESET_WAVES; |
550 | break; |
551 | case KFD_PREEMPT_TYPE_WAVEFRONT_SAVE: |
552 | type = SAVE_WAVES; |
553 | break; |
554 | default: |
555 | type = DRAIN_PIPE; |
556 | break; |
557 | } |
558 | |
559 | WREG32_RLC(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_DEQUEUE_REQUEST), type); |
560 | |
561 | end_jiffies = (utimeout * HZ / 1000) + jiffies; |
562 | while (true) { |
563 | temp = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE); |
564 | if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) |
565 | break; |
566 | if (time_after(jiffies, end_jiffies)) { |
567 | pr_err("cp queue preemption time out.\n" ); |
568 | kgd_gfx_v9_release_queue(adev, inst); |
569 | return -ETIME; |
570 | } |
571 | usleep_range(min: 500, max: 1000); |
572 | } |
573 | |
574 | kgd_gfx_v9_release_queue(adev, inst); |
575 | return 0; |
576 | } |
577 | |
578 | static int kgd_hqd_sdma_destroy(struct amdgpu_device *adev, void *mqd, |
579 | unsigned int utimeout) |
580 | { |
581 | struct v9_sdma_mqd *m; |
582 | uint32_t sdma_rlc_reg_offset; |
583 | uint32_t temp; |
584 | unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; |
585 | |
586 | m = get_sdma_mqd(mqd); |
587 | sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, engine_id: m->sdma_engine_id, |
588 | queue_id: m->sdma_queue_id); |
589 | |
590 | temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL); |
591 | temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK; |
592 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, temp); |
593 | |
594 | while (true) { |
595 | temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS); |
596 | if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) |
597 | break; |
598 | if (time_after(jiffies, end_jiffies)) { |
599 | pr_err("SDMA RLC not idle in %s\n" , __func__); |
600 | return -ETIME; |
601 | } |
602 | usleep_range(min: 500, max: 1000); |
603 | } |
604 | |
605 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, 0); |
606 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, |
607 | RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL) | |
608 | SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); |
609 | |
610 | m->sdmax_rlcx_rb_rptr = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR); |
611 | m->sdmax_rlcx_rb_rptr_hi = |
612 | RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI); |
613 | |
614 | return 0; |
615 | } |
616 | |
617 | bool kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev, |
618 | uint8_t vmid, uint16_t *p_pasid) |
619 | { |
620 | uint32_t value; |
621 | |
622 | value = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) |
623 | + vmid); |
624 | *p_pasid = value & ATC_VMID0_PASID_MAPPING__PASID_MASK; |
625 | |
626 | return !!(value & ATC_VMID0_PASID_MAPPING__VALID_MASK); |
627 | } |
628 | |
629 | int kgd_gfx_v9_wave_control_execute(struct amdgpu_device *adev, |
630 | uint32_t gfx_index_val, |
631 | uint32_t sq_cmd, uint32_t inst) |
632 | { |
633 | uint32_t data = 0; |
634 | |
635 | mutex_lock(&adev->grbm_idx_mutex); |
636 | |
637 | WREG32_SOC15_RLC_SHADOW(GC, GET_INST(GC, inst), mmGRBM_GFX_INDEX, gfx_index_val); |
638 | WREG32_SOC15(GC, GET_INST(GC, inst), mmSQ_CMD, sq_cmd); |
639 | |
640 | data = REG_SET_FIELD(data, GRBM_GFX_INDEX, |
641 | INSTANCE_BROADCAST_WRITES, 1); |
642 | data = REG_SET_FIELD(data, GRBM_GFX_INDEX, |
643 | SH_BROADCAST_WRITES, 1); |
644 | data = REG_SET_FIELD(data, GRBM_GFX_INDEX, |
645 | SE_BROADCAST_WRITES, 1); |
646 | |
647 | WREG32_SOC15_RLC_SHADOW(GC, GET_INST(GC, inst), mmGRBM_GFX_INDEX, data); |
648 | mutex_unlock(lock: &adev->grbm_idx_mutex); |
649 | |
650 | return 0; |
651 | } |
652 | |
653 | /* |
654 | * GFX9 helper for wave launch stall requirements on debug trap setting. |
655 | * |
656 | * vmid: |
657 | * Target VMID to stall/unstall. |
658 | * |
659 | * stall: |
660 | * 0-unstall wave launch (enable), 1-stall wave launch (disable). |
661 | * After wavefront launch has been stalled, allocated waves must drain from |
662 | * SPI in order for debug trap settings to take effect on those waves. |
663 | * This is roughly a ~96 clock cycle wait on SPI where a read on |
664 | * SPI_GDBG_WAVE_CNTL translates to ~32 clock cycles. |
665 | * KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY indicates the number of reads required. |
666 | * |
667 | * NOTE: We can afford to clear the entire STALL_VMID field on unstall |
668 | * because GFX9.4.1 cannot support multi-process debugging due to trap |
669 | * configuration and masking being limited to global scope. Always assume |
670 | * single process conditions. |
671 | */ |
672 | #define KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY 3 |
673 | void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev, |
674 | uint32_t vmid, |
675 | bool stall) |
676 | { |
677 | int i; |
678 | uint32_t data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL)); |
679 | |
680 | if (amdgpu_ip_version(adev, ip: GC_HWIP, inst: 0) == IP_VERSION(9, 4, 1)) |
681 | data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_VMID, |
682 | stall ? 1 << vmid : 0); |
683 | else |
684 | data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA, |
685 | stall ? 1 : 0); |
686 | |
687 | WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), data); |
688 | |
689 | if (!stall) |
690 | return; |
691 | |
692 | for (i = 0; i < KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY; i++) |
693 | RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL)); |
694 | } |
695 | |
696 | /* |
697 | * restore_dbg_registers is ignored here but is a general interface requirement |
698 | * for devices that support GFXOFF and where the RLC save/restore list |
699 | * does not support hw registers for debugging i.e. the driver has to manually |
700 | * initialize the debug mode registers after it has disabled GFX off during the |
701 | * debug session. |
702 | */ |
703 | uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev, |
704 | bool restore_dbg_registers, |
705 | uint32_t vmid) |
706 | { |
707 | mutex_lock(&adev->grbm_idx_mutex); |
708 | |
709 | kgd_gfx_v9_set_wave_launch_stall(adev, vmid, stall: true); |
710 | |
711 | WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0); |
712 | |
713 | kgd_gfx_v9_set_wave_launch_stall(adev, vmid, stall: false); |
714 | |
715 | mutex_unlock(lock: &adev->grbm_idx_mutex); |
716 | |
717 | return 0; |
718 | } |
719 | |
720 | /* |
721 | * keep_trap_enabled is ignored here but is a general interface requirement |
722 | * for devices that support multi-process debugging where the performance |
723 | * overhead from trap temporary setup needs to be bypassed when the debug |
724 | * session has ended. |
725 | */ |
726 | uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev, |
727 | bool keep_trap_enabled, |
728 | uint32_t vmid) |
729 | { |
730 | mutex_lock(&adev->grbm_idx_mutex); |
731 | |
732 | kgd_gfx_v9_set_wave_launch_stall(adev, vmid, stall: true); |
733 | |
734 | WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0); |
735 | |
736 | kgd_gfx_v9_set_wave_launch_stall(adev, vmid, stall: false); |
737 | |
738 | mutex_unlock(lock: &adev->grbm_idx_mutex); |
739 | |
740 | return 0; |
741 | } |
742 | |
743 | int kgd_gfx_v9_validate_trap_override_request(struct amdgpu_device *adev, |
744 | uint32_t trap_override, |
745 | uint32_t *trap_mask_supported) |
746 | { |
747 | *trap_mask_supported &= KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH; |
748 | |
749 | /* The SPI_GDBG_TRAP_MASK register is global and affects all |
750 | * processes. Only allow OR-ing the address-watch bit, since |
751 | * this only affects processes under the debugger. Other bits |
752 | * should stay 0 to avoid the debugger interfering with other |
753 | * processes. |
754 | */ |
755 | if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR) |
756 | return -EINVAL; |
757 | |
758 | return 0; |
759 | } |
760 | |
761 | uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev, |
762 | uint32_t vmid, |
763 | uint32_t trap_override, |
764 | uint32_t trap_mask_bits, |
765 | uint32_t trap_mask_request, |
766 | uint32_t *trap_mask_prev, |
767 | uint32_t kfd_dbg_cntl_prev) |
768 | { |
769 | uint32_t data, wave_cntl_prev; |
770 | |
771 | mutex_lock(&adev->grbm_idx_mutex); |
772 | |
773 | wave_cntl_prev = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL)); |
774 | |
775 | kgd_gfx_v9_set_wave_launch_stall(adev, vmid, stall: true); |
776 | |
777 | data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK)); |
778 | *trap_mask_prev = REG_GET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN); |
779 | |
780 | trap_mask_bits = (trap_mask_bits & trap_mask_request) | |
781 | (*trap_mask_prev & ~trap_mask_request); |
782 | |
783 | data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN, trap_mask_bits); |
784 | data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, REPLACE, trap_override); |
785 | WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), data); |
786 | |
787 | /* We need to preserve wave launch mode stall settings. */ |
788 | WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL), wave_cntl_prev); |
789 | |
790 | mutex_unlock(lock: &adev->grbm_idx_mutex); |
791 | |
792 | return 0; |
793 | } |
794 | |
795 | uint32_t kgd_gfx_v9_set_wave_launch_mode(struct amdgpu_device *adev, |
796 | uint8_t wave_launch_mode, |
797 | uint32_t vmid) |
798 | { |
799 | uint32_t data = 0; |
800 | bool is_mode_set = !!wave_launch_mode; |
801 | |
802 | mutex_lock(&adev->grbm_idx_mutex); |
803 | |
804 | kgd_gfx_v9_set_wave_launch_stall(adev, vmid, stall: true); |
805 | |
806 | data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2, |
807 | VMID_MASK, is_mode_set ? 1 << vmid : 0); |
808 | data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2, |
809 | MODE, is_mode_set ? wave_launch_mode : 0); |
810 | WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_WAVE_CNTL2), data); |
811 | |
812 | kgd_gfx_v9_set_wave_launch_stall(adev, vmid, stall: false); |
813 | |
814 | mutex_unlock(lock: &adev->grbm_idx_mutex); |
815 | |
816 | return 0; |
817 | } |
818 | |
819 | #define TCP_WATCH_STRIDE (mmTCP_WATCH1_ADDR_H - mmTCP_WATCH0_ADDR_H) |
820 | uint32_t kgd_gfx_v9_set_address_watch(struct amdgpu_device *adev, |
821 | uint64_t watch_address, |
822 | uint32_t watch_address_mask, |
823 | uint32_t watch_id, |
824 | uint32_t watch_mode, |
825 | uint32_t debug_vmid, |
826 | uint32_t inst) |
827 | { |
828 | uint32_t watch_address_high; |
829 | uint32_t watch_address_low; |
830 | uint32_t watch_address_cntl; |
831 | |
832 | watch_address_cntl = 0; |
833 | |
834 | watch_address_low = lower_32_bits(watch_address); |
835 | watch_address_high = upper_32_bits(watch_address) & 0xffff; |
836 | |
837 | watch_address_cntl = REG_SET_FIELD(watch_address_cntl, |
838 | TCP_WATCH0_CNTL, |
839 | VMID, |
840 | debug_vmid); |
841 | watch_address_cntl = REG_SET_FIELD(watch_address_cntl, |
842 | TCP_WATCH0_CNTL, |
843 | MODE, |
844 | watch_mode); |
845 | watch_address_cntl = REG_SET_FIELD(watch_address_cntl, |
846 | TCP_WATCH0_CNTL, |
847 | MASK, |
848 | watch_address_mask >> 6); |
849 | |
850 | /* Turning off this watch point until we set all the registers */ |
851 | watch_address_cntl = REG_SET_FIELD(watch_address_cntl, |
852 | TCP_WATCH0_CNTL, |
853 | VALID, |
854 | 0); |
855 | |
856 | WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) + |
857 | (watch_id * TCP_WATCH_STRIDE)), |
858 | watch_address_cntl); |
859 | |
860 | WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) + |
861 | (watch_id * TCP_WATCH_STRIDE)), |
862 | watch_address_high); |
863 | |
864 | WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_L) + |
865 | (watch_id * TCP_WATCH_STRIDE)), |
866 | watch_address_low); |
867 | |
868 | /* Enable the watch point */ |
869 | watch_address_cntl = REG_SET_FIELD(watch_address_cntl, |
870 | TCP_WATCH0_CNTL, |
871 | VALID, |
872 | 1); |
873 | |
874 | WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) + |
875 | (watch_id * TCP_WATCH_STRIDE)), |
876 | watch_address_cntl); |
877 | |
878 | return 0; |
879 | } |
880 | |
881 | uint32_t kgd_gfx_v9_clear_address_watch(struct amdgpu_device *adev, |
882 | uint32_t watch_id) |
883 | { |
884 | uint32_t watch_address_cntl; |
885 | |
886 | watch_address_cntl = 0; |
887 | |
888 | WREG32_RLC((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) + |
889 | (watch_id * TCP_WATCH_STRIDE)), |
890 | watch_address_cntl); |
891 | |
892 | return 0; |
893 | } |
894 | |
895 | /* kgd_gfx_v9_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values |
896 | * The values read are: |
897 | * ib_offload_wait_time -- Wait Count for Indirect Buffer Offloads. |
898 | * atomic_offload_wait_time -- Wait Count for L2 and GDS Atomics Offloads. |
899 | * wrm_offload_wait_time -- Wait Count for WAIT_REG_MEM Offloads. |
900 | * gws_wait_time -- Wait Count for Global Wave Syncs. |
901 | * que_sleep_wait_time -- Wait Count for Dequeue Retry. |
902 | * sch_wave_wait_time -- Wait Count for Scheduling Wave Message. |
903 | * sem_rearm_wait_time -- Wait Count for Semaphore re-arm. |
904 | * deq_retry_wait_time -- Wait Count for Global Wave Syncs. |
905 | */ |
906 | void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev, |
907 | uint32_t *wait_times, |
908 | uint32_t inst) |
909 | |
910 | { |
911 | *wait_times = RREG32(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), |
912 | mmCP_IQ_WAIT_TIME2)); |
913 | } |
914 | |
915 | void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev, |
916 | uint32_t vmid, uint64_t page_table_base) |
917 | { |
918 | if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) { |
919 | pr_err("trying to set page table base for wrong VMID %u\n" , |
920 | vmid); |
921 | return; |
922 | } |
923 | |
924 | adev->mmhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base); |
925 | |
926 | adev->gfxhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base); |
927 | } |
928 | |
929 | static void lock_spi_csq_mutexes(struct amdgpu_device *adev) |
930 | { |
931 | mutex_lock(&adev->srbm_mutex); |
932 | mutex_lock(&adev->grbm_idx_mutex); |
933 | |
934 | } |
935 | |
936 | static void unlock_spi_csq_mutexes(struct amdgpu_device *adev) |
937 | { |
938 | mutex_unlock(lock: &adev->grbm_idx_mutex); |
939 | mutex_unlock(lock: &adev->srbm_mutex); |
940 | } |
941 | |
942 | /** |
943 | * get_wave_count: Read device registers to get number of waves in flight for |
944 | * a particular queue. The method also returns the VMID associated with the |
945 | * queue. |
946 | * |
947 | * @adev: Handle of device whose registers are to be read |
948 | * @queue_idx: Index of queue in the queue-map bit-field |
949 | * @wave_cnt: Output parameter updated with number of waves in flight |
950 | * @vmid: Output parameter updated with VMID of queue whose wave count |
951 | * is being collected |
952 | * @inst: xcc's instance number on a multi-XCC setup |
953 | */ |
954 | static void get_wave_count(struct amdgpu_device *adev, int queue_idx, |
955 | int *wave_cnt, int *vmid, uint32_t inst) |
956 | { |
957 | int pipe_idx; |
958 | int queue_slot; |
959 | unsigned int reg_val; |
960 | |
961 | /* |
962 | * Program GRBM with appropriate MEID, PIPEID, QUEUEID and VMID |
963 | * parameters to read out waves in flight. Get VMID if there are |
964 | * non-zero waves in flight. |
965 | */ |
966 | *vmid = 0xFF; |
967 | *wave_cnt = 0; |
968 | pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe; |
969 | queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe; |
970 | soc15_grbm_select(adev, me: 1, pipe: pipe_idx, queue: queue_slot, vmid: 0, xcc_id: inst); |
971 | reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst, mmSPI_CSQ_WF_ACTIVE_COUNT_0) + |
972 | queue_slot); |
973 | *wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK; |
974 | if (*wave_cnt != 0) |
975 | *vmid = (RREG32_SOC15(GC, inst, mmCP_HQD_VMID) & |
976 | CP_HQD_VMID__VMID_MASK) >> CP_HQD_VMID__VMID__SHIFT; |
977 | } |
978 | |
979 | /** |
980 | * kgd_gfx_v9_get_cu_occupancy: Reads relevant registers associated with each |
981 | * shader engine and aggregates the number of waves that are in flight for the |
982 | * process whose pasid is provided as a parameter. The process could have ZERO |
983 | * or more queues running and submitting waves to compute units. |
984 | * |
985 | * @adev: Handle of device from which to get number of waves in flight |
986 | * @pasid: Identifies the process for which this query call is invoked |
987 | * @pasid_wave_cnt: Output parameter updated with number of waves in flight that |
988 | * belong to process with given pasid |
989 | * @max_waves_per_cu: Output parameter updated with maximum number of waves |
990 | * possible per Compute Unit |
991 | * @inst: xcc's instance number on a multi-XCC setup |
992 | * |
993 | * Note: It's possible that the device has too many queues (oversubscription) |
994 | * in which case a VMID could be remapped to a different PASID. This could lead |
995 | * to an inaccurate wave count. Following is a high-level sequence: |
996 | * Time T1: vmid = getVmid(); vmid is associated with Pasid P1 |
997 | * Time T2: passId = getPasId(vmid); vmid is associated with Pasid P2 |
998 | * In the sequence above wave count obtained from time T1 will be incorrectly |
999 | * lost or added to total wave count. |
1000 | * |
1001 | * The registers that provide the waves in flight are: |
1002 | * |
1003 | * SPI_CSQ_WF_ACTIVE_STATUS - bit-map of queues per pipe. The bit is ON if a |
1004 | * queue is slotted, OFF if there is no queue. A process could have ZERO or |
1005 | * more queues slotted and submitting waves to be run on compute units. Even |
1006 | * when there is a queue it is possible there could be zero wave fronts, this |
1007 | * can happen when queue is waiting on top-of-pipe events - e.g. waitRegMem |
1008 | * command |
1009 | * |
1010 | * For each bit that is ON from above: |
1011 | * |
1012 | * Read (SPI_CSQ_WF_ACTIVE_COUNT_0 + queue_idx) register. It provides the |
1013 | * number of waves that are in flight for the queue at specified index. The |
1014 | * index ranges from 0 to 7. |
1015 | * |
1016 | * If non-zero waves are in flight, read CP_HQD_VMID register to obtain VMID |
1017 | * of the wave(s). |
1018 | * |
1019 | * Determine if VMID from above step maps to pasid provided as parameter. If |
1020 | * it matches agrregate the wave count. That the VMID will not match pasid is |
1021 | * a normal condition i.e. a device is expected to support multiple queues |
1022 | * from multiple proceses. |
1023 | * |
1024 | * Reading registers referenced above involves programming GRBM appropriately |
1025 | */ |
1026 | void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid, |
1027 | int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst) |
1028 | { |
1029 | int qidx; |
1030 | int vmid; |
1031 | int se_idx; |
1032 | int sh_idx; |
1033 | int se_cnt; |
1034 | int sh_cnt; |
1035 | int wave_cnt; |
1036 | int queue_map; |
1037 | int pasid_tmp; |
1038 | int max_queue_cnt; |
1039 | int vmid_wave_cnt = 0; |
1040 | DECLARE_BITMAP(cp_queue_bitmap, AMDGPU_MAX_QUEUES); |
1041 | |
1042 | lock_spi_csq_mutexes(adev); |
1043 | soc15_grbm_select(adev, me: 1, pipe: 0, queue: 0, vmid: 0, xcc_id: inst); |
1044 | |
1045 | /* |
1046 | * Iterate through the shader engines and arrays of the device |
1047 | * to get number of waves in flight |
1048 | */ |
1049 | bitmap_complement(dst: cp_queue_bitmap, src: adev->gfx.mec_bitmap[0].queue_bitmap, |
1050 | AMDGPU_MAX_QUEUES); |
1051 | max_queue_cnt = adev->gfx.mec.num_pipe_per_mec * |
1052 | adev->gfx.mec.num_queue_per_pipe; |
1053 | sh_cnt = adev->gfx.config.max_sh_per_se; |
1054 | se_cnt = adev->gfx.config.max_shader_engines; |
1055 | for (se_idx = 0; se_idx < se_cnt; se_idx++) { |
1056 | for (sh_idx = 0; sh_idx < sh_cnt; sh_idx++) { |
1057 | |
1058 | amdgpu_gfx_select_se_sh(adev, se_idx, sh_idx, 0xffffffff, inst); |
1059 | queue_map = RREG32_SOC15(GC, inst, mmSPI_CSQ_WF_ACTIVE_STATUS); |
1060 | |
1061 | /* |
1062 | * Assumption: queue map encodes following schema: four |
1063 | * pipes per each micro-engine, with each pipe mapping |
1064 | * eight queues. This schema is true for GFX9 devices |
1065 | * and must be verified for newer device families |
1066 | */ |
1067 | for (qidx = 0; qidx < max_queue_cnt; qidx++) { |
1068 | |
1069 | /* Skip qeueus that are not associated with |
1070 | * compute functions |
1071 | */ |
1072 | if (!test_bit(qidx, cp_queue_bitmap)) |
1073 | continue; |
1074 | |
1075 | if (!(queue_map & (1 << qidx))) |
1076 | continue; |
1077 | |
1078 | /* Get number of waves in flight and aggregate them */ |
1079 | get_wave_count(adev, queue_idx: qidx, wave_cnt: &wave_cnt, vmid: &vmid, |
1080 | inst); |
1081 | if (wave_cnt != 0) { |
1082 | pasid_tmp = |
1083 | RREG32(SOC15_REG_OFFSET(OSSSYS, inst, |
1084 | mmIH_VMID_0_LUT) + vmid); |
1085 | if (pasid_tmp == pasid) |
1086 | vmid_wave_cnt += wave_cnt; |
1087 | } |
1088 | } |
1089 | } |
1090 | } |
1091 | |
1092 | amdgpu_gfx_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, inst); |
1093 | soc15_grbm_select(adev, me: 0, pipe: 0, queue: 0, vmid: 0, xcc_id: inst); |
1094 | unlock_spi_csq_mutexes(adev); |
1095 | |
1096 | /* Update the output parameters and return */ |
1097 | *pasid_wave_cnt = vmid_wave_cnt; |
1098 | *max_waves_per_cu = adev->gfx.cu_info.simd_per_cu * |
1099 | adev->gfx.cu_info.max_waves_per_simd; |
1100 | } |
1101 | |
1102 | void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev, |
1103 | uint32_t wait_times, |
1104 | uint32_t grace_period, |
1105 | uint32_t *reg_offset, |
1106 | uint32_t *reg_data) |
1107 | { |
1108 | *reg_data = wait_times; |
1109 | |
1110 | /* |
1111 | * The CP cannot handle a 0 grace period input and will result in |
1112 | * an infinite grace period being set so set to 1 to prevent this. |
1113 | */ |
1114 | if (grace_period == 0) |
1115 | grace_period = 1; |
1116 | |
1117 | *reg_data = REG_SET_FIELD(*reg_data, |
1118 | CP_IQ_WAIT_TIME2, |
1119 | SCH_WAVE, |
1120 | grace_period); |
1121 | |
1122 | *reg_offset = SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2); |
1123 | } |
1124 | |
1125 | void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev, |
1126 | uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr, uint32_t inst) |
1127 | { |
1128 | kgd_gfx_v9_lock_srbm(adev, mec: 0, pipe: 0, queue: 0, vmid, inst); |
1129 | |
1130 | /* |
1131 | * Program TBA registers |
1132 | */ |
1133 | WREG32_SOC15(GC, GET_INST(GC, inst), mmSQ_SHADER_TBA_LO, |
1134 | lower_32_bits(tba_addr >> 8)); |
1135 | WREG32_SOC15(GC, GET_INST(GC, inst), mmSQ_SHADER_TBA_HI, |
1136 | upper_32_bits(tba_addr >> 8)); |
1137 | |
1138 | /* |
1139 | * Program TMA registers |
1140 | */ |
1141 | WREG32_SOC15(GC, GET_INST(GC, inst), mmSQ_SHADER_TMA_LO, |
1142 | lower_32_bits(tma_addr >> 8)); |
1143 | WREG32_SOC15(GC, GET_INST(GC, inst), mmSQ_SHADER_TMA_HI, |
1144 | upper_32_bits(tma_addr >> 8)); |
1145 | |
1146 | kgd_gfx_v9_unlock_srbm(adev, inst); |
1147 | } |
1148 | |
1149 | const struct kfd2kgd_calls gfx_v9_kfd2kgd = { |
1150 | .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings, |
1151 | .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping, |
1152 | .init_interrupts = kgd_gfx_v9_init_interrupts, |
1153 | .hqd_load = kgd_gfx_v9_hqd_load, |
1154 | .hiq_mqd_load = kgd_gfx_v9_hiq_mqd_load, |
1155 | .hqd_sdma_load = kgd_hqd_sdma_load, |
1156 | .hqd_dump = kgd_gfx_v9_hqd_dump, |
1157 | .hqd_sdma_dump = kgd_hqd_sdma_dump, |
1158 | .hqd_is_occupied = kgd_gfx_v9_hqd_is_occupied, |
1159 | .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, |
1160 | .hqd_destroy = kgd_gfx_v9_hqd_destroy, |
1161 | .hqd_sdma_destroy = kgd_hqd_sdma_destroy, |
1162 | .wave_control_execute = kgd_gfx_v9_wave_control_execute, |
1163 | .get_atc_vmid_pasid_mapping_info = |
1164 | kgd_gfx_v9_get_atc_vmid_pasid_mapping_info, |
1165 | .set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base, |
1166 | .enable_debug_trap = kgd_gfx_v9_enable_debug_trap, |
1167 | .disable_debug_trap = kgd_gfx_v9_disable_debug_trap, |
1168 | .validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request, |
1169 | .set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override, |
1170 | .set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode, |
1171 | .set_address_watch = kgd_gfx_v9_set_address_watch, |
1172 | .clear_address_watch = kgd_gfx_v9_clear_address_watch, |
1173 | .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times, |
1174 | .build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info, |
1175 | .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy, |
1176 | .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings, |
1177 | }; |
1178 | |