amdgpu_amdkfd_gfx_v9.c source code [linux/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c]

1	/*
2	* Copyright 2014-2018 Advanced Micro Devices, Inc.
3	*
4	* Permission is hereby granted, free of charge, to any person obtaining a
5	* copy of this software and associated documentation files (the "Software"),
6	* to deal in the Software without restriction, including without limitation
7	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8	* and/or sell copies of the Software, and to permit persons to whom the
9	* Software is furnished to do so, subject to the following conditions:
10	*
11	* The above copyright notice and this permission notice shall be included in
12	* all copies or substantial portions of the Software.
13	*
14	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17	* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18	* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19	* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20	* OTHER DEALINGS IN THE SOFTWARE.
21	*/
22	#include "amdgpu.h"
23	#include "amdgpu_amdkfd.h"
24	#include "gc/gc_9_0_offset.h"
25	#include "gc/gc_9_0_sh_mask.h"
26	#include "vega10_enum.h"
27	#include "sdma0/sdma0_4_0_offset.h"
28	#include "sdma0/sdma0_4_0_sh_mask.h"
29	#include "sdma1/sdma1_4_0_offset.h"
30	#include "sdma1/sdma1_4_0_sh_mask.h"
31	#include "athub/athub_1_0_offset.h"
32	#include "athub/athub_1_0_sh_mask.h"
33	#include "oss/osssys_4_0_offset.h"
34	#include "oss/osssys_4_0_sh_mask.h"
35	#include "soc15_common.h"
36	#include "v9_structs.h"
37	#include "soc15.h"
38	#include "soc15d.h"
39	#include "gfx_v9_0.h"
40	#include "amdgpu_amdkfd_gfx_v9.h"
41	#include <uapi/linux/kfd_ioctl.h>
42
43	enum hqd_dequeue_request_type {
44	NO_ACTION = `0`,
45	DRAIN_PIPE,
46	RESET_WAVES,
47	SAVE_WAVES
48	};
49
50	static void kgd_gfx_v9_lock_srbm(struct amdgpu_device *adev, uint32_t mec, uint32_t pipe,
51	uint32_t queue, uint32_t vmid, uint32_t inst)
52	{
53	mutex_lock(&adev->srbm_mutex);
54	soc15_grbm_select(adev, me: mec, pipe, queue, vmid, GET_INST(GC, inst));
55	}
56
57	static void kgd_gfx_v9_unlock_srbm(struct amdgpu_device *adev, uint32_t inst)
58	{
59	soc15_grbm_select(adev, me: `0`, pipe: `0`, queue: `0`, vmid: `0`, GET_INST(GC, inst));
60	mutex_unlock(lock: &adev->srbm_mutex);
61	}
62
63	void kgd_gfx_v9_acquire_queue(struct amdgpu_device *adev, uint32_t pipe_id,
64	uint32_t queue_id, uint32_t inst)
65	{
66	uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + `1`;
67	uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
68
69	kgd_gfx_v9_lock_srbm(adev, mec, pipe, queue: queue_id, vmid: `0`, inst);
70	}
71
72	uint64_t kgd_gfx_v9_get_queue_mask(struct amdgpu_device *adev,
73	uint32_t pipe_id, uint32_t queue_id)
74	{
75	unsigned int bit = pipe_id * adev->gfx.mec.num_queue_per_pipe +
76	queue_id;
77
78	return `1ull` << bit;
79	}
80
81	void kgd_gfx_v9_release_queue(struct amdgpu_device *adev, uint32_t inst)
82	{
83	kgd_gfx_v9_unlock_srbm(adev, inst);
84	}
85
86	void kgd_gfx_v9_program_sh_mem_settings(struct amdgpu_device *adev, uint32_t vmid,
87	uint32_t sh_mem_config,
88	uint32_t sh_mem_ape1_base,
89	uint32_t sh_mem_ape1_limit,
90	uint32_t sh_mem_bases, uint32_t inst)
91	{
92	kgd_gfx_v9_lock_srbm(adev, mec: `0`, pipe: `0`, queue: `0`, vmid, inst);
93
94	WREG32_RLC(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmSH_MEM_CONFIG), sh_mem_config);
95	WREG32_RLC(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmSH_MEM_BASES), sh_mem_bases);
96	/ APE1 no longer exists on GFX9 /
97
98	kgd_gfx_v9_unlock_srbm(adev, inst);
99	}
100
101	int kgd_gfx_v9_set_pasid_vmid_mapping(struct amdgpu_device *adev, u32 pasid,
102	unsigned int vmid, uint32_t inst)
103	{
104	/*
105	* We have to assume that there is no outstanding mapping.
106	* The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because
107	* a mapping is in progress or because a mapping finished
108	* and the SW cleared it.
109	* So the protocol is to always wait & clear.
110	*/
111	uint32_t pasid_mapping = (pasid == `0`) ? `0` : (uint32_t)pasid \|
112	ATC_VMID0_PASID_MAPPING__VALID_MASK;
113
114	/*
115	* need to do this twice, once for gfx and once for mmhub
116	* for ATC add 16 to VMID for mmhub, for IH different registers.
117	* ATC_VMID0..15 registers are separate from ATC_VMID16..31.
118	*/
119
120	WREG32(SOC15_REG_OFFSET(ATHUB, `0`, mmATC_VMID0_PASID_MAPPING) + vmid,
121	pasid_mapping);
122
123	while (!(RREG32(SOC15_REG_OFFSET(
124	ATHUB, `0`,
125	mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) &
126	(`1U` << vmid)))
127	cpu_relax();
128
129	WREG32(SOC15_REG_OFFSET(ATHUB, `0`,
130	mmATC_VMID_PASID_MAPPING_UPDATE_STATUS),
131	`1U` << vmid);
132
133	/ Mapping vmid to pasid also for IH block /
134	WREG32(SOC15_REG_OFFSET(OSSSYS, `0`, mmIH_VMID_0_LUT) + vmid,
135	pasid_mapping);
136
137	WREG32(SOC15_REG_OFFSET(ATHUB, `0`, mmATC_VMID16_PASID_MAPPING) + vmid,
138	pasid_mapping);
139
140	while (!(RREG32(SOC15_REG_OFFSET(
141	ATHUB, `0`,
142	mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) &
143	(`1U` << (vmid + `16`))))
144	cpu_relax();
145
146	WREG32(SOC15_REG_OFFSET(ATHUB, `0`,
147	mmATC_VMID_PASID_MAPPING_UPDATE_STATUS),
148	`1U` << (vmid + `16`));
149
150	/ Mapping vmid to pasid also for IH block /
151	WREG32(SOC15_REG_OFFSET(OSSSYS, `0`, mmIH_VMID_0_LUT_MM) + vmid,
152	pasid_mapping);
153	return `0`;
154	}
155
156	/ TODO - RING0 form of field is obsolete, seems to date back to SI*
157	* but still works
158	*/
159
160	int kgd_gfx_v9_init_interrupts(struct amdgpu_device *adev, uint32_t pipe_id,
161	uint32_t inst)
162	{
163	uint32_t mec;
164	uint32_t pipe;
165
166	mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + `1`;
167	pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
168
169	kgd_gfx_v9_lock_srbm(adev, mec, pipe, queue: `0`, vmid: `0`, inst);
170
171	WREG32_SOC15(GC, GET_INST(GC, inst), mmCPC_INT_CNTL,
172	CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK \|
173	CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK);
174
175	kgd_gfx_v9_unlock_srbm(adev, inst);
176
177	return `0`;
178	}
179
180	static uint32_t get_sdma_rlc_reg_offset(struct amdgpu_device *adev,
181	unsigned int engine_id,
182	unsigned int queue_id)
183	{
184	uint32_t sdma_engine_reg_base = `0`;
185	uint32_t sdma_rlc_reg_offset;
186
187	switch (engine_id) {
188	default:
189	dev_warn(adev->dev,
190	"Invalid sdma engine id (%d), using engine id 0\n",
191	engine_id);
192	fallthrough;
193	case `0`:
194	sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA0, `0`,
195	mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL;
196	break;
197	case `1`:
198	sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA1, `0`,
199	mmSDMA1_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL;
200	break;
201	}
202
203	sdma_rlc_reg_offset = sdma_engine_reg_base
204	+ queue_id * (mmSDMA0_RLC1_RB_CNTL - mmSDMA0_RLC0_RB_CNTL);
205
206	pr_debug("RLC register offset for SDMA%d RLC%d: 0x%x\n", engine_id,
207	queue_id, sdma_rlc_reg_offset);
208
209	return sdma_rlc_reg_offset;
210	}
211
212	static inline struct v9_mqd get_mqd(void* *mqd)
213	{
214	return (struct v9_mqd *)mqd;
215	}
216
217	static inline struct v9_sdma_mqd get_sdma_mqd(void* *mqd)
218	{
219	return (struct v9_sdma_mqd *)mqd;
220	}
221
222	int kgd_gfx_v9_hqd_load(struct amdgpu_device adev, void* *mqd,
223	uint32_t pipe_id, uint32_t queue_id,
224	uint32_t __user *wptr, uint32_t wptr_shift,
225	uint32_t wptr_mask, struct mm_struct *mm,
226	uint32_t inst)
227	{
228	struct v9_mqd *m;
229	uint32_t *mqd_hqd;
230	uint32_t reg, hqd_base, data;
231
232	m = get_mqd(mqd);
233
234	kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
235
236	/ HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. /
237	mqd_hqd = &m->cp_mqd_base_addr_lo;
238	hqd_base = SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_MQD_BASE_ADDR);
239
240	for (reg = hqd_base;
241	reg <= SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_PQ_WPTR_HI); reg++)
242	WREG32_RLC(reg, mqd_hqd[reg - hqd_base]);
243
244
245	/ Activate doorbell logic before triggering WPTR poll. /
246	data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control,
247	CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, `1`);
248	WREG32_RLC(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_PQ_DOORBELL_CONTROL),
249	data);
250
251	if (wptr) {
252	/ Don't read wptr with get_user because the user*
253	* context may not be accessible (if this function
254	* runs in a work queue). Instead trigger a one-shot
255	* polling read from memory in the CP. This assumes
256	* that wptr is GPU-accessible in the queue's VMID via
257	* ATC or SVM. WPTR==RPTR before starting the poll so
258	* the CP starts fetching new commands from the right
259	* place.
260	*
261	* Guessing a 64-bit WPTR from a 32-bit RPTR is a bit
262	* tricky. Assume that the queue didn't overflow. The
263	* number of valid bits in the 32-bit RPTR depends on
264	* the queue size. The remaining bits are taken from
265	* the saved 64-bit WPTR. If the WPTR wrapped, add the
266	* queue size.
267	*/
268	uint32_t queue_size =
269	`2` << REG_GET_FIELD(m->cp_hqd_pq_control,
270	CP_HQD_PQ_CONTROL, QUEUE_SIZE);
271	uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - `1`);
272
273	if ((m->cp_hqd_pq_wptr_lo & (queue_size - `1`)) < guessed_wptr)
274	guessed_wptr += queue_size;
275	guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - `1`);
276	guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << `32`;
277
278	WREG32_RLC(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_PQ_WPTR_LO),
279	lower_32_bits(guessed_wptr));
280	WREG32_RLC(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_PQ_WPTR_HI),
281	upper_32_bits(guessed_wptr));
282	WREG32_RLC(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_PQ_WPTR_POLL_ADDR),
283	lower_32_bits((uintptr_t)wptr));
284	WREG32_RLC(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_PQ_WPTR_POLL_ADDR_HI),
285	upper_32_bits((uintptr_t)wptr));
286	WREG32_SOC15(GC, GET_INST(GC, inst), mmCP_PQ_WPTR_POLL_CNTL1,
287	(uint32_t)kgd_gfx_v9_get_queue_mask(adev, pipe_id, queue_id));
288	}
289
290	/ Start the EOP fetcher /
291	WREG32_RLC(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_EOP_RPTR),
292	REG_SET_FIELD(m->cp_hqd_eop_rptr,
293	CP_HQD_EOP_RPTR, INIT_FETCHER, `1`));
294
295	data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, `1`);
296	WREG32_RLC(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE), data);
297
298	kgd_gfx_v9_release_queue(adev, inst);
299
300	return `0`;
301	}
302
303	int kgd_gfx_v9_hiq_mqd_load(struct amdgpu_device adev, void* *mqd,
304	uint32_t pipe_id, uint32_t queue_id,
305	uint32_t doorbell_off, uint32_t inst)
306	{
307	struct amdgpu_ring *kiq_ring = &adev->gfx.kiq[inst].ring;
308	struct v9_mqd *m;
309	uint32_t mec, pipe;
310	int r;
311
312	m = get_mqd(mqd);
313
314	kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
315
316	mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + `1`;
317	pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
318
319	pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
320	mec, pipe, queue_id);
321
322	spin_lock(lock: &adev->gfx.kiq[inst].ring_lock);
323	r = amdgpu_ring_alloc(ring: kiq_ring, ndw: `7`);
324	if (r) {
325	pr_err("Failed to alloc KIQ (%d).\n", r);
326	goto out_unlock;
327	}
328
329	amdgpu_ring_write(ring: kiq_ring, PACKET3(PACKET3_MAP_QUEUES, `5`));
330	amdgpu_ring_write(ring: kiq_ring,
331	PACKET3_MAP_QUEUES_QUEUE_SEL(`0`) \| / Queue_Sel /
332	PACKET3_MAP_QUEUES_VMID(m->cp_hqd_vmid) \| / VMID /
333	PACKET3_MAP_QUEUES_QUEUE(queue_id) \|
334	PACKET3_MAP_QUEUES_PIPE(pipe) \|
335	PACKET3_MAP_QUEUES_ME((mec - `1`)) \|
336	PACKET3_MAP_QUEUES_QUEUE_TYPE(`0`) \| /queue_type: normal compute queue /
337	PACKET3_MAP_QUEUES_ALLOC_FORMAT(`0`) \| / alloc format: all_on_one_pipe /
338	PACKET3_MAP_QUEUES_ENGINE_SEL(`1`) \| / engine_sel: hiq /
339	PACKET3_MAP_QUEUES_NUM_QUEUES(`1`)); / num_queues: must be 1 /
340	amdgpu_ring_write(ring: kiq_ring,
341	PACKET3_MAP_QUEUES_DOORBELL_OFFSET(doorbell_off));
342	amdgpu_ring_write(ring: kiq_ring, v: m->cp_mqd_base_addr_lo);
343	amdgpu_ring_write(ring: kiq_ring, v: m->cp_mqd_base_addr_hi);
344	amdgpu_ring_write(ring: kiq_ring, v: m->cp_hqd_pq_wptr_poll_addr_lo);
345	amdgpu_ring_write(ring: kiq_ring, v: m->cp_hqd_pq_wptr_poll_addr_hi);
346	amdgpu_ring_commit(ring: kiq_ring);
347
348	out_unlock:
349	spin_unlock(lock: &adev->gfx.kiq[inst].ring_lock);
350	kgd_gfx_v9_release_queue(adev, inst);
351
352	return r;
353	}
354
355	int kgd_gfx_v9_hqd_dump(struct amdgpu_device *adev,
356	uint32_t pipe_id, uint32_t queue_id,
357	uint32_t (*dump)[`2`], uint32_t n_regs, uint32_t inst)
358	{
359	uint32_t i = `0`, reg;
360	#define HQD_N_REGS 56
361	#define DUMP_REG(addr) do { \
362	if (WARN_ON_ONCE(i >= HQD_N_REGS)) \
363	break; \
364	(*dump)[i][0] = (addr) << 2; \
365	(*dump)[i++][1] = RREG32(addr); \
366	} while (0)
367
368	dump = kmalloc_array(HQD_N_REGS `2`, size: sizeof(uint32_t), GFP_KERNEL);
369	if (*dump == NULL)
370	return -ENOMEM;
371
372	kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
373
374	for (reg = SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_MQD_BASE_ADDR);
375	reg <= SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_PQ_WPTR_HI); reg++)
376	DUMP_REG(reg);
377
378	kgd_gfx_v9_release_queue(adev, inst);
379
380	WARN_ON_ONCE(i != HQD_N_REGS);
381	*n_regs = i;
382
383	return `0`;
384	}
385
386	static int kgd_hqd_sdma_load(struct amdgpu_device adev, void* *mqd,
387	uint32_t __user wptr, struct* mm_struct *mm)
388	{
389	struct v9_sdma_mqd *m;
390	uint32_t sdma_rlc_reg_offset;
391	unsigned long end_jiffies;
392	uint32_t data;
393	uint64_t data64;
394	uint64_t __user wptr64 = (uint64_t __user )wptr;
395
396	m = get_sdma_mqd(mqd);
397	sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, engine_id: m->sdma_engine_id,
398	queue_id: m->sdma_queue_id);
399
400	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL,
401	m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK));
402
403	end_jiffies = msecs_to_jiffies(m: `2000`) + jiffies;
404	while (true) {
405	data = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS);
406	if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
407	break;
408	if (time_after(jiffies, end_jiffies)) {
409	pr_err("SDMA RLC not idle in %s\n", __func__);
410	return -ETIME;
411	}
412	usleep_range(min: `500`, max: `1000`);
413	}
414
415	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL_OFFSET,
416	m->sdmax_rlcx_doorbell_offset);
417
418	data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL,
419	ENABLE, `1`);
420	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, data);
421	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR,
422	m->sdmax_rlcx_rb_rptr);
423	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI,
424	m->sdmax_rlcx_rb_rptr_hi);
425
426	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, `1`);
427	if (read_user_wptr(mm, wptr64, data64)) {
428	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR,
429	lower_32_bits(data64));
430	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI,
431	upper_32_bits(data64));
432	} else {
433	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR,
434	m->sdmax_rlcx_rb_rptr);
435	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI,
436	m->sdmax_rlcx_rb_rptr_hi);
437	}
438	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, `0`);
439
440	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base);
441	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE_HI,
442	m->sdmax_rlcx_rb_base_hi);
443	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_LO,
444	m->sdmax_rlcx_rb_rptr_addr_lo);
445	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_HI,
446	m->sdmax_rlcx_rb_rptr_addr_hi);
447
448	data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL,
449	RB_ENABLE, `1`);
450	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, data);
451
452	return `0`;
453	}
454
455	static int kgd_hqd_sdma_dump(struct amdgpu_device *adev,
456	uint32_t engine_id, uint32_t queue_id,
457	uint32_t (*dump)[`2`], uint32_t n_regs)
458	{
459	uint32_t sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev,
460	engine_id, queue_id);
461	uint32_t i = `0`, reg;
462	#undef HQD_N_REGS
463	#define HQD_N_REGS (19+6+7+10)
464
465	dump = kmalloc_array(HQD_N_REGS `2`, size: sizeof(uint32_t), GFP_KERNEL);
466	if (*dump == NULL)
467	return -ENOMEM;
468
469	for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++)
470	DUMP_REG(sdma_rlc_reg_offset + reg);
471	for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++)
472	DUMP_REG(sdma_rlc_reg_offset + reg);
473	for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN;
474	reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++)
475	DUMP_REG(sdma_rlc_reg_offset + reg);
476	for (reg = mmSDMA0_RLC0_MIDCMD_DATA0;
477	reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++)
478	DUMP_REG(sdma_rlc_reg_offset + reg);
479
480	WARN_ON_ONCE(i != HQD_N_REGS);
481	*n_regs = i;
482
483	return `0`;
484	}
485
486	bool kgd_gfx_v9_hqd_is_occupied(struct amdgpu_device *adev,
487	uint64_t queue_address, uint32_t pipe_id,
488	uint32_t queue_id, uint32_t inst)
489	{
490	uint32_t act;
491	bool retval = false;
492	uint32_t low, high;
493
494	kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
495	act = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE);
496	if (act) {
497	low = lower_32_bits(queue_address >> `8`);
498	high = upper_32_bits(queue_address >> `8`);
499
500	if (low == RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE) &&
501	high == RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_BASE_HI))
502	retval = true;
503	}
504	kgd_gfx_v9_release_queue(adev, inst);
505	return retval;
506	}
507
508	static bool kgd_hqd_sdma_is_occupied(struct amdgpu_device adev, void* *mqd)
509	{
510	struct v9_sdma_mqd *m;
511	uint32_t sdma_rlc_reg_offset;
512	uint32_t sdma_rlc_rb_cntl;
513
514	m = get_sdma_mqd(mqd);
515	sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, engine_id: m->sdma_engine_id,
516	queue_id: m->sdma_queue_id);
517
518	sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL);
519
520	if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)
521	return true;
522
523	return false;
524	}
525
526	int kgd_gfx_v9_hqd_destroy(struct amdgpu_device adev, void* *mqd,
527	enum kfd_preempt_type reset_type,
528	unsigned int utimeout, uint32_t pipe_id,
529	uint32_t queue_id, uint32_t inst)
530	{
531	enum hqd_dequeue_request_type type;
532	unsigned long end_jiffies;
533	uint32_t temp;
534	struct v9_mqd *m = get_mqd(mqd);
535
536	if (amdgpu_in_reset(adev))
537	return -EIO;
538
539	kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
540
541	if (m->cp_hqd_vmid == `0`)
542	WREG32_FIELD15_RLC(GC, GET_INST(GC, inst), RLC_CP_SCHEDULERS, scheduler1, `0`);
543
544	switch (reset_type) {
545	case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN:
546	type = DRAIN_PIPE;
547	break;
548	case KFD_PREEMPT_TYPE_WAVEFRONT_RESET:
549	type = RESET_WAVES;
550	break;
551	case KFD_PREEMPT_TYPE_WAVEFRONT_SAVE:
552	type = SAVE_WAVES;
553	break;
554	default:
555	type = DRAIN_PIPE;
556	break;
557	}
558
559	WREG32_RLC(SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmCP_HQD_DEQUEUE_REQUEST), type);
560
561	end_jiffies = (utimeout * HZ / `1000`) + jiffies;
562	while (true) {
563	temp = RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_ACTIVE);
564	if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
565	break;
566	if (time_after(jiffies, end_jiffies)) {
567	pr_err("cp queue preemption time out.\n");
568	kgd_gfx_v9_release_queue(adev, inst);
569	return -ETIME;
570	}
571	usleep_range(min: `500`, max: `1000`);
572	}
573
574	kgd_gfx_v9_release_queue(adev, inst);
575	return `0`;
576	}
577
578	static int kgd_hqd_sdma_destroy(struct amdgpu_device adev, void* *mqd,
579	unsigned int utimeout)
580	{
581	struct v9_sdma_mqd *m;
582	uint32_t sdma_rlc_reg_offset;
583	uint32_t temp;
584	unsigned long end_jiffies = (utimeout * HZ / `1000`) + jiffies;
585
586	m = get_sdma_mqd(mqd);
587	sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, engine_id: m->sdma_engine_id,
588	queue_id: m->sdma_queue_id);
589
590	temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL);
591	temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK;
592	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, temp);
593
594	while (true) {
595	temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS);
596	if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK)
597	break;
598	if (time_after(jiffies, end_jiffies)) {
599	pr_err("SDMA RLC not idle in %s\n", __func__);
600	return -ETIME;
601	}
602	usleep_range(min: `500`, max: `1000`);
603	}
604
605	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, `0`);
606	WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL,
607	RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL) \|
608	SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK);
609
610	m->sdmax_rlcx_rb_rptr = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR);
611	m->sdmax_rlcx_rb_rptr_hi =
612	RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI);
613
614	return `0`;
615	}
616
617	bool kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev,
618	uint8_t vmid, uint16_t *p_pasid)
619	{
620	uint32_t value;
621
622	value = RREG32(SOC15_REG_OFFSET(ATHUB, `0`, mmATC_VMID0_PASID_MAPPING)
623	+ vmid);
624	*p_pasid = value & ATC_VMID0_PASID_MAPPING__PASID_MASK;
625
626	return !!(value & ATC_VMID0_PASID_MAPPING__VALID_MASK);
627	}
628
629	int kgd_gfx_v9_wave_control_execute(struct amdgpu_device *adev,
630	uint32_t gfx_index_val,
631	uint32_t sq_cmd, uint32_t inst)
632	{
633	uint32_t data = `0`;
634
635	mutex_lock(&adev->grbm_idx_mutex);
636
637	WREG32_SOC15_RLC_SHADOW(GC, GET_INST(GC, inst), mmGRBM_GFX_INDEX, gfx_index_val);
638	WREG32_SOC15(GC, GET_INST(GC, inst), mmSQ_CMD, sq_cmd);
639
640	data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
641	INSTANCE_BROADCAST_WRITES, `1`);
642	data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
643	SH_BROADCAST_WRITES, `1`);
644	data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
645	SE_BROADCAST_WRITES, `1`);
646
647	WREG32_SOC15_RLC_SHADOW(GC, GET_INST(GC, inst), mmGRBM_GFX_INDEX, data);
648	mutex_unlock(lock: &adev->grbm_idx_mutex);
649
650	return `0`;
651	}
652
653	/*
654	* GFX9 helper for wave launch stall requirements on debug trap setting.
655	*
656	* vmid:
657	* Target VMID to stall/unstall.
658	*
659	* stall:
660	* 0-unstall wave launch (enable), 1-stall wave launch (disable).
661	* After wavefront launch has been stalled, allocated waves must drain from
662	* SPI in order for debug trap settings to take effect on those waves.
663	* This is roughly a ~96 clock cycle wait on SPI where a read on
664	* SPI_GDBG_WAVE_CNTL translates to ~32 clock cycles.
665	* KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY indicates the number of reads required.
666	*
667	* NOTE: We can afford to clear the entire STALL_VMID field on unstall
668	* because GFX9.4.1 cannot support multi-process debugging due to trap
669	* configuration and masking being limited to global scope. Always assume
670	* single process conditions.
671	*/
672	#define KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY 3
673	void kgd_gfx_v9_set_wave_launch_stall(struct amdgpu_device *adev,
674	uint32_t vmid,
675	bool stall)
676	{
677	int i;
678	uint32_t data = RREG32(SOC15_REG_OFFSET(GC, `0`, mmSPI_GDBG_WAVE_CNTL));
679
680	if (amdgpu_ip_version(adev, ip: GC_HWIP, inst: `0`) == IP_VERSION(`9`, `4`, `1`))
681	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_VMID,
682	stall ? `1` << vmid : `0`);
683	else
684	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL, STALL_RA,
685	stall ? `1` : `0`);
686
687	WREG32(SOC15_REG_OFFSET(GC, `0`, mmSPI_GDBG_WAVE_CNTL), data);
688
689	if (!stall)
690	return;
691
692	for (i = `0`; i < KGD_GFX_V9_WAVE_LAUNCH_SPI_DRAIN_LATENCY; i++)
693	RREG32(SOC15_REG_OFFSET(GC, `0`, mmSPI_GDBG_WAVE_CNTL));
694	}
695
696	/*
697	* restore_dbg_registers is ignored here but is a general interface requirement
698	* for devices that support GFXOFF and where the RLC save/restore list
699	* does not support hw registers for debugging i.e. the driver has to manually
700	* initialize the debug mode registers after it has disabled GFX off during the
701	* debug session.
702	*/
703	uint32_t kgd_gfx_v9_enable_debug_trap(struct amdgpu_device *adev,
704	bool restore_dbg_registers,
705	uint32_t vmid)
706	{
707	mutex_lock(&adev->grbm_idx_mutex);
708
709	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, stall: true);
710
711	WREG32(SOC15_REG_OFFSET(GC, `0`, mmSPI_GDBG_TRAP_MASK), `0`);
712
713	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, stall: false);
714
715	mutex_unlock(lock: &adev->grbm_idx_mutex);
716
717	return `0`;
718	}
719
720	/*
721	* keep_trap_enabled is ignored here but is a general interface requirement
722	* for devices that support multi-process debugging where the performance
723	* overhead from trap temporary setup needs to be bypassed when the debug
724	* session has ended.
725	*/
726	uint32_t kgd_gfx_v9_disable_debug_trap(struct amdgpu_device *adev,
727	bool keep_trap_enabled,
728	uint32_t vmid)
729	{
730	mutex_lock(&adev->grbm_idx_mutex);
731
732	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, stall: true);
733
734	WREG32(SOC15_REG_OFFSET(GC, `0`, mmSPI_GDBG_TRAP_MASK), `0`);
735
736	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, stall: false);
737
738	mutex_unlock(lock: &adev->grbm_idx_mutex);
739
740	return `0`;
741	}
742
743	int kgd_gfx_v9_validate_trap_override_request(struct amdgpu_device *adev,
744	uint32_t trap_override,
745	uint32_t *trap_mask_supported)
746	{
747	*trap_mask_supported &= KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH;
748
749	/ The SPI_GDBG_TRAP_MASK register is global and affects all*
750	* processes. Only allow OR-ing the address-watch bit, since
751	* this only affects processes under the debugger. Other bits
752	* should stay 0 to avoid the debugger interfering with other
753	* processes.
754	*/
755	if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR)
756	return -EINVAL;
757
758	return `0`;
759	}
760
761	uint32_t kgd_gfx_v9_set_wave_launch_trap_override(struct amdgpu_device *adev,
762	uint32_t vmid,
763	uint32_t trap_override,
764	uint32_t trap_mask_bits,
765	uint32_t trap_mask_request,
766	uint32_t *trap_mask_prev,
767	uint32_t kfd_dbg_cntl_prev)
768	{
769	uint32_t data, wave_cntl_prev;
770
771	mutex_lock(&adev->grbm_idx_mutex);
772
773	wave_cntl_prev = RREG32(SOC15_REG_OFFSET(GC, `0`, mmSPI_GDBG_WAVE_CNTL));
774
775	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, stall: true);
776
777	data = RREG32(SOC15_REG_OFFSET(GC, `0`, mmSPI_GDBG_TRAP_MASK));
778	*trap_mask_prev = REG_GET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN);
779
780	trap_mask_bits = (trap_mask_bits & trap_mask_request) \|
781	(*trap_mask_prev & ~trap_mask_request);
782
783	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, EXCP_EN, trap_mask_bits);
784	data = REG_SET_FIELD(data, SPI_GDBG_TRAP_MASK, REPLACE, trap_override);
785	WREG32(SOC15_REG_OFFSET(GC, `0`, mmSPI_GDBG_TRAP_MASK), data);
786
787	/ We need to preserve wave launch mode stall settings. /
788	WREG32(SOC15_REG_OFFSET(GC, `0`, mmSPI_GDBG_WAVE_CNTL), wave_cntl_prev);
789
790	mutex_unlock(lock: &adev->grbm_idx_mutex);
791
792	return `0`;
793	}
794
795	uint32_t kgd_gfx_v9_set_wave_launch_mode(struct amdgpu_device *adev,
796	uint8_t wave_launch_mode,
797	uint32_t vmid)
798	{
799	uint32_t data = `0`;
800	bool is_mode_set = !!wave_launch_mode;
801
802	mutex_lock(&adev->grbm_idx_mutex);
803
804	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, stall: true);
805
806	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
807	VMID_MASK, is_mode_set ? `1` << vmid : `0`);
808	data = REG_SET_FIELD(data, SPI_GDBG_WAVE_CNTL2,
809	MODE, is_mode_set ? wave_launch_mode : `0`);
810	WREG32(SOC15_REG_OFFSET(GC, `0`, mmSPI_GDBG_WAVE_CNTL2), data);
811
812	kgd_gfx_v9_set_wave_launch_stall(adev, vmid, stall: false);
813
814	mutex_unlock(lock: &adev->grbm_idx_mutex);
815
816	return `0`;
817	}
818
819	#define TCP_WATCH_STRIDE (mmTCP_WATCH1_ADDR_H - mmTCP_WATCH0_ADDR_H)
820	uint32_t kgd_gfx_v9_set_address_watch(struct amdgpu_device *adev,
821	uint64_t watch_address,
822	uint32_t watch_address_mask,
823	uint32_t watch_id,
824	uint32_t watch_mode,
825	uint32_t debug_vmid,
826	uint32_t inst)
827	{
828	uint32_t watch_address_high;
829	uint32_t watch_address_low;
830	uint32_t watch_address_cntl;
831
832	watch_address_cntl = `0`;
833
834	watch_address_low = lower_32_bits(watch_address);
835	watch_address_high = upper_32_bits(watch_address) & `0xffff`;
836
837	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
838	TCP_WATCH0_CNTL,
839	VMID,
840	debug_vmid);
841	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
842	TCP_WATCH0_CNTL,
843	MODE,
844	watch_mode);
845	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
846	TCP_WATCH0_CNTL,
847	MASK,
848	watch_address_mask >> `6`);
849
850	/ Turning off this watch point until we set all the registers /
851	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
852	TCP_WATCH0_CNTL,
853	VALID,
854	`0`);
855
856	WREG32_RLC((SOC15_REG_OFFSET(GC, `0`, mmTCP_WATCH0_CNTL) +
857	(watch_id * TCP_WATCH_STRIDE)),
858	watch_address_cntl);
859
860	WREG32_RLC((SOC15_REG_OFFSET(GC, `0`, mmTCP_WATCH0_ADDR_H) +
861	(watch_id * TCP_WATCH_STRIDE)),
862	watch_address_high);
863
864	WREG32_RLC((SOC15_REG_OFFSET(GC, `0`, mmTCP_WATCH0_ADDR_L) +
865	(watch_id * TCP_WATCH_STRIDE)),
866	watch_address_low);
867
868	/ Enable the watch point /
869	watch_address_cntl = REG_SET_FIELD(watch_address_cntl,
870	TCP_WATCH0_CNTL,
871	VALID,
872	`1`);
873
874	WREG32_RLC((SOC15_REG_OFFSET(GC, `0`, mmTCP_WATCH0_CNTL) +
875	(watch_id * TCP_WATCH_STRIDE)),
876	watch_address_cntl);
877
878	return `0`;
879	}
880
881	uint32_t kgd_gfx_v9_clear_address_watch(struct amdgpu_device *adev,
882	uint32_t watch_id)
883	{
884	uint32_t watch_address_cntl;
885
886	watch_address_cntl = `0`;
887
888	WREG32_RLC((SOC15_REG_OFFSET(GC, `0`, mmTCP_WATCH0_CNTL) +
889	(watch_id * TCP_WATCH_STRIDE)),
890	watch_address_cntl);
891
892	return `0`;
893	}
894
895	/ kgd_gfx_v9_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values*
896	* The values read are:
897	* ib_offload_wait_time -- Wait Count for Indirect Buffer Offloads.
898	* atomic_offload_wait_time -- Wait Count for L2 and GDS Atomics Offloads.
899	* wrm_offload_wait_time -- Wait Count for WAIT_REG_MEM Offloads.
900	* gws_wait_time -- Wait Count for Global Wave Syncs.
901	* que_sleep_wait_time -- Wait Count for Dequeue Retry.
902	* sch_wave_wait_time -- Wait Count for Scheduling Wave Message.
903	* sem_rearm_wait_time -- Wait Count for Semaphore re-arm.
904	* deq_retry_wait_time -- Wait Count for Global Wave Syncs.
905	*/
906	void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev,
907	uint32_t *wait_times,
908	uint32_t inst)
909
910	{
911	*wait_times = RREG32(SOC15_REG_OFFSET(GC, GET_INST(GC, inst),
912	mmCP_IQ_WAIT_TIME2));
913	}
914
915	void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
916	uint32_t vmid, uint64_t page_table_base)
917	{
918	if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) {
919	pr_err("trying to set page table base for wrong VMID %u\n",
920	vmid);
921	return;
922	}
923
924	adev->mmhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base);
925
926	adev->gfxhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base);
927	}
928
929	static void lock_spi_csq_mutexes(struct amdgpu_device *adev)
930	{
931	mutex_lock(&adev->srbm_mutex);
932	mutex_lock(&adev->grbm_idx_mutex);
933
934	}
935
936	static void unlock_spi_csq_mutexes(struct amdgpu_device *adev)
937	{
938	mutex_unlock(lock: &adev->grbm_idx_mutex);
939	mutex_unlock(lock: &adev->srbm_mutex);
940	}
941
942	/**
943	* get_wave_count: Read device registers to get number of waves in flight for
944	* a particular queue. The method also returns the VMID associated with the
945	* queue.
946	*
947	* @adev: Handle of device whose registers are to be read
948	* @queue_idx: Index of queue in the queue-map bit-field
949	* @wave_cnt: Output parameter updated with number of waves in flight
950	* @vmid: Output parameter updated with VMID of queue whose wave count
951	* is being collected
952	* @inst: xcc's instance number on a multi-XCC setup
953	*/
954	static void get_wave_count(struct amdgpu_device adev, int* queue_idx,
955	int wave_cnt, int* *vmid, uint32_t inst)
956	{
957	int pipe_idx;
958	int queue_slot;
959	unsigned int reg_val;
960
961	/*
962	* Program GRBM with appropriate MEID, PIPEID, QUEUEID and VMID
963	* parameters to read out waves in flight. Get VMID if there are
964	* non-zero waves in flight.
965	*/
966	*vmid = `0xFF`;
967	*wave_cnt = `0`;
968	pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe;
969	queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe;
970	soc15_grbm_select(adev, me: `1`, pipe: pipe_idx, queue: queue_slot, vmid: `0`, xcc_id: inst);
971	reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst, mmSPI_CSQ_WF_ACTIVE_COUNT_0) +
972	queue_slot);
973	*wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
974	if (*wave_cnt != `0`)
975	*vmid = (RREG32_SOC15(GC, inst, mmCP_HQD_VMID) &
976	CP_HQD_VMID__VMID_MASK) >> CP_HQD_VMID__VMID__SHIFT;
977	}
978
979	/**
980	* kgd_gfx_v9_get_cu_occupancy: Reads relevant registers associated with each
981	* shader engine and aggregates the number of waves that are in flight for the
982	* process whose pasid is provided as a parameter. The process could have ZERO
983	* or more queues running and submitting waves to compute units.
984	*
985	* @adev: Handle of device from which to get number of waves in flight
986	* @pasid: Identifies the process for which this query call is invoked
987	* @pasid_wave_cnt: Output parameter updated with number of waves in flight that
988	* belong to process with given pasid
989	* @max_waves_per_cu: Output parameter updated with maximum number of waves
990	* possible per Compute Unit
991	* @inst: xcc's instance number on a multi-XCC setup
992	*
993	* Note: It's possible that the device has too many queues (oversubscription)
994	* in which case a VMID could be remapped to a different PASID. This could lead
995	* to an inaccurate wave count. Following is a high-level sequence:
996	* Time T1: vmid = getVmid(); vmid is associated with Pasid P1
997	* Time T2: passId = getPasId(vmid); vmid is associated with Pasid P2
998	* In the sequence above wave count obtained from time T1 will be incorrectly
999	* lost or added to total wave count.
1000	*
1001	* The registers that provide the waves in flight are:
1002	*
1003	* SPI_CSQ_WF_ACTIVE_STATUS - bit-map of queues per pipe. The bit is ON if a
1004	* queue is slotted, OFF if there is no queue. A process could have ZERO or
1005	* more queues slotted and submitting waves to be run on compute units. Even
1006	* when there is a queue it is possible there could be zero wave fronts, this
1007	* can happen when queue is waiting on top-of-pipe events - e.g. waitRegMem
1008	* command
1009	*
1010	* For each bit that is ON from above:
1011	*
1012	* Read (SPI_CSQ_WF_ACTIVE_COUNT_0 + queue_idx) register. It provides the
1013	* number of waves that are in flight for the queue at specified index. The
1014	* index ranges from 0 to 7.
1015	*
1016	* If non-zero waves are in flight, read CP_HQD_VMID register to obtain VMID
1017	* of the wave(s).
1018	*
1019	* Determine if VMID from above step maps to pasid provided as parameter. If
1020	* it matches agrregate the wave count. That the VMID will not match pasid is
1021	* a normal condition i.e. a device is expected to support multiple queues
1022	* from multiple proceses.
1023	*
1024	* Reading registers referenced above involves programming GRBM appropriately
1025	*/
1026	void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device adev, int* pasid,
1027	int pasid_wave_cnt, int* *max_waves_per_cu, uint32_t inst)
1028	{
1029	int qidx;
1030	int vmid;
1031	int se_idx;
1032	int sh_idx;
1033	int se_cnt;
1034	int sh_cnt;
1035	int wave_cnt;
1036	int queue_map;
1037	int pasid_tmp;
1038	int max_queue_cnt;
1039	int vmid_wave_cnt = `0`;
1040	DECLARE_BITMAP(cp_queue_bitmap, AMDGPU_MAX_QUEUES);
1041
1042	lock_spi_csq_mutexes(adev);
1043	soc15_grbm_select(adev, me: `1`, pipe: `0`, queue: `0`, vmid: `0`, xcc_id: inst);
1044
1045	/*
1046	* Iterate through the shader engines and arrays of the device
1047	* to get number of waves in flight
1048	*/
1049	bitmap_complement(dst: cp_queue_bitmap, src: adev->gfx.mec_bitmap[`0`].queue_bitmap,
1050	AMDGPU_MAX_QUEUES);
1051	max_queue_cnt = adev->gfx.mec.num_pipe_per_mec *
1052	adev->gfx.mec.num_queue_per_pipe;
1053	sh_cnt = adev->gfx.config.max_sh_per_se;
1054	se_cnt = adev->gfx.config.max_shader_engines;
1055	for (se_idx = `0`; se_idx < se_cnt; se_idx++) {
1056	for (sh_idx = `0`; sh_idx < sh_cnt; sh_idx++) {
1057
1058	amdgpu_gfx_select_se_sh(adev, se_idx, sh_idx, `0xffffffff`, inst);
1059	queue_map = RREG32_SOC15(GC, inst, mmSPI_CSQ_WF_ACTIVE_STATUS);
1060
1061	/*
1062	* Assumption: queue map encodes following schema: four
1063	* pipes per each micro-engine, with each pipe mapping
1064	* eight queues. This schema is true for GFX9 devices
1065	* and must be verified for newer device families
1066	*/
1067	for (qidx = `0`; qidx < max_queue_cnt; qidx++) {
1068
1069	/ Skip qeueus that are not associated with*
1070	* compute functions
1071	*/
1072	if (!test_bit(qidx, cp_queue_bitmap))
1073	continue;
1074
1075	if (!(queue_map & (`1` << qidx)))
1076	continue;
1077
1078	/ Get number of waves in flight and aggregate them /
1079	get_wave_count(adev, queue_idx: qidx, wave_cnt: &wave_cnt, vmid: &vmid,
1080	inst);
1081	if (wave_cnt != `0`) {
1082	pasid_tmp =
1083	RREG32(SOC15_REG_OFFSET(OSSSYS, inst,
1084	mmIH_VMID_0_LUT) + vmid);
1085	if (pasid_tmp == pasid)
1086	vmid_wave_cnt += wave_cnt;
1087	}
1088	}
1089	}
1090	}
1091
1092	amdgpu_gfx_select_se_sh(adev, `0xffffffff`, `0xffffffff`, `0xffffffff`, inst);
1093	soc15_grbm_select(adev, me: `0`, pipe: `0`, queue: `0`, vmid: `0`, xcc_id: inst);
1094	unlock_spi_csq_mutexes(adev);
1095
1096	/ Update the output parameters and return /
1097	*pasid_wave_cnt = vmid_wave_cnt;
1098	max_waves_per_cu = adev->gfx.cu_info.simd_per_cu
1099	adev->gfx.cu_info.max_waves_per_simd;
1100	}
1101
1102	void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
1103	uint32_t wait_times,
1104	uint32_t grace_period,
1105	uint32_t *reg_offset,
1106	uint32_t *reg_data)
1107	{
1108	*reg_data = wait_times;
1109
1110	/*
1111	* The CP cannot handle a 0 grace period input and will result in
1112	* an infinite grace period being set so set to 1 to prevent this.
1113	*/
1114	if (grace_period == `0`)
1115	grace_period = `1`;
1116
1117	reg_data = REG_SET_FIELD(reg_data,
1118	CP_IQ_WAIT_TIME2,
1119	SCH_WAVE,
1120	grace_period);
1121
1122	*reg_offset = SOC15_REG_OFFSET(GC, `0`, mmCP_IQ_WAIT_TIME2);
1123	}
1124
1125	void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
1126	uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr, uint32_t inst)
1127	{
1128	kgd_gfx_v9_lock_srbm(adev, mec: `0`, pipe: `0`, queue: `0`, vmid, inst);
1129
1130	/*
1131	* Program TBA registers
1132	*/
1133	WREG32_SOC15(GC, GET_INST(GC, inst), mmSQ_SHADER_TBA_LO,
1134	lower_32_bits(tba_addr >> `8`));
1135	WREG32_SOC15(GC, GET_INST(GC, inst), mmSQ_SHADER_TBA_HI,
1136	upper_32_bits(tba_addr >> `8`));
1137
1138	/*
1139	* Program TMA registers
1140	*/
1141	WREG32_SOC15(GC, GET_INST(GC, inst), mmSQ_SHADER_TMA_LO,
1142	lower_32_bits(tma_addr >> `8`));
1143	WREG32_SOC15(GC, GET_INST(GC, inst), mmSQ_SHADER_TMA_HI,
1144	upper_32_bits(tma_addr >> `8`));
1145
1146	kgd_gfx_v9_unlock_srbm(adev, inst);
1147	}
1148
1149	const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
1150	.program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings,
1151	.set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
1152	.init_interrupts = kgd_gfx_v9_init_interrupts,
1153	.hqd_load = kgd_gfx_v9_hqd_load,
1154	.hiq_mqd_load = kgd_gfx_v9_hiq_mqd_load,
1155	.hqd_sdma_load = kgd_hqd_sdma_load,
1156	.hqd_dump = kgd_gfx_v9_hqd_dump,
1157	.hqd_sdma_dump = kgd_hqd_sdma_dump,
1158	.hqd_is_occupied = kgd_gfx_v9_hqd_is_occupied,
1159	.hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
1160	.hqd_destroy = kgd_gfx_v9_hqd_destroy,
1161	.hqd_sdma_destroy = kgd_hqd_sdma_destroy,
1162	.wave_control_execute = kgd_gfx_v9_wave_control_execute,
1163	.get_atc_vmid_pasid_mapping_info =
1164	kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
1165	.set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base,
1166	.enable_debug_trap = kgd_gfx_v9_enable_debug_trap,
1167	.disable_debug_trap = kgd_gfx_v9_disable_debug_trap,
1168	.validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request,
1169	.set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override,
1170	.set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode,
1171	.set_address_watch = kgd_gfx_v9_set_address_watch,
1172	.clear_address_watch = kgd_gfx_v9_clear_address_watch,
1173	.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
1174	.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
1175	.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
1176	.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
1177	};
1178

source code of linux/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c