SIInsertWaitcnts.cpp source code [llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp]

1	//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// Insert wait instructions for memory reads and writes.
11	///
12	/// Memory reads and writes are issued asynchronously, so we need to insert
13	/// S_WAITCNT instructions when we want to access any of their results or
14	/// overwrite any register that's used asynchronously.
15	///
16	/// TODO: This pass currently keeps one timeline per hardware counter. A more
17	/// finely-grained approach that keeps one timeline per event type could
18	/// sometimes get away with generating weaker s_waitcnt instructions. For
19	/// example, when both SMEM and LDS are in flight and we need to wait for
20	/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21	/// but the pass will currently generate a conservative lgkmcnt(0) because
22	/// multiple event types are in flight.
23	//
24	//===----------------------------------------------------------------------===//
25
26	#include "AMDGPU.h"
27	#include "GCNSubtarget.h"
28	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
29	#include "SIMachineFunctionInfo.h"
30	#include "Utils/AMDGPUBaseInfo.h"
31	#include "llvm/ADT/MapVector.h"
32	#include "llvm/ADT/PostOrderIterator.h"
33	#include "llvm/ADT/Sequence.h"
34	#include "llvm/Analysis/AliasAnalysis.h"
35	#include "llvm/CodeGen/MachineLoopInfo.h"
36	#include "llvm/CodeGen/MachinePostDominators.h"
37	#include "llvm/InitializePasses.h"
38	#include "llvm/Support/DebugCounter.h"
39	#include "llvm/TargetParser/TargetParser.h"
40	using namespace llvm;
41
42	#define DEBUG_TYPE "si-insert-waitcnts"
43
44	DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
45	"Force emit s_waitcnt expcnt(0) instrs");
46	DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
47	"Force emit s_waitcnt lgkmcnt(0) instrs");
48	DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
49	"Force emit s_waitcnt vmcnt(0) instrs");
50
51	static cl::opt<bool> ForceEmitZeroFlag(
52	"amdgpu-waitcnt-forcezero",
53	cl::desc ("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
54	cl::init(Val: false), cl::Hidden);
55
56	namespace {
57	// Class of object that encapsulates latest instruction counter score
58	// associated with the operand. Used for determining whether
59	// s_waitcnt instruction needs to be emitted.
60
61	enum InstCounterType {
62	LOAD_CNT = `0`, // VMcnt prior to gfx12.
63	DS_CNT, // LKGMcnt prior to gfx12.
64	EXP_CNT, //
65	STORE_CNT, // VScnt in gfx10/gfx11.
66	NUM_NORMAL_INST_CNTS,
67	SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
68	BVH_CNT, // gfx12+ only.
69	KM_CNT, // gfx12+ only.
70	NUM_EXTENDED_INST_CNTS,
71	NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
72	};
73	} // namespace
74
75	namespace llvm {
76	template <> struct enum_iteration_traits<InstCounterType> {
77	static constexpr bool is_iterable = true;
78	};
79	} // namespace llvm
80
81	namespace {
82	// Return an iterator over all counters between LOAD_CNT (the first counter)
83	// and \c MaxCounter (exclusive, default value yields an enumeration over
84	// all counters).
85	auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
86	return enum_seq(Begin: LOAD_CNT, End: MaxCounter);
87	}
88
89	using RegInterval = std::pair<int, int>;
90
91	struct HardwareLimits {
92	unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
93	unsigned ExpcntMax;
94	unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
95	unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
96	unsigned SamplecntMax; // gfx12+ only.
97	unsigned BvhcntMax; // gfx12+ only.
98	unsigned KmcntMax; // gfx12+ only.
99	};
100
101	struct RegisterEncoding {
102	unsigned VGPR0;
103	unsigned VGPRL;
104	unsigned SGPR0;
105	unsigned SGPRL;
106	};
107
108	enum WaitEventType {
109	VMEM_ACCESS, // vector-memory read & write
110	VMEM_READ_ACCESS, // vector-memory read
111	VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
112	VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only)
113	VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
114	SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
115	LDS_ACCESS, // lds read & write
116	GDS_ACCESS, // gds read & write
117	SQ_MESSAGE, // send message
118	SMEM_ACCESS, // scalar-memory read & write
119	EXP_GPR_LOCK, // export holding on its data src
120	GDS_GPR_LOCK, // GDS holding on its data and addr src
121	EXP_POS_ACCESS, // write to export position
122	EXP_PARAM_ACCESS, // write to export parameter
123	VMW_GPR_LOCK, // vector-memory write holding on its data src
124	EXP_LDS_ACCESS, // read by ldsdir counting as export
125	NUM_WAIT_EVENTS,
126	};
127
128	// The mapping is:
129	// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
130	// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
131	// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
132	// We reserve a fixed number of VGPR slots in the scoring tables for
133	// special tokens like SCMEM_LDS (needed for buffer load to LDS).
134	enum RegisterMapping {
135	SQ_MAX_PGM_VGPRS = `512`, // Maximum programmable VGPRs across all targets.
136	AGPR_OFFSET = `256`, // Maximum programmable ArchVGPRs across all targets.
137	SQ_MAX_PGM_SGPRS = `256`, // Maximum programmable SGPRs across all targets.
138	NUM_EXTRA_VGPRS = `9`, // Reserved slots for DS.
139	// Artificial register slots to track LDS writes into specific LDS locations
140	// if a location is known. When slots are exhausted or location is
141	// unknown use the first slot. The first slot is also always updated in
142	// addition to known location's slot to properly generate waits if dependent
143	// instruction's location is unknown.
144	EXTRA_VGPR_LDS = `0`,
145	NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
146	};
147
148	// Enumerate different types of result-returning VMEM operations. Although
149	// s_waitcnt orders them all with a single vmcnt counter, in the absence of
150	// s_waitcnt only instructions of the same VmemType are guaranteed to write
151	// their results in order -- so there is no need to insert an s_waitcnt between
152	// two instructions of the same type that write the same vgpr.
153	enum VmemType {
154	// BUF instructions and MIMG instructions without a sampler.
155	VMEM_NOSAMPLER,
156	// MIMG instructions with a sampler.
157	VMEM_SAMPLER,
158	// BVH instructions
159	VMEM_BVH,
160	NUM_VMEM_TYPES
161	};
162
163	// Maps values of InstCounterType to the instruction that waits on that
164	// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
165	// returns true.
166	static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
167	AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
168	AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
169	AMDGPU::S_WAIT_KMCNT};
170
171	static bool updateVMCntOnly(const MachineInstr &Inst) {
172	return SIInstrInfo::isVMEM(MI: Inst) \|\| SIInstrInfo::isFLATGlobal(MI: Inst) \|\|
173	SIInstrInfo::isFLATScratch(MI: Inst);
174	}
175
176	#ifndef NDEBUG
177	static bool isNormalMode(InstCounterType MaxCounter) {
178	return MaxCounter == NUM_NORMAL_INST_CNTS;
179	}
180	#endif // NDEBUG
181
182	VmemType getVmemType(const MachineInstr &Inst) {
183	assert(updateVMCntOnly(Inst));
184	if (!SIInstrInfo::isMIMG(MI: Inst) && !SIInstrInfo::isVIMAGE(MI: Inst) &&
185	!SIInstrInfo::isVSAMPLE(MI: Inst))
186	return VMEM_NOSAMPLER;
187	const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: Inst.getOpcode());
188	const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
189	AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode);
190	return BaseInfo->BVH ? VMEM_BVH
191	: BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
192	}
193
194	unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
195	switch (T) {
196	case LOAD_CNT:
197	return Wait.LoadCnt;
198	case EXP_CNT:
199	return Wait.ExpCnt;
200	case DS_CNT:
201	return Wait.DsCnt;
202	case STORE_CNT:
203	return Wait.StoreCnt;
204	case SAMPLE_CNT:
205	return Wait.SampleCnt;
206	case BVH_CNT:
207	return Wait.BvhCnt;
208	case KM_CNT:
209	return Wait.KmCnt;
210	default:
211	llvm_unreachable("bad InstCounterType");
212	}
213	}
214
215	void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
216	unsigned &WC = getCounterRef(Wait, T);
217	WC = std::min(a: WC, b: Count);
218	}
219
220	void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
221	getCounterRef(Wait, T) = ~`0u`;
222	}
223
224	unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
225	return getCounterRef(Wait, T);
226	}
227
228	// Mapping from event to counter according to the table masks.
229	InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
230	for (auto T : inst_counter_types()) {
231	if (masks[T] & (`1` << E))
232	return T;
233	}
234	llvm_unreachable("event type has no associated counter");
235	}
236
237	// This objects maintains the current score brackets of each wait counter, and
238	// a per-register scoreboard for each wait counter.
239	//
240	// We also maintain the latest score for every event type that can change the
241	// waitcnt in order to know if there are multiple types of events within
242	// the brackets. When multiple types of event happen in the bracket,
243	// wait count may get decreased out of order, therefore we need to put in
244	// "s_waitcnt 0" before use.
245	class WaitcntBrackets {
246	public:
247	WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
248	HardwareLimits Limits, RegisterEncoding Encoding,
249	const unsigned *WaitEventMaskForInst,
250	InstCounterType SmemAccessCounter)
251	: ST(SubTarget), MaxCounter(MaxCounter), Limits (Limits),
252	Encoding (Encoding), WaitEventMaskForInst(WaitEventMaskForInst),
253	SmemAccessCounter(SmemAccessCounter) {}
254
255	unsigned getWaitCountMax(InstCounterType T) const {
256	switch (T) {
257	case LOAD_CNT:
258	return Limits.LoadcntMax;
259	case DS_CNT:
260	return Limits.DscntMax;
261	case EXP_CNT:
262	return Limits.ExpcntMax;
263	case STORE_CNT:
264	return Limits.StorecntMax;
265	case SAMPLE_CNT:
266	return Limits.SamplecntMax;
267	case BVH_CNT:
268	return Limits.BvhcntMax;
269	case KM_CNT:
270	return Limits.KmcntMax;
271	default:
272	break;
273	}
274	return `0`;
275	}
276
277	unsigned getScoreLB(InstCounterType T) const {
278	assert(T < NUM_INST_CNTS);
279	return ScoreLBs[T];
280	}
281
282	unsigned getScoreUB(InstCounterType T) const {
283	assert(T < NUM_INST_CNTS);
284	return ScoreUBs[T];
285	}
286
287	unsigned getScoreRange(InstCounterType T) const {
288	return getScoreUB(T) - getScoreLB(T);
289	}
290
291	unsigned getRegScore(int GprNo, InstCounterType T) const {
292	if (GprNo < NUM_ALL_VGPRS) {
293	return VgprScores[T][GprNo];
294	}
295	assert(T == SmemAccessCounter);
296	return SgprScores[GprNo - NUM_ALL_VGPRS];
297	}
298
299	bool merge(const WaitcntBrackets &Other);
300
301	RegInterval getRegInterval(const MachineInstr *MI,
302	const MachineRegisterInfo *MRI,
303	const SIRegisterInfo TRI, unsigned* OpNo) const;
304
305	bool counterOutOfOrder(InstCounterType T) const;
306	void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
307	void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
308	void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
309	void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
310	void applyWaitcnt(InstCounterType T, unsigned Count);
311	void updateByEvent(const SIInstrInfo TII, const* SIRegisterInfo *TRI,
312	const MachineRegisterInfo *MRI, WaitEventType E,
313	MachineInstr &MI);
314
315	unsigned hasPendingEvent() const { return PendingEvents; }
316	unsigned hasPendingEvent(WaitEventType E) const {
317	return PendingEvents & (`1` << E);
318	}
319	unsigned hasPendingEvent(InstCounterType T) const {
320	unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
321	assert((HasPending != `0`) == (getScoreRange(T) != `0`));
322	return HasPending;
323	}
324
325	bool hasMixedPendingEvents(InstCounterType T) const {
326	unsigned Events = hasPendingEvent(T);
327	// Return true if more than one bit is set in Events.
328	return Events & (Events - `1`);
329	}
330
331	bool hasPendingFlat() const {
332	return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
333	LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) \|\|
334	(LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
335	LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
336	}
337
338	void setPendingFlat() {
339	LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
340	LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
341	}
342
343	// Return true if there might be pending writes to the specified vgpr by VMEM
344	// instructions with types different from V.
345	bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
346	assert(GprNo < NUM_ALL_VGPRS);
347	return VgprVmemTypes[GprNo] & ~(`1` << V);
348	}
349
350	void clearVgprVmemTypes(int GprNo) {
351	assert(GprNo < NUM_ALL_VGPRS);
352	VgprVmemTypes[GprNo] = `0`;
353	}
354
355	void setStateOnFunctionEntryOrReturn() {
356	setScoreUB(T: STORE_CNT, Val: getScoreUB(T: STORE_CNT) + getWaitCountMax(T: STORE_CNT));
357	PendingEvents \|= WaitEventMaskForInst[STORE_CNT];
358	}
359
360	ArrayRef<const MachineInstr > getLDSDMAStores() const* {
361	return LDSDMAStores;
362	}
363
364	void print(raw_ostream &);
365	void dump() { print(dbgs()); }
366
367	private:
368	struct MergeInfo {
369	unsigned OldLB;
370	unsigned OtherLB;
371	unsigned MyShift;
372	unsigned OtherShift;
373	};
374	static bool mergeScore(const MergeInfo &M, unsigned &Score,
375	unsigned OtherScore);
376
377	void setScoreLB(InstCounterType T, unsigned Val) {
378	assert(T < NUM_INST_CNTS);
379	ScoreLBs[T] = Val;
380	}
381
382	void setScoreUB(InstCounterType T, unsigned Val) {
383	assert(T < NUM_INST_CNTS);
384	ScoreUBs[T] = Val;
385
386	if (T != EXP_CNT)
387	return;
388
389	if (getScoreRange(T: EXP_CNT) > getWaitCountMax(T: EXP_CNT))
390	ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(T: EXP_CNT);
391	}
392
393	void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
394	if (GprNo < NUM_ALL_VGPRS) {
395	VgprUB = std::max(a: VgprUB, b: GprNo);
396	VgprScores[T][GprNo] = Val;
397	} else {
398	assert(T == SmemAccessCounter);
399	SgprUB = std::max(a: SgprUB, b: GprNo - NUM_ALL_VGPRS);
400	SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
401	}
402	}
403
404	void setExpScore(const MachineInstr MI, const* SIInstrInfo *TII,
405	const SIRegisterInfo TRI, const* MachineRegisterInfo *MRI,
406	unsigned OpNo, unsigned Val);
407
408	const GCNSubtarget ST = nullptr*;
409	InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
410	HardwareLimits Limits = {};
411	RegisterEncoding Encoding = {};
412	const unsigned *WaitEventMaskForInst;
413	InstCounterType SmemAccessCounter;
414	unsigned ScoreLBs[NUM_INST_CNTS] = {`0`};
415	unsigned ScoreUBs[NUM_INST_CNTS] = {`0`};
416	unsigned PendingEvents = `0`;
417	// Remember the last flat memory operation.
418	unsigned LastFlat[NUM_INST_CNTS] = {`0`};
419	// wait_cnt scores for every vgpr.
420	// Keep track of the VgprUB and SgprUB to make merge at join efficient.
421	int VgprUB = -`1`;
422	int SgprUB = -`1`;
423	unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{`0`}};
424	// Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
425	// pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
426	unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {`0`};
427	// Bitmask of the VmemTypes of VMEM instructions that might have a pending
428	// write to each vgpr.
429	unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {`0`};
430	// Store representative LDS DMA operations. The only useful info here is
431	// alias info. One store is kept per unique AAInfo.
432	SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - `1`> LDSDMAStores;
433	};
434
435	// This abstracts the logic for generating and updating S_WAIT instructions*
436	// away from the analysis that determines where they are needed. This was
437	// done because the set of counters and instructions for waiting on them
438	// underwent a major shift with gfx12, sufficiently so that having this
439	// abstraction allows the main analysis logic to be simpler than it would
440	// otherwise have had to become.
441	class WaitcntGenerator {
442	protected:
443	const GCNSubtarget ST = nullptr*;
444	const SIInstrInfo TII = nullptr*;
445	AMDGPU::IsaVersion IV;
446	InstCounterType MaxCounter;
447
448	public:
449	WaitcntGenerator() {}
450	WaitcntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter)
451	: ST(ST), TII(ST->getInstrInfo()),
452	IV(AMDGPU::getIsaVersion(GPU: ST->getCPU())), MaxCounter(MaxCounter) {}
453
454	// Edits an existing sequence of wait count instructions according
455	// to an incoming Waitcnt value, which is itself updated to reflect
456	// any new wait count instructions which may need to be generated by
457	// WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
458	// were made.
459	//
460	// This editing will usually be merely updated operands, but it may also
461	// delete instructions if the incoming Wait value indicates they are not
462	// needed. It may also remove existing instructions for which a wait
463	// is needed if it can be determined that it is better to generate new
464	// instructions later, as can happen on gfx12.
465	virtual bool
466	applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
467	MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
468	MachineBasicBlock::instr_iterator It) const = `0`;
469
470	// Transform a soft waitcnt into a normal one.
471	bool promoteSoftWaitCnt(MachineInstr Waitcnt) const*;
472
473	// Generates new wait count instructions according to the value of
474	// Wait, returning true if any new instructions were created.
475	virtual bool createNewWaitcnt(MachineBasicBlock &Block,
476	MachineBasicBlock::instr_iterator It,
477	AMDGPU::Waitcnt Wait) = `0`;
478
479	// Returns an array of bit masks which can be used to map values in
480	// WaitEventType to corresponding counter values in InstCounterType.
481	virtual const unsigned getWaitEventMask() const* = `0`;
482
483	// Returns a new waitcnt with all counters except VScnt set to 0. If
484	// IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
485	virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = `0`;
486
487	virtual ~WaitcntGenerator() = default;
488
489	// Create a mask value from the initializer list of wait event types.
490	static constexpr unsigned
491	eventMask(std::initializer_list<WaitEventType> Events) {
492	unsigned Mask = `0`;
493	for (auto &E : Events)
494	Mask \|= `1` << E;
495
496	return Mask;
497	}
498	};
499
500	class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
501	public:
502	WaitcntGeneratorPreGFX12() {}
503	WaitcntGeneratorPreGFX12(const GCNSubtarget *ST)
504	: WaitcntGenerator (ST, NUM_NORMAL_INST_CNTS) {}
505
506	bool
507	applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
508	MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
509	MachineBasicBlock::instr_iterator It) const override;
510
511	bool createNewWaitcnt(MachineBasicBlock &Block,
512	MachineBasicBlock::instr_iterator It,
513	AMDGPU::Waitcnt Wait) override;
514
515	const unsigned getWaitEventMask() const* override {
516	assert(ST);
517
518	static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
519	eventMask(Events: {VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
520	VMEM_BVH_READ_ACCESS}),
521	eventMask(Events: {SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
522	eventMask(Events: {EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
523	EXP_POS_ACCESS, EXP_LDS_ACCESS}),
524	eventMask(Events: {VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
525	`0`,
526	`0`,
527	`0`};
528
529	return WaitEventMaskForInstPreGFX12;
530	}
531
532	virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
533	};
534
535	class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
536	public:
537	WaitcntGeneratorGFX12Plus() {}
538	WaitcntGeneratorGFX12Plus(const GCNSubtarget *ST, InstCounterType MaxCounter)
539	: WaitcntGenerator (ST, MaxCounter) {}
540
541	bool
542	applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
543	MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
544	MachineBasicBlock::instr_iterator It) const override;
545
546	bool createNewWaitcnt(MachineBasicBlock &Block,
547	MachineBasicBlock::instr_iterator It,
548	AMDGPU::Waitcnt Wait) override;
549
550	const unsigned getWaitEventMask() const* override {
551	assert(ST);
552
553	static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
554	eventMask(Events: {VMEM_ACCESS, VMEM_READ_ACCESS}),
555	eventMask(Events: {LDS_ACCESS, GDS_ACCESS}),
556	eventMask(Events: {EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
557	EXP_POS_ACCESS, EXP_LDS_ACCESS}),
558	eventMask(Events: {VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
559	eventMask(Events: {VMEM_SAMPLER_READ_ACCESS}),
560	eventMask(Events: {VMEM_BVH_READ_ACCESS}),
561	eventMask(Events: {SMEM_ACCESS, SQ_MESSAGE})};
562
563	return WaitEventMaskForInstGFX12Plus;
564	}
565
566	virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
567	};
568
569	class SIInsertWaitcnts : public MachineFunctionPass {
570	private:
571	const GCNSubtarget ST = nullptr*;
572	const SIInstrInfo TII = nullptr*;
573	const SIRegisterInfo TRI = nullptr*;
574	const MachineRegisterInfo MRI = nullptr*;
575
576	DenseMap<const Value , MachineBasicBlock > SLoadAddresses;
577	DenseMap<MachineBasicBlock , bool*> PreheadersToFlush;
578	MachineLoopInfo *MLI;
579	MachinePostDominatorTree *PDT;
580	AliasAnalysis AA = nullptr*;
581
582	struct BlockInfo {
583	std::unique_ptr<WaitcntBrackets> Incoming;
584	bool Dirty = true;
585	};
586
587	InstCounterType SmemAccessCounter;
588
589	MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
590
591	// ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
592	// because of amdgpu-waitcnt-forcezero flag
593	bool ForceEmitZeroWaitcnts;
594	bool ForceEmitWaitcnt[NUM_INST_CNTS];
595
596	bool OptNone;
597
598	// In any given run of this pass, WCG will point to one of these two
599	// generator objects, which must have been re-initialised before use
600	// from a value made using a subtarget constructor.
601	WaitcntGeneratorPreGFX12 WCGPreGFX12;
602	WaitcntGeneratorGFX12Plus WCGGFX12Plus;
603
604	WaitcntGenerator WCG = nullptr*;
605
606	// S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
607	// message.
608	DenseSet<MachineInstr *> ReleaseVGPRInsts;
609
610	InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
611
612	public:
613	static char ID;
614
615	SIInsertWaitcnts() : MachineFunctionPass (ID) {
616	(void)ForceExpCounter;
617	(void)ForceLgkmCounter;
618	(void)ForceVMCounter;
619	}
620
621	bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
622	bool isPreheaderToFlush(MachineBasicBlock &MBB,
623	WaitcntBrackets &ScoreBrackets);
624	bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
625	bool runOnMachineFunction(MachineFunction &MF) override;
626
627	StringRef getPassName() const override {
628	return "SI insert wait instructions";
629	}
630
631	void getAnalysisUsage(AnalysisUsage &AU) const override {
632	AU.setPreservesCFG();
633	AU.addRequired<MachineLoopInfo>();
634	AU.addRequired<MachinePostDominatorTree>();
635	AU.addUsedIfAvailable<AAResultsWrapperPass>();
636	AU.addPreserved<AAResultsWrapperPass>();
637	MachineFunctionPass::getAnalysisUsage(AU);
638	}
639
640	bool isForceEmitWaitcnt() const {
641	for (auto T : inst_counter_types())
642	if (ForceEmitWaitcnt[T])
643	return true;
644	return false;
645	}
646
647	void setForceEmitWaitcnt() {
648	// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
649	// For debug builds, get the debug counter info and adjust if need be
650	#ifndef NDEBUG
651	if (DebugCounter::isCounterSet(ID: ForceExpCounter) &&
652	DebugCounter::shouldExecute(CounterName: ForceExpCounter)) {
653	ForceEmitWaitcnt[EXP_CNT] = true;
654	} else {
655	ForceEmitWaitcnt[EXP_CNT] = false;
656	}
657
658	if (DebugCounter::isCounterSet(ID: ForceLgkmCounter) &&
659	DebugCounter::shouldExecute(CounterName: ForceLgkmCounter)) {
660	ForceEmitWaitcnt[DS_CNT] = true;
661	ForceEmitWaitcnt[KM_CNT] = true;
662	} else {
663	ForceEmitWaitcnt[DS_CNT] = false;
664	ForceEmitWaitcnt[KM_CNT] = false;
665	}
666
667	if (DebugCounter::isCounterSet(ID: ForceVMCounter) &&
668	DebugCounter::shouldExecute(CounterName: ForceVMCounter)) {
669	ForceEmitWaitcnt[LOAD_CNT] = true;
670	ForceEmitWaitcnt[SAMPLE_CNT] = true;
671	ForceEmitWaitcnt[BVH_CNT] = true;
672	} else {
673	ForceEmitWaitcnt[LOAD_CNT] = false;
674	ForceEmitWaitcnt[SAMPLE_CNT] = false;
675	ForceEmitWaitcnt[BVH_CNT] = false;
676	}
677	#endif // NDEBUG
678	}
679
680	// Return the appropriate VMEM__ACCESS type for Inst, which must be a VMEM or*
681	// FLAT instruction.
682	WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
683	// Maps VMEM access types to their corresponding WaitEventType.
684	static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
685	VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
686
687	assert(SIInstrInfo::isVMEM(Inst) \|\| SIInstrInfo::isFLAT(Inst));
688	// LDS DMA loads are also stores, but on the LDS side. On the VMEM side
689	// these should use VM_CNT.
690	if (!ST->hasVscnt() \|\| SIInstrInfo::mayWriteLDSThroughDMA(MI: Inst))
691	return VMEM_ACCESS;
692	if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(MI: Inst)) {
693	// FLAT and SCRATCH instructions may access scratch. Other VMEM
694	// instructions do not.
695	if (SIInstrInfo::isFLAT(MI: Inst) && mayAccessScratchThroughFlat(MI: Inst))
696	return SCRATCH_WRITE_ACCESS;
697	return VMEM_WRITE_ACCESS;
698	}
699	if (!ST->hasExtendedWaitCounts() \|\| SIInstrInfo::isFLAT(MI: Inst))
700	return VMEM_READ_ACCESS;
701	return VmemReadMapping[getVmemType(Inst)];
702	}
703
704	bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
705	bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
706	bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
707	bool generateWaitcntInstBefore(MachineInstr &MI,
708	WaitcntBrackets &ScoreBrackets,
709	MachineInstr *OldWaitcntInstr,
710	bool FlushVmCnt);
711	bool generateWaitcnt(AMDGPU::Waitcnt Wait,
712	MachineBasicBlock::instr_iterator It,
713	MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
714	MachineInstr *OldWaitcntInstr);
715	void updateEventWaitcntAfter(MachineInstr &Inst,
716	WaitcntBrackets *ScoreBrackets);
717	bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
718	WaitcntBrackets &ScoreBrackets);
719	};
720
721	} // end anonymous namespace
722
723	RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
724	const MachineRegisterInfo *MRI,
725	const SIRegisterInfo *TRI,
726	unsigned OpNo) const {
727	const MachineOperand &Op = MI->getOperand(i: OpNo);
728	if (!TRI->isInAllocatableClass(Op.getReg()))
729	return {-`1`, -`1`};
730
731	// A use via a PW operand does not need a waitcnt.
732	// A partial write is not a WAW.
733	assert(!Op.getSubReg() \|\| !Op.isUndef());
734
735	RegInterval Result;
736
737	unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
738	AMDGPU::HWEncoding::REG_IDX_MASK;
739
740	if (TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg())) {
741	assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
742	Result.first = Reg - Encoding.VGPR0;
743	if (TRI->isAGPR(MRI: *MRI, Reg: Op.getReg()))
744	Result.first += AGPR_OFFSET;
745	assert(Result.first >= `0` && Result.first < SQ_MAX_PGM_VGPRS);
746	} else if (TRI->isSGPRReg(MRI: *MRI, Reg: Op.getReg())) {
747	assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
748	Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
749	assert(Result.first >= NUM_ALL_VGPRS &&
750	Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
751	}
752	// TODO: Handle TTMP
753	// else if (TRI->isTTMP(MRI, Reg.getReg())) ...*
754	else
755	return {-`1`, -`1`};
756
757	const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
758	unsigned Size = TRI->getRegSizeInBits(*RC);
759	Result.second = Result.first + ((Size + `16`) / `32`);
760
761	return Result;
762	}
763
764	void WaitcntBrackets::setExpScore(const MachineInstr *MI,
765	const SIInstrInfo *TII,
766	const SIRegisterInfo *TRI,
767	const MachineRegisterInfo MRI, unsigned* OpNo,
768	unsigned Val) {
769	RegInterval Interval = getRegInterval(MI, MRI, TRI, OpNo);
770	assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
771	for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
772	setRegScore(GprNo: RegNo, T: EXP_CNT, Val);
773	}
774	}
775
776	void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
777	const SIRegisterInfo *TRI,
778	const MachineRegisterInfo *MRI,
779	WaitEventType E, MachineInstr &Inst) {
780	InstCounterType T = eventCounter(masks: WaitEventMaskForInst, E);
781
782	unsigned UB = getScoreUB(T);
783	unsigned CurrScore = UB + `1`;
784	if (CurrScore == `0`)
785	report_fatal_error(reason: "InsertWaitcnt score wraparound");
786	// PendingEvents and ScoreUB need to be update regardless if this event
787	// changes the score of a register or not.
788	// Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
789	PendingEvents \|= `1` << E;
790	setScoreUB(T, Val: CurrScore);
791
792	if (T == EXP_CNT) {
793	// Put score on the source vgprs. If this is a store, just use those
794	// specific register(s).
795	if (TII->isDS(MI: Inst) && (Inst.mayStore() \|\| Inst.mayLoad())) {
796	int AddrOpIdx =
797	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
798	// All GDS operations must protect their address register (same as
799	// export.)
800	if (AddrOpIdx != -`1`) {
801	setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: AddrOpIdx, Val: CurrScore);
802	}
803
804	if (Inst.mayStore()) {
805	if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) {
806	setExpScore(
807	&Inst, TII, TRI, MRI,
808	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
809	CurrScore);
810	}
811	if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) {
812	setExpScore(&Inst, TII, TRI, MRI,
813	AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
814	AMDGPU::OpName::data1),
815	CurrScore);
816	}
817	} else if (SIInstrInfo::isAtomicRet(MI: Inst) && !SIInstrInfo::isGWS(MI: Inst) &&
818	Inst.getOpcode() != AMDGPU::DS_APPEND &&
819	Inst.getOpcode() != AMDGPU::DS_CONSUME &&
820	Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
821	for (unsigned I = `0`, E = Inst.getNumOperands(); I != E; ++I) {
822	const MachineOperand &Op = Inst.getOperand(i: I);
823	if (Op.isReg() && !Op.isDef() &&
824	TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg())) {
825	setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: I, Val: CurrScore);
826	}
827	}
828	}
829	} else if (TII->isFLAT(MI: Inst)) {
830	if (Inst.mayStore()) {
831	setExpScore(
832	&Inst, TII, TRI, MRI,
833	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
834	CurrScore);
835	} else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
836	setExpScore(
837	&Inst, TII, TRI, MRI,
838	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
839	CurrScore);
840	}
841	} else if (TII->isMIMG(MI: Inst)) {
842	if (Inst.mayStore()) {
843	setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: `0`, Val: CurrScore);
844	} else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
845	setExpScore(
846	&Inst, TII, TRI, MRI,
847	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
848	CurrScore);
849	}
850	} else if (TII->isMTBUF(MI: Inst)) {
851	if (Inst.mayStore()) {
852	setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: `0`, Val: CurrScore);
853	}
854	} else if (TII->isMUBUF(MI: Inst)) {
855	if (Inst.mayStore()) {
856	setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: `0`, Val: CurrScore);
857	} else if (SIInstrInfo::isAtomicRet(MI: Inst)) {
858	setExpScore(
859	&Inst, TII, TRI, MRI,
860	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
861	CurrScore);
862	}
863	} else if (TII->isLDSDIR(MI: Inst)) {
864	// LDSDIR instructions attach the score to the destination.
865	setExpScore(
866	&Inst, TII, TRI, MRI,
867	AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
868	CurrScore);
869	} else {
870	if (TII->isEXP(MI: Inst)) {
871	// For export the destination registers are really temps that
872	// can be used as the actual source after export patching, so
873	// we need to treat them like sources and set the EXP_CNT
874	// score.
875	for (unsigned I = `0`, E = Inst.getNumOperands(); I != E; ++I) {
876	MachineOperand &DefMO = Inst.getOperand(i: I);
877	if (DefMO.isReg() && DefMO.isDef() &&
878	TRI->isVGPR(MRI: *MRI, Reg: DefMO.getReg())) {
879	setRegScore(
880	GprNo: TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
881	T: EXP_CNT, Val: CurrScore);
882	}
883	}
884	}
885	for (unsigned I = `0`, E = Inst.getNumOperands(); I != E; ++I) {
886	MachineOperand &MO = Inst.getOperand(i: I);
887	if (MO.isReg() && !MO.isDef() &&
888	TRI->isVectorRegister(MRI: *MRI, Reg: MO.getReg())) {
889	setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: I, Val: CurrScore);
890	}
891	}
892	}
893	#if 0 // TODO: check if this is handled by MUBUF code above.
894	} else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD \|\|
895	Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 \|\|
896	Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
897	MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
898	unsigned OpNo;//TODO: find the OpNo for this operand;
899	RegInterval Interval = getRegInterval(&Inst, MRI, TRI, OpNo);
900	for (int RegNo = Interval.first; RegNo < Interval.second;
901	++RegNo) {
902	setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
903	}
904	#endif
905	} else / LGKM_CNT \|\| EXP_CNT \|\| VS_CNT \|\| NUM_INST_CNTS / {
906	// Match the score to the destination registers.
907	for (unsigned I = `0`, E = Inst.getNumOperands(); I != E; ++I) {
908	auto &Op = Inst.getOperand(i: I);
909	if (!Op.isReg() \|\| !Op.isDef())
910	continue;
911	RegInterval Interval = getRegInterval(MI: &Inst, MRI, TRI, OpNo: I);
912	if (T == LOAD_CNT \|\| T == SAMPLE_CNT \|\| T == BVH_CNT) {
913	if (Interval.first >= NUM_ALL_VGPRS)
914	continue;
915	if (updateVMCntOnly(Inst)) {
916	// updateVMCntOnly should only leave us with VGPRs
917	// MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
918	// defs. That's required for a sane index into `VgprMemTypes` below
919	assert(TRI->isVectorRegister(*MRI, Op.getReg()));
920	VmemType V = getVmemType(Inst);
921	for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
922	VgprVmemTypes[RegNo] \|= `1` << V;
923	}
924	}
925	for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
926	setRegScore(GprNo: RegNo, T, Val: CurrScore);
927	}
928	}
929	if (Inst.mayStore() &&
930	(TII->isDS(MI: Inst) \|\| TII->mayWriteLDSThroughDMA(MI: Inst))) {
931	// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
932	// written can be accessed. A load from LDS to VMEM does not need a wait.
933	unsigned Slot = `0`;
934	for (const auto *MemOp : Inst.memoperands()) {
935	if (!MemOp->isStore() \|\|
936	MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
937	continue;
938	// Comparing just AA info does not guarantee memoperands are equal
939	// in general, but this is so for LDS DMA in practice.
940	auto AAI = MemOp->getAAInfo();
941	// Alias scope information gives a way to definitely identify an
942	// original memory object and practically produced in the module LDS
943	// lowering pass. If there is no scope available we will not be able
944	// to disambiguate LDS aliasing as after the module lowering all LDS
945	// is squashed into a single big object. Do not attempt to use one of
946	// the limited LDSDMAStores for something we will not be able to use
947	// anyway.
948	if (!AAI \|\| !AAI.Scope)
949	break;
950	for (unsigned I = `0`, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
951	for (const auto *MemOp : LDSDMAStores [I]->memoperands()) {
952	if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
953	Slot = I + `1`;
954	break;
955	}
956	}
957	}
958	if (Slot \|\| LDSDMAStores.size() == NUM_EXTRA_VGPRS - `1`)
959	break;
960	LDSDMAStores.push_back(Elt: &Inst);
961	Slot = LDSDMAStores.size();
962	break;
963	}
964	setRegScore(GprNo: SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, Val: CurrScore);
965	if (Slot)
966	setRegScore(GprNo: SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, Val: CurrScore);
967	}
968	}
969	}
970
971	void WaitcntBrackets::print(raw_ostream &OS) {
972	OS << `'\n'`;
973	for (auto T : inst_counter_types(MaxCounter)) {
974	unsigned SR = getScoreRange(T);
975
976	switch (T) {
977	case LOAD_CNT:
978	OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
979	<< SR << "): ";
980	break;
981	case DS_CNT:
982	OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
983	<< SR << "): ";
984	break;
985	case EXP_CNT:
986	OS << " EXP_CNT(" << SR << "): ";
987	break;
988	case STORE_CNT:
989	OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
990	<< SR << "): ";
991	break;
992	case SAMPLE_CNT:
993	OS << " SAMPLE_CNT(" << SR << "): ";
994	break;
995	case BVH_CNT:
996	OS << " BVH_CNT(" << SR << "): ";
997	break;
998	case KM_CNT:
999	OS << " KM_CNT(" << SR << "): ";
1000	break;
1001	default:
1002	OS << " UNKNOWN(" << SR << "): ";
1003	break;
1004	}
1005
1006	if (SR != `0`) {
1007	// Print vgpr scores.
1008	unsigned LB = getScoreLB(T);
1009
1010	for (int J = `0`; J <= VgprUB; J++) {
1011	unsigned RegScore = getRegScore(GprNo: J, T);
1012	if (RegScore <= LB)
1013	continue;
1014	unsigned RelScore = RegScore - LB - `1`;
1015	if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
1016	OS << RelScore << ":v" << J << " ";
1017	} else {
1018	OS << RelScore << ":ds ";
1019	}
1020	}
1021	// Also need to print sgpr scores for lgkm_cnt.
1022	if (T == SmemAccessCounter) {
1023	for (int J = `0`; J <= SgprUB; J++) {
1024	unsigned RegScore = getRegScore(GprNo: J + NUM_ALL_VGPRS, T);
1025	if (RegScore <= LB)
1026	continue;
1027	unsigned RelScore = RegScore - LB - `1`;
1028	OS << RelScore << ":s" << J << " ";
1029	}
1030	}
1031	}
1032	OS << `'\n'`;
1033	}
1034	OS << `'\n'`;
1035	}
1036
1037	/// Simplify the waitcnt, in the sense of removing redundant counts, and return
1038	/// whether a waitcnt instruction is needed at all.
1039	void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1040	simplifyWaitcnt(T: LOAD_CNT, Count&: Wait.LoadCnt);
1041	simplifyWaitcnt(T: EXP_CNT, Count&: Wait.ExpCnt);
1042	simplifyWaitcnt(T: DS_CNT, Count&: Wait.DsCnt);
1043	simplifyWaitcnt(T: STORE_CNT, Count&: Wait.StoreCnt);
1044	simplifyWaitcnt(T: SAMPLE_CNT, Count&: Wait.SampleCnt);
1045	simplifyWaitcnt(T: BVH_CNT, Count&: Wait.BvhCnt);
1046	simplifyWaitcnt(T: KM_CNT, Count&: Wait.KmCnt);
1047	}
1048
1049	void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1050	unsigned &Count) const {
1051	// The number of outstanding events for this type, T, can be calculated
1052	// as (UB - LB). If the current Count is greater than or equal to the number
1053	// of outstanding events, then the wait for this counter is redundant.
1054	if (Count >= getScoreRange(T))
1055	Count = ~`0u`;
1056	}
1057
1058	void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
1059	AMDGPU::Waitcnt &Wait) const {
1060	unsigned ScoreToWait = getRegScore(GprNo: RegNo, T);
1061
1062	// If the score of src_operand falls within the bracket, we need an
1063	// s_waitcnt instruction.
1064	const unsigned LB = getScoreLB(T);
1065	const unsigned UB = getScoreUB(T);
1066	if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1067	if ((T == LOAD_CNT \|\| T == DS_CNT) && hasPendingFlat() &&
1068	!ST->hasFlatLgkmVMemCountInOrder()) {
1069	// If there is a pending FLAT operation, and this is a VMem or LGKM
1070	// waitcnt and the target can report early completion, then we need
1071	// to force a waitcnt 0.
1072	addWait(Wait, T, Count: `0`);
1073	} else if (counterOutOfOrder(T)) {
1074	// Counter can get decremented out-of-order when there
1075	// are multiple types event in the bracket. Also emit an s_wait counter
1076	// with a conservative value of 0 for the counter.
1077	addWait(Wait, T, Count: `0`);
1078	} else {
1079	// If a counter has been maxed out avoid overflow by waiting for
1080	// MAX(CounterType) - 1 instead.
1081	unsigned NeededWait = std::min(a: UB - ScoreToWait, b: getWaitCountMax(T) - `1`);
1082	addWait(Wait, T, Count: NeededWait);
1083	}
1084	}
1085	}
1086
1087	void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1088	applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt);
1089	applyWaitcnt(T: EXP_CNT, Count: Wait.ExpCnt);
1090	applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt);
1091	applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt);
1092	applyWaitcnt(T: SAMPLE_CNT, Count: Wait.SampleCnt);
1093	applyWaitcnt(T: BVH_CNT, Count: Wait.BvhCnt);
1094	applyWaitcnt(T: KM_CNT, Count: Wait.KmCnt);
1095	}
1096
1097	void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1098	const unsigned UB = getScoreUB(T);
1099	if (Count >= UB)
1100	return;
1101	if (Count != `0`) {
1102	if (counterOutOfOrder(T))
1103	return;
1104	setScoreLB(T, Val: std::max(a: getScoreLB(T), b: UB - Count));
1105	} else {
1106	setScoreLB(T, Val: UB);
1107	PendingEvents &= ~WaitEventMaskForInst[T];
1108	}
1109	}
1110
1111	// Where there are multiple types of event in the bracket of a counter,
1112	// the decrement may go out of order.
1113	bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1114	// Scalar memory read always can go out of order.
1115	if (T == SmemAccessCounter && hasPendingEvent(E: SMEM_ACCESS))
1116	return true;
1117	return hasMixedPendingEvents(T);
1118	}
1119
1120	INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1121	false)
1122	INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
1123	INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
1124	INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1125	false)
1126
1127	char SIInsertWaitcnts::ID = `0`;
1128
1129	char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
1130
1131	FunctionPass *llvm::createSIInsertWaitcntsPass() {
1132	return new SIInsertWaitcnts ();
1133	}
1134
1135	static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
1136	unsigned NewEnc) {
1137	int OpIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: OpName);
1138	assert(OpIdx >= `0`);
1139
1140	MachineOperand &MO = MI.getOperand(i: OpIdx);
1141
1142	if (NewEnc == MO.getImm())
1143	return false;
1144
1145	MO.setImm(NewEnc);
1146	return true;
1147	}
1148
1149	/// Determine if \p MI is a gfx12+ single-counter S_WAIT_CNT instruction,*
1150	/// and if so, which counter it is waiting on.
1151	static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1152	switch (Opcode) {
1153	case AMDGPU::S_WAIT_LOADCNT:
1154	return LOAD_CNT;
1155	case AMDGPU::S_WAIT_EXPCNT:
1156	return EXP_CNT;
1157	case AMDGPU::S_WAIT_STORECNT:
1158	return STORE_CNT;
1159	case AMDGPU::S_WAIT_SAMPLECNT:
1160	return SAMPLE_CNT;
1161	case AMDGPU::S_WAIT_BVHCNT:
1162	return BVH_CNT;
1163	case AMDGPU::S_WAIT_DSCNT:
1164	return DS_CNT;
1165	case AMDGPU::S_WAIT_KMCNT:
1166	return KM_CNT;
1167	default:
1168	return {};
1169	}
1170	}
1171
1172	bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr Waitcnt) const* {
1173	unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: Waitcnt->getOpcode());
1174	if (Opcode == Waitcnt->getOpcode())
1175	return false;
1176
1177	Waitcnt->setDesc(TII->get(Opcode));
1178	return true;
1179	}
1180
1181	/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1182	/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1183	/// from \p Wait that were added by previous passes. Currently this pass
1184	/// conservatively assumes that these preexisting waits are required for
1185	/// correctness.
1186	bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1187	WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1188	AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1189	assert(ST);
1190	assert(isNormalMode(MaxCounter));
1191
1192	bool Modified = false;
1193	MachineInstr WaitcntInstr = nullptr*;
1194	MachineInstr WaitcntVsCntInstr = nullptr*;
1195
1196	for (auto &II :
1197	make_early_inc_range(Range: make_range(x: OldWaitcntInstr.getIterator(), y: It))) {
1198	if (II.isMetaInstruction())
1199	continue;
1200
1201	unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: II.getOpcode());
1202	bool IsSoft = Opcode != II.getOpcode();
1203
1204	// Update required wait count. If this is a soft waitcnt (= it was added
1205	// by an earlier pass), it may be entirely removed.
1206	if (Opcode == AMDGPU::S_WAITCNT) {
1207	unsigned IEnc = II.getOperand(i: `0`).getImm();
1208	AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(Version: IV, Encoded: IEnc);
1209	if (IsSoft)
1210	ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
1211	Wait = Wait.combined(Other: OldWait);
1212
1213	// Merge consecutive waitcnt of the same type by erasing multiples.
1214	if (WaitcntInstr \|\| (!Wait.hasWaitExceptStoreCnt() && IsSoft)) {
1215	II.eraseFromParent();
1216	Modified = true;
1217	} else
1218	WaitcntInstr = &II;
1219	} else {
1220	assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1221	assert(II.getOperand(`0`).getReg() == AMDGPU::SGPR_NULL);
1222
1223	unsigned OldVSCnt =
1224	TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1225	if (IsSoft)
1226	ScoreBrackets.simplifyWaitcnt(T: InstCounterType::STORE_CNT, Count&: OldVSCnt);
1227	Wait.StoreCnt = std::min(a: Wait.StoreCnt, b: OldVSCnt);
1228
1229	if (WaitcntVsCntInstr \|\| (!Wait.hasWaitStoreCnt() && IsSoft)) {
1230	II.eraseFromParent();
1231	Modified = true;
1232	} else
1233	WaitcntVsCntInstr = &II;
1234	}
1235	}
1236
1237	if (WaitcntInstr) {
1238	Modified \|= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1239	AMDGPU::encodeWaitcnt(IV, Wait));
1240	Modified \|= promoteSoftWaitCnt(Waitcnt: WaitcntInstr);
1241
1242	ScoreBrackets.applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt);
1243	ScoreBrackets.applyWaitcnt(T: EXP_CNT, Count: Wait.ExpCnt);
1244	ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt);
1245	Wait.LoadCnt = ~`0u`;
1246	Wait.ExpCnt = ~`0u`;
1247	Wait.DsCnt = ~`0u`;
1248
1249	LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
1250	? dbgs()
1251	<< "applyPreexistingWaitcnt\n"
1252	<< "New Instr at block end: " << *WaitcntInstr << `'\n'`
1253	: dbgs() << "applyPreexistingWaitcnt\n"
1254	<< "Old Instr: " << *It
1255	<< "New Instr: " << *WaitcntInstr << `'\n'`);
1256	}
1257
1258	if (WaitcntVsCntInstr) {
1259	Modified \|= updateOperandIfDifferent(*WaitcntVsCntInstr,
1260	AMDGPU::OpName::simm16, Wait.StoreCnt);
1261	Modified \|= promoteSoftWaitCnt(Waitcnt: WaitcntVsCntInstr);
1262
1263	ScoreBrackets.applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt);
1264	Wait.StoreCnt = ~`0u`;
1265
1266	LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
1267	? dbgs() << "applyPreexistingWaitcnt\n"
1268	<< "New Instr at block end: " << *WaitcntVsCntInstr
1269	<< `'\n'`
1270	: dbgs() << "applyPreexistingWaitcnt\n"
1271	<< "Old Instr: " << *It
1272	<< "New Instr: " << *WaitcntVsCntInstr << `'\n'`);
1273	}
1274
1275	return Modified;
1276	}
1277
1278	/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1279	/// required counters in \p Wait
1280	bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1281	MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1282	AMDGPU::Waitcnt Wait) {
1283	assert(ST);
1284	assert(isNormalMode(MaxCounter));
1285
1286	bool Modified = false;
1287	const DebugLoc &DL = Block.findDebugLoc(MBBI: It);
1288
1289	// Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1290	// single instruction while VScnt has its own instruction.
1291	if (Wait.hasWaitExceptStoreCnt()) {
1292	unsigned Enc = AMDGPU::encodeWaitcnt(Version: IV, Decoded: Wait);
1293	[[maybe_unused]] auto SWaitInst =
1294	BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1295	Modified = true;
1296
1297	LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1298	if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1299	dbgs() << "New Instr: " << *SWaitInst << `'\n'`);
1300	}
1301
1302	if (Wait.hasWaitStoreCnt()) {
1303	assert(ST->hasVscnt());
1304
1305	[[maybe_unused]] auto SWaitInst =
1306	BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1307	.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1308	.addImm(Wait.StoreCnt);
1309	Modified = true;
1310
1311	LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1312	if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1313	dbgs() << "New Instr: " << *SWaitInst << `'\n'`);
1314	}
1315
1316	return Modified;
1317	}
1318
1319	AMDGPU::Waitcnt
1320	WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1321	return AMDGPU::Waitcnt (`0`, `0`, `0`, IncludeVSCnt && ST->hasVscnt() ? `0` : ~`0u`);
1322	}
1323
1324	AMDGPU::Waitcnt
1325	WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1326	return AMDGPU::Waitcnt (`0`, `0`, `0`, IncludeVSCnt ? `0` : ~`0u`, `0`, `0`, `0`);
1327	}
1328
1329	/// Combine consecutive S_WAIT_CNT instructions that precede \p It and*
1330	/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1331	/// were added by previous passes. Currently this pass conservatively
1332	/// assumes that these preexisting waits are required for correctness.
1333	bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1334	WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1335	AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1336	assert(ST);
1337	assert(!isNormalMode(MaxCounter));
1338
1339	bool Modified = false;
1340	MachineInstr CombinedLoadDsCntInstr = nullptr*;
1341	MachineInstr CombinedStoreDsCntInstr = nullptr*;
1342	MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1343
1344	for (auto &II :
1345	make_early_inc_range(Range: make_range(x: OldWaitcntInstr.getIterator(), y: It))) {
1346	if (II.isMetaInstruction())
1347	continue;
1348
1349	MachineInstr **UpdatableInstr;
1350
1351	// Update required wait count. If this is a soft waitcnt (= it was added
1352	// by an earlier pass), it may be entirely removed.
1353
1354	unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: II.getOpcode());
1355	bool IsSoft = Opcode != II.getOpcode();
1356
1357	if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1358	unsigned OldEnc =
1359	TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1360	AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(Version: IV, LoadcntDscnt: OldEnc);
1361	if (IsSoft)
1362	ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
1363	Wait = Wait.combined(Other: OldWait);
1364	UpdatableInstr = &CombinedLoadDsCntInstr;
1365	} else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1366	unsigned OldEnc =
1367	TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1368	AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(Version: IV, StorecntDscnt: OldEnc);
1369	if (IsSoft)
1370	ScoreBrackets.simplifyWaitcnt(Wait&: OldWait);
1371	Wait = Wait.combined(Other: OldWait);
1372	UpdatableInstr = &CombinedStoreDsCntInstr;
1373	} else {
1374	std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1375	assert(CT.has_value());
1376	unsigned OldCnt =
1377	TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1378	if (IsSoft)
1379	ScoreBrackets.simplifyWaitcnt(T: CT.value(), Count&: OldCnt);
1380	addWait(Wait, T: CT.value(), Count: OldCnt);
1381	UpdatableInstr = &WaitInstrs[CT.value()];
1382	}
1383
1384	// Merge consecutive waitcnt of the same type by erasing multiples.
1385	if (!*UpdatableInstr) {
1386	*UpdatableInstr = &II;
1387	} else {
1388	II.eraseFromParent();
1389	Modified = true;
1390	}
1391	}
1392
1393	if (CombinedLoadDsCntInstr) {
1394	// Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1395	// to be waited for. Otherwise, let the instruction be deleted so
1396	// the appropriate single counter wait instruction can be inserted
1397	// instead, when new S_WAIT_CNT instructions are inserted by*
1398	// createNewWaitcnt(). As a side effect, resetting the wait counts will
1399	// cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1400	// the loop below that deals with single counter instructions.
1401	if (Wait.LoadCnt != ~`0u` && Wait.DsCnt != ~`0u`) {
1402	unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(Version: IV, Decoded: Wait);
1403	Modified \|= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1404	AMDGPU::OpName::simm16, NewEnc);
1405	Modified \|= promoteSoftWaitCnt(Waitcnt: CombinedLoadDsCntInstr);
1406	ScoreBrackets.applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt);
1407	ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt);
1408	Wait.LoadCnt = ~`0u`;
1409	Wait.DsCnt = ~`0u`;
1410
1411	LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1412	? dbgs() << "applyPreexistingWaitcnt\n"
1413	<< "New Instr at block end: "
1414	<< *CombinedLoadDsCntInstr << `'\n'`
1415	: dbgs() << "applyPreexistingWaitcnt\n"
1416	<< "Old Instr: " << *It << "New Instr: "
1417	<< *CombinedLoadDsCntInstr << `'\n'`);
1418	} else {
1419	CombinedLoadDsCntInstr->eraseFromParent();
1420	Modified = true;
1421	}
1422	}
1423
1424	if (CombinedStoreDsCntInstr) {
1425	// Similarly for S_WAIT_STORECNT_DSCNT.
1426	if (Wait.StoreCnt != ~`0u` && Wait.DsCnt != ~`0u`) {
1427	unsigned NewEnc = AMDGPU::encodeStorecntDscnt(Version: IV, Decoded: Wait);
1428	Modified \|= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
1429	AMDGPU::OpName::simm16, NewEnc);
1430	Modified \|= promoteSoftWaitCnt(Waitcnt: CombinedStoreDsCntInstr);
1431	ScoreBrackets.applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt);
1432	ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt);
1433	Wait.StoreCnt = ~`0u`;
1434	Wait.DsCnt = ~`0u`;
1435
1436	LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1437	? dbgs() << "applyPreexistingWaitcnt\n"
1438	<< "New Instr at block end: "
1439	<< *CombinedStoreDsCntInstr << `'\n'`
1440	: dbgs() << "applyPreexistingWaitcnt\n"
1441	<< "Old Instr: " << *It << "New Instr: "
1442	<< *CombinedStoreDsCntInstr << `'\n'`);
1443	} else {
1444	CombinedStoreDsCntInstr->eraseFromParent();
1445	Modified = true;
1446	}
1447	}
1448
1449	// Look for an opportunity to convert existing S_WAIT_LOADCNT,
1450	// S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1451	// or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1452	// instructions so that createNewWaitcnt() will create new combined
1453	// instructions to replace them.
1454
1455	if (Wait.DsCnt != ~`0u`) {
1456	// This is a vector of addresses in WaitInstrs pointing to instructions
1457	// that should be removed if they are present.
1458	SmallVector<MachineInstr **, `2`> WaitsToErase;
1459
1460	// If it's known that both DScnt and either LOADcnt or STOREcnt (but not
1461	// both) need to be waited for, ensure that there are no existing
1462	// individual wait count instructions for these.
1463
1464	if (Wait.LoadCnt != ~`0u`) {
1465	WaitsToErase.push_back(Elt: &WaitInstrs[LOAD_CNT]);
1466	WaitsToErase.push_back(Elt: &WaitInstrs[DS_CNT]);
1467	} else if (Wait.StoreCnt != ~`0u`) {
1468	WaitsToErase.push_back(Elt: &WaitInstrs[STORE_CNT]);
1469	WaitsToErase.push_back(Elt: &WaitInstrs[DS_CNT]);
1470	}
1471
1472	for (MachineInstr **WI : WaitsToErase) {
1473	if (!*WI)
1474	continue;
1475
1476	(*WI)->eraseFromParent();
1477	WI = nullptr*;
1478	Modified = true;
1479	}
1480	}
1481
1482	for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
1483	if (!WaitInstrs[CT])
1484	continue;
1485
1486	unsigned NewCnt = getWait(Wait, T: CT);
1487	if (NewCnt != ~`0u`) {
1488	Modified \|= updateOperandIfDifferent(*WaitInstrs[CT],
1489	AMDGPU::OpName::simm16, NewCnt);
1490	Modified \|= promoteSoftWaitCnt(Waitcnt: WaitInstrs[CT]);
1491
1492	ScoreBrackets.applyWaitcnt(T: CT, Count: NewCnt);
1493	setNoWait(Wait, T: CT);
1494
1495	LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1496	? dbgs() << "applyPreexistingWaitcnt\n"
1497	<< "New Instr at block end: " << *WaitInstrs[CT]
1498	<< `'\n'`
1499	: dbgs() << "applyPreexistingWaitcnt\n"
1500	<< "Old Instr: " << *It
1501	<< "New Instr: " << *WaitInstrs[CT] << `'\n'`);
1502	} else {
1503	WaitInstrs[CT]->eraseFromParent();
1504	Modified = true;
1505	}
1506	}
1507
1508	return Modified;
1509	}
1510
1511	/// Generate S_WAIT_CNT instructions for any required counters in \p Wait*
1512	bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1513	MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1514	AMDGPU::Waitcnt Wait) {
1515	assert(ST);
1516	assert(!isNormalMode(MaxCounter));
1517
1518	bool Modified = false;
1519	const DebugLoc &DL = Block.findDebugLoc(MBBI: It);
1520
1521	// Check for opportunities to use combined wait instructions.
1522	if (Wait.DsCnt != ~`0u`) {
1523	MachineInstr SWaitInst = nullptr*;
1524
1525	if (Wait.LoadCnt != ~`0u`) {
1526	unsigned Enc = AMDGPU::encodeLoadcntDscnt(Version: IV, Decoded: Wait);
1527
1528	SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
1529	.addImm(Enc);
1530
1531	Wait.LoadCnt = ~`0u`;
1532	Wait.DsCnt = ~`0u`;
1533	} else if (Wait.StoreCnt != ~`0u`) {
1534	unsigned Enc = AMDGPU::encodeStorecntDscnt(Version: IV, Decoded: Wait);
1535
1536	SWaitInst =
1537	BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
1538	.addImm(Enc);
1539
1540	Wait.StoreCnt = ~`0u`;
1541	Wait.DsCnt = ~`0u`;
1542	}
1543
1544	if (SWaitInst) {
1545	Modified = true;
1546
1547	LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1548	if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1549	dbgs() << "New Instr: " << *SWaitInst << `'\n'`);
1550	}
1551	}
1552
1553	// Generate an instruction for any remaining counter that needs
1554	// waiting for.
1555
1556	for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
1557	unsigned Count = getWait(Wait, T: CT);
1558	if (Count == ~`0u`)
1559	continue;
1560
1561	[[maybe_unused]] auto SWaitInst =
1562	BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
1563	.addImm(Count);
1564
1565	Modified = true;
1566
1567	LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1568	if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1569	dbgs() << "New Instr: " << *SWaitInst << `'\n'`);
1570	}
1571
1572	return Modified;
1573	}
1574
1575	static bool readsVCCZ(const MachineInstr &MI) {
1576	unsigned Opc = MI.getOpcode();
1577	return (Opc == AMDGPU::S_CBRANCH_VCCNZ \|\| Opc == AMDGPU::S_CBRANCH_VCCZ) &&
1578	!MI.getOperand(`1`).isUndef();
1579	}
1580
1581	/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
1582	static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
1583	// Currently all conventions wait, but this may not always be the case.
1584	//
1585	// TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
1586	// senses to omit the wait and do it in the caller.
1587	return true;
1588	}
1589
1590	/// \returns true if the callee is expected to wait for any outstanding waits
1591	/// before returning.
1592	static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
1593	return true;
1594	}
1595
1596	/// Generate s_waitcnt instruction to be placed before cur_Inst.
1597	/// Instructions of a given type are returned in order,
1598	/// but instructions of different types can complete out of order.
1599	/// We rely on this in-order completion
1600	/// and simply assign a score to the memory access instructions.
1601	/// We keep track of the active "score bracket" to determine
1602	/// if an access of a memory read requires an s_waitcnt
1603	/// and if so what the value of each counter is.
1604	/// The "score bracket" is bound by the lower bound and upper bound
1605	/// scores (_score_LB and _score_ub respectively).
1606	/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1607	/// flush the vmcnt counter here.
1608	bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1609	WaitcntBrackets &ScoreBrackets,
1610	MachineInstr *OldWaitcntInstr,
1611	bool FlushVmCnt) {
1612	setForceEmitWaitcnt();
1613
1614	if (MI.isMetaInstruction())
1615	return false;
1616
1617	AMDGPU::Waitcnt Wait;
1618
1619	// FIXME: This should have already been handled by the memory legalizer.
1620	// Removing this currently doesn't affect any lit tests, but we need to
1621	// verify that nothing was relying on this. The number of buffer invalidates
1622	// being handled here should not be expanded.
1623	if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 \|\|
1624	MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC \|\|
1625	MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL \|\|
1626	MI.getOpcode() == AMDGPU::BUFFER_GL0_INV \|\|
1627	MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1628	Wait.LoadCnt = `0`;
1629	}
1630
1631	// All waits must be resolved at call return.
1632	// NOTE: this could be improved with knowledge of all call sites or
1633	// with knowledge of the called routines.
1634	if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG \|\|
1635	MI.getOpcode() == AMDGPU::SI_RETURN \|\|
1636	MI.getOpcode() == AMDGPU::S_SETPC_B64_return \|\|
1637	(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1638	Wait = Wait.combined(Other: WCG->getAllZeroWaitcnt(/IncludeVSCnt=/false));
1639	}
1640	// Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1641	// stores. In this case it can be useful to send a message to explicitly
1642	// release all VGPRs before the stores have completed, but it is only safe to
1643	// do this if:
1644	// there are no outstanding scratch stores*
1645	// we are not in Dynamic VGPR mode*
1646	else if (MI.getOpcode() == AMDGPU::S_ENDPGM \|\|
1647	MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1648	if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone &&
1649	ScoreBrackets.getScoreRange(T: STORE_CNT) != `0` &&
1650	!ScoreBrackets.hasPendingEvent(E: SCRATCH_WRITE_ACCESS))
1651	ReleaseVGPRInsts.insert(V: &MI);
1652	}
1653	// Resolve vm waits before gs-done.
1654	else if ((MI.getOpcode() == AMDGPU::S_SENDMSG \|\|
1655	MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1656	ST->hasLegacyGeometry() &&
1657	((MI.getOperand(`0`).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1658	AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
1659	Wait.LoadCnt = `0`;
1660	}
1661	#if 0 // TODO: the following blocks of logic when we have fence.
1662	else if (MI.getOpcode() == SC_FENCE) {
1663	const unsigned int group_size =
1664	context->shader_info->GetMaxThreadGroupSize();
1665	// group_size == 0 means thread group size is unknown at compile time
1666	const bool group_is_multi_wave =
1667	(group_size == `0` \|\| group_size > target_info->GetWaveFrontSize());
1668	const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
1669
1670	for (unsigned int i = `0`; i < Inst->NumSrcOperands(); i++) {
1671	SCRegType src_type = Inst->GetSrcType(i);
1672	switch (src_type) {
1673	case SCMEM_LDS:
1674	if (group_is_multi_wave \|\|
1675	context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
1676	EmitWaitcnt \|= ScoreBrackets->updateByWait(DS_CNT,
1677	ScoreBrackets->getScoreUB(DS_CNT));
1678	// LDS may have to wait for VMcnt after buffer load to LDS
1679	if (target_info->HasBufferLoadToLDS()) {
1680	EmitWaitcnt \|= ScoreBrackets->updateByWait(LOAD_CNT,
1681	ScoreBrackets->getScoreUB(LOAD_CNT));
1682	}
1683	}
1684	break;
1685
1686	case SCMEM_GDS:
1687	if (group_is_multi_wave \|\| fence_is_global) {
1688	EmitWaitcnt \|= ScoreBrackets->updateByWait(EXP_CNT,
1689	ScoreBrackets->getScoreUB(EXP_CNT));
1690	EmitWaitcnt \|= ScoreBrackets->updateByWait(DS_CNT,
1691	ScoreBrackets->getScoreUB(DS_CNT));
1692	}
1693	break;
1694
1695	case SCMEM_UAV:
1696	case SCMEM_TFBUF:
1697	case SCMEM_RING:
1698	case SCMEM_SCATTER:
1699	if (group_is_multi_wave \|\| fence_is_global) {
1700	EmitWaitcnt \|= ScoreBrackets->updateByWait(EXP_CNT,
1701	ScoreBrackets->getScoreUB(EXP_CNT));
1702	EmitWaitcnt \|= ScoreBrackets->updateByWait(LOAD_CNT,
1703	ScoreBrackets->getScoreUB(LOAD_CNT));
1704	}
1705	break;
1706
1707	case SCMEM_SCRATCH:
1708	default:
1709	break;
1710	}
1711	}
1712	}
1713	#endif
1714
1715	// Export & GDS instructions do not read the EXEC mask until after the export
1716	// is granted (which can occur well after the instruction is issued).
1717	// The shader program must flush all EXP operations on the export-count
1718	// before overwriting the EXEC mask.
1719	else {
1720	if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1721	// Export and GDS are tracked individually, either may trigger a waitcnt
1722	// for EXEC.
1723	if (ScoreBrackets.hasPendingEvent(E: EXP_GPR_LOCK) \|\|
1724	ScoreBrackets.hasPendingEvent(E: EXP_PARAM_ACCESS) \|\|
1725	ScoreBrackets.hasPendingEvent(E: EXP_POS_ACCESS) \|\|
1726	ScoreBrackets.hasPendingEvent(E: GDS_GPR_LOCK)) {
1727	Wait.ExpCnt = `0`;
1728	}
1729	}
1730
1731	if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1732	// The function is going to insert a wait on everything in its prolog.
1733	// This still needs to be careful if the call target is a load (e.g. a GOT
1734	// load). We also need to check WAW dependency with saved PC.
1735	Wait = AMDGPU::Waitcnt ();
1736
1737	int CallAddrOpIdx =
1738	AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
1739
1740	if (MI.getOperand(i: CallAddrOpIdx).isReg()) {
1741	RegInterval CallAddrOpInterval =
1742	ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, OpNo: CallAddrOpIdx);
1743
1744	for (int RegNo = CallAddrOpInterval.first;
1745	RegNo < CallAddrOpInterval.second; ++RegNo)
1746	ScoreBrackets.determineWait(T: SmemAccessCounter, RegNo, Wait);
1747
1748	int RtnAddrOpIdx =
1749	AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
1750	if (RtnAddrOpIdx != -`1`) {
1751	RegInterval RtnAddrOpInterval =
1752	ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, OpNo: RtnAddrOpIdx);
1753
1754	for (int RegNo = RtnAddrOpInterval.first;
1755	RegNo < RtnAddrOpInterval.second; ++RegNo)
1756	ScoreBrackets.determineWait(T: SmemAccessCounter, RegNo, Wait);
1757	}
1758	}
1759	} else {
1760	// FIXME: Should not be relying on memoperands.
1761	// Look at the source operands of every instruction to see if
1762	// any of them results from a previous memory operation that affects
1763	// its current usage. If so, an s_waitcnt instruction needs to be
1764	// emitted.
1765	// If the source operand was defined by a load, add the s_waitcnt
1766	// instruction.
1767	//
1768	// Two cases are handled for destination operands:
1769	// 1) If the destination operand was defined by a load, add the s_waitcnt
1770	// instruction to guarantee the right WAW order.
1771	// 2) If a destination operand that was used by a recent export/store ins,
1772	// add s_waitcnt on exp_cnt to guarantee the WAR order.
1773
1774	for (const MachineMemOperand *Memop : MI.memoperands()) {
1775	const Value *Ptr = Memop->getValue();
1776	if (Memop->isStore() && SLoadAddresses.count(Val: Ptr)) {
1777	addWait(Wait, T: SmemAccessCounter, Count: `0`);
1778	if (PDT->dominates(A: MI.getParent(), B: SLoadAddresses.find(Val: Ptr)->second))
1779	SLoadAddresses.erase(Val: Ptr);
1780	}
1781	unsigned AS = Memop->getAddrSpace();
1782	if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
1783	continue;
1784	// No need to wait before load from VMEM to LDS.
1785	if (TII->mayWriteLDSThroughDMA(MI))
1786	continue;
1787
1788	// LOAD_CNT is only relevant to vgpr or LDS.
1789	unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1790	bool FoundAliasingStore = false;
1791	// Only objects with alias scope info were added to LDSDMAScopes array.
1792	// In the absense of the scope info we will not be able to disambiguate
1793	// aliasing here. There is no need to try searching for a corresponding
1794	// store slot. This is conservatively correct because in that case we
1795	// will produce a wait using the first (general) LDS DMA wait slot which
1796	// will wait on all of them anyway.
1797	if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
1798	const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
1799	for (unsigned I = `0`, E = LDSDMAStores.size(); I != E; ++I) {
1800	if (MI.mayAlias(AA, Other: LDSDMAStores [I], UseTBAA: true*)) {
1801	FoundAliasingStore = true;
1802	ScoreBrackets.determineWait(T: LOAD_CNT, RegNo: RegNo + I + `1`, Wait);
1803	}
1804	}
1805	}
1806	if (!FoundAliasingStore)
1807	ScoreBrackets.determineWait(T: LOAD_CNT, RegNo, Wait);
1808	if (Memop->isStore()) {
1809	ScoreBrackets.determineWait(T: EXP_CNT, RegNo, Wait);
1810	}
1811	}
1812
1813	// Loop over use and def operands.
1814	for (unsigned I = `0`, E = MI.getNumOperands(); I != E; ++I) {
1815	MachineOperand &Op = MI.getOperand(i: I);
1816	if (!Op.isReg())
1817	continue;
1818
1819	// If the instruction does not read tied source, skip the operand.
1820	if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
1821	continue;
1822
1823	RegInterval Interval = ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, OpNo: I);
1824
1825	const bool IsVGPR = TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg());
1826	for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1827	if (IsVGPR) {
1828	// RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1829	// previous write and this write are the same type of VMEM
1830	// instruction, in which case they're guaranteed to write their
1831	// results in order anyway.
1832	if (Op.isUse() \|\| !updateVMCntOnly(Inst: MI) \|\|
1833	ScoreBrackets.hasOtherPendingVmemTypes(GprNo: RegNo,
1834	V: getVmemType(Inst: MI))) {
1835	ScoreBrackets.determineWait(T: LOAD_CNT, RegNo, Wait);
1836	ScoreBrackets.determineWait(T: SAMPLE_CNT, RegNo, Wait);
1837	ScoreBrackets.determineWait(T: BVH_CNT, RegNo, Wait);
1838	ScoreBrackets.clearVgprVmemTypes(GprNo: RegNo);
1839	}
1840	if (Op.isDef() \|\| ScoreBrackets.hasPendingEvent(E: EXP_LDS_ACCESS)) {
1841	ScoreBrackets.determineWait(T: EXP_CNT, RegNo, Wait);
1842	}
1843	ScoreBrackets.determineWait(T: DS_CNT, RegNo, Wait);
1844	} else {
1845	ScoreBrackets.determineWait(T: SmemAccessCounter, RegNo, Wait);
1846	}
1847	}
1848	}
1849	}
1850	}
1851
1852	// The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
1853	// not, we need to ensure the subtarget is capable of backing off barrier
1854	// instructions in case there are any outstanding memory operations that may
1855	// cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
1856	if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1857	!ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
1858	Wait = Wait.combined(Other: WCG->getAllZeroWaitcnt(/IncludeVSCnt=/true));
1859	}
1860
1861	// TODO: Remove this work-around, enable the assert for Bug 457939
1862	// after fixing the scheduler. Also, the Shader Compiler code is
1863	// independent of target.
1864	if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1865	if (ScoreBrackets.hasPendingEvent(E: SMEM_ACCESS)) {
1866	Wait.DsCnt = `0`;
1867	}
1868	}
1869
1870	// Verify that the wait is actually needed.
1871	ScoreBrackets.simplifyWaitcnt(Wait);
1872
1873	if (ForceEmitZeroWaitcnts)
1874	Wait = WCG->getAllZeroWaitcnt(/IncludeVSCnt=/false);
1875
1876	if (ForceEmitWaitcnt[LOAD_CNT])
1877	Wait.LoadCnt = `0`;
1878	if (ForceEmitWaitcnt[EXP_CNT])
1879	Wait.ExpCnt = `0`;
1880	if (ForceEmitWaitcnt[DS_CNT])
1881	Wait.DsCnt = `0`;
1882	if (ForceEmitWaitcnt[SAMPLE_CNT])
1883	Wait.SampleCnt = `0`;
1884	if (ForceEmitWaitcnt[BVH_CNT])
1885	Wait.BvhCnt = `0`;
1886	if (ForceEmitWaitcnt[KM_CNT])
1887	Wait.KmCnt = `0`;
1888
1889	if (FlushVmCnt) {
1890	if (ScoreBrackets.hasPendingEvent(T: LOAD_CNT))
1891	Wait.LoadCnt = `0`;
1892	if (ScoreBrackets.hasPendingEvent(T: SAMPLE_CNT))
1893	Wait.SampleCnt = `0`;
1894	if (ScoreBrackets.hasPendingEvent(T: BVH_CNT))
1895	Wait.BvhCnt = `0`;
1896	}
1897
1898	return generateWaitcnt(Wait, It: MI.getIterator(), Block&: *MI.getParent(), ScoreBrackets,
1899	OldWaitcntInstr);
1900	}
1901
1902	bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
1903	MachineBasicBlock::instr_iterator It,
1904	MachineBasicBlock &Block,
1905	WaitcntBrackets &ScoreBrackets,
1906	MachineInstr *OldWaitcntInstr) {
1907	bool Modified = false;
1908
1909	if (OldWaitcntInstr)
1910	// Try to merge the required wait with preexisting waitcnt instructions.
1911	// Also erase redundant waitcnt.
1912	Modified =
1913	WCG->applyPreexistingWaitcnt(ScoreBrackets, OldWaitcntInstr&: *OldWaitcntInstr, Wait, It);
1914
1915	// Any counts that could have been applied to any existing waitcnt
1916	// instructions will have been done so, now deal with any remaining.
1917	ScoreBrackets.applyWaitcnt(Wait);
1918
1919	// ExpCnt can be merged into VINTERP.
1920	if (Wait.ExpCnt != ~`0u` && It != Block.instr_end() &&
1921	SIInstrInfo::isVINTERP(MI: *It)) {
1922	MachineOperand *WaitExp =
1923	TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
1924	if (Wait.ExpCnt < WaitExp->getImm()) {
1925	WaitExp->setImm(Wait.ExpCnt);
1926	Modified = true;
1927	}
1928	Wait.ExpCnt = ~`0u`;
1929
1930	LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
1931	<< "Update Instr: " << *It);
1932	}
1933
1934	if (WCG->createNewWaitcnt(Block, It, Wait))
1935	Modified = true;
1936
1937	return Modified;
1938	}
1939
1940	// This is a flat memory operation. Check to see if it has memory tokens other
1941	// than LDS. Other address spaces supported by flat memory operations involve
1942	// global memory.
1943	bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1944	assert(TII->isFLAT(MI));
1945
1946	// All flat instructions use the VMEM counter.
1947	assert(TII->usesVM_CNT(MI));
1948
1949	// If there are no memory operands then conservatively assume the flat
1950	// operation may access VMEM.
1951	if (MI.memoperands_empty())
1952	return true;
1953
1954	// See if any memory operand specifies an address space that involves VMEM.
1955	// Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1956	// involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1957	// (GDS) address space is not supported by flat operations. Therefore, simply
1958	// return true unless only the LDS address space is found.
1959	for (const MachineMemOperand *Memop : MI.memoperands()) {
1960	unsigned AS = Memop->getAddrSpace();
1961	assert(AS != AMDGPUAS::REGION_ADDRESS);
1962	if (AS != AMDGPUAS::LOCAL_ADDRESS)
1963	return true;
1964	}
1965
1966	return false;
1967	}
1968
1969	// This is a flat memory operation. Check to see if it has memory tokens for
1970	// either LDS or FLAT.
1971	bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1972	assert(TII->isFLAT(MI));
1973
1974	// Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1975	if (!TII->usesLGKM_CNT(MI))
1976	return false;
1977
1978	// If in tgsplit mode then there can be no use of LDS.
1979	if (ST->isTgSplitEnabled())
1980	return false;
1981
1982	// If there are no memory operands then conservatively assume the flat
1983	// operation may access LDS.
1984	if (MI.memoperands_empty())
1985	return true;
1986
1987	// See if any memory operand specifies an address space that involves LDS.
1988	for (const MachineMemOperand *Memop : MI.memoperands()) {
1989	unsigned AS = Memop->getAddrSpace();
1990	if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::FLAT_ADDRESS)
1991	return true;
1992	}
1993
1994	return false;
1995	}
1996
1997	// This is a flat memory operation. Check to see if it has memory tokens for
1998	// either scratch or FLAT.
1999	bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
2000	const MachineInstr &MI) const {
2001	assert(TII->isFLAT(MI));
2002
2003	// SCRATCH instructions always access scratch.
2004	if (TII->isFLATScratch(MI))
2005	return true;
2006
2007	// GLOBAL instructions never access scratch.
2008	if (TII->isFLATGlobal(MI))
2009	return false;
2010
2011	// If there are no memory operands then conservatively assume the flat
2012	// operation may access scratch.
2013	if (MI.memoperands_empty())
2014	return true;
2015
2016	// See if any memory operand specifies an address space that involves scratch.
2017	return any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *Memop) {
2018	unsigned AS = Memop->getAddrSpace();
2019	return AS == AMDGPUAS::PRIVATE_ADDRESS \|\| AS == AMDGPUAS::FLAT_ADDRESS;
2020	});
2021	}
2022
2023	static bool isCacheInvOrWBInst(MachineInstr &Inst) {
2024	auto Opc = Inst.getOpcode();
2025	return Opc == AMDGPU::GLOBAL_INV \|\| Opc == AMDGPU::GLOBAL_WB \|\|
2026	Opc == AMDGPU::GLOBAL_WBINV;
2027	}
2028
2029	void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2030	WaitcntBrackets *ScoreBrackets) {
2031	// Now look at the instruction opcode. If it is a memory access
2032	// instruction, update the upper-bound of the appropriate counter's
2033	// bracket and the destination operand scores.
2034	// TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
2035
2036	if (TII->isDS(MI: Inst) && TII->usesLGKM_CNT(MI: Inst)) {
2037	if (TII->isAlwaysGDS(Inst.getOpcode()) \|\|
2038	TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2039	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: GDS_ACCESS, Inst);
2040	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: GDS_GPR_LOCK, Inst);
2041	} else {
2042	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: LDS_ACCESS, Inst);
2043	}
2044	} else if (TII->isFLAT(MI: Inst)) {
2045	// TODO: Track this properly.
2046	if (isCacheInvOrWBInst(Inst))
2047	return;
2048
2049	assert(Inst.mayLoadOrStore());
2050
2051	int FlatASCount = `0`;
2052
2053	if (mayAccessVMEMThroughFlat(MI: Inst)) {
2054	++FlatASCount;
2055	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: getVmemWaitEventType(Inst),
2056	Inst);
2057	}
2058
2059	if (mayAccessLDSThroughFlat(MI: Inst)) {
2060	++FlatASCount;
2061	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: LDS_ACCESS, Inst);
2062	}
2063
2064	// A Flat memory operation must access at least one address space.
2065	assert(FlatASCount);
2066
2067	// This is a flat memory operation that access both VMEM and LDS, so note it
2068	// - it will require that both the VM and LGKM be flushed to zero if it is
2069	// pending when a VM or LGKM dependency occurs.
2070	if (FlatASCount > `1`)
2071	ScoreBrackets->setPendingFlat();
2072	} else if (SIInstrInfo::isVMEM(MI: Inst) &&
2073	!llvm::AMDGPU::getMUBUFIsBufferInv(Opc: Inst.getOpcode())) {
2074	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: getVmemWaitEventType(Inst),
2075	Inst);
2076
2077	if (ST->vmemWriteNeedsExpWaitcnt() &&
2078	(Inst.mayStore() \|\| SIInstrInfo::isAtomicRet(MI: Inst))) {
2079	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: VMW_GPR_LOCK, Inst);
2080	}
2081	} else if (TII->isSMRD(MI: Inst)) {
2082	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SMEM_ACCESS, Inst);
2083	} else if (Inst.isCall()) {
2084	if (callWaitsOnFunctionReturn(MI: Inst)) {
2085	// Act as a wait on everything
2086	ScoreBrackets->applyWaitcnt(
2087	Wait: WCG->getAllZeroWaitcnt(/IncludeVSCnt=/false));
2088	ScoreBrackets->setStateOnFunctionEntryOrReturn();
2089	} else {
2090	// May need to way wait for anything.
2091	ScoreBrackets->applyWaitcnt(Wait: AMDGPU::Waitcnt ());
2092	}
2093	} else if (SIInstrInfo::isLDSDIR(MI: Inst)) {
2094	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_LDS_ACCESS, Inst);
2095	} else if (TII->isVINTERP(MI: Inst)) {
2096	int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2097	ScoreBrackets->applyWaitcnt(T: EXP_CNT, Count: Imm);
2098	} else if (SIInstrInfo::isEXP(MI: Inst)) {
2099	unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2100	if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
2101	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_PARAM_ACCESS, Inst);
2102	else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2103	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_POS_ACCESS, Inst);
2104	else
2105	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_GPR_LOCK, Inst);
2106	} else {
2107	switch (Inst.getOpcode()) {
2108	case AMDGPU::S_SENDMSG:
2109	case AMDGPU::S_SENDMSG_RTN_B32:
2110	case AMDGPU::S_SENDMSG_RTN_B64:
2111	case AMDGPU::S_SENDMSGHALT:
2112	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SQ_MESSAGE, Inst);
2113	break;
2114	case AMDGPU::S_MEMTIME:
2115	case AMDGPU::S_MEMREALTIME:
2116	case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
2117	case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2118	case AMDGPU::S_BARRIER_LEAVE:
2119	case AMDGPU::S_GET_BARRIER_STATE_M0:
2120	case AMDGPU::S_GET_BARRIER_STATE_IMM:
2121	ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SMEM_ACCESS, Inst);
2122	break;
2123	}
2124	}
2125	}
2126
2127	bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2128	unsigned OtherScore) {
2129	unsigned MyShifted = Score <= M.OldLB ? `0` : Score + M.MyShift;
2130	unsigned OtherShifted =
2131	OtherScore <= M.OtherLB ? `0` : OtherScore + M.OtherShift;
2132	Score = std::max(a: MyShifted, b: OtherShifted);
2133	return OtherShifted > MyShifted;
2134	}
2135
2136	/// Merge the pending events and associater score brackets of \p Other into
2137	/// this brackets status.
2138	///
2139	/// Returns whether the merge resulted in a change that requires tighter waits
2140	/// (i.e. the merged brackets strictly dominate the original brackets).
2141	bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2142	bool StrictDom = false;
2143
2144	VgprUB = std::max(a: VgprUB, b: Other.VgprUB);
2145	SgprUB = std::max(a: SgprUB, b: Other.SgprUB);
2146
2147	for (auto T : inst_counter_types(MaxCounter)) {
2148	// Merge event flags for this counter
2149	const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2150	const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2151	if (OtherEvents & ~OldEvents)
2152	StrictDom = true;
2153	PendingEvents \|= OtherEvents;
2154
2155	// Merge scores for this counter
2156	const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2157	const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2158	const unsigned NewUB = ScoreLBs[T] + std::max(a: MyPending, b: OtherPending);
2159	if (NewUB < ScoreLBs[T])
2160	report_fatal_error(reason: "waitcnt score overflow");
2161
2162	MergeInfo M;
2163	M.OldLB = ScoreLBs[T];
2164	M.OtherLB = Other.ScoreLBs[T];
2165	M.MyShift = NewUB - ScoreUBs[T];
2166	M.OtherShift = NewUB - Other.ScoreUBs[T];
2167
2168	ScoreUBs[T] = NewUB;
2169
2170	StrictDom \|= mergeScore(M, Score&: LastFlat[T], OtherScore: Other.LastFlat[T]);
2171
2172	for (int J = `0`; J <= VgprUB; J++)
2173	StrictDom \|= mergeScore(M, Score&: VgprScores[T][J], OtherScore: Other.VgprScores[T][J]);
2174
2175	if (T == SmemAccessCounter) {
2176	for (int J = `0`; J <= SgprUB; J++)
2177	StrictDom \|= mergeScore(M, Score&: SgprScores[J], OtherScore: Other.SgprScores[J]);
2178	}
2179	}
2180
2181	for (int J = `0`; J <= VgprUB; J++) {
2182	unsigned char NewVmemTypes = VgprVmemTypes[J] \| Other.VgprVmemTypes[J];
2183	StrictDom \|= NewVmemTypes != VgprVmemTypes[J];
2184	VgprVmemTypes[J] = NewVmemTypes;
2185	}
2186
2187	return StrictDom;
2188	}
2189
2190	static bool isWaitInstr(MachineInstr &Inst) {
2191	unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: Inst.getOpcode());
2192	return Opcode == AMDGPU::S_WAITCNT \|\|
2193	(Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(`0`).isReg() &&
2194	Inst.getOperand(`0`).getReg() == AMDGPU::SGPR_NULL) \|\|
2195	Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT \|\|
2196	Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT \|\|
2197	counterTypeForInstr(Opcode).has_value();
2198	}
2199
2200	// Generate s_waitcnt instructions where needed.
2201	bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2202	MachineBasicBlock &Block,
2203	WaitcntBrackets &ScoreBrackets) {
2204	bool Modified = false;
2205
2206	LLVM_DEBUG({
2207	dbgs() << "* Block" << Block.getNumber() << " *";
2208	ScoreBrackets.dump();
2209	});
2210
2211	// Track the correctness of vccz through this basic block. There are two
2212	// reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2213	// ST->partialVCCWritesUpdateVCCZ().
2214	bool VCCZCorrect = true;
2215	if (ST->hasReadVCCZBug()) {
2216	// vccz could be incorrect at a basic block boundary if a predecessor wrote
2217	// to vcc and then issued an smem load.
2218	VCCZCorrect = false;
2219	} else if (!ST->partialVCCWritesUpdateVCCZ()) {
2220	// vccz could be incorrect at a basic block boundary if a predecessor wrote
2221	// to vcc_lo or vcc_hi.
2222	VCCZCorrect = false;
2223	}
2224
2225	// Walk over the instructions.
2226	MachineInstr OldWaitcntInstr = nullptr*;
2227
2228	for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2229	E = Block.instr_end();
2230	Iter != E;) {
2231	MachineInstr &Inst = *Iter;
2232
2233	// Track pre-existing waitcnts that were added in earlier iterations or by
2234	// the memory legalizer.
2235	if (isWaitInstr(Inst)) {
2236	if (!OldWaitcntInstr)
2237	OldWaitcntInstr = &Inst;
2238	++Iter;
2239	continue;
2240	}
2241
2242	bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2243	isPreheaderToFlush(MBB&: Block, ScoreBrackets);
2244
2245	// Generate an s_waitcnt instruction to be placed before Inst, if needed.
2246	Modified \|= generateWaitcntInstBefore(MI&: Inst, ScoreBrackets, OldWaitcntInstr,
2247	FlushVmCnt);
2248	OldWaitcntInstr = nullptr;
2249
2250	// Restore vccz if it's not known to be correct already.
2251	bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(MI: Inst);
2252
2253	// Don't examine operands unless we need to track vccz correctness.
2254	if (ST->hasReadVCCZBug() \|\| !ST->partialVCCWritesUpdateVCCZ()) {
2255	if (Inst.definesRegister(AMDGPU::VCC_LO, /TRI=/nullptr) \|\|
2256	Inst.definesRegister(AMDGPU::VCC_HI, /TRI=/nullptr)) {
2257	// Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2258	if (!ST->partialVCCWritesUpdateVCCZ())
2259	VCCZCorrect = false;
2260	} else if (Inst.definesRegister(AMDGPU::VCC, /TRI=/nullptr)) {
2261	// There is a hardware bug on CI/SI where SMRD instruction may corrupt
2262	// vccz bit, so when we detect that an instruction may read from a
2263	// corrupt vccz bit, we need to:
2264	// 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2265	// operations to complete.
2266	// 2. Restore the correct value of vccz by writing the current value
2267	// of vcc back to vcc.
2268	if (ST->hasReadVCCZBug() &&
2269	ScoreBrackets.hasPendingEvent(E: SMEM_ACCESS)) {
2270	// Writes to vcc while there's an outstanding smem read may get
2271	// clobbered as soon as any read completes.
2272	VCCZCorrect = false;
2273	} else {
2274	// Writes to vcc will fix any incorrect value in vccz.
2275	VCCZCorrect = true;
2276	}
2277	}
2278	}
2279
2280	if (TII->isSMRD(MI: Inst)) {
2281	for (const MachineMemOperand *Memop : Inst.memoperands()) {
2282	// No need to handle invariant loads when avoiding WAR conflicts, as
2283	// there cannot be a vector store to the same memory location.
2284	if (!Memop->isInvariant()) {
2285	const Value *Ptr = Memop->getValue();
2286	SLoadAddresses.insert(KV: std::pair(Ptr, Inst.getParent()));
2287	}
2288	}
2289	if (ST->hasReadVCCZBug()) {
2290	// This smem read could complete and clobber vccz at any time.
2291	VCCZCorrect = false;
2292	}
2293	}
2294
2295	updateEventWaitcntAfter(Inst, ScoreBrackets: &ScoreBrackets);
2296
2297	#if 0 // TODO: implement resource type check controlled by options with ub = LB.
2298	// If this instruction generates a S_SETVSKIP because it is an
2299	// indexed resource, and we are on Tahiti, then it will also force
2300	// an S_WAITCNT vmcnt(0)
2301	if (RequireCheckResourceType(Inst, context)) {
2302	// Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
2303	ScoreBrackets->setScoreLB(LOAD_CNT,
2304	ScoreBrackets->getScoreUB(LOAD_CNT));
2305	}
2306	#endif
2307
2308	if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
2309	AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt(
2310	IncludeVSCnt: Inst.mayStore() && !SIInstrInfo::isAtomicRet(MI: Inst));
2311	ScoreBrackets.simplifyWaitcnt(Wait);
2312	Modified \|= generateWaitcnt(Wait, It: std::next(x: Inst.getIterator()), Block,
2313	ScoreBrackets, /OldWaitcntInstr=/nullptr);
2314	}
2315
2316	LLVM_DEBUG({
2317	Inst.print(dbgs());
2318	ScoreBrackets.dump();
2319	});
2320
2321	// TODO: Remove this work-around after fixing the scheduler and enable the
2322	// assert above.
2323	if (RestoreVCCZ) {
2324	// Restore the vccz bit. Any time a value is written to vcc, the vcc
2325	// bit is updated, so we can restore the bit by reading the value of
2326	// vcc and then writing it back to the register.
2327	BuildMI(Block, Inst, Inst.getDebugLoc(),
2328	TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2329	TRI->getVCC())
2330	.addReg(TRI->getVCC());
2331	VCCZCorrect = true;
2332	Modified = true;
2333	}
2334
2335	++Iter;
2336	}
2337
2338	// Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
2339	// needed.
2340	AMDGPU::Waitcnt Wait;
2341	if (Block.getFirstTerminator() == Block.end() &&
2342	isPreheaderToFlush(MBB&: Block, ScoreBrackets)) {
2343	if (ScoreBrackets.hasPendingEvent(T: LOAD_CNT))
2344	Wait.LoadCnt = `0`;
2345	if (ScoreBrackets.hasPendingEvent(T: SAMPLE_CNT))
2346	Wait.SampleCnt = `0`;
2347	if (ScoreBrackets.hasPendingEvent(T: BVH_CNT))
2348	Wait.BvhCnt = `0`;
2349	}
2350
2351	// Combine or remove any redundant waitcnts at the end of the block.
2352	Modified \|= generateWaitcnt(Wait, It: Block.instr_end(), Block, ScoreBrackets,
2353	OldWaitcntInstr);
2354
2355	return Modified;
2356	}
2357
2358	// Return true if the given machine basic block is a preheader of a loop in
2359	// which we want to flush the vmcnt counter, and false otherwise.
2360	bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
2361	WaitcntBrackets &ScoreBrackets) {
2362	auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(Key: &MBB, Args: false);
2363	if (!IsInserted)
2364	return Iterator ->second;
2365
2366	MachineBasicBlock *Succ = MBB.getSingleSuccessor();
2367	if (!Succ)
2368	return false;
2369
2370	MachineLoop *Loop = MLI->getLoopFor(BB: Succ);
2371	if (!Loop)
2372	return false;
2373
2374	if (Loop->getLoopPreheader() == &MBB &&
2375	shouldFlushVmCnt(ML: Loop, Brackets&: ScoreBrackets)) {
2376	Iterator ->second = true;
2377	return true;
2378	}
2379
2380	return false;
2381	}
2382
2383	bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
2384	return SIInstrInfo::isVMEM(MI) \|\|
2385	(SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
2386	}
2387
2388	// Return true if it is better to flush the vmcnt counter in the preheader of
2389	// the given loop. We currently decide to flush in two situations:
2390	// 1. The loop contains vmem store(s), no vmem load and at least one use of a
2391	// vgpr containing a value that is loaded outside of the loop. (Only on
2392	// targets with no vscnt counter).
2393	// 2. The loop contains vmem load(s), but the loaded values are not used in the
2394	// loop, and at least one use of a vgpr containing a value that is loaded
2395	// outside of the loop.
2396	bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
2397	WaitcntBrackets &Brackets) {
2398	bool HasVMemLoad = false;
2399	bool HasVMemStore = false;
2400	bool UsesVgprLoadedOutside = false;
2401	DenseSet<Register> VgprUse;
2402	DenseSet<Register> VgprDef;
2403
2404	for (MachineBasicBlock *MBB : ML->blocks()) {
2405	for (MachineInstr &MI : *MBB) {
2406	if (isVMEMOrFlatVMEM(MI)) {
2407	if (MI.mayLoad())
2408	HasVMemLoad = true;
2409	if (MI.mayStore())
2410	HasVMemStore = true;
2411	}
2412	for (unsigned I = `0`; I < MI.getNumOperands(); I++) {
2413	MachineOperand &Op = MI.getOperand(i: I);
2414	if (!Op.isReg() \|\| !TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg()))
2415	continue;
2416	RegInterval Interval = Brackets.getRegInterval(MI: &MI, MRI, TRI, OpNo: I);
2417	// Vgpr use
2418	if (Op.isUse()) {
2419	for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2420	// If we find a register that is loaded inside the loop, 1. and 2.
2421	// are invalidated and we can exit.
2422	if (VgprDef.contains(V: RegNo))
2423	return false;
2424	VgprUse.insert(V: RegNo);
2425	// If at least one of Op's registers is in the score brackets, the
2426	// value is likely loaded outside of the loop.
2427	if (Brackets.getRegScore(GprNo: RegNo, T: LOAD_CNT) >
2428	Brackets.getScoreLB(T: LOAD_CNT) \|\|
2429	Brackets.getRegScore(GprNo: RegNo, T: SAMPLE_CNT) >
2430	Brackets.getScoreLB(T: SAMPLE_CNT) \|\|
2431	Brackets.getRegScore(GprNo: RegNo, T: BVH_CNT) >
2432	Brackets.getScoreLB(T: BVH_CNT)) {
2433	UsesVgprLoadedOutside = true;
2434	break;
2435	}
2436	}
2437	}
2438	// VMem load vgpr def
2439	else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())
2440	for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2441	// If we find a register that is loaded inside the loop, 1. and 2.
2442	// are invalidated and we can exit.
2443	if (VgprUse.contains(V: RegNo))
2444	return false;
2445	VgprDef.insert(V: RegNo);
2446	}
2447	}
2448	}
2449	}
2450	if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2451	return true;
2452	return HasVMemLoad && UsesVgprLoadedOutside;
2453	}
2454
2455	bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
2456	ST = &MF.getSubtarget<GCNSubtarget>();
2457	TII = ST->getInstrInfo();
2458	TRI = &TII->getRegisterInfo();
2459	MRI = &MF.getRegInfo();
2460	const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2461	MLI = &getAnalysis<MachineLoopInfo>();
2462	PDT = &getAnalysis<MachinePostDominatorTree>();
2463	if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2464	AA = &AAR->getAAResults();
2465
2466	AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST->getCPU());
2467
2468	if (ST->hasExtendedWaitCounts()) {
2469	MaxCounter = NUM_EXTENDED_INST_CNTS;
2470	WCGGFX12Plus = WaitcntGeneratorGFX12Plus (ST, MaxCounter);
2471	WCG = &WCGGFX12Plus;
2472	} else {
2473	MaxCounter = NUM_NORMAL_INST_CNTS;
2474	WCGPreGFX12 = WaitcntGeneratorPreGFX12 (ST);
2475	WCG = &WCGPreGFX12;
2476	}
2477
2478	ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
2479	for (auto T : inst_counter_types())
2480	ForceEmitWaitcnt[T] = false;
2481
2482	const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
2483
2484	SmemAccessCounter = eventCounter(masks: WaitEventMaskForInst, E: SMEM_ACCESS);
2485
2486	OptNone = MF.getFunction().hasOptNone() \|\|
2487	MF.getTarget().getOptLevel() == CodeGenOptLevel::None;
2488
2489	HardwareLimits Limits = {};
2490	if (ST->hasExtendedWaitCounts()) {
2491	Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(Version: IV);
2492	Limits.DscntMax = AMDGPU::getDscntBitMask(Version: IV);
2493	} else {
2494	Limits.LoadcntMax = AMDGPU::getVmcntBitMask(Version: IV);
2495	Limits.DscntMax = AMDGPU::getLgkmcntBitMask(Version: IV);
2496	}
2497	Limits.ExpcntMax = AMDGPU::getExpcntBitMask(Version: IV);
2498	Limits.StorecntMax = AMDGPU::getStorecntBitMask(Version: IV);
2499	Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(Version: IV);
2500	Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(Version: IV);
2501	Limits.KmcntMax = AMDGPU::getKmcntBitMask(Version: IV);
2502
2503	unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
2504	unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
2505	assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2506	assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2507
2508	RegisterEncoding Encoding = {};
2509	Encoding.VGPR0 =
2510	TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2511	Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - `1`;
2512	Encoding.SGPR0 =
2513	TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2514	Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - `1`;
2515
2516	BlockInfos.clear();
2517	bool Modified = false;
2518
2519	MachineBasicBlock &EntryBB = MF.front();
2520	MachineBasicBlock::iterator I = EntryBB.begin();
2521
2522	if (!MFI->isEntryFunction()) {
2523	// Wait for any outstanding memory operations that the input registers may
2524	// depend on. We can't track them and it's better to do the wait after the
2525	// costly call sequence.
2526
2527	// TODO: Could insert earlier and schedule more liberally with operations
2528	// that only use caller preserved registers.
2529	for (MachineBasicBlock::iterator E = EntryBB.end();
2530	I != E && (I ->isPHI() \|\| I ->isMetaInstruction()); ++I)
2531	;
2532
2533	if (ST->hasExtendedWaitCounts()) {
2534	BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2535	.addImm(`0`);
2536	for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) {
2537	if (CT == LOAD_CNT \|\| CT == DS_CNT \|\| CT == STORE_CNT)
2538	continue;
2539
2540	BuildMI(EntryBB, I, DebugLoc (),
2541	TII->get(instrsForExtendedCounterTypes[CT]))
2542	.addImm(`0`);
2543	}
2544	} else {
2545	BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(`0`);
2546	}
2547
2548	auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
2549	args&: ST, args&: MaxCounter, args&: Limits, args&: Encoding, args&: WaitEventMaskForInst,
2550	args&: SmemAccessCounter);
2551	NonKernelInitialState ->setStateOnFunctionEntryOrReturn();
2552	BlockInfos [&EntryBB].Incoming = std::move(NonKernelInitialState);
2553
2554	Modified = true;
2555	}
2556
2557	// Keep iterating over the blocks in reverse post order, inserting and
2558	// updating s_waitcnt where needed, until a fix point is reached.
2559	for (auto MBB : ReversePostOrderTraversal<MachineFunction >(&MF))
2560	BlockInfos.insert(KV: {MBB, BlockInfo ()});
2561
2562	std::unique_ptr<WaitcntBrackets> Brackets;
2563	bool Repeat;
2564	do {
2565	Repeat = false;
2566
2567	for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
2568	++BII) {
2569	MachineBasicBlock *MBB = BII->first;
2570	BlockInfo &BI = BII->second;
2571	if (!BI.Dirty)
2572	continue;
2573
2574	if (BI.Incoming) {
2575	if (!Brackets)
2576	Brackets = std::make_unique<WaitcntBrackets>(args&: *BI.Incoming);
2577	else
2578	Brackets = BI.Incoming;
2579	} else {
2580	if (!Brackets)
2581	Brackets = std::make_unique<WaitcntBrackets>(
2582	args&: ST, args&: MaxCounter, args&: Limits, args&: Encoding, args&: WaitEventMaskForInst,
2583	args&: SmemAccessCounter);
2584	else
2585	*Brackets = WaitcntBrackets (ST, MaxCounter, Limits, Encoding,
2586	WaitEventMaskForInst, SmemAccessCounter);
2587	}
2588
2589	Modified \|= insertWaitcntInBlock(MF, Block&: MBB, ScoreBrackets&: Brackets);
2590	BI.Dirty = false;
2591
2592	if (Brackets ->hasPendingEvent()) {
2593	BlockInfo MoveBracketsToSucc = nullptr*;
2594	for (MachineBasicBlock *Succ : MBB->successors()) {
2595	auto SuccBII = BlockInfos.find(Key: Succ);
2596	BlockInfo &SuccBI = SuccBII->second;
2597	if (!SuccBI.Incoming) {
2598	SuccBI.Dirty = true;
2599	if (SuccBII <= BII)
2600	Repeat = true;
2601	if (!MoveBracketsToSucc) {
2602	MoveBracketsToSucc = &SuccBI;
2603	} else {
2604	SuccBI.Incoming = std::make_unique<WaitcntBrackets>(args&: *Brackets);
2605	}
2606	} else if (SuccBI.Incoming ->merge(Other: *Brackets)) {
2607	SuccBI.Dirty = true;
2608	if (SuccBII <= BII)
2609	Repeat = true;
2610	}
2611	}
2612	if (MoveBracketsToSucc)
2613	MoveBracketsToSucc->Incoming = std::move(Brackets);
2614	}
2615	}
2616	} while (Repeat);
2617
2618	if (ST->hasScalarStores()) {
2619	SmallVector<MachineBasicBlock *, `4`> EndPgmBlocks;
2620	bool HaveScalarStores = false;
2621
2622	for (MachineBasicBlock &MBB : MF) {
2623	for (MachineInstr &MI : MBB) {
2624	if (!HaveScalarStores && TII->isScalarStore(MI))
2625	HaveScalarStores = true;
2626
2627	if (MI.getOpcode() == AMDGPU::S_ENDPGM \|\|
2628	MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2629	EndPgmBlocks.push_back(Elt: &MBB);
2630	}
2631	}
2632
2633	if (HaveScalarStores) {
2634	// If scalar writes are used, the cache must be flushed or else the next
2635	// wave to reuse the same scratch memory can be clobbered.
2636	//
2637	// Insert s_dcache_wb at wave termination points if there were any scalar
2638	// stores, and only if the cache hasn't already been flushed. This could
2639	// be improved by looking across blocks for flushes in postdominating
2640	// blocks from the stores but an explicitly requested flush is probably
2641	// very rare.
2642	for (MachineBasicBlock *MBB : EndPgmBlocks) {
2643	bool SeenDCacheWB = false;
2644
2645	for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
2646	I != E; ++I) {
2647	if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
2648	SeenDCacheWB = true;
2649	else if (TII->isScalarStore(MI: *I))
2650	SeenDCacheWB = false;
2651
2652	// FIXME: It would be better to insert this before a waitcnt if any.
2653	if ((I->getOpcode() == AMDGPU::S_ENDPGM \|\|
2654	I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2655	!SeenDCacheWB) {
2656	Modified = true;
2657	BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
2658	}
2659	}
2660	}
2661	}
2662	}
2663
2664	// Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
2665	// instructions.
2666	for (MachineInstr *MI : ReleaseVGPRInsts) {
2667	if (ST->requiresNopBeforeDeallocVGPRs()) {
2668	BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_NOP))
2669	.addImm(`0`);
2670	}
2671	BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2672	TII->get(AMDGPU::S_SENDMSG))
2673	.addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
2674	Modified = true;
2675	}
2676	ReleaseVGPRInsts.clear();
2677
2678	return Modified;
2679	}
2680

source code of llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp