1 | //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// Insert wait instructions for memory reads and writes. |
11 | /// |
12 | /// Memory reads and writes are issued asynchronously, so we need to insert |
13 | /// S_WAITCNT instructions when we want to access any of their results or |
14 | /// overwrite any register that's used asynchronously. |
15 | /// |
16 | /// TODO: This pass currently keeps one timeline per hardware counter. A more |
17 | /// finely-grained approach that keeps one timeline per event type could |
18 | /// sometimes get away with generating weaker s_waitcnt instructions. For |
19 | /// example, when both SMEM and LDS are in flight and we need to wait for |
20 | /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient, |
21 | /// but the pass will currently generate a conservative lgkmcnt(0) because |
22 | /// multiple event types are in flight. |
23 | // |
24 | //===----------------------------------------------------------------------===// |
25 | |
26 | #include "AMDGPU.h" |
27 | #include "GCNSubtarget.h" |
28 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
29 | #include "SIMachineFunctionInfo.h" |
30 | #include "Utils/AMDGPUBaseInfo.h" |
31 | #include "llvm/ADT/MapVector.h" |
32 | #include "llvm/ADT/PostOrderIterator.h" |
33 | #include "llvm/ADT/Sequence.h" |
34 | #include "llvm/Analysis/AliasAnalysis.h" |
35 | #include "llvm/CodeGen/MachineLoopInfo.h" |
36 | #include "llvm/CodeGen/MachinePostDominators.h" |
37 | #include "llvm/InitializePasses.h" |
38 | #include "llvm/Support/DebugCounter.h" |
39 | #include "llvm/TargetParser/TargetParser.h" |
40 | using namespace llvm; |
41 | |
42 | #define DEBUG_TYPE "si-insert-waitcnts" |
43 | |
44 | DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp" , |
45 | "Force emit s_waitcnt expcnt(0) instrs" ); |
46 | DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm" , |
47 | "Force emit s_waitcnt lgkmcnt(0) instrs" ); |
48 | DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm" , |
49 | "Force emit s_waitcnt vmcnt(0) instrs" ); |
50 | |
51 | static cl::opt<bool> ForceEmitZeroFlag( |
52 | "amdgpu-waitcnt-forcezero" , |
53 | cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)" ), |
54 | cl::init(Val: false), cl::Hidden); |
55 | |
56 | namespace { |
57 | // Class of object that encapsulates latest instruction counter score |
58 | // associated with the operand. Used for determining whether |
59 | // s_waitcnt instruction needs to be emitted. |
60 | |
61 | enum InstCounterType { |
62 | LOAD_CNT = 0, // VMcnt prior to gfx12. |
63 | DS_CNT, // LKGMcnt prior to gfx12. |
64 | EXP_CNT, // |
65 | STORE_CNT, // VScnt in gfx10/gfx11. |
66 | NUM_NORMAL_INST_CNTS, |
67 | SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only. |
68 | BVH_CNT, // gfx12+ only. |
69 | KM_CNT, // gfx12+ only. |
70 | NUM_EXTENDED_INST_CNTS, |
71 | NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS |
72 | }; |
73 | } // namespace |
74 | |
75 | namespace llvm { |
76 | template <> struct enum_iteration_traits<InstCounterType> { |
77 | static constexpr bool is_iterable = true; |
78 | }; |
79 | } // namespace llvm |
80 | |
81 | namespace { |
82 | // Return an iterator over all counters between LOAD_CNT (the first counter) |
83 | // and \c MaxCounter (exclusive, default value yields an enumeration over |
84 | // all counters). |
85 | auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) { |
86 | return enum_seq(Begin: LOAD_CNT, End: MaxCounter); |
87 | } |
88 | |
89 | using RegInterval = std::pair<int, int>; |
90 | |
91 | struct HardwareLimits { |
92 | unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12. |
93 | unsigned ExpcntMax; |
94 | unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12. |
95 | unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11. |
96 | unsigned SamplecntMax; // gfx12+ only. |
97 | unsigned BvhcntMax; // gfx12+ only. |
98 | unsigned KmcntMax; // gfx12+ only. |
99 | }; |
100 | |
101 | struct RegisterEncoding { |
102 | unsigned VGPR0; |
103 | unsigned VGPRL; |
104 | unsigned SGPR0; |
105 | unsigned SGPRL; |
106 | }; |
107 | |
108 | enum WaitEventType { |
109 | VMEM_ACCESS, // vector-memory read & write |
110 | VMEM_READ_ACCESS, // vector-memory read |
111 | VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only) |
112 | VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only) |
113 | VMEM_WRITE_ACCESS, // vector-memory write that is not scratch |
114 | SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch |
115 | LDS_ACCESS, // lds read & write |
116 | GDS_ACCESS, // gds read & write |
117 | SQ_MESSAGE, // send message |
118 | SMEM_ACCESS, // scalar-memory read & write |
119 | EXP_GPR_LOCK, // export holding on its data src |
120 | GDS_GPR_LOCK, // GDS holding on its data and addr src |
121 | EXP_POS_ACCESS, // write to export position |
122 | EXP_PARAM_ACCESS, // write to export parameter |
123 | VMW_GPR_LOCK, // vector-memory write holding on its data src |
124 | EXP_LDS_ACCESS, // read by ldsdir counting as export |
125 | NUM_WAIT_EVENTS, |
126 | }; |
127 | |
128 | // The mapping is: |
129 | // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs |
130 | // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots |
131 | // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs |
132 | // We reserve a fixed number of VGPR slots in the scoring tables for |
133 | // special tokens like SCMEM_LDS (needed for buffer load to LDS). |
134 | enum RegisterMapping { |
135 | SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets. |
136 | AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets. |
137 | SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets. |
138 | = 9, // Reserved slots for DS. |
139 | // Artificial register slots to track LDS writes into specific LDS locations |
140 | // if a location is known. When slots are exhausted or location is |
141 | // unknown use the first slot. The first slot is also always updated in |
142 | // addition to known location's slot to properly generate waits if dependent |
143 | // instruction's location is unknown. |
144 | = 0, |
145 | NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts. |
146 | }; |
147 | |
148 | // Enumerate different types of result-returning VMEM operations. Although |
149 | // s_waitcnt orders them all with a single vmcnt counter, in the absence of |
150 | // s_waitcnt only instructions of the same VmemType are guaranteed to write |
151 | // their results in order -- so there is no need to insert an s_waitcnt between |
152 | // two instructions of the same type that write the same vgpr. |
153 | enum VmemType { |
154 | // BUF instructions and MIMG instructions without a sampler. |
155 | VMEM_NOSAMPLER, |
156 | // MIMG instructions with a sampler. |
157 | VMEM_SAMPLER, |
158 | // BVH instructions |
159 | VMEM_BVH, |
160 | NUM_VMEM_TYPES |
161 | }; |
162 | |
163 | // Maps values of InstCounterType to the instruction that waits on that |
164 | // counter. Only used if GCNSubtarget::hasExtendedWaitCounts() |
165 | // returns true. |
166 | static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = { |
167 | AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT, |
168 | AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT, |
169 | AMDGPU::S_WAIT_KMCNT}; |
170 | |
171 | static bool updateVMCntOnly(const MachineInstr &Inst) { |
172 | return SIInstrInfo::isVMEM(MI: Inst) || SIInstrInfo::isFLATGlobal(MI: Inst) || |
173 | SIInstrInfo::isFLATScratch(MI: Inst); |
174 | } |
175 | |
176 | #ifndef NDEBUG |
177 | static bool isNormalMode(InstCounterType MaxCounter) { |
178 | return MaxCounter == NUM_NORMAL_INST_CNTS; |
179 | } |
180 | #endif // NDEBUG |
181 | |
182 | VmemType getVmemType(const MachineInstr &Inst) { |
183 | assert(updateVMCntOnly(Inst)); |
184 | if (!SIInstrInfo::isMIMG(MI: Inst) && !SIInstrInfo::isVIMAGE(MI: Inst) && |
185 | !SIInstrInfo::isVSAMPLE(MI: Inst)) |
186 | return VMEM_NOSAMPLER; |
187 | const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc: Inst.getOpcode()); |
188 | const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo = |
189 | AMDGPU::getMIMGBaseOpcodeInfo(BaseOpcode: Info->BaseOpcode); |
190 | return BaseInfo->BVH ? VMEM_BVH |
191 | : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER; |
192 | } |
193 | |
194 | unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) { |
195 | switch (T) { |
196 | case LOAD_CNT: |
197 | return Wait.LoadCnt; |
198 | case EXP_CNT: |
199 | return Wait.ExpCnt; |
200 | case DS_CNT: |
201 | return Wait.DsCnt; |
202 | case STORE_CNT: |
203 | return Wait.StoreCnt; |
204 | case SAMPLE_CNT: |
205 | return Wait.SampleCnt; |
206 | case BVH_CNT: |
207 | return Wait.BvhCnt; |
208 | case KM_CNT: |
209 | return Wait.KmCnt; |
210 | default: |
211 | llvm_unreachable("bad InstCounterType" ); |
212 | } |
213 | } |
214 | |
215 | void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { |
216 | unsigned &WC = getCounterRef(Wait, T); |
217 | WC = std::min(a: WC, b: Count); |
218 | } |
219 | |
220 | void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { |
221 | getCounterRef(Wait, T) = ~0u; |
222 | } |
223 | |
224 | unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) { |
225 | return getCounterRef(Wait, T); |
226 | } |
227 | |
228 | // Mapping from event to counter according to the table masks. |
229 | InstCounterType eventCounter(const unsigned *masks, WaitEventType E) { |
230 | for (auto T : inst_counter_types()) { |
231 | if (masks[T] & (1 << E)) |
232 | return T; |
233 | } |
234 | llvm_unreachable("event type has no associated counter" ); |
235 | } |
236 | |
237 | // This objects maintains the current score brackets of each wait counter, and |
238 | // a per-register scoreboard for each wait counter. |
239 | // |
240 | // We also maintain the latest score for every event type that can change the |
241 | // waitcnt in order to know if there are multiple types of events within |
242 | // the brackets. When multiple types of event happen in the bracket, |
243 | // wait count may get decreased out of order, therefore we need to put in |
244 | // "s_waitcnt 0" before use. |
245 | class WaitcntBrackets { |
246 | public: |
247 | WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter, |
248 | HardwareLimits Limits, RegisterEncoding Encoding, |
249 | const unsigned *WaitEventMaskForInst, |
250 | InstCounterType SmemAccessCounter) |
251 | : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits), |
252 | Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst), |
253 | SmemAccessCounter(SmemAccessCounter) {} |
254 | |
255 | unsigned getWaitCountMax(InstCounterType T) const { |
256 | switch (T) { |
257 | case LOAD_CNT: |
258 | return Limits.LoadcntMax; |
259 | case DS_CNT: |
260 | return Limits.DscntMax; |
261 | case EXP_CNT: |
262 | return Limits.ExpcntMax; |
263 | case STORE_CNT: |
264 | return Limits.StorecntMax; |
265 | case SAMPLE_CNT: |
266 | return Limits.SamplecntMax; |
267 | case BVH_CNT: |
268 | return Limits.BvhcntMax; |
269 | case KM_CNT: |
270 | return Limits.KmcntMax; |
271 | default: |
272 | break; |
273 | } |
274 | return 0; |
275 | } |
276 | |
277 | unsigned getScoreLB(InstCounterType T) const { |
278 | assert(T < NUM_INST_CNTS); |
279 | return ScoreLBs[T]; |
280 | } |
281 | |
282 | unsigned getScoreUB(InstCounterType T) const { |
283 | assert(T < NUM_INST_CNTS); |
284 | return ScoreUBs[T]; |
285 | } |
286 | |
287 | unsigned getScoreRange(InstCounterType T) const { |
288 | return getScoreUB(T) - getScoreLB(T); |
289 | } |
290 | |
291 | unsigned getRegScore(int GprNo, InstCounterType T) const { |
292 | if (GprNo < NUM_ALL_VGPRS) { |
293 | return VgprScores[T][GprNo]; |
294 | } |
295 | assert(T == SmemAccessCounter); |
296 | return SgprScores[GprNo - NUM_ALL_VGPRS]; |
297 | } |
298 | |
299 | bool merge(const WaitcntBrackets &Other); |
300 | |
301 | RegInterval getRegInterval(const MachineInstr *MI, |
302 | const MachineRegisterInfo *MRI, |
303 | const SIRegisterInfo *TRI, unsigned OpNo) const; |
304 | |
305 | bool counterOutOfOrder(InstCounterType T) const; |
306 | void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; |
307 | void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; |
308 | void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const; |
309 | void applyWaitcnt(const AMDGPU::Waitcnt &Wait); |
310 | void applyWaitcnt(InstCounterType T, unsigned Count); |
311 | void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, |
312 | const MachineRegisterInfo *MRI, WaitEventType E, |
313 | MachineInstr &MI); |
314 | |
315 | unsigned hasPendingEvent() const { return PendingEvents; } |
316 | unsigned hasPendingEvent(WaitEventType E) const { |
317 | return PendingEvents & (1 << E); |
318 | } |
319 | unsigned hasPendingEvent(InstCounterType T) const { |
320 | unsigned HasPending = PendingEvents & WaitEventMaskForInst[T]; |
321 | assert((HasPending != 0) == (getScoreRange(T) != 0)); |
322 | return HasPending; |
323 | } |
324 | |
325 | bool hasMixedPendingEvents(InstCounterType T) const { |
326 | unsigned Events = hasPendingEvent(T); |
327 | // Return true if more than one bit is set in Events. |
328 | return Events & (Events - 1); |
329 | } |
330 | |
331 | bool hasPendingFlat() const { |
332 | return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] && |
333 | LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) || |
334 | (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] && |
335 | LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT])); |
336 | } |
337 | |
338 | void setPendingFlat() { |
339 | LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT]; |
340 | LastFlat[DS_CNT] = ScoreUBs[DS_CNT]; |
341 | } |
342 | |
343 | // Return true if there might be pending writes to the specified vgpr by VMEM |
344 | // instructions with types different from V. |
345 | bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const { |
346 | assert(GprNo < NUM_ALL_VGPRS); |
347 | return VgprVmemTypes[GprNo] & ~(1 << V); |
348 | } |
349 | |
350 | void clearVgprVmemTypes(int GprNo) { |
351 | assert(GprNo < NUM_ALL_VGPRS); |
352 | VgprVmemTypes[GprNo] = 0; |
353 | } |
354 | |
355 | void setStateOnFunctionEntryOrReturn() { |
356 | setScoreUB(T: STORE_CNT, Val: getScoreUB(T: STORE_CNT) + getWaitCountMax(T: STORE_CNT)); |
357 | PendingEvents |= WaitEventMaskForInst[STORE_CNT]; |
358 | } |
359 | |
360 | ArrayRef<const MachineInstr *> getLDSDMAStores() const { |
361 | return LDSDMAStores; |
362 | } |
363 | |
364 | void print(raw_ostream &); |
365 | void dump() { print(dbgs()); } |
366 | |
367 | private: |
368 | struct MergeInfo { |
369 | unsigned OldLB; |
370 | unsigned OtherLB; |
371 | unsigned MyShift; |
372 | unsigned OtherShift; |
373 | }; |
374 | static bool mergeScore(const MergeInfo &M, unsigned &Score, |
375 | unsigned OtherScore); |
376 | |
377 | void setScoreLB(InstCounterType T, unsigned Val) { |
378 | assert(T < NUM_INST_CNTS); |
379 | ScoreLBs[T] = Val; |
380 | } |
381 | |
382 | void setScoreUB(InstCounterType T, unsigned Val) { |
383 | assert(T < NUM_INST_CNTS); |
384 | ScoreUBs[T] = Val; |
385 | |
386 | if (T != EXP_CNT) |
387 | return; |
388 | |
389 | if (getScoreRange(T: EXP_CNT) > getWaitCountMax(T: EXP_CNT)) |
390 | ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(T: EXP_CNT); |
391 | } |
392 | |
393 | void setRegScore(int GprNo, InstCounterType T, unsigned Val) { |
394 | if (GprNo < NUM_ALL_VGPRS) { |
395 | VgprUB = std::max(a: VgprUB, b: GprNo); |
396 | VgprScores[T][GprNo] = Val; |
397 | } else { |
398 | assert(T == SmemAccessCounter); |
399 | SgprUB = std::max(a: SgprUB, b: GprNo - NUM_ALL_VGPRS); |
400 | SgprScores[GprNo - NUM_ALL_VGPRS] = Val; |
401 | } |
402 | } |
403 | |
404 | void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII, |
405 | const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, |
406 | unsigned OpNo, unsigned Val); |
407 | |
408 | const GCNSubtarget *ST = nullptr; |
409 | InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS; |
410 | HardwareLimits Limits = {}; |
411 | RegisterEncoding Encoding = {}; |
412 | const unsigned *WaitEventMaskForInst; |
413 | InstCounterType SmemAccessCounter; |
414 | unsigned ScoreLBs[NUM_INST_CNTS] = {0}; |
415 | unsigned ScoreUBs[NUM_INST_CNTS] = {0}; |
416 | unsigned PendingEvents = 0; |
417 | // Remember the last flat memory operation. |
418 | unsigned LastFlat[NUM_INST_CNTS] = {0}; |
419 | // wait_cnt scores for every vgpr. |
420 | // Keep track of the VgprUB and SgprUB to make merge at join efficient. |
421 | int VgprUB = -1; |
422 | int SgprUB = -1; |
423 | unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; |
424 | // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt |
425 | // pre-gfx12) or KM_CNT (gfx12+ only) are relevant. |
426 | unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0}; |
427 | // Bitmask of the VmemTypes of VMEM instructions that might have a pending |
428 | // write to each vgpr. |
429 | unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; |
430 | // Store representative LDS DMA operations. The only useful info here is |
431 | // alias info. One store is kept per unique AAInfo. |
432 | SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores; |
433 | }; |
434 | |
435 | // This abstracts the logic for generating and updating S_WAIT* instructions |
436 | // away from the analysis that determines where they are needed. This was |
437 | // done because the set of counters and instructions for waiting on them |
438 | // underwent a major shift with gfx12, sufficiently so that having this |
439 | // abstraction allows the main analysis logic to be simpler than it would |
440 | // otherwise have had to become. |
441 | class WaitcntGenerator { |
442 | protected: |
443 | const GCNSubtarget *ST = nullptr; |
444 | const SIInstrInfo *TII = nullptr; |
445 | AMDGPU::IsaVersion IV; |
446 | InstCounterType MaxCounter; |
447 | |
448 | public: |
449 | WaitcntGenerator() {} |
450 | WaitcntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter) |
451 | : ST(ST), TII(ST->getInstrInfo()), |
452 | IV(AMDGPU::getIsaVersion(GPU: ST->getCPU())), MaxCounter(MaxCounter) {} |
453 | |
454 | // Edits an existing sequence of wait count instructions according |
455 | // to an incoming Waitcnt value, which is itself updated to reflect |
456 | // any new wait count instructions which may need to be generated by |
457 | // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits |
458 | // were made. |
459 | // |
460 | // This editing will usually be merely updated operands, but it may also |
461 | // delete instructions if the incoming Wait value indicates they are not |
462 | // needed. It may also remove existing instructions for which a wait |
463 | // is needed if it can be determined that it is better to generate new |
464 | // instructions later, as can happen on gfx12. |
465 | virtual bool |
466 | applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, |
467 | MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, |
468 | MachineBasicBlock::instr_iterator It) const = 0; |
469 | |
470 | // Transform a soft waitcnt into a normal one. |
471 | bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const; |
472 | |
473 | // Generates new wait count instructions according to the value of |
474 | // Wait, returning true if any new instructions were created. |
475 | virtual bool createNewWaitcnt(MachineBasicBlock &Block, |
476 | MachineBasicBlock::instr_iterator It, |
477 | AMDGPU::Waitcnt Wait) = 0; |
478 | |
479 | // Returns an array of bit masks which can be used to map values in |
480 | // WaitEventType to corresponding counter values in InstCounterType. |
481 | virtual const unsigned *getWaitEventMask() const = 0; |
482 | |
483 | // Returns a new waitcnt with all counters except VScnt set to 0. If |
484 | // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u. |
485 | virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0; |
486 | |
487 | virtual ~WaitcntGenerator() = default; |
488 | |
489 | // Create a mask value from the initializer list of wait event types. |
490 | static constexpr unsigned |
491 | eventMask(std::initializer_list<WaitEventType> Events) { |
492 | unsigned Mask = 0; |
493 | for (auto &E : Events) |
494 | Mask |= 1 << E; |
495 | |
496 | return Mask; |
497 | } |
498 | }; |
499 | |
500 | class WaitcntGeneratorPreGFX12 : public WaitcntGenerator { |
501 | public: |
502 | WaitcntGeneratorPreGFX12() {} |
503 | WaitcntGeneratorPreGFX12(const GCNSubtarget *ST) |
504 | : WaitcntGenerator(ST, NUM_NORMAL_INST_CNTS) {} |
505 | |
506 | bool |
507 | applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, |
508 | MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, |
509 | MachineBasicBlock::instr_iterator It) const override; |
510 | |
511 | bool createNewWaitcnt(MachineBasicBlock &Block, |
512 | MachineBasicBlock::instr_iterator It, |
513 | AMDGPU::Waitcnt Wait) override; |
514 | |
515 | const unsigned *getWaitEventMask() const override { |
516 | assert(ST); |
517 | |
518 | static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = { |
519 | eventMask(Events: {VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, |
520 | VMEM_BVH_READ_ACCESS}), |
521 | eventMask(Events: {SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}), |
522 | eventMask(Events: {EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS, |
523 | EXP_POS_ACCESS, EXP_LDS_ACCESS}), |
524 | eventMask(Events: {VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}), |
525 | 0, |
526 | 0, |
527 | 0}; |
528 | |
529 | return WaitEventMaskForInstPreGFX12; |
530 | } |
531 | |
532 | virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; |
533 | }; |
534 | |
535 | class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { |
536 | public: |
537 | WaitcntGeneratorGFX12Plus() {} |
538 | WaitcntGeneratorGFX12Plus(const GCNSubtarget *ST, InstCounterType MaxCounter) |
539 | : WaitcntGenerator(ST, MaxCounter) {} |
540 | |
541 | bool |
542 | applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, |
543 | MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, |
544 | MachineBasicBlock::instr_iterator It) const override; |
545 | |
546 | bool createNewWaitcnt(MachineBasicBlock &Block, |
547 | MachineBasicBlock::instr_iterator It, |
548 | AMDGPU::Waitcnt Wait) override; |
549 | |
550 | const unsigned *getWaitEventMask() const override { |
551 | assert(ST); |
552 | |
553 | static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = { |
554 | eventMask(Events: {VMEM_ACCESS, VMEM_READ_ACCESS}), |
555 | eventMask(Events: {LDS_ACCESS, GDS_ACCESS}), |
556 | eventMask(Events: {EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS, |
557 | EXP_POS_ACCESS, EXP_LDS_ACCESS}), |
558 | eventMask(Events: {VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}), |
559 | eventMask(Events: {VMEM_SAMPLER_READ_ACCESS}), |
560 | eventMask(Events: {VMEM_BVH_READ_ACCESS}), |
561 | eventMask(Events: {SMEM_ACCESS, SQ_MESSAGE})}; |
562 | |
563 | return WaitEventMaskForInstGFX12Plus; |
564 | } |
565 | |
566 | virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; |
567 | }; |
568 | |
569 | class SIInsertWaitcnts : public MachineFunctionPass { |
570 | private: |
571 | const GCNSubtarget *ST = nullptr; |
572 | const SIInstrInfo *TII = nullptr; |
573 | const SIRegisterInfo *TRI = nullptr; |
574 | const MachineRegisterInfo *MRI = nullptr; |
575 | |
576 | DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses; |
577 | DenseMap<MachineBasicBlock *, bool> ; |
578 | MachineLoopInfo *MLI; |
579 | MachinePostDominatorTree *PDT; |
580 | AliasAnalysis *AA = nullptr; |
581 | |
582 | struct BlockInfo { |
583 | std::unique_ptr<WaitcntBrackets> Incoming; |
584 | bool Dirty = true; |
585 | }; |
586 | |
587 | InstCounterType SmemAccessCounter; |
588 | |
589 | MapVector<MachineBasicBlock *, BlockInfo> BlockInfos; |
590 | |
591 | // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0 |
592 | // because of amdgpu-waitcnt-forcezero flag |
593 | bool ForceEmitZeroWaitcnts; |
594 | bool ForceEmitWaitcnt[NUM_INST_CNTS]; |
595 | |
596 | bool OptNone; |
597 | |
598 | // In any given run of this pass, WCG will point to one of these two |
599 | // generator objects, which must have been re-initialised before use |
600 | // from a value made using a subtarget constructor. |
601 | WaitcntGeneratorPreGFX12 WCGPreGFX12; |
602 | WaitcntGeneratorGFX12Plus WCGGFX12Plus; |
603 | |
604 | WaitcntGenerator *WCG = nullptr; |
605 | |
606 | // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS |
607 | // message. |
608 | DenseSet<MachineInstr *> ReleaseVGPRInsts; |
609 | |
610 | InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS; |
611 | |
612 | public: |
613 | static char ID; |
614 | |
615 | SIInsertWaitcnts() : MachineFunctionPass(ID) { |
616 | (void)ForceExpCounter; |
617 | (void)ForceLgkmCounter; |
618 | (void)ForceVMCounter; |
619 | } |
620 | |
621 | bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets); |
622 | bool isPreheaderToFlush(MachineBasicBlock &MBB, |
623 | WaitcntBrackets &ScoreBrackets); |
624 | bool isVMEMOrFlatVMEM(const MachineInstr &MI) const; |
625 | bool runOnMachineFunction(MachineFunction &MF) override; |
626 | |
627 | StringRef getPassName() const override { |
628 | return "SI insert wait instructions" ; |
629 | } |
630 | |
631 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
632 | AU.setPreservesCFG(); |
633 | AU.addRequired<MachineLoopInfo>(); |
634 | AU.addRequired<MachinePostDominatorTree>(); |
635 | AU.addUsedIfAvailable<AAResultsWrapperPass>(); |
636 | AU.addPreserved<AAResultsWrapperPass>(); |
637 | MachineFunctionPass::getAnalysisUsage(AU); |
638 | } |
639 | |
640 | bool isForceEmitWaitcnt() const { |
641 | for (auto T : inst_counter_types()) |
642 | if (ForceEmitWaitcnt[T]) |
643 | return true; |
644 | return false; |
645 | } |
646 | |
647 | void setForceEmitWaitcnt() { |
648 | // For non-debug builds, ForceEmitWaitcnt has been initialized to false; |
649 | // For debug builds, get the debug counter info and adjust if need be |
650 | #ifndef NDEBUG |
651 | if (DebugCounter::isCounterSet(ID: ForceExpCounter) && |
652 | DebugCounter::shouldExecute(CounterName: ForceExpCounter)) { |
653 | ForceEmitWaitcnt[EXP_CNT] = true; |
654 | } else { |
655 | ForceEmitWaitcnt[EXP_CNT] = false; |
656 | } |
657 | |
658 | if (DebugCounter::isCounterSet(ID: ForceLgkmCounter) && |
659 | DebugCounter::shouldExecute(CounterName: ForceLgkmCounter)) { |
660 | ForceEmitWaitcnt[DS_CNT] = true; |
661 | ForceEmitWaitcnt[KM_CNT] = true; |
662 | } else { |
663 | ForceEmitWaitcnt[DS_CNT] = false; |
664 | ForceEmitWaitcnt[KM_CNT] = false; |
665 | } |
666 | |
667 | if (DebugCounter::isCounterSet(ID: ForceVMCounter) && |
668 | DebugCounter::shouldExecute(CounterName: ForceVMCounter)) { |
669 | ForceEmitWaitcnt[LOAD_CNT] = true; |
670 | ForceEmitWaitcnt[SAMPLE_CNT] = true; |
671 | ForceEmitWaitcnt[BVH_CNT] = true; |
672 | } else { |
673 | ForceEmitWaitcnt[LOAD_CNT] = false; |
674 | ForceEmitWaitcnt[SAMPLE_CNT] = false; |
675 | ForceEmitWaitcnt[BVH_CNT] = false; |
676 | } |
677 | #endif // NDEBUG |
678 | } |
679 | |
680 | // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or |
681 | // FLAT instruction. |
682 | WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const { |
683 | // Maps VMEM access types to their corresponding WaitEventType. |
684 | static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = { |
685 | VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}; |
686 | |
687 | assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst)); |
688 | // LDS DMA loads are also stores, but on the LDS side. On the VMEM side |
689 | // these should use VM_CNT. |
690 | if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(MI: Inst)) |
691 | return VMEM_ACCESS; |
692 | if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(MI: Inst)) { |
693 | // FLAT and SCRATCH instructions may access scratch. Other VMEM |
694 | // instructions do not. |
695 | if (SIInstrInfo::isFLAT(MI: Inst) && mayAccessScratchThroughFlat(MI: Inst)) |
696 | return SCRATCH_WRITE_ACCESS; |
697 | return VMEM_WRITE_ACCESS; |
698 | } |
699 | if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(MI: Inst)) |
700 | return VMEM_READ_ACCESS; |
701 | return VmemReadMapping[getVmemType(Inst)]; |
702 | } |
703 | |
704 | bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; |
705 | bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; |
706 | bool mayAccessScratchThroughFlat(const MachineInstr &MI) const; |
707 | bool generateWaitcntInstBefore(MachineInstr &MI, |
708 | WaitcntBrackets &ScoreBrackets, |
709 | MachineInstr *OldWaitcntInstr, |
710 | bool FlushVmCnt); |
711 | bool generateWaitcnt(AMDGPU::Waitcnt Wait, |
712 | MachineBasicBlock::instr_iterator It, |
713 | MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets, |
714 | MachineInstr *OldWaitcntInstr); |
715 | void updateEventWaitcntAfter(MachineInstr &Inst, |
716 | WaitcntBrackets *ScoreBrackets); |
717 | bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, |
718 | WaitcntBrackets &ScoreBrackets); |
719 | }; |
720 | |
721 | } // end anonymous namespace |
722 | |
723 | RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, |
724 | const MachineRegisterInfo *MRI, |
725 | const SIRegisterInfo *TRI, |
726 | unsigned OpNo) const { |
727 | const MachineOperand &Op = MI->getOperand(i: OpNo); |
728 | if (!TRI->isInAllocatableClass(Op.getReg())) |
729 | return {-1, -1}; |
730 | |
731 | // A use via a PW operand does not need a waitcnt. |
732 | // A partial write is not a WAW. |
733 | assert(!Op.getSubReg() || !Op.isUndef()); |
734 | |
735 | RegInterval Result; |
736 | |
737 | unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) & |
738 | AMDGPU::HWEncoding::REG_IDX_MASK; |
739 | |
740 | if (TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg())) { |
741 | assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL); |
742 | Result.first = Reg - Encoding.VGPR0; |
743 | if (TRI->isAGPR(MRI: *MRI, Reg: Op.getReg())) |
744 | Result.first += AGPR_OFFSET; |
745 | assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); |
746 | } else if (TRI->isSGPRReg(MRI: *MRI, Reg: Op.getReg())) { |
747 | assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS); |
748 | Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS; |
749 | assert(Result.first >= NUM_ALL_VGPRS && |
750 | Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS); |
751 | } |
752 | // TODO: Handle TTMP |
753 | // else if (TRI->isTTMP(*MRI, Reg.getReg())) ... |
754 | else |
755 | return {-1, -1}; |
756 | |
757 | const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg()); |
758 | unsigned Size = TRI->getRegSizeInBits(*RC); |
759 | Result.second = Result.first + ((Size + 16) / 32); |
760 | |
761 | return Result; |
762 | } |
763 | |
764 | void WaitcntBrackets::setExpScore(const MachineInstr *MI, |
765 | const SIInstrInfo *TII, |
766 | const SIRegisterInfo *TRI, |
767 | const MachineRegisterInfo *MRI, unsigned OpNo, |
768 | unsigned Val) { |
769 | RegInterval Interval = getRegInterval(MI, MRI, TRI, OpNo); |
770 | assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg())); |
771 | for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { |
772 | setRegScore(GprNo: RegNo, T: EXP_CNT, Val); |
773 | } |
774 | } |
775 | |
776 | void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, |
777 | const SIRegisterInfo *TRI, |
778 | const MachineRegisterInfo *MRI, |
779 | WaitEventType E, MachineInstr &Inst) { |
780 | InstCounterType T = eventCounter(masks: WaitEventMaskForInst, E); |
781 | |
782 | unsigned UB = getScoreUB(T); |
783 | unsigned CurrScore = UB + 1; |
784 | if (CurrScore == 0) |
785 | report_fatal_error(reason: "InsertWaitcnt score wraparound" ); |
786 | // PendingEvents and ScoreUB need to be update regardless if this event |
787 | // changes the score of a register or not. |
788 | // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message. |
789 | PendingEvents |= 1 << E; |
790 | setScoreUB(T, Val: CurrScore); |
791 | |
792 | if (T == EXP_CNT) { |
793 | // Put score on the source vgprs. If this is a store, just use those |
794 | // specific register(s). |
795 | if (TII->isDS(MI: Inst) && (Inst.mayStore() || Inst.mayLoad())) { |
796 | int AddrOpIdx = |
797 | AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr); |
798 | // All GDS operations must protect their address register (same as |
799 | // export.) |
800 | if (AddrOpIdx != -1) { |
801 | setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: AddrOpIdx, Val: CurrScore); |
802 | } |
803 | |
804 | if (Inst.mayStore()) { |
805 | if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) { |
806 | setExpScore( |
807 | &Inst, TII, TRI, MRI, |
808 | AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0), |
809 | CurrScore); |
810 | } |
811 | if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) { |
812 | setExpScore(&Inst, TII, TRI, MRI, |
813 | AMDGPU::getNamedOperandIdx(Inst.getOpcode(), |
814 | AMDGPU::OpName::data1), |
815 | CurrScore); |
816 | } |
817 | } else if (SIInstrInfo::isAtomicRet(MI: Inst) && !SIInstrInfo::isGWS(MI: Inst) && |
818 | Inst.getOpcode() != AMDGPU::DS_APPEND && |
819 | Inst.getOpcode() != AMDGPU::DS_CONSUME && |
820 | Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) { |
821 | for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { |
822 | const MachineOperand &Op = Inst.getOperand(i: I); |
823 | if (Op.isReg() && !Op.isDef() && |
824 | TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg())) { |
825 | setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: I, Val: CurrScore); |
826 | } |
827 | } |
828 | } |
829 | } else if (TII->isFLAT(MI: Inst)) { |
830 | if (Inst.mayStore()) { |
831 | setExpScore( |
832 | &Inst, TII, TRI, MRI, |
833 | AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), |
834 | CurrScore); |
835 | } else if (SIInstrInfo::isAtomicRet(MI: Inst)) { |
836 | setExpScore( |
837 | &Inst, TII, TRI, MRI, |
838 | AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), |
839 | CurrScore); |
840 | } |
841 | } else if (TII->isMIMG(MI: Inst)) { |
842 | if (Inst.mayStore()) { |
843 | setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: 0, Val: CurrScore); |
844 | } else if (SIInstrInfo::isAtomicRet(MI: Inst)) { |
845 | setExpScore( |
846 | &Inst, TII, TRI, MRI, |
847 | AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), |
848 | CurrScore); |
849 | } |
850 | } else if (TII->isMTBUF(MI: Inst)) { |
851 | if (Inst.mayStore()) { |
852 | setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: 0, Val: CurrScore); |
853 | } |
854 | } else if (TII->isMUBUF(MI: Inst)) { |
855 | if (Inst.mayStore()) { |
856 | setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: 0, Val: CurrScore); |
857 | } else if (SIInstrInfo::isAtomicRet(MI: Inst)) { |
858 | setExpScore( |
859 | &Inst, TII, TRI, MRI, |
860 | AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data), |
861 | CurrScore); |
862 | } |
863 | } else if (TII->isLDSDIR(MI: Inst)) { |
864 | // LDSDIR instructions attach the score to the destination. |
865 | setExpScore( |
866 | &Inst, TII, TRI, MRI, |
867 | AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst), |
868 | CurrScore); |
869 | } else { |
870 | if (TII->isEXP(MI: Inst)) { |
871 | // For export the destination registers are really temps that |
872 | // can be used as the actual source after export patching, so |
873 | // we need to treat them like sources and set the EXP_CNT |
874 | // score. |
875 | for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { |
876 | MachineOperand &DefMO = Inst.getOperand(i: I); |
877 | if (DefMO.isReg() && DefMO.isDef() && |
878 | TRI->isVGPR(MRI: *MRI, Reg: DefMO.getReg())) { |
879 | setRegScore( |
880 | GprNo: TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)), |
881 | T: EXP_CNT, Val: CurrScore); |
882 | } |
883 | } |
884 | } |
885 | for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { |
886 | MachineOperand &MO = Inst.getOperand(i: I); |
887 | if (MO.isReg() && !MO.isDef() && |
888 | TRI->isVectorRegister(MRI: *MRI, Reg: MO.getReg())) { |
889 | setExpScore(MI: &Inst, TII, TRI, MRI, OpNo: I, Val: CurrScore); |
890 | } |
891 | } |
892 | } |
893 | #if 0 // TODO: check if this is handled by MUBUF code above. |
894 | } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD || |
895 | Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 || |
896 | Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) { |
897 | MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data); |
898 | unsigned OpNo;//TODO: find the OpNo for this operand; |
899 | RegInterval Interval = getRegInterval(&Inst, MRI, TRI, OpNo); |
900 | for (int RegNo = Interval.first; RegNo < Interval.second; |
901 | ++RegNo) { |
902 | setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore); |
903 | } |
904 | #endif |
905 | } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ { |
906 | // Match the score to the destination registers. |
907 | for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) { |
908 | auto &Op = Inst.getOperand(i: I); |
909 | if (!Op.isReg() || !Op.isDef()) |
910 | continue; |
911 | RegInterval Interval = getRegInterval(MI: &Inst, MRI, TRI, OpNo: I); |
912 | if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) { |
913 | if (Interval.first >= NUM_ALL_VGPRS) |
914 | continue; |
915 | if (updateVMCntOnly(Inst)) { |
916 | // updateVMCntOnly should only leave us with VGPRs |
917 | // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR |
918 | // defs. That's required for a sane index into `VgprMemTypes` below |
919 | assert(TRI->isVectorRegister(*MRI, Op.getReg())); |
920 | VmemType V = getVmemType(Inst); |
921 | for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) |
922 | VgprVmemTypes[RegNo] |= 1 << V; |
923 | } |
924 | } |
925 | for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { |
926 | setRegScore(GprNo: RegNo, T, Val: CurrScore); |
927 | } |
928 | } |
929 | if (Inst.mayStore() && |
930 | (TII->isDS(MI: Inst) || TII->mayWriteLDSThroughDMA(MI: Inst))) { |
931 | // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS |
932 | // written can be accessed. A load from LDS to VMEM does not need a wait. |
933 | unsigned Slot = 0; |
934 | for (const auto *MemOp : Inst.memoperands()) { |
935 | if (!MemOp->isStore() || |
936 | MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS) |
937 | continue; |
938 | // Comparing just AA info does not guarantee memoperands are equal |
939 | // in general, but this is so for LDS DMA in practice. |
940 | auto AAI = MemOp->getAAInfo(); |
941 | // Alias scope information gives a way to definitely identify an |
942 | // original memory object and practically produced in the module LDS |
943 | // lowering pass. If there is no scope available we will not be able |
944 | // to disambiguate LDS aliasing as after the module lowering all LDS |
945 | // is squashed into a single big object. Do not attempt to use one of |
946 | // the limited LDSDMAStores for something we will not be able to use |
947 | // anyway. |
948 | if (!AAI || !AAI.Scope) |
949 | break; |
950 | for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) { |
951 | for (const auto *MemOp : LDSDMAStores[I]->memoperands()) { |
952 | if (MemOp->isStore() && AAI == MemOp->getAAInfo()) { |
953 | Slot = I + 1; |
954 | break; |
955 | } |
956 | } |
957 | } |
958 | if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1) |
959 | break; |
960 | LDSDMAStores.push_back(Elt: &Inst); |
961 | Slot = LDSDMAStores.size(); |
962 | break; |
963 | } |
964 | setRegScore(GprNo: SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, Val: CurrScore); |
965 | if (Slot) |
966 | setRegScore(GprNo: SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, Val: CurrScore); |
967 | } |
968 | } |
969 | } |
970 | |
971 | void WaitcntBrackets::print(raw_ostream &OS) { |
972 | OS << '\n'; |
973 | for (auto T : inst_counter_types(MaxCounter)) { |
974 | unsigned SR = getScoreRange(T); |
975 | |
976 | switch (T) { |
977 | case LOAD_CNT: |
978 | OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM" ) << "_CNT(" |
979 | << SR << "): " ; |
980 | break; |
981 | case DS_CNT: |
982 | OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM" ) << "_CNT(" |
983 | << SR << "): " ; |
984 | break; |
985 | case EXP_CNT: |
986 | OS << " EXP_CNT(" << SR << "): " ; |
987 | break; |
988 | case STORE_CNT: |
989 | OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS" ) << "_CNT(" |
990 | << SR << "): " ; |
991 | break; |
992 | case SAMPLE_CNT: |
993 | OS << " SAMPLE_CNT(" << SR << "): " ; |
994 | break; |
995 | case BVH_CNT: |
996 | OS << " BVH_CNT(" << SR << "): " ; |
997 | break; |
998 | case KM_CNT: |
999 | OS << " KM_CNT(" << SR << "): " ; |
1000 | break; |
1001 | default: |
1002 | OS << " UNKNOWN(" << SR << "): " ; |
1003 | break; |
1004 | } |
1005 | |
1006 | if (SR != 0) { |
1007 | // Print vgpr scores. |
1008 | unsigned LB = getScoreLB(T); |
1009 | |
1010 | for (int J = 0; J <= VgprUB; J++) { |
1011 | unsigned RegScore = getRegScore(GprNo: J, T); |
1012 | if (RegScore <= LB) |
1013 | continue; |
1014 | unsigned RelScore = RegScore - LB - 1; |
1015 | if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) { |
1016 | OS << RelScore << ":v" << J << " " ; |
1017 | } else { |
1018 | OS << RelScore << ":ds " ; |
1019 | } |
1020 | } |
1021 | // Also need to print sgpr scores for lgkm_cnt. |
1022 | if (T == SmemAccessCounter) { |
1023 | for (int J = 0; J <= SgprUB; J++) { |
1024 | unsigned RegScore = getRegScore(GprNo: J + NUM_ALL_VGPRS, T); |
1025 | if (RegScore <= LB) |
1026 | continue; |
1027 | unsigned RelScore = RegScore - LB - 1; |
1028 | OS << RelScore << ":s" << J << " " ; |
1029 | } |
1030 | } |
1031 | } |
1032 | OS << '\n'; |
1033 | } |
1034 | OS << '\n'; |
1035 | } |
1036 | |
1037 | /// Simplify the waitcnt, in the sense of removing redundant counts, and return |
1038 | /// whether a waitcnt instruction is needed at all. |
1039 | void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { |
1040 | simplifyWaitcnt(T: LOAD_CNT, Count&: Wait.LoadCnt); |
1041 | simplifyWaitcnt(T: EXP_CNT, Count&: Wait.ExpCnt); |
1042 | simplifyWaitcnt(T: DS_CNT, Count&: Wait.DsCnt); |
1043 | simplifyWaitcnt(T: STORE_CNT, Count&: Wait.StoreCnt); |
1044 | simplifyWaitcnt(T: SAMPLE_CNT, Count&: Wait.SampleCnt); |
1045 | simplifyWaitcnt(T: BVH_CNT, Count&: Wait.BvhCnt); |
1046 | simplifyWaitcnt(T: KM_CNT, Count&: Wait.KmCnt); |
1047 | } |
1048 | |
1049 | void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, |
1050 | unsigned &Count) const { |
1051 | // The number of outstanding events for this type, T, can be calculated |
1052 | // as (UB - LB). If the current Count is greater than or equal to the number |
1053 | // of outstanding events, then the wait for this counter is redundant. |
1054 | if (Count >= getScoreRange(T)) |
1055 | Count = ~0u; |
1056 | } |
1057 | |
1058 | void WaitcntBrackets::determineWait(InstCounterType T, int RegNo, |
1059 | AMDGPU::Waitcnt &Wait) const { |
1060 | unsigned ScoreToWait = getRegScore(GprNo: RegNo, T); |
1061 | |
1062 | // If the score of src_operand falls within the bracket, we need an |
1063 | // s_waitcnt instruction. |
1064 | const unsigned LB = getScoreLB(T); |
1065 | const unsigned UB = getScoreUB(T); |
1066 | if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { |
1067 | if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() && |
1068 | !ST->hasFlatLgkmVMemCountInOrder()) { |
1069 | // If there is a pending FLAT operation, and this is a VMem or LGKM |
1070 | // waitcnt and the target can report early completion, then we need |
1071 | // to force a waitcnt 0. |
1072 | addWait(Wait, T, Count: 0); |
1073 | } else if (counterOutOfOrder(T)) { |
1074 | // Counter can get decremented out-of-order when there |
1075 | // are multiple types event in the bracket. Also emit an s_wait counter |
1076 | // with a conservative value of 0 for the counter. |
1077 | addWait(Wait, T, Count: 0); |
1078 | } else { |
1079 | // If a counter has been maxed out avoid overflow by waiting for |
1080 | // MAX(CounterType) - 1 instead. |
1081 | unsigned NeededWait = std::min(a: UB - ScoreToWait, b: getWaitCountMax(T) - 1); |
1082 | addWait(Wait, T, Count: NeededWait); |
1083 | } |
1084 | } |
1085 | } |
1086 | |
1087 | void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { |
1088 | applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt); |
1089 | applyWaitcnt(T: EXP_CNT, Count: Wait.ExpCnt); |
1090 | applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt); |
1091 | applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt); |
1092 | applyWaitcnt(T: SAMPLE_CNT, Count: Wait.SampleCnt); |
1093 | applyWaitcnt(T: BVH_CNT, Count: Wait.BvhCnt); |
1094 | applyWaitcnt(T: KM_CNT, Count: Wait.KmCnt); |
1095 | } |
1096 | |
1097 | void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { |
1098 | const unsigned UB = getScoreUB(T); |
1099 | if (Count >= UB) |
1100 | return; |
1101 | if (Count != 0) { |
1102 | if (counterOutOfOrder(T)) |
1103 | return; |
1104 | setScoreLB(T, Val: std::max(a: getScoreLB(T), b: UB - Count)); |
1105 | } else { |
1106 | setScoreLB(T, Val: UB); |
1107 | PendingEvents &= ~WaitEventMaskForInst[T]; |
1108 | } |
1109 | } |
1110 | |
1111 | // Where there are multiple types of event in the bracket of a counter, |
1112 | // the decrement may go out of order. |
1113 | bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { |
1114 | // Scalar memory read always can go out of order. |
1115 | if (T == SmemAccessCounter && hasPendingEvent(E: SMEM_ACCESS)) |
1116 | return true; |
1117 | return hasMixedPendingEvents(T); |
1118 | } |
1119 | |
1120 | INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts" , false, |
1121 | false) |
1122 | INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) |
1123 | INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) |
1124 | INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts" , false, |
1125 | false) |
1126 | |
1127 | char SIInsertWaitcnts::ID = 0; |
1128 | |
1129 | char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID; |
1130 | |
1131 | FunctionPass *llvm::createSIInsertWaitcntsPass() { |
1132 | return new SIInsertWaitcnts(); |
1133 | } |
1134 | |
1135 | static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, |
1136 | unsigned NewEnc) { |
1137 | int OpIdx = AMDGPU::getNamedOperandIdx(Opcode: MI.getOpcode(), NamedIdx: OpName); |
1138 | assert(OpIdx >= 0); |
1139 | |
1140 | MachineOperand &MO = MI.getOperand(i: OpIdx); |
1141 | |
1142 | if (NewEnc == MO.getImm()) |
1143 | return false; |
1144 | |
1145 | MO.setImm(NewEnc); |
1146 | return true; |
1147 | } |
1148 | |
1149 | /// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction, |
1150 | /// and if so, which counter it is waiting on. |
1151 | static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) { |
1152 | switch (Opcode) { |
1153 | case AMDGPU::S_WAIT_LOADCNT: |
1154 | return LOAD_CNT; |
1155 | case AMDGPU::S_WAIT_EXPCNT: |
1156 | return EXP_CNT; |
1157 | case AMDGPU::S_WAIT_STORECNT: |
1158 | return STORE_CNT; |
1159 | case AMDGPU::S_WAIT_SAMPLECNT: |
1160 | return SAMPLE_CNT; |
1161 | case AMDGPU::S_WAIT_BVHCNT: |
1162 | return BVH_CNT; |
1163 | case AMDGPU::S_WAIT_DSCNT: |
1164 | return DS_CNT; |
1165 | case AMDGPU::S_WAIT_KMCNT: |
1166 | return KM_CNT; |
1167 | default: |
1168 | return {}; |
1169 | } |
1170 | } |
1171 | |
1172 | bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const { |
1173 | unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: Waitcnt->getOpcode()); |
1174 | if (Opcode == Waitcnt->getOpcode()) |
1175 | return false; |
1176 | |
1177 | Waitcnt->setDesc(TII->get(Opcode)); |
1178 | return true; |
1179 | } |
1180 | |
1181 | /// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that |
1182 | /// precede \p It and follow \p OldWaitcntInstr and apply any extra waits |
1183 | /// from \p Wait that were added by previous passes. Currently this pass |
1184 | /// conservatively assumes that these preexisting waits are required for |
1185 | /// correctness. |
1186 | bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( |
1187 | WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, |
1188 | AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { |
1189 | assert(ST); |
1190 | assert(isNormalMode(MaxCounter)); |
1191 | |
1192 | bool Modified = false; |
1193 | MachineInstr *WaitcntInstr = nullptr; |
1194 | MachineInstr *WaitcntVsCntInstr = nullptr; |
1195 | |
1196 | for (auto &II : |
1197 | make_early_inc_range(Range: make_range(x: OldWaitcntInstr.getIterator(), y: It))) { |
1198 | if (II.isMetaInstruction()) |
1199 | continue; |
1200 | |
1201 | unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: II.getOpcode()); |
1202 | bool IsSoft = Opcode != II.getOpcode(); |
1203 | |
1204 | // Update required wait count. If this is a soft waitcnt (= it was added |
1205 | // by an earlier pass), it may be entirely removed. |
1206 | if (Opcode == AMDGPU::S_WAITCNT) { |
1207 | unsigned IEnc = II.getOperand(i: 0).getImm(); |
1208 | AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(Version: IV, Encoded: IEnc); |
1209 | if (IsSoft) |
1210 | ScoreBrackets.simplifyWaitcnt(Wait&: OldWait); |
1211 | Wait = Wait.combined(Other: OldWait); |
1212 | |
1213 | // Merge consecutive waitcnt of the same type by erasing multiples. |
1214 | if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && IsSoft)) { |
1215 | II.eraseFromParent(); |
1216 | Modified = true; |
1217 | } else |
1218 | WaitcntInstr = &II; |
1219 | } else { |
1220 | assert(Opcode == AMDGPU::S_WAITCNT_VSCNT); |
1221 | assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); |
1222 | |
1223 | unsigned OldVSCnt = |
1224 | TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); |
1225 | if (IsSoft) |
1226 | ScoreBrackets.simplifyWaitcnt(T: InstCounterType::STORE_CNT, Count&: OldVSCnt); |
1227 | Wait.StoreCnt = std::min(a: Wait.StoreCnt, b: OldVSCnt); |
1228 | |
1229 | if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && IsSoft)) { |
1230 | II.eraseFromParent(); |
1231 | Modified = true; |
1232 | } else |
1233 | WaitcntVsCntInstr = &II; |
1234 | } |
1235 | } |
1236 | |
1237 | if (WaitcntInstr) { |
1238 | Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16, |
1239 | AMDGPU::encodeWaitcnt(IV, Wait)); |
1240 | Modified |= promoteSoftWaitCnt(Waitcnt: WaitcntInstr); |
1241 | |
1242 | ScoreBrackets.applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt); |
1243 | ScoreBrackets.applyWaitcnt(T: EXP_CNT, Count: Wait.ExpCnt); |
1244 | ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt); |
1245 | Wait.LoadCnt = ~0u; |
1246 | Wait.ExpCnt = ~0u; |
1247 | Wait.DsCnt = ~0u; |
1248 | |
1249 | LLVM_DEBUG(It == WaitcntInstr->getParent()->end() |
1250 | ? dbgs() |
1251 | << "applyPreexistingWaitcnt\n" |
1252 | << "New Instr at block end: " << *WaitcntInstr << '\n' |
1253 | : dbgs() << "applyPreexistingWaitcnt\n" |
1254 | << "Old Instr: " << *It |
1255 | << "New Instr: " << *WaitcntInstr << '\n'); |
1256 | } |
1257 | |
1258 | if (WaitcntVsCntInstr) { |
1259 | Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr, |
1260 | AMDGPU::OpName::simm16, Wait.StoreCnt); |
1261 | Modified |= promoteSoftWaitCnt(Waitcnt: WaitcntVsCntInstr); |
1262 | |
1263 | ScoreBrackets.applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt); |
1264 | Wait.StoreCnt = ~0u; |
1265 | |
1266 | LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end() |
1267 | ? dbgs() << "applyPreexistingWaitcnt\n" |
1268 | << "New Instr at block end: " << *WaitcntVsCntInstr |
1269 | << '\n' |
1270 | : dbgs() << "applyPreexistingWaitcnt\n" |
1271 | << "Old Instr: " << *It |
1272 | << "New Instr: " << *WaitcntVsCntInstr << '\n'); |
1273 | } |
1274 | |
1275 | return Modified; |
1276 | } |
1277 | |
1278 | /// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any |
1279 | /// required counters in \p Wait |
1280 | bool WaitcntGeneratorPreGFX12::createNewWaitcnt( |
1281 | MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, |
1282 | AMDGPU::Waitcnt Wait) { |
1283 | assert(ST); |
1284 | assert(isNormalMode(MaxCounter)); |
1285 | |
1286 | bool Modified = false; |
1287 | const DebugLoc &DL = Block.findDebugLoc(MBBI: It); |
1288 | |
1289 | // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a |
1290 | // single instruction while VScnt has its own instruction. |
1291 | if (Wait.hasWaitExceptStoreCnt()) { |
1292 | unsigned Enc = AMDGPU::encodeWaitcnt(Version: IV, Decoded: Wait); |
1293 | [[maybe_unused]] auto SWaitInst = |
1294 | BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); |
1295 | Modified = true; |
1296 | |
1297 | LLVM_DEBUG(dbgs() << "generateWaitcnt\n" ; |
1298 | if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; |
1299 | dbgs() << "New Instr: " << *SWaitInst << '\n'); |
1300 | } |
1301 | |
1302 | if (Wait.hasWaitStoreCnt()) { |
1303 | assert(ST->hasVscnt()); |
1304 | |
1305 | [[maybe_unused]] auto SWaitInst = |
1306 | BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) |
1307 | .addReg(AMDGPU::SGPR_NULL, RegState::Undef) |
1308 | .addImm(Wait.StoreCnt); |
1309 | Modified = true; |
1310 | |
1311 | LLVM_DEBUG(dbgs() << "generateWaitcnt\n" ; |
1312 | if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; |
1313 | dbgs() << "New Instr: " << *SWaitInst << '\n'); |
1314 | } |
1315 | |
1316 | return Modified; |
1317 | } |
1318 | |
1319 | AMDGPU::Waitcnt |
1320 | WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const { |
1321 | return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u); |
1322 | } |
1323 | |
1324 | AMDGPU::Waitcnt |
1325 | WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const { |
1326 | return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0); |
1327 | } |
1328 | |
1329 | /// Combine consecutive S_WAIT_*CNT instructions that precede \p It and |
1330 | /// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that |
1331 | /// were added by previous passes. Currently this pass conservatively |
1332 | /// assumes that these preexisting waits are required for correctness. |
1333 | bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( |
1334 | WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, |
1335 | AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const { |
1336 | assert(ST); |
1337 | assert(!isNormalMode(MaxCounter)); |
1338 | |
1339 | bool Modified = false; |
1340 | MachineInstr *CombinedLoadDsCntInstr = nullptr; |
1341 | MachineInstr *CombinedStoreDsCntInstr = nullptr; |
1342 | MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {}; |
1343 | |
1344 | for (auto &II : |
1345 | make_early_inc_range(Range: make_range(x: OldWaitcntInstr.getIterator(), y: It))) { |
1346 | if (II.isMetaInstruction()) |
1347 | continue; |
1348 | |
1349 | MachineInstr **UpdatableInstr; |
1350 | |
1351 | // Update required wait count. If this is a soft waitcnt (= it was added |
1352 | // by an earlier pass), it may be entirely removed. |
1353 | |
1354 | unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: II.getOpcode()); |
1355 | bool IsSoft = Opcode != II.getOpcode(); |
1356 | |
1357 | if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) { |
1358 | unsigned OldEnc = |
1359 | TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); |
1360 | AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(Version: IV, LoadcntDscnt: OldEnc); |
1361 | if (IsSoft) |
1362 | ScoreBrackets.simplifyWaitcnt(Wait&: OldWait); |
1363 | Wait = Wait.combined(Other: OldWait); |
1364 | UpdatableInstr = &CombinedLoadDsCntInstr; |
1365 | } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) { |
1366 | unsigned OldEnc = |
1367 | TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); |
1368 | AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(Version: IV, StorecntDscnt: OldEnc); |
1369 | if (IsSoft) |
1370 | ScoreBrackets.simplifyWaitcnt(Wait&: OldWait); |
1371 | Wait = Wait.combined(Other: OldWait); |
1372 | UpdatableInstr = &CombinedStoreDsCntInstr; |
1373 | } else { |
1374 | std::optional<InstCounterType> CT = counterTypeForInstr(Opcode); |
1375 | assert(CT.has_value()); |
1376 | unsigned OldCnt = |
1377 | TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); |
1378 | if (IsSoft) |
1379 | ScoreBrackets.simplifyWaitcnt(T: CT.value(), Count&: OldCnt); |
1380 | addWait(Wait, T: CT.value(), Count: OldCnt); |
1381 | UpdatableInstr = &WaitInstrs[CT.value()]; |
1382 | } |
1383 | |
1384 | // Merge consecutive waitcnt of the same type by erasing multiples. |
1385 | if (!*UpdatableInstr) { |
1386 | *UpdatableInstr = &II; |
1387 | } else { |
1388 | II.eraseFromParent(); |
1389 | Modified = true; |
1390 | } |
1391 | } |
1392 | |
1393 | if (CombinedLoadDsCntInstr) { |
1394 | // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need |
1395 | // to be waited for. Otherwise, let the instruction be deleted so |
1396 | // the appropriate single counter wait instruction can be inserted |
1397 | // instead, when new S_WAIT_*CNT instructions are inserted by |
1398 | // createNewWaitcnt(). As a side effect, resetting the wait counts will |
1399 | // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by |
1400 | // the loop below that deals with single counter instructions. |
1401 | if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) { |
1402 | unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(Version: IV, Decoded: Wait); |
1403 | Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr, |
1404 | AMDGPU::OpName::simm16, NewEnc); |
1405 | Modified |= promoteSoftWaitCnt(Waitcnt: CombinedLoadDsCntInstr); |
1406 | ScoreBrackets.applyWaitcnt(T: LOAD_CNT, Count: Wait.LoadCnt); |
1407 | ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt); |
1408 | Wait.LoadCnt = ~0u; |
1409 | Wait.DsCnt = ~0u; |
1410 | |
1411 | LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() |
1412 | ? dbgs() << "applyPreexistingWaitcnt\n" |
1413 | << "New Instr at block end: " |
1414 | << *CombinedLoadDsCntInstr << '\n' |
1415 | : dbgs() << "applyPreexistingWaitcnt\n" |
1416 | << "Old Instr: " << *It << "New Instr: " |
1417 | << *CombinedLoadDsCntInstr << '\n'); |
1418 | } else { |
1419 | CombinedLoadDsCntInstr->eraseFromParent(); |
1420 | Modified = true; |
1421 | } |
1422 | } |
1423 | |
1424 | if (CombinedStoreDsCntInstr) { |
1425 | // Similarly for S_WAIT_STORECNT_DSCNT. |
1426 | if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) { |
1427 | unsigned NewEnc = AMDGPU::encodeStorecntDscnt(Version: IV, Decoded: Wait); |
1428 | Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr, |
1429 | AMDGPU::OpName::simm16, NewEnc); |
1430 | Modified |= promoteSoftWaitCnt(Waitcnt: CombinedStoreDsCntInstr); |
1431 | ScoreBrackets.applyWaitcnt(T: STORE_CNT, Count: Wait.StoreCnt); |
1432 | ScoreBrackets.applyWaitcnt(T: DS_CNT, Count: Wait.DsCnt); |
1433 | Wait.StoreCnt = ~0u; |
1434 | Wait.DsCnt = ~0u; |
1435 | |
1436 | LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() |
1437 | ? dbgs() << "applyPreexistingWaitcnt\n" |
1438 | << "New Instr at block end: " |
1439 | << *CombinedStoreDsCntInstr << '\n' |
1440 | : dbgs() << "applyPreexistingWaitcnt\n" |
1441 | << "Old Instr: " << *It << "New Instr: " |
1442 | << *CombinedStoreDsCntInstr << '\n'); |
1443 | } else { |
1444 | CombinedStoreDsCntInstr->eraseFromParent(); |
1445 | Modified = true; |
1446 | } |
1447 | } |
1448 | |
1449 | // Look for an opportunity to convert existing S_WAIT_LOADCNT, |
1450 | // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT |
1451 | // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing |
1452 | // instructions so that createNewWaitcnt() will create new combined |
1453 | // instructions to replace them. |
1454 | |
1455 | if (Wait.DsCnt != ~0u) { |
1456 | // This is a vector of addresses in WaitInstrs pointing to instructions |
1457 | // that should be removed if they are present. |
1458 | SmallVector<MachineInstr **, 2> WaitsToErase; |
1459 | |
1460 | // If it's known that both DScnt and either LOADcnt or STOREcnt (but not |
1461 | // both) need to be waited for, ensure that there are no existing |
1462 | // individual wait count instructions for these. |
1463 | |
1464 | if (Wait.LoadCnt != ~0u) { |
1465 | WaitsToErase.push_back(Elt: &WaitInstrs[LOAD_CNT]); |
1466 | WaitsToErase.push_back(Elt: &WaitInstrs[DS_CNT]); |
1467 | } else if (Wait.StoreCnt != ~0u) { |
1468 | WaitsToErase.push_back(Elt: &WaitInstrs[STORE_CNT]); |
1469 | WaitsToErase.push_back(Elt: &WaitInstrs[DS_CNT]); |
1470 | } |
1471 | |
1472 | for (MachineInstr **WI : WaitsToErase) { |
1473 | if (!*WI) |
1474 | continue; |
1475 | |
1476 | (*WI)->eraseFromParent(); |
1477 | *WI = nullptr; |
1478 | Modified = true; |
1479 | } |
1480 | } |
1481 | |
1482 | for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) { |
1483 | if (!WaitInstrs[CT]) |
1484 | continue; |
1485 | |
1486 | unsigned NewCnt = getWait(Wait, T: CT); |
1487 | if (NewCnt != ~0u) { |
1488 | Modified |= updateOperandIfDifferent(*WaitInstrs[CT], |
1489 | AMDGPU::OpName::simm16, NewCnt); |
1490 | Modified |= promoteSoftWaitCnt(Waitcnt: WaitInstrs[CT]); |
1491 | |
1492 | ScoreBrackets.applyWaitcnt(T: CT, Count: NewCnt); |
1493 | setNoWait(Wait, T: CT); |
1494 | |
1495 | LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() |
1496 | ? dbgs() << "applyPreexistingWaitcnt\n" |
1497 | << "New Instr at block end: " << *WaitInstrs[CT] |
1498 | << '\n' |
1499 | : dbgs() << "applyPreexistingWaitcnt\n" |
1500 | << "Old Instr: " << *It |
1501 | << "New Instr: " << *WaitInstrs[CT] << '\n'); |
1502 | } else { |
1503 | WaitInstrs[CT]->eraseFromParent(); |
1504 | Modified = true; |
1505 | } |
1506 | } |
1507 | |
1508 | return Modified; |
1509 | } |
1510 | |
1511 | /// Generate S_WAIT_*CNT instructions for any required counters in \p Wait |
1512 | bool WaitcntGeneratorGFX12Plus::createNewWaitcnt( |
1513 | MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It, |
1514 | AMDGPU::Waitcnt Wait) { |
1515 | assert(ST); |
1516 | assert(!isNormalMode(MaxCounter)); |
1517 | |
1518 | bool Modified = false; |
1519 | const DebugLoc &DL = Block.findDebugLoc(MBBI: It); |
1520 | |
1521 | // Check for opportunities to use combined wait instructions. |
1522 | if (Wait.DsCnt != ~0u) { |
1523 | MachineInstr *SWaitInst = nullptr; |
1524 | |
1525 | if (Wait.LoadCnt != ~0u) { |
1526 | unsigned Enc = AMDGPU::encodeLoadcntDscnt(Version: IV, Decoded: Wait); |
1527 | |
1528 | SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT)) |
1529 | .addImm(Enc); |
1530 | |
1531 | Wait.LoadCnt = ~0u; |
1532 | Wait.DsCnt = ~0u; |
1533 | } else if (Wait.StoreCnt != ~0u) { |
1534 | unsigned Enc = AMDGPU::encodeStorecntDscnt(Version: IV, Decoded: Wait); |
1535 | |
1536 | SWaitInst = |
1537 | BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT)) |
1538 | .addImm(Enc); |
1539 | |
1540 | Wait.StoreCnt = ~0u; |
1541 | Wait.DsCnt = ~0u; |
1542 | } |
1543 | |
1544 | if (SWaitInst) { |
1545 | Modified = true; |
1546 | |
1547 | LLVM_DEBUG(dbgs() << "generateWaitcnt\n" ; |
1548 | if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; |
1549 | dbgs() << "New Instr: " << *SWaitInst << '\n'); |
1550 | } |
1551 | } |
1552 | |
1553 | // Generate an instruction for any remaining counter that needs |
1554 | // waiting for. |
1555 | |
1556 | for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) { |
1557 | unsigned Count = getWait(Wait, T: CT); |
1558 | if (Count == ~0u) |
1559 | continue; |
1560 | |
1561 | [[maybe_unused]] auto SWaitInst = |
1562 | BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT])) |
1563 | .addImm(Count); |
1564 | |
1565 | Modified = true; |
1566 | |
1567 | LLVM_DEBUG(dbgs() << "generateWaitcnt\n" ; |
1568 | if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; |
1569 | dbgs() << "New Instr: " << *SWaitInst << '\n'); |
1570 | } |
1571 | |
1572 | return Modified; |
1573 | } |
1574 | |
1575 | static bool readsVCCZ(const MachineInstr &MI) { |
1576 | unsigned Opc = MI.getOpcode(); |
1577 | return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) && |
1578 | !MI.getOperand(1).isUndef(); |
1579 | } |
1580 | |
1581 | /// \returns true if the callee inserts an s_waitcnt 0 on function entry. |
1582 | static bool callWaitsOnFunctionEntry(const MachineInstr &MI) { |
1583 | // Currently all conventions wait, but this may not always be the case. |
1584 | // |
1585 | // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make |
1586 | // senses to omit the wait and do it in the caller. |
1587 | return true; |
1588 | } |
1589 | |
1590 | /// \returns true if the callee is expected to wait for any outstanding waits |
1591 | /// before returning. |
1592 | static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { |
1593 | return true; |
1594 | } |
1595 | |
1596 | /// Generate s_waitcnt instruction to be placed before cur_Inst. |
1597 | /// Instructions of a given type are returned in order, |
1598 | /// but instructions of different types can complete out of order. |
1599 | /// We rely on this in-order completion |
1600 | /// and simply assign a score to the memory access instructions. |
1601 | /// We keep track of the active "score bracket" to determine |
1602 | /// if an access of a memory read requires an s_waitcnt |
1603 | /// and if so what the value of each counter is. |
1604 | /// The "score bracket" is bound by the lower bound and upper bound |
1605 | /// scores (*_score_LB and *_score_ub respectively). |
1606 | /// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to |
1607 | /// flush the vmcnt counter here. |
1608 | bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, |
1609 | WaitcntBrackets &ScoreBrackets, |
1610 | MachineInstr *OldWaitcntInstr, |
1611 | bool FlushVmCnt) { |
1612 | setForceEmitWaitcnt(); |
1613 | |
1614 | if (MI.isMetaInstruction()) |
1615 | return false; |
1616 | |
1617 | AMDGPU::Waitcnt Wait; |
1618 | |
1619 | // FIXME: This should have already been handled by the memory legalizer. |
1620 | // Removing this currently doesn't affect any lit tests, but we need to |
1621 | // verify that nothing was relying on this. The number of buffer invalidates |
1622 | // being handled here should not be expanded. |
1623 | if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || |
1624 | MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || |
1625 | MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL || |
1626 | MI.getOpcode() == AMDGPU::BUFFER_GL0_INV || |
1627 | MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) { |
1628 | Wait.LoadCnt = 0; |
1629 | } |
1630 | |
1631 | // All waits must be resolved at call return. |
1632 | // NOTE: this could be improved with knowledge of all call sites or |
1633 | // with knowledge of the called routines. |
1634 | if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || |
1635 | MI.getOpcode() == AMDGPU::SI_RETURN || |
1636 | MI.getOpcode() == AMDGPU::S_SETPC_B64_return || |
1637 | (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { |
1638 | Wait = Wait.combined(Other: WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); |
1639 | } |
1640 | // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM |
1641 | // stores. In this case it can be useful to send a message to explicitly |
1642 | // release all VGPRs before the stores have completed, but it is only safe to |
1643 | // do this if: |
1644 | // * there are no outstanding scratch stores |
1645 | // * we are not in Dynamic VGPR mode |
1646 | else if (MI.getOpcode() == AMDGPU::S_ENDPGM || |
1647 | MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { |
1648 | if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone && |
1649 | ScoreBrackets.getScoreRange(T: STORE_CNT) != 0 && |
1650 | !ScoreBrackets.hasPendingEvent(E: SCRATCH_WRITE_ACCESS)) |
1651 | ReleaseVGPRInsts.insert(V: &MI); |
1652 | } |
1653 | // Resolve vm waits before gs-done. |
1654 | else if ((MI.getOpcode() == AMDGPU::S_SENDMSG || |
1655 | MI.getOpcode() == AMDGPU::S_SENDMSGHALT) && |
1656 | ST->hasLegacyGeometry() && |
1657 | ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) == |
1658 | AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) { |
1659 | Wait.LoadCnt = 0; |
1660 | } |
1661 | #if 0 // TODO: the following blocks of logic when we have fence. |
1662 | else if (MI.getOpcode() == SC_FENCE) { |
1663 | const unsigned int group_size = |
1664 | context->shader_info->GetMaxThreadGroupSize(); |
1665 | // group_size == 0 means thread group size is unknown at compile time |
1666 | const bool group_is_multi_wave = |
1667 | (group_size == 0 || group_size > target_info->GetWaveFrontSize()); |
1668 | const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence(); |
1669 | |
1670 | for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) { |
1671 | SCRegType src_type = Inst->GetSrcType(i); |
1672 | switch (src_type) { |
1673 | case SCMEM_LDS: |
1674 | if (group_is_multi_wave || |
1675 | context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) { |
1676 | EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT, |
1677 | ScoreBrackets->getScoreUB(DS_CNT)); |
1678 | // LDS may have to wait for VMcnt after buffer load to LDS |
1679 | if (target_info->HasBufferLoadToLDS()) { |
1680 | EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT, |
1681 | ScoreBrackets->getScoreUB(LOAD_CNT)); |
1682 | } |
1683 | } |
1684 | break; |
1685 | |
1686 | case SCMEM_GDS: |
1687 | if (group_is_multi_wave || fence_is_global) { |
1688 | EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, |
1689 | ScoreBrackets->getScoreUB(EXP_CNT)); |
1690 | EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT, |
1691 | ScoreBrackets->getScoreUB(DS_CNT)); |
1692 | } |
1693 | break; |
1694 | |
1695 | case SCMEM_UAV: |
1696 | case SCMEM_TFBUF: |
1697 | case SCMEM_RING: |
1698 | case SCMEM_SCATTER: |
1699 | if (group_is_multi_wave || fence_is_global) { |
1700 | EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, |
1701 | ScoreBrackets->getScoreUB(EXP_CNT)); |
1702 | EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT, |
1703 | ScoreBrackets->getScoreUB(LOAD_CNT)); |
1704 | } |
1705 | break; |
1706 | |
1707 | case SCMEM_SCRATCH: |
1708 | default: |
1709 | break; |
1710 | } |
1711 | } |
1712 | } |
1713 | #endif |
1714 | |
1715 | // Export & GDS instructions do not read the EXEC mask until after the export |
1716 | // is granted (which can occur well after the instruction is issued). |
1717 | // The shader program must flush all EXP operations on the export-count |
1718 | // before overwriting the EXEC mask. |
1719 | else { |
1720 | if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { |
1721 | // Export and GDS are tracked individually, either may trigger a waitcnt |
1722 | // for EXEC. |
1723 | if (ScoreBrackets.hasPendingEvent(E: EXP_GPR_LOCK) || |
1724 | ScoreBrackets.hasPendingEvent(E: EXP_PARAM_ACCESS) || |
1725 | ScoreBrackets.hasPendingEvent(E: EXP_POS_ACCESS) || |
1726 | ScoreBrackets.hasPendingEvent(E: GDS_GPR_LOCK)) { |
1727 | Wait.ExpCnt = 0; |
1728 | } |
1729 | } |
1730 | |
1731 | if (MI.isCall() && callWaitsOnFunctionEntry(MI)) { |
1732 | // The function is going to insert a wait on everything in its prolog. |
1733 | // This still needs to be careful if the call target is a load (e.g. a GOT |
1734 | // load). We also need to check WAW dependency with saved PC. |
1735 | Wait = AMDGPU::Waitcnt(); |
1736 | |
1737 | int CallAddrOpIdx = |
1738 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); |
1739 | |
1740 | if (MI.getOperand(i: CallAddrOpIdx).isReg()) { |
1741 | RegInterval CallAddrOpInterval = |
1742 | ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, OpNo: CallAddrOpIdx); |
1743 | |
1744 | for (int RegNo = CallAddrOpInterval.first; |
1745 | RegNo < CallAddrOpInterval.second; ++RegNo) |
1746 | ScoreBrackets.determineWait(T: SmemAccessCounter, RegNo, Wait); |
1747 | |
1748 | int RtnAddrOpIdx = |
1749 | AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); |
1750 | if (RtnAddrOpIdx != -1) { |
1751 | RegInterval RtnAddrOpInterval = |
1752 | ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, OpNo: RtnAddrOpIdx); |
1753 | |
1754 | for (int RegNo = RtnAddrOpInterval.first; |
1755 | RegNo < RtnAddrOpInterval.second; ++RegNo) |
1756 | ScoreBrackets.determineWait(T: SmemAccessCounter, RegNo, Wait); |
1757 | } |
1758 | } |
1759 | } else { |
1760 | // FIXME: Should not be relying on memoperands. |
1761 | // Look at the source operands of every instruction to see if |
1762 | // any of them results from a previous memory operation that affects |
1763 | // its current usage. If so, an s_waitcnt instruction needs to be |
1764 | // emitted. |
1765 | // If the source operand was defined by a load, add the s_waitcnt |
1766 | // instruction. |
1767 | // |
1768 | // Two cases are handled for destination operands: |
1769 | // 1) If the destination operand was defined by a load, add the s_waitcnt |
1770 | // instruction to guarantee the right WAW order. |
1771 | // 2) If a destination operand that was used by a recent export/store ins, |
1772 | // add s_waitcnt on exp_cnt to guarantee the WAR order. |
1773 | |
1774 | for (const MachineMemOperand *Memop : MI.memoperands()) { |
1775 | const Value *Ptr = Memop->getValue(); |
1776 | if (Memop->isStore() && SLoadAddresses.count(Val: Ptr)) { |
1777 | addWait(Wait, T: SmemAccessCounter, Count: 0); |
1778 | if (PDT->dominates(A: MI.getParent(), B: SLoadAddresses.find(Val: Ptr)->second)) |
1779 | SLoadAddresses.erase(Val: Ptr); |
1780 | } |
1781 | unsigned AS = Memop->getAddrSpace(); |
1782 | if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS) |
1783 | continue; |
1784 | // No need to wait before load from VMEM to LDS. |
1785 | if (TII->mayWriteLDSThroughDMA(MI)) |
1786 | continue; |
1787 | |
1788 | // LOAD_CNT is only relevant to vgpr or LDS. |
1789 | unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; |
1790 | bool FoundAliasingStore = false; |
1791 | // Only objects with alias scope info were added to LDSDMAScopes array. |
1792 | // In the absense of the scope info we will not be able to disambiguate |
1793 | // aliasing here. There is no need to try searching for a corresponding |
1794 | // store slot. This is conservatively correct because in that case we |
1795 | // will produce a wait using the first (general) LDS DMA wait slot which |
1796 | // will wait on all of them anyway. |
1797 | if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) { |
1798 | const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores(); |
1799 | for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) { |
1800 | if (MI.mayAlias(AA, Other: *LDSDMAStores[I], UseTBAA: true)) { |
1801 | FoundAliasingStore = true; |
1802 | ScoreBrackets.determineWait(T: LOAD_CNT, RegNo: RegNo + I + 1, Wait); |
1803 | } |
1804 | } |
1805 | } |
1806 | if (!FoundAliasingStore) |
1807 | ScoreBrackets.determineWait(T: LOAD_CNT, RegNo, Wait); |
1808 | if (Memop->isStore()) { |
1809 | ScoreBrackets.determineWait(T: EXP_CNT, RegNo, Wait); |
1810 | } |
1811 | } |
1812 | |
1813 | // Loop over use and def operands. |
1814 | for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { |
1815 | MachineOperand &Op = MI.getOperand(i: I); |
1816 | if (!Op.isReg()) |
1817 | continue; |
1818 | |
1819 | // If the instruction does not read tied source, skip the operand. |
1820 | if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI)) |
1821 | continue; |
1822 | |
1823 | RegInterval Interval = ScoreBrackets.getRegInterval(MI: &MI, MRI, TRI, OpNo: I); |
1824 | |
1825 | const bool IsVGPR = TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg()); |
1826 | for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { |
1827 | if (IsVGPR) { |
1828 | // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the |
1829 | // previous write and this write are the same type of VMEM |
1830 | // instruction, in which case they're guaranteed to write their |
1831 | // results in order anyway. |
1832 | if (Op.isUse() || !updateVMCntOnly(Inst: MI) || |
1833 | ScoreBrackets.hasOtherPendingVmemTypes(GprNo: RegNo, |
1834 | V: getVmemType(Inst: MI))) { |
1835 | ScoreBrackets.determineWait(T: LOAD_CNT, RegNo, Wait); |
1836 | ScoreBrackets.determineWait(T: SAMPLE_CNT, RegNo, Wait); |
1837 | ScoreBrackets.determineWait(T: BVH_CNT, RegNo, Wait); |
1838 | ScoreBrackets.clearVgprVmemTypes(GprNo: RegNo); |
1839 | } |
1840 | if (Op.isDef() || ScoreBrackets.hasPendingEvent(E: EXP_LDS_ACCESS)) { |
1841 | ScoreBrackets.determineWait(T: EXP_CNT, RegNo, Wait); |
1842 | } |
1843 | ScoreBrackets.determineWait(T: DS_CNT, RegNo, Wait); |
1844 | } else { |
1845 | ScoreBrackets.determineWait(T: SmemAccessCounter, RegNo, Wait); |
1846 | } |
1847 | } |
1848 | } |
1849 | } |
1850 | } |
1851 | |
1852 | // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does |
1853 | // not, we need to ensure the subtarget is capable of backing off barrier |
1854 | // instructions in case there are any outstanding memory operations that may |
1855 | // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here. |
1856 | if (MI.getOpcode() == AMDGPU::S_BARRIER && |
1857 | !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) { |
1858 | Wait = Wait.combined(Other: WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true)); |
1859 | } |
1860 | |
1861 | // TODO: Remove this work-around, enable the assert for Bug 457939 |
1862 | // after fixing the scheduler. Also, the Shader Compiler code is |
1863 | // independent of target. |
1864 | if (readsVCCZ(MI) && ST->hasReadVCCZBug()) { |
1865 | if (ScoreBrackets.hasPendingEvent(E: SMEM_ACCESS)) { |
1866 | Wait.DsCnt = 0; |
1867 | } |
1868 | } |
1869 | |
1870 | // Verify that the wait is actually needed. |
1871 | ScoreBrackets.simplifyWaitcnt(Wait); |
1872 | |
1873 | if (ForceEmitZeroWaitcnts) |
1874 | Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false); |
1875 | |
1876 | if (ForceEmitWaitcnt[LOAD_CNT]) |
1877 | Wait.LoadCnt = 0; |
1878 | if (ForceEmitWaitcnt[EXP_CNT]) |
1879 | Wait.ExpCnt = 0; |
1880 | if (ForceEmitWaitcnt[DS_CNT]) |
1881 | Wait.DsCnt = 0; |
1882 | if (ForceEmitWaitcnt[SAMPLE_CNT]) |
1883 | Wait.SampleCnt = 0; |
1884 | if (ForceEmitWaitcnt[BVH_CNT]) |
1885 | Wait.BvhCnt = 0; |
1886 | if (ForceEmitWaitcnt[KM_CNT]) |
1887 | Wait.KmCnt = 0; |
1888 | |
1889 | if (FlushVmCnt) { |
1890 | if (ScoreBrackets.hasPendingEvent(T: LOAD_CNT)) |
1891 | Wait.LoadCnt = 0; |
1892 | if (ScoreBrackets.hasPendingEvent(T: SAMPLE_CNT)) |
1893 | Wait.SampleCnt = 0; |
1894 | if (ScoreBrackets.hasPendingEvent(T: BVH_CNT)) |
1895 | Wait.BvhCnt = 0; |
1896 | } |
1897 | |
1898 | return generateWaitcnt(Wait, It: MI.getIterator(), Block&: *MI.getParent(), ScoreBrackets, |
1899 | OldWaitcntInstr); |
1900 | } |
1901 | |
1902 | bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, |
1903 | MachineBasicBlock::instr_iterator It, |
1904 | MachineBasicBlock &Block, |
1905 | WaitcntBrackets &ScoreBrackets, |
1906 | MachineInstr *OldWaitcntInstr) { |
1907 | bool Modified = false; |
1908 | |
1909 | if (OldWaitcntInstr) |
1910 | // Try to merge the required wait with preexisting waitcnt instructions. |
1911 | // Also erase redundant waitcnt. |
1912 | Modified = |
1913 | WCG->applyPreexistingWaitcnt(ScoreBrackets, OldWaitcntInstr&: *OldWaitcntInstr, Wait, It); |
1914 | |
1915 | // Any counts that could have been applied to any existing waitcnt |
1916 | // instructions will have been done so, now deal with any remaining. |
1917 | ScoreBrackets.applyWaitcnt(Wait); |
1918 | |
1919 | // ExpCnt can be merged into VINTERP. |
1920 | if (Wait.ExpCnt != ~0u && It != Block.instr_end() && |
1921 | SIInstrInfo::isVINTERP(MI: *It)) { |
1922 | MachineOperand *WaitExp = |
1923 | TII->getNamedOperand(*It, AMDGPU::OpName::waitexp); |
1924 | if (Wait.ExpCnt < WaitExp->getImm()) { |
1925 | WaitExp->setImm(Wait.ExpCnt); |
1926 | Modified = true; |
1927 | } |
1928 | Wait.ExpCnt = ~0u; |
1929 | |
1930 | LLVM_DEBUG(dbgs() << "generateWaitcnt\n" |
1931 | << "Update Instr: " << *It); |
1932 | } |
1933 | |
1934 | if (WCG->createNewWaitcnt(Block, It, Wait)) |
1935 | Modified = true; |
1936 | |
1937 | return Modified; |
1938 | } |
1939 | |
1940 | // This is a flat memory operation. Check to see if it has memory tokens other |
1941 | // than LDS. Other address spaces supported by flat memory operations involve |
1942 | // global memory. |
1943 | bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const { |
1944 | assert(TII->isFLAT(MI)); |
1945 | |
1946 | // All flat instructions use the VMEM counter. |
1947 | assert(TII->usesVM_CNT(MI)); |
1948 | |
1949 | // If there are no memory operands then conservatively assume the flat |
1950 | // operation may access VMEM. |
1951 | if (MI.memoperands_empty()) |
1952 | return true; |
1953 | |
1954 | // See if any memory operand specifies an address space that involves VMEM. |
1955 | // Flat operations only supported FLAT, LOCAL (LDS), or address spaces |
1956 | // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION |
1957 | // (GDS) address space is not supported by flat operations. Therefore, simply |
1958 | // return true unless only the LDS address space is found. |
1959 | for (const MachineMemOperand *Memop : MI.memoperands()) { |
1960 | unsigned AS = Memop->getAddrSpace(); |
1961 | assert(AS != AMDGPUAS::REGION_ADDRESS); |
1962 | if (AS != AMDGPUAS::LOCAL_ADDRESS) |
1963 | return true; |
1964 | } |
1965 | |
1966 | return false; |
1967 | } |
1968 | |
1969 | // This is a flat memory operation. Check to see if it has memory tokens for |
1970 | // either LDS or FLAT. |
1971 | bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const { |
1972 | assert(TII->isFLAT(MI)); |
1973 | |
1974 | // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter. |
1975 | if (!TII->usesLGKM_CNT(MI)) |
1976 | return false; |
1977 | |
1978 | // If in tgsplit mode then there can be no use of LDS. |
1979 | if (ST->isTgSplitEnabled()) |
1980 | return false; |
1981 | |
1982 | // If there are no memory operands then conservatively assume the flat |
1983 | // operation may access LDS. |
1984 | if (MI.memoperands_empty()) |
1985 | return true; |
1986 | |
1987 | // See if any memory operand specifies an address space that involves LDS. |
1988 | for (const MachineMemOperand *Memop : MI.memoperands()) { |
1989 | unsigned AS = Memop->getAddrSpace(); |
1990 | if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) |
1991 | return true; |
1992 | } |
1993 | |
1994 | return false; |
1995 | } |
1996 | |
1997 | // This is a flat memory operation. Check to see if it has memory tokens for |
1998 | // either scratch or FLAT. |
1999 | bool SIInsertWaitcnts::mayAccessScratchThroughFlat( |
2000 | const MachineInstr &MI) const { |
2001 | assert(TII->isFLAT(MI)); |
2002 | |
2003 | // SCRATCH instructions always access scratch. |
2004 | if (TII->isFLATScratch(MI)) |
2005 | return true; |
2006 | |
2007 | // GLOBAL instructions never access scratch. |
2008 | if (TII->isFLATGlobal(MI)) |
2009 | return false; |
2010 | |
2011 | // If there are no memory operands then conservatively assume the flat |
2012 | // operation may access scratch. |
2013 | if (MI.memoperands_empty()) |
2014 | return true; |
2015 | |
2016 | // See if any memory operand specifies an address space that involves scratch. |
2017 | return any_of(Range: MI.memoperands(), P: [](const MachineMemOperand *Memop) { |
2018 | unsigned AS = Memop->getAddrSpace(); |
2019 | return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; |
2020 | }); |
2021 | } |
2022 | |
2023 | static bool isCacheInvOrWBInst(MachineInstr &Inst) { |
2024 | auto Opc = Inst.getOpcode(); |
2025 | return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB || |
2026 | Opc == AMDGPU::GLOBAL_WBINV; |
2027 | } |
2028 | |
2029 | void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, |
2030 | WaitcntBrackets *ScoreBrackets) { |
2031 | // Now look at the instruction opcode. If it is a memory access |
2032 | // instruction, update the upper-bound of the appropriate counter's |
2033 | // bracket and the destination operand scores. |
2034 | // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere. |
2035 | |
2036 | if (TII->isDS(MI: Inst) && TII->usesLGKM_CNT(MI: Inst)) { |
2037 | if (TII->isAlwaysGDS(Inst.getOpcode()) || |
2038 | TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { |
2039 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: GDS_ACCESS, Inst); |
2040 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: GDS_GPR_LOCK, Inst); |
2041 | } else { |
2042 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: LDS_ACCESS, Inst); |
2043 | } |
2044 | } else if (TII->isFLAT(MI: Inst)) { |
2045 | // TODO: Track this properly. |
2046 | if (isCacheInvOrWBInst(Inst)) |
2047 | return; |
2048 | |
2049 | assert(Inst.mayLoadOrStore()); |
2050 | |
2051 | int FlatASCount = 0; |
2052 | |
2053 | if (mayAccessVMEMThroughFlat(MI: Inst)) { |
2054 | ++FlatASCount; |
2055 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: getVmemWaitEventType(Inst), |
2056 | Inst); |
2057 | } |
2058 | |
2059 | if (mayAccessLDSThroughFlat(MI: Inst)) { |
2060 | ++FlatASCount; |
2061 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: LDS_ACCESS, Inst); |
2062 | } |
2063 | |
2064 | // A Flat memory operation must access at least one address space. |
2065 | assert(FlatASCount); |
2066 | |
2067 | // This is a flat memory operation that access both VMEM and LDS, so note it |
2068 | // - it will require that both the VM and LGKM be flushed to zero if it is |
2069 | // pending when a VM or LGKM dependency occurs. |
2070 | if (FlatASCount > 1) |
2071 | ScoreBrackets->setPendingFlat(); |
2072 | } else if (SIInstrInfo::isVMEM(MI: Inst) && |
2073 | !llvm::AMDGPU::getMUBUFIsBufferInv(Opc: Inst.getOpcode())) { |
2074 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: getVmemWaitEventType(Inst), |
2075 | Inst); |
2076 | |
2077 | if (ST->vmemWriteNeedsExpWaitcnt() && |
2078 | (Inst.mayStore() || SIInstrInfo::isAtomicRet(MI: Inst))) { |
2079 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: VMW_GPR_LOCK, Inst); |
2080 | } |
2081 | } else if (TII->isSMRD(MI: Inst)) { |
2082 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SMEM_ACCESS, Inst); |
2083 | } else if (Inst.isCall()) { |
2084 | if (callWaitsOnFunctionReturn(MI: Inst)) { |
2085 | // Act as a wait on everything |
2086 | ScoreBrackets->applyWaitcnt( |
2087 | Wait: WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); |
2088 | ScoreBrackets->setStateOnFunctionEntryOrReturn(); |
2089 | } else { |
2090 | // May need to way wait for anything. |
2091 | ScoreBrackets->applyWaitcnt(Wait: AMDGPU::Waitcnt()); |
2092 | } |
2093 | } else if (SIInstrInfo::isLDSDIR(MI: Inst)) { |
2094 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_LDS_ACCESS, Inst); |
2095 | } else if (TII->isVINTERP(MI: Inst)) { |
2096 | int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm(); |
2097 | ScoreBrackets->applyWaitcnt(T: EXP_CNT, Count: Imm); |
2098 | } else if (SIInstrInfo::isEXP(MI: Inst)) { |
2099 | unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm(); |
2100 | if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31) |
2101 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_PARAM_ACCESS, Inst); |
2102 | else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST) |
2103 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_POS_ACCESS, Inst); |
2104 | else |
2105 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: EXP_GPR_LOCK, Inst); |
2106 | } else { |
2107 | switch (Inst.getOpcode()) { |
2108 | case AMDGPU::S_SENDMSG: |
2109 | case AMDGPU::S_SENDMSG_RTN_B32: |
2110 | case AMDGPU::S_SENDMSG_RTN_B64: |
2111 | case AMDGPU::S_SENDMSGHALT: |
2112 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SQ_MESSAGE, Inst); |
2113 | break; |
2114 | case AMDGPU::S_MEMTIME: |
2115 | case AMDGPU::S_MEMREALTIME: |
2116 | case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0: |
2117 | case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: |
2118 | case AMDGPU::S_BARRIER_LEAVE: |
2119 | case AMDGPU::S_GET_BARRIER_STATE_M0: |
2120 | case AMDGPU::S_GET_BARRIER_STATE_IMM: |
2121 | ScoreBrackets->updateByEvent(TII, TRI, MRI, E: SMEM_ACCESS, Inst); |
2122 | break; |
2123 | } |
2124 | } |
2125 | } |
2126 | |
2127 | bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score, |
2128 | unsigned OtherScore) { |
2129 | unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift; |
2130 | unsigned OtherShifted = |
2131 | OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift; |
2132 | Score = std::max(a: MyShifted, b: OtherShifted); |
2133 | return OtherShifted > MyShifted; |
2134 | } |
2135 | |
2136 | /// Merge the pending events and associater score brackets of \p Other into |
2137 | /// this brackets status. |
2138 | /// |
2139 | /// Returns whether the merge resulted in a change that requires tighter waits |
2140 | /// (i.e. the merged brackets strictly dominate the original brackets). |
2141 | bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { |
2142 | bool StrictDom = false; |
2143 | |
2144 | VgprUB = std::max(a: VgprUB, b: Other.VgprUB); |
2145 | SgprUB = std::max(a: SgprUB, b: Other.SgprUB); |
2146 | |
2147 | for (auto T : inst_counter_types(MaxCounter)) { |
2148 | // Merge event flags for this counter |
2149 | const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; |
2150 | const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; |
2151 | if (OtherEvents & ~OldEvents) |
2152 | StrictDom = true; |
2153 | PendingEvents |= OtherEvents; |
2154 | |
2155 | // Merge scores for this counter |
2156 | const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T]; |
2157 | const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T]; |
2158 | const unsigned NewUB = ScoreLBs[T] + std::max(a: MyPending, b: OtherPending); |
2159 | if (NewUB < ScoreLBs[T]) |
2160 | report_fatal_error(reason: "waitcnt score overflow" ); |
2161 | |
2162 | MergeInfo M; |
2163 | M.OldLB = ScoreLBs[T]; |
2164 | M.OtherLB = Other.ScoreLBs[T]; |
2165 | M.MyShift = NewUB - ScoreUBs[T]; |
2166 | M.OtherShift = NewUB - Other.ScoreUBs[T]; |
2167 | |
2168 | ScoreUBs[T] = NewUB; |
2169 | |
2170 | StrictDom |= mergeScore(M, Score&: LastFlat[T], OtherScore: Other.LastFlat[T]); |
2171 | |
2172 | for (int J = 0; J <= VgprUB; J++) |
2173 | StrictDom |= mergeScore(M, Score&: VgprScores[T][J], OtherScore: Other.VgprScores[T][J]); |
2174 | |
2175 | if (T == SmemAccessCounter) { |
2176 | for (int J = 0; J <= SgprUB; J++) |
2177 | StrictDom |= mergeScore(M, Score&: SgprScores[J], OtherScore: Other.SgprScores[J]); |
2178 | } |
2179 | } |
2180 | |
2181 | for (int J = 0; J <= VgprUB; J++) { |
2182 | unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J]; |
2183 | StrictDom |= NewVmemTypes != VgprVmemTypes[J]; |
2184 | VgprVmemTypes[J] = NewVmemTypes; |
2185 | } |
2186 | |
2187 | return StrictDom; |
2188 | } |
2189 | |
2190 | static bool isWaitInstr(MachineInstr &Inst) { |
2191 | unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode: Inst.getOpcode()); |
2192 | return Opcode == AMDGPU::S_WAITCNT || |
2193 | (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() && |
2194 | Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) || |
2195 | Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT || |
2196 | Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT || |
2197 | counterTypeForInstr(Opcode).has_value(); |
2198 | } |
2199 | |
2200 | // Generate s_waitcnt instructions where needed. |
2201 | bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, |
2202 | MachineBasicBlock &Block, |
2203 | WaitcntBrackets &ScoreBrackets) { |
2204 | bool Modified = false; |
2205 | |
2206 | LLVM_DEBUG({ |
2207 | dbgs() << "*** Block" << Block.getNumber() << " ***" ; |
2208 | ScoreBrackets.dump(); |
2209 | }); |
2210 | |
2211 | // Track the correctness of vccz through this basic block. There are two |
2212 | // reasons why it might be incorrect; see ST->hasReadVCCZBug() and |
2213 | // ST->partialVCCWritesUpdateVCCZ(). |
2214 | bool VCCZCorrect = true; |
2215 | if (ST->hasReadVCCZBug()) { |
2216 | // vccz could be incorrect at a basic block boundary if a predecessor wrote |
2217 | // to vcc and then issued an smem load. |
2218 | VCCZCorrect = false; |
2219 | } else if (!ST->partialVCCWritesUpdateVCCZ()) { |
2220 | // vccz could be incorrect at a basic block boundary if a predecessor wrote |
2221 | // to vcc_lo or vcc_hi. |
2222 | VCCZCorrect = false; |
2223 | } |
2224 | |
2225 | // Walk over the instructions. |
2226 | MachineInstr *OldWaitcntInstr = nullptr; |
2227 | |
2228 | for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(), |
2229 | E = Block.instr_end(); |
2230 | Iter != E;) { |
2231 | MachineInstr &Inst = *Iter; |
2232 | |
2233 | // Track pre-existing waitcnts that were added in earlier iterations or by |
2234 | // the memory legalizer. |
2235 | if (isWaitInstr(Inst)) { |
2236 | if (!OldWaitcntInstr) |
2237 | OldWaitcntInstr = &Inst; |
2238 | ++Iter; |
2239 | continue; |
2240 | } |
2241 | |
2242 | bool FlushVmCnt = Block.getFirstTerminator() == Inst && |
2243 | isPreheaderToFlush(MBB&: Block, ScoreBrackets); |
2244 | |
2245 | // Generate an s_waitcnt instruction to be placed before Inst, if needed. |
2246 | Modified |= generateWaitcntInstBefore(MI&: Inst, ScoreBrackets, OldWaitcntInstr, |
2247 | FlushVmCnt); |
2248 | OldWaitcntInstr = nullptr; |
2249 | |
2250 | // Restore vccz if it's not known to be correct already. |
2251 | bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(MI: Inst); |
2252 | |
2253 | // Don't examine operands unless we need to track vccz correctness. |
2254 | if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) { |
2255 | if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) || |
2256 | Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) { |
2257 | // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz. |
2258 | if (!ST->partialVCCWritesUpdateVCCZ()) |
2259 | VCCZCorrect = false; |
2260 | } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) { |
2261 | // There is a hardware bug on CI/SI where SMRD instruction may corrupt |
2262 | // vccz bit, so when we detect that an instruction may read from a |
2263 | // corrupt vccz bit, we need to: |
2264 | // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD |
2265 | // operations to complete. |
2266 | // 2. Restore the correct value of vccz by writing the current value |
2267 | // of vcc back to vcc. |
2268 | if (ST->hasReadVCCZBug() && |
2269 | ScoreBrackets.hasPendingEvent(E: SMEM_ACCESS)) { |
2270 | // Writes to vcc while there's an outstanding smem read may get |
2271 | // clobbered as soon as any read completes. |
2272 | VCCZCorrect = false; |
2273 | } else { |
2274 | // Writes to vcc will fix any incorrect value in vccz. |
2275 | VCCZCorrect = true; |
2276 | } |
2277 | } |
2278 | } |
2279 | |
2280 | if (TII->isSMRD(MI: Inst)) { |
2281 | for (const MachineMemOperand *Memop : Inst.memoperands()) { |
2282 | // No need to handle invariant loads when avoiding WAR conflicts, as |
2283 | // there cannot be a vector store to the same memory location. |
2284 | if (!Memop->isInvariant()) { |
2285 | const Value *Ptr = Memop->getValue(); |
2286 | SLoadAddresses.insert(KV: std::pair(Ptr, Inst.getParent())); |
2287 | } |
2288 | } |
2289 | if (ST->hasReadVCCZBug()) { |
2290 | // This smem read could complete and clobber vccz at any time. |
2291 | VCCZCorrect = false; |
2292 | } |
2293 | } |
2294 | |
2295 | updateEventWaitcntAfter(Inst, ScoreBrackets: &ScoreBrackets); |
2296 | |
2297 | #if 0 // TODO: implement resource type check controlled by options with ub = LB. |
2298 | // If this instruction generates a S_SETVSKIP because it is an |
2299 | // indexed resource, and we are on Tahiti, then it will also force |
2300 | // an S_WAITCNT vmcnt(0) |
2301 | if (RequireCheckResourceType(Inst, context)) { |
2302 | // Force the score to as if an S_WAITCNT vmcnt(0) is emitted. |
2303 | ScoreBrackets->setScoreLB(LOAD_CNT, |
2304 | ScoreBrackets->getScoreUB(LOAD_CNT)); |
2305 | } |
2306 | #endif |
2307 | |
2308 | if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { |
2309 | AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt( |
2310 | IncludeVSCnt: Inst.mayStore() && !SIInstrInfo::isAtomicRet(MI: Inst)); |
2311 | ScoreBrackets.simplifyWaitcnt(Wait); |
2312 | Modified |= generateWaitcnt(Wait, It: std::next(x: Inst.getIterator()), Block, |
2313 | ScoreBrackets, /*OldWaitcntInstr=*/nullptr); |
2314 | } |
2315 | |
2316 | LLVM_DEBUG({ |
2317 | Inst.print(dbgs()); |
2318 | ScoreBrackets.dump(); |
2319 | }); |
2320 | |
2321 | // TODO: Remove this work-around after fixing the scheduler and enable the |
2322 | // assert above. |
2323 | if (RestoreVCCZ) { |
2324 | // Restore the vccz bit. Any time a value is written to vcc, the vcc |
2325 | // bit is updated, so we can restore the bit by reading the value of |
2326 | // vcc and then writing it back to the register. |
2327 | BuildMI(Block, Inst, Inst.getDebugLoc(), |
2328 | TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), |
2329 | TRI->getVCC()) |
2330 | .addReg(TRI->getVCC()); |
2331 | VCCZCorrect = true; |
2332 | Modified = true; |
2333 | } |
2334 | |
2335 | ++Iter; |
2336 | } |
2337 | |
2338 | // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if |
2339 | // needed. |
2340 | AMDGPU::Waitcnt Wait; |
2341 | if (Block.getFirstTerminator() == Block.end() && |
2342 | isPreheaderToFlush(MBB&: Block, ScoreBrackets)) { |
2343 | if (ScoreBrackets.hasPendingEvent(T: LOAD_CNT)) |
2344 | Wait.LoadCnt = 0; |
2345 | if (ScoreBrackets.hasPendingEvent(T: SAMPLE_CNT)) |
2346 | Wait.SampleCnt = 0; |
2347 | if (ScoreBrackets.hasPendingEvent(T: BVH_CNT)) |
2348 | Wait.BvhCnt = 0; |
2349 | } |
2350 | |
2351 | // Combine or remove any redundant waitcnts at the end of the block. |
2352 | Modified |= generateWaitcnt(Wait, It: Block.instr_end(), Block, ScoreBrackets, |
2353 | OldWaitcntInstr); |
2354 | |
2355 | return Modified; |
2356 | } |
2357 | |
2358 | // Return true if the given machine basic block is a preheader of a loop in |
2359 | // which we want to flush the vmcnt counter, and false otherwise. |
2360 | bool SIInsertWaitcnts::(MachineBasicBlock &MBB, |
2361 | WaitcntBrackets &ScoreBrackets) { |
2362 | auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(Key: &MBB, Args: false); |
2363 | if (!IsInserted) |
2364 | return Iterator->second; |
2365 | |
2366 | MachineBasicBlock *Succ = MBB.getSingleSuccessor(); |
2367 | if (!Succ) |
2368 | return false; |
2369 | |
2370 | MachineLoop *Loop = MLI->getLoopFor(BB: Succ); |
2371 | if (!Loop) |
2372 | return false; |
2373 | |
2374 | if (Loop->getLoopPreheader() == &MBB && |
2375 | shouldFlushVmCnt(ML: Loop, Brackets&: ScoreBrackets)) { |
2376 | Iterator->second = true; |
2377 | return true; |
2378 | } |
2379 | |
2380 | return false; |
2381 | } |
2382 | |
2383 | bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const { |
2384 | return SIInstrInfo::isVMEM(MI) || |
2385 | (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI)); |
2386 | } |
2387 | |
2388 | // Return true if it is better to flush the vmcnt counter in the preheader of |
2389 | // the given loop. We currently decide to flush in two situations: |
2390 | // 1. The loop contains vmem store(s), no vmem load and at least one use of a |
2391 | // vgpr containing a value that is loaded outside of the loop. (Only on |
2392 | // targets with no vscnt counter). |
2393 | // 2. The loop contains vmem load(s), but the loaded values are not used in the |
2394 | // loop, and at least one use of a vgpr containing a value that is loaded |
2395 | // outside of the loop. |
2396 | bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, |
2397 | WaitcntBrackets &Brackets) { |
2398 | bool HasVMemLoad = false; |
2399 | bool HasVMemStore = false; |
2400 | bool UsesVgprLoadedOutside = false; |
2401 | DenseSet<Register> VgprUse; |
2402 | DenseSet<Register> VgprDef; |
2403 | |
2404 | for (MachineBasicBlock *MBB : ML->blocks()) { |
2405 | for (MachineInstr &MI : *MBB) { |
2406 | if (isVMEMOrFlatVMEM(MI)) { |
2407 | if (MI.mayLoad()) |
2408 | HasVMemLoad = true; |
2409 | if (MI.mayStore()) |
2410 | HasVMemStore = true; |
2411 | } |
2412 | for (unsigned I = 0; I < MI.getNumOperands(); I++) { |
2413 | MachineOperand &Op = MI.getOperand(i: I); |
2414 | if (!Op.isReg() || !TRI->isVectorRegister(MRI: *MRI, Reg: Op.getReg())) |
2415 | continue; |
2416 | RegInterval Interval = Brackets.getRegInterval(MI: &MI, MRI, TRI, OpNo: I); |
2417 | // Vgpr use |
2418 | if (Op.isUse()) { |
2419 | for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { |
2420 | // If we find a register that is loaded inside the loop, 1. and 2. |
2421 | // are invalidated and we can exit. |
2422 | if (VgprDef.contains(V: RegNo)) |
2423 | return false; |
2424 | VgprUse.insert(V: RegNo); |
2425 | // If at least one of Op's registers is in the score brackets, the |
2426 | // value is likely loaded outside of the loop. |
2427 | if (Brackets.getRegScore(GprNo: RegNo, T: LOAD_CNT) > |
2428 | Brackets.getScoreLB(T: LOAD_CNT) || |
2429 | Brackets.getRegScore(GprNo: RegNo, T: SAMPLE_CNT) > |
2430 | Brackets.getScoreLB(T: SAMPLE_CNT) || |
2431 | Brackets.getRegScore(GprNo: RegNo, T: BVH_CNT) > |
2432 | Brackets.getScoreLB(T: BVH_CNT)) { |
2433 | UsesVgprLoadedOutside = true; |
2434 | break; |
2435 | } |
2436 | } |
2437 | } |
2438 | // VMem load vgpr def |
2439 | else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef()) |
2440 | for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { |
2441 | // If we find a register that is loaded inside the loop, 1. and 2. |
2442 | // are invalidated and we can exit. |
2443 | if (VgprUse.contains(V: RegNo)) |
2444 | return false; |
2445 | VgprDef.insert(V: RegNo); |
2446 | } |
2447 | } |
2448 | } |
2449 | } |
2450 | if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside) |
2451 | return true; |
2452 | return HasVMemLoad && UsesVgprLoadedOutside; |
2453 | } |
2454 | |
2455 | bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { |
2456 | ST = &MF.getSubtarget<GCNSubtarget>(); |
2457 | TII = ST->getInstrInfo(); |
2458 | TRI = &TII->getRegisterInfo(); |
2459 | MRI = &MF.getRegInfo(); |
2460 | const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
2461 | MLI = &getAnalysis<MachineLoopInfo>(); |
2462 | PDT = &getAnalysis<MachinePostDominatorTree>(); |
2463 | if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>()) |
2464 | AA = &AAR->getAAResults(); |
2465 | |
2466 | AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(GPU: ST->getCPU()); |
2467 | |
2468 | if (ST->hasExtendedWaitCounts()) { |
2469 | MaxCounter = NUM_EXTENDED_INST_CNTS; |
2470 | WCGGFX12Plus = WaitcntGeneratorGFX12Plus(ST, MaxCounter); |
2471 | WCG = &WCGGFX12Plus; |
2472 | } else { |
2473 | MaxCounter = NUM_NORMAL_INST_CNTS; |
2474 | WCGPreGFX12 = WaitcntGeneratorPreGFX12(ST); |
2475 | WCG = &WCGPreGFX12; |
2476 | } |
2477 | |
2478 | ForceEmitZeroWaitcnts = ForceEmitZeroFlag; |
2479 | for (auto T : inst_counter_types()) |
2480 | ForceEmitWaitcnt[T] = false; |
2481 | |
2482 | const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask(); |
2483 | |
2484 | SmemAccessCounter = eventCounter(masks: WaitEventMaskForInst, E: SMEM_ACCESS); |
2485 | |
2486 | OptNone = MF.getFunction().hasOptNone() || |
2487 | MF.getTarget().getOptLevel() == CodeGenOptLevel::None; |
2488 | |
2489 | HardwareLimits Limits = {}; |
2490 | if (ST->hasExtendedWaitCounts()) { |
2491 | Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(Version: IV); |
2492 | Limits.DscntMax = AMDGPU::getDscntBitMask(Version: IV); |
2493 | } else { |
2494 | Limits.LoadcntMax = AMDGPU::getVmcntBitMask(Version: IV); |
2495 | Limits.DscntMax = AMDGPU::getLgkmcntBitMask(Version: IV); |
2496 | } |
2497 | Limits.ExpcntMax = AMDGPU::getExpcntBitMask(Version: IV); |
2498 | Limits.StorecntMax = AMDGPU::getStorecntBitMask(Version: IV); |
2499 | Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(Version: IV); |
2500 | Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(Version: IV); |
2501 | Limits.KmcntMax = AMDGPU::getKmcntBitMask(Version: IV); |
2502 | |
2503 | unsigned NumVGPRsMax = ST->getAddressableNumVGPRs(); |
2504 | unsigned NumSGPRsMax = ST->getAddressableNumSGPRs(); |
2505 | assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS); |
2506 | assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS); |
2507 | |
2508 | RegisterEncoding Encoding = {}; |
2509 | Encoding.VGPR0 = |
2510 | TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK; |
2511 | Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1; |
2512 | Encoding.SGPR0 = |
2513 | TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK; |
2514 | Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1; |
2515 | |
2516 | BlockInfos.clear(); |
2517 | bool Modified = false; |
2518 | |
2519 | MachineBasicBlock &EntryBB = MF.front(); |
2520 | MachineBasicBlock::iterator I = EntryBB.begin(); |
2521 | |
2522 | if (!MFI->isEntryFunction()) { |
2523 | // Wait for any outstanding memory operations that the input registers may |
2524 | // depend on. We can't track them and it's better to do the wait after the |
2525 | // costly call sequence. |
2526 | |
2527 | // TODO: Could insert earlier and schedule more liberally with operations |
2528 | // that only use caller preserved registers. |
2529 | for (MachineBasicBlock::iterator E = EntryBB.end(); |
2530 | I != E && (I->isPHI() || I->isMetaInstruction()); ++I) |
2531 | ; |
2532 | |
2533 | if (ST->hasExtendedWaitCounts()) { |
2534 | BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT)) |
2535 | .addImm(0); |
2536 | for (auto CT : inst_counter_types(MaxCounter: NUM_EXTENDED_INST_CNTS)) { |
2537 | if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT) |
2538 | continue; |
2539 | |
2540 | BuildMI(EntryBB, I, DebugLoc(), |
2541 | TII->get(instrsForExtendedCounterTypes[CT])) |
2542 | .addImm(0); |
2543 | } |
2544 | } else { |
2545 | BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); |
2546 | } |
2547 | |
2548 | auto NonKernelInitialState = std::make_unique<WaitcntBrackets>( |
2549 | args&: ST, args&: MaxCounter, args&: Limits, args&: Encoding, args&: WaitEventMaskForInst, |
2550 | args&: SmemAccessCounter); |
2551 | NonKernelInitialState->setStateOnFunctionEntryOrReturn(); |
2552 | BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState); |
2553 | |
2554 | Modified = true; |
2555 | } |
2556 | |
2557 | // Keep iterating over the blocks in reverse post order, inserting and |
2558 | // updating s_waitcnt where needed, until a fix point is reached. |
2559 | for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF)) |
2560 | BlockInfos.insert(KV: {MBB, BlockInfo()}); |
2561 | |
2562 | std::unique_ptr<WaitcntBrackets> Brackets; |
2563 | bool Repeat; |
2564 | do { |
2565 | Repeat = false; |
2566 | |
2567 | for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE; |
2568 | ++BII) { |
2569 | MachineBasicBlock *MBB = BII->first; |
2570 | BlockInfo &BI = BII->second; |
2571 | if (!BI.Dirty) |
2572 | continue; |
2573 | |
2574 | if (BI.Incoming) { |
2575 | if (!Brackets) |
2576 | Brackets = std::make_unique<WaitcntBrackets>(args&: *BI.Incoming); |
2577 | else |
2578 | *Brackets = *BI.Incoming; |
2579 | } else { |
2580 | if (!Brackets) |
2581 | Brackets = std::make_unique<WaitcntBrackets>( |
2582 | args&: ST, args&: MaxCounter, args&: Limits, args&: Encoding, args&: WaitEventMaskForInst, |
2583 | args&: SmemAccessCounter); |
2584 | else |
2585 | *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding, |
2586 | WaitEventMaskForInst, SmemAccessCounter); |
2587 | } |
2588 | |
2589 | Modified |= insertWaitcntInBlock(MF, Block&: *MBB, ScoreBrackets&: *Brackets); |
2590 | BI.Dirty = false; |
2591 | |
2592 | if (Brackets->hasPendingEvent()) { |
2593 | BlockInfo *MoveBracketsToSucc = nullptr; |
2594 | for (MachineBasicBlock *Succ : MBB->successors()) { |
2595 | auto SuccBII = BlockInfos.find(Key: Succ); |
2596 | BlockInfo &SuccBI = SuccBII->second; |
2597 | if (!SuccBI.Incoming) { |
2598 | SuccBI.Dirty = true; |
2599 | if (SuccBII <= BII) |
2600 | Repeat = true; |
2601 | if (!MoveBracketsToSucc) { |
2602 | MoveBracketsToSucc = &SuccBI; |
2603 | } else { |
2604 | SuccBI.Incoming = std::make_unique<WaitcntBrackets>(args&: *Brackets); |
2605 | } |
2606 | } else if (SuccBI.Incoming->merge(Other: *Brackets)) { |
2607 | SuccBI.Dirty = true; |
2608 | if (SuccBII <= BII) |
2609 | Repeat = true; |
2610 | } |
2611 | } |
2612 | if (MoveBracketsToSucc) |
2613 | MoveBracketsToSucc->Incoming = std::move(Brackets); |
2614 | } |
2615 | } |
2616 | } while (Repeat); |
2617 | |
2618 | if (ST->hasScalarStores()) { |
2619 | SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; |
2620 | bool HaveScalarStores = false; |
2621 | |
2622 | for (MachineBasicBlock &MBB : MF) { |
2623 | for (MachineInstr &MI : MBB) { |
2624 | if (!HaveScalarStores && TII->isScalarStore(MI)) |
2625 | HaveScalarStores = true; |
2626 | |
2627 | if (MI.getOpcode() == AMDGPU::S_ENDPGM || |
2628 | MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) |
2629 | EndPgmBlocks.push_back(Elt: &MBB); |
2630 | } |
2631 | } |
2632 | |
2633 | if (HaveScalarStores) { |
2634 | // If scalar writes are used, the cache must be flushed or else the next |
2635 | // wave to reuse the same scratch memory can be clobbered. |
2636 | // |
2637 | // Insert s_dcache_wb at wave termination points if there were any scalar |
2638 | // stores, and only if the cache hasn't already been flushed. This could |
2639 | // be improved by looking across blocks for flushes in postdominating |
2640 | // blocks from the stores but an explicitly requested flush is probably |
2641 | // very rare. |
2642 | for (MachineBasicBlock *MBB : EndPgmBlocks) { |
2643 | bool SeenDCacheWB = false; |
2644 | |
2645 | for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); |
2646 | I != E; ++I) { |
2647 | if (I->getOpcode() == AMDGPU::S_DCACHE_WB) |
2648 | SeenDCacheWB = true; |
2649 | else if (TII->isScalarStore(MI: *I)) |
2650 | SeenDCacheWB = false; |
2651 | |
2652 | // FIXME: It would be better to insert this before a waitcnt if any. |
2653 | if ((I->getOpcode() == AMDGPU::S_ENDPGM || |
2654 | I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && |
2655 | !SeenDCacheWB) { |
2656 | Modified = true; |
2657 | BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); |
2658 | } |
2659 | } |
2660 | } |
2661 | } |
2662 | } |
2663 | |
2664 | // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM |
2665 | // instructions. |
2666 | for (MachineInstr *MI : ReleaseVGPRInsts) { |
2667 | if (ST->requiresNopBeforeDeallocVGPRs()) { |
2668 | BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_NOP)) |
2669 | .addImm(0); |
2670 | } |
2671 | BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), |
2672 | TII->get(AMDGPU::S_SENDMSG)) |
2673 | .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus); |
2674 | Modified = true; |
2675 | } |
2676 | ReleaseVGPRInsts.clear(); |
2677 | |
2678 | return Modified; |
2679 | } |
2680 | |