1//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// AMD GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16
17#include "AMDGPUCallLowering.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUSubtarget.h"
20#include "SIFrameLowering.h"
21#include "SIISelLowering.h"
22#include "SIInstrInfo.h"
23#include "Utils/AMDGPUBaseInfo.h"
24#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
25#include "llvm/Support/ErrorHandling.h"
26
27#define GET_SUBTARGETINFO_HEADER
28#include "AMDGPUGenSubtargetInfo.inc"
29
30namespace llvm {
31
32class GCNTargetMachine;
33
34class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
35 public AMDGPUSubtarget {
36public:
37 using AMDGPUSubtarget::getMaxWavesPerEU;
38
39 // Following 2 enums are documented at:
40 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
41 enum class TrapHandlerAbi {
42 NONE = 0x00,
43 AMDHSA = 0x01,
44 };
45
46 enum class TrapID {
47 LLVMAMDHSATrap = 0x02,
48 LLVMAMDHSADebugTrap = 0x03,
49 };
50
51private:
52 /// GlobalISel related APIs.
53 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
54 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
55 std::unique_ptr<InstructionSelector> InstSelector;
56 std::unique_ptr<LegalizerInfo> Legalizer;
57 std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
58
59protected:
60 // Basic subtarget description.
61 Triple TargetTriple;
62 AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
63 unsigned Gen = INVALID;
64 InstrItineraryData InstrItins;
65 int LDSBankCount = 0;
66 unsigned MaxPrivateElementSize = 0;
67
68 // Possibly statically set by tablegen, but may want to be overridden.
69 bool FastDenormalF32 = false;
70 bool HalfRate64Ops = false;
71 bool FullRate64Ops = false;
72
73 // Dynamically set bits that enable features.
74 bool FlatForGlobal = false;
75 bool AutoWaitcntBeforeBarrier = false;
76 bool BackOffBarrier = false;
77 bool UnalignedScratchAccess = false;
78 bool UnalignedAccessMode = false;
79 bool HasApertureRegs = false;
80 bool SupportsXNACK = false;
81 bool KernargPreload = false;
82
83 // This should not be used directly. 'TargetID' tracks the dynamic settings
84 // for XNACK.
85 bool EnableXNACK = false;
86
87 bool EnableTgSplit = false;
88 bool EnableCuMode = false;
89 bool TrapHandler = false;
90 bool EnablePreciseMemory = false;
91
92 // Used as options.
93 bool EnableLoadStoreOpt = false;
94 bool EnableUnsafeDSOffsetFolding = false;
95 bool EnableSIScheduler = false;
96 bool EnableDS128 = false;
97 bool EnablePRTStrictNull = false;
98 bool DumpCode = false;
99
100 // Subtarget statically properties set by tablegen
101 bool FP64 = false;
102 bool FMA = false;
103 bool MIMG_R128 = false;
104 bool CIInsts = false;
105 bool GFX8Insts = false;
106 bool GFX9Insts = false;
107 bool GFX90AInsts = false;
108 bool GFX940Insts = false;
109 bool GFX10Insts = false;
110 bool GFX11Insts = false;
111 bool GFX12Insts = false;
112 bool GFX10_3Insts = false;
113 bool GFX7GFX8GFX9Insts = false;
114 bool SGPRInitBug = false;
115 bool UserSGPRInit16Bug = false;
116 bool NegativeScratchOffsetBug = false;
117 bool NegativeUnalignedScratchOffsetBug = false;
118 bool HasSMemRealTime = false;
119 bool HasIntClamp = false;
120 bool HasFmaMixInsts = false;
121 bool HasMovrel = false;
122 bool HasVGPRIndexMode = false;
123 bool HasScalarDwordx3Loads = false;
124 bool HasScalarStores = false;
125 bool HasScalarAtomics = false;
126 bool HasSDWAOmod = false;
127 bool HasSDWAScalar = false;
128 bool HasSDWASdst = false;
129 bool HasSDWAMac = false;
130 bool HasSDWAOutModsVOPC = false;
131 bool HasDPP = false;
132 bool HasDPP8 = false;
133 bool HasDPALU_DPP = false;
134 bool HasDPPSrc1SGPR = false;
135 bool HasPackedFP32Ops = false;
136 bool HasImageInsts = false;
137 bool HasExtendedImageInsts = false;
138 bool HasR128A16 = false;
139 bool HasA16 = false;
140 bool HasG16 = false;
141 bool HasNSAEncoding = false;
142 bool HasPartialNSAEncoding = false;
143 bool GFX10_AEncoding = false;
144 bool GFX10_BEncoding = false;
145 bool HasDLInsts = false;
146 bool HasFmacF64Inst = false;
147 bool HasDot1Insts = false;
148 bool HasDot2Insts = false;
149 bool HasDot3Insts = false;
150 bool HasDot4Insts = false;
151 bool HasDot5Insts = false;
152 bool HasDot6Insts = false;
153 bool HasDot7Insts = false;
154 bool HasDot8Insts = false;
155 bool HasDot9Insts = false;
156 bool HasDot10Insts = false;
157 bool HasDot11Insts = false;
158 bool HasMAIInsts = false;
159 bool HasFP8Insts = false;
160 bool HasFP8ConversionInsts = false;
161 bool HasPkFmacF16Inst = false;
162 bool HasAtomicDsPkAdd16Insts = false;
163 bool HasAtomicFlatPkAdd16Insts = false;
164 bool HasAtomicFaddRtnInsts = false;
165 bool HasAtomicFaddNoRtnInsts = false;
166 bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
167 bool HasAtomicBufferGlobalPkAddF16Insts = false;
168 bool HasAtomicCSubNoRtnInsts = false;
169 bool HasAtomicGlobalPkAddBF16Inst = false;
170 bool HasFlatAtomicFaddF32Inst = false;
171 bool HasDefaultComponentZero = false;
172 bool HasDefaultComponentBroadcast = false;
173 /// The maximum number of instructions that may be placed within an S_CLAUSE,
174 /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
175 /// indicates a lack of S_CLAUSE support.
176 unsigned MaxHardClauseLength = 0;
177 bool SupportsSRAMECC = false;
178
179 // This should not be used directly. 'TargetID' tracks the dynamic settings
180 // for SRAMECC.
181 bool EnableSRAMECC = false;
182
183 bool HasNoSdstCMPX = false;
184 bool HasVscnt = false;
185 bool HasGetWaveIdInst = false;
186 bool HasSMemTimeInst = false;
187 bool HasShaderCyclesRegister = false;
188 bool HasShaderCyclesHiLoRegisters = false;
189 bool HasVOP3Literal = false;
190 bool HasNoDataDepHazard = false;
191 bool FlatAddressSpace = false;
192 bool FlatInstOffsets = false;
193 bool FlatGlobalInsts = false;
194 bool FlatScratchInsts = false;
195 bool ScalarFlatScratchInsts = false;
196 bool HasArchitectedFlatScratch = false;
197 bool EnableFlatScratch = false;
198 bool HasArchitectedSGPRs = false;
199 bool HasGDS = false;
200 bool HasGWS = false;
201 bool AddNoCarryInsts = false;
202 bool HasUnpackedD16VMem = false;
203 bool LDSMisalignedBug = false;
204 bool HasMFMAInlineLiteralBug = false;
205 bool UnalignedBufferAccess = false;
206 bool UnalignedDSAccess = false;
207 bool HasPackedTID = false;
208 bool ScalarizeGlobal = false;
209 bool HasSALUFloatInsts = false;
210 bool HasVGPRSingleUseHintInsts = false;
211 bool HasPseudoScalarTrans = false;
212 bool HasRestrictedSOffset = false;
213
214 bool HasVcmpxPermlaneHazard = false;
215 bool HasVMEMtoScalarWriteHazard = false;
216 bool HasSMEMtoVectorWriteHazard = false;
217 bool HasInstFwdPrefetchBug = false;
218 bool HasVcmpxExecWARHazard = false;
219 bool HasLdsBranchVmemWARHazard = false;
220 bool HasNSAtoVMEMBug = false;
221 bool HasNSAClauseBug = false;
222 bool HasOffset3fBug = false;
223 bool HasFlatSegmentOffsetBug = false;
224 bool HasImageStoreD16Bug = false;
225 bool HasImageGather4D16Bug = false;
226 bool HasMSAALoadDstSelBug = false;
227 bool HasPrivEnabledTrap2NopBug = false;
228 bool Has1_5xVGPRs = false;
229 bool HasMADIntraFwdBug = false;
230 bool HasVOPDInsts = false;
231 bool HasVALUTransUseHazard = false;
232 bool HasForceStoreSC0SC1 = false;
233
234 bool RequiresCOV6 = false;
235
236 // Dummy feature to use for assembler in tablegen.
237 bool FeatureDisable = false;
238
239 SelectionDAGTargetInfo TSInfo;
240private:
241 SIInstrInfo InstrInfo;
242 SITargetLowering TLInfo;
243 SIFrameLowering FrameLowering;
244
245public:
246 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
247 const GCNTargetMachine &TM);
248 ~GCNSubtarget() override;
249
250 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
251 StringRef GPU, StringRef FS);
252
253 const SIInstrInfo *getInstrInfo() const override {
254 return &InstrInfo;
255 }
256
257 const SIFrameLowering *getFrameLowering() const override {
258 return &FrameLowering;
259 }
260
261 const SITargetLowering *getTargetLowering() const override {
262 return &TLInfo;
263 }
264
265 const SIRegisterInfo *getRegisterInfo() const override {
266 return &InstrInfo.getRegisterInfo();
267 }
268
269 const CallLowering *getCallLowering() const override {
270 return CallLoweringInfo.get();
271 }
272
273 const InlineAsmLowering *getInlineAsmLowering() const override {
274 return InlineAsmLoweringInfo.get();
275 }
276
277 InstructionSelector *getInstructionSelector() const override {
278 return InstSelector.get();
279 }
280
281 const LegalizerInfo *getLegalizerInfo() const override {
282 return Legalizer.get();
283 }
284
285 const AMDGPURegisterBankInfo *getRegBankInfo() const override {
286 return RegBankInfo.get();
287 }
288
289 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
290 return TargetID;
291 }
292
293 // Nothing implemented, just prevent crashes on use.
294 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
295 return &TSInfo;
296 }
297
298 const InstrItineraryData *getInstrItineraryData() const override {
299 return &InstrItins;
300 }
301
302 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
303
304 Generation getGeneration() const {
305 return (Generation)Gen;
306 }
307
308 unsigned getMaxWaveScratchSize() const {
309 // See COMPUTE_TMPRING_SIZE.WAVESIZE.
310 if (getGeneration() >= GFX12) {
311 // 18-bit field in units of 64-dword.
312 return (64 * 4) * ((1 << 18) - 1);
313 }
314 if (getGeneration() == GFX11) {
315 // 15-bit field in units of 64-dword.
316 return (64 * 4) * ((1 << 15) - 1);
317 }
318 // 13-bit field in units of 256-dword.
319 return (256 * 4) * ((1 << 13) - 1);
320 }
321
322 /// Return the number of high bits known to be zero for a frame index.
323 unsigned getKnownHighZeroBitsForFrameIndex() const {
324 return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
325 }
326
327 int getLDSBankCount() const {
328 return LDSBankCount;
329 }
330
331 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
332 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
333 }
334
335 unsigned getConstantBusLimit(unsigned Opcode) const;
336
337 /// Returns if the result of this instruction with a 16-bit result returned in
338 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
339 /// the original value.
340 bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
341
342 bool supportsWGP() const { return getGeneration() >= GFX10; }
343
344 bool hasIntClamp() const {
345 return HasIntClamp;
346 }
347
348 bool hasFP64() const {
349 return FP64;
350 }
351
352 bool hasMIMG_R128() const {
353 return MIMG_R128;
354 }
355
356 bool hasHWFP64() const {
357 return FP64;
358 }
359
360 bool hasHalfRate64Ops() const {
361 return HalfRate64Ops;
362 }
363
364 bool hasFullRate64Ops() const {
365 return FullRate64Ops;
366 }
367
368 bool hasAddr64() const {
369 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
370 }
371
372 bool hasFlat() const {
373 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
374 }
375
376 // Return true if the target only has the reverse operand versions of VALU
377 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
378 bool hasOnlyRevVALUShifts() const {
379 return getGeneration() >= VOLCANIC_ISLANDS;
380 }
381
382 bool hasFractBug() const {
383 return getGeneration() == SOUTHERN_ISLANDS;
384 }
385
386 bool hasBFE() const {
387 return true;
388 }
389
390 bool hasBFI() const {
391 return true;
392 }
393
394 bool hasBFM() const {
395 return hasBFE();
396 }
397
398 bool hasBCNT(unsigned Size) const {
399 return true;
400 }
401
402 bool hasFFBL() const {
403 return true;
404 }
405
406 bool hasFFBH() const {
407 return true;
408 }
409
410 bool hasMed3_16() const {
411 return getGeneration() >= AMDGPUSubtarget::GFX9;
412 }
413
414 bool hasMin3Max3_16() const {
415 return getGeneration() >= AMDGPUSubtarget::GFX9;
416 }
417
418 bool hasFmaMixInsts() const {
419 return HasFmaMixInsts;
420 }
421
422 bool hasCARRY() const {
423 return true;
424 }
425
426 bool hasFMA() const {
427 return FMA;
428 }
429
430 bool hasSwap() const {
431 return GFX9Insts;
432 }
433
434 bool hasScalarPackInsts() const {
435 return GFX9Insts;
436 }
437
438 bool hasScalarMulHiInsts() const {
439 return GFX9Insts;
440 }
441
442 bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
443
444 TrapHandlerAbi getTrapHandlerAbi() const {
445 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
446 }
447
448 bool supportsGetDoorbellID() const {
449 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
450 return getGeneration() >= GFX9;
451 }
452
453 /// True if the offset field of DS instructions works as expected. On SI, the
454 /// offset uses a 16-bit adder and does not always wrap properly.
455 bool hasUsableDSOffset() const {
456 return getGeneration() >= SEA_ISLANDS;
457 }
458
459 bool unsafeDSOffsetFoldingEnabled() const {
460 return EnableUnsafeDSOffsetFolding;
461 }
462
463 /// Condition output from div_scale is usable.
464 bool hasUsableDivScaleConditionOutput() const {
465 return getGeneration() != SOUTHERN_ISLANDS;
466 }
467
468 /// Extra wait hazard is needed in some cases before
469 /// s_cbranch_vccnz/s_cbranch_vccz.
470 bool hasReadVCCZBug() const {
471 return getGeneration() <= SEA_ISLANDS;
472 }
473
474 /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
475 bool partialVCCWritesUpdateVCCZ() const {
476 return getGeneration() >= GFX10;
477 }
478
479 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
480 /// was written by a VALU instruction.
481 bool hasSMRDReadVALUDefHazard() const {
482 return getGeneration() == SOUTHERN_ISLANDS;
483 }
484
485 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
486 /// SGPR was written by a VALU Instruction.
487 bool hasVMEMReadSGPRVALUDefHazard() const {
488 return getGeneration() >= VOLCANIC_ISLANDS;
489 }
490
491 bool hasRFEHazards() const {
492 return getGeneration() >= VOLCANIC_ISLANDS;
493 }
494
495 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
496 unsigned getSetRegWaitStates() const {
497 return getGeneration() <= SEA_ISLANDS ? 1 : 2;
498 }
499
500 bool dumpCode() const {
501 return DumpCode;
502 }
503
504 /// Return the amount of LDS that can be used that will not restrict the
505 /// occupancy lower than WaveCount.
506 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
507 const Function &) const;
508
509 bool supportsMinMaxDenormModes() const {
510 return getGeneration() >= AMDGPUSubtarget::GFX9;
511 }
512
513 /// \returns If target supports S_DENORM_MODE.
514 bool hasDenormModeInst() const {
515 return getGeneration() >= AMDGPUSubtarget::GFX10;
516 }
517
518 bool useFlatForGlobal() const {
519 return FlatForGlobal;
520 }
521
522 /// \returns If target supports ds_read/write_b128 and user enables generation
523 /// of ds_read/write_b128.
524 bool useDS128() const {
525 return CIInsts && EnableDS128;
526 }
527
528 /// \return If target supports ds_read/write_b96/128.
529 bool hasDS96AndDS128() const {
530 return CIInsts;
531 }
532
533 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
534 bool haveRoundOpsF64() const {
535 return CIInsts;
536 }
537
538 /// \returns If MUBUF instructions always perform range checking, even for
539 /// buffer resources used for private memory access.
540 bool privateMemoryResourceIsRangeChecked() const {
541 return getGeneration() < AMDGPUSubtarget::GFX9;
542 }
543
544 /// \returns If target requires PRT Struct NULL support (zero result registers
545 /// for sparse texture support).
546 bool usePRTStrictNull() const {
547 return EnablePRTStrictNull;
548 }
549
550 bool hasAutoWaitcntBeforeBarrier() const {
551 return AutoWaitcntBeforeBarrier;
552 }
553
554 /// \returns true if the target supports backing off of s_barrier instructions
555 /// when an exception is raised.
556 bool supportsBackOffBarrier() const {
557 return BackOffBarrier;
558 }
559
560 bool hasUnalignedBufferAccess() const {
561 return UnalignedBufferAccess;
562 }
563
564 bool hasUnalignedBufferAccessEnabled() const {
565 return UnalignedBufferAccess && UnalignedAccessMode;
566 }
567
568 bool hasUnalignedDSAccess() const {
569 return UnalignedDSAccess;
570 }
571
572 bool hasUnalignedDSAccessEnabled() const {
573 return UnalignedDSAccess && UnalignedAccessMode;
574 }
575
576 bool hasUnalignedScratchAccess() const {
577 return UnalignedScratchAccess;
578 }
579
580 bool hasUnalignedAccessMode() const {
581 return UnalignedAccessMode;
582 }
583
584 bool hasApertureRegs() const {
585 return HasApertureRegs;
586 }
587
588 bool isTrapHandlerEnabled() const {
589 return TrapHandler;
590 }
591
592 bool isXNACKEnabled() const {
593 return TargetID.isXnackOnOrAny();
594 }
595
596 bool isTgSplitEnabled() const {
597 return EnableTgSplit;
598 }
599
600 bool isCuModeEnabled() const {
601 return EnableCuMode;
602 }
603
604 bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
605
606 bool hasFlatAddressSpace() const {
607 return FlatAddressSpace;
608 }
609
610 bool hasFlatScrRegister() const {
611 return hasFlatAddressSpace();
612 }
613
614 bool hasFlatInstOffsets() const {
615 return FlatInstOffsets;
616 }
617
618 bool hasFlatGlobalInsts() const {
619 return FlatGlobalInsts;
620 }
621
622 bool hasFlatScratchInsts() const {
623 return FlatScratchInsts;
624 }
625
626 // Check if target supports ST addressing mode with FLAT scratch instructions.
627 // The ST addressing mode means no registers are used, either VGPR or SGPR,
628 // but only immediate offset is swizzled and added to the FLAT scratch base.
629 bool hasFlatScratchSTMode() const {
630 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
631 }
632
633 bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; }
634
635 bool hasScalarFlatScratchInsts() const {
636 return ScalarFlatScratchInsts;
637 }
638
639 bool enableFlatScratch() const {
640 return flatScratchIsArchitected() ||
641 (EnableFlatScratch && hasFlatScratchInsts());
642 }
643
644 bool hasGlobalAddTidInsts() const {
645 return GFX10_BEncoding;
646 }
647
648 bool hasAtomicCSub() const {
649 return GFX10_BEncoding;
650 }
651
652 // BUFFER/FLAT/GLOBAL_ATOMIC_ADD/MIN/MAX_F64
653 bool hasBufferFlatGlobalAtomicsF64() const { return hasGFX90AInsts(); }
654
655 bool hasExportInsts() const {
656 return !hasGFX940Insts();
657 }
658
659 bool hasVINTERPEncoding() const {
660 return GFX11Insts;
661 }
662
663 // DS_ADD_F64/DS_ADD_RTN_F64
664 bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); }
665
666 bool hasMultiDwordFlatScratchAddressing() const {
667 return getGeneration() >= GFX9;
668 }
669
670 bool hasFlatSegmentOffsetBug() const {
671 return HasFlatSegmentOffsetBug;
672 }
673
674 bool hasFlatLgkmVMemCountInOrder() const {
675 return getGeneration() > GFX9;
676 }
677
678 bool hasD16LoadStore() const {
679 return getGeneration() >= GFX9;
680 }
681
682 bool d16PreservesUnusedBits() const {
683 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
684 }
685
686 bool hasD16Images() const {
687 return getGeneration() >= VOLCANIC_ISLANDS;
688 }
689
690 /// Return if most LDS instructions have an m0 use that require m0 to be
691 /// initialized.
692 bool ldsRequiresM0Init() const {
693 return getGeneration() < GFX9;
694 }
695
696 // True if the hardware rewinds and replays GWS operations if a wave is
697 // preempted.
698 //
699 // If this is false, a GWS operation requires testing if a nack set the
700 // MEM_VIOL bit, and repeating if so.
701 bool hasGWSAutoReplay() const {
702 return getGeneration() >= GFX9;
703 }
704
705 /// \returns if target has ds_gws_sema_release_all instruction.
706 bool hasGWSSemaReleaseAll() const {
707 return CIInsts;
708 }
709
710 /// \returns true if the target has integer add/sub instructions that do not
711 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
712 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
713 /// for saturation.
714 bool hasAddNoCarry() const {
715 return AddNoCarryInsts;
716 }
717
718 bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
719
720 bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
721
722 bool hasUnpackedD16VMem() const {
723 return HasUnpackedD16VMem;
724 }
725
726 // Covers VS/PS/CS graphics shaders
727 bool isMesaGfxShader(const Function &F) const {
728 return isMesa3DOS() && AMDGPU::isShader(CC: F.getCallingConv());
729 }
730
731 bool hasMad64_32() const {
732 return getGeneration() >= SEA_ISLANDS;
733 }
734
735 bool hasSDWAOmod() const {
736 return HasSDWAOmod;
737 }
738
739 bool hasSDWAScalar() const {
740 return HasSDWAScalar;
741 }
742
743 bool hasSDWASdst() const {
744 return HasSDWASdst;
745 }
746
747 bool hasSDWAMac() const {
748 return HasSDWAMac;
749 }
750
751 bool hasSDWAOutModsVOPC() const {
752 return HasSDWAOutModsVOPC;
753 }
754
755 bool hasDLInsts() const {
756 return HasDLInsts;
757 }
758
759 bool hasFmacF64Inst() const { return HasFmacF64Inst; }
760
761 bool hasDot1Insts() const {
762 return HasDot1Insts;
763 }
764
765 bool hasDot2Insts() const {
766 return HasDot2Insts;
767 }
768
769 bool hasDot3Insts() const {
770 return HasDot3Insts;
771 }
772
773 bool hasDot4Insts() const {
774 return HasDot4Insts;
775 }
776
777 bool hasDot5Insts() const {
778 return HasDot5Insts;
779 }
780
781 bool hasDot6Insts() const {
782 return HasDot6Insts;
783 }
784
785 bool hasDot7Insts() const {
786 return HasDot7Insts;
787 }
788
789 bool hasDot8Insts() const {
790 return HasDot8Insts;
791 }
792
793 bool hasDot9Insts() const {
794 return HasDot9Insts;
795 }
796
797 bool hasDot10Insts() const {
798 return HasDot10Insts;
799 }
800
801 bool hasDot11Insts() const {
802 return HasDot11Insts;
803 }
804
805 bool hasMAIInsts() const {
806 return HasMAIInsts;
807 }
808
809 bool hasFP8Insts() const {
810 return HasFP8Insts;
811 }
812
813 bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
814
815 bool hasPkFmacF16Inst() const {
816 return HasPkFmacF16Inst;
817 }
818
819 bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
820
821 bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
822
823 bool hasAtomicFaddInsts() const {
824 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
825 }
826
827 bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
828
829 bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
830
831 bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
832 return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
833 }
834
835 bool hasAtomicBufferGlobalPkAddF16Insts() const {
836 return HasAtomicBufferGlobalPkAddF16Insts;
837 }
838
839 bool hasAtomicGlobalPkAddBF16Inst() const {
840 return HasAtomicGlobalPkAddBF16Inst;
841 }
842
843 bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
844
845 bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
846
847 bool hasDefaultComponentBroadcast() const {
848 return HasDefaultComponentBroadcast;
849 }
850
851 bool hasNoSdstCMPX() const {
852 return HasNoSdstCMPX;
853 }
854
855 bool hasVscnt() const {
856 return HasVscnt;
857 }
858
859 bool hasGetWaveIdInst() const {
860 return HasGetWaveIdInst;
861 }
862
863 bool hasSMemTimeInst() const {
864 return HasSMemTimeInst;
865 }
866
867 bool hasShaderCyclesRegister() const {
868 return HasShaderCyclesRegister;
869 }
870
871 bool hasShaderCyclesHiLoRegisters() const {
872 return HasShaderCyclesHiLoRegisters;
873 }
874
875 bool hasVOP3Literal() const {
876 return HasVOP3Literal;
877 }
878
879 bool hasNoDataDepHazard() const {
880 return HasNoDataDepHazard;
881 }
882
883 bool vmemWriteNeedsExpWaitcnt() const {
884 return getGeneration() < SEA_ISLANDS;
885 }
886
887 bool hasInstPrefetch() const {
888 return getGeneration() == GFX10 || getGeneration() == GFX11;
889 }
890
891 bool hasPrefetch() const { return GFX12Insts; }
892
893 // Has s_cmpk_* instructions.
894 bool hasSCmpK() const { return getGeneration() < GFX12; }
895
896 // Scratch is allocated in 256 dword per wave blocks for the entire
897 // wavefront. When viewed from the perspective of an arbitrary workitem, this
898 // is 4-byte aligned.
899 //
900 // Only 4-byte alignment is really needed to access anything. Transformations
901 // on the pointer value itself may rely on the alignment / known low bits of
902 // the pointer. Set this to something above the minimum to avoid needing
903 // dynamic realignment in common cases.
904 Align getStackAlignment() const { return Align(16); }
905
906 bool enableMachineScheduler() const override {
907 return true;
908 }
909
910 bool useAA() const override;
911
912 bool enableSubRegLiveness() const override {
913 return true;
914 }
915
916 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
917 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
918
919 // static wrappers
920 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
921
922 // XXX - Why is this here if it isn't in the default pass set?
923 bool enableEarlyIfConversion() const override {
924 return true;
925 }
926
927 void overrideSchedPolicy(MachineSchedPolicy &Policy,
928 unsigned NumRegionInstrs) const override;
929
930 void mirFileLoaded(MachineFunction &MF) const override;
931
932 unsigned getMaxNumUserSGPRs() const {
933 return AMDGPU::getMaxNumUserSGPRs(*this);
934 }
935
936 bool hasSMemRealTime() const {
937 return HasSMemRealTime;
938 }
939
940 bool hasMovrel() const {
941 return HasMovrel;
942 }
943
944 bool hasVGPRIndexMode() const {
945 return HasVGPRIndexMode;
946 }
947
948 bool useVGPRIndexMode() const;
949
950 bool hasScalarCompareEq64() const {
951 return getGeneration() >= VOLCANIC_ISLANDS;
952 }
953
954 bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
955
956 bool hasScalarStores() const {
957 return HasScalarStores;
958 }
959
960 bool hasScalarAtomics() const {
961 return HasScalarAtomics;
962 }
963
964 bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
965 bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
966
967 /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
968 bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
969
970 /// \returns true if the subtarget has the v_permlane64_b32 instruction.
971 bool hasPermLane64() const { return getGeneration() >= GFX11; }
972
973 bool hasDPP() const {
974 return HasDPP;
975 }
976
977 bool hasDPPBroadcasts() const {
978 return HasDPP && getGeneration() < GFX10;
979 }
980
981 bool hasDPPWavefrontShifts() const {
982 return HasDPP && getGeneration() < GFX10;
983 }
984
985 bool hasDPP8() const {
986 return HasDPP8;
987 }
988
989 bool hasDPALU_DPP() const {
990 return HasDPALU_DPP;
991 }
992
993 bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
994
995 bool hasPackedFP32Ops() const {
996 return HasPackedFP32Ops;
997 }
998
999 // Has V_PK_MOV_B32 opcode
1000 bool hasPkMovB32() const {
1001 return GFX90AInsts;
1002 }
1003
1004 bool hasFmaakFmamkF32Insts() const {
1005 return getGeneration() >= GFX10 || hasGFX940Insts();
1006 }
1007
1008 bool hasImageInsts() const {
1009 return HasImageInsts;
1010 }
1011
1012 bool hasExtendedImageInsts() const {
1013 return HasExtendedImageInsts;
1014 }
1015
1016 bool hasR128A16() const {
1017 return HasR128A16;
1018 }
1019
1020 bool hasA16() const { return HasA16; }
1021
1022 bool hasG16() const { return HasG16; }
1023
1024 bool hasOffset3fBug() const {
1025 return HasOffset3fBug;
1026 }
1027
1028 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
1029
1030 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
1031
1032 bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
1033
1034 bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
1035
1036 bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }
1037
1038 bool hasNSAEncoding() const { return HasNSAEncoding; }
1039
1040 bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
1041
1042 bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
1043
1044 unsigned getNSAMaxSize(bool HasSampler = false) const {
1045 return AMDGPU::getNSAMaxSize(*this, HasSampler);
1046 }
1047
1048 bool hasGFX10_AEncoding() const {
1049 return GFX10_AEncoding;
1050 }
1051
1052 bool hasGFX10_BEncoding() const {
1053 return GFX10_BEncoding;
1054 }
1055
1056 bool hasGFX10_3Insts() const {
1057 return GFX10_3Insts;
1058 }
1059
1060 bool hasMadF16() const;
1061
1062 bool hasMovB64() const { return GFX940Insts; }
1063
1064 bool hasLshlAddB64() const { return GFX940Insts; }
1065
1066 bool enableSIScheduler() const {
1067 return EnableSIScheduler;
1068 }
1069
1070 bool loadStoreOptEnabled() const {
1071 return EnableLoadStoreOpt;
1072 }
1073
1074 bool hasSGPRInitBug() const {
1075 return SGPRInitBug;
1076 }
1077
1078 bool hasUserSGPRInit16Bug() const {
1079 return UserSGPRInit16Bug && isWave32();
1080 }
1081
1082 bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
1083
1084 bool hasNegativeUnalignedScratchOffsetBug() const {
1085 return NegativeUnalignedScratchOffsetBug;
1086 }
1087
1088 bool hasMFMAInlineLiteralBug() const {
1089 return HasMFMAInlineLiteralBug;
1090 }
1091
1092 bool has12DWordStoreHazard() const {
1093 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
1094 }
1095
1096 // \returns true if the subtarget supports DWORDX3 load/store instructions.
1097 bool hasDwordx3LoadStores() const {
1098 return CIInsts;
1099 }
1100
1101 bool hasReadM0MovRelInterpHazard() const {
1102 return getGeneration() == AMDGPUSubtarget::GFX9;
1103 }
1104
1105 bool hasReadM0SendMsgHazard() const {
1106 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1107 getGeneration() <= AMDGPUSubtarget::GFX9;
1108 }
1109
1110 bool hasReadM0LdsDmaHazard() const {
1111 return getGeneration() == AMDGPUSubtarget::GFX9;
1112 }
1113
1114 bool hasReadM0LdsDirectHazard() const {
1115 return getGeneration() == AMDGPUSubtarget::GFX9;
1116 }
1117
1118 bool hasVcmpxPermlaneHazard() const {
1119 return HasVcmpxPermlaneHazard;
1120 }
1121
1122 bool hasVMEMtoScalarWriteHazard() const {
1123 return HasVMEMtoScalarWriteHazard;
1124 }
1125
1126 bool hasSMEMtoVectorWriteHazard() const {
1127 return HasSMEMtoVectorWriteHazard;
1128 }
1129
1130 bool hasLDSMisalignedBug() const {
1131 return LDSMisalignedBug && !EnableCuMode;
1132 }
1133
1134 bool hasInstFwdPrefetchBug() const {
1135 return HasInstFwdPrefetchBug;
1136 }
1137
1138 bool hasVcmpxExecWARHazard() const {
1139 return HasVcmpxExecWARHazard;
1140 }
1141
1142 bool hasLdsBranchVmemWARHazard() const {
1143 return HasLdsBranchVmemWARHazard;
1144 }
1145
1146 // Shift amount of a 64 bit shift cannot be a highest allocated register
1147 // if also at the end of the allocation block.
1148 bool hasShift64HighRegBug() const {
1149 return GFX90AInsts && !GFX940Insts;
1150 }
1151
1152 // Has one cycle hazard on transcendental instruction feeding a
1153 // non transcendental VALU.
1154 bool hasTransForwardingHazard() const { return GFX940Insts; }
1155
1156 // Has one cycle hazard on a VALU instruction partially writing dst with
1157 // a shift of result bits feeding another VALU instruction.
1158 bool hasDstSelForwardingHazard() const { return GFX940Insts; }
1159
1160 // Cannot use op_sel with v_dot instructions.
1161 bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }
1162
1163 // Does not have HW interlocs for VALU writing and then reading SGPRs.
1164 bool hasVDecCoExecHazard() const {
1165 return GFX940Insts;
1166 }
1167
1168 bool hasNSAtoVMEMBug() const {
1169 return HasNSAtoVMEMBug;
1170 }
1171
1172 bool hasNSAClauseBug() const { return HasNSAClauseBug; }
1173
1174 bool hasHardClauses() const { return MaxHardClauseLength > 0; }
1175
1176 bool hasGFX90AInsts() const { return GFX90AInsts; }
1177
1178 bool hasFPAtomicToDenormModeHazard() const {
1179 return getGeneration() == GFX10;
1180 }
1181
1182 bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
1183
1184 bool hasLdsDirect() const { return getGeneration() >= GFX11; }
1185
1186 bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
1187
1188 bool hasVALUPartialForwardingHazard() const {
1189 return getGeneration() == GFX11;
1190 }
1191
1192 bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
1193
1194 bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }
1195
1196 bool requiresCodeObjectV6() const { return RequiresCOV6; }
1197
1198 bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
1199
1200 /// Return if operations acting on VGPR tuples require even alignment.
1201 bool needsAlignedVGPRs() const { return GFX90AInsts; }
1202
1203 /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
1204 bool hasSPackHL() const { return GFX11Insts; }
1205
1206 /// Return true if the target's EXP instruction has the COMPR flag, which
1207 /// affects the meaning of the EN (enable) bits.
1208 bool hasCompressedExport() const { return !GFX11Insts; }
1209
1210 /// Return true if the target's EXP instruction supports the NULL export
1211 /// target.
1212 bool hasNullExportTarget() const { return !GFX11Insts; }
1213
1214 bool has1_5xVGPRs() const { return Has1_5xVGPRs; }
1215
1216 bool hasVOPDInsts() const { return HasVOPDInsts; }
1217
1218 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
1219
1220 /// Return true if the target has the S_DELAY_ALU instruction.
1221 bool hasDelayAlu() const { return GFX11Insts; }
1222
1223 bool hasPackedTID() const { return HasPackedTID; }
1224
1225 // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that
1226 // hasGFX90AInsts is also true.
1227 bool hasGFX940Insts() const { return GFX940Insts; }
1228
1229 bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
1230
1231 bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; }
1232
1233 bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
1234
1235 bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
1236
1237 /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
1238 /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
1239 bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
1240
1241 /// \returns The maximum number of instructions that can be enclosed in an
1242 /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
1243 /// instruction.
1244 unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
1245
1246 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1247 /// SGPRs
1248 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1249
1250 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1251 /// VGPRs
1252 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
1253
1254 /// Return occupancy for the given function. Used LDS and a number of
1255 /// registers if provided.
1256 /// Note, occupancy can be affected by the scratch allocation as well, but
1257 /// we do not have enough information to compute it.
1258 unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
1259 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
1260
1261 /// \returns true if the flat_scratch register should be initialized with the
1262 /// pointer to the wave's scratch memory rather than a size and offset.
1263 bool flatScratchIsPointer() const {
1264 return getGeneration() >= AMDGPUSubtarget::GFX9;
1265 }
1266
1267 /// \returns true if the flat_scratch register is initialized by the HW.
1268 /// In this case it is readonly.
1269 bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
1270
1271 /// \returns true if the architected SGPRs are enabled.
1272 bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
1273
1274 /// \returns true if Global Data Share is supported.
1275 bool hasGDS() const { return HasGDS; }
1276
1277 /// \returns true if Global Wave Sync is supported.
1278 bool hasGWS() const { return HasGWS; }
1279
1280 /// \returns true if the machine has merged shaders in which s0-s7 are
1281 /// reserved by the hardware and user SGPRs start at s8
1282 bool hasMergedShaders() const {
1283 return getGeneration() >= GFX9;
1284 }
1285
1286 // \returns true if the target supports the pre-NGG legacy geometry path.
1287 bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
1288
1289 // \returns true if preloading kernel arguments is supported.
1290 bool hasKernargPreload() const { return KernargPreload; }
1291
1292 // \returns true if the target has split barriers feature
1293 bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
1294
1295 // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
1296 bool hasCvtFP8VOP1Bug() const { return true; }
1297
1298 // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
1299 // no-return form.
1300 bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
1301
1302 // \returns true if the target has DX10_CLAMP kernel descriptor mode bit
1303 bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
1304
1305 // \returns true if the target has IEEE kernel descriptor mode bit
1306 bool hasIEEEMode() const { return getGeneration() < GFX12; }
1307
1308 // \returns true if the target has IEEE fminimum/fmaximum instructions
1309 bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
1310
1311 // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
1312 bool hasRrWGMode() const { return getGeneration() >= GFX12; }
1313
1314 /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
1315 /// values.
1316 bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
1317
1318 // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
1319 // of sign-extending.
1320 bool hasGetPCZeroExtension() const { return GFX12Insts; }
1321
1322 /// \returns SGPR allocation granularity supported by the subtarget.
1323 unsigned getSGPRAllocGranule() const {
1324 return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1325 }
1326
1327 /// \returns SGPR encoding granularity supported by the subtarget.
1328 unsigned getSGPREncodingGranule() const {
1329 return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1330 }
1331
1332 /// \returns Total number of SGPRs supported by the subtarget.
1333 unsigned getTotalNumSGPRs() const {
1334 return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1335 }
1336
1337 /// \returns Addressable number of SGPRs supported by the subtarget.
1338 unsigned getAddressableNumSGPRs() const {
1339 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1340 }
1341
1342 /// \returns Minimum number of SGPRs that meets the given number of waves per
1343 /// execution unit requirement supported by the subtarget.
1344 unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1345 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1346 }
1347
1348 /// \returns Maximum number of SGPRs that meets the given number of waves per
1349 /// execution unit requirement supported by the subtarget.
1350 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1351 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1352 }
1353
1354 /// \returns Reserved number of SGPRs. This is common
1355 /// utility function called by MachineFunction and
1356 /// Function variants of getReservedNumSGPRs.
1357 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
1358 /// \returns Reserved number of SGPRs for given machine function \p MF.
1359 unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1360
1361 /// \returns Reserved number of SGPRs for given function \p F.
1362 unsigned getReservedNumSGPRs(const Function &F) const;
1363
1364 /// \returns max num SGPRs. This is the common utility
1365 /// function called by MachineFunction and Function
1366 /// variants of getMaxNumSGPRs.
1367 unsigned getBaseMaxNumSGPRs(const Function &F,
1368 std::pair<unsigned, unsigned> WavesPerEU,
1369 unsigned PreloadedSGPRs,
1370 unsigned ReservedNumSGPRs) const;
1371
1372 /// \returns Maximum number of SGPRs that meets number of waves per execution
1373 /// unit requirement for function \p MF, or number of SGPRs explicitly
1374 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1375 ///
1376 /// \returns Value that meets number of waves per execution unit requirement
1377 /// if explicitly requested value cannot be converted to integer, violates
1378 /// subtarget's specifications, or does not meet number of waves per execution
1379 /// unit requirement.
1380 unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1381
1382 /// \returns Maximum number of SGPRs that meets number of waves per execution
1383 /// unit requirement for function \p F, or number of SGPRs explicitly
1384 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1385 ///
1386 /// \returns Value that meets number of waves per execution unit requirement
1387 /// if explicitly requested value cannot be converted to integer, violates
1388 /// subtarget's specifications, or does not meet number of waves per execution
1389 /// unit requirement.
1390 unsigned getMaxNumSGPRs(const Function &F) const;
1391
1392 /// \returns VGPR allocation granularity supported by the subtarget.
1393 unsigned getVGPRAllocGranule() const {
1394 return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1395 }
1396
1397 /// \returns VGPR encoding granularity supported by the subtarget.
1398 unsigned getVGPREncodingGranule() const {
1399 return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1400 }
1401
1402 /// \returns Total number of VGPRs supported by the subtarget.
1403 unsigned getTotalNumVGPRs() const {
1404 return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1405 }
1406
1407 /// \returns Addressable number of architectural VGPRs supported by the
1408 /// subtarget.
1409 unsigned getAddressableNumArchVGPRs() const {
1410 return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this);
1411 }
1412
1413 /// \returns Addressable number of VGPRs supported by the subtarget.
1414 unsigned getAddressableNumVGPRs() const {
1415 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1416 }
1417
1418 /// \returns the minimum number of VGPRs that will prevent achieving more than
1419 /// the specified number of waves \p WavesPerEU.
1420 unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1421 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1422 }
1423
1424 /// \returns the maximum number of VGPRs that can be used and still achieved
1425 /// at least the specified number of waves \p WavesPerEU.
1426 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1427 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1428 }
1429
1430 /// \returns max num VGPRs. This is the common utility function
1431 /// called by MachineFunction and Function variants of getMaxNumVGPRs.
1432 unsigned getBaseMaxNumVGPRs(const Function &F,
1433 std::pair<unsigned, unsigned> WavesPerEU) const;
1434 /// \returns Maximum number of VGPRs that meets number of waves per execution
1435 /// unit requirement for function \p F, or number of VGPRs explicitly
1436 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1437 ///
1438 /// \returns Value that meets number of waves per execution unit requirement
1439 /// if explicitly requested value cannot be converted to integer, violates
1440 /// subtarget's specifications, or does not meet number of waves per execution
1441 /// unit requirement.
1442 unsigned getMaxNumVGPRs(const Function &F) const;
1443
1444 unsigned getMaxNumAGPRs(const Function &F) const {
1445 return getMaxNumVGPRs(F);
1446 }
1447
1448 /// \returns Maximum number of VGPRs that meets number of waves per execution
1449 /// unit requirement for function \p MF, or number of VGPRs explicitly
1450 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1451 ///
1452 /// \returns Value that meets number of waves per execution unit requirement
1453 /// if explicitly requested value cannot be converted to integer, violates
1454 /// subtarget's specifications, or does not meet number of waves per execution
1455 /// unit requirement.
1456 unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1457
1458 void getPostRAMutations(
1459 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1460 const override;
1461
1462 std::unique_ptr<ScheduleDAGMutation>
1463 createFillMFMAShadowMutation(const TargetInstrInfo *TII) const;
1464
1465 bool isWave32() const {
1466 return getWavefrontSize() == 32;
1467 }
1468
1469 bool isWave64() const {
1470 return getWavefrontSize() == 64;
1471 }
1472
1473 const TargetRegisterClass *getBoolRC() const {
1474 return getRegisterInfo()->getBoolRC();
1475 }
1476
1477 /// \returns Maximum number of work groups per compute unit supported by the
1478 /// subtarget and limited by given \p FlatWorkGroupSize.
1479 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1480 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1481 }
1482
1483 /// \returns Minimum flat work group size supported by the subtarget.
1484 unsigned getMinFlatWorkGroupSize() const override {
1485 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1486 }
1487
1488 /// \returns Maximum flat work group size supported by the subtarget.
1489 unsigned getMaxFlatWorkGroupSize() const override {
1490 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1491 }
1492
1493 /// \returns Number of waves per execution unit required to support the given
1494 /// \p FlatWorkGroupSize.
1495 unsigned
1496 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1497 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1498 }
1499
1500 /// \returns Minimum number of waves per execution unit supported by the
1501 /// subtarget.
1502 unsigned getMinWavesPerEU() const override {
1503 return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1504 }
1505
1506 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1507 SDep &Dep,
1508 const TargetSchedModel *SchedModel) const override;
1509
1510 // \returns true if it's beneficial on this subtarget for the scheduler to
1511 // cluster stores as well as loads.
1512 bool shouldClusterStores() const { return getGeneration() >= GFX11; }
1513
1514 // \returns the number of address arguments from which to enable MIMG NSA
1515 // on supported architectures.
1516 unsigned getNSAThreshold(const MachineFunction &MF) const;
1517
1518 // \returns true if the subtarget has a hazard requiring an "s_nop 0"
1519 // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
1520 bool requiresNopBeforeDeallocVGPRs() const {
1521 // Currently all targets that support the dealloc VGPRs message also require
1522 // the nop.
1523 return true;
1524 }
1525};
1526
1527class GCNUserSGPRUsageInfo {
1528public:
1529 bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1530
1531 bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1532
1533 bool hasDispatchPtr() const { return DispatchPtr; }
1534
1535 bool hasQueuePtr() const { return QueuePtr; }
1536
1537 bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1538
1539 bool hasDispatchID() const { return DispatchID; }
1540
1541 bool hasFlatScratchInit() const { return FlatScratchInit; }
1542
1543 unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1544
1545 unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1546
1547 unsigned getNumFreeUserSGPRs();
1548
1549 void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1550
1551 enum UserSGPRID : unsigned {
1552 ImplicitBufferPtrID = 0,
1553 PrivateSegmentBufferID = 1,
1554 DispatchPtrID = 2,
1555 QueuePtrID = 3,
1556 KernargSegmentPtrID = 4,
1557 DispatchIdID = 5,
1558 FlatScratchInitID = 6,
1559 PrivateSegmentSizeID = 7
1560 };
1561
1562 // Returns the size in number of SGPRs for preload user SGPR field.
1563 static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1564 switch (ID) {
1565 case ImplicitBufferPtrID:
1566 return 2;
1567 case PrivateSegmentBufferID:
1568 return 4;
1569 case DispatchPtrID:
1570 return 2;
1571 case QueuePtrID:
1572 return 2;
1573 case KernargSegmentPtrID:
1574 return 2;
1575 case DispatchIdID:
1576 return 2;
1577 case FlatScratchInitID:
1578 return 2;
1579 case PrivateSegmentSizeID:
1580 return 1;
1581 }
1582 llvm_unreachable("Unknown UserSGPRID.");
1583 }
1584
1585 GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1586
1587private:
1588 const GCNSubtarget &ST;
1589
1590 // Private memory buffer
1591 // Compute directly in sgpr[0:1]
1592 // Other shaders indirect 64-bits at sgpr[0:1]
1593 bool ImplicitBufferPtr = false;
1594
1595 bool PrivateSegmentBuffer = false;
1596
1597 bool DispatchPtr = false;
1598
1599 bool QueuePtr = false;
1600
1601 bool KernargSegmentPtr = false;
1602
1603 bool DispatchID = false;
1604
1605 bool FlatScratchInit = false;
1606
1607 unsigned NumKernargPreloadSGPRs = 0;
1608
1609 unsigned NumUsedUserSGPRs = 0;
1610};
1611
1612} // end namespace llvm
1613
1614#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1615

source code of llvm/lib/Target/AMDGPU/GCNSubtarget.h