GCNSubtarget.h source code [llvm/lib/Target/AMDGPU/GCNSubtarget.h]

1	//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//==-----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// AMD GCN specific subclass of TargetSubtarget.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15	#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16
17	#include "AMDGPUCallLowering.h"
18	#include "AMDGPURegisterBankInfo.h"
19	#include "AMDGPUSubtarget.h"
20	#include "SIFrameLowering.h"
21	#include "SIISelLowering.h"
22	#include "SIInstrInfo.h"
23	#include "Utils/AMDGPUBaseInfo.h"
24	#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
25	#include "llvm/Support/ErrorHandling.h"
26
27	#define GET_SUBTARGETINFO_HEADER
28	#include "AMDGPUGenSubtargetInfo.inc"
29
30	namespace llvm {
31
32	class GCNTargetMachine;
33
34	class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
35	public AMDGPUSubtarget {
36	public:
37	using AMDGPUSubtarget::getMaxWavesPerEU;
38
39	// Following 2 enums are documented at:
40	// - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
41	enum class TrapHandlerAbi {
42	NONE = `0x00`,
43	AMDHSA = `0x01`,
44	};
45
46	enum class TrapID {
47	LLVMAMDHSATrap = `0x02`,
48	LLVMAMDHSADebugTrap = `0x03`,
49	};
50
51	private:
52	/// GlobalISel related APIs.
53	std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
54	std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
55	std::unique_ptr<InstructionSelector> InstSelector;
56	std::unique_ptr<LegalizerInfo> Legalizer;
57	std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
58
59	protected:
60	// Basic subtarget description.
61	Triple TargetTriple;
62	AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
63	unsigned Gen = INVALID;
64	InstrItineraryData InstrItins;
65	int LDSBankCount = `0`;
66	unsigned MaxPrivateElementSize = `0`;
67
68	// Possibly statically set by tablegen, but may want to be overridden.
69	bool FastDenormalF32 = false;
70	bool HalfRate64Ops = false;
71	bool FullRate64Ops = false;
72
73	// Dynamically set bits that enable features.
74	bool FlatForGlobal = false;
75	bool AutoWaitcntBeforeBarrier = false;
76	bool BackOffBarrier = false;
77	bool UnalignedScratchAccess = false;
78	bool UnalignedAccessMode = false;
79	bool HasApertureRegs = false;
80	bool SupportsXNACK = false;
81	bool KernargPreload = false;
82
83	// This should not be used directly. 'TargetID' tracks the dynamic settings
84	// for XNACK.
85	bool EnableXNACK = false;
86
87	bool EnableTgSplit = false;
88	bool EnableCuMode = false;
89	bool TrapHandler = false;
90	bool EnablePreciseMemory = false;
91
92	// Used as options.
93	bool EnableLoadStoreOpt = false;
94	bool EnableUnsafeDSOffsetFolding = false;
95	bool EnableSIScheduler = false;
96	bool EnableDS128 = false;
97	bool EnablePRTStrictNull = false;
98	bool DumpCode = false;
99
100	// Subtarget statically properties set by tablegen
101	bool FP64 = false;
102	bool FMA = false;
103	bool MIMG_R128 = false;
104	bool CIInsts = false;
105	bool GFX8Insts = false;
106	bool GFX9Insts = false;
107	bool GFX90AInsts = false;
108	bool GFX940Insts = false;
109	bool GFX10Insts = false;
110	bool GFX11Insts = false;
111	bool GFX12Insts = false;
112	bool GFX10_3Insts = false;
113	bool GFX7GFX8GFX9Insts = false;
114	bool SGPRInitBug = false;
115	bool UserSGPRInit16Bug = false;
116	bool NegativeScratchOffsetBug = false;
117	bool NegativeUnalignedScratchOffsetBug = false;
118	bool HasSMemRealTime = false;
119	bool HasIntClamp = false;
120	bool HasFmaMixInsts = false;
121	bool HasMovrel = false;
122	bool HasVGPRIndexMode = false;
123	bool HasScalarDwordx3Loads = false;
124	bool HasScalarStores = false;
125	bool HasScalarAtomics = false;
126	bool HasSDWAOmod = false;
127	bool HasSDWAScalar = false;
128	bool HasSDWASdst = false;
129	bool HasSDWAMac = false;
130	bool HasSDWAOutModsVOPC = false;
131	bool HasDPP = false;
132	bool HasDPP8 = false;
133	bool HasDPALU_DPP = false;
134	bool HasDPPSrc1SGPR = false;
135	bool HasPackedFP32Ops = false;
136	bool HasImageInsts = false;
137	bool HasExtendedImageInsts = false;
138	bool HasR128A16 = false;
139	bool HasA16 = false;
140	bool HasG16 = false;
141	bool HasNSAEncoding = false;
142	bool HasPartialNSAEncoding = false;
143	bool GFX10_AEncoding = false;
144	bool GFX10_BEncoding = false;
145	bool HasDLInsts = false;
146	bool HasFmacF64Inst = false;
147	bool HasDot1Insts = false;
148	bool HasDot2Insts = false;
149	bool HasDot3Insts = false;
150	bool HasDot4Insts = false;
151	bool HasDot5Insts = false;
152	bool HasDot6Insts = false;
153	bool HasDot7Insts = false;
154	bool HasDot8Insts = false;
155	bool HasDot9Insts = false;
156	bool HasDot10Insts = false;
157	bool HasDot11Insts = false;
158	bool HasMAIInsts = false;
159	bool HasFP8Insts = false;
160	bool HasFP8ConversionInsts = false;
161	bool HasPkFmacF16Inst = false;
162	bool HasAtomicDsPkAdd16Insts = false;
163	bool HasAtomicFlatPkAdd16Insts = false;
164	bool HasAtomicFaddRtnInsts = false;
165	bool HasAtomicFaddNoRtnInsts = false;
166	bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
167	bool HasAtomicBufferGlobalPkAddF16Insts = false;
168	bool HasAtomicCSubNoRtnInsts = false;
169	bool HasAtomicGlobalPkAddBF16Inst = false;
170	bool HasFlatAtomicFaddF32Inst = false;
171	bool HasDefaultComponentZero = false;
172	bool HasDefaultComponentBroadcast = false;
173	/// The maximum number of instructions that may be placed within an S_CLAUSE,
174	/// which is one greater than the maximum argument to S_CLAUSE. A value of 0
175	/// indicates a lack of S_CLAUSE support.
176	unsigned MaxHardClauseLength = `0`;
177	bool SupportsSRAMECC = false;
178
179	// This should not be used directly. 'TargetID' tracks the dynamic settings
180	// for SRAMECC.
181	bool EnableSRAMECC = false;
182
183	bool HasNoSdstCMPX = false;
184	bool HasVscnt = false;
185	bool HasGetWaveIdInst = false;
186	bool HasSMemTimeInst = false;
187	bool HasShaderCyclesRegister = false;
188	bool HasShaderCyclesHiLoRegisters = false;
189	bool HasVOP3Literal = false;
190	bool HasNoDataDepHazard = false;
191	bool FlatAddressSpace = false;
192	bool FlatInstOffsets = false;
193	bool FlatGlobalInsts = false;
194	bool FlatScratchInsts = false;
195	bool ScalarFlatScratchInsts = false;
196	bool HasArchitectedFlatScratch = false;
197	bool EnableFlatScratch = false;
198	bool HasArchitectedSGPRs = false;
199	bool HasGDS = false;
200	bool HasGWS = false;
201	bool AddNoCarryInsts = false;
202	bool HasUnpackedD16VMem = false;
203	bool LDSMisalignedBug = false;
204	bool HasMFMAInlineLiteralBug = false;
205	bool UnalignedBufferAccess = false;
206	bool UnalignedDSAccess = false;
207	bool HasPackedTID = false;
208	bool ScalarizeGlobal = false;
209	bool HasSALUFloatInsts = false;
210	bool HasVGPRSingleUseHintInsts = false;
211	bool HasPseudoScalarTrans = false;
212	bool HasRestrictedSOffset = false;
213
214	bool HasVcmpxPermlaneHazard = false;
215	bool HasVMEMtoScalarWriteHazard = false;
216	bool HasSMEMtoVectorWriteHazard = false;
217	bool HasInstFwdPrefetchBug = false;
218	bool HasVcmpxExecWARHazard = false;
219	bool HasLdsBranchVmemWARHazard = false;
220	bool HasNSAtoVMEMBug = false;
221	bool HasNSAClauseBug = false;
222	bool HasOffset3fBug = false;
223	bool HasFlatSegmentOffsetBug = false;
224	bool HasImageStoreD16Bug = false;
225	bool HasImageGather4D16Bug = false;
226	bool HasMSAALoadDstSelBug = false;
227	bool HasPrivEnabledTrap2NopBug = false;
228	bool Has1_5xVGPRs = false;
229	bool HasMADIntraFwdBug = false;
230	bool HasVOPDInsts = false;
231	bool HasVALUTransUseHazard = false;
232	bool HasForceStoreSC0SC1 = false;
233
234	bool RequiresCOV6 = false;
235
236	// Dummy feature to use for assembler in tablegen.
237	bool FeatureDisable = false;
238
239	SelectionDAGTargetInfo TSInfo;
240	private:
241	SIInstrInfo InstrInfo;
242	SITargetLowering TLInfo;
243	SIFrameLowering FrameLowering;
244
245	public:
246	GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
247	const GCNTargetMachine &TM);
248	~GCNSubtarget() override;
249
250	GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
251	StringRef GPU, StringRef FS);
252
253	const SIInstrInfo getInstrInfo() const* override {
254	return &InstrInfo;
255	}
256
257	const SIFrameLowering getFrameLowering() const* override {
258	return &FrameLowering;
259	}
260
261	const SITargetLowering getTargetLowering() const* override {
262	return &TLInfo;
263	}
264
265	const SIRegisterInfo getRegisterInfo() const* override {
266	return &InstrInfo.getRegisterInfo();
267	}
268
269	const CallLowering getCallLowering() const* override {
270	return CallLoweringInfo.get();
271	}
272
273	const InlineAsmLowering getInlineAsmLowering() const* override {
274	return InlineAsmLoweringInfo.get();
275	}
276
277	InstructionSelector getInstructionSelector() const* override {
278	return InstSelector.get();
279	}
280
281	const LegalizerInfo getLegalizerInfo() const* override {
282	return Legalizer.get();
283	}
284
285	const AMDGPURegisterBankInfo getRegBankInfo() const* override {
286	return RegBankInfo.get();
287	}
288
289	const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
290	return TargetID;
291	}
292
293	// Nothing implemented, just prevent crashes on use.
294	const SelectionDAGTargetInfo getSelectionDAGInfo() const* override {
295	return &TSInfo;
296	}
297
298	const InstrItineraryData getInstrItineraryData() const* override {
299	return &InstrItins;
300	}
301
302	void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
303
304	Generation getGeneration() const {
305	return (Generation)Gen;
306	}
307
308	unsigned getMaxWaveScratchSize() const {
309	// See COMPUTE_TMPRING_SIZE.WAVESIZE.
310	if (getGeneration() >= GFX12) {
311	// 18-bit field in units of 64-dword.
312	return (`64` * `4`) * ((`1` << `18`) - `1`);
313	}
314	if (getGeneration() == GFX11) {
315	// 15-bit field in units of 64-dword.
316	return (`64` * `4`) * ((`1` << `15`) - `1`);
317	}
318	// 13-bit field in units of 256-dword.
319	return (`256` * `4`) * ((`1` << `13`) - `1`);
320	}
321
322	/// Return the number of high bits known to be zero for a frame index.
323	unsigned getKnownHighZeroBitsForFrameIndex() const {
324	return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
325	}
326
327	int getLDSBankCount() const {
328	return LDSBankCount;
329	}
330
331	unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
332	return (ForBufferRSrc \|\| !enableFlatScratch()) ? MaxPrivateElementSize : `16`;
333	}
334
335	unsigned getConstantBusLimit(unsigned Opcode) const;
336
337	/// Returns if the result of this instruction with a 16-bit result returned in
338	/// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
339	/// the original value.
340	bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
341
342	bool supportsWGP() const { return getGeneration() >= GFX10; }
343
344	bool hasIntClamp() const {
345	return HasIntClamp;
346	}
347
348	bool hasFP64() const {
349	return FP64;
350	}
351
352	bool hasMIMG_R128() const {
353	return MIMG_R128;
354	}
355
356	bool hasHWFP64() const {
357	return FP64;
358	}
359
360	bool hasHalfRate64Ops() const {
361	return HalfRate64Ops;
362	}
363
364	bool hasFullRate64Ops() const {
365	return FullRate64Ops;
366	}
367
368	bool hasAddr64() const {
369	return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
370	}
371
372	bool hasFlat() const {
373	return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
374	}
375
376	// Return true if the target only has the reverse operand versions of VALU
377	// shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
378	bool hasOnlyRevVALUShifts() const {
379	return getGeneration() >= VOLCANIC_ISLANDS;
380	}
381
382	bool hasFractBug() const {
383	return getGeneration() == SOUTHERN_ISLANDS;
384	}
385
386	bool hasBFE() const {
387	return true;
388	}
389
390	bool hasBFI() const {
391	return true;
392	}
393
394	bool hasBFM() const {
395	return hasBFE();
396	}
397
398	bool hasBCNT(unsigned Size) const {
399	return true;
400	}
401
402	bool hasFFBL() const {
403	return true;
404	}
405
406	bool hasFFBH() const {
407	return true;
408	}
409
410	bool hasMed3_16() const {
411	return getGeneration() >= AMDGPUSubtarget::GFX9;
412	}
413
414	bool hasMin3Max3_16() const {
415	return getGeneration() >= AMDGPUSubtarget::GFX9;
416	}
417
418	bool hasFmaMixInsts() const {
419	return HasFmaMixInsts;
420	}
421
422	bool hasCARRY() const {
423	return true;
424	}
425
426	bool hasFMA() const {
427	return FMA;
428	}
429
430	bool hasSwap() const {
431	return GFX9Insts;
432	}
433
434	bool hasScalarPackInsts() const {
435	return GFX9Insts;
436	}
437
438	bool hasScalarMulHiInsts() const {
439	return GFX9Insts;
440	}
441
442	bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
443
444	TrapHandlerAbi getTrapHandlerAbi() const {
445	return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
446	}
447
448	bool supportsGetDoorbellID() const {
449	// The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
450	return getGeneration() >= GFX9;
451	}
452
453	/// True if the offset field of DS instructions works as expected. On SI, the
454	/// offset uses a 16-bit adder and does not always wrap properly.
455	bool hasUsableDSOffset() const {
456	return getGeneration() >= SEA_ISLANDS;
457	}
458
459	bool unsafeDSOffsetFoldingEnabled() const {
460	return EnableUnsafeDSOffsetFolding;
461	}
462
463	/// Condition output from div_scale is usable.
464	bool hasUsableDivScaleConditionOutput() const {
465	return getGeneration() != SOUTHERN_ISLANDS;
466	}
467
468	/// Extra wait hazard is needed in some cases before
469	/// s_cbranch_vccnz/s_cbranch_vccz.
470	bool hasReadVCCZBug() const {
471	return getGeneration() <= SEA_ISLANDS;
472	}
473
474	/// Writes to VCC_LO/VCC_HI update the VCCZ flag.
475	bool partialVCCWritesUpdateVCCZ() const {
476	return getGeneration() >= GFX10;
477	}
478
479	/// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
480	/// was written by a VALU instruction.
481	bool hasSMRDReadVALUDefHazard() const {
482	return getGeneration() == SOUTHERN_ISLANDS;
483	}
484
485	/// A read of an SGPR by a VMEM instruction requires 5 wait states when the
486	/// SGPR was written by a VALU Instruction.
487	bool hasVMEMReadSGPRVALUDefHazard() const {
488	return getGeneration() >= VOLCANIC_ISLANDS;
489	}
490
491	bool hasRFEHazards() const {
492	return getGeneration() >= VOLCANIC_ISLANDS;
493	}
494
495	/// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
496	unsigned getSetRegWaitStates() const {
497	return getGeneration() <= SEA_ISLANDS ? `1` : `2`;
498	}
499
500	bool dumpCode() const {
501	return DumpCode;
502	}
503
504	/// Return the amount of LDS that can be used that will not restrict the
505	/// occupancy lower than WaveCount.
506	unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
507	const Function &) const;
508
509	bool supportsMinMaxDenormModes() const {
510	return getGeneration() >= AMDGPUSubtarget::GFX9;
511	}
512
513	/// \returns If target supports S_DENORM_MODE.
514	bool hasDenormModeInst() const {
515	return getGeneration() >= AMDGPUSubtarget::GFX10;
516	}
517
518	bool useFlatForGlobal() const {
519	return FlatForGlobal;
520	}
521
522	/// \returns If target supports ds_read/write_b128 and user enables generation
523	/// of ds_read/write_b128.
524	bool useDS128() const {
525	return CIInsts && EnableDS128;
526	}
527
528	/// \return If target supports ds_read/write_b96/128.
529	bool hasDS96AndDS128() const {
530	return CIInsts;
531	}
532
533	/// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
534	bool haveRoundOpsF64() const {
535	return CIInsts;
536	}
537
538	/// \returns If MUBUF instructions always perform range checking, even for
539	/// buffer resources used for private memory access.
540	bool privateMemoryResourceIsRangeChecked() const {
541	return getGeneration() < AMDGPUSubtarget::GFX9;
542	}
543
544	/// \returns If target requires PRT Struct NULL support (zero result registers
545	/// for sparse texture support).
546	bool usePRTStrictNull() const {
547	return EnablePRTStrictNull;
548	}
549
550	bool hasAutoWaitcntBeforeBarrier() const {
551	return AutoWaitcntBeforeBarrier;
552	}
553
554	/// \returns true if the target supports backing off of s_barrier instructions
555	/// when an exception is raised.
556	bool supportsBackOffBarrier() const {
557	return BackOffBarrier;
558	}
559
560	bool hasUnalignedBufferAccess() const {
561	return UnalignedBufferAccess;
562	}
563
564	bool hasUnalignedBufferAccessEnabled() const {
565	return UnalignedBufferAccess && UnalignedAccessMode;
566	}
567
568	bool hasUnalignedDSAccess() const {
569	return UnalignedDSAccess;
570	}
571
572	bool hasUnalignedDSAccessEnabled() const {
573	return UnalignedDSAccess && UnalignedAccessMode;
574	}
575
576	bool hasUnalignedScratchAccess() const {
577	return UnalignedScratchAccess;
578	}
579
580	bool hasUnalignedAccessMode() const {
581	return UnalignedAccessMode;
582	}
583
584	bool hasApertureRegs() const {
585	return HasApertureRegs;
586	}
587
588	bool isTrapHandlerEnabled() const {
589	return TrapHandler;
590	}
591
592	bool isXNACKEnabled() const {
593	return TargetID.isXnackOnOrAny();
594	}
595
596	bool isTgSplitEnabled() const {
597	return EnableTgSplit;
598	}
599
600	bool isCuModeEnabled() const {
601	return EnableCuMode;
602	}
603
604	bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
605
606	bool hasFlatAddressSpace() const {
607	return FlatAddressSpace;
608	}
609
610	bool hasFlatScrRegister() const {
611	return hasFlatAddressSpace();
612	}
613
614	bool hasFlatInstOffsets() const {
615	return FlatInstOffsets;
616	}
617
618	bool hasFlatGlobalInsts() const {
619	return FlatGlobalInsts;
620	}
621
622	bool hasFlatScratchInsts() const {
623	return FlatScratchInsts;
624	}
625
626	// Check if target supports ST addressing mode with FLAT scratch instructions.
627	// The ST addressing mode means no registers are used, either VGPR or SGPR,
628	// but only immediate offset is swizzled and added to the FLAT scratch base.
629	bool hasFlatScratchSTMode() const {
630	return hasFlatScratchInsts() && (hasGFX10_3Insts() \|\| hasGFX940Insts());
631	}
632
633	bool hasFlatScratchSVSMode() const { return GFX940Insts \|\| GFX11Insts; }
634
635	bool hasScalarFlatScratchInsts() const {
636	return ScalarFlatScratchInsts;
637	}
638
639	bool enableFlatScratch() const {
640	return flatScratchIsArchitected() \|\|
641	(EnableFlatScratch && hasFlatScratchInsts());
642	}
643
644	bool hasGlobalAddTidInsts() const {
645	return GFX10_BEncoding;
646	}
647
648	bool hasAtomicCSub() const {
649	return GFX10_BEncoding;
650	}
651
652	// BUFFER/FLAT/GLOBAL_ATOMIC_ADD/MIN/MAX_F64
653	bool hasBufferFlatGlobalAtomicsF64() const { return hasGFX90AInsts(); }
654
655	bool hasExportInsts() const {
656	return !hasGFX940Insts();
657	}
658
659	bool hasVINTERPEncoding() const {
660	return GFX11Insts;
661	}
662
663	// DS_ADD_F64/DS_ADD_RTN_F64
664	bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); }
665
666	bool hasMultiDwordFlatScratchAddressing() const {
667	return getGeneration() >= GFX9;
668	}
669
670	bool hasFlatSegmentOffsetBug() const {
671	return HasFlatSegmentOffsetBug;
672	}
673
674	bool hasFlatLgkmVMemCountInOrder() const {
675	return getGeneration() > GFX9;
676	}
677
678	bool hasD16LoadStore() const {
679	return getGeneration() >= GFX9;
680	}
681
682	bool d16PreservesUnusedBits() const {
683	return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
684	}
685
686	bool hasD16Images() const {
687	return getGeneration() >= VOLCANIC_ISLANDS;
688	}
689
690	/// Return if most LDS instructions have an m0 use that require m0 to be
691	/// initialized.
692	bool ldsRequiresM0Init() const {
693	return getGeneration() < GFX9;
694	}
695
696	// True if the hardware rewinds and replays GWS operations if a wave is
697	// preempted.
698	//
699	// If this is false, a GWS operation requires testing if a nack set the
700	// MEM_VIOL bit, and repeating if so.
701	bool hasGWSAutoReplay() const {
702	return getGeneration() >= GFX9;
703	}
704
705	/// \returns if target has ds_gws_sema_release_all instruction.
706	bool hasGWSSemaReleaseAll() const {
707	return CIInsts;
708	}
709
710	/// \returns true if the target has integer add/sub instructions that do not
711	/// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
712	/// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
713	/// for saturation.
714	bool hasAddNoCarry() const {
715	return AddNoCarryInsts;
716	}
717
718	bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
719
720	bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
721
722	bool hasUnpackedD16VMem() const {
723	return HasUnpackedD16VMem;
724	}
725
726	// Covers VS/PS/CS graphics shaders
727	bool isMesaGfxShader(const Function &F) const {
728	return isMesa3DOS() && AMDGPU::isShader(CC: F.getCallingConv());
729	}
730
731	bool hasMad64_32() const {
732	return getGeneration() >= SEA_ISLANDS;
733	}
734
735	bool hasSDWAOmod() const {
736	return HasSDWAOmod;
737	}
738
739	bool hasSDWAScalar() const {
740	return HasSDWAScalar;
741	}
742
743	bool hasSDWASdst() const {
744	return HasSDWASdst;
745	}
746
747	bool hasSDWAMac() const {
748	return HasSDWAMac;
749	}
750
751	bool hasSDWAOutModsVOPC() const {
752	return HasSDWAOutModsVOPC;
753	}
754
755	bool hasDLInsts() const {
756	return HasDLInsts;
757	}
758
759	bool hasFmacF64Inst() const { return HasFmacF64Inst; }
760
761	bool hasDot1Insts() const {
762	return HasDot1Insts;
763	}
764
765	bool hasDot2Insts() const {
766	return HasDot2Insts;
767	}
768
769	bool hasDot3Insts() const {
770	return HasDot3Insts;
771	}
772
773	bool hasDot4Insts() const {
774	return HasDot4Insts;
775	}
776
777	bool hasDot5Insts() const {
778	return HasDot5Insts;
779	}
780
781	bool hasDot6Insts() const {
782	return HasDot6Insts;
783	}
784
785	bool hasDot7Insts() const {
786	return HasDot7Insts;
787	}
788
789	bool hasDot8Insts() const {
790	return HasDot8Insts;
791	}
792
793	bool hasDot9Insts() const {
794	return HasDot9Insts;
795	}
796
797	bool hasDot10Insts() const {
798	return HasDot10Insts;
799	}
800
801	bool hasDot11Insts() const {
802	return HasDot11Insts;
803	}
804
805	bool hasMAIInsts() const {
806	return HasMAIInsts;
807	}
808
809	bool hasFP8Insts() const {
810	return HasFP8Insts;
811	}
812
813	bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
814
815	bool hasPkFmacF16Inst() const {
816	return HasPkFmacF16Inst;
817	}
818
819	bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
820
821	bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
822
823	bool hasAtomicFaddInsts() const {
824	return HasAtomicFaddRtnInsts \|\| HasAtomicFaddNoRtnInsts;
825	}
826
827	bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; }
828
829	bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
830
831	bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
832	return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
833	}
834
835	bool hasAtomicBufferGlobalPkAddF16Insts() const {
836	return HasAtomicBufferGlobalPkAddF16Insts;
837	}
838
839	bool hasAtomicGlobalPkAddBF16Inst() const {
840	return HasAtomicGlobalPkAddBF16Inst;
841	}
842
843	bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
844
845	bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
846
847	bool hasDefaultComponentBroadcast() const {
848	return HasDefaultComponentBroadcast;
849	}
850
851	bool hasNoSdstCMPX() const {
852	return HasNoSdstCMPX;
853	}
854
855	bool hasVscnt() const {
856	return HasVscnt;
857	}
858
859	bool hasGetWaveIdInst() const {
860	return HasGetWaveIdInst;
861	}
862
863	bool hasSMemTimeInst() const {
864	return HasSMemTimeInst;
865	}
866
867	bool hasShaderCyclesRegister() const {
868	return HasShaderCyclesRegister;
869	}
870
871	bool hasShaderCyclesHiLoRegisters() const {
872	return HasShaderCyclesHiLoRegisters;
873	}
874
875	bool hasVOP3Literal() const {
876	return HasVOP3Literal;
877	}
878
879	bool hasNoDataDepHazard() const {
880	return HasNoDataDepHazard;
881	}
882
883	bool vmemWriteNeedsExpWaitcnt() const {
884	return getGeneration() < SEA_ISLANDS;
885	}
886
887	bool hasInstPrefetch() const {
888	return getGeneration() == GFX10 \|\| getGeneration() == GFX11;
889	}
890
891	bool hasPrefetch() const { return GFX12Insts; }
892
893	// Has s_cmpk_ instructions.*
894	bool hasSCmpK() const { return getGeneration() < GFX12; }
895
896	// Scratch is allocated in 256 dword per wave blocks for the entire
897	// wavefront. When viewed from the perspective of an arbitrary workitem, this
898	// is 4-byte aligned.
899	//
900	// Only 4-byte alignment is really needed to access anything. Transformations
901	// on the pointer value itself may rely on the alignment / known low bits of
902	// the pointer. Set this to something above the minimum to avoid needing
903	// dynamic realignment in common cases.
904	Align getStackAlignment() const { return Align (`16`); }
905
906	bool enableMachineScheduler() const override {
907	return true;
908	}
909
910	bool useAA() const override;
911
912	bool enableSubRegLiveness() const override {
913	return true;
914	}
915
916	void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
917	bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
918
919	// static wrappers
920	static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
921
922	// XXX - Why is this here if it isn't in the default pass set?
923	bool enableEarlyIfConversion() const override {
924	return true;
925	}
926
927	void overrideSchedPolicy(MachineSchedPolicy &Policy,
928	unsigned NumRegionInstrs) const override;
929
930	void mirFileLoaded(MachineFunction &MF) const override;
931
932	unsigned getMaxNumUserSGPRs() const {
933	return AMDGPU::getMaxNumUserSGPRs(*this);
934	}
935
936	bool hasSMemRealTime() const {
937	return HasSMemRealTime;
938	}
939
940	bool hasMovrel() const {
941	return HasMovrel;
942	}
943
944	bool hasVGPRIndexMode() const {
945	return HasVGPRIndexMode;
946	}
947
948	bool useVGPRIndexMode() const;
949
950	bool hasScalarCompareEq64() const {
951	return getGeneration() >= VOLCANIC_ISLANDS;
952	}
953
954	bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
955
956	bool hasScalarStores() const {
957	return HasScalarStores;
958	}
959
960	bool hasScalarAtomics() const {
961	return HasScalarAtomics;
962	}
963
964	bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
965	bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
966
967	/// \returns true if the subtarget has the v_permlanex16_b32 instruction.
968	bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
969
970	/// \returns true if the subtarget has the v_permlane64_b32 instruction.
971	bool hasPermLane64() const { return getGeneration() >= GFX11; }
972
973	bool hasDPP() const {
974	return HasDPP;
975	}
976
977	bool hasDPPBroadcasts() const {
978	return HasDPP && getGeneration() < GFX10;
979	}
980
981	bool hasDPPWavefrontShifts() const {
982	return HasDPP && getGeneration() < GFX10;
983	}
984
985	bool hasDPP8() const {
986	return HasDPP8;
987	}
988
989	bool hasDPALU_DPP() const {
990	return HasDPALU_DPP;
991	}
992
993	bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
994
995	bool hasPackedFP32Ops() const {
996	return HasPackedFP32Ops;
997	}
998
999	// Has V_PK_MOV_B32 opcode
1000	bool hasPkMovB32() const {
1001	return GFX90AInsts;
1002	}
1003
1004	bool hasFmaakFmamkF32Insts() const {
1005	return getGeneration() >= GFX10 \|\| hasGFX940Insts();
1006	}
1007
1008	bool hasImageInsts() const {
1009	return HasImageInsts;
1010	}
1011
1012	bool hasExtendedImageInsts() const {
1013	return HasExtendedImageInsts;
1014	}
1015
1016	bool hasR128A16() const {
1017	return HasR128A16;
1018	}
1019
1020	bool hasA16() const { return HasA16; }
1021
1022	bool hasG16() const { return HasG16; }
1023
1024	bool hasOffset3fBug() const {
1025	return HasOffset3fBug;
1026	}
1027
1028	bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
1029
1030	bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
1031
1032	bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
1033
1034	bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
1035
1036	bool hasPrivEnabledTrap2NopBug() const { return HasPrivEnabledTrap2NopBug; }
1037
1038	bool hasNSAEncoding() const { return HasNSAEncoding; }
1039
1040	bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
1041
1042	bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
1043
1044	unsigned getNSAMaxSize(bool HasSampler = false) const {
1045	return AMDGPU::getNSAMaxSize(*this, HasSampler);
1046	}
1047
1048	bool hasGFX10_AEncoding() const {
1049	return GFX10_AEncoding;
1050	}
1051
1052	bool hasGFX10_BEncoding() const {
1053	return GFX10_BEncoding;
1054	}
1055
1056	bool hasGFX10_3Insts() const {
1057	return GFX10_3Insts;
1058	}
1059
1060	bool hasMadF16() const;
1061
1062	bool hasMovB64() const { return GFX940Insts; }
1063
1064	bool hasLshlAddB64() const { return GFX940Insts; }
1065
1066	bool enableSIScheduler() const {
1067	return EnableSIScheduler;
1068	}
1069
1070	bool loadStoreOptEnabled() const {
1071	return EnableLoadStoreOpt;
1072	}
1073
1074	bool hasSGPRInitBug() const {
1075	return SGPRInitBug;
1076	}
1077
1078	bool hasUserSGPRInit16Bug() const {
1079	return UserSGPRInit16Bug && isWave32();
1080	}
1081
1082	bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
1083
1084	bool hasNegativeUnalignedScratchOffsetBug() const {
1085	return NegativeUnalignedScratchOffsetBug;
1086	}
1087
1088	bool hasMFMAInlineLiteralBug() const {
1089	return HasMFMAInlineLiteralBug;
1090	}
1091
1092	bool has12DWordStoreHazard() const {
1093	return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
1094	}
1095
1096	// \returns true if the subtarget supports DWORDX3 load/store instructions.
1097	bool hasDwordx3LoadStores() const {
1098	return CIInsts;
1099	}
1100
1101	bool hasReadM0MovRelInterpHazard() const {
1102	return getGeneration() == AMDGPUSubtarget::GFX9;
1103	}
1104
1105	bool hasReadM0SendMsgHazard() const {
1106	return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1107	getGeneration() <= AMDGPUSubtarget::GFX9;
1108	}
1109
1110	bool hasReadM0LdsDmaHazard() const {
1111	return getGeneration() == AMDGPUSubtarget::GFX9;
1112	}
1113
1114	bool hasReadM0LdsDirectHazard() const {
1115	return getGeneration() == AMDGPUSubtarget::GFX9;
1116	}
1117
1118	bool hasVcmpxPermlaneHazard() const {
1119	return HasVcmpxPermlaneHazard;
1120	}
1121
1122	bool hasVMEMtoScalarWriteHazard() const {
1123	return HasVMEMtoScalarWriteHazard;
1124	}
1125
1126	bool hasSMEMtoVectorWriteHazard() const {
1127	return HasSMEMtoVectorWriteHazard;
1128	}
1129
1130	bool hasLDSMisalignedBug() const {
1131	return LDSMisalignedBug && !EnableCuMode;
1132	}
1133
1134	bool hasInstFwdPrefetchBug() const {
1135	return HasInstFwdPrefetchBug;
1136	}
1137
1138	bool hasVcmpxExecWARHazard() const {
1139	return HasVcmpxExecWARHazard;
1140	}
1141
1142	bool hasLdsBranchVmemWARHazard() const {
1143	return HasLdsBranchVmemWARHazard;
1144	}
1145
1146	// Shift amount of a 64 bit shift cannot be a highest allocated register
1147	// if also at the end of the allocation block.
1148	bool hasShift64HighRegBug() const {
1149	return GFX90AInsts && !GFX940Insts;
1150	}
1151
1152	// Has one cycle hazard on transcendental instruction feeding a
1153	// non transcendental VALU.
1154	bool hasTransForwardingHazard() const { return GFX940Insts; }
1155
1156	// Has one cycle hazard on a VALU instruction partially writing dst with
1157	// a shift of result bits feeding another VALU instruction.
1158	bool hasDstSelForwardingHazard() const { return GFX940Insts; }
1159
1160	// Cannot use op_sel with v_dot instructions.
1161	bool hasDOTOpSelHazard() const { return GFX940Insts \|\| GFX11Insts; }
1162
1163	// Does not have HW interlocs for VALU writing and then reading SGPRs.
1164	bool hasVDecCoExecHazard() const {
1165	return GFX940Insts;
1166	}
1167
1168	bool hasNSAtoVMEMBug() const {
1169	return HasNSAtoVMEMBug;
1170	}
1171
1172	bool hasNSAClauseBug() const { return HasNSAClauseBug; }
1173
1174	bool hasHardClauses() const { return MaxHardClauseLength > `0`; }
1175
1176	bool hasGFX90AInsts() const { return GFX90AInsts; }
1177
1178	bool hasFPAtomicToDenormModeHazard() const {
1179	return getGeneration() == GFX10;
1180	}
1181
1182	bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
1183
1184	bool hasLdsDirect() const { return getGeneration() >= GFX11; }
1185
1186	bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
1187
1188	bool hasVALUPartialForwardingHazard() const {
1189	return getGeneration() == GFX11;
1190	}
1191
1192	bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
1193
1194	bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }
1195
1196	bool requiresCodeObjectV6() const { return RequiresCOV6; }
1197
1198	bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
1199
1200	/// Return if operations acting on VGPR tuples require even alignment.
1201	bool needsAlignedVGPRs() const { return GFX90AInsts; }
1202
1203	/// Return true if the target has the S_PACK_HL_B32_B16 instruction.
1204	bool hasSPackHL() const { return GFX11Insts; }
1205
1206	/// Return true if the target's EXP instruction has the COMPR flag, which
1207	/// affects the meaning of the EN (enable) bits.
1208	bool hasCompressedExport() const { return !GFX11Insts; }
1209
1210	/// Return true if the target's EXP instruction supports the NULL export
1211	/// target.
1212	bool hasNullExportTarget() const { return !GFX11Insts; }
1213
1214	bool has1_5xVGPRs() const { return Has1_5xVGPRs; }
1215
1216	bool hasVOPDInsts() const { return HasVOPDInsts; }
1217
1218	bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
1219
1220	/// Return true if the target has the S_DELAY_ALU instruction.
1221	bool hasDelayAlu() const { return GFX11Insts; }
1222
1223	bool hasPackedTID() const { return HasPackedTID; }
1224
1225	// GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that
1226	// hasGFX90AInsts is also true.
1227	bool hasGFX940Insts() const { return GFX940Insts; }
1228
1229	bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
1230
1231	bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; }
1232
1233	bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
1234
1235	bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
1236
1237	/// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
1238	/// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
1239	bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
1240
1241	/// \returns The maximum number of instructions that can be enclosed in an
1242	/// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
1243	/// instruction.
1244	unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
1245
1246	/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1247	/// SGPRs
1248	unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1249
1250	/// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1251	/// VGPRs
1252	unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
1253
1254	/// Return occupancy for the given function. Used LDS and a number of
1255	/// registers if provided.
1256	/// Note, occupancy can be affected by the scratch allocation as well, but
1257	/// we do not have enough information to compute it.
1258	unsigned computeOccupancy(const Function &F, unsigned LDSSize = `0`,
1259	unsigned NumSGPRs = `0`, unsigned NumVGPRs = `0`) const;
1260
1261	/// \returns true if the flat_scratch register should be initialized with the
1262	/// pointer to the wave's scratch memory rather than a size and offset.
1263	bool flatScratchIsPointer() const {
1264	return getGeneration() >= AMDGPUSubtarget::GFX9;
1265	}
1266
1267	/// \returns true if the flat_scratch register is initialized by the HW.
1268	/// In this case it is readonly.
1269	bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
1270
1271	/// \returns true if the architected SGPRs are enabled.
1272	bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
1273
1274	/// \returns true if Global Data Share is supported.
1275	bool hasGDS() const { return HasGDS; }
1276
1277	/// \returns true if Global Wave Sync is supported.
1278	bool hasGWS() const { return HasGWS; }
1279
1280	/// \returns true if the machine has merged shaders in which s0-s7 are
1281	/// reserved by the hardware and user SGPRs start at s8
1282	bool hasMergedShaders() const {
1283	return getGeneration() >= GFX9;
1284	}
1285
1286	// \returns true if the target supports the pre-NGG legacy geometry path.
1287	bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
1288
1289	// \returns true if preloading kernel arguments is supported.
1290	bool hasKernargPreload() const { return KernargPreload; }
1291
1292	// \returns true if the target has split barriers feature
1293	bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
1294
1295	// \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
1296	bool hasCvtFP8VOP1Bug() const { return true; }
1297
1298	// \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
1299	// no-return form.
1300	bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
1301
1302	// \returns true if the target has DX10_CLAMP kernel descriptor mode bit
1303	bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
1304
1305	// \returns true if the target has IEEE kernel descriptor mode bit
1306	bool hasIEEEMode() const { return getGeneration() < GFX12; }
1307
1308	// \returns true if the target has IEEE fminimum/fmaximum instructions
1309	bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
1310
1311	// \returns true if the target has WG_RR_MODE kernel descriptor mode bit
1312	bool hasRrWGMode() const { return getGeneration() >= GFX12; }
1313
1314	/// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
1315	/// values.
1316	bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
1317
1318	// \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
1319	// of sign-extending.
1320	bool hasGetPCZeroExtension() const { return GFX12Insts; }
1321
1322	/// \returns SGPR allocation granularity supported by the subtarget.
1323	unsigned getSGPRAllocGranule() const {
1324	return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1325	}
1326
1327	/// \returns SGPR encoding granularity supported by the subtarget.
1328	unsigned getSGPREncodingGranule() const {
1329	return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1330	}
1331
1332	/// \returns Total number of SGPRs supported by the subtarget.
1333	unsigned getTotalNumSGPRs() const {
1334	return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1335	}
1336
1337	/// \returns Addressable number of SGPRs supported by the subtarget.
1338	unsigned getAddressableNumSGPRs() const {
1339	return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1340	}
1341
1342	/// \returns Minimum number of SGPRs that meets the given number of waves per
1343	/// execution unit requirement supported by the subtarget.
1344	unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1345	return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1346	}
1347
1348	/// \returns Maximum number of SGPRs that meets the given number of waves per
1349	/// execution unit requirement supported by the subtarget.
1350	unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1351	return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1352	}
1353
1354	/// \returns Reserved number of SGPRs. This is common
1355	/// utility function called by MachineFunction and
1356	/// Function variants of getReservedNumSGPRs.
1357	unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
1358	/// \returns Reserved number of SGPRs for given machine function \p MF.
1359	unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1360
1361	/// \returns Reserved number of SGPRs for given function \p F.
1362	unsigned getReservedNumSGPRs(const Function &F) const;
1363
1364	/// \returns max num SGPRs. This is the common utility
1365	/// function called by MachineFunction and Function
1366	/// variants of getMaxNumSGPRs.
1367	unsigned getBaseMaxNumSGPRs(const Function &F,
1368	std::pair<unsigned, unsigned> WavesPerEU,
1369	unsigned PreloadedSGPRs,
1370	unsigned ReservedNumSGPRs) const;
1371
1372	/// \returns Maximum number of SGPRs that meets number of waves per execution
1373	/// unit requirement for function \p MF, or number of SGPRs explicitly
1374	/// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1375	///
1376	/// \returns Value that meets number of waves per execution unit requirement
1377	/// if explicitly requested value cannot be converted to integer, violates
1378	/// subtarget's specifications, or does not meet number of waves per execution
1379	/// unit requirement.
1380	unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1381
1382	/// \returns Maximum number of SGPRs that meets number of waves per execution
1383	/// unit requirement for function \p F, or number of SGPRs explicitly
1384	/// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1385	///
1386	/// \returns Value that meets number of waves per execution unit requirement
1387	/// if explicitly requested value cannot be converted to integer, violates
1388	/// subtarget's specifications, or does not meet number of waves per execution
1389	/// unit requirement.
1390	unsigned getMaxNumSGPRs(const Function &F) const;
1391
1392	/// \returns VGPR allocation granularity supported by the subtarget.
1393	unsigned getVGPRAllocGranule() const {
1394	return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1395	}
1396
1397	/// \returns VGPR encoding granularity supported by the subtarget.
1398	unsigned getVGPREncodingGranule() const {
1399	return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1400	}
1401
1402	/// \returns Total number of VGPRs supported by the subtarget.
1403	unsigned getTotalNumVGPRs() const {
1404	return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1405	}
1406
1407	/// \returns Addressable number of architectural VGPRs supported by the
1408	/// subtarget.
1409	unsigned getAddressableNumArchVGPRs() const {
1410	return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this);
1411	}
1412
1413	/// \returns Addressable number of VGPRs supported by the subtarget.
1414	unsigned getAddressableNumVGPRs() const {
1415	return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1416	}
1417
1418	/// \returns the minimum number of VGPRs that will prevent achieving more than
1419	/// the specified number of waves \p WavesPerEU.
1420	unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1421	return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1422	}
1423
1424	/// \returns the maximum number of VGPRs that can be used and still achieved
1425	/// at least the specified number of waves \p WavesPerEU.
1426	unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1427	return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1428	}
1429
1430	/// \returns max num VGPRs. This is the common utility function
1431	/// called by MachineFunction and Function variants of getMaxNumVGPRs.
1432	unsigned getBaseMaxNumVGPRs(const Function &F,
1433	std::pair<unsigned, unsigned> WavesPerEU) const;
1434	/// \returns Maximum number of VGPRs that meets number of waves per execution
1435	/// unit requirement for function \p F, or number of VGPRs explicitly
1436	/// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1437	///
1438	/// \returns Value that meets number of waves per execution unit requirement
1439	/// if explicitly requested value cannot be converted to integer, violates
1440	/// subtarget's specifications, or does not meet number of waves per execution
1441	/// unit requirement.
1442	unsigned getMaxNumVGPRs(const Function &F) const;
1443
1444	unsigned getMaxNumAGPRs(const Function &F) const {
1445	return getMaxNumVGPRs(F);
1446	}
1447
1448	/// \returns Maximum number of VGPRs that meets number of waves per execution
1449	/// unit requirement for function \p MF, or number of VGPRs explicitly
1450	/// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1451	///
1452	/// \returns Value that meets number of waves per execution unit requirement
1453	/// if explicitly requested value cannot be converted to integer, violates
1454	/// subtarget's specifications, or does not meet number of waves per execution
1455	/// unit requirement.
1456	unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1457
1458	void getPostRAMutations(
1459	std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1460	const override;
1461
1462	std::unique_ptr<ScheduleDAGMutation>
1463	createFillMFMAShadowMutation(const TargetInstrInfo TII) const*;
1464
1465	bool isWave32() const {
1466	return getWavefrontSize() == `32`;
1467	}
1468
1469	bool isWave64() const {
1470	return getWavefrontSize() == `64`;
1471	}
1472
1473	const TargetRegisterClass getBoolRC() const* {
1474	return getRegisterInfo()->getBoolRC();
1475	}
1476
1477	/// \returns Maximum number of work groups per compute unit supported by the
1478	/// subtarget and limited by given \p FlatWorkGroupSize.
1479	unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1480	return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1481	}
1482
1483	/// \returns Minimum flat work group size supported by the subtarget.
1484	unsigned getMinFlatWorkGroupSize() const override {
1485	return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1486	}
1487
1488	/// \returns Maximum flat work group size supported by the subtarget.
1489	unsigned getMaxFlatWorkGroupSize() const override {
1490	return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1491	}
1492
1493	/// \returns Number of waves per execution unit required to support the given
1494	/// \p FlatWorkGroupSize.
1495	unsigned
1496	getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1497	return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1498	}
1499
1500	/// \returns Minimum number of waves per execution unit supported by the
1501	/// subtarget.
1502	unsigned getMinWavesPerEU() const override {
1503	return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1504	}
1505
1506	void adjustSchedDependency(SUnit Def, int* DefOpIdx, SUnit Use, int* UseOpIdx,
1507	SDep &Dep,
1508	const TargetSchedModel SchedModel) const* override;
1509
1510	// \returns true if it's beneficial on this subtarget for the scheduler to
1511	// cluster stores as well as loads.
1512	bool shouldClusterStores() const { return getGeneration() >= GFX11; }
1513
1514	// \returns the number of address arguments from which to enable MIMG NSA
1515	// on supported architectures.
1516	unsigned getNSAThreshold(const MachineFunction &MF) const;
1517
1518	// \returns true if the subtarget has a hazard requiring an "s_nop 0"
1519	// instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
1520	bool requiresNopBeforeDeallocVGPRs() const {
1521	// Currently all targets that support the dealloc VGPRs message also require
1522	// the nop.
1523	return true;
1524	}
1525	};
1526
1527	class GCNUserSGPRUsageInfo {
1528	public:
1529	bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1530
1531	bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1532
1533	bool hasDispatchPtr() const { return DispatchPtr; }
1534
1535	bool hasQueuePtr() const { return QueuePtr; }
1536
1537	bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1538
1539	bool hasDispatchID() const { return DispatchID; }
1540
1541	bool hasFlatScratchInit() const { return FlatScratchInit; }
1542
1543	unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1544
1545	unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1546
1547	unsigned getNumFreeUserSGPRs();
1548
1549	void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1550
1551	enum UserSGPRID : unsigned {
1552	ImplicitBufferPtrID = `0`,
1553	PrivateSegmentBufferID = `1`,
1554	DispatchPtrID = `2`,
1555	QueuePtrID = `3`,
1556	KernargSegmentPtrID = `4`,
1557	DispatchIdID = `5`,
1558	FlatScratchInitID = `6`,
1559	PrivateSegmentSizeID = `7`
1560	};
1561
1562	// Returns the size in number of SGPRs for preload user SGPR field.
1563	static unsigned getNumUserSGPRForField(UserSGPRID ID) {
1564	switch (ID) {
1565	case ImplicitBufferPtrID:
1566	return `2`;
1567	case PrivateSegmentBufferID:
1568	return `4`;
1569	case DispatchPtrID:
1570	return `2`;
1571	case QueuePtrID:
1572	return `2`;
1573	case KernargSegmentPtrID:
1574	return `2`;
1575	case DispatchIdID:
1576	return `2`;
1577	case FlatScratchInitID:
1578	return `2`;
1579	case PrivateSegmentSizeID:
1580	return `1`;
1581	}
1582	llvm_unreachable("Unknown UserSGPRID.");
1583	}
1584
1585	GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1586
1587	private:
1588	const GCNSubtarget &ST;
1589
1590	// Private memory buffer
1591	// Compute directly in sgpr[0:1]
1592	// Other shaders indirect 64-bits at sgpr[0:1]
1593	bool ImplicitBufferPtr = false;
1594
1595	bool PrivateSegmentBuffer = false;
1596
1597	bool DispatchPtr = false;
1598
1599	bool QueuePtr = false;
1600
1601	bool KernargSegmentPtr = false;
1602
1603	bool DispatchID = false;
1604
1605	bool FlatScratchInit = false;
1606
1607	unsigned NumKernargPreloadSGPRs = `0`;
1608
1609	unsigned NumUsedUserSGPRs = `0`;
1610	};
1611
1612	} // end namespace llvm
1613
1614	#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1615

source code of llvm/lib/Target/AMDGPU/GCNSubtarget.h