AMDGPUCodeGenPrepare.cpp source code [llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp]

1	//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This pass does misc. AMDGPU optimizations on IR before instruction
11	/// selection.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "AMDGPU.h"
16	#include "AMDGPUTargetMachine.h"
17	#include "SIModeRegisterDefaults.h"
18	#include "llvm/Analysis/AssumptionCache.h"
19	#include "llvm/Analysis/ConstantFolding.h"
20	#include "llvm/Analysis/TargetLibraryInfo.h"
21	#include "llvm/Analysis/UniformityAnalysis.h"
22	#include "llvm/Analysis/ValueTracking.h"
23	#include "llvm/CodeGen/TargetPassConfig.h"
24	#include "llvm/IR/Dominators.h"
25	#include "llvm/IR/IRBuilder.h"
26	#include "llvm/IR/InstVisitor.h"
27	#include "llvm/IR/IntrinsicsAMDGPU.h"
28	#include "llvm/IR/PatternMatch.h"
29	#include "llvm/InitializePasses.h"
30	#include "llvm/Pass.h"
31	#include "llvm/Support/KnownBits.h"
32	#include "llvm/Transforms/Utils/IntegerDivision.h"
33	#include "llvm/Transforms/Utils/Local.h"
34
35	#define DEBUG_TYPE "amdgpu-codegenprepare"
36
37	using namespace llvm;
38	using namespace llvm::PatternMatch;
39
40	namespace {
41
42	static cl::opt<bool> WidenLoads(
43	"amdgpu-codegenprepare-widen-constant-loads",
44	cl::desc ("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
45	cl::ReallyHidden,
46	cl::init(Val: false));
47
48	static cl::opt<bool> Widen16BitOps(
49	"amdgpu-codegenprepare-widen-16-bit-ops",
50	cl::desc ("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
51	cl::ReallyHidden,
52	cl::init(Val: true));
53
54	static cl::opt<bool>
55	BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
56	cl::desc ("Break large PHI nodes for DAGISel"),
57	cl::ReallyHidden, cl::init(Val: true));
58
59	static cl::opt<bool>
60	ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
61	cl::desc ("For testing purposes, always break large "
62	"PHIs even if it isn't profitable."),
63	cl::ReallyHidden, cl::init(Val: false));
64
65	static cl::opt<unsigned> BreakLargePHIsThreshold(
66	"amdgpu-codegenprepare-break-large-phis-threshold",
67	cl::desc ("Minimum type size in bits for breaking large PHI nodes"),
68	cl::ReallyHidden, cl::init(Val: `32`));
69
70	static cl::opt<bool> UseMul24Intrin(
71	"amdgpu-codegenprepare-mul24",
72	cl::desc ("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
73	cl::ReallyHidden,
74	cl::init(Val: true));
75
76	// Legalize 64-bit division by using the generic IR expansion.
77	static cl::opt<bool> ExpandDiv64InIR(
78	"amdgpu-codegenprepare-expand-div64",
79	cl::desc ("Expand 64-bit division in AMDGPUCodeGenPrepare"),
80	cl::ReallyHidden,
81	cl::init(Val: false));
82
83	// Leave all division operations as they are. This supersedes ExpandDiv64InIR
84	// and is used for testing the legalizer.
85	static cl::opt<bool> DisableIDivExpand(
86	"amdgpu-codegenprepare-disable-idiv-expansion",
87	cl::desc ("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
88	cl::ReallyHidden,
89	cl::init(Val: false));
90
91	// Disable processing of fdiv so we can better test the backend implementations.
92	static cl::opt<bool> DisableFDivExpand(
93	"amdgpu-codegenprepare-disable-fdiv-expansion",
94	cl::desc ("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
95	cl::ReallyHidden,
96	cl::init(Val: false));
97
98	class AMDGPUCodeGenPrepareImpl
99	: public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
100	public:
101	const GCNSubtarget ST = nullptr*;
102	const TargetLibraryInfo TLInfo = nullptr*;
103	AssumptionCache AC = nullptr*;
104	DominatorTree DT = nullptr*;
105	UniformityInfo UA = nullptr*;
106	Module Mod = nullptr*;
107	const DataLayout DL = nullptr*;
108	bool HasUnsafeFPMath = false;
109	bool HasFP32DenormalFlush = false;
110	bool FlowChanged = false;
111	mutable Function SqrtF32 = nullptr*;
112	mutable Function LdexpF32 = nullptr*;
113
114	DenseMap<const PHINode , bool*> BreakPhiNodesCache;
115
116	Function getSqrtF32() const* {
117	if (SqrtF32)
118	return SqrtF32;
119
120	LLVMContext &Ctx = Mod->getContext();
121	SqrtF32 = Intrinsic::getDeclaration(M: Mod, Intrinsic::id: amdgcn_sqrt,
122	Tys: {Type::getFloatTy(C&: Ctx)});
123	return SqrtF32;
124	}
125
126	Function getLdexpF32() const* {
127	if (LdexpF32)
128	return LdexpF32;
129
130	LLVMContext &Ctx = Mod->getContext();
131	LdexpF32 = Intrinsic::getDeclaration(
132	M: Mod, Intrinsic::id: ldexp, Tys: {Type::getFloatTy(C&: Ctx), Type::getInt32Ty(C&: Ctx)});
133	return LdexpF32;
134	}
135
136	bool canBreakPHINode(const PHINode &I);
137
138	/// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
139	/// binary operation \p V.
140	///
141	/// \returns Binary operation \p V.
142	/// \returns \p T's base element bit width.
143	unsigned getBaseElementBitWidth(const Type T) const*;
144
145	/// \returns Equivalent 32 bit integer type for given type \p T. For example,
146	/// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
147	/// is returned.
148	Type getI32Ty(IRBuilder<> &B, const* Type T) const*;
149
150	/// \returns True if binary operation \p I is a signed binary operation, false
151	/// otherwise.
152	bool isSigned(const BinaryOperator &I) const;
153
154	/// \returns True if the condition of 'select' operation \p I comes from a
155	/// signed 'icmp' operation, false otherwise.
156	bool isSigned(const SelectInst &I) const;
157
158	/// \returns True if type \p T needs to be promoted to 32 bit integer type,
159	/// false otherwise.
160	bool needsPromotionToI32(const Type T) const*;
161
162	/// Return true if \p T is a legal scalar floating point type.
163	bool isLegalFloatingTy(const Type T) const*;
164
165	/// Wrapper to pass all the arguments to computeKnownFPClass
166	KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested,
167	const Instruction CtxI) const* {
168	return llvm::computeKnownFPClass(V, DL: *DL, InterestedClasses: Interested, Depth: `0`, TLI: TLInfo, AC, CxtI: CtxI,
169	DT);
170	}
171
172	bool canIgnoreDenormalInput(const Value V, const* Instruction CtxI) const* {
173	return HasFP32DenormalFlush \|\|
174	computeKnownFPClass(V, Interested: fcSubnormal, CtxI).isKnownNeverSubnormal();
175	}
176
177	/// Promotes uniform binary operation \p I to equivalent 32 bit binary
178	/// operation.
179	///
180	/// \details \p I's base element bit width must be greater than 1 and less
181	/// than or equal 16. Promotion is done by sign or zero extending operands to
182	/// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
183	/// truncating the result of 32 bit binary operation back to \p I's original
184	/// type. Division operation is not promoted.
185	///
186	/// \returns True if \p I is promoted to equivalent 32 bit binary operation,
187	/// false otherwise.
188	bool promoteUniformOpToI32(BinaryOperator &I) const;
189
190	/// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
191	///
192	/// \details \p I's base element bit width must be greater than 1 and less
193	/// than or equal 16. Promotion is done by sign or zero extending operands to
194	/// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
195	///
196	/// \returns True.
197	bool promoteUniformOpToI32(ICmpInst &I) const;
198
199	/// Promotes uniform 'select' operation \p I to 32 bit 'select'
200	/// operation.
201	///
202	/// \details \p I's base element bit width must be greater than 1 and less
203	/// than or equal 16. Promotion is done by sign or zero extending operands to
204	/// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
205	/// result of 32 bit 'select' operation back to \p I's original type.
206	///
207	/// \returns True.
208	bool promoteUniformOpToI32(SelectInst &I) const;
209
210	/// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
211	/// intrinsic.
212	///
213	/// \details \p I's base element bit width must be greater than 1 and less
214	/// than or equal 16. Promotion is done by zero extending the operand to 32
215	/// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
216	/// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
217	/// shift amount is 32 minus \p I's base element bit width), and truncating
218	/// the result of the shift operation back to \p I's original type.
219	///
220	/// \returns True.
221	bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
222
223	/// \returns The minimum number of bits needed to store the value of \Op as an
224	/// unsigned integer. Truncating to this size and then zero-extending to
225	/// the original will not change the value.
226	unsigned numBitsUnsigned(Value Op) const*;
227
228	/// \returns The minimum number of bits needed to store the value of \Op as a
229	/// signed integer. Truncating to this size and then sign-extending to
230	/// the original size will not change the value.
231	unsigned numBitsSigned(Value Op) const*;
232
233	/// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
234	/// SelectionDAG has an issue where an and asserting the bits are known
235	bool replaceMulWithMul24(BinaryOperator &I) const;
236
237	/// Perform same function as equivalently named function in DAGCombiner. Since
238	/// we expand some divisions here, we need to perform this before obscuring.
239	bool foldBinOpIntoSelect(BinaryOperator &I) const;
240
241	bool divHasSpecialOptimization(BinaryOperator &I,
242	Value Num, Value Den) const;
243	int getDivNumBits(BinaryOperator &I,
244	Value Num, Value Den,
245	unsigned AtLeast, bool Signed) const;
246
247	/// Expands 24 bit div or rem.
248	Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
249	Value Num, Value Den,
250	bool IsDiv, bool IsSigned) const;
251
252	Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
253	Value Num, Value Den, unsigned NumBits,
254	bool IsDiv, bool IsSigned) const;
255
256	/// Expands 32 bit div or rem.
257	Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
258	Value Num, Value Den) const;
259
260	Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
261	Value Num, Value Den) const;
262	void expandDivRem64(BinaryOperator &I) const;
263
264	/// Widen a scalar load.
265	///
266	/// \details \p Widen scalar load for uniform, small type loads from constant
267	// memory / to a full 32-bits and then truncate the input to allow a scalar
268	// load instead of a vector load.
269	//
270	/// \returns True.
271
272	bool canWidenScalarExtLoad(LoadInst &I) const;
273
274	Value *matchFractPat(IntrinsicInst &I);
275	Value applyFractPat(IRBuilder<> &Builder, Value FractArg);
276
277	bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF,
278	FastMathFlags SqrtFMF) const;
279
280	Value optimizeWithRsq(IRBuilder<> &Builder, Value Num, Value *Den,
281	FastMathFlags DivFMF, FastMathFlags SqrtFMF,
282	const Instruction CtxI) const*;
283
284	Value optimizeWithRcp(IRBuilder<> &Builder, Value Num, Value *Den,
285	FastMathFlags FMF, const Instruction CtxI) const*;
286	Value optimizeWithFDivFast(IRBuilder<> &Builder, Value Num, Value *Den,
287	float ReqdAccuracy) const;
288
289	Value visitFDivElement(IRBuilder<> &Builder, Value Num, Value *Den,
290	FastMathFlags DivFMF, FastMathFlags SqrtFMF,
291	Value RsqOp, const* Instruction *FDiv,
292	float ReqdAccuracy) const;
293
294	std::pair<Value , Value > getFrexpResults(IRBuilder<> &Builder,
295	Value Src) const*;
296
297	Value emitRcpIEEE1ULP(IRBuilder<> &Builder, Value Src,
298	bool IsNegative) const;
299	Value emitFrexpDiv(IRBuilder<> &Builder, Value LHS, Value *RHS,
300	FastMathFlags FMF) const;
301	Value emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value Src,
302	FastMathFlags FMF) const;
303
304	public:
305	bool visitFDiv(BinaryOperator &I);
306
307	bool visitInstruction(Instruction &I) { return false; }
308	bool visitBinaryOperator(BinaryOperator &I);
309	bool visitLoadInst(LoadInst &I);
310	bool visitICmpInst(ICmpInst &I);
311	bool visitSelectInst(SelectInst &I);
312	bool visitPHINode(PHINode &I);
313
314	bool visitIntrinsicInst(IntrinsicInst &I);
315	bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
316	bool visitMinNum(IntrinsicInst &I);
317	bool visitSqrt(IntrinsicInst &I);
318	bool run(Function &F);
319	};
320
321	class AMDGPUCodeGenPrepare : public FunctionPass {
322	private:
323	AMDGPUCodeGenPrepareImpl Impl;
324
325	public:
326	static char ID;
327	AMDGPUCodeGenPrepare() : FunctionPass (ID) {
328	initializeAMDGPUCodeGenPreparePass(*PassRegistry::getPassRegistry());
329	}
330	void getAnalysisUsage(AnalysisUsage &AU) const override {
331	AU.addRequired<AssumptionCacheTracker>();
332	AU.addRequired<UniformityInfoWrapperPass>();
333	AU.addRequired<TargetLibraryInfoWrapperPass>();
334
335	// FIXME: Division expansion needs to preserve the dominator tree.
336	if (!ExpandDiv64InIR)
337	AU.setPreservesAll();
338	}
339	bool runOnFunction(Function &F) override;
340	bool doInitialization(Module &M) override;
341	StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
342	};
343
344	} // end anonymous namespace
345
346	bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
347	BreakPhiNodesCache.clear();
348	bool MadeChange = false;
349
350	Function::iterator NextBB;
351	for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
352	BasicBlock BB = &FI;
353	NextBB = std::next(x: FI);
354
355	BasicBlock::iterator Next;
356	for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
357	I = Next) {
358	Next = std::next(x: I);
359
360	MadeChange \|= visit(I&: *I);
361
362	if (Next != E) { // Control flow changed
363	BasicBlock *NextInstBB = Next ->getParent();
364	if (NextInstBB != BB) {
365	BB = NextInstBB;
366	E = BB->end();
367	FE = F.end();
368	}
369	}
370	}
371	}
372	return MadeChange;
373	}
374
375	unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type T) const* {
376	assert(needsPromotionToI32(T) && "T does not need promotion to i32");
377
378	if (T->isIntegerTy())
379	return T->getIntegerBitWidth();
380	return cast<VectorType>(Val: T)->getElementType()->getIntegerBitWidth();
381	}
382
383	Type AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const* Type T) const* {
384	assert(needsPromotionToI32(T) && "T does not need promotion to i32");
385
386	if (T->isIntegerTy())
387	return B.getInt32Ty();
388	return FixedVectorType::get(ElementType: B.getInt32Ty(), FVTy: cast<FixedVectorType>(Val: T));
389	}
390
391	bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {
392	return I.getOpcode() == Instruction::AShr \|\|
393	I.getOpcode() == Instruction::SDiv \|\| I.getOpcode() == Instruction::SRem;
394	}
395
396	bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
397	return isa<ICmpInst>(Val: I.getOperand(i_nocapture: `0`)) ?
398	cast<ICmpInst>(Val: I.getOperand(i_nocapture: `0`))->isSigned() : false;
399	}
400
401	bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type T) const* {
402	if (!Widen16BitOps)
403	return false;
404
405	const IntegerType *IntTy = dyn_cast<IntegerType>(Val: T);
406	if (IntTy && IntTy->getBitWidth() > `1` && IntTy->getBitWidth() <= `16`)
407	return true;
408
409	if (const VectorType *VT = dyn_cast<VectorType>(Val: T)) {
410	// TODO: The set of packed operations is more limited, so may want to
411	// promote some anyway.
412	if (ST->hasVOP3PInsts())
413	return false;
414
415	return needsPromotionToI32(T: VT->getElementType());
416	}
417
418	return false;
419	}
420
421	bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type Ty) const* {
422	return Ty->isFloatTy() \|\| Ty->isDoubleTy() \|\|
423	(Ty->isHalfTy() && ST->has16BitInsts());
424	}
425
426	// Return true if the op promoted to i32 should have nsw set.
427	static bool promotedOpIsNSW(const Instruction &I) {
428	switch (I.getOpcode()) {
429	case Instruction::Shl:
430	case Instruction::Add:
431	case Instruction::Sub:
432	return true;
433	case Instruction::Mul:
434	return I.hasNoUnsignedWrap();
435	default:
436	return false;
437	}
438	}
439
440	// Return true if the op promoted to i32 should have nuw set.
441	static bool promotedOpIsNUW(const Instruction &I) {
442	switch (I.getOpcode()) {
443	case Instruction::Shl:
444	case Instruction::Add:
445	case Instruction::Mul:
446	return true;
447	case Instruction::Sub:
448	return I.hasNoUnsignedWrap();
449	default:
450	return false;
451	}
452	}
453
454	bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
455	Type *Ty = I.getType();
456	const DataLayout &DL = Mod->getDataLayout();
457	int TySize = DL.getTypeSizeInBits(Ty);
458	Align Alignment = DL.getValueOrABITypeAlignment(Alignment: I.getAlign(), Ty);
459
460	return I.isSimple() && TySize < `32` && Alignment >= `4` && UA->isUniform(I: &I);
461	}
462
463	bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const {
464	assert(needsPromotionToI32(I.getType()) &&
465	"I does not need promotion to i32");
466
467	if (I.getOpcode() == Instruction::SDiv \|\|
468	I.getOpcode() == Instruction::UDiv \|\|
469	I.getOpcode() == Instruction::SRem \|\|
470	I.getOpcode() == Instruction::URem)
471	return false;
472
473	IRBuilder<> Builder(&I);
474	Builder.SetCurrentDebugLocation(I.getDebugLoc());
475
476	Type *I32Ty = getI32Ty(B&: Builder, T: I.getType());
477	Value ExtOp0 = nullptr*;
478	Value ExtOp1 = nullptr*;
479	Value ExtRes = nullptr*;
480	Value TruncRes = nullptr*;
481
482	if (isSigned(I)) {
483	ExtOp0 = Builder.CreateSExt(V: I.getOperand(i_nocapture: `0`), DestTy: I32Ty);
484	ExtOp1 = Builder.CreateSExt(V: I.getOperand(i_nocapture: `1`), DestTy: I32Ty);
485	} else {
486	ExtOp0 = Builder.CreateZExt(V: I.getOperand(i_nocapture: `0`), DestTy: I32Ty);
487	ExtOp1 = Builder.CreateZExt(V: I.getOperand(i_nocapture: `1`), DestTy: I32Ty);
488	}
489
490	ExtRes = Builder.CreateBinOp(Opc: I.getOpcode(), LHS: ExtOp0, RHS: ExtOp1);
491	if (Instruction *Inst = dyn_cast<Instruction>(Val: ExtRes)) {
492	if (promotedOpIsNSW(I: cast<Instruction>(Val&: I)))
493	Inst->setHasNoSignedWrap();
494
495	if (promotedOpIsNUW(I: cast<Instruction>(Val&: I)))
496	Inst->setHasNoUnsignedWrap();
497
498	if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(Val: &I))
499	Inst->setIsExact(ExactOp->isExact());
500	}
501
502	TruncRes = Builder.CreateTrunc(V: ExtRes, DestTy: I.getType());
503
504	I.replaceAllUsesWith(V: TruncRes);
505	I.eraseFromParent();
506
507	return true;
508	}
509
510	bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const {
511	assert(needsPromotionToI32(I.getOperand(`0`)->getType()) &&
512	"I does not need promotion to i32");
513
514	IRBuilder<> Builder(&I);
515	Builder.SetCurrentDebugLocation(I.getDebugLoc());
516
517	Type *I32Ty = getI32Ty(B&: Builder, T: I.getOperand(i_nocapture: `0`)->getType());
518	Value ExtOp0 = nullptr*;
519	Value ExtOp1 = nullptr*;
520	Value NewICmp = nullptr*;
521
522	if (I.isSigned()) {
523	ExtOp0 = Builder.CreateSExt(V: I.getOperand(i_nocapture: `0`), DestTy: I32Ty);
524	ExtOp1 = Builder.CreateSExt(V: I.getOperand(i_nocapture: `1`), DestTy: I32Ty);
525	} else {
526	ExtOp0 = Builder.CreateZExt(V: I.getOperand(i_nocapture: `0`), DestTy: I32Ty);
527	ExtOp1 = Builder.CreateZExt(V: I.getOperand(i_nocapture: `1`), DestTy: I32Ty);
528	}
529	NewICmp = Builder.CreateICmp(P: I.getPredicate(), LHS: ExtOp0, RHS: ExtOp1);
530
531	I.replaceAllUsesWith(V: NewICmp);
532	I.eraseFromParent();
533
534	return true;
535	}
536
537	bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const {
538	assert(needsPromotionToI32(I.getType()) &&
539	"I does not need promotion to i32");
540
541	IRBuilder<> Builder(&I);
542	Builder.SetCurrentDebugLocation(I.getDebugLoc());
543
544	Type *I32Ty = getI32Ty(B&: Builder, T: I.getType());
545	Value ExtOp1 = nullptr*;
546	Value ExtOp2 = nullptr*;
547	Value ExtRes = nullptr*;
548	Value TruncRes = nullptr*;
549
550	if (isSigned(I)) {
551	ExtOp1 = Builder.CreateSExt(V: I.getOperand(i_nocapture: `1`), DestTy: I32Ty);
552	ExtOp2 = Builder.CreateSExt(V: I.getOperand(i_nocapture: `2`), DestTy: I32Ty);
553	} else {
554	ExtOp1 = Builder.CreateZExt(V: I.getOperand(i_nocapture: `1`), DestTy: I32Ty);
555	ExtOp2 = Builder.CreateZExt(V: I.getOperand(i_nocapture: `2`), DestTy: I32Ty);
556	}
557	ExtRes = Builder.CreateSelect(C: I.getOperand(i_nocapture: `0`), True: ExtOp1, False: ExtOp2);
558	TruncRes = Builder.CreateTrunc(V: ExtRes, DestTy: I.getType());
559
560	I.replaceAllUsesWith(V: TruncRes);
561	I.eraseFromParent();
562
563	return true;
564	}
565
566	bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
567	IntrinsicInst &I) const {
568	assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
569	"I must be bitreverse intrinsic");
570	assert(needsPromotionToI32(I.getType()) &&
571	"I does not need promotion to i32");
572
573	IRBuilder<> Builder(&I);
574	Builder.SetCurrentDebugLocation(I.getDebugLoc());
575
576	Type *I32Ty = getI32Ty(B&: Builder, T: I.getType());
577	Function *I32 =
578	Intrinsic::getDeclaration(M: Mod, Intrinsic::id: bitreverse, Tys: { I32Ty });
579	Value *ExtOp = Builder.CreateZExt(V: I.getOperand(i_nocapture: `0`), DestTy: I32Ty);
580	Value *ExtRes = Builder.CreateCall(Callee: I32, Args: { ExtOp });
581	Value *LShrOp =
582	Builder.CreateLShr(LHS: ExtRes, RHS: `32` - getBaseElementBitWidth(T: I.getType()));
583	Value *TruncRes =
584	Builder.CreateTrunc(V: LShrOp, DestTy: I.getType());
585
586	I.replaceAllUsesWith(V: TruncRes);
587	I.eraseFromParent();
588
589	return true;
590	}
591
592	unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value Op) const* {
593	return computeKnownBits(V: Op, DL: *DL, Depth: `0`, AC).countMaxActiveBits();
594	}
595
596	unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value Op) const* {
597	return ComputeMaxSignificantBits(Op, DL: *DL, Depth: `0`, AC);
598	}
599
600	static void extractValues(IRBuilder<> &Builder,
601	SmallVectorImpl<Value > &Values, Value V) {
602	auto *VT = dyn_cast<FixedVectorType>(Val: V->getType());
603	if (!VT) {
604	Values.push_back(Elt: V);
605	return;
606	}
607
608	for (int I = `0`, E = VT->getNumElements(); I != E; ++I)
609	Values.push_back(Elt: Builder.CreateExtractElement(Vec: V, Idx: I));
610	}
611
612	static Value *insertValues(IRBuilder<> &Builder,
613	Type *Ty,
614	SmallVectorImpl<Value *> &Values) {
615	if (!Ty->isVectorTy()) {
616	assert(Values.size() == `1`);
617	return Values [`0`];
618	}
619
620	Value *NewVal = PoisonValue::get(T: Ty);
621	for (int I = `0`, E = Values.size(); I != E; ++I)
622	NewVal = Builder.CreateInsertElement(Vec: NewVal, NewElt: Values [I], Idx: I);
623
624	return NewVal;
625	}
626
627	bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
628	if (I.getOpcode() != Instruction::Mul)
629	return false;
630
631	Type *Ty = I.getType();
632	unsigned Size = Ty->getScalarSizeInBits();
633	if (Size <= `16` && ST->has16BitInsts())
634	return false;
635
636	// Prefer scalar if this could be s_mul_i32
637	if (UA->isUniform(I: &I))
638	return false;
639
640	Value *LHS = I.getOperand(i_nocapture: `0`);
641	Value *RHS = I.getOperand(i_nocapture: `1`);
642	IRBuilder<> Builder(&I);
643	Builder.SetCurrentDebugLocation(I.getDebugLoc());
644
645	unsigned LHSBits = `0`, RHSBits = `0`;
646	bool IsSigned = false;
647
648	if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(Op: LHS)) <= `24` &&
649	(RHSBits = numBitsUnsigned(Op: RHS)) <= `24`) {
650	IsSigned = false;
651
652	} else if (ST->hasMulI24() && (LHSBits = numBitsSigned(Op: LHS)) <= `24` &&
653	(RHSBits = numBitsSigned(Op: RHS)) <= `24`) {
654	IsSigned = true;
655
656	} else
657	return false;
658
659	SmallVector<Value *, `4`> LHSVals;
660	SmallVector<Value *, `4`> RHSVals;
661	SmallVector<Value *, `4`> ResultVals;
662	extractValues(Builder, Values&: LHSVals, V: LHS);
663	extractValues(Builder, Values&: RHSVals, V: RHS);
664
665	IntegerType *I32Ty = Builder.getInt32Ty();
666	IntegerType *IntrinTy = Size > `32` ? Builder.getInt64Ty() : I32Ty;
667	Type *DstTy = LHSVals [`0`]->getType();
668
669	for (int I = `0`, E = LHSVals.size(); I != E; ++I) {
670	Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(V: LHSVals [I], DestTy: I32Ty)
671	: Builder.CreateZExtOrTrunc(V: LHSVals [I], DestTy: I32Ty);
672	Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(V: RHSVals [I], DestTy: I32Ty)
673	: Builder.CreateZExtOrTrunc(V: RHSVals [I], DestTy: I32Ty);
674	Intrinsic::ID ID =
675	IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
676	Value *Result = Builder.CreateIntrinsic(ID, Types: {IntrinTy}, Args: {LHS, RHS});
677	Result = IsSigned ? Builder.CreateSExtOrTrunc(V: Result, DestTy: DstTy)
678	: Builder.CreateZExtOrTrunc(V: Result, DestTy: DstTy);
679	ResultVals.push_back(Elt: Result);
680	}
681
682	Value *NewVal = insertValues(Builder, Ty, Values&: ResultVals);
683	NewVal->takeName(V: &I);
684	I.replaceAllUsesWith(V: NewVal);
685	I.eraseFromParent();
686
687	return true;
688	}
689
690	// Find a select instruction, which may have been casted. This is mostly to deal
691	// with cases where i16 selects were promoted here to i32.
692	static SelectInst findSelectThroughCast(Value V, CastInst *&Cast) {
693	Cast = nullptr;
694	if (SelectInst *Sel = dyn_cast<SelectInst>(Val: V))
695	return Sel;
696
697	if ((Cast = dyn_cast<CastInst>(Val: V))) {
698	if (SelectInst *Sel = dyn_cast<SelectInst>(Val: Cast->getOperand(i_nocapture: `0`)))
699	return Sel;
700	}
701
702	return nullptr;
703	}
704
705	bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
706	// Don't do this unless the old select is going away. We want to eliminate the
707	// binary operator, not replace a binop with a select.
708	int SelOpNo = `0`;
709
710	CastInst *CastOp;
711
712	// TODO: Should probably try to handle some cases with multiple
713	// users. Duplicating the select may be profitable for division.
714	SelectInst *Sel = findSelectThroughCast(V: BO.getOperand(i_nocapture: `0`), Cast&: CastOp);
715	if (!Sel \|\| !Sel->hasOneUse()) {
716	SelOpNo = `1`;
717	Sel = findSelectThroughCast(V: BO.getOperand(i_nocapture: `1`), Cast&: CastOp);
718	}
719
720	if (!Sel \|\| !Sel->hasOneUse())
721	return false;
722
723	Constant *CT = dyn_cast<Constant>(Val: Sel->getTrueValue());
724	Constant *CF = dyn_cast<Constant>(Val: Sel->getFalseValue());
725	Constant *CBO = dyn_cast<Constant>(Val: BO.getOperand(i_nocapture: SelOpNo ^ `1`));
726	if (!CBO \|\| !CT \|\| !CF)
727	return false;
728
729	if (CastOp) {
730	if (!CastOp->hasOneUse())
731	return false;
732	CT = ConstantFoldCastOperand(Opcode: CastOp->getOpcode(), C: CT, DestTy: BO.getType(), DL: *DL);
733	CF = ConstantFoldCastOperand(Opcode: CastOp->getOpcode(), C: CF, DestTy: BO.getType(), DL: *DL);
734	}
735
736	// TODO: Handle special 0/-1 cases DAG combine does, although we only really
737	// need to handle divisions here.
738	Constant *FoldedT = SelOpNo ?
739	ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CBO, RHS: CT, DL: *DL) :
740	ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CT, RHS: CBO, DL: *DL);
741	if (!FoldedT \|\| isa<ConstantExpr>(Val: FoldedT))
742	return false;
743
744	Constant *FoldedF = SelOpNo ?
745	ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CBO, RHS: CF, DL: *DL) :
746	ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CF, RHS: CBO, DL: *DL);
747	if (!FoldedF \|\| isa<ConstantExpr>(Val: FoldedF))
748	return false;
749
750	IRBuilder<> Builder(&BO);
751	Builder.SetCurrentDebugLocation(BO.getDebugLoc());
752	if (const FPMathOperator FPOp = dyn_cast<const* FPMathOperator>(Val: &BO))
753	Builder.setFastMathFlags(FPOp->getFastMathFlags());
754
755	Value *NewSelect = Builder.CreateSelect(C: Sel->getCondition(),
756	True: FoldedT, False: FoldedF);
757	NewSelect->takeName(V: &BO);
758	BO.replaceAllUsesWith(V: NewSelect);
759	BO.eraseFromParent();
760	if (CastOp)
761	CastOp->eraseFromParent();
762	Sel->eraseFromParent();
763	return true;
764	}
765
766	std::pair<Value , Value >
767	AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
768	Value Src) const* {
769	Type *Ty = Src->getType();
770	Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,
771	{Ty, Builder.getInt32Ty()}, Src);
772	Value *FrexpMant = Builder.CreateExtractValue(Agg: Frexp, Idxs: {`0`});
773
774	// Bypass the bug workaround for the exponent result since it doesn't matter.
775	// TODO: Does the bug workaround even really need to consider the exponent
776	// result? It's unspecified by the spec.
777
778	Value *FrexpExp =
779	ST->hasFractBug()
780	? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,
781	{Builder.getInt32Ty(), Ty}, Src)
782	: Builder.CreateExtractValue(Agg: Frexp, Idxs: {`1`});
783	return {FrexpMant, FrexpExp};
784	}
785
786	/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
787	Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
788	Value *Src,
789	bool IsNegative) const {
790	// Same as for 1.0, but expand the sign out of the constant.
791	// -1.0 / x -> rcp (fneg x)
792	if (IsNegative)
793	Src = Builder.CreateFNeg(V: Src);
794
795	// The rcp instruction doesn't support denormals, so scale the input
796	// out of the denormal range and convert at the end.
797	//
798	// Expand as 2^-n (1.0 / (x * 2^n))*
799
800	// TODO: Skip scaling if input is known never denormal and the input
801	// range won't underflow to denormal. The hard part is knowing the
802	// result. We need a range check, the result could be denormal for
803	// 0x1p+126 < den <= 0x1p+127.
804	auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
805	Value *ScaleFactor = Builder.CreateNeg(V: FrexpExp);
806	Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::ID: amdgcn_rcp, V: FrexpMant);
807	return Builder.CreateCall(Callee: getLdexpF32(), Args: {Rcp, ScaleFactor});
808	}
809
810	/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
811	Value AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value LHS,
812	Value *RHS,
813	FastMathFlags FMF) const {
814	// If we have have to work around the fract/frexp bug, we're worse off than
815	// using the fdiv.fast expansion. The full safe expansion is faster if we have
816	// fast FMA.
817	if (HasFP32DenormalFlush && ST->hasFractBug() && !ST->hasFastFMAF32() &&
818	(!FMF.noNaNs() \|\| !FMF.noInfs()))
819	return nullptr;
820
821	// We're scaling the LHS to avoid a denormal input, and scale the denominator
822	// to avoid large values underflowing the result.
823	auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, Src: RHS);
824
825	Value *Rcp =
826	Builder.CreateUnaryIntrinsic(Intrinsic::ID: amdgcn_rcp, V: FrexpMantRHS);
827
828	auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, Src: LHS);
829	Value *Mul = Builder.CreateFMul(L: FrexpMantLHS, R: Rcp);
830
831	// We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
832	// result.
833	Value *ExpDiff = Builder.CreateSub(LHS: FrexpExpLHS, RHS: FrexpExpRHS);
834	return Builder.CreateCall(Callee: getLdexpF32(), Args: {Mul, ExpDiff});
835	}
836
837	/// Emit a sqrt that handles denormals and is accurate to 2ulp.
838	Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
839	Value *Src,
840	FastMathFlags FMF) const {
841	Type *Ty = Src->getType();
842	APFloat SmallestNormal =
843	APFloat::getSmallestNormalized(Sem: Ty->getFltSemantics());
844	Value *NeedScale =
845	Builder.CreateFCmpOLT(LHS: Src, RHS: ConstantFP::get(Ty, V: SmallestNormal));
846
847	ConstantInt *Zero = Builder.getInt32(C: `0`);
848	Value *InputScaleFactor =
849	Builder.CreateSelect(C: NeedScale, True: Builder.getInt32(C: `32`), False: Zero);
850
851	Value *Scaled = Builder.CreateCall(Callee: getLdexpF32(), Args: {Src, InputScaleFactor});
852
853	Value *Sqrt = Builder.CreateCall(Callee: getSqrtF32(), Args: Scaled);
854
855	Value *OutputScaleFactor =
856	Builder.CreateSelect(C: NeedScale, True: Builder.getInt32(C: -`16`), False: Zero);
857	return Builder.CreateCall(Callee: getLdexpF32(), Args: {Sqrt, OutputScaleFactor});
858	}
859
860	/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
861	static Value emitRsqIEEE1ULP(IRBuilder<> &Builder, Value Src,
862	bool IsNegative) {
863	// bool need_scale = x < 0x1p-126f;
864	// float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
865	// float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
866	// rsq(x input_scale) * output_scale;*
867
868	Type *Ty = Src->getType();
869	APFloat SmallestNormal =
870	APFloat::getSmallestNormalized(Sem: Ty->getFltSemantics());
871	Value *NeedScale =
872	Builder.CreateFCmpOLT(LHS: Src, RHS: ConstantFP::get(Ty, V: SmallestNormal));
873	Constant *One = ConstantFP::get(Ty, V: `1.0`);
874	Constant *InputScale = ConstantFP::get(Ty, V: `0x1.0p+24`);
875	Constant *OutputScale =
876	ConstantFP::get(Ty, V: IsNegative ? -`0x1.0p+12` : `0x1.0p+12`);
877
878	Value *InputScaleFactor = Builder.CreateSelect(C: NeedScale, True: InputScale, False: One);
879
880	Value *ScaledInput = Builder.CreateFMul(L: Src, R: InputScaleFactor);
881	Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::ID: amdgcn_rsq, V: ScaledInput);
882	Value *OutputScaleFactor = Builder.CreateSelect(
883	C: NeedScale, True: OutputScale, False: IsNegative ? ConstantFP::get(Ty, V: -`1.0`) : One);
884
885	return Builder.CreateFMul(L: Rsq, R: OutputScaleFactor);
886	}
887
888	bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
889	FastMathFlags DivFMF,
890	FastMathFlags SqrtFMF) const {
891	// The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
892	if (!DivFMF.allowContract() \|\| !SqrtFMF.allowContract())
893	return false;
894
895	// v_rsq_f32 gives 1ulp
896	return SqrtFMF.approxFunc() \|\| HasUnsafeFPMath \|\|
897	SqrtOp->getFPAccuracy() >= `1.0f`;
898	}
899
900	Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
901	IRBuilder<> &Builder, Value Num, Value Den, const FastMathFlags DivFMF,
902	const FastMathFlags SqrtFMF, const Instruction CtxI) const* {
903	// The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
904	assert(DivFMF.allowContract() && SqrtFMF.allowContract());
905
906	// rsq_f16 is accurate to 0.51 ulp.
907	// rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
908	// rsq_f64 is never accurate.
909	const ConstantFP *CLHS = dyn_cast<ConstantFP>(Val: Num);
910	if (!CLHS)
911	return nullptr;
912
913	assert(Den->getType()->isFloatTy());
914
915	bool IsNegative = false;
916
917	// TODO: Handle other numerator values with arcp.
918	if (CLHS->isExactlyValue(V: `1.0`) \|\| (IsNegative = CLHS->isExactlyValue(V: -`1.0`))) {
919	// Add in the sqrt flags.
920	IRBuilder<>::FastMathFlagGuard Guard(Builder);
921	Builder.setFastMathFlags(DivFMF \| SqrtFMF);
922
923	if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) \|\| HasUnsafeFPMath \|\|
924	canIgnoreDenormalInput(V: Den, CtxI)) {
925	Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::ID: amdgcn_rsq, V: Den);
926	// -1.0 / sqrt(x) -> fneg(rsq(x))
927	return IsNegative ? Builder.CreateFNeg(V: Result) : Result;
928	}
929
930	return emitRsqIEEE1ULP(Builder, Src: Den, IsNegative);
931	}
932
933	return nullptr;
934	}
935
936	// Optimize fdiv with rcp:
937	//
938	// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
939	// allowed with unsafe-fp-math or afn.
940	//
941	// a/b -> arcp(b) when arcp is allowed, and we only need provide ULP 1.0*
942	Value *
943	AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
944	Value *Den, FastMathFlags FMF,
945	const Instruction CtxI) const* {
946	// rcp_f16 is accurate to 0.51 ulp.
947	// rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
948	// rcp_f64 is never accurate.
949	assert(Den->getType()->isFloatTy());
950
951	if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Val: Num)) {
952	bool IsNegative = false;
953	if (CLHS->isExactlyValue(V: `1.0`) \|\|
954	(IsNegative = CLHS->isExactlyValue(V: -`1.0`))) {
955	Value *Src = Den;
956
957	if (HasFP32DenormalFlush \|\| FMF.approxFunc()) {
958	// -1.0 / x -> 1.0 / fneg(x)
959	if (IsNegative)
960	Src = Builder.CreateFNeg(V: Src);
961
962	// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
963	// the CI documentation has a worst case error of 1 ulp.
964	// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
965	// to use it as long as we aren't trying to use denormals.
966	//
967	// v_rcp_f16 and v_rsq_f16 DO support denormals.
968
969	// NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
970	// insert rsq intrinsic here.
971
972	// 1.0 / x -> rcp(x)
973	return Builder.CreateUnaryIntrinsic(Intrinsic::ID: amdgcn_rcp, V: Src);
974	}
975
976	// TODO: If the input isn't denormal, and we know the input exponent isn't
977	// big enough to introduce a denormal we can avoid the scaling.
978	return emitRcpIEEE1ULP(Builder, Src, IsNegative);
979	}
980	}
981
982	if (FMF.allowReciprocal()) {
983	// x / y -> x (1.0 / y)*
984
985	// TODO: Could avoid denormal scaling and use raw rcp if we knew the output
986	// will never underflow.
987	if (HasFP32DenormalFlush \|\| FMF.approxFunc()) {
988	Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::ID: amdgcn_rcp, V: Den);
989	return Builder.CreateFMul(L: Num, R: Recip);
990	}
991
992	Value Recip = emitRcpIEEE1ULP(Builder, Src: Den, IsNegative: false*);
993	return Builder.CreateFMul(L: Num, R: Recip);
994	}
995
996	return nullptr;
997	}
998
999	// optimize with fdiv.fast:
1000	//
1001	// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1002	//
1003	// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1004	//
1005	// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
1006	Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
1007	IRBuilder<> &Builder, Value Num, Value Den, float ReqdAccuracy) const {
1008	// fdiv.fast can achieve 2.5 ULP accuracy.
1009	if (ReqdAccuracy < `2.5f`)
1010	return nullptr;
1011
1012	// Only have fdiv.fast for f32.
1013	assert(Den->getType()->isFloatTy());
1014
1015	bool NumIsOne = false;
1016	if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Val: Num)) {
1017	if (CNum->isExactlyValue(V: +`1.0`) \|\| CNum->isExactlyValue(V: -`1.0`))
1018	NumIsOne = true;
1019	}
1020
1021	// fdiv does not support denormals. But 1.0/x is always fine to use it.
1022	//
1023	// TODO: This works for any value with a specific known exponent range, don't
1024	// just limit to constant 1.
1025	if (!HasFP32DenormalFlush && !NumIsOne)
1026	return nullptr;
1027
1028	return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den});
1029	}
1030
1031	Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
1032	IRBuilder<> &Builder, Value Num, Value Den, FastMathFlags DivFMF,
1033	FastMathFlags SqrtFMF, Value RsqOp, const* Instruction *FDivInst,
1034	float ReqdDivAccuracy) const {
1035	if (RsqOp) {
1036	Value *Rsq =
1037	optimizeWithRsq(Builder, Num, Den: RsqOp, DivFMF, SqrtFMF, CtxI: FDivInst);
1038	if (Rsq)
1039	return Rsq;
1040	}
1041
1042	Value *Rcp = optimizeWithRcp(Builder, Num, Den, FMF: DivFMF, CtxI: FDivInst);
1043	if (Rcp)
1044	return Rcp;
1045
1046	// In the basic case fdiv_fast has the same instruction count as the frexp div
1047	// expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
1048	// potentially be fused into a user. Also, materialization of the constants
1049	// can be reused for multiple instances.
1050	Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdAccuracy: ReqdDivAccuracy);
1051	if (FDivFast)
1052	return FDivFast;
1053
1054	return emitFrexpDiv(Builder, LHS: Num, RHS: Den, FMF: DivFMF);
1055	}
1056
1057	// Optimizations is performed based on fpmath, fast math flags as well as
1058	// denormals to optimize fdiv with either rcp or fdiv.fast.
1059	//
1060	// With rcp:
1061	// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
1062	// allowed with unsafe-fp-math or afn.
1063	//
1064	// a/b -> arcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.*
1065	//
1066	// With fdiv.fast:
1067	// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1068	//
1069	// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1070	//
1071	// NOTE: rcp is the preference in cases that both are legal.
1072	bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
1073	if (DisableFDivExpand)
1074	return false;
1075
1076	Type *Ty = FDiv.getType()->getScalarType();
1077	if (!Ty->isFloatTy())
1078	return false;
1079
1080	// The f64 rcp/rsq approximations are pretty inaccurate. We can do an
1081	// expansion around them in codegen. f16 is good enough to always use.
1082
1083	const FPMathOperator FPOp = cast<const* FPMathOperator>(Val: &FDiv);
1084	const FastMathFlags DivFMF = FPOp->getFastMathFlags();
1085	const float ReqdAccuracy = FPOp->getFPAccuracy();
1086
1087	FastMathFlags SqrtFMF;
1088
1089	Value *Num = FDiv.getOperand(i_nocapture: `0`);
1090	Value *Den = FDiv.getOperand(i_nocapture: `1`);
1091
1092	Value RsqOp = nullptr*;
1093	auto *DenII = dyn_cast<IntrinsicInst>(Val: Den);
1094	if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
1095	DenII->hasOneUse()) {
1096	const auto *SqrtOp = cast<FPMathOperator>(Val: DenII);
1097	SqrtFMF = SqrtOp->getFastMathFlags();
1098	if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
1099	RsqOp = SqrtOp->getOperand(i: `0`);
1100	}
1101
1102	// Inaccurate rcp is allowed with unsafe-fp-math or afn.
1103	//
1104	// Defer to codegen to handle this.
1105	//
1106	// TODO: Decide on an interpretation for interactions between afn + arcp +
1107	// !fpmath, and make it consistent between here and codegen. For now, defer
1108	// expansion of afn to codegen. The current interpretation is so aggressive we
1109	// don't need any pre-consideration here when we have better information. A
1110	// more conservative interpretation could use handling here.
1111	const bool AllowInaccurateRcp = HasUnsafeFPMath \|\| DivFMF.approxFunc();
1112	if (!RsqOp && AllowInaccurateRcp)
1113	return false;
1114
1115	// Defer the correct implementations to codegen.
1116	if (ReqdAccuracy < `1.0f`)
1117	return false;
1118
1119	IRBuilder<> Builder(FDiv.getParent(), std::next(x: FDiv.getIterator()));
1120	Builder.setFastMathFlags(DivFMF);
1121	Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
1122
1123	SmallVector<Value *, `4`> NumVals;
1124	SmallVector<Value *, `4`> DenVals;
1125	SmallVector<Value *, `4`> RsqDenVals;
1126	extractValues(Builder, Values&: NumVals, V: Num);
1127	extractValues(Builder, Values&: DenVals, V: Den);
1128
1129	if (RsqOp)
1130	extractValues(Builder, Values&: RsqDenVals, V: RsqOp);
1131
1132	SmallVector<Value *, `4`> ResultVals(NumVals.size());
1133	for (int I = `0`, E = NumVals.size(); I != E; ++I) {
1134	Value *NumElt = NumVals [I];
1135	Value *DenElt = DenVals [I];
1136	Value RsqDenElt = RsqOp ? RsqDenVals [I] : nullptr*;
1137
1138	Value *NewElt =
1139	visitFDivElement(Builder, Num: NumElt, Den: DenElt, DivFMF, SqrtFMF, RsqOp: RsqDenElt,
1140	FDivInst: cast<Instruction>(Val: FPOp), ReqdDivAccuracy: ReqdAccuracy);
1141	if (!NewElt) {
1142	// Keep the original, but scalarized.
1143
1144	// This has the unfortunate side effect of sometimes scalarizing when
1145	// we're not going to do anything.
1146	NewElt = Builder.CreateFDiv(L: NumElt, R: DenElt);
1147	if (auto *NewEltInst = dyn_cast<Instruction>(Val: NewElt))
1148	NewEltInst->copyMetadata(SrcInst: FDiv);
1149	}
1150
1151	ResultVals [I] = NewElt;
1152	}
1153
1154	Value *NewVal = insertValues(Builder, Ty: FDiv.getType(), Values&: ResultVals);
1155
1156	if (NewVal) {
1157	FDiv.replaceAllUsesWith(V: NewVal);
1158	NewVal->takeName(V: &FDiv);
1159	RecursivelyDeleteTriviallyDeadInstructions(V: &FDiv, TLI: TLInfo);
1160	}
1161
1162	return true;
1163	}
1164
1165	static bool hasUnsafeFPMath(const Function &F) {
1166	Attribute Attr = F.getFnAttribute(Kind: "unsafe-fp-math");
1167	return Attr.getValueAsBool();
1168	}
1169
1170	static std::pair<Value, Value> getMul64(IRBuilder<> &Builder,
1171	Value LHS, Value RHS) {
1172	Type *I32Ty = Builder.getInt32Ty();
1173	Type *I64Ty = Builder.getInt64Ty();
1174
1175	Value *LHS_EXT64 = Builder.CreateZExt(V: LHS, DestTy: I64Ty);
1176	Value *RHS_EXT64 = Builder.CreateZExt(V: RHS, DestTy: I64Ty);
1177	Value *MUL64 = Builder.CreateMul(LHS: LHS_EXT64, RHS: RHS_EXT64);
1178	Value *Lo = Builder.CreateTrunc(V: MUL64, DestTy: I32Ty);
1179	Value *Hi = Builder.CreateLShr(LHS: MUL64, RHS: Builder.getInt64(C: `32`));
1180	Hi = Builder.CreateTrunc(V: Hi, DestTy: I32Ty);
1181	return std::pair(Lo, Hi);
1182	}
1183
1184	static Value* getMulHu(IRBuilder<> &Builder, Value LHS, Value RHS) {
1185	return getMul64(Builder, LHS, RHS).second;
1186	}
1187
1188	/// Figure out how many bits are really needed for this division. \p AtLeast is
1189	/// an optimization hint to bypass the second ComputeNumSignBits call if we the
1190	/// first one is insufficient. Returns -1 on failure.
1191	int AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
1192	Value Den, unsigned* AtLeast,
1193	bool IsSigned) const {
1194	const DataLayout &DL = Mod->getDataLayout();
1195	unsigned LHSSignBits = ComputeNumSignBits(Op: Num, DL, Depth: `0`, AC, CxtI: &I);
1196	if (LHSSignBits < AtLeast)
1197	return -`1`;
1198
1199	unsigned RHSSignBits = ComputeNumSignBits(Op: Den, DL, Depth: `0`, AC, CxtI: &I);
1200	if (RHSSignBits < AtLeast)
1201	return -`1`;
1202
1203	unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
1204	unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
1205	if (IsSigned)
1206	++DivBits;
1207	return DivBits;
1208	}
1209
1210	// The fractional part of a float is enough to accurately represent up to
1211	// a 24-bit signed integer.
1212	Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
1213	BinaryOperator &I, Value *Num,
1214	Value Den, bool* IsDiv,
1215	bool IsSigned) const {
1216	unsigned SSBits = Num->getType()->getScalarSizeInBits();
1217	// If Num bits <= 24, assume 0 signbits.
1218	unsigned AtLeast = (SSBits <= `24`) ? `0` : (SSBits - `24` + IsSigned);
1219	int DivBits = getDivNumBits(I, Num, Den, AtLeast, IsSigned);
1220	if (DivBits == -`1`)
1221	return nullptr;
1222	return expandDivRem24Impl(Builder, I, Num, Den, NumBits: DivBits, IsDiv, IsSigned);
1223	}
1224
1225	Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1226	IRBuilder<> &Builder, BinaryOperator &I, Value Num, Value Den,
1227	unsigned DivBits, bool IsDiv, bool IsSigned) const {
1228	Type *I32Ty = Builder.getInt32Ty();
1229	Num = Builder.CreateTrunc(V: Num, DestTy: I32Ty);
1230	Den = Builder.CreateTrunc(V: Den, DestTy: I32Ty);
1231
1232	Type *F32Ty = Builder.getFloatTy();
1233	ConstantInt *One = Builder.getInt32(C: `1`);
1234	Value *JQ = One;
1235
1236	if (IsSigned) {
1237	// char\|short jq = ia ^ ib;
1238	JQ = Builder.CreateXor(LHS: Num, RHS: Den);
1239
1240	// jq = jq >> (bitsize - 2)
1241	JQ = Builder.CreateAShr(LHS: JQ, RHS: Builder.getInt32(C: `30`));
1242
1243	// jq = jq \| 0x1
1244	JQ = Builder.CreateOr(LHS: JQ, RHS: One);
1245	}
1246
1247	// int ia = (int)LHS;
1248	Value *IA = Num;
1249
1250	// int ib, (int)RHS;
1251	Value *IB = Den;
1252
1253	// float fa = (float)ia;
1254	Value *FA = IsSigned ? Builder.CreateSIToFP(V: IA, DestTy: F32Ty)
1255	: Builder.CreateUIToFP(V: IA, DestTy: F32Ty);
1256
1257	// float fb = (float)ib;
1258	Value *FB = IsSigned ? Builder.CreateSIToFP(V: IB,DestTy: F32Ty)
1259	: Builder.CreateUIToFP(V: IB,DestTy: F32Ty);
1260
1261	Function *RcpDecl = Intrinsic::getDeclaration(M: Mod, Intrinsic::id: amdgcn_rcp,
1262	Tys: Builder.getFloatTy());
1263	Value *RCP = Builder.CreateCall(Callee: RcpDecl, Args: { FB });
1264	Value *FQM = Builder.CreateFMul(L: FA, R: RCP);
1265
1266	// fq = trunc(fqm);
1267	CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::ID: trunc, V: FQM);
1268	FQ->copyFastMathFlags(FMF: Builder.getFastMathFlags());
1269
1270	// float fqneg = -fq;
1271	Value *FQNeg = Builder.CreateFNeg(V: FQ);
1272
1273	// float fr = mad(fqneg, fb, fa);
1274	auto FMAD = !ST->hasMadMacF32Insts()
1275	? Intrinsic::fma
1276	: (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1277	Value *FR = Builder.CreateIntrinsic(FMAD,
1278	{FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
1279
1280	// int iq = (int)fq;
1281	Value *IQ = IsSigned ? Builder.CreateFPToSI(V: FQ, DestTy: I32Ty)
1282	: Builder.CreateFPToUI(V: FQ, DestTy: I32Ty);
1283
1284	// fr = fabs(fr);
1285	FR = Builder.CreateUnaryIntrinsic(Intrinsic::ID: fabs, V: FR, FMFSource: FQ);
1286
1287	// fb = fabs(fb);
1288	FB = Builder.CreateUnaryIntrinsic(Intrinsic::ID: fabs, V: FB, FMFSource: FQ);
1289
1290	// int cv = fr >= fb;
1291	Value *CV = Builder.CreateFCmpOGE(LHS: FR, RHS: FB);
1292
1293	// jq = (cv ? jq : 0);
1294	JQ = Builder.CreateSelect(C: CV, True: JQ, False: Builder.getInt32(C: `0`));
1295
1296	// dst = iq + jq;
1297	Value *Div = Builder.CreateAdd(LHS: IQ, RHS: JQ);
1298
1299	Value *Res = Div;
1300	if (!IsDiv) {
1301	// Rem needs compensation, it's easier to recompute it
1302	Value *Rem = Builder.CreateMul(LHS: Div, RHS: Den);
1303	Res = Builder.CreateSub(LHS: Num, RHS: Rem);
1304	}
1305
1306	if (DivBits != `0` && DivBits < `32`) {
1307	// Extend in register from the number of bits this divide really is.
1308	if (IsSigned) {
1309	int InRegBits = `32` - DivBits;
1310
1311	Res = Builder.CreateShl(LHS: Res, RHS: InRegBits);
1312	Res = Builder.CreateAShr(LHS: Res, RHS: InRegBits);
1313	} else {
1314	ConstantInt *TruncMask
1315	= Builder.getInt32(C: (UINT64_C(`1`) << DivBits) - `1`);
1316	Res = Builder.CreateAnd(LHS: Res, RHS: TruncMask);
1317	}
1318	}
1319
1320	return Res;
1321	}
1322
1323	// Try to recognize special cases the DAG will emit special, better expansions
1324	// than the general expansion we do here.
1325
1326	// TODO: It would be better to just directly handle those optimizations here.
1327	bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1328	Value *Num,
1329	Value Den) const* {
1330	if (Constant *C = dyn_cast<Constant>(Val: Den)) {
1331	// Arbitrary constants get a better expansion as long as a wider mulhi is
1332	// legal.
1333	if (C->getType()->getScalarSizeInBits() <= `32`)
1334	return true;
1335
1336	// TODO: Sdiv check for not exact for some reason.
1337
1338	// If there's no wider mulhi, there's only a better expansion for powers of
1339	// two.
1340	// TODO: Should really know for each vector element.
1341	if (isKnownToBeAPowerOfTwo(V: C, DL: DL, OrZero: true*, Depth: `0`, AC, CxtI: &I, DT))
1342	return true;
1343
1344	return false;
1345	}
1346
1347	if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Val: Den)) {
1348	// fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1349	if (BinOpDen->getOpcode() == Instruction::Shl &&
1350	isa<Constant>(Val: BinOpDen->getOperand(i_nocapture: `0`)) &&
1351	isKnownToBeAPowerOfTwo(V: BinOpDen->getOperand(i_nocapture: `0`), DL: DL, OrZero: true*,
1352	Depth: `0`, AC, CxtI: &I, DT)) {
1353	return true;
1354	}
1355	}
1356
1357	return false;
1358	}
1359
1360	static Value getSign32(Value V, IRBuilder<> &Builder, const DataLayout *DL) {
1361	// Check whether the sign can be determined statically.
1362	KnownBits Known = computeKnownBits(V, DL: *DL);
1363	if (Known.isNegative())
1364	return Constant::getAllOnesValue(Ty: V->getType());
1365	if (Known.isNonNegative())
1366	return Constant::getNullValue(Ty: V->getType());
1367	return Builder.CreateAShr(LHS: V, RHS: Builder.getInt32(C: `31`));
1368	}
1369
1370	Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1371	BinaryOperator &I, Value *X,
1372	Value Y) const* {
1373	Instruction::BinaryOps Opc = I.getOpcode();
1374	assert(Opc == Instruction::URem \|\| Opc == Instruction::UDiv \|\|
1375	Opc == Instruction::SRem \|\| Opc == Instruction::SDiv);
1376
1377	FastMathFlags FMF;
1378	FMF.setFast();
1379	Builder.setFastMathFlags(FMF);
1380
1381	if (divHasSpecialOptimization(I, Num: X, Den: Y))
1382	return nullptr; // Keep it for later optimization.
1383
1384	bool IsDiv = Opc == Instruction::UDiv \|\| Opc == Instruction::SDiv;
1385	bool IsSigned = Opc == Instruction::SRem \|\| Opc == Instruction::SDiv;
1386
1387	Type *Ty = X->getType();
1388	Type *I32Ty = Builder.getInt32Ty();
1389	Type *F32Ty = Builder.getFloatTy();
1390
1391	if (Ty->getScalarSizeInBits() != `32`) {
1392	if (IsSigned) {
1393	X = Builder.CreateSExtOrTrunc(V: X, DestTy: I32Ty);
1394	Y = Builder.CreateSExtOrTrunc(V: Y, DestTy: I32Ty);
1395	} else {
1396	X = Builder.CreateZExtOrTrunc(V: X, DestTy: I32Ty);
1397	Y = Builder.CreateZExtOrTrunc(V: Y, DestTy: I32Ty);
1398	}
1399	}
1400
1401	if (Value *Res = expandDivRem24(Builder, I, Num: X, Den: Y, IsDiv, IsSigned)) {
1402	return IsSigned ? Builder.CreateSExtOrTrunc(V: Res, DestTy: Ty) :
1403	Builder.CreateZExtOrTrunc(V: Res, DestTy: Ty);
1404	}
1405
1406	ConstantInt *Zero = Builder.getInt32(C: `0`);
1407	ConstantInt *One = Builder.getInt32(C: `1`);
1408
1409	Value Sign = nullptr*;
1410	if (IsSigned) {
1411	Value *SignX = getSign32(V: X, Builder, DL);
1412	Value *SignY = getSign32(V: Y, Builder, DL);
1413	// Remainder sign is the same as LHS
1414	Sign = IsDiv ? Builder.CreateXor(LHS: SignX, RHS: SignY) : SignX;
1415
1416	X = Builder.CreateAdd(LHS: X, RHS: SignX);
1417	Y = Builder.CreateAdd(LHS: Y, RHS: SignY);
1418
1419	X = Builder.CreateXor(LHS: X, RHS: SignX);
1420	Y = Builder.CreateXor(LHS: Y, RHS: SignY);
1421	}
1422
1423	// The algorithm here is based on ideas from "Software Integer Division", Tom
1424	// Rodeheffer, August 2008.
1425	//
1426	// unsigned udiv(unsigned x, unsigned y) {
1427	// // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1428	// // that this is a lower bound on inv(y), even if some of the calculations
1429	// // round up.
1430	// unsigned z = (unsigned)((4294967296.0 - 512.0) v_rcp_f32((float)y));*
1431	//
1432	// // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1433	// // Empirically this is guaranteed to give a "two-y" lower bound on
1434	// // inv(y).
1435	// z += umulh(z, -y z);*
1436	//
1437	// // Quotient/remainder estimate.
1438	// unsigned q = umulh(x, z);
1439	// unsigned r = x - q y;*
1440	//
1441	// // Two rounds of quotient/remainder refinement.
1442	// if (r >= y) {
1443	// ++q;
1444	// r -= y;
1445	// }
1446	// if (r >= y) {
1447	// ++q;
1448	// r -= y;
1449	// }
1450	//
1451	// return q;
1452	// }
1453
1454	// Initial estimate of inv(y).
1455	Value *FloatY = Builder.CreateUIToFP(V: Y, DestTy: F32Ty);
1456	Function *Rcp = Intrinsic::getDeclaration(M: Mod, Intrinsic::id: amdgcn_rcp, Tys: F32Ty);
1457	Value *RcpY = Builder.CreateCall(Callee: Rcp, Args: {FloatY});
1458	Constant Scale = ConstantFP::get(Ty: F32Ty, V: llvm::bit_cast<float*>(from: `0x4F7FFFFE`));
1459	Value *ScaledY = Builder.CreateFMul(L: RcpY, R: Scale);
1460	Value *Z = Builder.CreateFPToUI(V: ScaledY, DestTy: I32Ty);
1461
1462	// One round of UNR.
1463	Value *NegY = Builder.CreateSub(LHS: Zero, RHS: Y);
1464	Value *NegYZ = Builder.CreateMul(LHS: NegY, RHS: Z);
1465	Z = Builder.CreateAdd(LHS: Z, RHS: getMulHu(Builder, LHS: Z, RHS: NegYZ));
1466
1467	// Quotient/remainder estimate.
1468	Value *Q = getMulHu(Builder, LHS: X, RHS: Z);
1469	Value *R = Builder.CreateSub(LHS: X, RHS: Builder.CreateMul(LHS: Q, RHS: Y));
1470
1471	// First quotient/remainder refinement.
1472	Value *Cond = Builder.CreateICmpUGE(LHS: R, RHS: Y);
1473	if (IsDiv)
1474	Q = Builder.CreateSelect(C: Cond, True: Builder.CreateAdd(LHS: Q, RHS: One), False: Q);
1475	R = Builder.CreateSelect(C: Cond, True: Builder.CreateSub(LHS: R, RHS: Y), False: R);
1476
1477	// Second quotient/remainder refinement.
1478	Cond = Builder.CreateICmpUGE(LHS: R, RHS: Y);
1479	Value *Res;
1480	if (IsDiv)
1481	Res = Builder.CreateSelect(C: Cond, True: Builder.CreateAdd(LHS: Q, RHS: One), False: Q);
1482	else
1483	Res = Builder.CreateSelect(C: Cond, True: Builder.CreateSub(LHS: R, RHS: Y), False: R);
1484
1485	if (IsSigned) {
1486	Res = Builder.CreateXor(LHS: Res, RHS: Sign);
1487	Res = Builder.CreateSub(LHS: Res, RHS: Sign);
1488	Res = Builder.CreateSExtOrTrunc(V: Res, DestTy: Ty);
1489	} else {
1490	Res = Builder.CreateZExtOrTrunc(V: Res, DestTy: Ty);
1491	}
1492	return Res;
1493	}
1494
1495	Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1496	BinaryOperator &I, Value *Num,
1497	Value Den) const* {
1498	if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1499	return nullptr; // Keep it for later optimization.
1500
1501	Instruction::BinaryOps Opc = I.getOpcode();
1502
1503	bool IsDiv = Opc == Instruction::SDiv \|\| Opc == Instruction::UDiv;
1504	bool IsSigned = Opc == Instruction::SDiv \|\| Opc == Instruction::SRem;
1505
1506	int NumDivBits = getDivNumBits(I, Num, Den, AtLeast: `32`, IsSigned);
1507	if (NumDivBits == -`1`)
1508	return nullptr;
1509
1510	Value Narrowed = nullptr*;
1511	if (NumDivBits <= `24`) {
1512	Narrowed = expandDivRem24Impl(Builder, I, Num, Den, DivBits: NumDivBits,
1513	IsDiv, IsSigned);
1514	} else if (NumDivBits <= `32`) {
1515	Narrowed = expandDivRem32(Builder, I, X: Num, Y: Den);
1516	}
1517
1518	if (Narrowed) {
1519	return IsSigned ? Builder.CreateSExt(V: Narrowed, DestTy: Num->getType()) :
1520	Builder.CreateZExt(V: Narrowed, DestTy: Num->getType());
1521	}
1522
1523	return nullptr;
1524	}
1525
1526	void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1527	Instruction::BinaryOps Opc = I.getOpcode();
1528	// Do the general expansion.
1529	if (Opc == Instruction::UDiv \|\| Opc == Instruction::SDiv) {
1530	expandDivisionUpTo64Bits(Div: &I);
1531	return;
1532	}
1533
1534	if (Opc == Instruction::URem \|\| Opc == Instruction::SRem) {
1535	expandRemainderUpTo64Bits(Rem: &I);
1536	return;
1537	}
1538
1539	llvm_unreachable("not a division");
1540	}
1541
1542	bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1543	if (foldBinOpIntoSelect(BO&: I))
1544	return true;
1545
1546	if (ST->has16BitInsts() && needsPromotionToI32(T: I.getType()) &&
1547	UA->isUniform(I: &I) && promoteUniformOpToI32(I))
1548	return true;
1549
1550	if (UseMul24Intrin && replaceMulWithMul24(I))
1551	return true;
1552
1553	bool Changed = false;
1554	Instruction::BinaryOps Opc = I.getOpcode();
1555	Type *Ty = I.getType();
1556	Value NewDiv = nullptr*;
1557	unsigned ScalarSize = Ty->getScalarSizeInBits();
1558
1559	SmallVector<BinaryOperator *, `8`> Div64ToExpand;
1560
1561	if ((Opc == Instruction::URem \|\| Opc == Instruction::UDiv \|\|
1562	Opc == Instruction::SRem \|\| Opc == Instruction::SDiv) &&
1563	ScalarSize <= `64` &&
1564	!DisableIDivExpand) {
1565	Value *Num = I.getOperand(i_nocapture: `0`);
1566	Value *Den = I.getOperand(i_nocapture: `1`);
1567	IRBuilder<> Builder(&I);
1568	Builder.SetCurrentDebugLocation(I.getDebugLoc());
1569
1570	if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
1571	NewDiv = PoisonValue::get(T: VT);
1572
1573	for (unsigned N = `0`, E = VT->getNumElements(); N != E; ++N) {
1574	Value *NumEltN = Builder.CreateExtractElement(Vec: Num, Idx: N);
1575	Value *DenEltN = Builder.CreateExtractElement(Vec: Den, Idx: N);
1576
1577	Value *NewElt;
1578	if (ScalarSize <= `32`) {
1579	NewElt = expandDivRem32(Builder, I, X: NumEltN, Y: DenEltN);
1580	if (!NewElt)
1581	NewElt = Builder.CreateBinOp(Opc, LHS: NumEltN, RHS: DenEltN);
1582	} else {
1583	// See if this 64-bit division can be shrunk to 32/24-bits before
1584	// producing the general expansion.
1585	NewElt = shrinkDivRem64(Builder, I, Num: NumEltN, Den: DenEltN);
1586	if (!NewElt) {
1587	// The general 64-bit expansion introduces control flow and doesn't
1588	// return the new value. Just insert a scalar copy and defer
1589	// expanding it.
1590	NewElt = Builder.CreateBinOp(Opc, LHS: NumEltN, RHS: DenEltN);
1591	Div64ToExpand.push_back(Elt: cast<BinaryOperator>(Val: NewElt));
1592	}
1593	}
1594
1595	NewDiv = Builder.CreateInsertElement(Vec: NewDiv, NewElt, Idx: N);
1596	}
1597	} else {
1598	if (ScalarSize <= `32`)
1599	NewDiv = expandDivRem32(Builder, I, X: Num, Y: Den);
1600	else {
1601	NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1602	if (!NewDiv)
1603	Div64ToExpand.push_back(Elt: &I);
1604	}
1605	}
1606
1607	if (NewDiv) {
1608	I.replaceAllUsesWith(V: NewDiv);
1609	I.eraseFromParent();
1610	Changed = true;
1611	}
1612	}
1613
1614	if (ExpandDiv64InIR) {
1615	// TODO: We get much worse code in specially handled constant cases.
1616	for (BinaryOperator *Div : Div64ToExpand) {
1617	expandDivRem64(I&: *Div);
1618	FlowChanged = true;
1619	Changed = true;
1620	}
1621	}
1622
1623	return Changed;
1624	}
1625
1626	bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1627	if (!WidenLoads)
1628	return false;
1629
1630	if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS \|\|
1631	I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1632	canWidenScalarExtLoad(I)) {
1633	IRBuilder<> Builder(&I);
1634	Builder.SetCurrentDebugLocation(I.getDebugLoc());
1635
1636	Type *I32Ty = Builder.getInt32Ty();
1637	LoadInst *WidenLoad = Builder.CreateLoad(Ty: I32Ty, Ptr: I.getPointerOperand());
1638	WidenLoad->copyMetadata(SrcInst: I);
1639
1640	// If we have range metadata, we need to convert the type, and not make
1641	// assumptions about the high bits.
1642	if (auto *Range = WidenLoad->getMetadata(KindID: LLVMContext::MD_range)) {
1643	ConstantInt *Lower =
1644	mdconst::extract<ConstantInt>(MD: Range->getOperand(I: `0`));
1645
1646	if (Lower->isNullValue()) {
1647	WidenLoad->setMetadata(KindID: LLVMContext::MD_range, Node: nullptr);
1648	} else {
1649	Metadata *LowAndHigh[] = {
1650	ConstantAsMetadata::get(C: ConstantInt::get(Ty: I32Ty, V: Lower->getValue().zext(width: `32`))),
1651	// Don't make assumptions about the high bits.
1652	ConstantAsMetadata::get(C: ConstantInt::get(Ty: I32Ty, V: `0`))
1653	};
1654
1655	WidenLoad->setMetadata(KindID: LLVMContext::MD_range,
1656	Node: MDNode::get(Context&: Mod->getContext(), MDs: LowAndHigh));
1657	}
1658	}
1659
1660	int TySize = Mod->getDataLayout().getTypeSizeInBits(Ty: I.getType());
1661	Type *IntNTy = Builder.getIntNTy(N: TySize);
1662	Value *ValTrunc = Builder.CreateTrunc(V: WidenLoad, DestTy: IntNTy);
1663	Value *ValOrig = Builder.CreateBitCast(V: ValTrunc, DestTy: I.getType());
1664	I.replaceAllUsesWith(V: ValOrig);
1665	I.eraseFromParent();
1666	return true;
1667	}
1668
1669	return false;
1670	}
1671
1672	bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
1673	bool Changed = false;
1674
1675	if (ST->has16BitInsts() && needsPromotionToI32(T: I.getOperand(i_nocapture: `0`)->getType()) &&
1676	UA->isUniform(I: &I))
1677	Changed \|= promoteUniformOpToI32(I);
1678
1679	return Changed;
1680	}
1681
1682	bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1683	Value *Cond = I.getCondition();
1684	Value *TrueVal = I.getTrueValue();
1685	Value *FalseVal = I.getFalseValue();
1686	Value *CmpVal;
1687	FCmpInst::Predicate Pred;
1688
1689	if (ST->has16BitInsts() && needsPromotionToI32(T: I.getType())) {
1690	if (UA->isUniform(I: &I))
1691	return promoteUniformOpToI32(I);
1692	return false;
1693	}
1694
1695	// Match fract pattern with nan check.
1696	if (!match(V: Cond, P: m_FCmp(Pred, L: m_Value(V&: CmpVal), R: m_NonNaN())))
1697	return false;
1698
1699	FPMathOperator *FPOp = dyn_cast<FPMathOperator>(Val: &I);
1700	if (!FPOp)
1701	return false;
1702
1703	IRBuilder<> Builder(&I);
1704	Builder.setFastMathFlags(FPOp->getFastMathFlags());
1705
1706	auto *IITrue = dyn_cast<IntrinsicInst>(Val: TrueVal);
1707	auto *IIFalse = dyn_cast<IntrinsicInst>(Val: FalseVal);
1708
1709	Value Fract = nullptr*;
1710	if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1711	CmpVal == matchFractPat(I&: *IIFalse)) {
1712	// isnan(x) ? x : fract(x)
1713	Fract = applyFractPat(Builder, FractArg: CmpVal);
1714	} else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1715	CmpVal == matchFractPat(I&: *IITrue)) {
1716	// !isnan(x) ? fract(x) : x
1717	Fract = applyFractPat(Builder, FractArg: CmpVal);
1718	} else
1719	return false;
1720
1721	Fract->takeName(V: &I);
1722	I.replaceAllUsesWith(V: Fract);
1723	RecursivelyDeleteTriviallyDeadInstructions(V: &I, TLI: TLInfo);
1724	return true;
1725	}
1726
1727	static bool areInSameBB(const Value A, const* Value *B) {
1728	const auto *IA = dyn_cast<Instruction>(Val: A);
1729	const auto *IB = dyn_cast<Instruction>(Val: B);
1730	return IA && IB && IA->getParent() == IB->getParent();
1731	}
1732
1733	// Helper for breaking large PHIs that returns true when an extractelement on V
1734	// is likely to be folded away by the DAG combiner.
1735	static bool isInterestingPHIIncomingValue(const Value *V) {
1736	const auto *FVT = dyn_cast<FixedVectorType>(Val: V->getType());
1737	if (!FVT)
1738	return false;
1739
1740	const Value *CurVal = V;
1741
1742	// Check for insertelements, keeping track of the elements covered.
1743	BitVector EltsCovered(FVT->getNumElements());
1744	while (const auto *IE = dyn_cast<InsertElementInst>(Val: CurVal)) {
1745	const auto *Idx = dyn_cast<ConstantInt>(Val: IE->getOperand(i_nocapture: `2`));
1746
1747	// Non constant index/out of bounds index -> folding is unlikely.
1748	// The latter is more of a sanity check because canonical IR should just
1749	// have replaced those with poison.
1750	if (!Idx \|\| Idx->getSExtValue() >= FVT->getNumElements())
1751	return false;
1752
1753	const auto *VecSrc = IE->getOperand(i_nocapture: `0`);
1754
1755	// If the vector source is another instruction, it must be in the same basic
1756	// block. Otherwise, the DAGCombiner won't see the whole thing and is
1757	// unlikely to be able to do anything interesting here.
1758	if (isa<Instruction>(Val: VecSrc) && !areInSameBB(A: VecSrc, B: IE))
1759	return false;
1760
1761	CurVal = VecSrc;
1762	EltsCovered.set(Idx->getSExtValue());
1763
1764	// All elements covered.
1765	if (EltsCovered.all())
1766	return true;
1767	}
1768
1769	// We either didn't find a single insertelement, or the insertelement chain
1770	// ended before all elements were covered. Check for other interesting values.
1771
1772	// Constants are always interesting because we can just constant fold the
1773	// extractelements.
1774	if (isa<Constant>(Val: CurVal))
1775	return true;
1776
1777	// shufflevector is likely to be profitable if either operand is a constant,
1778	// or if either source is in the same block.
1779	// This is because shufflevector is most often lowered as a series of
1780	// insert/extract elements anyway.
1781	if (const auto *SV = dyn_cast<ShuffleVectorInst>(Val: CurVal)) {
1782	return isa<Constant>(Val: SV->getOperand(i_nocapture: `1`)) \|\|
1783	areInSameBB(A: SV, B: SV->getOperand(i_nocapture: `0`)) \|\|
1784	areInSameBB(A: SV, B: SV->getOperand(i_nocapture: `1`));
1785	}
1786
1787	return false;
1788	}
1789
1790	static void collectPHINodes(const PHINode &I,
1791	SmallPtrSet<const PHINode *, `8`> &SeenPHIs) {
1792	const auto [It, Inserted] = SeenPHIs.insert(Ptr: &I);
1793	if (!Inserted)
1794	return;
1795
1796	for (const Value *Inc : I.incoming_values()) {
1797	if (const auto *PhiInc = dyn_cast<PHINode>(Val: Inc))
1798	collectPHINodes(I: *PhiInc, SeenPHIs);
1799	}
1800
1801	for (const User *U : I.users()) {
1802	if (const auto *PhiU = dyn_cast<PHINode>(Val: U))
1803	collectPHINodes(I: *PhiU, SeenPHIs);
1804	}
1805	}
1806
1807	bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1808	// Check in the cache first.
1809	if (const auto It = BreakPhiNodesCache.find(Val: &I);
1810	It != BreakPhiNodesCache.end())
1811	return It ->second;
1812
1813	// We consider PHI nodes as part of "chains", so given a PHI node I, we
1814	// recursively consider all its users and incoming values that are also PHI
1815	// nodes. We then make a decision about all of those PHIs at once. Either they
1816	// all get broken up, or none of them do. That way, we avoid cases where a
1817	// single PHI is/is not broken and we end up reforming/exploding a vector
1818	// multiple times, or even worse, doing it in a loop.
1819	SmallPtrSet<const PHINode *, `8`> WorkList;
1820	collectPHINodes(I, SeenPHIs&: WorkList);
1821
1822	#ifndef NDEBUG
1823	// Check that none of the PHI nodes in the worklist are in the map. If some of
1824	// them are, it means we're not good enough at collecting related PHIs.
1825	for (const PHINode *WLP : WorkList) {
1826	assert(BreakPhiNodesCache.count(WLP) == `0`);
1827	}
1828	#endif
1829
1830	// To consider a PHI profitable to break, we need to see some interesting
1831	// incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1832	// must have one to consider all PHIs breakable.
1833	//
1834	// This threshold has been determined through performance testing.
1835	//
1836	// Note that the computation below is equivalent to
1837	//
1838	// (unsigned)ceil((K / 3.0) 2)*
1839	//
1840	// It's simply written this way to avoid mixing integral/FP arithmetic.
1841	const auto Threshold = (alignTo(Value: WorkList.size() * `2`, Align: `3`) / `3`);
1842	unsigned NumBreakablePHIs = `0`;
1843	bool CanBreak = false;
1844	for (const PHINode *Cur : WorkList) {
1845	// Don't break PHIs that have no interesting incoming values. That is, where
1846	// there is no clear opportunity to fold the "extractelement" instructions
1847	// we would add.
1848	//
1849	// Note: IC does not run after this pass, so we're only interested in the
1850	// foldings that the DAG combiner can do.
1851	if (any_of(Range: Cur->incoming_values(), P: isInterestingPHIIncomingValue)) {
1852	if (++NumBreakablePHIs >= Threshold) {
1853	CanBreak = true;
1854	break;
1855	}
1856	}
1857	}
1858
1859	for (const PHINode *Cur : WorkList)
1860	BreakPhiNodesCache [Cur] = CanBreak;
1861
1862	return CanBreak;
1863	}
1864
1865	/// Helper class for "break large PHIs" (visitPHINode).
1866	///
1867	/// This represents a slice of a PHI's incoming value, which is made up of:
1868	/// - The type of the slice (Ty)
1869	/// - The index in the incoming value's vector where the slice starts (Idx)
1870	/// - The number of elements in the slice (NumElts).
1871	/// It also keeps track of the NewPHI node inserted for this particular slice.
1872	///
1873	/// Slice examples:
1874	/// <4 x i64> -> Split into four i64 slices.
1875	/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1876	/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1877	/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1878	class VectorSlice {
1879	public:
1880	VectorSlice(Type Ty, unsigned* Idx, unsigned NumElts)
1881	: Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1882
1883	Type Ty = nullptr*;
1884	unsigned Idx = `0`;
1885	unsigned NumElts = `0`;
1886	PHINode NewPHI = nullptr*;
1887
1888	/// Slice \p Inc according to the information contained within this slice.
1889	/// This is cached, so if called multiple times for the same \p BB & \p Inc
1890	/// pair, it returns the same Sliced value as well.
1891	///
1892	/// Note this intentionally* does not return the same value for, say,*
1893	/// [%bb.0, %0] & [%bb.1, %0] as:
1894	/// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1895	/// the value in bb.1 may not be reachable from bb.0 if it's its
1896	/// predecessor.)
1897	/// - We also want to make our extract instructions as local as possible so
1898	/// the DAG has better chances of folding them out. Duplicating them like
1899	/// that is beneficial in that regard.
1900	///
1901	/// This is both a minor optimization to avoid creating duplicate
1902	/// instructions, but also a requirement for correctness. It is not forbidden
1903	/// for a PHI node to have the same [BB, Val] pair multiple times. If we
1904	/// returned a new value each time, those previously identical pairs would all
1905	/// have different incoming values (from the same block) and it'd cause a "PHI
1906	/// node has multiple entries for the same basic block with different incoming
1907	/// values!" verifier error.
1908	Value getSlicedVal(BasicBlock BB, Value *Inc, StringRef NewValName) {
1909	Value *&Res = SlicedVals [{BB, Inc}];
1910	if (Res)
1911	return Res;
1912
1913	IRBuilder<> B(BB->getTerminator());
1914	if (Instruction *IncInst = dyn_cast<Instruction>(Val: Inc))
1915	B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1916
1917	if (NumElts > `1`) {
1918	SmallVector<int, `4`> Mask;
1919	for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1920	Mask.push_back(Elt: K);
1921	Res = B.CreateShuffleVector(V: Inc, Mask, Name: NewValName);
1922	} else
1923	Res = B.CreateExtractElement(Vec: Inc, Idx, Name: NewValName);
1924
1925	return Res;
1926	}
1927
1928	private:
1929	SmallDenseMap<std::pair<BasicBlock , Value >, Value *> SlicedVals;
1930	};
1931
1932	bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1933	// Break-up fixed-vector PHIs into smaller pieces.
1934	// Default threshold is 32, so it breaks up any vector that's >32 bits into
1935	// its elements, or into 32-bit pieces (for 8/16 bit elts).
1936	//
1937	// This is only helpful for DAGISel because it doesn't handle large PHIs as
1938	// well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1939	// With large, odd-sized PHIs we may end up needing many `build_vector`
1940	// operations with most elements being "undef". This inhibits a lot of
1941	// optimization opportunities and can result in unreasonably high register
1942	// pressure and the inevitable stack spilling.
1943	if (!BreakLargePHIs \|\| getCGPassBuilderOption().EnableGlobalISelOption)
1944	return false;
1945
1946	FixedVectorType *FVT = dyn_cast<FixedVectorType>(Val: I.getType());
1947	if (!FVT \|\| FVT->getNumElements() == `1` \|\|
1948	DL->getTypeSizeInBits(Ty: FVT) <= BreakLargePHIsThreshold)
1949	return false;
1950
1951	if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1952	return false;
1953
1954	std::vector<VectorSlice> Slices;
1955
1956	Type *EltTy = FVT->getElementType();
1957	{
1958	unsigned Idx = `0`;
1959	// For 8/16 bits type, don't scalarize fully but break it up into as many
1960	// 32-bit slices as we can, and scalarize the tail.
1961	const unsigned EltSize = DL->getTypeSizeInBits(Ty: EltTy);
1962	const unsigned NumElts = FVT->getNumElements();
1963	if (EltSize == `8` \|\| EltSize == `16`) {
1964	const unsigned SubVecSize = (`32` / EltSize);
1965	Type *SubVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: SubVecSize);
1966	for (unsigned End = alignDown(Value: NumElts, Align: SubVecSize); Idx < End;
1967	Idx += SubVecSize)
1968	Slices.emplace_back(args&: SubVecTy, args&: Idx, args: SubVecSize);
1969	}
1970
1971	// Scalarize all remaining elements.
1972	for (; Idx < NumElts; ++Idx)
1973	Slices.emplace_back(args&: EltTy, args&: Idx, args: `1`);
1974	}
1975
1976	assert(Slices.size() > `1`);
1977
1978	// Create one PHI per vector piece. The "VectorSlice" class takes care of
1979	// creating the necessary instruction to extract the relevant slices of each
1980	// incoming value.
1981	IRBuilder<> B(I.getParent());
1982	B.SetCurrentDebugLocation(I.getDebugLoc());
1983
1984	unsigned IncNameSuffix = `0`;
1985	for (VectorSlice &S : Slices) {
1986	// We need to reset the build on each iteration, because getSlicedVal may
1987	// have inserted something into I's BB.
1988	B.SetInsertPoint(I.getParent()->getFirstNonPHI());
1989	S.NewPHI = B.CreatePHI(Ty: S.Ty, NumReservedValues: I.getNumIncomingValues());
1990
1991	for (const auto &[Idx, BB] : enumerate(First: I.blocks())) {
1992	S.NewPHI->addIncoming(V: S.getSlicedVal(BB, Inc: I.getIncomingValue(i: Idx),
1993	NewValName: "largephi.extractslice" +
1994	std::to_string(val: IncNameSuffix++)),
1995	BB);
1996	}
1997	}
1998
1999	// And replace this PHI with a vector of all the previous PHI values.
2000	Value *Vec = PoisonValue::get(T: FVT);
2001	unsigned NameSuffix = `0`;
2002	for (VectorSlice &S : Slices) {
2003	const auto ValName = "largephi.insertslice" + std::to_string(val: NameSuffix++);
2004	if (S.NumElts > `1`)
2005	Vec =
2006	B.CreateInsertVector(DstType: FVT, SrcVec: Vec, SubVec: S.NewPHI, Idx: B.getInt64(C: S.Idx), Name: ValName);
2007	else
2008	Vec = B.CreateInsertElement(Vec, NewElt: S.NewPHI, Idx: S.Idx, Name: ValName);
2009	}
2010
2011	I.replaceAllUsesWith(V: Vec);
2012	I.eraseFromParent();
2013	return true;
2014	}
2015
2016	bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2017	switch (I.getIntrinsicID()) {
2018	case Intrinsic::bitreverse:
2019	return visitBitreverseIntrinsicInst(I);
2020	case Intrinsic::minnum:
2021	return visitMinNum(I);
2022	case Intrinsic::sqrt:
2023	return visitSqrt(I);
2024	default:
2025	return false;
2026	}
2027	}
2028
2029	bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
2030	bool Changed = false;
2031
2032	if (ST->has16BitInsts() && needsPromotionToI32(T: I.getType()) &&
2033	UA->isUniform(I: &I))
2034	Changed \|= promoteUniformBitreverseToI32(I);
2035
2036	return Changed;
2037	}
2038
2039	/// Match non-nan fract pattern.
2040	/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0)
2041	///
2042	/// If fract is a useful instruction for the subtarget. Does not account for the
2043	/// nan handling; the instruction has a nan check on the input value.
2044	Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
2045	if (ST->hasFractBug())
2046	return nullptr;
2047
2048	if (I.getIntrinsicID() != Intrinsic::minnum)
2049	return nullptr;
2050
2051	Type *Ty = I.getType();
2052	if (!isLegalFloatingTy(Ty: Ty->getScalarType()))
2053	return nullptr;
2054
2055	Value *Arg0 = I.getArgOperand(i: `0`);
2056	Value *Arg1 = I.getArgOperand(i: `1`);
2057
2058	const APFloat *C;
2059	if (!match(V: Arg1, P: m_APFloat(Res&: C)))
2060	return nullptr;
2061
2062	APFloat One(`1.0`);
2063	bool LosesInfo;
2064	One.convert(ToSemantics: C->getSemantics(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
2065
2066	// Match nextafter(1.0, -1)
2067	One.next(nextDown: true);
2068	if (One != *C)
2069	return nullptr;
2070
2071	Value *FloorSrc;
2072	if (match(Arg0, m_FSub(m_Value(FloorSrc),
2073	m_Intrinsic<Intrinsic::floor>(m_Deferred(FloorSrc)))))
2074	return FloorSrc;
2075	return nullptr;
2076	}
2077
2078	Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
2079	Value *FractArg) {
2080	SmallVector<Value *, `4`> FractVals;
2081	extractValues(Builder, Values&: FractVals, V: FractArg);
2082
2083	SmallVector<Value *, `4`> ResultVals(FractVals.size());
2084
2085	Type *Ty = FractArg->getType()->getScalarType();
2086	for (unsigned I = `0`, E = FractVals.size(); I != E; ++I) {
2087	ResultVals [I] =
2088	Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals [I]});
2089	}
2090
2091	return insertValues(Builder, Ty: FractArg->getType(), Values&: ResultVals);
2092	}
2093
2094	bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst &I) {
2095	Value *FractArg = matchFractPat(I);
2096	if (!FractArg)
2097	return false;
2098
2099	// Match pattern for fract intrinsic in contexts where the nan check has been
2100	// optimized out (and hope the knowledge the source can't be nan wasn't lost).
2101	if (!I.hasNoNaNs() &&
2102	!isKnownNeverNaN(V: FractArg, /Depth=/`0`, SQ: SimplifyQuery (*DL, TLInfo)))
2103	return false;
2104
2105	IRBuilder<> Builder(&I);
2106	FastMathFlags FMF = I.getFastMathFlags();
2107	FMF.setNoNaNs();
2108	Builder.setFastMathFlags(FMF);
2109
2110	Value *Fract = applyFractPat(Builder, FractArg);
2111	Fract->takeName(V: &I);
2112	I.replaceAllUsesWith(V: Fract);
2113
2114	RecursivelyDeleteTriviallyDeadInstructions(V: &I, TLI: TLInfo);
2115	return true;
2116	}
2117
2118	static bool isOneOrNegOne(const Value *Val) {
2119	const APFloat *C;
2120	return match(V: Val, P: m_APFloat(Res&: C)) && C->getExactLog2Abs() == `0`;
2121	}
2122
2123	// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2124	bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2125	Type *Ty = Sqrt.getType()->getScalarType();
2126	if (!Ty->isFloatTy() && (!Ty->isHalfTy() \|\| ST->has16BitInsts()))
2127	return false;
2128
2129	const FPMathOperator FPOp = cast<const* FPMathOperator>(Val: &Sqrt);
2130	FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2131
2132	// We're trying to handle the fast-but-not-that-fast case only. The lowering
2133	// of fast llvm.sqrt will give the raw instruction anyway.
2134	if (SqrtFMF.approxFunc() \|\| HasUnsafeFPMath)
2135	return false;
2136
2137	const float ReqdAccuracy = FPOp->getFPAccuracy();
2138
2139	// Defer correctly rounded expansion to codegen.
2140	if (ReqdAccuracy < `1.0f`)
2141	return false;
2142
2143	// FIXME: This is an ugly hack for this pass using forward iteration instead
2144	// of reverse. If it worked like a normal combiner, the rsq would form before
2145	// we saw a sqrt call.
2146	auto *FDiv =
2147	dyn_cast_or_null<FPMathOperator>(Val: Sqrt.getUniqueUndroppableUser());
2148	if (FDiv && FDiv->getOpcode() == Instruction::FDiv &&
2149	FDiv->getFPAccuracy() >= `1.0f` &&
2150	canOptimizeWithRsq(SqrtOp: FPOp, DivFMF: FDiv->getFastMathFlags(), SqrtFMF) &&
2151	// TODO: We should also handle the arcp case for the fdiv with non-1 value
2152	isOneOrNegOne(Val: FDiv->getOperand(i: `0`)))
2153	return false;
2154
2155	Value *SrcVal = Sqrt.getOperand(i_nocapture: `0`);
2156	bool CanTreatAsDAZ = canIgnoreDenormalInput(V: SrcVal, CtxI: &Sqrt);
2157
2158	// The raw instruction is 1 ulp, but the correction for denormal handling
2159	// brings it to 2.
2160	if (!CanTreatAsDAZ && ReqdAccuracy < `2.0f`)
2161	return false;
2162
2163	IRBuilder<> Builder(&Sqrt);
2164	SmallVector<Value *, `4`> SrcVals;
2165	extractValues(Builder, Values&: SrcVals, V: SrcVal);
2166
2167	SmallVector<Value *, `4`> ResultVals(SrcVals.size());
2168	for (int I = `0`, E = SrcVals.size(); I != E; ++I) {
2169	if (CanTreatAsDAZ)
2170	ResultVals [I] = Builder.CreateCall(Callee: getSqrtF32(), Args: SrcVals [I]);
2171	else
2172	ResultVals [I] = emitSqrtIEEE2ULP(Builder, Src: SrcVals [I], FMF: SqrtFMF);
2173	}
2174
2175	Value *NewSqrt = insertValues(Builder, Ty: Sqrt.getType(), Values&: ResultVals);
2176	NewSqrt->takeName(V: &Sqrt);
2177	Sqrt.replaceAllUsesWith(V: NewSqrt);
2178	Sqrt.eraseFromParent();
2179	return true;
2180	}
2181
2182	bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
2183	Impl.Mod = &M;
2184	Impl.DL = &Impl.Mod->getDataLayout();
2185	Impl.SqrtF32 = nullptr;
2186	Impl.LdexpF32 = nullptr;
2187	return false;
2188	}
2189
2190	bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2191	if (skipFunction(F))
2192	return false;
2193
2194	auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2195	if (!TPC)
2196	return false;
2197
2198	const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2199	Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2200	Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
2201	Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2202	Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2203	auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2204	Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
2205	Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
2206	SIModeRegisterDefaults Mode(F, *Impl.ST);
2207	Impl.HasFP32DenormalFlush =
2208	Mode.FP32Denormals == DenormalMode::getPreserveSign();
2209	return Impl.run(F);
2210	}
2211
2212	PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
2213	FunctionAnalysisManager &FAM) {
2214	AMDGPUCodeGenPrepareImpl Impl;
2215	Impl.Mod = F.getParent();
2216	Impl.DL = &Impl.Mod->getDataLayout();
2217	Impl.TLInfo = &FAM.getResult<TargetLibraryAnalysis>(IR&: F);
2218	Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
2219	Impl.AC = &FAM.getResult<AssumptionAnalysis>(IR&: F);
2220	Impl.UA = &FAM.getResult<UniformityInfoAnalysis>(IR&: F);
2221	Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(IR&: F);
2222	Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
2223	SIModeRegisterDefaults Mode(F, *Impl.ST);
2224	Impl.HasFP32DenormalFlush =
2225	Mode.FP32Denormals == DenormalMode::getPreserveSign();
2226	PreservedAnalyses PA = PreservedAnalyses::none();
2227	if (!Impl.FlowChanged)
2228	PA.preserveSet<CFGAnalyses>();
2229	return Impl.run(F) ? PA : PreservedAnalyses::all();
2230	}
2231
2232	INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2233	"AMDGPU IR optimizations", false, false)
2234	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
2235	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
2236	INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
2237	INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2238	false, false)
2239
2240	char AMDGPUCodeGenPrepare::ID = `0`;
2241
2242	FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
2243	return new AMDGPUCodeGenPrepare ();
2244	}
2245

source code of llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp