1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUTargetMachine.h"
17#include "SIModeRegisterDefaults.h"
18#include "llvm/Analysis/AssumptionCache.h"
19#include "llvm/Analysis/ConstantFolding.h"
20#include "llvm/Analysis/TargetLibraryInfo.h"
21#include "llvm/Analysis/UniformityAnalysis.h"
22#include "llvm/Analysis/ValueTracking.h"
23#include "llvm/CodeGen/TargetPassConfig.h"
24#include "llvm/IR/Dominators.h"
25#include "llvm/IR/IRBuilder.h"
26#include "llvm/IR/InstVisitor.h"
27#include "llvm/IR/IntrinsicsAMDGPU.h"
28#include "llvm/IR/PatternMatch.h"
29#include "llvm/InitializePasses.h"
30#include "llvm/Pass.h"
31#include "llvm/Support/KnownBits.h"
32#include "llvm/Transforms/Utils/IntegerDivision.h"
33#include "llvm/Transforms/Utils/Local.h"
34
35#define DEBUG_TYPE "amdgpu-codegenprepare"
36
37using namespace llvm;
38using namespace llvm::PatternMatch;
39
40namespace {
41
42static cl::opt<bool> WidenLoads(
43 "amdgpu-codegenprepare-widen-constant-loads",
44 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
45 cl::ReallyHidden,
46 cl::init(Val: false));
47
48static cl::opt<bool> Widen16BitOps(
49 "amdgpu-codegenprepare-widen-16-bit-ops",
50 cl::desc("Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
51 cl::ReallyHidden,
52 cl::init(Val: true));
53
54static cl::opt<bool>
55 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
56 cl::desc("Break large PHI nodes for DAGISel"),
57 cl::ReallyHidden, cl::init(Val: true));
58
59static cl::opt<bool>
60 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
61 cl::desc("For testing purposes, always break large "
62 "PHIs even if it isn't profitable."),
63 cl::ReallyHidden, cl::init(Val: false));
64
65static cl::opt<unsigned> BreakLargePHIsThreshold(
66 "amdgpu-codegenprepare-break-large-phis-threshold",
67 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
68 cl::ReallyHidden, cl::init(Val: 32));
69
70static cl::opt<bool> UseMul24Intrin(
71 "amdgpu-codegenprepare-mul24",
72 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
73 cl::ReallyHidden,
74 cl::init(Val: true));
75
76// Legalize 64-bit division by using the generic IR expansion.
77static cl::opt<bool> ExpandDiv64InIR(
78 "amdgpu-codegenprepare-expand-div64",
79 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
80 cl::ReallyHidden,
81 cl::init(Val: false));
82
83// Leave all division operations as they are. This supersedes ExpandDiv64InIR
84// and is used for testing the legalizer.
85static cl::opt<bool> DisableIDivExpand(
86 "amdgpu-codegenprepare-disable-idiv-expansion",
87 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
88 cl::ReallyHidden,
89 cl::init(Val: false));
90
91// Disable processing of fdiv so we can better test the backend implementations.
92static cl::opt<bool> DisableFDivExpand(
93 "amdgpu-codegenprepare-disable-fdiv-expansion",
94 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
95 cl::ReallyHidden,
96 cl::init(Val: false));
97
98class AMDGPUCodeGenPrepareImpl
99 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
100public:
101 const GCNSubtarget *ST = nullptr;
102 const TargetLibraryInfo *TLInfo = nullptr;
103 AssumptionCache *AC = nullptr;
104 DominatorTree *DT = nullptr;
105 UniformityInfo *UA = nullptr;
106 Module *Mod = nullptr;
107 const DataLayout *DL = nullptr;
108 bool HasUnsafeFPMath = false;
109 bool HasFP32DenormalFlush = false;
110 bool FlowChanged = false;
111 mutable Function *SqrtF32 = nullptr;
112 mutable Function *LdexpF32 = nullptr;
113
114 DenseMap<const PHINode *, bool> BreakPhiNodesCache;
115
116 Function *getSqrtF32() const {
117 if (SqrtF32)
118 return SqrtF32;
119
120 LLVMContext &Ctx = Mod->getContext();
121 SqrtF32 = Intrinsic::getDeclaration(M: Mod, Intrinsic::id: amdgcn_sqrt,
122 Tys: {Type::getFloatTy(C&: Ctx)});
123 return SqrtF32;
124 }
125
126 Function *getLdexpF32() const {
127 if (LdexpF32)
128 return LdexpF32;
129
130 LLVMContext &Ctx = Mod->getContext();
131 LdexpF32 = Intrinsic::getDeclaration(
132 M: Mod, Intrinsic::id: ldexp, Tys: {Type::getFloatTy(C&: Ctx), Type::getInt32Ty(C&: Ctx)});
133 return LdexpF32;
134 }
135
136 bool canBreakPHINode(const PHINode &I);
137
138 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
139 /// binary operation \p V.
140 ///
141 /// \returns Binary operation \p V.
142 /// \returns \p T's base element bit width.
143 unsigned getBaseElementBitWidth(const Type *T) const;
144
145 /// \returns Equivalent 32 bit integer type for given type \p T. For example,
146 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
147 /// is returned.
148 Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
149
150 /// \returns True if binary operation \p I is a signed binary operation, false
151 /// otherwise.
152 bool isSigned(const BinaryOperator &I) const;
153
154 /// \returns True if the condition of 'select' operation \p I comes from a
155 /// signed 'icmp' operation, false otherwise.
156 bool isSigned(const SelectInst &I) const;
157
158 /// \returns True if type \p T needs to be promoted to 32 bit integer type,
159 /// false otherwise.
160 bool needsPromotionToI32(const Type *T) const;
161
162 /// Return true if \p T is a legal scalar floating point type.
163 bool isLegalFloatingTy(const Type *T) const;
164
165 /// Wrapper to pass all the arguments to computeKnownFPClass
166 KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested,
167 const Instruction *CtxI) const {
168 return llvm::computeKnownFPClass(V, DL: *DL, InterestedClasses: Interested, Depth: 0, TLI: TLInfo, AC, CxtI: CtxI,
169 DT);
170 }
171
172 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
173 return HasFP32DenormalFlush ||
174 computeKnownFPClass(V, Interested: fcSubnormal, CtxI).isKnownNeverSubnormal();
175 }
176
177 /// Promotes uniform binary operation \p I to equivalent 32 bit binary
178 /// operation.
179 ///
180 /// \details \p I's base element bit width must be greater than 1 and less
181 /// than or equal 16. Promotion is done by sign or zero extending operands to
182 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
183 /// truncating the result of 32 bit binary operation back to \p I's original
184 /// type. Division operation is not promoted.
185 ///
186 /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
187 /// false otherwise.
188 bool promoteUniformOpToI32(BinaryOperator &I) const;
189
190 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
191 ///
192 /// \details \p I's base element bit width must be greater than 1 and less
193 /// than or equal 16. Promotion is done by sign or zero extending operands to
194 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
195 ///
196 /// \returns True.
197 bool promoteUniformOpToI32(ICmpInst &I) const;
198
199 /// Promotes uniform 'select' operation \p I to 32 bit 'select'
200 /// operation.
201 ///
202 /// \details \p I's base element bit width must be greater than 1 and less
203 /// than or equal 16. Promotion is done by sign or zero extending operands to
204 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
205 /// result of 32 bit 'select' operation back to \p I's original type.
206 ///
207 /// \returns True.
208 bool promoteUniformOpToI32(SelectInst &I) const;
209
210 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
211 /// intrinsic.
212 ///
213 /// \details \p I's base element bit width must be greater than 1 and less
214 /// than or equal 16. Promotion is done by zero extending the operand to 32
215 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
216 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
217 /// shift amount is 32 minus \p I's base element bit width), and truncating
218 /// the result of the shift operation back to \p I's original type.
219 ///
220 /// \returns True.
221 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
222
223 /// \returns The minimum number of bits needed to store the value of \Op as an
224 /// unsigned integer. Truncating to this size and then zero-extending to
225 /// the original will not change the value.
226 unsigned numBitsUnsigned(Value *Op) const;
227
228 /// \returns The minimum number of bits needed to store the value of \Op as a
229 /// signed integer. Truncating to this size and then sign-extending to
230 /// the original size will not change the value.
231 unsigned numBitsSigned(Value *Op) const;
232
233 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
234 /// SelectionDAG has an issue where an and asserting the bits are known
235 bool replaceMulWithMul24(BinaryOperator &I) const;
236
237 /// Perform same function as equivalently named function in DAGCombiner. Since
238 /// we expand some divisions here, we need to perform this before obscuring.
239 bool foldBinOpIntoSelect(BinaryOperator &I) const;
240
241 bool divHasSpecialOptimization(BinaryOperator &I,
242 Value *Num, Value *Den) const;
243 int getDivNumBits(BinaryOperator &I,
244 Value *Num, Value *Den,
245 unsigned AtLeast, bool Signed) const;
246
247 /// Expands 24 bit div or rem.
248 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
249 Value *Num, Value *Den,
250 bool IsDiv, bool IsSigned) const;
251
252 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
253 Value *Num, Value *Den, unsigned NumBits,
254 bool IsDiv, bool IsSigned) const;
255
256 /// Expands 32 bit div or rem.
257 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
258 Value *Num, Value *Den) const;
259
260 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
261 Value *Num, Value *Den) const;
262 void expandDivRem64(BinaryOperator &I) const;
263
264 /// Widen a scalar load.
265 ///
266 /// \details \p Widen scalar load for uniform, small type loads from constant
267 // memory / to a full 32-bits and then truncate the input to allow a scalar
268 // load instead of a vector load.
269 //
270 /// \returns True.
271
272 bool canWidenScalarExtLoad(LoadInst &I) const;
273
274 Value *matchFractPat(IntrinsicInst &I);
275 Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
276
277 bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF,
278 FastMathFlags SqrtFMF) const;
279
280 Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
281 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
282 const Instruction *CtxI) const;
283
284 Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
285 FastMathFlags FMF, const Instruction *CtxI) const;
286 Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
287 float ReqdAccuracy) const;
288
289 Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
290 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
291 Value *RsqOp, const Instruction *FDiv,
292 float ReqdAccuracy) const;
293
294 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
295 Value *Src) const;
296
297 Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
298 bool IsNegative) const;
299 Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
300 FastMathFlags FMF) const;
301 Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
302 FastMathFlags FMF) const;
303
304public:
305 bool visitFDiv(BinaryOperator &I);
306
307 bool visitInstruction(Instruction &I) { return false; }
308 bool visitBinaryOperator(BinaryOperator &I);
309 bool visitLoadInst(LoadInst &I);
310 bool visitICmpInst(ICmpInst &I);
311 bool visitSelectInst(SelectInst &I);
312 bool visitPHINode(PHINode &I);
313
314 bool visitIntrinsicInst(IntrinsicInst &I);
315 bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
316 bool visitMinNum(IntrinsicInst &I);
317 bool visitSqrt(IntrinsicInst &I);
318 bool run(Function &F);
319};
320
321class AMDGPUCodeGenPrepare : public FunctionPass {
322private:
323 AMDGPUCodeGenPrepareImpl Impl;
324
325public:
326 static char ID;
327 AMDGPUCodeGenPrepare() : FunctionPass(ID) {
328 initializeAMDGPUCodeGenPreparePass(*PassRegistry::getPassRegistry());
329 }
330 void getAnalysisUsage(AnalysisUsage &AU) const override {
331 AU.addRequired<AssumptionCacheTracker>();
332 AU.addRequired<UniformityInfoWrapperPass>();
333 AU.addRequired<TargetLibraryInfoWrapperPass>();
334
335 // FIXME: Division expansion needs to preserve the dominator tree.
336 if (!ExpandDiv64InIR)
337 AU.setPreservesAll();
338 }
339 bool runOnFunction(Function &F) override;
340 bool doInitialization(Module &M) override;
341 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
342};
343
344} // end anonymous namespace
345
346bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
347 BreakPhiNodesCache.clear();
348 bool MadeChange = false;
349
350 Function::iterator NextBB;
351 for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
352 BasicBlock *BB = &*FI;
353 NextBB = std::next(x: FI);
354
355 BasicBlock::iterator Next;
356 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
357 I = Next) {
358 Next = std::next(x: I);
359
360 MadeChange |= visit(I&: *I);
361
362 if (Next != E) { // Control flow changed
363 BasicBlock *NextInstBB = Next->getParent();
364 if (NextInstBB != BB) {
365 BB = NextInstBB;
366 E = BB->end();
367 FE = F.end();
368 }
369 }
370 }
371 }
372 return MadeChange;
373}
374
375unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
376 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
377
378 if (T->isIntegerTy())
379 return T->getIntegerBitWidth();
380 return cast<VectorType>(Val: T)->getElementType()->getIntegerBitWidth();
381}
382
383Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const {
384 assert(needsPromotionToI32(T) && "T does not need promotion to i32");
385
386 if (T->isIntegerTy())
387 return B.getInt32Ty();
388 return FixedVectorType::get(ElementType: B.getInt32Ty(), FVTy: cast<FixedVectorType>(Val: T));
389}
390
391bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {
392 return I.getOpcode() == Instruction::AShr ||
393 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
394}
395
396bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
397 return isa<ICmpInst>(Val: I.getOperand(i_nocapture: 0)) ?
398 cast<ICmpInst>(Val: I.getOperand(i_nocapture: 0))->isSigned() : false;
399}
400
401bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const {
402 if (!Widen16BitOps)
403 return false;
404
405 const IntegerType *IntTy = dyn_cast<IntegerType>(Val: T);
406 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
407 return true;
408
409 if (const VectorType *VT = dyn_cast<VectorType>(Val: T)) {
410 // TODO: The set of packed operations is more limited, so may want to
411 // promote some anyway.
412 if (ST->hasVOP3PInsts())
413 return false;
414
415 return needsPromotionToI32(T: VT->getElementType());
416 }
417
418 return false;
419}
420
421bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
422 return Ty->isFloatTy() || Ty->isDoubleTy() ||
423 (Ty->isHalfTy() && ST->has16BitInsts());
424}
425
426// Return true if the op promoted to i32 should have nsw set.
427static bool promotedOpIsNSW(const Instruction &I) {
428 switch (I.getOpcode()) {
429 case Instruction::Shl:
430 case Instruction::Add:
431 case Instruction::Sub:
432 return true;
433 case Instruction::Mul:
434 return I.hasNoUnsignedWrap();
435 default:
436 return false;
437 }
438}
439
440// Return true if the op promoted to i32 should have nuw set.
441static bool promotedOpIsNUW(const Instruction &I) {
442 switch (I.getOpcode()) {
443 case Instruction::Shl:
444 case Instruction::Add:
445 case Instruction::Mul:
446 return true;
447 case Instruction::Sub:
448 return I.hasNoUnsignedWrap();
449 default:
450 return false;
451 }
452}
453
454bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
455 Type *Ty = I.getType();
456 const DataLayout &DL = Mod->getDataLayout();
457 int TySize = DL.getTypeSizeInBits(Ty);
458 Align Alignment = DL.getValueOrABITypeAlignment(Alignment: I.getAlign(), Ty);
459
460 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA->isUniform(I: &I);
461}
462
463bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const {
464 assert(needsPromotionToI32(I.getType()) &&
465 "I does not need promotion to i32");
466
467 if (I.getOpcode() == Instruction::SDiv ||
468 I.getOpcode() == Instruction::UDiv ||
469 I.getOpcode() == Instruction::SRem ||
470 I.getOpcode() == Instruction::URem)
471 return false;
472
473 IRBuilder<> Builder(&I);
474 Builder.SetCurrentDebugLocation(I.getDebugLoc());
475
476 Type *I32Ty = getI32Ty(B&: Builder, T: I.getType());
477 Value *ExtOp0 = nullptr;
478 Value *ExtOp1 = nullptr;
479 Value *ExtRes = nullptr;
480 Value *TruncRes = nullptr;
481
482 if (isSigned(I)) {
483 ExtOp0 = Builder.CreateSExt(V: I.getOperand(i_nocapture: 0), DestTy: I32Ty);
484 ExtOp1 = Builder.CreateSExt(V: I.getOperand(i_nocapture: 1), DestTy: I32Ty);
485 } else {
486 ExtOp0 = Builder.CreateZExt(V: I.getOperand(i_nocapture: 0), DestTy: I32Ty);
487 ExtOp1 = Builder.CreateZExt(V: I.getOperand(i_nocapture: 1), DestTy: I32Ty);
488 }
489
490 ExtRes = Builder.CreateBinOp(Opc: I.getOpcode(), LHS: ExtOp0, RHS: ExtOp1);
491 if (Instruction *Inst = dyn_cast<Instruction>(Val: ExtRes)) {
492 if (promotedOpIsNSW(I: cast<Instruction>(Val&: I)))
493 Inst->setHasNoSignedWrap();
494
495 if (promotedOpIsNUW(I: cast<Instruction>(Val&: I)))
496 Inst->setHasNoUnsignedWrap();
497
498 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(Val: &I))
499 Inst->setIsExact(ExactOp->isExact());
500 }
501
502 TruncRes = Builder.CreateTrunc(V: ExtRes, DestTy: I.getType());
503
504 I.replaceAllUsesWith(V: TruncRes);
505 I.eraseFromParent();
506
507 return true;
508}
509
510bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const {
511 assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
512 "I does not need promotion to i32");
513
514 IRBuilder<> Builder(&I);
515 Builder.SetCurrentDebugLocation(I.getDebugLoc());
516
517 Type *I32Ty = getI32Ty(B&: Builder, T: I.getOperand(i_nocapture: 0)->getType());
518 Value *ExtOp0 = nullptr;
519 Value *ExtOp1 = nullptr;
520 Value *NewICmp = nullptr;
521
522 if (I.isSigned()) {
523 ExtOp0 = Builder.CreateSExt(V: I.getOperand(i_nocapture: 0), DestTy: I32Ty);
524 ExtOp1 = Builder.CreateSExt(V: I.getOperand(i_nocapture: 1), DestTy: I32Ty);
525 } else {
526 ExtOp0 = Builder.CreateZExt(V: I.getOperand(i_nocapture: 0), DestTy: I32Ty);
527 ExtOp1 = Builder.CreateZExt(V: I.getOperand(i_nocapture: 1), DestTy: I32Ty);
528 }
529 NewICmp = Builder.CreateICmp(P: I.getPredicate(), LHS: ExtOp0, RHS: ExtOp1);
530
531 I.replaceAllUsesWith(V: NewICmp);
532 I.eraseFromParent();
533
534 return true;
535}
536
537bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const {
538 assert(needsPromotionToI32(I.getType()) &&
539 "I does not need promotion to i32");
540
541 IRBuilder<> Builder(&I);
542 Builder.SetCurrentDebugLocation(I.getDebugLoc());
543
544 Type *I32Ty = getI32Ty(B&: Builder, T: I.getType());
545 Value *ExtOp1 = nullptr;
546 Value *ExtOp2 = nullptr;
547 Value *ExtRes = nullptr;
548 Value *TruncRes = nullptr;
549
550 if (isSigned(I)) {
551 ExtOp1 = Builder.CreateSExt(V: I.getOperand(i_nocapture: 1), DestTy: I32Ty);
552 ExtOp2 = Builder.CreateSExt(V: I.getOperand(i_nocapture: 2), DestTy: I32Ty);
553 } else {
554 ExtOp1 = Builder.CreateZExt(V: I.getOperand(i_nocapture: 1), DestTy: I32Ty);
555 ExtOp2 = Builder.CreateZExt(V: I.getOperand(i_nocapture: 2), DestTy: I32Ty);
556 }
557 ExtRes = Builder.CreateSelect(C: I.getOperand(i_nocapture: 0), True: ExtOp1, False: ExtOp2);
558 TruncRes = Builder.CreateTrunc(V: ExtRes, DestTy: I.getType());
559
560 I.replaceAllUsesWith(V: TruncRes);
561 I.eraseFromParent();
562
563 return true;
564}
565
566bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
567 IntrinsicInst &I) const {
568 assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
569 "I must be bitreverse intrinsic");
570 assert(needsPromotionToI32(I.getType()) &&
571 "I does not need promotion to i32");
572
573 IRBuilder<> Builder(&I);
574 Builder.SetCurrentDebugLocation(I.getDebugLoc());
575
576 Type *I32Ty = getI32Ty(B&: Builder, T: I.getType());
577 Function *I32 =
578 Intrinsic::getDeclaration(M: Mod, Intrinsic::id: bitreverse, Tys: { I32Ty });
579 Value *ExtOp = Builder.CreateZExt(V: I.getOperand(i_nocapture: 0), DestTy: I32Ty);
580 Value *ExtRes = Builder.CreateCall(Callee: I32, Args: { ExtOp });
581 Value *LShrOp =
582 Builder.CreateLShr(LHS: ExtRes, RHS: 32 - getBaseElementBitWidth(T: I.getType()));
583 Value *TruncRes =
584 Builder.CreateTrunc(V: LShrOp, DestTy: I.getType());
585
586 I.replaceAllUsesWith(V: TruncRes);
587 I.eraseFromParent();
588
589 return true;
590}
591
592unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
593 return computeKnownBits(V: Op, DL: *DL, Depth: 0, AC).countMaxActiveBits();
594}
595
596unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {
597 return ComputeMaxSignificantBits(Op, DL: *DL, Depth: 0, AC);
598}
599
600static void extractValues(IRBuilder<> &Builder,
601 SmallVectorImpl<Value *> &Values, Value *V) {
602 auto *VT = dyn_cast<FixedVectorType>(Val: V->getType());
603 if (!VT) {
604 Values.push_back(Elt: V);
605 return;
606 }
607
608 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
609 Values.push_back(Elt: Builder.CreateExtractElement(Vec: V, Idx: I));
610}
611
612static Value *insertValues(IRBuilder<> &Builder,
613 Type *Ty,
614 SmallVectorImpl<Value *> &Values) {
615 if (!Ty->isVectorTy()) {
616 assert(Values.size() == 1);
617 return Values[0];
618 }
619
620 Value *NewVal = PoisonValue::get(T: Ty);
621 for (int I = 0, E = Values.size(); I != E; ++I)
622 NewVal = Builder.CreateInsertElement(Vec: NewVal, NewElt: Values[I], Idx: I);
623
624 return NewVal;
625}
626
627bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
628 if (I.getOpcode() != Instruction::Mul)
629 return false;
630
631 Type *Ty = I.getType();
632 unsigned Size = Ty->getScalarSizeInBits();
633 if (Size <= 16 && ST->has16BitInsts())
634 return false;
635
636 // Prefer scalar if this could be s_mul_i32
637 if (UA->isUniform(I: &I))
638 return false;
639
640 Value *LHS = I.getOperand(i_nocapture: 0);
641 Value *RHS = I.getOperand(i_nocapture: 1);
642 IRBuilder<> Builder(&I);
643 Builder.SetCurrentDebugLocation(I.getDebugLoc());
644
645 unsigned LHSBits = 0, RHSBits = 0;
646 bool IsSigned = false;
647
648 if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(Op: LHS)) <= 24 &&
649 (RHSBits = numBitsUnsigned(Op: RHS)) <= 24) {
650 IsSigned = false;
651
652 } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(Op: LHS)) <= 24 &&
653 (RHSBits = numBitsSigned(Op: RHS)) <= 24) {
654 IsSigned = true;
655
656 } else
657 return false;
658
659 SmallVector<Value *, 4> LHSVals;
660 SmallVector<Value *, 4> RHSVals;
661 SmallVector<Value *, 4> ResultVals;
662 extractValues(Builder, Values&: LHSVals, V: LHS);
663 extractValues(Builder, Values&: RHSVals, V: RHS);
664
665 IntegerType *I32Ty = Builder.getInt32Ty();
666 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
667 Type *DstTy = LHSVals[0]->getType();
668
669 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
670 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(V: LHSVals[I], DestTy: I32Ty)
671 : Builder.CreateZExtOrTrunc(V: LHSVals[I], DestTy: I32Ty);
672 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(V: RHSVals[I], DestTy: I32Ty)
673 : Builder.CreateZExtOrTrunc(V: RHSVals[I], DestTy: I32Ty);
674 Intrinsic::ID ID =
675 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
676 Value *Result = Builder.CreateIntrinsic(ID, Types: {IntrinTy}, Args: {LHS, RHS});
677 Result = IsSigned ? Builder.CreateSExtOrTrunc(V: Result, DestTy: DstTy)
678 : Builder.CreateZExtOrTrunc(V: Result, DestTy: DstTy);
679 ResultVals.push_back(Elt: Result);
680 }
681
682 Value *NewVal = insertValues(Builder, Ty, Values&: ResultVals);
683 NewVal->takeName(V: &I);
684 I.replaceAllUsesWith(V: NewVal);
685 I.eraseFromParent();
686
687 return true;
688}
689
690// Find a select instruction, which may have been casted. This is mostly to deal
691// with cases where i16 selects were promoted here to i32.
692static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) {
693 Cast = nullptr;
694 if (SelectInst *Sel = dyn_cast<SelectInst>(Val: V))
695 return Sel;
696
697 if ((Cast = dyn_cast<CastInst>(Val: V))) {
698 if (SelectInst *Sel = dyn_cast<SelectInst>(Val: Cast->getOperand(i_nocapture: 0)))
699 return Sel;
700 }
701
702 return nullptr;
703}
704
705bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
706 // Don't do this unless the old select is going away. We want to eliminate the
707 // binary operator, not replace a binop with a select.
708 int SelOpNo = 0;
709
710 CastInst *CastOp;
711
712 // TODO: Should probably try to handle some cases with multiple
713 // users. Duplicating the select may be profitable for division.
714 SelectInst *Sel = findSelectThroughCast(V: BO.getOperand(i_nocapture: 0), Cast&: CastOp);
715 if (!Sel || !Sel->hasOneUse()) {
716 SelOpNo = 1;
717 Sel = findSelectThroughCast(V: BO.getOperand(i_nocapture: 1), Cast&: CastOp);
718 }
719
720 if (!Sel || !Sel->hasOneUse())
721 return false;
722
723 Constant *CT = dyn_cast<Constant>(Val: Sel->getTrueValue());
724 Constant *CF = dyn_cast<Constant>(Val: Sel->getFalseValue());
725 Constant *CBO = dyn_cast<Constant>(Val: BO.getOperand(i_nocapture: SelOpNo ^ 1));
726 if (!CBO || !CT || !CF)
727 return false;
728
729 if (CastOp) {
730 if (!CastOp->hasOneUse())
731 return false;
732 CT = ConstantFoldCastOperand(Opcode: CastOp->getOpcode(), C: CT, DestTy: BO.getType(), DL: *DL);
733 CF = ConstantFoldCastOperand(Opcode: CastOp->getOpcode(), C: CF, DestTy: BO.getType(), DL: *DL);
734 }
735
736 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
737 // need to handle divisions here.
738 Constant *FoldedT = SelOpNo ?
739 ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CBO, RHS: CT, DL: *DL) :
740 ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CT, RHS: CBO, DL: *DL);
741 if (!FoldedT || isa<ConstantExpr>(Val: FoldedT))
742 return false;
743
744 Constant *FoldedF = SelOpNo ?
745 ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CBO, RHS: CF, DL: *DL) :
746 ConstantFoldBinaryOpOperands(Opcode: BO.getOpcode(), LHS: CF, RHS: CBO, DL: *DL);
747 if (!FoldedF || isa<ConstantExpr>(Val: FoldedF))
748 return false;
749
750 IRBuilder<> Builder(&BO);
751 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
752 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Val: &BO))
753 Builder.setFastMathFlags(FPOp->getFastMathFlags());
754
755 Value *NewSelect = Builder.CreateSelect(C: Sel->getCondition(),
756 True: FoldedT, False: FoldedF);
757 NewSelect->takeName(V: &BO);
758 BO.replaceAllUsesWith(V: NewSelect);
759 BO.eraseFromParent();
760 if (CastOp)
761 CastOp->eraseFromParent();
762 Sel->eraseFromParent();
763 return true;
764}
765
766std::pair<Value *, Value *>
767AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
768 Value *Src) const {
769 Type *Ty = Src->getType();
770 Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,
771 {Ty, Builder.getInt32Ty()}, Src);
772 Value *FrexpMant = Builder.CreateExtractValue(Agg: Frexp, Idxs: {0});
773
774 // Bypass the bug workaround for the exponent result since it doesn't matter.
775 // TODO: Does the bug workaround even really need to consider the exponent
776 // result? It's unspecified by the spec.
777
778 Value *FrexpExp =
779 ST->hasFractBug()
780 ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,
781 {Builder.getInt32Ty(), Ty}, Src)
782 : Builder.CreateExtractValue(Agg: Frexp, Idxs: {1});
783 return {FrexpMant, FrexpExp};
784}
785
786/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
787Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
788 Value *Src,
789 bool IsNegative) const {
790 // Same as for 1.0, but expand the sign out of the constant.
791 // -1.0 / x -> rcp (fneg x)
792 if (IsNegative)
793 Src = Builder.CreateFNeg(V: Src);
794
795 // The rcp instruction doesn't support denormals, so scale the input
796 // out of the denormal range and convert at the end.
797 //
798 // Expand as 2^-n * (1.0 / (x * 2^n))
799
800 // TODO: Skip scaling if input is known never denormal and the input
801 // range won't underflow to denormal. The hard part is knowing the
802 // result. We need a range check, the result could be denormal for
803 // 0x1p+126 < den <= 0x1p+127.
804 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
805 Value *ScaleFactor = Builder.CreateNeg(V: FrexpExp);
806 Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::ID: amdgcn_rcp, V: FrexpMant);
807 return Builder.CreateCall(Callee: getLdexpF32(), Args: {Rcp, ScaleFactor});
808}
809
810/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
811Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
812 Value *RHS,
813 FastMathFlags FMF) const {
814 // If we have have to work around the fract/frexp bug, we're worse off than
815 // using the fdiv.fast expansion. The full safe expansion is faster if we have
816 // fast FMA.
817 if (HasFP32DenormalFlush && ST->hasFractBug() && !ST->hasFastFMAF32() &&
818 (!FMF.noNaNs() || !FMF.noInfs()))
819 return nullptr;
820
821 // We're scaling the LHS to avoid a denormal input, and scale the denominator
822 // to avoid large values underflowing the result.
823 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, Src: RHS);
824
825 Value *Rcp =
826 Builder.CreateUnaryIntrinsic(Intrinsic::ID: amdgcn_rcp, V: FrexpMantRHS);
827
828 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, Src: LHS);
829 Value *Mul = Builder.CreateFMul(L: FrexpMantLHS, R: Rcp);
830
831 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
832 // result.
833 Value *ExpDiff = Builder.CreateSub(LHS: FrexpExpLHS, RHS: FrexpExpRHS);
834 return Builder.CreateCall(Callee: getLdexpF32(), Args: {Mul, ExpDiff});
835}
836
837/// Emit a sqrt that handles denormals and is accurate to 2ulp.
838Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
839 Value *Src,
840 FastMathFlags FMF) const {
841 Type *Ty = Src->getType();
842 APFloat SmallestNormal =
843 APFloat::getSmallestNormalized(Sem: Ty->getFltSemantics());
844 Value *NeedScale =
845 Builder.CreateFCmpOLT(LHS: Src, RHS: ConstantFP::get(Ty, V: SmallestNormal));
846
847 ConstantInt *Zero = Builder.getInt32(C: 0);
848 Value *InputScaleFactor =
849 Builder.CreateSelect(C: NeedScale, True: Builder.getInt32(C: 32), False: Zero);
850
851 Value *Scaled = Builder.CreateCall(Callee: getLdexpF32(), Args: {Src, InputScaleFactor});
852
853 Value *Sqrt = Builder.CreateCall(Callee: getSqrtF32(), Args: Scaled);
854
855 Value *OutputScaleFactor =
856 Builder.CreateSelect(C: NeedScale, True: Builder.getInt32(C: -16), False: Zero);
857 return Builder.CreateCall(Callee: getLdexpF32(), Args: {Sqrt, OutputScaleFactor});
858}
859
860/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
861static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
862 bool IsNegative) {
863 // bool need_scale = x < 0x1p-126f;
864 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
865 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
866 // rsq(x * input_scale) * output_scale;
867
868 Type *Ty = Src->getType();
869 APFloat SmallestNormal =
870 APFloat::getSmallestNormalized(Sem: Ty->getFltSemantics());
871 Value *NeedScale =
872 Builder.CreateFCmpOLT(LHS: Src, RHS: ConstantFP::get(Ty, V: SmallestNormal));
873 Constant *One = ConstantFP::get(Ty, V: 1.0);
874 Constant *InputScale = ConstantFP::get(Ty, V: 0x1.0p+24);
875 Constant *OutputScale =
876 ConstantFP::get(Ty, V: IsNegative ? -0x1.0p+12 : 0x1.0p+12);
877
878 Value *InputScaleFactor = Builder.CreateSelect(C: NeedScale, True: InputScale, False: One);
879
880 Value *ScaledInput = Builder.CreateFMul(L: Src, R: InputScaleFactor);
881 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::ID: amdgcn_rsq, V: ScaledInput);
882 Value *OutputScaleFactor = Builder.CreateSelect(
883 C: NeedScale, True: OutputScale, False: IsNegative ? ConstantFP::get(Ty, V: -1.0) : One);
884
885 return Builder.CreateFMul(L: Rsq, R: OutputScaleFactor);
886}
887
888bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
889 FastMathFlags DivFMF,
890 FastMathFlags SqrtFMF) const {
891 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
892 if (!DivFMF.allowContract() || !SqrtFMF.allowContract())
893 return false;
894
895 // v_rsq_f32 gives 1ulp
896 return SqrtFMF.approxFunc() || HasUnsafeFPMath ||
897 SqrtOp->getFPAccuracy() >= 1.0f;
898}
899
900Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
901 IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
902 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
903 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
904 assert(DivFMF.allowContract() && SqrtFMF.allowContract());
905
906 // rsq_f16 is accurate to 0.51 ulp.
907 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
908 // rsq_f64 is never accurate.
909 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Val: Num);
910 if (!CLHS)
911 return nullptr;
912
913 assert(Den->getType()->isFloatTy());
914
915 bool IsNegative = false;
916
917 // TODO: Handle other numerator values with arcp.
918 if (CLHS->isExactlyValue(V: 1.0) || (IsNegative = CLHS->isExactlyValue(V: -1.0))) {
919 // Add in the sqrt flags.
920 IRBuilder<>::FastMathFlagGuard Guard(Builder);
921 Builder.setFastMathFlags(DivFMF | SqrtFMF);
922
923 if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath ||
924 canIgnoreDenormalInput(V: Den, CtxI)) {
925 Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::ID: amdgcn_rsq, V: Den);
926 // -1.0 / sqrt(x) -> fneg(rsq(x))
927 return IsNegative ? Builder.CreateFNeg(V: Result) : Result;
928 }
929
930 return emitRsqIEEE1ULP(Builder, Src: Den, IsNegative);
931 }
932
933 return nullptr;
934}
935
936// Optimize fdiv with rcp:
937//
938// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
939// allowed with unsafe-fp-math or afn.
940//
941// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
942Value *
943AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
944 Value *Den, FastMathFlags FMF,
945 const Instruction *CtxI) const {
946 // rcp_f16 is accurate to 0.51 ulp.
947 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
948 // rcp_f64 is never accurate.
949 assert(Den->getType()->isFloatTy());
950
951 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Val: Num)) {
952 bool IsNegative = false;
953 if (CLHS->isExactlyValue(V: 1.0) ||
954 (IsNegative = CLHS->isExactlyValue(V: -1.0))) {
955 Value *Src = Den;
956
957 if (HasFP32DenormalFlush || FMF.approxFunc()) {
958 // -1.0 / x -> 1.0 / fneg(x)
959 if (IsNegative)
960 Src = Builder.CreateFNeg(V: Src);
961
962 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
963 // the CI documentation has a worst case error of 1 ulp.
964 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
965 // to use it as long as we aren't trying to use denormals.
966 //
967 // v_rcp_f16 and v_rsq_f16 DO support denormals.
968
969 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
970 // insert rsq intrinsic here.
971
972 // 1.0 / x -> rcp(x)
973 return Builder.CreateUnaryIntrinsic(Intrinsic::ID: amdgcn_rcp, V: Src);
974 }
975
976 // TODO: If the input isn't denormal, and we know the input exponent isn't
977 // big enough to introduce a denormal we can avoid the scaling.
978 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
979 }
980 }
981
982 if (FMF.allowReciprocal()) {
983 // x / y -> x * (1.0 / y)
984
985 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
986 // will never underflow.
987 if (HasFP32DenormalFlush || FMF.approxFunc()) {
988 Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::ID: amdgcn_rcp, V: Den);
989 return Builder.CreateFMul(L: Num, R: Recip);
990 }
991
992 Value *Recip = emitRcpIEEE1ULP(Builder, Src: Den, IsNegative: false);
993 return Builder.CreateFMul(L: Num, R: Recip);
994 }
995
996 return nullptr;
997}
998
999// optimize with fdiv.fast:
1000//
1001// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1002//
1003// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1004//
1005// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
1006Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
1007 IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
1008 // fdiv.fast can achieve 2.5 ULP accuracy.
1009 if (ReqdAccuracy < 2.5f)
1010 return nullptr;
1011
1012 // Only have fdiv.fast for f32.
1013 assert(Den->getType()->isFloatTy());
1014
1015 bool NumIsOne = false;
1016 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Val: Num)) {
1017 if (CNum->isExactlyValue(V: +1.0) || CNum->isExactlyValue(V: -1.0))
1018 NumIsOne = true;
1019 }
1020
1021 // fdiv does not support denormals. But 1.0/x is always fine to use it.
1022 //
1023 // TODO: This works for any value with a specific known exponent range, don't
1024 // just limit to constant 1.
1025 if (!HasFP32DenormalFlush && !NumIsOne)
1026 return nullptr;
1027
1028 return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den});
1029}
1030
1031Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
1032 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
1033 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
1034 float ReqdDivAccuracy) const {
1035 if (RsqOp) {
1036 Value *Rsq =
1037 optimizeWithRsq(Builder, Num, Den: RsqOp, DivFMF, SqrtFMF, CtxI: FDivInst);
1038 if (Rsq)
1039 return Rsq;
1040 }
1041
1042 Value *Rcp = optimizeWithRcp(Builder, Num, Den, FMF: DivFMF, CtxI: FDivInst);
1043 if (Rcp)
1044 return Rcp;
1045
1046 // In the basic case fdiv_fast has the same instruction count as the frexp div
1047 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
1048 // potentially be fused into a user. Also, materialization of the constants
1049 // can be reused for multiple instances.
1050 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdAccuracy: ReqdDivAccuracy);
1051 if (FDivFast)
1052 return FDivFast;
1053
1054 return emitFrexpDiv(Builder, LHS: Num, RHS: Den, FMF: DivFMF);
1055}
1056
1057// Optimizations is performed based on fpmath, fast math flags as well as
1058// denormals to optimize fdiv with either rcp or fdiv.fast.
1059//
1060// With rcp:
1061// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
1062// allowed with unsafe-fp-math or afn.
1063//
1064// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
1065//
1066// With fdiv.fast:
1067// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
1068//
1069// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
1070//
1071// NOTE: rcp is the preference in cases that both are legal.
1072bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
1073 if (DisableFDivExpand)
1074 return false;
1075
1076 Type *Ty = FDiv.getType()->getScalarType();
1077 if (!Ty->isFloatTy())
1078 return false;
1079
1080 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
1081 // expansion around them in codegen. f16 is good enough to always use.
1082
1083 const FPMathOperator *FPOp = cast<const FPMathOperator>(Val: &FDiv);
1084 const FastMathFlags DivFMF = FPOp->getFastMathFlags();
1085 const float ReqdAccuracy = FPOp->getFPAccuracy();
1086
1087 FastMathFlags SqrtFMF;
1088
1089 Value *Num = FDiv.getOperand(i_nocapture: 0);
1090 Value *Den = FDiv.getOperand(i_nocapture: 1);
1091
1092 Value *RsqOp = nullptr;
1093 auto *DenII = dyn_cast<IntrinsicInst>(Val: Den);
1094 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
1095 DenII->hasOneUse()) {
1096 const auto *SqrtOp = cast<FPMathOperator>(Val: DenII);
1097 SqrtFMF = SqrtOp->getFastMathFlags();
1098 if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
1099 RsqOp = SqrtOp->getOperand(i: 0);
1100 }
1101
1102 // Inaccurate rcp is allowed with unsafe-fp-math or afn.
1103 //
1104 // Defer to codegen to handle this.
1105 //
1106 // TODO: Decide on an interpretation for interactions between afn + arcp +
1107 // !fpmath, and make it consistent between here and codegen. For now, defer
1108 // expansion of afn to codegen. The current interpretation is so aggressive we
1109 // don't need any pre-consideration here when we have better information. A
1110 // more conservative interpretation could use handling here.
1111 const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc();
1112 if (!RsqOp && AllowInaccurateRcp)
1113 return false;
1114
1115 // Defer the correct implementations to codegen.
1116 if (ReqdAccuracy < 1.0f)
1117 return false;
1118
1119 IRBuilder<> Builder(FDiv.getParent(), std::next(x: FDiv.getIterator()));
1120 Builder.setFastMathFlags(DivFMF);
1121 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
1122
1123 SmallVector<Value *, 4> NumVals;
1124 SmallVector<Value *, 4> DenVals;
1125 SmallVector<Value *, 4> RsqDenVals;
1126 extractValues(Builder, Values&: NumVals, V: Num);
1127 extractValues(Builder, Values&: DenVals, V: Den);
1128
1129 if (RsqOp)
1130 extractValues(Builder, Values&: RsqDenVals, V: RsqOp);
1131
1132 SmallVector<Value *, 4> ResultVals(NumVals.size());
1133 for (int I = 0, E = NumVals.size(); I != E; ++I) {
1134 Value *NumElt = NumVals[I];
1135 Value *DenElt = DenVals[I];
1136 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
1137
1138 Value *NewElt =
1139 visitFDivElement(Builder, Num: NumElt, Den: DenElt, DivFMF, SqrtFMF, RsqOp: RsqDenElt,
1140 FDivInst: cast<Instruction>(Val: FPOp), ReqdDivAccuracy: ReqdAccuracy);
1141 if (!NewElt) {
1142 // Keep the original, but scalarized.
1143
1144 // This has the unfortunate side effect of sometimes scalarizing when
1145 // we're not going to do anything.
1146 NewElt = Builder.CreateFDiv(L: NumElt, R: DenElt);
1147 if (auto *NewEltInst = dyn_cast<Instruction>(Val: NewElt))
1148 NewEltInst->copyMetadata(SrcInst: FDiv);
1149 }
1150
1151 ResultVals[I] = NewElt;
1152 }
1153
1154 Value *NewVal = insertValues(Builder, Ty: FDiv.getType(), Values&: ResultVals);
1155
1156 if (NewVal) {
1157 FDiv.replaceAllUsesWith(V: NewVal);
1158 NewVal->takeName(V: &FDiv);
1159 RecursivelyDeleteTriviallyDeadInstructions(V: &FDiv, TLI: TLInfo);
1160 }
1161
1162 return true;
1163}
1164
1165static bool hasUnsafeFPMath(const Function &F) {
1166 Attribute Attr = F.getFnAttribute(Kind: "unsafe-fp-math");
1167 return Attr.getValueAsBool();
1168}
1169
1170static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
1171 Value *LHS, Value *RHS) {
1172 Type *I32Ty = Builder.getInt32Ty();
1173 Type *I64Ty = Builder.getInt64Ty();
1174
1175 Value *LHS_EXT64 = Builder.CreateZExt(V: LHS, DestTy: I64Ty);
1176 Value *RHS_EXT64 = Builder.CreateZExt(V: RHS, DestTy: I64Ty);
1177 Value *MUL64 = Builder.CreateMul(LHS: LHS_EXT64, RHS: RHS_EXT64);
1178 Value *Lo = Builder.CreateTrunc(V: MUL64, DestTy: I32Ty);
1179 Value *Hi = Builder.CreateLShr(LHS: MUL64, RHS: Builder.getInt64(C: 32));
1180 Hi = Builder.CreateTrunc(V: Hi, DestTy: I32Ty);
1181 return std::pair(Lo, Hi);
1182}
1183
1184static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
1185 return getMul64(Builder, LHS, RHS).second;
1186}
1187
1188/// Figure out how many bits are really needed for this division. \p AtLeast is
1189/// an optimization hint to bypass the second ComputeNumSignBits call if we the
1190/// first one is insufficient. Returns -1 on failure.
1191int AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
1192 Value *Den, unsigned AtLeast,
1193 bool IsSigned) const {
1194 const DataLayout &DL = Mod->getDataLayout();
1195 unsigned LHSSignBits = ComputeNumSignBits(Op: Num, DL, Depth: 0, AC, CxtI: &I);
1196 if (LHSSignBits < AtLeast)
1197 return -1;
1198
1199 unsigned RHSSignBits = ComputeNumSignBits(Op: Den, DL, Depth: 0, AC, CxtI: &I);
1200 if (RHSSignBits < AtLeast)
1201 return -1;
1202
1203 unsigned SignBits = std::min(a: LHSSignBits, b: RHSSignBits);
1204 unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
1205 if (IsSigned)
1206 ++DivBits;
1207 return DivBits;
1208}
1209
1210// The fractional part of a float is enough to accurately represent up to
1211// a 24-bit signed integer.
1212Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
1213 BinaryOperator &I, Value *Num,
1214 Value *Den, bool IsDiv,
1215 bool IsSigned) const {
1216 unsigned SSBits = Num->getType()->getScalarSizeInBits();
1217 // If Num bits <= 24, assume 0 signbits.
1218 unsigned AtLeast = (SSBits <= 24) ? 0 : (SSBits - 24 + IsSigned);
1219 int DivBits = getDivNumBits(I, Num, Den, AtLeast, IsSigned);
1220 if (DivBits == -1)
1221 return nullptr;
1222 return expandDivRem24Impl(Builder, I, Num, Den, NumBits: DivBits, IsDiv, IsSigned);
1223}
1224
1225Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1226 IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
1227 unsigned DivBits, bool IsDiv, bool IsSigned) const {
1228 Type *I32Ty = Builder.getInt32Ty();
1229 Num = Builder.CreateTrunc(V: Num, DestTy: I32Ty);
1230 Den = Builder.CreateTrunc(V: Den, DestTy: I32Ty);
1231
1232 Type *F32Ty = Builder.getFloatTy();
1233 ConstantInt *One = Builder.getInt32(C: 1);
1234 Value *JQ = One;
1235
1236 if (IsSigned) {
1237 // char|short jq = ia ^ ib;
1238 JQ = Builder.CreateXor(LHS: Num, RHS: Den);
1239
1240 // jq = jq >> (bitsize - 2)
1241 JQ = Builder.CreateAShr(LHS: JQ, RHS: Builder.getInt32(C: 30));
1242
1243 // jq = jq | 0x1
1244 JQ = Builder.CreateOr(LHS: JQ, RHS: One);
1245 }
1246
1247 // int ia = (int)LHS;
1248 Value *IA = Num;
1249
1250 // int ib, (int)RHS;
1251 Value *IB = Den;
1252
1253 // float fa = (float)ia;
1254 Value *FA = IsSigned ? Builder.CreateSIToFP(V: IA, DestTy: F32Ty)
1255 : Builder.CreateUIToFP(V: IA, DestTy: F32Ty);
1256
1257 // float fb = (float)ib;
1258 Value *FB = IsSigned ? Builder.CreateSIToFP(V: IB,DestTy: F32Ty)
1259 : Builder.CreateUIToFP(V: IB,DestTy: F32Ty);
1260
1261 Function *RcpDecl = Intrinsic::getDeclaration(M: Mod, Intrinsic::id: amdgcn_rcp,
1262 Tys: Builder.getFloatTy());
1263 Value *RCP = Builder.CreateCall(Callee: RcpDecl, Args: { FB });
1264 Value *FQM = Builder.CreateFMul(L: FA, R: RCP);
1265
1266 // fq = trunc(fqm);
1267 CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::ID: trunc, V: FQM);
1268 FQ->copyFastMathFlags(FMF: Builder.getFastMathFlags());
1269
1270 // float fqneg = -fq;
1271 Value *FQNeg = Builder.CreateFNeg(V: FQ);
1272
1273 // float fr = mad(fqneg, fb, fa);
1274 auto FMAD = !ST->hasMadMacF32Insts()
1275 ? Intrinsic::fma
1276 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1277 Value *FR = Builder.CreateIntrinsic(FMAD,
1278 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
1279
1280 // int iq = (int)fq;
1281 Value *IQ = IsSigned ? Builder.CreateFPToSI(V: FQ, DestTy: I32Ty)
1282 : Builder.CreateFPToUI(V: FQ, DestTy: I32Ty);
1283
1284 // fr = fabs(fr);
1285 FR = Builder.CreateUnaryIntrinsic(Intrinsic::ID: fabs, V: FR, FMFSource: FQ);
1286
1287 // fb = fabs(fb);
1288 FB = Builder.CreateUnaryIntrinsic(Intrinsic::ID: fabs, V: FB, FMFSource: FQ);
1289
1290 // int cv = fr >= fb;
1291 Value *CV = Builder.CreateFCmpOGE(LHS: FR, RHS: FB);
1292
1293 // jq = (cv ? jq : 0);
1294 JQ = Builder.CreateSelect(C: CV, True: JQ, False: Builder.getInt32(C: 0));
1295
1296 // dst = iq + jq;
1297 Value *Div = Builder.CreateAdd(LHS: IQ, RHS: JQ);
1298
1299 Value *Res = Div;
1300 if (!IsDiv) {
1301 // Rem needs compensation, it's easier to recompute it
1302 Value *Rem = Builder.CreateMul(LHS: Div, RHS: Den);
1303 Res = Builder.CreateSub(LHS: Num, RHS: Rem);
1304 }
1305
1306 if (DivBits != 0 && DivBits < 32) {
1307 // Extend in register from the number of bits this divide really is.
1308 if (IsSigned) {
1309 int InRegBits = 32 - DivBits;
1310
1311 Res = Builder.CreateShl(LHS: Res, RHS: InRegBits);
1312 Res = Builder.CreateAShr(LHS: Res, RHS: InRegBits);
1313 } else {
1314 ConstantInt *TruncMask
1315 = Builder.getInt32(C: (UINT64_C(1) << DivBits) - 1);
1316 Res = Builder.CreateAnd(LHS: Res, RHS: TruncMask);
1317 }
1318 }
1319
1320 return Res;
1321}
1322
1323// Try to recognize special cases the DAG will emit special, better expansions
1324// than the general expansion we do here.
1325
1326// TODO: It would be better to just directly handle those optimizations here.
1327bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1328 Value *Num,
1329 Value *Den) const {
1330 if (Constant *C = dyn_cast<Constant>(Val: Den)) {
1331 // Arbitrary constants get a better expansion as long as a wider mulhi is
1332 // legal.
1333 if (C->getType()->getScalarSizeInBits() <= 32)
1334 return true;
1335
1336 // TODO: Sdiv check for not exact for some reason.
1337
1338 // If there's no wider mulhi, there's only a better expansion for powers of
1339 // two.
1340 // TODO: Should really know for each vector element.
1341 if (isKnownToBeAPowerOfTwo(V: C, DL: *DL, OrZero: true, Depth: 0, AC, CxtI: &I, DT))
1342 return true;
1343
1344 return false;
1345 }
1346
1347 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Val: Den)) {
1348 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1349 if (BinOpDen->getOpcode() == Instruction::Shl &&
1350 isa<Constant>(Val: BinOpDen->getOperand(i_nocapture: 0)) &&
1351 isKnownToBeAPowerOfTwo(V: BinOpDen->getOperand(i_nocapture: 0), DL: *DL, OrZero: true,
1352 Depth: 0, AC, CxtI: &I, DT)) {
1353 return true;
1354 }
1355 }
1356
1357 return false;
1358}
1359
1360static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) {
1361 // Check whether the sign can be determined statically.
1362 KnownBits Known = computeKnownBits(V, DL: *DL);
1363 if (Known.isNegative())
1364 return Constant::getAllOnesValue(Ty: V->getType());
1365 if (Known.isNonNegative())
1366 return Constant::getNullValue(Ty: V->getType());
1367 return Builder.CreateAShr(LHS: V, RHS: Builder.getInt32(C: 31));
1368}
1369
1370Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1371 BinaryOperator &I, Value *X,
1372 Value *Y) const {
1373 Instruction::BinaryOps Opc = I.getOpcode();
1374 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1375 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1376
1377 FastMathFlags FMF;
1378 FMF.setFast();
1379 Builder.setFastMathFlags(FMF);
1380
1381 if (divHasSpecialOptimization(I, Num: X, Den: Y))
1382 return nullptr; // Keep it for later optimization.
1383
1384 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1385 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1386
1387 Type *Ty = X->getType();
1388 Type *I32Ty = Builder.getInt32Ty();
1389 Type *F32Ty = Builder.getFloatTy();
1390
1391 if (Ty->getScalarSizeInBits() != 32) {
1392 if (IsSigned) {
1393 X = Builder.CreateSExtOrTrunc(V: X, DestTy: I32Ty);
1394 Y = Builder.CreateSExtOrTrunc(V: Y, DestTy: I32Ty);
1395 } else {
1396 X = Builder.CreateZExtOrTrunc(V: X, DestTy: I32Ty);
1397 Y = Builder.CreateZExtOrTrunc(V: Y, DestTy: I32Ty);
1398 }
1399 }
1400
1401 if (Value *Res = expandDivRem24(Builder, I, Num: X, Den: Y, IsDiv, IsSigned)) {
1402 return IsSigned ? Builder.CreateSExtOrTrunc(V: Res, DestTy: Ty) :
1403 Builder.CreateZExtOrTrunc(V: Res, DestTy: Ty);
1404 }
1405
1406 ConstantInt *Zero = Builder.getInt32(C: 0);
1407 ConstantInt *One = Builder.getInt32(C: 1);
1408
1409 Value *Sign = nullptr;
1410 if (IsSigned) {
1411 Value *SignX = getSign32(V: X, Builder, DL);
1412 Value *SignY = getSign32(V: Y, Builder, DL);
1413 // Remainder sign is the same as LHS
1414 Sign = IsDiv ? Builder.CreateXor(LHS: SignX, RHS: SignY) : SignX;
1415
1416 X = Builder.CreateAdd(LHS: X, RHS: SignX);
1417 Y = Builder.CreateAdd(LHS: Y, RHS: SignY);
1418
1419 X = Builder.CreateXor(LHS: X, RHS: SignX);
1420 Y = Builder.CreateXor(LHS: Y, RHS: SignY);
1421 }
1422
1423 // The algorithm here is based on ideas from "Software Integer Division", Tom
1424 // Rodeheffer, August 2008.
1425 //
1426 // unsigned udiv(unsigned x, unsigned y) {
1427 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1428 // // that this is a lower bound on inv(y), even if some of the calculations
1429 // // round up.
1430 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1431 //
1432 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1433 // // Empirically this is guaranteed to give a "two-y" lower bound on
1434 // // inv(y).
1435 // z += umulh(z, -y * z);
1436 //
1437 // // Quotient/remainder estimate.
1438 // unsigned q = umulh(x, z);
1439 // unsigned r = x - q * y;
1440 //
1441 // // Two rounds of quotient/remainder refinement.
1442 // if (r >= y) {
1443 // ++q;
1444 // r -= y;
1445 // }
1446 // if (r >= y) {
1447 // ++q;
1448 // r -= y;
1449 // }
1450 //
1451 // return q;
1452 // }
1453
1454 // Initial estimate of inv(y).
1455 Value *FloatY = Builder.CreateUIToFP(V: Y, DestTy: F32Ty);
1456 Function *Rcp = Intrinsic::getDeclaration(M: Mod, Intrinsic::id: amdgcn_rcp, Tys: F32Ty);
1457 Value *RcpY = Builder.CreateCall(Callee: Rcp, Args: {FloatY});
1458 Constant *Scale = ConstantFP::get(Ty: F32Ty, V: llvm::bit_cast<float>(from: 0x4F7FFFFE));
1459 Value *ScaledY = Builder.CreateFMul(L: RcpY, R: Scale);
1460 Value *Z = Builder.CreateFPToUI(V: ScaledY, DestTy: I32Ty);
1461
1462 // One round of UNR.
1463 Value *NegY = Builder.CreateSub(LHS: Zero, RHS: Y);
1464 Value *NegYZ = Builder.CreateMul(LHS: NegY, RHS: Z);
1465 Z = Builder.CreateAdd(LHS: Z, RHS: getMulHu(Builder, LHS: Z, RHS: NegYZ));
1466
1467 // Quotient/remainder estimate.
1468 Value *Q = getMulHu(Builder, LHS: X, RHS: Z);
1469 Value *R = Builder.CreateSub(LHS: X, RHS: Builder.CreateMul(LHS: Q, RHS: Y));
1470
1471 // First quotient/remainder refinement.
1472 Value *Cond = Builder.CreateICmpUGE(LHS: R, RHS: Y);
1473 if (IsDiv)
1474 Q = Builder.CreateSelect(C: Cond, True: Builder.CreateAdd(LHS: Q, RHS: One), False: Q);
1475 R = Builder.CreateSelect(C: Cond, True: Builder.CreateSub(LHS: R, RHS: Y), False: R);
1476
1477 // Second quotient/remainder refinement.
1478 Cond = Builder.CreateICmpUGE(LHS: R, RHS: Y);
1479 Value *Res;
1480 if (IsDiv)
1481 Res = Builder.CreateSelect(C: Cond, True: Builder.CreateAdd(LHS: Q, RHS: One), False: Q);
1482 else
1483 Res = Builder.CreateSelect(C: Cond, True: Builder.CreateSub(LHS: R, RHS: Y), False: R);
1484
1485 if (IsSigned) {
1486 Res = Builder.CreateXor(LHS: Res, RHS: Sign);
1487 Res = Builder.CreateSub(LHS: Res, RHS: Sign);
1488 Res = Builder.CreateSExtOrTrunc(V: Res, DestTy: Ty);
1489 } else {
1490 Res = Builder.CreateZExtOrTrunc(V: Res, DestTy: Ty);
1491 }
1492 return Res;
1493}
1494
1495Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1496 BinaryOperator &I, Value *Num,
1497 Value *Den) const {
1498 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1499 return nullptr; // Keep it for later optimization.
1500
1501 Instruction::BinaryOps Opc = I.getOpcode();
1502
1503 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1504 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1505
1506 int NumDivBits = getDivNumBits(I, Num, Den, AtLeast: 32, IsSigned);
1507 if (NumDivBits == -1)
1508 return nullptr;
1509
1510 Value *Narrowed = nullptr;
1511 if (NumDivBits <= 24) {
1512 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, DivBits: NumDivBits,
1513 IsDiv, IsSigned);
1514 } else if (NumDivBits <= 32) {
1515 Narrowed = expandDivRem32(Builder, I, X: Num, Y: Den);
1516 }
1517
1518 if (Narrowed) {
1519 return IsSigned ? Builder.CreateSExt(V: Narrowed, DestTy: Num->getType()) :
1520 Builder.CreateZExt(V: Narrowed, DestTy: Num->getType());
1521 }
1522
1523 return nullptr;
1524}
1525
1526void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1527 Instruction::BinaryOps Opc = I.getOpcode();
1528 // Do the general expansion.
1529 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1530 expandDivisionUpTo64Bits(Div: &I);
1531 return;
1532 }
1533
1534 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1535 expandRemainderUpTo64Bits(Rem: &I);
1536 return;
1537 }
1538
1539 llvm_unreachable("not a division");
1540}
1541
1542bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1543 if (foldBinOpIntoSelect(BO&: I))
1544 return true;
1545
1546 if (ST->has16BitInsts() && needsPromotionToI32(T: I.getType()) &&
1547 UA->isUniform(I: &I) && promoteUniformOpToI32(I))
1548 return true;
1549
1550 if (UseMul24Intrin && replaceMulWithMul24(I))
1551 return true;
1552
1553 bool Changed = false;
1554 Instruction::BinaryOps Opc = I.getOpcode();
1555 Type *Ty = I.getType();
1556 Value *NewDiv = nullptr;
1557 unsigned ScalarSize = Ty->getScalarSizeInBits();
1558
1559 SmallVector<BinaryOperator *, 8> Div64ToExpand;
1560
1561 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1562 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1563 ScalarSize <= 64 &&
1564 !DisableIDivExpand) {
1565 Value *Num = I.getOperand(i_nocapture: 0);
1566 Value *Den = I.getOperand(i_nocapture: 1);
1567 IRBuilder<> Builder(&I);
1568 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1569
1570 if (auto *VT = dyn_cast<FixedVectorType>(Val: Ty)) {
1571 NewDiv = PoisonValue::get(T: VT);
1572
1573 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1574 Value *NumEltN = Builder.CreateExtractElement(Vec: Num, Idx: N);
1575 Value *DenEltN = Builder.CreateExtractElement(Vec: Den, Idx: N);
1576
1577 Value *NewElt;
1578 if (ScalarSize <= 32) {
1579 NewElt = expandDivRem32(Builder, I, X: NumEltN, Y: DenEltN);
1580 if (!NewElt)
1581 NewElt = Builder.CreateBinOp(Opc, LHS: NumEltN, RHS: DenEltN);
1582 } else {
1583 // See if this 64-bit division can be shrunk to 32/24-bits before
1584 // producing the general expansion.
1585 NewElt = shrinkDivRem64(Builder, I, Num: NumEltN, Den: DenEltN);
1586 if (!NewElt) {
1587 // The general 64-bit expansion introduces control flow and doesn't
1588 // return the new value. Just insert a scalar copy and defer
1589 // expanding it.
1590 NewElt = Builder.CreateBinOp(Opc, LHS: NumEltN, RHS: DenEltN);
1591 Div64ToExpand.push_back(Elt: cast<BinaryOperator>(Val: NewElt));
1592 }
1593 }
1594
1595 NewDiv = Builder.CreateInsertElement(Vec: NewDiv, NewElt, Idx: N);
1596 }
1597 } else {
1598 if (ScalarSize <= 32)
1599 NewDiv = expandDivRem32(Builder, I, X: Num, Y: Den);
1600 else {
1601 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1602 if (!NewDiv)
1603 Div64ToExpand.push_back(Elt: &I);
1604 }
1605 }
1606
1607 if (NewDiv) {
1608 I.replaceAllUsesWith(V: NewDiv);
1609 I.eraseFromParent();
1610 Changed = true;
1611 }
1612 }
1613
1614 if (ExpandDiv64InIR) {
1615 // TODO: We get much worse code in specially handled constant cases.
1616 for (BinaryOperator *Div : Div64ToExpand) {
1617 expandDivRem64(I&: *Div);
1618 FlowChanged = true;
1619 Changed = true;
1620 }
1621 }
1622
1623 return Changed;
1624}
1625
1626bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1627 if (!WidenLoads)
1628 return false;
1629
1630 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1631 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1632 canWidenScalarExtLoad(I)) {
1633 IRBuilder<> Builder(&I);
1634 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1635
1636 Type *I32Ty = Builder.getInt32Ty();
1637 LoadInst *WidenLoad = Builder.CreateLoad(Ty: I32Ty, Ptr: I.getPointerOperand());
1638 WidenLoad->copyMetadata(SrcInst: I);
1639
1640 // If we have range metadata, we need to convert the type, and not make
1641 // assumptions about the high bits.
1642 if (auto *Range = WidenLoad->getMetadata(KindID: LLVMContext::MD_range)) {
1643 ConstantInt *Lower =
1644 mdconst::extract<ConstantInt>(MD: Range->getOperand(I: 0));
1645
1646 if (Lower->isNullValue()) {
1647 WidenLoad->setMetadata(KindID: LLVMContext::MD_range, Node: nullptr);
1648 } else {
1649 Metadata *LowAndHigh[] = {
1650 ConstantAsMetadata::get(C: ConstantInt::get(Ty: I32Ty, V: Lower->getValue().zext(width: 32))),
1651 // Don't make assumptions about the high bits.
1652 ConstantAsMetadata::get(C: ConstantInt::get(Ty: I32Ty, V: 0))
1653 };
1654
1655 WidenLoad->setMetadata(KindID: LLVMContext::MD_range,
1656 Node: MDNode::get(Context&: Mod->getContext(), MDs: LowAndHigh));
1657 }
1658 }
1659
1660 int TySize = Mod->getDataLayout().getTypeSizeInBits(Ty: I.getType());
1661 Type *IntNTy = Builder.getIntNTy(N: TySize);
1662 Value *ValTrunc = Builder.CreateTrunc(V: WidenLoad, DestTy: IntNTy);
1663 Value *ValOrig = Builder.CreateBitCast(V: ValTrunc, DestTy: I.getType());
1664 I.replaceAllUsesWith(V: ValOrig);
1665 I.eraseFromParent();
1666 return true;
1667 }
1668
1669 return false;
1670}
1671
1672bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
1673 bool Changed = false;
1674
1675 if (ST->has16BitInsts() && needsPromotionToI32(T: I.getOperand(i_nocapture: 0)->getType()) &&
1676 UA->isUniform(I: &I))
1677 Changed |= promoteUniformOpToI32(I);
1678
1679 return Changed;
1680}
1681
1682bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1683 Value *Cond = I.getCondition();
1684 Value *TrueVal = I.getTrueValue();
1685 Value *FalseVal = I.getFalseValue();
1686 Value *CmpVal;
1687 FCmpInst::Predicate Pred;
1688
1689 if (ST->has16BitInsts() && needsPromotionToI32(T: I.getType())) {
1690 if (UA->isUniform(I: &I))
1691 return promoteUniformOpToI32(I);
1692 return false;
1693 }
1694
1695 // Match fract pattern with nan check.
1696 if (!match(V: Cond, P: m_FCmp(Pred, L: m_Value(V&: CmpVal), R: m_NonNaN())))
1697 return false;
1698
1699 FPMathOperator *FPOp = dyn_cast<FPMathOperator>(Val: &I);
1700 if (!FPOp)
1701 return false;
1702
1703 IRBuilder<> Builder(&I);
1704 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1705
1706 auto *IITrue = dyn_cast<IntrinsicInst>(Val: TrueVal);
1707 auto *IIFalse = dyn_cast<IntrinsicInst>(Val: FalseVal);
1708
1709 Value *Fract = nullptr;
1710 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1711 CmpVal == matchFractPat(I&: *IIFalse)) {
1712 // isnan(x) ? x : fract(x)
1713 Fract = applyFractPat(Builder, FractArg: CmpVal);
1714 } else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1715 CmpVal == matchFractPat(I&: *IITrue)) {
1716 // !isnan(x) ? fract(x) : x
1717 Fract = applyFractPat(Builder, FractArg: CmpVal);
1718 } else
1719 return false;
1720
1721 Fract->takeName(V: &I);
1722 I.replaceAllUsesWith(V: Fract);
1723 RecursivelyDeleteTriviallyDeadInstructions(V: &I, TLI: TLInfo);
1724 return true;
1725}
1726
1727static bool areInSameBB(const Value *A, const Value *B) {
1728 const auto *IA = dyn_cast<Instruction>(Val: A);
1729 const auto *IB = dyn_cast<Instruction>(Val: B);
1730 return IA && IB && IA->getParent() == IB->getParent();
1731}
1732
1733// Helper for breaking large PHIs that returns true when an extractelement on V
1734// is likely to be folded away by the DAG combiner.
1735static bool isInterestingPHIIncomingValue(const Value *V) {
1736 const auto *FVT = dyn_cast<FixedVectorType>(Val: V->getType());
1737 if (!FVT)
1738 return false;
1739
1740 const Value *CurVal = V;
1741
1742 // Check for insertelements, keeping track of the elements covered.
1743 BitVector EltsCovered(FVT->getNumElements());
1744 while (const auto *IE = dyn_cast<InsertElementInst>(Val: CurVal)) {
1745 const auto *Idx = dyn_cast<ConstantInt>(Val: IE->getOperand(i_nocapture: 2));
1746
1747 // Non constant index/out of bounds index -> folding is unlikely.
1748 // The latter is more of a sanity check because canonical IR should just
1749 // have replaced those with poison.
1750 if (!Idx || Idx->getSExtValue() >= FVT->getNumElements())
1751 return false;
1752
1753 const auto *VecSrc = IE->getOperand(i_nocapture: 0);
1754
1755 // If the vector source is another instruction, it must be in the same basic
1756 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1757 // unlikely to be able to do anything interesting here.
1758 if (isa<Instruction>(Val: VecSrc) && !areInSameBB(A: VecSrc, B: IE))
1759 return false;
1760
1761 CurVal = VecSrc;
1762 EltsCovered.set(Idx->getSExtValue());
1763
1764 // All elements covered.
1765 if (EltsCovered.all())
1766 return true;
1767 }
1768
1769 // We either didn't find a single insertelement, or the insertelement chain
1770 // ended before all elements were covered. Check for other interesting values.
1771
1772 // Constants are always interesting because we can just constant fold the
1773 // extractelements.
1774 if (isa<Constant>(Val: CurVal))
1775 return true;
1776
1777 // shufflevector is likely to be profitable if either operand is a constant,
1778 // or if either source is in the same block.
1779 // This is because shufflevector is most often lowered as a series of
1780 // insert/extract elements anyway.
1781 if (const auto *SV = dyn_cast<ShuffleVectorInst>(Val: CurVal)) {
1782 return isa<Constant>(Val: SV->getOperand(i_nocapture: 1)) ||
1783 areInSameBB(A: SV, B: SV->getOperand(i_nocapture: 0)) ||
1784 areInSameBB(A: SV, B: SV->getOperand(i_nocapture: 1));
1785 }
1786
1787 return false;
1788}
1789
1790static void collectPHINodes(const PHINode &I,
1791 SmallPtrSet<const PHINode *, 8> &SeenPHIs) {
1792 const auto [It, Inserted] = SeenPHIs.insert(Ptr: &I);
1793 if (!Inserted)
1794 return;
1795
1796 for (const Value *Inc : I.incoming_values()) {
1797 if (const auto *PhiInc = dyn_cast<PHINode>(Val: Inc))
1798 collectPHINodes(I: *PhiInc, SeenPHIs);
1799 }
1800
1801 for (const User *U : I.users()) {
1802 if (const auto *PhiU = dyn_cast<PHINode>(Val: U))
1803 collectPHINodes(I: *PhiU, SeenPHIs);
1804 }
1805}
1806
1807bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1808 // Check in the cache first.
1809 if (const auto It = BreakPhiNodesCache.find(Val: &I);
1810 It != BreakPhiNodesCache.end())
1811 return It->second;
1812
1813 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1814 // recursively consider all its users and incoming values that are also PHI
1815 // nodes. We then make a decision about all of those PHIs at once. Either they
1816 // all get broken up, or none of them do. That way, we avoid cases where a
1817 // single PHI is/is not broken and we end up reforming/exploding a vector
1818 // multiple times, or even worse, doing it in a loop.
1819 SmallPtrSet<const PHINode *, 8> WorkList;
1820 collectPHINodes(I, SeenPHIs&: WorkList);
1821
1822#ifndef NDEBUG
1823 // Check that none of the PHI nodes in the worklist are in the map. If some of
1824 // them are, it means we're not good enough at collecting related PHIs.
1825 for (const PHINode *WLP : WorkList) {
1826 assert(BreakPhiNodesCache.count(WLP) == 0);
1827 }
1828#endif
1829
1830 // To consider a PHI profitable to break, we need to see some interesting
1831 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1832 // must have one to consider all PHIs breakable.
1833 //
1834 // This threshold has been determined through performance testing.
1835 //
1836 // Note that the computation below is equivalent to
1837 //
1838 // (unsigned)ceil((K / 3.0) * 2)
1839 //
1840 // It's simply written this way to avoid mixing integral/FP arithmetic.
1841 const auto Threshold = (alignTo(Value: WorkList.size() * 2, Align: 3) / 3);
1842 unsigned NumBreakablePHIs = 0;
1843 bool CanBreak = false;
1844 for (const PHINode *Cur : WorkList) {
1845 // Don't break PHIs that have no interesting incoming values. That is, where
1846 // there is no clear opportunity to fold the "extractelement" instructions
1847 // we would add.
1848 //
1849 // Note: IC does not run after this pass, so we're only interested in the
1850 // foldings that the DAG combiner can do.
1851 if (any_of(Range: Cur->incoming_values(), P: isInterestingPHIIncomingValue)) {
1852 if (++NumBreakablePHIs >= Threshold) {
1853 CanBreak = true;
1854 break;
1855 }
1856 }
1857 }
1858
1859 for (const PHINode *Cur : WorkList)
1860 BreakPhiNodesCache[Cur] = CanBreak;
1861
1862 return CanBreak;
1863}
1864
1865/// Helper class for "break large PHIs" (visitPHINode).
1866///
1867/// This represents a slice of a PHI's incoming value, which is made up of:
1868/// - The type of the slice (Ty)
1869/// - The index in the incoming value's vector where the slice starts (Idx)
1870/// - The number of elements in the slice (NumElts).
1871/// It also keeps track of the NewPHI node inserted for this particular slice.
1872///
1873/// Slice examples:
1874/// <4 x i64> -> Split into four i64 slices.
1875/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1876/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1877/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1878class VectorSlice {
1879public:
1880 VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
1881 : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1882
1883 Type *Ty = nullptr;
1884 unsigned Idx = 0;
1885 unsigned NumElts = 0;
1886 PHINode *NewPHI = nullptr;
1887
1888 /// Slice \p Inc according to the information contained within this slice.
1889 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1890 /// pair, it returns the same Sliced value as well.
1891 ///
1892 /// Note this *intentionally* does not return the same value for, say,
1893 /// [%bb.0, %0] & [%bb.1, %0] as:
1894 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1895 /// the value in bb.1 may not be reachable from bb.0 if it's its
1896 /// predecessor.)
1897 /// - We also want to make our extract instructions as local as possible so
1898 /// the DAG has better chances of folding them out. Duplicating them like
1899 /// that is beneficial in that regard.
1900 ///
1901 /// This is both a minor optimization to avoid creating duplicate
1902 /// instructions, but also a requirement for correctness. It is not forbidden
1903 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
1904 /// returned a new value each time, those previously identical pairs would all
1905 /// have different incoming values (from the same block) and it'd cause a "PHI
1906 /// node has multiple entries for the same basic block with different incoming
1907 /// values!" verifier error.
1908 Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
1909 Value *&Res = SlicedVals[{BB, Inc}];
1910 if (Res)
1911 return Res;
1912
1913 IRBuilder<> B(BB->getTerminator());
1914 if (Instruction *IncInst = dyn_cast<Instruction>(Val: Inc))
1915 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1916
1917 if (NumElts > 1) {
1918 SmallVector<int, 4> Mask;
1919 for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1920 Mask.push_back(Elt: K);
1921 Res = B.CreateShuffleVector(V: Inc, Mask, Name: NewValName);
1922 } else
1923 Res = B.CreateExtractElement(Vec: Inc, Idx, Name: NewValName);
1924
1925 return Res;
1926 }
1927
1928private:
1929 SmallDenseMap<std::pair<BasicBlock *, Value *>, Value *> SlicedVals;
1930};
1931
1932bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1933 // Break-up fixed-vector PHIs into smaller pieces.
1934 // Default threshold is 32, so it breaks up any vector that's >32 bits into
1935 // its elements, or into 32-bit pieces (for 8/16 bit elts).
1936 //
1937 // This is only helpful for DAGISel because it doesn't handle large PHIs as
1938 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1939 // With large, odd-sized PHIs we may end up needing many `build_vector`
1940 // operations with most elements being "undef". This inhibits a lot of
1941 // optimization opportunities and can result in unreasonably high register
1942 // pressure and the inevitable stack spilling.
1943 if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
1944 return false;
1945
1946 FixedVectorType *FVT = dyn_cast<FixedVectorType>(Val: I.getType());
1947 if (!FVT || FVT->getNumElements() == 1 ||
1948 DL->getTypeSizeInBits(Ty: FVT) <= BreakLargePHIsThreshold)
1949 return false;
1950
1951 if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1952 return false;
1953
1954 std::vector<VectorSlice> Slices;
1955
1956 Type *EltTy = FVT->getElementType();
1957 {
1958 unsigned Idx = 0;
1959 // For 8/16 bits type, don't scalarize fully but break it up into as many
1960 // 32-bit slices as we can, and scalarize the tail.
1961 const unsigned EltSize = DL->getTypeSizeInBits(Ty: EltTy);
1962 const unsigned NumElts = FVT->getNumElements();
1963 if (EltSize == 8 || EltSize == 16) {
1964 const unsigned SubVecSize = (32 / EltSize);
1965 Type *SubVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: SubVecSize);
1966 for (unsigned End = alignDown(Value: NumElts, Align: SubVecSize); Idx < End;
1967 Idx += SubVecSize)
1968 Slices.emplace_back(args&: SubVecTy, args&: Idx, args: SubVecSize);
1969 }
1970
1971 // Scalarize all remaining elements.
1972 for (; Idx < NumElts; ++Idx)
1973 Slices.emplace_back(args&: EltTy, args&: Idx, args: 1);
1974 }
1975
1976 assert(Slices.size() > 1);
1977
1978 // Create one PHI per vector piece. The "VectorSlice" class takes care of
1979 // creating the necessary instruction to extract the relevant slices of each
1980 // incoming value.
1981 IRBuilder<> B(I.getParent());
1982 B.SetCurrentDebugLocation(I.getDebugLoc());
1983
1984 unsigned IncNameSuffix = 0;
1985 for (VectorSlice &S : Slices) {
1986 // We need to reset the build on each iteration, because getSlicedVal may
1987 // have inserted something into I's BB.
1988 B.SetInsertPoint(I.getParent()->getFirstNonPHI());
1989 S.NewPHI = B.CreatePHI(Ty: S.Ty, NumReservedValues: I.getNumIncomingValues());
1990
1991 for (const auto &[Idx, BB] : enumerate(First: I.blocks())) {
1992 S.NewPHI->addIncoming(V: S.getSlicedVal(BB, Inc: I.getIncomingValue(i: Idx),
1993 NewValName: "largephi.extractslice" +
1994 std::to_string(val: IncNameSuffix++)),
1995 BB);
1996 }
1997 }
1998
1999 // And replace this PHI with a vector of all the previous PHI values.
2000 Value *Vec = PoisonValue::get(T: FVT);
2001 unsigned NameSuffix = 0;
2002 for (VectorSlice &S : Slices) {
2003 const auto ValName = "largephi.insertslice" + std::to_string(val: NameSuffix++);
2004 if (S.NumElts > 1)
2005 Vec =
2006 B.CreateInsertVector(DstType: FVT, SrcVec: Vec, SubVec: S.NewPHI, Idx: B.getInt64(C: S.Idx), Name: ValName);
2007 else
2008 Vec = B.CreateInsertElement(Vec, NewElt: S.NewPHI, Idx: S.Idx, Name: ValName);
2009 }
2010
2011 I.replaceAllUsesWith(V: Vec);
2012 I.eraseFromParent();
2013 return true;
2014}
2015
2016bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2017 switch (I.getIntrinsicID()) {
2018 case Intrinsic::bitreverse:
2019 return visitBitreverseIntrinsicInst(I);
2020 case Intrinsic::minnum:
2021 return visitMinNum(I);
2022 case Intrinsic::sqrt:
2023 return visitSqrt(I);
2024 default:
2025 return false;
2026 }
2027}
2028
2029bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
2030 bool Changed = false;
2031
2032 if (ST->has16BitInsts() && needsPromotionToI32(T: I.getType()) &&
2033 UA->isUniform(I: &I))
2034 Changed |= promoteUniformBitreverseToI32(I);
2035
2036 return Changed;
2037}
2038
2039/// Match non-nan fract pattern.
2040/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0)
2041///
2042/// If fract is a useful instruction for the subtarget. Does not account for the
2043/// nan handling; the instruction has a nan check on the input value.
2044Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
2045 if (ST->hasFractBug())
2046 return nullptr;
2047
2048 if (I.getIntrinsicID() != Intrinsic::minnum)
2049 return nullptr;
2050
2051 Type *Ty = I.getType();
2052 if (!isLegalFloatingTy(Ty: Ty->getScalarType()))
2053 return nullptr;
2054
2055 Value *Arg0 = I.getArgOperand(i: 0);
2056 Value *Arg1 = I.getArgOperand(i: 1);
2057
2058 const APFloat *C;
2059 if (!match(V: Arg1, P: m_APFloat(Res&: C)))
2060 return nullptr;
2061
2062 APFloat One(1.0);
2063 bool LosesInfo;
2064 One.convert(ToSemantics: C->getSemantics(), RM: APFloat::rmNearestTiesToEven, losesInfo: &LosesInfo);
2065
2066 // Match nextafter(1.0, -1)
2067 One.next(nextDown: true);
2068 if (One != *C)
2069 return nullptr;
2070
2071 Value *FloorSrc;
2072 if (match(Arg0, m_FSub(m_Value(FloorSrc),
2073 m_Intrinsic<Intrinsic::floor>(m_Deferred(FloorSrc)))))
2074 return FloorSrc;
2075 return nullptr;
2076}
2077
2078Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
2079 Value *FractArg) {
2080 SmallVector<Value *, 4> FractVals;
2081 extractValues(Builder, Values&: FractVals, V: FractArg);
2082
2083 SmallVector<Value *, 4> ResultVals(FractVals.size());
2084
2085 Type *Ty = FractArg->getType()->getScalarType();
2086 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
2087 ResultVals[I] =
2088 Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});
2089 }
2090
2091 return insertValues(Builder, Ty: FractArg->getType(), Values&: ResultVals);
2092}
2093
2094bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst &I) {
2095 Value *FractArg = matchFractPat(I);
2096 if (!FractArg)
2097 return false;
2098
2099 // Match pattern for fract intrinsic in contexts where the nan check has been
2100 // optimized out (and hope the knowledge the source can't be nan wasn't lost).
2101 if (!I.hasNoNaNs() &&
2102 !isKnownNeverNaN(V: FractArg, /*Depth=*/0, SQ: SimplifyQuery(*DL, TLInfo)))
2103 return false;
2104
2105 IRBuilder<> Builder(&I);
2106 FastMathFlags FMF = I.getFastMathFlags();
2107 FMF.setNoNaNs();
2108 Builder.setFastMathFlags(FMF);
2109
2110 Value *Fract = applyFractPat(Builder, FractArg);
2111 Fract->takeName(V: &I);
2112 I.replaceAllUsesWith(V: Fract);
2113
2114 RecursivelyDeleteTriviallyDeadInstructions(V: &I, TLI: TLInfo);
2115 return true;
2116}
2117
2118static bool isOneOrNegOne(const Value *Val) {
2119 const APFloat *C;
2120 return match(V: Val, P: m_APFloat(Res&: C)) && C->getExactLog2Abs() == 0;
2121}
2122
2123// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2124bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2125 Type *Ty = Sqrt.getType()->getScalarType();
2126 if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST->has16BitInsts()))
2127 return false;
2128
2129 const FPMathOperator *FPOp = cast<const FPMathOperator>(Val: &Sqrt);
2130 FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2131
2132 // We're trying to handle the fast-but-not-that-fast case only. The lowering
2133 // of fast llvm.sqrt will give the raw instruction anyway.
2134 if (SqrtFMF.approxFunc() || HasUnsafeFPMath)
2135 return false;
2136
2137 const float ReqdAccuracy = FPOp->getFPAccuracy();
2138
2139 // Defer correctly rounded expansion to codegen.
2140 if (ReqdAccuracy < 1.0f)
2141 return false;
2142
2143 // FIXME: This is an ugly hack for this pass using forward iteration instead
2144 // of reverse. If it worked like a normal combiner, the rsq would form before
2145 // we saw a sqrt call.
2146 auto *FDiv =
2147 dyn_cast_or_null<FPMathOperator>(Val: Sqrt.getUniqueUndroppableUser());
2148 if (FDiv && FDiv->getOpcode() == Instruction::FDiv &&
2149 FDiv->getFPAccuracy() >= 1.0f &&
2150 canOptimizeWithRsq(SqrtOp: FPOp, DivFMF: FDiv->getFastMathFlags(), SqrtFMF) &&
2151 // TODO: We should also handle the arcp case for the fdiv with non-1 value
2152 isOneOrNegOne(Val: FDiv->getOperand(i: 0)))
2153 return false;
2154
2155 Value *SrcVal = Sqrt.getOperand(i_nocapture: 0);
2156 bool CanTreatAsDAZ = canIgnoreDenormalInput(V: SrcVal, CtxI: &Sqrt);
2157
2158 // The raw instruction is 1 ulp, but the correction for denormal handling
2159 // brings it to 2.
2160 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2161 return false;
2162
2163 IRBuilder<> Builder(&Sqrt);
2164 SmallVector<Value *, 4> SrcVals;
2165 extractValues(Builder, Values&: SrcVals, V: SrcVal);
2166
2167 SmallVector<Value *, 4> ResultVals(SrcVals.size());
2168 for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2169 if (CanTreatAsDAZ)
2170 ResultVals[I] = Builder.CreateCall(Callee: getSqrtF32(), Args: SrcVals[I]);
2171 else
2172 ResultVals[I] = emitSqrtIEEE2ULP(Builder, Src: SrcVals[I], FMF: SqrtFMF);
2173 }
2174
2175 Value *NewSqrt = insertValues(Builder, Ty: Sqrt.getType(), Values&: ResultVals);
2176 NewSqrt->takeName(V: &Sqrt);
2177 Sqrt.replaceAllUsesWith(V: NewSqrt);
2178 Sqrt.eraseFromParent();
2179 return true;
2180}
2181
2182bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
2183 Impl.Mod = &M;
2184 Impl.DL = &Impl.Mod->getDataLayout();
2185 Impl.SqrtF32 = nullptr;
2186 Impl.LdexpF32 = nullptr;
2187 return false;
2188}
2189
2190bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2191 if (skipFunction(F))
2192 return false;
2193
2194 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2195 if (!TPC)
2196 return false;
2197
2198 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2199 Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2200 Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
2201 Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2202 Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2203 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2204 Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
2205 Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
2206 SIModeRegisterDefaults Mode(F, *Impl.ST);
2207 Impl.HasFP32DenormalFlush =
2208 Mode.FP32Denormals == DenormalMode::getPreserveSign();
2209 return Impl.run(F);
2210}
2211
2212PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
2213 FunctionAnalysisManager &FAM) {
2214 AMDGPUCodeGenPrepareImpl Impl;
2215 Impl.Mod = F.getParent();
2216 Impl.DL = &Impl.Mod->getDataLayout();
2217 Impl.TLInfo = &FAM.getResult<TargetLibraryAnalysis>(IR&: F);
2218 Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
2219 Impl.AC = &FAM.getResult<AssumptionAnalysis>(IR&: F);
2220 Impl.UA = &FAM.getResult<UniformityInfoAnalysis>(IR&: F);
2221 Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(IR&: F);
2222 Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
2223 SIModeRegisterDefaults Mode(F, *Impl.ST);
2224 Impl.HasFP32DenormalFlush =
2225 Mode.FP32Denormals == DenormalMode::getPreserveSign();
2226 PreservedAnalyses PA = PreservedAnalyses::none();
2227 if (!Impl.FlowChanged)
2228 PA.preserveSet<CFGAnalyses>();
2229 return Impl.run(F) ? PA : PreservedAnalyses::all();
2230}
2231
2232INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2233 "AMDGPU IR optimizations", false, false)
2234INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
2235INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
2236INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
2237INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2238 false, false)
2239
2240char AMDGPUCodeGenPrepare::ID = 0;
2241
2242FunctionPass *llvm::createAMDGPUCodeGenPreparePass() {
2243 return new AMDGPUCodeGenPrepare();
2244}
2245

source code of llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp