LoopVectorizationLegality.cpp source code [llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp]

1	//===- LoopVectorizationLegality.cpp --------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file provides loop vectorization legality analysis. Original code
10	// resided in LoopVectorize.cpp for a long time.
11	//
12	// At this point, it is implemented as a utility class, not as an analysis
13	// pass. It should be easy to create an analysis pass around it if there
14	// is a need (but D45420 needs to happen first).
15	//
16
17	#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
18	#include "llvm/Analysis/Loads.h"
19	#include "llvm/Analysis/LoopInfo.h"
20	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
21	#include "llvm/Analysis/TargetLibraryInfo.h"
22	#include "llvm/Analysis/TargetTransformInfo.h"
23	#include "llvm/Analysis/ValueTracking.h"
24	#include "llvm/Analysis/VectorUtils.h"
25	#include "llvm/IR/IntrinsicInst.h"
26	#include "llvm/IR/PatternMatch.h"
27	#include "llvm/Transforms/Utils/SizeOpts.h"
28	#include "llvm/Transforms/Vectorize/LoopVectorize.h"
29
30	using namespace llvm;
31	using namespace PatternMatch;
32
33	#define LV_NAME "loop-vectorize"
34	#define DEBUG_TYPE LV_NAME
35
36	static cl::opt<bool>
37	EnableIfConversion("enable-if-conversion", cl::init(Val: true), cl::Hidden,
38	cl::desc ("Enable if-conversion during vectorization."));
39
40	static cl::opt<bool>
41	AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(Val: false), cl::Hidden,
42	cl::desc ("Enable recognition of non-constant strided "
43	"pointer induction variables."));
44
45	namespace llvm {
46	cl::opt<bool>
47	HintsAllowReordering("hints-allow-reordering", cl::init(Val: true), cl::Hidden,
48	cl::desc ("Allow enabling loop hints to reorder "
49	"FP operations during vectorization."));
50	}
51
52	// TODO: Move size-based thresholds out of legality checking, make cost based
53	// decisions instead of hard thresholds.
54	static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
55	"vectorize-scev-check-threshold", cl::init(Val: `16`), cl::Hidden,
56	cl::desc ("The maximum number of SCEV checks allowed."));
57
58	static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
59	"pragma-vectorize-scev-check-threshold", cl::init(Val: `128`), cl::Hidden,
60	cl::desc ("The maximum number of SCEV checks allowed with a "
61	"vectorize(enable) pragma"));
62
63	static cl::opt<LoopVectorizeHints::ScalableForceKind>
64	ForceScalableVectorization(
65	"scalable-vectorization", cl::init(Val: LoopVectorizeHints::SK_Unspecified),
66	cl::Hidden,
67	cl::desc ("Control whether the compiler can use scalable vectors to "
68	"vectorize a loop"),
69	cl::values(
70	clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off",
71	"Scalable vectorization is disabled."),
72	clEnumValN(
73	LoopVectorizeHints::SK_PreferScalable, "preferred",
74	"Scalable vectorization is available and favored when the "
75	"cost is inconclusive."),
76	clEnumValN(
77	LoopVectorizeHints::SK_PreferScalable, "on",
78	"Scalable vectorization is available and favored when the "
79	"cost is inconclusive.")));
80
81	/// Maximum vectorization interleave count.
82	static const unsigned MaxInterleaveFactor = `16`;
83
84	namespace llvm {
85
86	bool LoopVectorizeHints::Hint::validate(unsigned Val) {
87	switch (Kind) {
88	case HK_WIDTH:
89	return isPowerOf2_32(Value: Val) && Val <= VectorizerParams::MaxVectorWidth;
90	case HK_INTERLEAVE:
91	return isPowerOf2_32(Value: Val) && Val <= MaxInterleaveFactor;
92	case HK_FORCE:
93	return (Val <= `1`);
94	case HK_ISVECTORIZED:
95	case HK_PREDICATE:
96	case HK_SCALABLE:
97	return (Val == `0` \|\| Val == `1`);
98	}
99	return false;
100	}
101
102	LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
103	bool InterleaveOnlyWhenForced,
104	OptimizationRemarkEmitter &ORE,
105	const TargetTransformInfo *TTI)
106	: Width ("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
107	Interleave ("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE),
108	Force ("vectorize.enable", FK_Undefined, HK_FORCE),
109	IsVectorized ("isvectorized", `0`, HK_ISVECTORIZED),
110	Predicate ("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
111	Scalable ("vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE),
112	TheLoop(L), ORE(ORE) {
113	// Populate values with existing loop metadata.
114	getHintsFromMetadata();
115
116	// force-vector-interleave overrides DisableInterleaving.
117	if (VectorizerParams::isInterleaveForced())
118	Interleave.Value = VectorizerParams::VectorizationInterleave;
119
120	// If the metadata doesn't explicitly specify whether to enable scalable
121	// vectorization, then decide based on the following criteria (increasing
122	// level of priority):
123	// - Target default
124	// - Metadata width
125	// - Force option (always overrides)
126	if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) {
127	if (TTI)
128	Scalable.Value = TTI->enableScalableVectorization() ? SK_PreferScalable
129	: SK_FixedWidthOnly;
130
131	if (Width.Value)
132	// If the width is set, but the metadata says nothing about the scalable
133	// property, then assume it concerns only a fixed-width UserVF.
134	// If width is not set, the flag takes precedence.
135	Scalable.Value = SK_FixedWidthOnly;
136	}
137
138	// If the flag is set to force any use of scalable vectors, override the loop
139	// hints.
140	if (ForceScalableVectorization.getValue() !=
141	LoopVectorizeHints::SK_Unspecified)
142	Scalable.Value = ForceScalableVectorization.getValue();
143
144	// Scalable vectorization is disabled if no preference is specified.
145	if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified)
146	Scalable.Value = SK_FixedWidthOnly;
147
148	if (IsVectorized.Value != `1`)
149	// If the vectorization width and interleaving count are both 1 then
150	// consider the loop to have been already vectorized because there's
151	// nothing more that we can do.
152	IsVectorized.Value =
153	getWidth() == ElementCount::getFixed(MinVal: `1`) && getInterleave() == `1`;
154	LLVM_DEBUG(if (InterleaveOnlyWhenForced && getInterleave() == `1`) dbgs()
155	<< "LV: Interleaving disabled by the pass manager\n");
156	}
157
158	void LoopVectorizeHints::setAlreadyVectorized() {
159	LLVMContext &Context = TheLoop->getHeader()->getContext();
160
161	MDNode *IsVectorizedMD = MDNode::get(
162	Context,
163	MDs: {MDString::get(Context, Str: "llvm.loop.isvectorized"),
164	ConstantAsMetadata::get(C: ConstantInt::get(Context, V: APInt (`32`, `1`)))});
165	MDNode *LoopID = TheLoop->getLoopID();
166	MDNode *NewLoopID =
167	makePostTransformationMetadata(Context, OrigLoopID: LoopID,
168	RemovePrefixes: {Twine (Prefix(), "vectorize.").str(),
169	Twine (Prefix(), "interleave.").str()},
170	AddAttrs: {IsVectorizedMD});
171	TheLoop->setLoopID(NewLoopID);
172
173	// Update internal cache.
174	IsVectorized.Value = `1`;
175	}
176
177	bool LoopVectorizeHints::allowVectorization(
178	Function F, Loop L, bool VectorizeOnlyWhenForced) const {
179	if (getForce() == LoopVectorizeHints::FK_Disabled) {
180	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
181	emitRemarkWithHints();
182	return false;
183	}
184
185	if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) {
186	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
187	emitRemarkWithHints();
188	return false;
189	}
190
191	if (getIsVectorized() == `1`) {
192	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
193	// FIXME: Add interleave.disable metadata. This will allow
194	// vectorize.disable to be used without disabling the pass and errors
195	// to differentiate between disabled vectorization and a width of 1.
196	ORE.emit(RemarkBuilder: [&]() {
197	return OptimizationRemarkAnalysis (vectorizeAnalysisPassName(),
198	"AllDisabled", L->getStartLoc(),
199	L->getHeader())
200	<< "loop not vectorized: vectorization and interleaving are "
201	"explicitly disabled, or the loop has already been "
202	"vectorized";
203	});
204	return false;
205	}
206
207	return true;
208	}
209
210	void LoopVectorizeHints::emitRemarkWithHints() const {
211	using namespace ore;
212
213	ORE.emit(RemarkBuilder: [&]() {
214	if (Force.Value == LoopVectorizeHints::FK_Disabled)
215	return OptimizationRemarkMissed (LV_NAME, "MissedExplicitlyDisabled",
216	TheLoop->getStartLoc(),
217	TheLoop->getHeader())
218	<< "loop not vectorized: vectorization is explicitly disabled";
219	else {
220	OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
221	TheLoop->getStartLoc(), TheLoop->getHeader());
222	R << "loop not vectorized";
223	if (Force.Value == LoopVectorizeHints::FK_Enabled) {
224	R << " (Force=" << NV ("Force", true);
225	if (Width.Value != `0`)
226	R << ", Vector Width=" << NV ("VectorWidth", getWidth());
227	if (getInterleave() != `0`)
228	R << ", Interleave Count=" << NV ("InterleaveCount", getInterleave());
229	R << ")";
230	}
231	return R;
232	}
233	});
234	}
235
236	const char LoopVectorizeHints::vectorizeAnalysisPassName() const* {
237	if (getWidth() == ElementCount::getFixed(MinVal: `1`))
238	return LV_NAME;
239	if (getForce() == LoopVectorizeHints::FK_Disabled)
240	return LV_NAME;
241	if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero())
242	return LV_NAME;
243	return OptimizationRemarkAnalysis::AlwaysPrint;
244	}
245
246	bool LoopVectorizeHints::allowReordering() const {
247	// Allow the vectorizer to change the order of operations if enabling
248	// loop hints are provided
249	ElementCount EC = getWidth();
250	return HintsAllowReordering &&
251	(getForce() == LoopVectorizeHints::FK_Enabled \|\|
252	EC.getKnownMinValue() > `1`);
253	}
254
255	void LoopVectorizeHints::getHintsFromMetadata() {
256	MDNode *LoopID = TheLoop->getLoopID();
257	if (!LoopID)
258	return;
259
260	// First operand should refer to the loop id itself.
261	assert(LoopID->getNumOperands() > `0` && "requires at least one operand");
262	assert(LoopID->getOperand(`0`) == LoopID && "invalid loop id");
263
264	for (unsigned i = `1`, ie = LoopID->getNumOperands(); i < ie; ++i) {
265	const MDString S = nullptr*;
266	SmallVector<Metadata *, `4`> Args;
267
268	// The expected hint is either a MDString or a MDNode with the first
269	// operand a MDString.
270	if (const MDNode *MD = dyn_cast<MDNode>(Val: LoopID->getOperand(I: i))) {
271	if (!MD \|\| MD->getNumOperands() == `0`)
272	continue;
273	S = dyn_cast<MDString>(Val: MD->getOperand(I: `0`));
274	for (unsigned i = `1`, ie = MD->getNumOperands(); i < ie; ++i)
275	Args.push_back(Elt: MD->getOperand(I: i));
276	} else {
277	S = dyn_cast<MDString>(Val: LoopID->getOperand(I: i));
278	assert(Args.size() == `0` && "too many arguments for MDString");
279	}
280
281	if (!S)
282	continue;
283
284	// Check if the hint starts with the loop metadata prefix.
285	StringRef Name = S->getString();
286	if (Args.size() == `1`)
287	setHint(Name, Arg: Args [`0`]);
288	}
289	}
290
291	void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
292	if (!Name.starts_with(Prefix: Prefix()))
293	return;
294	Name = Name.substr(Start: Prefix().size(), N: StringRef::npos);
295
296	const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(MD&: Arg);
297	if (!C)
298	return;
299	unsigned Val = C->getZExtValue();
300
301	Hint *Hints[] = {&Width, &Interleave, &Force,
302	&IsVectorized, &Predicate, &Scalable};
303	for (auto *H : Hints) {
304	if (Name == H->Name) {
305	if (H->validate(Val))
306	H->Value = Val;
307	else
308	LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
309	break;
310	}
311	}
312	}
313
314	// Return true if the inner loop \p Lp is uniform with regard to the outer loop
315	// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
316	// executing the inner loop will execute the same iterations). This check is
317	// very constrained for now but it will be relaxed in the future. \p Lp is
318	// considered uniform if it meets all the following conditions:
319	// 1) it has a canonical IV (starting from 0 and with stride 1),
320	// 2) its latch terminator is a conditional branch and,
321	// 3) its latch condition is a compare instruction whose operands are the
322	// canonical IV and an OuterLp invariant.
323	// This check doesn't take into account the uniformity of other conditions not
324	// related to the loop latch because they don't affect the loop uniformity.
325	//
326	// NOTE: We decided to keep all these checks and its associated documentation
327	// together so that we can easily have a picture of the current supported loop
328	// nests. However, some of the current checks don't depend on \p OuterLp and
329	// would be redundantly executed for each \p Lp if we invoked this function for
330	// different candidate outer loops. This is not the case for now because we
331	// don't currently have the infrastructure to evaluate multiple candidate outer
332	// loops and \p OuterLp will be a fixed parameter while we only support explicit
333	// outer loop vectorization. It's also very likely that these checks go away
334	// before introducing the aforementioned infrastructure. However, if this is not
335	// the case, we should move the \p OuterLp independent checks to a separate
336	// function that is only executed once for each \p Lp.
337	static bool isUniformLoop(Loop Lp, Loop OuterLp) {
338	assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
339
340	// If Lp is the outer loop, it's uniform by definition.
341	if (Lp == OuterLp)
342	return true;
343	assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
344
345	// 1.
346	PHINode *IV = Lp->getCanonicalInductionVariable();
347	if (!IV) {
348	LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
349	return false;
350	}
351
352	// 2.
353	BasicBlock *Latch = Lp->getLoopLatch();
354	auto *LatchBr = dyn_cast<BranchInst>(Val: Latch->getTerminator());
355	if (!LatchBr \|\| LatchBr->isUnconditional()) {
356	LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
357	return false;
358	}
359
360	// 3.
361	auto *LatchCmp = dyn_cast<CmpInst>(Val: LatchBr->getCondition());
362	if (!LatchCmp) {
363	LLVM_DEBUG(
364	dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
365	return false;
366	}
367
368	Value *CondOp0 = LatchCmp->getOperand(i_nocapture: `0`);
369	Value *CondOp1 = LatchCmp->getOperand(i_nocapture: `1`);
370	Value *IVUpdate = IV->getIncomingValueForBlock(BB: Latch);
371	if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(V: CondOp1)) &&
372	!(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(V: CondOp0))) {
373	LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
374	return false;
375	}
376
377	return true;
378	}
379
380	// Return true if \p Lp and all its nested loops are uniform with regard to \p
381	// OuterLp.
382	static bool isUniformLoopNest(Loop Lp, Loop OuterLp) {
383	if (!isUniformLoop(Lp, OuterLp))
384	return false;
385
386	// Check if nested loops are uniform.
387	for (Loop SubLp : Lp)
388	if (!isUniformLoopNest(Lp: SubLp, OuterLp))
389	return false;
390
391	return true;
392	}
393
394	static Type convertPointerToIntegerType(const* DataLayout &DL, Type *Ty) {
395	if (Ty->isPointerTy())
396	return DL.getIntPtrType(Ty);
397
398	// It is possible that char's or short's overflow when we ask for the loop's
399	// trip count, work around this by changing the type size.
400	if (Ty->getScalarSizeInBits() < `32`)
401	return Type::getInt32Ty(C&: Ty->getContext());
402
403	return Ty;
404	}
405
406	static Type getWiderType(const* DataLayout &DL, Type Ty0, Type Ty1) {
407	Ty0 = convertPointerToIntegerType(DL, Ty: Ty0);
408	Ty1 = convertPointerToIntegerType(DL, Ty: Ty1);
409	if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
410	return Ty0;
411	return Ty1;
412	}
413
414	/// Check that the instruction has outside loop users and is not an
415	/// identified reduction variable.
416	static bool hasOutsideLoopUser(const Loop TheLoop, Instruction Inst,
417	SmallPtrSetImpl<Value *> &AllowedExit) {
418	// Reductions, Inductions and non-header phis are allowed to have exit users. All
419	// other instructions must not have external users.
420	if (!AllowedExit.count(Ptr: Inst))
421	// Check that all of the users of the loop are inside the BB.
422	for (User *U : Inst->users()) {
423	Instruction *UI = cast<Instruction>(Val: U);
424	// This user may be a reduction exit value.
425	if (!TheLoop->contains(Inst: UI)) {
426	LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << `'\n'`);
427	return true;
428	}
429	}
430	return false;
431	}
432
433	/// Returns true if A and B have same pointer operands or same SCEVs addresses
434	static bool storeToSameAddress(ScalarEvolution SE, StoreInst A,
435	StoreInst *B) {
436	// Compare store
437	if (A == B)
438	return true;
439
440	// Otherwise Compare pointers
441	Value *APtr = A->getPointerOperand();
442	Value *BPtr = B->getPointerOperand();
443	if (APtr == BPtr)
444	return true;
445
446	// Otherwise compare address SCEVs
447	if (SE->getSCEV(V: APtr) == SE->getSCEV(V: BPtr))
448	return true;
449
450	return false;
451	}
452
453	int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
454	Value Ptr) const* {
455	// FIXME: Currently, the set of symbolic strides is sometimes queried before
456	// it's collected. This happens from canVectorizeWithIfConvert, when the
457	// pointer is checked to reference consecutive elements suitable for a
458	// masked access.
459	const auto &Strides =
460	LAI ? LAI->getSymbolicStrides() : DenseMap<Value , const* SCEV *>();
461
462	Function *F = TheLoop->getHeader()->getParent();
463	bool OptForSize = F->hasOptSize() \|\|
464	llvm::shouldOptimizeForSize(BB: TheLoop->getHeader(), PSI, BFI,
465	QueryType: PGSOQueryType::IRPass);
466	bool CanAddPredicate = !OptForSize;
467	int Stride = getPtrStride(PSE, AccessTy, Ptr, Lp: TheLoop, StridesMap: Strides,
468	Assume: CanAddPredicate, ShouldCheckWrap: false).value_or(u: `0`);
469	if (Stride == `1` \|\| Stride == -`1`)
470	return Stride;
471	return `0`;
472	}
473
474	bool LoopVectorizationLegality::isInvariant(Value V) const* {
475	return LAI->isInvariant(V);
476	}
477
478	namespace {
479	/// A rewriter to build the SCEVs for each of the VF lanes in the expected
480	/// vectorized loop, which can then be compared to detect their uniformity. This
481	/// is done by replacing the AddRec SCEVs of the original scalar loop (TheLoop)
482	/// with new AddRecs where the step is multiplied by StepMultiplier and Offset *
483	/// Step is added. Also checks if all sub-expressions are analyzable w.r.t.
484	/// uniformity.
485	class SCEVAddRecForUniformityRewriter
486	: public SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter> {
487	/// Multiplier to be applied to the step of AddRecs in TheLoop.
488	unsigned StepMultiplier;
489
490	/// Offset to be added to the AddRecs in TheLoop.
491	unsigned Offset;
492
493	/// Loop for which to rewrite AddRecsFor.
494	Loop *TheLoop;
495
496	/// Is any sub-expressions not analyzable w.r.t. uniformity?
497	bool CannotAnalyze = false;
498
499	bool canAnalyze() const { return !CannotAnalyze; }
500
501	public:
502	SCEVAddRecForUniformityRewriter(ScalarEvolution &SE, unsigned StepMultiplier,
503	unsigned Offset, Loop *TheLoop)
504	: SCEVRewriteVisitor (SE), StepMultiplier(StepMultiplier), Offset(Offset),
505	TheLoop(TheLoop) {}
506
507	const SCEV visitAddRecExpr(const* SCEVAddRecExpr *Expr) {
508	assert(Expr->getLoop() == TheLoop &&
509	"addrec outside of TheLoop must be invariant and should have been "
510	"handled earlier");
511	// Build a new AddRec by multiplying the step by StepMultiplier and
512	// incrementing the start by Offset step.*
513	Type *Ty = Expr->getType();
514	auto *Step = Expr->getStepRecurrence(SE);
515	if (!SE.isLoopInvariant(S: Step, L: TheLoop)) {
516	CannotAnalyze = true;
517	return Expr;
518	}
519	auto *NewStep = SE.getMulExpr(LHS: Step, RHS: SE.getConstant(Ty, V: StepMultiplier));
520	auto *ScaledOffset = SE.getMulExpr(LHS: Step, RHS: SE.getConstant(Ty, V: Offset));
521	auto *NewStart = SE.getAddExpr(LHS: Expr->getStart(), RHS: ScaledOffset);
522	return SE.getAddRecExpr(Start: NewStart, Step: NewStep, L: TheLoop, Flags: SCEV::FlagAnyWrap);
523	}
524
525	const SCEV visit(const* SCEV *S) {
526	if (CannotAnalyze \|\| SE.isLoopInvariant(S, L: TheLoop))
527	return S;
528	return SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter>::visit(S);
529	}
530
531	const SCEV visitUnknown(const* SCEVUnknown *S) {
532	if (SE.isLoopInvariant(S, L: TheLoop))
533	return S;
534	// The value could vary across iterations.
535	CannotAnalyze = true;
536	return S;
537	}
538
539	const SCEV visitCouldNotCompute(const* SCEVCouldNotCompute *S) {
540	// Could not analyze the expression.
541	CannotAnalyze = true;
542	return S;
543	}
544
545	static const SCEV rewrite(const* SCEV *S, ScalarEvolution &SE,
546	unsigned StepMultiplier, unsigned Offset,
547	Loop *TheLoop) {
548	/// Bail out if the expression does not contain an UDiv expression.
549	/// Uniform values which are not loop invariant require operations to strip
550	/// out the lowest bits. For now just look for UDivs and use it to avoid
551	/// re-writing UDIV-free expressions for other lanes to limit compile time.
552	if (!SCEVExprContains(Root: S,
553	Pred: [](const SCEV S) { return* isa<SCEVUDivExpr>(Val: S); }))
554	return SE.getCouldNotCompute();
555
556	SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset,
557	TheLoop);
558	const SCEV *Result = Rewriter.visit(S);
559
560	if (Rewriter.canAnalyze())
561	return Result;
562	return SE.getCouldNotCompute();
563	}
564	};
565
566	} // namespace
567
568	bool LoopVectorizationLegality::isUniform(Value V, ElementCount VF) const* {
569	if (isInvariant(V))
570	return true;
571	if (VF.isScalable())
572	return false;
573	if (VF.isScalar())
574	return true;
575
576	// Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
577	// never considered uniform.
578	auto *SE = PSE.getSE();
579	if (!SE->isSCEVable(Ty: V->getType()))
580	return false;
581	const SCEV *S = SE->getSCEV(V);
582
583	// Rewrite AddRecs in TheLoop to step by VF and check if the expression for
584	// lane 0 matches the expressions for all other lanes.
585	unsigned FixedVF = VF.getKnownMinValue();
586	const SCEV *FirstLaneExpr =
587	SCEVAddRecForUniformityRewriter::rewrite(S, SE&: *SE, StepMultiplier: FixedVF, Offset: `0`, TheLoop);
588	if (isa<SCEVCouldNotCompute>(Val: FirstLaneExpr))
589	return false;
590
591	// Make sure the expressions for lanes FixedVF-1..1 match the expression for
592	// lane 0. We check lanes in reverse order for compile-time, as frequently
593	// checking the last lane is sufficient to rule out uniformity.
594	return all_of(Range: reverse(C: seq<unsigned>(Begin: `1`, End: FixedVF)), P: [&](unsigned I) {
595	const SCEV *IthLaneExpr =
596	SCEVAddRecForUniformityRewriter::rewrite(S, SE&: *SE, StepMultiplier: FixedVF, Offset: I, TheLoop);
597	return FirstLaneExpr == IthLaneExpr;
598	});
599	}
600
601	bool LoopVectorizationLegality::isUniformMemOp(Instruction &I,
602	ElementCount VF) const {
603	Value *Ptr = getLoadStorePointerOperand(V: &I);
604	if (!Ptr)
605	return false;
606	// Note: There's nothing inherent which prevents predicated loads and
607	// stores from being uniform. The current lowering simply doesn't handle
608	// it; in particular, the cost model distinguishes scatter/gather from
609	// scalar w/predication, and we currently rely on the scalar path.
610	return isUniform(V: Ptr, VF) && !blockNeedsPredication(BB: I.getParent());
611	}
612
613	bool LoopVectorizationLegality::canVectorizeOuterLoop() {
614	assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop.");
615	// Store the result and return it at the end instead of exiting early, in case
616	// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
617	bool Result = true;
618	bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
619
620	for (BasicBlock *BB : TheLoop->blocks()) {
621	// Check whether the BB terminator is a BranchInst. Any other terminator is
622	// not supported yet.
623	auto *Br = dyn_cast<BranchInst>(Val: BB->getTerminator());
624	if (!Br) {
625	reportVectorizationFailure(DebugMsg: "Unsupported basic block terminator",
626	OREMsg: "loop control flow is not understood by vectorizer",
627	ORETag: "CFGNotUnderstood", ORE, TheLoop);
628	if (DoExtraAnalysis)
629	Result = false;
630	else
631	return false;
632	}
633
634	// Check whether the BranchInst is a supported one. Only unconditional
635	// branches, conditional branches with an outer loop invariant condition or
636	// backedges are supported.
637	// FIXME: We skip these checks when VPlan predication is enabled as we
638	// want to allow divergent branches. This whole check will be removed
639	// once VPlan predication is on by default.
640	if (Br && Br->isConditional() &&
641	!TheLoop->isLoopInvariant(V: Br->getCondition()) &&
642	!LI->isLoopHeader(BB: Br->getSuccessor(i: `0`)) &&
643	!LI->isLoopHeader(BB: Br->getSuccessor(i: `1`))) {
644	reportVectorizationFailure(DebugMsg: "Unsupported conditional branch",
645	OREMsg: "loop control flow is not understood by vectorizer",
646	ORETag: "CFGNotUnderstood", ORE, TheLoop);
647	if (DoExtraAnalysis)
648	Result = false;
649	else
650	return false;
651	}
652	}
653
654	// Check whether inner loops are uniform. At this point, we only support
655	// simple outer loops scenarios with uniform nested loops.
656	if (!isUniformLoopNest(Lp: TheLoop /loop nest/,
657	OuterLp: TheLoop /context outer loop/)) {
658	reportVectorizationFailure(DebugMsg: "Outer loop contains divergent loops",
659	OREMsg: "loop control flow is not understood by vectorizer",
660	ORETag: "CFGNotUnderstood", ORE, TheLoop);
661	if (DoExtraAnalysis)
662	Result = false;
663	else
664	return false;
665	}
666
667	// Check whether we are able to set up outer loop induction.
668	if (!setupOuterLoopInductions()) {
669	reportVectorizationFailure(DebugMsg: "Unsupported outer loop Phi(s)",
670	OREMsg: "Unsupported outer loop Phi(s)",
671	ORETag: "UnsupportedPhi", ORE, TheLoop);
672	if (DoExtraAnalysis)
673	Result = false;
674	else
675	return false;
676	}
677
678	return Result;
679	}
680
681	void LoopVectorizationLegality::addInductionPhi(
682	PHINode Phi, const* InductionDescriptor &ID,
683	SmallPtrSetImpl<Value *> &AllowedExit) {
684	Inductions [Phi] = ID;
685
686	// In case this induction also comes with casts that we know we can ignore
687	// in the vectorized loop body, record them here. All casts could be recorded
688	// here for ignoring, but suffices to record only the first (as it is the
689	// only one that may bw used outside the cast sequence).
690	const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
691	if (!Casts.empty())
692	InductionCastsToIgnore.insert(Ptr: *Casts.begin());
693
694	Type *PhiTy = Phi->getType();
695	const DataLayout &DL = Phi->getModule()->getDataLayout();
696
697	// Get the widest type.
698	if (!PhiTy->isFloatingPointTy()) {
699	if (!WidestIndTy)
700	WidestIndTy = convertPointerToIntegerType(DL, Ty: PhiTy);
701	else
702	WidestIndTy = getWiderType(DL, Ty0: PhiTy, Ty1: WidestIndTy);
703	}
704
705	// Int inductions are special because we only allow one IV.
706	if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
707	ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
708	isa<Constant>(Val: ID.getStartValue()) &&
709	cast<Constant>(Val: ID.getStartValue())->isNullValue()) {
710
711	// Use the phi node with the widest type as induction. Use the last
712	// one if there are multiple (no good reason for doing this other
713	// than it is expedient). We've checked that it begins at zero and
714	// steps by one, so this is a canonical induction variable.
715	if (!PrimaryInduction \|\| PhiTy == WidestIndTy)
716	PrimaryInduction = Phi;
717	}
718
719	// Both the PHI node itself, and the "post-increment" value feeding
720	// back into the PHI node may have external users.
721	// We can allow those uses, except if the SCEVs we have for them rely
722	// on predicates that only hold within the loop, since allowing the exit
723	// currently means re-using this SCEV outside the loop (see PR33706 for more
724	// details).
725	if (PSE.getPredicate().isAlwaysTrue()) {
726	AllowedExit.insert(Ptr: Phi);
727	AllowedExit.insert(Ptr: Phi->getIncomingValueForBlock(BB: TheLoop->getLoopLatch()));
728	}
729
730	LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
731	}
732
733	bool LoopVectorizationLegality::setupOuterLoopInductions() {
734	BasicBlock *Header = TheLoop->getHeader();
735
736	// Returns true if a given Phi is a supported induction.
737	auto isSupportedPhi = [&](PHINode &Phi) -> bool {
738	InductionDescriptor ID;
739	if (InductionDescriptor::isInductionPHI(Phi: &Phi, L: TheLoop, PSE, D&: ID) &&
740	ID.getKind() == InductionDescriptor::IK_IntInduction) {
741	addInductionPhi(Phi: &Phi, ID, AllowedExit);
742	return true;
743	} else {
744	// Bail out for any Phi in the outer loop header that is not a supported
745	// induction.
746	LLVM_DEBUG(
747	dbgs()
748	<< "LV: Found unsupported PHI for outer loop vectorization.\n");
749	return false;
750	}
751	};
752
753	if (llvm::all_of(Range: Header->phis(), P: isSupportedPhi))
754	return true;
755	else
756	return false;
757	}
758
759	/// Checks if a function is scalarizable according to the TLI, in
760	/// the sense that it should be vectorized and then expanded in
761	/// multiple scalar calls. This is represented in the
762	/// TLI via mappings that do not specify a vector name, as in the
763	/// following example:
764	///
765	/// const VecDesc VecIntrinsics[] = {
766	/// {"llvm.phx.abs.i32", "", 4}
767	/// };
768	static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) {
769	const StringRef ScalarName = CI.getCalledFunction()->getName();
770	bool Scalarize = TLI.isFunctionVectorizable(F: ScalarName);
771	// Check that all known VFs are not associated to a vector
772	// function, i.e. the vector name is emty.
773	if (Scalarize) {
774	ElementCount WidestFixedVF, WidestScalableVF;
775	TLI.getWidestVF(ScalarF: ScalarName, FixedVF&: WidestFixedVF, ScalableVF&: WidestScalableVF);
776	for (ElementCount VF = ElementCount::getFixed(MinVal: `2`);
777	ElementCount::isKnownLE(LHS: VF, RHS: WidestFixedVF); VF *= `2`)
778	Scalarize &= !TLI.isFunctionVectorizable(F: ScalarName, VF);
779	for (ElementCount VF = ElementCount::getScalable(MinVal: `1`);
780	ElementCount::isKnownLE(LHS: VF, RHS: WidestScalableVF); VF *= `2`)
781	Scalarize &= !TLI.isFunctionVectorizable(F: ScalarName, VF);
782	assert((WidestScalableVF.isZero() \|\| !Scalarize) &&
783	"Caller may decide to scalarize a variant using a scalable VF");
784	}
785	return Scalarize;
786	}
787
788	bool LoopVectorizationLegality::canVectorizeInstrs() {
789	BasicBlock *Header = TheLoop->getHeader();
790
791	// For each block in the loop.
792	for (BasicBlock *BB : TheLoop->blocks()) {
793	// Scan the instructions in the block and look for hazards.
794	for (Instruction &I : *BB) {
795	if (auto *Phi = dyn_cast<PHINode>(Val: &I)) {
796	Type *PhiTy = Phi->getType();
797	// Check that this PHI type is allowed.
798	if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
799	!PhiTy->isPointerTy()) {
800	reportVectorizationFailure(DebugMsg: "Found a non-int non-pointer PHI",
801	OREMsg: "loop control flow is not understood by vectorizer",
802	ORETag: "CFGNotUnderstood", ORE, TheLoop);
803	return false;
804	}
805
806	// If this PHINode is not in the header block, then we know that we
807	// can convert it to select during if-conversion. No need to check if
808	// the PHIs in this block are induction or reduction variables.
809	if (BB != Header) {
810	// Non-header phi nodes that have outside uses can be vectorized. Add
811	// them to the list of allowed exits.
812	// Unsafe cyclic dependencies with header phis are identified during
813	// legalization for reduction, induction and fixed order
814	// recurrences.
815	AllowedExit.insert(Ptr: &I);
816	continue;
817	}
818
819	// We only allow if-converted PHIs with exactly two incoming values.
820	if (Phi->getNumIncomingValues() != `2`) {
821	reportVectorizationFailure(DebugMsg: "Found an invalid PHI",
822	OREMsg: "loop control flow is not understood by vectorizer",
823	ORETag: "CFGNotUnderstood", ORE, TheLoop, I: Phi);
824	return false;
825	}
826
827	RecurrenceDescriptor RedDes;
828	if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
829	DT, SE: PSE.getSE())) {
830	Requirements->addExactFPMathInst(I: RedDes.getExactFPMathInst());
831	AllowedExit.insert(Ptr: RedDes.getLoopExitInstr());
832	Reductions [Phi] = RedDes;
833	continue;
834	}
835
836	// We prevent matching non-constant strided pointer IVS to preserve
837	// historical vectorizer behavior after a generalization of the
838	// IVDescriptor code. The intent is to remove this check, but we
839	// have to fix issues around code quality for such loops first.
840	auto isDisallowedStridedPointerInduction =
841	[](const InductionDescriptor &ID) {
842	if (AllowStridedPointerIVs)
843	return false;
844	return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
845	ID.getConstIntStepValue() == nullptr;
846	};
847
848	// TODO: Instead of recording the AllowedExit, it would be good to
849	// record the complementary set: NotAllowedExit. These include (but may
850	// not be limited to):
851	// 1. Reduction phis as they represent the one-before-last value, which
852	// is not available when vectorized
853	// 2. Induction phis and increment when SCEV predicates cannot be used
854	// outside the loop - see addInductionPhi
855	// 3. Non-Phis with outside uses when SCEV predicates cannot be used
856	// outside the loop - see call to hasOutsideLoopUser in the non-phi
857	// handling below
858	// 4. FixedOrderRecurrence phis that can possibly be handled by
859	// extraction.
860	// By recording these, we can then reason about ways to vectorize each
861	// of these NotAllowedExit.
862	InductionDescriptor ID;
863	if (InductionDescriptor::isInductionPHI(Phi, L: TheLoop, PSE, D&: ID) &&
864	!isDisallowedStridedPointerInduction (ID)) {
865	addInductionPhi(Phi, ID, AllowedExit);
866	Requirements->addExactFPMathInst(I: ID.getExactFPMathInst());
867	continue;
868	}
869
870	if (RecurrenceDescriptor::isFixedOrderRecurrence(Phi, TheLoop, DT)) {
871	AllowedExit.insert(Ptr: Phi);
872	FixedOrderRecurrences.insert(Ptr: Phi);
873	continue;
874	}
875
876	// As a last resort, coerce the PHI to a AddRec expression
877	// and re-try classifying it a an induction PHI.
878	if (InductionDescriptor::isInductionPHI(Phi, L: TheLoop, PSE, D&: ID, Assume: true) &&
879	!isDisallowedStridedPointerInduction (ID)) {
880	addInductionPhi(Phi, ID, AllowedExit);
881	continue;
882	}
883
884	reportVectorizationFailure(DebugMsg: "Found an unidentified PHI",
885	OREMsg: "value that could not be identified as "
886	"reduction is used outside the loop",
887	ORETag: "NonReductionValueUsedOutsideLoop", ORE, TheLoop, I: Phi);
888	return false;
889	} // end of PHI handling
890
891	// We handle calls that:
892	// Are debug info intrinsics.*
893	// Have a mapping to an IR intrinsic.*
894	// Have a vector version available.*
895	auto *CI = dyn_cast<CallInst>(Val: &I);
896
897	if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
898	!isa<DbgInfoIntrinsic>(Val: CI) &&
899	!(CI->getCalledFunction() && TLI &&
900	(!VFDatabase::getMappings(CI: *CI).empty() \|\|
901	isTLIScalarize(TLI: TLI, CI: CI)))) {
902	// If the call is a recognized math libary call, it is likely that
903	// we can vectorize it given loosened floating-point constraints.
904	LibFunc Func;
905	bool IsMathLibCall =
906	TLI && CI->getCalledFunction() &&
907	CI->getType()->isFloatingPointTy() &&
908	TLI->getLibFunc(funcName: CI->getCalledFunction()->getName(), F&: Func) &&
909	TLI->hasOptimizedCodeGen(F: Func);
910
911	if (IsMathLibCall) {
912	// TODO: Ideally, we should not use clang-specific language here,
913	// but it's hard to provide meaningful yet generic advice.
914	// Also, should this be guarded by allowExtraAnalysis() and/or be part
915	// of the returned info from isFunctionVectorizable()?
916	reportVectorizationFailure(
917	DebugMsg: "Found a non-intrinsic callsite",
918	OREMsg: "library call cannot be vectorized. "
919	"Try compiling with -fno-math-errno, -ffast-math, "
920	"or similar flags",
921	ORETag: "CantVectorizeLibcall", ORE, TheLoop, I: CI);
922	} else {
923	reportVectorizationFailure(DebugMsg: "Found a non-intrinsic callsite",
924	OREMsg: "call instruction cannot be vectorized",
925	ORETag: "CantVectorizeLibcall", ORE, TheLoop, I: CI);
926	}
927	return false;
928	}
929
930	// Some intrinsics have scalar arguments and should be same in order for
931	// them to be vectorized (i.e. loop invariant).
932	if (CI) {
933	auto *SE = PSE.getSE();
934	Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
935	for (unsigned i = `0`, e = CI->arg_size(); i != e; ++i)
936	if (isVectorIntrinsicWithScalarOpAtArg(ID: IntrinID, ScalarOpdIdx: i)) {
937	if (!SE->isLoopInvariant(S: PSE.getSCEV(V: CI->getOperand(i_nocapture: i)), L: TheLoop)) {
938	reportVectorizationFailure(DebugMsg: "Found unvectorizable intrinsic",
939	OREMsg: "intrinsic instruction cannot be vectorized",
940	ORETag: "CantVectorizeIntrinsic", ORE, TheLoop, I: CI);
941	return false;
942	}
943	}
944	}
945
946	// If we found a vectorized variant of a function, note that so LV can
947	// make better decisions about maximum VF.
948	if (CI && !VFDatabase::getMappings(CI: *CI).empty())
949	VecCallVariantsFound = true;
950
951	// Check that the instruction return type is vectorizable.
952	// Also, we can't vectorize extractelement instructions.
953	if ((!VectorType::isValidElementType(ElemTy: I.getType()) &&
954	!I.getType()->isVoidTy()) \|\|
955	isa<ExtractElementInst>(Val: I)) {
956	reportVectorizationFailure(DebugMsg: "Found unvectorizable type",
957	OREMsg: "instruction return type cannot be vectorized",
958	ORETag: "CantVectorizeInstructionReturnType", ORE, TheLoop, I: &I);
959	return false;
960	}
961
962	// Check that the stored type is vectorizable.
963	if (auto *ST = dyn_cast<StoreInst>(Val: &I)) {
964	Type *T = ST->getValueOperand()->getType();
965	if (!VectorType::isValidElementType(ElemTy: T)) {
966	reportVectorizationFailure(DebugMsg: "Store instruction cannot be vectorized",
967	OREMsg: "store instruction cannot be vectorized",
968	ORETag: "CantVectorizeStore", ORE, TheLoop, I: ST);
969	return false;
970	}
971
972	// For nontemporal stores, check that a nontemporal vector version is
973	// supported on the target.
974	if (ST->getMetadata(KindID: LLVMContext::MD_nontemporal)) {
975	// Arbitrarily try a vector of 2 elements.
976	auto VecTy = FixedVectorType::get(ElementType: T, /NumElts=/*`2`);
977	assert(VecTy && "did not find vectorized version of stored type");
978	if (!TTI->isLegalNTStore(DataType: VecTy, Alignment: ST->getAlign())) {
979	reportVectorizationFailure(
980	DebugMsg: "nontemporal store instruction cannot be vectorized",
981	OREMsg: "nontemporal store instruction cannot be vectorized",
982	ORETag: "CantVectorizeNontemporalStore", ORE, TheLoop, I: ST);
983	return false;
984	}
985	}
986
987	} else if (auto *LD = dyn_cast<LoadInst>(Val: &I)) {
988	if (LD->getMetadata(KindID: LLVMContext::MD_nontemporal)) {
989	// For nontemporal loads, check that a nontemporal vector version is
990	// supported on the target (arbitrarily try a vector of 2 elements).
991	auto VecTy = FixedVectorType::get(ElementType: I.getType(), /NumElts=/*`2`);
992	assert(VecTy && "did not find vectorized version of load type");
993	if (!TTI->isLegalNTLoad(DataType: VecTy, Alignment: LD->getAlign())) {
994	reportVectorizationFailure(
995	DebugMsg: "nontemporal load instruction cannot be vectorized",
996	OREMsg: "nontemporal load instruction cannot be vectorized",
997	ORETag: "CantVectorizeNontemporalLoad", ORE, TheLoop, I: LD);
998	return false;
999	}
1000	}
1001
1002	// FP instructions can allow unsafe algebra, thus vectorizable by
1003	// non-IEEE-754 compliant SIMD units.
1004	// This applies to floating-point math operations and calls, not memory
1005	// operations, shuffles, or casts, as they don't change precision or
1006	// semantics.
1007	} else if (I.getType()->isFloatingPointTy() && (CI \|\| I.isBinaryOp()) &&
1008	!I.isFast()) {
1009	LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
1010	Hints->setPotentiallyUnsafe();
1011	}
1012
1013	// Reduction instructions are allowed to have exit users.
1014	// All other instructions must not have external users.
1015	if (hasOutsideLoopUser(TheLoop, Inst: &I, AllowedExit)) {
1016	// We can safely vectorize loops where instructions within the loop are
1017	// used outside the loop only if the SCEV predicates within the loop is
1018	// same as outside the loop. Allowing the exit means reusing the SCEV
1019	// outside the loop.
1020	if (PSE.getPredicate().isAlwaysTrue()) {
1021	AllowedExit.insert(Ptr: &I);
1022	continue;
1023	}
1024	reportVectorizationFailure(DebugMsg: "Value cannot be used outside the loop",
1025	OREMsg: "value cannot be used outside the loop",
1026	ORETag: "ValueUsedOutsideLoop", ORE, TheLoop, I: &I);
1027	return false;
1028	}
1029	} // next instr.
1030	}
1031
1032	if (!PrimaryInduction) {
1033	if (Inductions.empty()) {
1034	reportVectorizationFailure(DebugMsg: "Did not find one integer induction var",
1035	OREMsg: "loop induction variable could not be identified",
1036	ORETag: "NoInductionVariable", ORE, TheLoop);
1037	return false;
1038	} else if (!WidestIndTy) {
1039	reportVectorizationFailure(DebugMsg: "Did not find one integer induction var",
1040	OREMsg: "integer loop induction variable could not be identified",
1041	ORETag: "NoIntegerInductionVariable", ORE, TheLoop);
1042	return false;
1043	} else {
1044	LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
1045	}
1046	}
1047
1048	// Now we know the widest induction type, check if our found induction
1049	// is the same size. If it's not, unset it here and InnerLoopVectorizer
1050	// will create another.
1051	if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
1052	PrimaryInduction = nullptr;
1053
1054	return true;
1055	}
1056
1057	bool LoopVectorizationLegality::canVectorizeMemory() {
1058	LAI = &LAIs.getInfo(L&: *TheLoop);
1059	const OptimizationRemarkAnalysis *LAR = LAI->getReport();
1060	if (LAR) {
1061	ORE->emit(RemarkBuilder: [&]() {
1062	return OptimizationRemarkAnalysis (Hints->vectorizeAnalysisPassName(),
1063	"loop not vectorized: ", *LAR);
1064	});
1065	}
1066
1067	if (!LAI->canVectorizeMemory())
1068	return false;
1069
1070	// We can vectorize stores to invariant address when final reduction value is
1071	// guaranteed to be stored at the end of the loop. Also, if decision to
1072	// vectorize loop is made, runtime checks are added so as to make sure that
1073	// invariant address won't alias with any other objects.
1074	if (!LAI->getStoresToInvariantAddresses().empty()) {
1075	// For each invariant address, check if last stored value is unconditional
1076	// and the address is not calculated inside the loop.
1077	for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
1078	if (!isInvariantStoreOfReduction(SI))
1079	continue;
1080
1081	if (blockNeedsPredication(BB: SI->getParent())) {
1082	reportVectorizationFailure(
1083	DebugMsg: "We don't allow storing to uniform addresses",
1084	OREMsg: "write of conditional recurring variant value to a loop "
1085	"invariant address could not be vectorized",
1086	ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1087	return false;
1088	}
1089
1090	// Invariant address should be defined outside of loop. LICM pass usually
1091	// makes sure it happens, but in rare cases it does not, we do not want
1092	// to overcomplicate vectorization to support this case.
1093	if (Instruction *Ptr = dyn_cast<Instruction>(Val: SI->getPointerOperand())) {
1094	if (TheLoop->contains(Inst: Ptr)) {
1095	reportVectorizationFailure(
1096	DebugMsg: "Invariant address is calculated inside the loop",
1097	OREMsg: "write to a loop invariant address could not "
1098	"be vectorized",
1099	ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1100	return false;
1101	}
1102	}
1103	}
1104
1105	if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
1106	// For each invariant address, check its last stored value is the result
1107	// of one of our reductions.
1108	//
1109	// We do not check if dependence with loads exists because they are
1110	// currently rejected earlier in LoopAccessInfo::analyzeLoop. In case this
1111	// behaviour changes we have to modify this code.
1112	ScalarEvolution *SE = PSE.getSE();
1113	SmallVector<StoreInst *, `4`> UnhandledStores;
1114	for (StoreInst *SI : LAI->getStoresToInvariantAddresses()) {
1115	if (isInvariantStoreOfReduction(SI)) {
1116	// Earlier stores to this address are effectively deadcode.
1117	// With opaque pointers it is possible for one pointer to be used with
1118	// different sizes of stored values:
1119	// store i32 0, ptr %x
1120	// store i8 0, ptr %x
1121	// The latest store doesn't complitely overwrite the first one in the
1122	// example. That is why we have to make sure that types of stored
1123	// values are same.
1124	// TODO: Check that bitwidth of unhandled store is smaller then the
1125	// one that overwrites it and add a test.
1126	erase_if(C&: UnhandledStores, P: [SE, SI](StoreInst *I) {
1127	return storeToSameAddress(SE, A: SI, B: I) &&
1128	I->getValueOperand()->getType() ==
1129	SI->getValueOperand()->getType();
1130	});
1131	continue;
1132	}
1133	UnhandledStores.push_back(Elt: SI);
1134	}
1135
1136	bool IsOK = UnhandledStores.empty();
1137	// TODO: we should also validate against InvariantMemSets.
1138	if (!IsOK) {
1139	reportVectorizationFailure(
1140	DebugMsg: "We don't allow storing to uniform addresses",
1141	OREMsg: "write to a loop invariant address could not "
1142	"be vectorized",
1143	ORETag: "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1144	return false;
1145	}
1146	}
1147	}
1148
1149	PSE.addPredicate(Pred: LAI->getPSE().getPredicate());
1150	return true;
1151	}
1152
1153	bool LoopVectorizationLegality::canVectorizeFPMath(
1154	bool EnableStrictReductions) {
1155
1156	// First check if there is any ExactFP math or if we allow reassociations
1157	if (!Requirements->getExactFPInst() \|\| Hints->allowReordering())
1158	return true;
1159
1160	// If the above is false, we have ExactFPMath & do not allow reordering.
1161	// If the EnableStrictReductions flag is set, first check if we have any
1162	// Exact FP induction vars, which we cannot vectorize.
1163	if (!EnableStrictReductions \|\|
1164	any_of(Range: getInductionVars(), P: [&](auto &Induction) -> bool {
1165	InductionDescriptor IndDesc = Induction.second;
1166	return IndDesc.getExactFPMathInst();
1167	}))
1168	return false;
1169
1170	// We can now only vectorize if all reductions with Exact FP math also
1171	// have the isOrdered flag set, which indicates that we can move the
1172	// reduction operations in-loop.
1173	return (all_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1174	const RecurrenceDescriptor &RdxDesc = Reduction.second;
1175	return !RdxDesc.hasExactFPMath() \|\| RdxDesc.isOrdered();
1176	}));
1177	}
1178
1179	bool LoopVectorizationLegality::isInvariantStoreOfReduction(StoreInst *SI) {
1180	return any_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1181	const RecurrenceDescriptor &RdxDesc = Reduction.second;
1182	return RdxDesc.IntermediateStore == SI;
1183	});
1184	}
1185
1186	bool LoopVectorizationLegality::isInvariantAddressOfReduction(Value *V) {
1187	return any_of(Range: getReductionVars(), P: [&](auto &Reduction) -> bool {
1188	const RecurrenceDescriptor &RdxDesc = Reduction.second;
1189	if (!RdxDesc.IntermediateStore)
1190	return false;
1191
1192	ScalarEvolution *SE = PSE.getSE();
1193	Value *InvariantAddress = RdxDesc.IntermediateStore->getPointerOperand();
1194	return V == InvariantAddress \|\|
1195	SE->getSCEV(V) == SE->getSCEV(V: InvariantAddress);
1196	});
1197	}
1198
1199	bool LoopVectorizationLegality::isInductionPhi(const Value V) const* {
1200	Value In0 = const_cast<Value >(V);
1201	PHINode *PN = dyn_cast_or_null<PHINode>(Val: In0);
1202	if (!PN)
1203	return false;
1204
1205	return Inductions.count(Key: PN);
1206	}
1207
1208	const InductionDescriptor *
1209	LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode Phi) const* {
1210	if (!isInductionPhi(V: Phi))
1211	return nullptr;
1212	auto &ID = getInductionVars().find(Key: Phi)->second;
1213	if (ID.getKind() == InductionDescriptor::IK_IntInduction \|\|
1214	ID.getKind() == InductionDescriptor::IK_FpInduction)
1215	return &ID;
1216	return nullptr;
1217	}
1218
1219	const InductionDescriptor *
1220	LoopVectorizationLegality::getPointerInductionDescriptor(PHINode Phi) const* {
1221	if (!isInductionPhi(V: Phi))
1222	return nullptr;
1223	auto &ID = getInductionVars().find(Key: Phi)->second;
1224	if (ID.getKind() == InductionDescriptor::IK_PtrInduction)
1225	return &ID;
1226	return nullptr;
1227	}
1228
1229	bool LoopVectorizationLegality::isCastedInductionVariable(
1230	const Value V) const* {
1231	auto *Inst = dyn_cast<Instruction>(Val: V);
1232	return (Inst && InductionCastsToIgnore.count(Ptr: Inst));
1233	}
1234
1235	bool LoopVectorizationLegality::isInductionVariable(const Value V) const* {
1236	return isInductionPhi(V) \|\| isCastedInductionVariable(V);
1237	}
1238
1239	bool LoopVectorizationLegality::isFixedOrderRecurrence(
1240	const PHINode Phi) const* {
1241	return FixedOrderRecurrences.count(Ptr: Phi);
1242	}
1243
1244	bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock BB) const* {
1245	return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
1246	}
1247
1248	bool LoopVectorizationLegality::blockCanBePredicated(
1249	BasicBlock BB, SmallPtrSetImpl<Value > &SafePtrs,
1250	SmallPtrSetImpl<const Instruction > &MaskedOp) const* {
1251	for (Instruction &I : *BB) {
1252	// We can predicate blocks with calls to assume, as long as we drop them in
1253	// case we flatten the CFG via predication.
1254	if (match(&I, m_Intrinsic<Intrinsic::assume>())) {
1255	MaskedOp.insert(Ptr: &I);
1256	continue;
1257	}
1258
1259	// Do not let llvm.experimental.noalias.scope.decl block the vectorization.
1260	// TODO: there might be cases that it should block the vectorization. Let's
1261	// ignore those for now.
1262	if (isa<NoAliasScopeDeclInst>(Val: &I))
1263	continue;
1264
1265	// We can allow masked calls if there's at least one vector variant, even
1266	// if we end up scalarizing due to the cost model calculations.
1267	// TODO: Allow other calls if they have appropriate attributes... readonly
1268	// and argmemonly?
1269	if (CallInst *CI = dyn_cast<CallInst>(Val: &I))
1270	if (VFDatabase::hasMaskedVariant(CI: *CI)) {
1271	MaskedOp.insert(Ptr: CI);
1272	continue;
1273	}
1274
1275	// Loads are handled via masking (or speculated if safe to do so.)
1276	if (auto *LI = dyn_cast<LoadInst>(Val: &I)) {
1277	if (!SafePtrs.count(Ptr: LI->getPointerOperand()))
1278	MaskedOp.insert(Ptr: LI);
1279	continue;
1280	}
1281
1282	// Predicated store requires some form of masking:
1283	// 1) masked store HW instruction,
1284	// 2) emulation via load-blend-store (only if safe and legal to do so,
1285	// be aware on the race conditions), or
1286	// 3) element-by-element predicate check and scalar store.
1287	if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
1288	MaskedOp.insert(Ptr: SI);
1289	continue;
1290	}
1291
1292	if (I.mayReadFromMemory() \|\| I.mayWriteToMemory() \|\| I.mayThrow())
1293	return false;
1294	}
1295
1296	return true;
1297	}
1298
1299	bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
1300	if (!EnableIfConversion) {
1301	reportVectorizationFailure(DebugMsg: "If-conversion is disabled",
1302	OREMsg: "if-conversion is disabled",
1303	ORETag: "IfConversionDisabled",
1304	ORE, TheLoop);
1305	return false;
1306	}
1307
1308	assert(TheLoop->getNumBlocks() > `1` && "Single block loops are vectorizable");
1309
1310	// A list of pointers which are known to be dereferenceable within scope of
1311	// the loop body for each iteration of the loop which executes. That is,
1312	// the memory pointed to can be dereferenced (with the access size implied by
1313	// the value's type) unconditionally within the loop header without
1314	// introducing a new fault.
1315	SmallPtrSet<Value *, `8`> SafePointers;
1316
1317	// Collect safe addresses.
1318	for (BasicBlock *BB : TheLoop->blocks()) {
1319	if (!blockNeedsPredication(BB)) {
1320	for (Instruction &I : *BB)
1321	if (auto *Ptr = getLoadStorePointerOperand(V: &I))
1322	SafePointers.insert(Ptr);
1323	continue;
1324	}
1325
1326	// For a block which requires predication, a address may be safe to access
1327	// in the loop w/o predication if we can prove dereferenceability facts
1328	// sufficient to ensure it'll never fault within the loop. For the moment,
1329	// we restrict this to loads; stores are more complicated due to
1330	// concurrency restrictions.
1331	ScalarEvolution &SE = *PSE.getSE();
1332	for (Instruction &I : *BB) {
1333	LoadInst *LI = dyn_cast<LoadInst>(Val: &I);
1334	if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(LI: *LI) &&
1335	isDereferenceableAndAlignedInLoop(LI, L: TheLoop, SE, DT&: *DT, AC))
1336	SafePointers.insert(Ptr: LI->getPointerOperand());
1337	}
1338	}
1339
1340	// Collect the blocks that need predication.
1341	for (BasicBlock *BB : TheLoop->blocks()) {
1342	// We don't support switch statements inside loops.
1343	if (!isa<BranchInst>(Val: BB->getTerminator())) {
1344	reportVectorizationFailure(DebugMsg: "Loop contains a switch statement",
1345	OREMsg: "loop contains a switch statement",
1346	ORETag: "LoopContainsSwitch", ORE, TheLoop,
1347	I: BB->getTerminator());
1348	return false;
1349	}
1350
1351	// We must be able to predicate all blocks that need to be predicated.
1352	if (blockNeedsPredication(BB) &&
1353	!blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp)) {
1354	reportVectorizationFailure(
1355	DebugMsg: "Control flow cannot be substituted for a select",
1356	OREMsg: "control flow cannot be substituted for a select", ORETag: "NoCFGForSelect",
1357	ORE, TheLoop, I: BB->getTerminator());
1358	return false;
1359	}
1360	}
1361
1362	// We can if-convert this loop.
1363	return true;
1364	}
1365
1366	// Helper function to canVectorizeLoopNestCFG.
1367	bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
1368	bool UseVPlanNativePath) {
1369	assert((UseVPlanNativePath \|\| Lp->isInnermost()) &&
1370	"VPlan-native path is not enabled.");
1371
1372	// TODO: ORE should be improved to show more accurate information when an
1373	// outer loop can't be vectorized because a nested loop is not understood or
1374	// legal. Something like: "outer_loop_location: loop not vectorized:
1375	// (inner_loop_location) loop control flow is not understood by vectorizer".
1376
1377	// Store the result and return it at the end instead of exiting early, in case
1378	// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1379	bool Result = true;
1380	bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1381
1382	// We must have a loop in canonical form. Loops with indirectbr in them cannot
1383	// be canonicalized.
1384	if (!Lp->getLoopPreheader()) {
1385	reportVectorizationFailure(DebugMsg: "Loop doesn't have a legal pre-header",
1386	OREMsg: "loop control flow is not understood by vectorizer",
1387	ORETag: "CFGNotUnderstood", ORE, TheLoop);
1388	if (DoExtraAnalysis)
1389	Result = false;
1390	else
1391	return false;
1392	}
1393
1394	// We must have a single backedge.
1395	if (Lp->getNumBackEdges() != `1`) {
1396	reportVectorizationFailure(DebugMsg: "The loop must have a single backedge",
1397	OREMsg: "loop control flow is not understood by vectorizer",
1398	ORETag: "CFGNotUnderstood", ORE, TheLoop);
1399	if (DoExtraAnalysis)
1400	Result = false;
1401	else
1402	return false;
1403	}
1404
1405	return Result;
1406	}
1407
1408	bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
1409	Loop Lp, bool* UseVPlanNativePath) {
1410	// Store the result and return it at the end instead of exiting early, in case
1411	// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1412	bool Result = true;
1413	bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1414	if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
1415	if (DoExtraAnalysis)
1416	Result = false;
1417	else
1418	return false;
1419	}
1420
1421	// Recursively check whether the loop control flow of nested loops is
1422	// understood.
1423	for (Loop SubLp : Lp)
1424	if (!canVectorizeLoopNestCFG(Lp: SubLp, UseVPlanNativePath)) {
1425	if (DoExtraAnalysis)
1426	Result = false;
1427	else
1428	return false;
1429	}
1430
1431	return Result;
1432	}
1433
1434	bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
1435	// Store the result and return it at the end instead of exiting early, in case
1436	// allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1437	bool Result = true;
1438
1439	bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1440	// Check whether the loop-related control flow in the loop nest is expected by
1441	// vectorizer.
1442	if (!canVectorizeLoopNestCFG(Lp: TheLoop, UseVPlanNativePath)) {
1443	if (DoExtraAnalysis)
1444	Result = false;
1445	else
1446	return false;
1447	}
1448
1449	// We need to have a loop header.
1450	LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
1451	<< `'\n'`);
1452
1453	// Specific checks for outer loops. We skip the remaining legal checks at this
1454	// point because they don't support outer loops.
1455	if (!TheLoop->isInnermost()) {
1456	assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
1457
1458	if (!canVectorizeOuterLoop()) {
1459	reportVectorizationFailure(DebugMsg: "Unsupported outer loop",
1460	OREMsg: "unsupported outer loop",
1461	ORETag: "UnsupportedOuterLoop",
1462	ORE, TheLoop);
1463	// TODO: Implement DoExtraAnalysis when subsequent legal checks support
1464	// outer loops.
1465	return false;
1466	}
1467
1468	LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
1469	return Result;
1470	}
1471
1472	assert(TheLoop->isInnermost() && "Inner loop expected.");
1473	// Check if we can if-convert non-single-bb loops.
1474	unsigned NumBlocks = TheLoop->getNumBlocks();
1475	if (NumBlocks != `1` && !canVectorizeWithIfConvert()) {
1476	LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
1477	if (DoExtraAnalysis)
1478	Result = false;
1479	else
1480	return false;
1481	}
1482
1483	// Check if we can vectorize the instructions and CFG in this loop.
1484	if (!canVectorizeInstrs()) {
1485	LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
1486	if (DoExtraAnalysis)
1487	Result = false;
1488	else
1489	return false;
1490	}
1491
1492	// Go over each instruction and look at memory deps.
1493	if (!canVectorizeMemory()) {
1494	LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
1495	if (DoExtraAnalysis)
1496	Result = false;
1497	else
1498	return false;
1499	}
1500
1501	LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
1502	<< (LAI->getRuntimePointerChecking()->Need
1503	? " (with a runtime bound check)"
1504	: "")
1505	<< "!\n");
1506
1507	unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
1508	if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
1509	SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
1510
1511	if (PSE.getPredicate().getComplexity() > SCEVThreshold) {
1512	reportVectorizationFailure(DebugMsg: "Too many SCEV checks needed",
1513	OREMsg: "Too many SCEV assumptions need to be made and checked at runtime",
1514	ORETag: "TooManySCEVRunTimeChecks", ORE, TheLoop);
1515	if (DoExtraAnalysis)
1516	Result = false;
1517	else
1518	return false;
1519	}
1520
1521	// Okay! We've done all the tests. If any have failed, return false. Otherwise
1522	// we can vectorize, and at this point we don't have any other mem analysis
1523	// which may limit our maximum vectorization factor, so just return true with
1524	// no restrictions.
1525	return Result;
1526	}
1527
1528	bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
1529
1530	LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
1531
1532	SmallPtrSet<const Value *, `8`> ReductionLiveOuts;
1533
1534	for (const auto &Reduction : getReductionVars())
1535	ReductionLiveOuts.insert(Ptr: Reduction.second.getLoopExitInstr());
1536
1537	// TODO: handle non-reduction outside users when tail is folded by masking.
1538	for (auto *AE : AllowedExit) {
1539	// Check that all users of allowed exit values are inside the loop or
1540	// are the live-out of a reduction.
1541	if (ReductionLiveOuts.count(Ptr: AE))
1542	continue;
1543	for (User *U : AE->users()) {
1544	Instruction *UI = cast<Instruction>(Val: U);
1545	if (TheLoop->contains(Inst: UI))
1546	continue;
1547	LLVM_DEBUG(
1548	dbgs()
1549	<< "LV: Cannot fold tail by masking, loop has an outside user for "
1550	<< *UI << "\n");
1551	return false;
1552	}
1553	}
1554
1555	for (const auto &Entry : getInductionVars()) {
1556	PHINode *OrigPhi = Entry.first;
1557	for (User *U : OrigPhi->users()) {
1558	auto *UI = cast<Instruction>(Val: U);
1559	if (!TheLoop->contains(Inst: UI)) {
1560	LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking, loop IV has an "
1561	"outside user for "
1562	<< *UI << "\n");
1563	return false;
1564	}
1565	}
1566	}
1567
1568	// The list of pointers that we can safely read and write to remains empty.
1569	SmallPtrSet<Value *, `8`> SafePointers;
1570
1571	// Collect masked ops in temporary set first to avoid partially populating
1572	// MaskedOp if a block cannot be predicated.
1573	SmallPtrSet<const Instruction *, `8`> TmpMaskedOp;
1574
1575	// Check and mark all blocks for predication, including those that ordinarily
1576	// do not need predication such as the header block.
1577	for (BasicBlock *BB : TheLoop->blocks()) {
1578	if (!blockCanBePredicated(BB, SafePtrs&: SafePointers, MaskedOp&: TmpMaskedOp)) {
1579	LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n");
1580	return false;
1581	}
1582	}
1583
1584	LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
1585
1586	MaskedOp.insert(I: TmpMaskedOp.begin(), E: TmpMaskedOp.end());
1587	return true;
1588	}
1589
1590	} // namespace llvm
1591

source code of llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp