1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanHCFGBuilder.h"
62#include "VPlanTransforms.h"
63#include "VPlanVerifier.h"
64#include "llvm/ADT/APInt.h"
65#include "llvm/ADT/ArrayRef.h"
66#include "llvm/ADT/DenseMap.h"
67#include "llvm/ADT/DenseMapInfo.h"
68#include "llvm/ADT/Hashing.h"
69#include "llvm/ADT/MapVector.h"
70#include "llvm/ADT/STLExtras.h"
71#include "llvm/ADT/SmallPtrSet.h"
72#include "llvm/ADT/SmallSet.h"
73#include "llvm/ADT/SmallVector.h"
74#include "llvm/ADT/Statistic.h"
75#include "llvm/ADT/StringRef.h"
76#include "llvm/ADT/Twine.h"
77#include "llvm/ADT/iterator_range.h"
78#include "llvm/Analysis/AssumptionCache.h"
79#include "llvm/Analysis/BasicAliasAnalysis.h"
80#include "llvm/Analysis/BlockFrequencyInfo.h"
81#include "llvm/Analysis/CFG.h"
82#include "llvm/Analysis/CodeMetrics.h"
83#include "llvm/Analysis/DemandedBits.h"
84#include "llvm/Analysis/GlobalsModRef.h"
85#include "llvm/Analysis/LoopAccessAnalysis.h"
86#include "llvm/Analysis/LoopAnalysisManager.h"
87#include "llvm/Analysis/LoopInfo.h"
88#include "llvm/Analysis/LoopIterator.h"
89#include "llvm/Analysis/OptimizationRemarkEmitter.h"
90#include "llvm/Analysis/ProfileSummaryInfo.h"
91#include "llvm/Analysis/ScalarEvolution.h"
92#include "llvm/Analysis/ScalarEvolutionExpressions.h"
93#include "llvm/Analysis/TargetLibraryInfo.h"
94#include "llvm/Analysis/TargetTransformInfo.h"
95#include "llvm/Analysis/ValueTracking.h"
96#include "llvm/Analysis/VectorUtils.h"
97#include "llvm/IR/Attributes.h"
98#include "llvm/IR/BasicBlock.h"
99#include "llvm/IR/CFG.h"
100#include "llvm/IR/Constant.h"
101#include "llvm/IR/Constants.h"
102#include "llvm/IR/DataLayout.h"
103#include "llvm/IR/DebugInfo.h"
104#include "llvm/IR/DebugInfoMetadata.h"
105#include "llvm/IR/DebugLoc.h"
106#include "llvm/IR/DerivedTypes.h"
107#include "llvm/IR/DiagnosticInfo.h"
108#include "llvm/IR/Dominators.h"
109#include "llvm/IR/Function.h"
110#include "llvm/IR/IRBuilder.h"
111#include "llvm/IR/InstrTypes.h"
112#include "llvm/IR/Instruction.h"
113#include "llvm/IR/Instructions.h"
114#include "llvm/IR/IntrinsicInst.h"
115#include "llvm/IR/Intrinsics.h"
116#include "llvm/IR/MDBuilder.h"
117#include "llvm/IR/Metadata.h"
118#include "llvm/IR/Module.h"
119#include "llvm/IR/Operator.h"
120#include "llvm/IR/PatternMatch.h"
121#include "llvm/IR/ProfDataUtils.h"
122#include "llvm/IR/Type.h"
123#include "llvm/IR/Use.h"
124#include "llvm/IR/User.h"
125#include "llvm/IR/Value.h"
126#include "llvm/IR/ValueHandle.h"
127#include "llvm/IR/VectorBuilder.h"
128#include "llvm/IR/Verifier.h"
129#include "llvm/Support/Casting.h"
130#include "llvm/Support/CommandLine.h"
131#include "llvm/Support/Compiler.h"
132#include "llvm/Support/Debug.h"
133#include "llvm/Support/ErrorHandling.h"
134#include "llvm/Support/InstructionCost.h"
135#include "llvm/Support/MathExtras.h"
136#include "llvm/Support/raw_ostream.h"
137#include "llvm/Transforms/Utils/BasicBlockUtils.h"
138#include "llvm/Transforms/Utils/InjectTLIMappings.h"
139#include "llvm/Transforms/Utils/LoopSimplify.h"
140#include "llvm/Transforms/Utils/LoopUtils.h"
141#include "llvm/Transforms/Utils/LoopVersioning.h"
142#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
143#include "llvm/Transforms/Utils/SizeOpts.h"
144#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
145#include <algorithm>
146#include <cassert>
147#include <cmath>
148#include <cstdint>
149#include <functional>
150#include <iterator>
151#include <limits>
152#include <map>
153#include <memory>
154#include <string>
155#include <tuple>
156#include <utility>
157
158using namespace llvm;
159
160#define LV_NAME "loop-vectorize"
161#define DEBUG_TYPE LV_NAME
162
163#ifndef NDEBUG
164const char VerboseDebug[] = DEBUG_TYPE "-verbose";
165#endif
166
167/// @{
168/// Metadata attribute names
169const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
170const char LLVMLoopVectorizeFollowupVectorized[] =
171 "llvm.loop.vectorize.followup_vectorized";
172const char LLVMLoopVectorizeFollowupEpilogue[] =
173 "llvm.loop.vectorize.followup_epilogue";
174/// @}
175
176STATISTIC(LoopsVectorized, "Number of loops vectorized");
177STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
178STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
179
180static cl::opt<bool> EnableEpilogueVectorization(
181 "enable-epilogue-vectorization", cl::init(Val: true), cl::Hidden,
182 cl::desc("Enable vectorization of epilogue loops."));
183
184static cl::opt<unsigned> EpilogueVectorizationForceVF(
185 "epilogue-vectorization-force-VF", cl::init(Val: 1), cl::Hidden,
186 cl::desc("When epilogue vectorization is enabled, and a value greater than "
187 "1 is specified, forces the given VF for all applicable epilogue "
188 "loops."));
189
190static cl::opt<unsigned> EpilogueVectorizationMinVF(
191 "epilogue-vectorization-minimum-VF", cl::init(Val: 16), cl::Hidden,
192 cl::desc("Only loops with vectorization factor equal to or larger than "
193 "the specified value are considered for epilogue vectorization."));
194
195/// Loops with a known constant trip count below this number are vectorized only
196/// if no scalar iteration overheads are incurred.
197static cl::opt<unsigned> TinyTripCountVectorThreshold(
198 "vectorizer-min-trip-count", cl::init(Val: 16), cl::Hidden,
199 cl::desc("Loops with a constant trip count that is smaller than this "
200 "value are vectorized only if no scalar iteration overheads "
201 "are incurred."));
202
203static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
204 "vectorize-memory-check-threshold", cl::init(Val: 128), cl::Hidden,
205 cl::desc("The maximum allowed number of runtime memory checks"));
206
207// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
208// that predication is preferred, and this lists all options. I.e., the
209// vectorizer will try to fold the tail-loop (epilogue) into the vector body
210// and predicate the instructions accordingly. If tail-folding fails, there are
211// different fallback strategies depending on these values:
212namespace PreferPredicateTy {
213 enum Option {
214 ScalarEpilogue = 0,
215 PredicateElseScalarEpilogue,
216 PredicateOrDontVectorize
217 };
218} // namespace PreferPredicateTy
219
220static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
221 "prefer-predicate-over-epilogue",
222 cl::init(Val: PreferPredicateTy::ScalarEpilogue),
223 cl::Hidden,
224 cl::desc("Tail-folding and predication preferences over creating a scalar "
225 "epilogue loop."),
226 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
227 "scalar-epilogue",
228 "Don't tail-predicate loops, create scalar epilogue"),
229 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
230 "predicate-else-scalar-epilogue",
231 "prefer tail-folding, create scalar epilogue if tail "
232 "folding fails."),
233 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
234 "predicate-dont-vectorize",
235 "prefers tail-folding, don't attempt vectorization if "
236 "tail-folding fails.")));
237
238static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
239 "force-tail-folding-style", cl::desc("Force the tail folding style"),
240 cl::init(Val: TailFoldingStyle::None),
241 cl::values(
242 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
243 clEnumValN(
244 TailFoldingStyle::Data, "data",
245 "Create lane mask for data only, using active.lane.mask intrinsic"),
246 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
247 "data-without-lane-mask",
248 "Create lane mask with compare/stepvector"),
249 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
250 "Create lane mask using active.lane.mask intrinsic, and use "
251 "it for both data and control flow"),
252 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
253 "data-and-control-without-rt-check",
254 "Similar to data-and-control, but remove the runtime check"),
255 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
256 "Use predicated EVL instructions for tail folding. If EVL "
257 "is unsupported, fallback to data-without-lane-mask.")));
258
259static cl::opt<bool> MaximizeBandwidth(
260 "vectorizer-maximize-bandwidth", cl::init(Val: false), cl::Hidden,
261 cl::desc("Maximize bandwidth when selecting vectorization factor which "
262 "will be determined by the smallest type in loop."));
263
264static cl::opt<bool> EnableInterleavedMemAccesses(
265 "enable-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
266 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
267
268/// An interleave-group may need masking if it resides in a block that needs
269/// predication, or in order to mask away gaps.
270static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
271 "enable-masked-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
272 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
273
274static cl::opt<unsigned> ForceTargetNumScalarRegs(
275 "force-target-num-scalar-regs", cl::init(Val: 0), cl::Hidden,
276 cl::desc("A flag that overrides the target's number of scalar registers."));
277
278static cl::opt<unsigned> ForceTargetNumVectorRegs(
279 "force-target-num-vector-regs", cl::init(Val: 0), cl::Hidden,
280 cl::desc("A flag that overrides the target's number of vector registers."));
281
282static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
283 "force-target-max-scalar-interleave", cl::init(Val: 0), cl::Hidden,
284 cl::desc("A flag that overrides the target's max interleave factor for "
285 "scalar loops."));
286
287static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
288 "force-target-max-vector-interleave", cl::init(Val: 0), cl::Hidden,
289 cl::desc("A flag that overrides the target's max interleave factor for "
290 "vectorized loops."));
291
292static cl::opt<unsigned> ForceTargetInstructionCost(
293 "force-target-instruction-cost", cl::init(Val: 0), cl::Hidden,
294 cl::desc("A flag that overrides the target's expected cost for "
295 "an instruction to a single constant value. Mostly "
296 "useful for getting consistent testing."));
297
298static cl::opt<bool> ForceTargetSupportsScalableVectors(
299 "force-target-supports-scalable-vectors", cl::init(Val: false), cl::Hidden,
300 cl::desc(
301 "Pretend that scalable vectors are supported, even if the target does "
302 "not support them. This flag should only be used for testing."));
303
304static cl::opt<unsigned> SmallLoopCost(
305 "small-loop-cost", cl::init(Val: 20), cl::Hidden,
306 cl::desc(
307 "The cost of a loop that is considered 'small' by the interleaver."));
308
309static cl::opt<bool> LoopVectorizeWithBlockFrequency(
310 "loop-vectorize-with-block-frequency", cl::init(Val: true), cl::Hidden,
311 cl::desc("Enable the use of the block frequency analysis to access PGO "
312 "heuristics minimizing code growth in cold regions and being more "
313 "aggressive in hot regions."));
314
315// Runtime interleave loops for load/store throughput.
316static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
317 "enable-loadstore-runtime-interleave", cl::init(Val: true), cl::Hidden,
318 cl::desc(
319 "Enable runtime interleaving until load/store ports are saturated"));
320
321/// The number of stores in a loop that are allowed to need predication.
322static cl::opt<unsigned> NumberOfStoresToPredicate(
323 "vectorize-num-stores-pred", cl::init(Val: 1), cl::Hidden,
324 cl::desc("Max number of stores to be predicated behind an if."));
325
326static cl::opt<bool> EnableIndVarRegisterHeur(
327 "enable-ind-var-reg-heur", cl::init(Val: true), cl::Hidden,
328 cl::desc("Count the induction variable only once when interleaving"));
329
330static cl::opt<bool> EnableCondStoresVectorization(
331 "enable-cond-stores-vec", cl::init(Val: true), cl::Hidden,
332 cl::desc("Enable if predication of stores during vectorization."));
333
334static cl::opt<unsigned> MaxNestedScalarReductionIC(
335 "max-nested-scalar-reduction-interleave", cl::init(Val: 2), cl::Hidden,
336 cl::desc("The maximum interleave count to use when interleaving a scalar "
337 "reduction in a nested loop."));
338
339static cl::opt<bool>
340 PreferInLoopReductions("prefer-inloop-reductions", cl::init(Val: false),
341 cl::Hidden,
342 cl::desc("Prefer in-loop vector reductions, "
343 "overriding the targets preference."));
344
345static cl::opt<bool> ForceOrderedReductions(
346 "force-ordered-reductions", cl::init(Val: false), cl::Hidden,
347 cl::desc("Enable the vectorisation of loops with in-order (strict) "
348 "FP reductions"));
349
350static cl::opt<bool> PreferPredicatedReductionSelect(
351 "prefer-predicated-reduction-select", cl::init(Val: false), cl::Hidden,
352 cl::desc(
353 "Prefer predicating a reduction operation over an after loop select."));
354
355namespace llvm {
356cl::opt<bool> EnableVPlanNativePath(
357 "enable-vplan-native-path", cl::Hidden,
358 cl::desc("Enable VPlan-native vectorization path with "
359 "support for outer loop vectorization."));
360}
361
362// This flag enables the stress testing of the VPlan H-CFG construction in the
363// VPlan-native vectorization path. It must be used in conjuction with
364// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
365// verification of the H-CFGs built.
366static cl::opt<bool> VPlanBuildStressTest(
367 "vplan-build-stress-test", cl::init(Val: false), cl::Hidden,
368 cl::desc(
369 "Build VPlan for every supported loop nest in the function and bail "
370 "out right after the build (stress test the VPlan H-CFG construction "
371 "in the VPlan-native vectorization path)."));
372
373cl::opt<bool> llvm::EnableLoopInterleaving(
374 "interleave-loops", cl::init(Val: true), cl::Hidden,
375 cl::desc("Enable loop interleaving in Loop vectorization passes"));
376cl::opt<bool> llvm::EnableLoopVectorization(
377 "vectorize-loops", cl::init(Val: true), cl::Hidden,
378 cl::desc("Run the Loop vectorization passes"));
379
380static cl::opt<bool> PrintVPlansInDotFormat(
381 "vplan-print-in-dot-format", cl::Hidden,
382 cl::desc("Use dot format instead of plain text when dumping VPlans"));
383
384static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
385 "force-widen-divrem-via-safe-divisor", cl::Hidden,
386 cl::desc(
387 "Override cost based safe divisor widening for div/rem instructions"));
388
389static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
390 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(Val: true),
391 cl::Hidden,
392 cl::desc("Try wider VFs if they enable the use of vector variants"));
393
394// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
395// variables not overflowing do not hold. See `emitSCEVChecks`.
396static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
397// Likelyhood of bypassing the vectorized loop because pointers overlap. See
398// `emitMemRuntimeChecks`.
399static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
400// Likelyhood of bypassing the vectorized loop because there are zero trips left
401// after prolog. See `emitIterationCountCheck`.
402static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
403
404/// A helper function that returns true if the given type is irregular. The
405/// type is irregular if its allocated size doesn't equal the store size of an
406/// element of the corresponding vector type.
407static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
408 // Determine if an array of N elements of type Ty is "bitcast compatible"
409 // with a <N x Ty> vector.
410 // This is only true if there is no padding between the array elements.
411 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
412}
413
414/// A helper function that returns the reciprocal of the block probability of
415/// predicated blocks. If we return X, we are assuming the predicated block
416/// will execute once for every X iterations of the loop header.
417///
418/// TODO: We should use actual block probability here, if available. Currently,
419/// we always assume predicated blocks have a 50% chance of executing.
420static unsigned getReciprocalPredBlockProb() { return 2; }
421
422/// Returns "best known" trip count for the specified loop \p L as defined by
423/// the following procedure:
424/// 1) Returns exact trip count if it is known.
425/// 2) Returns expected trip count according to profile data if any.
426/// 3) Returns upper bound estimate if it is known.
427/// 4) Returns std::nullopt if all of the above failed.
428static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
429 Loop *L) {
430 // Check if exact trip count is known.
431 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
432 return ExpectedTC;
433
434 // Check if there is an expected trip count available from profile data.
435 if (LoopVectorizeWithBlockFrequency)
436 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
437 return *EstimatedTC;
438
439 // Check if upper bound estimate is known.
440 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
441 return ExpectedTC;
442
443 return std::nullopt;
444}
445
446/// Return a vector containing interleaved elements from multiple
447/// smaller input vectors.
448static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals,
449 const Twine &Name) {
450 unsigned Factor = Vals.size();
451 assert(Factor > 1 && "Tried to interleave invalid number of vectors");
452
453 VectorType *VecTy = cast<VectorType>(Val: Vals[0]->getType());
454#ifndef NDEBUG
455 for (Value *Val : Vals)
456 assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
457#endif
458
459 // Scalable vectors cannot use arbitrary shufflevectors (only splats), so
460 // must use intrinsics to interleave.
461 if (VecTy->isScalableTy()) {
462 VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VTy: VecTy);
463 return Builder.CreateIntrinsic(
464 WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
465 /*FMFSource=*/nullptr, Name);
466 }
467
468 // Fixed length. Start by concatenating all vectors into a wide vector.
469 Value *WideVec = concatenateVectors(Builder, Vecs: Vals);
470
471 // Interleave the elements into the wide vector.
472 const unsigned NumElts = VecTy->getElementCount().getFixedValue();
473 return Builder.CreateShuffleVector(
474 V: WideVec, Mask: createInterleaveMask(VF: NumElts, NumVecs: Factor), Name);
475}
476
477namespace {
478// Forward declare GeneratedRTChecks.
479class GeneratedRTChecks;
480
481using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
482} // namespace
483
484namespace llvm {
485
486AnalysisKey ShouldRunExtraVectorPasses::Key;
487
488/// InnerLoopVectorizer vectorizes loops which contain only one basic
489/// block to a specified vectorization factor (VF).
490/// This class performs the widening of scalars into vectors, or multiple
491/// scalars. This class also implements the following features:
492/// * It inserts an epilogue loop for handling loops that don't have iteration
493/// counts that are known to be a multiple of the vectorization factor.
494/// * It handles the code generation for reduction variables.
495/// * Scalarization (implementation using scalars) of un-vectorizable
496/// instructions.
497/// InnerLoopVectorizer does not perform any vectorization-legality
498/// checks, and relies on the caller to check for the different legality
499/// aspects. The InnerLoopVectorizer relies on the
500/// LoopVectorizationLegality class to provide information about the induction
501/// and reduction variables that were found to a given vectorization factor.
502class InnerLoopVectorizer {
503public:
504 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
505 LoopInfo *LI, DominatorTree *DT,
506 const TargetLibraryInfo *TLI,
507 const TargetTransformInfo *TTI, AssumptionCache *AC,
508 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
509 ElementCount MinProfitableTripCount,
510 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
511 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
512 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
513 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
514 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
515 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
516 PSI(PSI), RTChecks(RTChecks) {
517 // Query this against the original loop and save it here because the profile
518 // of the original loop header may change as the transformation happens.
519 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
520 BB: OrigLoop->getHeader(), PSI, BFI, QueryType: PGSOQueryType::IRPass);
521
522 if (MinProfitableTripCount.isZero())
523 this->MinProfitableTripCount = VecWidth;
524 else
525 this->MinProfitableTripCount = MinProfitableTripCount;
526 }
527
528 virtual ~InnerLoopVectorizer() = default;
529
530 /// Create a new empty loop that will contain vectorized instructions later
531 /// on, while the old loop will be used as the scalar remainder. Control flow
532 /// is generated around the vectorized (and scalar epilogue) loops consisting
533 /// of various checks and bypasses. Return the pre-header block of the new
534 /// loop and the start value for the canonical induction, if it is != 0. The
535 /// latter is the case when vectorizing the epilogue loop. In the case of
536 /// epilogue vectorization, this function is overriden to handle the more
537 /// complex control flow around the loops. \p ExpandedSCEVs is used to
538 /// look up SCEV expansions for expressions needed during skeleton creation.
539 virtual std::pair<BasicBlock *, Value *>
540 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
541
542 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
543 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
544
545 // Return true if any runtime check is added.
546 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
547
548 /// A helper function to scalarize a single Instruction in the innermost loop.
549 /// Generates a sequence of scalar instances for each lane between \p MinLane
550 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
551 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
552 /// Instr's operands.
553 void scalarizeInstruction(const Instruction *Instr,
554 VPReplicateRecipe *RepRecipe,
555 const VPIteration &Instance,
556 VPTransformState &State);
557
558 /// Try to vectorize interleaved access group \p Group with the base address
559 /// given in \p Addr, optionally masking the vector operations if \p
560 /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
561 /// values in the vectorized loop.
562 void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
563 ArrayRef<VPValue *> VPDefs,
564 VPTransformState &State, VPValue *Addr,
565 ArrayRef<VPValue *> StoredValues,
566 VPValue *BlockInMask, bool NeedsMaskForGaps);
567
568 /// Fix the non-induction PHIs in \p Plan.
569 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
570
571 /// Create a new phi node for the induction variable \p OrigPhi to resume
572 /// iteration count in the scalar epilogue, from where the vectorized loop
573 /// left off. \p Step is the SCEV-expanded induction step to use. In cases
574 /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
575 /// and the resume values can come from an additional bypass block, the \p
576 /// AdditionalBypass pair provides information about the bypass block and the
577 /// end value on the edge from bypass to this loop.
578 PHINode *createInductionResumeValue(
579 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
580 ArrayRef<BasicBlock *> BypassBlocks,
581 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
582
583 /// Returns the original loop trip count.
584 Value *getTripCount() const { return TripCount; }
585
586 /// Used to set the trip count after ILV's construction and after the
587 /// preheader block has been executed. Note that this always holds the trip
588 /// count of the original loop for both main loop and epilogue vectorization.
589 void setTripCount(Value *TC) { TripCount = TC; }
590
591protected:
592 friend class LoopVectorizationPlanner;
593
594 /// A small list of PHINodes.
595 using PhiVector = SmallVector<PHINode *, 4>;
596
597 /// A type for scalarized values in the new loop. Each value from the
598 /// original loop, when scalarized, is represented by UF x VF scalar values
599 /// in the new unrolled loop, where UF is the unroll factor and VF is the
600 /// vectorization factor.
601 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
602
603 /// Set up the values of the IVs correctly when exiting the vector loop.
604 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
605 Value *VectorTripCount, Value *EndValue,
606 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
607 VPlan &Plan, VPTransformState &State);
608
609 /// Create the exit value of first order recurrences in the middle block and
610 /// update their users.
611 void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
612 VPTransformState &State);
613
614 /// Iteratively sink the scalarized operands of a predicated instruction into
615 /// the block that was created for it.
616 void sinkScalarOperands(Instruction *PredInst);
617
618 /// Returns (and creates if needed) the trip count of the widened loop.
619 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
620
621 /// Returns a bitcasted value to the requested vector type.
622 /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
623 Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
624 const DataLayout &DL);
625
626 /// Emit a bypass check to see if the vector trip count is zero, including if
627 /// it overflows.
628 void emitIterationCountCheck(BasicBlock *Bypass);
629
630 /// Emit a bypass check to see if all of the SCEV assumptions we've
631 /// had to make are correct. Returns the block containing the checks or
632 /// nullptr if no checks have been added.
633 BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
634
635 /// Emit bypass checks to check any memory assumptions we may have made.
636 /// Returns the block containing the checks or nullptr if no checks have been
637 /// added.
638 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
639
640 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
641 /// vector loop preheader, middle block and scalar preheader.
642 void createVectorLoopSkeleton(StringRef Prefix);
643
644 /// Create new phi nodes for the induction variables to resume iteration count
645 /// in the scalar epilogue, from where the vectorized loop left off.
646 /// In cases where the loop skeleton is more complicated (eg. epilogue
647 /// vectorization) and the resume values can come from an additional bypass
648 /// block, the \p AdditionalBypass pair provides information about the bypass
649 /// block and the end value on the edge from bypass to this loop.
650 void createInductionResumeValues(
651 const SCEV2ValueTy &ExpandedSCEVs,
652 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
653
654 /// Complete the loop skeleton by adding debug MDs, creating appropriate
655 /// conditional branches in the middle block, preparing the builder and
656 /// running the verifier. Return the preheader of the completed vector loop.
657 BasicBlock *completeLoopSkeleton();
658
659 /// Allow subclasses to override and print debug traces before/after vplan
660 /// execution, when trace information is requested.
661 virtual void printDebugTracesAtStart(){};
662 virtual void printDebugTracesAtEnd(){};
663
664 /// The original loop.
665 Loop *OrigLoop;
666
667 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
668 /// dynamic knowledge to simplify SCEV expressions and converts them to a
669 /// more usable form.
670 PredicatedScalarEvolution &PSE;
671
672 /// Loop Info.
673 LoopInfo *LI;
674
675 /// Dominator Tree.
676 DominatorTree *DT;
677
678 /// Target Library Info.
679 const TargetLibraryInfo *TLI;
680
681 /// Target Transform Info.
682 const TargetTransformInfo *TTI;
683
684 /// Assumption Cache.
685 AssumptionCache *AC;
686
687 /// Interface to emit optimization remarks.
688 OptimizationRemarkEmitter *ORE;
689
690 /// The vectorization SIMD factor to use. Each vector will have this many
691 /// vector elements.
692 ElementCount VF;
693
694 ElementCount MinProfitableTripCount;
695
696 /// The vectorization unroll factor to use. Each scalar is vectorized to this
697 /// many different vector instructions.
698 unsigned UF;
699
700 /// The builder that we use
701 IRBuilder<> Builder;
702
703 // --- Vectorization state ---
704
705 /// The vector-loop preheader.
706 BasicBlock *LoopVectorPreHeader;
707
708 /// The scalar-loop preheader.
709 BasicBlock *LoopScalarPreHeader;
710
711 /// Middle Block between the vector and the scalar.
712 BasicBlock *LoopMiddleBlock;
713
714 /// The unique ExitBlock of the scalar loop if one exists. Note that
715 /// there can be multiple exiting edges reaching this block.
716 BasicBlock *LoopExitBlock;
717
718 /// The scalar loop body.
719 BasicBlock *LoopScalarBody;
720
721 /// A list of all bypass blocks. The first block is the entry of the loop.
722 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
723
724 /// Store instructions that were predicated.
725 SmallVector<Instruction *, 4> PredicatedInstructions;
726
727 /// Trip count of the original loop.
728 Value *TripCount = nullptr;
729
730 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
731 Value *VectorTripCount = nullptr;
732
733 /// The legality analysis.
734 LoopVectorizationLegality *Legal;
735
736 /// The profitablity analysis.
737 LoopVectorizationCostModel *Cost;
738
739 // Record whether runtime checks are added.
740 bool AddedSafetyChecks = false;
741
742 // Holds the end values for each induction variable. We save the end values
743 // so we can later fix-up the external users of the induction variables.
744 DenseMap<PHINode *, Value *> IVEndValues;
745
746 /// BFI and PSI are used to check for profile guided size optimizations.
747 BlockFrequencyInfo *BFI;
748 ProfileSummaryInfo *PSI;
749
750 // Whether this loop should be optimized for size based on profile guided size
751 // optimizatios.
752 bool OptForSizeBasedOnProfile;
753
754 /// Structure to hold information about generated runtime checks, responsible
755 /// for cleaning the checks, if vectorization turns out unprofitable.
756 GeneratedRTChecks &RTChecks;
757
758 // Holds the resume values for reductions in the loops, used to set the
759 // correct start value of reduction PHIs when vectorizing the epilogue.
760 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
761 ReductionResumeValues;
762};
763
764class InnerLoopUnroller : public InnerLoopVectorizer {
765public:
766 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
767 LoopInfo *LI, DominatorTree *DT,
768 const TargetLibraryInfo *TLI,
769 const TargetTransformInfo *TTI, AssumptionCache *AC,
770 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
771 LoopVectorizationLegality *LVL,
772 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
773 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
774 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
775 ElementCount::getFixed(MinVal: 1),
776 ElementCount::getFixed(MinVal: 1), UnrollFactor, LVL, CM,
777 BFI, PSI, Check) {}
778};
779
780/// Encapsulate information regarding vectorization of a loop and its epilogue.
781/// This information is meant to be updated and used across two stages of
782/// epilogue vectorization.
783struct EpilogueLoopVectorizationInfo {
784 ElementCount MainLoopVF = ElementCount::getFixed(MinVal: 0);
785 unsigned MainLoopUF = 0;
786 ElementCount EpilogueVF = ElementCount::getFixed(MinVal: 0);
787 unsigned EpilogueUF = 0;
788 BasicBlock *MainLoopIterationCountCheck = nullptr;
789 BasicBlock *EpilogueIterationCountCheck = nullptr;
790 BasicBlock *SCEVSafetyCheck = nullptr;
791 BasicBlock *MemSafetyCheck = nullptr;
792 Value *TripCount = nullptr;
793 Value *VectorTripCount = nullptr;
794
795 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
796 ElementCount EVF, unsigned EUF)
797 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
798 assert(EUF == 1 &&
799 "A high UF for the epilogue loop is likely not beneficial.");
800 }
801};
802
803/// An extension of the inner loop vectorizer that creates a skeleton for a
804/// vectorized loop that has its epilogue (residual) also vectorized.
805/// The idea is to run the vplan on a given loop twice, firstly to setup the
806/// skeleton and vectorize the main loop, and secondly to complete the skeleton
807/// from the first step and vectorize the epilogue. This is achieved by
808/// deriving two concrete strategy classes from this base class and invoking
809/// them in succession from the loop vectorizer planner.
810class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
811public:
812 InnerLoopAndEpilogueVectorizer(
813 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
814 DominatorTree *DT, const TargetLibraryInfo *TLI,
815 const TargetTransformInfo *TTI, AssumptionCache *AC,
816 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
817 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
818 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
819 GeneratedRTChecks &Checks)
820 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
821 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
822 CM, BFI, PSI, Checks),
823 EPI(EPI) {}
824
825 // Override this function to handle the more complex control flow around the
826 // three loops.
827 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
828 const SCEV2ValueTy &ExpandedSCEVs) final {
829 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
830 }
831
832 /// The interface for creating a vectorized skeleton using one of two
833 /// different strategies, each corresponding to one execution of the vplan
834 /// as described above.
835 virtual std::pair<BasicBlock *, Value *>
836 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
837
838 /// Holds and updates state information required to vectorize the main loop
839 /// and its epilogue in two separate passes. This setup helps us avoid
840 /// regenerating and recomputing runtime safety checks. It also helps us to
841 /// shorten the iteration-count-check path length for the cases where the
842 /// iteration count of the loop is so small that the main vector loop is
843 /// completely skipped.
844 EpilogueLoopVectorizationInfo &EPI;
845};
846
847/// A specialized derived class of inner loop vectorizer that performs
848/// vectorization of *main* loops in the process of vectorizing loops and their
849/// epilogues.
850class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
851public:
852 EpilogueVectorizerMainLoop(
853 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
854 DominatorTree *DT, const TargetLibraryInfo *TLI,
855 const TargetTransformInfo *TTI, AssumptionCache *AC,
856 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
857 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
858 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
859 GeneratedRTChecks &Check)
860 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
861 EPI, LVL, CM, BFI, PSI, Check) {}
862 /// Implements the interface for creating a vectorized skeleton using the
863 /// *main loop* strategy (ie the first pass of vplan execution).
864 std::pair<BasicBlock *, Value *>
865 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
866
867protected:
868 /// Emits an iteration count bypass check once for the main loop (when \p
869 /// ForEpilogue is false) and once for the epilogue loop (when \p
870 /// ForEpilogue is true).
871 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
872 void printDebugTracesAtStart() override;
873 void printDebugTracesAtEnd() override;
874};
875
876// A specialized derived class of inner loop vectorizer that performs
877// vectorization of *epilogue* loops in the process of vectorizing loops and
878// their epilogues.
879class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
880public:
881 EpilogueVectorizerEpilogueLoop(
882 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
883 DominatorTree *DT, const TargetLibraryInfo *TLI,
884 const TargetTransformInfo *TTI, AssumptionCache *AC,
885 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
886 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
887 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
888 GeneratedRTChecks &Checks)
889 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
890 EPI, LVL, CM, BFI, PSI, Checks) {
891 TripCount = EPI.TripCount;
892 }
893 /// Implements the interface for creating a vectorized skeleton using the
894 /// *epilogue loop* strategy (ie the second pass of vplan execution).
895 std::pair<BasicBlock *, Value *>
896 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
897
898protected:
899 /// Emits an iteration count bypass check after the main vector loop has
900 /// finished to see if there are any iterations left to execute by either
901 /// the vector epilogue or the scalar epilogue.
902 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
903 BasicBlock *Bypass,
904 BasicBlock *Insert);
905 void printDebugTracesAtStart() override;
906 void printDebugTracesAtEnd() override;
907};
908} // end namespace llvm
909
910/// Look for a meaningful debug location on the instruction or it's
911/// operands.
912static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
913 if (!I)
914 return DebugLoc();
915
916 DebugLoc Empty;
917 if (I->getDebugLoc() != Empty)
918 return I->getDebugLoc();
919
920 for (Use &Op : I->operands()) {
921 if (Instruction *OpInst = dyn_cast<Instruction>(Val&: Op))
922 if (OpInst->getDebugLoc() != Empty)
923 return OpInst->getDebugLoc();
924 }
925
926 return I->getDebugLoc();
927}
928
929/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
930/// is passed, the message relates to that particular instruction.
931#ifndef NDEBUG
932static void debugVectorizationMessage(const StringRef Prefix,
933 const StringRef DebugMsg,
934 Instruction *I) {
935 dbgs() << "LV: " << Prefix << DebugMsg;
936 if (I != nullptr)
937 dbgs() << " " << *I;
938 else
939 dbgs() << '.';
940 dbgs() << '\n';
941}
942#endif
943
944/// Create an analysis remark that explains why vectorization failed
945///
946/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
947/// RemarkName is the identifier for the remark. If \p I is passed it is an
948/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
949/// the location of the remark. \return the remark object that can be
950/// streamed to.
951static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
952 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
953 Value *CodeRegion = TheLoop->getHeader();
954 DebugLoc DL = TheLoop->getStartLoc();
955
956 if (I) {
957 CodeRegion = I->getParent();
958 // If there is no debug location attached to the instruction, revert back to
959 // using the loop's.
960 if (I->getDebugLoc())
961 DL = I->getDebugLoc();
962 }
963
964 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
965}
966
967namespace llvm {
968
969/// Return a value for Step multiplied by VF.
970Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
971 int64_t Step) {
972 assert(Ty->isIntegerTy() && "Expected an integer step");
973 return B.CreateElementCount(DstType: Ty, EC: VF.multiplyCoefficientBy(RHS: Step));
974}
975
976/// Return the runtime value for VF.
977Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
978 return B.CreateElementCount(DstType: Ty, EC: VF);
979}
980
981const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
982 Loop *OrigLoop) {
983 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
984 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
985
986 ScalarEvolution &SE = *PSE.getSE();
987 return SE.getTripCountFromExitCount(ExitCount: BackedgeTakenCount, EvalTy: IdxTy, L: OrigLoop);
988}
989
990void reportVectorizationFailure(const StringRef DebugMsg,
991 const StringRef OREMsg, const StringRef ORETag,
992 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
993 Instruction *I) {
994 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
995 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
996 ORE->emit(
997 OptDiag&: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop, I)
998 << "loop not vectorized: " << OREMsg);
999}
1000
1001void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1002 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1003 Instruction *I) {
1004 LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1005 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1006 ORE->emit(
1007 OptDiag&: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop, I)
1008 << Msg);
1009}
1010
1011/// Report successful vectorization of the loop. In case an outer loop is
1012/// vectorized, prepend "outer" to the vectorization remark.
1013static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1014 VectorizationFactor VF, unsigned IC) {
1015 LLVM_DEBUG(debugVectorizationMessage(
1016 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
1017 nullptr));
1018 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
1019 ORE->emit(RemarkBuilder: [&]() {
1020 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
1021 TheLoop->getHeader())
1022 << "vectorized " << LoopType << "loop (vectorization width: "
1023 << ore::NV("VectorizationFactor", VF.Width)
1024 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
1025 });
1026}
1027
1028} // end namespace llvm
1029
1030#ifndef NDEBUG
1031/// \return string containing a file name and a line # for the given loop.
1032static std::string getDebugLocString(const Loop *L) {
1033 std::string Result;
1034 if (L) {
1035 raw_string_ostream OS(Result);
1036 if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1037 LoopDbgLoc.print(OS);
1038 else
1039 // Just print the module name.
1040 OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1041 OS.flush();
1042 }
1043 return Result;
1044}
1045#endif
1046
1047namespace llvm {
1048
1049// Loop vectorization cost-model hints how the scalar epilogue loop should be
1050// lowered.
1051enum ScalarEpilogueLowering {
1052
1053 // The default: allowing scalar epilogues.
1054 CM_ScalarEpilogueAllowed,
1055
1056 // Vectorization with OptForSize: don't allow epilogues.
1057 CM_ScalarEpilogueNotAllowedOptSize,
1058
1059 // A special case of vectorisation with OptForSize: loops with a very small
1060 // trip count are considered for vectorization under OptForSize, thereby
1061 // making sure the cost of their loop body is dominant, free of runtime
1062 // guards and scalar iteration overheads.
1063 CM_ScalarEpilogueNotAllowedLowTripLoop,
1064
1065 // Loop hint predicate indicating an epilogue is undesired.
1066 CM_ScalarEpilogueNotNeededUsePredicate,
1067
1068 // Directive indicating we must either tail fold or not vectorize
1069 CM_ScalarEpilogueNotAllowedUsePredicate
1070};
1071
1072using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1073
1074/// LoopVectorizationCostModel - estimates the expected speedups due to
1075/// vectorization.
1076/// In many cases vectorization is not profitable. This can happen because of
1077/// a number of reasons. In this class we mainly attempt to predict the
1078/// expected speedup/slowdowns due to the supported instruction set. We use the
1079/// TargetTransformInfo to query the different backends for the cost of
1080/// different operations.
1081class LoopVectorizationCostModel {
1082public:
1083 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1084 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1085 LoopVectorizationLegality *Legal,
1086 const TargetTransformInfo &TTI,
1087 const TargetLibraryInfo *TLI, DemandedBits *DB,
1088 AssumptionCache *AC,
1089 OptimizationRemarkEmitter *ORE, const Function *F,
1090 const LoopVectorizeHints *Hints,
1091 InterleavedAccessInfo &IAI)
1092 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1093 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1094 Hints(Hints), InterleaveInfo(IAI) {}
1095
1096 /// \return An upper bound for the vectorization factors (both fixed and
1097 /// scalable). If the factors are 0, vectorization and interleaving should be
1098 /// avoided up front.
1099 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1100
1101 /// \return True if runtime checks are required for vectorization, and false
1102 /// otherwise.
1103 bool runtimeChecksRequired();
1104
1105 /// Setup cost-based decisions for user vectorization factor.
1106 /// \return true if the UserVF is a feasible VF to be chosen.
1107 bool selectUserVectorizationFactor(ElementCount UserVF) {
1108 collectUniformsAndScalars(VF: UserVF);
1109 collectInstsToScalarize(VF: UserVF);
1110 return expectedCost(VF: UserVF).first.isValid();
1111 }
1112
1113 /// \return The size (in bits) of the smallest and widest types in the code
1114 /// that needs to be vectorized. We ignore values that remain scalar such as
1115 /// 64 bit loop indices.
1116 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1117
1118 /// \return The desired interleave count.
1119 /// If interleave count has been specified by metadata it will be returned.
1120 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1121 /// are the selected vectorization factor and the cost of the selected VF.
1122 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1123
1124 /// Memory access instruction may be vectorized in more than one way.
1125 /// Form of instruction after vectorization depends on cost.
1126 /// This function takes cost-based decisions for Load/Store instructions
1127 /// and collects them in a map. This decisions map is used for building
1128 /// the lists of loop-uniform and loop-scalar instructions.
1129 /// The calculated cost is saved with widening decision in order to
1130 /// avoid redundant calculations.
1131 void setCostBasedWideningDecision(ElementCount VF);
1132
1133 /// A call may be vectorized in different ways depending on whether we have
1134 /// vectorized variants available and whether the target supports masking.
1135 /// This function analyzes all calls in the function at the supplied VF,
1136 /// makes a decision based on the costs of available options, and stores that
1137 /// decision in a map for use in planning and plan execution.
1138 void setVectorizedCallDecision(ElementCount VF);
1139
1140 /// A struct that represents some properties of the register usage
1141 /// of a loop.
1142 struct RegisterUsage {
1143 /// Holds the number of loop invariant values that are used in the loop.
1144 /// The key is ClassID of target-provided register class.
1145 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1146 /// Holds the maximum number of concurrent live intervals in the loop.
1147 /// The key is ClassID of target-provided register class.
1148 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1149 };
1150
1151 /// \return Returns information about the register usages of the loop for the
1152 /// given vectorization factors.
1153 SmallVector<RegisterUsage, 8>
1154 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1155
1156 /// Collect values we want to ignore in the cost model.
1157 void collectValuesToIgnore();
1158
1159 /// Collect all element types in the loop for which widening is needed.
1160 void collectElementTypesForWidening();
1161
1162 /// Split reductions into those that happen in the loop, and those that happen
1163 /// outside. In loop reductions are collected into InLoopReductions.
1164 void collectInLoopReductions();
1165
1166 /// Returns true if we should use strict in-order reductions for the given
1167 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1168 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1169 /// of FP operations.
1170 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1171 return !Hints->allowReordering() && RdxDesc.isOrdered();
1172 }
1173
1174 /// \returns The smallest bitwidth each instruction can be represented with.
1175 /// The vector equivalents of these instructions should be truncated to this
1176 /// type.
1177 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1178 return MinBWs;
1179 }
1180
1181 /// \returns True if it is more profitable to scalarize instruction \p I for
1182 /// vectorization factor \p VF.
1183 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1184 assert(VF.isVector() &&
1185 "Profitable to scalarize relevant only for VF > 1.");
1186 assert(
1187 TheLoop->isInnermost() &&
1188 "cost-model should not be used for outer loops (in VPlan-native path)");
1189
1190 auto Scalars = InstsToScalarize.find(Val: VF);
1191 assert(Scalars != InstsToScalarize.end() &&
1192 "VF not yet analyzed for scalarization profitability");
1193 return Scalars->second.contains(Val: I);
1194 }
1195
1196 /// Returns true if \p I is known to be uniform after vectorization.
1197 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1198 assert(
1199 TheLoop->isInnermost() &&
1200 "cost-model should not be used for outer loops (in VPlan-native path)");
1201 // Pseudo probe needs to be duplicated for each unrolled iteration and
1202 // vector lane so that profiled loop trip count can be accurately
1203 // accumulated instead of being under counted.
1204 if (isa<PseudoProbeInst>(Val: I))
1205 return false;
1206
1207 if (VF.isScalar())
1208 return true;
1209
1210 auto UniformsPerVF = Uniforms.find(Val: VF);
1211 assert(UniformsPerVF != Uniforms.end() &&
1212 "VF not yet analyzed for uniformity");
1213 return UniformsPerVF->second.count(Ptr: I);
1214 }
1215
1216 /// Returns true if \p I is known to be scalar after vectorization.
1217 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1218 assert(
1219 TheLoop->isInnermost() &&
1220 "cost-model should not be used for outer loops (in VPlan-native path)");
1221 if (VF.isScalar())
1222 return true;
1223
1224 auto ScalarsPerVF = Scalars.find(Val: VF);
1225 assert(ScalarsPerVF != Scalars.end() &&
1226 "Scalar values are not calculated for VF");
1227 return ScalarsPerVF->second.count(Ptr: I);
1228 }
1229
1230 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1231 /// for vectorization factor \p VF.
1232 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1233 return VF.isVector() && MinBWs.contains(Key: I) &&
1234 !isProfitableToScalarize(I, VF) &&
1235 !isScalarAfterVectorization(I, VF);
1236 }
1237
1238 /// Decision that was taken during cost calculation for memory instruction.
1239 enum InstWidening {
1240 CM_Unknown,
1241 CM_Widen, // For consecutive accesses with stride +1.
1242 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1243 CM_Interleave,
1244 CM_GatherScatter,
1245 CM_Scalarize,
1246 CM_VectorCall,
1247 CM_IntrinsicCall
1248 };
1249
1250 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1251 /// instruction \p I and vector width \p VF.
1252 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1253 InstructionCost Cost) {
1254 assert(VF.isVector() && "Expected VF >=2");
1255 WideningDecisions[std::make_pair(x&: I, y&: VF)] = std::make_pair(x&: W, y&: Cost);
1256 }
1257
1258 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1259 /// interleaving group \p Grp and vector width \p VF.
1260 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1261 ElementCount VF, InstWidening W,
1262 InstructionCost Cost) {
1263 assert(VF.isVector() && "Expected VF >=2");
1264 /// Broadcast this decicion to all instructions inside the group.
1265 /// But the cost will be assigned to one instruction only.
1266 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1267 if (auto *I = Grp->getMember(Index: i)) {
1268 if (Grp->getInsertPos() == I)
1269 WideningDecisions[std::make_pair(x&: I, y&: VF)] = std::make_pair(x&: W, y&: Cost);
1270 else
1271 WideningDecisions[std::make_pair(x&: I, y&: VF)] = std::make_pair(x&: W, y: 0);
1272 }
1273 }
1274 }
1275
1276 /// Return the cost model decision for the given instruction \p I and vector
1277 /// width \p VF. Return CM_Unknown if this instruction did not pass
1278 /// through the cost modeling.
1279 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1280 assert(VF.isVector() && "Expected VF to be a vector VF");
1281 assert(
1282 TheLoop->isInnermost() &&
1283 "cost-model should not be used for outer loops (in VPlan-native path)");
1284
1285 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(x&: I, y&: VF);
1286 auto Itr = WideningDecisions.find(Val: InstOnVF);
1287 if (Itr == WideningDecisions.end())
1288 return CM_Unknown;
1289 return Itr->second.first;
1290 }
1291
1292 /// Return the vectorization cost for the given instruction \p I and vector
1293 /// width \p VF.
1294 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1295 assert(VF.isVector() && "Expected VF >=2");
1296 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(x&: I, y&: VF);
1297 assert(WideningDecisions.contains(InstOnVF) &&
1298 "The cost is not calculated");
1299 return WideningDecisions[InstOnVF].second;
1300 }
1301
1302 struct CallWideningDecision {
1303 InstWidening Kind;
1304 Function *Variant;
1305 Intrinsic::ID IID;
1306 std::optional<unsigned> MaskPos;
1307 InstructionCost Cost;
1308 };
1309
1310 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1311 Function *Variant, Intrinsic::ID IID,
1312 std::optional<unsigned> MaskPos,
1313 InstructionCost Cost) {
1314 assert(!VF.isScalar() && "Expected vector VF");
1315 CallWideningDecisions[std::make_pair(x&: CI, y&: VF)] = {.Kind: Kind, .Variant: Variant, .IID: IID,
1316 .MaskPos: MaskPos, .Cost: Cost};
1317 }
1318
1319 CallWideningDecision getCallWideningDecision(CallInst *CI,
1320 ElementCount VF) const {
1321 assert(!VF.isScalar() && "Expected vector VF");
1322 return CallWideningDecisions.at(Val: std::make_pair(x&: CI, y&: VF));
1323 }
1324
1325 /// Return True if instruction \p I is an optimizable truncate whose operand
1326 /// is an induction variable. Such a truncate will be removed by adding a new
1327 /// induction variable with the destination type.
1328 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1329 // If the instruction is not a truncate, return false.
1330 auto *Trunc = dyn_cast<TruncInst>(Val: I);
1331 if (!Trunc)
1332 return false;
1333
1334 // Get the source and destination types of the truncate.
1335 Type *SrcTy = ToVectorTy(Scalar: cast<CastInst>(Val: I)->getSrcTy(), EC: VF);
1336 Type *DestTy = ToVectorTy(Scalar: cast<CastInst>(Val: I)->getDestTy(), EC: VF);
1337
1338 // If the truncate is free for the given types, return false. Replacing a
1339 // free truncate with an induction variable would add an induction variable
1340 // update instruction to each iteration of the loop. We exclude from this
1341 // check the primary induction variable since it will need an update
1342 // instruction regardless.
1343 Value *Op = Trunc->getOperand(i_nocapture: 0);
1344 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(Ty1: SrcTy, Ty2: DestTy))
1345 return false;
1346
1347 // If the truncated value is not an induction variable, return false.
1348 return Legal->isInductionPhi(V: Op);
1349 }
1350
1351 /// Collects the instructions to scalarize for each predicated instruction in
1352 /// the loop.
1353 void collectInstsToScalarize(ElementCount VF);
1354
1355 /// Collect Uniform and Scalar values for the given \p VF.
1356 /// The sets depend on CM decision for Load/Store instructions
1357 /// that may be vectorized as interleave, gather-scatter or scalarized.
1358 /// Also make a decision on what to do about call instructions in the loop
1359 /// at that VF -- scalarize, call a known vector routine, or call a
1360 /// vector intrinsic.
1361 void collectUniformsAndScalars(ElementCount VF) {
1362 // Do the analysis once.
1363 if (VF.isScalar() || Uniforms.contains(Val: VF))
1364 return;
1365 setCostBasedWideningDecision(VF);
1366 setVectorizedCallDecision(VF);
1367 collectLoopUniforms(VF);
1368 collectLoopScalars(VF);
1369 }
1370
1371 /// Returns true if the target machine supports masked store operation
1372 /// for the given \p DataType and kind of access to \p Ptr.
1373 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1374 return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1375 TTI.isLegalMaskedStore(DataType, Alignment);
1376 }
1377
1378 /// Returns true if the target machine supports masked load operation
1379 /// for the given \p DataType and kind of access to \p Ptr.
1380 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1381 return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1382 TTI.isLegalMaskedLoad(DataType, Alignment);
1383 }
1384
1385 /// Returns true if the target machine can represent \p V as a masked gather
1386 /// or scatter operation.
1387 bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1388 bool LI = isa<LoadInst>(Val: V);
1389 bool SI = isa<StoreInst>(Val: V);
1390 if (!LI && !SI)
1391 return false;
1392 auto *Ty = getLoadStoreType(I: V);
1393 Align Align = getLoadStoreAlignment(I: V);
1394 if (VF.isVector())
1395 Ty = VectorType::get(ElementType: Ty, EC: VF);
1396 return (LI && TTI.isLegalMaskedGather(DataType: Ty, Alignment: Align)) ||
1397 (SI && TTI.isLegalMaskedScatter(DataType: Ty, Alignment: Align));
1398 }
1399
1400 /// Returns true if the target machine supports all of the reduction
1401 /// variables found for the given VF.
1402 bool canVectorizeReductions(ElementCount VF) const {
1403 return (all_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
1404 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1405 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1406 }));
1407 }
1408
1409 /// Given costs for both strategies, return true if the scalar predication
1410 /// lowering should be used for div/rem. This incorporates an override
1411 /// option so it is not simply a cost comparison.
1412 bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1413 InstructionCost SafeDivisorCost) const {
1414 switch (ForceSafeDivisor) {
1415 case cl::BOU_UNSET:
1416 return ScalarCost < SafeDivisorCost;
1417 case cl::BOU_TRUE:
1418 return false;
1419 case cl::BOU_FALSE:
1420 return true;
1421 };
1422 llvm_unreachable("impossible case value");
1423 }
1424
1425 /// Returns true if \p I is an instruction which requires predication and
1426 /// for which our chosen predication strategy is scalarization (i.e. we
1427 /// don't have an alternate strategy such as masking available).
1428 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1429 bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1430
1431 /// Returns true if \p I is an instruction that needs to be predicated
1432 /// at runtime. The result is independent of the predication mechanism.
1433 /// Superset of instructions that return true for isScalarWithPredication.
1434 bool isPredicatedInst(Instruction *I) const;
1435
1436 /// Return the costs for our two available strategies for lowering a
1437 /// div/rem operation which requires speculating at least one lane.
1438 /// First result is for scalarization (will be invalid for scalable
1439 /// vectors); second is for the safe-divisor strategy.
1440 std::pair<InstructionCost, InstructionCost>
1441 getDivRemSpeculationCost(Instruction *I,
1442 ElementCount VF) const;
1443
1444 /// Returns true if \p I is a memory instruction with consecutive memory
1445 /// access that can be widened.
1446 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1447
1448 /// Returns true if \p I is a memory instruction in an interleaved-group
1449 /// of memory accesses that can be vectorized with wide vector loads/stores
1450 /// and shuffles.
1451 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
1452
1453 /// Check if \p Instr belongs to any interleaved access group.
1454 bool isAccessInterleaved(Instruction *Instr) {
1455 return InterleaveInfo.isInterleaved(Instr);
1456 }
1457
1458 /// Get the interleaved access group that \p Instr belongs to.
1459 const InterleaveGroup<Instruction> *
1460 getInterleavedAccessGroup(Instruction *Instr) {
1461 return InterleaveInfo.getInterleaveGroup(Instr);
1462 }
1463
1464 /// Returns true if we're required to use a scalar epilogue for at least
1465 /// the final iteration of the original loop.
1466 bool requiresScalarEpilogue(bool IsVectorizing) const {
1467 if (!isScalarEpilogueAllowed())
1468 return false;
1469 // If we might exit from anywhere but the latch, must run the exiting
1470 // iteration in scalar form.
1471 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1472 return true;
1473 return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
1474 }
1475
1476 /// Returns true if we're required to use a scalar epilogue for at least
1477 /// the final iteration of the original loop for all VFs in \p Range.
1478 /// A scalar epilogue must either be required for all VFs in \p Range or for
1479 /// none.
1480 bool requiresScalarEpilogue(VFRange Range) const {
1481 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1482 return requiresScalarEpilogue(IsVectorizing: VF.isVector());
1483 };
1484 bool IsRequired = all_of(Range, P: RequiresScalarEpilogue);
1485 assert(
1486 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1487 "all VFs in range must agree on whether a scalar epilogue is required");
1488 return IsRequired;
1489 }
1490
1491 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1492 /// loop hint annotation.
1493 bool isScalarEpilogueAllowed() const {
1494 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1495 }
1496
1497 /// Returns the TailFoldingStyle that is best for the current loop.
1498 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1499 if (!ChosenTailFoldingStyle)
1500 return TailFoldingStyle::None;
1501 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1502 : ChosenTailFoldingStyle->second;
1503 }
1504
1505 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1506 /// overflow or not.
1507 /// \param IsScalableVF true if scalable vector factors enabled.
1508 /// \param UserIC User specific interleave count.
1509 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1510 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1511 if (!Legal->prepareToFoldTailByMasking()) {
1512 ChosenTailFoldingStyle =
1513 std::make_pair(x: TailFoldingStyle::None, y: TailFoldingStyle::None);
1514 return;
1515 }
1516
1517 if (!ForceTailFoldingStyle.getNumOccurrences()) {
1518 ChosenTailFoldingStyle = std::make_pair(
1519 x: TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1520 y: TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1521 return;
1522 }
1523
1524 // Set styles when forced.
1525 ChosenTailFoldingStyle = std::make_pair(x&: ForceTailFoldingStyle.getValue(),
1526 y&: ForceTailFoldingStyle.getValue());
1527 if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1528 return;
1529 // Override forced styles if needed.
1530 // FIXME: use actual opcode/data type for analysis here.
1531 // FIXME: Investigate opportunity for fixed vector factor.
1532 bool EVLIsLegal =
1533 IsScalableVF && UserIC <= 1 &&
1534 TTI.hasActiveVectorLength(Opcode: 0, DataType: nullptr, Alignment: Align()) &&
1535 !EnableVPlanNativePath &&
1536 // FIXME: implement support for max safe dependency distance.
1537 Legal->isSafeForAnyVectorWidth() &&
1538 // FIXME: remove this once reductions are supported.
1539 Legal->getReductionVars().empty();
1540 if (!EVLIsLegal) {
1541 // If for some reason EVL mode is unsupported, fallback to
1542 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1543 // in a generic way.
1544 ChosenTailFoldingStyle =
1545 std::make_pair(x: TailFoldingStyle::DataWithoutLaneMask,
1546 y: TailFoldingStyle::DataWithoutLaneMask);
1547 LLVM_DEBUG(
1548 dbgs()
1549 << "LV: Preference for VP intrinsics indicated. Will "
1550 "not try to generate VP Intrinsics "
1551 << (UserIC > 1
1552 ? "since interleave count specified is greater than 1.\n"
1553 : "due to non-interleaving reasons.\n"));
1554 }
1555 }
1556
1557 /// Returns true if all loop blocks should be masked to fold tail loop.
1558 bool foldTailByMasking() const {
1559 // TODO: check if it is possible to check for None style independent of
1560 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1561 return getTailFoldingStyle() != TailFoldingStyle::None;
1562 }
1563
1564 /// Returns true if the instructions in this block requires predication
1565 /// for any reason, e.g. because tail folding now requires a predicate
1566 /// or because the block in the original loop was predicated.
1567 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1568 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1569 }
1570
1571 /// Returns true if VP intrinsics with explicit vector length support should
1572 /// be generated in the tail folded loop.
1573 bool foldTailWithEVL() const {
1574 return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL &&
1575 // FIXME: remove this once vp_reverse is supported.
1576 none_of(
1577 Range: WideningDecisions,
1578 P: [](const std::pair<std::pair<Instruction *, ElementCount>,
1579 std::pair<InstWidening, InstructionCost>>
1580 &Data) { return Data.second.first == CM_Widen_Reverse; });
1581 }
1582
1583 /// Returns true if the Phi is part of an inloop reduction.
1584 bool isInLoopReduction(PHINode *Phi) const {
1585 return InLoopReductions.contains(Ptr: Phi);
1586 }
1587
1588 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1589 /// with factor VF. Return the cost of the instruction, including
1590 /// scalarization overhead if it's needed.
1591 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1592
1593 /// Estimate cost of a call instruction CI if it were vectorized with factor
1594 /// VF. Return the cost of the instruction, including scalarization overhead
1595 /// if it's needed.
1596 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1597
1598 /// Invalidates decisions already taken by the cost model.
1599 void invalidateCostModelingDecisions() {
1600 WideningDecisions.clear();
1601 CallWideningDecisions.clear();
1602 Uniforms.clear();
1603 Scalars.clear();
1604 }
1605
1606 /// The vectorization cost is a combination of the cost itself and a boolean
1607 /// indicating whether any of the contributing operations will actually
1608 /// operate on vector values after type legalization in the backend. If this
1609 /// latter value is false, then all operations will be scalarized (i.e. no
1610 /// vectorization has actually taken place).
1611 using VectorizationCostTy = std::pair<InstructionCost, bool>;
1612
1613 /// Returns the expected execution cost. The unit of the cost does
1614 /// not matter because we use the 'cost' units to compare different
1615 /// vector widths. The cost that is returned is *not* normalized by
1616 /// the factor width. If \p Invalid is not nullptr, this function
1617 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1618 /// each instruction that has an Invalid cost for the given VF.
1619 VectorizationCostTy
1620 expectedCost(ElementCount VF,
1621 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1622
1623 bool hasPredStores() const { return NumPredStores > 0; }
1624
1625 /// Returns true if epilogue vectorization is considered profitable, and
1626 /// false otherwise.
1627 /// \p VF is the vectorization factor chosen for the original loop.
1628 bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1629
1630private:
1631 unsigned NumPredStores = 0;
1632
1633 /// \return An upper bound for the vectorization factors for both
1634 /// fixed and scalable vectorization, where the minimum-known number of
1635 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1636 /// disabled or unsupported, then the scalable part will be equal to
1637 /// ElementCount::getScalable(0).
1638 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1639 ElementCount UserVF,
1640 bool FoldTailByMasking);
1641
1642 /// \return the maximized element count based on the targets vector
1643 /// registers and the loop trip-count, but limited to a maximum safe VF.
1644 /// This is a helper function of computeFeasibleMaxVF.
1645 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1646 unsigned SmallestType,
1647 unsigned WidestType,
1648 ElementCount MaxSafeVF,
1649 bool FoldTailByMasking);
1650
1651 /// \return the maximum legal scalable VF, based on the safe max number
1652 /// of elements.
1653 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1654
1655 /// Returns the execution time cost of an instruction for a given vector
1656 /// width. Vector width of one means scalar.
1657 VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1658
1659 /// The cost-computation logic from getInstructionCost which provides
1660 /// the vector type as an output parameter.
1661 InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1662 Type *&VectorTy);
1663
1664 /// Return the cost of instructions in an inloop reduction pattern, if I is
1665 /// part of that pattern.
1666 std::optional<InstructionCost>
1667 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1668 TTI::TargetCostKind CostKind) const;
1669
1670 /// Calculate vectorization cost of memory instruction \p I.
1671 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1672
1673 /// The cost computation for scalarized memory instruction.
1674 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1675
1676 /// The cost computation for interleaving group of memory instructions.
1677 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1678
1679 /// The cost computation for Gather/Scatter instruction.
1680 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1681
1682 /// The cost computation for widening instruction \p I with consecutive
1683 /// memory access.
1684 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1685
1686 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1687 /// Load: scalar load + broadcast.
1688 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1689 /// element)
1690 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1691
1692 /// Estimate the overhead of scalarizing an instruction. This is a
1693 /// convenience wrapper for the type-based getScalarizationOverhead API.
1694 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1695 TTI::TargetCostKind CostKind) const;
1696
1697 /// Returns true if an artificially high cost for emulated masked memrefs
1698 /// should be used.
1699 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1700
1701 /// Map of scalar integer values to the smallest bitwidth they can be legally
1702 /// represented as. The vector equivalents of these values should be truncated
1703 /// to this type.
1704 MapVector<Instruction *, uint64_t> MinBWs;
1705
1706 /// A type representing the costs for instructions if they were to be
1707 /// scalarized rather than vectorized. The entries are Instruction-Cost
1708 /// pairs.
1709 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1710
1711 /// A set containing all BasicBlocks that are known to present after
1712 /// vectorization as a predicated block.
1713 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1714 PredicatedBBsAfterVectorization;
1715
1716 /// Records whether it is allowed to have the original scalar loop execute at
1717 /// least once. This may be needed as a fallback loop in case runtime
1718 /// aliasing/dependence checks fail, or to handle the tail/remainder
1719 /// iterations when the trip count is unknown or doesn't divide by the VF,
1720 /// or as a peel-loop to handle gaps in interleave-groups.
1721 /// Under optsize and when the trip count is very small we don't allow any
1722 /// iterations to execute in the scalar loop.
1723 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1724
1725 /// Control finally chosen tail folding style. The first element is used if
1726 /// the IV update may overflow, the second element - if it does not.
1727 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1728 ChosenTailFoldingStyle;
1729
1730 /// A map holding scalar costs for different vectorization factors. The
1731 /// presence of a cost for an instruction in the mapping indicates that the
1732 /// instruction will be scalarized when vectorizing with the associated
1733 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1734 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1735
1736 /// Holds the instructions known to be uniform after vectorization.
1737 /// The data is collected per VF.
1738 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1739
1740 /// Holds the instructions known to be scalar after vectorization.
1741 /// The data is collected per VF.
1742 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1743
1744 /// Holds the instructions (address computations) that are forced to be
1745 /// scalarized.
1746 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1747
1748 /// PHINodes of the reductions that should be expanded in-loop.
1749 SmallPtrSet<PHINode *, 4> InLoopReductions;
1750
1751 /// A Map of inloop reduction operations and their immediate chain operand.
1752 /// FIXME: This can be removed once reductions can be costed correctly in
1753 /// VPlan. This was added to allow quick lookup of the inloop operations.
1754 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1755
1756 /// Returns the expected difference in cost from scalarizing the expression
1757 /// feeding a predicated instruction \p PredInst. The instructions to
1758 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1759 /// non-negative return value implies the expression will be scalarized.
1760 /// Currently, only single-use chains are considered for scalarization.
1761 InstructionCost computePredInstDiscount(Instruction *PredInst,
1762 ScalarCostsTy &ScalarCosts,
1763 ElementCount VF);
1764
1765 /// Collect the instructions that are uniform after vectorization. An
1766 /// instruction is uniform if we represent it with a single scalar value in
1767 /// the vectorized loop corresponding to each vector iteration. Examples of
1768 /// uniform instructions include pointer operands of consecutive or
1769 /// interleaved memory accesses. Note that although uniformity implies an
1770 /// instruction will be scalar, the reverse is not true. In general, a
1771 /// scalarized instruction will be represented by VF scalar values in the
1772 /// vectorized loop, each corresponding to an iteration of the original
1773 /// scalar loop.
1774 void collectLoopUniforms(ElementCount VF);
1775
1776 /// Collect the instructions that are scalar after vectorization. An
1777 /// instruction is scalar if it is known to be uniform or will be scalarized
1778 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1779 /// to the list if they are used by a load/store instruction that is marked as
1780 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1781 /// VF values in the vectorized loop, each corresponding to an iteration of
1782 /// the original scalar loop.
1783 void collectLoopScalars(ElementCount VF);
1784
1785 /// Keeps cost model vectorization decision and cost for instructions.
1786 /// Right now it is used for memory instructions only.
1787 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1788 std::pair<InstWidening, InstructionCost>>;
1789
1790 DecisionList WideningDecisions;
1791
1792 using CallDecisionList =
1793 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1794
1795 CallDecisionList CallWideningDecisions;
1796
1797 /// Returns true if \p V is expected to be vectorized and it needs to be
1798 /// extracted.
1799 bool needsExtract(Value *V, ElementCount VF) const {
1800 Instruction *I = dyn_cast<Instruction>(Val: V);
1801 if (VF.isScalar() || !I || !TheLoop->contains(Inst: I) ||
1802 TheLoop->isLoopInvariant(V: I))
1803 return false;
1804
1805 // Assume we can vectorize V (and hence we need extraction) if the
1806 // scalars are not computed yet. This can happen, because it is called
1807 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1808 // the scalars are collected. That should be a safe assumption in most
1809 // cases, because we check if the operands have vectorizable types
1810 // beforehand in LoopVectorizationLegality.
1811 return !Scalars.contains(Val: VF) || !isScalarAfterVectorization(I, VF);
1812 };
1813
1814 /// Returns a range containing only operands needing to be extracted.
1815 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1816 ElementCount VF) const {
1817 return SmallVector<Value *, 4>(make_filter_range(
1818 Range&: Ops, Pred: [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1819 }
1820
1821public:
1822 /// The loop that we evaluate.
1823 Loop *TheLoop;
1824
1825 /// Predicated scalar evolution analysis.
1826 PredicatedScalarEvolution &PSE;
1827
1828 /// Loop Info analysis.
1829 LoopInfo *LI;
1830
1831 /// Vectorization legality.
1832 LoopVectorizationLegality *Legal;
1833
1834 /// Vector target information.
1835 const TargetTransformInfo &TTI;
1836
1837 /// Target Library Info.
1838 const TargetLibraryInfo *TLI;
1839
1840 /// Demanded bits analysis.
1841 DemandedBits *DB;
1842
1843 /// Assumption cache.
1844 AssumptionCache *AC;
1845
1846 /// Interface to emit optimization remarks.
1847 OptimizationRemarkEmitter *ORE;
1848
1849 const Function *TheFunction;
1850
1851 /// Loop Vectorize Hint.
1852 const LoopVectorizeHints *Hints;
1853
1854 /// The interleave access information contains groups of interleaved accesses
1855 /// with the same stride and close to each other.
1856 InterleavedAccessInfo &InterleaveInfo;
1857
1858 /// Values to ignore in the cost model.
1859 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1860
1861 /// Values to ignore in the cost model when VF > 1.
1862 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1863
1864 /// All element types found in the loop.
1865 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1866};
1867} // end namespace llvm
1868
1869namespace {
1870/// Helper struct to manage generating runtime checks for vectorization.
1871///
1872/// The runtime checks are created up-front in temporary blocks to allow better
1873/// estimating the cost and un-linked from the existing IR. After deciding to
1874/// vectorize, the checks are moved back. If deciding not to vectorize, the
1875/// temporary blocks are completely removed.
1876class GeneratedRTChecks {
1877 /// Basic block which contains the generated SCEV checks, if any.
1878 BasicBlock *SCEVCheckBlock = nullptr;
1879
1880 /// The value representing the result of the generated SCEV checks. If it is
1881 /// nullptr, either no SCEV checks have been generated or they have been used.
1882 Value *SCEVCheckCond = nullptr;
1883
1884 /// Basic block which contains the generated memory runtime checks, if any.
1885 BasicBlock *MemCheckBlock = nullptr;
1886
1887 /// The value representing the result of the generated memory runtime checks.
1888 /// If it is nullptr, either no memory runtime checks have been generated or
1889 /// they have been used.
1890 Value *MemRuntimeCheckCond = nullptr;
1891
1892 DominatorTree *DT;
1893 LoopInfo *LI;
1894 TargetTransformInfo *TTI;
1895
1896 SCEVExpander SCEVExp;
1897 SCEVExpander MemCheckExp;
1898
1899 bool CostTooHigh = false;
1900 const bool AddBranchWeights;
1901
1902 Loop *OuterLoop = nullptr;
1903
1904public:
1905 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1906 TargetTransformInfo *TTI, const DataLayout &DL,
1907 bool AddBranchWeights)
1908 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1909 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1910
1911 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1912 /// accurately estimate the cost of the runtime checks. The blocks are
1913 /// un-linked from the IR and is added back during vector code generation. If
1914 /// there is no vector code generation, the check blocks are removed
1915 /// completely.
1916 void Create(Loop *L, const LoopAccessInfo &LAI,
1917 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1918
1919 // Hard cutoff to limit compile-time increase in case a very large number of
1920 // runtime checks needs to be generated.
1921 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1922 // profile info.
1923 CostTooHigh =
1924 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1925 if (CostTooHigh)
1926 return;
1927
1928 BasicBlock *LoopHeader = L->getHeader();
1929 BasicBlock *Preheader = L->getLoopPreheader();
1930
1931 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1932 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1933 // may be used by SCEVExpander. The blocks will be un-linked from their
1934 // predecessors and removed from LI & DT at the end of the function.
1935 if (!UnionPred.isAlwaysTrue()) {
1936 SCEVCheckBlock = SplitBlock(Old: Preheader, SplitPt: Preheader->getTerminator(), DT, LI,
1937 MSSAU: nullptr, BBName: "vector.scevcheck");
1938
1939 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1940 Pred: &UnionPred, Loc: SCEVCheckBlock->getTerminator());
1941 }
1942
1943 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1944 if (RtPtrChecking.Need) {
1945 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1946 MemCheckBlock = SplitBlock(Old: Pred, SplitPt: Pred->getTerminator(), DT, LI, MSSAU: nullptr,
1947 BBName: "vector.memcheck");
1948
1949 auto DiffChecks = RtPtrChecking.getDiffChecks();
1950 if (DiffChecks) {
1951 Value *RuntimeVF = nullptr;
1952 MemRuntimeCheckCond = addDiffRuntimeChecks(
1953 Loc: MemCheckBlock->getTerminator(), Checks: *DiffChecks, Expander&: MemCheckExp,
1954 GetVF: [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1955 if (!RuntimeVF)
1956 RuntimeVF = getRuntimeVF(B, Ty: B.getIntNTy(N: Bits), VF);
1957 return RuntimeVF;
1958 },
1959 IC);
1960 } else {
1961 MemRuntimeCheckCond = addRuntimeChecks(
1962 Loc: MemCheckBlock->getTerminator(), TheLoop: L, PointerChecks: RtPtrChecking.getChecks(),
1963 Expander&: MemCheckExp, HoistRuntimeChecks: VectorizerParams::HoistRuntimeChecks);
1964 }
1965 assert(MemRuntimeCheckCond &&
1966 "no RT checks generated although RtPtrChecking "
1967 "claimed checks are required");
1968 }
1969
1970 if (!MemCheckBlock && !SCEVCheckBlock)
1971 return;
1972
1973 // Unhook the temporary block with the checks, update various places
1974 // accordingly.
1975 if (SCEVCheckBlock)
1976 SCEVCheckBlock->replaceAllUsesWith(V: Preheader);
1977 if (MemCheckBlock)
1978 MemCheckBlock->replaceAllUsesWith(V: Preheader);
1979
1980 if (SCEVCheckBlock) {
1981 SCEVCheckBlock->getTerminator()->moveBefore(MovePos: Preheader->getTerminator());
1982 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1983 Preheader->getTerminator()->eraseFromParent();
1984 }
1985 if (MemCheckBlock) {
1986 MemCheckBlock->getTerminator()->moveBefore(MovePos: Preheader->getTerminator());
1987 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1988 Preheader->getTerminator()->eraseFromParent();
1989 }
1990
1991 DT->changeImmediateDominator(BB: LoopHeader, NewBB: Preheader);
1992 if (MemCheckBlock) {
1993 DT->eraseNode(BB: MemCheckBlock);
1994 LI->removeBlock(BB: MemCheckBlock);
1995 }
1996 if (SCEVCheckBlock) {
1997 DT->eraseNode(BB: SCEVCheckBlock);
1998 LI->removeBlock(BB: SCEVCheckBlock);
1999 }
2000
2001 // Outer loop is used as part of the later cost calculations.
2002 OuterLoop = L->getParentLoop();
2003 }
2004
2005 InstructionCost getCost() {
2006 if (SCEVCheckBlock || MemCheckBlock)
2007 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2008
2009 if (CostTooHigh) {
2010 InstructionCost Cost;
2011 Cost.setInvalid();
2012 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
2013 return Cost;
2014 }
2015
2016 InstructionCost RTCheckCost = 0;
2017 if (SCEVCheckBlock)
2018 for (Instruction &I : *SCEVCheckBlock) {
2019 if (SCEVCheckBlock->getTerminator() == &I)
2020 continue;
2021 InstructionCost C =
2022 TTI->getInstructionCost(U: &I, CostKind: TTI::TCK_RecipThroughput);
2023 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2024 RTCheckCost += C;
2025 }
2026 if (MemCheckBlock) {
2027 InstructionCost MemCheckCost = 0;
2028 for (Instruction &I : *MemCheckBlock) {
2029 if (MemCheckBlock->getTerminator() == &I)
2030 continue;
2031 InstructionCost C =
2032 TTI->getInstructionCost(U: &I, CostKind: TTI::TCK_RecipThroughput);
2033 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2034 MemCheckCost += C;
2035 }
2036
2037 // If the runtime memory checks are being created inside an outer loop
2038 // we should find out if these checks are outer loop invariant. If so,
2039 // the checks will likely be hoisted out and so the effective cost will
2040 // reduce according to the outer loop trip count.
2041 if (OuterLoop) {
2042 ScalarEvolution *SE = MemCheckExp.getSE();
2043 // TODO: If profitable, we could refine this further by analysing every
2044 // individual memory check, since there could be a mixture of loop
2045 // variant and invariant checks that mean the final condition is
2046 // variant.
2047 const SCEV *Cond = SE->getSCEV(V: MemRuntimeCheckCond);
2048 if (SE->isLoopInvariant(S: Cond, L: OuterLoop)) {
2049 // It seems reasonable to assume that we can reduce the effective
2050 // cost of the checks even when we know nothing about the trip
2051 // count. Assume that the outer loop executes at least twice.
2052 unsigned BestTripCount = 2;
2053
2054 // If exact trip count is known use that.
2055 if (unsigned SmallTC = SE->getSmallConstantTripCount(L: OuterLoop))
2056 BestTripCount = SmallTC;
2057 else if (LoopVectorizeWithBlockFrequency) {
2058 // Else use profile data if available.
2059 if (auto EstimatedTC = getLoopEstimatedTripCount(L: OuterLoop))
2060 BestTripCount = *EstimatedTC;
2061 }
2062
2063 BestTripCount = std::max(a: BestTripCount, b: 1U);
2064 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2065
2066 // Let's ensure the cost is always at least 1.
2067 NewMemCheckCost = std::max(a: *NewMemCheckCost.getValue(),
2068 b: (InstructionCost::CostType)1);
2069
2070 if (BestTripCount > 1)
2071 LLVM_DEBUG(dbgs()
2072 << "We expect runtime memory checks to be hoisted "
2073 << "out of the outer loop. Cost reduced from "
2074 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2075
2076 MemCheckCost = NewMemCheckCost;
2077 }
2078 }
2079
2080 RTCheckCost += MemCheckCost;
2081 }
2082
2083 if (SCEVCheckBlock || MemCheckBlock)
2084 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2085 << "\n");
2086
2087 return RTCheckCost;
2088 }
2089
2090 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2091 /// unused.
2092 ~GeneratedRTChecks() {
2093 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2094 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2095 if (!SCEVCheckCond)
2096 SCEVCleaner.markResultUsed();
2097
2098 if (!MemRuntimeCheckCond)
2099 MemCheckCleaner.markResultUsed();
2100
2101 if (MemRuntimeCheckCond) {
2102 auto &SE = *MemCheckExp.getSE();
2103 // Memory runtime check generation creates compares that use expanded
2104 // values. Remove them before running the SCEVExpanderCleaners.
2105 for (auto &I : make_early_inc_range(Range: reverse(C&: *MemCheckBlock))) {
2106 if (MemCheckExp.isInsertedInstruction(I: &I))
2107 continue;
2108 SE.forgetValue(V: &I);
2109 I.eraseFromParent();
2110 }
2111 }
2112 MemCheckCleaner.cleanup();
2113 SCEVCleaner.cleanup();
2114
2115 if (SCEVCheckCond)
2116 SCEVCheckBlock->eraseFromParent();
2117 if (MemRuntimeCheckCond)
2118 MemCheckBlock->eraseFromParent();
2119 }
2120
2121 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2122 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2123 /// depending on the generated condition.
2124 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2125 BasicBlock *LoopVectorPreHeader,
2126 BasicBlock *LoopExitBlock) {
2127 if (!SCEVCheckCond)
2128 return nullptr;
2129
2130 Value *Cond = SCEVCheckCond;
2131 // Mark the check as used, to prevent it from being removed during cleanup.
2132 SCEVCheckCond = nullptr;
2133 if (auto *C = dyn_cast<ConstantInt>(Val: Cond))
2134 if (C->isZero())
2135 return nullptr;
2136
2137 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2138
2139 BranchInst::Create(IfTrue: LoopVectorPreHeader, InsertAtEnd: SCEVCheckBlock);
2140 // Create new preheader for vector loop.
2141 if (OuterLoop)
2142 OuterLoop->addBasicBlockToLoop(NewBB: SCEVCheckBlock, LI&: *LI);
2143
2144 SCEVCheckBlock->getTerminator()->eraseFromParent();
2145 SCEVCheckBlock->moveBefore(MovePos: LoopVectorPreHeader);
2146 Pred->getTerminator()->replaceSuccessorWith(OldBB: LoopVectorPreHeader,
2147 NewBB: SCEVCheckBlock);
2148
2149 DT->addNewBlock(BB: SCEVCheckBlock, DomBB: Pred);
2150 DT->changeImmediateDominator(BB: LoopVectorPreHeader, NewBB: SCEVCheckBlock);
2151
2152 BranchInst &BI = *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond);
2153 if (AddBranchWeights)
2154 setBranchWeights(I&: BI, Weights: SCEVCheckBypassWeights);
2155 ReplaceInstWithInst(From: SCEVCheckBlock->getTerminator(), To: &BI);
2156 return SCEVCheckBlock;
2157 }
2158
2159 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2160 /// the branches to branch to the vector preheader or \p Bypass, depending on
2161 /// the generated condition.
2162 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2163 BasicBlock *LoopVectorPreHeader) {
2164 // Check if we generated code that checks in runtime if arrays overlap.
2165 if (!MemRuntimeCheckCond)
2166 return nullptr;
2167
2168 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2169 Pred->getTerminator()->replaceSuccessorWith(OldBB: LoopVectorPreHeader,
2170 NewBB: MemCheckBlock);
2171
2172 DT->addNewBlock(BB: MemCheckBlock, DomBB: Pred);
2173 DT->changeImmediateDominator(BB: LoopVectorPreHeader, NewBB: MemCheckBlock);
2174 MemCheckBlock->moveBefore(MovePos: LoopVectorPreHeader);
2175
2176 if (OuterLoop)
2177 OuterLoop->addBasicBlockToLoop(NewBB: MemCheckBlock, LI&: *LI);
2178
2179 BranchInst &BI =
2180 *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: MemRuntimeCheckCond);
2181 if (AddBranchWeights) {
2182 setBranchWeights(I&: BI, Weights: MemCheckBypassWeights);
2183 }
2184 ReplaceInstWithInst(From: MemCheckBlock->getTerminator(), To: &BI);
2185 MemCheckBlock->getTerminator()->setDebugLoc(
2186 Pred->getTerminator()->getDebugLoc());
2187
2188 // Mark the check as used, to prevent it from being removed during cleanup.
2189 MemRuntimeCheckCond = nullptr;
2190 return MemCheckBlock;
2191 }
2192};
2193} // namespace
2194
2195static bool useActiveLaneMask(TailFoldingStyle Style) {
2196 return Style == TailFoldingStyle::Data ||
2197 Style == TailFoldingStyle::DataAndControlFlow ||
2198 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2199}
2200
2201static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2202 return Style == TailFoldingStyle::DataAndControlFlow ||
2203 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2204}
2205
2206// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2207// vectorization. The loop needs to be annotated with #pragma omp simd
2208// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2209// vector length information is not provided, vectorization is not considered
2210// explicit. Interleave hints are not allowed either. These limitations will be
2211// relaxed in the future.
2212// Please, note that we are currently forced to abuse the pragma 'clang
2213// vectorize' semantics. This pragma provides *auto-vectorization hints*
2214// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2215// provides *explicit vectorization hints* (LV can bypass legal checks and
2216// assume that vectorization is legal). However, both hints are implemented
2217// using the same metadata (llvm.loop.vectorize, processed by
2218// LoopVectorizeHints). This will be fixed in the future when the native IR
2219// representation for pragma 'omp simd' is introduced.
2220static bool isExplicitVecOuterLoop(Loop *OuterLp,
2221 OptimizationRemarkEmitter *ORE) {
2222 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2223 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2224
2225 // Only outer loops with an explicit vectorization hint are supported.
2226 // Unannotated outer loops are ignored.
2227 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2228 return false;
2229
2230 Function *Fn = OuterLp->getHeader()->getParent();
2231 if (!Hints.allowVectorization(F: Fn, L: OuterLp,
2232 VectorizeOnlyWhenForced: true /*VectorizeOnlyWhenForced*/)) {
2233 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2234 return false;
2235 }
2236
2237 if (Hints.getInterleave() > 1) {
2238 // TODO: Interleave support is future work.
2239 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2240 "outer loops.\n");
2241 Hints.emitRemarkWithHints();
2242 return false;
2243 }
2244
2245 return true;
2246}
2247
2248static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2249 OptimizationRemarkEmitter *ORE,
2250 SmallVectorImpl<Loop *> &V) {
2251 // Collect inner loops and outer loops without irreducible control flow. For
2252 // now, only collect outer loops that have explicit vectorization hints. If we
2253 // are stress testing the VPlan H-CFG construction, we collect the outermost
2254 // loop of every loop nest.
2255 if (L.isInnermost() || VPlanBuildStressTest ||
2256 (EnableVPlanNativePath && isExplicitVecOuterLoop(OuterLp: &L, ORE))) {
2257 LoopBlocksRPO RPOT(&L);
2258 RPOT.perform(LI);
2259 if (!containsIrreducibleCFG<const BasicBlock *>(RPOTraversal&: RPOT, LI: *LI)) {
2260 V.push_back(Elt: &L);
2261 // TODO: Collect inner loops inside marked outer loops in case
2262 // vectorization fails for the outer loop. Do not invoke
2263 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2264 // already known to be reducible. We can use an inherited attribute for
2265 // that.
2266 return;
2267 }
2268 }
2269 for (Loop *InnerL : L)
2270 collectSupportedLoops(L&: *InnerL, LI, ORE, V);
2271}
2272
2273//===----------------------------------------------------------------------===//
2274// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2275// LoopVectorizationCostModel and LoopVectorizationPlanner.
2276//===----------------------------------------------------------------------===//
2277
2278/// Compute the transformed value of Index at offset StartValue using step
2279/// StepValue.
2280/// For integer induction, returns StartValue + Index * StepValue.
2281/// For pointer induction, returns StartValue[Index * StepValue].
2282/// FIXME: The newly created binary instructions should contain nsw/nuw
2283/// flags, which can be found from the original scalar operations.
2284static Value *
2285emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2286 Value *Step,
2287 InductionDescriptor::InductionKind InductionKind,
2288 const BinaryOperator *InductionBinOp) {
2289 Type *StepTy = Step->getType();
2290 Value *CastedIndex = StepTy->isIntegerTy()
2291 ? B.CreateSExtOrTrunc(V: Index, DestTy: StepTy)
2292 : B.CreateCast(Op: Instruction::SIToFP, V: Index, DestTy: StepTy);
2293 if (CastedIndex != Index) {
2294 CastedIndex->setName(CastedIndex->getName() + ".cast");
2295 Index = CastedIndex;
2296 }
2297
2298 // Note: the IR at this point is broken. We cannot use SE to create any new
2299 // SCEV and then expand it, hoping that SCEV's simplification will give us
2300 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2301 // lead to various SCEV crashes. So all we can do is to use builder and rely
2302 // on InstCombine for future simplifications. Here we handle some trivial
2303 // cases only.
2304 auto CreateAdd = [&B](Value *X, Value *Y) {
2305 assert(X->getType() == Y->getType() && "Types don't match!");
2306 if (auto *CX = dyn_cast<ConstantInt>(Val: X))
2307 if (CX->isZero())
2308 return Y;
2309 if (auto *CY = dyn_cast<ConstantInt>(Val: Y))
2310 if (CY->isZero())
2311 return X;
2312 return B.CreateAdd(LHS: X, RHS: Y);
2313 };
2314
2315 // We allow X to be a vector type, in which case Y will potentially be
2316 // splatted into a vector with the same element count.
2317 auto CreateMul = [&B](Value *X, Value *Y) {
2318 assert(X->getType()->getScalarType() == Y->getType() &&
2319 "Types don't match!");
2320 if (auto *CX = dyn_cast<ConstantInt>(Val: X))
2321 if (CX->isOne())
2322 return Y;
2323 if (auto *CY = dyn_cast<ConstantInt>(Val: Y))
2324 if (CY->isOne())
2325 return X;
2326 VectorType *XVTy = dyn_cast<VectorType>(Val: X->getType());
2327 if (XVTy && !isa<VectorType>(Val: Y->getType()))
2328 Y = B.CreateVectorSplat(EC: XVTy->getElementCount(), V: Y);
2329 return B.CreateMul(LHS: X, RHS: Y);
2330 };
2331
2332 switch (InductionKind) {
2333 case InductionDescriptor::IK_IntInduction: {
2334 assert(!isa<VectorType>(Index->getType()) &&
2335 "Vector indices not supported for integer inductions yet");
2336 assert(Index->getType() == StartValue->getType() &&
2337 "Index type does not match StartValue type");
2338 if (isa<ConstantInt>(Val: Step) && cast<ConstantInt>(Val: Step)->isMinusOne())
2339 return B.CreateSub(LHS: StartValue, RHS: Index);
2340 auto *Offset = CreateMul(Index, Step);
2341 return CreateAdd(StartValue, Offset);
2342 }
2343 case InductionDescriptor::IK_PtrInduction:
2344 return B.CreatePtrAdd(Ptr: StartValue, Offset: CreateMul(Index, Step));
2345 case InductionDescriptor::IK_FpInduction: {
2346 assert(!isa<VectorType>(Index->getType()) &&
2347 "Vector indices not supported for FP inductions yet");
2348 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2349 assert(InductionBinOp &&
2350 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2351 InductionBinOp->getOpcode() == Instruction::FSub) &&
2352 "Original bin op should be defined for FP induction");
2353
2354 Value *MulExp = B.CreateFMul(L: Step, R: Index);
2355 return B.CreateBinOp(Opc: InductionBinOp->getOpcode(), LHS: StartValue, RHS: MulExp,
2356 Name: "induction");
2357 }
2358 case InductionDescriptor::IK_NoInduction:
2359 return nullptr;
2360 }
2361 llvm_unreachable("invalid enum");
2362}
2363
2364std::optional<unsigned> getMaxVScale(const Function &F,
2365 const TargetTransformInfo &TTI) {
2366 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2367 return MaxVScale;
2368
2369 if (F.hasFnAttribute(Attribute::VScaleRange))
2370 return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2371
2372 return std::nullopt;
2373}
2374
2375/// For the given VF and UF and maximum trip count computed for the loop, return
2376/// whether the induction variable might overflow in the vectorized loop. If not,
2377/// then we know a runtime overflow check always evaluates to false and can be
2378/// removed.
2379static bool isIndvarOverflowCheckKnownFalse(
2380 const LoopVectorizationCostModel *Cost,
2381 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2382 // Always be conservative if we don't know the exact unroll factor.
2383 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2384
2385 Type *IdxTy = Cost->Legal->getWidestInductionType();
2386 APInt MaxUIntTripCount = cast<IntegerType>(Val: IdxTy)->getMask();
2387
2388 // We know the runtime overflow check is known false iff the (max) trip-count
2389 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2390 // the vector loop induction variable.
2391 if (unsigned TC =
2392 Cost->PSE.getSE()->getSmallConstantMaxTripCount(L: Cost->TheLoop)) {
2393 uint64_t MaxVF = VF.getKnownMinValue();
2394 if (VF.isScalable()) {
2395 std::optional<unsigned> MaxVScale =
2396 getMaxVScale(F: *Cost->TheFunction, TTI: Cost->TTI);
2397 if (!MaxVScale)
2398 return false;
2399 MaxVF *= *MaxVScale;
2400 }
2401
2402 return (MaxUIntTripCount - TC).ugt(RHS: MaxVF * MaxUF);
2403 }
2404
2405 return false;
2406}
2407
2408// Return whether we allow using masked interleave-groups (for dealing with
2409// strided loads/stores that reside in predicated blocks, or for dealing
2410// with gaps).
2411static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2412 // If an override option has been passed in for interleaved accesses, use it.
2413 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2414 return EnableMaskedInterleavedMemAccesses;
2415
2416 return TTI.enableMaskedInterleavedAccessVectorization();
2417}
2418
2419// Try to vectorize the interleave group that \p Instr belongs to.
2420//
2421// E.g. Translate following interleaved load group (factor = 3):
2422// for (i = 0; i < N; i+=3) {
2423// R = Pic[i]; // Member of index 0
2424// G = Pic[i+1]; // Member of index 1
2425// B = Pic[i+2]; // Member of index 2
2426// ... // do something to R, G, B
2427// }
2428// To:
2429// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2430// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2431// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2432// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2433//
2434// Or translate following interleaved store group (factor = 3):
2435// for (i = 0; i < N; i+=3) {
2436// ... do something to R, G, B
2437// Pic[i] = R; // Member of index 0
2438// Pic[i+1] = G; // Member of index 1
2439// Pic[i+2] = B; // Member of index 2
2440// }
2441// To:
2442// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2443// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2444// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2445// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2446// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2447void InnerLoopVectorizer::vectorizeInterleaveGroup(
2448 const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2449 VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2450 VPValue *BlockInMask, bool NeedsMaskForGaps) {
2451 Instruction *Instr = Group->getInsertPos();
2452 const DataLayout &DL = Instr->getModule()->getDataLayout();
2453
2454 // Prepare for the vector type of the interleaved load/store.
2455 Type *ScalarTy = getLoadStoreType(I: Instr);
2456 unsigned InterleaveFactor = Group->getFactor();
2457 auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: VF * InterleaveFactor);
2458
2459 // Prepare for the new pointers.
2460 SmallVector<Value *, 2> AddrParts;
2461 unsigned Index = Group->getIndex(Instr);
2462
2463 // TODO: extend the masked interleaved-group support to reversed access.
2464 assert((!BlockInMask || !Group->isReverse()) &&
2465 "Reversed masked interleave-group not supported.");
2466
2467 Value *Idx;
2468 // If the group is reverse, adjust the index to refer to the last vector lane
2469 // instead of the first. We adjust the index from the first vector lane,
2470 // rather than directly getting the pointer for lane VF - 1, because the
2471 // pointer operand of the interleaved access is supposed to be uniform. For
2472 // uniform instructions, we're only required to generate a value for the
2473 // first vector lane in each unroll iteration.
2474 if (Group->isReverse()) {
2475 Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty: Builder.getInt32Ty(), VF);
2476 Idx = Builder.CreateSub(LHS: RuntimeVF, RHS: Builder.getInt32(C: 1));
2477 Idx = Builder.CreateMul(LHS: Idx, RHS: Builder.getInt32(C: Group->getFactor()));
2478 Idx = Builder.CreateAdd(LHS: Idx, RHS: Builder.getInt32(C: Index));
2479 Idx = Builder.CreateNeg(V: Idx);
2480 } else
2481 Idx = Builder.getInt32(C: -Index);
2482
2483 for (unsigned Part = 0; Part < UF; Part++) {
2484 Value *AddrPart = State.get(Def: Addr, Instance: VPIteration(Part, 0));
2485 if (auto *I = dyn_cast<Instruction>(Val: AddrPart))
2486 State.setDebugLocFrom(I->getDebugLoc());
2487
2488 // Notice current instruction could be any index. Need to adjust the address
2489 // to the member of index 0.
2490 //
2491 // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2492 // b = A[i]; // Member of index 0
2493 // Current pointer is pointed to A[i+1], adjust it to A[i].
2494 //
2495 // E.g. A[i+1] = a; // Member of index 1
2496 // A[i] = b; // Member of index 0
2497 // A[i+2] = c; // Member of index 2 (Current instruction)
2498 // Current pointer is pointed to A[i+2], adjust it to A[i].
2499
2500 bool InBounds = false;
2501 if (auto *gep = dyn_cast<GetElementPtrInst>(Val: AddrPart->stripPointerCasts()))
2502 InBounds = gep->isInBounds();
2503 AddrPart = Builder.CreateGEP(Ty: ScalarTy, Ptr: AddrPart, IdxList: Idx, Name: "", IsInBounds: InBounds);
2504 AddrParts.push_back(Elt: AddrPart);
2505 }
2506
2507 State.setDebugLocFrom(Instr->getDebugLoc());
2508 Value *PoisonVec = PoisonValue::get(T: VecTy);
2509
2510 auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
2511 unsigned Part, Value *MaskForGaps) -> Value * {
2512 if (VF.isScalable()) {
2513 assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2514 assert(InterleaveFactor == 2 &&
2515 "Unsupported deinterleave factor for scalable vectors");
2516 auto *BlockInMaskPart = State.get(Def: BlockInMask, Part);
2517 SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart};
2518 auto *MaskTy =
2519 VectorType::get(ElementType: Builder.getInt1Ty(), NumElements: VF.getKnownMinValue() * 2, Scalable: true);
2520 return Builder.CreateIntrinsic(
2521 MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
2522 /*FMFSource=*/nullptr, "interleaved.mask");
2523 }
2524
2525 if (!BlockInMask)
2526 return MaskForGaps;
2527
2528 Value *BlockInMaskPart = State.get(Def: BlockInMask, Part);
2529 Value *ShuffledMask = Builder.CreateShuffleVector(
2530 V: BlockInMaskPart,
2531 Mask: createReplicatedMask(ReplicationFactor: InterleaveFactor, VF: VF.getKnownMinValue()),
2532 Name: "interleaved.mask");
2533 return MaskForGaps ? Builder.CreateBinOp(Opc: Instruction::And, LHS: ShuffledMask,
2534 RHS: MaskForGaps)
2535 : ShuffledMask;
2536 };
2537
2538 // Vectorize the interleaved load group.
2539 if (isa<LoadInst>(Val: Instr)) {
2540 Value *MaskForGaps = nullptr;
2541 if (NeedsMaskForGaps) {
2542 MaskForGaps =
2543 createBitMaskForGaps(Builder, VF: VF.getKnownMinValue(), Group: *Group);
2544 assert(MaskForGaps && "Mask for Gaps is required but it is null");
2545 }
2546
2547 // For each unroll part, create a wide load for the group.
2548 SmallVector<Value *, 2> NewLoads;
2549 for (unsigned Part = 0; Part < UF; Part++) {
2550 Instruction *NewLoad;
2551 if (BlockInMask || MaskForGaps) {
2552 assert(useMaskedInterleavedAccesses(*TTI) &&
2553 "masked interleaved groups are not allowed.");
2554 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2555 NewLoad =
2556 Builder.CreateMaskedLoad(Ty: VecTy, Ptr: AddrParts[Part], Alignment: Group->getAlign(),
2557 Mask: GroupMask, PassThru: PoisonVec, Name: "wide.masked.vec");
2558 }
2559 else
2560 NewLoad = Builder.CreateAlignedLoad(Ty: VecTy, Ptr: AddrParts[Part],
2561 Align: Group->getAlign(), Name: "wide.vec");
2562 Group->addMetadata(NewInst: NewLoad);
2563 NewLoads.push_back(Elt: NewLoad);
2564 }
2565
2566 if (VecTy->isScalableTy()) {
2567 assert(InterleaveFactor == 2 &&
2568 "Unsupported deinterleave factor for scalable vectors");
2569
2570 for (unsigned Part = 0; Part < UF; ++Part) {
2571 // Scalable vectors cannot use arbitrary shufflevectors (only splats),
2572 // so must use intrinsics to deinterleave.
2573 Value *DI = Builder.CreateIntrinsic(
2574 Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part],
2575 /*FMFSource=*/nullptr, "strided.vec");
2576 unsigned J = 0;
2577 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2578 Instruction *Member = Group->getMember(Index: I);
2579
2580 if (!Member)
2581 continue;
2582
2583 Value *StridedVec = Builder.CreateExtractValue(Agg: DI, Idxs: I);
2584 // If this member has different type, cast the result type.
2585 if (Member->getType() != ScalarTy) {
2586 VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: VF);
2587 StridedVec = createBitOrPointerCast(V: StridedVec, DstVTy: OtherVTy, DL);
2588 }
2589
2590 if (Group->isReverse())
2591 StridedVec = Builder.CreateVectorReverse(V: StridedVec, Name: "reverse");
2592
2593 State.set(Def: VPDefs[J], V: StridedVec, Part);
2594 ++J;
2595 }
2596 }
2597
2598 return;
2599 }
2600
2601 // For each member in the group, shuffle out the appropriate data from the
2602 // wide loads.
2603 unsigned J = 0;
2604 for (unsigned I = 0; I < InterleaveFactor; ++I) {
2605 Instruction *Member = Group->getMember(Index: I);
2606
2607 // Skip the gaps in the group.
2608 if (!Member)
2609 continue;
2610
2611 auto StrideMask =
2612 createStrideMask(Start: I, Stride: InterleaveFactor, VF: VF.getKnownMinValue());
2613 for (unsigned Part = 0; Part < UF; Part++) {
2614 Value *StridedVec = Builder.CreateShuffleVector(
2615 V: NewLoads[Part], Mask: StrideMask, Name: "strided.vec");
2616
2617 // If this member has different type, cast the result type.
2618 if (Member->getType() != ScalarTy) {
2619 assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2620 VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: VF);
2621 StridedVec = createBitOrPointerCast(V: StridedVec, DstVTy: OtherVTy, DL);
2622 }
2623
2624 if (Group->isReverse())
2625 StridedVec = Builder.CreateVectorReverse(V: StridedVec, Name: "reverse");
2626
2627 State.set(Def: VPDefs[J], V: StridedVec, Part);
2628 }
2629 ++J;
2630 }
2631 return;
2632 }
2633
2634 // The sub vector type for current instruction.
2635 auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: VF);
2636
2637 // Vectorize the interleaved store group.
2638 Value *MaskForGaps =
2639 createBitMaskForGaps(Builder, VF: VF.getKnownMinValue(), Group: *Group);
2640 assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2641 "masked interleaved groups are not allowed.");
2642 assert((!MaskForGaps || !VF.isScalable()) &&
2643 "masking gaps for scalable vectors is not yet supported.");
2644 for (unsigned Part = 0; Part < UF; Part++) {
2645 // Collect the stored vector from each member.
2646 SmallVector<Value *, 4> StoredVecs;
2647 unsigned StoredIdx = 0;
2648 for (unsigned i = 0; i < InterleaveFactor; i++) {
2649 assert((Group->getMember(i) || MaskForGaps) &&
2650 "Fail to get a member from an interleaved store group");
2651 Instruction *Member = Group->getMember(Index: i);
2652
2653 // Skip the gaps in the group.
2654 if (!Member) {
2655 Value *Undef = PoisonValue::get(T: SubVT);
2656 StoredVecs.push_back(Elt: Undef);
2657 continue;
2658 }
2659
2660 Value *StoredVec = State.get(Def: StoredValues[StoredIdx], Part);
2661 ++StoredIdx;
2662
2663 if (Group->isReverse())
2664 StoredVec = Builder.CreateVectorReverse(V: StoredVec, Name: "reverse");
2665
2666 // If this member has different type, cast it to a unified type.
2667
2668 if (StoredVec->getType() != SubVT)
2669 StoredVec = createBitOrPointerCast(V: StoredVec, DstVTy: SubVT, DL);
2670
2671 StoredVecs.push_back(Elt: StoredVec);
2672 }
2673
2674 // Interleave all the smaller vectors into one wider vector.
2675 Value *IVec = interleaveVectors(Builder, Vals: StoredVecs, Name: "interleaved.vec");
2676 Instruction *NewStoreInstr;
2677 if (BlockInMask || MaskForGaps) {
2678 Value *GroupMask = CreateGroupMask(Part, MaskForGaps);
2679 NewStoreInstr = Builder.CreateMaskedStore(Val: IVec, Ptr: AddrParts[Part],
2680 Alignment: Group->getAlign(), Mask: GroupMask);
2681 } else
2682 NewStoreInstr =
2683 Builder.CreateAlignedStore(Val: IVec, Ptr: AddrParts[Part], Align: Group->getAlign());
2684
2685 Group->addMetadata(NewInst: NewStoreInstr);
2686 }
2687}
2688
2689void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2690 VPReplicateRecipe *RepRecipe,
2691 const VPIteration &Instance,
2692 VPTransformState &State) {
2693 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2694
2695 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2696 // the first lane and part.
2697 if (isa<NoAliasScopeDeclInst>(Val: Instr))
2698 if (!Instance.isFirstIteration())
2699 return;
2700
2701 // Does this instruction return a value ?
2702 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2703
2704 Instruction *Cloned = Instr->clone();
2705 if (!IsVoidRetTy) {
2706 Cloned->setName(Instr->getName() + ".cloned");
2707#if !defined(NDEBUG)
2708 // Verify that VPlan type inference results agree with the type of the
2709 // generated values.
2710 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2711 "inferred type and type from generated instructions do not match");
2712#endif
2713 }
2714
2715 RepRecipe->setFlags(Cloned);
2716
2717 if (auto DL = Instr->getDebugLoc())
2718 State.setDebugLocFrom(DL);
2719
2720 // Replace the operands of the cloned instructions with their scalar
2721 // equivalents in the new loop.
2722 for (const auto &I : enumerate(First: RepRecipe->operands())) {
2723 auto InputInstance = Instance;
2724 VPValue *Operand = I.value();
2725 if (vputils::isUniformAfterVectorization(VPV: Operand))
2726 InputInstance.Lane = VPLane::getFirstLane();
2727 Cloned->setOperand(i: I.index(), Val: State.get(Def: Operand, Instance: InputInstance));
2728 }
2729 State.addNewMetadata(To: Cloned, Orig: Instr);
2730
2731 // Place the cloned scalar in the new loop.
2732 State.Builder.Insert(I: Cloned);
2733
2734 State.set(Def: RepRecipe, V: Cloned, Instance);
2735
2736 // If we just cloned a new assumption, add it the assumption cache.
2737 if (auto *II = dyn_cast<AssumeInst>(Val: Cloned))
2738 AC->registerAssumption(CI: II);
2739
2740 // End if-block.
2741 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2742 if (IfPredicateInstr)
2743 PredicatedInstructions.push_back(Elt: Cloned);
2744}
2745
2746Value *
2747InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2748 if (VectorTripCount)
2749 return VectorTripCount;
2750
2751 Value *TC = getTripCount();
2752 IRBuilder<> Builder(InsertBlock->getTerminator());
2753
2754 Type *Ty = TC->getType();
2755 // This is where we can make the step a runtime constant.
2756 Value *Step = createStepForVF(B&: Builder, Ty, VF, Step: UF);
2757
2758 // If the tail is to be folded by masking, round the number of iterations N
2759 // up to a multiple of Step instead of rounding down. This is done by first
2760 // adding Step-1 and then rounding down. Note that it's ok if this addition
2761 // overflows: the vector induction variable will eventually wrap to zero given
2762 // that it starts at zero and its Step is a power of two; the loop will then
2763 // exit, with the last early-exit vector comparison also producing all-true.
2764 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2765 // is accounted for in emitIterationCountCheck that adds an overflow check.
2766 if (Cost->foldTailByMasking()) {
2767 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2768 "VF*UF must be a power of 2 when folding tail by masking");
2769 Value *NumLanes = getRuntimeVF(B&: Builder, Ty, VF: VF * UF);
2770 TC = Builder.CreateAdd(
2771 LHS: TC, RHS: Builder.CreateSub(LHS: NumLanes, RHS: ConstantInt::get(Ty, V: 1)), Name: "n.rnd.up");
2772 }
2773
2774 // Now we need to generate the expression for the part of the loop that the
2775 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2776 // iterations are not required for correctness, or N - Step, otherwise. Step
2777 // is equal to the vectorization factor (number of SIMD elements) times the
2778 // unroll factor (number of SIMD instructions).
2779 Value *R = Builder.CreateURem(LHS: TC, RHS: Step, Name: "n.mod.vf");
2780
2781 // There are cases where we *must* run at least one iteration in the remainder
2782 // loop. See the cost model for when this can happen. If the step evenly
2783 // divides the trip count, we set the remainder to be equal to the step. If
2784 // the step does not evenly divide the trip count, no adjustment is necessary
2785 // since there will already be scalar iterations. Note that the minimum
2786 // iterations check ensures that N >= Step.
2787 if (Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector())) {
2788 auto *IsZero = Builder.CreateICmpEQ(LHS: R, RHS: ConstantInt::get(Ty: R->getType(), V: 0));
2789 R = Builder.CreateSelect(C: IsZero, True: Step, False: R);
2790 }
2791
2792 VectorTripCount = Builder.CreateSub(LHS: TC, RHS: R, Name: "n.vec");
2793
2794 return VectorTripCount;
2795}
2796
2797Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2798 const DataLayout &DL) {
2799 // Verify that V is a vector type with same number of elements as DstVTy.
2800 auto *DstFVTy = cast<VectorType>(Val: DstVTy);
2801 auto VF = DstFVTy->getElementCount();
2802 auto *SrcVecTy = cast<VectorType>(Val: V->getType());
2803 assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2804 Type *SrcElemTy = SrcVecTy->getElementType();
2805 Type *DstElemTy = DstFVTy->getElementType();
2806 assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2807 "Vector elements must have same size");
2808
2809 // Do a direct cast if element types are castable.
2810 if (CastInst::isBitOrNoopPointerCastable(SrcTy: SrcElemTy, DestTy: DstElemTy, DL)) {
2811 return Builder.CreateBitOrPointerCast(V, DestTy: DstFVTy);
2812 }
2813 // V cannot be directly casted to desired vector type.
2814 // May happen when V is a floating point vector but DstVTy is a vector of
2815 // pointers or vice-versa. Handle this using a two-step bitcast using an
2816 // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2817 assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2818 "Only one type should be a pointer type");
2819 assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2820 "Only one type should be a floating point type");
2821 Type *IntTy =
2822 IntegerType::getIntNTy(C&: V->getContext(), N: DL.getTypeSizeInBits(Ty: SrcElemTy));
2823 auto *VecIntTy = VectorType::get(ElementType: IntTy, EC: VF);
2824 Value *CastVal = Builder.CreateBitOrPointerCast(V, DestTy: VecIntTy);
2825 return Builder.CreateBitOrPointerCast(V: CastVal, DestTy: DstFVTy);
2826}
2827
2828void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2829 Value *Count = getTripCount();
2830 // Reuse existing vector loop preheader for TC checks.
2831 // Note that new preheader block is generated for vector loop.
2832 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2833 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2834
2835 // Generate code to check if the loop's trip count is less than VF * UF, or
2836 // equal to it in case a scalar epilogue is required; this implies that the
2837 // vector trip count is zero. This check also covers the case where adding one
2838 // to the backedge-taken count overflowed leading to an incorrect trip count
2839 // of zero. In this case we will also jump to the scalar loop.
2840 auto P = Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()) ? ICmpInst::ICMP_ULE
2841 : ICmpInst::ICMP_ULT;
2842
2843 // If tail is to be folded, vector loop takes care of all iterations.
2844 Type *CountTy = Count->getType();
2845 Value *CheckMinIters = Builder.getFalse();
2846 auto CreateStep = [&]() -> Value * {
2847 // Create step with max(MinProTripCount, UF * VF).
2848 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2849 return createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF);
2850
2851 Value *MinProfTC =
2852 createStepForVF(B&: Builder, Ty: CountTy, VF: MinProfitableTripCount, Step: 1);
2853 if (!VF.isScalable())
2854 return MinProfTC;
2855 return Builder.CreateBinaryIntrinsic(
2856 Intrinsic::ID: umax, LHS: MinProfTC, RHS: createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF));
2857 };
2858
2859 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2860 if (Style == TailFoldingStyle::None)
2861 CheckMinIters =
2862 Builder.CreateICmp(P, LHS: Count, RHS: CreateStep(), Name: "min.iters.check");
2863 else if (VF.isScalable() &&
2864 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2865 Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2866 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2867 // an overflow to zero when updating induction variables and so an
2868 // additional overflow check is required before entering the vector loop.
2869
2870 // Get the maximum unsigned value for the type.
2871 Value *MaxUIntTripCount =
2872 ConstantInt::get(Ty: CountTy, V: cast<IntegerType>(Val: CountTy)->getMask());
2873 Value *LHS = Builder.CreateSub(LHS: MaxUIntTripCount, RHS: Count);
2874
2875 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2876 CheckMinIters = Builder.CreateICmp(P: ICmpInst::ICMP_ULT, LHS, RHS: CreateStep());
2877 }
2878
2879 // Create new preheader for vector loop.
2880 LoopVectorPreHeader =
2881 SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(), DT, LI, MSSAU: nullptr,
2882 BBName: "vector.ph");
2883
2884 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2885 DT->getNode(Bypass)->getIDom()) &&
2886 "TC check is expected to dominate Bypass");
2887
2888 // Update dominator for Bypass & LoopExit (if needed).
2889 DT->changeImmediateDominator(BB: Bypass, NewBB: TCCheckBlock);
2890 if (!Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()))
2891 // If there is an epilogue which must run, there's no edge from the
2892 // middle block to exit blocks and thus no need to update the immediate
2893 // dominator of the exit blocks.
2894 DT->changeImmediateDominator(BB: LoopExitBlock, NewBB: TCCheckBlock);
2895
2896 BranchInst &BI =
2897 *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
2898 if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator()))
2899 setBranchWeights(I&: BI, Weights: MinItersBypassWeights);
2900 ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI);
2901 LoopBypassBlocks.push_back(Elt: TCCheckBlock);
2902}
2903
2904BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2905 BasicBlock *const SCEVCheckBlock =
2906 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2907 if (!SCEVCheckBlock)
2908 return nullptr;
2909
2910 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2911 (OptForSizeBasedOnProfile &&
2912 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2913 "Cannot SCEV check stride or overflow when optimizing for size");
2914
2915
2916 // Update dominator only if this is first RT check.
2917 if (LoopBypassBlocks.empty()) {
2918 DT->changeImmediateDominator(BB: Bypass, NewBB: SCEVCheckBlock);
2919 if (!Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()))
2920 // If there is an epilogue which must run, there's no edge from the
2921 // middle block to exit blocks and thus no need to update the immediate
2922 // dominator of the exit blocks.
2923 DT->changeImmediateDominator(BB: LoopExitBlock, NewBB: SCEVCheckBlock);
2924 }
2925
2926 LoopBypassBlocks.push_back(Elt: SCEVCheckBlock);
2927 AddedSafetyChecks = true;
2928 return SCEVCheckBlock;
2929}
2930
2931BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
2932 // VPlan-native path does not do any analysis for runtime checks currently.
2933 if (EnableVPlanNativePath)
2934 return nullptr;
2935
2936 BasicBlock *const MemCheckBlock =
2937 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2938
2939 // Check if we generated code that checks in runtime if arrays overlap. We put
2940 // the checks into a separate block to make the more common case of few
2941 // elements faster.
2942 if (!MemCheckBlock)
2943 return nullptr;
2944
2945 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2946 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2947 "Cannot emit memory checks when optimizing for size, unless forced "
2948 "to vectorize.");
2949 ORE->emit(RemarkBuilder: [&]() {
2950 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2951 OrigLoop->getStartLoc(),
2952 OrigLoop->getHeader())
2953 << "Code-size may be reduced by not forcing "
2954 "vectorization, or by source-code modifications "
2955 "eliminating the need for runtime checks "
2956 "(e.g., adding 'restrict').";
2957 });
2958 }
2959
2960 LoopBypassBlocks.push_back(Elt: MemCheckBlock);
2961
2962 AddedSafetyChecks = true;
2963
2964 return MemCheckBlock;
2965}
2966
2967void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2968 LoopScalarBody = OrigLoop->getHeader();
2969 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2970 assert(LoopVectorPreHeader && "Invalid loop structure");
2971 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2972 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2973 "multiple exit loop without required epilogue?");
2974
2975 LoopMiddleBlock =
2976 SplitBlock(Old: LoopVectorPreHeader, SplitPt: LoopVectorPreHeader->getTerminator(), DT,
2977 LI, MSSAU: nullptr, BBName: Twine(Prefix) + "middle.block");
2978 LoopScalarPreHeader =
2979 SplitBlock(Old: LoopMiddleBlock, SplitPt: LoopMiddleBlock->getTerminator(), DT, LI,
2980 MSSAU: nullptr, BBName: Twine(Prefix) + "scalar.ph");
2981
2982 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
2983
2984 // Set up the middle block terminator. Two cases:
2985 // 1) If we know that we must execute the scalar epilogue, emit an
2986 // unconditional branch.
2987 // 2) Otherwise, we must have a single unique exit block (due to how we
2988 // implement the multiple exit case). In this case, set up a conditional
2989 // branch from the middle block to the loop scalar preheader, and the
2990 // exit block. completeLoopSkeleton will update the condition to use an
2991 // iteration check, if required to decide whether to execute the remainder.
2992 BranchInst *BrInst =
2993 Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector())
2994 ? BranchInst::Create(IfTrue: LoopScalarPreHeader)
2995 : BranchInst::Create(IfTrue: LoopExitBlock, IfFalse: LoopScalarPreHeader,
2996 Cond: Builder.getTrue());
2997 BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
2998 ReplaceInstWithInst(From: LoopMiddleBlock->getTerminator(), To: BrInst);
2999
3000 // Update dominator for loop exit. During skeleton creation, only the vector
3001 // pre-header and the middle block are created. The vector loop is entirely
3002 // created during VPlan exection.
3003 if (!Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()))
3004 // If there is an epilogue which must run, there's no edge from the
3005 // middle block to exit blocks and thus no need to update the immediate
3006 // dominator of the exit blocks.
3007 DT->changeImmediateDominator(BB: LoopExitBlock, NewBB: LoopMiddleBlock);
3008}
3009
3010PHINode *InnerLoopVectorizer::createInductionResumeValue(
3011 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
3012 ArrayRef<BasicBlock *> BypassBlocks,
3013 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3014 Value *VectorTripCount = getOrCreateVectorTripCount(InsertBlock: LoopVectorPreHeader);
3015 assert(VectorTripCount && "Expected valid arguments");
3016
3017 Instruction *OldInduction = Legal->getPrimaryInduction();
3018 Value *&EndValue = IVEndValues[OrigPhi];
3019 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3020 if (OrigPhi == OldInduction) {
3021 // We know what the end value is.
3022 EndValue = VectorTripCount;
3023 } else {
3024 IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3025
3026 // Fast-math-flags propagate from the original induction instruction.
3027 if (II.getInductionBinOp() && isa<FPMathOperator>(Val: II.getInductionBinOp()))
3028 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3029
3030 EndValue = emitTransformedIndex(B, Index: VectorTripCount, StartValue: II.getStartValue(),
3031 Step, InductionKind: II.getKind(), InductionBinOp: II.getInductionBinOp());
3032 EndValue->setName("ind.end");
3033
3034 // Compute the end value for the additional bypass (if applicable).
3035 if (AdditionalBypass.first) {
3036 B.SetInsertPoint(TheBB: AdditionalBypass.first,
3037 IP: AdditionalBypass.first->getFirstInsertionPt());
3038 EndValueFromAdditionalBypass =
3039 emitTransformedIndex(B, Index: AdditionalBypass.second, StartValue: II.getStartValue(),
3040 Step, InductionKind: II.getKind(), InductionBinOp: II.getInductionBinOp());
3041 EndValueFromAdditionalBypass->setName("ind.end");
3042 }
3043 }
3044
3045 // Create phi nodes to merge from the backedge-taken check block.
3046 PHINode *BCResumeVal =
3047 PHINode::Create(Ty: OrigPhi->getType(), NumReservedValues: 3, NameStr: "bc.resume.val",
3048 InsertBefore: LoopScalarPreHeader->getTerminator()->getIterator());
3049 // Copy original phi DL over to the new one.
3050 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3051
3052 // The new PHI merges the original incoming value, in case of a bypass,
3053 // or the value at the end of the vectorized loop.
3054 BCResumeVal->addIncoming(V: EndValue, BB: LoopMiddleBlock);
3055
3056 // Fix the scalar body counter (PHI node).
3057 // The old induction's phi node in the scalar body needs the truncated
3058 // value.
3059 for (BasicBlock *BB : BypassBlocks)
3060 BCResumeVal->addIncoming(V: II.getStartValue(), BB);
3061
3062 if (AdditionalBypass.first)
3063 BCResumeVal->setIncomingValueForBlock(BB: AdditionalBypass.first,
3064 V: EndValueFromAdditionalBypass);
3065 return BCResumeVal;
3066}
3067
3068/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
3069/// expansion results.
3070static Value *getExpandedStep(const InductionDescriptor &ID,
3071 const SCEV2ValueTy &ExpandedSCEVs) {
3072 const SCEV *Step = ID.getStep();
3073 if (auto *C = dyn_cast<SCEVConstant>(Val: Step))
3074 return C->getValue();
3075 if (auto *U = dyn_cast<SCEVUnknown>(Val: Step))
3076 return U->getValue();
3077 auto I = ExpandedSCEVs.find(Val: Step);
3078 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
3079 return I->second;
3080}
3081
3082void InnerLoopVectorizer::createInductionResumeValues(
3083 const SCEV2ValueTy &ExpandedSCEVs,
3084 std::pair<BasicBlock *, Value *> AdditionalBypass) {
3085 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3086 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3087 "Inconsistent information about additional bypass.");
3088 // We are going to resume the execution of the scalar loop.
3089 // Go over all of the induction variables that we found and fix the
3090 // PHIs that are left in the scalar version of the loop.
3091 // The starting values of PHI nodes depend on the counter of the last
3092 // iteration in the vectorized loop.
3093 // If we come from a bypass edge then we need to start from the original
3094 // start value.
3095 for (const auto &InductionEntry : Legal->getInductionVars()) {
3096 PHINode *OrigPhi = InductionEntry.first;
3097 const InductionDescriptor &II = InductionEntry.second;
3098 PHINode *BCResumeVal = createInductionResumeValue(
3099 OrigPhi, II, Step: getExpandedStep(ID: II, ExpandedSCEVs), BypassBlocks: LoopBypassBlocks,
3100 AdditionalBypass);
3101 OrigPhi->setIncomingValueForBlock(BB: LoopScalarPreHeader, V: BCResumeVal);
3102 }
3103}
3104
3105BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3106 // The trip counts should be cached by now.
3107 Value *Count = getTripCount();
3108 Value *VectorTripCount = getOrCreateVectorTripCount(InsertBlock: LoopVectorPreHeader);
3109
3110 auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3111
3112 // Add a check in the middle block to see if we have completed
3113 // all of the iterations in the first vector loop. Three cases:
3114 // 1) If we require a scalar epilogue, there is no conditional branch as
3115 // we unconditionally branch to the scalar preheader. Do nothing.
3116 // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3117 // Thus if tail is to be folded, we know we don't need to run the
3118 // remainder and we can use the previous value for the condition (true).
3119 // 3) Otherwise, construct a runtime check.
3120 if (!Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()) &&
3121 !Cost->foldTailByMasking()) {
3122 // Here we use the same DebugLoc as the scalar loop latch terminator instead
3123 // of the corresponding compare because they may have ended up with
3124 // different line numbers and we want to avoid awkward line stepping while
3125 // debugging. Eg. if the compare has got a line number inside the loop.
3126 // TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3127 // operands. Perform simplification directly on VPlan once the branch is
3128 // modeled there.
3129 IRBuilder<> B(LoopMiddleBlock->getTerminator());
3130 B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3131 Value *CmpN = B.CreateICmpEQ(LHS: Count, RHS: VectorTripCount, Name: "cmp.n");
3132 BranchInst &BI = *cast<BranchInst>(Val: LoopMiddleBlock->getTerminator());
3133 BI.setCondition(CmpN);
3134 if (hasBranchWeightMD(I: *ScalarLatchTerm)) {
3135 // Assume that `Count % VectorTripCount` is equally distributed.
3136 unsigned TripCount = UF * VF.getKnownMinValue();
3137 assert(TripCount > 0 && "trip count should not be zero");
3138 const uint32_t Weights[] = {1, TripCount - 1};
3139 setBranchWeights(I&: BI, Weights);
3140 }
3141 }
3142
3143#ifdef EXPENSIVE_CHECKS
3144 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3145#endif
3146
3147 return LoopVectorPreHeader;
3148}
3149
3150std::pair<BasicBlock *, Value *>
3151InnerLoopVectorizer::createVectorizedLoopSkeleton(
3152 const SCEV2ValueTy &ExpandedSCEVs) {
3153 /*
3154 In this function we generate a new loop. The new loop will contain
3155 the vectorized instructions while the old loop will continue to run the
3156 scalar remainder.
3157
3158 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3159 / | preheader are expanded here. Eventually all required SCEV
3160 / | expansion should happen here.
3161 / v
3162 | [ ] <-- vector loop bypass (may consist of multiple blocks).
3163 | / |
3164 | / v
3165 || [ ] <-- vector pre header.
3166 |/ |
3167 | v
3168 | [ ] \
3169 | [ ]_| <-- vector loop (created during VPlan execution).
3170 | |
3171 | v
3172 \ -[ ] <--- middle-block.
3173 \/ |
3174 /\ v
3175 | ->[ ] <--- new preheader.
3176 | |
3177 (opt) v <-- edge from middle to exit iff epilogue is not required.
3178 | [ ] \
3179 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3180 \ |
3181 \ v
3182 >[ ] <-- exit block(s).
3183 ...
3184 */
3185
3186 // Create an empty vector loop, and prepare basic blocks for the runtime
3187 // checks.
3188 createVectorLoopSkeleton(Prefix: "");
3189
3190 // Now, compare the new count to zero. If it is zero skip the vector loop and
3191 // jump to the scalar loop. This check also covers the case where the
3192 // backedge-taken count is uint##_max: adding one to it will overflow leading
3193 // to an incorrect trip count of zero. In this (rare) case we will also jump
3194 // to the scalar loop.
3195 emitIterationCountCheck(Bypass: LoopScalarPreHeader);
3196
3197 // Generate the code to check any assumptions that we've made for SCEV
3198 // expressions.
3199 emitSCEVChecks(Bypass: LoopScalarPreHeader);
3200
3201 // Generate the code that checks in runtime if arrays overlap. We put the
3202 // checks into a separate block to make the more common case of few elements
3203 // faster.
3204 emitMemRuntimeChecks(Bypass: LoopScalarPreHeader);
3205
3206 // Emit phis for the new starting index of the scalar loop.
3207 createInductionResumeValues(ExpandedSCEVs);
3208
3209 return {completeLoopSkeleton(), nullptr};
3210}
3211
3212// Fix up external users of the induction variable. At this point, we are
3213// in LCSSA form, with all external PHIs that use the IV having one input value,
3214// coming from the remainder loop. We need those PHIs to also have a correct
3215// value for the IV when arriving directly from the middle block.
3216void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3217 const InductionDescriptor &II,
3218 Value *VectorTripCount, Value *EndValue,
3219 BasicBlock *MiddleBlock,
3220 BasicBlock *VectorHeader, VPlan &Plan,
3221 VPTransformState &State) {
3222 // There are two kinds of external IV usages - those that use the value
3223 // computed in the last iteration (the PHI) and those that use the penultimate
3224 // value (the value that feeds into the phi from the loop latch).
3225 // We allow both, but they, obviously, have different values.
3226
3227 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3228
3229 DenseMap<Value *, Value *> MissingVals;
3230
3231 // An external user of the last iteration's value should see the value that
3232 // the remainder loop uses to initialize its own IV.
3233 Value *PostInc = OrigPhi->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch());
3234 for (User *U : PostInc->users()) {
3235 Instruction *UI = cast<Instruction>(Val: U);
3236 if (!OrigLoop->contains(Inst: UI)) {
3237 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3238 MissingVals[UI] = EndValue;
3239 }
3240 }
3241
3242 // An external user of the penultimate value need to see EndValue - Step.
3243 // The simplest way to get this is to recompute it from the constituent SCEVs,
3244 // that is Start + (Step * (CRD - 1)).
3245 for (User *U : OrigPhi->users()) {
3246 auto *UI = cast<Instruction>(Val: U);
3247 if (!OrigLoop->contains(Inst: UI)) {
3248 assert(isa<PHINode>(UI) && "Expected LCSSA form");
3249 IRBuilder<> B(MiddleBlock->getTerminator());
3250
3251 // Fast-math-flags propagate from the original induction instruction.
3252 if (II.getInductionBinOp() && isa<FPMathOperator>(Val: II.getInductionBinOp()))
3253 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3254
3255 Value *CountMinusOne = B.CreateSub(
3256 LHS: VectorTripCount, RHS: ConstantInt::get(Ty: VectorTripCount->getType(), V: 1));
3257 CountMinusOne->setName("cmo");
3258
3259 VPValue *StepVPV = Plan.getSCEVExpansion(S: II.getStep());
3260 assert(StepVPV && "step must have been expanded during VPlan execution");
3261 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
3262 : State.get(Def: StepVPV, Instance: {0, 0});
3263 Value *Escape =
3264 emitTransformedIndex(B, Index: CountMinusOne, StartValue: II.getStartValue(), Step,
3265 InductionKind: II.getKind(), InductionBinOp: II.getInductionBinOp());
3266 Escape->setName("ind.escape");
3267 MissingVals[UI] = Escape;
3268 }
3269 }
3270
3271 for (auto &I : MissingVals) {
3272 PHINode *PHI = cast<PHINode>(Val: I.first);
3273 // One corner case we have to handle is two IVs "chasing" each-other,
3274 // that is %IV2 = phi [...], [ %IV1, %latch ]
3275 // In this case, if IV1 has an external use, we need to avoid adding both
3276 // "last value of IV1" and "penultimate value of IV2". So, verify that we
3277 // don't already have an incoming value for the middle block.
3278 if (PHI->getBasicBlockIndex(BB: MiddleBlock) == -1) {
3279 PHI->addIncoming(V: I.second, BB: MiddleBlock);
3280 Plan.removeLiveOut(PN: PHI);
3281 }
3282 }
3283}
3284
3285namespace {
3286
3287struct CSEDenseMapInfo {
3288 static bool canHandle(const Instruction *I) {
3289 return isa<InsertElementInst>(Val: I) || isa<ExtractElementInst>(Val: I) ||
3290 isa<ShuffleVectorInst>(Val: I) || isa<GetElementPtrInst>(Val: I);
3291 }
3292
3293 static inline Instruction *getEmptyKey() {
3294 return DenseMapInfo<Instruction *>::getEmptyKey();
3295 }
3296
3297 static inline Instruction *getTombstoneKey() {
3298 return DenseMapInfo<Instruction *>::getTombstoneKey();
3299 }
3300
3301 static unsigned getHashValue(const Instruction *I) {
3302 assert(canHandle(I) && "Unknown instruction!");
3303 return hash_combine(args: I->getOpcode(), args: hash_combine_range(first: I->value_op_begin(),
3304 last: I->value_op_end()));
3305 }
3306
3307 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3308 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3309 LHS == getTombstoneKey() || RHS == getTombstoneKey())
3310 return LHS == RHS;
3311 return LHS->isIdenticalTo(I: RHS);
3312 }
3313};
3314
3315} // end anonymous namespace
3316
3317///Perform cse of induction variable instructions.
3318static void cse(BasicBlock *BB) {
3319 // Perform simple cse.
3320 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3321 for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
3322 if (!CSEDenseMapInfo::canHandle(I: &In))
3323 continue;
3324
3325 // Check if we can replace this instruction with any of the
3326 // visited instructions.
3327 if (Instruction *V = CSEMap.lookup(Val: &In)) {
3328 In.replaceAllUsesWith(V);
3329 In.eraseFromParent();
3330 continue;
3331 }
3332
3333 CSEMap[&In] = &In;
3334 }
3335}
3336
3337InstructionCost
3338LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3339 ElementCount VF) const {
3340 // We only need to calculate a cost if the VF is scalar; for actual vectors
3341 // we should already have a pre-calculated cost at each VF.
3342 if (!VF.isScalar())
3343 return CallWideningDecisions.at(Val: std::make_pair(x&: CI, y&: VF)).Cost;
3344
3345 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3346 Type *RetTy = CI->getType();
3347 if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
3348 if (auto RedCost = getReductionPatternCost(I: CI, VF, VectorTy: RetTy, CostKind))
3349 return *RedCost;
3350
3351 SmallVector<Type *, 4> Tys;
3352 for (auto &ArgOp : CI->args())
3353 Tys.push_back(Elt: ArgOp->getType());
3354
3355 InstructionCost ScalarCallCost =
3356 TTI.getCallInstrCost(F: CI->getCalledFunction(), RetTy, Tys, CostKind);
3357
3358 // If this is an intrinsic we may have a lower cost for it.
3359 if (getVectorIntrinsicIDForCall(CI, TLI)) {
3360 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
3361 return std::min(a: ScalarCallCost, b: IntrinsicCost);
3362 }
3363 return ScalarCallCost;
3364}
3365
3366static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3367 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3368 return Elt;
3369 return VectorType::get(ElementType: Elt, EC: VF);
3370}
3371
3372InstructionCost
3373LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3374 ElementCount VF) const {
3375 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3376 assert(ID && "Expected intrinsic call!");
3377 Type *RetTy = MaybeVectorizeType(Elt: CI->getType(), VF);
3378 FastMathFlags FMF;
3379 if (auto *FPMO = dyn_cast<FPMathOperator>(Val: CI))
3380 FMF = FPMO->getFastMathFlags();
3381
3382 SmallVector<const Value *> Arguments(CI->args());
3383 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3384 SmallVector<Type *> ParamTys;
3385 std::transform(first: FTy->param_begin(), last: FTy->param_end(),
3386 result: std::back_inserter(x&: ParamTys),
3387 unary_op: [&](Type *Ty) { return MaybeVectorizeType(Elt: Ty, VF); });
3388
3389 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3390 dyn_cast<IntrinsicInst>(Val: CI));
3391 return TTI.getIntrinsicInstrCost(ICA: CostAttrs,
3392 CostKind: TargetTransformInfo::TCK_RecipThroughput);
3393}
3394
3395static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3396 auto *I1 = cast<IntegerType>(Val: cast<VectorType>(Val: T1)->getElementType());
3397 auto *I2 = cast<IntegerType>(Val: cast<VectorType>(Val: T2)->getElementType());
3398 return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3399}
3400
3401static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3402 auto *I1 = cast<IntegerType>(Val: cast<VectorType>(Val: T1)->getElementType());
3403 auto *I2 = cast<IntegerType>(Val: cast<VectorType>(Val: T2)->getElementType());
3404 return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3405}
3406
3407void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3408 VPlan &Plan) {
3409 // Fix widened non-induction PHIs by setting up the PHI operands.
3410 if (EnableVPlanNativePath)
3411 fixNonInductionPHIs(Plan, State);
3412
3413 // At this point every instruction in the original loop is widened to a
3414 // vector form. Now we need to fix the recurrences in the loop. These PHI
3415 // nodes are currently empty because we did not want to introduce cycles.
3416 // This is the second stage of vectorizing recurrences. Note that fixing
3417 // reduction phis are already modeled in VPlan.
3418 // TODO: Also model fixing fixed-order recurrence phis in VPlan.
3419 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3420 VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3421 for (VPRecipeBase &R : HeaderVPBB->phis()) {
3422 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &R))
3423 fixFixedOrderRecurrence(PhiR: FOR, State);
3424 }
3425
3426 // Forget the original basic block.
3427 PSE.getSE()->forgetLoop(L: OrigLoop);
3428 PSE.getSE()->forgetBlockAndLoopDispositions();
3429
3430 // After vectorization, the exit blocks of the original loop will have
3431 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
3432 // looked through single-entry phis.
3433 SmallVector<BasicBlock *> ExitBlocks;
3434 OrigLoop->getExitBlocks(ExitBlocks);
3435 for (BasicBlock *Exit : ExitBlocks)
3436 for (PHINode &PN : Exit->phis())
3437 PSE.getSE()->forgetLcssaPhiWithNewPredecessor(L: OrigLoop, V: &PN);
3438
3439 VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
3440 Loop *VectorLoop = LI->getLoopFor(BB: State.CFG.VPBB2IRBB[LatchVPBB]);
3441 if (Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector())) {
3442 // No edge from the middle block to the unique exit block has been inserted
3443 // and there is nothing to fix from vector loop; phis should have incoming
3444 // from scalar loop only.
3445 } else {
3446 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
3447 // the cost model.
3448
3449 // If we inserted an edge from the middle block to the unique exit block,
3450 // update uses outside the loop (phis) to account for the newly inserted
3451 // edge.
3452
3453 // Fix-up external users of the induction variables.
3454 for (const auto &Entry : Legal->getInductionVars())
3455 fixupIVUsers(OrigPhi: Entry.first, II: Entry.second,
3456 VectorTripCount: getOrCreateVectorTripCount(InsertBlock: VectorLoop->getLoopPreheader()),
3457 EndValue: IVEndValues[Entry.first], MiddleBlock: LoopMiddleBlock,
3458 VectorHeader: VectorLoop->getHeader(), Plan, State);
3459 }
3460
3461 // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3462 // in the exit block, so update the builder.
3463 State.Builder.SetInsertPoint(TheBB: State.CFG.ExitBB,
3464 IP: State.CFG.ExitBB->getFirstNonPHIIt());
3465 for (const auto &KV : Plan.getLiveOuts())
3466 KV.second->fixPhi(Plan, State);
3467
3468 for (Instruction *PI : PredicatedInstructions)
3469 sinkScalarOperands(PredInst: &*PI);
3470
3471 // Remove redundant induction instructions.
3472 cse(BB: VectorLoop->getHeader());
3473
3474 // Set/update profile weights for the vector and remainder loops as original
3475 // loop iterations are now distributed among them. Note that original loop
3476 // represented by LoopScalarBody becomes remainder loop after vectorization.
3477 //
3478 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3479 // end up getting slightly roughened result but that should be OK since
3480 // profile is not inherently precise anyway. Note also possible bypass of
3481 // vector code caused by legality checks is ignored, assigning all the weight
3482 // to the vector loop, optimistically.
3483 //
3484 // For scalable vectorization we can't know at compile time how many iterations
3485 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3486 // vscale of '1'.
3487 setProfileInfoAfterUnrolling(OrigLoop: LI->getLoopFor(BB: LoopScalarBody), UnrolledLoop: VectorLoop,
3488 RemainderLoop: LI->getLoopFor(BB: LoopScalarBody),
3489 UF: VF.getKnownMinValue() * UF);
3490}
3491
3492void InnerLoopVectorizer::fixFixedOrderRecurrence(
3493 VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3494 // This is the second phase of vectorizing first-order recurrences. An
3495 // overview of the transformation is described below. Suppose we have the
3496 // following loop.
3497 //
3498 // for (int i = 0; i < n; ++i)
3499 // b[i] = a[i] - a[i - 1];
3500 //
3501 // There is a first-order recurrence on "a". For this loop, the shorthand
3502 // scalar IR looks like:
3503 //
3504 // scalar.ph:
3505 // s_init = a[-1]
3506 // br scalar.body
3507 //
3508 // scalar.body:
3509 // i = phi [0, scalar.ph], [i+1, scalar.body]
3510 // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3511 // s2 = a[i]
3512 // b[i] = s2 - s1
3513 // br cond, scalar.body, ...
3514 //
3515 // In this example, s1 is a recurrence because it's value depends on the
3516 // previous iteration. In the first phase of vectorization, we created a
3517 // vector phi v1 for s1. We now complete the vectorization and produce the
3518 // shorthand vector IR shown below (for VF = 4, UF = 1).
3519 //
3520 // vector.ph:
3521 // v_init = vector(..., ..., ..., a[-1])
3522 // br vector.body
3523 //
3524 // vector.body
3525 // i = phi [0, vector.ph], [i+4, vector.body]
3526 // v1 = phi [v_init, vector.ph], [v2, vector.body]
3527 // v2 = a[i, i+1, i+2, i+3];
3528 // v3 = vector(v1(3), v2(0, 1, 2))
3529 // b[i, i+1, i+2, i+3] = v2 - v3
3530 // br cond, vector.body, middle.block
3531 //
3532 // middle.block:
3533 // x = v2(3)
3534 // br scalar.ph
3535 //
3536 // scalar.ph:
3537 // s_init = phi [x, middle.block], [a[-1], otherwise]
3538 // br scalar.body
3539 //
3540 // After execution completes the vector loop, we extract the next value of
3541 // the recurrence (x) to use as the initial value in the scalar loop.
3542
3543 // Extract the last vector element in the middle block. This will be the
3544 // initial value for the recurrence when jumping to the scalar loop.
3545 VPValue *PreviousDef = PhiR->getBackedgeValue();
3546 Value *Incoming = State.get(Def: PreviousDef, Part: UF - 1);
3547 auto *ExtractForScalar = Incoming;
3548 auto *IdxTy = Builder.getInt32Ty();
3549 Value *RuntimeVF = nullptr;
3550 if (VF.isVector()) {
3551 auto *One = ConstantInt::get(Ty: IdxTy, V: 1);
3552 Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3553 RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF);
3554 auto *LastIdx = Builder.CreateSub(LHS: RuntimeVF, RHS: One);
3555 ExtractForScalar =
3556 Builder.CreateExtractElement(Vec: Incoming, Idx: LastIdx, Name: "vector.recur.extract");
3557 }
3558
3559 auto RecurSplice = cast<VPInstruction>(Val: *PhiR->user_begin());
3560 assert(PhiR->getNumUsers() == 1 &&
3561 RecurSplice->getOpcode() ==
3562 VPInstruction::FirstOrderRecurrenceSplice &&
3563 "recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3564 SmallVector<VPLiveOut *> LiveOuts;
3565 for (VPUser *U : RecurSplice->users())
3566 if (auto *LiveOut = dyn_cast<VPLiveOut>(Val: U))
3567 LiveOuts.push_back(Elt: LiveOut);
3568
3569 if (!LiveOuts.empty()) {
3570 // Extract the second last element in the middle block if the
3571 // Phi is used outside the loop. We need to extract the phi itself
3572 // and not the last element (the phi update in the current iteration). This
3573 // will be the value when jumping to the exit block from the
3574 // LoopMiddleBlock, when the scalar loop is not run at all.
3575 Value *ExtractForPhiUsedOutsideLoop = nullptr;
3576 if (VF.isVector()) {
3577 auto *Idx = Builder.CreateSub(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: IdxTy, V: 2));
3578 ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3579 Vec: Incoming, Idx, Name: "vector.recur.extract.for.phi");
3580 } else {
3581 assert(UF > 1 && "VF and UF cannot both be 1");
3582 // When loop is unrolled without vectorizing, initialize
3583 // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
3584 // value of `Incoming`. This is analogous to the vectorized case above:
3585 // extracting the second last element when VF > 1.
3586 ExtractForPhiUsedOutsideLoop = State.get(Def: PreviousDef, Part: UF - 2);
3587 }
3588
3589 for (VPLiveOut *LiveOut : LiveOuts) {
3590 assert(!Cost->requiresScalarEpilogue(VF.isVector()));
3591 PHINode *LCSSAPhi = LiveOut->getPhi();
3592 LCSSAPhi->addIncoming(V: ExtractForPhiUsedOutsideLoop, BB: LoopMiddleBlock);
3593 State.Plan->removeLiveOut(PN: LCSSAPhi);
3594 }
3595 }
3596
3597 // Fix the initial value of the original recurrence in the scalar loop.
3598 Builder.SetInsertPoint(TheBB: LoopScalarPreHeader, IP: LoopScalarPreHeader->begin());
3599 PHINode *Phi = cast<PHINode>(Val: PhiR->getUnderlyingValue());
3600 auto *Start = Builder.CreatePHI(Ty: Phi->getType(), NumReservedValues: 2, Name: "scalar.recur.init");
3601 auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3602 for (auto *BB : predecessors(BB: LoopScalarPreHeader)) {
3603 auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3604 Start->addIncoming(V: Incoming, BB);
3605 }
3606
3607 Phi->setIncomingValueForBlock(BB: LoopScalarPreHeader, V: Start);
3608 Phi->setName("scalar.recur");
3609}
3610
3611void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3612 // The basic block and loop containing the predicated instruction.
3613 auto *PredBB = PredInst->getParent();
3614 auto *VectorLoop = LI->getLoopFor(BB: PredBB);
3615
3616 // Initialize a worklist with the operands of the predicated instruction.
3617 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3618
3619 // Holds instructions that we need to analyze again. An instruction may be
3620 // reanalyzed if we don't yet know if we can sink it or not.
3621 SmallVector<Instruction *, 8> InstsToReanalyze;
3622
3623 // Returns true if a given use occurs in the predicated block. Phi nodes use
3624 // their operands in their corresponding predecessor blocks.
3625 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3626 auto *I = cast<Instruction>(Val: U.getUser());
3627 BasicBlock *BB = I->getParent();
3628 if (auto *Phi = dyn_cast<PHINode>(Val: I))
3629 BB = Phi->getIncomingBlock(
3630 i: PHINode::getIncomingValueNumForOperand(i: U.getOperandNo()));
3631 return BB == PredBB;
3632 };
3633
3634 // Iteratively sink the scalarized operands of the predicated instruction
3635 // into the block we created for it. When an instruction is sunk, it's
3636 // operands are then added to the worklist. The algorithm ends after one pass
3637 // through the worklist doesn't sink a single instruction.
3638 bool Changed;
3639 do {
3640 // Add the instructions that need to be reanalyzed to the worklist, and
3641 // reset the changed indicator.
3642 Worklist.insert(Start: InstsToReanalyze.begin(), End: InstsToReanalyze.end());
3643 InstsToReanalyze.clear();
3644 Changed = false;
3645
3646 while (!Worklist.empty()) {
3647 auto *I = dyn_cast<Instruction>(Val: Worklist.pop_back_val());
3648
3649 // We can't sink an instruction if it is a phi node, is not in the loop,
3650 // may have side effects or may read from memory.
3651 // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3652 if (!I || isa<PHINode>(Val: I) || !VectorLoop->contains(Inst: I) ||
3653 I->mayHaveSideEffects() || I->mayReadFromMemory())
3654 continue;
3655
3656 // If the instruction is already in PredBB, check if we can sink its
3657 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3658 // sinking the scalar instruction I, hence it appears in PredBB; but it
3659 // may have failed to sink I's operands (recursively), which we try
3660 // (again) here.
3661 if (I->getParent() == PredBB) {
3662 Worklist.insert(Start: I->op_begin(), End: I->op_end());
3663 continue;
3664 }
3665
3666 // It's legal to sink the instruction if all its uses occur in the
3667 // predicated block. Otherwise, there's nothing to do yet, and we may
3668 // need to reanalyze the instruction.
3669 if (!llvm::all_of(Range: I->uses(), P: isBlockOfUsePredicated)) {
3670 InstsToReanalyze.push_back(Elt: I);
3671 continue;
3672 }
3673
3674 // Move the instruction to the beginning of the predicated block, and add
3675 // it's operands to the worklist.
3676 I->moveBefore(MovePos: &*PredBB->getFirstInsertionPt());
3677 Worklist.insert(Start: I->op_begin(), End: I->op_end());
3678
3679 // The sinking may have enabled other instructions to be sunk, so we will
3680 // need to iterate.
3681 Changed = true;
3682 }
3683 } while (Changed);
3684}
3685
3686void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
3687 VPTransformState &State) {
3688 auto Iter = vp_depth_first_deep(G: Plan.getEntry());
3689 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
3690 for (VPRecipeBase &P : VPBB->phis()) {
3691 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &P);
3692 if (!VPPhi)
3693 continue;
3694 PHINode *NewPhi = cast<PHINode>(Val: State.get(Def: VPPhi, Part: 0));
3695 // Make sure the builder has a valid insert point.
3696 Builder.SetInsertPoint(NewPhi);
3697 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3698 VPValue *Inc = VPPhi->getIncomingValue(I: i);
3699 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(I: i);
3700 NewPhi->addIncoming(V: State.get(Def: Inc, Part: 0), BB: State.CFG.VPBB2IRBB[VPBB]);
3701 }
3702 }
3703 }
3704}
3705
3706void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3707 // We should not collect Scalars more than once per VF. Right now, this
3708 // function is called from collectUniformsAndScalars(), which already does
3709 // this check. Collecting Scalars for VF=1 does not make any sense.
3710 assert(VF.isVector() && !Scalars.contains(VF) &&
3711 "This function should not be visited twice for the same VF");
3712
3713 // This avoids any chances of creating a REPLICATE recipe during planning
3714 // since that would result in generation of scalarized code during execution,
3715 // which is not supported for scalable vectors.
3716 if (VF.isScalable()) {
3717 Scalars[VF].insert(I: Uniforms[VF].begin(), E: Uniforms[VF].end());
3718 return;
3719 }
3720
3721 SmallSetVector<Instruction *, 8> Worklist;
3722
3723 // These sets are used to seed the analysis with pointers used by memory
3724 // accesses that will remain scalar.
3725 SmallSetVector<Instruction *, 8> ScalarPtrs;
3726 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3727 auto *Latch = TheLoop->getLoopLatch();
3728
3729 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3730 // The pointer operands of loads and stores will be scalar as long as the
3731 // memory access is not a gather or scatter operation. The value operand of a
3732 // store will remain scalar if the store is scalarized.
3733 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3734 InstWidening WideningDecision = getWideningDecision(I: MemAccess, VF);
3735 assert(WideningDecision != CM_Unknown &&
3736 "Widening decision should be ready at this moment");
3737 if (auto *Store = dyn_cast<StoreInst>(Val: MemAccess))
3738 if (Ptr == Store->getValueOperand())
3739 return WideningDecision == CM_Scalarize;
3740 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3741 "Ptr is neither a value or pointer operand");
3742 return WideningDecision != CM_GatherScatter;
3743 };
3744
3745 // A helper that returns true if the given value is a bitcast or
3746 // getelementptr instruction contained in the loop.
3747 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3748 return ((isa<BitCastInst>(Val: V) && V->getType()->isPointerTy()) ||
3749 isa<GetElementPtrInst>(Val: V)) &&
3750 !TheLoop->isLoopInvariant(V);
3751 };
3752
3753 // A helper that evaluates a memory access's use of a pointer. If the use will
3754 // be a scalar use and the pointer is only used by memory accesses, we place
3755 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3756 // PossibleNonScalarPtrs.
3757 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3758 // We only care about bitcast and getelementptr instructions contained in
3759 // the loop.
3760 if (!isLoopVaryingBitCastOrGEP(Ptr))
3761 return;
3762
3763 // If the pointer has already been identified as scalar (e.g., if it was
3764 // also identified as uniform), there's nothing to do.
3765 auto *I = cast<Instruction>(Val: Ptr);
3766 if (Worklist.count(key: I))
3767 return;
3768
3769 // If the use of the pointer will be a scalar use, and all users of the
3770 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3771 // place the pointer in PossibleNonScalarPtrs.
3772 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(Range: I->users(), P: [&](User *U) {
3773 return isa<LoadInst>(Val: U) || isa<StoreInst>(Val: U);
3774 }))
3775 ScalarPtrs.insert(X: I);
3776 else
3777 PossibleNonScalarPtrs.insert(Ptr: I);
3778 };
3779
3780 // We seed the scalars analysis with three classes of instructions: (1)
3781 // instructions marked uniform-after-vectorization and (2) bitcast,
3782 // getelementptr and (pointer) phi instructions used by memory accesses
3783 // requiring a scalar use.
3784 //
3785 // (1) Add to the worklist all instructions that have been identified as
3786 // uniform-after-vectorization.
3787 Worklist.insert(Start: Uniforms[VF].begin(), End: Uniforms[VF].end());
3788
3789 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3790 // memory accesses requiring a scalar use. The pointer operands of loads and
3791 // stores will be scalar as long as the memory accesses is not a gather or
3792 // scatter operation. The value operand of a store will remain scalar if the
3793 // store is scalarized.
3794 for (auto *BB : TheLoop->blocks())
3795 for (auto &I : *BB) {
3796 if (auto *Load = dyn_cast<LoadInst>(Val: &I)) {
3797 evaluatePtrUse(Load, Load->getPointerOperand());
3798 } else if (auto *Store = dyn_cast<StoreInst>(Val: &I)) {
3799 evaluatePtrUse(Store, Store->getPointerOperand());
3800 evaluatePtrUse(Store, Store->getValueOperand());
3801 }
3802 }
3803 for (auto *I : ScalarPtrs)
3804 if (!PossibleNonScalarPtrs.count(Ptr: I)) {
3805 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3806 Worklist.insert(X: I);
3807 }
3808
3809 // Insert the forced scalars.
3810 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3811 // induction variable when the PHI user is scalarized.
3812 auto ForcedScalar = ForcedScalars.find(Val: VF);
3813 if (ForcedScalar != ForcedScalars.end())
3814 for (auto *I : ForcedScalar->second) {
3815 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3816 Worklist.insert(X: I);
3817 }
3818
3819 // Expand the worklist by looking through any bitcasts and getelementptr
3820 // instructions we've already identified as scalar. This is similar to the
3821 // expansion step in collectLoopUniforms(); however, here we're only
3822 // expanding to include additional bitcasts and getelementptr instructions.
3823 unsigned Idx = 0;
3824 while (Idx != Worklist.size()) {
3825 Instruction *Dst = Worklist[Idx++];
3826 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(i: 0)))
3827 continue;
3828 auto *Src = cast<Instruction>(Val: Dst->getOperand(i: 0));
3829 if (llvm::all_of(Range: Src->users(), P: [&](User *U) -> bool {
3830 auto *J = cast<Instruction>(Val: U);
3831 return !TheLoop->contains(Inst: J) || Worklist.count(key: J) ||
3832 ((isa<LoadInst>(Val: J) || isa<StoreInst>(Val: J)) &&
3833 isScalarUse(J, Src));
3834 })) {
3835 Worklist.insert(X: Src);
3836 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3837 }
3838 }
3839
3840 // An induction variable will remain scalar if all users of the induction
3841 // variable and induction variable update remain scalar.
3842 for (const auto &Induction : Legal->getInductionVars()) {
3843 auto *Ind = Induction.first;
3844 auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
3845
3846 // If tail-folding is applied, the primary induction variable will be used
3847 // to feed a vector compare.
3848 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3849 continue;
3850
3851 // Returns true if \p Indvar is a pointer induction that is used directly by
3852 // load/store instruction \p I.
3853 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3854 Instruction *I) {
3855 return Induction.second.getKind() ==
3856 InductionDescriptor::IK_PtrInduction &&
3857 (isa<LoadInst>(Val: I) || isa<StoreInst>(Val: I)) &&
3858 Indvar == getLoadStorePointerOperand(V: I) && isScalarUse(I, Indvar);
3859 };
3860
3861 // Determine if all users of the induction variable are scalar after
3862 // vectorization.
3863 auto ScalarInd = llvm::all_of(Range: Ind->users(), P: [&](User *U) -> bool {
3864 auto *I = cast<Instruction>(Val: U);
3865 return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
3866 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3867 });
3868 if (!ScalarInd)
3869 continue;
3870
3871 // If the induction variable update is a fixed-order recurrence, neither the
3872 // induction variable or its update should be marked scalar after
3873 // vectorization.
3874 auto *IndUpdatePhi = dyn_cast<PHINode>(Val: IndUpdate);
3875 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(Phi: IndUpdatePhi))
3876 continue;
3877
3878 // Determine if all users of the induction variable update instruction are
3879 // scalar after vectorization.
3880 auto ScalarIndUpdate =
3881 llvm::all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool {
3882 auto *I = cast<Instruction>(Val: U);
3883 return I == Ind || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
3884 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3885 });
3886 if (!ScalarIndUpdate)
3887 continue;
3888
3889 // The induction variable and its update instruction will remain scalar.
3890 Worklist.insert(X: Ind);
3891 Worklist.insert(X: IndUpdate);
3892 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3893 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3894 << "\n");
3895 }
3896
3897 Scalars[VF].insert(I: Worklist.begin(), E: Worklist.end());
3898}
3899
3900bool LoopVectorizationCostModel::isScalarWithPredication(
3901 Instruction *I, ElementCount VF) const {
3902 if (!isPredicatedInst(I))
3903 return false;
3904
3905 // Do we have a non-scalar lowering for this predicated
3906 // instruction? No - it is scalar with predication.
3907 switch(I->getOpcode()) {
3908 default:
3909 return true;
3910 case Instruction::Call:
3911 if (VF.isScalar())
3912 return true;
3913 return CallWideningDecisions.at(Val: std::make_pair(x: cast<CallInst>(Val: I), y&: VF))
3914 .Kind == CM_Scalarize;
3915 case Instruction::Load:
3916 case Instruction::Store: {
3917 auto *Ptr = getLoadStorePointerOperand(V: I);
3918 auto *Ty = getLoadStoreType(I);
3919 Type *VTy = Ty;
3920 if (VF.isVector())
3921 VTy = VectorType::get(ElementType: Ty, EC: VF);
3922 const Align Alignment = getLoadStoreAlignment(I);
3923 return isa<LoadInst>(Val: I) ? !(isLegalMaskedLoad(DataType: Ty, Ptr, Alignment) ||
3924 TTI.isLegalMaskedGather(DataType: VTy, Alignment))
3925 : !(isLegalMaskedStore(DataType: Ty, Ptr, Alignment) ||
3926 TTI.isLegalMaskedScatter(DataType: VTy, Alignment));
3927 }
3928 case Instruction::UDiv:
3929 case Instruction::SDiv:
3930 case Instruction::SRem:
3931 case Instruction::URem: {
3932 // We have the option to use the safe-divisor idiom to avoid predication.
3933 // The cost based decision here will always select safe-divisor for
3934 // scalable vectors as scalarization isn't legal.
3935 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3936 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3937 }
3938 }
3939}
3940
3941bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3942 if (!blockNeedsPredicationForAnyReason(BB: I->getParent()))
3943 return false;
3944
3945 // Can we prove this instruction is safe to unconditionally execute?
3946 // If not, we must use some form of predication.
3947 switch(I->getOpcode()) {
3948 default:
3949 return false;
3950 case Instruction::Load:
3951 case Instruction::Store: {
3952 if (!Legal->isMaskRequired(I))
3953 return false;
3954 // When we know the load's address is loop invariant and the instruction
3955 // in the original scalar loop was unconditionally executed then we
3956 // don't need to mark it as a predicated instruction. Tail folding may
3957 // introduce additional predication, but we're guaranteed to always have
3958 // at least one active lane. We call Legal->blockNeedsPredication here
3959 // because it doesn't query tail-folding. For stores, we need to prove
3960 // both speculation safety (which follows from the same argument as loads),
3961 // but also must prove the value being stored is correct. The easiest
3962 // form of the later is to require that all values stored are the same.
3963 if (Legal->isInvariant(V: getLoadStorePointerOperand(V: I)) &&
3964 (isa<LoadInst>(Val: I) ||
3965 (isa<StoreInst>(Val: I) &&
3966 TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand()))) &&
3967 !Legal->blockNeedsPredication(BB: I->getParent()))
3968 return false;
3969 return true;
3970 }
3971 case Instruction::UDiv:
3972 case Instruction::SDiv:
3973 case Instruction::SRem:
3974 case Instruction::URem:
3975 // TODO: We can use the loop-preheader as context point here and get
3976 // context sensitive reasoning
3977 return !isSafeToSpeculativelyExecute(I);
3978 case Instruction::Call:
3979 return Legal->isMaskRequired(I);
3980 }
3981}
3982
3983std::pair<InstructionCost, InstructionCost>
3984LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
3985 ElementCount VF) const {
3986 assert(I->getOpcode() == Instruction::UDiv ||
3987 I->getOpcode() == Instruction::SDiv ||
3988 I->getOpcode() == Instruction::SRem ||
3989 I->getOpcode() == Instruction::URem);
3990 assert(!isSafeToSpeculativelyExecute(I));
3991
3992 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3993
3994 // Scalarization isn't legal for scalable vector types
3995 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3996 if (!VF.isScalable()) {
3997 // Get the scalarization cost and scale this amount by the probability of
3998 // executing the predicated block. If the instruction is not predicated,
3999 // we fall through to the next case.
4000 ScalarizationCost = 0;
4001
4002 // These instructions have a non-void type, so account for the phi nodes
4003 // that we will create. This cost is likely to be zero. The phi node
4004 // cost, if any, should be scaled by the block probability because it
4005 // models a copy at the end of each predicated block.
4006 ScalarizationCost += VF.getKnownMinValue() *
4007 TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
4008
4009 // The cost of the non-predicated instruction.
4010 ScalarizationCost += VF.getKnownMinValue() *
4011 TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: I->getType(), CostKind);
4012
4013 // The cost of insertelement and extractelement instructions needed for
4014 // scalarization.
4015 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4016
4017 // Scale the cost by the probability of executing the predicated blocks.
4018 // This assumes the predicated block for each vector lane is equally
4019 // likely.
4020 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4021 }
4022 InstructionCost SafeDivisorCost = 0;
4023
4024 auto *VecTy = ToVectorTy(Scalar: I->getType(), EC: VF);
4025
4026 // The cost of the select guard to ensure all lanes are well defined
4027 // after we speculate above any internal control flow.
4028 SafeDivisorCost += TTI.getCmpSelInstrCost(
4029 Opcode: Instruction::Select, ValTy: VecTy,
4030 CondTy: ToVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
4031 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
4032
4033 // Certain instructions can be cheaper to vectorize if they have a constant
4034 // second vector operand. One example of this are shifts on x86.
4035 Value *Op2 = I->getOperand(i: 1);
4036 auto Op2Info = TTI.getOperandInfo(V: Op2);
4037 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
4038 Legal->isInvariant(V: Op2))
4039 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
4040
4041 SmallVector<const Value *, 4> Operands(I->operand_values());
4042 SafeDivisorCost += TTI.getArithmeticInstrCost(
4043 Opcode: I->getOpcode(), Ty: VecTy, CostKind,
4044 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
4045 Opd2Info: Op2Info, Args: Operands, CxtI: I);
4046 return {ScalarizationCost, SafeDivisorCost};
4047}
4048
4049bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4050 Instruction *I, ElementCount VF) {
4051 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4052 assert(getWideningDecision(I, VF) == CM_Unknown &&
4053 "Decision should not be set yet.");
4054 auto *Group = getInterleavedAccessGroup(Instr: I);
4055 assert(Group && "Must have a group.");
4056
4057 // If the instruction's allocated size doesn't equal it's type size, it
4058 // requires padding and will be scalarized.
4059 auto &DL = I->getModule()->getDataLayout();
4060 auto *ScalarTy = getLoadStoreType(I);
4061 if (hasIrregularType(Ty: ScalarTy, DL))
4062 return false;
4063
4064 // If the group involves a non-integral pointer, we may not be able to
4065 // losslessly cast all values to a common type.
4066 unsigned InterleaveFactor = Group->getFactor();
4067 bool ScalarNI = DL.isNonIntegralPointerType(Ty: ScalarTy);
4068 for (unsigned i = 0; i < InterleaveFactor; i++) {
4069 Instruction *Member = Group->getMember(Index: i);
4070 if (!Member)
4071 continue;
4072 auto *MemberTy = getLoadStoreType(I: Member);
4073 bool MemberNI = DL.isNonIntegralPointerType(Ty: MemberTy);
4074 // Don't coerce non-integral pointers to integers or vice versa.
4075 if (MemberNI != ScalarNI) {
4076 // TODO: Consider adding special nullptr value case here
4077 return false;
4078 } else if (MemberNI && ScalarNI &&
4079 ScalarTy->getPointerAddressSpace() !=
4080 MemberTy->getPointerAddressSpace()) {
4081 return false;
4082 }
4083 }
4084
4085 // Check if masking is required.
4086 // A Group may need masking for one of two reasons: it resides in a block that
4087 // needs predication, or it was decided to use masking to deal with gaps
4088 // (either a gap at the end of a load-access that may result in a speculative
4089 // load, or any gaps in a store-access).
4090 bool PredicatedAccessRequiresMasking =
4091 blockNeedsPredicationForAnyReason(BB: I->getParent()) &&
4092 Legal->isMaskRequired(I);
4093 bool LoadAccessWithGapsRequiresEpilogMasking =
4094 isa<LoadInst>(Val: I) && Group->requiresScalarEpilogue() &&
4095 !isScalarEpilogueAllowed();
4096 bool StoreAccessWithGapsRequiresMasking =
4097 isa<StoreInst>(Val: I) && (Group->getNumMembers() < Group->getFactor());
4098 if (!PredicatedAccessRequiresMasking &&
4099 !LoadAccessWithGapsRequiresEpilogMasking &&
4100 !StoreAccessWithGapsRequiresMasking)
4101 return true;
4102
4103 // If masked interleaving is required, we expect that the user/target had
4104 // enabled it, because otherwise it either wouldn't have been created or
4105 // it should have been invalidated by the CostModel.
4106 assert(useMaskedInterleavedAccesses(TTI) &&
4107 "Masked interleave-groups for predicated accesses are not enabled.");
4108
4109 if (Group->isReverse())
4110 return false;
4111
4112 auto *Ty = getLoadStoreType(I);
4113 const Align Alignment = getLoadStoreAlignment(I);
4114 return isa<LoadInst>(Val: I) ? TTI.isLegalMaskedLoad(DataType: Ty, Alignment)
4115 : TTI.isLegalMaskedStore(DataType: Ty, Alignment);
4116}
4117
4118bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4119 Instruction *I, ElementCount VF) {
4120 // Get and ensure we have a valid memory instruction.
4121 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4122
4123 auto *Ptr = getLoadStorePointerOperand(V: I);
4124 auto *ScalarTy = getLoadStoreType(I);
4125
4126 // In order to be widened, the pointer should be consecutive, first of all.
4127 if (!Legal->isConsecutivePtr(AccessTy: ScalarTy, Ptr))
4128 return false;
4129
4130 // If the instruction is a store located in a predicated block, it will be
4131 // scalarized.
4132 if (isScalarWithPredication(I, VF))
4133 return false;
4134
4135 // If the instruction's allocated size doesn't equal it's type size, it
4136 // requires padding and will be scalarized.
4137 auto &DL = I->getModule()->getDataLayout();
4138 if (hasIrregularType(Ty: ScalarTy, DL))
4139 return false;
4140
4141 return true;
4142}
4143
4144void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4145 // We should not collect Uniforms more than once per VF. Right now,
4146 // this function is called from collectUniformsAndScalars(), which
4147 // already does this check. Collecting Uniforms for VF=1 does not make any
4148 // sense.
4149
4150 assert(VF.isVector() && !Uniforms.contains(VF) &&
4151 "This function should not be visited twice for the same VF");
4152
4153 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4154 // not analyze again. Uniforms.count(VF) will return 1.
4155 Uniforms[VF].clear();
4156
4157 // We now know that the loop is vectorizable!
4158 // Collect instructions inside the loop that will remain uniform after
4159 // vectorization.
4160
4161 // Global values, params and instructions outside of current loop are out of
4162 // scope.
4163 auto isOutOfScope = [&](Value *V) -> bool {
4164 Instruction *I = dyn_cast<Instruction>(Val: V);
4165 return (!I || !TheLoop->contains(Inst: I));
4166 };
4167
4168 // Worklist containing uniform instructions demanding lane 0.
4169 SetVector<Instruction *> Worklist;
4170 BasicBlock *Latch = TheLoop->getLoopLatch();
4171
4172 // Add uniform instructions demanding lane 0 to the worklist. Instructions
4173 // that are scalar with predication must not be considered uniform after
4174 // vectorization, because that would create an erroneous replicating region
4175 // where only a single instance out of VF should be formed.
4176 // TODO: optimize such seldom cases if found important, see PR40816.
4177 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4178 if (isOutOfScope(I)) {
4179 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4180 << *I << "\n");
4181 return;
4182 }
4183 if (isScalarWithPredication(I, VF)) {
4184 LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4185 << *I << "\n");
4186 return;
4187 }
4188 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4189 Worklist.insert(X: I);
4190 };
4191
4192 // Start with the conditional branch. If the branch condition is an
4193 // instruction contained in the loop that is only used by the branch, it is
4194 // uniform.
4195 auto *Cmp = dyn_cast<Instruction>(Val: Latch->getTerminator()->getOperand(i: 0));
4196 if (Cmp && TheLoop->contains(Inst: Cmp) && Cmp->hasOneUse())
4197 addToWorklistIfAllowed(Cmp);
4198
4199 auto PrevVF = VF.divideCoefficientBy(RHS: 2);
4200 // Return true if all lanes perform the same memory operation, and we can
4201 // thus chose to execute only one.
4202 auto isUniformMemOpUse = [&](Instruction *I) {
4203 // If the value was already known to not be uniform for the previous
4204 // (smaller VF), it cannot be uniform for the larger VF.
4205 if (PrevVF.isVector()) {
4206 auto Iter = Uniforms.find(Val: PrevVF);
4207 if (Iter != Uniforms.end() && !Iter->second.contains(Ptr: I))
4208 return false;
4209 }
4210 if (!Legal->isUniformMemOp(I&: *I, VF))
4211 return false;
4212 if (isa<LoadInst>(Val: I))
4213 // Loading the same address always produces the same result - at least
4214 // assuming aliasing and ordering which have already been checked.
4215 return true;
4216 // Storing the same value on every iteration.
4217 return TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand());
4218 };
4219
4220 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4221 InstWidening WideningDecision = getWideningDecision(I, VF);
4222 assert(WideningDecision != CM_Unknown &&
4223 "Widening decision should be ready at this moment");
4224
4225 if (isUniformMemOpUse(I))
4226 return true;
4227
4228 return (WideningDecision == CM_Widen ||
4229 WideningDecision == CM_Widen_Reverse ||
4230 WideningDecision == CM_Interleave);
4231 };
4232
4233 // Returns true if Ptr is the pointer operand of a memory access instruction
4234 // I, I is known to not require scalarization, and the pointer is not also
4235 // stored.
4236 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4237 if (isa<StoreInst>(Val: I) && I->getOperand(i: 0) == Ptr)
4238 return false;
4239 return getLoadStorePointerOperand(V: I) == Ptr &&
4240 (isUniformDecision(I, VF) || Legal->isInvariant(V: Ptr));
4241 };
4242
4243 // Holds a list of values which are known to have at least one uniform use.
4244 // Note that there may be other uses which aren't uniform. A "uniform use"
4245 // here is something which only demands lane 0 of the unrolled iterations;
4246 // it does not imply that all lanes produce the same value (e.g. this is not
4247 // the usual meaning of uniform)
4248 SetVector<Value *> HasUniformUse;
4249
4250 // Scan the loop for instructions which are either a) known to have only
4251 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4252 for (auto *BB : TheLoop->blocks())
4253 for (auto &I : *BB) {
4254 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &I)) {
4255 switch (II->getIntrinsicID()) {
4256 case Intrinsic::sideeffect:
4257 case Intrinsic::experimental_noalias_scope_decl:
4258 case Intrinsic::assume:
4259 case Intrinsic::lifetime_start:
4260 case Intrinsic::lifetime_end:
4261 if (TheLoop->hasLoopInvariantOperands(I: &I))
4262 addToWorklistIfAllowed(&I);
4263 break;
4264 default:
4265 break;
4266 }
4267 }
4268
4269 // ExtractValue instructions must be uniform, because the operands are
4270 // known to be loop-invariant.
4271 if (auto *EVI = dyn_cast<ExtractValueInst>(Val: &I)) {
4272 assert(isOutOfScope(EVI->getAggregateOperand()) &&
4273 "Expected aggregate value to be loop invariant");
4274 addToWorklistIfAllowed(EVI);
4275 continue;
4276 }
4277
4278 // If there's no pointer operand, there's nothing to do.
4279 auto *Ptr = getLoadStorePointerOperand(V: &I);
4280 if (!Ptr)
4281 continue;
4282
4283 if (isUniformMemOpUse(&I))
4284 addToWorklistIfAllowed(&I);
4285
4286 if (isVectorizedMemAccessUse(&I, Ptr))
4287 HasUniformUse.insert(X: Ptr);
4288 }
4289
4290 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4291 // demanding) users. Since loops are assumed to be in LCSSA form, this
4292 // disallows uses outside the loop as well.
4293 for (auto *V : HasUniformUse) {
4294 if (isOutOfScope(V))
4295 continue;
4296 auto *I = cast<Instruction>(Val: V);
4297 auto UsersAreMemAccesses =
4298 llvm::all_of(Range: I->users(), P: [&](User *U) -> bool {
4299 return isVectorizedMemAccessUse(cast<Instruction>(Val: U), V);
4300 });
4301 if (UsersAreMemAccesses)
4302 addToWorklistIfAllowed(I);
4303 }
4304
4305 // Expand Worklist in topological order: whenever a new instruction
4306 // is added , its users should be already inside Worklist. It ensures
4307 // a uniform instruction will only be used by uniform instructions.
4308 unsigned idx = 0;
4309 while (idx != Worklist.size()) {
4310 Instruction *I = Worklist[idx++];
4311
4312 for (auto *OV : I->operand_values()) {
4313 // isOutOfScope operands cannot be uniform instructions.
4314 if (isOutOfScope(OV))
4315 continue;
4316 // First order recurrence Phi's should typically be considered
4317 // non-uniform.
4318 auto *OP = dyn_cast<PHINode>(Val: OV);
4319 if (OP && Legal->isFixedOrderRecurrence(Phi: OP))
4320 continue;
4321 // If all the users of the operand are uniform, then add the
4322 // operand into the uniform worklist.
4323 auto *OI = cast<Instruction>(Val: OV);
4324 if (llvm::all_of(Range: OI->users(), P: [&](User *U) -> bool {
4325 auto *J = cast<Instruction>(Val: U);
4326 return Worklist.count(key: J) || isVectorizedMemAccessUse(J, OI);
4327 }))
4328 addToWorklistIfAllowed(OI);
4329 }
4330 }
4331
4332 // For an instruction to be added into Worklist above, all its users inside
4333 // the loop should also be in Worklist. However, this condition cannot be
4334 // true for phi nodes that form a cyclic dependence. We must process phi
4335 // nodes separately. An induction variable will remain uniform if all users
4336 // of the induction variable and induction variable update remain uniform.
4337 // The code below handles both pointer and non-pointer induction variables.
4338 for (const auto &Induction : Legal->getInductionVars()) {
4339 auto *Ind = Induction.first;
4340 auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
4341
4342 // Determine if all users of the induction variable are uniform after
4343 // vectorization.
4344 auto UniformInd = llvm::all_of(Range: Ind->users(), P: [&](User *U) -> bool {
4345 auto *I = cast<Instruction>(Val: U);
4346 return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
4347 isVectorizedMemAccessUse(I, Ind);
4348 });
4349 if (!UniformInd)
4350 continue;
4351
4352 // Determine if all users of the induction variable update instruction are
4353 // uniform after vectorization.
4354 auto UniformIndUpdate =
4355 llvm::all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool {
4356 auto *I = cast<Instruction>(Val: U);
4357 return I == Ind || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
4358 isVectorizedMemAccessUse(I, IndUpdate);
4359 });
4360 if (!UniformIndUpdate)
4361 continue;
4362
4363 // The induction variable and its update instruction will remain uniform.
4364 addToWorklistIfAllowed(Ind);
4365 addToWorklistIfAllowed(IndUpdate);
4366 }
4367
4368 Uniforms[VF].insert(I: Worklist.begin(), E: Worklist.end());
4369}
4370
4371bool LoopVectorizationCostModel::runtimeChecksRequired() {
4372 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4373
4374 if (Legal->getRuntimePointerChecking()->Need) {
4375 reportVectorizationFailure(DebugMsg: "Runtime ptr check is required with -Os/-Oz",
4376 OREMsg: "runtime pointer checks needed. Enable vectorization of this "
4377 "loop with '#pragma clang loop vectorize(enable)' when "
4378 "compiling with -Os/-Oz",
4379 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
4380 return true;
4381 }
4382
4383 if (!PSE.getPredicate().isAlwaysTrue()) {
4384 reportVectorizationFailure(DebugMsg: "Runtime SCEV check is required with -Os/-Oz",
4385 OREMsg: "runtime SCEV checks needed. Enable vectorization of this "
4386 "loop with '#pragma clang loop vectorize(enable)' when "
4387 "compiling with -Os/-Oz",
4388 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
4389 return true;
4390 }
4391
4392 // FIXME: Avoid specializing for stride==1 instead of bailing out.
4393 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4394 reportVectorizationFailure(DebugMsg: "Runtime stride check for small trip count",
4395 OREMsg: "runtime stride == 1 checks needed. Enable vectorization of "
4396 "this loop without such check by compiling with -Os/-Oz",
4397 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
4398 return true;
4399 }
4400
4401 return false;
4402}
4403
4404ElementCount
4405LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4406 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4407 return ElementCount::getScalable(MinVal: 0);
4408
4409 if (Hints->isScalableVectorizationDisabled()) {
4410 reportVectorizationInfo(Msg: "Scalable vectorization is explicitly disabled",
4411 ORETag: "ScalableVectorizationDisabled", ORE, TheLoop);
4412 return ElementCount::getScalable(MinVal: 0);
4413 }
4414
4415 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4416
4417 auto MaxScalableVF = ElementCount::getScalable(
4418 MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
4419
4420 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4421 // FIXME: While for scalable vectors this is currently sufficient, this should
4422 // be replaced by a more detailed mechanism that filters out specific VFs,
4423 // instead of invalidating vectorization for a whole set of VFs based on the
4424 // MaxVF.
4425
4426 // Disable scalable vectorization if the loop contains unsupported reductions.
4427 if (!canVectorizeReductions(VF: MaxScalableVF)) {
4428 reportVectorizationInfo(
4429 Msg: "Scalable vectorization not supported for the reduction "
4430 "operations found in this loop.",
4431 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
4432 return ElementCount::getScalable(MinVal: 0);
4433 }
4434
4435 // Disable scalable vectorization if the loop contains any instructions
4436 // with element types not supported for scalable vectors.
4437 if (any_of(Range&: ElementTypesInLoop, P: [&](Type *Ty) {
4438 return !Ty->isVoidTy() &&
4439 !this->TTI.isElementTypeLegalForScalableVector(Ty);
4440 })) {
4441 reportVectorizationInfo(Msg: "Scalable vectorization is not supported "
4442 "for all element types found in this loop.",
4443 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
4444 return ElementCount::getScalable(MinVal: 0);
4445 }
4446
4447 if (Legal->isSafeForAnyVectorWidth())
4448 return MaxScalableVF;
4449
4450 // Limit MaxScalableVF by the maximum safe dependence distance.
4451 if (std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI))
4452 MaxScalableVF = ElementCount::getScalable(MinVal: MaxSafeElements / *MaxVScale);
4453 else
4454 MaxScalableVF = ElementCount::getScalable(MinVal: 0);
4455
4456 if (!MaxScalableVF)
4457 reportVectorizationInfo(
4458 Msg: "Max legal vector width too small, scalable vectorization "
4459 "unfeasible.",
4460 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
4461
4462 return MaxScalableVF;
4463}
4464
4465FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4466 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4467 MinBWs = computeMinimumValueSizes(Blocks: TheLoop->getBlocks(), DB&: *DB, TTI: &TTI);
4468 unsigned SmallestType, WidestType;
4469 std::tie(args&: SmallestType, args&: WidestType) = getSmallestAndWidestTypes();
4470
4471 // Get the maximum safe dependence distance in bits computed by LAA.
4472 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4473 // the memory accesses that is most restrictive (involved in the smallest
4474 // dependence distance).
4475 unsigned MaxSafeElements =
4476 llvm::bit_floor(Value: Legal->getMaxSafeVectorWidthInBits() / WidestType);
4477
4478 auto MaxSafeFixedVF = ElementCount::getFixed(MinVal: MaxSafeElements);
4479 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4480
4481 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4482 << ".\n");
4483 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4484 << ".\n");
4485
4486 // First analyze the UserVF, fall back if the UserVF should be ignored.
4487 if (UserVF) {
4488 auto MaxSafeUserVF =
4489 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4490
4491 if (ElementCount::isKnownLE(LHS: UserVF, RHS: MaxSafeUserVF)) {
4492 // If `VF=vscale x N` is safe, then so is `VF=N`
4493 if (UserVF.isScalable())
4494 return FixedScalableVFPair(
4495 ElementCount::getFixed(MinVal: UserVF.getKnownMinValue()), UserVF);
4496 else
4497 return UserVF;
4498 }
4499
4500 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4501
4502 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4503 // is better to ignore the hint and let the compiler choose a suitable VF.
4504 if (!UserVF.isScalable()) {
4505 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4506 << " is unsafe, clamping to max safe VF="
4507 << MaxSafeFixedVF << ".\n");
4508 ORE->emit(RemarkBuilder: [&]() {
4509 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4510 TheLoop->getStartLoc(),
4511 TheLoop->getHeader())
4512 << "User-specified vectorization factor "
4513 << ore::NV("UserVectorizationFactor", UserVF)
4514 << " is unsafe, clamping to maximum safe vectorization factor "
4515 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4516 });
4517 return MaxSafeFixedVF;
4518 }
4519
4520 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4521 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4522 << " is ignored because scalable vectors are not "
4523 "available.\n");
4524 ORE->emit(RemarkBuilder: [&]() {
4525 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4526 TheLoop->getStartLoc(),
4527 TheLoop->getHeader())
4528 << "User-specified vectorization factor "
4529 << ore::NV("UserVectorizationFactor", UserVF)
4530 << " is ignored because the target does not support scalable "
4531 "vectors. The compiler will pick a more suitable value.";
4532 });
4533 } else {
4534 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4535 << " is unsafe. Ignoring scalable UserVF.\n");
4536 ORE->emit(RemarkBuilder: [&]() {
4537 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4538 TheLoop->getStartLoc(),
4539 TheLoop->getHeader())
4540 << "User-specified vectorization factor "
4541 << ore::NV("UserVectorizationFactor", UserVF)
4542 << " is unsafe. Ignoring the hint to let the compiler pick a "
4543 "more suitable value.";
4544 });
4545 }
4546 }
4547
4548 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4549 << " / " << WidestType << " bits.\n");
4550
4551 FixedScalableVFPair Result(ElementCount::getFixed(MinVal: 1),
4552 ElementCount::getScalable(MinVal: 0));
4553 if (auto MaxVF =
4554 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4555 MaxSafeVF: MaxSafeFixedVF, FoldTailByMasking))
4556 Result.FixedVF = MaxVF;
4557
4558 if (auto MaxVF =
4559 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4560 MaxSafeVF: MaxSafeScalableVF, FoldTailByMasking))
4561 if (MaxVF.isScalable()) {
4562 Result.ScalableVF = MaxVF;
4563 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4564 << "\n");
4565 }
4566
4567 return Result;
4568}
4569
4570FixedScalableVFPair
4571LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4572 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4573 // TODO: It may by useful to do since it's still likely to be dynamically
4574 // uniform if the target can skip.
4575 reportVectorizationFailure(
4576 DebugMsg: "Not inserting runtime ptr check for divergent target",
4577 OREMsg: "runtime pointer checks needed. Not enabled for divergent target",
4578 ORETag: "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4579 return FixedScalableVFPair::getNone();
4580 }
4581
4582 unsigned TC = PSE.getSE()->getSmallConstantTripCount(L: TheLoop);
4583 unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(L: TheLoop);
4584 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4585 if (TC == 1) {
4586 reportVectorizationFailure(DebugMsg: "Single iteration (non) loop",
4587 OREMsg: "loop trip count is one, irrelevant for vectorization",
4588 ORETag: "SingleIterationLoop", ORE, TheLoop);
4589 return FixedScalableVFPair::getNone();
4590 }
4591
4592 switch (ScalarEpilogueStatus) {
4593 case CM_ScalarEpilogueAllowed:
4594 return computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: false);
4595 case CM_ScalarEpilogueNotAllowedUsePredicate:
4596 [[fallthrough]];
4597 case CM_ScalarEpilogueNotNeededUsePredicate:
4598 LLVM_DEBUG(
4599 dbgs() << "LV: vector predicate hint/switch found.\n"
4600 << "LV: Not allowing scalar epilogue, creating predicated "
4601 << "vector loop.\n");
4602 break;
4603 case CM_ScalarEpilogueNotAllowedLowTripLoop:
4604 // fallthrough as a special case of OptForSize
4605 case CM_ScalarEpilogueNotAllowedOptSize:
4606 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4607 LLVM_DEBUG(
4608 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4609 else
4610 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4611 << "count.\n");
4612
4613 // Bail if runtime checks are required, which are not good when optimising
4614 // for size.
4615 if (runtimeChecksRequired())
4616 return FixedScalableVFPair::getNone();
4617
4618 break;
4619 }
4620
4621 // The only loops we can vectorize without a scalar epilogue, are loops with
4622 // a bottom-test and a single exiting block. We'd have to handle the fact
4623 // that not every instruction executes on the last iteration. This will
4624 // require a lane mask which varies through the vector loop body. (TODO)
4625 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4626 // If there was a tail-folding hint/switch, but we can't fold the tail by
4627 // masking, fallback to a vectorization with a scalar epilogue.
4628 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4629 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4630 "scalar epilogue instead.\n");
4631 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4632 return computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: false);
4633 }
4634 return FixedScalableVFPair::getNone();
4635 }
4636
4637 // Now try the tail folding
4638
4639 // Invalidate interleave groups that require an epilogue if we can't mask
4640 // the interleave-group.
4641 if (!useMaskedInterleavedAccesses(TTI)) {
4642 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4643 "No decisions should have been taken at this point");
4644 // Note: There is no need to invalidate any cost modeling decisions here, as
4645 // non where taken so far.
4646 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4647 }
4648
4649 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: true);
4650
4651 // Avoid tail folding if the trip count is known to be a multiple of any VF
4652 // we choose.
4653 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4654 MaxFactors.FixedVF.getFixedValue();
4655 if (MaxFactors.ScalableVF) {
4656 std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
4657 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4658 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4659 a: *MaxPowerOf2RuntimeVF,
4660 b: *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4661 } else
4662 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4663 }
4664
4665 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4666 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4667 "MaxFixedVF must be a power of 2");
4668 unsigned MaxVFtimesIC =
4669 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4670 ScalarEvolution *SE = PSE.getSE();
4671 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4672 const SCEV *ExitCount = SE->getAddExpr(
4673 LHS: BackedgeTakenCount, RHS: SE->getOne(Ty: BackedgeTakenCount->getType()));
4674 const SCEV *Rem = SE->getURemExpr(
4675 LHS: SE->applyLoopGuards(Expr: ExitCount, L: TheLoop),
4676 RHS: SE->getConstant(Ty: BackedgeTakenCount->getType(), V: MaxVFtimesIC));
4677 if (Rem->isZero()) {
4678 // Accept MaxFixedVF if we do not have a tail.
4679 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4680 return MaxFactors;
4681 }
4682 }
4683
4684 // If we don't know the precise trip count, or if the trip count that we
4685 // found modulo the vectorization factor is not zero, try to fold the tail
4686 // by masking.
4687 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4688 setTailFoldingStyles(IsScalableVF: MaxFactors.ScalableVF.isScalable(), UserIC);
4689 if (foldTailByMasking()) {
4690 if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
4691 LLVM_DEBUG(
4692 dbgs()
4693 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4694 "try to generate VP Intrinsics with scalable vector "
4695 "factors only.\n");
4696 // Tail folded loop using VP intrinsics restricts the VF to be scalable
4697 // for now.
4698 // TODO: extend it for fixed vectors, if required.
4699 assert(MaxFactors.ScalableVF.isScalable() &&
4700 "Expected scalable vector factor.");
4701
4702 MaxFactors.FixedVF = ElementCount::getFixed(MinVal: 1);
4703 }
4704 return MaxFactors;
4705 }
4706
4707 // If there was a tail-folding hint/switch, but we can't fold the tail by
4708 // masking, fallback to a vectorization with a scalar epilogue.
4709 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4710 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4711 "scalar epilogue instead.\n");
4712 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4713 return MaxFactors;
4714 }
4715
4716 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4717 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4718 return FixedScalableVFPair::getNone();
4719 }
4720
4721 if (TC == 0) {
4722 reportVectorizationFailure(
4723 DebugMsg: "Unable to calculate the loop count due to complex control flow",
4724 OREMsg: "unable to calculate the loop count due to complex control flow",
4725 ORETag: "UnknownLoopCountComplexCFG", ORE, TheLoop);
4726 return FixedScalableVFPair::getNone();
4727 }
4728
4729 reportVectorizationFailure(
4730 DebugMsg: "Cannot optimize for size and vectorize at the same time.",
4731 OREMsg: "cannot optimize for size and vectorize at the same time. "
4732 "Enable vectorization of this loop with '#pragma clang loop "
4733 "vectorize(enable)' when compiling with -Os/-Oz",
4734 ORETag: "NoTailLoopWithOptForSize", ORE, TheLoop);
4735 return FixedScalableVFPair::getNone();
4736}
4737
4738ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4739 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4740 ElementCount MaxSafeVF, bool FoldTailByMasking) {
4741 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4742 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4743 K: ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4744 : TargetTransformInfo::RGK_FixedWidthVector);
4745
4746 // Convenience function to return the minimum of two ElementCounts.
4747 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4748 assert((LHS.isScalable() == RHS.isScalable()) &&
4749 "Scalable flags must match");
4750 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4751 };
4752
4753 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4754 // Note that both WidestRegister and WidestType may not be a powers of 2.
4755 auto MaxVectorElementCount = ElementCount::get(
4756 MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / WidestType),
4757 Scalable: ComputeScalableMaxVF);
4758 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4759 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4760 << (MaxVectorElementCount * WidestType) << " bits.\n");
4761
4762 if (!MaxVectorElementCount) {
4763 LLVM_DEBUG(dbgs() << "LV: The target has no "
4764 << (ComputeScalableMaxVF ? "scalable" : "fixed")
4765 << " vector registers.\n");
4766 return ElementCount::getFixed(MinVal: 1);
4767 }
4768
4769 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4770 if (MaxVectorElementCount.isScalable() &&
4771 TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4772 auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4773 auto Min = Attr.getVScaleRangeMin();
4774 WidestRegisterMinEC *= Min;
4775 }
4776
4777 // When a scalar epilogue is required, at least one iteration of the scalar
4778 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4779 // max VF that results in a dead vector loop.
4780 if (MaxTripCount > 0 && requiresScalarEpilogue(IsVectorizing: true))
4781 MaxTripCount -= 1;
4782
4783 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4784 (!FoldTailByMasking || isPowerOf2_32(Value: MaxTripCount))) {
4785 // If upper bound loop trip count (TC) is known at compile time there is no
4786 // point in choosing VF greater than TC (as done in the loop below). Select
4787 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4788 // scalable, we only fall back on a fixed VF when the TC is less than or
4789 // equal to the known number of lanes.
4790 auto ClampedUpperTripCount = llvm::bit_floor(Value: MaxTripCount);
4791 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4792 "exceeding the constant trip count: "
4793 << ClampedUpperTripCount << "\n");
4794 return ElementCount::get(
4795 MinVal: ClampedUpperTripCount,
4796 Scalable: FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4797 }
4798
4799 TargetTransformInfo::RegisterKind RegKind =
4800 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4801 : TargetTransformInfo::RGK_FixedWidthVector;
4802 ElementCount MaxVF = MaxVectorElementCount;
4803 if (MaximizeBandwidth ||
4804 (MaximizeBandwidth.getNumOccurrences() == 0 &&
4805 (TTI.shouldMaximizeVectorBandwidth(K: RegKind) ||
4806 (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
4807 auto MaxVectorElementCountMaxBW = ElementCount::get(
4808 MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / SmallestType),
4809 Scalable: ComputeScalableMaxVF);
4810 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4811
4812 // Collect all viable vectorization factors larger than the default MaxVF
4813 // (i.e. MaxVectorElementCount).
4814 SmallVector<ElementCount, 8> VFs;
4815 for (ElementCount VS = MaxVectorElementCount * 2;
4816 ElementCount::isKnownLE(LHS: VS, RHS: MaxVectorElementCountMaxBW); VS *= 2)
4817 VFs.push_back(Elt: VS);
4818
4819 // For each VF calculate its register usage.
4820 auto RUs = calculateRegisterUsage(VFs);
4821
4822 // Select the largest VF which doesn't require more registers than existing
4823 // ones.
4824 for (int i = RUs.size() - 1; i >= 0; --i) {
4825 bool Selected = true;
4826 for (auto &pair : RUs[i].MaxLocalUsers) {
4827 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(ClassID: pair.first);
4828 if (pair.second > TargetNumRegisters)
4829 Selected = false;
4830 }
4831 if (Selected) {
4832 MaxVF = VFs[i];
4833 break;
4834 }
4835 }
4836 if (ElementCount MinVF =
4837 TTI.getMinimumVF(ElemWidth: SmallestType, IsScalable: ComputeScalableMaxVF)) {
4838 if (ElementCount::isKnownLT(LHS: MaxVF, RHS: MinVF)) {
4839 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4840 << ") with target's minimum: " << MinVF << '\n');
4841 MaxVF = MinVF;
4842 }
4843 }
4844
4845 // Invalidate any widening decisions we might have made, in case the loop
4846 // requires prediction (decided later), but we have already made some
4847 // load/store widening decisions.
4848 invalidateCostModelingDecisions();
4849 }
4850 return MaxVF;
4851}
4852
4853/// Convenience function that returns the value of vscale_range iff
4854/// vscale_range.min == vscale_range.max or otherwise returns the value
4855/// returned by the corresponding TTI method.
4856static std::optional<unsigned>
4857getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4858 const Function *Fn = L->getHeader()->getParent();
4859 if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4860 auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4861 auto Min = Attr.getVScaleRangeMin();
4862 auto Max = Attr.getVScaleRangeMax();
4863 if (Max && Min == Max)
4864 return Max;
4865 }
4866
4867 return TTI.getVScaleForTuning();
4868}
4869
4870bool LoopVectorizationPlanner::isMoreProfitable(
4871 const VectorizationFactor &A, const VectorizationFactor &B) const {
4872 InstructionCost CostA = A.Cost;
4873 InstructionCost CostB = B.Cost;
4874
4875 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(L: OrigLoop);
4876
4877 if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
4878 // If the trip count is a known (possibly small) constant, the trip count
4879 // will be rounded up to an integer number of iterations under
4880 // FoldTailByMasking. The total cost in that case will be
4881 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4882 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4883 // some extra overheads, but for the purpose of comparing the costs of
4884 // different VFs we can use this to compare the total loop-body cost
4885 // expected after vectorization.
4886 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4887 InstructionCost VectorCost,
4888 InstructionCost ScalarCost) {
4889 return CM.foldTailByMasking() ? VectorCost * divideCeil(Numerator: MaxTripCount, Denominator: VF)
4890 : VectorCost * (MaxTripCount / VF) +
4891 ScalarCost * (MaxTripCount % VF);
4892 };
4893 auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
4894 auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
4895
4896 return RTCostA < RTCostB;
4897 }
4898
4899 // Improve estimate for the vector width if it is scalable.
4900 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4901 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4902 if (std::optional<unsigned> VScale = getVScaleForTuning(L: OrigLoop, TTI)) {
4903 if (A.Width.isScalable())
4904 EstimatedWidthA *= *VScale;
4905 if (B.Width.isScalable())
4906 EstimatedWidthB *= *VScale;
4907 }
4908
4909 // Assume vscale may be larger than 1 (or the value being tuned for),
4910 // so that scalable vectorization is slightly favorable over fixed-width
4911 // vectorization.
4912 if (A.Width.isScalable() && !B.Width.isScalable())
4913 return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
4914
4915 // To avoid the need for FP division:
4916 // (CostA / A.Width) < (CostB / B.Width)
4917 // <=> (CostA * B.Width) < (CostB * A.Width)
4918 return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
4919}
4920
4921static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
4922 OptimizationRemarkEmitter *ORE,
4923 Loop *TheLoop) {
4924 if (InvalidCosts.empty())
4925 return;
4926
4927 // Emit a report of VFs with invalid costs in the loop.
4928
4929 // Group the remarks per instruction, keeping the instruction order from
4930 // InvalidCosts.
4931 std::map<Instruction *, unsigned> Numbering;
4932 unsigned I = 0;
4933 for (auto &Pair : InvalidCosts)
4934 if (!Numbering.count(x: Pair.first))
4935 Numbering[Pair.first] = I++;
4936
4937 // Sort the list, first on instruction(number) then on VF.
4938 sort(C&: InvalidCosts, Comp: [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4939 if (Numbering[A.first] != Numbering[B.first])
4940 return Numbering[A.first] < Numbering[B.first];
4941 ElementCountComparator ECC;
4942 return ECC(A.second, B.second);
4943 });
4944
4945 // For a list of ordered instruction-vf pairs:
4946 // [(load, vf1), (load, vf2), (store, vf1)]
4947 // Group the instructions together to emit separate remarks for:
4948 // load (vf1, vf2)
4949 // store (vf1)
4950 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4951 auto Subset = ArrayRef<InstructionVFPair>();
4952 do {
4953 if (Subset.empty())
4954 Subset = Tail.take_front(N: 1);
4955
4956 Instruction *I = Subset.front().first;
4957
4958 // If the next instruction is different, or if there are no other pairs,
4959 // emit a remark for the collated subset. e.g.
4960 // [(load, vf1), (load, vf2))]
4961 // to emit:
4962 // remark: invalid costs for 'load' at VF=(vf, vf2)
4963 if (Subset == Tail || Tail[Subset.size()].first != I) {
4964 std::string OutString;
4965 raw_string_ostream OS(OutString);
4966 assert(!Subset.empty() && "Unexpected empty range");
4967 OS << "Instruction with invalid costs prevented vectorization at VF=(";
4968 for (const auto &Pair : Subset)
4969 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4970 OS << "):";
4971 if (auto *CI = dyn_cast<CallInst>(Val: I))
4972 OS << " call to " << CI->getCalledFunction()->getName();
4973 else
4974 OS << " " << I->getOpcodeName();
4975 OS.flush();
4976 reportVectorizationInfo(Msg: OutString, ORETag: "InvalidCost", ORE, TheLoop, I);
4977 Tail = Tail.drop_front(N: Subset.size());
4978 Subset = {};
4979 } else
4980 // Grow the subset by one element
4981 Subset = Tail.take_front(N: Subset.size() + 1);
4982 } while (!Tail.empty());
4983}
4984
4985VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
4986 const ElementCountSet &VFCandidates) {
4987 InstructionCost ExpectedCost =
4988 CM.expectedCost(VF: ElementCount::getFixed(MinVal: 1)).first;
4989 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4990 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4991 assert(VFCandidates.count(ElementCount::getFixed(1)) &&
4992 "Expected Scalar VF to be a candidate");
4993
4994 const VectorizationFactor ScalarCost(ElementCount::getFixed(MinVal: 1), ExpectedCost,
4995 ExpectedCost);
4996 VectorizationFactor ChosenFactor = ScalarCost;
4997
4998 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4999 if (ForceVectorization && VFCandidates.size() > 1) {
5000 // Ignore scalar width, because the user explicitly wants vectorization.
5001 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5002 // evaluation.
5003 ChosenFactor.Cost = InstructionCost::getMax();
5004 }
5005
5006 SmallVector<InstructionVFPair> InvalidCosts;
5007 for (const auto &i : VFCandidates) {
5008 // The cost for scalar VF=1 is already calculated, so ignore it.
5009 if (i.isScalar())
5010 continue;
5011
5012 LoopVectorizationCostModel::VectorizationCostTy C =
5013 CM.expectedCost(VF: i, Invalid: &InvalidCosts);
5014 VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5015
5016#ifndef NDEBUG
5017 unsigned AssumedMinimumVscale =
5018 getVScaleForTuning(L: OrigLoop, TTI).value_or(u: 1);
5019 unsigned Width =
5020 Candidate.Width.isScalable()
5021 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5022 : Candidate.Width.getFixedValue();
5023 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5024 << " costs: " << (Candidate.Cost / Width));
5025 if (i.isScalable())
5026 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5027 << AssumedMinimumVscale << ")");
5028 LLVM_DEBUG(dbgs() << ".\n");
5029#endif
5030
5031 if (!C.second && !ForceVectorization) {
5032 LLVM_DEBUG(
5033 dbgs() << "LV: Not considering vector loop of width " << i
5034 << " because it will not generate any vector instructions.\n");
5035 continue;
5036 }
5037
5038 // If profitable add it to ProfitableVF list.
5039 if (isMoreProfitable(A: Candidate, B: ScalarCost))
5040 ProfitableVFs.push_back(Elt: Candidate);
5041
5042 if (isMoreProfitable(A: Candidate, B: ChosenFactor))
5043 ChosenFactor = Candidate;
5044 }
5045
5046 emitInvalidCostRemarks(InvalidCosts, ORE, TheLoop: OrigLoop);
5047
5048 if (!EnableCondStoresVectorization && CM.hasPredStores()) {
5049 reportVectorizationFailure(
5050 DebugMsg: "There are conditional stores.",
5051 OREMsg: "store that is conditionally executed prevents vectorization",
5052 ORETag: "ConditionalStore", ORE, TheLoop: OrigLoop);
5053 ChosenFactor = ScalarCost;
5054 }
5055
5056 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5057 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
5058 << "LV: Vectorization seems to be not beneficial, "
5059 << "but was forced by a user.\n");
5060 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5061 return ChosenFactor;
5062}
5063
5064bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
5065 ElementCount VF) const {
5066 // Cross iteration phis such as reductions need special handling and are
5067 // currently unsupported.
5068 if (any_of(Range: OrigLoop->getHeader()->phis(),
5069 P: [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(Phi: &Phi); }))
5070 return false;
5071
5072 // Phis with uses outside of the loop require special handling and are
5073 // currently unsupported.
5074 for (const auto &Entry : Legal->getInductionVars()) {
5075 // Look for uses of the value of the induction at the last iteration.
5076 Value *PostInc =
5077 Entry.first->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch());
5078 for (User *U : PostInc->users())
5079 if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
5080 return false;
5081 // Look for uses of penultimate value of the induction.
5082 for (User *U : Entry.first->users())
5083 if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
5084 return false;
5085 }
5086
5087 // Epilogue vectorization code has not been auditted to ensure it handles
5088 // non-latch exits properly. It may be fine, but it needs auditted and
5089 // tested.
5090 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
5091 return false;
5092
5093 return true;
5094}
5095
5096bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5097 const ElementCount VF) const {
5098 // FIXME: We need a much better cost-model to take different parameters such
5099 // as register pressure, code size increase and cost of extra branches into
5100 // account. For now we apply a very crude heuristic and only consider loops
5101 // with vectorization factors larger than a certain value.
5102
5103 // Allow the target to opt out entirely.
5104 if (!TTI.preferEpilogueVectorization())
5105 return false;
5106
5107 // We also consider epilogue vectorization unprofitable for targets that don't
5108 // consider interleaving beneficial (eg. MVE).
5109 if (TTI.getMaxInterleaveFactor(VF) <= 1)
5110 return false;
5111
5112 unsigned Multiplier = 1;
5113 if (VF.isScalable())
5114 Multiplier = getVScaleForTuning(L: TheLoop, TTI).value_or(u: 1);
5115 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
5116 return true;
5117 return false;
5118}
5119
5120VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
5121 const ElementCount MainLoopVF, unsigned IC) {
5122 VectorizationFactor Result = VectorizationFactor::Disabled();
5123 if (!EnableEpilogueVectorization) {
5124 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
5125 return Result;
5126 }
5127
5128 if (!CM.isScalarEpilogueAllowed()) {
5129 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
5130 "epilogue is allowed.\n");
5131 return Result;
5132 }
5133
5134 // Not really a cost consideration, but check for unsupported cases here to
5135 // simplify the logic.
5136 if (!isCandidateForEpilogueVectorization(VF: MainLoopVF)) {
5137 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
5138 "is not a supported candidate.\n");
5139 return Result;
5140 }
5141
5142 if (EpilogueVectorizationForceVF > 1) {
5143 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
5144 ElementCount ForcedEC = ElementCount::getFixed(MinVal: EpilogueVectorizationForceVF);
5145 if (hasPlanWithVF(VF: ForcedEC))
5146 return {ForcedEC, 0, 0};
5147 else {
5148 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
5149 "viable.\n");
5150 return Result;
5151 }
5152 }
5153
5154 if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
5155 OrigLoop->getHeader()->getParent()->hasMinSize()) {
5156 LLVM_DEBUG(
5157 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
5158 return Result;
5159 }
5160
5161 if (!CM.isEpilogueVectorizationProfitable(VF: MainLoopVF)) {
5162 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5163 "this loop\n");
5164 return Result;
5165 }
5166
5167 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5168 // the main loop handles 8 lanes per iteration. We could still benefit from
5169 // vectorizing the epilogue loop with VF=4.
5170 ElementCount EstimatedRuntimeVF = MainLoopVF;
5171 if (MainLoopVF.isScalable()) {
5172 EstimatedRuntimeVF = ElementCount::getFixed(MinVal: MainLoopVF.getKnownMinValue());
5173 if (std::optional<unsigned> VScale = getVScaleForTuning(L: OrigLoop, TTI))
5174 EstimatedRuntimeVF *= *VScale;
5175 }
5176
5177 ScalarEvolution &SE = *PSE.getSE();
5178 Type *TCType = Legal->getWidestInductionType();
5179 const SCEV *RemainingIterations = nullptr;
5180 for (auto &NextVF : ProfitableVFs) {
5181 // Skip candidate VFs without a corresponding VPlan.
5182 if (!hasPlanWithVF(VF: NextVF.Width))
5183 continue;
5184
5185 // Skip candidate VFs with widths >= the estimate runtime VF (scalable
5186 // vectors) or the VF of the main loop (fixed vectors).
5187 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5188 ElementCount::isKnownGE(LHS: NextVF.Width, RHS: EstimatedRuntimeVF)) ||
5189 ElementCount::isKnownGE(LHS: NextVF.Width, RHS: MainLoopVF))
5190 continue;
5191
5192 // If NextVF is greater than the number of remaining iterations, the
5193 // epilogue loop would be dead. Skip such factors.
5194 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
5195 // TODO: extend to support scalable VFs.
5196 if (!RemainingIterations) {
5197 const SCEV *TC = createTripCountSCEV(IdxTy: TCType, PSE, OrigLoop);
5198 RemainingIterations = SE.getURemExpr(
5199 LHS: TC, RHS: SE.getConstant(Ty: TCType, V: MainLoopVF.getKnownMinValue() * IC));
5200 }
5201 if (SE.isKnownPredicate(
5202 Pred: CmpInst::ICMP_UGT,
5203 LHS: SE.getConstant(Ty: TCType, V: NextVF.Width.getKnownMinValue()),
5204 RHS: RemainingIterations))
5205 continue;
5206 }
5207
5208 if (Result.Width.isScalar() || isMoreProfitable(A: NextVF, B: Result))
5209 Result = NextVF;
5210 }
5211
5212 if (Result != VectorizationFactor::Disabled())
5213 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5214 << Result.Width << "\n");
5215 return Result;
5216}
5217
5218std::pair<unsigned, unsigned>
5219LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5220 unsigned MinWidth = -1U;
5221 unsigned MaxWidth = 8;
5222 const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5223 // For in-loop reductions, no element types are added to ElementTypesInLoop
5224 // if there are no loads/stores in the loop. In this case, check through the
5225 // reduction variables to determine the maximum width.
5226 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5227 // Reset MaxWidth so that we can find the smallest type used by recurrences
5228 // in the loop.
5229 MaxWidth = -1U;
5230 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5231 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5232 // When finding the min width used by the recurrence we need to account
5233 // for casts on the input operands of the recurrence.
5234 MaxWidth = std::min<unsigned>(
5235 a: MaxWidth, b: std::min<unsigned>(
5236 a: RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5237 b: RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5238 }
5239 } else {
5240 for (Type *T : ElementTypesInLoop) {
5241 MinWidth = std::min<unsigned>(
5242 a: MinWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
5243 MaxWidth = std::max<unsigned>(
5244 a: MaxWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
5245 }
5246 }
5247 return {MinWidth, MaxWidth};
5248}
5249
5250void LoopVectorizationCostModel::collectElementTypesForWidening() {
5251 ElementTypesInLoop.clear();
5252 // For each block.
5253 for (BasicBlock *BB : TheLoop->blocks()) {
5254 // For each instruction in the loop.
5255 for (Instruction &I : BB->instructionsWithoutDebug()) {
5256 Type *T = I.getType();
5257
5258 // Skip ignored values.
5259 if (ValuesToIgnore.count(Ptr: &I))
5260 continue;
5261
5262 // Only examine Loads, Stores and PHINodes.
5263 if (!isa<LoadInst>(Val: I) && !isa<StoreInst>(Val: I) && !isa<PHINode>(Val: I))
5264 continue;
5265
5266 // Examine PHI nodes that are reduction variables. Update the type to
5267 // account for the recurrence type.
5268 if (auto *PN = dyn_cast<PHINode>(Val: &I)) {
5269 if (!Legal->isReductionVariable(PN))
5270 continue;
5271 const RecurrenceDescriptor &RdxDesc =
5272 Legal->getReductionVars().find(Key: PN)->second;
5273 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5274 TTI.preferInLoopReduction(Opcode: RdxDesc.getOpcode(),
5275 Ty: RdxDesc.getRecurrenceType(),
5276 Flags: TargetTransformInfo::ReductionFlags()))
5277 continue;
5278 T = RdxDesc.getRecurrenceType();
5279 }
5280
5281 // Examine the stored values.
5282 if (auto *ST = dyn_cast<StoreInst>(Val: &I))
5283 T = ST->getValueOperand()->getType();
5284
5285 assert(T->isSized() &&
5286 "Expected the load/store/recurrence type to be sized");
5287
5288 ElementTypesInLoop.insert(Ptr: T);
5289 }
5290 }
5291}
5292
5293unsigned
5294LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5295 InstructionCost LoopCost) {
5296 // -- The interleave heuristics --
5297 // We interleave the loop in order to expose ILP and reduce the loop overhead.
5298 // There are many micro-architectural considerations that we can't predict
5299 // at this level. For example, frontend pressure (on decode or fetch) due to
5300 // code size, or the number and capabilities of the execution ports.
5301 //
5302 // We use the following heuristics to select the interleave count:
5303 // 1. If the code has reductions, then we interleave to break the cross
5304 // iteration dependency.
5305 // 2. If the loop is really small, then we interleave to reduce the loop
5306 // overhead.
5307 // 3. We don't interleave if we think that we will spill registers to memory
5308 // due to the increased register pressure.
5309
5310 if (!isScalarEpilogueAllowed())
5311 return 1;
5312
5313 // Do not interleave if EVL is preferred and no User IC is specified.
5314 if (foldTailWithEVL()) {
5315 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
5316 "Unroll factor forced to be 1.\n");
5317 return 1;
5318 }
5319
5320 // We used the distance for the interleave count.
5321 if (!Legal->isSafeForAnyVectorWidth())
5322 return 1;
5323
5324 auto BestKnownTC = getSmallBestKnownTC(SE&: *PSE.getSE(), L: TheLoop);
5325 const bool HasReductions = !Legal->getReductionVars().empty();
5326
5327 // If we did not calculate the cost for VF (because the user selected the VF)
5328 // then we calculate the cost of VF here.
5329 if (LoopCost == 0) {
5330 LoopCost = expectedCost(VF).first;
5331 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5332
5333 // Loop body is free and there is no need for interleaving.
5334 if (LoopCost == 0)
5335 return 1;
5336 }
5337
5338 RegisterUsage R = calculateRegisterUsage(VFs: {VF})[0];
5339 // We divide by these constants so assume that we have at least one
5340 // instruction that uses at least one register.
5341 for (auto& pair : R.MaxLocalUsers) {
5342 pair.second = std::max(a: pair.second, b: 1U);
5343 }
5344
5345 // We calculate the interleave count using the following formula.
5346 // Subtract the number of loop invariants from the number of available
5347 // registers. These registers are used by all of the interleaved instances.
5348 // Next, divide the remaining registers by the number of registers that is
5349 // required by the loop, in order to estimate how many parallel instances
5350 // fit without causing spills. All of this is rounded down if necessary to be
5351 // a power of two. We want power of two interleave count to simplify any
5352 // addressing operations or alignment considerations.
5353 // We also want power of two interleave counts to ensure that the induction
5354 // variable of the vector loop wraps to zero, when tail is folded by masking;
5355 // this currently happens when OptForSize, in which case IC is set to 1 above.
5356 unsigned IC = UINT_MAX;
5357
5358 for (auto& pair : R.MaxLocalUsers) {
5359 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(ClassID: pair.first);
5360 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5361 << " registers of "
5362 << TTI.getRegisterClassName(pair.first) << " register class\n");
5363 if (VF.isScalar()) {
5364 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5365 TargetNumRegisters = ForceTargetNumScalarRegs;
5366 } else {
5367 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5368 TargetNumRegisters = ForceTargetNumVectorRegs;
5369 }
5370 unsigned MaxLocalUsers = pair.second;
5371 unsigned LoopInvariantRegs = 0;
5372 if (R.LoopInvariantRegs.find(Key: pair.first) != R.LoopInvariantRegs.end())
5373 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5374
5375 unsigned TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs) /
5376 MaxLocalUsers);
5377 // Don't count the induction variable as interleaved.
5378 if (EnableIndVarRegisterHeur) {
5379 TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs - 1) /
5380 std::max(a: 1U, b: (MaxLocalUsers - 1)));
5381 }
5382
5383 IC = std::min(a: IC, b: TmpIC);
5384 }
5385
5386 // Clamp the interleave ranges to reasonable counts.
5387 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5388
5389 // Check if the user has overridden the max.
5390 if (VF.isScalar()) {
5391 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5392 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5393 } else {
5394 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5395 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5396 }
5397
5398 unsigned EstimatedVF = VF.getKnownMinValue();
5399 if (VF.isScalable()) {
5400 if (std::optional<unsigned> VScale = getVScaleForTuning(L: TheLoop, TTI))
5401 EstimatedVF *= *VScale;
5402 }
5403 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
5404
5405 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(L: TheLoop);
5406 if (KnownTC > 0) {
5407 // At least one iteration must be scalar when this constraint holds. So the
5408 // maximum available iterations for interleaving is one less.
5409 unsigned AvailableTC =
5410 requiresScalarEpilogue(IsVectorizing: VF.isVector()) ? KnownTC - 1 : KnownTC;
5411
5412 // If trip count is known we select between two prospective ICs, where
5413 // 1) the aggressive IC is capped by the trip count divided by VF
5414 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
5415 // The final IC is selected in a way that the epilogue loop trip count is
5416 // minimized while maximizing the IC itself, so that we either run the
5417 // vector loop at least once if it generates a small epilogue loop, or else
5418 // we run the vector loop at least twice.
5419
5420 unsigned InterleaveCountUB = bit_floor(
5421 Value: std::max(a: 1u, b: std::min(a: AvailableTC / EstimatedVF, b: MaxInterleaveCount)));
5422 unsigned InterleaveCountLB = bit_floor(Value: std::max(
5423 a: 1u, b: std::min(a: AvailableTC / (EstimatedVF * 2), b: MaxInterleaveCount)));
5424 MaxInterleaveCount = InterleaveCountLB;
5425
5426 if (InterleaveCountUB != InterleaveCountLB) {
5427 unsigned TailTripCountUB =
5428 (AvailableTC % (EstimatedVF * InterleaveCountUB));
5429 unsigned TailTripCountLB =
5430 (AvailableTC % (EstimatedVF * InterleaveCountLB));
5431 // If both produce same scalar tail, maximize the IC to do the same work
5432 // in fewer vector loop iterations
5433 if (TailTripCountUB == TailTripCountLB)
5434 MaxInterleaveCount = InterleaveCountUB;
5435 }
5436 } else if (BestKnownTC && *BestKnownTC > 0) {
5437 // At least one iteration must be scalar when this constraint holds. So the
5438 // maximum available iterations for interleaving is one less.
5439 unsigned AvailableTC = requiresScalarEpilogue(IsVectorizing: VF.isVector())
5440 ? (*BestKnownTC) - 1
5441 : *BestKnownTC;
5442
5443 // If trip count is an estimated compile time constant, limit the
5444 // IC to be capped by the trip count divided by VF * 2, such that the vector
5445 // loop runs at least twice to make interleaving seem profitable when there
5446 // is an epilogue loop present. Since exact Trip count is not known we
5447 // choose to be conservative in our IC estimate.
5448 MaxInterleaveCount = bit_floor(Value: std::max(
5449 a: 1u, b: std::min(a: AvailableTC / (EstimatedVF * 2), b: MaxInterleaveCount)));
5450 }
5451
5452 assert(MaxInterleaveCount > 0 &&
5453 "Maximum interleave count must be greater than 0");
5454
5455 // Clamp the calculated IC to be between the 1 and the max interleave count
5456 // that the target and trip count allows.
5457 if (IC > MaxInterleaveCount)
5458 IC = MaxInterleaveCount;
5459 else
5460 // Make sure IC is greater than 0.
5461 IC = std::max(a: 1u, b: IC);
5462
5463 assert(IC > 0 && "Interleave count must be greater than 0.");
5464
5465 // Interleave if we vectorized this loop and there is a reduction that could
5466 // benefit from interleaving.
5467 if (VF.isVector() && HasReductions) {
5468 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5469 return IC;
5470 }
5471
5472 // For any scalar loop that either requires runtime checks or predication we
5473 // are better off leaving this to the unroller. Note that if we've already
5474 // vectorized the loop we will have done the runtime check and so interleaving
5475 // won't require further checks.
5476 bool ScalarInterleavingRequiresPredication =
5477 (VF.isScalar() && any_of(Range: TheLoop->blocks(), P: [this](BasicBlock *BB) {
5478 return Legal->blockNeedsPredication(BB);
5479 }));
5480 bool ScalarInterleavingRequiresRuntimePointerCheck =
5481 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5482
5483 // We want to interleave small loops in order to reduce the loop overhead and
5484 // potentially expose ILP opportunities.
5485 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5486 << "LV: IC is " << IC << '\n'
5487 << "LV: VF is " << VF << '\n');
5488 const bool AggressivelyInterleaveReductions =
5489 TTI.enableAggressiveInterleaving(LoopHasReductions: HasReductions);
5490 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5491 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5492 // We assume that the cost overhead is 1 and we use the cost model
5493 // to estimate the cost of the loop and interleave until the cost of the
5494 // loop overhead is about 5% of the cost of the loop.
5495 unsigned SmallIC = std::min(a: IC, b: (unsigned)llvm::bit_floor<uint64_t>(
5496 Value: SmallLoopCost / *LoopCost.getValue()));
5497
5498 // Interleave until store/load ports (estimated by max interleave count) are
5499 // saturated.
5500 unsigned NumStores = Legal->getNumStores();
5501 unsigned NumLoads = Legal->getNumLoads();
5502 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5503 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5504
5505 // There is little point in interleaving for reductions containing selects
5506 // and compares when VF=1 since it may just create more overhead than it's
5507 // worth for loops with small trip counts. This is because we still have to
5508 // do the final reduction after the loop.
5509 bool HasSelectCmpReductions =
5510 HasReductions &&
5511 any_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
5512 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5513 return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5514 Kind: RdxDesc.getRecurrenceKind());
5515 });
5516 if (HasSelectCmpReductions) {
5517 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5518 return 1;
5519 }
5520
5521 // If we have a scalar reduction (vector reductions are already dealt with
5522 // by this point), we can increase the critical path length if the loop
5523 // we're interleaving is inside another loop. For tree-wise reductions
5524 // set the limit to 2, and for ordered reductions it's best to disable
5525 // interleaving entirely.
5526 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5527 bool HasOrderedReductions =
5528 any_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
5529 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5530 return RdxDesc.isOrdered();
5531 });
5532 if (HasOrderedReductions) {
5533 LLVM_DEBUG(
5534 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5535 return 1;
5536 }
5537
5538 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5539 SmallIC = std::min(a: SmallIC, b: F);
5540 StoresIC = std::min(a: StoresIC, b: F);
5541 LoadsIC = std::min(a: LoadsIC, b: F);
5542 }
5543
5544 if (EnableLoadStoreRuntimeInterleave &&
5545 std::max(a: StoresIC, b: LoadsIC) > SmallIC) {
5546 LLVM_DEBUG(
5547 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5548 return std::max(a: StoresIC, b: LoadsIC);
5549 }
5550
5551 // If there are scalar reductions and TTI has enabled aggressive
5552 // interleaving for reductions, we will interleave to expose ILP.
5553 if (VF.isScalar() && AggressivelyInterleaveReductions) {
5554 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5555 // Interleave no less than SmallIC but not as aggressive as the normal IC
5556 // to satisfy the rare situation when resources are too limited.
5557 return std::max(a: IC / 2, b: SmallIC);
5558 } else {
5559 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5560 return SmallIC;
5561 }
5562 }
5563
5564 // Interleave if this is a large loop (small loops are already dealt with by
5565 // this point) that could benefit from interleaving.
5566 if (AggressivelyInterleaveReductions) {
5567 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5568 return IC;
5569 }
5570
5571 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5572 return 1;
5573}
5574
5575SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5576LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5577 // This function calculates the register usage by measuring the highest number
5578 // of values that are alive at a single location. Obviously, this is a very
5579 // rough estimation. We scan the loop in a topological order in order and
5580 // assign a number to each instruction. We use RPO to ensure that defs are
5581 // met before their users. We assume that each instruction that has in-loop
5582 // users starts an interval. We record every time that an in-loop value is
5583 // used, so we have a list of the first and last occurrences of each
5584 // instruction. Next, we transpose this data structure into a multi map that
5585 // holds the list of intervals that *end* at a specific location. This multi
5586 // map allows us to perform a linear search. We scan the instructions linearly
5587 // and record each time that a new interval starts, by placing it in a set.
5588 // If we find this value in the multi-map then we remove it from the set.
5589 // The max register usage is the maximum size of the set.
5590 // We also search for instructions that are defined outside the loop, but are
5591 // used inside the loop. We need this number separately from the max-interval
5592 // usage number because when we unroll, loop-invariant values do not take
5593 // more register.
5594 LoopBlocksDFS DFS(TheLoop);
5595 DFS.perform(LI);
5596
5597 RegisterUsage RU;
5598
5599 // Each 'key' in the map opens a new interval. The values
5600 // of the map are the index of the 'last seen' usage of the
5601 // instruction that is the key.
5602 using IntervalMap = DenseMap<Instruction *, unsigned>;
5603
5604 // Maps instruction to its index.
5605 SmallVector<Instruction *, 64> IdxToInstr;
5606 // Marks the end of each interval.
5607 IntervalMap EndPoint;
5608 // Saves the list of instruction indices that are used in the loop.
5609 SmallPtrSet<Instruction *, 8> Ends;
5610 // Saves the list of values that are used in the loop but are defined outside
5611 // the loop (not including non-instruction values such as arguments and
5612 // constants).
5613 SmallSetVector<Instruction *, 8> LoopInvariants;
5614
5615 for (BasicBlock *BB : make_range(x: DFS.beginRPO(), y: DFS.endRPO())) {
5616 for (Instruction &I : BB->instructionsWithoutDebug()) {
5617 IdxToInstr.push_back(Elt: &I);
5618
5619 // Save the end location of each USE.
5620 for (Value *U : I.operands()) {
5621 auto *Instr = dyn_cast<Instruction>(Val: U);
5622
5623 // Ignore non-instruction values such as arguments, constants, etc.
5624 // FIXME: Might need some motivation why these values are ignored. If
5625 // for example an argument is used inside the loop it will increase the
5626 // register pressure (so shouldn't we add it to LoopInvariants).
5627 if (!Instr)
5628 continue;
5629
5630 // If this instruction is outside the loop then record it and continue.
5631 if (!TheLoop->contains(Inst: Instr)) {
5632 LoopInvariants.insert(X: Instr);
5633 continue;
5634 }
5635
5636 // Overwrite previous end points.
5637 EndPoint[Instr] = IdxToInstr.size();
5638 Ends.insert(Ptr: Instr);
5639 }
5640 }
5641 }
5642
5643 // Saves the list of intervals that end with the index in 'key'.
5644 using InstrList = SmallVector<Instruction *, 2>;
5645 DenseMap<unsigned, InstrList> TransposeEnds;
5646
5647 // Transpose the EndPoints to a list of values that end at each index.
5648 for (auto &Interval : EndPoint)
5649 TransposeEnds[Interval.second].push_back(Elt: Interval.first);
5650
5651 SmallPtrSet<Instruction *, 8> OpenIntervals;
5652 SmallVector<RegisterUsage, 8> RUs(VFs.size());
5653 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5654
5655 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5656
5657 const auto &TTICapture = TTI;
5658 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5659 if (Ty->isTokenTy() || !VectorType::isValidElementType(ElemTy: Ty))
5660 return 0;
5661 return TTICapture.getRegUsageForType(Ty: VectorType::get(ElementType: Ty, EC: VF));
5662 };
5663
5664 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5665 Instruction *I = IdxToInstr[i];
5666
5667 // Remove all of the instructions that end at this location.
5668 InstrList &List = TransposeEnds[i];
5669 for (Instruction *ToRemove : List)
5670 OpenIntervals.erase(Ptr: ToRemove);
5671
5672 // Ignore instructions that are never used within the loop.
5673 if (!Ends.count(Ptr: I))
5674 continue;
5675
5676 // Skip ignored values.
5677 if (ValuesToIgnore.count(Ptr: I))
5678 continue;
5679
5680 collectInLoopReductions();
5681
5682 // For each VF find the maximum usage of registers.
5683 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5684 // Count the number of registers used, per register class, given all open
5685 // intervals.
5686 // Note that elements in this SmallMapVector will be default constructed
5687 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5688 // there is no previous entry for ClassID.
5689 SmallMapVector<unsigned, unsigned, 4> RegUsage;
5690
5691 if (VFs[j].isScalar()) {
5692 for (auto *Inst : OpenIntervals) {
5693 unsigned ClassID =
5694 TTI.getRegisterClassForType(Vector: false, Ty: Inst->getType());
5695 // FIXME: The target might use more than one register for the type
5696 // even in the scalar case.
5697 RegUsage[ClassID] += 1;
5698 }
5699 } else {
5700 collectUniformsAndScalars(VF: VFs[j]);
5701 for (auto *Inst : OpenIntervals) {
5702 // Skip ignored values for VF > 1.
5703 if (VecValuesToIgnore.count(Ptr: Inst))
5704 continue;
5705 if (isScalarAfterVectorization(I: Inst, VF: VFs[j])) {
5706 unsigned ClassID =
5707 TTI.getRegisterClassForType(Vector: false, Ty: Inst->getType());
5708 // FIXME: The target might use more than one register for the type
5709 // even in the scalar case.
5710 RegUsage[ClassID] += 1;
5711 } else {
5712 unsigned ClassID =
5713 TTI.getRegisterClassForType(Vector: true, Ty: Inst->getType());
5714 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5715 }
5716 }
5717 }
5718
5719 for (auto& pair : RegUsage) {
5720 auto &Entry = MaxUsages[j][pair.first];
5721 Entry = std::max(a: Entry, b: pair.second);
5722 }
5723 }
5724
5725 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5726 << OpenIntervals.size() << '\n');
5727
5728 // Add the current instruction to the list of open intervals.
5729 OpenIntervals.insert(Ptr: I);
5730 }
5731
5732 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5733 // Note that elements in this SmallMapVector will be default constructed
5734 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5735 // there is no previous entry for ClassID.
5736 SmallMapVector<unsigned, unsigned, 4> Invariant;
5737
5738 for (auto *Inst : LoopInvariants) {
5739 // FIXME: The target might use more than one register for the type
5740 // even in the scalar case.
5741 bool IsScalar = all_of(Range: Inst->users(), P: [&](User *U) {
5742 auto *I = cast<Instruction>(Val: U);
5743 return TheLoop != LI->getLoopFor(BB: I->getParent()) ||
5744 isScalarAfterVectorization(I, VF: VFs[i]);
5745 });
5746
5747 ElementCount VF = IsScalar ? ElementCount::getFixed(MinVal: 1) : VFs[i];
5748 unsigned ClassID =
5749 TTI.getRegisterClassForType(Vector: VF.isVector(), Ty: Inst->getType());
5750 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5751 }
5752
5753 LLVM_DEBUG({
5754 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5755 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5756 << " item\n";
5757 for (const auto &pair : MaxUsages[i]) {
5758 dbgs() << "LV(REG): RegisterClass: "
5759 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5760 << " registers\n";
5761 }
5762 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5763 << " item\n";
5764 for (const auto &pair : Invariant) {
5765 dbgs() << "LV(REG): RegisterClass: "
5766 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5767 << " registers\n";
5768 }
5769 });
5770
5771 RU.LoopInvariantRegs = Invariant;
5772 RU.MaxLocalUsers = MaxUsages[i];
5773 RUs[i] = RU;
5774 }
5775
5776 return RUs;
5777}
5778
5779bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5780 ElementCount VF) {
5781 // TODO: Cost model for emulated masked load/store is completely
5782 // broken. This hack guides the cost model to use an artificially
5783 // high enough value to practically disable vectorization with such
5784 // operations, except where previously deployed legality hack allowed
5785 // using very low cost values. This is to avoid regressions coming simply
5786 // from moving "masked load/store" check from legality to cost model.
5787 // Masked Load/Gather emulation was previously never allowed.
5788 // Limited number of Masked Store/Scatter emulation was allowed.
5789 assert((isPredicatedInst(I)) &&
5790 "Expecting a scalar emulated instruction");
5791 return isa<LoadInst>(Val: I) ||
5792 (isa<StoreInst>(Val: I) &&
5793 NumPredStores > NumberOfStoresToPredicate);
5794}
5795
5796void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5797 // If we aren't vectorizing the loop, or if we've already collected the
5798 // instructions to scalarize, there's nothing to do. Collection may already
5799 // have occurred if we have a user-selected VF and are now computing the
5800 // expected cost for interleaving.
5801 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(Val: VF))
5802 return;
5803
5804 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5805 // not profitable to scalarize any instructions, the presence of VF in the
5806 // map will indicate that we've analyzed it already.
5807 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5808
5809 PredicatedBBsAfterVectorization[VF].clear();
5810
5811 // Find all the instructions that are scalar with predication in the loop and
5812 // determine if it would be better to not if-convert the blocks they are in.
5813 // If so, we also record the instructions to scalarize.
5814 for (BasicBlock *BB : TheLoop->blocks()) {
5815 if (!blockNeedsPredicationForAnyReason(BB))
5816 continue;
5817 for (Instruction &I : *BB)
5818 if (isScalarWithPredication(I: &I, VF)) {
5819 ScalarCostsTy ScalarCosts;
5820 // Do not apply discount if scalable, because that would lead to
5821 // invalid scalarization costs.
5822 // Do not apply discount logic if hacked cost is needed
5823 // for emulated masked memrefs.
5824 if (!isScalarAfterVectorization(I: &I, VF) && !VF.isScalable() &&
5825 !useEmulatedMaskMemRefHack(I: &I, VF) &&
5826 computePredInstDiscount(PredInst: &I, ScalarCosts, VF) >= 0)
5827 ScalarCostsVF.insert(I: ScalarCosts.begin(), E: ScalarCosts.end());
5828 // Remember that BB will remain after vectorization.
5829 PredicatedBBsAfterVectorization[VF].insert(Ptr: BB);
5830 }
5831 }
5832}
5833
5834InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5835 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5836 assert(!isUniformAfterVectorization(PredInst, VF) &&
5837 "Instruction marked uniform-after-vectorization will be predicated");
5838
5839 // Initialize the discount to zero, meaning that the scalar version and the
5840 // vector version cost the same.
5841 InstructionCost Discount = 0;
5842
5843 // Holds instructions to analyze. The instructions we visit are mapped in
5844 // ScalarCosts. Those instructions are the ones that would be scalarized if
5845 // we find that the scalar version costs less.
5846 SmallVector<Instruction *, 8> Worklist;
5847
5848 // Returns true if the given instruction can be scalarized.
5849 auto canBeScalarized = [&](Instruction *I) -> bool {
5850 // We only attempt to scalarize instructions forming a single-use chain
5851 // from the original predicated block that would otherwise be vectorized.
5852 // Although not strictly necessary, we give up on instructions we know will
5853 // already be scalar to avoid traversing chains that are unlikely to be
5854 // beneficial.
5855 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5856 isScalarAfterVectorization(I, VF))
5857 return false;
5858
5859 // If the instruction is scalar with predication, it will be analyzed
5860 // separately. We ignore it within the context of PredInst.
5861 if (isScalarWithPredication(I, VF))
5862 return false;
5863
5864 // If any of the instruction's operands are uniform after vectorization,
5865 // the instruction cannot be scalarized. This prevents, for example, a
5866 // masked load from being scalarized.
5867 //
5868 // We assume we will only emit a value for lane zero of an instruction
5869 // marked uniform after vectorization, rather than VF identical values.
5870 // Thus, if we scalarize an instruction that uses a uniform, we would
5871 // create uses of values corresponding to the lanes we aren't emitting code
5872 // for. This behavior can be changed by allowing getScalarValue to clone
5873 // the lane zero values for uniforms rather than asserting.
5874 for (Use &U : I->operands())
5875 if (auto *J = dyn_cast<Instruction>(Val: U.get()))
5876 if (isUniformAfterVectorization(I: J, VF))
5877 return false;
5878
5879 // Otherwise, we can scalarize the instruction.
5880 return true;
5881 };
5882
5883 // Compute the expected cost discount from scalarizing the entire expression
5884 // feeding the predicated instruction. We currently only consider expressions
5885 // that are single-use instruction chains.
5886 Worklist.push_back(Elt: PredInst);
5887 while (!Worklist.empty()) {
5888 Instruction *I = Worklist.pop_back_val();
5889
5890 // If we've already analyzed the instruction, there's nothing to do.
5891 if (ScalarCosts.contains(Val: I))
5892 continue;
5893
5894 // Compute the cost of the vector instruction. Note that this cost already
5895 // includes the scalarization overhead of the predicated instruction.
5896 InstructionCost VectorCost = getInstructionCost(I, VF).first;
5897
5898 // Compute the cost of the scalarized instruction. This cost is the cost of
5899 // the instruction as if it wasn't if-converted and instead remained in the
5900 // predicated block. We will scale this cost by block probability after
5901 // computing the scalarization overhead.
5902 InstructionCost ScalarCost =
5903 VF.getFixedValue() *
5904 getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1)).first;
5905
5906 // Compute the scalarization overhead of needed insertelement instructions
5907 // and phi nodes.
5908 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5909 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5910 ScalarCost += TTI.getScalarizationOverhead(
5911 Ty: cast<VectorType>(Val: ToVectorTy(Scalar: I->getType(), EC: VF)),
5912 DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), /*Insert*/ true,
5913 /*Extract*/ false, CostKind);
5914 ScalarCost +=
5915 VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
5916 }
5917
5918 // Compute the scalarization overhead of needed extractelement
5919 // instructions. For each of the instruction's operands, if the operand can
5920 // be scalarized, add it to the worklist; otherwise, account for the
5921 // overhead.
5922 for (Use &U : I->operands())
5923 if (auto *J = dyn_cast<Instruction>(Val: U.get())) {
5924 assert(VectorType::isValidElementType(J->getType()) &&
5925 "Instruction has non-scalar type");
5926 if (canBeScalarized(J))
5927 Worklist.push_back(Elt: J);
5928 else if (needsExtract(V: J, VF)) {
5929 ScalarCost += TTI.getScalarizationOverhead(
5930 Ty: cast<VectorType>(Val: ToVectorTy(Scalar: J->getType(), EC: VF)),
5931 DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), /*Insert*/ false,
5932 /*Extract*/ true, CostKind);
5933 }
5934 }
5935
5936 // Scale the total scalar cost by block probability.
5937 ScalarCost /= getReciprocalPredBlockProb();
5938
5939 // Compute the discount. A non-negative discount means the vector version
5940 // of the instruction costs more, and scalarizing would be beneficial.
5941 Discount += VectorCost - ScalarCost;
5942 ScalarCosts[I] = ScalarCost;
5943 }
5944
5945 return Discount;
5946}
5947
5948LoopVectorizationCostModel::VectorizationCostTy
5949LoopVectorizationCostModel::expectedCost(
5950 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
5951 VectorizationCostTy Cost;
5952
5953 // For each block.
5954 for (BasicBlock *BB : TheLoop->blocks()) {
5955 VectorizationCostTy BlockCost;
5956
5957 // For each instruction in the old loop.
5958 for (Instruction &I : BB->instructionsWithoutDebug()) {
5959 // Skip ignored values.
5960 if (ValuesToIgnore.count(Ptr: &I) ||
5961 (VF.isVector() && VecValuesToIgnore.count(Ptr: &I)))
5962 continue;
5963
5964 VectorizationCostTy C = getInstructionCost(I: &I, VF);
5965
5966 // Check if we should override the cost.
5967 if (C.first.isValid() &&
5968 ForceTargetInstructionCost.getNumOccurrences() > 0)
5969 C.first = InstructionCost(ForceTargetInstructionCost);
5970
5971 // Keep a list of instructions with invalid costs.
5972 if (Invalid && !C.first.isValid())
5973 Invalid->emplace_back(Args: &I, Args&: VF);
5974
5975 BlockCost.first += C.first;
5976 BlockCost.second |= C.second;
5977 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5978 << " for VF " << VF << " For instruction: " << I
5979 << '\n');
5980 }
5981
5982 // If we are vectorizing a predicated block, it will have been
5983 // if-converted. This means that the block's instructions (aside from
5984 // stores and instructions that may divide by zero) will now be
5985 // unconditionally executed. For the scalar case, we may not always execute
5986 // the predicated block, if it is an if-else block. Thus, scale the block's
5987 // cost by the probability of executing it. blockNeedsPredication from
5988 // Legal is used so as to not include all blocks in tail folded loops.
5989 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5990 BlockCost.first /= getReciprocalPredBlockProb();
5991
5992 Cost.first += BlockCost.first;
5993 Cost.second |= BlockCost.second;
5994 }
5995
5996 return Cost;
5997}
5998
5999/// Gets Address Access SCEV after verifying that the access pattern
6000/// is loop invariant except the induction variable dependence.
6001///
6002/// This SCEV can be sent to the Target in order to estimate the address
6003/// calculation cost.
6004static const SCEV *getAddressAccessSCEV(
6005 Value *Ptr,
6006 LoopVectorizationLegality *Legal,
6007 PredicatedScalarEvolution &PSE,
6008 const Loop *TheLoop) {
6009
6010 auto *Gep = dyn_cast<GetElementPtrInst>(Val: Ptr);
6011 if (!Gep)
6012 return nullptr;
6013
6014 // We are looking for a gep with all loop invariant indices except for one
6015 // which should be an induction variable.
6016 auto SE = PSE.getSE();
6017 unsigned NumOperands = Gep->getNumOperands();
6018 for (unsigned i = 1; i < NumOperands; ++i) {
6019 Value *Opd = Gep->getOperand(i_nocapture: i);
6020 if (!SE->isLoopInvariant(S: SE->getSCEV(V: Opd), L: TheLoop) &&
6021 !Legal->isInductionVariable(V: Opd))
6022 return nullptr;
6023 }
6024
6025 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6026 return PSE.getSCEV(V: Ptr);
6027}
6028
6029InstructionCost
6030LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6031 ElementCount VF) {
6032 assert(VF.isVector() &&
6033 "Scalarization cost of instruction implies vectorization.");
6034 if (VF.isScalable())
6035 return InstructionCost::getInvalid();
6036
6037 Type *ValTy = getLoadStoreType(I);
6038 auto SE = PSE.getSE();
6039
6040 unsigned AS = getLoadStoreAddressSpace(I);
6041 Value *Ptr = getLoadStorePointerOperand(V: I);
6042 Type *PtrTy = ToVectorTy(Scalar: Ptr->getType(), EC: VF);
6043 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6044 // that it is being called from this specific place.
6045
6046 // Figure out whether the access is strided and get the stride value
6047 // if it's known in compile time
6048 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6049
6050 // Get the cost of the scalar memory instruction and address computation.
6051 InstructionCost Cost =
6052 VF.getKnownMinValue() * TTI.getAddressComputationCost(Ty: PtrTy, SE, Ptr: PtrSCEV);
6053
6054 // Don't pass *I here, since it is scalar but will actually be part of a
6055 // vectorized loop where the user of it is a vectorized instruction.
6056 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6057 const Align Alignment = getLoadStoreAlignment(I);
6058 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(Opcode: I->getOpcode(),
6059 Src: ValTy->getScalarType(),
6060 Alignment, AddressSpace: AS, CostKind);
6061
6062 // Get the overhead of the extractelement and insertelement instructions
6063 // we might create due to scalarization.
6064 Cost += getScalarizationOverhead(I, VF, CostKind);
6065
6066 // If we have a predicated load/store, it will need extra i1 extracts and
6067 // conditional branches, but may not be executed for each vector lane. Scale
6068 // the cost by the probability of executing the predicated block.
6069 if (isPredicatedInst(I)) {
6070 Cost /= getReciprocalPredBlockProb();
6071
6072 // Add the cost of an i1 extract and a branch
6073 auto *Vec_i1Ty =
6074 VectorType::get(ElementType: IntegerType::getInt1Ty(C&: ValTy->getContext()), EC: VF);
6075 Cost += TTI.getScalarizationOverhead(
6076 Ty: Vec_i1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getKnownMinValue()),
6077 /*Insert=*/false, /*Extract=*/true, CostKind);
6078 Cost += TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind);
6079
6080 if (useEmulatedMaskMemRefHack(I, VF))
6081 // Artificially setting to a high enough value to practically disable
6082 // vectorization with such operations.
6083 Cost = 3000000;
6084 }
6085
6086 return Cost;
6087}
6088
6089InstructionCost
6090LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6091 ElementCount VF) {
6092 Type *ValTy = getLoadStoreType(I);
6093 auto *VectorTy = cast<VectorType>(Val: ToVectorTy(Scalar: ValTy, EC: VF));
6094 Value *Ptr = getLoadStorePointerOperand(V: I);
6095 unsigned AS = getLoadStoreAddressSpace(I);
6096 int ConsecutiveStride = Legal->isConsecutivePtr(AccessTy: ValTy, Ptr);
6097 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6098
6099 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6100 "Stride should be 1 or -1 for consecutive memory access");
6101 const Align Alignment = getLoadStoreAlignment(I);
6102 InstructionCost Cost = 0;
6103 if (Legal->isMaskRequired(I)) {
6104 Cost += TTI.getMaskedMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS,
6105 CostKind);
6106 } else {
6107 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
6108 Cost += TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS,
6109 CostKind, OpdInfo: OpInfo, I);
6110 }
6111
6112 bool Reverse = ConsecutiveStride < 0;
6113 if (Reverse)
6114 Cost += TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, Tp: VectorTy,
6115 Mask: std::nullopt, CostKind, Index: 0);
6116 return Cost;
6117}
6118
6119InstructionCost
6120LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6121 ElementCount VF) {
6122 assert(Legal->isUniformMemOp(*I, VF));
6123
6124 Type *ValTy = getLoadStoreType(I);
6125 auto *VectorTy = cast<VectorType>(Val: ToVectorTy(Scalar: ValTy, EC: VF));
6126 const Align Alignment = getLoadStoreAlignment(I);
6127 unsigned AS = getLoadStoreAddressSpace(I);
6128 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6129 if (isa<LoadInst>(Val: I)) {
6130 return TTI.getAddressComputationCost(Ty: ValTy) +
6131 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: ValTy, Alignment, AddressSpace: AS,
6132 CostKind) +
6133 TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, Tp: VectorTy);
6134 }
6135 StoreInst *SI = cast<StoreInst>(Val: I);
6136
6137 bool isLoopInvariantStoreValue = Legal->isInvariant(V: SI->getValueOperand());
6138 return TTI.getAddressComputationCost(Ty: ValTy) +
6139 TTI.getMemoryOpCost(Opcode: Instruction::Store, Src: ValTy, Alignment, AddressSpace: AS,
6140 CostKind) +
6141 (isLoopInvariantStoreValue
6142 ? 0
6143 : TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VectorTy,
6144 CostKind, Index: VF.getKnownMinValue() - 1));
6145}
6146
6147InstructionCost
6148LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6149 ElementCount VF) {
6150 Type *ValTy = getLoadStoreType(I);
6151 auto *VectorTy = cast<VectorType>(Val: ToVectorTy(Scalar: ValTy, EC: VF));
6152 const Align Alignment = getLoadStoreAlignment(I);
6153 const Value *Ptr = getLoadStorePointerOperand(V: I);
6154
6155 return TTI.getAddressComputationCost(Ty: VectorTy) +
6156 TTI.getGatherScatterOpCost(
6157 Opcode: I->getOpcode(), DataTy: VectorTy, Ptr, VariableMask: Legal->isMaskRequired(I), Alignment,
6158 CostKind: TargetTransformInfo::TCK_RecipThroughput, I);
6159}
6160
6161InstructionCost
6162LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6163 ElementCount VF) {
6164 Type *ValTy = getLoadStoreType(I);
6165 auto *VectorTy = cast<VectorType>(Val: ToVectorTy(Scalar: ValTy, EC: VF));
6166 unsigned AS = getLoadStoreAddressSpace(I);
6167 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6168
6169 auto Group = getInterleavedAccessGroup(Instr: I);
6170 assert(Group && "Fail to get an interleaved access group.");
6171
6172 unsigned InterleaveFactor = Group->getFactor();
6173 auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor);
6174
6175 // Holds the indices of existing members in the interleaved group.
6176 SmallVector<unsigned, 4> Indices;
6177 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6178 if (Group->getMember(Index: IF))
6179 Indices.push_back(Elt: IF);
6180
6181 // Calculate the cost of the whole interleaved group.
6182 bool UseMaskForGaps =
6183 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6184 (isa<StoreInst>(Val: I) && (Group->getNumMembers() < Group->getFactor()));
6185 InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6186 Opcode: I->getOpcode(), VecTy: WideVecTy, Factor: Group->getFactor(), Indices, Alignment: Group->getAlign(),
6187 AddressSpace: AS, CostKind, UseMaskForCond: Legal->isMaskRequired(I), UseMaskForGaps);
6188
6189 if (Group->isReverse()) {
6190 // TODO: Add support for reversed masked interleaved access.
6191 assert(!Legal->isMaskRequired(I) &&
6192 "Reverse masked interleaved access not supported.");
6193 Cost += Group->getNumMembers() *
6194 TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, Tp: VectorTy,
6195 Mask: std::nullopt, CostKind, Index: 0);
6196 }
6197 return Cost;
6198}
6199
6200std::optional<InstructionCost>
6201LoopVectorizationCostModel::getReductionPatternCost(
6202 Instruction *I, ElementCount VF, Type *Ty,
6203 TTI::TargetCostKind CostKind) const {
6204 using namespace llvm::PatternMatch;
6205 // Early exit for no inloop reductions
6206 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Val: Ty))
6207 return std::nullopt;
6208 auto *VectorTy = cast<VectorType>(Val: Ty);
6209
6210 // We are looking for a pattern of, and finding the minimal acceptable cost:
6211 // reduce(mul(ext(A), ext(B))) or
6212 // reduce(mul(A, B)) or
6213 // reduce(ext(A)) or
6214 // reduce(A).
6215 // The basic idea is that we walk down the tree to do that, finding the root
6216 // reduction instruction in InLoopReductionImmediateChains. From there we find
6217 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6218 // of the components. If the reduction cost is lower then we return it for the
6219 // reduction instruction and 0 for the other instructions in the pattern. If
6220 // it is not we return an invalid cost specifying the orignal cost method
6221 // should be used.
6222 Instruction *RetI = I;
6223 if (match(V: RetI, P: m_ZExtOrSExt(Op: m_Value()))) {
6224 if (!RetI->hasOneUser())
6225 return std::nullopt;
6226 RetI = RetI->user_back();
6227 }
6228
6229 if (match(V: RetI, P: m_OneUse(SubPattern: m_Mul(L: m_Value(), R: m_Value()))) &&
6230 RetI->user_back()->getOpcode() == Instruction::Add) {
6231 RetI = RetI->user_back();
6232 }
6233
6234 // Test if the found instruction is a reduction, and if not return an invalid
6235 // cost specifying the parent to use the original cost modelling.
6236 if (!InLoopReductionImmediateChains.count(Val: RetI))
6237 return std::nullopt;
6238
6239 // Find the reduction this chain is a part of and calculate the basic cost of
6240 // the reduction on its own.
6241 Instruction *LastChain = InLoopReductionImmediateChains.at(Val: RetI);
6242 Instruction *ReductionPhi = LastChain;
6243 while (!isa<PHINode>(Val: ReductionPhi))
6244 ReductionPhi = InLoopReductionImmediateChains.at(Val: ReductionPhi);
6245
6246 const RecurrenceDescriptor &RdxDesc =
6247 Legal->getReductionVars().find(Key: cast<PHINode>(Val: ReductionPhi))->second;
6248
6249 InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6250 Opcode: RdxDesc.getOpcode(), Ty: VectorTy, FMF: RdxDesc.getFastMathFlags(), CostKind);
6251
6252 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6253 // normal fmul instruction to the cost of the fadd reduction.
6254 if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6255 BaseCost +=
6256 TTI.getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: VectorTy, CostKind);
6257
6258 // If we're using ordered reductions then we can just return the base cost
6259 // here, since getArithmeticReductionCost calculates the full ordered
6260 // reduction cost when FP reassociation is not allowed.
6261 if (useOrderedReductions(RdxDesc))
6262 return BaseCost;
6263
6264 // Get the operand that was not the reduction chain and match it to one of the
6265 // patterns, returning the better cost if it is found.
6266 Instruction *RedOp = RetI->getOperand(i: 1) == LastChain
6267 ? dyn_cast<Instruction>(Val: RetI->getOperand(i: 0))
6268 : dyn_cast<Instruction>(Val: RetI->getOperand(i: 1));
6269
6270 VectorTy = VectorType::get(ElementType: I->getOperand(i: 0)->getType(), Other: VectorTy);
6271
6272 Instruction *Op0, *Op1;
6273 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6274 match(V: RedOp,
6275 P: m_ZExtOrSExt(Op: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) &&
6276 match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
6277 Op0->getOpcode() == Op1->getOpcode() &&
6278 Op0->getOperand(i: 0)->getType() == Op1->getOperand(i: 0)->getType() &&
6279 !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1) &&
6280 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6281
6282 // Matched reduce.add(ext(mul(ext(A), ext(B)))
6283 // Note that the extend opcodes need to all match, or if A==B they will have
6284 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6285 // which is equally fine.
6286 bool IsUnsigned = isa<ZExtInst>(Val: Op0);
6287 auto *ExtType = VectorType::get(ElementType: Op0->getOperand(i: 0)->getType(), Other: VectorTy);
6288 auto *MulType = VectorType::get(ElementType: Op0->getType(), Other: VectorTy);
6289
6290 InstructionCost ExtCost =
6291 TTI.getCastInstrCost(Opcode: Op0->getOpcode(), Dst: MulType, Src: ExtType,
6292 CCH: TTI::CastContextHint::None, CostKind, I: Op0);
6293 InstructionCost MulCost =
6294 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: MulType, CostKind);
6295 InstructionCost Ext2Cost =
6296 TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: MulType,
6297 CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
6298
6299 InstructionCost RedCost = TTI.getMulAccReductionCost(
6300 IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType, CostKind);
6301
6302 if (RedCost.isValid() &&
6303 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6304 return I == RetI ? RedCost : 0;
6305 } else if (RedOp && match(V: RedOp, P: m_ZExtOrSExt(Op: m_Value())) &&
6306 !TheLoop->isLoopInvariant(V: RedOp)) {
6307 // Matched reduce(ext(A))
6308 bool IsUnsigned = isa<ZExtInst>(Val: RedOp);
6309 auto *ExtType = VectorType::get(ElementType: RedOp->getOperand(i: 0)->getType(), Other: VectorTy);
6310 InstructionCost RedCost = TTI.getExtendedReductionCost(
6311 Opcode: RdxDesc.getOpcode(), IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
6312 FMF: RdxDesc.getFastMathFlags(), CostKind);
6313
6314 InstructionCost ExtCost =
6315 TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: ExtType,
6316 CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
6317 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6318 return I == RetI ? RedCost : 0;
6319 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6320 match(V: RedOp, P: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) {
6321 if (match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
6322 Op0->getOpcode() == Op1->getOpcode() &&
6323 !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1)) {
6324 bool IsUnsigned = isa<ZExtInst>(Val: Op0);
6325 Type *Op0Ty = Op0->getOperand(i: 0)->getType();
6326 Type *Op1Ty = Op1->getOperand(i: 0)->getType();
6327 Type *LargestOpTy =
6328 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6329 : Op0Ty;
6330 auto *ExtType = VectorType::get(ElementType: LargestOpTy, Other: VectorTy);
6331
6332 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6333 // different sizes. We take the largest type as the ext to reduce, and add
6334 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6335 InstructionCost ExtCost0 = TTI.getCastInstrCost(
6336 Opcode: Op0->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op0Ty, Other: VectorTy),
6337 CCH: TTI::CastContextHint::None, CostKind, I: Op0);
6338 InstructionCost ExtCost1 = TTI.getCastInstrCost(
6339 Opcode: Op1->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op1Ty, Other: VectorTy),
6340 CCH: TTI::CastContextHint::None, CostKind, I: Op1);
6341 InstructionCost MulCost =
6342 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6343
6344 InstructionCost RedCost = TTI.getMulAccReductionCost(
6345 IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType, CostKind);
6346 InstructionCost ExtraExtCost = 0;
6347 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6348 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6349 ExtraExtCost = TTI.getCastInstrCost(
6350 Opcode: ExtraExtOp->getOpcode(), Dst: ExtType,
6351 Src: VectorType::get(ElementType: ExtraExtOp->getOperand(i: 0)->getType(), Other: VectorTy),
6352 CCH: TTI::CastContextHint::None, CostKind, I: ExtraExtOp);
6353 }
6354
6355 if (RedCost.isValid() &&
6356 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6357 return I == RetI ? RedCost : 0;
6358 } else if (!match(V: I, P: m_ZExtOrSExt(Op: m_Value()))) {
6359 // Matched reduce.add(mul())
6360 InstructionCost MulCost =
6361 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6362
6363 InstructionCost RedCost = TTI.getMulAccReductionCost(
6364 IsUnsigned: true, ResTy: RdxDesc.getRecurrenceType(), Ty: VectorTy, CostKind);
6365
6366 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6367 return I == RetI ? RedCost : 0;
6368 }
6369 }
6370
6371 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6372}
6373
6374InstructionCost
6375LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6376 ElementCount VF) {
6377 // Calculate scalar cost only. Vectorization cost should be ready at this
6378 // moment.
6379 if (VF.isScalar()) {
6380 Type *ValTy = getLoadStoreType(I);
6381 const Align Alignment = getLoadStoreAlignment(I);
6382 unsigned AS = getLoadStoreAddressSpace(I);
6383
6384 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
6385 return TTI.getAddressComputationCost(Ty: ValTy) +
6386 TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS,
6387 CostKind: TTI::TCK_RecipThroughput, OpdInfo: OpInfo, I);
6388 }
6389 return getWideningCost(I, VF);
6390}
6391
6392LoopVectorizationCostModel::VectorizationCostTy
6393LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6394 ElementCount VF) {
6395 // If we know that this instruction will remain uniform, check the cost of
6396 // the scalar version.
6397 if (isUniformAfterVectorization(I, VF))
6398 VF = ElementCount::getFixed(MinVal: 1);
6399
6400 if (VF.isVector() && isProfitableToScalarize(I, VF))
6401 return VectorizationCostTy(InstsToScalarize[VF][I], false);
6402
6403 // Forced scalars do not have any scalarization overhead.
6404 auto ForcedScalar = ForcedScalars.find(Val: VF);
6405 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6406 auto InstSet = ForcedScalar->second;
6407 if (InstSet.count(Ptr: I))
6408 return VectorizationCostTy(
6409 (getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1)).first *
6410 VF.getKnownMinValue()),
6411 false);
6412 }
6413
6414 Type *VectorTy;
6415 InstructionCost C = getInstructionCost(I, VF, VectorTy);
6416
6417 bool TypeNotScalarized = false;
6418 if (VF.isVector() && VectorTy->isVectorTy()) {
6419 if (unsigned NumParts = TTI.getNumberOfParts(Tp: VectorTy)) {
6420 if (VF.isScalable())
6421 // <vscale x 1 x iN> is assumed to be profitable over iN because
6422 // scalable registers are a distinct register class from scalar ones.
6423 // If we ever find a target which wants to lower scalable vectors
6424 // back to scalars, we'll need to update this code to explicitly
6425 // ask TTI about the register class uses for each part.
6426 TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6427 else
6428 TypeNotScalarized = NumParts < VF.getKnownMinValue();
6429 } else
6430 C = InstructionCost::getInvalid();
6431 }
6432 return VectorizationCostTy(C, TypeNotScalarized);
6433}
6434
6435InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6436 Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
6437
6438 // There is no mechanism yet to create a scalable scalarization loop,
6439 // so this is currently Invalid.
6440 if (VF.isScalable())
6441 return InstructionCost::getInvalid();
6442
6443 if (VF.isScalar())
6444 return 0;
6445
6446 InstructionCost Cost = 0;
6447 Type *RetTy = ToVectorTy(Scalar: I->getType(), EC: VF);
6448 if (!RetTy->isVoidTy() &&
6449 (!isa<LoadInst>(Val: I) || !TTI.supportsEfficientVectorElementLoadStore()))
6450 Cost += TTI.getScalarizationOverhead(
6451 Ty: cast<VectorType>(Val: RetTy), DemandedElts: APInt::getAllOnes(numBits: VF.getKnownMinValue()),
6452 /*Insert*/ true,
6453 /*Extract*/ false, CostKind);
6454
6455 // Some targets keep addresses scalar.
6456 if (isa<LoadInst>(Val: I) && !TTI.prefersVectorizedAddressing())
6457 return Cost;
6458
6459 // Some targets support efficient element stores.
6460 if (isa<StoreInst>(Val: I) && TTI.supportsEfficientVectorElementLoadStore())
6461 return Cost;
6462
6463 // Collect operands to consider.
6464 CallInst *CI = dyn_cast<CallInst>(Val: I);
6465 Instruction::op_range Ops = CI ? CI->args() : I->operands();
6466
6467 // Skip operands that do not require extraction/scalarization and do not incur
6468 // any overhead.
6469 SmallVector<Type *> Tys;
6470 for (auto *V : filterExtractingOperands(Ops, VF))
6471 Tys.push_back(Elt: MaybeVectorizeType(Elt: V->getType(), VF));
6472 return Cost + TTI.getOperandsScalarizationOverhead(
6473 Args: filterExtractingOperands(Ops, VF), Tys, CostKind);
6474}
6475
6476void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6477 if (VF.isScalar())
6478 return;
6479 NumPredStores = 0;
6480 for (BasicBlock *BB : TheLoop->blocks()) {
6481 // For each instruction in the old loop.
6482 for (Instruction &I : *BB) {
6483 Value *Ptr = getLoadStorePointerOperand(V: &I);
6484 if (!Ptr)
6485 continue;
6486
6487 // TODO: We should generate better code and update the cost model for
6488 // predicated uniform stores. Today they are treated as any other
6489 // predicated store (see added test cases in
6490 // invariant-store-vectorization.ll).
6491 if (isa<StoreInst>(Val: &I) && isScalarWithPredication(I: &I, VF))
6492 NumPredStores++;
6493
6494 if (Legal->isUniformMemOp(I, VF)) {
6495 auto isLegalToScalarize = [&]() {
6496 if (!VF.isScalable())
6497 // Scalarization of fixed length vectors "just works".
6498 return true;
6499
6500 // We have dedicated lowering for unpredicated uniform loads and
6501 // stores. Note that even with tail folding we know that at least
6502 // one lane is active (i.e. generalized predication is not possible
6503 // here), and the logic below depends on this fact.
6504 if (!foldTailByMasking())
6505 return true;
6506
6507 // For scalable vectors, a uniform memop load is always
6508 // uniform-by-parts and we know how to scalarize that.
6509 if (isa<LoadInst>(Val: I))
6510 return true;
6511
6512 // A uniform store isn't neccessarily uniform-by-part
6513 // and we can't assume scalarization.
6514 auto &SI = cast<StoreInst>(Val&: I);
6515 return TheLoop->isLoopInvariant(V: SI.getValueOperand());
6516 };
6517
6518 const InstructionCost GatherScatterCost =
6519 isLegalGatherOrScatter(V: &I, VF) ?
6520 getGatherScatterCost(I: &I, VF) : InstructionCost::getInvalid();
6521
6522 // Load: Scalar load + broadcast
6523 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6524 // FIXME: This cost is a significant under-estimate for tail folded
6525 // memory ops.
6526 const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6527 getUniformMemOpCost(I: &I, VF) : InstructionCost::getInvalid();
6528
6529 // Choose better solution for the current VF, Note that Invalid
6530 // costs compare as maximumal large. If both are invalid, we get
6531 // scalable invalid which signals a failure and a vectorization abort.
6532 if (GatherScatterCost < ScalarizationCost)
6533 setWideningDecision(I: &I, VF, W: CM_GatherScatter, Cost: GatherScatterCost);
6534 else
6535 setWideningDecision(I: &I, VF, W: CM_Scalarize, Cost: ScalarizationCost);
6536 continue;
6537 }
6538
6539 // We assume that widening is the best solution when possible.
6540 if (memoryInstructionCanBeWidened(I: &I, VF)) {
6541 InstructionCost Cost = getConsecutiveMemOpCost(I: &I, VF);
6542 int ConsecutiveStride = Legal->isConsecutivePtr(
6543 AccessTy: getLoadStoreType(I: &I), Ptr: getLoadStorePointerOperand(V: &I));
6544 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6545 "Expected consecutive stride.");
6546 InstWidening Decision =
6547 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6548 setWideningDecision(I: &I, VF, W: Decision, Cost);
6549 continue;
6550 }
6551
6552 // Choose between Interleaving, Gather/Scatter or Scalarization.
6553 InstructionCost InterleaveCost = InstructionCost::getInvalid();
6554 unsigned NumAccesses = 1;
6555 if (isAccessInterleaved(Instr: &I)) {
6556 auto Group = getInterleavedAccessGroup(Instr: &I);
6557 assert(Group && "Fail to get an interleaved access group.");
6558
6559 // Make one decision for the whole group.
6560 if (getWideningDecision(I: &I, VF) != CM_Unknown)
6561 continue;
6562
6563 NumAccesses = Group->getNumMembers();
6564 if (interleavedAccessCanBeWidened(I: &I, VF))
6565 InterleaveCost = getInterleaveGroupCost(I: &I, VF);
6566 }
6567
6568 InstructionCost GatherScatterCost =
6569 isLegalGatherOrScatter(V: &I, VF)
6570 ? getGatherScatterCost(I: &I, VF) * NumAccesses
6571 : InstructionCost::getInvalid();
6572
6573 InstructionCost ScalarizationCost =
6574 getMemInstScalarizationCost(I: &I, VF) * NumAccesses;
6575
6576 // Choose better solution for the current VF,
6577 // write down this decision and use it during vectorization.
6578 InstructionCost Cost;
6579 InstWidening Decision;
6580 if (InterleaveCost <= GatherScatterCost &&
6581 InterleaveCost < ScalarizationCost) {
6582 Decision = CM_Interleave;
6583 Cost = InterleaveCost;
6584 } else if (GatherScatterCost < ScalarizationCost) {
6585 Decision = CM_GatherScatter;
6586 Cost = GatherScatterCost;
6587 } else {
6588 Decision = CM_Scalarize;
6589 Cost = ScalarizationCost;
6590 }
6591 // If the instructions belongs to an interleave group, the whole group
6592 // receives the same decision. The whole group receives the cost, but
6593 // the cost will actually be assigned to one instruction.
6594 if (auto Group = getInterleavedAccessGroup(Instr: &I))
6595 setWideningDecision(Grp: Group, VF, W: Decision, Cost);
6596 else
6597 setWideningDecision(I: &I, VF, W: Decision, Cost);
6598 }
6599 }
6600
6601 // Make sure that any load of address and any other address computation
6602 // remains scalar unless there is gather/scatter support. This avoids
6603 // inevitable extracts into address registers, and also has the benefit of
6604 // activating LSR more, since that pass can't optimize vectorized
6605 // addresses.
6606 if (TTI.prefersVectorizedAddressing())
6607 return;
6608
6609 // Start with all scalar pointer uses.
6610 SmallPtrSet<Instruction *, 8> AddrDefs;
6611 for (BasicBlock *BB : TheLoop->blocks())
6612 for (Instruction &I : *BB) {
6613 Instruction *PtrDef =
6614 dyn_cast_or_null<Instruction>(Val: getLoadStorePointerOperand(V: &I));
6615 if (PtrDef && TheLoop->contains(Inst: PtrDef) &&
6616 getWideningDecision(I: &I, VF) != CM_GatherScatter)
6617 AddrDefs.insert(Ptr: PtrDef);
6618 }
6619
6620 // Add all instructions used to generate the addresses.
6621 SmallVector<Instruction *, 4> Worklist;
6622 append_range(C&: Worklist, R&: AddrDefs);
6623 while (!Worklist.empty()) {
6624 Instruction *I = Worklist.pop_back_val();
6625 for (auto &Op : I->operands())
6626 if (auto *InstOp = dyn_cast<Instruction>(Val&: Op))
6627 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(Val: InstOp) &&
6628 AddrDefs.insert(Ptr: InstOp).second)
6629 Worklist.push_back(Elt: InstOp);
6630 }
6631
6632 for (auto *I : AddrDefs) {
6633 if (isa<LoadInst>(Val: I)) {
6634 // Setting the desired widening decision should ideally be handled in
6635 // by cost functions, but since this involves the task of finding out
6636 // if the loaded register is involved in an address computation, it is
6637 // instead changed here when we know this is the case.
6638 InstWidening Decision = getWideningDecision(I, VF);
6639 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6640 // Scalarize a widened load of address.
6641 setWideningDecision(
6642 I, VF, W: CM_Scalarize,
6643 Cost: (VF.getKnownMinValue() *
6644 getMemoryInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1))));
6645 else if (auto Group = getInterleavedAccessGroup(Instr: I)) {
6646 // Scalarize an interleave group of address loads.
6647 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6648 if (Instruction *Member = Group->getMember(Index: I))
6649 setWideningDecision(
6650 I: Member, VF, W: CM_Scalarize,
6651 Cost: (VF.getKnownMinValue() *
6652 getMemoryInstructionCost(I: Member, VF: ElementCount::getFixed(MinVal: 1))));
6653 }
6654 }
6655 } else
6656 // Make sure I gets scalarized and a cost estimate without
6657 // scalarization overhead.
6658 ForcedScalars[VF].insert(Ptr: I);
6659 }
6660}
6661
6662void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6663 assert(!VF.isScalar() &&
6664 "Trying to set a vectorization decision for a scalar VF");
6665
6666 for (BasicBlock *BB : TheLoop->blocks()) {
6667 // For each instruction in the old loop.
6668 for (Instruction &I : *BB) {
6669 CallInst *CI = dyn_cast<CallInst>(Val: &I);
6670
6671 if (!CI)
6672 continue;
6673
6674 InstructionCost ScalarCost = InstructionCost::getInvalid();
6675 InstructionCost VectorCost = InstructionCost::getInvalid();
6676 InstructionCost IntrinsicCost = InstructionCost::getInvalid();
6677 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6678
6679 Function *ScalarFunc = CI->getCalledFunction();
6680 Type *ScalarRetTy = CI->getType();
6681 SmallVector<Type *, 4> Tys, ScalarTys;
6682 bool MaskRequired = Legal->isMaskRequired(I: CI);
6683 for (auto &ArgOp : CI->args())
6684 ScalarTys.push_back(Elt: ArgOp->getType());
6685
6686 // Compute corresponding vector type for return value and arguments.
6687 Type *RetTy = ToVectorTy(Scalar: ScalarRetTy, EC: VF);
6688 for (Type *ScalarTy : ScalarTys)
6689 Tys.push_back(Elt: ToVectorTy(Scalar: ScalarTy, EC: VF));
6690
6691 // An in-loop reduction using an fmuladd intrinsic is a special case;
6692 // we don't want the normal cost for that intrinsic.
6693 if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
6694 if (auto RedCost = getReductionPatternCost(I: CI, VF, Ty: RetTy, CostKind)) {
6695 setCallWideningDecision(CI, VF, Kind: CM_IntrinsicCall, Variant: nullptr,
6696 IID: getVectorIntrinsicIDForCall(CI, TLI),
6697 MaskPos: std::nullopt, Cost: *RedCost);
6698 continue;
6699 }
6700
6701 // Estimate cost of scalarized vector call. The source operands are
6702 // assumed to be vectors, so we need to extract individual elements from
6703 // there, execute VF scalar calls, and then gather the result into the
6704 // vector return value.
6705 InstructionCost ScalarCallCost =
6706 TTI.getCallInstrCost(F: ScalarFunc, RetTy: ScalarRetTy, Tys: ScalarTys, CostKind);
6707
6708 // Compute costs of unpacking argument values for the scalar calls and
6709 // packing the return values to a vector.
6710 InstructionCost ScalarizationCost =
6711 getScalarizationOverhead(I: CI, VF, CostKind);
6712
6713 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6714
6715 // Find the cost of vectorizing the call, if we can find a suitable
6716 // vector variant of the function.
6717 bool UsesMask = false;
6718 VFInfo FuncInfo;
6719 Function *VecFunc = nullptr;
6720 // Search through any available variants for one we can use at this VF.
6721 for (VFInfo &Info : VFDatabase::getMappings(CI: *CI)) {
6722 // Must match requested VF.
6723 if (Info.Shape.VF != VF)
6724 continue;
6725
6726 // Must take a mask argument if one is required
6727 if (MaskRequired && !Info.isMasked())
6728 continue;
6729
6730 // Check that all parameter kinds are supported
6731 bool ParamsOk = true;
6732 for (VFParameter Param : Info.Shape.Parameters) {
6733 switch (Param.ParamKind) {
6734 case VFParamKind::Vector:
6735 break;
6736 case VFParamKind::OMP_Uniform: {
6737 Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
6738 // Make sure the scalar parameter in the loop is invariant.
6739 if (!PSE.getSE()->isLoopInvariant(S: PSE.getSCEV(V: ScalarParam),
6740 L: TheLoop))
6741 ParamsOk = false;
6742 break;
6743 }
6744 case VFParamKind::OMP_Linear: {
6745 Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
6746 // Find the stride for the scalar parameter in this loop and see if
6747 // it matches the stride for the variant.
6748 // TODO: do we need to figure out the cost of an extract to get the
6749 // first lane? Or do we hope that it will be folded away?
6750 ScalarEvolution *SE = PSE.getSE();
6751 const auto *SAR =
6752 dyn_cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: ScalarParam));
6753
6754 if (!SAR || SAR->getLoop() != TheLoop) {
6755 ParamsOk = false;
6756 break;
6757 }
6758
6759 const SCEVConstant *Step =
6760 dyn_cast<SCEVConstant>(Val: SAR->getStepRecurrence(SE&: *SE));
6761
6762 if (!Step ||
6763 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6764 ParamsOk = false;
6765
6766 break;
6767 }
6768 case VFParamKind::GlobalPredicate:
6769 UsesMask = true;
6770 break;
6771 default:
6772 ParamsOk = false;
6773 break;
6774 }
6775 }
6776
6777 if (!ParamsOk)
6778 continue;
6779
6780 // Found a suitable candidate, stop here.
6781 VecFunc = CI->getModule()->getFunction(Name: Info.VectorName);
6782 FuncInfo = Info;
6783 break;
6784 }
6785
6786 // Add in the cost of synthesizing a mask if one wasn't required.
6787 InstructionCost MaskCost = 0;
6788 if (VecFunc && UsesMask && !MaskRequired)
6789 MaskCost = TTI.getShuffleCost(
6790 Kind: TargetTransformInfo::SK_Broadcast,
6791 Tp: VectorType::get(ElementType: IntegerType::getInt1Ty(
6792 C&: VecFunc->getFunctionType()->getContext()),
6793 EC: VF));
6794
6795 if (TLI && VecFunc && !CI->isNoBuiltin())
6796 VectorCost =
6797 TTI.getCallInstrCost(F: nullptr, RetTy, Tys, CostKind) + MaskCost;
6798
6799 // Find the cost of an intrinsic; some targets may have instructions that
6800 // perform the operation without needing an actual call.
6801 Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6802 if (IID != Intrinsic::not_intrinsic)
6803 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6804
6805 InstructionCost Cost = ScalarCost;
6806 InstWidening Decision = CM_Scalarize;
6807
6808 if (VectorCost <= Cost) {
6809 Cost = VectorCost;
6810 Decision = CM_VectorCall;
6811 }
6812
6813 if (IntrinsicCost <= Cost) {
6814 Cost = IntrinsicCost;
6815 Decision = CM_IntrinsicCall;
6816 }
6817
6818 setCallWideningDecision(CI, VF, Kind: Decision, Variant: VecFunc, IID,
6819 MaskPos: FuncInfo.getParamIndexForOptionalMask(), Cost);
6820 }
6821 }
6822}
6823
6824InstructionCost
6825LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6826 Type *&VectorTy) {
6827 Type *RetTy = I->getType();
6828 if (canTruncateToMinimalBitwidth(I, VF))
6829 RetTy = IntegerType::get(C&: RetTy->getContext(), NumBits: MinBWs[I]);
6830 auto SE = PSE.getSE();
6831 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6832
6833 auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6834 ElementCount VF) -> bool {
6835 if (VF.isScalar())
6836 return true;
6837
6838 auto Scalarized = InstsToScalarize.find(Val: VF);
6839 assert(Scalarized != InstsToScalarize.end() &&
6840 "VF not yet analyzed for scalarization profitability");
6841 return !Scalarized->second.count(Val: I) &&
6842 llvm::all_of(Range: I->users(), P: [&](User *U) {
6843 auto *UI = cast<Instruction>(Val: U);
6844 return !Scalarized->second.count(Val: UI);
6845 });
6846 };
6847 (void) hasSingleCopyAfterVectorization;
6848
6849 if (isScalarAfterVectorization(I, VF)) {
6850 // With the exception of GEPs and PHIs, after scalarization there should
6851 // only be one copy of the instruction generated in the loop. This is
6852 // because the VF is either 1, or any instructions that need scalarizing
6853 // have already been dealt with by the time we get here. As a result,
6854 // it means we don't have to multiply the instruction cost by VF.
6855 assert(I->getOpcode() == Instruction::GetElementPtr ||
6856 I->getOpcode() == Instruction::PHI ||
6857 (I->getOpcode() == Instruction::BitCast &&
6858 I->getType()->isPointerTy()) ||
6859 hasSingleCopyAfterVectorization(I, VF));
6860 VectorTy = RetTy;
6861 } else
6862 VectorTy = ToVectorTy(Scalar: RetTy, EC: VF);
6863
6864 // TODO: We need to estimate the cost of intrinsic calls.
6865 switch (I->getOpcode()) {
6866 case Instruction::GetElementPtr:
6867 // We mark this instruction as zero-cost because the cost of GEPs in
6868 // vectorized code depends on whether the corresponding memory instruction
6869 // is scalarized or not. Therefore, we handle GEPs with the memory
6870 // instruction cost.
6871 return 0;
6872 case Instruction::Br: {
6873 // In cases of scalarized and predicated instructions, there will be VF
6874 // predicated blocks in the vectorized loop. Each branch around these
6875 // blocks requires also an extract of its vector compare i1 element.
6876 bool ScalarPredicatedBB = false;
6877 BranchInst *BI = cast<BranchInst>(Val: I);
6878 if (VF.isVector() && BI->isConditional() &&
6879 (PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 0)) ||
6880 PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 1))))
6881 ScalarPredicatedBB = true;
6882
6883 if (ScalarPredicatedBB) {
6884 // Not possible to scalarize scalable vector with predicated instructions.
6885 if (VF.isScalable())
6886 return InstructionCost::getInvalid();
6887 // Return cost for branches around scalarized and predicated blocks.
6888 auto *Vec_i1Ty =
6889 VectorType::get(ElementType: IntegerType::getInt1Ty(C&: RetTy->getContext()), EC: VF);
6890 return (
6891 TTI.getScalarizationOverhead(
6892 Ty: Vec_i1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
6893 /*Insert*/ false, /*Extract*/ true, CostKind) +
6894 (TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind) * VF.getFixedValue()));
6895 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6896 // The back-edge branch will remain, as will all scalar branches.
6897 return TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind);
6898 else
6899 // This branch will be eliminated by if-conversion.
6900 return 0;
6901 // Note: We currently assume zero cost for an unconditional branch inside
6902 // a predicated block since it will become a fall-through, although we
6903 // may decide in the future to call TTI for all branches.
6904 }
6905 case Instruction::PHI: {
6906 auto *Phi = cast<PHINode>(Val: I);
6907
6908 // First-order recurrences are replaced by vector shuffles inside the loop.
6909 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6910 SmallVector<int> Mask(VF.getKnownMinValue());
6911 std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - 1);
6912 return TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice,
6913 Tp: cast<VectorType>(Val: VectorTy), Mask, CostKind,
6914 Index: VF.getKnownMinValue() - 1);
6915 }
6916
6917 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6918 // converted into select instructions. We require N - 1 selects per phi
6919 // node, where N is the number of incoming values.
6920 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6921 return (Phi->getNumIncomingValues() - 1) *
6922 TTI.getCmpSelInstrCost(
6923 Opcode: Instruction::Select, ValTy: ToVectorTy(Scalar: Phi->getType(), EC: VF),
6924 CondTy: ToVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF),
6925 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
6926
6927 return TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
6928 }
6929 case Instruction::UDiv:
6930 case Instruction::SDiv:
6931 case Instruction::URem:
6932 case Instruction::SRem:
6933 if (VF.isVector() && isPredicatedInst(I)) {
6934 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6935 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6936 ScalarCost : SafeDivisorCost;
6937 }
6938 // We've proven all lanes safe to speculate, fall through.
6939 [[fallthrough]];
6940 case Instruction::Add:
6941 case Instruction::FAdd:
6942 case Instruction::Sub:
6943 case Instruction::FSub:
6944 case Instruction::Mul:
6945 case Instruction::FMul:
6946 case Instruction::FDiv:
6947 case Instruction::FRem:
6948 case Instruction::Shl:
6949 case Instruction::LShr:
6950 case Instruction::AShr:
6951 case Instruction::And:
6952 case Instruction::Or:
6953 case Instruction::Xor: {
6954 // If we're speculating on the stride being 1, the multiplication may
6955 // fold away. We can generalize this for all operations using the notion
6956 // of neutral elements. (TODO)
6957 if (I->getOpcode() == Instruction::Mul &&
6958 (PSE.getSCEV(V: I->getOperand(i: 0))->isOne() ||
6959 PSE.getSCEV(V: I->getOperand(i: 1))->isOne()))
6960 return 0;
6961
6962 // Detect reduction patterns
6963 if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy, CostKind))
6964 return *RedCost;
6965
6966 // Certain instructions can be cheaper to vectorize if they have a constant
6967 // second vector operand. One example of this are shifts on x86.
6968 Value *Op2 = I->getOperand(i: 1);
6969 auto Op2Info = TTI.getOperandInfo(V: Op2);
6970 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6971 Legal->isInvariant(V: Op2))
6972 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6973
6974 SmallVector<const Value *, 4> Operands(I->operand_values());
6975 return TTI.getArithmeticInstrCost(
6976 Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6977 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6978 Opd2Info: Op2Info, Args: Operands, CxtI: I, TLibInfo: TLI);
6979 }
6980 case Instruction::FNeg: {
6981 return TTI.getArithmeticInstrCost(
6982 Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6983 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6984 Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6985 Args: I->getOperand(i: 0), CxtI: I);
6986 }
6987 case Instruction::Select: {
6988 SelectInst *SI = cast<SelectInst>(Val: I);
6989 const SCEV *CondSCEV = SE->getSCEV(V: SI->getCondition());
6990 bool ScalarCond = (SE->isLoopInvariant(S: CondSCEV, L: TheLoop));
6991
6992 const Value *Op0, *Op1;
6993 using namespace llvm::PatternMatch;
6994 if (!ScalarCond && (match(V: I, P: m_LogicalAnd(L: m_Value(V&: Op0), R: m_Value(V&: Op1))) ||
6995 match(V: I, P: m_LogicalOr(L: m_Value(V&: Op0), R: m_Value(V&: Op1))))) {
6996 // select x, y, false --> x & y
6997 // select x, true, y --> x | y
6998 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(V: Op0);
6999 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(V: Op1);
7000 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7001 Op1->getType()->getScalarSizeInBits() == 1);
7002
7003 SmallVector<const Value *, 2> Operands{Op0, Op1};
7004 return TTI.getArithmeticInstrCost(
7005 Opcode: match(V: I, P: m_LogicalOr()) ? Instruction::Or : Instruction::And, Ty: VectorTy,
7006 CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: I);
7007 }
7008
7009 Type *CondTy = SI->getCondition()->getType();
7010 if (!ScalarCond)
7011 CondTy = VectorType::get(ElementType: CondTy, EC: VF);
7012
7013 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7014 if (auto *Cmp = dyn_cast<CmpInst>(Val: SI->getCondition()))
7015 Pred = Cmp->getPredicate();
7016 return TTI.getCmpSelInstrCost(Opcode: I->getOpcode(), ValTy: VectorTy, CondTy, VecPred: Pred,
7017 CostKind, I);
7018 }
7019 case Instruction::ICmp:
7020 case Instruction::FCmp: {
7021 Type *ValTy = I->getOperand(i: 0)->getType();
7022 Instruction *Op0AsInstruction = dyn_cast<Instruction>(Val: I->getOperand(i: 0));
7023 if (canTruncateToMinimalBitwidth(I: Op0AsInstruction, VF))
7024 ValTy = IntegerType::get(C&: ValTy->getContext(), NumBits: MinBWs[Op0AsInstruction]);
7025 VectorTy = ToVectorTy(Scalar: ValTy, EC: VF);
7026 return TTI.getCmpSelInstrCost(Opcode: I->getOpcode(), ValTy: VectorTy, CondTy: nullptr,
7027 VecPred: cast<CmpInst>(Val: I)->getPredicate(), CostKind,
7028 I);
7029 }
7030 case Instruction::Store:
7031 case Instruction::Load: {
7032 ElementCount Width = VF;
7033 if (Width.isVector()) {
7034 InstWidening Decision = getWideningDecision(I, VF: Width);
7035 assert(Decision != CM_Unknown &&
7036 "CM decision should be taken at this point");
7037 if (getWideningCost(I, VF) == InstructionCost::getInvalid())
7038 return InstructionCost::getInvalid();
7039 if (Decision == CM_Scalarize)
7040 Width = ElementCount::getFixed(MinVal: 1);
7041 }
7042 VectorTy = ToVectorTy(Scalar: getLoadStoreType(I), EC: Width);
7043 return getMemoryInstructionCost(I, VF);
7044 }
7045 case Instruction::BitCast:
7046 if (I->getType()->isPointerTy())
7047 return 0;
7048 [[fallthrough]];
7049 case Instruction::ZExt:
7050 case Instruction::SExt:
7051 case Instruction::FPToUI:
7052 case Instruction::FPToSI:
7053 case Instruction::FPExt:
7054 case Instruction::PtrToInt:
7055 case Instruction::IntToPtr:
7056 case Instruction::SIToFP:
7057 case Instruction::UIToFP:
7058 case Instruction::Trunc:
7059 case Instruction::FPTrunc: {
7060 // Computes the CastContextHint from a Load/Store instruction.
7061 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7062 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7063 "Expected a load or a store!");
7064
7065 if (VF.isScalar() || !TheLoop->contains(Inst: I))
7066 return TTI::CastContextHint::Normal;
7067
7068 switch (getWideningDecision(I, VF)) {
7069 case LoopVectorizationCostModel::CM_GatherScatter:
7070 return TTI::CastContextHint::GatherScatter;
7071 case LoopVectorizationCostModel::CM_Interleave:
7072 return TTI::CastContextHint::Interleave;
7073 case LoopVectorizationCostModel::CM_Scalarize:
7074 case LoopVectorizationCostModel::CM_Widen:
7075 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7076 : TTI::CastContextHint::Normal;
7077 case LoopVectorizationCostModel::CM_Widen_Reverse:
7078 return TTI::CastContextHint::Reversed;
7079 case LoopVectorizationCostModel::CM_Unknown:
7080 llvm_unreachable("Instr did not go through cost modelling?");
7081 case LoopVectorizationCostModel::CM_VectorCall:
7082 case LoopVectorizationCostModel::CM_IntrinsicCall:
7083 llvm_unreachable_internal(msg: "Instr has invalid widening decision");
7084 }
7085
7086 llvm_unreachable("Unhandled case!");
7087 };
7088
7089 unsigned Opcode = I->getOpcode();
7090 TTI::CastContextHint CCH = TTI::CastContextHint::None;
7091 // For Trunc, the context is the only user, which must be a StoreInst.
7092 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7093 if (I->hasOneUse())
7094 if (StoreInst *Store = dyn_cast<StoreInst>(Val: *I->user_begin()))
7095 CCH = ComputeCCH(Store);
7096 }
7097 // For Z/Sext, the context is the operand, which must be a LoadInst.
7098 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7099 Opcode == Instruction::FPExt) {
7100 if (LoadInst *Load = dyn_cast<LoadInst>(Val: I->getOperand(i: 0)))
7101 CCH = ComputeCCH(Load);
7102 }
7103
7104 // We optimize the truncation of induction variables having constant
7105 // integer steps. The cost of these truncations is the same as the scalar
7106 // operation.
7107 if (isOptimizableIVTruncate(I, VF)) {
7108 auto *Trunc = cast<TruncInst>(Val: I);
7109 return TTI.getCastInstrCost(Opcode: Instruction::Trunc, Dst: Trunc->getDestTy(),
7110 Src: Trunc->getSrcTy(), CCH, CostKind, I: Trunc);
7111 }
7112
7113 // Detect reduction patterns
7114 if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy, CostKind))
7115 return *RedCost;
7116
7117 Type *SrcScalarTy = I->getOperand(i: 0)->getType();
7118 Type *SrcVecTy =
7119 VectorTy->isVectorTy() ? ToVectorTy(Scalar: SrcScalarTy, EC: VF) : SrcScalarTy;
7120 if (canTruncateToMinimalBitwidth(I, VF)) {
7121 // This cast is going to be shrunk. This may remove the cast or it might
7122 // turn it into slightly different cast. For example, if MinBW == 16,
7123 // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7124 //
7125 // Calculate the modified src and dest types.
7126 Type *MinVecTy = VectorTy;
7127 if (Opcode == Instruction::Trunc) {
7128 SrcVecTy = smallestIntegerVectorType(T1: SrcVecTy, T2: MinVecTy);
7129 VectorTy =
7130 largestIntegerVectorType(T1: ToVectorTy(Scalar: I->getType(), EC: VF), T2: MinVecTy);
7131 } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7132 // Leave SrcVecTy unchanged - we only shrink the destination element
7133 // type.
7134 VectorTy =
7135 smallestIntegerVectorType(T1: ToVectorTy(Scalar: I->getType(), EC: VF), T2: MinVecTy);
7136 }
7137 }
7138
7139 return TTI.getCastInstrCost(Opcode, Dst: VectorTy, Src: SrcVecTy, CCH, CostKind, I);
7140 }
7141 case Instruction::Call:
7142 return getVectorCallCost(CI: cast<CallInst>(Val: I), VF);
7143 case Instruction::ExtractValue:
7144 return TTI.getInstructionCost(U: I, CostKind: TTI::TCK_RecipThroughput);
7145 case Instruction::Alloca:
7146 // We cannot easily widen alloca to a scalable alloca, as
7147 // the result would need to be a vector of pointers.
7148 if (VF.isScalable())
7149 return InstructionCost::getInvalid();
7150 [[fallthrough]];
7151 default:
7152 // This opcode is unknown. Assume that it is the same as 'mul'.
7153 return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
7154 } // end of switch.
7155}
7156
7157void LoopVectorizationCostModel::collectValuesToIgnore() {
7158 // Ignore ephemeral values.
7159 CodeMetrics::collectEphemeralValues(L: TheLoop, AC, EphValues&: ValuesToIgnore);
7160
7161 // Find all stores to invariant variables. Since they are going to sink
7162 // outside the loop we do not need calculate cost for them.
7163 for (BasicBlock *BB : TheLoop->blocks())
7164 for (Instruction &I : *BB) {
7165 StoreInst *SI;
7166 if ((SI = dyn_cast<StoreInst>(Val: &I)) &&
7167 Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand()))
7168 ValuesToIgnore.insert(Ptr: &I);
7169 }
7170
7171 // Ignore type-promoting instructions we identified during reduction
7172 // detection.
7173 for (const auto &Reduction : Legal->getReductionVars()) {
7174 const RecurrenceDescriptor &RedDes = Reduction.second;
7175 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7176 VecValuesToIgnore.insert(I: Casts.begin(), E: Casts.end());
7177 }
7178 // Ignore type-casting instructions we identified during induction
7179 // detection.
7180 for (const auto &Induction : Legal->getInductionVars()) {
7181 const InductionDescriptor &IndDes = Induction.second;
7182 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7183 VecValuesToIgnore.insert(I: Casts.begin(), E: Casts.end());
7184 }
7185}
7186
7187void LoopVectorizationCostModel::collectInLoopReductions() {
7188 for (const auto &Reduction : Legal->getReductionVars()) {
7189 PHINode *Phi = Reduction.first;
7190 const RecurrenceDescriptor &RdxDesc = Reduction.second;
7191
7192 // We don't collect reductions that are type promoted (yet).
7193 if (RdxDesc.getRecurrenceType() != Phi->getType())
7194 continue;
7195
7196 // If the target would prefer this reduction to happen "in-loop", then we
7197 // want to record it as such.
7198 unsigned Opcode = RdxDesc.getOpcode();
7199 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7200 !TTI.preferInLoopReduction(Opcode, Ty: Phi->getType(),
7201 Flags: TargetTransformInfo::ReductionFlags()))
7202 continue;
7203
7204 // Check that we can correctly put the reductions into the loop, by
7205 // finding the chain of operations that leads from the phi to the loop
7206 // exit value.
7207 SmallVector<Instruction *, 4> ReductionOperations =
7208 RdxDesc.getReductionOpChain(Phi, L: TheLoop);
7209 bool InLoop = !ReductionOperations.empty();
7210
7211 if (InLoop) {
7212 InLoopReductions.insert(Ptr: Phi);
7213 // Add the elements to InLoopReductionImmediateChains for cost modelling.
7214 Instruction *LastChain = Phi;
7215 for (auto *I : ReductionOperations) {
7216 InLoopReductionImmediateChains[I] = LastChain;
7217 LastChain = I;
7218 }
7219 }
7220 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7221 << " reduction for phi: " << *Phi << "\n");
7222 }
7223}
7224
7225VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
7226 DebugLoc DL, const Twine &Name) {
7227 assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
7228 Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
7229 return tryInsertInstruction(
7230 VPI: new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
7231}
7232
7233// This function will select a scalable VF if the target supports scalable
7234// vectors and a fixed one otherwise.
7235// TODO: we could return a pair of values that specify the max VF and
7236// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7237// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7238// doesn't have a cost model that can choose which plan to execute if
7239// more than one is generated.
7240static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
7241 LoopVectorizationCostModel &CM) {
7242 unsigned WidestType;
7243 std::tie(args: std::ignore, args&: WidestType) = CM.getSmallestAndWidestTypes();
7244
7245 TargetTransformInfo::RegisterKind RegKind =
7246 TTI.enableScalableVectorization()
7247 ? TargetTransformInfo::RGK_ScalableVector
7248 : TargetTransformInfo::RGK_FixedWidthVector;
7249
7250 TypeSize RegSize = TTI.getRegisterBitWidth(K: RegKind);
7251 unsigned N = RegSize.getKnownMinValue() / WidestType;
7252 return ElementCount::get(MinVal: N, Scalable: RegSize.isScalable());
7253}
7254
7255VectorizationFactor
7256LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7257 ElementCount VF = UserVF;
7258 // Outer loop handling: They may require CFG and instruction level
7259 // transformations before even evaluating whether vectorization is profitable.
7260 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7261 // the vectorization pipeline.
7262 if (!OrigLoop->isInnermost()) {
7263 // If the user doesn't provide a vectorization factor, determine a
7264 // reasonable one.
7265 if (UserVF.isZero()) {
7266 VF = determineVPlanVF(TTI, CM);
7267 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7268
7269 // Make sure we have a VF > 1 for stress testing.
7270 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7271 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7272 << "overriding computed VF.\n");
7273 VF = ElementCount::getFixed(MinVal: 4);
7274 }
7275 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7276 !ForceTargetSupportsScalableVectors) {
7277 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7278 << "not supported by the target.\n");
7279 reportVectorizationFailure(
7280 DebugMsg: "Scalable vectorization requested but not supported by the target",
7281 OREMsg: "the scalable user-specified vectorization width for outer-loop "
7282 "vectorization cannot be used because the target does not support "
7283 "scalable vectors.",
7284 ORETag: "ScalableVFUnfeasible", ORE, TheLoop: OrigLoop);
7285 return VectorizationFactor::Disabled();
7286 }
7287 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7288 assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7289 "VF needs to be a power of two");
7290 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7291 << "VF " << VF << " to build VPlans.\n");
7292 buildVPlans(MinVF: VF, MaxVF: VF);
7293
7294 // For VPlan build stress testing, we bail out after VPlan construction.
7295 if (VPlanBuildStressTest)
7296 return VectorizationFactor::Disabled();
7297
7298 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7299 }
7300
7301 LLVM_DEBUG(
7302 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7303 "VPlan-native path.\n");
7304 return VectorizationFactor::Disabled();
7305}
7306
7307std::optional<VectorizationFactor>
7308LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7309 assert(OrigLoop->isInnermost() && "Inner loop expected.");
7310 CM.collectValuesToIgnore();
7311 CM.collectElementTypesForWidening();
7312
7313 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7314 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7315 return std::nullopt;
7316
7317 // Invalidate interleave groups if all blocks of loop will be predicated.
7318 if (CM.blockNeedsPredicationForAnyReason(BB: OrigLoop->getHeader()) &&
7319 !useMaskedInterleavedAccesses(TTI)) {
7320 LLVM_DEBUG(
7321 dbgs()
7322 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7323 "which requires masked-interleaved support.\n");
7324 if (CM.InterleaveInfo.invalidateGroups())
7325 // Invalidating interleave groups also requires invalidating all decisions
7326 // based on them, which includes widening decisions and uniform and scalar
7327 // values.
7328 CM.invalidateCostModelingDecisions();
7329 }
7330
7331 ElementCount MaxUserVF =
7332 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7333 bool UserVFIsLegal = ElementCount::isKnownLE(LHS: UserVF, RHS: MaxUserVF);
7334 if (!UserVF.isZero() && UserVFIsLegal) {
7335 assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7336 "VF needs to be a power of two");
7337 // Collect the instructions (and their associated costs) that will be more
7338 // profitable to scalarize.
7339 CM.collectInLoopReductions();
7340 if (CM.selectUserVectorizationFactor(UserVF)) {
7341 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7342 buildVPlansWithVPRecipes(MinVF: UserVF, MaxVF: UserVF);
7343 if (!hasPlanWithVF(VF: UserVF)) {
7344 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
7345 << ".\n");
7346 return std::nullopt;
7347 }
7348
7349 LLVM_DEBUG(printPlans(dbgs()));
7350 return {{UserVF, 0, 0}};
7351 } else
7352 reportVectorizationInfo(Msg: "UserVF ignored because of invalid costs.",
7353 ORETag: "InvalidCost", ORE, TheLoop: OrigLoop);
7354 }
7355
7356 // Populate the set of Vectorization Factor Candidates.
7357 ElementCountSet VFCandidates;
7358 for (auto VF = ElementCount::getFixed(MinVal: 1);
7359 ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.FixedVF); VF *= 2)
7360 VFCandidates.insert(V: VF);
7361 for (auto VF = ElementCount::getScalable(MinVal: 1);
7362 ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.ScalableVF); VF *= 2)
7363 VFCandidates.insert(V: VF);
7364
7365 CM.collectInLoopReductions();
7366 for (const auto &VF : VFCandidates) {
7367 // Collect Uniform and Scalar instructions after vectorization with VF.
7368 CM.collectUniformsAndScalars(VF);
7369
7370 // Collect the instructions (and their associated costs) that will be more
7371 // profitable to scalarize.
7372 if (VF.isVector())
7373 CM.collectInstsToScalarize(VF);
7374 }
7375
7376 buildVPlansWithVPRecipes(MinVF: ElementCount::getFixed(MinVal: 1), MaxVF: MaxFactors.FixedVF);
7377 buildVPlansWithVPRecipes(MinVF: ElementCount::getScalable(MinVal: 1), MaxVF: MaxFactors.ScalableVF);
7378
7379 LLVM_DEBUG(printPlans(dbgs()));
7380 if (!MaxFactors.hasVector())
7381 return VectorizationFactor::Disabled();
7382
7383 // Select the optimal vectorization factor.
7384 VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
7385 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7386 if (!hasPlanWithVF(VF: VF.Width)) {
7387 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
7388 << ".\n");
7389 return std::nullopt;
7390 }
7391 return VF;
7392}
7393
7394VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7395 assert(count_if(VPlans,
7396 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7397 1 &&
7398 "Best VF has not a single VPlan.");
7399
7400 for (const VPlanPtr &Plan : VPlans) {
7401 if (Plan->hasVF(VF))
7402 return *Plan.get();
7403 }
7404 llvm_unreachable("No plan found!");
7405}
7406
7407static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7408 SmallVector<Metadata *, 4> MDs;
7409 // Reserve first location for self reference to the LoopID metadata node.
7410 MDs.push_back(Elt: nullptr);
7411 bool IsUnrollMetadata = false;
7412 MDNode *LoopID = L->getLoopID();
7413 if (LoopID) {
7414 // First find existing loop unrolling disable metadata.
7415 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7416 auto *MD = dyn_cast<MDNode>(Val: LoopID->getOperand(I: i));
7417 if (MD) {
7418 const auto *S = dyn_cast<MDString>(Val: MD->getOperand(I: 0));
7419 IsUnrollMetadata =
7420 S && S->getString().starts_with(Prefix: "llvm.loop.unroll.disable");
7421 }
7422 MDs.push_back(Elt: LoopID->getOperand(I: i));
7423 }
7424 }
7425
7426 if (!IsUnrollMetadata) {
7427 // Add runtime unroll disable metadata.
7428 LLVMContext &Context = L->getHeader()->getContext();
7429 SmallVector<Metadata *, 1> DisableOperands;
7430 DisableOperands.push_back(
7431 Elt: MDString::get(Context, Str: "llvm.loop.unroll.runtime.disable"));
7432 MDNode *DisableNode = MDNode::get(Context, MDs: DisableOperands);
7433 MDs.push_back(Elt: DisableNode);
7434 MDNode *NewLoopID = MDNode::get(Context, MDs);
7435 // Set operand 0 to refer to the loop id itself.
7436 NewLoopID->replaceOperandWith(I: 0, New: NewLoopID);
7437 L->setLoopID(NewLoopID);
7438 }
7439}
7440
7441// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7442// create a merge phi node for it and add it to \p ReductionResumeValues.
7443static void createAndCollectMergePhiForReduction(
7444 VPInstruction *RedResult,
7445 DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues,
7446 VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) {
7447 if (!RedResult ||
7448 RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7449 return;
7450
7451 auto *PhiR = cast<VPReductionPHIRecipe>(Val: RedResult->getOperand(N: 0));
7452 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7453
7454 TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
7455 Value *FinalValue =
7456 State.get(Def: RedResult, Instance: VPIteration(State.UF - 1, VPLane::getFirstLane()));
7457 auto *ResumePhi =
7458 dyn_cast<PHINode>(Val: PhiR->getStartValue()->getUnderlyingValue());
7459
7460 // TODO: bc.merge.rdx should not be created here, instead it should be
7461 // modeled in VPlan.
7462 BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7463 // Create a phi node that merges control-flow from the backedge-taken check
7464 // block and the middle block.
7465 auto *BCBlockPhi =
7466 PHINode::Create(Ty: FinalValue->getType(), NumReservedValues: 2, NameStr: "bc.merge.rdx",
7467 InsertBefore: LoopScalarPreHeader->getTerminator()->getIterator());
7468
7469 // If we are fixing reductions in the epilogue loop then we should already
7470 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7471 // we carry over the incoming values correctly.
7472 for (auto *Incoming : predecessors(BB: LoopScalarPreHeader)) {
7473 if (Incoming == LoopMiddleBlock)
7474 BCBlockPhi->addIncoming(V: FinalValue, BB: Incoming);
7475 else if (ResumePhi && is_contained(Range: ResumePhi->blocks(), Element: Incoming))
7476 BCBlockPhi->addIncoming(V: ResumePhi->getIncomingValueForBlock(BB: Incoming),
7477 BB: Incoming);
7478 else
7479 BCBlockPhi->addIncoming(V: ReductionStartValue, BB: Incoming);
7480 }
7481
7482 auto *OrigPhi = cast<PHINode>(Val: PhiR->getUnderlyingValue());
7483 // TODO: This fixup should instead be modeled in VPlan.
7484 // Fix the scalar loop reduction variable with the incoming reduction sum
7485 // from the vector body and from the backedge value.
7486 int IncomingEdgeBlockIdx =
7487 OrigPhi->getBasicBlockIndex(BB: OrigLoop->getLoopLatch());
7488 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7489 // Pick the other block.
7490 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7491 OrigPhi->setIncomingValue(i: SelfEdgeBlockIdx, V: BCBlockPhi);
7492 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7493 OrigPhi->setIncomingValue(i: IncomingEdgeBlockIdx, V: LoopExitInst);
7494
7495 ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7496}
7497
7498std::pair<DenseMap<const SCEV *, Value *>,
7499 DenseMap<const RecurrenceDescriptor *, Value *>>
7500LoopVectorizationPlanner::executePlan(
7501 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7502 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7503 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7504 assert(BestVPlan.hasVF(BestVF) &&
7505 "Trying to execute plan with unsupported VF");
7506 assert(BestVPlan.hasUF(BestUF) &&
7507 "Trying to execute plan with unsupported UF");
7508 assert(
7509 (IsEpilogueVectorization || !ExpandedSCEVs) &&
7510 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7511
7512 if (!IsEpilogueVectorization)
7513 VPlanTransforms::optimizeForVFAndUF(Plan&: BestVPlan, BestVF, BestUF, PSE);
7514
7515 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
7516 << ", UF=" << BestUF << '\n');
7517 BestVPlan.setName("Final VPlan");
7518 LLVM_DEBUG(BestVPlan.dump());
7519
7520 // Perform the actual loop transformation.
7521 VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7522 OrigLoop->getHeader()->getContext());
7523
7524 // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7525 // before making any changes to the CFG.
7526 if (!BestVPlan.getPreheader()->empty()) {
7527 State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7528 State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
7529 BestVPlan.getPreheader()->execute(State: &State);
7530 }
7531 if (!ILV.getTripCount())
7532 ILV.setTripCount(State.get(Def: BestVPlan.getTripCount(), Instance: {0, 0}));
7533 else
7534 assert(IsEpilogueVectorization && "should only re-use the existing trip "
7535 "count during epilogue vectorization");
7536
7537 // 1. Set up the skeleton for vectorization, including vector pre-header and
7538 // middle block. The vector loop is created during VPlan execution.
7539 Value *CanonicalIVStartValue;
7540 std::tie(args&: State.CFG.PrevBB, args&: CanonicalIVStartValue) =
7541 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs: ExpandedSCEVs ? *ExpandedSCEVs
7542 : State.ExpandedSCEVs);
7543
7544 // Only use noalias metadata when using memory checks guaranteeing no overlap
7545 // across all iterations.
7546 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7547 std::unique_ptr<LoopVersioning> LVer = nullptr;
7548 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7549 !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7550
7551 // We currently don't use LoopVersioning for the actual loop cloning but we
7552 // still use it to add the noalias metadata.
7553 // TODO: Find a better way to re-use LoopVersioning functionality to add
7554 // metadata.
7555 LVer = std::make_unique<LoopVersioning>(
7556 args: *LAI, args: LAI->getRuntimePointerChecking()->getChecks(), args&: OrigLoop, args&: LI, args&: DT,
7557 args: PSE.getSE());
7558 State.LVer = &*LVer;
7559 State.LVer->prepareNoAliasMetadata();
7560 }
7561
7562 ILV.printDebugTracesAtStart();
7563
7564 //===------------------------------------------------===//
7565 //
7566 // Notice: any optimization or new instruction that go
7567 // into the code below should also be implemented in
7568 // the cost-model.
7569 //
7570 //===------------------------------------------------===//
7571
7572 // 2. Copy and widen instructions from the old loop into the new loop.
7573 BestVPlan.prepareToExecute(TripCount: ILV.getTripCount(),
7574 VectorTripCount: ILV.getOrCreateVectorTripCount(InsertBlock: nullptr),
7575 CanonicalIVStartValue, State);
7576
7577 BestVPlan.execute(State: &State);
7578
7579 // 2.5 Collect reduction resume values.
7580 DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
7581 auto *ExitVPBB =
7582 cast<VPBasicBlock>(Val: BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7583 for (VPRecipeBase &R : *ExitVPBB) {
7584 createAndCollectMergePhiForReduction(RedResult: dyn_cast<VPInstruction>(Val: &R),
7585 ReductionResumeValues, State, OrigLoop,
7586 LoopMiddleBlock: State.CFG.VPBB2IRBB[ExitVPBB]);
7587 }
7588
7589 // 2.6. Maintain Loop Hints
7590 // Keep all loop hints from the original loop on the vector loop (we'll
7591 // replace the vectorizer-specific hints below).
7592 MDNode *OrigLoopID = OrigLoop->getLoopID();
7593
7594 std::optional<MDNode *> VectorizedLoopID =
7595 makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
7596 LLVMLoopVectorizeFollowupVectorized});
7597
7598 VPBasicBlock *HeaderVPBB =
7599 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7600 Loop *L = LI->getLoopFor(BB: State.CFG.VPBB2IRBB[HeaderVPBB]);
7601 if (VectorizedLoopID)
7602 L->setLoopID(*VectorizedLoopID);
7603 else {
7604 // Keep all loop hints from the original loop on the vector loop (we'll
7605 // replace the vectorizer-specific hints below).
7606 if (MDNode *LID = OrigLoop->getLoopID())
7607 L->setLoopID(LID);
7608
7609 LoopVectorizeHints Hints(L, true, *ORE);
7610 Hints.setAlreadyVectorized();
7611 }
7612 TargetTransformInfo::UnrollingPreferences UP;
7613 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7614 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
7615 AddRuntimeUnrollDisableMetaData(L);
7616
7617 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7618 // predication, updating analyses.
7619 ILV.fixVectorizedLoop(State, Plan&: BestVPlan);
7620
7621 ILV.printDebugTracesAtEnd();
7622
7623 return {State.ExpandedSCEVs, ReductionResumeValues};
7624}
7625
7626#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7627void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7628 for (const auto &Plan : VPlans)
7629 if (PrintVPlansInDotFormat)
7630 Plan->printDOT(O);
7631 else
7632 Plan->print(O);
7633}
7634#endif
7635
7636//===--------------------------------------------------------------------===//
7637// EpilogueVectorizerMainLoop
7638//===--------------------------------------------------------------------===//
7639
7640/// This function is partially responsible for generating the control flow
7641/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7642std::pair<BasicBlock *, Value *>
7643EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7644 const SCEV2ValueTy &ExpandedSCEVs) {
7645 createVectorLoopSkeleton(Prefix: "");
7646
7647 // Generate the code to check the minimum iteration count of the vector
7648 // epilogue (see below).
7649 EPI.EpilogueIterationCountCheck =
7650 emitIterationCountCheck(Bypass: LoopScalarPreHeader, ForEpilogue: true);
7651 EPI.EpilogueIterationCountCheck->setName("iter.check");
7652
7653 // Generate the code to check any assumptions that we've made for SCEV
7654 // expressions.
7655 EPI.SCEVSafetyCheck = emitSCEVChecks(Bypass: LoopScalarPreHeader);
7656
7657 // Generate the code that checks at runtime if arrays overlap. We put the
7658 // checks into a separate block to make the more common case of few elements
7659 // faster.
7660 EPI.MemSafetyCheck = emitMemRuntimeChecks(Bypass: LoopScalarPreHeader);
7661
7662 // Generate the iteration count check for the main loop, *after* the check
7663 // for the epilogue loop, so that the path-length is shorter for the case
7664 // that goes directly through the vector epilogue. The longer-path length for
7665 // the main loop is compensated for, by the gain from vectorizing the larger
7666 // trip count. Note: the branch will get updated later on when we vectorize
7667 // the epilogue.
7668 EPI.MainLoopIterationCountCheck =
7669 emitIterationCountCheck(Bypass: LoopScalarPreHeader, ForEpilogue: false);
7670
7671 // Generate the induction variable.
7672 EPI.VectorTripCount = getOrCreateVectorTripCount(InsertBlock: LoopVectorPreHeader);
7673
7674 // Skip induction resume value creation here because they will be created in
7675 // the second pass for the scalar loop. The induction resume values for the
7676 // inductions in the epilogue loop are created before executing the plan for
7677 // the epilogue loop.
7678
7679 return {completeLoopSkeleton(), nullptr};
7680}
7681
7682void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7683 LLVM_DEBUG({
7684 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7685 << "Main Loop VF:" << EPI.MainLoopVF
7686 << ", Main Loop UF:" << EPI.MainLoopUF
7687 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7688 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7689 });
7690}
7691
7692void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7693 DEBUG_WITH_TYPE(VerboseDebug, {
7694 dbgs() << "intermediate fn:\n"
7695 << *OrigLoop->getHeader()->getParent() << "\n";
7696 });
7697}
7698
7699BasicBlock *
7700EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7701 bool ForEpilogue) {
7702 assert(Bypass && "Expected valid bypass basic block.");
7703 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7704 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7705 Value *Count = getTripCount();
7706 // Reuse existing vector loop preheader for TC checks.
7707 // Note that new preheader block is generated for vector loop.
7708 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7709 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7710
7711 // Generate code to check if the loop's trip count is less than VF * UF of the
7712 // main vector loop.
7713 auto P = Cost->requiresScalarEpilogue(IsVectorizing: ForEpilogue ? EPI.EpilogueVF.isVector()
7714 : VF.isVector())
7715 ? ICmpInst::ICMP_ULE
7716 : ICmpInst::ICMP_ULT;
7717
7718 Value *CheckMinIters = Builder.CreateICmp(
7719 P, LHS: Count, RHS: createStepForVF(B&: Builder, Ty: Count->getType(), VF: VFactor, Step: UFactor),
7720 Name: "min.iters.check");
7721
7722 if (!ForEpilogue)
7723 TCCheckBlock->setName("vector.main.loop.iter.check");
7724
7725 // Create new preheader for vector loop.
7726 LoopVectorPreHeader = SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(),
7727 DT, LI, MSSAU: nullptr, BBName: "vector.ph");
7728
7729 if (ForEpilogue) {
7730 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7731 DT->getNode(Bypass)->getIDom()) &&
7732 "TC check is expected to dominate Bypass");
7733
7734 // Update dominator for Bypass & LoopExit.
7735 DT->changeImmediateDominator(BB: Bypass, NewBB: TCCheckBlock);
7736 if (!Cost->requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector()))
7737 // For loops with multiple exits, there's no edge from the middle block
7738 // to exit blocks (as the epilogue must run) and thus no need to update
7739 // the immediate dominator of the exit blocks.
7740 DT->changeImmediateDominator(BB: LoopExitBlock, NewBB: TCCheckBlock);
7741
7742 LoopBypassBlocks.push_back(Elt: TCCheckBlock);
7743
7744 // Save the trip count so we don't have to regenerate it in the
7745 // vec.epilog.iter.check. This is safe to do because the trip count
7746 // generated here dominates the vector epilog iter check.
7747 EPI.TripCount = Count;
7748 }
7749
7750 BranchInst &BI =
7751 *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
7752 if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator()))
7753 setBranchWeights(I&: BI, Weights: MinItersBypassWeights);
7754 ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI);
7755
7756 return TCCheckBlock;
7757}
7758
7759//===--------------------------------------------------------------------===//
7760// EpilogueVectorizerEpilogueLoop
7761//===--------------------------------------------------------------------===//
7762
7763/// This function is partially responsible for generating the control flow
7764/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7765std::pair<BasicBlock *, Value *>
7766EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7767 const SCEV2ValueTy &ExpandedSCEVs) {
7768 createVectorLoopSkeleton(Prefix: "vec.epilog.");
7769
7770 // Now, compare the remaining count and if there aren't enough iterations to
7771 // execute the vectorized epilogue skip to the scalar part.
7772 BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7773 VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7774 LoopVectorPreHeader =
7775 SplitBlock(Old: LoopVectorPreHeader, SplitPt: LoopVectorPreHeader->getTerminator(), DT,
7776 LI, MSSAU: nullptr, BBName: "vec.epilog.ph");
7777 emitMinimumVectorEpilogueIterCountCheck(Bypass: LoopScalarPreHeader,
7778 Insert: VecEpilogueIterationCountCheck);
7779
7780 // Adjust the control flow taking the state info from the main loop
7781 // vectorization into account.
7782 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7783 "expected this to be saved from the previous pass.");
7784 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7785 From: VecEpilogueIterationCountCheck, To: LoopVectorPreHeader);
7786
7787 DT->changeImmediateDominator(BB: LoopVectorPreHeader,
7788 NewBB: EPI.MainLoopIterationCountCheck);
7789
7790 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7791 From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7792
7793 if (EPI.SCEVSafetyCheck)
7794 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7795 From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7796 if (EPI.MemSafetyCheck)
7797 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7798 From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7799
7800 DT->changeImmediateDominator(
7801 BB: VecEpilogueIterationCountCheck,
7802 NewBB: VecEpilogueIterationCountCheck->getSinglePredecessor());
7803
7804 DT->changeImmediateDominator(BB: LoopScalarPreHeader,
7805 NewBB: EPI.EpilogueIterationCountCheck);
7806 if (!Cost->requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector()))
7807 // If there is an epilogue which must run, there's no edge from the
7808 // middle block to exit blocks and thus no need to update the immediate
7809 // dominator of the exit blocks.
7810 DT->changeImmediateDominator(BB: LoopExitBlock,
7811 NewBB: EPI.EpilogueIterationCountCheck);
7812
7813 // Keep track of bypass blocks, as they feed start values to the induction and
7814 // reduction phis in the scalar loop preheader.
7815 if (EPI.SCEVSafetyCheck)
7816 LoopBypassBlocks.push_back(Elt: EPI.SCEVSafetyCheck);
7817 if (EPI.MemSafetyCheck)
7818 LoopBypassBlocks.push_back(Elt: EPI.MemSafetyCheck);
7819 LoopBypassBlocks.push_back(Elt: EPI.EpilogueIterationCountCheck);
7820
7821 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7822 // reductions which merge control-flow from the latch block and the middle
7823 // block. Update the incoming values here and move the Phi into the preheader.
7824 SmallVector<PHINode *, 4> PhisInBlock;
7825 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7826 PhisInBlock.push_back(Elt: &Phi);
7827
7828 for (PHINode *Phi : PhisInBlock) {
7829 Phi->moveBefore(MovePos: LoopVectorPreHeader->getFirstNonPHI());
7830 Phi->replaceIncomingBlockWith(
7831 Old: VecEpilogueIterationCountCheck->getSinglePredecessor(),
7832 New: VecEpilogueIterationCountCheck);
7833
7834 // If the phi doesn't have an incoming value from the
7835 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7836 // value and also those from other check blocks. This is needed for
7837 // reduction phis only.
7838 if (none_of(Range: Phi->blocks(), P: [&](BasicBlock *IncB) {
7839 return EPI.EpilogueIterationCountCheck == IncB;
7840 }))
7841 continue;
7842 Phi->removeIncomingValue(BB: EPI.EpilogueIterationCountCheck);
7843 if (EPI.SCEVSafetyCheck)
7844 Phi->removeIncomingValue(BB: EPI.SCEVSafetyCheck);
7845 if (EPI.MemSafetyCheck)
7846 Phi->removeIncomingValue(BB: EPI.MemSafetyCheck);
7847 }
7848
7849 // Generate a resume induction for the vector epilogue and put it in the
7850 // vector epilogue preheader
7851 Type *IdxTy = Legal->getWidestInductionType();
7852 PHINode *EPResumeVal = PHINode::Create(Ty: IdxTy, NumReservedValues: 2, NameStr: "vec.epilog.resume.val");
7853 EPResumeVal->insertBefore(InsertPos: LoopVectorPreHeader->getFirstNonPHIIt());
7854 EPResumeVal->addIncoming(V: EPI.VectorTripCount, BB: VecEpilogueIterationCountCheck);
7855 EPResumeVal->addIncoming(V: ConstantInt::get(Ty: IdxTy, V: 0),
7856 BB: EPI.MainLoopIterationCountCheck);
7857
7858 // Generate induction resume values. These variables save the new starting
7859 // indexes for the scalar loop. They are used to test if there are any tail
7860 // iterations left once the vector loop has completed.
7861 // Note that when the vectorized epilogue is skipped due to iteration count
7862 // check, then the resume value for the induction variable comes from
7863 // the trip count of the main vector loop, hence passing the AdditionalBypass
7864 // argument.
7865 createInductionResumeValues(ExpandedSCEVs,
7866 AdditionalBypass: {VecEpilogueIterationCountCheck,
7867 EPI.VectorTripCount} /* AdditionalBypass */);
7868
7869 return {completeLoopSkeleton(), EPResumeVal};
7870}
7871
7872BasicBlock *
7873EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7874 BasicBlock *Bypass, BasicBlock *Insert) {
7875
7876 assert(EPI.TripCount &&
7877 "Expected trip count to have been safed in the first pass.");
7878 assert(
7879 (!isa<Instruction>(EPI.TripCount) ||
7880 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7881 "saved trip count does not dominate insertion point.");
7882 Value *TC = EPI.TripCount;
7883 IRBuilder<> Builder(Insert->getTerminator());
7884 Value *Count = Builder.CreateSub(LHS: TC, RHS: EPI.VectorTripCount, Name: "n.vec.remaining");
7885
7886 // Generate code to check if the loop's trip count is less than VF * UF of the
7887 // vector epilogue loop.
7888 auto P = Cost->requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector())
7889 ? ICmpInst::ICMP_ULE
7890 : ICmpInst::ICMP_ULT;
7891
7892 Value *CheckMinIters =
7893 Builder.CreateICmp(P, LHS: Count,
7894 RHS: createStepForVF(B&: Builder, Ty: Count->getType(),
7895 VF: EPI.EpilogueVF, Step: EPI.EpilogueUF),
7896 Name: "min.epilog.iters.check");
7897
7898 BranchInst &BI =
7899 *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
7900 if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())) {
7901 unsigned MainLoopStep = UF * VF.getKnownMinValue();
7902 unsigned EpilogueLoopStep =
7903 EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
7904 // We assume the remaining `Count` is equally distributed in
7905 // [0, MainLoopStep)
7906 // So the probability for `Count < EpilogueLoopStep` should be
7907 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7908 unsigned EstimatedSkipCount = std::min(a: MainLoopStep, b: EpilogueLoopStep);
7909 const uint32_t Weights[] = {EstimatedSkipCount,
7910 MainLoopStep - EstimatedSkipCount};
7911 setBranchWeights(I&: BI, Weights);
7912 }
7913 ReplaceInstWithInst(From: Insert->getTerminator(), To: &BI);
7914
7915 LoopBypassBlocks.push_back(Elt: Insert);
7916 return Insert;
7917}
7918
7919void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7920 LLVM_DEBUG({
7921 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7922 << "Epilogue Loop VF:" << EPI.EpilogueVF
7923 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7924 });
7925}
7926
7927void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7928 DEBUG_WITH_TYPE(VerboseDebug, {
7929 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7930 });
7931}
7932
7933bool LoopVectorizationPlanner::getDecisionAndClampRange(
7934 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7935 assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7936 bool PredicateAtRangeStart = Predicate(Range.Start);
7937
7938 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
7939 if (Predicate(TmpVF) != PredicateAtRangeStart) {
7940 Range.End = TmpVF;
7941 break;
7942 }
7943
7944 return PredicateAtRangeStart;
7945}
7946
7947/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7948/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7949/// of VF's starting at a given VF and extending it as much as possible. Each
7950/// vectorization decision can potentially shorten this sub-range during
7951/// buildVPlan().
7952void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7953 ElementCount MaxVF) {
7954 auto MaxVFTimes2 = MaxVF * 2;
7955 for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) {
7956 VFRange SubRange = {VF, MaxVFTimes2};
7957 VPlans.push_back(Elt: buildVPlan(Range&: SubRange));
7958 VF = SubRange.End;
7959 }
7960}
7961
7962iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
7963VPRecipeBuilder::mapToVPValues(User::op_range Operands) {
7964 std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
7965 if (auto *I = dyn_cast<Instruction>(Val: Op)) {
7966 if (auto *R = Ingredient2Recipe.lookup(Val: I))
7967 return R->getVPSingleValue();
7968 }
7969 return Plan.getOrAddLiveIn(V: Op);
7970 };
7971 return map_range(C&: Operands, F: Fn);
7972}
7973
7974VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
7975 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7976
7977 // Look for cached value.
7978 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7979 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Val: Edge);
7980 if (ECEntryIt != EdgeMaskCache.end())
7981 return ECEntryIt->second;
7982
7983 VPValue *SrcMask = getBlockInMask(BB: Src);
7984
7985 // The terminator has to be a branch inst!
7986 BranchInst *BI = dyn_cast<BranchInst>(Val: Src->getTerminator());
7987 assert(BI && "Unexpected terminator found");
7988
7989 if (!BI->isConditional() || BI->getSuccessor(i: 0) == BI->getSuccessor(i: 1))
7990 return EdgeMaskCache[Edge] = SrcMask;
7991
7992 // If source is an exiting block, we know the exit edge is dynamically dead
7993 // in the vector loop, and thus we don't need to restrict the mask. Avoid
7994 // adding uses of an otherwise potentially dead instruction.
7995 if (OrigLoop->isLoopExiting(BB: Src))
7996 return EdgeMaskCache[Edge] = SrcMask;
7997
7998 VPValue *EdgeMask = getVPValueOrAddLiveIn(V: BI->getCondition(), Plan);
7999 assert(EdgeMask && "No Edge Mask found for condition");
8000
8001 if (BI->getSuccessor(i: 0) != Dst)
8002 EdgeMask = Builder.createNot(Operand: EdgeMask, DL: BI->getDebugLoc());
8003
8004 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8005 // The condition is 'SrcMask && EdgeMask', which is equivalent to
8006 // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8007 // The select version does not introduce new UB if SrcMask is false and
8008 // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8009 VPValue *False = Plan.getOrAddLiveIn(
8010 V: ConstantInt::getFalse(Ty: BI->getCondition()->getType()));
8011 EdgeMask =
8012 Builder.createSelect(Cond: SrcMask, TrueVal: EdgeMask, FalseVal: False, DL: BI->getDebugLoc());
8013 }
8014
8015 return EdgeMaskCache[Edge] = EdgeMask;
8016}
8017
8018VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const {
8019 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8020
8021 // Look for cached value.
8022 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8023 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Val: Edge);
8024 assert(ECEntryIt != EdgeMaskCache.end() &&
8025 "looking up mask for edge which has not been created");
8026 return ECEntryIt->second;
8027}
8028
8029void VPRecipeBuilder::createHeaderMask() {
8030 BasicBlock *Header = OrigLoop->getHeader();
8031
8032 // When not folding the tail, use nullptr to model all-true mask.
8033 if (!CM.foldTailByMasking()) {
8034 BlockMaskCache[Header] = nullptr;
8035 return;
8036 }
8037
8038 // Introduce the early-exit compare IV <= BTC to form header block mask.
8039 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8040 // constructing the desired canonical IV in the header block as its first
8041 // non-phi instructions.
8042
8043 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8044 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8045 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
8046 HeaderVPBB->insert(Recipe: IV, InsertPt: NewInsertionPoint);
8047
8048 VPBuilder::InsertPointGuard Guard(Builder);
8049 Builder.setInsertPoint(TheBB: HeaderVPBB, IP: NewInsertionPoint);
8050 VPValue *BlockMask = nullptr;
8051 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
8052 BlockMask = Builder.createICmp(Pred: CmpInst::ICMP_ULE, A: IV, B: BTC);
8053 BlockMaskCache[Header] = BlockMask;
8054}
8055
8056VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
8057 // Return the cached value.
8058 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(Val: BB);
8059 assert(BCEntryIt != BlockMaskCache.end() &&
8060 "Trying to access mask for block without one.");
8061 return BCEntryIt->second;
8062}
8063
8064void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {
8065 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8066 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
8067 assert(OrigLoop->getHeader() != BB &&
8068 "Loop header must have cached block mask");
8069
8070 // All-one mask is modelled as no-mask following the convention for masked
8071 // load/store/gather/scatter. Initialize BlockMask to no-mask.
8072 VPValue *BlockMask = nullptr;
8073 // This is the block mask. We OR all incoming edges.
8074 for (auto *Predecessor : predecessors(BB)) {
8075 VPValue *EdgeMask = createEdgeMask(Src: Predecessor, Dst: BB);
8076 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8077 BlockMaskCache[BB] = EdgeMask;
8078 return;
8079 }
8080
8081 if (!BlockMask) { // BlockMask has its initialized nullptr value.
8082 BlockMask = EdgeMask;
8083 continue;
8084 }
8085
8086 BlockMask = Builder.createOr(LHS: BlockMask, RHS: EdgeMask, DL: {});
8087 }
8088
8089 BlockMaskCache[BB] = BlockMask;
8090}
8091
8092VPWidenMemoryRecipe *
8093VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
8094 VFRange &Range) {
8095 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8096 "Must be called with either a load or store");
8097
8098 auto willWiden = [&](ElementCount VF) -> bool {
8099 LoopVectorizationCostModel::InstWidening Decision =
8100 CM.getWideningDecision(I, VF);
8101 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8102 "CM decision should be taken at this point.");
8103 if (Decision == LoopVectorizationCostModel::CM_Interleave)
8104 return true;
8105 if (CM.isScalarAfterVectorization(I, VF) ||
8106 CM.isProfitableToScalarize(I, VF))
8107 return false;
8108 return Decision != LoopVectorizationCostModel::CM_Scalarize;
8109 };
8110
8111 if (!LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: willWiden, Range))
8112 return nullptr;
8113
8114 VPValue *Mask = nullptr;
8115 if (Legal->isMaskRequired(I))
8116 Mask = getBlockInMask(BB: I->getParent());
8117
8118 // Determine if the pointer operand of the access is either consecutive or
8119 // reverse consecutive.
8120 LoopVectorizationCostModel::InstWidening Decision =
8121 CM.getWideningDecision(I, VF: Range.Start);
8122 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8123 bool Consecutive =
8124 Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8125
8126 VPValue *Ptr = isa<LoadInst>(Val: I) ? Operands[0] : Operands[1];
8127 if (Consecutive) {
8128 auto *GEP = dyn_cast<GetElementPtrInst>(
8129 Val: Ptr->getUnderlyingValue()->stripPointerCasts());
8130 auto *VectorPtr = new VPVectorPointerRecipe(
8131 Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
8132 I->getDebugLoc());
8133 Builder.getInsertBlock()->appendRecipe(Recipe: VectorPtr);
8134 Ptr = VectorPtr;
8135 }
8136 if (LoadInst *Load = dyn_cast<LoadInst>(Val: I))
8137 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
8138 I->getDebugLoc());
8139
8140 StoreInst *Store = cast<StoreInst>(Val: I);
8141 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
8142 Reverse, I->getDebugLoc());
8143}
8144
8145/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8146/// insert a recipe to expand the step for the induction recipe.
8147static VPWidenIntOrFpInductionRecipe *
8148createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
8149 VPValue *Start, const InductionDescriptor &IndDesc,
8150 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,
8151 VFRange &Range) {
8152 assert(IndDesc.getStartValue() ==
8153 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8154 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8155 "step must be loop invariant");
8156
8157 VPValue *Step =
8158 vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: IndDesc.getStep(), SE);
8159 if (auto *TruncI = dyn_cast<TruncInst>(Val: PhiOrTrunc)) {
8160 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
8161 }
8162 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8163 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
8164}
8165
8166VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8167 PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {
8168
8169 // Check if this is an integer or fp induction. If so, build the recipe that
8170 // produces its scalar and vector values.
8171 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8172 return createWidenInductionRecipes(Phi, PhiOrTrunc: Phi, Start: Operands[0], IndDesc: *II, Plan,
8173 SE&: *PSE.getSE(), OrigLoop&: *OrigLoop, Range);
8174
8175 // Check if this is pointer induction. If so, build the recipe for it.
8176 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8177 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: II->getStep(),
8178 SE&: *PSE.getSE());
8179 return new VPWidenPointerInductionRecipe(
8180 Phi, Operands[0], Step, *II,
8181 LoopVectorizationPlanner::getDecisionAndClampRange(
8182 Predicate: [&](ElementCount VF) {
8183 return CM.isScalarAfterVectorization(I: Phi, VF);
8184 },
8185 Range));
8186 }
8187 return nullptr;
8188}
8189
8190VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8191 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {
8192 // Optimize the special case where the source is a constant integer
8193 // induction variable. Notice that we can only optimize the 'trunc' case
8194 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8195 // (c) other casts depend on pointer size.
8196
8197 // Determine whether \p K is a truncation based on an induction variable that
8198 // can be optimized.
8199 auto isOptimizableIVTruncate =
8200 [&](Instruction *K) -> std::function<bool(ElementCount)> {
8201 return [=](ElementCount VF) -> bool {
8202 return CM.isOptimizableIVTruncate(I: K, VF);
8203 };
8204 };
8205
8206 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8207 Predicate: isOptimizableIVTruncate(I), Range)) {
8208
8209 auto *Phi = cast<PHINode>(Val: I->getOperand(i_nocapture: 0));
8210 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8211 VPValue *Start = Plan.getOrAddLiveIn(V: II.getStartValue());
8212 return createWidenInductionRecipes(Phi, PhiOrTrunc: I, Start, IndDesc: II, Plan, SE&: *PSE.getSE(),
8213 OrigLoop&: *OrigLoop, Range);
8214 }
8215 return nullptr;
8216}
8217
8218VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
8219 ArrayRef<VPValue *> Operands) {
8220 unsigned NumIncoming = Phi->getNumIncomingValues();
8221
8222 // We know that all PHIs in non-header blocks are converted into selects, so
8223 // we don't have to worry about the insertion order and we can just use the
8224 // builder. At this point we generate the predication tree. There may be
8225 // duplications since this is a simple recursive scan, but future
8226 // optimizations will clean it up.
8227 // TODO: At the moment the first mask is always skipped, but it would be
8228 // better to skip the most expensive mask.
8229 SmallVector<VPValue *, 2> OperandsWithMask;
8230
8231 for (unsigned In = 0; In < NumIncoming; In++) {
8232 OperandsWithMask.push_back(Elt: Operands[In]);
8233 VPValue *EdgeMask =
8234 getEdgeMask(Src: Phi->getIncomingBlock(i: In), Dst: Phi->getParent());
8235 if (!EdgeMask) {
8236 assert(In == 0 && "Both null and non-null edge masks found");
8237 assert(all_equal(Operands) &&
8238 "Distinct incoming values with one having a full mask");
8239 break;
8240 }
8241 if (In == 0)
8242 continue;
8243 OperandsWithMask.push_back(Elt: EdgeMask);
8244 }
8245 return new VPBlendRecipe(Phi, OperandsWithMask);
8246}
8247
8248VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8249 ArrayRef<VPValue *> Operands,
8250 VFRange &Range) {
8251 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8252 Predicate: [this, CI](ElementCount VF) {
8253 return CM.isScalarWithPredication(I: CI, VF);
8254 },
8255 Range);
8256
8257 if (IsPredicated)
8258 return nullptr;
8259
8260 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8261 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8262 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8263 ID == Intrinsic::pseudoprobe ||
8264 ID == Intrinsic::experimental_noalias_scope_decl))
8265 return nullptr;
8266
8267 SmallVector<VPValue *, 4> Ops(Operands.take_front(N: CI->arg_size()));
8268
8269 // Is it beneficial to perform intrinsic call compared to lib call?
8270 bool ShouldUseVectorIntrinsic =
8271 ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8272 Predicate: [&](ElementCount VF) -> bool {
8273 return CM.getCallWideningDecision(CI, VF).Kind ==
8274 LoopVectorizationCostModel::CM_IntrinsicCall;
8275 },
8276 Range);
8277 if (ShouldUseVectorIntrinsic)
8278 return new VPWidenCallRecipe(*CI, make_range(x: Ops.begin(), y: Ops.end()), ID,
8279 CI->getDebugLoc());
8280
8281 Function *Variant = nullptr;
8282 std::optional<unsigned> MaskPos;
8283 // Is better to call a vectorized version of the function than to to scalarize
8284 // the call?
8285 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8286 Predicate: [&](ElementCount VF) -> bool {
8287 // The following case may be scalarized depending on the VF.
8288 // The flag shows whether we can use a usual Call for vectorized
8289 // version of the instruction.
8290
8291 // If we've found a variant at a previous VF, then stop looking. A
8292 // vectorized variant of a function expects input in a certain shape
8293 // -- basically the number of input registers, the number of lanes
8294 // per register, and whether there's a mask required.
8295 // We store a pointer to the variant in the VPWidenCallRecipe, so
8296 // once we have an appropriate variant it's only valid for that VF.
8297 // This will force a different vplan to be generated for each VF that
8298 // finds a valid variant.
8299 if (Variant)
8300 return false;
8301 LoopVectorizationCostModel::CallWideningDecision Decision =
8302 CM.getCallWideningDecision(CI, VF);
8303 if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
8304 Variant = Decision.Variant;
8305 MaskPos = Decision.MaskPos;
8306 return true;
8307 }
8308
8309 return false;
8310 },
8311 Range);
8312 if (ShouldUseVectorCall) {
8313 if (MaskPos.has_value()) {
8314 // We have 2 cases that would require a mask:
8315 // 1) The block needs to be predicated, either due to a conditional
8316 // in the scalar loop or use of an active lane mask with
8317 // tail-folding, and we use the appropriate mask for the block.
8318 // 2) No mask is required for the block, but the only available
8319 // vector variant at this VF requires a mask, so we synthesize an
8320 // all-true mask.
8321 VPValue *Mask = nullptr;
8322 if (Legal->isMaskRequired(I: CI))
8323 Mask = getBlockInMask(BB: CI->getParent());
8324 else
8325 Mask = Plan.getOrAddLiveIn(V: ConstantInt::getTrue(
8326 Ty: IntegerType::getInt1Ty(C&: Variant->getFunctionType()->getContext())));
8327
8328 Ops.insert(I: Ops.begin() + *MaskPos, Elt: Mask);
8329 }
8330
8331 return new VPWidenCallRecipe(*CI, make_range(x: Ops.begin(), y: Ops.end()),
8332 Intrinsic::not_intrinsic, CI->getDebugLoc(),
8333 Variant);
8334 }
8335
8336 return nullptr;
8337}
8338
8339bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8340 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8341 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8342 // Instruction should be widened, unless it is scalar after vectorization,
8343 // scalarization is profitable or it is predicated.
8344 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8345 return CM.isScalarAfterVectorization(I, VF) ||
8346 CM.isProfitableToScalarize(I, VF) ||
8347 CM.isScalarWithPredication(I, VF);
8348 };
8349 return !LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillScalarize,
8350 Range);
8351}
8352
8353VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8354 ArrayRef<VPValue *> Operands,
8355 VPBasicBlock *VPBB) {
8356 switch (I->getOpcode()) {
8357 default:
8358 return nullptr;
8359 case Instruction::SDiv:
8360 case Instruction::UDiv:
8361 case Instruction::SRem:
8362 case Instruction::URem: {
8363 // If not provably safe, use a select to form a safe divisor before widening the
8364 // div/rem operation itself. Otherwise fall through to general handling below.
8365 if (CM.isPredicatedInst(I)) {
8366 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8367 VPValue *Mask = getBlockInMask(BB: I->getParent());
8368 VPValue *One =
8369 Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: I->getType(), V: 1u, IsSigned: false));
8370 auto *SafeRHS =
8371 new VPInstruction(Instruction::Select, {Mask, Ops[1], One},
8372 I->getDebugLoc());
8373 VPBB->appendRecipe(Recipe: SafeRHS);
8374 Ops[1] = SafeRHS;
8375 return new VPWidenRecipe(*I, make_range(x: Ops.begin(), y: Ops.end()));
8376 }
8377 [[fallthrough]];
8378 }
8379 case Instruction::Add:
8380 case Instruction::And:
8381 case Instruction::AShr:
8382 case Instruction::FAdd:
8383 case Instruction::FCmp:
8384 case Instruction::FDiv:
8385 case Instruction::FMul:
8386 case Instruction::FNeg:
8387 case Instruction::FRem:
8388 case Instruction::FSub:
8389 case Instruction::ICmp:
8390 case Instruction::LShr:
8391 case Instruction::Mul:
8392 case Instruction::Or:
8393 case Instruction::Select:
8394 case Instruction::Shl:
8395 case Instruction::Sub:
8396 case Instruction::Xor:
8397 case Instruction::Freeze:
8398 return new VPWidenRecipe(*I, make_range(x: Operands.begin(), y: Operands.end()));
8399 };
8400}
8401
8402void VPRecipeBuilder::fixHeaderPhis() {
8403 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8404 for (VPHeaderPHIRecipe *R : PhisToFix) {
8405 auto *PN = cast<PHINode>(Val: R->getUnderlyingValue());
8406 VPRecipeBase *IncR =
8407 getRecipe(I: cast<Instruction>(Val: PN->getIncomingValueForBlock(BB: OrigLatch)));
8408 R->addOperand(Operand: IncR->getVPSingleValue());
8409 }
8410}
8411
8412VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
8413 VFRange &Range) {
8414 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8415 Predicate: [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8416 Range);
8417
8418 bool IsPredicated = CM.isPredicatedInst(I);
8419
8420 // Even if the instruction is not marked as uniform, there are certain
8421 // intrinsic calls that can be effectively treated as such, so we check for
8422 // them here. Conservatively, we only do this for scalable vectors, since
8423 // for fixed-width VFs we can always fall back on full scalarization.
8424 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(Val: I)) {
8425 switch (cast<IntrinsicInst>(Val: I)->getIntrinsicID()) {
8426 case Intrinsic::assume:
8427 case Intrinsic::lifetime_start:
8428 case Intrinsic::lifetime_end:
8429 // For scalable vectors if one of the operands is variant then we still
8430 // want to mark as uniform, which will generate one instruction for just
8431 // the first lane of the vector. We can't scalarize the call in the same
8432 // way as for fixed-width vectors because we don't know how many lanes
8433 // there are.
8434 //
8435 // The reasons for doing it this way for scalable vectors are:
8436 // 1. For the assume intrinsic generating the instruction for the first
8437 // lane is still be better than not generating any at all. For
8438 // example, the input may be a splat across all lanes.
8439 // 2. For the lifetime start/end intrinsics the pointer operand only
8440 // does anything useful when the input comes from a stack object,
8441 // which suggests it should always be uniform. For non-stack objects
8442 // the effect is to poison the object, which still allows us to
8443 // remove the call.
8444 IsUniform = true;
8445 break;
8446 default:
8447 break;
8448 }
8449 }
8450 VPValue *BlockInMask = nullptr;
8451 if (!IsPredicated) {
8452 // Finalize the recipe for Instr, first if it is not predicated.
8453 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8454 } else {
8455 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8456 // Instructions marked for predication are replicated and a mask operand is
8457 // added initially. Masked replicate recipes will later be placed under an
8458 // if-then construct to prevent side-effects. Generate recipes to compute
8459 // the block mask for this region.
8460 BlockInMask = getBlockInMask(BB: I->getParent());
8461 }
8462
8463 auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(Operands: I->operands()),
8464 IsUniform, BlockInMask);
8465 return Recipe;
8466}
8467
8468VPRecipeBase *
8469VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8470 ArrayRef<VPValue *> Operands,
8471 VFRange &Range, VPBasicBlock *VPBB) {
8472 // First, check for specific widening recipes that deal with inductions, Phi
8473 // nodes, calls and memory operations.
8474 VPRecipeBase *Recipe;
8475 if (auto Phi = dyn_cast<PHINode>(Val: Instr)) {
8476 if (Phi->getParent() != OrigLoop->getHeader())
8477 return tryToBlend(Phi, Operands);
8478
8479 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8480 return Recipe;
8481
8482 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8483 assert((Legal->isReductionVariable(Phi) ||
8484 Legal->isFixedOrderRecurrence(Phi)) &&
8485 "can only widen reductions and fixed-order recurrences here");
8486 VPValue *StartV = Operands[0];
8487 if (Legal->isReductionVariable(PN: Phi)) {
8488 const RecurrenceDescriptor &RdxDesc =
8489 Legal->getReductionVars().find(Key: Phi)->second;
8490 assert(RdxDesc.getRecurrenceStartValue() ==
8491 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8492 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8493 CM.isInLoopReduction(Phi),
8494 CM.useOrderedReductions(RdxDesc));
8495 } else {
8496 // TODO: Currently fixed-order recurrences are modeled as chains of
8497 // first-order recurrences. If there are no users of the intermediate
8498 // recurrences in the chain, the fixed order recurrence should be modeled
8499 // directly, enabling more efficient codegen.
8500 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8501 }
8502
8503 PhisToFix.push_back(Elt: PhiRecipe);
8504 return PhiRecipe;
8505 }
8506
8507 if (isa<TruncInst>(Val: Instr) && (Recipe = tryToOptimizeInductionTruncate(
8508 I: cast<TruncInst>(Val: Instr), Operands, Range)))
8509 return Recipe;
8510
8511 // All widen recipes below deal only with VF > 1.
8512 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8513 Predicate: [&](ElementCount VF) { return VF.isScalar(); }, Range))
8514 return nullptr;
8515
8516 if (auto *CI = dyn_cast<CallInst>(Val: Instr))
8517 return tryToWidenCall(CI, Operands, Range);
8518
8519 if (isa<LoadInst>(Val: Instr) || isa<StoreInst>(Val: Instr))
8520 return tryToWidenMemory(I: Instr, Operands, Range);
8521
8522 if (!shouldWiden(I: Instr, Range))
8523 return nullptr;
8524
8525 if (auto GEP = dyn_cast<GetElementPtrInst>(Val: Instr))
8526 return new VPWidenGEPRecipe(GEP,
8527 make_range(x: Operands.begin(), y: Operands.end()));
8528
8529 if (auto *SI = dyn_cast<SelectInst>(Val: Instr)) {
8530 return new VPWidenSelectRecipe(
8531 *SI, make_range(x: Operands.begin(), y: Operands.end()));
8532 }
8533
8534 if (auto *CI = dyn_cast<CastInst>(Val: Instr)) {
8535 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8536 *CI);
8537 }
8538
8539 return tryToWiden(I: Instr, Operands, VPBB);
8540}
8541
8542void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8543 ElementCount MaxVF) {
8544 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8545
8546 auto MaxVFTimes2 = MaxVF * 2;
8547 for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) {
8548 VFRange SubRange = {VF, MaxVFTimes2};
8549 if (auto Plan = tryToBuildVPlanWithVPRecipes(Range&: SubRange)) {
8550 // Now optimize the initial VPlan.
8551 if (!Plan->hasVF(VF: ElementCount::getFixed(MinVal: 1)))
8552 VPlanTransforms::truncateToMinimalBitwidths(
8553 Plan&: *Plan, MinBWs: CM.getMinimalBitwidths(), Ctx&: PSE.getSE()->getContext());
8554 VPlanTransforms::optimize(Plan&: *Plan, SE&: *PSE.getSE());
8555 // TODO: try to put it close to addActiveLaneMask().
8556 if (CM.foldTailWithEVL())
8557 VPlanTransforms::addExplicitVectorLength(Plan&: *Plan);
8558 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8559 VPlans.push_back(Elt: std::move(Plan));
8560 }
8561 VF = SubRange.End;
8562 }
8563}
8564
8565// Add the necessary canonical IV and branch recipes required to control the
8566// loop.
8567static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8568 DebugLoc DL) {
8569 Value *StartIdx = ConstantInt::get(Ty: IdxTy, V: 0);
8570 auto *StartV = Plan.getOrAddLiveIn(V: StartIdx);
8571
8572 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8573 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8574 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8575 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8576 Header->insert(Recipe: CanonicalIVPHI, InsertPt: Header->begin());
8577
8578 VPBuilder Builder(TopRegion->getExitingBasicBlock());
8579 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8580 auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8581 Opcode: Instruction::Add, Operands: {CanonicalIVPHI, &Plan.getVFxUF()}, WrapFlags: {HasNUW, false}, DL,
8582 Name: "index.next");
8583 CanonicalIVPHI->addOperand(Operand: CanonicalIVIncrement);
8584
8585 // Add the BranchOnCount VPInstruction to the latch.
8586 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCount,
8587 Operands: {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8588}
8589
8590// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8591// original exit block.
8592static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
8593 VPRecipeBuilder &Builder, VPlan &Plan) {
8594 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8595 BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8596 // Only handle single-exit loops with unique exit blocks for now.
8597 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8598 return;
8599
8600 // Introduce VPUsers modeling the exit values.
8601 for (PHINode &ExitPhi : ExitBB->phis()) {
8602 Value *IncomingValue =
8603 ExitPhi.getIncomingValueForBlock(BB: ExitingBB);
8604 VPValue *V = Builder.getVPValueOrAddLiveIn(V: IncomingValue, Plan);
8605 Plan.addLiveOut(PN: &ExitPhi, V);
8606 }
8607}
8608
8609VPlanPtr
8610LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8611
8612 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8613
8614 // ---------------------------------------------------------------------------
8615 // Build initial VPlan: Scan the body of the loop in a topological order to
8616 // visit each basic block after having visited its predecessor basic blocks.
8617 // ---------------------------------------------------------------------------
8618
8619 // Create initial VPlan skeleton, having a basic block for the pre-header
8620 // which contains SCEV expansions that need to happen before the CFG is
8621 // modified; a basic block for the vector pre-header, followed by a region for
8622 // the vector loop, followed by the middle basic block. The skeleton vector
8623 // loop region contains a header and latch basic blocks.
8624 VPlanPtr Plan = VPlan::createInitialVPlan(
8625 TripCount: createTripCountSCEV(IdxTy: Legal->getWidestInductionType(), PSE, OrigLoop),
8626 PSE&: *PSE.getSE());
8627 VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8628 VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8629 VPBlockUtils::insertBlockAfter(NewBlock: LatchVPBB, BlockPtr: HeaderVPBB);
8630 Plan->getVectorLoopRegion()->setEntry(HeaderVPBB);
8631 Plan->getVectorLoopRegion()->setExiting(LatchVPBB);
8632
8633 // Don't use getDecisionAndClampRange here, because we don't know the UF
8634 // so this function is better to be conservative, rather than to split
8635 // it up into different VPlans.
8636 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8637 bool IVUpdateMayOverflow = false;
8638 for (ElementCount VF : Range)
8639 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(Cost: &CM, VF);
8640
8641 DebugLoc DL = getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction());
8642 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8643 // When not folding the tail, we know that the induction increment will not
8644 // overflow.
8645 bool HasNUW = Style == TailFoldingStyle::None;
8646 addCanonicalIVRecipes(Plan&: *Plan, IdxTy: Legal->getWidestInductionType(), HasNUW, DL);
8647
8648 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
8649
8650 // ---------------------------------------------------------------------------
8651 // Pre-construction: record ingredients whose recipes we'll need to further
8652 // process after constructing the initial VPlan.
8653 // ---------------------------------------------------------------------------
8654
8655 // For each interleave group which is relevant for this (possibly trimmed)
8656 // Range, add it to the set of groups to be later applied to the VPlan and add
8657 // placeholders for its members' Recipes which we'll be replacing with a
8658 // single VPInterleaveRecipe.
8659 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8660 auto applyIG = [IG, this](ElementCount VF) -> bool {
8661 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8662 CM.getWideningDecision(I: IG->getInsertPos(), VF) ==
8663 LoopVectorizationCostModel::CM_Interleave);
8664 // For scalable vectors, the only interleave factor currently supported
8665 // is 2 since we require the (de)interleave2 intrinsics instead of
8666 // shufflevectors.
8667 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8668 "Unsupported interleave factor for scalable vectors");
8669 return Result;
8670 };
8671 if (!getDecisionAndClampRange(Predicate: applyIG, Range))
8672 continue;
8673 InterleaveGroups.insert(Ptr: IG);
8674 };
8675
8676 // ---------------------------------------------------------------------------
8677 // Construct recipes for the instructions in the loop
8678 // ---------------------------------------------------------------------------
8679
8680 // Scan the body of the loop in a topological order to visit each basic block
8681 // after having visited its predecessor basic blocks.
8682 LoopBlocksDFS DFS(OrigLoop);
8683 DFS.perform(LI);
8684
8685 VPBasicBlock *VPBB = HeaderVPBB;
8686 BasicBlock *HeaderBB = OrigLoop->getHeader();
8687 bool NeedsMasks =
8688 CM.foldTailByMasking() ||
8689 any_of(Range: OrigLoop->blocks(), P: [this, HeaderBB](BasicBlock *BB) {
8690 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
8691 return Legal->blockNeedsPredication(BB) || NeedsBlends;
8692 });
8693 for (BasicBlock *BB : make_range(x: DFS.beginRPO(), y: DFS.endRPO())) {
8694 // Relevant instructions from basic block BB will be grouped into VPRecipe
8695 // ingredients and fill a new VPBasicBlock.
8696 if (VPBB != HeaderVPBB)
8697 VPBB->setName(BB->getName());
8698 Builder.setInsertPoint(VPBB);
8699
8700 if (VPBB == HeaderVPBB)
8701 RecipeBuilder.createHeaderMask();
8702 else if (NeedsMasks)
8703 RecipeBuilder.createBlockInMask(BB);
8704
8705 // Introduce each ingredient into VPlan.
8706 // TODO: Model and preserve debug intrinsics in VPlan.
8707 for (Instruction &I : drop_end(RangeOrContainer: BB->instructionsWithoutDebug(SkipPseudoOp: false))) {
8708 Instruction *Instr = &I;
8709 SmallVector<VPValue *, 4> Operands;
8710 auto *Phi = dyn_cast<PHINode>(Val: Instr);
8711 if (Phi && Phi->getParent() == HeaderBB) {
8712 Operands.push_back(Elt: Plan->getOrAddLiveIn(
8713 V: Phi->getIncomingValueForBlock(BB: OrigLoop->getLoopPreheader())));
8714 } else {
8715 auto OpRange = RecipeBuilder.mapToVPValues(Operands: Instr->operands());
8716 Operands = {OpRange.begin(), OpRange.end()};
8717 }
8718
8719 // Invariant stores inside loop will be deleted and a single store
8720 // with the final reduction value will be added to the exit block
8721 StoreInst *SI;
8722 if ((SI = dyn_cast<StoreInst>(Val: &I)) &&
8723 Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand()))
8724 continue;
8725
8726 VPRecipeBase *Recipe =
8727 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
8728 if (!Recipe)
8729 Recipe = RecipeBuilder.handleReplication(I: Instr, Range);
8730
8731 RecipeBuilder.setRecipe(I: Instr, R: Recipe);
8732 if (isa<VPHeaderPHIRecipe>(Val: Recipe)) {
8733 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8734 // the following cases, VPHeaderPHIRecipes may be created after non-phi
8735 // recipes and need to be moved to the phi section of HeaderVPBB:
8736 // * tail-folding (non-phi recipes computing the header mask are
8737 // introduced earlier than regular header phi recipes, and should appear
8738 // after them)
8739 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8740
8741 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8742 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8743 "unexpected recipe needs moving");
8744 Recipe->insertBefore(BB&: *HeaderVPBB, IP: HeaderVPBB->getFirstNonPhi());
8745 } else
8746 VPBB->appendRecipe(Recipe);
8747 }
8748
8749 VPBlockUtils::insertBlockAfter(NewBlock: new VPBasicBlock(), BlockPtr: VPBB);
8750 VPBB = cast<VPBasicBlock>(Val: VPBB->getSingleSuccessor());
8751 }
8752
8753 // After here, VPBB should not be used.
8754 VPBB = nullptr;
8755
8756 if (CM.requiresScalarEpilogue(Range)) {
8757 // No edge from the middle block to the unique exit block has been inserted
8758 // and there is nothing to fix from vector loop; phis should have incoming
8759 // from scalar loop only.
8760 } else
8761 addUsersInExitBlock(HeaderVPBB, OrigLoop, Builder&: RecipeBuilder, Plan&: *Plan);
8762
8763 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8764 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8765 "entry block must be set to a VPRegionBlock having a non-empty entry "
8766 "VPBasicBlock");
8767 RecipeBuilder.fixHeaderPhis();
8768
8769 // ---------------------------------------------------------------------------
8770 // Transform initial VPlan: Apply previously taken decisions, in order, to
8771 // bring the VPlan to its final state.
8772 // ---------------------------------------------------------------------------
8773
8774 // Adjust the recipes for any inloop reductions.
8775 adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, MinVF: Range.Start);
8776
8777 // Interleave memory: for each Interleave Group we marked earlier as relevant
8778 // for this VPlan, replace the Recipes widening its memory instructions with a
8779 // single VPInterleaveRecipe at its insertion point.
8780 for (const auto *IG : InterleaveGroups) {
8781 auto *Recipe =
8782 cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IG->getInsertPos()));
8783 SmallVector<VPValue *, 4> StoredValues;
8784 for (unsigned i = 0; i < IG->getFactor(); ++i)
8785 if (auto *SI = dyn_cast_or_null<StoreInst>(Val: IG->getMember(Index: i))) {
8786 auto *StoreR = cast<VPWidenStoreRecipe>(Val: RecipeBuilder.getRecipe(I: SI));
8787 StoredValues.push_back(Elt: StoreR->getStoredValue());
8788 }
8789
8790 bool NeedsMaskForGaps =
8791 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8792 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8793 Recipe->getMask(), NeedsMaskForGaps);
8794 VPIG->insertBefore(InsertPos: Recipe);
8795 unsigned J = 0;
8796 for (unsigned i = 0; i < IG->getFactor(); ++i)
8797 if (Instruction *Member = IG->getMember(Index: i)) {
8798 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(I: Member);
8799 if (!Member->getType()->isVoidTy()) {
8800 VPValue *OriginalV = MemberR->getVPSingleValue();
8801 OriginalV->replaceAllUsesWith(New: VPIG->getVPValue(I: J));
8802 J++;
8803 }
8804 MemberR->eraseFromParent();
8805 }
8806 }
8807
8808 for (ElementCount VF : Range)
8809 Plan->addVF(VF);
8810 Plan->setName("Initial VPlan");
8811
8812 // Replace VPValues for known constant strides guaranteed by predicate scalar
8813 // evolution.
8814 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8815 auto *StrideV = cast<SCEVUnknown>(Val: Stride)->getValue();
8816 auto *ScevStride = dyn_cast<SCEVConstant>(Val: PSE.getSCEV(V: StrideV));
8817 // Only handle constant strides for now.
8818 if (!ScevStride)
8819 continue;
8820 Constant *CI = ConstantInt::get(Ty: Stride->getType(), V: ScevStride->getAPInt());
8821
8822 auto *ConstVPV = Plan->getOrAddLiveIn(V: CI);
8823 // The versioned value may not be used in the loop directly, so just add a
8824 // new live-in in those cases.
8825 Plan->getOrAddLiveIn(V: StrideV)->replaceAllUsesWith(New: ConstVPV);
8826 }
8827
8828 VPlanTransforms::dropPoisonGeneratingRecipes(Plan&: *Plan, BlockNeedsPredication: [this](BasicBlock *BB) {
8829 return Legal->blockNeedsPredication(BB);
8830 });
8831
8832 // Sink users of fixed-order recurrence past the recipe defining the previous
8833 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8834 if (!VPlanTransforms::adjustFixedOrderRecurrences(Plan&: *Plan, Builder))
8835 return nullptr;
8836
8837 if (useActiveLaneMask(Style)) {
8838 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8839 // TailFoldingStyle is visible there.
8840 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8841 bool WithoutRuntimeCheck =
8842 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8843 VPlanTransforms::addActiveLaneMask(Plan&: *Plan, UseActiveLaneMaskForControlFlow: ForControlFlow,
8844 DataAndControlFlowWithoutRuntimeCheck: WithoutRuntimeCheck);
8845 }
8846 return Plan;
8847}
8848
8849VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8850 // Outer loop handling: They may require CFG and instruction level
8851 // transformations before even evaluating whether vectorization is profitable.
8852 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8853 // the vectorization pipeline.
8854 assert(!OrigLoop->isInnermost());
8855 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8856
8857 // Create new empty VPlan
8858 auto Plan = VPlan::createInitialVPlan(
8859 TripCount: createTripCountSCEV(IdxTy: Legal->getWidestInductionType(), PSE, OrigLoop),
8860 PSE&: *PSE.getSE());
8861
8862 // Build hierarchical CFG
8863 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8864 HCFGBuilder.buildHierarchicalCFG();
8865
8866 for (ElementCount VF : Range)
8867 Plan->addVF(VF);
8868
8869 VPlanTransforms::VPInstructionsToVPRecipes(
8870 Plan,
8871 GetIntOrFpInductionDescriptor: [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(Phi: P); },
8872 SE&: *PSE.getSE(), TLI: *TLI);
8873
8874 // Remove the existing terminator of the exiting block of the top-most region.
8875 // A BranchOnCount will be added instead when adding the canonical IV recipes.
8876 auto *Term =
8877 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8878 Term->eraseFromParent();
8879
8880 // Tail folding is not supported for outer loops, so the induction increment
8881 // is guaranteed to not wrap.
8882 bool HasNUW = true;
8883 addCanonicalIVRecipes(Plan&: *Plan, IdxTy: Legal->getWidestInductionType(), HasNUW,
8884 DL: DebugLoc());
8885 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8886 return Plan;
8887}
8888
8889// Adjust the recipes for reductions. For in-loop reductions the chain of
8890// instructions leading from the loop exit instr to the phi need to be converted
8891// to reductions, with one operand being vector and the other being the scalar
8892// reduction chain. For other reductions, a select is introduced between the phi
8893// and live-out recipes when folding the tail.
8894//
8895// A ComputeReductionResult recipe is added to the middle block, also for
8896// in-loop reductions which compute their result in-loop, because generating
8897// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8898void LoopVectorizationPlanner::adjustRecipesForReductions(
8899 VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
8900 ElementCount MinVF) {
8901 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8902 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8903 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8904 // sank outside of the loop would keep the same order as they had in the
8905 // original loop.
8906 SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8907 for (VPRecipeBase &R : Header->phis()) {
8908 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R))
8909 ReductionPHIList.emplace_back(Args&: ReductionPhi);
8910 }
8911 bool HasIntermediateStore = false;
8912 stable_sort(Range&: ReductionPHIList,
8913 C: [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8914 const VPReductionPHIRecipe *R2) {
8915 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8916 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8917 HasIntermediateStore |= IS1 || IS2;
8918
8919 // If neither of the recipes has an intermediate store, keep the
8920 // order the same.
8921 if (!IS1 && !IS2)
8922 return false;
8923
8924 // If only one of the recipes has an intermediate store, then
8925 // move it towards the beginning of the list.
8926 if (IS1 && !IS2)
8927 return true;
8928
8929 if (!IS1 && IS2)
8930 return false;
8931
8932 // If both recipes have an intermediate store, then the recipe
8933 // with the later store should be processed earlier. So it
8934 // should go to the beginning of the list.
8935 return DT->dominates(Def: IS2, User: IS1);
8936 });
8937
8938 if (HasIntermediateStore && ReductionPHIList.size() > 1)
8939 for (VPRecipeBase *R : ReductionPHIList)
8940 R->moveBefore(BB&: *Header, I: Header->getFirstNonPhi());
8941
8942 for (VPRecipeBase &R : Header->phis()) {
8943 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
8944 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8945 continue;
8946
8947 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
8948 RecurKind Kind = RdxDesc.getRecurrenceKind();
8949 assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
8950 "AnyOf reductions are not allowed for in-loop reductions");
8951
8952 // Collect the chain of "link" recipes for the reduction starting at PhiR.
8953 SetVector<VPSingleDefRecipe *> Worklist;
8954 Worklist.insert(X: PhiR);
8955 for (unsigned I = 0; I != Worklist.size(); ++I) {
8956 VPSingleDefRecipe *Cur = Worklist[I];
8957 for (VPUser *U : Cur->users()) {
8958 auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(Val: U);
8959 if (!UserRecipe) {
8960 assert(isa<VPLiveOut>(U) &&
8961 "U must either be a VPSingleDef or VPLiveOut");
8962 continue;
8963 }
8964 Worklist.insert(X: UserRecipe);
8965 }
8966 }
8967
8968 // Visit operation "Links" along the reduction chain top-down starting from
8969 // the phi until LoopExitValue. We keep track of the previous item
8970 // (PreviousLink) to tell which of the two operands of a Link will remain
8971 // scalar and which will be reduced. For minmax by select(cmp), Link will be
8972 // the select instructions. Blend recipes of in-loop reduction phi's will
8973 // get folded to their non-phi operand, as the reduction recipe handles the
8974 // condition directly.
8975 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
8976 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
8977 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
8978
8979 // Index of the first operand which holds a non-mask vector operand.
8980 unsigned IndexOfFirstOperand;
8981 // Recognize a call to the llvm.fmuladd intrinsic.
8982 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
8983 VPValue *VecOp;
8984 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
8985 if (IsFMulAdd) {
8986 assert(
8987 RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
8988 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
8989 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
8990 isa<VPWidenCallRecipe>(CurrentLink)) &&
8991 CurrentLink->getOperand(2) == PreviousLink &&
8992 "expected a call where the previous link is the added operand");
8993
8994 // If the instruction is a call to the llvm.fmuladd intrinsic then we
8995 // need to create an fmul recipe (multiplying the first two operands of
8996 // the fmuladd together) to use as the vector operand for the fadd
8997 // reduction.
8998 VPInstruction *FMulRecipe = new VPInstruction(
8999 Instruction::FMul,
9000 {CurrentLink->getOperand(N: 0), CurrentLink->getOperand(N: 1)},
9001 CurrentLinkI->getFastMathFlags());
9002 LinkVPBB->insert(Recipe: FMulRecipe, InsertPt: CurrentLink->getIterator());
9003 VecOp = FMulRecipe;
9004 } else {
9005 auto *Blend = dyn_cast<VPBlendRecipe>(Val: CurrentLink);
9006 if (PhiR->isInLoop() && Blend) {
9007 assert(Blend->getNumIncomingValues() == 2 &&
9008 "Blend must have 2 incoming values");
9009 if (Blend->getIncomingValue(Idx: 0) == PhiR)
9010 Blend->replaceAllUsesWith(New: Blend->getIncomingValue(Idx: 1));
9011 else {
9012 assert(Blend->getIncomingValue(1) == PhiR &&
9013 "PhiR must be an operand of the blend");
9014 Blend->replaceAllUsesWith(New: Blend->getIncomingValue(Idx: 0));
9015 }
9016 continue;
9017 }
9018
9019 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9020 if (isa<VPWidenRecipe>(Val: CurrentLink)) {
9021 assert(isa<CmpInst>(CurrentLinkI) &&
9022 "need to have the compare of the select");
9023 continue;
9024 }
9025 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9026 "must be a select recipe");
9027 IndexOfFirstOperand = 1;
9028 } else {
9029 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9030 "Expected to replace a VPWidenSC");
9031 IndexOfFirstOperand = 0;
9032 }
9033 // Note that for non-commutable operands (cmp-selects), the semantics of
9034 // the cmp-select are captured in the recurrence kind.
9035 unsigned VecOpId =
9036 CurrentLink->getOperand(N: IndexOfFirstOperand) == PreviousLink
9037 ? IndexOfFirstOperand + 1
9038 : IndexOfFirstOperand;
9039 VecOp = CurrentLink->getOperand(N: VecOpId);
9040 assert(VecOp != PreviousLink &&
9041 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9042 (VecOpId - IndexOfFirstOperand)) ==
9043 PreviousLink &&
9044 "PreviousLink must be the operand other than VecOp");
9045 }
9046
9047 BasicBlock *BB = CurrentLinkI->getParent();
9048 VPValue *CondOp = nullptr;
9049 if (CM.blockNeedsPredicationForAnyReason(BB))
9050 CondOp = RecipeBuilder.getBlockInMask(BB);
9051
9052 VPReductionRecipe *RedRecipe =
9053 new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp,
9054 CondOp, CM.useOrderedReductions(RdxDesc));
9055 // Append the recipe to the end of the VPBasicBlock because we need to
9056 // ensure that it comes after all of it's inputs, including CondOp.
9057 // Note that this transformation may leave over dead recipes (including
9058 // CurrentLink), which will be cleaned by a later VPlan transform.
9059 LinkVPBB->appendRecipe(Recipe: RedRecipe);
9060 CurrentLink->replaceAllUsesWith(New: RedRecipe);
9061 PreviousLink = RedRecipe;
9062 }
9063 }
9064 Builder.setInsertPoint(&*LatchVPBB->begin());
9065 for (VPRecipeBase &R :
9066 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9067 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
9068 if (!PhiR)
9069 continue;
9070
9071 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9072 // If tail is folded by masking, introduce selects between the phi
9073 // and the live-out instruction of each reduction, at the beginning of the
9074 // dedicated latch block.
9075 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9076 auto *NewExitingVPV = PhiR->getBackedgeValue();
9077 if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9078 VPValue *Cond = RecipeBuilder.getBlockInMask(BB: OrigLoop->getHeader());
9079 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9080 "reduction recipe must be defined before latch");
9081 Type *PhiTy = PhiR->getOperand(N: 0)->getLiveInIRValue()->getType();
9082 std::optional<FastMathFlags> FMFs =
9083 PhiTy->isFloatingPointTy()
9084 ? std::make_optional(t: RdxDesc.getFastMathFlags())
9085 : std::nullopt;
9086 NewExitingVPV =
9087 Builder.createSelect(Cond, TrueVal: OrigExitingVPV, FalseVal: PhiR, DL: {}, Name: "", FMFs);
9088 OrigExitingVPV->replaceUsesWithIf(New: NewExitingVPV, ShouldReplace: [](VPUser &U, unsigned) {
9089 return isa<VPInstruction>(Val: &U) &&
9090 cast<VPInstruction>(Val: &U)->getOpcode() ==
9091 VPInstruction::ComputeReductionResult;
9092 });
9093 if (PreferPredicatedReductionSelect ||
9094 TTI.preferPredicatedReductionSelect(
9095 Opcode: PhiR->getRecurrenceDescriptor().getOpcode(), Ty: PhiTy,
9096 Flags: TargetTransformInfo::ReductionFlags()))
9097 PhiR->setOperand(I: 1, New: NewExitingVPV);
9098 }
9099
9100 // If the vector reduction can be performed in a smaller type, we truncate
9101 // then extend the loop exit value to enable InstCombine to evaluate the
9102 // entire expression in the smaller type.
9103 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9104 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
9105 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9106 Type *RdxTy = RdxDesc.getRecurrenceType();
9107 auto *Trunc =
9108 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9109 auto *Extnd =
9110 RdxDesc.isSigned()
9111 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9112 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9113
9114 Trunc->insertAfter(InsertPos: NewExitingVPV->getDefiningRecipe());
9115 Extnd->insertAfter(InsertPos: Trunc);
9116 if (PhiR->getOperand(N: 1) == NewExitingVPV)
9117 PhiR->setOperand(I: 1, New: Extnd->getVPSingleValue());
9118 NewExitingVPV = Extnd;
9119 }
9120
9121 // We want code in the middle block to appear to execute on the location of
9122 // the scalar loop's latch terminator because: (a) it is all compiler
9123 // generated, (b) these instructions are always executed after evaluating
9124 // the latch conditional branch, and (c) other passes may add new
9125 // predecessors which terminate on this line. This is the easiest way to
9126 // ensure we don't accidentally cause an extra step back into the loop while
9127 // debugging.
9128 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9129
9130 // TODO: At the moment ComputeReductionResult also drives creation of the
9131 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9132 // even for in-loop reductions, until the reduction resume value handling is
9133 // also modeled in VPlan.
9134 auto *FinalReductionResult = new VPInstruction(
9135 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9136 cast<VPBasicBlock>(Val: VectorLoopRegion->getSingleSuccessor())
9137 ->appendRecipe(Recipe: FinalReductionResult);
9138 OrigExitingVPV->replaceUsesWithIf(
9139 New: FinalReductionResult,
9140 ShouldReplace: [](VPUser &User, unsigned) { return isa<VPLiveOut>(Val: &User); });
9141 }
9142
9143 VPlanTransforms::clearReductionWrapFlags(Plan&: *Plan);
9144}
9145
9146#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9147void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9148 VPSlotTracker &SlotTracker) const {
9149 O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9150 IG->getInsertPos()->printAsOperand(O, PrintType: false);
9151 O << ", ";
9152 getAddr()->printAsOperand(OS&: O, Tracker&: SlotTracker);
9153 VPValue *Mask = getMask();
9154 if (Mask) {
9155 O << ", ";
9156 Mask->printAsOperand(OS&: O, Tracker&: SlotTracker);
9157 }
9158
9159 unsigned OpIdx = 0;
9160 for (unsigned i = 0; i < IG->getFactor(); ++i) {
9161 if (!IG->getMember(Index: i))
9162 continue;
9163 if (getNumStoreOperands() > 0) {
9164 O << "\n" << Indent << " store ";
9165 getOperand(N: 1 + OpIdx)->printAsOperand(OS&: O, Tracker&: SlotTracker);
9166 O << " to index " << i;
9167 } else {
9168 O << "\n" << Indent << " ";
9169 getVPValue(I: OpIdx)->printAsOperand(OS&: O, Tracker&: SlotTracker);
9170 O << " = load from index " << i;
9171 }
9172 ++OpIdx;
9173 }
9174}
9175#endif
9176
9177void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9178 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9179 "Not a pointer induction according to InductionDescriptor!");
9180 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9181 "Unexpected type.");
9182 assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
9183 "Recipe should have been replaced");
9184
9185 auto *IVR = getParent()->getPlan()->getCanonicalIV();
9186 PHINode *CanonicalIV = cast<PHINode>(Val: State.get(Def: IVR, Part: 0, /*IsScalar*/ true));
9187 Type *PhiType = IndDesc.getStep()->getType();
9188
9189 // Build a pointer phi
9190 Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9191 Type *ScStValueType = ScalarStartValue->getType();
9192 PHINode *NewPointerPhi = PHINode::Create(Ty: ScStValueType, NumReservedValues: 2, NameStr: "pointer.phi",
9193 InsertBefore: CanonicalIV->getIterator());
9194
9195 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this);
9196 NewPointerPhi->addIncoming(V: ScalarStartValue, BB: VectorPH);
9197
9198 // A pointer induction, performed by using a gep
9199 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
9200
9201 Value *ScalarStepValue = State.get(Def: getOperand(N: 1), Instance: VPIteration(0, 0));
9202 Value *RuntimeVF = getRuntimeVF(B&: State.Builder, Ty: PhiType, VF: State.VF);
9203 Value *NumUnrolledElems =
9204 State.Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: PhiType, V: State.UF));
9205 Value *InductionGEP = GetElementPtrInst::Create(
9206 PointeeType: State.Builder.getInt8Ty(), Ptr: NewPointerPhi,
9207 IdxList: State.Builder.CreateMul(LHS: ScalarStepValue, RHS: NumUnrolledElems), NameStr: "ptr.ind",
9208 InsertBefore: InductionLoc);
9209 // Add induction update using an incorrect block temporarily. The phi node
9210 // will be fixed after VPlan execution. Note that at this point the latch
9211 // block cannot be used, as it does not exist yet.
9212 // TODO: Model increment value in VPlan, by turning the recipe into a
9213 // multi-def and a subclass of VPHeaderPHIRecipe.
9214 NewPointerPhi->addIncoming(V: InductionGEP, BB: VectorPH);
9215
9216 // Create UF many actual address geps that use the pointer
9217 // phi as base and a vectorized version of the step value
9218 // (<step*0, ..., step*N>) as offset.
9219 for (unsigned Part = 0; Part < State.UF; ++Part) {
9220 Type *VecPhiType = VectorType::get(ElementType: PhiType, EC: State.VF);
9221 Value *StartOffsetScalar =
9222 State.Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: PhiType, V: Part));
9223 Value *StartOffset =
9224 State.Builder.CreateVectorSplat(EC: State.VF, V: StartOffsetScalar);
9225 // Create a vector of consecutive numbers from zero to VF.
9226 StartOffset = State.Builder.CreateAdd(
9227 LHS: StartOffset, RHS: State.Builder.CreateStepVector(DstType: VecPhiType));
9228
9229 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9230 "scalar step must be the same across all parts");
9231 Value *GEP = State.Builder.CreateGEP(
9232 Ty: State.Builder.getInt8Ty(), Ptr: NewPointerPhi,
9233 IdxList: State.Builder.CreateMul(
9234 LHS: StartOffset,
9235 RHS: State.Builder.CreateVectorSplat(EC: State.VF, V: ScalarStepValue),
9236 Name: "vector.gep"));
9237 State.set(Def: this, V: GEP, Part);
9238 }
9239}
9240
9241void VPDerivedIVRecipe::execute(VPTransformState &State) {
9242 assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9243
9244 // Fast-math-flags propagate from the original induction instruction.
9245 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9246 if (FPBinOp)
9247 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9248
9249 Value *Step = State.get(Def: getStepValue(), Instance: VPIteration(0, 0));
9250 Value *CanonicalIV = State.get(Def: getOperand(N: 1), Instance: VPIteration(0, 0));
9251 Value *DerivedIV = emitTransformedIndex(
9252 B&: State.Builder, Index: CanonicalIV, StartValue: getStartValue()->getLiveInIRValue(), Step,
9253 InductionKind: Kind, InductionBinOp: cast_if_present<BinaryOperator>(Val: FPBinOp));
9254 DerivedIV->setName("offset.idx");
9255 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9256
9257 State.set(Def: this, V: DerivedIV, Instance: VPIteration(0, 0));
9258}
9259
9260void VPInterleaveRecipe::execute(VPTransformState &State) {
9261 assert(!State.Instance && "Interleave group being replicated.");
9262 State.ILV->vectorizeInterleaveGroup(Group: IG, VPDefs: definedValues(), State, Addr: getAddr(),
9263 StoredValues: getStoredValues(), BlockInMask: getMask(),
9264 NeedsMaskForGaps);
9265}
9266
9267void VPReplicateRecipe::execute(VPTransformState &State) {
9268 Instruction *UI = getUnderlyingInstr();
9269 if (State.Instance) { // Generate a single instance.
9270 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9271 State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: *State.Instance, State);
9272 // Insert scalar instance packing it into a vector.
9273 if (State.VF.isVector() && shouldPack()) {
9274 // If we're constructing lane 0, initialize to start from poison.
9275 if (State.Instance->Lane.isFirstLane()) {
9276 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9277 Value *Poison = PoisonValue::get(
9278 T: VectorType::get(ElementType: UI->getType(), EC: State.VF));
9279 State.set(Def: this, V: Poison, Part: State.Instance->Part);
9280 }
9281 State.packScalarIntoVectorValue(Def: this, Instance: *State.Instance);
9282 }
9283 return;
9284 }
9285
9286 if (IsUniform) {
9287 // If the recipe is uniform across all parts (instead of just per VF), only
9288 // generate a single instance.
9289 if ((isa<LoadInst>(Val: UI) || isa<StoreInst>(Val: UI)) &&
9290 all_of(Range: operands(), P: [](VPValue *Op) {
9291 return Op->isDefinedOutsideVectorRegions();
9292 })) {
9293 State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: VPIteration(0, 0), State);
9294 if (user_begin() != user_end()) {
9295 for (unsigned Part = 1; Part < State.UF; ++Part)
9296 State.set(Def: this, V: State.get(Def: this, Instance: VPIteration(0, 0)),
9297 Instance: VPIteration(Part, 0));
9298 }
9299 return;
9300 }
9301
9302 // Uniform within VL means we need to generate lane 0 only for each
9303 // unrolled copy.
9304 for (unsigned Part = 0; Part < State.UF; ++Part)
9305 State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: VPIteration(Part, 0), State);
9306 return;
9307 }
9308
9309 // A store of a loop varying value to a uniform address only needs the last
9310 // copy of the store.
9311 if (isa<StoreInst>(Val: UI) &&
9312 vputils::isUniformAfterVectorization(VPV: getOperand(N: 1))) {
9313 auto Lane = VPLane::getLastLaneForVF(VF: State.VF);
9314 State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: VPIteration(State.UF - 1, Lane),
9315 State);
9316 return;
9317 }
9318
9319 // Generate scalar instances for all VF lanes of all UF parts.
9320 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9321 const unsigned EndLane = State.VF.getKnownMinValue();
9322 for (unsigned Part = 0; Part < State.UF; ++Part)
9323 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9324 State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: VPIteration(Part, Lane), State);
9325}
9326
9327void VPWidenLoadRecipe::execute(VPTransformState &State) {
9328 auto *LI = cast<LoadInst>(Val: &Ingredient);
9329
9330 Type *ScalarDataTy = getLoadStoreType(I: &Ingredient);
9331 auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
9332 const Align Alignment = getLoadStoreAlignment(I: &Ingredient);
9333 bool CreateGather = !isConsecutive();
9334
9335 auto &Builder = State.Builder;
9336 State.setDebugLocFrom(getDebugLoc());
9337 for (unsigned Part = 0; Part < State.UF; ++Part) {
9338 Value *NewLI;
9339 Value *Mask = nullptr;
9340 if (auto *VPMask = getMask()) {
9341 // Mask reversal is only needed for non-all-one (null) masks, as reverse
9342 // of a null all-one mask is a null mask.
9343 Mask = State.get(Def: VPMask, Part);
9344 if (isReverse())
9345 Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse");
9346 }
9347
9348 Value *Addr = State.get(Def: getAddr(), Part, /*IsScalar*/ !CreateGather);
9349 if (CreateGather) {
9350 NewLI = Builder.CreateMaskedGather(Ty: DataTy, Ptrs: Addr, Alignment, Mask, PassThru: nullptr,
9351 Name: "wide.masked.gather");
9352 } else if (Mask) {
9353 NewLI = Builder.CreateMaskedLoad(Ty: DataTy, Ptr: Addr, Alignment, Mask,
9354 PassThru: PoisonValue::get(T: DataTy),
9355 Name: "wide.masked.load");
9356 } else {
9357 NewLI = Builder.CreateAlignedLoad(Ty: DataTy, Ptr: Addr, Align: Alignment, Name: "wide.load");
9358 }
9359 // Add metadata to the load, but setVectorValue to the reverse shuffle.
9360 State.addMetadata(To: NewLI, From: LI);
9361 if (Reverse)
9362 NewLI = Builder.CreateVectorReverse(V: NewLI, Name: "reverse");
9363 State.set(Def: this, V: NewLI, Part);
9364 }
9365}
9366
9367void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
9368 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9369 "explicit vector length.");
9370 // FIXME: Support reverse loading after vp_reverse is added.
9371 assert(!isReverse() && "Reverse loads are not implemented yet.");
9372
9373 auto *LI = cast<LoadInst>(Val: &Ingredient);
9374
9375 Type *ScalarDataTy = getLoadStoreType(I: &Ingredient);
9376 auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
9377 const Align Alignment = getLoadStoreAlignment(I: &Ingredient);
9378 bool CreateGather = !isConsecutive();
9379
9380 auto &Builder = State.Builder;
9381 State.setDebugLocFrom(getDebugLoc());
9382 CallInst *NewLI;
9383 Value *EVL = State.get(Def: getEVL(), Instance: VPIteration(0, 0));
9384 Value *Addr = State.get(Def: getAddr(), Part: 0, IsScalar: !CreateGather);
9385 Value *Mask = getMask()
9386 ? State.get(Def: getMask(), Part: 0)
9387 : Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
9388 if (CreateGather) {
9389 NewLI =
9390 Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
9391 nullptr, "wide.masked.gather");
9392 } else {
9393 VectorBuilder VBuilder(Builder);
9394 VBuilder.setEVL(EVL).setMask(Mask);
9395 NewLI = cast<CallInst>(Val: VBuilder.createVectorInstruction(
9396 Opcode: Instruction::Load, ReturnTy: DataTy, VecOpArray: Addr, Name: "vp.op.load"));
9397 }
9398 NewLI->addParamAttr(
9399 ArgNo: 0, Attr: Attribute::getWithAlignment(Context&: NewLI->getContext(), Alignment));
9400 State.addMetadata(To: NewLI, From: LI);
9401 State.set(Def: this, V: NewLI, Part: 0);
9402}
9403
9404void VPWidenStoreRecipe::execute(VPTransformState &State) {
9405 auto *SI = cast<StoreInst>(Val: &Ingredient);
9406
9407 VPValue *StoredVPValue = getStoredValue();
9408 bool CreateScatter = !isConsecutive();
9409 const Align Alignment = getLoadStoreAlignment(I: &Ingredient);
9410
9411 auto &Builder = State.Builder;
9412 State.setDebugLocFrom(getDebugLoc());
9413
9414 for (unsigned Part = 0; Part < State.UF; ++Part) {
9415 Instruction *NewSI = nullptr;
9416 Value *Mask = nullptr;
9417 if (auto *VPMask = getMask()) {
9418 // Mask reversal is only needed for non-all-one (null) masks, as reverse
9419 // of a null all-one mask is a null mask.
9420 Mask = State.get(Def: VPMask, Part);
9421 if (isReverse())
9422 Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse");
9423 }
9424
9425 Value *StoredVal = State.get(Def: StoredVPValue, Part);
9426 if (isReverse()) {
9427 // If we store to reverse consecutive memory locations, then we need
9428 // to reverse the order of elements in the stored value.
9429 StoredVal = Builder.CreateVectorReverse(V: StoredVal, Name: "reverse");
9430 // We don't want to update the value in the map as it might be used in
9431 // another expression. So don't call resetVectorValue(StoredVal).
9432 }
9433 Value *Addr = State.get(Def: getAddr(), Part, /*IsScalar*/ !CreateScatter);
9434 if (CreateScatter)
9435 NewSI = Builder.CreateMaskedScatter(Val: StoredVal, Ptrs: Addr, Alignment, Mask);
9436 else if (Mask)
9437 NewSI = Builder.CreateMaskedStore(Val: StoredVal, Ptr: Addr, Alignment, Mask);
9438 else
9439 NewSI = Builder.CreateAlignedStore(Val: StoredVal, Ptr: Addr, Align: Alignment);
9440 State.addMetadata(To: NewSI, From: SI);
9441 }
9442}
9443
9444void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
9445 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9446 "explicit vector length.");
9447 // FIXME: Support reverse loading after vp_reverse is added.
9448 assert(!isReverse() && "Reverse store are not implemented yet.");
9449
9450 auto *SI = cast<StoreInst>(Val: &Ingredient);
9451
9452 VPValue *StoredValue = getStoredValue();
9453 bool CreateScatter = !isConsecutive();
9454 const Align Alignment = getLoadStoreAlignment(I: &Ingredient);
9455
9456 auto &Builder = State.Builder;
9457 State.setDebugLocFrom(getDebugLoc());
9458
9459 CallInst *NewSI = nullptr;
9460 Value *StoredVal = State.get(Def: StoredValue, Part: 0);
9461 Value *EVL = State.get(Def: getEVL(), Instance: VPIteration(0, 0));
9462 // FIXME: Support reverse store after vp_reverse is added.
9463 Value *Mask = getMask()
9464 ? State.get(Def: getMask(), Part: 0)
9465 : Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
9466 Value *Addr = State.get(Def: getAddr(), Part: 0, IsScalar: !CreateScatter);
9467 if (CreateScatter) {
9468 NewSI = Builder.CreateIntrinsic(Type::getVoidTy(C&: EVL->getContext()),
9469 Intrinsic::vp_scatter,
9470 {StoredVal, Addr, Mask, EVL});
9471 } else {
9472 VectorBuilder VBuilder(Builder);
9473 VBuilder.setEVL(EVL).setMask(Mask);
9474 NewSI = cast<CallInst>(Val: VBuilder.createVectorInstruction(
9475 Opcode: Instruction::Store, ReturnTy: Type::getVoidTy(C&: EVL->getContext()),
9476 VecOpArray: {StoredVal, Addr}));
9477 }
9478 NewSI->addParamAttr(
9479 ArgNo: 1, Attr: Attribute::getWithAlignment(Context&: NewSI->getContext(), Alignment));
9480 State.addMetadata(To: NewSI, From: SI);
9481}
9482
9483// Determine how to lower the scalar epilogue, which depends on 1) optimising
9484// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9485// predication, and 4) a TTI hook that analyses whether the loop is suitable
9486// for predication.
9487static ScalarEpilogueLowering getScalarEpilogueLowering(
9488 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9489 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9490 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9491 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9492 // don't look at hints or options, and don't request a scalar epilogue.
9493 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9494 // LoopAccessInfo (due to code dependency and not being able to reliably get
9495 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9496 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9497 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9498 // back to the old way and vectorize with versioning when forced. See D81345.)
9499 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(BB: L->getHeader(), PSI, BFI,
9500 QueryType: PGSOQueryType::IRPass) &&
9501 Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9502 return CM_ScalarEpilogueNotAllowedOptSize;
9503
9504 // 2) If set, obey the directives
9505 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9506 switch (PreferPredicateOverEpilogue) {
9507 case PreferPredicateTy::ScalarEpilogue:
9508 return CM_ScalarEpilogueAllowed;
9509 case PreferPredicateTy::PredicateElseScalarEpilogue:
9510 return CM_ScalarEpilogueNotNeededUsePredicate;
9511 case PreferPredicateTy::PredicateOrDontVectorize:
9512 return CM_ScalarEpilogueNotAllowedUsePredicate;
9513 };
9514 }
9515
9516 // 3) If set, obey the hints
9517 switch (Hints.getPredicate()) {
9518 case LoopVectorizeHints::FK_Enabled:
9519 return CM_ScalarEpilogueNotNeededUsePredicate;
9520 case LoopVectorizeHints::FK_Disabled:
9521 return CM_ScalarEpilogueAllowed;
9522 };
9523
9524 // 4) if the TTI hook indicates this is profitable, request predication.
9525 TailFoldingInfo TFI(TLI, &LVL, IAI);
9526 if (TTI->preferPredicateOverEpilogue(TFI: &TFI))
9527 return CM_ScalarEpilogueNotNeededUsePredicate;
9528
9529 return CM_ScalarEpilogueAllowed;
9530}
9531
9532// Process the loop in the VPlan-native vectorization path. This path builds
9533// VPlan upfront in the vectorization pipeline, which allows to apply
9534// VPlan-to-VPlan transformations from the very beginning without modifying the
9535// input LLVM IR.
9536static bool processLoopInVPlanNativePath(
9537 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9538 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9539 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9540 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9541 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9542 LoopVectorizationRequirements &Requirements) {
9543
9544 if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) {
9545 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9546 return false;
9547 }
9548 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9549 Function *F = L->getHeader()->getParent();
9550 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9551
9552 ScalarEpilogueLowering SEL =
9553 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL&: *LVL, IAI: &IAI);
9554
9555 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9556 &Hints, IAI);
9557 // Use the planner for outer loop vectorization.
9558 // TODO: CM is not used at this point inside the planner. Turn CM into an
9559 // optional argument if we don't need it in the future.
9560 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9561 ORE);
9562
9563 // Get user vectorization factor.
9564 ElementCount UserVF = Hints.getWidth();
9565
9566 CM.collectElementTypesForWidening();
9567
9568 // Plan how to best vectorize, return the best VF and its cost.
9569 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9570
9571 // If we are stress testing VPlan builds, do not attempt to generate vector
9572 // code. Masked vector code generation support will follow soon.
9573 // Also, do not attempt to vectorize if no vector code will be produced.
9574 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9575 return false;
9576
9577 VPlan &BestPlan = LVP.getBestPlanFor(VF: VF.Width);
9578
9579 {
9580 bool AddBranchWeights =
9581 hasBranchWeightMD(I: *L->getLoopLatch()->getTerminator());
9582 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9583 F->getParent()->getDataLayout(), AddBranchWeights);
9584 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9585 VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9586 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9587 << L->getHeader()->getParent()->getName() << "\"\n");
9588 LVP.executePlan(BestVF: VF.Width, BestUF: 1, BestVPlan&: BestPlan, ILV&: LB, DT, IsEpilogueVectorization: false);
9589 }
9590
9591 reportVectorization(ORE, TheLoop: L, VF, IC: 1);
9592
9593 // Mark the loop as already vectorized to avoid vectorizing again.
9594 Hints.setAlreadyVectorized();
9595 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9596 return true;
9597}
9598
9599// Emit a remark if there are stores to floats that required a floating point
9600// extension. If the vectorized loop was generated with floating point there
9601// will be a performance penalty from the conversion overhead and the change in
9602// the vector width.
9603static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9604 SmallVector<Instruction *, 4> Worklist;
9605 for (BasicBlock *BB : L->getBlocks()) {
9606 for (Instruction &Inst : *BB) {
9607 if (auto *S = dyn_cast<StoreInst>(Val: &Inst)) {
9608 if (S->getValueOperand()->getType()->isFloatTy())
9609 Worklist.push_back(Elt: S);
9610 }
9611 }
9612 }
9613
9614 // Traverse the floating point stores upwards searching, for floating point
9615 // conversions.
9616 SmallPtrSet<const Instruction *, 4> Visited;
9617 SmallPtrSet<const Instruction *, 4> EmittedRemark;
9618 while (!Worklist.empty()) {
9619 auto *I = Worklist.pop_back_val();
9620 if (!L->contains(Inst: I))
9621 continue;
9622 if (!Visited.insert(Ptr: I).second)
9623 continue;
9624
9625 // Emit a remark if the floating point store required a floating
9626 // point conversion.
9627 // TODO: More work could be done to identify the root cause such as a
9628 // constant or a function return type and point the user to it.
9629 if (isa<FPExtInst>(Val: I) && EmittedRemark.insert(Ptr: I).second)
9630 ORE->emit(RemarkBuilder: [&]() {
9631 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9632 I->getDebugLoc(), L->getHeader())
9633 << "floating point conversion changes vector width. "
9634 << "Mixed floating point precision requires an up/down "
9635 << "cast that will negatively impact performance.";
9636 });
9637
9638 for (Use &Op : I->operands())
9639 if (auto *OpI = dyn_cast<Instruction>(Val&: Op))
9640 Worklist.push_back(Elt: OpI);
9641 }
9642}
9643
9644static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9645 VectorizationFactor &VF,
9646 std::optional<unsigned> VScale, Loop *L,
9647 ScalarEvolution &SE,
9648 ScalarEpilogueLowering SEL) {
9649 InstructionCost CheckCost = Checks.getCost();
9650 if (!CheckCost.isValid())
9651 return false;
9652
9653 // When interleaving only scalar and vector cost will be equal, which in turn
9654 // would lead to a divide by 0. Fall back to hard threshold.
9655 if (VF.Width.isScalar()) {
9656 if (CheckCost > VectorizeMemoryCheckThreshold) {
9657 LLVM_DEBUG(
9658 dbgs()
9659 << "LV: Interleaving only is not profitable due to runtime checks\n");
9660 return false;
9661 }
9662 return true;
9663 }
9664
9665 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9666 uint64_t ScalarC = *VF.ScalarCost.getValue();
9667 if (ScalarC == 0)
9668 return true;
9669
9670 // First, compute the minimum iteration count required so that the vector
9671 // loop outperforms the scalar loop.
9672 // The total cost of the scalar loop is
9673 // ScalarC * TC
9674 // where
9675 // * TC is the actual trip count of the loop.
9676 // * ScalarC is the cost of a single scalar iteration.
9677 //
9678 // The total cost of the vector loop is
9679 // RtC + VecC * (TC / VF) + EpiC
9680 // where
9681 // * RtC is the cost of the generated runtime checks
9682 // * VecC is the cost of a single vector iteration.
9683 // * TC is the actual trip count of the loop
9684 // * VF is the vectorization factor
9685 // * EpiCost is the cost of the generated epilogue, including the cost
9686 // of the remaining scalar operations.
9687 //
9688 // Vectorization is profitable once the total vector cost is less than the
9689 // total scalar cost:
9690 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9691 //
9692 // Now we can compute the minimum required trip count TC as
9693 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
9694 //
9695 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9696 // the computations are performed on doubles, not integers and the result
9697 // is rounded up, hence we get an upper estimate of the TC.
9698 unsigned IntVF = VF.Width.getKnownMinValue();
9699 if (VF.Width.isScalable()) {
9700 unsigned AssumedMinimumVscale = 1;
9701 if (VScale)
9702 AssumedMinimumVscale = *VScale;
9703 IntVF *= AssumedMinimumVscale;
9704 }
9705 uint64_t RtC = *CheckCost.getValue();
9706 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
9707 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(Numerator: RtC * IntVF, Denominator: Div);
9708
9709 // Second, compute a minimum iteration count so that the cost of the
9710 // runtime checks is only a fraction of the total scalar loop cost. This
9711 // adds a loop-dependent bound on the overhead incurred if the runtime
9712 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9713 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9714 // cost, compute
9715 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9716 uint64_t MinTC2 = divideCeil(Numerator: RtC * 10, Denominator: ScalarC);
9717
9718 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9719 // epilogue is allowed, choose the next closest multiple of VF. This should
9720 // partly compensate for ignoring the epilogue cost.
9721 uint64_t MinTC = std::max(a: MinTC1, b: MinTC2);
9722 if (SEL == CM_ScalarEpilogueAllowed)
9723 MinTC = alignTo(Value: MinTC, Align: IntVF);
9724 VF.MinProfitableTripCount = ElementCount::getFixed(MinVal: MinTC);
9725
9726 LLVM_DEBUG(
9727 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9728 << VF.MinProfitableTripCount << "\n");
9729
9730 // Skip vectorization if the expected trip count is less than the minimum
9731 // required trip count.
9732 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9733 if (ElementCount::isKnownLT(LHS: ElementCount::getFixed(MinVal: *ExpectedTC),
9734 RHS: VF.MinProfitableTripCount)) {
9735 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9736 "trip count < minimum profitable VF ("
9737 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9738 << ")\n");
9739
9740 return false;
9741 }
9742 }
9743 return true;
9744}
9745
9746LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9747 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9748 !EnableLoopInterleaving),
9749 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9750 !EnableLoopVectorization) {}
9751
9752bool LoopVectorizePass::processLoop(Loop *L) {
9753 assert((EnableVPlanNativePath || L->isInnermost()) &&
9754 "VPlan-native path is not enabled. Only process inner loops.");
9755
9756#ifndef NDEBUG
9757 const std::string DebugLocStr = getDebugLocString(L);
9758#endif /* NDEBUG */
9759
9760 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9761 << L->getHeader()->getParent()->getName() << "' from "
9762 << DebugLocStr << "\n");
9763
9764 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9765
9766 LLVM_DEBUG(
9767 dbgs() << "LV: Loop hints:"
9768 << " force="
9769 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9770 ? "disabled"
9771 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9772 ? "enabled"
9773 : "?"))
9774 << " width=" << Hints.getWidth()
9775 << " interleave=" << Hints.getInterleave() << "\n");
9776
9777 // Function containing loop
9778 Function *F = L->getHeader()->getParent();
9779
9780 // Looking at the diagnostic output is the only way to determine if a loop
9781 // was vectorized (other than looking at the IR or machine code), so it
9782 // is important to generate an optimization remark for each loop. Most of
9783 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9784 // generated as OptimizationRemark and OptimizationRemarkMissed are
9785 // less verbose reporting vectorized loops and unvectorized loops that may
9786 // benefit from vectorization, respectively.
9787
9788 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9789 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9790 return false;
9791 }
9792
9793 PredicatedScalarEvolution PSE(*SE, *L);
9794
9795 // Check if it is legal to vectorize the loop.
9796 LoopVectorizationRequirements Requirements;
9797 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9798 &Requirements, &Hints, DB, AC, BFI, PSI);
9799 if (!LVL.canVectorize(UseVPlanNativePath: EnableVPlanNativePath)) {
9800 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9801 Hints.emitRemarkWithHints();
9802 return false;
9803 }
9804
9805 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9806 // here. They may require CFG and instruction level transformations before
9807 // even evaluating whether vectorization is profitable. Since we cannot modify
9808 // the incoming IR, we need to build VPlan upfront in the vectorization
9809 // pipeline.
9810 if (!L->isInnermost())
9811 return processLoopInVPlanNativePath(L, PSE, LI, DT, LVL: &LVL, TTI, TLI, DB, AC,
9812 ORE, BFI, PSI, Hints, Requirements);
9813
9814 assert(L->isInnermost() && "Inner loop expected.");
9815
9816 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9817 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9818
9819 // If an override option has been passed in for interleaved accesses, use it.
9820 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9821 UseInterleaved = EnableInterleavedMemAccesses;
9822
9823 // Analyze interleaved memory accesses.
9824 if (UseInterleaved)
9825 IAI.analyzeInterleaving(EnableMaskedInterleavedGroup: useMaskedInterleavedAccesses(TTI: *TTI));
9826
9827 // Check the function attributes and profiles to find out if this function
9828 // should be optimized for size.
9829 ScalarEpilogueLowering SEL =
9830 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, IAI: &IAI);
9831
9832 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9833 // count by optimizing for size, to minimize overheads.
9834 auto ExpectedTC = getSmallBestKnownTC(SE&: *SE, L);
9835 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9836 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9837 << "This loop is worth vectorizing only if no scalar "
9838 << "iteration overheads are incurred.");
9839 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9840 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9841 else {
9842 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
9843 LLVM_DEBUG(dbgs() << "\n");
9844 // Predicate tail-folded loops are efficient even when the loop
9845 // iteration count is low. However, setting the epilogue policy to
9846 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9847 // with runtime checks. It's more effective to let
9848 // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9849 // for the loop.
9850 if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
9851 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9852 } else {
9853 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9854 "small to consider vectorizing.\n");
9855 reportVectorizationFailure(
9856 DebugMsg: "The trip count is below the minial threshold value.",
9857 OREMsg: "loop trip count is too low, avoiding vectorization",
9858 ORETag: "LowTripCount", ORE, TheLoop: L);
9859 Hints.emitRemarkWithHints();
9860 return false;
9861 }
9862 }
9863 }
9864
9865 // Check the function attributes to see if implicit floats or vectors are
9866 // allowed.
9867 if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9868 reportVectorizationFailure(
9869 DebugMsg: "Can't vectorize when the NoImplicitFloat attribute is used",
9870 OREMsg: "loop not vectorized due to NoImplicitFloat attribute",
9871 ORETag: "NoImplicitFloat", ORE, TheLoop: L);
9872 Hints.emitRemarkWithHints();
9873 return false;
9874 }
9875
9876 // Check if the target supports potentially unsafe FP vectorization.
9877 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9878 // for the target we're vectorizing for, to make sure none of the
9879 // additional fp-math flags can help.
9880 if (Hints.isPotentiallyUnsafe() &&
9881 TTI->isFPVectorizationPotentiallyUnsafe()) {
9882 reportVectorizationFailure(
9883 DebugMsg: "Potentially unsafe FP op prevents vectorization",
9884 OREMsg: "loop not vectorized due to unsafe FP support.",
9885 ORETag: "UnsafeFP", ORE, TheLoop: L);
9886 Hints.emitRemarkWithHints();
9887 return false;
9888 }
9889
9890 bool AllowOrderedReductions;
9891 // If the flag is set, use that instead and override the TTI behaviour.
9892 if (ForceOrderedReductions.getNumOccurrences() > 0)
9893 AllowOrderedReductions = ForceOrderedReductions;
9894 else
9895 AllowOrderedReductions = TTI->enableOrderedReductions();
9896 if (!LVL.canVectorizeFPMath(EnableStrictReductions: AllowOrderedReductions)) {
9897 ORE->emit(RemarkBuilder: [&]() {
9898 auto *ExactFPMathInst = Requirements.getExactFPInst();
9899 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9900 ExactFPMathInst->getDebugLoc(),
9901 ExactFPMathInst->getParent())
9902 << "loop not vectorized: cannot prove it is safe to reorder "
9903 "floating-point operations";
9904 });
9905 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9906 "reorder floating-point operations\n");
9907 Hints.emitRemarkWithHints();
9908 return false;
9909 }
9910
9911 // Use the cost model.
9912 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9913 F, &Hints, IAI);
9914 // Use the planner for vectorization.
9915 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9916 ORE);
9917
9918 // Get user vectorization factor and interleave count.
9919 ElementCount UserVF = Hints.getWidth();
9920 unsigned UserIC = Hints.getInterleave();
9921
9922 // Plan how to best vectorize, return the best VF and its cost.
9923 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9924
9925 VectorizationFactor VF = VectorizationFactor::Disabled();
9926 unsigned IC = 1;
9927
9928 bool AddBranchWeights =
9929 hasBranchWeightMD(I: *L->getLoopLatch()->getTerminator());
9930 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9931 F->getParent()->getDataLayout(), AddBranchWeights);
9932 if (MaybeVF) {
9933 VF = *MaybeVF;
9934 // Select the interleave count.
9935 IC = CM.selectInterleaveCount(VF: VF.Width, LoopCost: VF.Cost);
9936
9937 unsigned SelectedIC = std::max(a: IC, b: UserIC);
9938 // Optimistically generate runtime checks if they are needed. Drop them if
9939 // they turn out to not be profitable.
9940 if (VF.Width.isVector() || SelectedIC > 1)
9941 Checks.Create(L, LAI: *LVL.getLAI(), UnionPred: PSE.getPredicate(), VF: VF.Width, IC: SelectedIC);
9942
9943 // Check if it is profitable to vectorize with runtime checks.
9944 bool ForceVectorization =
9945 Hints.getForce() == LoopVectorizeHints::FK_Enabled;
9946 if (!ForceVectorization &&
9947 !areRuntimeChecksProfitable(Checks, VF, VScale: getVScaleForTuning(L, TTI: *TTI), L,
9948 SE&: *PSE.getSE(), SEL)) {
9949 ORE->emit(RemarkBuilder: [&]() {
9950 return OptimizationRemarkAnalysisAliasing(
9951 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9952 L->getHeader())
9953 << "loop not vectorized: cannot prove it is safe to reorder "
9954 "memory operations";
9955 });
9956 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9957 Hints.emitRemarkWithHints();
9958 return false;
9959 }
9960 }
9961
9962 // Identify the diagnostic messages that should be produced.
9963 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9964 bool VectorizeLoop = true, InterleaveLoop = true;
9965 if (VF.Width.isScalar()) {
9966 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9967 VecDiagMsg = std::make_pair(
9968 x: "VectorizationNotBeneficial",
9969 y: "the cost-model indicates that vectorization is not beneficial");
9970 VectorizeLoop = false;
9971 }
9972
9973 if (!MaybeVF && UserIC > 1) {
9974 // Tell the user interleaving was avoided up-front, despite being explicitly
9975 // requested.
9976 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9977 "interleaving should be avoided up front\n");
9978 IntDiagMsg = std::make_pair(
9979 x: "InterleavingAvoided",
9980 y: "Ignoring UserIC, because interleaving was avoided up front");
9981 InterleaveLoop = false;
9982 } else if (IC == 1 && UserIC <= 1) {
9983 // Tell the user interleaving is not beneficial.
9984 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9985 IntDiagMsg = std::make_pair(
9986 x: "InterleavingNotBeneficial",
9987 y: "the cost-model indicates that interleaving is not beneficial");
9988 InterleaveLoop = false;
9989 if (UserIC == 1) {
9990 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9991 IntDiagMsg.second +=
9992 " and is explicitly disabled or interleave count is set to 1";
9993 }
9994 } else if (IC > 1 && UserIC == 1) {
9995 // Tell the user interleaving is beneficial, but it explicitly disabled.
9996 LLVM_DEBUG(
9997 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9998 IntDiagMsg = std::make_pair(
9999 x: "InterleavingBeneficialButDisabled",
10000 y: "the cost-model indicates that interleaving is beneficial "
10001 "but is explicitly disabled or interleave count is set to 1");
10002 InterleaveLoop = false;
10003 }
10004
10005 // Override IC if user provided an interleave count.
10006 IC = UserIC > 0 ? UserIC : IC;
10007
10008 // Emit diagnostic messages, if any.
10009 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10010 if (!VectorizeLoop && !InterleaveLoop) {
10011 // Do not vectorize or interleaving the loop.
10012 ORE->emit(RemarkBuilder: [&]() {
10013 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10014 L->getStartLoc(), L->getHeader())
10015 << VecDiagMsg.second;
10016 });
10017 ORE->emit(RemarkBuilder: [&]() {
10018 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10019 L->getStartLoc(), L->getHeader())
10020 << IntDiagMsg.second;
10021 });
10022 return false;
10023 } else if (!VectorizeLoop && InterleaveLoop) {
10024 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10025 ORE->emit(RemarkBuilder: [&]() {
10026 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10027 L->getStartLoc(), L->getHeader())
10028 << VecDiagMsg.second;
10029 });
10030 } else if (VectorizeLoop && !InterleaveLoop) {
10031 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10032 << ") in " << DebugLocStr << '\n');
10033 ORE->emit(RemarkBuilder: [&]() {
10034 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10035 L->getStartLoc(), L->getHeader())
10036 << IntDiagMsg.second;
10037 });
10038 } else if (VectorizeLoop && InterleaveLoop) {
10039 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10040 << ") in " << DebugLocStr << '\n');
10041 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10042 }
10043
10044 bool DisableRuntimeUnroll = false;
10045 MDNode *OrigLoopID = L->getLoopID();
10046 {
10047 using namespace ore;
10048 if (!VectorizeLoop) {
10049 assert(IC > 1 && "interleave count should not be 1 or 0");
10050 // If we decided that it is not legal to vectorize the loop, then
10051 // interleave it.
10052 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10053 &CM, BFI, PSI, Checks);
10054
10055 VPlan &BestPlan = LVP.getBestPlanFor(VF: VF.Width);
10056 LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: Unroller, DT, IsEpilogueVectorization: false);
10057
10058 ORE->emit(RemarkBuilder: [&]() {
10059 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10060 L->getHeader())
10061 << "interleaved loop (interleaved count: "
10062 << NV("InterleaveCount", IC) << ")";
10063 });
10064 } else {
10065 // If we decided that it is *legal* to vectorize the loop, then do it.
10066
10067 // Consider vectorizing the epilogue too if it's profitable.
10068 VectorizationFactor EpilogueVF =
10069 LVP.selectEpilogueVectorizationFactor(MainLoopVF: VF.Width, IC);
10070 if (EpilogueVF.Width.isVector()) {
10071
10072 // The first pass vectorizes the main loop and creates a scalar epilogue
10073 // to be vectorized by executing the plan (potentially with a different
10074 // factor) again shortly afterwards.
10075 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10076 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10077 EPI, &LVL, &CM, BFI, PSI, Checks);
10078
10079 std::unique_ptr<VPlan> BestMainPlan(
10080 LVP.getBestPlanFor(VF: EPI.MainLoopVF).duplicate());
10081 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
10082 BestVF: EPI.MainLoopVF, BestUF: EPI.MainLoopUF, BestVPlan&: *BestMainPlan, ILV&: MainILV, DT, IsEpilogueVectorization: true);
10083 ++LoopsVectorized;
10084
10085 // Second pass vectorizes the epilogue and adjusts the control flow
10086 // edges from the first pass.
10087 EPI.MainLoopVF = EPI.EpilogueVF;
10088 EPI.MainLoopUF = EPI.EpilogueUF;
10089 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10090 ORE, EPI, &LVL, &CM, BFI, PSI,
10091 Checks);
10092
10093 VPlan &BestEpiPlan = LVP.getBestPlanFor(VF: EPI.EpilogueVF);
10094 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10095 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10096 Header->setName("vec.epilog.vector.body");
10097
10098 // Re-use the trip count and steps expanded for the main loop, as
10099 // skeleton creation needs it as a value that dominates both the scalar
10100 // and vector epilogue loops
10101 // TODO: This is a workaround needed for epilogue vectorization and it
10102 // should be removed once induction resume value creation is done
10103 // directly in VPlan.
10104 EpilogILV.setTripCount(MainILV.getTripCount());
10105 for (auto &R : make_early_inc_range(Range&: *BestEpiPlan.getPreheader())) {
10106 auto *ExpandR = cast<VPExpandSCEVRecipe>(Val: &R);
10107 auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn(
10108 V: ExpandedSCEVs.find(Val: ExpandR->getSCEV())->second);
10109 ExpandR->replaceAllUsesWith(New: ExpandedVal);
10110 if (BestEpiPlan.getTripCount() == ExpandR)
10111 BestEpiPlan.resetTripCount(NewTripCount: ExpandedVal);
10112 ExpandR->eraseFromParent();
10113 }
10114
10115 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10116 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10117 // before vectorizing the epilogue loop.
10118 for (VPRecipeBase &R : Header->phis()) {
10119 if (isa<VPCanonicalIVPHIRecipe>(Val: &R))
10120 continue;
10121
10122 Value *ResumeV = nullptr;
10123 // TODO: Move setting of resume values to prepareToExecute.
10124 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R)) {
10125 ResumeV = ReductionResumeValues
10126 .find(Val: &ReductionPhi->getRecurrenceDescriptor())
10127 ->second;
10128 } else {
10129 // Create induction resume values for both widened pointer and
10130 // integer/fp inductions and update the start value of the induction
10131 // recipes to use the resume value.
10132 PHINode *IndPhi = nullptr;
10133 const InductionDescriptor *ID;
10134 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(Val: &R)) {
10135 IndPhi = cast<PHINode>(Val: Ind->getUnderlyingValue());
10136 ID = &Ind->getInductionDescriptor();
10137 } else {
10138 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(Val: &R);
10139 IndPhi = WidenInd->getPHINode();
10140 ID = &WidenInd->getInductionDescriptor();
10141 }
10142
10143 ResumeV = MainILV.createInductionResumeValue(
10144 OrigPhi: IndPhi, II: *ID, Step: getExpandedStep(ID: *ID, ExpandedSCEVs),
10145 BypassBlocks: {EPI.MainLoopIterationCountCheck});
10146 }
10147 assert(ResumeV && "Must have a resume value");
10148 VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(V: ResumeV);
10149 cast<VPHeaderPHIRecipe>(Val: &R)->setStartValue(StartVal);
10150 }
10151
10152 LVP.executePlan(BestVF: EPI.EpilogueVF, BestUF: EPI.EpilogueUF, BestVPlan&: BestEpiPlan, ILV&: EpilogILV,
10153 DT, IsEpilogueVectorization: true, ExpandedSCEVs: &ExpandedSCEVs);
10154 ++LoopsEpilogueVectorized;
10155
10156 if (!MainILV.areSafetyChecksAdded())
10157 DisableRuntimeUnroll = true;
10158 } else {
10159 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10160 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10161 PSI, Checks);
10162
10163 VPlan &BestPlan = LVP.getBestPlanFor(VF: VF.Width);
10164 LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: LB, DT, IsEpilogueVectorization: false);
10165 ++LoopsVectorized;
10166
10167 // Add metadata to disable runtime unrolling a scalar loop when there
10168 // are no runtime checks about strides and memory. A scalar loop that is
10169 // rarely used is not worth unrolling.
10170 if (!LB.areSafetyChecksAdded())
10171 DisableRuntimeUnroll = true;
10172 }
10173 // Report the vectorization decision.
10174 reportVectorization(ORE, TheLoop: L, VF, IC);
10175 }
10176
10177 if (ORE->allowExtraAnalysis(LV_NAME))
10178 checkMixedPrecision(L, ORE);
10179 }
10180
10181 std::optional<MDNode *> RemainderLoopID =
10182 makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
10183 LLVMLoopVectorizeFollowupEpilogue});
10184 if (RemainderLoopID) {
10185 L->setLoopID(*RemainderLoopID);
10186 } else {
10187 if (DisableRuntimeUnroll)
10188 AddRuntimeUnrollDisableMetaData(L);
10189
10190 // Mark the loop as already vectorized to avoid vectorizing again.
10191 Hints.setAlreadyVectorized();
10192 }
10193
10194 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10195 return true;
10196}
10197
10198LoopVectorizeResult LoopVectorizePass::runImpl(
10199 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10200 DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_,
10201 DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
10202 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10203 SE = &SE_;
10204 LI = &LI_;
10205 TTI = &TTI_;
10206 DT = &DT_;
10207 BFI = BFI_;
10208 TLI = TLI_;
10209 AC = &AC_;
10210 LAIs = &LAIs_;
10211 DB = &DB_;
10212 ORE = &ORE_;
10213 PSI = PSI_;
10214
10215 // Don't attempt if
10216 // 1. the target claims to have no vector registers, and
10217 // 2. interleaving won't help ILP.
10218 //
10219 // The second condition is necessary because, even if the target has no
10220 // vector registers, loop vectorization may still enable scalar
10221 // interleaving.
10222 if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true)) &&
10223 TTI->getMaxInterleaveFactor(VF: ElementCount::getFixed(MinVal: 1)) < 2)
10224 return LoopVectorizeResult(false, false);
10225
10226 bool Changed = false, CFGChanged = false;
10227
10228 // The vectorizer requires loops to be in simplified form.
10229 // Since simplification may add new inner loops, it has to run before the
10230 // legality and profitability checks. This means running the loop vectorizer
10231 // will simplify all loops, regardless of whether anything end up being
10232 // vectorized.
10233 for (const auto &L : *LI)
10234 Changed |= CFGChanged |=
10235 simplifyLoop(L, DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA: false /* PreserveLCSSA */);
10236
10237 // Build up a worklist of inner-loops to vectorize. This is necessary as
10238 // the act of vectorizing or partially unrolling a loop creates new loops
10239 // and can invalidate iterators across the loops.
10240 SmallVector<Loop *, 8> Worklist;
10241
10242 for (Loop *L : *LI)
10243 collectSupportedLoops(L&: *L, LI, ORE, V&: Worklist);
10244
10245 LoopsAnalyzed += Worklist.size();
10246
10247 // Now walk the identified inner loops.
10248 while (!Worklist.empty()) {
10249 Loop *L = Worklist.pop_back_val();
10250
10251 // For the inner loops we actually process, form LCSSA to simplify the
10252 // transform.
10253 Changed |= formLCSSARecursively(L&: *L, DT: *DT, LI, SE);
10254
10255 Changed |= CFGChanged |= processLoop(L);
10256
10257 if (Changed) {
10258 LAIs->clear();
10259
10260#ifndef NDEBUG
10261 if (VerifySCEV)
10262 SE->verify();
10263#endif
10264 }
10265 }
10266
10267 // Process each loop nest in the function.
10268 return LoopVectorizeResult(Changed, CFGChanged);
10269}
10270
10271PreservedAnalyses LoopVectorizePass::run(Function &F,
10272 FunctionAnalysisManager &AM) {
10273 auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
10274 // There are no loops in the function. Return before computing other expensive
10275 // analyses.
10276 if (LI.empty())
10277 return PreservedAnalyses::all();
10278 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
10279 auto &TTI = AM.getResult<TargetIRAnalysis>(IR&: F);
10280 auto &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
10281 auto &TLI = AM.getResult<TargetLibraryAnalysis>(IR&: F);
10282 auto &AC = AM.getResult<AssumptionAnalysis>(IR&: F);
10283 auto &DB = AM.getResult<DemandedBitsAnalysis>(IR&: F);
10284 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
10285
10286 LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(IR&: F);
10287 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
10288 ProfileSummaryInfo *PSI =
10289 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
10290 BlockFrequencyInfo *BFI = nullptr;
10291 if (PSI && PSI->hasProfileSummary())
10292 BFI = &AM.getResult<BlockFrequencyAnalysis>(IR&: F);
10293 LoopVectorizeResult Result =
10294 runImpl(F, SE_&: SE, LI_&: LI, TTI_&: TTI, DT_&: DT, BFI_: BFI, TLI_: &TLI, DB_&: DB, AC_&: AC, LAIs_&: LAIs, ORE_&: ORE, PSI_: PSI);
10295 if (!Result.MadeAnyChange)
10296 return PreservedAnalyses::all();
10297 PreservedAnalyses PA;
10298
10299 if (isAssignmentTrackingEnabled(M: *F.getParent())) {
10300 for (auto &BB : F)
10301 RemoveRedundantDbgInstrs(BB: &BB);
10302 }
10303
10304 // We currently do not preserve loopinfo/dominator analyses with outer loop
10305 // vectorization. Until this is addressed, mark these analyses as preserved
10306 // only for non-VPlan-native path.
10307 // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10308 if (!EnableVPlanNativePath) {
10309 PA.preserve<LoopAnalysis>();
10310 PA.preserve<DominatorTreeAnalysis>();
10311 PA.preserve<ScalarEvolutionAnalysis>();
10312 }
10313
10314 if (Result.MadeCFGChange) {
10315 // Making CFG changes likely means a loop got vectorized. Indicate that
10316 // extra simplification passes should be run.
10317 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10318 // be run if runtime checks have been added.
10319 AM.getResult<ShouldRunExtraVectorPasses>(IR&: F);
10320 PA.preserve<ShouldRunExtraVectorPasses>();
10321 } else {
10322 PA.preserveSet<CFGAnalyses>();
10323 }
10324 return PA;
10325}
10326
10327void LoopVectorizePass::printPipeline(
10328 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10329 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10330 OS, MapClassName2PassName);
10331
10332 OS << '<';
10333 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10334 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10335 OS << '>';
10336}
10337

source code of llvm/lib/Transforms/Vectorize/LoopVectorize.cpp