LoopVectorize.cpp source code [llvm/lib/Transforms/Vectorize/LoopVectorize.cpp]

1	//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10	// and generates target-independent LLVM-IR.
11	// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12	// of instructions in order to estimate the profitability of vectorization.
13	//
14	// The loop vectorizer combines consecutive loop iterations into a single
15	// 'wide' iteration. After this transformation the index is incremented
16	// by the SIMD vector width, and not by one.
17	//
18	// This pass has three parts:
19	// 1. The main loop pass that drives the different parts.
20	// 2. LoopVectorizationLegality - A unit that checks for the legality
21	// of the vectorization.
22	// 3. InnerLoopVectorizer - A unit that performs the actual
23	// widening of instructions.
24	// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25	// of vectorization. It decides on the optimal vector width, which
26	// can be one, if vectorization is not profitable.
27	//
28	// There is a development effort going on to migrate loop vectorizer to the
29	// VPlan infrastructure and to introduce outer loop vectorization support (see
30	// docs/VectorizationPlan.rst and
31	// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32	// purpose, we temporarily introduced the VPlan-native vectorization path: an
33	// alternative vectorization path that is natively implemented on top of the
34	// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35	//
36	//===----------------------------------------------------------------------===//
37	//
38	// The reduction-variable vectorization is based on the paper:
39	// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40	//
41	// Variable uniformity checks are inspired by:
42	// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43	//
44	// The interleaved access vectorization is based on the paper:
45	// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46	// Data for SIMD
47	//
48	// Other ideas/concepts are from:
49	// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50	//
51	// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52	// Vectorizing Compilers.
53	//
54	//===----------------------------------------------------------------------===//
55
56	#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57	#include "LoopVectorizationPlanner.h"
58	#include "VPRecipeBuilder.h"
59	#include "VPlan.h"
60	#include "VPlanAnalysis.h"
61	#include "VPlanHCFGBuilder.h"
62	#include "VPlanTransforms.h"
63	#include "VPlanVerifier.h"
64	#include "llvm/ADT/APInt.h"
65	#include "llvm/ADT/ArrayRef.h"
66	#include "llvm/ADT/DenseMap.h"
67	#include "llvm/ADT/DenseMapInfo.h"
68	#include "llvm/ADT/Hashing.h"
69	#include "llvm/ADT/MapVector.h"
70	#include "llvm/ADT/STLExtras.h"
71	#include "llvm/ADT/SmallPtrSet.h"
72	#include "llvm/ADT/SmallSet.h"
73	#include "llvm/ADT/SmallVector.h"
74	#include "llvm/ADT/Statistic.h"
75	#include "llvm/ADT/StringRef.h"
76	#include "llvm/ADT/Twine.h"
77	#include "llvm/ADT/iterator_range.h"
78	#include "llvm/Analysis/AssumptionCache.h"
79	#include "llvm/Analysis/BasicAliasAnalysis.h"
80	#include "llvm/Analysis/BlockFrequencyInfo.h"
81	#include "llvm/Analysis/CFG.h"
82	#include "llvm/Analysis/CodeMetrics.h"
83	#include "llvm/Analysis/DemandedBits.h"
84	#include "llvm/Analysis/GlobalsModRef.h"
85	#include "llvm/Analysis/LoopAccessAnalysis.h"
86	#include "llvm/Analysis/LoopAnalysisManager.h"
87	#include "llvm/Analysis/LoopInfo.h"
88	#include "llvm/Analysis/LoopIterator.h"
89	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
90	#include "llvm/Analysis/ProfileSummaryInfo.h"
91	#include "llvm/Analysis/ScalarEvolution.h"
92	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
93	#include "llvm/Analysis/TargetLibraryInfo.h"
94	#include "llvm/Analysis/TargetTransformInfo.h"
95	#include "llvm/Analysis/ValueTracking.h"
96	#include "llvm/Analysis/VectorUtils.h"
97	#include "llvm/IR/Attributes.h"
98	#include "llvm/IR/BasicBlock.h"
99	#include "llvm/IR/CFG.h"
100	#include "llvm/IR/Constant.h"
101	#include "llvm/IR/Constants.h"
102	#include "llvm/IR/DataLayout.h"
103	#include "llvm/IR/DebugInfo.h"
104	#include "llvm/IR/DebugInfoMetadata.h"
105	#include "llvm/IR/DebugLoc.h"
106	#include "llvm/IR/DerivedTypes.h"
107	#include "llvm/IR/DiagnosticInfo.h"
108	#include "llvm/IR/Dominators.h"
109	#include "llvm/IR/Function.h"
110	#include "llvm/IR/IRBuilder.h"
111	#include "llvm/IR/InstrTypes.h"
112	#include "llvm/IR/Instruction.h"
113	#include "llvm/IR/Instructions.h"
114	#include "llvm/IR/IntrinsicInst.h"
115	#include "llvm/IR/Intrinsics.h"
116	#include "llvm/IR/MDBuilder.h"
117	#include "llvm/IR/Metadata.h"
118	#include "llvm/IR/Module.h"
119	#include "llvm/IR/Operator.h"
120	#include "llvm/IR/PatternMatch.h"
121	#include "llvm/IR/ProfDataUtils.h"
122	#include "llvm/IR/Type.h"
123	#include "llvm/IR/Use.h"
124	#include "llvm/IR/User.h"
125	#include "llvm/IR/Value.h"
126	#include "llvm/IR/ValueHandle.h"
127	#include "llvm/IR/VectorBuilder.h"
128	#include "llvm/IR/Verifier.h"
129	#include "llvm/Support/Casting.h"
130	#include "llvm/Support/CommandLine.h"
131	#include "llvm/Support/Compiler.h"
132	#include "llvm/Support/Debug.h"
133	#include "llvm/Support/ErrorHandling.h"
134	#include "llvm/Support/InstructionCost.h"
135	#include "llvm/Support/MathExtras.h"
136	#include "llvm/Support/raw_ostream.h"
137	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
138	#include "llvm/Transforms/Utils/InjectTLIMappings.h"
139	#include "llvm/Transforms/Utils/LoopSimplify.h"
140	#include "llvm/Transforms/Utils/LoopUtils.h"
141	#include "llvm/Transforms/Utils/LoopVersioning.h"
142	#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
143	#include "llvm/Transforms/Utils/SizeOpts.h"
144	#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
145	#include <algorithm>
146	#include <cassert>
147	#include <cmath>
148	#include <cstdint>
149	#include <functional>
150	#include <iterator>
151	#include <limits>
152	#include <map>
153	#include <memory>
154	#include <string>
155	#include <tuple>
156	#include <utility>
157
158	using namespace llvm;
159
160	#define LV_NAME "loop-vectorize"
161	#define DEBUG_TYPE LV_NAME
162
163	#ifndef NDEBUG
164	const char VerboseDebug[] = DEBUG_TYPE "-verbose";
165	#endif
166
167	/// @{
168	/// Metadata attribute names
169	const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
170	const char LLVMLoopVectorizeFollowupVectorized[] =
171	"llvm.loop.vectorize.followup_vectorized";
172	const char LLVMLoopVectorizeFollowupEpilogue[] =
173	"llvm.loop.vectorize.followup_epilogue";
174	/// @}
175
176	STATISTIC(LoopsVectorized, "Number of loops vectorized");
177	STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
178	STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
179
180	static cl::opt<bool> EnableEpilogueVectorization(
181	"enable-epilogue-vectorization", cl::init(Val: true), cl::Hidden,
182	cl::desc ("Enable vectorization of epilogue loops."));
183
184	static cl::opt<unsigned> EpilogueVectorizationForceVF(
185	"epilogue-vectorization-force-VF", cl::init(Val: `1`), cl::Hidden,
186	cl::desc ("When epilogue vectorization is enabled, and a value greater than "
187	"1 is specified, forces the given VF for all applicable epilogue "
188	"loops."));
189
190	static cl::opt<unsigned> EpilogueVectorizationMinVF(
191	"epilogue-vectorization-minimum-VF", cl::init(Val: `16`), cl::Hidden,
192	cl::desc ("Only loops with vectorization factor equal to or larger than "
193	"the specified value are considered for epilogue vectorization."));
194
195	/// Loops with a known constant trip count below this number are vectorized only
196	/// if no scalar iteration overheads are incurred.
197	static cl::opt<unsigned> TinyTripCountVectorThreshold(
198	"vectorizer-min-trip-count", cl::init(Val: `16`), cl::Hidden,
199	cl::desc ("Loops with a constant trip count that is smaller than this "
200	"value are vectorized only if no scalar iteration overheads "
201	"are incurred."));
202
203	static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
204	"vectorize-memory-check-threshold", cl::init(Val: `128`), cl::Hidden,
205	cl::desc ("The maximum allowed number of runtime memory checks"));
206
207	// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
208	// that predication is preferred, and this lists all options. I.e., the
209	// vectorizer will try to fold the tail-loop (epilogue) into the vector body
210	// and predicate the instructions accordingly. If tail-folding fails, there are
211	// different fallback strategies depending on these values:
212	namespace PreferPredicateTy {
213	enum Option {
214	ScalarEpilogue = `0`,
215	PredicateElseScalarEpilogue,
216	PredicateOrDontVectorize
217	};
218	} // namespace PreferPredicateTy
219
220	static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
221	"prefer-predicate-over-epilogue",
222	cl::init(Val: PreferPredicateTy::ScalarEpilogue),
223	cl::Hidden,
224	cl::desc ("Tail-folding and predication preferences over creating a scalar "
225	"epilogue loop."),
226	cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
227	"scalar-epilogue",
228	"Don't tail-predicate loops, create scalar epilogue"),
229	clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
230	"predicate-else-scalar-epilogue",
231	"prefer tail-folding, create scalar epilogue if tail "
232	"folding fails."),
233	clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
234	"predicate-dont-vectorize",
235	"prefers tail-folding, don't attempt vectorization if "
236	"tail-folding fails.")));
237
238	static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
239	"force-tail-folding-style", cl::desc ("Force the tail folding style"),
240	cl::init(Val: TailFoldingStyle::None),
241	cl::values(
242	clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
243	clEnumValN(
244	TailFoldingStyle::Data, "data",
245	"Create lane mask for data only, using active.lane.mask intrinsic"),
246	clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
247	"data-without-lane-mask",
248	"Create lane mask with compare/stepvector"),
249	clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
250	"Create lane mask using active.lane.mask intrinsic, and use "
251	"it for both data and control flow"),
252	clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
253	"data-and-control-without-rt-check",
254	"Similar to data-and-control, but remove the runtime check"),
255	clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
256	"Use predicated EVL instructions for tail folding. If EVL "
257	"is unsupported, fallback to data-without-lane-mask.")));
258
259	static cl::opt<bool> MaximizeBandwidth(
260	"vectorizer-maximize-bandwidth", cl::init(Val: false), cl::Hidden,
261	cl::desc ("Maximize bandwidth when selecting vectorization factor which "
262	"will be determined by the smallest type in loop."));
263
264	static cl::opt<bool> EnableInterleavedMemAccesses(
265	"enable-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
266	cl::desc ("Enable vectorization on interleaved memory accesses in a loop"));
267
268	/// An interleave-group may need masking if it resides in a block that needs
269	/// predication, or in order to mask away gaps.
270	static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
271	"enable-masked-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
272	cl::desc ("Enable vectorization on masked interleaved memory accesses in a loop"));
273
274	static cl::opt<unsigned> ForceTargetNumScalarRegs(
275	"force-target-num-scalar-regs", cl::init(Val: `0`), cl::Hidden,
276	cl::desc ("A flag that overrides the target's number of scalar registers."));
277
278	static cl::opt<unsigned> ForceTargetNumVectorRegs(
279	"force-target-num-vector-regs", cl::init(Val: `0`), cl::Hidden,
280	cl::desc ("A flag that overrides the target's number of vector registers."));
281
282	static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
283	"force-target-max-scalar-interleave", cl::init(Val: `0`), cl::Hidden,
284	cl::desc ("A flag that overrides the target's max interleave factor for "
285	"scalar loops."));
286
287	static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
288	"force-target-max-vector-interleave", cl::init(Val: `0`), cl::Hidden,
289	cl::desc ("A flag that overrides the target's max interleave factor for "
290	"vectorized loops."));
291
292	static cl::opt<unsigned> ForceTargetInstructionCost(
293	"force-target-instruction-cost", cl::init(Val: `0`), cl::Hidden,
294	cl::desc ("A flag that overrides the target's expected cost for "
295	"an instruction to a single constant value. Mostly "
296	"useful for getting consistent testing."));
297
298	static cl::opt<bool> ForceTargetSupportsScalableVectors(
299	"force-target-supports-scalable-vectors", cl::init(Val: false), cl::Hidden,
300	cl::desc (
301	"Pretend that scalable vectors are supported, even if the target does "
302	"not support them. This flag should only be used for testing."));
303
304	static cl::opt<unsigned> SmallLoopCost(
305	"small-loop-cost", cl::init(Val: `20`), cl::Hidden,
306	cl::desc (
307	"The cost of a loop that is considered 'small' by the interleaver."));
308
309	static cl::opt<bool> LoopVectorizeWithBlockFrequency(
310	"loop-vectorize-with-block-frequency", cl::init(Val: true), cl::Hidden,
311	cl::desc ("Enable the use of the block frequency analysis to access PGO "
312	"heuristics minimizing code growth in cold regions and being more "
313	"aggressive in hot regions."));
314
315	// Runtime interleave loops for load/store throughput.
316	static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
317	"enable-loadstore-runtime-interleave", cl::init(Val: true), cl::Hidden,
318	cl::desc (
319	"Enable runtime interleaving until load/store ports are saturated"));
320
321	/// The number of stores in a loop that are allowed to need predication.
322	static cl::opt<unsigned> NumberOfStoresToPredicate(
323	"vectorize-num-stores-pred", cl::init(Val: `1`), cl::Hidden,
324	cl::desc ("Max number of stores to be predicated behind an if."));
325
326	static cl::opt<bool> EnableIndVarRegisterHeur(
327	"enable-ind-var-reg-heur", cl::init(Val: true), cl::Hidden,
328	cl::desc ("Count the induction variable only once when interleaving"));
329
330	static cl::opt<bool> EnableCondStoresVectorization(
331	"enable-cond-stores-vec", cl::init(Val: true), cl::Hidden,
332	cl::desc ("Enable if predication of stores during vectorization."));
333
334	static cl::opt<unsigned> MaxNestedScalarReductionIC(
335	"max-nested-scalar-reduction-interleave", cl::init(Val: `2`), cl::Hidden,
336	cl::desc ("The maximum interleave count to use when interleaving a scalar "
337	"reduction in a nested loop."));
338
339	static cl::opt<bool>
340	PreferInLoopReductions("prefer-inloop-reductions", cl::init(Val: false),
341	cl::Hidden,
342	cl::desc ("Prefer in-loop vector reductions, "
343	"overriding the targets preference."));
344
345	static cl::opt<bool> ForceOrderedReductions(
346	"force-ordered-reductions", cl::init(Val: false), cl::Hidden,
347	cl::desc ("Enable the vectorisation of loops with in-order (strict) "
348	"FP reductions"));
349
350	static cl::opt<bool> PreferPredicatedReductionSelect(
351	"prefer-predicated-reduction-select", cl::init(Val: false), cl::Hidden,
352	cl::desc (
353	"Prefer predicating a reduction operation over an after loop select."));
354
355	namespace llvm {
356	cl::opt<bool> EnableVPlanNativePath(
357	"enable-vplan-native-path", cl::Hidden,
358	cl::desc ("Enable VPlan-native vectorization path with "
359	"support for outer loop vectorization."));
360	}
361
362	// This flag enables the stress testing of the VPlan H-CFG construction in the
363	// VPlan-native vectorization path. It must be used in conjuction with
364	// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
365	// verification of the H-CFGs built.
366	static cl::opt<bool> VPlanBuildStressTest(
367	"vplan-build-stress-test", cl::init(Val: false), cl::Hidden,
368	cl::desc (
369	"Build VPlan for every supported loop nest in the function and bail "
370	"out right after the build (stress test the VPlan H-CFG construction "
371	"in the VPlan-native vectorization path)."));
372
373	cl::opt<bool> llvm::EnableLoopInterleaving(
374	"interleave-loops", cl::init(Val: true), cl::Hidden,
375	cl::desc ("Enable loop interleaving in Loop vectorization passes"));
376	cl::opt<bool> llvm::EnableLoopVectorization(
377	"vectorize-loops", cl::init(Val: true), cl::Hidden,
378	cl::desc ("Run the Loop vectorization passes"));
379
380	static cl::opt<bool> PrintVPlansInDotFormat(
381	"vplan-print-in-dot-format", cl::Hidden,
382	cl::desc ("Use dot format instead of plain text when dumping VPlans"));
383
384	static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
385	"force-widen-divrem-via-safe-divisor", cl::Hidden,
386	cl::desc (
387	"Override cost based safe divisor widening for div/rem instructions"));
388
389	static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
390	"vectorizer-maximize-bandwidth-for-vector-calls", cl::init(Val: true),
391	cl::Hidden,
392	cl::desc ("Try wider VFs if they enable the use of vector variants"));
393
394	// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
395	// variables not overflowing do not hold. See `emitSCEVChecks`.
396	static constexpr uint32_t SCEVCheckBypassWeights[] = {`1`, `127`};
397	// Likelyhood of bypassing the vectorized loop because pointers overlap. See
398	// `emitMemRuntimeChecks`.
399	static constexpr uint32_t MemCheckBypassWeights[] = {`1`, `127`};
400	// Likelyhood of bypassing the vectorized loop because there are zero trips left
401	// after prolog. See `emitIterationCountCheck`.
402	static constexpr uint32_t MinItersBypassWeights[] = {`1`, `127`};
403
404	/// A helper function that returns true if the given type is irregular. The
405	/// type is irregular if its allocated size doesn't equal the store size of an
406	/// element of the corresponding vector type.
407	static bool hasIrregularType(Type Ty, const* DataLayout &DL) {
408	// Determine if an array of N elements of type Ty is "bitcast compatible"
409	// with a <N x Ty> vector.
410	// This is only true if there is no padding between the array elements.
411	return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
412	}
413
414	/// A helper function that returns the reciprocal of the block probability of
415	/// predicated blocks. If we return X, we are assuming the predicated block
416	/// will execute once for every X iterations of the loop header.
417	///
418	/// TODO: We should use actual block probability here, if available. Currently,
419	/// we always assume predicated blocks have a 50% chance of executing.
420	static unsigned getReciprocalPredBlockProb() { return `2`; }
421
422	/// Returns "best known" trip count for the specified loop \p L as defined by
423	/// the following procedure:
424	/// 1) Returns exact trip count if it is known.
425	/// 2) Returns expected trip count according to profile data if any.
426	/// 3) Returns upper bound estimate if it is known.
427	/// 4) Returns std::nullopt if all of the above failed.
428	static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
429	Loop *L) {
430	// Check if exact trip count is known.
431	if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
432	return ExpectedTC;
433
434	// Check if there is an expected trip count available from profile data.
435	if (LoopVectorizeWithBlockFrequency)
436	if (auto EstimatedTC = getLoopEstimatedTripCount(L))
437	return *EstimatedTC;
438
439	// Check if upper bound estimate is known.
440	if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
441	return ExpectedTC;
442
443	return std::nullopt;
444	}
445
446	/// Return a vector containing interleaved elements from multiple
447	/// smaller input vectors.
448	static Value interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value > Vals,
449	const Twine &Name) {
450	unsigned Factor = Vals.size();
451	assert(Factor > `1` && "Tried to interleave invalid number of vectors");
452
453	VectorType *VecTy = cast<VectorType>(Val: Vals [`0`]->getType());
454	#ifndef NDEBUG
455	for (Value *Val : Vals)
456	assert(Val->getType() == VecTy && "Tried to interleave mismatched types");
457	#endif
458
459	// Scalable vectors cannot use arbitrary shufflevectors (only splats), so
460	// must use intrinsics to interleave.
461	if (VecTy->isScalableTy()) {
462	VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VTy: VecTy);
463	return Builder.CreateIntrinsic(
464	WideVecTy, Intrinsic::experimental_vector_interleave2, Vals,
465	/FMFSource=/nullptr, Name);
466	}
467
468	// Fixed length. Start by concatenating all vectors into a wide vector.
469	Value *WideVec = concatenateVectors(Builder, Vecs: Vals);
470
471	// Interleave the elements into the wide vector.
472	const unsigned NumElts = VecTy->getElementCount().getFixedValue();
473	return Builder.CreateShuffleVector(
474	V: WideVec, Mask: createInterleaveMask(VF: NumElts, NumVecs: Factor), Name);
475	}
476
477	namespace {
478	// Forward declare GeneratedRTChecks.
479	class GeneratedRTChecks;
480
481	using SCEV2ValueTy = DenseMap<const SCEV , Value >;
482	} // namespace
483
484	namespace llvm {
485
486	AnalysisKey ShouldRunExtraVectorPasses::Key;
487
488	/// InnerLoopVectorizer vectorizes loops which contain only one basic
489	/// block to a specified vectorization factor (VF).
490	/// This class performs the widening of scalars into vectors, or multiple
491	/// scalars. This class also implements the following features:
492	/// It inserts an epilogue loop for handling loops that don't have iteration*
493	/// counts that are known to be a multiple of the vectorization factor.
494	/// It handles the code generation for reduction variables.*
495	/// Scalarization (implementation using scalars) of un-vectorizable*
496	/// instructions.
497	/// InnerLoopVectorizer does not perform any vectorization-legality
498	/// checks, and relies on the caller to check for the different legality
499	/// aspects. The InnerLoopVectorizer relies on the
500	/// LoopVectorizationLegality class to provide information about the induction
501	/// and reduction variables that were found to a given vectorization factor.
502	class InnerLoopVectorizer {
503	public:
504	InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
505	LoopInfo LI, DominatorTree DT,
506	const TargetLibraryInfo *TLI,
507	const TargetTransformInfo TTI, AssumptionCache AC,
508	OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
509	ElementCount MinProfitableTripCount,
510	unsigned UnrollFactor, LoopVectorizationLegality *LVL,
511	LoopVectorizationCostModel CM, BlockFrequencyInfo BFI,
512	ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
513	: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
514	AC(AC), ORE(ORE), VF (VecWidth), UF(UnrollFactor),
515	Builder (PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
516	PSI(PSI), RTChecks(RTChecks) {
517	// Query this against the original loop and save it here because the profile
518	// of the original loop header may change as the transformation happens.
519	OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
520	BB: OrigLoop->getHeader(), PSI, BFI, QueryType: PGSOQueryType::IRPass);
521
522	if (MinProfitableTripCount.isZero())
523	this->MinProfitableTripCount = VecWidth;
524	else
525	this->MinProfitableTripCount = MinProfitableTripCount;
526	}
527
528	virtual ~InnerLoopVectorizer() = default;
529
530	/// Create a new empty loop that will contain vectorized instructions later
531	/// on, while the old loop will be used as the scalar remainder. Control flow
532	/// is generated around the vectorized (and scalar epilogue) loops consisting
533	/// of various checks and bypasses. Return the pre-header block of the new
534	/// loop and the start value for the canonical induction, if it is != 0. The
535	/// latter is the case when vectorizing the epilogue loop. In the case of
536	/// epilogue vectorization, this function is overriden to handle the more
537	/// complex control flow around the loops. \p ExpandedSCEVs is used to
538	/// look up SCEV expansions for expressions needed during skeleton creation.
539	virtual std::pair<BasicBlock , Value >
540	createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
541
542	/// Fix the vectorized code, taking care of header phi's, live-outs, and more.
543	void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
544
545	// Return true if any runtime check is added.
546	bool areSafetyChecksAdded() { return AddedSafetyChecks; }
547
548	/// A helper function to scalarize a single Instruction in the innermost loop.
549	/// Generates a sequence of scalar instances for each lane between \p MinLane
550	/// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
551	/// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
552	/// Instr's operands.
553	void scalarizeInstruction(const Instruction *Instr,
554	VPReplicateRecipe *RepRecipe,
555	const VPIteration &Instance,
556	VPTransformState &State);
557
558	/// Try to vectorize interleaved access group \p Group with the base address
559	/// given in \p Addr, optionally masking the vector operations if \p
560	/// BlockInMask is non-null. Use \p State to translate given VPValues to IR
561	/// values in the vectorized loop.
562	void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
563	ArrayRef<VPValue *> VPDefs,
564	VPTransformState &State, VPValue *Addr,
565	ArrayRef<VPValue *> StoredValues,
566	VPValue BlockInMask, bool* NeedsMaskForGaps);
567
568	/// Fix the non-induction PHIs in \p Plan.
569	void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
570
571	/// Create a new phi node for the induction variable \p OrigPhi to resume
572	/// iteration count in the scalar epilogue, from where the vectorized loop
573	/// left off. \p Step is the SCEV-expanded induction step to use. In cases
574	/// where the loop skeleton is more complicated (i.e., epilogue vectorization)
575	/// and the resume values can come from an additional bypass block, the \p
576	/// AdditionalBypass pair provides information about the bypass block and the
577	/// end value on the edge from bypass to this loop.
578	PHINode *createInductionResumeValue(
579	PHINode OrigPhi, const* InductionDescriptor &ID, Value *Step,
580	ArrayRef<BasicBlock *> BypassBlocks,
581	std::pair<BasicBlock , Value > AdditionalBypass = {nullptr, nullptr});
582
583	/// Returns the original loop trip count.
584	Value getTripCount() const* { return TripCount; }
585
586	/// Used to set the trip count after ILV's construction and after the
587	/// preheader block has been executed. Note that this always holds the trip
588	/// count of the original loop for both main loop and epilogue vectorization.
589	void setTripCount(Value *TC) { TripCount = TC; }
590
591	protected:
592	friend class LoopVectorizationPlanner;
593
594	/// A small list of PHINodes.
595	using PhiVector = SmallVector<PHINode *, `4`>;
596
597	/// A type for scalarized values in the new loop. Each value from the
598	/// original loop, when scalarized, is represented by UF x VF scalar values
599	/// in the new unrolled loop, where UF is the unroll factor and VF is the
600	/// vectorization factor.
601	using ScalarParts = SmallVector<SmallVector<Value *, `4`>, `2`>;
602
603	/// Set up the values of the IVs correctly when exiting the vector loop.
604	void fixupIVUsers(PHINode OrigPhi, const* InductionDescriptor &II,
605	Value VectorTripCount, Value EndValue,
606	BasicBlock MiddleBlock, BasicBlock VectorHeader,
607	VPlan &Plan, VPTransformState &State);
608
609	/// Create the exit value of first order recurrences in the middle block and
610	/// update their users.
611	void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
612	VPTransformState &State);
613
614	/// Iteratively sink the scalarized operands of a predicated instruction into
615	/// the block that was created for it.
616	void sinkScalarOperands(Instruction *PredInst);
617
618	/// Returns (and creates if needed) the trip count of the widened loop.
619	Value getOrCreateVectorTripCount(BasicBlock InsertBlock);
620
621	/// Returns a bitcasted value to the requested vector type.
622	/// Also handles bitcasts of vector<float> <-> vector<pointer> types.
623	Value createBitOrPointerCast(Value V, VectorType *DstVTy,
624	const DataLayout &DL);
625
626	/// Emit a bypass check to see if the vector trip count is zero, including if
627	/// it overflows.
628	void emitIterationCountCheck(BasicBlock *Bypass);
629
630	/// Emit a bypass check to see if all of the SCEV assumptions we've
631	/// had to make are correct. Returns the block containing the checks or
632	/// nullptr if no checks have been added.
633	BasicBlock emitSCEVChecks(BasicBlock Bypass);
634
635	/// Emit bypass checks to check any memory assumptions we may have made.
636	/// Returns the block containing the checks or nullptr if no checks have been
637	/// added.
638	BasicBlock emitMemRuntimeChecks(BasicBlock Bypass);
639
640	/// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
641	/// vector loop preheader, middle block and scalar preheader.
642	void createVectorLoopSkeleton(StringRef Prefix);
643
644	/// Create new phi nodes for the induction variables to resume iteration count
645	/// in the scalar epilogue, from where the vectorized loop left off.
646	/// In cases where the loop skeleton is more complicated (eg. epilogue
647	/// vectorization) and the resume values can come from an additional bypass
648	/// block, the \p AdditionalBypass pair provides information about the bypass
649	/// block and the end value on the edge from bypass to this loop.
650	void createInductionResumeValues(
651	const SCEV2ValueTy &ExpandedSCEVs,
652	std::pair<BasicBlock , Value > AdditionalBypass = {nullptr, nullptr});
653
654	/// Complete the loop skeleton by adding debug MDs, creating appropriate
655	/// conditional branches in the middle block, preparing the builder and
656	/// running the verifier. Return the preheader of the completed vector loop.
657	BasicBlock *completeLoopSkeleton();
658
659	/// Allow subclasses to override and print debug traces before/after vplan
660	/// execution, when trace information is requested.
661	virtual void printDebugTracesAtStart(){};
662	virtual void printDebugTracesAtEnd(){};
663
664	/// The original loop.
665	Loop *OrigLoop;
666
667	/// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
668	/// dynamic knowledge to simplify SCEV expressions and converts them to a
669	/// more usable form.
670	PredicatedScalarEvolution &PSE;
671
672	/// Loop Info.
673	LoopInfo *LI;
674
675	/// Dominator Tree.
676	DominatorTree *DT;
677
678	/// Target Library Info.
679	const TargetLibraryInfo *TLI;
680
681	/// Target Transform Info.
682	const TargetTransformInfo *TTI;
683
684	/// Assumption Cache.
685	AssumptionCache *AC;
686
687	/// Interface to emit optimization remarks.
688	OptimizationRemarkEmitter *ORE;
689
690	/// The vectorization SIMD factor to use. Each vector will have this many
691	/// vector elements.
692	ElementCount VF;
693
694	ElementCount MinProfitableTripCount;
695
696	/// The vectorization unroll factor to use. Each scalar is vectorized to this
697	/// many different vector instructions.
698	unsigned UF;
699
700	/// The builder that we use
701	IRBuilder<> Builder;
702
703	// --- Vectorization state ---
704
705	/// The vector-loop preheader.
706	BasicBlock *LoopVectorPreHeader;
707
708	/// The scalar-loop preheader.
709	BasicBlock *LoopScalarPreHeader;
710
711	/// Middle Block between the vector and the scalar.
712	BasicBlock *LoopMiddleBlock;
713
714	/// The unique ExitBlock of the scalar loop if one exists. Note that
715	/// there can be multiple exiting edges reaching this block.
716	BasicBlock *LoopExitBlock;
717
718	/// The scalar loop body.
719	BasicBlock *LoopScalarBody;
720
721	/// A list of all bypass blocks. The first block is the entry of the loop.
722	SmallVector<BasicBlock *, `4`> LoopBypassBlocks;
723
724	/// Store instructions that were predicated.
725	SmallVector<Instruction *, `4`> PredicatedInstructions;
726
727	/// Trip count of the original loop.
728	Value TripCount = nullptr*;
729
730	/// Trip count of the widened loop (TripCount - TripCount % (VFUF))*
731	Value VectorTripCount = nullptr*;
732
733	/// The legality analysis.
734	LoopVectorizationLegality *Legal;
735
736	/// The profitablity analysis.
737	LoopVectorizationCostModel *Cost;
738
739	// Record whether runtime checks are added.
740	bool AddedSafetyChecks = false;
741
742	// Holds the end values for each induction variable. We save the end values
743	// so we can later fix-up the external users of the induction variables.
744	DenseMap<PHINode , Value > IVEndValues;
745
746	/// BFI and PSI are used to check for profile guided size optimizations.
747	BlockFrequencyInfo *BFI;
748	ProfileSummaryInfo *PSI;
749
750	// Whether this loop should be optimized for size based on profile guided size
751	// optimizatios.
752	bool OptForSizeBasedOnProfile;
753
754	/// Structure to hold information about generated runtime checks, responsible
755	/// for cleaning the checks, if vectorization turns out unprofitable.
756	GeneratedRTChecks &RTChecks;
757
758	// Holds the resume values for reductions in the loops, used to set the
759	// correct start value of reduction PHIs when vectorizing the epilogue.
760	SmallMapVector<const RecurrenceDescriptor , PHINode , `4`>
761	ReductionResumeValues;
762	};
763
764	class InnerLoopUnroller : public InnerLoopVectorizer {
765	public:
766	InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
767	LoopInfo LI, DominatorTree DT,
768	const TargetLibraryInfo *TLI,
769	const TargetTransformInfo TTI, AssumptionCache AC,
770	OptimizationRemarkEmitter ORE, unsigned* UnrollFactor,
771	LoopVectorizationLegality *LVL,
772	LoopVectorizationCostModel CM, BlockFrequencyInfo BFI,
773	ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
774	: InnerLoopVectorizer (OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
775	ElementCount::getFixed(MinVal: `1`),
776	ElementCount::getFixed(MinVal: `1`), UnrollFactor, LVL, CM,
777	BFI, PSI, Check) {}
778	};
779
780	/// Encapsulate information regarding vectorization of a loop and its epilogue.
781	/// This information is meant to be updated and used across two stages of
782	/// epilogue vectorization.
783	struct EpilogueLoopVectorizationInfo {
784	ElementCount MainLoopVF = ElementCount::getFixed(MinVal: `0`);
785	unsigned MainLoopUF = `0`;
786	ElementCount EpilogueVF = ElementCount::getFixed(MinVal: `0`);
787	unsigned EpilogueUF = `0`;
788	BasicBlock MainLoopIterationCountCheck = nullptr*;
789	BasicBlock EpilogueIterationCountCheck = nullptr*;
790	BasicBlock SCEVSafetyCheck = nullptr*;
791	BasicBlock MemSafetyCheck = nullptr*;
792	Value TripCount = nullptr*;
793	Value VectorTripCount = nullptr*;
794
795	EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
796	ElementCount EVF, unsigned EUF)
797	: MainLoopVF (MVF), MainLoopUF(MUF), EpilogueVF (EVF), EpilogueUF(EUF) {
798	assert(EUF == `1` &&
799	"A high UF for the epilogue loop is likely not beneficial.");
800	}
801	};
802
803	/// An extension of the inner loop vectorizer that creates a skeleton for a
804	/// vectorized loop that has its epilogue (residual) also vectorized.
805	/// The idea is to run the vplan on a given loop twice, firstly to setup the
806	/// skeleton and vectorize the main loop, and secondly to complete the skeleton
807	/// from the first step and vectorize the epilogue. This is achieved by
808	/// deriving two concrete strategy classes from this base class and invoking
809	/// them in succession from the loop vectorizer planner.
810	class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
811	public:
812	InnerLoopAndEpilogueVectorizer(
813	Loop OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo LI,
814	DominatorTree DT, const* TargetLibraryInfo *TLI,
815	const TargetTransformInfo TTI, AssumptionCache AC,
816	OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
817	LoopVectorizationLegality LVL, llvm::LoopVectorizationCostModel CM,
818	BlockFrequencyInfo BFI, ProfileSummaryInfo PSI,
819	GeneratedRTChecks &Checks)
820	: InnerLoopVectorizer (OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
821	EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
822	CM, BFI, PSI, Checks),
823	EPI(EPI) {}
824
825	// Override this function to handle the more complex control flow around the
826	// three loops.
827	std::pair<BasicBlock , Value > createVectorizedLoopSkeleton(
828	const SCEV2ValueTy &ExpandedSCEVs) final {
829	return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
830	}
831
832	/// The interface for creating a vectorized skeleton using one of two
833	/// different strategies, each corresponding to one execution of the vplan
834	/// as described above.
835	virtual std::pair<BasicBlock , Value >
836	createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = `0`;
837
838	/// Holds and updates state information required to vectorize the main loop
839	/// and its epilogue in two separate passes. This setup helps us avoid
840	/// regenerating and recomputing runtime safety checks. It also helps us to
841	/// shorten the iteration-count-check path length for the cases where the
842	/// iteration count of the loop is so small that the main vector loop is
843	/// completely skipped.
844	EpilogueLoopVectorizationInfo &EPI;
845	};
846
847	/// A specialized derived class of inner loop vectorizer that performs
848	/// vectorization of main* loops in the process of vectorizing loops and their*
849	/// epilogues.
850	class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
851	public:
852	EpilogueVectorizerMainLoop(
853	Loop OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo LI,
854	DominatorTree DT, const* TargetLibraryInfo *TLI,
855	const TargetTransformInfo TTI, AssumptionCache AC,
856	OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
857	LoopVectorizationLegality LVL, llvm::LoopVectorizationCostModel CM,
858	BlockFrequencyInfo BFI, ProfileSummaryInfo PSI,
859	GeneratedRTChecks &Check)
860	: InnerLoopAndEpilogueVectorizer (OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
861	EPI, LVL, CM, BFI, PSI, Check) {}
862	/// Implements the interface for creating a vectorized skeleton using the
863	/// main loop* strategy (ie the first pass of vplan execution).*
864	std::pair<BasicBlock , Value >
865	createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
866
867	protected:
868	/// Emits an iteration count bypass check once for the main loop (when \p
869	/// ForEpilogue is false) and once for the epilogue loop (when \p
870	/// ForEpilogue is true).
871	BasicBlock emitIterationCountCheck(BasicBlock Bypass, bool ForEpilogue);
872	void printDebugTracesAtStart() override;
873	void printDebugTracesAtEnd() override;
874	};
875
876	// A specialized derived class of inner loop vectorizer that performs
877	// vectorization of epilogue* loops in the process of vectorizing loops and*
878	// their epilogues.
879	class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
880	public:
881	EpilogueVectorizerEpilogueLoop(
882	Loop OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo LI,
883	DominatorTree DT, const* TargetLibraryInfo *TLI,
884	const TargetTransformInfo TTI, AssumptionCache AC,
885	OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
886	LoopVectorizationLegality LVL, llvm::LoopVectorizationCostModel CM,
887	BlockFrequencyInfo BFI, ProfileSummaryInfo PSI,
888	GeneratedRTChecks &Checks)
889	: InnerLoopAndEpilogueVectorizer (OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
890	EPI, LVL, CM, BFI, PSI, Checks) {
891	TripCount = EPI.TripCount;
892	}
893	/// Implements the interface for creating a vectorized skeleton using the
894	/// epilogue loop* strategy (ie the second pass of vplan execution).*
895	std::pair<BasicBlock , Value >
896	createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
897
898	protected:
899	/// Emits an iteration count bypass check after the main vector loop has
900	/// finished to see if there are any iterations left to execute by either
901	/// the vector epilogue or the scalar epilogue.
902	BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
903	BasicBlock *Bypass,
904	BasicBlock *Insert);
905	void printDebugTracesAtStart() override;
906	void printDebugTracesAtEnd() override;
907	};
908	} // end namespace llvm
909
910	/// Look for a meaningful debug location on the instruction or it's
911	/// operands.
912	static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
913	if (!I)
914	return DebugLoc ();
915
916	DebugLoc Empty;
917	if (I->getDebugLoc() != Empty)
918	return I->getDebugLoc();
919
920	for (Use &Op : I->operands()) {
921	if (Instruction *OpInst = dyn_cast<Instruction>(Val&: Op))
922	if (OpInst->getDebugLoc() != Empty)
923	return OpInst->getDebugLoc();
924	}
925
926	return I->getDebugLoc();
927	}
928
929	/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
930	/// is passed, the message relates to that particular instruction.
931	#ifndef NDEBUG
932	static void debugVectorizationMessage(const StringRef Prefix,
933	const StringRef DebugMsg,
934	Instruction *I) {
935	dbgs() << "LV: " << Prefix << DebugMsg;
936	if (I != nullptr)
937	dbgs() << " " << *I;
938	else
939	dbgs() << `'.'`;
940	dbgs() << `'\n'`;
941	}
942	#endif
943
944	/// Create an analysis remark that explains why vectorization failed
945	///
946	/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
947	/// RemarkName is the identifier for the remark. If \p I is passed it is an
948	/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
949	/// the location of the remark. \return the remark object that can be
950	/// streamed to.
951	static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
952	StringRef RemarkName, Loop TheLoop, Instruction I) {
953	Value *CodeRegion = TheLoop->getHeader();
954	DebugLoc DL = TheLoop->getStartLoc();
955
956	if (I) {
957	CodeRegion = I->getParent();
958	// If there is no debug location attached to the instruction, revert back to
959	// using the loop's.
960	if (I->getDebugLoc())
961	DL = I->getDebugLoc();
962	}
963
964	return OptimizationRemarkAnalysis (PassName, RemarkName, DL, CodeRegion);
965	}
966
967	namespace llvm {
968
969	/// Return a value for Step multiplied by VF.
970	Value createStepForVF(IRBuilderBase &B, Type Ty, ElementCount VF,
971	int64_t Step) {
972	assert(Ty->isIntegerTy() && "Expected an integer step");
973	return B.CreateElementCount(DstType: Ty, EC: VF.multiplyCoefficientBy(RHS: Step));
974	}
975
976	/// Return the runtime value for VF.
977	Value getRuntimeVF(IRBuilderBase &B, Type Ty, ElementCount VF) {
978	return B.CreateElementCount(DstType: Ty, EC: VF);
979	}
980
981	const SCEV createTripCountSCEV(Type IdxTy, PredicatedScalarEvolution &PSE,
982	Loop *OrigLoop) {
983	const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
984	assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
985
986	ScalarEvolution &SE = *PSE.getSE();
987	return SE.getTripCountFromExitCount(ExitCount: BackedgeTakenCount, EvalTy: IdxTy, L: OrigLoop);
988	}
989
990	void reportVectorizationFailure(const StringRef DebugMsg,
991	const StringRef OREMsg, const StringRef ORETag,
992	OptimizationRemarkEmitter ORE, Loop TheLoop,
993	Instruction *I) {
994	LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
995	LoopVectorizeHints Hints(TheLoop, true / doesn't matter /, *ORE);
996	ORE->emit(
997	OptDiag&: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop, I)
998	<< "loop not vectorized: " << OREMsg);
999	}
1000
1001	void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1002	OptimizationRemarkEmitter ORE, Loop TheLoop,
1003	Instruction *I) {
1004	LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1005	LoopVectorizeHints Hints(TheLoop, true / doesn't matter /, *ORE);
1006	ORE->emit(
1007	OptDiag&: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop, I)
1008	<< Msg);
1009	}
1010
1011	/// Report successful vectorization of the loop. In case an outer loop is
1012	/// vectorized, prepend "outer" to the vectorization remark.
1013	static void reportVectorization(OptimizationRemarkEmitter ORE, Loop TheLoop,
1014	VectorizationFactor VF, unsigned IC) {
1015	LLVM_DEBUG(debugVectorizationMessage(
1016	"Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
1017	nullptr));
1018	StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
1019	ORE->emit(RemarkBuilder: [&]() {
1020	return OptimizationRemark (LV_NAME, "Vectorized", TheLoop->getStartLoc(),
1021	TheLoop->getHeader())
1022	<< "vectorized " << LoopType << "loop (vectorization width: "
1023	<< ore::NV ("VectorizationFactor", VF.Width)
1024	<< ", interleaved count: " << ore::NV ("InterleaveCount", IC) << ")";
1025	});
1026	}
1027
1028	} // end namespace llvm
1029
1030	#ifndef NDEBUG
1031	/// \return string containing a file name and a line # for the given loop.
1032	static std::string getDebugLocString(const Loop *L) {
1033	std::string Result;
1034	if (L) {
1035	raw_string_ostream OS(Result);
1036	if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1037	LoopDbgLoc.print(OS);
1038	else
1039	// Just print the module name.
1040	OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1041	OS.flush();
1042	}
1043	return Result;
1044	}
1045	#endif
1046
1047	namespace llvm {
1048
1049	// Loop vectorization cost-model hints how the scalar epilogue loop should be
1050	// lowered.
1051	enum ScalarEpilogueLowering {
1052
1053	// The default: allowing scalar epilogues.
1054	CM_ScalarEpilogueAllowed,
1055
1056	// Vectorization with OptForSize: don't allow epilogues.
1057	CM_ScalarEpilogueNotAllowedOptSize,
1058
1059	// A special case of vectorisation with OptForSize: loops with a very small
1060	// trip count are considered for vectorization under OptForSize, thereby
1061	// making sure the cost of their loop body is dominant, free of runtime
1062	// guards and scalar iteration overheads.
1063	CM_ScalarEpilogueNotAllowedLowTripLoop,
1064
1065	// Loop hint predicate indicating an epilogue is undesired.
1066	CM_ScalarEpilogueNotNeededUsePredicate,
1067
1068	// Directive indicating we must either tail fold or not vectorize
1069	CM_ScalarEpilogueNotAllowedUsePredicate
1070	};
1071
1072	using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1073
1074	/// LoopVectorizationCostModel - estimates the expected speedups due to
1075	/// vectorization.
1076	/// In many cases vectorization is not profitable. This can happen because of
1077	/// a number of reasons. In this class we mainly attempt to predict the
1078	/// expected speedup/slowdowns due to the supported instruction set. We use the
1079	/// TargetTransformInfo to query the different backends for the cost of
1080	/// different operations.
1081	class LoopVectorizationCostModel {
1082	public:
1083	LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1084	PredicatedScalarEvolution &PSE, LoopInfo *LI,
1085	LoopVectorizationLegality *Legal,
1086	const TargetTransformInfo &TTI,
1087	const TargetLibraryInfo TLI, DemandedBits DB,
1088	AssumptionCache *AC,
1089	OptimizationRemarkEmitter ORE, const* Function *F,
1090	const LoopVectorizeHints *Hints,
1091	InterleavedAccessInfo &IAI)
1092	: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1093	TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1094	Hints(Hints), InterleaveInfo(IAI) {}
1095
1096	/// \return An upper bound for the vectorization factors (both fixed and
1097	/// scalable). If the factors are 0, vectorization and interleaving should be
1098	/// avoided up front.
1099	FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1100
1101	/// \return True if runtime checks are required for vectorization, and false
1102	/// otherwise.
1103	bool runtimeChecksRequired();
1104
1105	/// Setup cost-based decisions for user vectorization factor.
1106	/// \return true if the UserVF is a feasible VF to be chosen.
1107	bool selectUserVectorizationFactor(ElementCount UserVF) {
1108	collectUniformsAndScalars(VF: UserVF);
1109	collectInstsToScalarize(VF: UserVF);
1110	return expectedCost(VF: UserVF).first.isValid();
1111	}
1112
1113	/// \return The size (in bits) of the smallest and widest types in the code
1114	/// that needs to be vectorized. We ignore values that remain scalar such as
1115	/// 64 bit loop indices.
1116	std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1117
1118	/// \return The desired interleave count.
1119	/// If interleave count has been specified by metadata it will be returned.
1120	/// Otherwise, the interleave count is computed and returned. VF and LoopCost
1121	/// are the selected vectorization factor and the cost of the selected VF.
1122	unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1123
1124	/// Memory access instruction may be vectorized in more than one way.
1125	/// Form of instruction after vectorization depends on cost.
1126	/// This function takes cost-based decisions for Load/Store instructions
1127	/// and collects them in a map. This decisions map is used for building
1128	/// the lists of loop-uniform and loop-scalar instructions.
1129	/// The calculated cost is saved with widening decision in order to
1130	/// avoid redundant calculations.
1131	void setCostBasedWideningDecision(ElementCount VF);
1132
1133	/// A call may be vectorized in different ways depending on whether we have
1134	/// vectorized variants available and whether the target supports masking.
1135	/// This function analyzes all calls in the function at the supplied VF,
1136	/// makes a decision based on the costs of available options, and stores that
1137	/// decision in a map for use in planning and plan execution.
1138	void setVectorizedCallDecision(ElementCount VF);
1139
1140	/// A struct that represents some properties of the register usage
1141	/// of a loop.
1142	struct RegisterUsage {
1143	/// Holds the number of loop invariant values that are used in the loop.
1144	/// The key is ClassID of target-provided register class.
1145	SmallMapVector<unsigned, unsigned, `4`> LoopInvariantRegs;
1146	/// Holds the maximum number of concurrent live intervals in the loop.
1147	/// The key is ClassID of target-provided register class.
1148	SmallMapVector<unsigned, unsigned, `4`> MaxLocalUsers;
1149	};
1150
1151	/// \return Returns information about the register usages of the loop for the
1152	/// given vectorization factors.
1153	SmallVector<RegisterUsage, `8`>
1154	calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1155
1156	/// Collect values we want to ignore in the cost model.
1157	void collectValuesToIgnore();
1158
1159	/// Collect all element types in the loop for which widening is needed.
1160	void collectElementTypesForWidening();
1161
1162	/// Split reductions into those that happen in the loop, and those that happen
1163	/// outside. In loop reductions are collected into InLoopReductions.
1164	void collectInLoopReductions();
1165
1166	/// Returns true if we should use strict in-order reductions for the given
1167	/// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1168	/// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1169	/// of FP operations.
1170	bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1171	return !Hints->allowReordering() && RdxDesc.isOrdered();
1172	}
1173
1174	/// \returns The smallest bitwidth each instruction can be represented with.
1175	/// The vector equivalents of these instructions should be truncated to this
1176	/// type.
1177	const MapVector<Instruction , uint64_t> &getMinimalBitwidths() const* {
1178	return MinBWs;
1179	}
1180
1181	/// \returns True if it is more profitable to scalarize instruction \p I for
1182	/// vectorization factor \p VF.
1183	bool isProfitableToScalarize(Instruction I, ElementCount VF) const* {
1184	assert(VF.isVector() &&
1185	"Profitable to scalarize relevant only for VF > 1.");
1186	assert(
1187	TheLoop->isInnermost() &&
1188	"cost-model should not be used for outer loops (in VPlan-native path)");
1189
1190	auto Scalars = InstsToScalarize.find(Val: VF);
1191	assert(Scalars != InstsToScalarize.end() &&
1192	"VF not yet analyzed for scalarization profitability");
1193	return Scalars ->second.contains(Val: I);
1194	}
1195
1196	/// Returns true if \p I is known to be uniform after vectorization.
1197	bool isUniformAfterVectorization(Instruction I, ElementCount VF) const* {
1198	assert(
1199	TheLoop->isInnermost() &&
1200	"cost-model should not be used for outer loops (in VPlan-native path)");
1201	// Pseudo probe needs to be duplicated for each unrolled iteration and
1202	// vector lane so that profiled loop trip count can be accurately
1203	// accumulated instead of being under counted.
1204	if (isa<PseudoProbeInst>(Val: I))
1205	return false;
1206
1207	if (VF.isScalar())
1208	return true;
1209
1210	auto UniformsPerVF = Uniforms.find(Val: VF);
1211	assert(UniformsPerVF != Uniforms.end() &&
1212	"VF not yet analyzed for uniformity");
1213	return UniformsPerVF ->second.count(Ptr: I);
1214	}
1215
1216	/// Returns true if \p I is known to be scalar after vectorization.
1217	bool isScalarAfterVectorization(Instruction I, ElementCount VF) const* {
1218	assert(
1219	TheLoop->isInnermost() &&
1220	"cost-model should not be used for outer loops (in VPlan-native path)");
1221	if (VF.isScalar())
1222	return true;
1223
1224	auto ScalarsPerVF = Scalars.find(Val: VF);
1225	assert(ScalarsPerVF != Scalars.end() &&
1226	"Scalar values are not calculated for VF");
1227	return ScalarsPerVF ->second.count(Ptr: I);
1228	}
1229
1230	/// \returns True if instruction \p I can be truncated to a smaller bitwidth
1231	/// for vectorization factor \p VF.
1232	bool canTruncateToMinimalBitwidth(Instruction I, ElementCount VF) const* {
1233	return VF.isVector() && MinBWs.contains(Key: I) &&
1234	!isProfitableToScalarize(I, VF) &&
1235	!isScalarAfterVectorization(I, VF);
1236	}
1237
1238	/// Decision that was taken during cost calculation for memory instruction.
1239	enum InstWidening {
1240	CM_Unknown,
1241	CM_Widen, // For consecutive accesses with stride +1.
1242	CM_Widen_Reverse, // For consecutive accesses with stride -1.
1243	CM_Interleave,
1244	CM_GatherScatter,
1245	CM_Scalarize,
1246	CM_VectorCall,
1247	CM_IntrinsicCall
1248	};
1249
1250	/// Save vectorization decision \p W and \p Cost taken by the cost model for
1251	/// instruction \p I and vector width \p VF.
1252	void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1253	InstructionCost Cost) {
1254	assert(VF.isVector() && "Expected VF >=2");
1255	WideningDecisions [std::make_pair(x&: I, y&: VF)] = std::make_pair(x&: W, y&: Cost);
1256	}
1257
1258	/// Save vectorization decision \p W and \p Cost taken by the cost model for
1259	/// interleaving group \p Grp and vector width \p VF.
1260	void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1261	ElementCount VF, InstWidening W,
1262	InstructionCost Cost) {
1263	assert(VF.isVector() && "Expected VF >=2");
1264	/// Broadcast this decicion to all instructions inside the group.
1265	/// But the cost will be assigned to one instruction only.
1266	for (unsigned i = `0`; i < Grp->getFactor(); ++i) {
1267	if (auto *I = Grp->getMember(Index: i)) {
1268	if (Grp->getInsertPos() == I)
1269	WideningDecisions [std::make_pair(x&: I, y&: VF)] = std::make_pair(x&: W, y&: Cost);
1270	else
1271	WideningDecisions [std::make_pair(x&: I, y&: VF)] = std::make_pair(x&: W, y: `0`);
1272	}
1273	}
1274	}
1275
1276	/// Return the cost model decision for the given instruction \p I and vector
1277	/// width \p VF. Return CM_Unknown if this instruction did not pass
1278	/// through the cost modeling.
1279	InstWidening getWideningDecision(Instruction I, ElementCount VF) const* {
1280	assert(VF.isVector() && "Expected VF to be a vector VF");
1281	assert(
1282	TheLoop->isInnermost() &&
1283	"cost-model should not be used for outer loops (in VPlan-native path)");
1284
1285	std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(x&: I, y&: VF);
1286	auto Itr = WideningDecisions.find(Val: InstOnVF);
1287	if (Itr == WideningDecisions.end())
1288	return CM_Unknown;
1289	return Itr ->second.first;
1290	}
1291
1292	/// Return the vectorization cost for the given instruction \p I and vector
1293	/// width \p VF.
1294	InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1295	assert(VF.isVector() && "Expected VF >=2");
1296	std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(x&: I, y&: VF);
1297	assert(WideningDecisions.contains(InstOnVF) &&
1298	"The cost is not calculated");
1299	return WideningDecisions [InstOnVF].second;
1300	}
1301
1302	struct CallWideningDecision {
1303	InstWidening Kind;
1304	Function *Variant;
1305	Intrinsic::ID IID;
1306	std::optional<unsigned> MaskPos;
1307	InstructionCost Cost;
1308	};
1309
1310	void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1311	Function *Variant, Intrinsic::ID IID,
1312	std::optional<unsigned> MaskPos,
1313	InstructionCost Cost) {
1314	assert(!VF.isScalar() && "Expected vector VF");
1315	CallWideningDecisions [std::make_pair(x&: CI, y&: VF)] = {.Kind: Kind, .Variant: Variant, .IID: IID,
1316	.MaskPos: MaskPos, .Cost: Cost};
1317	}
1318
1319	CallWideningDecision getCallWideningDecision(CallInst *CI,
1320	ElementCount VF) const {
1321	assert(!VF.isScalar() && "Expected vector VF");
1322	return CallWideningDecisions.at(Val: std::make_pair(x&: CI, y&: VF));
1323	}
1324
1325	/// Return True if instruction \p I is an optimizable truncate whose operand
1326	/// is an induction variable. Such a truncate will be removed by adding a new
1327	/// induction variable with the destination type.
1328	bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1329	// If the instruction is not a truncate, return false.
1330	auto *Trunc = dyn_cast<TruncInst>(Val: I);
1331	if (!Trunc)
1332	return false;
1333
1334	// Get the source and destination types of the truncate.
1335	Type *SrcTy = ToVectorTy(Scalar: cast<CastInst>(Val: I)->getSrcTy(), EC: VF);
1336	Type *DestTy = ToVectorTy(Scalar: cast<CastInst>(Val: I)->getDestTy(), EC: VF);
1337
1338	// If the truncate is free for the given types, return false. Replacing a
1339	// free truncate with an induction variable would add an induction variable
1340	// update instruction to each iteration of the loop. We exclude from this
1341	// check the primary induction variable since it will need an update
1342	// instruction regardless.
1343	Value *Op = Trunc->getOperand(i_nocapture: `0`);
1344	if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(Ty1: SrcTy, Ty2: DestTy))
1345	return false;
1346
1347	// If the truncated value is not an induction variable, return false.
1348	return Legal->isInductionPhi(V: Op);
1349	}
1350
1351	/// Collects the instructions to scalarize for each predicated instruction in
1352	/// the loop.
1353	void collectInstsToScalarize(ElementCount VF);
1354
1355	/// Collect Uniform and Scalar values for the given \p VF.
1356	/// The sets depend on CM decision for Load/Store instructions
1357	/// that may be vectorized as interleave, gather-scatter or scalarized.
1358	/// Also make a decision on what to do about call instructions in the loop
1359	/// at that VF -- scalarize, call a known vector routine, or call a
1360	/// vector intrinsic.
1361	void collectUniformsAndScalars(ElementCount VF) {
1362	// Do the analysis once.
1363	if (VF.isScalar() \|\| Uniforms.contains(Val: VF))
1364	return;
1365	setCostBasedWideningDecision(VF);
1366	setVectorizedCallDecision(VF);
1367	collectLoopUniforms(VF);
1368	collectLoopScalars(VF);
1369	}
1370
1371	/// Returns true if the target machine supports masked store operation
1372	/// for the given \p DataType and kind of access to \p Ptr.
1373	bool isLegalMaskedStore(Type DataType, Value Ptr, Align Alignment) const {
1374	return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1375	TTI.isLegalMaskedStore(DataType, Alignment);
1376	}
1377
1378	/// Returns true if the target machine supports masked load operation
1379	/// for the given \p DataType and kind of access to \p Ptr.
1380	bool isLegalMaskedLoad(Type DataType, Value Ptr, Align Alignment) const {
1381	return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1382	TTI.isLegalMaskedLoad(DataType, Alignment);
1383	}
1384
1385	/// Returns true if the target machine can represent \p V as a masked gather
1386	/// or scatter operation.
1387	bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1388	bool LI = isa<LoadInst>(Val: V);
1389	bool SI = isa<StoreInst>(Val: V);
1390	if (!LI && !SI)
1391	return false;
1392	auto *Ty = getLoadStoreType(I: V);
1393	Align Align = getLoadStoreAlignment(I: V);
1394	if (VF.isVector())
1395	Ty = VectorType::get(ElementType: Ty, EC: VF);
1396	return (LI && TTI.isLegalMaskedGather(DataType: Ty, Alignment: Align)) \|\|
1397	(SI && TTI.isLegalMaskedScatter(DataType: Ty, Alignment: Align));
1398	}
1399
1400	/// Returns true if the target machine supports all of the reduction
1401	/// variables found for the given VF.
1402	bool canVectorizeReductions(ElementCount VF) const {
1403	return (all_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
1404	const RecurrenceDescriptor &RdxDesc = Reduction.second;
1405	return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1406	}));
1407	}
1408
1409	/// Given costs for both strategies, return true if the scalar predication
1410	/// lowering should be used for div/rem. This incorporates an override
1411	/// option so it is not simply a cost comparison.
1412	bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1413	InstructionCost SafeDivisorCost) const {
1414	switch (ForceSafeDivisor) {
1415	case cl::BOU_UNSET:
1416	return ScalarCost < SafeDivisorCost;
1417	case cl::BOU_TRUE:
1418	return false;
1419	case cl::BOU_FALSE:
1420	return true;
1421	};
1422	llvm_unreachable("impossible case value");
1423	}
1424
1425	/// Returns true if \p I is an instruction which requires predication and
1426	/// for which our chosen predication strategy is scalarization (i.e. we
1427	/// don't have an alternate strategy such as masking available).
1428	/// \p VF is the vectorization factor that will be used to vectorize \p I.
1429	bool isScalarWithPredication(Instruction I, ElementCount VF) const*;
1430
1431	/// Returns true if \p I is an instruction that needs to be predicated
1432	/// at runtime. The result is independent of the predication mechanism.
1433	/// Superset of instructions that return true for isScalarWithPredication.
1434	bool isPredicatedInst(Instruction I) const*;
1435
1436	/// Return the costs for our two available strategies for lowering a
1437	/// div/rem operation which requires speculating at least one lane.
1438	/// First result is for scalarization (will be invalid for scalable
1439	/// vectors); second is for the safe-divisor strategy.
1440	std::pair<InstructionCost, InstructionCost>
1441	getDivRemSpeculationCost(Instruction *I,
1442	ElementCount VF) const;
1443
1444	/// Returns true if \p I is a memory instruction with consecutive memory
1445	/// access that can be widened.
1446	bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1447
1448	/// Returns true if \p I is a memory instruction in an interleaved-group
1449	/// of memory accesses that can be vectorized with wide vector loads/stores
1450	/// and shuffles.
1451	bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
1452
1453	/// Check if \p Instr belongs to any interleaved access group.
1454	bool isAccessInterleaved(Instruction *Instr) {
1455	return InterleaveInfo.isInterleaved(Instr);
1456	}
1457
1458	/// Get the interleaved access group that \p Instr belongs to.
1459	const InterleaveGroup<Instruction> *
1460	getInterleavedAccessGroup(Instruction *Instr) {
1461	return InterleaveInfo.getInterleaveGroup(Instr);
1462	}
1463
1464	/// Returns true if we're required to use a scalar epilogue for at least
1465	/// the final iteration of the original loop.
1466	bool requiresScalarEpilogue(bool IsVectorizing) const {
1467	if (!isScalarEpilogueAllowed())
1468	return false;
1469	// If we might exit from anywhere but the latch, must run the exiting
1470	// iteration in scalar form.
1471	if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1472	return true;
1473	return IsVectorizing && InterleaveInfo.requiresScalarEpilogue();
1474	}
1475
1476	/// Returns true if we're required to use a scalar epilogue for at least
1477	/// the final iteration of the original loop for all VFs in \p Range.
1478	/// A scalar epilogue must either be required for all VFs in \p Range or for
1479	/// none.
1480	bool requiresScalarEpilogue(VFRange Range) const {
1481	auto RequiresScalarEpilogue = [this](ElementCount VF) {
1482	return requiresScalarEpilogue(IsVectorizing: VF.isVector());
1483	};
1484	bool IsRequired = all_of(Range, P: RequiresScalarEpilogue);
1485	assert(
1486	(IsRequired \|\| none_of(Range, RequiresScalarEpilogue)) &&
1487	"all VFs in range must agree on whether a scalar epilogue is required");
1488	return IsRequired;
1489	}
1490
1491	/// Returns true if a scalar epilogue is not allowed due to optsize or a
1492	/// loop hint annotation.
1493	bool isScalarEpilogueAllowed() const {
1494	return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1495	}
1496
1497	/// Returns the TailFoldingStyle that is best for the current loop.
1498	TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1499	if (!ChosenTailFoldingStyle)
1500	return TailFoldingStyle::None;
1501	return IVUpdateMayOverflow ? ChosenTailFoldingStyle ->first
1502	: ChosenTailFoldingStyle ->second;
1503	}
1504
1505	/// Selects and saves TailFoldingStyle for 2 options - if IV update may
1506	/// overflow or not.
1507	/// \param IsScalableVF true if scalable vector factors enabled.
1508	/// \param UserIC User specific interleave count.
1509	void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1510	assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1511	if (!Legal->prepareToFoldTailByMasking()) {
1512	ChosenTailFoldingStyle =
1513	std::make_pair(x: TailFoldingStyle::None, y: TailFoldingStyle::None);
1514	return;
1515	}
1516
1517	if (!ForceTailFoldingStyle.getNumOccurrences()) {
1518	ChosenTailFoldingStyle = std::make_pair(
1519	x: TTI.getPreferredTailFoldingStyle(/IVUpdateMayOverflow=/true),
1520	y: TTI.getPreferredTailFoldingStyle(/IVUpdateMayOverflow=/false));
1521	return;
1522	}
1523
1524	// Set styles when forced.
1525	ChosenTailFoldingStyle = std::make_pair(x&: ForceTailFoldingStyle.getValue(),
1526	y&: ForceTailFoldingStyle.getValue());
1527	if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1528	return;
1529	// Override forced styles if needed.
1530	// FIXME: use actual opcode/data type for analysis here.
1531	// FIXME: Investigate opportunity for fixed vector factor.
1532	bool EVLIsLegal =
1533	IsScalableVF && UserIC <= `1` &&
1534	TTI.hasActiveVectorLength(Opcode: `0`, DataType: nullptr, Alignment: Align ()) &&
1535	!EnableVPlanNativePath &&
1536	// FIXME: implement support for max safe dependency distance.
1537	Legal->isSafeForAnyVectorWidth() &&
1538	// FIXME: remove this once reductions are supported.
1539	Legal->getReductionVars().empty();
1540	if (!EVLIsLegal) {
1541	// If for some reason EVL mode is unsupported, fallback to
1542	// DataWithoutLaneMask to try to vectorize the loop with folded tail
1543	// in a generic way.
1544	ChosenTailFoldingStyle =
1545	std::make_pair(x: TailFoldingStyle::DataWithoutLaneMask,
1546	y: TailFoldingStyle::DataWithoutLaneMask);
1547	LLVM_DEBUG(
1548	dbgs()
1549	<< "LV: Preference for VP intrinsics indicated. Will "
1550	"not try to generate VP Intrinsics "
1551	<< (UserIC > `1`
1552	? "since interleave count specified is greater than 1.\n"
1553	: "due to non-interleaving reasons.\n"));
1554	}
1555	}
1556
1557	/// Returns true if all loop blocks should be masked to fold tail loop.
1558	bool foldTailByMasking() const {
1559	// TODO: check if it is possible to check for None style independent of
1560	// IVUpdateMayOverflow flag in getTailFoldingStyle.
1561	return getTailFoldingStyle() != TailFoldingStyle::None;
1562	}
1563
1564	/// Returns true if the instructions in this block requires predication
1565	/// for any reason, e.g. because tail folding now requires a predicate
1566	/// or because the block in the original loop was predicated.
1567	bool blockNeedsPredicationForAnyReason(BasicBlock BB) const* {
1568	return foldTailByMasking() \|\| Legal->blockNeedsPredication(BB);
1569	}
1570
1571	/// Returns true if VP intrinsics with explicit vector length support should
1572	/// be generated in the tail folded loop.
1573	bool foldTailWithEVL() const {
1574	return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL &&
1575	// FIXME: remove this once vp_reverse is supported.
1576	none_of(
1577	Range: WideningDecisions,
1578	P: [](const std::pair<std::pair<Instruction *, ElementCount>,
1579	std::pair<InstWidening, InstructionCost>>
1580	&Data) { return Data.second.first == CM_Widen_Reverse; });
1581	}
1582
1583	/// Returns true if the Phi is part of an inloop reduction.
1584	bool isInLoopReduction(PHINode Phi) const* {
1585	return InLoopReductions.contains(Ptr: Phi);
1586	}
1587
1588	/// Estimate cost of an intrinsic call instruction CI if it were vectorized
1589	/// with factor VF. Return the cost of the instruction, including
1590	/// scalarization overhead if it's needed.
1591	InstructionCost getVectorIntrinsicCost(CallInst CI, ElementCount VF) const*;
1592
1593	/// Estimate cost of a call instruction CI if it were vectorized with factor
1594	/// VF. Return the cost of the instruction, including scalarization overhead
1595	/// if it's needed.
1596	InstructionCost getVectorCallCost(CallInst CI, ElementCount VF) const*;
1597
1598	/// Invalidates decisions already taken by the cost model.
1599	void invalidateCostModelingDecisions() {
1600	WideningDecisions.clear();
1601	CallWideningDecisions.clear();
1602	Uniforms.clear();
1603	Scalars.clear();
1604	}
1605
1606	/// The vectorization cost is a combination of the cost itself and a boolean
1607	/// indicating whether any of the contributing operations will actually
1608	/// operate on vector values after type legalization in the backend. If this
1609	/// latter value is false, then all operations will be scalarized (i.e. no
1610	/// vectorization has actually taken place).
1611	using VectorizationCostTy = std::pair<InstructionCost, bool>;
1612
1613	/// Returns the expected execution cost. The unit of the cost does
1614	/// not matter because we use the 'cost' units to compare different
1615	/// vector widths. The cost that is returned is not* normalized by*
1616	/// the factor width. If \p Invalid is not nullptr, this function
1617	/// will add a pair(Instruction, ElementCount) to \p Invalid for*
1618	/// each instruction that has an Invalid cost for the given VF.
1619	VectorizationCostTy
1620	expectedCost(ElementCount VF,
1621	SmallVectorImpl<InstructionVFPair> Invalid = nullptr*);
1622
1623	bool hasPredStores() const { return NumPredStores > `0`; }
1624
1625	/// Returns true if epilogue vectorization is considered profitable, and
1626	/// false otherwise.
1627	/// \p VF is the vectorization factor chosen for the original loop.
1628	bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1629
1630	private:
1631	unsigned NumPredStores = `0`;
1632
1633	/// \return An upper bound for the vectorization factors for both
1634	/// fixed and scalable vectorization, where the minimum-known number of
1635	/// elements is a power-of-2 larger than zero. If scalable vectorization is
1636	/// disabled or unsupported, then the scalable part will be equal to
1637	/// ElementCount::getScalable(0).
1638	FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1639	ElementCount UserVF,
1640	bool FoldTailByMasking);
1641
1642	/// \return the maximized element count based on the targets vector
1643	/// registers and the loop trip-count, but limited to a maximum safe VF.
1644	/// This is a helper function of computeFeasibleMaxVF.
1645	ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1646	unsigned SmallestType,
1647	unsigned WidestType,
1648	ElementCount MaxSafeVF,
1649	bool FoldTailByMasking);
1650
1651	/// \return the maximum legal scalable VF, based on the safe max number
1652	/// of elements.
1653	ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1654
1655	/// Returns the execution time cost of an instruction for a given vector
1656	/// width. Vector width of one means scalar.
1657	VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1658
1659	/// The cost-computation logic from getInstructionCost which provides
1660	/// the vector type as an output parameter.
1661	InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1662	Type *&VectorTy);
1663
1664	/// Return the cost of instructions in an inloop reduction pattern, if I is
1665	/// part of that pattern.
1666	std::optional<InstructionCost>
1667	getReductionPatternCost(Instruction I, ElementCount VF, Type VectorTy,
1668	TTI::TargetCostKind CostKind) const;
1669
1670	/// Calculate vectorization cost of memory instruction \p I.
1671	InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1672
1673	/// The cost computation for scalarized memory instruction.
1674	InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1675
1676	/// The cost computation for interleaving group of memory instructions.
1677	InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1678
1679	/// The cost computation for Gather/Scatter instruction.
1680	InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1681
1682	/// The cost computation for widening instruction \p I with consecutive
1683	/// memory access.
1684	InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1685
1686	/// The cost calculation for Load/Store instruction \p I with uniform pointer -
1687	/// Load: scalar load + broadcast.
1688	/// Store: scalar store + (loop invariant value stored? 0 : extract of last
1689	/// element)
1690	InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1691
1692	/// Estimate the overhead of scalarizing an instruction. This is a
1693	/// convenience wrapper for the type-based getScalarizationOverhead API.
1694	InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1695	TTI::TargetCostKind CostKind) const;
1696
1697	/// Returns true if an artificially high cost for emulated masked memrefs
1698	/// should be used.
1699	bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1700
1701	/// Map of scalar integer values to the smallest bitwidth they can be legally
1702	/// represented as. The vector equivalents of these values should be truncated
1703	/// to this type.
1704	MapVector<Instruction *, uint64_t> MinBWs;
1705
1706	/// A type representing the costs for instructions if they were to be
1707	/// scalarized rather than vectorized. The entries are Instruction-Cost
1708	/// pairs.
1709	using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1710
1711	/// A set containing all BasicBlocks that are known to present after
1712	/// vectorization as a predicated block.
1713	DenseMap<ElementCount, SmallPtrSet<BasicBlock *, `4`>>
1714	PredicatedBBsAfterVectorization;
1715
1716	/// Records whether it is allowed to have the original scalar loop execute at
1717	/// least once. This may be needed as a fallback loop in case runtime
1718	/// aliasing/dependence checks fail, or to handle the tail/remainder
1719	/// iterations when the trip count is unknown or doesn't divide by the VF,
1720	/// or as a peel-loop to handle gaps in interleave-groups.
1721	/// Under optsize and when the trip count is very small we don't allow any
1722	/// iterations to execute in the scalar loop.
1723	ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1724
1725	/// Control finally chosen tail folding style. The first element is used if
1726	/// the IV update may overflow, the second element - if it does not.
1727	std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1728	ChosenTailFoldingStyle;
1729
1730	/// A map holding scalar costs for different vectorization factors. The
1731	/// presence of a cost for an instruction in the mapping indicates that the
1732	/// instruction will be scalarized when vectorizing with the associated
1733	/// vectorization factor. The entries are VF-ScalarCostTy pairs.
1734	DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1735
1736	/// Holds the instructions known to be uniform after vectorization.
1737	/// The data is collected per VF.
1738	DenseMap<ElementCount, SmallPtrSet<Instruction *, `4`>> Uniforms;
1739
1740	/// Holds the instructions known to be scalar after vectorization.
1741	/// The data is collected per VF.
1742	DenseMap<ElementCount, SmallPtrSet<Instruction *, `4`>> Scalars;
1743
1744	/// Holds the instructions (address computations) that are forced to be
1745	/// scalarized.
1746	DenseMap<ElementCount, SmallPtrSet<Instruction *, `4`>> ForcedScalars;
1747
1748	/// PHINodes of the reductions that should be expanded in-loop.
1749	SmallPtrSet<PHINode *, `4`> InLoopReductions;
1750
1751	/// A Map of inloop reduction operations and their immediate chain operand.
1752	/// FIXME: This can be removed once reductions can be costed correctly in
1753	/// VPlan. This was added to allow quick lookup of the inloop operations.
1754	DenseMap<Instruction , Instruction > InLoopReductionImmediateChains;
1755
1756	/// Returns the expected difference in cost from scalarizing the expression
1757	/// feeding a predicated instruction \p PredInst. The instructions to
1758	/// scalarize and their scalar costs are collected in \p ScalarCosts. A
1759	/// non-negative return value implies the expression will be scalarized.
1760	/// Currently, only single-use chains are considered for scalarization.
1761	InstructionCost computePredInstDiscount(Instruction *PredInst,
1762	ScalarCostsTy &ScalarCosts,
1763	ElementCount VF);
1764
1765	/// Collect the instructions that are uniform after vectorization. An
1766	/// instruction is uniform if we represent it with a single scalar value in
1767	/// the vectorized loop corresponding to each vector iteration. Examples of
1768	/// uniform instructions include pointer operands of consecutive or
1769	/// interleaved memory accesses. Note that although uniformity implies an
1770	/// instruction will be scalar, the reverse is not true. In general, a
1771	/// scalarized instruction will be represented by VF scalar values in the
1772	/// vectorized loop, each corresponding to an iteration of the original
1773	/// scalar loop.
1774	void collectLoopUniforms(ElementCount VF);
1775
1776	/// Collect the instructions that are scalar after vectorization. An
1777	/// instruction is scalar if it is known to be uniform or will be scalarized
1778	/// during vectorization. collectLoopScalars should only add non-uniform nodes
1779	/// to the list if they are used by a load/store instruction that is marked as
1780	/// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1781	/// VF values in the vectorized loop, each corresponding to an iteration of
1782	/// the original scalar loop.
1783	void collectLoopScalars(ElementCount VF);
1784
1785	/// Keeps cost model vectorization decision and cost for instructions.
1786	/// Right now it is used for memory instructions only.
1787	using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1788	std::pair<InstWidening, InstructionCost>>;
1789
1790	DecisionList WideningDecisions;
1791
1792	using CallDecisionList =
1793	DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1794
1795	CallDecisionList CallWideningDecisions;
1796
1797	/// Returns true if \p V is expected to be vectorized and it needs to be
1798	/// extracted.
1799	bool needsExtract(Value V, ElementCount VF) const* {
1800	Instruction *I = dyn_cast<Instruction>(Val: V);
1801	if (VF.isScalar() \|\| !I \|\| !TheLoop->contains(Inst: I) \|\|
1802	TheLoop->isLoopInvariant(V: I))
1803	return false;
1804
1805	// Assume we can vectorize V (and hence we need extraction) if the
1806	// scalars are not computed yet. This can happen, because it is called
1807	// via getScalarizationOverhead from setCostBasedWideningDecision, before
1808	// the scalars are collected. That should be a safe assumption in most
1809	// cases, because we check if the operands have vectorizable types
1810	// beforehand in LoopVectorizationLegality.
1811	return !Scalars.contains(Val: VF) \|\| !isScalarAfterVectorization(I, VF);
1812	};
1813
1814	/// Returns a range containing only operands needing to be extracted.
1815	SmallVector<Value *, `4`> filterExtractingOperands(Instruction::op_range Ops,
1816	ElementCount VF) const {
1817	return SmallVector<Value *, `4`>(make_filter_range(
1818	Range&: Ops, Pred: [this, VF](Value V) { return* this->needsExtract(V, VF); }));
1819	}
1820
1821	public:
1822	/// The loop that we evaluate.
1823	Loop *TheLoop;
1824
1825	/// Predicated scalar evolution analysis.
1826	PredicatedScalarEvolution &PSE;
1827
1828	/// Loop Info analysis.
1829	LoopInfo *LI;
1830
1831	/// Vectorization legality.
1832	LoopVectorizationLegality *Legal;
1833
1834	/// Vector target information.
1835	const TargetTransformInfo &TTI;
1836
1837	/// Target Library Info.
1838	const TargetLibraryInfo *TLI;
1839
1840	/// Demanded bits analysis.
1841	DemandedBits *DB;
1842
1843	/// Assumption cache.
1844	AssumptionCache *AC;
1845
1846	/// Interface to emit optimization remarks.
1847	OptimizationRemarkEmitter *ORE;
1848
1849	const Function *TheFunction;
1850
1851	/// Loop Vectorize Hint.
1852	const LoopVectorizeHints *Hints;
1853
1854	/// The interleave access information contains groups of interleaved accesses
1855	/// with the same stride and close to each other.
1856	InterleavedAccessInfo &InterleaveInfo;
1857
1858	/// Values to ignore in the cost model.
1859	SmallPtrSet<const Value *, `16`> ValuesToIgnore;
1860
1861	/// Values to ignore in the cost model when VF > 1.
1862	SmallPtrSet<const Value *, `16`> VecValuesToIgnore;
1863
1864	/// All element types found in the loop.
1865	SmallPtrSet<Type *, `16`> ElementTypesInLoop;
1866	};
1867	} // end namespace llvm
1868
1869	namespace {
1870	/// Helper struct to manage generating runtime checks for vectorization.
1871	///
1872	/// The runtime checks are created up-front in temporary blocks to allow better
1873	/// estimating the cost and un-linked from the existing IR. After deciding to
1874	/// vectorize, the checks are moved back. If deciding not to vectorize, the
1875	/// temporary blocks are completely removed.
1876	class GeneratedRTChecks {
1877	/// Basic block which contains the generated SCEV checks, if any.
1878	BasicBlock SCEVCheckBlock = nullptr*;
1879
1880	/// The value representing the result of the generated SCEV checks. If it is
1881	/// nullptr, either no SCEV checks have been generated or they have been used.
1882	Value SCEVCheckCond = nullptr*;
1883
1884	/// Basic block which contains the generated memory runtime checks, if any.
1885	BasicBlock MemCheckBlock = nullptr*;
1886
1887	/// The value representing the result of the generated memory runtime checks.
1888	/// If it is nullptr, either no memory runtime checks have been generated or
1889	/// they have been used.
1890	Value MemRuntimeCheckCond = nullptr*;
1891
1892	DominatorTree *DT;
1893	LoopInfo *LI;
1894	TargetTransformInfo *TTI;
1895
1896	SCEVExpander SCEVExp;
1897	SCEVExpander MemCheckExp;
1898
1899	bool CostTooHigh = false;
1900	const bool AddBranchWeights;
1901
1902	Loop OuterLoop = nullptr*;
1903
1904	public:
1905	GeneratedRTChecks(ScalarEvolution &SE, DominatorTree DT, LoopInfo LI,
1906	TargetTransformInfo TTI, const* DataLayout &DL,
1907	bool AddBranchWeights)
1908	: DT(DT), LI(LI), TTI(TTI), SCEVExp (SE, DL, "scev.check"),
1909	MemCheckExp (SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1910
1911	/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1912	/// accurately estimate the cost of the runtime checks. The blocks are
1913	/// un-linked from the IR and is added back during vector code generation. If
1914	/// there is no vector code generation, the check blocks are removed
1915	/// completely.
1916	void Create(Loop L, const* LoopAccessInfo &LAI,
1917	const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1918
1919	// Hard cutoff to limit compile-time increase in case a very large number of
1920	// runtime checks needs to be generated.
1921	// TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1922	// profile info.
1923	CostTooHigh =
1924	LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1925	if (CostTooHigh)
1926	return;
1927
1928	BasicBlock *LoopHeader = L->getHeader();
1929	BasicBlock *Preheader = L->getLoopPreheader();
1930
1931	// Use SplitBlock to create blocks for SCEV & memory runtime checks to
1932	// ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1933	// may be used by SCEVExpander. The blocks will be un-linked from their
1934	// predecessors and removed from LI & DT at the end of the function.
1935	if (!UnionPred.isAlwaysTrue()) {
1936	SCEVCheckBlock = SplitBlock(Old: Preheader, SplitPt: Preheader->getTerminator(), DT, LI,
1937	MSSAU: nullptr, BBName: "vector.scevcheck");
1938
1939	SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1940	Pred: &UnionPred, Loc: SCEVCheckBlock->getTerminator());
1941	}
1942
1943	const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1944	if (RtPtrChecking.Need) {
1945	auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1946	MemCheckBlock = SplitBlock(Old: Pred, SplitPt: Pred->getTerminator(), DT, LI, MSSAU: nullptr,
1947	BBName: "vector.memcheck");
1948
1949	auto DiffChecks = RtPtrChecking.getDiffChecks();
1950	if (DiffChecks) {
1951	Value RuntimeVF = nullptr*;
1952	MemRuntimeCheckCond = addDiffRuntimeChecks(
1953	Loc: MemCheckBlock->getTerminator(), Checks: *DiffChecks, Expander&: MemCheckExp,
1954	GetVF: [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1955	if (!RuntimeVF)
1956	RuntimeVF = getRuntimeVF(B, Ty: B.getIntNTy(N: Bits), VF);
1957	return RuntimeVF;
1958	},
1959	IC);
1960	} else {
1961	MemRuntimeCheckCond = addRuntimeChecks(
1962	Loc: MemCheckBlock->getTerminator(), TheLoop: L, PointerChecks: RtPtrChecking.getChecks(),
1963	Expander&: MemCheckExp, HoistRuntimeChecks: VectorizerParams::HoistRuntimeChecks);
1964	}
1965	assert(MemRuntimeCheckCond &&
1966	"no RT checks generated although RtPtrChecking "
1967	"claimed checks are required");
1968	}
1969
1970	if (!MemCheckBlock && !SCEVCheckBlock)
1971	return;
1972
1973	// Unhook the temporary block with the checks, update various places
1974	// accordingly.
1975	if (SCEVCheckBlock)
1976	SCEVCheckBlock->replaceAllUsesWith(V: Preheader);
1977	if (MemCheckBlock)
1978	MemCheckBlock->replaceAllUsesWith(V: Preheader);
1979
1980	if (SCEVCheckBlock) {
1981	SCEVCheckBlock->getTerminator()->moveBefore(MovePos: Preheader->getTerminator());
1982	new UnreachableInst (Preheader->getContext(), SCEVCheckBlock);
1983	Preheader->getTerminator()->eraseFromParent();
1984	}
1985	if (MemCheckBlock) {
1986	MemCheckBlock->getTerminator()->moveBefore(MovePos: Preheader->getTerminator());
1987	new UnreachableInst (Preheader->getContext(), MemCheckBlock);
1988	Preheader->getTerminator()->eraseFromParent();
1989	}
1990
1991	DT->changeImmediateDominator(BB: LoopHeader, NewBB: Preheader);
1992	if (MemCheckBlock) {
1993	DT->eraseNode(BB: MemCheckBlock);
1994	LI->removeBlock(BB: MemCheckBlock);
1995	}
1996	if (SCEVCheckBlock) {
1997	DT->eraseNode(BB: SCEVCheckBlock);
1998	LI->removeBlock(BB: SCEVCheckBlock);
1999	}
2000
2001	// Outer loop is used as part of the later cost calculations.
2002	OuterLoop = L->getParentLoop();
2003	}
2004
2005	InstructionCost getCost() {
2006	if (SCEVCheckBlock \|\| MemCheckBlock)
2007	LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
2008
2009	if (CostTooHigh) {
2010	InstructionCost Cost;
2011	Cost.setInvalid();
2012	LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
2013	return Cost;
2014	}
2015
2016	InstructionCost RTCheckCost = `0`;
2017	if (SCEVCheckBlock)
2018	for (Instruction &I : *SCEVCheckBlock) {
2019	if (SCEVCheckBlock->getTerminator() == &I)
2020	continue;
2021	InstructionCost C =
2022	TTI->getInstructionCost(U: &I, CostKind: TTI::TCK_RecipThroughput);
2023	LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2024	RTCheckCost += C;
2025	}
2026	if (MemCheckBlock) {
2027	InstructionCost MemCheckCost = `0`;
2028	for (Instruction &I : *MemCheckBlock) {
2029	if (MemCheckBlock->getTerminator() == &I)
2030	continue;
2031	InstructionCost C =
2032	TTI->getInstructionCost(U: &I, CostKind: TTI::TCK_RecipThroughput);
2033	LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2034	MemCheckCost += C;
2035	}
2036
2037	// If the runtime memory checks are being created inside an outer loop
2038	// we should find out if these checks are outer loop invariant. If so,
2039	// the checks will likely be hoisted out and so the effective cost will
2040	// reduce according to the outer loop trip count.
2041	if (OuterLoop) {
2042	ScalarEvolution *SE = MemCheckExp.getSE();
2043	// TODO: If profitable, we could refine this further by analysing every
2044	// individual memory check, since there could be a mixture of loop
2045	// variant and invariant checks that mean the final condition is
2046	// variant.
2047	const SCEV *Cond = SE->getSCEV(V: MemRuntimeCheckCond);
2048	if (SE->isLoopInvariant(S: Cond, L: OuterLoop)) {
2049	// It seems reasonable to assume that we can reduce the effective
2050	// cost of the checks even when we know nothing about the trip
2051	// count. Assume that the outer loop executes at least twice.
2052	unsigned BestTripCount = `2`;
2053
2054	// If exact trip count is known use that.
2055	if (unsigned SmallTC = SE->getSmallConstantTripCount(L: OuterLoop))
2056	BestTripCount = SmallTC;
2057	else if (LoopVectorizeWithBlockFrequency) {
2058	// Else use profile data if available.
2059	if (auto EstimatedTC = getLoopEstimatedTripCount(L: OuterLoop))
2060	BestTripCount = *EstimatedTC;
2061	}
2062
2063	BestTripCount = std::max(a: BestTripCount, b: `1U`);
2064	InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2065
2066	// Let's ensure the cost is always at least 1.
2067	NewMemCheckCost = std::max(a: *NewMemCheckCost.getValue(),
2068	b: (InstructionCost::CostType)`1`);
2069
2070	if (BestTripCount > `1`)
2071	LLVM_DEBUG(dbgs()
2072	<< "We expect runtime memory checks to be hoisted "
2073	<< "out of the outer loop. Cost reduced from "
2074	<< MemCheckCost << " to " << NewMemCheckCost << `'\n'`);
2075
2076	MemCheckCost = NewMemCheckCost;
2077	}
2078	}
2079
2080	RTCheckCost += MemCheckCost;
2081	}
2082
2083	if (SCEVCheckBlock \|\| MemCheckBlock)
2084	LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2085	<< "\n");
2086
2087	return RTCheckCost;
2088	}
2089
2090	/// Remove the created SCEV & memory runtime check blocks & instructions, if
2091	/// unused.
2092	~GeneratedRTChecks() {
2093	SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2094	SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2095	if (!SCEVCheckCond)
2096	SCEVCleaner.markResultUsed();
2097
2098	if (!MemRuntimeCheckCond)
2099	MemCheckCleaner.markResultUsed();
2100
2101	if (MemRuntimeCheckCond) {
2102	auto &SE = *MemCheckExp.getSE();
2103	// Memory runtime check generation creates compares that use expanded
2104	// values. Remove them before running the SCEVExpanderCleaners.
2105	for (auto &I : make_early_inc_range(Range: reverse(C&: *MemCheckBlock))) {
2106	if (MemCheckExp.isInsertedInstruction(I: &I))
2107	continue;
2108	SE.forgetValue(V: &I);
2109	I.eraseFromParent();
2110	}
2111	}
2112	MemCheckCleaner.cleanup();
2113	SCEVCleaner.cleanup();
2114
2115	if (SCEVCheckCond)
2116	SCEVCheckBlock->eraseFromParent();
2117	if (MemRuntimeCheckCond)
2118	MemCheckBlock->eraseFromParent();
2119	}
2120
2121	/// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2122	/// adjusts the branches to branch to the vector preheader or \p Bypass,
2123	/// depending on the generated condition.
2124	BasicBlock emitSCEVChecks(BasicBlock Bypass,
2125	BasicBlock *LoopVectorPreHeader,
2126	BasicBlock *LoopExitBlock) {
2127	if (!SCEVCheckCond)
2128	return nullptr;
2129
2130	Value *Cond = SCEVCheckCond;
2131	// Mark the check as used, to prevent it from being removed during cleanup.
2132	SCEVCheckCond = nullptr;
2133	if (auto *C = dyn_cast<ConstantInt>(Val: Cond))
2134	if (C->isZero())
2135	return nullptr;
2136
2137	auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2138
2139	BranchInst::Create(IfTrue: LoopVectorPreHeader, InsertAtEnd: SCEVCheckBlock);
2140	// Create new preheader for vector loop.
2141	if (OuterLoop)
2142	OuterLoop->addBasicBlockToLoop(NewBB: SCEVCheckBlock, LI&: *LI);
2143
2144	SCEVCheckBlock->getTerminator()->eraseFromParent();
2145	SCEVCheckBlock->moveBefore(MovePos: LoopVectorPreHeader);
2146	Pred->getTerminator()->replaceSuccessorWith(OldBB: LoopVectorPreHeader,
2147	NewBB: SCEVCheckBlock);
2148
2149	DT->addNewBlock(BB: SCEVCheckBlock, DomBB: Pred);
2150	DT->changeImmediateDominator(BB: LoopVectorPreHeader, NewBB: SCEVCheckBlock);
2151
2152	BranchInst &BI = *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond);
2153	if (AddBranchWeights)
2154	setBranchWeights(I&: BI, Weights: SCEVCheckBypassWeights);
2155	ReplaceInstWithInst(From: SCEVCheckBlock->getTerminator(), To: &BI);
2156	return SCEVCheckBlock;
2157	}
2158
2159	/// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2160	/// the branches to branch to the vector preheader or \p Bypass, depending on
2161	/// the generated condition.
2162	BasicBlock emitMemRuntimeChecks(BasicBlock Bypass,
2163	BasicBlock *LoopVectorPreHeader) {
2164	// Check if we generated code that checks in runtime if arrays overlap.
2165	if (!MemRuntimeCheckCond)
2166	return nullptr;
2167
2168	auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2169	Pred->getTerminator()->replaceSuccessorWith(OldBB: LoopVectorPreHeader,
2170	NewBB: MemCheckBlock);
2171
2172	DT->addNewBlock(BB: MemCheckBlock, DomBB: Pred);
2173	DT->changeImmediateDominator(BB: LoopVectorPreHeader, NewBB: MemCheckBlock);
2174	MemCheckBlock->moveBefore(MovePos: LoopVectorPreHeader);
2175
2176	if (OuterLoop)
2177	OuterLoop->addBasicBlockToLoop(NewBB: MemCheckBlock, LI&: *LI);
2178
2179	BranchInst &BI =
2180	*BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: MemRuntimeCheckCond);
2181	if (AddBranchWeights) {
2182	setBranchWeights(I&: BI, Weights: MemCheckBypassWeights);
2183	}
2184	ReplaceInstWithInst(From: MemCheckBlock->getTerminator(), To: &BI);
2185	MemCheckBlock->getTerminator()->setDebugLoc(
2186	Pred->getTerminator()->getDebugLoc());
2187
2188	// Mark the check as used, to prevent it from being removed during cleanup.
2189	MemRuntimeCheckCond = nullptr;
2190	return MemCheckBlock;
2191	}
2192	};
2193	} // namespace
2194
2195	static bool useActiveLaneMask(TailFoldingStyle Style) {
2196	return Style == TailFoldingStyle::Data \|\|
2197	Style == TailFoldingStyle::DataAndControlFlow \|\|
2198	Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2199	}
2200
2201	static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2202	return Style == TailFoldingStyle::DataAndControlFlow \|\|
2203	Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2204	}
2205
2206	// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2207	// vectorization. The loop needs to be annotated with #pragma omp simd
2208	// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2209	// vector length information is not provided, vectorization is not considered
2210	// explicit. Interleave hints are not allowed either. These limitations will be
2211	// relaxed in the future.
2212	// Please, note that we are currently forced to abuse the pragma 'clang
2213	// vectorize' semantics. This pragma provides auto-vectorization hints
2214	// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2215	// provides explicit vectorization hints* (LV can bypass legal checks and*
2216	// assume that vectorization is legal). However, both hints are implemented
2217	// using the same metadata (llvm.loop.vectorize, processed by
2218	// LoopVectorizeHints). This will be fixed in the future when the native IR
2219	// representation for pragma 'omp simd' is introduced.
2220	static bool isExplicitVecOuterLoop(Loop *OuterLp,
2221	OptimizationRemarkEmitter *ORE) {
2222	assert(!OuterLp->isInnermost() && "This is not an outer loop");
2223	LoopVectorizeHints Hints(OuterLp, true /DisableInterleaving/, *ORE);
2224
2225	// Only outer loops with an explicit vectorization hint are supported.
2226	// Unannotated outer loops are ignored.
2227	if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2228	return false;
2229
2230	Function *Fn = OuterLp->getHeader()->getParent();
2231	if (!Hints.allowVectorization(F: Fn, L: OuterLp,
2232	VectorizeOnlyWhenForced: true /VectorizeOnlyWhenForced/)) {
2233	LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2234	return false;
2235	}
2236
2237	if (Hints.getInterleave() > `1`) {
2238	// TODO: Interleave support is future work.
2239	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2240	"outer loops.\n");
2241	Hints.emitRemarkWithHints();
2242	return false;
2243	}
2244
2245	return true;
2246	}
2247
2248	static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2249	OptimizationRemarkEmitter *ORE,
2250	SmallVectorImpl<Loop *> &V) {
2251	// Collect inner loops and outer loops without irreducible control flow. For
2252	// now, only collect outer loops that have explicit vectorization hints. If we
2253	// are stress testing the VPlan H-CFG construction, we collect the outermost
2254	// loop of every loop nest.
2255	if (L.isInnermost() \|\| VPlanBuildStressTest \|\|
2256	(EnableVPlanNativePath && isExplicitVecOuterLoop(OuterLp: &L, ORE))) {
2257	LoopBlocksRPO RPOT(&L);
2258	RPOT.perform(LI);
2259	if (!containsIrreducibleCFG<const BasicBlock >(RPOTraversal&: RPOT, LI: LI)) {
2260	V.push_back(Elt: &L);
2261	// TODO: Collect inner loops inside marked outer loops in case
2262	// vectorization fails for the outer loop. Do not invoke
2263	// 'containsIrreducibleCFG' again for inner loops when the outer loop is
2264	// already known to be reducible. We can use an inherited attribute for
2265	// that.
2266	return;
2267	}
2268	}
2269	for (Loop *InnerL : L)
2270	collectSupportedLoops(L&: *InnerL, LI, ORE, V);
2271	}
2272
2273	//===----------------------------------------------------------------------===//
2274	// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2275	// LoopVectorizationCostModel and LoopVectorizationPlanner.
2276	//===----------------------------------------------------------------------===//
2277
2278	/// Compute the transformed value of Index at offset StartValue using step
2279	/// StepValue.
2280	/// For integer induction, returns StartValue + Index StepValue.*
2281	/// For pointer induction, returns StartValue[Index StepValue].*
2282	/// FIXME: The newly created binary instructions should contain nsw/nuw
2283	/// flags, which can be found from the original scalar operations.
2284	static Value *
2285	emitTransformedIndex(IRBuilderBase &B, Value Index, Value StartValue,
2286	Value *Step,
2287	InductionDescriptor::InductionKind InductionKind,
2288	const BinaryOperator *InductionBinOp) {
2289	Type *StepTy = Step->getType();
2290	Value *CastedIndex = StepTy->isIntegerTy()
2291	? B.CreateSExtOrTrunc(V: Index, DestTy: StepTy)
2292	: B.CreateCast(Op: Instruction::SIToFP, V: Index, DestTy: StepTy);
2293	if (CastedIndex != Index) {
2294	CastedIndex->setName(CastedIndex->getName() + ".cast");
2295	Index = CastedIndex;
2296	}
2297
2298	// Note: the IR at this point is broken. We cannot use SE to create any new
2299	// SCEV and then expand it, hoping that SCEV's simplification will give us
2300	// a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2301	// lead to various SCEV crashes. So all we can do is to use builder and rely
2302	// on InstCombine for future simplifications. Here we handle some trivial
2303	// cases only.
2304	auto CreateAdd = [&B](Value X, Value Y) {
2305	assert(X->getType() == Y->getType() && "Types don't match!");
2306	if (auto *CX = dyn_cast<ConstantInt>(Val: X))
2307	if (CX->isZero())
2308	return Y;
2309	if (auto *CY = dyn_cast<ConstantInt>(Val: Y))
2310	if (CY->isZero())
2311	return X;
2312	return B.CreateAdd(LHS: X, RHS: Y);
2313	};
2314
2315	// We allow X to be a vector type, in which case Y will potentially be
2316	// splatted into a vector with the same element count.
2317	auto CreateMul = [&B](Value X, Value Y) {
2318	assert(X->getType()->getScalarType() == Y->getType() &&
2319	"Types don't match!");
2320	if (auto *CX = dyn_cast<ConstantInt>(Val: X))
2321	if (CX->isOne())
2322	return Y;
2323	if (auto *CY = dyn_cast<ConstantInt>(Val: Y))
2324	if (CY->isOne())
2325	return X;
2326	VectorType *XVTy = dyn_cast<VectorType>(Val: X->getType());
2327	if (XVTy && !isa<VectorType>(Val: Y->getType()))
2328	Y = B.CreateVectorSplat(EC: XVTy->getElementCount(), V: Y);
2329	return B.CreateMul(LHS: X, RHS: Y);
2330	};
2331
2332	switch (InductionKind) {
2333	case InductionDescriptor::IK_IntInduction: {
2334	assert(!isa<VectorType>(Index->getType()) &&
2335	"Vector indices not supported for integer inductions yet");
2336	assert(Index->getType() == StartValue->getType() &&
2337	"Index type does not match StartValue type");
2338	if (isa<ConstantInt>(Val: Step) && cast<ConstantInt>(Val: Step)->isMinusOne())
2339	return B.CreateSub(LHS: StartValue, RHS: Index);
2340	auto *Offset = CreateMul (Index, Step);
2341	return CreateAdd (StartValue, Offset);
2342	}
2343	case InductionDescriptor::IK_PtrInduction:
2344	return B.CreatePtrAdd(Ptr: StartValue, Offset: CreateMul (Index, Step));
2345	case InductionDescriptor::IK_FpInduction: {
2346	assert(!isa<VectorType>(Index->getType()) &&
2347	"Vector indices not supported for FP inductions yet");
2348	assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2349	assert(InductionBinOp &&
2350	(InductionBinOp->getOpcode() == Instruction::FAdd \|\|
2351	InductionBinOp->getOpcode() == Instruction::FSub) &&
2352	"Original bin op should be defined for FP induction");
2353
2354	Value *MulExp = B.CreateFMul(L: Step, R: Index);
2355	return B.CreateBinOp(Opc: InductionBinOp->getOpcode(), LHS: StartValue, RHS: MulExp,
2356	Name: "induction");
2357	}
2358	case InductionDescriptor::IK_NoInduction:
2359	return nullptr;
2360	}
2361	llvm_unreachable("invalid enum");
2362	}
2363
2364	std::optional<unsigned> getMaxVScale(const Function &F,
2365	const TargetTransformInfo &TTI) {
2366	if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2367	return MaxVScale;
2368
2369	if (F.hasFnAttribute(Attribute::VScaleRange))
2370	return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2371
2372	return std::nullopt;
2373	}
2374
2375	/// For the given VF and UF and maximum trip count computed for the loop, return
2376	/// whether the induction variable might overflow in the vectorized loop. If not,
2377	/// then we know a runtime overflow check always evaluates to false and can be
2378	/// removed.
2379	static bool isIndvarOverflowCheckKnownFalse(
2380	const LoopVectorizationCostModel *Cost,
2381	ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2382	// Always be conservative if we don't know the exact unroll factor.
2383	unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2384
2385	Type *IdxTy = Cost->Legal->getWidestInductionType();
2386	APInt MaxUIntTripCount = cast<IntegerType>(Val: IdxTy)->getMask();
2387
2388	// We know the runtime overflow check is known false iff the (max) trip-count
2389	// is known and (max) trip-count + (VF UF) does not overflow in the type of*
2390	// the vector loop induction variable.
2391	if (unsigned TC =
2392	Cost->PSE.getSE()->getSmallConstantMaxTripCount(L: Cost->TheLoop)) {
2393	uint64_t MaxVF = VF.getKnownMinValue();
2394	if (VF.isScalable()) {
2395	std::optional<unsigned> MaxVScale =
2396	getMaxVScale(F: *Cost->TheFunction, TTI: Cost->TTI);
2397	if (!MaxVScale)
2398	return false;
2399	MaxVF = MaxVScale;
2400	}
2401
2402	return (MaxUIntTripCount - TC).ugt(RHS: MaxVF * MaxUF);
2403	}
2404
2405	return false;
2406	}
2407
2408	// Return whether we allow using masked interleave-groups (for dealing with
2409	// strided loads/stores that reside in predicated blocks, or for dealing
2410	// with gaps).
2411	static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2412	// If an override option has been passed in for interleaved accesses, use it.
2413	if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > `0`)
2414	return EnableMaskedInterleavedMemAccesses;
2415
2416	return TTI.enableMaskedInterleavedAccessVectorization();
2417	}
2418
2419	// Try to vectorize the interleave group that \p Instr belongs to.
2420	//
2421	// E.g. Translate following interleaved load group (factor = 3):
2422	// for (i = 0; i < N; i+=3) {
2423	// R = Pic[i]; // Member of index 0
2424	// G = Pic[i+1]; // Member of index 1
2425	// B = Pic[i+2]; // Member of index 2
2426	// ... // do something to R, G, B
2427	// }
2428	// To:
2429	// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
2430	// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
2431	// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
2432	// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
2433	//
2434	// Or translate following interleaved store group (factor = 3):
2435	// for (i = 0; i < N; i+=3) {
2436	// ... do something to R, G, B
2437	// Pic[i] = R; // Member of index 0
2438	// Pic[i+1] = G; // Member of index 1
2439	// Pic[i+2] = B; // Member of index 2
2440	// }
2441	// To:
2442	// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2443	// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2444	// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2445	// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
2446	// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
2447	void InnerLoopVectorizer::vectorizeInterleaveGroup(
2448	const InterleaveGroup<Instruction> Group, ArrayRef<VPValue > VPDefs,
2449	VPTransformState &State, VPValue Addr, ArrayRef<VPValue > StoredValues,
2450	VPValue BlockInMask, bool* NeedsMaskForGaps) {
2451	Instruction *Instr = Group->getInsertPos();
2452	const DataLayout &DL = Instr->getModule()->getDataLayout();
2453
2454	// Prepare for the vector type of the interleaved load/store.
2455	Type *ScalarTy = getLoadStoreType(I: Instr);
2456	unsigned InterleaveFactor = Group->getFactor();
2457	auto VecTy = VectorType::get(ElementType: ScalarTy, EC: VF InterleaveFactor);
2458
2459	// Prepare for the new pointers.
2460	SmallVector<Value *, `2`> AddrParts;
2461	unsigned Index = Group->getIndex(Instr);
2462
2463	// TODO: extend the masked interleaved-group support to reversed access.
2464	assert((!BlockInMask \|\| !Group->isReverse()) &&
2465	"Reversed masked interleave-group not supported.");
2466
2467	Value *Idx;
2468	// If the group is reverse, adjust the index to refer to the last vector lane
2469	// instead of the first. We adjust the index from the first vector lane,
2470	// rather than directly getting the pointer for lane VF - 1, because the
2471	// pointer operand of the interleaved access is supposed to be uniform. For
2472	// uniform instructions, we're only required to generate a value for the
2473	// first vector lane in each unroll iteration.
2474	if (Group->isReverse()) {
2475	Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty: Builder.getInt32Ty(), VF);
2476	Idx = Builder.CreateSub(LHS: RuntimeVF, RHS: Builder.getInt32(C: `1`));
2477	Idx = Builder.CreateMul(LHS: Idx, RHS: Builder.getInt32(C: Group->getFactor()));
2478	Idx = Builder.CreateAdd(LHS: Idx, RHS: Builder.getInt32(C: Index));
2479	Idx = Builder.CreateNeg(V: Idx);
2480	} else
2481	Idx = Builder.getInt32(C: -Index);
2482
2483	for (unsigned Part = `0`; Part < UF; Part++) {
2484	Value *AddrPart = State.get(Def: Addr, Instance: VPIteration (Part, `0`));
2485	if (auto *I = dyn_cast<Instruction>(Val: AddrPart))
2486	State.setDebugLocFrom(I->getDebugLoc());
2487
2488	// Notice current instruction could be any index. Need to adjust the address
2489	// to the member of index 0.
2490	//
2491	// E.g. a = A[i+1]; // Member of index 1 (Current instruction)
2492	// b = A[i]; // Member of index 0
2493	// Current pointer is pointed to A[i+1], adjust it to A[i].
2494	//
2495	// E.g. A[i+1] = a; // Member of index 1
2496	// A[i] = b; // Member of index 0
2497	// A[i+2] = c; // Member of index 2 (Current instruction)
2498	// Current pointer is pointed to A[i+2], adjust it to A[i].
2499
2500	bool InBounds = false;
2501	if (auto *gep = dyn_cast<GetElementPtrInst>(Val: AddrPart->stripPointerCasts()))
2502	InBounds = gep->isInBounds();
2503	AddrPart = Builder.CreateGEP(Ty: ScalarTy, Ptr: AddrPart, IdxList: Idx, Name: "", IsInBounds: InBounds);
2504	AddrParts.push_back(Elt: AddrPart);
2505	}
2506
2507	State.setDebugLocFrom(Instr->getDebugLoc());
2508	Value *PoisonVec = PoisonValue::get(T: VecTy);
2509
2510	auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor](
2511	unsigned Part, Value MaskForGaps) -> Value {
2512	if (VF.isScalable()) {
2513	assert(!MaskForGaps && "Interleaved groups with gaps are not supported.");
2514	assert(InterleaveFactor == `2` &&
2515	"Unsupported deinterleave factor for scalable vectors");
2516	auto *BlockInMaskPart = State.get(Def: BlockInMask, Part);
2517	SmallVector<Value *, `2`> Ops = {BlockInMaskPart, BlockInMaskPart};
2518	auto *MaskTy =
2519	VectorType::get(ElementType: Builder.getInt1Ty(), NumElements: VF.getKnownMinValue() * `2`, Scalable: true);
2520	return Builder.CreateIntrinsic(
2521	MaskTy, Intrinsic::experimental_vector_interleave2, Ops,
2522	/FMFSource=/nullptr, "interleaved.mask");
2523	}
2524
2525	if (!BlockInMask)
2526	return MaskForGaps;
2527
2528	Value *BlockInMaskPart = State.get(Def: BlockInMask, Part);
2529	Value *ShuffledMask = Builder.CreateShuffleVector(
2530	V: BlockInMaskPart,
2531	Mask: createReplicatedMask(ReplicationFactor: InterleaveFactor, VF: VF.getKnownMinValue()),
2532	Name: "interleaved.mask");
2533	return MaskForGaps ? Builder.CreateBinOp(Opc: Instruction::And, LHS: ShuffledMask,
2534	RHS: MaskForGaps)
2535	: ShuffledMask;
2536	};
2537
2538	// Vectorize the interleaved load group.
2539	if (isa<LoadInst>(Val: Instr)) {
2540	Value MaskForGaps = nullptr*;
2541	if (NeedsMaskForGaps) {
2542	MaskForGaps =
2543	createBitMaskForGaps(Builder, VF: VF.getKnownMinValue(), Group: *Group);
2544	assert(MaskForGaps && "Mask for Gaps is required but it is null");
2545	}
2546
2547	// For each unroll part, create a wide load for the group.
2548	SmallVector<Value *, `2`> NewLoads;
2549	for (unsigned Part = `0`; Part < UF; Part++) {
2550	Instruction *NewLoad;
2551	if (BlockInMask \|\| MaskForGaps) {
2552	assert(useMaskedInterleavedAccesses(*TTI) &&
2553	"masked interleaved groups are not allowed.");
2554	Value *GroupMask = CreateGroupMask (Part, MaskForGaps);
2555	NewLoad =
2556	Builder.CreateMaskedLoad(Ty: VecTy, Ptr: AddrParts [Part], Alignment: Group->getAlign(),
2557	Mask: GroupMask, PassThru: PoisonVec, Name: "wide.masked.vec");
2558	}
2559	else
2560	NewLoad = Builder.CreateAlignedLoad(Ty: VecTy, Ptr: AddrParts [Part],
2561	Align: Group->getAlign(), Name: "wide.vec");
2562	Group->addMetadata(NewInst: NewLoad);
2563	NewLoads.push_back(Elt: NewLoad);
2564	}
2565
2566	if (VecTy->isScalableTy()) {
2567	assert(InterleaveFactor == `2` &&
2568	"Unsupported deinterleave factor for scalable vectors");
2569
2570	for (unsigned Part = `0`; Part < UF; ++Part) {
2571	// Scalable vectors cannot use arbitrary shufflevectors (only splats),
2572	// so must use intrinsics to deinterleave.
2573	Value *DI = Builder.CreateIntrinsic(
2574	Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads [Part],
2575	/FMFSource=/nullptr, "strided.vec");
2576	unsigned J = `0`;
2577	for (unsigned I = `0`; I < InterleaveFactor; ++I) {
2578	Instruction *Member = Group->getMember(Index: I);
2579
2580	if (!Member)
2581	continue;
2582
2583	Value *StridedVec = Builder.CreateExtractValue(Agg: DI, Idxs: I);
2584	// If this member has different type, cast the result type.
2585	if (Member->getType() != ScalarTy) {
2586	VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: VF);
2587	StridedVec = createBitOrPointerCast(V: StridedVec, DstVTy: OtherVTy, DL);
2588	}
2589
2590	if (Group->isReverse())
2591	StridedVec = Builder.CreateVectorReverse(V: StridedVec, Name: "reverse");
2592
2593	State.set(Def: VPDefs [J], V: StridedVec, Part);
2594	++J;
2595	}
2596	}
2597
2598	return;
2599	}
2600
2601	// For each member in the group, shuffle out the appropriate data from the
2602	// wide loads.
2603	unsigned J = `0`;
2604	for (unsigned I = `0`; I < InterleaveFactor; ++I) {
2605	Instruction *Member = Group->getMember(Index: I);
2606
2607	// Skip the gaps in the group.
2608	if (!Member)
2609	continue;
2610
2611	auto StrideMask =
2612	createStrideMask(Start: I, Stride: InterleaveFactor, VF: VF.getKnownMinValue());
2613	for (unsigned Part = `0`; Part < UF; Part++) {
2614	Value *StridedVec = Builder.CreateShuffleVector(
2615	V: NewLoads [Part], Mask: StrideMask, Name: "strided.vec");
2616
2617	// If this member has different type, cast the result type.
2618	if (Member->getType() != ScalarTy) {
2619	assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2620	VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: VF);
2621	StridedVec = createBitOrPointerCast(V: StridedVec, DstVTy: OtherVTy, DL);
2622	}
2623
2624	if (Group->isReverse())
2625	StridedVec = Builder.CreateVectorReverse(V: StridedVec, Name: "reverse");
2626
2627	State.set(Def: VPDefs [J], V: StridedVec, Part);
2628	}
2629	++J;
2630	}
2631	return;
2632	}
2633
2634	// The sub vector type for current instruction.
2635	auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: VF);
2636
2637	// Vectorize the interleaved store group.
2638	Value *MaskForGaps =
2639	createBitMaskForGaps(Builder, VF: VF.getKnownMinValue(), Group: *Group);
2640	assert((!MaskForGaps \|\| useMaskedInterleavedAccesses(*TTI)) &&
2641	"masked interleaved groups are not allowed.");
2642	assert((!MaskForGaps \|\| !VF.isScalable()) &&
2643	"masking gaps for scalable vectors is not yet supported.");
2644	for (unsigned Part = `0`; Part < UF; Part++) {
2645	// Collect the stored vector from each member.
2646	SmallVector<Value *, `4`> StoredVecs;
2647	unsigned StoredIdx = `0`;
2648	for (unsigned i = `0`; i < InterleaveFactor; i++) {
2649	assert((Group->getMember(i) \|\| MaskForGaps) &&
2650	"Fail to get a member from an interleaved store group");
2651	Instruction *Member = Group->getMember(Index: i);
2652
2653	// Skip the gaps in the group.
2654	if (!Member) {
2655	Value *Undef = PoisonValue::get(T: SubVT);
2656	StoredVecs.push_back(Elt: Undef);
2657	continue;
2658	}
2659
2660	Value *StoredVec = State.get(Def: StoredValues [StoredIdx], Part);
2661	++StoredIdx;
2662
2663	if (Group->isReverse())
2664	StoredVec = Builder.CreateVectorReverse(V: StoredVec, Name: "reverse");
2665
2666	// If this member has different type, cast it to a unified type.
2667
2668	if (StoredVec->getType() != SubVT)
2669	StoredVec = createBitOrPointerCast(V: StoredVec, DstVTy: SubVT, DL);
2670
2671	StoredVecs.push_back(Elt: StoredVec);
2672	}
2673
2674	// Interleave all the smaller vectors into one wider vector.
2675	Value *IVec = interleaveVectors(Builder, Vals: StoredVecs, Name: "interleaved.vec");
2676	Instruction *NewStoreInstr;
2677	if (BlockInMask \|\| MaskForGaps) {
2678	Value *GroupMask = CreateGroupMask (Part, MaskForGaps);
2679	NewStoreInstr = Builder.CreateMaskedStore(Val: IVec, Ptr: AddrParts [Part],
2680	Alignment: Group->getAlign(), Mask: GroupMask);
2681	} else
2682	NewStoreInstr =
2683	Builder.CreateAlignedStore(Val: IVec, Ptr: AddrParts [Part], Align: Group->getAlign());
2684
2685	Group->addMetadata(NewInst: NewStoreInstr);
2686	}
2687	}
2688
2689	void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2690	VPReplicateRecipe *RepRecipe,
2691	const VPIteration &Instance,
2692	VPTransformState &State) {
2693	assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2694
2695	// llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2696	// the first lane and part.
2697	if (isa<NoAliasScopeDeclInst>(Val: Instr))
2698	if (!Instance.isFirstIteration())
2699	return;
2700
2701	// Does this instruction return a value ?
2702	bool IsVoidRetTy = Instr->getType()->isVoidTy();
2703
2704	Instruction *Cloned = Instr->clone();
2705	if (!IsVoidRetTy) {
2706	Cloned->setName(Instr->getName() + ".cloned");
2707	#if !defined(NDEBUG)
2708	// Verify that VPlan type inference results agree with the type of the
2709	// generated values.
2710	assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2711	"inferred type and type from generated instructions do not match");
2712	#endif
2713	}
2714
2715	RepRecipe->setFlags(Cloned);
2716
2717	if (auto DL = Instr->getDebugLoc())
2718	State.setDebugLocFrom(DL);
2719
2720	// Replace the operands of the cloned instructions with their scalar
2721	// equivalents in the new loop.
2722	for (const auto &I : enumerate(First: RepRecipe->operands())) {
2723	auto InputInstance = Instance;
2724	VPValue *Operand = I.value();
2725	if (vputils::isUniformAfterVectorization(VPV: Operand))
2726	InputInstance.Lane = VPLane::getFirstLane();
2727	Cloned->setOperand(i: I.index(), Val: State.get(Def: Operand, Instance: InputInstance));
2728	}
2729	State.addNewMetadata(To: Cloned, Orig: Instr);
2730
2731	// Place the cloned scalar in the new loop.
2732	State.Builder.Insert(I: Cloned);
2733
2734	State.set(Def: RepRecipe, V: Cloned, Instance);
2735
2736	// If we just cloned a new assumption, add it the assumption cache.
2737	if (auto *II = dyn_cast<AssumeInst>(Val: Cloned))
2738	AC->registerAssumption(CI: II);
2739
2740	// End if-block.
2741	bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2742	if (IfPredicateInstr)
2743	PredicatedInstructions.push_back(Elt: Cloned);
2744	}
2745
2746	Value *
2747	InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2748	if (VectorTripCount)
2749	return VectorTripCount;
2750
2751	Value *TC = getTripCount();
2752	IRBuilder<> Builder(InsertBlock->getTerminator());
2753
2754	Type *Ty = TC->getType();
2755	// This is where we can make the step a runtime constant.
2756	Value *Step = createStepForVF(B&: Builder, Ty, VF, Step: UF);
2757
2758	// If the tail is to be folded by masking, round the number of iterations N
2759	// up to a multiple of Step instead of rounding down. This is done by first
2760	// adding Step-1 and then rounding down. Note that it's ok if this addition
2761	// overflows: the vector induction variable will eventually wrap to zero given
2762	// that it starts at zero and its Step is a power of two; the loop will then
2763	// exit, with the last early-exit vector comparison also producing all-true.
2764	// For scalable vectors the VF is not guaranteed to be a power of 2, but this
2765	// is accounted for in emitIterationCountCheck that adds an overflow check.
2766	if (Cost->foldTailByMasking()) {
2767	assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2768	"VF*UF must be a power of 2 when folding tail by masking");
2769	Value NumLanes = getRuntimeVF(B&: Builder, Ty, VF: VF UF);
2770	TC = Builder.CreateAdd(
2771	LHS: TC, RHS: Builder.CreateSub(LHS: NumLanes, RHS: ConstantInt::get(Ty, V: `1`)), Name: "n.rnd.up");
2772	}
2773
2774	// Now we need to generate the expression for the part of the loop that the
2775	// vectorized body will execute. This is equal to N - (N % Step) if scalar
2776	// iterations are not required for correctness, or N - Step, otherwise. Step
2777	// is equal to the vectorization factor (number of SIMD elements) times the
2778	// unroll factor (number of SIMD instructions).
2779	Value *R = Builder.CreateURem(LHS: TC, RHS: Step, Name: "n.mod.vf");
2780
2781	// There are cases where we must* run at least one iteration in the remainder*
2782	// loop. See the cost model for when this can happen. If the step evenly
2783	// divides the trip count, we set the remainder to be equal to the step. If
2784	// the step does not evenly divide the trip count, no adjustment is necessary
2785	// since there will already be scalar iterations. Note that the minimum
2786	// iterations check ensures that N >= Step.
2787	if (Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector())) {
2788	auto *IsZero = Builder.CreateICmpEQ(LHS: R, RHS: ConstantInt::get(Ty: R->getType(), V: `0`));
2789	R = Builder.CreateSelect(C: IsZero, True: Step, False: R);
2790	}
2791
2792	VectorTripCount = Builder.CreateSub(LHS: TC, RHS: R, Name: "n.vec");
2793
2794	return VectorTripCount;
2795	}
2796
2797	Value InnerLoopVectorizer::createBitOrPointerCast(Value V, VectorType *DstVTy,
2798	const DataLayout &DL) {
2799	// Verify that V is a vector type with same number of elements as DstVTy.
2800	auto *DstFVTy = cast<VectorType>(Val: DstVTy);
2801	auto VF = DstFVTy->getElementCount();
2802	auto *SrcVecTy = cast<VectorType>(Val: V->getType());
2803	assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match");
2804	Type *SrcElemTy = SrcVecTy->getElementType();
2805	Type *DstElemTy = DstFVTy->getElementType();
2806	assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2807	"Vector elements must have same size");
2808
2809	// Do a direct cast if element types are castable.
2810	if (CastInst::isBitOrNoopPointerCastable(SrcTy: SrcElemTy, DestTy: DstElemTy, DL)) {
2811	return Builder.CreateBitOrPointerCast(V, DestTy: DstFVTy);
2812	}
2813	// V cannot be directly casted to desired vector type.
2814	// May happen when V is a floating point vector but DstVTy is a vector of
2815	// pointers or vice-versa. Handle this using a two-step bitcast using an
2816	// intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2817	assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2818	"Only one type should be a pointer type");
2819	assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2820	"Only one type should be a floating point type");
2821	Type *IntTy =
2822	IntegerType::getIntNTy(C&: V->getContext(), N: DL.getTypeSizeInBits(Ty: SrcElemTy));
2823	auto *VecIntTy = VectorType::get(ElementType: IntTy, EC: VF);
2824	Value *CastVal = Builder.CreateBitOrPointerCast(V, DestTy: VecIntTy);
2825	return Builder.CreateBitOrPointerCast(V: CastVal, DestTy: DstFVTy);
2826	}
2827
2828	void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2829	Value *Count = getTripCount();
2830	// Reuse existing vector loop preheader for TC checks.
2831	// Note that new preheader block is generated for vector loop.
2832	BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2833	IRBuilder<> Builder(TCCheckBlock->getTerminator());
2834
2835	// Generate code to check if the loop's trip count is less than VF UF, or*
2836	// equal to it in case a scalar epilogue is required; this implies that the
2837	// vector trip count is zero. This check also covers the case where adding one
2838	// to the backedge-taken count overflowed leading to an incorrect trip count
2839	// of zero. In this case we will also jump to the scalar loop.
2840	auto P = Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()) ? ICmpInst::ICMP_ULE
2841	: ICmpInst::ICMP_ULT;
2842
2843	// If tail is to be folded, vector loop takes care of all iterations.
2844	Type *CountTy = Count->getType();
2845	Value *CheckMinIters = Builder.getFalse();
2846	auto CreateStep = [&]() -> Value * {
2847	// Create step with max(MinProTripCount, UF VF).*
2848	if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2849	return createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF);
2850
2851	Value *MinProfTC =
2852	createStepForVF(B&: Builder, Ty: CountTy, VF: MinProfitableTripCount, Step: `1`);
2853	if (!VF.isScalable())
2854	return MinProfTC;
2855	return Builder.CreateBinaryIntrinsic(
2856	Intrinsic::ID: umax, LHS: MinProfTC, RHS: createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF));
2857	};
2858
2859	TailFoldingStyle Style = Cost->getTailFoldingStyle();
2860	if (Style == TailFoldingStyle::None)
2861	CheckMinIters =
2862	Builder.CreateICmp(P, LHS: Count, RHS: CreateStep (), Name: "min.iters.check");
2863	else if (VF.isScalable() &&
2864	!isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2865	Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2866	// vscale is not necessarily a power-of-2, which means we cannot guarantee
2867	// an overflow to zero when updating induction variables and so an
2868	// additional overflow check is required before entering the vector loop.
2869
2870	// Get the maximum unsigned value for the type.
2871	Value *MaxUIntTripCount =
2872	ConstantInt::get(Ty: CountTy, V: cast<IntegerType>(Val: CountTy)->getMask());
2873	Value *LHS = Builder.CreateSub(LHS: MaxUIntTripCount, RHS: Count);
2874
2875	// Don't execute the vector loop if (UMax - n) < (VF UF).*
2876	CheckMinIters = Builder.CreateICmp(P: ICmpInst::ICMP_ULT, LHS, RHS: CreateStep ());
2877	}
2878
2879	// Create new preheader for vector loop.
2880	LoopVectorPreHeader =
2881	SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(), DT, LI, MSSAU: nullptr,
2882	BBName: "vector.ph");
2883
2884	assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2885	DT->getNode(Bypass)->getIDom()) &&
2886	"TC check is expected to dominate Bypass");
2887
2888	// Update dominator for Bypass & LoopExit (if needed).
2889	DT->changeImmediateDominator(BB: Bypass, NewBB: TCCheckBlock);
2890	if (!Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()))
2891	// If there is an epilogue which must run, there's no edge from the
2892	// middle block to exit blocks and thus no need to update the immediate
2893	// dominator of the exit blocks.
2894	DT->changeImmediateDominator(BB: LoopExitBlock, NewBB: TCCheckBlock);
2895
2896	BranchInst &BI =
2897	*BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
2898	if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator()))
2899	setBranchWeights(I&: BI, Weights: MinItersBypassWeights);
2900	ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI);
2901	LoopBypassBlocks.push_back(Elt: TCCheckBlock);
2902	}
2903
2904	BasicBlock InnerLoopVectorizer::emitSCEVChecks(BasicBlock Bypass) {
2905	BasicBlock *const SCEVCheckBlock =
2906	RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2907	if (!SCEVCheckBlock)
2908	return nullptr;
2909
2910	assert(!(SCEVCheckBlock->getParent()->hasOptSize() \|\|
2911	(OptForSizeBasedOnProfile &&
2912	Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2913	"Cannot SCEV check stride or overflow when optimizing for size");
2914
2915
2916	// Update dominator only if this is first RT check.
2917	if (LoopBypassBlocks.empty()) {
2918	DT->changeImmediateDominator(BB: Bypass, NewBB: SCEVCheckBlock);
2919	if (!Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()))
2920	// If there is an epilogue which must run, there's no edge from the
2921	// middle block to exit blocks and thus no need to update the immediate
2922	// dominator of the exit blocks.
2923	DT->changeImmediateDominator(BB: LoopExitBlock, NewBB: SCEVCheckBlock);
2924	}
2925
2926	LoopBypassBlocks.push_back(Elt: SCEVCheckBlock);
2927	AddedSafetyChecks = true;
2928	return SCEVCheckBlock;
2929	}
2930
2931	BasicBlock InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock Bypass) {
2932	// VPlan-native path does not do any analysis for runtime checks currently.
2933	if (EnableVPlanNativePath)
2934	return nullptr;
2935
2936	BasicBlock *const MemCheckBlock =
2937	RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2938
2939	// Check if we generated code that checks in runtime if arrays overlap. We put
2940	// the checks into a separate block to make the more common case of few
2941	// elements faster.
2942	if (!MemCheckBlock)
2943	return nullptr;
2944
2945	if (MemCheckBlock->getParent()->hasOptSize() \|\| OptForSizeBasedOnProfile) {
2946	assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2947	"Cannot emit memory checks when optimizing for size, unless forced "
2948	"to vectorize.");
2949	ORE->emit(RemarkBuilder: [&]() {
2950	return OptimizationRemarkAnalysis (DEBUG_TYPE, "VectorizationCodeSize",
2951	OrigLoop->getStartLoc(),
2952	OrigLoop->getHeader())
2953	<< "Code-size may be reduced by not forcing "
2954	"vectorization, or by source-code modifications "
2955	"eliminating the need for runtime checks "
2956	"(e.g., adding 'restrict').";
2957	});
2958	}
2959
2960	LoopBypassBlocks.push_back(Elt: MemCheckBlock);
2961
2962	AddedSafetyChecks = true;
2963
2964	return MemCheckBlock;
2965	}
2966
2967	void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2968	LoopScalarBody = OrigLoop->getHeader();
2969	LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2970	assert(LoopVectorPreHeader && "Invalid loop structure");
2971	LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2972	assert((LoopExitBlock \|\| Cost->requiresScalarEpilogue(VF.isVector())) &&
2973	"multiple exit loop without required epilogue?");
2974
2975	LoopMiddleBlock =
2976	SplitBlock(Old: LoopVectorPreHeader, SplitPt: LoopVectorPreHeader->getTerminator(), DT,
2977	LI, MSSAU: nullptr, BBName: Twine (Prefix) + "middle.block");
2978	LoopScalarPreHeader =
2979	SplitBlock(Old: LoopMiddleBlock, SplitPt: LoopMiddleBlock->getTerminator(), DT, LI,
2980	MSSAU: nullptr, BBName: Twine (Prefix) + "scalar.ph");
2981
2982	auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
2983
2984	// Set up the middle block terminator. Two cases:
2985	// 1) If we know that we must execute the scalar epilogue, emit an
2986	// unconditional branch.
2987	// 2) Otherwise, we must have a single unique exit block (due to how we
2988	// implement the multiple exit case). In this case, set up a conditional
2989	// branch from the middle block to the loop scalar preheader, and the
2990	// exit block. completeLoopSkeleton will update the condition to use an
2991	// iteration check, if required to decide whether to execute the remainder.
2992	BranchInst *BrInst =
2993	Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector())
2994	? BranchInst::Create(IfTrue: LoopScalarPreHeader)
2995	: BranchInst::Create(IfTrue: LoopExitBlock, IfFalse: LoopScalarPreHeader,
2996	Cond: Builder.getTrue());
2997	BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
2998	ReplaceInstWithInst(From: LoopMiddleBlock->getTerminator(), To: BrInst);
2999
3000	// Update dominator for loop exit. During skeleton creation, only the vector
3001	// pre-header and the middle block are created. The vector loop is entirely
3002	// created during VPlan exection.
3003	if (!Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()))
3004	// If there is an epilogue which must run, there's no edge from the
3005	// middle block to exit blocks and thus no need to update the immediate
3006	// dominator of the exit blocks.
3007	DT->changeImmediateDominator(BB: LoopExitBlock, NewBB: LoopMiddleBlock);
3008	}
3009
3010	PHINode *InnerLoopVectorizer::createInductionResumeValue(
3011	PHINode OrigPhi, const* InductionDescriptor &II, Value *Step,
3012	ArrayRef<BasicBlock *> BypassBlocks,
3013	std::pair<BasicBlock , Value > AdditionalBypass) {
3014	Value *VectorTripCount = getOrCreateVectorTripCount(InsertBlock: LoopVectorPreHeader);
3015	assert(VectorTripCount && "Expected valid arguments");
3016
3017	Instruction *OldInduction = Legal->getPrimaryInduction();
3018	Value *&EndValue = IVEndValues [OrigPhi];
3019	Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3020	if (OrigPhi == OldInduction) {
3021	// We know what the end value is.
3022	EndValue = VectorTripCount;
3023	} else {
3024	IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3025
3026	// Fast-math-flags propagate from the original induction instruction.
3027	if (II.getInductionBinOp() && isa<FPMathOperator>(Val: II.getInductionBinOp()))
3028	B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3029
3030	EndValue = emitTransformedIndex(B, Index: VectorTripCount, StartValue: II.getStartValue(),
3031	Step, InductionKind: II.getKind(), InductionBinOp: II.getInductionBinOp());
3032	EndValue->setName("ind.end");
3033
3034	// Compute the end value for the additional bypass (if applicable).
3035	if (AdditionalBypass.first) {
3036	B.SetInsertPoint(TheBB: AdditionalBypass.first,
3037	IP: AdditionalBypass.first->getFirstInsertionPt());
3038	EndValueFromAdditionalBypass =
3039	emitTransformedIndex(B, Index: AdditionalBypass.second, StartValue: II.getStartValue(),
3040	Step, InductionKind: II.getKind(), InductionBinOp: II.getInductionBinOp());
3041	EndValueFromAdditionalBypass->setName("ind.end");
3042	}
3043	}
3044
3045	// Create phi nodes to merge from the backedge-taken check block.
3046	PHINode *BCResumeVal =
3047	PHINode::Create(Ty: OrigPhi->getType(), NumReservedValues: `3`, NameStr: "bc.resume.val",
3048	InsertBefore: LoopScalarPreHeader->getTerminator()->getIterator());
3049	// Copy original phi DL over to the new one.
3050	BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3051
3052	// The new PHI merges the original incoming value, in case of a bypass,
3053	// or the value at the end of the vectorized loop.
3054	BCResumeVal->addIncoming(V: EndValue, BB: LoopMiddleBlock);
3055
3056	// Fix the scalar body counter (PHI node).
3057	// The old induction's phi node in the scalar body needs the truncated
3058	// value.
3059	for (BasicBlock *BB : BypassBlocks)
3060	BCResumeVal->addIncoming(V: II.getStartValue(), BB);
3061
3062	if (AdditionalBypass.first)
3063	BCResumeVal->setIncomingValueForBlock(BB: AdditionalBypass.first,
3064	V: EndValueFromAdditionalBypass);
3065	return BCResumeVal;
3066	}
3067
3068	/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
3069	/// expansion results.
3070	static Value getExpandedStep(const* InductionDescriptor &ID,
3071	const SCEV2ValueTy &ExpandedSCEVs) {
3072	const SCEV *Step = ID.getStep();
3073	if (auto *C = dyn_cast<SCEVConstant>(Val: Step))
3074	return C->getValue();
3075	if (auto *U = dyn_cast<SCEVUnknown>(Val: Step))
3076	return U->getValue();
3077	auto I = ExpandedSCEVs.find(Val: Step);
3078	assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
3079	return I ->second;
3080	}
3081
3082	void InnerLoopVectorizer::createInductionResumeValues(
3083	const SCEV2ValueTy &ExpandedSCEVs,
3084	std::pair<BasicBlock , Value > AdditionalBypass) {
3085	assert(((AdditionalBypass.first && AdditionalBypass.second) \|\|
3086	(!AdditionalBypass.first && !AdditionalBypass.second)) &&
3087	"Inconsistent information about additional bypass.");
3088	// We are going to resume the execution of the scalar loop.
3089	// Go over all of the induction variables that we found and fix the
3090	// PHIs that are left in the scalar version of the loop.
3091	// The starting values of PHI nodes depend on the counter of the last
3092	// iteration in the vectorized loop.
3093	// If we come from a bypass edge then we need to start from the original
3094	// start value.
3095	for (const auto &InductionEntry : Legal->getInductionVars()) {
3096	PHINode *OrigPhi = InductionEntry.first;
3097	const InductionDescriptor &II = InductionEntry.second;
3098	PHINode *BCResumeVal = createInductionResumeValue(
3099	OrigPhi, II, Step: getExpandedStep(ID: II, ExpandedSCEVs), BypassBlocks: LoopBypassBlocks,
3100	AdditionalBypass);
3101	OrigPhi->setIncomingValueForBlock(BB: LoopScalarPreHeader, V: BCResumeVal);
3102	}
3103	}
3104
3105	BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3106	// The trip counts should be cached by now.
3107	Value *Count = getTripCount();
3108	Value *VectorTripCount = getOrCreateVectorTripCount(InsertBlock: LoopVectorPreHeader);
3109
3110	auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3111
3112	// Add a check in the middle block to see if we have completed
3113	// all of the iterations in the first vector loop. Three cases:
3114	// 1) If we require a scalar epilogue, there is no conditional branch as
3115	// we unconditionally branch to the scalar preheader. Do nothing.
3116	// 2) If (N - N%VF) == N, then we don't* need to run the remainder.*
3117	// Thus if tail is to be folded, we know we don't need to run the
3118	// remainder and we can use the previous value for the condition (true).
3119	// 3) Otherwise, construct a runtime check.
3120	if (!Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()) &&
3121	!Cost->foldTailByMasking()) {
3122	// Here we use the same DebugLoc as the scalar loop latch terminator instead
3123	// of the corresponding compare because they may have ended up with
3124	// different line numbers and we want to avoid awkward line stepping while
3125	// debugging. Eg. if the compare has got a line number inside the loop.
3126	// TODO: At the moment, CreateICmpEQ will simplify conditions with constant
3127	// operands. Perform simplification directly on VPlan once the branch is
3128	// modeled there.
3129	IRBuilder<> B(LoopMiddleBlock->getTerminator());
3130	B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
3131	Value *CmpN = B.CreateICmpEQ(LHS: Count, RHS: VectorTripCount, Name: "cmp.n");
3132	BranchInst &BI = *cast<BranchInst>(Val: LoopMiddleBlock->getTerminator());
3133	BI.setCondition(CmpN);
3134	if (hasBranchWeightMD(I: *ScalarLatchTerm)) {
3135	// Assume that `Count % VectorTripCount` is equally distributed.
3136	unsigned TripCount = UF * VF.getKnownMinValue();
3137	assert(TripCount > `0` && "trip count should not be zero");
3138	const uint32_t Weights[] = {`1`, TripCount - `1`};
3139	setBranchWeights(I&: BI, Weights);
3140	}
3141	}
3142
3143	#ifdef EXPENSIVE_CHECKS
3144	assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3145	#endif
3146
3147	return LoopVectorPreHeader;
3148	}
3149
3150	std::pair<BasicBlock , Value >
3151	InnerLoopVectorizer::createVectorizedLoopSkeleton(
3152	const SCEV2ValueTy &ExpandedSCEVs) {
3153	/*
3154	In this function we generate a new loop. The new loop will contain
3155	the vectorized instructions while the old loop will continue to run the
3156	scalar remainder.
3157
3158	[ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3159	/ \| preheader are expanded here. Eventually all required SCEV
3160	/ \| expansion should happen here.
3161	/ v
3162	\| [ ] <-- vector loop bypass (may consist of multiple blocks).
3163	\| / \|
3164	\| / v
3165	\|\| [ ] <-- vector pre header.
3166	\|/ \|
3167	\| v
3168	\| [ ] \
3169	\| [ ]_\| <-- vector loop (created during VPlan execution).
3170	\| \|
3171	\| v
3172	\ -[ ] <--- middle-block.
3173	\/ \|
3174	/\ v
3175	\| ->[ ] <--- new preheader.
3176	\| \|
3177	(opt) v <-- edge from middle to exit iff epilogue is not required.
3178	\| [ ] \
3179	\| [ ]_\| <-- old scalar loop to handle remainder (scalar epilogue).
3180	\ \|
3181	\ v
3182	>[ ] <-- exit block(s).
3183	...
3184	*/
3185
3186	// Create an empty vector loop, and prepare basic blocks for the runtime
3187	// checks.
3188	createVectorLoopSkeleton(Prefix: "");
3189
3190	// Now, compare the new count to zero. If it is zero skip the vector loop and
3191	// jump to the scalar loop. This check also covers the case where the
3192	// backedge-taken count is uint##_max: adding one to it will overflow leading
3193	// to an incorrect trip count of zero. In this (rare) case we will also jump
3194	// to the scalar loop.
3195	emitIterationCountCheck(Bypass: LoopScalarPreHeader);
3196
3197	// Generate the code to check any assumptions that we've made for SCEV
3198	// expressions.
3199	emitSCEVChecks(Bypass: LoopScalarPreHeader);
3200
3201	// Generate the code that checks in runtime if arrays overlap. We put the
3202	// checks into a separate block to make the more common case of few elements
3203	// faster.
3204	emitMemRuntimeChecks(Bypass: LoopScalarPreHeader);
3205
3206	// Emit phis for the new starting index of the scalar loop.
3207	createInductionResumeValues(ExpandedSCEVs);
3208
3209	return {completeLoopSkeleton(), nullptr};
3210	}
3211
3212	// Fix up external users of the induction variable. At this point, we are
3213	// in LCSSA form, with all external PHIs that use the IV having one input value,
3214	// coming from the remainder loop. We need those PHIs to also have a correct
3215	// value for the IV when arriving directly from the middle block.
3216	void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3217	const InductionDescriptor &II,
3218	Value VectorTripCount, Value EndValue,
3219	BasicBlock *MiddleBlock,
3220	BasicBlock *VectorHeader, VPlan &Plan,
3221	VPTransformState &State) {
3222	// There are two kinds of external IV usages - those that use the value
3223	// computed in the last iteration (the PHI) and those that use the penultimate
3224	// value (the value that feeds into the phi from the loop latch).
3225	// We allow both, but they, obviously, have different values.
3226
3227	assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3228
3229	DenseMap<Value , Value > MissingVals;
3230
3231	// An external user of the last iteration's value should see the value that
3232	// the remainder loop uses to initialize its own IV.
3233	Value *PostInc = OrigPhi->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch());
3234	for (User *U : PostInc->users()) {
3235	Instruction *UI = cast<Instruction>(Val: U);
3236	if (!OrigLoop->contains(Inst: UI)) {
3237	assert(isa<PHINode>(UI) && "Expected LCSSA form");
3238	MissingVals [UI] = EndValue;
3239	}
3240	}
3241
3242	// An external user of the penultimate value need to see EndValue - Step.
3243	// The simplest way to get this is to recompute it from the constituent SCEVs,
3244	// that is Start + (Step (CRD - 1)).*
3245	for (User *U : OrigPhi->users()) {
3246	auto *UI = cast<Instruction>(Val: U);
3247	if (!OrigLoop->contains(Inst: UI)) {
3248	assert(isa<PHINode>(UI) && "Expected LCSSA form");
3249	IRBuilder<> B(MiddleBlock->getTerminator());
3250
3251	// Fast-math-flags propagate from the original induction instruction.
3252	if (II.getInductionBinOp() && isa<FPMathOperator>(Val: II.getInductionBinOp()))
3253	B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3254
3255	Value *CountMinusOne = B.CreateSub(
3256	LHS: VectorTripCount, RHS: ConstantInt::get(Ty: VectorTripCount->getType(), V: `1`));
3257	CountMinusOne->setName("cmo");
3258
3259	VPValue *StepVPV = Plan.getSCEVExpansion(S: II.getStep());
3260	assert(StepVPV && "step must have been expanded during VPlan execution");
3261	Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
3262	: State.get(Def: StepVPV, Instance: {`0`, `0`});
3263	Value *Escape =
3264	emitTransformedIndex(B, Index: CountMinusOne, StartValue: II.getStartValue(), Step,
3265	InductionKind: II.getKind(), InductionBinOp: II.getInductionBinOp());
3266	Escape->setName("ind.escape");
3267	MissingVals [UI] = Escape;
3268	}
3269	}
3270
3271	for (auto &I : MissingVals) {
3272	PHINode *PHI = cast<PHINode>(Val: I.first);
3273	// One corner case we have to handle is two IVs "chasing" each-other,
3274	// that is %IV2 = phi [...], [ %IV1, %latch ]
3275	// In this case, if IV1 has an external use, we need to avoid adding both
3276	// "last value of IV1" and "penultimate value of IV2". So, verify that we
3277	// don't already have an incoming value for the middle block.
3278	if (PHI->getBasicBlockIndex(BB: MiddleBlock) == -`1`) {
3279	PHI->addIncoming(V: I.second, BB: MiddleBlock);
3280	Plan.removeLiveOut(PN: PHI);
3281	}
3282	}
3283	}
3284
3285	namespace {
3286
3287	struct CSEDenseMapInfo {
3288	static bool canHandle(const Instruction *I) {
3289	return isa<InsertElementInst>(Val: I) \|\| isa<ExtractElementInst>(Val: I) \|\|
3290	isa<ShuffleVectorInst>(Val: I) \|\| isa<GetElementPtrInst>(Val: I);
3291	}
3292
3293	static inline Instruction *getEmptyKey() {
3294	return DenseMapInfo<Instruction *>::getEmptyKey();
3295	}
3296
3297	static inline Instruction *getTombstoneKey() {
3298	return DenseMapInfo<Instruction *>::getTombstoneKey();
3299	}
3300
3301	static unsigned getHashValue(const Instruction *I) {
3302	assert(canHandle(I) && "Unknown instruction!");
3303	return hash_combine(args: I->getOpcode(), args: hash_combine_range(first: I->value_op_begin(),
3304	last: I->value_op_end()));
3305	}
3306
3307	static bool isEqual(const Instruction LHS, const* Instruction *RHS) {
3308	if (LHS == getEmptyKey() \|\| RHS == getEmptyKey() \|\|
3309	LHS == getTombstoneKey() \|\| RHS == getTombstoneKey())
3310	return LHS == RHS;
3311	return LHS->isIdenticalTo(I: RHS);
3312	}
3313	};
3314
3315	} // end anonymous namespace
3316
3317	///Perform cse of induction variable instructions.
3318	static void cse(BasicBlock *BB) {
3319	// Perform simple cse.
3320	SmallDenseMap<Instruction , Instruction , `4`, CSEDenseMapInfo> CSEMap;
3321	for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
3322	if (!CSEDenseMapInfo::canHandle(I: &In))
3323	continue;
3324
3325	// Check if we can replace this instruction with any of the
3326	// visited instructions.
3327	if (Instruction *V = CSEMap.lookup(Val: &In)) {
3328	In.replaceAllUsesWith(V);
3329	In.eraseFromParent();
3330	continue;
3331	}
3332
3333	CSEMap [&In] = &In;
3334	}
3335	}
3336
3337	InstructionCost
3338	LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3339	ElementCount VF) const {
3340	// We only need to calculate a cost if the VF is scalar; for actual vectors
3341	// we should already have a pre-calculated cost at each VF.
3342	if (!VF.isScalar())
3343	return CallWideningDecisions.at(Val: std::make_pair(x&: CI, y&: VF)).Cost;
3344
3345	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3346	Type *RetTy = CI->getType();
3347	if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
3348	if (auto RedCost = getReductionPatternCost(I: CI, VF, VectorTy: RetTy, CostKind))
3349	return *RedCost;
3350
3351	SmallVector<Type *, `4`> Tys;
3352	for (auto &ArgOp : CI->args())
3353	Tys.push_back(Elt: ArgOp ->getType());
3354
3355	InstructionCost ScalarCallCost =
3356	TTI.getCallInstrCost(F: CI->getCalledFunction(), RetTy, Tys, CostKind);
3357
3358	// If this is an intrinsic we may have a lower cost for it.
3359	if (getVectorIntrinsicIDForCall(CI, TLI)) {
3360	InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
3361	return std::min(a: ScalarCallCost, b: IntrinsicCost);
3362	}
3363	return ScalarCallCost;
3364	}
3365
3366	static Type MaybeVectorizeType(Type Elt, ElementCount VF) {
3367	if (VF.isScalar() \|\| (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3368	return Elt;
3369	return VectorType::get(ElementType: Elt, EC: VF);
3370	}
3371
3372	InstructionCost
3373	LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3374	ElementCount VF) const {
3375	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3376	assert(ID && "Expected intrinsic call!");
3377	Type *RetTy = MaybeVectorizeType(Elt: CI->getType(), VF);
3378	FastMathFlags FMF;
3379	if (auto *FPMO = dyn_cast<FPMathOperator>(Val: CI))
3380	FMF = FPMO->getFastMathFlags();
3381
3382	SmallVector<const Value *> Arguments(CI->args());
3383	FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3384	SmallVector<Type *> ParamTys;
3385	std::transform(first: FTy->param_begin(), last: FTy->param_end(),
3386	result: std::back_inserter(x&: ParamTys),
3387	unary_op: [&](Type Ty) { return* MaybeVectorizeType(Elt: Ty, VF); });
3388
3389	IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3390	dyn_cast<IntrinsicInst>(Val: CI));
3391	return TTI.getIntrinsicInstrCost(ICA: CostAttrs,
3392	CostKind: TargetTransformInfo::TCK_RecipThroughput);
3393	}
3394
3395	static Type smallestIntegerVectorType(Type T1, Type *T2) {
3396	auto *I1 = cast<IntegerType>(Val: cast<VectorType>(Val: T1)->getElementType());
3397	auto *I2 = cast<IntegerType>(Val: cast<VectorType>(Val: T2)->getElementType());
3398	return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3399	}
3400
3401	static Type largestIntegerVectorType(Type T1, Type *T2) {
3402	auto *I1 = cast<IntegerType>(Val: cast<VectorType>(Val: T1)->getElementType());
3403	auto *I2 = cast<IntegerType>(Val: cast<VectorType>(Val: T2)->getElementType());
3404	return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3405	}
3406
3407	void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3408	VPlan &Plan) {
3409	// Fix widened non-induction PHIs by setting up the PHI operands.
3410	if (EnableVPlanNativePath)
3411	fixNonInductionPHIs(Plan, State);
3412
3413	// At this point every instruction in the original loop is widened to a
3414	// vector form. Now we need to fix the recurrences in the loop. These PHI
3415	// nodes are currently empty because we did not want to introduce cycles.
3416	// This is the second stage of vectorizing recurrences. Note that fixing
3417	// reduction phis are already modeled in VPlan.
3418	// TODO: Also model fixing fixed-order recurrence phis in VPlan.
3419	VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
3420	VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();
3421	for (VPRecipeBase &R : HeaderVPBB->phis()) {
3422	if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &R))
3423	fixFixedOrderRecurrence(PhiR: FOR, State);
3424	}
3425
3426	// Forget the original basic block.
3427	PSE.getSE()->forgetLoop(L: OrigLoop);
3428	PSE.getSE()->forgetBlockAndLoopDispositions();
3429
3430	// After vectorization, the exit blocks of the original loop will have
3431	// additional predecessors. Invalidate SCEVs for the exit phis in case SE
3432	// looked through single-entry phis.
3433	SmallVector<BasicBlock *> ExitBlocks;
3434	OrigLoop->getExitBlocks(ExitBlocks);
3435	for (BasicBlock *Exit : ExitBlocks)
3436	for (PHINode &PN : Exit->phis())
3437	PSE.getSE()->forgetLcssaPhiWithNewPredecessor(L: OrigLoop, V: &PN);
3438
3439	VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
3440	Loop *VectorLoop = LI->getLoopFor(BB: State.CFG.VPBB2IRBB [LatchVPBB]);
3441	if (Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector())) {
3442	// No edge from the middle block to the unique exit block has been inserted
3443	// and there is nothing to fix from vector loop; phis should have incoming
3444	// from scalar loop only.
3445	} else {
3446	// TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
3447	// the cost model.
3448
3449	// If we inserted an edge from the middle block to the unique exit block,
3450	// update uses outside the loop (phis) to account for the newly inserted
3451	// edge.
3452
3453	// Fix-up external users of the induction variables.
3454	for (const auto &Entry : Legal->getInductionVars())
3455	fixupIVUsers(OrigPhi: Entry.first, II: Entry.second,
3456	VectorTripCount: getOrCreateVectorTripCount(InsertBlock: VectorLoop->getLoopPreheader()),
3457	EndValue: IVEndValues [Entry.first], MiddleBlock: LoopMiddleBlock,
3458	VectorHeader: VectorLoop->getHeader(), Plan, State);
3459	}
3460
3461	// Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3462	// in the exit block, so update the builder.
3463	State.Builder.SetInsertPoint(TheBB: State.CFG.ExitBB,
3464	IP: State.CFG.ExitBB->getFirstNonPHIIt());
3465	for (const auto &KV : Plan.getLiveOuts())
3466	KV.second->fixPhi(Plan, State);
3467
3468	for (Instruction *PI : PredicatedInstructions)
3469	sinkScalarOperands(PredInst: &*PI);
3470
3471	// Remove redundant induction instructions.
3472	cse(BB: VectorLoop->getHeader());
3473
3474	// Set/update profile weights for the vector and remainder loops as original
3475	// loop iterations are now distributed among them. Note that original loop
3476	// represented by LoopScalarBody becomes remainder loop after vectorization.
3477	//
3478	// For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3479	// end up getting slightly roughened result but that should be OK since
3480	// profile is not inherently precise anyway. Note also possible bypass of
3481	// vector code caused by legality checks is ignored, assigning all the weight
3482	// to the vector loop, optimistically.
3483	//
3484	// For scalable vectorization we can't know at compile time how many iterations
3485	// of the loop are handled in one vector iteration, so instead assume a pessimistic
3486	// vscale of '1'.
3487	setProfileInfoAfterUnrolling(OrigLoop: LI->getLoopFor(BB: LoopScalarBody), UnrolledLoop: VectorLoop,
3488	RemainderLoop: LI->getLoopFor(BB: LoopScalarBody),
3489	UF: VF.getKnownMinValue() * UF);
3490	}
3491
3492	void InnerLoopVectorizer::fixFixedOrderRecurrence(
3493	VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3494	// This is the second phase of vectorizing first-order recurrences. An
3495	// overview of the transformation is described below. Suppose we have the
3496	// following loop.
3497	//
3498	// for (int i = 0; i < n; ++i)
3499	// b[i] = a[i] - a[i - 1];
3500	//
3501	// There is a first-order recurrence on "a". For this loop, the shorthand
3502	// scalar IR looks like:
3503	//
3504	// scalar.ph:
3505	// s_init = a[-1]
3506	// br scalar.body
3507	//
3508	// scalar.body:
3509	// i = phi [0, scalar.ph], [i+1, scalar.body]
3510	// s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3511	// s2 = a[i]
3512	// b[i] = s2 - s1
3513	// br cond, scalar.body, ...
3514	//
3515	// In this example, s1 is a recurrence because it's value depends on the
3516	// previous iteration. In the first phase of vectorization, we created a
3517	// vector phi v1 for s1. We now complete the vectorization and produce the
3518	// shorthand vector IR shown below (for VF = 4, UF = 1).
3519	//
3520	// vector.ph:
3521	// v_init = vector(..., ..., ..., a[-1])
3522	// br vector.body
3523	//
3524	// vector.body
3525	// i = phi [0, vector.ph], [i+4, vector.body]
3526	// v1 = phi [v_init, vector.ph], [v2, vector.body]
3527	// v2 = a[i, i+1, i+2, i+3];
3528	// v3 = vector(v1(3), v2(0, 1, 2))
3529	// b[i, i+1, i+2, i+3] = v2 - v3
3530	// br cond, vector.body, middle.block
3531	//
3532	// middle.block:
3533	// x = v2(3)
3534	// br scalar.ph
3535	//
3536	// scalar.ph:
3537	// s_init = phi [x, middle.block], [a[-1], otherwise]
3538	// br scalar.body
3539	//
3540	// After execution completes the vector loop, we extract the next value of
3541	// the recurrence (x) to use as the initial value in the scalar loop.
3542
3543	// Extract the last vector element in the middle block. This will be the
3544	// initial value for the recurrence when jumping to the scalar loop.
3545	VPValue *PreviousDef = PhiR->getBackedgeValue();
3546	Value *Incoming = State.get(Def: PreviousDef, Part: UF - `1`);
3547	auto *ExtractForScalar = Incoming;
3548	auto *IdxTy = Builder.getInt32Ty();
3549	Value RuntimeVF = nullptr*;
3550	if (VF.isVector()) {
3551	auto *One = ConstantInt::get(Ty: IdxTy, V: `1`);
3552	Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3553	RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF);
3554	auto *LastIdx = Builder.CreateSub(LHS: RuntimeVF, RHS: One);
3555	ExtractForScalar =
3556	Builder.CreateExtractElement(Vec: Incoming, Idx: LastIdx, Name: "vector.recur.extract");
3557	}
3558
3559	auto RecurSplice = cast<VPInstruction>(Val: *PhiR->user_begin());
3560	assert(PhiR->getNumUsers() == `1` &&
3561	RecurSplice->getOpcode() ==
3562	VPInstruction::FirstOrderRecurrenceSplice &&
3563	"recurrence phi must have a single user: FirstOrderRecurrenceSplice");
3564	SmallVector<VPLiveOut *> LiveOuts;
3565	for (VPUser *U : RecurSplice->users())
3566	if (auto *LiveOut = dyn_cast<VPLiveOut>(Val: U))
3567	LiveOuts.push_back(Elt: LiveOut);
3568
3569	if (!LiveOuts.empty()) {
3570	// Extract the second last element in the middle block if the
3571	// Phi is used outside the loop. We need to extract the phi itself
3572	// and not the last element (the phi update in the current iteration). This
3573	// will be the value when jumping to the exit block from the
3574	// LoopMiddleBlock, when the scalar loop is not run at all.
3575	Value ExtractForPhiUsedOutsideLoop = nullptr*;
3576	if (VF.isVector()) {
3577	auto *Idx = Builder.CreateSub(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: IdxTy, V: `2`));
3578	ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3579	Vec: Incoming, Idx, Name: "vector.recur.extract.for.phi");
3580	} else {
3581	assert(UF > `1` && "VF and UF cannot both be 1");
3582	// When loop is unrolled without vectorizing, initialize
3583	// ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
3584	// value of `Incoming`. This is analogous to the vectorized case above:
3585	// extracting the second last element when VF > 1.
3586	ExtractForPhiUsedOutsideLoop = State.get(Def: PreviousDef, Part: UF - `2`);
3587	}
3588
3589	for (VPLiveOut *LiveOut : LiveOuts) {
3590	assert(!Cost->requiresScalarEpilogue(VF.isVector()));
3591	PHINode *LCSSAPhi = LiveOut->getPhi();
3592	LCSSAPhi->addIncoming(V: ExtractForPhiUsedOutsideLoop, BB: LoopMiddleBlock);
3593	State.Plan->removeLiveOut(PN: LCSSAPhi);
3594	}
3595	}
3596
3597	// Fix the initial value of the original recurrence in the scalar loop.
3598	Builder.SetInsertPoint(TheBB: LoopScalarPreHeader, IP: LoopScalarPreHeader->begin());
3599	PHINode *Phi = cast<PHINode>(Val: PhiR->getUnderlyingValue());
3600	auto *Start = Builder.CreatePHI(Ty: Phi->getType(), NumReservedValues: `2`, Name: "scalar.recur.init");
3601	auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3602	for (auto *BB : predecessors(BB: LoopScalarPreHeader)) {
3603	auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3604	Start->addIncoming(V: Incoming, BB);
3605	}
3606
3607	Phi->setIncomingValueForBlock(BB: LoopScalarPreHeader, V: Start);
3608	Phi->setName("scalar.recur");
3609	}
3610
3611	void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3612	// The basic block and loop containing the predicated instruction.
3613	auto *PredBB = PredInst->getParent();
3614	auto *VectorLoop = LI->getLoopFor(BB: PredBB);
3615
3616	// Initialize a worklist with the operands of the predicated instruction.
3617	SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3618
3619	// Holds instructions that we need to analyze again. An instruction may be
3620	// reanalyzed if we don't yet know if we can sink it or not.
3621	SmallVector<Instruction *, `8`> InstsToReanalyze;
3622
3623	// Returns true if a given use occurs in the predicated block. Phi nodes use
3624	// their operands in their corresponding predecessor blocks.
3625	auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3626	auto *I = cast<Instruction>(Val: U.getUser());
3627	BasicBlock *BB = I->getParent();
3628	if (auto *Phi = dyn_cast<PHINode>(Val: I))
3629	BB = Phi->getIncomingBlock(
3630	i: PHINode::getIncomingValueNumForOperand(i: U.getOperandNo()));
3631	return BB == PredBB;
3632	};
3633
3634	// Iteratively sink the scalarized operands of the predicated instruction
3635	// into the block we created for it. When an instruction is sunk, it's
3636	// operands are then added to the worklist. The algorithm ends after one pass
3637	// through the worklist doesn't sink a single instruction.
3638	bool Changed;
3639	do {
3640	// Add the instructions that need to be reanalyzed to the worklist, and
3641	// reset the changed indicator.
3642	Worklist.insert(Start: InstsToReanalyze.begin(), End: InstsToReanalyze.end());
3643	InstsToReanalyze.clear();
3644	Changed = false;
3645
3646	while (!Worklist.empty()) {
3647	auto *I = dyn_cast<Instruction>(Val: Worklist.pop_back_val());
3648
3649	// We can't sink an instruction if it is a phi node, is not in the loop,
3650	// may have side effects or may read from memory.
3651	// TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3652	if (!I \|\| isa<PHINode>(Val: I) \|\| !VectorLoop->contains(Inst: I) \|\|
3653	I->mayHaveSideEffects() \|\| I->mayReadFromMemory())
3654	continue;
3655
3656	// If the instruction is already in PredBB, check if we can sink its
3657	// operands. In that case, VPlan's sinkScalarOperands() succeeded in
3658	// sinking the scalar instruction I, hence it appears in PredBB; but it
3659	// may have failed to sink I's operands (recursively), which we try
3660	// (again) here.
3661	if (I->getParent() == PredBB) {
3662	Worklist.insert(Start: I->op_begin(), End: I->op_end());
3663	continue;
3664	}
3665
3666	// It's legal to sink the instruction if all its uses occur in the
3667	// predicated block. Otherwise, there's nothing to do yet, and we may
3668	// need to reanalyze the instruction.
3669	if (!llvm::all_of(Range: I->uses(), P: isBlockOfUsePredicated)) {
3670	InstsToReanalyze.push_back(Elt: I);
3671	continue;
3672	}
3673
3674	// Move the instruction to the beginning of the predicated block, and add
3675	// it's operands to the worklist.
3676	I->moveBefore(MovePos: &*PredBB->getFirstInsertionPt());
3677	Worklist.insert(Start: I->op_begin(), End: I->op_end());
3678
3679	// The sinking may have enabled other instructions to be sunk, so we will
3680	// need to iterate.
3681	Changed = true;
3682	}
3683	} while (Changed);
3684	}
3685
3686	void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
3687	VPTransformState &State) {
3688	auto Iter = vp_depth_first_deep(G: Plan.getEntry());
3689	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
3690	for (VPRecipeBase &P : VPBB->phis()) {
3691	VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &P);
3692	if (!VPPhi)
3693	continue;
3694	PHINode *NewPhi = cast<PHINode>(Val: State.get(Def: VPPhi, Part: `0`));
3695	// Make sure the builder has a valid insert point.
3696	Builder.SetInsertPoint(NewPhi);
3697	for (unsigned i = `0`; i < VPPhi->getNumOperands(); ++i) {
3698	VPValue *Inc = VPPhi->getIncomingValue(I: i);
3699	VPBasicBlock *VPBB = VPPhi->getIncomingBlock(I: i);
3700	NewPhi->addIncoming(V: State.get(Def: Inc, Part: `0`), BB: State.CFG.VPBB2IRBB [VPBB]);
3701	}
3702	}
3703	}
3704	}
3705
3706	void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3707	// We should not collect Scalars more than once per VF. Right now, this
3708	// function is called from collectUniformsAndScalars(), which already does
3709	// this check. Collecting Scalars for VF=1 does not make any sense.
3710	assert(VF.isVector() && !Scalars.contains(VF) &&
3711	"This function should not be visited twice for the same VF");
3712
3713	// This avoids any chances of creating a REPLICATE recipe during planning
3714	// since that would result in generation of scalarized code during execution,
3715	// which is not supported for scalable vectors.
3716	if (VF.isScalable()) {
3717	Scalars [VF].insert(I: Uniforms [VF].begin(), E: Uniforms [VF].end());
3718	return;
3719	}
3720
3721	SmallSetVector<Instruction *, `8`> Worklist;
3722
3723	// These sets are used to seed the analysis with pointers used by memory
3724	// accesses that will remain scalar.
3725	SmallSetVector<Instruction *, `8`> ScalarPtrs;
3726	SmallPtrSet<Instruction *, `8`> PossibleNonScalarPtrs;
3727	auto *Latch = TheLoop->getLoopLatch();
3728
3729	// A helper that returns true if the use of Ptr by MemAccess will be scalar.
3730	// The pointer operands of loads and stores will be scalar as long as the
3731	// memory access is not a gather or scatter operation. The value operand of a
3732	// store will remain scalar if the store is scalarized.
3733	auto isScalarUse = [&](Instruction MemAccess, Value Ptr) {
3734	InstWidening WideningDecision = getWideningDecision(I: MemAccess, VF);
3735	assert(WideningDecision != CM_Unknown &&
3736	"Widening decision should be ready at this moment");
3737	if (auto *Store = dyn_cast<StoreInst>(Val: MemAccess))
3738	if (Ptr == Store->getValueOperand())
3739	return WideningDecision == CM_Scalarize;
3740	assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3741	"Ptr is neither a value or pointer operand");
3742	return WideningDecision != CM_GatherScatter;
3743	};
3744
3745	// A helper that returns true if the given value is a bitcast or
3746	// getelementptr instruction contained in the loop.
3747	auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3748	return ((isa<BitCastInst>(Val: V) && V->getType()->isPointerTy()) \|\|
3749	isa<GetElementPtrInst>(Val: V)) &&
3750	!TheLoop->isLoopInvariant(V);
3751	};
3752
3753	// A helper that evaluates a memory access's use of a pointer. If the use will
3754	// be a scalar use and the pointer is only used by memory accesses, we place
3755	// the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3756	// PossibleNonScalarPtrs.
3757	auto evaluatePtrUse = [&](Instruction MemAccess, Value Ptr) {
3758	// We only care about bitcast and getelementptr instructions contained in
3759	// the loop.
3760	if (!isLoopVaryingBitCastOrGEP (Ptr))
3761	return;
3762
3763	// If the pointer has already been identified as scalar (e.g., if it was
3764	// also identified as uniform), there's nothing to do.
3765	auto *I = cast<Instruction>(Val: Ptr);
3766	if (Worklist.count(key: I))
3767	return;
3768
3769	// If the use of the pointer will be a scalar use, and all users of the
3770	// pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3771	// place the pointer in PossibleNonScalarPtrs.
3772	if (isScalarUse (MemAccess, Ptr) && llvm::all_of(Range: I->users(), P: [&](User *U) {
3773	return isa<LoadInst>(Val: U) \|\| isa<StoreInst>(Val: U);
3774	}))
3775	ScalarPtrs.insert(X: I);
3776	else
3777	PossibleNonScalarPtrs.insert(Ptr: I);
3778	};
3779
3780	// We seed the scalars analysis with three classes of instructions: (1)
3781	// instructions marked uniform-after-vectorization and (2) bitcast,
3782	// getelementptr and (pointer) phi instructions used by memory accesses
3783	// requiring a scalar use.
3784	//
3785	// (1) Add to the worklist all instructions that have been identified as
3786	// uniform-after-vectorization.
3787	Worklist.insert(Start: Uniforms [VF].begin(), End: Uniforms [VF].end());
3788
3789	// (2) Add to the worklist all bitcast and getelementptr instructions used by
3790	// memory accesses requiring a scalar use. The pointer operands of loads and
3791	// stores will be scalar as long as the memory accesses is not a gather or
3792	// scatter operation. The value operand of a store will remain scalar if the
3793	// store is scalarized.
3794	for (auto *BB : TheLoop->blocks())
3795	for (auto &I : *BB) {
3796	if (auto *Load = dyn_cast<LoadInst>(Val: &I)) {
3797	evaluatePtrUse (Load, Load->getPointerOperand());
3798	} else if (auto *Store = dyn_cast<StoreInst>(Val: &I)) {
3799	evaluatePtrUse (Store, Store->getPointerOperand());
3800	evaluatePtrUse (Store, Store->getValueOperand());
3801	}
3802	}
3803	for (auto *I : ScalarPtrs)
3804	if (!PossibleNonScalarPtrs.count(Ptr: I)) {
3805	LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3806	Worklist.insert(X: I);
3807	}
3808
3809	// Insert the forced scalars.
3810	// FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3811	// induction variable when the PHI user is scalarized.
3812	auto ForcedScalar = ForcedScalars.find(Val: VF);
3813	if (ForcedScalar != ForcedScalars.end())
3814	for (auto *I : ForcedScalar ->second) {
3815	LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3816	Worklist.insert(X: I);
3817	}
3818
3819	// Expand the worklist by looking through any bitcasts and getelementptr
3820	// instructions we've already identified as scalar. This is similar to the
3821	// expansion step in collectLoopUniforms(); however, here we're only
3822	// expanding to include additional bitcasts and getelementptr instructions.
3823	unsigned Idx = `0`;
3824	while (Idx != Worklist.size()) {
3825	Instruction *Dst = Worklist [Idx++];
3826	if (!isLoopVaryingBitCastOrGEP (Dst->getOperand(i: `0`)))
3827	continue;
3828	auto *Src = cast<Instruction>(Val: Dst->getOperand(i: `0`));
3829	if (llvm::all_of(Range: Src->users(), P: [&](User U) -> bool* {
3830	auto *J = cast<Instruction>(Val: U);
3831	return !TheLoop->contains(Inst: J) \|\| Worklist.count(key: J) \|\|
3832	((isa<LoadInst>(Val: J) \|\| isa<StoreInst>(Val: J)) &&
3833	isScalarUse (J, Src));
3834	})) {
3835	Worklist.insert(X: Src);
3836	LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3837	}
3838	}
3839
3840	// An induction variable will remain scalar if all users of the induction
3841	// variable and induction variable update remain scalar.
3842	for (const auto &Induction : Legal->getInductionVars()) {
3843	auto *Ind = Induction.first;
3844	auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
3845
3846	// If tail-folding is applied, the primary induction variable will be used
3847	// to feed a vector compare.
3848	if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3849	continue;
3850
3851	// Returns true if \p Indvar is a pointer induction that is used directly by
3852	// load/store instruction \p I.
3853	auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3854	Instruction *I) {
3855	return Induction.second.getKind() ==
3856	InductionDescriptor::IK_PtrInduction &&
3857	(isa<LoadInst>(Val: I) \|\| isa<StoreInst>(Val: I)) &&
3858	Indvar == getLoadStorePointerOperand(V: I) && isScalarUse (I, Indvar);
3859	};
3860
3861	// Determine if all users of the induction variable are scalar after
3862	// vectorization.
3863	auto ScalarInd = llvm::all_of(Range: Ind->users(), P: [&](User U) -> bool* {
3864	auto *I = cast<Instruction>(Val: U);
3865	return I == IndUpdate \|\| !TheLoop->contains(Inst: I) \|\| Worklist.count(key: I) \|\|
3866	IsDirectLoadStoreFromPtrIndvar (Ind, I);
3867	});
3868	if (!ScalarInd)
3869	continue;
3870
3871	// If the induction variable update is a fixed-order recurrence, neither the
3872	// induction variable or its update should be marked scalar after
3873	// vectorization.
3874	auto *IndUpdatePhi = dyn_cast<PHINode>(Val: IndUpdate);
3875	if (IndUpdatePhi && Legal->isFixedOrderRecurrence(Phi: IndUpdatePhi))
3876	continue;
3877
3878	// Determine if all users of the induction variable update instruction are
3879	// scalar after vectorization.
3880	auto ScalarIndUpdate =
3881	llvm::all_of(Range: IndUpdate->users(), P: [&](User U) -> bool* {
3882	auto *I = cast<Instruction>(Val: U);
3883	return I == Ind \|\| !TheLoop->contains(Inst: I) \|\| Worklist.count(key: I) \|\|
3884	IsDirectLoadStoreFromPtrIndvar (IndUpdate, I);
3885	});
3886	if (!ScalarIndUpdate)
3887	continue;
3888
3889	// The induction variable and its update instruction will remain scalar.
3890	Worklist.insert(X: Ind);
3891	Worklist.insert(X: IndUpdate);
3892	LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3893	LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3894	<< "\n");
3895	}
3896
3897	Scalars [VF].insert(I: Worklist.begin(), E: Worklist.end());
3898	}
3899
3900	bool LoopVectorizationCostModel::isScalarWithPredication(
3901	Instruction I, ElementCount VF) const* {
3902	if (!isPredicatedInst(I))
3903	return false;
3904
3905	// Do we have a non-scalar lowering for this predicated
3906	// instruction? No - it is scalar with predication.
3907	switch(I->getOpcode()) {
3908	default:
3909	return true;
3910	case Instruction::Call:
3911	if (VF.isScalar())
3912	return true;
3913	return CallWideningDecisions.at(Val: std::make_pair(x: cast<CallInst>(Val: I), y&: VF))
3914	.Kind == CM_Scalarize;
3915	case Instruction::Load:
3916	case Instruction::Store: {
3917	auto *Ptr = getLoadStorePointerOperand(V: I);
3918	auto *Ty = getLoadStoreType(I);
3919	Type *VTy = Ty;
3920	if (VF.isVector())
3921	VTy = VectorType::get(ElementType: Ty, EC: VF);
3922	const Align Alignment = getLoadStoreAlignment(I);
3923	return isa<LoadInst>(Val: I) ? !(isLegalMaskedLoad(DataType: Ty, Ptr, Alignment) \|\|
3924	TTI.isLegalMaskedGather(DataType: VTy, Alignment))
3925	: !(isLegalMaskedStore(DataType: Ty, Ptr, Alignment) \|\|
3926	TTI.isLegalMaskedScatter(DataType: VTy, Alignment));
3927	}
3928	case Instruction::UDiv:
3929	case Instruction::SDiv:
3930	case Instruction::SRem:
3931	case Instruction::URem: {
3932	// We have the option to use the safe-divisor idiom to avoid predication.
3933	// The cost based decision here will always select safe-divisor for
3934	// scalable vectors as scalarization isn't legal.
3935	const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3936	return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3937	}
3938	}
3939	}
3940
3941	bool LoopVectorizationCostModel::isPredicatedInst(Instruction I) const* {
3942	if (!blockNeedsPredicationForAnyReason(BB: I->getParent()))
3943	return false;
3944
3945	// Can we prove this instruction is safe to unconditionally execute?
3946	// If not, we must use some form of predication.
3947	switch(I->getOpcode()) {
3948	default:
3949	return false;
3950	case Instruction::Load:
3951	case Instruction::Store: {
3952	if (!Legal->isMaskRequired(I))
3953	return false;
3954	// When we know the load's address is loop invariant and the instruction
3955	// in the original scalar loop was unconditionally executed then we
3956	// don't need to mark it as a predicated instruction. Tail folding may
3957	// introduce additional predication, but we're guaranteed to always have
3958	// at least one active lane. We call Legal->blockNeedsPredication here
3959	// because it doesn't query tail-folding. For stores, we need to prove
3960	// both speculation safety (which follows from the same argument as loads),
3961	// but also must prove the value being stored is correct. The easiest
3962	// form of the later is to require that all values stored are the same.
3963	if (Legal->isInvariant(V: getLoadStorePointerOperand(V: I)) &&
3964	(isa<LoadInst>(Val: I) \|\|
3965	(isa<StoreInst>(Val: I) &&
3966	TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand()))) &&
3967	!Legal->blockNeedsPredication(BB: I->getParent()))
3968	return false;
3969	return true;
3970	}
3971	case Instruction::UDiv:
3972	case Instruction::SDiv:
3973	case Instruction::SRem:
3974	case Instruction::URem:
3975	// TODO: We can use the loop-preheader as context point here and get
3976	// context sensitive reasoning
3977	return !isSafeToSpeculativelyExecute(I);
3978	case Instruction::Call:
3979	return Legal->isMaskRequired(I);
3980	}
3981	}
3982
3983	std::pair<InstructionCost, InstructionCost>
3984	LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
3985	ElementCount VF) const {
3986	assert(I->getOpcode() == Instruction::UDiv \|\|
3987	I->getOpcode() == Instruction::SDiv \|\|
3988	I->getOpcode() == Instruction::SRem \|\|
3989	I->getOpcode() == Instruction::URem);
3990	assert(!isSafeToSpeculativelyExecute(I));
3991
3992	const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3993
3994	// Scalarization isn't legal for scalable vector types
3995	InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3996	if (!VF.isScalable()) {
3997	// Get the scalarization cost and scale this amount by the probability of
3998	// executing the predicated block. If the instruction is not predicated,
3999	// we fall through to the next case.
4000	ScalarizationCost = `0`;
4001
4002	// These instructions have a non-void type, so account for the phi nodes
4003	// that we will create. This cost is likely to be zero. The phi node
4004	// cost, if any, should be scaled by the block probability because it
4005	// models a copy at the end of each predicated block.
4006	ScalarizationCost += VF.getKnownMinValue() *
4007	TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
4008
4009	// The cost of the non-predicated instruction.
4010	ScalarizationCost += VF.getKnownMinValue() *
4011	TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: I->getType(), CostKind);
4012
4013	// The cost of insertelement and extractelement instructions needed for
4014	// scalarization.
4015	ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
4016
4017	// Scale the cost by the probability of executing the predicated blocks.
4018	// This assumes the predicated block for each vector lane is equally
4019	// likely.
4020	ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
4021	}
4022	InstructionCost SafeDivisorCost = `0`;
4023
4024	auto *VecTy = ToVectorTy(Scalar: I->getType(), EC: VF);
4025
4026	// The cost of the select guard to ensure all lanes are well defined
4027	// after we speculate above any internal control flow.
4028	SafeDivisorCost += TTI.getCmpSelInstrCost(
4029	Opcode: Instruction::Select, ValTy: VecTy,
4030	CondTy: ToVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
4031	VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
4032
4033	// Certain instructions can be cheaper to vectorize if they have a constant
4034	// second vector operand. One example of this are shifts on x86.
4035	Value *Op2 = I->getOperand(i: `1`);
4036	auto Op2Info = TTI.getOperandInfo(V: Op2);
4037	if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
4038	Legal->isInvariant(V: Op2))
4039	Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
4040
4041	SmallVector<const Value *, `4`> Operands(I->operand_values());
4042	SafeDivisorCost += TTI.getArithmeticInstrCost(
4043	Opcode: I->getOpcode(), Ty: VecTy, CostKind,
4044	Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
4045	Opd2Info: Op2Info, Args: Operands, CxtI: I);
4046	return {ScalarizationCost, SafeDivisorCost};
4047	}
4048
4049	bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4050	Instruction *I, ElementCount VF) {
4051	assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4052	assert(getWideningDecision(I, VF) == CM_Unknown &&
4053	"Decision should not be set yet.");
4054	auto *Group = getInterleavedAccessGroup(Instr: I);
4055	assert(Group && "Must have a group.");
4056
4057	// If the instruction's allocated size doesn't equal it's type size, it
4058	// requires padding and will be scalarized.
4059	auto &DL = I->getModule()->getDataLayout();
4060	auto *ScalarTy = getLoadStoreType(I);
4061	if (hasIrregularType(Ty: ScalarTy, DL))
4062	return false;
4063
4064	// If the group involves a non-integral pointer, we may not be able to
4065	// losslessly cast all values to a common type.
4066	unsigned InterleaveFactor = Group->getFactor();
4067	bool ScalarNI = DL.isNonIntegralPointerType(Ty: ScalarTy);
4068	for (unsigned i = `0`; i < InterleaveFactor; i++) {
4069	Instruction *Member = Group->getMember(Index: i);
4070	if (!Member)
4071	continue;
4072	auto *MemberTy = getLoadStoreType(I: Member);
4073	bool MemberNI = DL.isNonIntegralPointerType(Ty: MemberTy);
4074	// Don't coerce non-integral pointers to integers or vice versa.
4075	if (MemberNI != ScalarNI) {
4076	// TODO: Consider adding special nullptr value case here
4077	return false;
4078	} else if (MemberNI && ScalarNI &&
4079	ScalarTy->getPointerAddressSpace() !=
4080	MemberTy->getPointerAddressSpace()) {
4081	return false;
4082	}
4083	}
4084
4085	// Check if masking is required.
4086	// A Group may need masking for one of two reasons: it resides in a block that
4087	// needs predication, or it was decided to use masking to deal with gaps
4088	// (either a gap at the end of a load-access that may result in a speculative
4089	// load, or any gaps in a store-access).
4090	bool PredicatedAccessRequiresMasking =
4091	blockNeedsPredicationForAnyReason(BB: I->getParent()) &&
4092	Legal->isMaskRequired(I);
4093	bool LoadAccessWithGapsRequiresEpilogMasking =
4094	isa<LoadInst>(Val: I) && Group->requiresScalarEpilogue() &&
4095	!isScalarEpilogueAllowed();
4096	bool StoreAccessWithGapsRequiresMasking =
4097	isa<StoreInst>(Val: I) && (Group->getNumMembers() < Group->getFactor());
4098	if (!PredicatedAccessRequiresMasking &&
4099	!LoadAccessWithGapsRequiresEpilogMasking &&
4100	!StoreAccessWithGapsRequiresMasking)
4101	return true;
4102
4103	// If masked interleaving is required, we expect that the user/target had
4104	// enabled it, because otherwise it either wouldn't have been created or
4105	// it should have been invalidated by the CostModel.
4106	assert(useMaskedInterleavedAccesses(TTI) &&
4107	"Masked interleave-groups for predicated accesses are not enabled.");
4108
4109	if (Group->isReverse())
4110	return false;
4111
4112	auto *Ty = getLoadStoreType(I);
4113	const Align Alignment = getLoadStoreAlignment(I);
4114	return isa<LoadInst>(Val: I) ? TTI.isLegalMaskedLoad(DataType: Ty, Alignment)
4115	: TTI.isLegalMaskedStore(DataType: Ty, Alignment);
4116	}
4117
4118	bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4119	Instruction *I, ElementCount VF) {
4120	// Get and ensure we have a valid memory instruction.
4121	assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4122
4123	auto *Ptr = getLoadStorePointerOperand(V: I);
4124	auto *ScalarTy = getLoadStoreType(I);
4125
4126	// In order to be widened, the pointer should be consecutive, first of all.
4127	if (!Legal->isConsecutivePtr(AccessTy: ScalarTy, Ptr))
4128	return false;
4129
4130	// If the instruction is a store located in a predicated block, it will be
4131	// scalarized.
4132	if (isScalarWithPredication(I, VF))
4133	return false;
4134
4135	// If the instruction's allocated size doesn't equal it's type size, it
4136	// requires padding and will be scalarized.
4137	auto &DL = I->getModule()->getDataLayout();
4138	if (hasIrregularType(Ty: ScalarTy, DL))
4139	return false;
4140
4141	return true;
4142	}
4143
4144	void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4145	// We should not collect Uniforms more than once per VF. Right now,
4146	// this function is called from collectUniformsAndScalars(), which
4147	// already does this check. Collecting Uniforms for VF=1 does not make any
4148	// sense.
4149
4150	assert(VF.isVector() && !Uniforms.contains(VF) &&
4151	"This function should not be visited twice for the same VF");
4152
4153	// Visit the list of Uniforms. If we'll not find any uniform value, we'll
4154	// not analyze again. Uniforms.count(VF) will return 1.
4155	Uniforms [VF].clear();
4156
4157	// We now know that the loop is vectorizable!
4158	// Collect instructions inside the loop that will remain uniform after
4159	// vectorization.
4160
4161	// Global values, params and instructions outside of current loop are out of
4162	// scope.
4163	auto isOutOfScope = [&](Value V) -> bool* {
4164	Instruction *I = dyn_cast<Instruction>(Val: V);
4165	return (!I \|\| !TheLoop->contains(Inst: I));
4166	};
4167
4168	// Worklist containing uniform instructions demanding lane 0.
4169	SetVector<Instruction *> Worklist;
4170	BasicBlock *Latch = TheLoop->getLoopLatch();
4171
4172	// Add uniform instructions demanding lane 0 to the worklist. Instructions
4173	// that are scalar with predication must not be considered uniform after
4174	// vectorization, because that would create an erroneous replicating region
4175	// where only a single instance out of VF should be formed.
4176	// TODO: optimize such seldom cases if found important, see PR40816.
4177	auto addToWorklistIfAllowed = [&](Instruction I) -> void* {
4178	if (isOutOfScope (I)) {
4179	LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4180	<< *I << "\n");
4181	return;
4182	}
4183	if (isScalarWithPredication(I, VF)) {
4184	LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4185	<< *I << "\n");
4186	return;
4187	}
4188	LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4189	Worklist.insert(X: I);
4190	};
4191
4192	// Start with the conditional branch. If the branch condition is an
4193	// instruction contained in the loop that is only used by the branch, it is
4194	// uniform.
4195	auto *Cmp = dyn_cast<Instruction>(Val: Latch->getTerminator()->getOperand(i: `0`));
4196	if (Cmp && TheLoop->contains(Inst: Cmp) && Cmp->hasOneUse())
4197	addToWorklistIfAllowed (Cmp);
4198
4199	auto PrevVF = VF.divideCoefficientBy(RHS: `2`);
4200	// Return true if all lanes perform the same memory operation, and we can
4201	// thus chose to execute only one.
4202	auto isUniformMemOpUse = [&](Instruction *I) {
4203	// If the value was already known to not be uniform for the previous
4204	// (smaller VF), it cannot be uniform for the larger VF.
4205	if (PrevVF.isVector()) {
4206	auto Iter = Uniforms.find(Val: PrevVF);
4207	if (Iter != Uniforms.end() && !Iter ->second.contains(Ptr: I))
4208	return false;
4209	}
4210	if (!Legal->isUniformMemOp(I&: *I, VF))
4211	return false;
4212	if (isa<LoadInst>(Val: I))
4213	// Loading the same address always produces the same result - at least
4214	// assuming aliasing and ordering which have already been checked.
4215	return true;
4216	// Storing the same value on every iteration.
4217	return TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand());
4218	};
4219
4220	auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4221	InstWidening WideningDecision = getWideningDecision(I, VF);
4222	assert(WideningDecision != CM_Unknown &&
4223	"Widening decision should be ready at this moment");
4224
4225	if (isUniformMemOpUse (I))
4226	return true;
4227
4228	return (WideningDecision == CM_Widen \|\|
4229	WideningDecision == CM_Widen_Reverse \|\|
4230	WideningDecision == CM_Interleave);
4231	};
4232
4233	// Returns true if Ptr is the pointer operand of a memory access instruction
4234	// I, I is known to not require scalarization, and the pointer is not also
4235	// stored.
4236	auto isVectorizedMemAccessUse = [&](Instruction I, Value Ptr) -> bool {
4237	if (isa<StoreInst>(Val: I) && I->getOperand(i: `0`) == Ptr)
4238	return false;
4239	return getLoadStorePointerOperand(V: I) == Ptr &&
4240	(isUniformDecision (I, VF) \|\| Legal->isInvariant(V: Ptr));
4241	};
4242
4243	// Holds a list of values which are known to have at least one uniform use.
4244	// Note that there may be other uses which aren't uniform. A "uniform use"
4245	// here is something which only demands lane 0 of the unrolled iterations;
4246	// it does not imply that all lanes produce the same value (e.g. this is not
4247	// the usual meaning of uniform)
4248	SetVector<Value *> HasUniformUse;
4249
4250	// Scan the loop for instructions which are either a) known to have only
4251	// lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4252	for (auto *BB : TheLoop->blocks())
4253	for (auto &I : *BB) {
4254	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &I)) {
4255	switch (II->getIntrinsicID()) {
4256	case Intrinsic::sideeffect:
4257	case Intrinsic::experimental_noalias_scope_decl:
4258	case Intrinsic::assume:
4259	case Intrinsic::lifetime_start:
4260	case Intrinsic::lifetime_end:
4261	if (TheLoop->hasLoopInvariantOperands(I: &I))
4262	addToWorklistIfAllowed (&I);
4263	break;
4264	default:
4265	break;
4266	}
4267	}
4268
4269	// ExtractValue instructions must be uniform, because the operands are
4270	// known to be loop-invariant.
4271	if (auto *EVI = dyn_cast<ExtractValueInst>(Val: &I)) {
4272	assert(isOutOfScope(EVI->getAggregateOperand()) &&
4273	"Expected aggregate value to be loop invariant");
4274	addToWorklistIfAllowed (EVI);
4275	continue;
4276	}
4277
4278	// If there's no pointer operand, there's nothing to do.
4279	auto *Ptr = getLoadStorePointerOperand(V: &I);
4280	if (!Ptr)
4281	continue;
4282
4283	if (isUniformMemOpUse (&I))
4284	addToWorklistIfAllowed (&I);
4285
4286	if (isVectorizedMemAccessUse (&I, Ptr))
4287	HasUniformUse.insert(X: Ptr);
4288	}
4289
4290	// Add to the worklist any operands which have only* uniform (e.g. lane 0*
4291	// demanding) users. Since loops are assumed to be in LCSSA form, this
4292	// disallows uses outside the loop as well.
4293	for (auto *V : HasUniformUse) {
4294	if (isOutOfScope (V))
4295	continue;
4296	auto *I = cast<Instruction>(Val: V);
4297	auto UsersAreMemAccesses =
4298	llvm::all_of(Range: I->users(), P: [&](User U) -> bool* {
4299	return isVectorizedMemAccessUse (cast<Instruction>(Val: U), V);
4300	});
4301	if (UsersAreMemAccesses)
4302	addToWorklistIfAllowed (I);
4303	}
4304
4305	// Expand Worklist in topological order: whenever a new instruction
4306	// is added , its users should be already inside Worklist. It ensures
4307	// a uniform instruction will only be used by uniform instructions.
4308	unsigned idx = `0`;
4309	while (idx != Worklist.size()) {
4310	Instruction *I = Worklist [idx++];
4311
4312	for (auto *OV : I->operand_values()) {
4313	// isOutOfScope operands cannot be uniform instructions.
4314	if (isOutOfScope (OV))
4315	continue;
4316	// First order recurrence Phi's should typically be considered
4317	// non-uniform.
4318	auto *OP = dyn_cast<PHINode>(Val: OV);
4319	if (OP && Legal->isFixedOrderRecurrence(Phi: OP))
4320	continue;
4321	// If all the users of the operand are uniform, then add the
4322	// operand into the uniform worklist.
4323	auto *OI = cast<Instruction>(Val: OV);
4324	if (llvm::all_of(Range: OI->users(), P: [&](User U) -> bool* {
4325	auto *J = cast<Instruction>(Val: U);
4326	return Worklist.count(key: J) \|\| isVectorizedMemAccessUse (J, OI);
4327	}))
4328	addToWorklistIfAllowed (OI);
4329	}
4330	}
4331
4332	// For an instruction to be added into Worklist above, all its users inside
4333	// the loop should also be in Worklist. However, this condition cannot be
4334	// true for phi nodes that form a cyclic dependence. We must process phi
4335	// nodes separately. An induction variable will remain uniform if all users
4336	// of the induction variable and induction variable update remain uniform.
4337	// The code below handles both pointer and non-pointer induction variables.
4338	for (const auto &Induction : Legal->getInductionVars()) {
4339	auto *Ind = Induction.first;
4340	auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
4341
4342	// Determine if all users of the induction variable are uniform after
4343	// vectorization.
4344	auto UniformInd = llvm::all_of(Range: Ind->users(), P: [&](User U) -> bool* {
4345	auto *I = cast<Instruction>(Val: U);
4346	return I == IndUpdate \|\| !TheLoop->contains(Inst: I) \|\| Worklist.count(key: I) \|\|
4347	isVectorizedMemAccessUse (I, Ind);
4348	});
4349	if (!UniformInd)
4350	continue;
4351
4352	// Determine if all users of the induction variable update instruction are
4353	// uniform after vectorization.
4354	auto UniformIndUpdate =
4355	llvm::all_of(Range: IndUpdate->users(), P: [&](User U) -> bool* {
4356	auto *I = cast<Instruction>(Val: U);
4357	return I == Ind \|\| !TheLoop->contains(Inst: I) \|\| Worklist.count(key: I) \|\|
4358	isVectorizedMemAccessUse (I, IndUpdate);
4359	});
4360	if (!UniformIndUpdate)
4361	continue;
4362
4363	// The induction variable and its update instruction will remain uniform.
4364	addToWorklistIfAllowed (Ind);
4365	addToWorklistIfAllowed (IndUpdate);
4366	}
4367
4368	Uniforms [VF].insert(I: Worklist.begin(), E: Worklist.end());
4369	}
4370
4371	bool LoopVectorizationCostModel::runtimeChecksRequired() {
4372	LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4373
4374	if (Legal->getRuntimePointerChecking()->Need) {
4375	reportVectorizationFailure(DebugMsg: "Runtime ptr check is required with -Os/-Oz",
4376	OREMsg: "runtime pointer checks needed. Enable vectorization of this "
4377	"loop with '#pragma clang loop vectorize(enable)' when "
4378	"compiling with -Os/-Oz",
4379	ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
4380	return true;
4381	}
4382
4383	if (!PSE.getPredicate().isAlwaysTrue()) {
4384	reportVectorizationFailure(DebugMsg: "Runtime SCEV check is required with -Os/-Oz",
4385	OREMsg: "runtime SCEV checks needed. Enable vectorization of this "
4386	"loop with '#pragma clang loop vectorize(enable)' when "
4387	"compiling with -Os/-Oz",
4388	ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
4389	return true;
4390	}
4391
4392	// FIXME: Avoid specializing for stride==1 instead of bailing out.
4393	if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4394	reportVectorizationFailure(DebugMsg: "Runtime stride check for small trip count",
4395	OREMsg: "runtime stride == 1 checks needed. Enable vectorization of "
4396	"this loop without such check by compiling with -Os/-Oz",
4397	ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
4398	return true;
4399	}
4400
4401	return false;
4402	}
4403
4404	ElementCount
4405	LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4406	if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4407	return ElementCount::getScalable(MinVal: `0`);
4408
4409	if (Hints->isScalableVectorizationDisabled()) {
4410	reportVectorizationInfo(Msg: "Scalable vectorization is explicitly disabled",
4411	ORETag: "ScalableVectorizationDisabled", ORE, TheLoop);
4412	return ElementCount::getScalable(MinVal: `0`);
4413	}
4414
4415	LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4416
4417	auto MaxScalableVF = ElementCount::getScalable(
4418	MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
4419
4420	// Test that the loop-vectorizer can legalize all operations for this MaxVF.
4421	// FIXME: While for scalable vectors this is currently sufficient, this should
4422	// be replaced by a more detailed mechanism that filters out specific VFs,
4423	// instead of invalidating vectorization for a whole set of VFs based on the
4424	// MaxVF.
4425
4426	// Disable scalable vectorization if the loop contains unsupported reductions.
4427	if (!canVectorizeReductions(VF: MaxScalableVF)) {
4428	reportVectorizationInfo(
4429	Msg: "Scalable vectorization not supported for the reduction "
4430	"operations found in this loop.",
4431	ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
4432	return ElementCount::getScalable(MinVal: `0`);
4433	}
4434
4435	// Disable scalable vectorization if the loop contains any instructions
4436	// with element types not supported for scalable vectors.
4437	if (any_of(Range&: ElementTypesInLoop, P: [&](Type *Ty) {
4438	return !Ty->isVoidTy() &&
4439	!this->TTI.isElementTypeLegalForScalableVector(Ty);
4440	})) {
4441	reportVectorizationInfo(Msg: "Scalable vectorization is not supported "
4442	"for all element types found in this loop.",
4443	ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
4444	return ElementCount::getScalable(MinVal: `0`);
4445	}
4446
4447	if (Legal->isSafeForAnyVectorWidth())
4448	return MaxScalableVF;
4449
4450	// Limit MaxScalableVF by the maximum safe dependence distance.
4451	if (std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI))
4452	MaxScalableVF = ElementCount::getScalable(MinVal: MaxSafeElements / *MaxVScale);
4453	else
4454	MaxScalableVF = ElementCount::getScalable(MinVal: `0`);
4455
4456	if (!MaxScalableVF)
4457	reportVectorizationInfo(
4458	Msg: "Max legal vector width too small, scalable vectorization "
4459	"unfeasible.",
4460	ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
4461
4462	return MaxScalableVF;
4463	}
4464
4465	FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4466	unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4467	MinBWs = computeMinimumValueSizes(Blocks: TheLoop->getBlocks(), DB&: *DB, TTI: &TTI);
4468	unsigned SmallestType, WidestType;
4469	std::tie(args&: SmallestType, args&: WidestType) = getSmallestAndWidestTypes();
4470
4471	// Get the maximum safe dependence distance in bits computed by LAA.
4472	// It is computed by MaxVF sizeOf(type) * 8, where type is taken from*
4473	// the memory accesses that is most restrictive (involved in the smallest
4474	// dependence distance).
4475	unsigned MaxSafeElements =
4476	llvm::bit_floor(Value: Legal->getMaxSafeVectorWidthInBits() / WidestType);
4477
4478	auto MaxSafeFixedVF = ElementCount::getFixed(MinVal: MaxSafeElements);
4479	auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4480
4481	LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4482	<< ".\n");
4483	LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4484	<< ".\n");
4485
4486	// First analyze the UserVF, fall back if the UserVF should be ignored.
4487	if (UserVF) {
4488	auto MaxSafeUserVF =
4489	UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4490
4491	if (ElementCount::isKnownLE(LHS: UserVF, RHS: MaxSafeUserVF)) {
4492	// If `VF=vscale x N` is safe, then so is `VF=N`
4493	if (UserVF.isScalable())
4494	return FixedScalableVFPair (
4495	ElementCount::getFixed(MinVal: UserVF.getKnownMinValue()), UserVF);
4496	else
4497	return UserVF;
4498	}
4499
4500	assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4501
4502	// Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4503	// is better to ignore the hint and let the compiler choose a suitable VF.
4504	if (!UserVF.isScalable()) {
4505	LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4506	<< " is unsafe, clamping to max safe VF="
4507	<< MaxSafeFixedVF << ".\n");
4508	ORE->emit(RemarkBuilder: [&]() {
4509	return OptimizationRemarkAnalysis (DEBUG_TYPE, "VectorizationFactor",
4510	TheLoop->getStartLoc(),
4511	TheLoop->getHeader())
4512	<< "User-specified vectorization factor "
4513	<< ore::NV ("UserVectorizationFactor", UserVF)
4514	<< " is unsafe, clamping to maximum safe vectorization factor "
4515	<< ore::NV ("VectorizationFactor", MaxSafeFixedVF);
4516	});
4517	return MaxSafeFixedVF;
4518	}
4519
4520	if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4521	LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4522	<< " is ignored because scalable vectors are not "
4523	"available.\n");
4524	ORE->emit(RemarkBuilder: [&]() {
4525	return OptimizationRemarkAnalysis (DEBUG_TYPE, "VectorizationFactor",
4526	TheLoop->getStartLoc(),
4527	TheLoop->getHeader())
4528	<< "User-specified vectorization factor "
4529	<< ore::NV ("UserVectorizationFactor", UserVF)
4530	<< " is ignored because the target does not support scalable "
4531	"vectors. The compiler will pick a more suitable value.";
4532	});
4533	} else {
4534	LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4535	<< " is unsafe. Ignoring scalable UserVF.\n");
4536	ORE->emit(RemarkBuilder: [&]() {
4537	return OptimizationRemarkAnalysis (DEBUG_TYPE, "VectorizationFactor",
4538	TheLoop->getStartLoc(),
4539	TheLoop->getHeader())
4540	<< "User-specified vectorization factor "
4541	<< ore::NV ("UserVectorizationFactor", UserVF)
4542	<< " is unsafe. Ignoring the hint to let the compiler pick a "
4543	"more suitable value.";
4544	});
4545	}
4546	}
4547
4548	LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4549	<< " / " << WidestType << " bits.\n");
4550
4551	FixedScalableVFPair Result(ElementCount::getFixed(MinVal: `1`),
4552	ElementCount::getScalable(MinVal: `0`));
4553	if (auto MaxVF =
4554	getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4555	MaxSafeVF: MaxSafeFixedVF, FoldTailByMasking))
4556	Result.FixedVF = MaxVF;
4557
4558	if (auto MaxVF =
4559	getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
4560	MaxSafeVF: MaxSafeScalableVF, FoldTailByMasking))
4561	if (MaxVF.isScalable()) {
4562	Result.ScalableVF = MaxVF;
4563	LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4564	<< "\n");
4565	}
4566
4567	return Result;
4568	}
4569
4570	FixedScalableVFPair
4571	LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4572	if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4573	// TODO: It may by useful to do since it's still likely to be dynamically
4574	// uniform if the target can skip.
4575	reportVectorizationFailure(
4576	DebugMsg: "Not inserting runtime ptr check for divergent target",
4577	OREMsg: "runtime pointer checks needed. Not enabled for divergent target",
4578	ORETag: "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4579	return FixedScalableVFPair::getNone();
4580	}
4581
4582	unsigned TC = PSE.getSE()->getSmallConstantTripCount(L: TheLoop);
4583	unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(L: TheLoop);
4584	LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << `'\n'`);
4585	if (TC == `1`) {
4586	reportVectorizationFailure(DebugMsg: "Single iteration (non) loop",
4587	OREMsg: "loop trip count is one, irrelevant for vectorization",
4588	ORETag: "SingleIterationLoop", ORE, TheLoop);
4589	return FixedScalableVFPair::getNone();
4590	}
4591
4592	switch (ScalarEpilogueStatus) {
4593	case CM_ScalarEpilogueAllowed:
4594	return computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: false);
4595	case CM_ScalarEpilogueNotAllowedUsePredicate:
4596	[[fallthrough]];
4597	case CM_ScalarEpilogueNotNeededUsePredicate:
4598	LLVM_DEBUG(
4599	dbgs() << "LV: vector predicate hint/switch found.\n"
4600	<< "LV: Not allowing scalar epilogue, creating predicated "
4601	<< "vector loop.\n");
4602	break;
4603	case CM_ScalarEpilogueNotAllowedLowTripLoop:
4604	// fallthrough as a special case of OptForSize
4605	case CM_ScalarEpilogueNotAllowedOptSize:
4606	if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4607	LLVM_DEBUG(
4608	dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4609	else
4610	LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4611	<< "count.\n");
4612
4613	// Bail if runtime checks are required, which are not good when optimising
4614	// for size.
4615	if (runtimeChecksRequired())
4616	return FixedScalableVFPair::getNone();
4617
4618	break;
4619	}
4620
4621	// The only loops we can vectorize without a scalar epilogue, are loops with
4622	// a bottom-test and a single exiting block. We'd have to handle the fact
4623	// that not every instruction executes on the last iteration. This will
4624	// require a lane mask which varies through the vector loop body. (TODO)
4625	if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4626	// If there was a tail-folding hint/switch, but we can't fold the tail by
4627	// masking, fallback to a vectorization with a scalar epilogue.
4628	if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4629	LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4630	"scalar epilogue instead.\n");
4631	ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4632	return computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: false);
4633	}
4634	return FixedScalableVFPair::getNone();
4635	}
4636
4637	// Now try the tail folding
4638
4639	// Invalidate interleave groups that require an epilogue if we can't mask
4640	// the interleave-group.
4641	if (!useMaskedInterleavedAccesses(TTI)) {
4642	assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4643	"No decisions should have been taken at this point");
4644	// Note: There is no need to invalidate any cost modeling decisions here, as
4645	// non where taken so far.
4646	InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4647	}
4648
4649	FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: true);
4650
4651	// Avoid tail folding if the trip count is known to be a multiple of any VF
4652	// we choose.
4653	std::optional<unsigned> MaxPowerOf2RuntimeVF =
4654	MaxFactors.FixedVF.getFixedValue();
4655	if (MaxFactors.ScalableVF) {
4656	std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
4657	if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4658	MaxPowerOf2RuntimeVF = std::max<unsigned>(
4659	a: *MaxPowerOf2RuntimeVF,
4660	b: MaxVScale MaxFactors.ScalableVF.getKnownMinValue());
4661	} else
4662	MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4663	}
4664
4665	if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > `0`) {
4666	assert((UserVF.isNonZero() \|\| isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4667	"MaxFixedVF must be a power of 2");
4668	unsigned MaxVFtimesIC =
4669	UserIC ? MaxPowerOf2RuntimeVF UserIC : *MaxPowerOf2RuntimeVF;
4670	ScalarEvolution *SE = PSE.getSE();
4671	const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4672	const SCEV *ExitCount = SE->getAddExpr(
4673	LHS: BackedgeTakenCount, RHS: SE->getOne(Ty: BackedgeTakenCount->getType()));
4674	const SCEV *Rem = SE->getURemExpr(
4675	LHS: SE->applyLoopGuards(Expr: ExitCount, L: TheLoop),
4676	RHS: SE->getConstant(Ty: BackedgeTakenCount->getType(), V: MaxVFtimesIC));
4677	if (Rem->isZero()) {
4678	// Accept MaxFixedVF if we do not have a tail.
4679	LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4680	return MaxFactors;
4681	}
4682	}
4683
4684	// If we don't know the precise trip count, or if the trip count that we
4685	// found modulo the vectorization factor is not zero, try to fold the tail
4686	// by masking.
4687	// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4688	setTailFoldingStyles(IsScalableVF: MaxFactors.ScalableVF.isScalable(), UserIC);
4689	if (foldTailByMasking()) {
4690	if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
4691	LLVM_DEBUG(
4692	dbgs()
4693	<< "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4694	"try to generate VP Intrinsics with scalable vector "
4695	"factors only.\n");
4696	// Tail folded loop using VP intrinsics restricts the VF to be scalable
4697	// for now.
4698	// TODO: extend it for fixed vectors, if required.
4699	assert(MaxFactors.ScalableVF.isScalable() &&
4700	"Expected scalable vector factor.");
4701
4702	MaxFactors.FixedVF = ElementCount::getFixed(MinVal: `1`);
4703	}
4704	return MaxFactors;
4705	}
4706
4707	// If there was a tail-folding hint/switch, but we can't fold the tail by
4708	// masking, fallback to a vectorization with a scalar epilogue.
4709	if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4710	LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4711	"scalar epilogue instead.\n");
4712	ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4713	return MaxFactors;
4714	}
4715
4716	if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4717	LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4718	return FixedScalableVFPair::getNone();
4719	}
4720
4721	if (TC == `0`) {
4722	reportVectorizationFailure(
4723	DebugMsg: "Unable to calculate the loop count due to complex control flow",
4724	OREMsg: "unable to calculate the loop count due to complex control flow",
4725	ORETag: "UnknownLoopCountComplexCFG", ORE, TheLoop);
4726	return FixedScalableVFPair::getNone();
4727	}
4728
4729	reportVectorizationFailure(
4730	DebugMsg: "Cannot optimize for size and vectorize at the same time.",
4731	OREMsg: "cannot optimize for size and vectorize at the same time. "
4732	"Enable vectorization of this loop with '#pragma clang loop "
4733	"vectorize(enable)' when compiling with -Os/-Oz",
4734	ORETag: "NoTailLoopWithOptForSize", ORE, TheLoop);
4735	return FixedScalableVFPair::getNone();
4736	}
4737
4738	ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4739	unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4740	ElementCount MaxSafeVF, bool FoldTailByMasking) {
4741	bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4742	const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4743	K: ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4744	: TargetTransformInfo::RGK_FixedWidthVector);
4745
4746	// Convenience function to return the minimum of two ElementCounts.
4747	auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4748	assert((LHS.isScalable() == RHS.isScalable()) &&
4749	"Scalable flags must match");
4750	return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4751	};
4752
4753	// Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4754	// Note that both WidestRegister and WidestType may not be a powers of 2.
4755	auto MaxVectorElementCount = ElementCount::get(
4756	MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / WidestType),
4757	Scalable: ComputeScalableMaxVF);
4758	MaxVectorElementCount = MinVF (MaxVectorElementCount, MaxSafeVF);
4759	LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4760	<< (MaxVectorElementCount * WidestType) << " bits.\n");
4761
4762	if (!MaxVectorElementCount) {
4763	LLVM_DEBUG(dbgs() << "LV: The target has no "
4764	<< (ComputeScalableMaxVF ? "scalable" : "fixed")
4765	<< " vector registers.\n");
4766	return ElementCount::getFixed(MinVal: `1`);
4767	}
4768
4769	unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4770	if (MaxVectorElementCount.isScalable() &&
4771	TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
4772	auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
4773	auto Min = Attr.getVScaleRangeMin();
4774	WidestRegisterMinEC *= Min;
4775	}
4776
4777	// When a scalar epilogue is required, at least one iteration of the scalar
4778	// loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4779	// max VF that results in a dead vector loop.
4780	if (MaxTripCount > `0` && requiresScalarEpilogue(IsVectorizing: true))
4781	MaxTripCount -= `1`;
4782
4783	if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4784	(!FoldTailByMasking \|\| isPowerOf2_32(Value: MaxTripCount))) {
4785	// If upper bound loop trip count (TC) is known at compile time there is no
4786	// point in choosing VF greater than TC (as done in the loop below). Select
4787	// maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4788	// scalable, we only fall back on a fixed VF when the TC is less than or
4789	// equal to the known number of lanes.
4790	auto ClampedUpperTripCount = llvm::bit_floor(Value: MaxTripCount);
4791	LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4792	"exceeding the constant trip count: "
4793	<< ClampedUpperTripCount << "\n");
4794	return ElementCount::get(
4795	MinVal: ClampedUpperTripCount,
4796	Scalable: FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4797	}
4798
4799	TargetTransformInfo::RegisterKind RegKind =
4800	ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4801	: TargetTransformInfo::RGK_FixedWidthVector;
4802	ElementCount MaxVF = MaxVectorElementCount;
4803	if (MaximizeBandwidth \|\|
4804	(MaximizeBandwidth.getNumOccurrences() == `0` &&
4805	(TTI.shouldMaximizeVectorBandwidth(K: RegKind) \|\|
4806	(UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
4807	auto MaxVectorElementCountMaxBW = ElementCount::get(
4808	MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / SmallestType),
4809	Scalable: ComputeScalableMaxVF);
4810	MaxVectorElementCountMaxBW = MinVF (MaxVectorElementCountMaxBW, MaxSafeVF);
4811
4812	// Collect all viable vectorization factors larger than the default MaxVF
4813	// (i.e. MaxVectorElementCount).
4814	SmallVector<ElementCount, `8`> VFs;
4815	for (ElementCount VS = MaxVectorElementCount * `2`;
4816	ElementCount::isKnownLE(LHS: VS, RHS: MaxVectorElementCountMaxBW); VS *= `2`)
4817	VFs.push_back(Elt: VS);
4818
4819	// For each VF calculate its register usage.
4820	auto RUs = calculateRegisterUsage(VFs);
4821
4822	// Select the largest VF which doesn't require more registers than existing
4823	// ones.
4824	for (int i = RUs.size() - `1`; i >= `0`; --i) {
4825	bool Selected = true;
4826	for (auto &pair : RUs [i].MaxLocalUsers) {
4827	unsigned TargetNumRegisters = TTI.getNumberOfRegisters(ClassID: pair.first);
4828	if (pair.second > TargetNumRegisters)
4829	Selected = false;
4830	}
4831	if (Selected) {
4832	MaxVF = VFs [i];
4833	break;
4834	}
4835	}
4836	if (ElementCount MinVF =
4837	TTI.getMinimumVF(ElemWidth: SmallestType, IsScalable: ComputeScalableMaxVF)) {
4838	if (ElementCount::isKnownLT(LHS: MaxVF, RHS: MinVF)) {
4839	LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4840	<< ") with target's minimum: " << MinVF << `'\n'`);
4841	MaxVF = MinVF;
4842	}
4843	}
4844
4845	// Invalidate any widening decisions we might have made, in case the loop
4846	// requires prediction (decided later), but we have already made some
4847	// load/store widening decisions.
4848	invalidateCostModelingDecisions();
4849	}
4850	return MaxVF;
4851	}
4852
4853	/// Convenience function that returns the value of vscale_range iff
4854	/// vscale_range.min == vscale_range.max or otherwise returns the value
4855	/// returned by the corresponding TTI method.
4856	static std::optional<unsigned>
4857	getVScaleForTuning(const Loop L, const* TargetTransformInfo &TTI) {
4858	const Function *Fn = L->getHeader()->getParent();
4859	if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4860	auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4861	auto Min = Attr.getVScaleRangeMin();
4862	auto Max = Attr.getVScaleRangeMax();
4863	if (Max && Min == Max)
4864	return Max;
4865	}
4866
4867	return TTI.getVScaleForTuning();
4868	}
4869
4870	bool LoopVectorizationPlanner::isMoreProfitable(
4871	const VectorizationFactor &A, const VectorizationFactor &B) const {
4872	InstructionCost CostA = A.Cost;
4873	InstructionCost CostB = B.Cost;
4874
4875	unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(L: OrigLoop);
4876
4877	if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
4878	// If the trip count is a known (possibly small) constant, the trip count
4879	// will be rounded up to an integer number of iterations under
4880	// FoldTailByMasking. The total cost in that case will be
4881	// VecCostceil(TripCount/VF). When not folding the tail, the total*
4882	// cost will be VecCostfloor(TC/VF) + ScalarCost(TC%VF). There will be
4883	// some extra overheads, but for the purpose of comparing the costs of
4884	// different VFs we can use this to compare the total loop-body cost
4885	// expected after vectorization.
4886	auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4887	InstructionCost VectorCost,
4888	InstructionCost ScalarCost) {
4889	return CM.foldTailByMasking() ? VectorCost * divideCeil(Numerator: MaxTripCount, Denominator: VF)
4890	: VectorCost * (MaxTripCount / VF) +
4891	ScalarCost * (MaxTripCount % VF);
4892	};
4893	auto RTCostA = GetCostForTC (A.Width.getFixedValue(), CostA, A.ScalarCost);
4894	auto RTCostB = GetCostForTC (B.Width.getFixedValue(), CostB, B.ScalarCost);
4895
4896	return RTCostA < RTCostB;
4897	}
4898
4899	// Improve estimate for the vector width if it is scalable.
4900	unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4901	unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4902	if (std::optional<unsigned> VScale = getVScaleForTuning(L: OrigLoop, TTI)) {
4903	if (A.Width.isScalable())
4904	EstimatedWidthA = VScale;
4905	if (B.Width.isScalable())
4906	EstimatedWidthB = VScale;
4907	}
4908
4909	// Assume vscale may be larger than 1 (or the value being tuned for),
4910	// so that scalable vectorization is slightly favorable over fixed-width
4911	// vectorization.
4912	if (A.Width.isScalable() && !B.Width.isScalable())
4913	return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
4914
4915	// To avoid the need for FP division:
4916	// (CostA / A.Width) < (CostB / B.Width)
4917	// <=> (CostA B.Width) < (CostB * A.Width)*
4918	return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
4919	}
4920
4921	static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
4922	OptimizationRemarkEmitter *ORE,
4923	Loop *TheLoop) {
4924	if (InvalidCosts.empty())
4925	return;
4926
4927	// Emit a report of VFs with invalid costs in the loop.
4928
4929	// Group the remarks per instruction, keeping the instruction order from
4930	// InvalidCosts.
4931	std::map<Instruction , unsigned*> Numbering;
4932	unsigned I = `0`;
4933	for (auto &Pair : InvalidCosts)
4934	if (!Numbering.count(x: Pair.first))
4935	Numbering [Pair.first] = I++;
4936
4937	// Sort the list, first on instruction(number) then on VF.
4938	sort(C&: InvalidCosts, Comp: [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4939	if (Numbering [A.first] != Numbering [B.first])
4940	return Numbering [A.first] < Numbering [B.first];
4941	ElementCountComparator ECC;
4942	return ECC (A.second, B.second);
4943	});
4944
4945	// For a list of ordered instruction-vf pairs:
4946	// [(load, vf1), (load, vf2), (store, vf1)]
4947	// Group the instructions together to emit separate remarks for:
4948	// load (vf1, vf2)
4949	// store (vf1)
4950	auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4951	auto Subset = ArrayRef<InstructionVFPair>();
4952	do {
4953	if (Subset.empty())
4954	Subset = Tail.take_front(N: `1`);
4955
4956	Instruction *I = Subset.front().first;
4957
4958	// If the next instruction is different, or if there are no other pairs,
4959	// emit a remark for the collated subset. e.g.
4960	// [(load, vf1), (load, vf2))]
4961	// to emit:
4962	// remark: invalid costs for 'load' at VF=(vf, vf2)
4963	if (Subset == Tail \|\| Tail [Subset.size()].first != I) {
4964	std::string OutString;
4965	raw_string_ostream OS(OutString);
4966	assert(!Subset.empty() && "Unexpected empty range");
4967	OS << "Instruction with invalid costs prevented vectorization at VF=(";
4968	for (const auto &Pair : Subset)
4969	OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4970	OS << "):";
4971	if (auto *CI = dyn_cast<CallInst>(Val: I))
4972	OS << " call to " << CI->getCalledFunction()->getName();
4973	else
4974	OS << " " << I->getOpcodeName();
4975	OS.flush();
4976	reportVectorizationInfo(Msg: OutString, ORETag: "InvalidCost", ORE, TheLoop, I);
4977	Tail = Tail.drop_front(N: Subset.size());
4978	Subset = {};
4979	} else
4980	// Grow the subset by one element
4981	Subset = Tail.take_front(N: Subset.size() + `1`);
4982	} while (!Tail.empty());
4983	}
4984
4985	VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor(
4986	const ElementCountSet &VFCandidates) {
4987	InstructionCost ExpectedCost =
4988	CM.expectedCost(VF: ElementCount::getFixed(MinVal: `1`)).first;
4989	LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4990	assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4991	assert(VFCandidates.count(ElementCount::getFixed(`1`)) &&
4992	"Expected Scalar VF to be a candidate");
4993
4994	const VectorizationFactor ScalarCost(ElementCount::getFixed(MinVal: `1`), ExpectedCost,
4995	ExpectedCost);
4996	VectorizationFactor ChosenFactor = ScalarCost;
4997
4998	bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4999	if (ForceVectorization && VFCandidates.size() > `1`) {
5000	// Ignore scalar width, because the user explicitly wants vectorization.
5001	// Initialize cost to max so that VF = 2 is, at least, chosen during cost
5002	// evaluation.
5003	ChosenFactor.Cost = InstructionCost::getMax();
5004	}
5005
5006	SmallVector<InstructionVFPair> InvalidCosts;
5007	for (const auto &i : VFCandidates) {
5008	// The cost for scalar VF=1 is already calculated, so ignore it.
5009	if (i.isScalar())
5010	continue;
5011
5012	LoopVectorizationCostModel::VectorizationCostTy C =
5013	CM.expectedCost(VF: i, Invalid: &InvalidCosts);
5014	VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5015
5016	#ifndef NDEBUG
5017	unsigned AssumedMinimumVscale =
5018	getVScaleForTuning(L: OrigLoop, TTI).value_or(u: `1`);
5019	unsigned Width =
5020	Candidate.Width.isScalable()
5021	? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5022	: Candidate.Width.getFixedValue();
5023	LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5024	<< " costs: " << (Candidate.Cost / Width));
5025	if (i.isScalable())
5026	LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5027	<< AssumedMinimumVscale << ")");
5028	LLVM_DEBUG(dbgs() << ".\n");
5029	#endif
5030
5031	if (!C.second && !ForceVectorization) {
5032	LLVM_DEBUG(
5033	dbgs() << "LV: Not considering vector loop of width " << i
5034	<< " because it will not generate any vector instructions.\n");
5035	continue;
5036	}
5037
5038	// If profitable add it to ProfitableVF list.
5039	if (isMoreProfitable(A: Candidate, B: ScalarCost))
5040	ProfitableVFs.push_back(Elt: Candidate);
5041
5042	if (isMoreProfitable(A: Candidate, B: ChosenFactor))
5043	ChosenFactor = Candidate;
5044	}
5045
5046	emitInvalidCostRemarks(InvalidCosts, ORE, TheLoop: OrigLoop);
5047
5048	if (!EnableCondStoresVectorization && CM.hasPredStores()) {
5049	reportVectorizationFailure(
5050	DebugMsg: "There are conditional stores.",
5051	OREMsg: "store that is conditionally executed prevents vectorization",
5052	ORETag: "ConditionalStore", ORE, TheLoop: OrigLoop);
5053	ChosenFactor = ScalarCost;
5054	}
5055
5056	LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5057	!isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
5058	<< "LV: Vectorization seems to be not beneficial, "
5059	<< "but was forced by a user.\n");
5060	LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5061	return ChosenFactor;
5062	}
5063
5064	bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
5065	ElementCount VF) const {
5066	// Cross iteration phis such as reductions need special handling and are
5067	// currently unsupported.
5068	if (any_of(Range: OrigLoop->getHeader()->phis(),
5069	P: [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(Phi: &Phi); }))
5070	return false;
5071
5072	// Phis with uses outside of the loop require special handling and are
5073	// currently unsupported.
5074	for (const auto &Entry : Legal->getInductionVars()) {
5075	// Look for uses of the value of the induction at the last iteration.
5076	Value *PostInc =
5077	Entry.first->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch());
5078	for (User *U : PostInc->users())
5079	if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
5080	return false;
5081	// Look for uses of penultimate value of the induction.
5082	for (User *U : Entry.first->users())
5083	if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
5084	return false;
5085	}
5086
5087	// Epilogue vectorization code has not been auditted to ensure it handles
5088	// non-latch exits properly. It may be fine, but it needs auditted and
5089	// tested.
5090	if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
5091	return false;
5092
5093	return true;
5094	}
5095
5096	bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5097	const ElementCount VF) const {
5098	// FIXME: We need a much better cost-model to take different parameters such
5099	// as register pressure, code size increase and cost of extra branches into
5100	// account. For now we apply a very crude heuristic and only consider loops
5101	// with vectorization factors larger than a certain value.
5102
5103	// Allow the target to opt out entirely.
5104	if (!TTI.preferEpilogueVectorization())
5105	return false;
5106
5107	// We also consider epilogue vectorization unprofitable for targets that don't
5108	// consider interleaving beneficial (eg. MVE).
5109	if (TTI.getMaxInterleaveFactor(VF) <= `1`)
5110	return false;
5111
5112	unsigned Multiplier = `1`;
5113	if (VF.isScalable())
5114	Multiplier = getVScaleForTuning(L: TheLoop, TTI).value_or(u: `1`);
5115	if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
5116	return true;
5117	return false;
5118	}
5119
5120	VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
5121	const ElementCount MainLoopVF, unsigned IC) {
5122	VectorizationFactor Result = VectorizationFactor::Disabled();
5123	if (!EnableEpilogueVectorization) {
5124	LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
5125	return Result;
5126	}
5127
5128	if (!CM.isScalarEpilogueAllowed()) {
5129	LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
5130	"epilogue is allowed.\n");
5131	return Result;
5132	}
5133
5134	// Not really a cost consideration, but check for unsupported cases here to
5135	// simplify the logic.
5136	if (!isCandidateForEpilogueVectorization(VF: MainLoopVF)) {
5137	LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
5138	"is not a supported candidate.\n");
5139	return Result;
5140	}
5141
5142	if (EpilogueVectorizationForceVF > `1`) {
5143	LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
5144	ElementCount ForcedEC = ElementCount::getFixed(MinVal: EpilogueVectorizationForceVF);
5145	if (hasPlanWithVF(VF: ForcedEC))
5146	return {ForcedEC, `0`, `0`};
5147	else {
5148	LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
5149	"viable.\n");
5150	return Result;
5151	}
5152	}
5153
5154	if (OrigLoop->getHeader()->getParent()->hasOptSize() \|\|
5155	OrigLoop->getHeader()->getParent()->hasMinSize()) {
5156	LLVM_DEBUG(
5157	dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
5158	return Result;
5159	}
5160
5161	if (!CM.isEpilogueVectorizationProfitable(VF: MainLoopVF)) {
5162	LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5163	"this loop\n");
5164	return Result;
5165	}
5166
5167	// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5168	// the main loop handles 8 lanes per iteration. We could still benefit from
5169	// vectorizing the epilogue loop with VF=4.
5170	ElementCount EstimatedRuntimeVF = MainLoopVF;
5171	if (MainLoopVF.isScalable()) {
5172	EstimatedRuntimeVF = ElementCount::getFixed(MinVal: MainLoopVF.getKnownMinValue());
5173	if (std::optional<unsigned> VScale = getVScaleForTuning(L: OrigLoop, TTI))
5174	EstimatedRuntimeVF = VScale;
5175	}
5176
5177	ScalarEvolution &SE = *PSE.getSE();
5178	Type *TCType = Legal->getWidestInductionType();
5179	const SCEV RemainingIterations = nullptr*;
5180	for (auto &NextVF : ProfitableVFs) {
5181	// Skip candidate VFs without a corresponding VPlan.
5182	if (!hasPlanWithVF(VF: NextVF.Width))
5183	continue;
5184
5185	// Skip candidate VFs with widths >= the estimate runtime VF (scalable
5186	// vectors) or the VF of the main loop (fixed vectors).
5187	if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5188	ElementCount::isKnownGE(LHS: NextVF.Width, RHS: EstimatedRuntimeVF)) \|\|
5189	ElementCount::isKnownGE(LHS: NextVF.Width, RHS: MainLoopVF))
5190	continue;
5191
5192	// If NextVF is greater than the number of remaining iterations, the
5193	// epilogue loop would be dead. Skip such factors.
5194	if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
5195	// TODO: extend to support scalable VFs.
5196	if (!RemainingIterations) {
5197	const SCEV *TC = createTripCountSCEV(IdxTy: TCType, PSE, OrigLoop);
5198	RemainingIterations = SE.getURemExpr(
5199	LHS: TC, RHS: SE.getConstant(Ty: TCType, V: MainLoopVF.getKnownMinValue() * IC));
5200	}
5201	if (SE.isKnownPredicate(
5202	Pred: CmpInst::ICMP_UGT,
5203	LHS: SE.getConstant(Ty: TCType, V: NextVF.Width.getKnownMinValue()),
5204	RHS: RemainingIterations))
5205	continue;
5206	}
5207
5208	if (Result.Width.isScalar() \|\| isMoreProfitable(A: NextVF, B: Result))
5209	Result = NextVF;
5210	}
5211
5212	if (Result != VectorizationFactor::Disabled())
5213	LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5214	<< Result.Width << "\n");
5215	return Result;
5216	}
5217
5218	std::pair<unsigned, unsigned>
5219	LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5220	unsigned MinWidth = -`1U`;
5221	unsigned MaxWidth = `8`;
5222	const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5223	// For in-loop reductions, no element types are added to ElementTypesInLoop
5224	// if there are no loads/stores in the loop. In this case, check through the
5225	// reduction variables to determine the maximum width.
5226	if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5227	// Reset MaxWidth so that we can find the smallest type used by recurrences
5228	// in the loop.
5229	MaxWidth = -`1U`;
5230	for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
5231	const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5232	// When finding the min width used by the recurrence we need to account
5233	// for casts on the input operands of the recurrence.
5234	MaxWidth = std::min<unsigned>(
5235	a: MaxWidth, b: std::min<unsigned>(
5236	a: RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5237	b: RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5238	}
5239	} else {
5240	for (Type *T : ElementTypesInLoop) {
5241	MinWidth = std::min<unsigned>(
5242	a: MinWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
5243	MaxWidth = std::max<unsigned>(
5244	a: MaxWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
5245	}
5246	}
5247	return {MinWidth, MaxWidth};
5248	}
5249
5250	void LoopVectorizationCostModel::collectElementTypesForWidening() {
5251	ElementTypesInLoop.clear();
5252	// For each block.
5253	for (BasicBlock *BB : TheLoop->blocks()) {
5254	// For each instruction in the loop.
5255	for (Instruction &I : BB->instructionsWithoutDebug()) {
5256	Type *T = I.getType();
5257
5258	// Skip ignored values.
5259	if (ValuesToIgnore.count(Ptr: &I))
5260	continue;
5261
5262	// Only examine Loads, Stores and PHINodes.
5263	if (!isa<LoadInst>(Val: I) && !isa<StoreInst>(Val: I) && !isa<PHINode>(Val: I))
5264	continue;
5265
5266	// Examine PHI nodes that are reduction variables. Update the type to
5267	// account for the recurrence type.
5268	if (auto *PN = dyn_cast<PHINode>(Val: &I)) {
5269	if (!Legal->isReductionVariable(PN))
5270	continue;
5271	const RecurrenceDescriptor &RdxDesc =
5272	Legal->getReductionVars().find(Key: PN)->second;
5273	if (PreferInLoopReductions \|\| useOrderedReductions(RdxDesc) \|\|
5274	TTI.preferInLoopReduction(Opcode: RdxDesc.getOpcode(),
5275	Ty: RdxDesc.getRecurrenceType(),
5276	Flags: TargetTransformInfo::ReductionFlags ()))
5277	continue;
5278	T = RdxDesc.getRecurrenceType();
5279	}
5280
5281	// Examine the stored values.
5282	if (auto *ST = dyn_cast<StoreInst>(Val: &I))
5283	T = ST->getValueOperand()->getType();
5284
5285	assert(T->isSized() &&
5286	"Expected the load/store/recurrence type to be sized");
5287
5288	ElementTypesInLoop.insert(Ptr: T);
5289	}
5290	}
5291	}
5292
5293	unsigned
5294	LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5295	InstructionCost LoopCost) {
5296	// -- The interleave heuristics --
5297	// We interleave the loop in order to expose ILP and reduce the loop overhead.
5298	// There are many micro-architectural considerations that we can't predict
5299	// at this level. For example, frontend pressure (on decode or fetch) due to
5300	// code size, or the number and capabilities of the execution ports.
5301	//
5302	// We use the following heuristics to select the interleave count:
5303	// 1. If the code has reductions, then we interleave to break the cross
5304	// iteration dependency.
5305	// 2. If the loop is really small, then we interleave to reduce the loop
5306	// overhead.
5307	// 3. We don't interleave if we think that we will spill registers to memory
5308	// due to the increased register pressure.
5309
5310	if (!isScalarEpilogueAllowed())
5311	return `1`;
5312
5313	// Do not interleave if EVL is preferred and no User IC is specified.
5314	if (foldTailWithEVL()) {
5315	LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
5316	"Unroll factor forced to be 1.\n");
5317	return `1`;
5318	}
5319
5320	// We used the distance for the interleave count.
5321	if (!Legal->isSafeForAnyVectorWidth())
5322	return `1`;
5323
5324	auto BestKnownTC = getSmallBestKnownTC(SE&: *PSE.getSE(), L: TheLoop);
5325	const bool HasReductions = !Legal->getReductionVars().empty();
5326
5327	// If we did not calculate the cost for VF (because the user selected the VF)
5328	// then we calculate the cost of VF here.
5329	if (LoopCost == `0`) {
5330	LoopCost = expectedCost(VF).first;
5331	assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
5332
5333	// Loop body is free and there is no need for interleaving.
5334	if (LoopCost == `0`)
5335	return `1`;
5336	}
5337
5338	RegisterUsage R = calculateRegisterUsage(VFs: {VF})[`0`];
5339	// We divide by these constants so assume that we have at least one
5340	// instruction that uses at least one register.
5341	for (auto& pair : R.MaxLocalUsers) {
5342	pair.second = std::max(a: pair.second, b: `1U`);
5343	}
5344
5345	// We calculate the interleave count using the following formula.
5346	// Subtract the number of loop invariants from the number of available
5347	// registers. These registers are used by all of the interleaved instances.
5348	// Next, divide the remaining registers by the number of registers that is
5349	// required by the loop, in order to estimate how many parallel instances
5350	// fit without causing spills. All of this is rounded down if necessary to be
5351	// a power of two. We want power of two interleave count to simplify any
5352	// addressing operations or alignment considerations.
5353	// We also want power of two interleave counts to ensure that the induction
5354	// variable of the vector loop wraps to zero, when tail is folded by masking;
5355	// this currently happens when OptForSize, in which case IC is set to 1 above.
5356	unsigned IC = UINT_MAX;
5357
5358	for (auto& pair : R.MaxLocalUsers) {
5359	unsigned TargetNumRegisters = TTI.getNumberOfRegisters(ClassID: pair.first);
5360	LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5361	<< " registers of "
5362	<< TTI.getRegisterClassName(pair.first) << " register class\n");
5363	if (VF.isScalar()) {
5364	if (ForceTargetNumScalarRegs.getNumOccurrences() > `0`)
5365	TargetNumRegisters = ForceTargetNumScalarRegs;
5366	} else {
5367	if (ForceTargetNumVectorRegs.getNumOccurrences() > `0`)
5368	TargetNumRegisters = ForceTargetNumVectorRegs;
5369	}
5370	unsigned MaxLocalUsers = pair.second;
5371	unsigned LoopInvariantRegs = `0`;
5372	if (R.LoopInvariantRegs.find(Key: pair.first) != R.LoopInvariantRegs.end())
5373	LoopInvariantRegs = R.LoopInvariantRegs [pair.first];
5374
5375	unsigned TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs) /
5376	MaxLocalUsers);
5377	// Don't count the induction variable as interleaved.
5378	if (EnableIndVarRegisterHeur) {
5379	TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs - `1`) /
5380	std::max(a: `1U`, b: (MaxLocalUsers - `1`)));
5381	}
5382
5383	IC = std::min(a: IC, b: TmpIC);
5384	}
5385
5386	// Clamp the interleave ranges to reasonable counts.
5387	unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5388
5389	// Check if the user has overridden the max.
5390	if (VF.isScalar()) {
5391	if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > `0`)
5392	MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5393	} else {
5394	if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > `0`)
5395	MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5396	}
5397
5398	unsigned EstimatedVF = VF.getKnownMinValue();
5399	if (VF.isScalable()) {
5400	if (std::optional<unsigned> VScale = getVScaleForTuning(L: TheLoop, TTI))
5401	EstimatedVF = VScale;
5402	}
5403	assert(EstimatedVF >= `1` && "Estimated VF shouldn't be less than 1");
5404
5405	unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(L: TheLoop);
5406	if (KnownTC > `0`) {
5407	// At least one iteration must be scalar when this constraint holds. So the
5408	// maximum available iterations for interleaving is one less.
5409	unsigned AvailableTC =
5410	requiresScalarEpilogue(IsVectorizing: VF.isVector()) ? KnownTC - `1` : KnownTC;
5411
5412	// If trip count is known we select between two prospective ICs, where
5413	// 1) the aggressive IC is capped by the trip count divided by VF
5414	// 2) the conservative IC is capped by the trip count divided by (VF 2)*
5415	// The final IC is selected in a way that the epilogue loop trip count is
5416	// minimized while maximizing the IC itself, so that we either run the
5417	// vector loop at least once if it generates a small epilogue loop, or else
5418	// we run the vector loop at least twice.
5419
5420	unsigned InterleaveCountUB = bit_floor(
5421	Value: std::max(a: `1u`, b: std::min(a: AvailableTC / EstimatedVF, b: MaxInterleaveCount)));
5422	unsigned InterleaveCountLB = bit_floor(Value: std::max(
5423	a: `1u`, b: std::min(a: AvailableTC / (EstimatedVF * `2`), b: MaxInterleaveCount)));
5424	MaxInterleaveCount = InterleaveCountLB;
5425
5426	if (InterleaveCountUB != InterleaveCountLB) {
5427	unsigned TailTripCountUB =
5428	(AvailableTC % (EstimatedVF * InterleaveCountUB));
5429	unsigned TailTripCountLB =
5430	(AvailableTC % (EstimatedVF * InterleaveCountLB));
5431	// If both produce same scalar tail, maximize the IC to do the same work
5432	// in fewer vector loop iterations
5433	if (TailTripCountUB == TailTripCountLB)
5434	MaxInterleaveCount = InterleaveCountUB;
5435	}
5436	} else if (BestKnownTC && *BestKnownTC > `0`) {
5437	// At least one iteration must be scalar when this constraint holds. So the
5438	// maximum available iterations for interleaving is one less.
5439	unsigned AvailableTC = requiresScalarEpilogue(IsVectorizing: VF.isVector())
5440	? (*BestKnownTC) - `1`
5441	: *BestKnownTC;
5442
5443	// If trip count is an estimated compile time constant, limit the
5444	// IC to be capped by the trip count divided by VF 2, such that the vector*
5445	// loop runs at least twice to make interleaving seem profitable when there
5446	// is an epilogue loop present. Since exact Trip count is not known we
5447	// choose to be conservative in our IC estimate.
5448	MaxInterleaveCount = bit_floor(Value: std::max(
5449	a: `1u`, b: std::min(a: AvailableTC / (EstimatedVF * `2`), b: MaxInterleaveCount)));
5450	}
5451
5452	assert(MaxInterleaveCount > `0` &&
5453	"Maximum interleave count must be greater than 0");
5454
5455	// Clamp the calculated IC to be between the 1 and the max interleave count
5456	// that the target and trip count allows.
5457	if (IC > MaxInterleaveCount)
5458	IC = MaxInterleaveCount;
5459	else
5460	// Make sure IC is greater than 0.
5461	IC = std::max(a: `1u`, b: IC);
5462
5463	assert(IC > `0` && "Interleave count must be greater than 0.");
5464
5465	// Interleave if we vectorized this loop and there is a reduction that could
5466	// benefit from interleaving.
5467	if (VF.isVector() && HasReductions) {
5468	LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5469	return IC;
5470	}
5471
5472	// For any scalar loop that either requires runtime checks or predication we
5473	// are better off leaving this to the unroller. Note that if we've already
5474	// vectorized the loop we will have done the runtime check and so interleaving
5475	// won't require further checks.
5476	bool ScalarInterleavingRequiresPredication =
5477	(VF.isScalar() && any_of(Range: TheLoop->blocks(), P: [this](BasicBlock *BB) {
5478	return Legal->blockNeedsPredication(BB);
5479	}));
5480	bool ScalarInterleavingRequiresRuntimePointerCheck =
5481	(VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5482
5483	// We want to interleave small loops in order to reduce the loop overhead and
5484	// potentially expose ILP opportunities.
5485	LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << `'\n'`
5486	<< "LV: IC is " << IC << `'\n'`
5487	<< "LV: VF is " << VF << `'\n'`);
5488	const bool AggressivelyInterleaveReductions =
5489	TTI.enableAggressiveInterleaving(LoopHasReductions: HasReductions);
5490	if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5491	!ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5492	// We assume that the cost overhead is 1 and we use the cost model
5493	// to estimate the cost of the loop and interleave until the cost of the
5494	// loop overhead is about 5% of the cost of the loop.
5495	unsigned SmallIC = std::min(a: IC, b: (unsigned)llvm::bit_floor<uint64_t>(
5496	Value: SmallLoopCost / *LoopCost.getValue()));
5497
5498	// Interleave until store/load ports (estimated by max interleave count) are
5499	// saturated.
5500	unsigned NumStores = Legal->getNumStores();
5501	unsigned NumLoads = Legal->getNumLoads();
5502	unsigned StoresIC = IC / (NumStores ? NumStores : `1`);
5503	unsigned LoadsIC = IC / (NumLoads ? NumLoads : `1`);
5504
5505	// There is little point in interleaving for reductions containing selects
5506	// and compares when VF=1 since it may just create more overhead than it's
5507	// worth for loops with small trip counts. This is because we still have to
5508	// do the final reduction after the loop.
5509	bool HasSelectCmpReductions =
5510	HasReductions &&
5511	any_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
5512	const RecurrenceDescriptor &RdxDesc = Reduction.second;
5513	return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5514	Kind: RdxDesc.getRecurrenceKind());
5515	});
5516	if (HasSelectCmpReductions) {
5517	LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5518	return `1`;
5519	}
5520
5521	// If we have a scalar reduction (vector reductions are already dealt with
5522	// by this point), we can increase the critical path length if the loop
5523	// we're interleaving is inside another loop. For tree-wise reductions
5524	// set the limit to 2, and for ordered reductions it's best to disable
5525	// interleaving entirely.
5526	if (HasReductions && TheLoop->getLoopDepth() > `1`) {
5527	bool HasOrderedReductions =
5528	any_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
5529	const RecurrenceDescriptor &RdxDesc = Reduction.second;
5530	return RdxDesc.isOrdered();
5531	});
5532	if (HasOrderedReductions) {
5533	LLVM_DEBUG(
5534	dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5535	return `1`;
5536	}
5537
5538	unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5539	SmallIC = std::min(a: SmallIC, b: F);
5540	StoresIC = std::min(a: StoresIC, b: F);
5541	LoadsIC = std::min(a: LoadsIC, b: F);
5542	}
5543
5544	if (EnableLoadStoreRuntimeInterleave &&
5545	std::max(a: StoresIC, b: LoadsIC) > SmallIC) {
5546	LLVM_DEBUG(
5547	dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5548	return std::max(a: StoresIC, b: LoadsIC);
5549	}
5550
5551	// If there are scalar reductions and TTI has enabled aggressive
5552	// interleaving for reductions, we will interleave to expose ILP.
5553	if (VF.isScalar() && AggressivelyInterleaveReductions) {
5554	LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5555	// Interleave no less than SmallIC but not as aggressive as the normal IC
5556	// to satisfy the rare situation when resources are too limited.
5557	return std::max(a: IC / `2`, b: SmallIC);
5558	} else {
5559	LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5560	return SmallIC;
5561	}
5562	}
5563
5564	// Interleave if this is a large loop (small loops are already dealt with by
5565	// this point) that could benefit from interleaving.
5566	if (AggressivelyInterleaveReductions) {
5567	LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5568	return IC;
5569	}
5570
5571	LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5572	return `1`;
5573	}
5574
5575	SmallVector<LoopVectorizationCostModel::RegisterUsage, `8`>
5576	LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5577	// This function calculates the register usage by measuring the highest number
5578	// of values that are alive at a single location. Obviously, this is a very
5579	// rough estimation. We scan the loop in a topological order in order and
5580	// assign a number to each instruction. We use RPO to ensure that defs are
5581	// met before their users. We assume that each instruction that has in-loop
5582	// users starts an interval. We record every time that an in-loop value is
5583	// used, so we have a list of the first and last occurrences of each
5584	// instruction. Next, we transpose this data structure into a multi map that
5585	// holds the list of intervals that end* at a specific location. This multi*
5586	// map allows us to perform a linear search. We scan the instructions linearly
5587	// and record each time that a new interval starts, by placing it in a set.
5588	// If we find this value in the multi-map then we remove it from the set.
5589	// The max register usage is the maximum size of the set.
5590	// We also search for instructions that are defined outside the loop, but are
5591	// used inside the loop. We need this number separately from the max-interval
5592	// usage number because when we unroll, loop-invariant values do not take
5593	// more register.
5594	LoopBlocksDFS DFS(TheLoop);
5595	DFS.perform(LI);
5596
5597	RegisterUsage RU;
5598
5599	// Each 'key' in the map opens a new interval. The values
5600	// of the map are the index of the 'last seen' usage of the
5601	// instruction that is the key.
5602	using IntervalMap = DenseMap<Instruction , unsigned*>;
5603
5604	// Maps instruction to its index.
5605	SmallVector<Instruction *, `64`> IdxToInstr;
5606	// Marks the end of each interval.
5607	IntervalMap EndPoint;
5608	// Saves the list of instruction indices that are used in the loop.
5609	SmallPtrSet<Instruction *, `8`> Ends;
5610	// Saves the list of values that are used in the loop but are defined outside
5611	// the loop (not including non-instruction values such as arguments and
5612	// constants).
5613	SmallSetVector<Instruction *, `8`> LoopInvariants;
5614
5615	for (BasicBlock *BB : make_range(x: DFS.beginRPO(), y: DFS.endRPO())) {
5616	for (Instruction &I : BB->instructionsWithoutDebug()) {
5617	IdxToInstr.push_back(Elt: &I);
5618
5619	// Save the end location of each USE.
5620	for (Value *U : I.operands()) {
5621	auto *Instr = dyn_cast<Instruction>(Val: U);
5622
5623	// Ignore non-instruction values such as arguments, constants, etc.
5624	// FIXME: Might need some motivation why these values are ignored. If
5625	// for example an argument is used inside the loop it will increase the
5626	// register pressure (so shouldn't we add it to LoopInvariants).
5627	if (!Instr)
5628	continue;
5629
5630	// If this instruction is outside the loop then record it and continue.
5631	if (!TheLoop->contains(Inst: Instr)) {
5632	LoopInvariants.insert(X: Instr);
5633	continue;
5634	}
5635
5636	// Overwrite previous end points.
5637	EndPoint [Instr] = IdxToInstr.size();
5638	Ends.insert(Ptr: Instr);
5639	}
5640	}
5641	}
5642
5643	// Saves the list of intervals that end with the index in 'key'.
5644	using InstrList = SmallVector<Instruction *, `2`>;
5645	DenseMap<unsigned, InstrList> TransposeEnds;
5646
5647	// Transpose the EndPoints to a list of values that end at each index.
5648	for (auto &Interval : EndPoint)
5649	TransposeEnds [Interval.second].push_back(Elt: Interval.first);
5650
5651	SmallPtrSet<Instruction *, `8`> OpenIntervals;
5652	SmallVector<RegisterUsage, `8`> RUs(VFs.size());
5653	SmallVector<SmallMapVector<unsigned, unsigned, `4`>, `8`> MaxUsages(VFs.size());
5654
5655	LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5656
5657	const auto &TTICapture = TTI;
5658	auto GetRegUsage = [&TTICapture](Type Ty, ElementCount VF) -> unsigned* {
5659	if (Ty->isTokenTy() \|\| !VectorType::isValidElementType(ElemTy: Ty))
5660	return `0`;
5661	return TTICapture.getRegUsageForType(Ty: VectorType::get(ElementType: Ty, EC: VF));
5662	};
5663
5664	for (unsigned int i = `0`, s = IdxToInstr.size(); i < s; ++i) {
5665	Instruction *I = IdxToInstr [i];
5666
5667	// Remove all of the instructions that end at this location.
5668	InstrList &List = TransposeEnds [i];
5669	for (Instruction *ToRemove : List)
5670	OpenIntervals.erase(Ptr: ToRemove);
5671
5672	// Ignore instructions that are never used within the loop.
5673	if (!Ends.count(Ptr: I))
5674	continue;
5675
5676	// Skip ignored values.
5677	if (ValuesToIgnore.count(Ptr: I))
5678	continue;
5679
5680	collectInLoopReductions();
5681
5682	// For each VF find the maximum usage of registers.
5683	for (unsigned j = `0`, e = VFs.size(); j < e; ++j) {
5684	// Count the number of registers used, per register class, given all open
5685	// intervals.
5686	// Note that elements in this SmallMapVector will be default constructed
5687	// as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5688	// there is no previous entry for ClassID.
5689	SmallMapVector<unsigned, unsigned, `4`> RegUsage;
5690
5691	if (VFs [j].isScalar()) {
5692	for (auto *Inst : OpenIntervals) {
5693	unsigned ClassID =
5694	TTI.getRegisterClassForType(Vector: false, Ty: Inst->getType());
5695	// FIXME: The target might use more than one register for the type
5696	// even in the scalar case.
5697	RegUsage [ClassID] += `1`;
5698	}
5699	} else {
5700	collectUniformsAndScalars(VF: VFs [j]);
5701	for (auto *Inst : OpenIntervals) {
5702	// Skip ignored values for VF > 1.
5703	if (VecValuesToIgnore.count(Ptr: Inst))
5704	continue;
5705	if (isScalarAfterVectorization(I: Inst, VF: VFs [j])) {
5706	unsigned ClassID =
5707	TTI.getRegisterClassForType(Vector: false, Ty: Inst->getType());
5708	// FIXME: The target might use more than one register for the type
5709	// even in the scalar case.
5710	RegUsage [ClassID] += `1`;
5711	} else {
5712	unsigned ClassID =
5713	TTI.getRegisterClassForType(Vector: true, Ty: Inst->getType());
5714	RegUsage [ClassID] += GetRegUsage (Inst->getType(), VFs [j]);
5715	}
5716	}
5717	}
5718
5719	for (auto& pair : RegUsage) {
5720	auto &Entry = MaxUsages [j][pair.first];
5721	Entry = std::max(a: Entry, b: pair.second);
5722	}
5723	}
5724
5725	LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5726	<< OpenIntervals.size() << `'\n'`);
5727
5728	// Add the current instruction to the list of open intervals.
5729	OpenIntervals.insert(Ptr: I);
5730	}
5731
5732	for (unsigned i = `0`, e = VFs.size(); i < e; ++i) {
5733	// Note that elements in this SmallMapVector will be default constructed
5734	// as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5735	// there is no previous entry for ClassID.
5736	SmallMapVector<unsigned, unsigned, `4`> Invariant;
5737
5738	for (auto *Inst : LoopInvariants) {
5739	// FIXME: The target might use more than one register for the type
5740	// even in the scalar case.
5741	bool IsScalar = all_of(Range: Inst->users(), P: [&](User *U) {
5742	auto *I = cast<Instruction>(Val: U);
5743	return TheLoop != LI->getLoopFor(BB: I->getParent()) \|\|
5744	isScalarAfterVectorization(I, VF: VFs [i]);
5745	});
5746
5747	ElementCount VF = IsScalar ? ElementCount::getFixed(MinVal: `1`) : VFs [i];
5748	unsigned ClassID =
5749	TTI.getRegisterClassForType(Vector: VF.isVector(), Ty: Inst->getType());
5750	Invariant [ClassID] += GetRegUsage (Inst->getType(), VF);
5751	}
5752
5753	LLVM_DEBUG({
5754	dbgs() << "LV(REG): VF = " << VFs[i] << `'\n'`;
5755	dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5756	<< " item\n";
5757	for (const auto &pair : MaxUsages[i]) {
5758	dbgs() << "LV(REG): RegisterClass: "
5759	<< TTI.getRegisterClassName(pair.first) << ", " << pair.second
5760	<< " registers\n";
5761	}
5762	dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5763	<< " item\n";
5764	for (const auto &pair : Invariant) {
5765	dbgs() << "LV(REG): RegisterClass: "
5766	<< TTI.getRegisterClassName(pair.first) << ", " << pair.second
5767	<< " registers\n";
5768	}
5769	});
5770
5771	RU.LoopInvariantRegs = Invariant;
5772	RU.MaxLocalUsers = MaxUsages [i];
5773	RUs [i] = RU;
5774	}
5775
5776	return RUs;
5777	}
5778
5779	bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5780	ElementCount VF) {
5781	// TODO: Cost model for emulated masked load/store is completely
5782	// broken. This hack guides the cost model to use an artificially
5783	// high enough value to practically disable vectorization with such
5784	// operations, except where previously deployed legality hack allowed
5785	// using very low cost values. This is to avoid regressions coming simply
5786	// from moving "masked load/store" check from legality to cost model.
5787	// Masked Load/Gather emulation was previously never allowed.
5788	// Limited number of Masked Store/Scatter emulation was allowed.
5789	assert((isPredicatedInst(I)) &&
5790	"Expecting a scalar emulated instruction");
5791	return isa<LoadInst>(Val: I) \|\|
5792	(isa<StoreInst>(Val: I) &&
5793	NumPredStores > NumberOfStoresToPredicate);
5794	}
5795
5796	void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5797	// If we aren't vectorizing the loop, or if we've already collected the
5798	// instructions to scalarize, there's nothing to do. Collection may already
5799	// have occurred if we have a user-selected VF and are now computing the
5800	// expected cost for interleaving.
5801	if (VF.isScalar() \|\| VF.isZero() \|\| InstsToScalarize.contains(Val: VF))
5802	return;
5803
5804	// Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5805	// not profitable to scalarize any instructions, the presence of VF in the
5806	// map will indicate that we've analyzed it already.
5807	ScalarCostsTy &ScalarCostsVF = InstsToScalarize [VF];
5808
5809	PredicatedBBsAfterVectorization [VF].clear();
5810
5811	// Find all the instructions that are scalar with predication in the loop and
5812	// determine if it would be better to not if-convert the blocks they are in.
5813	// If so, we also record the instructions to scalarize.
5814	for (BasicBlock *BB : TheLoop->blocks()) {
5815	if (!blockNeedsPredicationForAnyReason(BB))
5816	continue;
5817	for (Instruction &I : *BB)
5818	if (isScalarWithPredication(I: &I, VF)) {
5819	ScalarCostsTy ScalarCosts;
5820	// Do not apply discount if scalable, because that would lead to
5821	// invalid scalarization costs.
5822	// Do not apply discount logic if hacked cost is needed
5823	// for emulated masked memrefs.
5824	if (!isScalarAfterVectorization(I: &I, VF) && !VF.isScalable() &&
5825	!useEmulatedMaskMemRefHack(I: &I, VF) &&
5826	computePredInstDiscount(PredInst: &I, ScalarCosts, VF) >= `0`)
5827	ScalarCostsVF.insert(I: ScalarCosts.begin(), E: ScalarCosts.end());
5828	// Remember that BB will remain after vectorization.
5829	PredicatedBBsAfterVectorization [VF].insert(Ptr: BB);
5830	}
5831	}
5832	}
5833
5834	InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5835	Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5836	assert(!isUniformAfterVectorization(PredInst, VF) &&
5837	"Instruction marked uniform-after-vectorization will be predicated");
5838
5839	// Initialize the discount to zero, meaning that the scalar version and the
5840	// vector version cost the same.
5841	InstructionCost Discount = `0`;
5842
5843	// Holds instructions to analyze. The instructions we visit are mapped in
5844	// ScalarCosts. Those instructions are the ones that would be scalarized if
5845	// we find that the scalar version costs less.
5846	SmallVector<Instruction *, `8`> Worklist;
5847
5848	// Returns true if the given instruction can be scalarized.
5849	auto canBeScalarized = [&](Instruction I) -> bool* {
5850	// We only attempt to scalarize instructions forming a single-use chain
5851	// from the original predicated block that would otherwise be vectorized.
5852	// Although not strictly necessary, we give up on instructions we know will
5853	// already be scalar to avoid traversing chains that are unlikely to be
5854	// beneficial.
5855	if (!I->hasOneUse() \|\| PredInst->getParent() != I->getParent() \|\|
5856	isScalarAfterVectorization(I, VF))
5857	return false;
5858
5859	// If the instruction is scalar with predication, it will be analyzed
5860	// separately. We ignore it within the context of PredInst.
5861	if (isScalarWithPredication(I, VF))
5862	return false;
5863
5864	// If any of the instruction's operands are uniform after vectorization,
5865	// the instruction cannot be scalarized. This prevents, for example, a
5866	// masked load from being scalarized.
5867	//
5868	// We assume we will only emit a value for lane zero of an instruction
5869	// marked uniform after vectorization, rather than VF identical values.
5870	// Thus, if we scalarize an instruction that uses a uniform, we would
5871	// create uses of values corresponding to the lanes we aren't emitting code
5872	// for. This behavior can be changed by allowing getScalarValue to clone
5873	// the lane zero values for uniforms rather than asserting.
5874	for (Use &U : I->operands())
5875	if (auto *J = dyn_cast<Instruction>(Val: U.get()))
5876	if (isUniformAfterVectorization(I: J, VF))
5877	return false;
5878
5879	// Otherwise, we can scalarize the instruction.
5880	return true;
5881	};
5882
5883	// Compute the expected cost discount from scalarizing the entire expression
5884	// feeding the predicated instruction. We currently only consider expressions
5885	// that are single-use instruction chains.
5886	Worklist.push_back(Elt: PredInst);
5887	while (!Worklist.empty()) {
5888	Instruction *I = Worklist.pop_back_val();
5889
5890	// If we've already analyzed the instruction, there's nothing to do.
5891	if (ScalarCosts.contains(Val: I))
5892	continue;
5893
5894	// Compute the cost of the vector instruction. Note that this cost already
5895	// includes the scalarization overhead of the predicated instruction.
5896	InstructionCost VectorCost = getInstructionCost(I, VF).first;
5897
5898	// Compute the cost of the scalarized instruction. This cost is the cost of
5899	// the instruction as if it wasn't if-converted and instead remained in the
5900	// predicated block. We will scale this cost by block probability after
5901	// computing the scalarization overhead.
5902	InstructionCost ScalarCost =
5903	VF.getFixedValue() *
5904	getInstructionCost(I, VF: ElementCount::getFixed(MinVal: `1`)).first;
5905
5906	// Compute the scalarization overhead of needed insertelement instructions
5907	// and phi nodes.
5908	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5909	if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5910	ScalarCost += TTI.getScalarizationOverhead(
5911	Ty: cast<VectorType>(Val: ToVectorTy(Scalar: I->getType(), EC: VF)),
5912	DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), /Insert/ true,
5913	/Extract/ false, CostKind);
5914	ScalarCost +=
5915	VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
5916	}
5917
5918	// Compute the scalarization overhead of needed extractelement
5919	// instructions. For each of the instruction's operands, if the operand can
5920	// be scalarized, add it to the worklist; otherwise, account for the
5921	// overhead.
5922	for (Use &U : I->operands())
5923	if (auto *J = dyn_cast<Instruction>(Val: U.get())) {
5924	assert(VectorType::isValidElementType(J->getType()) &&
5925	"Instruction has non-scalar type");
5926	if (canBeScalarized (J))
5927	Worklist.push_back(Elt: J);
5928	else if (needsExtract(V: J, VF)) {
5929	ScalarCost += TTI.getScalarizationOverhead(
5930	Ty: cast<VectorType>(Val: ToVectorTy(Scalar: J->getType(), EC: VF)),
5931	DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), /Insert/ false,
5932	/Extract/ true, CostKind);
5933	}
5934	}
5935
5936	// Scale the total scalar cost by block probability.
5937	ScalarCost /= getReciprocalPredBlockProb();
5938
5939	// Compute the discount. A non-negative discount means the vector version
5940	// of the instruction costs more, and scalarizing would be beneficial.
5941	Discount += VectorCost - ScalarCost;
5942	ScalarCosts [I] = ScalarCost;
5943	}
5944
5945	return Discount;
5946	}
5947
5948	LoopVectorizationCostModel::VectorizationCostTy
5949	LoopVectorizationCostModel::expectedCost(
5950	ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
5951	VectorizationCostTy Cost;
5952
5953	// For each block.
5954	for (BasicBlock *BB : TheLoop->blocks()) {
5955	VectorizationCostTy BlockCost;
5956
5957	// For each instruction in the old loop.
5958	for (Instruction &I : BB->instructionsWithoutDebug()) {
5959	// Skip ignored values.
5960	if (ValuesToIgnore.count(Ptr: &I) \|\|
5961	(VF.isVector() && VecValuesToIgnore.count(Ptr: &I)))
5962	continue;
5963
5964	VectorizationCostTy C = getInstructionCost(I: &I, VF);
5965
5966	// Check if we should override the cost.
5967	if (C.first.isValid() &&
5968	ForceTargetInstructionCost.getNumOccurrences() > `0`)
5969	C.first = InstructionCost (ForceTargetInstructionCost);
5970
5971	// Keep a list of instructions with invalid costs.
5972	if (Invalid && !C.first.isValid())
5973	Invalid->emplace_back(Args: &I, Args&: VF);
5974
5975	BlockCost.first += C.first;
5976	BlockCost.second \|= C.second;
5977	LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5978	<< " for VF " << VF << " For instruction: " << I
5979	<< `'\n'`);
5980	}
5981
5982	// If we are vectorizing a predicated block, it will have been
5983	// if-converted. This means that the block's instructions (aside from
5984	// stores and instructions that may divide by zero) will now be
5985	// unconditionally executed. For the scalar case, we may not always execute
5986	// the predicated block, if it is an if-else block. Thus, scale the block's
5987	// cost by the probability of executing it. blockNeedsPredication from
5988	// Legal is used so as to not include all blocks in tail folded loops.
5989	if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5990	BlockCost.first /= getReciprocalPredBlockProb();
5991
5992	Cost.first += BlockCost.first;
5993	Cost.second \|= BlockCost.second;
5994	}
5995
5996	return Cost;
5997	}
5998
5999	/// Gets Address Access SCEV after verifying that the access pattern
6000	/// is loop invariant except the induction variable dependence.
6001	///
6002	/// This SCEV can be sent to the Target in order to estimate the address
6003	/// calculation cost.
6004	static const SCEV *getAddressAccessSCEV(
6005	Value *Ptr,
6006	LoopVectorizationLegality *Legal,
6007	PredicatedScalarEvolution &PSE,
6008	const Loop *TheLoop) {
6009
6010	auto *Gep = dyn_cast<GetElementPtrInst>(Val: Ptr);
6011	if (!Gep)
6012	return nullptr;
6013
6014	// We are looking for a gep with all loop invariant indices except for one
6015	// which should be an induction variable.
6016	auto SE = PSE.getSE();
6017	unsigned NumOperands = Gep->getNumOperands();
6018	for (unsigned i = `1`; i < NumOperands; ++i) {
6019	Value *Opd = Gep->getOperand(i_nocapture: i);
6020	if (!SE->isLoopInvariant(S: SE->getSCEV(V: Opd), L: TheLoop) &&
6021	!Legal->isInductionVariable(V: Opd))
6022	return nullptr;
6023	}
6024
6025	// Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6026	return PSE.getSCEV(V: Ptr);
6027	}
6028
6029	InstructionCost
6030	LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6031	ElementCount VF) {
6032	assert(VF.isVector() &&
6033	"Scalarization cost of instruction implies vectorization.");
6034	if (VF.isScalable())
6035	return InstructionCost::getInvalid();
6036
6037	Type *ValTy = getLoadStoreType(I);
6038	auto SE = PSE.getSE();
6039
6040	unsigned AS = getLoadStoreAddressSpace(I);
6041	Value *Ptr = getLoadStorePointerOperand(V: I);
6042	Type *PtrTy = ToVectorTy(Scalar: Ptr->getType(), EC: VF);
6043	// NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6044	// that it is being called from this specific place.
6045
6046	// Figure out whether the access is strided and get the stride value
6047	// if it's known in compile time
6048	const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6049
6050	// Get the cost of the scalar memory instruction and address computation.
6051	InstructionCost Cost =
6052	VF.getKnownMinValue() * TTI.getAddressComputationCost(Ty: PtrTy, SE, Ptr: PtrSCEV);
6053
6054	// Don't pass I here, since it is scalar but will actually be part of a*
6055	// vectorized loop where the user of it is a vectorized instruction.
6056	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6057	const Align Alignment = getLoadStoreAlignment(I);
6058	Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(Opcode: I->getOpcode(),
6059	Src: ValTy->getScalarType(),
6060	Alignment, AddressSpace: AS, CostKind);
6061
6062	// Get the overhead of the extractelement and insertelement instructions
6063	// we might create due to scalarization.
6064	Cost += getScalarizationOverhead(I, VF, CostKind);
6065
6066	// If we have a predicated load/store, it will need extra i1 extracts and
6067	// conditional branches, but may not be executed for each vector lane. Scale
6068	// the cost by the probability of executing the predicated block.
6069	if (isPredicatedInst(I)) {
6070	Cost /= getReciprocalPredBlockProb();
6071
6072	// Add the cost of an i1 extract and a branch
6073	auto *Vec_i1Ty =
6074	VectorType::get(ElementType: IntegerType::getInt1Ty(C&: ValTy->getContext()), EC: VF);
6075	Cost += TTI.getScalarizationOverhead(
6076	Ty: Vec_i1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getKnownMinValue()),
6077	/Insert=/false, /Extract=/true, CostKind);
6078	Cost += TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind);
6079
6080	if (useEmulatedMaskMemRefHack(I, VF))
6081	// Artificially setting to a high enough value to practically disable
6082	// vectorization with such operations.
6083	Cost = `3000000`;
6084	}
6085
6086	return Cost;
6087	}
6088
6089	InstructionCost
6090	LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6091	ElementCount VF) {
6092	Type *ValTy = getLoadStoreType(I);
6093	auto *VectorTy = cast<VectorType>(Val: ToVectorTy(Scalar: ValTy, EC: VF));
6094	Value *Ptr = getLoadStorePointerOperand(V: I);
6095	unsigned AS = getLoadStoreAddressSpace(I);
6096	int ConsecutiveStride = Legal->isConsecutivePtr(AccessTy: ValTy, Ptr);
6097	enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6098
6099	assert((ConsecutiveStride == `1` \|\| ConsecutiveStride == -`1`) &&
6100	"Stride should be 1 or -1 for consecutive memory access");
6101	const Align Alignment = getLoadStoreAlignment(I);
6102	InstructionCost Cost = `0`;
6103	if (Legal->isMaskRequired(I)) {
6104	Cost += TTI.getMaskedMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS,
6105	CostKind);
6106	} else {
6107	TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: `0`));
6108	Cost += TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS,
6109	CostKind, OpdInfo: OpInfo, I);
6110	}
6111
6112	bool Reverse = ConsecutiveStride < `0`;
6113	if (Reverse)
6114	Cost += TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, Tp: VectorTy,
6115	Mask: std::nullopt, CostKind, Index: `0`);
6116	return Cost;
6117	}
6118
6119	InstructionCost
6120	LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6121	ElementCount VF) {
6122	assert(Legal->isUniformMemOp(*I, VF));
6123
6124	Type *ValTy = getLoadStoreType(I);
6125	auto *VectorTy = cast<VectorType>(Val: ToVectorTy(Scalar: ValTy, EC: VF));
6126	const Align Alignment = getLoadStoreAlignment(I);
6127	unsigned AS = getLoadStoreAddressSpace(I);
6128	enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6129	if (isa<LoadInst>(Val: I)) {
6130	return TTI.getAddressComputationCost(Ty: ValTy) +
6131	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: ValTy, Alignment, AddressSpace: AS,
6132	CostKind) +
6133	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, Tp: VectorTy);
6134	}
6135	StoreInst *SI = cast<StoreInst>(Val: I);
6136
6137	bool isLoopInvariantStoreValue = Legal->isInvariant(V: SI->getValueOperand());
6138	return TTI.getAddressComputationCost(Ty: ValTy) +
6139	TTI.getMemoryOpCost(Opcode: Instruction::Store, Src: ValTy, Alignment, AddressSpace: AS,
6140	CostKind) +
6141	(isLoopInvariantStoreValue
6142	? `0`
6143	: TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VectorTy,
6144	CostKind, Index: VF.getKnownMinValue() - `1`));
6145	}
6146
6147	InstructionCost
6148	LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6149	ElementCount VF) {
6150	Type *ValTy = getLoadStoreType(I);
6151	auto *VectorTy = cast<VectorType>(Val: ToVectorTy(Scalar: ValTy, EC: VF));
6152	const Align Alignment = getLoadStoreAlignment(I);
6153	const Value *Ptr = getLoadStorePointerOperand(V: I);
6154
6155	return TTI.getAddressComputationCost(Ty: VectorTy) +
6156	TTI.getGatherScatterOpCost(
6157	Opcode: I->getOpcode(), DataTy: VectorTy, Ptr, VariableMask: Legal->isMaskRequired(I), Alignment,
6158	CostKind: TargetTransformInfo::TCK_RecipThroughput, I);
6159	}
6160
6161	InstructionCost
6162	LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6163	ElementCount VF) {
6164	Type *ValTy = getLoadStoreType(I);
6165	auto *VectorTy = cast<VectorType>(Val: ToVectorTy(Scalar: ValTy, EC: VF));
6166	unsigned AS = getLoadStoreAddressSpace(I);
6167	enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6168
6169	auto Group = getInterleavedAccessGroup(Instr: I);
6170	assert(Group && "Fail to get an interleaved access group.");
6171
6172	unsigned InterleaveFactor = Group->getFactor();
6173	auto WideVecTy = VectorType::get(ElementType: ValTy, EC: VF InterleaveFactor);
6174
6175	// Holds the indices of existing members in the interleaved group.
6176	SmallVector<unsigned, `4`> Indices;
6177	for (unsigned IF = `0`; IF < InterleaveFactor; IF++)
6178	if (Group->getMember(Index: IF))
6179	Indices.push_back(Elt: IF);
6180
6181	// Calculate the cost of the whole interleaved group.
6182	bool UseMaskForGaps =
6183	(Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) \|\|
6184	(isa<StoreInst>(Val: I) && (Group->getNumMembers() < Group->getFactor()));
6185	InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6186	Opcode: I->getOpcode(), VecTy: WideVecTy, Factor: Group->getFactor(), Indices, Alignment: Group->getAlign(),
6187	AddressSpace: AS, CostKind, UseMaskForCond: Legal->isMaskRequired(I), UseMaskForGaps);
6188
6189	if (Group->isReverse()) {
6190	// TODO: Add support for reversed masked interleaved access.
6191	assert(!Legal->isMaskRequired(I) &&
6192	"Reverse masked interleaved access not supported.");
6193	Cost += Group->getNumMembers() *
6194	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, Tp: VectorTy,
6195	Mask: std::nullopt, CostKind, Index: `0`);
6196	}
6197	return Cost;
6198	}
6199
6200	std::optional<InstructionCost>
6201	LoopVectorizationCostModel::getReductionPatternCost(
6202	Instruction I, ElementCount VF, Type Ty,
6203	TTI::TargetCostKind CostKind) const {
6204	using namespace llvm::PatternMatch;
6205	// Early exit for no inloop reductions
6206	if (InLoopReductions.empty() \|\| VF.isScalar() \|\| !isa<VectorType>(Val: Ty))
6207	return std::nullopt;
6208	auto *VectorTy = cast<VectorType>(Val: Ty);
6209
6210	// We are looking for a pattern of, and finding the minimal acceptable cost:
6211	// reduce(mul(ext(A), ext(B))) or
6212	// reduce(mul(A, B)) or
6213	// reduce(ext(A)) or
6214	// reduce(A).
6215	// The basic idea is that we walk down the tree to do that, finding the root
6216	// reduction instruction in InLoopReductionImmediateChains. From there we find
6217	// the pattern of mul/ext and test the cost of the entire pattern vs the cost
6218	// of the components. If the reduction cost is lower then we return it for the
6219	// reduction instruction and 0 for the other instructions in the pattern. If
6220	// it is not we return an invalid cost specifying the orignal cost method
6221	// should be used.
6222	Instruction *RetI = I;
6223	if (match(V: RetI, P: m_ZExtOrSExt(Op: m_Value()))) {
6224	if (!RetI->hasOneUser())
6225	return std::nullopt;
6226	RetI = RetI->user_back();
6227	}
6228
6229	if (match(V: RetI, P: m_OneUse(SubPattern: m_Mul(L: m_Value(), R: m_Value()))) &&
6230	RetI->user_back()->getOpcode() == Instruction::Add) {
6231	RetI = RetI->user_back();
6232	}
6233
6234	// Test if the found instruction is a reduction, and if not return an invalid
6235	// cost specifying the parent to use the original cost modelling.
6236	if (!InLoopReductionImmediateChains.count(Val: RetI))
6237	return std::nullopt;
6238
6239	// Find the reduction this chain is a part of and calculate the basic cost of
6240	// the reduction on its own.
6241	Instruction *LastChain = InLoopReductionImmediateChains.at(Val: RetI);
6242	Instruction *ReductionPhi = LastChain;
6243	while (!isa<PHINode>(Val: ReductionPhi))
6244	ReductionPhi = InLoopReductionImmediateChains.at(Val: ReductionPhi);
6245
6246	const RecurrenceDescriptor &RdxDesc =
6247	Legal->getReductionVars().find(Key: cast<PHINode>(Val: ReductionPhi))->second;
6248
6249	InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6250	Opcode: RdxDesc.getOpcode(), Ty: VectorTy, FMF: RdxDesc.getFastMathFlags(), CostKind);
6251
6252	// For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6253	// normal fmul instruction to the cost of the fadd reduction.
6254	if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6255	BaseCost +=
6256	TTI.getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: VectorTy, CostKind);
6257
6258	// If we're using ordered reductions then we can just return the base cost
6259	// here, since getArithmeticReductionCost calculates the full ordered
6260	// reduction cost when FP reassociation is not allowed.
6261	if (useOrderedReductions(RdxDesc))
6262	return BaseCost;
6263
6264	// Get the operand that was not the reduction chain and match it to one of the
6265	// patterns, returning the better cost if it is found.
6266	Instruction *RedOp = RetI->getOperand(i: `1`) == LastChain
6267	? dyn_cast<Instruction>(Val: RetI->getOperand(i: `0`))
6268	: dyn_cast<Instruction>(Val: RetI->getOperand(i: `1`));
6269
6270	VectorTy = VectorType::get(ElementType: I->getOperand(i: `0`)->getType(), Other: VectorTy);
6271
6272	Instruction Op0, Op1;
6273	if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6274	match(V: RedOp,
6275	P: m_ZExtOrSExt(Op: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) &&
6276	match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
6277	Op0->getOpcode() == Op1->getOpcode() &&
6278	Op0->getOperand(i: `0`)->getType() == Op1->getOperand(i: `0`)->getType() &&
6279	!TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1) &&
6280	(Op0->getOpcode() == RedOp->getOpcode() \|\| Op0 == Op1)) {
6281
6282	// Matched reduce.add(ext(mul(ext(A), ext(B)))
6283	// Note that the extend opcodes need to all match, or if A==B they will have
6284	// been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6285	// which is equally fine.
6286	bool IsUnsigned = isa<ZExtInst>(Val: Op0);
6287	auto *ExtType = VectorType::get(ElementType: Op0->getOperand(i: `0`)->getType(), Other: VectorTy);
6288	auto *MulType = VectorType::get(ElementType: Op0->getType(), Other: VectorTy);
6289
6290	InstructionCost ExtCost =
6291	TTI.getCastInstrCost(Opcode: Op0->getOpcode(), Dst: MulType, Src: ExtType,
6292	CCH: TTI::CastContextHint::None, CostKind, I: Op0);
6293	InstructionCost MulCost =
6294	TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: MulType, CostKind);
6295	InstructionCost Ext2Cost =
6296	TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: MulType,
6297	CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
6298
6299	InstructionCost RedCost = TTI.getMulAccReductionCost(
6300	IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType, CostKind);
6301
6302	if (RedCost.isValid() &&
6303	RedCost < ExtCost * `2` + MulCost + Ext2Cost + BaseCost)
6304	return I == RetI ? RedCost : `0`;
6305	} else if (RedOp && match(V: RedOp, P: m_ZExtOrSExt(Op: m_Value())) &&
6306	!TheLoop->isLoopInvariant(V: RedOp)) {
6307	// Matched reduce(ext(A))
6308	bool IsUnsigned = isa<ZExtInst>(Val: RedOp);
6309	auto *ExtType = VectorType::get(ElementType: RedOp->getOperand(i: `0`)->getType(), Other: VectorTy);
6310	InstructionCost RedCost = TTI.getExtendedReductionCost(
6311	Opcode: RdxDesc.getOpcode(), IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
6312	FMF: RdxDesc.getFastMathFlags(), CostKind);
6313
6314	InstructionCost ExtCost =
6315	TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: ExtType,
6316	CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
6317	if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6318	return I == RetI ? RedCost : `0`;
6319	} else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
6320	match(V: RedOp, P: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) {
6321	if (match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
6322	Op0->getOpcode() == Op1->getOpcode() &&
6323	!TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1)) {
6324	bool IsUnsigned = isa<ZExtInst>(Val: Op0);
6325	Type *Op0Ty = Op0->getOperand(i: `0`)->getType();
6326	Type *Op1Ty = Op1->getOperand(i: `0`)->getType();
6327	Type *LargestOpTy =
6328	Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6329	: Op0Ty;
6330	auto *ExtType = VectorType::get(ElementType: LargestOpTy, Other: VectorTy);
6331
6332	// Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
6333	// different sizes. We take the largest type as the ext to reduce, and add
6334	// the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6335	InstructionCost ExtCost0 = TTI.getCastInstrCost(
6336	Opcode: Op0->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op0Ty, Other: VectorTy),
6337	CCH: TTI::CastContextHint::None, CostKind, I: Op0);
6338	InstructionCost ExtCost1 = TTI.getCastInstrCost(
6339	Opcode: Op1->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op1Ty, Other: VectorTy),
6340	CCH: TTI::CastContextHint::None, CostKind, I: Op1);
6341	InstructionCost MulCost =
6342	TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6343
6344	InstructionCost RedCost = TTI.getMulAccReductionCost(
6345	IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType, CostKind);
6346	InstructionCost ExtraExtCost = `0`;
6347	if (Op0Ty != LargestOpTy \|\| Op1Ty != LargestOpTy) {
6348	Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6349	ExtraExtCost = TTI.getCastInstrCost(
6350	Opcode: ExtraExtOp->getOpcode(), Dst: ExtType,
6351	Src: VectorType::get(ElementType: ExtraExtOp->getOperand(i: `0`)->getType(), Other: VectorTy),
6352	CCH: TTI::CastContextHint::None, CostKind, I: ExtraExtOp);
6353	}
6354
6355	if (RedCost.isValid() &&
6356	(RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6357	return I == RetI ? RedCost : `0`;
6358	} else if (!match(V: I, P: m_ZExtOrSExt(Op: m_Value()))) {
6359	// Matched reduce.add(mul())
6360	InstructionCost MulCost =
6361	TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6362
6363	InstructionCost RedCost = TTI.getMulAccReductionCost(
6364	IsUnsigned: true, ResTy: RdxDesc.getRecurrenceType(), Ty: VectorTy, CostKind);
6365
6366	if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6367	return I == RetI ? RedCost : `0`;
6368	}
6369	}
6370
6371	return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
6372	}
6373
6374	InstructionCost
6375	LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6376	ElementCount VF) {
6377	// Calculate scalar cost only. Vectorization cost should be ready at this
6378	// moment.
6379	if (VF.isScalar()) {
6380	Type *ValTy = getLoadStoreType(I);
6381	const Align Alignment = getLoadStoreAlignment(I);
6382	unsigned AS = getLoadStoreAddressSpace(I);
6383
6384	TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: `0`));
6385	return TTI.getAddressComputationCost(Ty: ValTy) +
6386	TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS,
6387	CostKind: TTI::TCK_RecipThroughput, OpdInfo: OpInfo, I);
6388	}
6389	return getWideningCost(I, VF);
6390	}
6391
6392	LoopVectorizationCostModel::VectorizationCostTy
6393	LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6394	ElementCount VF) {
6395	// If we know that this instruction will remain uniform, check the cost of
6396	// the scalar version.
6397	if (isUniformAfterVectorization(I, VF))
6398	VF = ElementCount::getFixed(MinVal: `1`);
6399
6400	if (VF.isVector() && isProfitableToScalarize(I, VF))
6401	return VectorizationCostTy (InstsToScalarize [VF][I], false);
6402
6403	// Forced scalars do not have any scalarization overhead.
6404	auto ForcedScalar = ForcedScalars.find(Val: VF);
6405	if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6406	auto InstSet = ForcedScalar ->second;
6407	if (InstSet.count(Ptr: I))
6408	return VectorizationCostTy (
6409	(getInstructionCost(I, VF: ElementCount::getFixed(MinVal: `1`)).first *
6410	VF.getKnownMinValue()),
6411	false);
6412	}
6413
6414	Type *VectorTy;
6415	InstructionCost C = getInstructionCost(I, VF, VectorTy);
6416
6417	bool TypeNotScalarized = false;
6418	if (VF.isVector() && VectorTy->isVectorTy()) {
6419	if (unsigned NumParts = TTI.getNumberOfParts(Tp: VectorTy)) {
6420	if (VF.isScalable())
6421	// <vscale x 1 x iN> is assumed to be profitable over iN because
6422	// scalable registers are a distinct register class from scalar ones.
6423	// If we ever find a target which wants to lower scalable vectors
6424	// back to scalars, we'll need to update this code to explicitly
6425	// ask TTI about the register class uses for each part.
6426	TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6427	else
6428	TypeNotScalarized = NumParts < VF.getKnownMinValue();
6429	} else
6430	C = InstructionCost::getInvalid();
6431	}
6432	return VectorizationCostTy (C, TypeNotScalarized);
6433	}
6434
6435	InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
6436	Instruction I, ElementCount VF, TTI::TargetCostKind CostKind) const* {
6437
6438	// There is no mechanism yet to create a scalable scalarization loop,
6439	// so this is currently Invalid.
6440	if (VF.isScalable())
6441	return InstructionCost::getInvalid();
6442
6443	if (VF.isScalar())
6444	return `0`;
6445
6446	InstructionCost Cost = `0`;
6447	Type *RetTy = ToVectorTy(Scalar: I->getType(), EC: VF);
6448	if (!RetTy->isVoidTy() &&
6449	(!isa<LoadInst>(Val: I) \|\| !TTI.supportsEfficientVectorElementLoadStore()))
6450	Cost += TTI.getScalarizationOverhead(
6451	Ty: cast<VectorType>(Val: RetTy), DemandedElts: APInt::getAllOnes(numBits: VF.getKnownMinValue()),
6452	/Insert/ true,
6453	/Extract/ false, CostKind);
6454
6455	// Some targets keep addresses scalar.
6456	if (isa<LoadInst>(Val: I) && !TTI.prefersVectorizedAddressing())
6457	return Cost;
6458
6459	// Some targets support efficient element stores.
6460	if (isa<StoreInst>(Val: I) && TTI.supportsEfficientVectorElementLoadStore())
6461	return Cost;
6462
6463	// Collect operands to consider.
6464	CallInst *CI = dyn_cast<CallInst>(Val: I);
6465	Instruction::op_range Ops = CI ? CI->args() : I->operands();
6466
6467	// Skip operands that do not require extraction/scalarization and do not incur
6468	// any overhead.
6469	SmallVector<Type *> Tys;
6470	for (auto *V : filterExtractingOperands(Ops, VF))
6471	Tys.push_back(Elt: MaybeVectorizeType(Elt: V->getType(), VF));
6472	return Cost + TTI.getOperandsScalarizationOverhead(
6473	Args: filterExtractingOperands(Ops, VF), Tys, CostKind);
6474	}
6475
6476	void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6477	if (VF.isScalar())
6478	return;
6479	NumPredStores = `0`;
6480	for (BasicBlock *BB : TheLoop->blocks()) {
6481	// For each instruction in the old loop.
6482	for (Instruction &I : *BB) {
6483	Value *Ptr = getLoadStorePointerOperand(V: &I);
6484	if (!Ptr)
6485	continue;
6486
6487	// TODO: We should generate better code and update the cost model for
6488	// predicated uniform stores. Today they are treated as any other
6489	// predicated store (see added test cases in
6490	// invariant-store-vectorization.ll).
6491	if (isa<StoreInst>(Val: &I) && isScalarWithPredication(I: &I, VF))
6492	NumPredStores++;
6493
6494	if (Legal->isUniformMemOp(I, VF)) {
6495	auto isLegalToScalarize = [&]() {
6496	if (!VF.isScalable())
6497	// Scalarization of fixed length vectors "just works".
6498	return true;
6499
6500	// We have dedicated lowering for unpredicated uniform loads and
6501	// stores. Note that even with tail folding we know that at least
6502	// one lane is active (i.e. generalized predication is not possible
6503	// here), and the logic below depends on this fact.
6504	if (!foldTailByMasking())
6505	return true;
6506
6507	// For scalable vectors, a uniform memop load is always
6508	// uniform-by-parts and we know how to scalarize that.
6509	if (isa<LoadInst>(Val: I))
6510	return true;
6511
6512	// A uniform store isn't neccessarily uniform-by-part
6513	// and we can't assume scalarization.
6514	auto &SI = cast<StoreInst>(Val&: I);
6515	return TheLoop->isLoopInvariant(V: SI.getValueOperand());
6516	};
6517
6518	const InstructionCost GatherScatterCost =
6519	isLegalGatherOrScatter(V: &I, VF) ?
6520	getGatherScatterCost(I: &I, VF) : InstructionCost::getInvalid();
6521
6522	// Load: Scalar load + broadcast
6523	// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6524	// FIXME: This cost is a significant under-estimate for tail folded
6525	// memory ops.
6526	const InstructionCost ScalarizationCost = isLegalToScalarize () ?
6527	getUniformMemOpCost(I: &I, VF) : InstructionCost::getInvalid();
6528
6529	// Choose better solution for the current VF, Note that Invalid
6530	// costs compare as maximumal large. If both are invalid, we get
6531	// scalable invalid which signals a failure and a vectorization abort.
6532	if (GatherScatterCost < ScalarizationCost)
6533	setWideningDecision(I: &I, VF, W: CM_GatherScatter, Cost: GatherScatterCost);
6534	else
6535	setWideningDecision(I: &I, VF, W: CM_Scalarize, Cost: ScalarizationCost);
6536	continue;
6537	}
6538
6539	// We assume that widening is the best solution when possible.
6540	if (memoryInstructionCanBeWidened(I: &I, VF)) {
6541	InstructionCost Cost = getConsecutiveMemOpCost(I: &I, VF);
6542	int ConsecutiveStride = Legal->isConsecutivePtr(
6543	AccessTy: getLoadStoreType(I: &I), Ptr: getLoadStorePointerOperand(V: &I));
6544	assert((ConsecutiveStride == `1` \|\| ConsecutiveStride == -`1`) &&
6545	"Expected consecutive stride.");
6546	InstWidening Decision =
6547	ConsecutiveStride == `1` ? CM_Widen : CM_Widen_Reverse;
6548	setWideningDecision(I: &I, VF, W: Decision, Cost);
6549	continue;
6550	}
6551
6552	// Choose between Interleaving, Gather/Scatter or Scalarization.
6553	InstructionCost InterleaveCost = InstructionCost::getInvalid();
6554	unsigned NumAccesses = `1`;
6555	if (isAccessInterleaved(Instr: &I)) {
6556	auto Group = getInterleavedAccessGroup(Instr: &I);
6557	assert(Group && "Fail to get an interleaved access group.");
6558
6559	// Make one decision for the whole group.
6560	if (getWideningDecision(I: &I, VF) != CM_Unknown)
6561	continue;
6562
6563	NumAccesses = Group->getNumMembers();
6564	if (interleavedAccessCanBeWidened(I: &I, VF))
6565	InterleaveCost = getInterleaveGroupCost(I: &I, VF);
6566	}
6567
6568	InstructionCost GatherScatterCost =
6569	isLegalGatherOrScatter(V: &I, VF)
6570	? getGatherScatterCost(I: &I, VF) * NumAccesses
6571	: InstructionCost::getInvalid();
6572
6573	InstructionCost ScalarizationCost =
6574	getMemInstScalarizationCost(I: &I, VF) * NumAccesses;
6575
6576	// Choose better solution for the current VF,
6577	// write down this decision and use it during vectorization.
6578	InstructionCost Cost;
6579	InstWidening Decision;
6580	if (InterleaveCost <= GatherScatterCost &&
6581	InterleaveCost < ScalarizationCost) {
6582	Decision = CM_Interleave;
6583	Cost = InterleaveCost;
6584	} else if (GatherScatterCost < ScalarizationCost) {
6585	Decision = CM_GatherScatter;
6586	Cost = GatherScatterCost;
6587	} else {
6588	Decision = CM_Scalarize;
6589	Cost = ScalarizationCost;
6590	}
6591	// If the instructions belongs to an interleave group, the whole group
6592	// receives the same decision. The whole group receives the cost, but
6593	// the cost will actually be assigned to one instruction.
6594	if (auto Group = getInterleavedAccessGroup(Instr: &I))
6595	setWideningDecision(Grp: Group, VF, W: Decision, Cost);
6596	else
6597	setWideningDecision(I: &I, VF, W: Decision, Cost);
6598	}
6599	}
6600
6601	// Make sure that any load of address and any other address computation
6602	// remains scalar unless there is gather/scatter support. This avoids
6603	// inevitable extracts into address registers, and also has the benefit of
6604	// activating LSR more, since that pass can't optimize vectorized
6605	// addresses.
6606	if (TTI.prefersVectorizedAddressing())
6607	return;
6608
6609	// Start with all scalar pointer uses.
6610	SmallPtrSet<Instruction *, `8`> AddrDefs;
6611	for (BasicBlock *BB : TheLoop->blocks())
6612	for (Instruction &I : *BB) {
6613	Instruction *PtrDef =
6614	dyn_cast_or_null<Instruction>(Val: getLoadStorePointerOperand(V: &I));
6615	if (PtrDef && TheLoop->contains(Inst: PtrDef) &&
6616	getWideningDecision(I: &I, VF) != CM_GatherScatter)
6617	AddrDefs.insert(Ptr: PtrDef);
6618	}
6619
6620	// Add all instructions used to generate the addresses.
6621	SmallVector<Instruction *, `4`> Worklist;
6622	append_range(C&: Worklist, R&: AddrDefs);
6623	while (!Worklist.empty()) {
6624	Instruction *I = Worklist.pop_back_val();
6625	for (auto &Op : I->operands())
6626	if (auto *InstOp = dyn_cast<Instruction>(Val&: Op))
6627	if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(Val: InstOp) &&
6628	AddrDefs.insert(Ptr: InstOp).second)
6629	Worklist.push_back(Elt: InstOp);
6630	}
6631
6632	for (auto *I : AddrDefs) {
6633	if (isa<LoadInst>(Val: I)) {
6634	// Setting the desired widening decision should ideally be handled in
6635	// by cost functions, but since this involves the task of finding out
6636	// if the loaded register is involved in an address computation, it is
6637	// instead changed here when we know this is the case.
6638	InstWidening Decision = getWideningDecision(I, VF);
6639	if (Decision == CM_Widen \|\| Decision == CM_Widen_Reverse)
6640	// Scalarize a widened load of address.
6641	setWideningDecision(
6642	I, VF, W: CM_Scalarize,
6643	Cost: (VF.getKnownMinValue() *
6644	getMemoryInstructionCost(I, VF: ElementCount::getFixed(MinVal: `1`))));
6645	else if (auto Group = getInterleavedAccessGroup(Instr: I)) {
6646	// Scalarize an interleave group of address loads.
6647	for (unsigned I = `0`; I < Group->getFactor(); ++I) {
6648	if (Instruction *Member = Group->getMember(Index: I))
6649	setWideningDecision(
6650	I: Member, VF, W: CM_Scalarize,
6651	Cost: (VF.getKnownMinValue() *
6652	getMemoryInstructionCost(I: Member, VF: ElementCount::getFixed(MinVal: `1`))));
6653	}
6654	}
6655	} else
6656	// Make sure I gets scalarized and a cost estimate without
6657	// scalarization overhead.
6658	ForcedScalars [VF].insert(Ptr: I);
6659	}
6660	}
6661
6662	void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6663	assert(!VF.isScalar() &&
6664	"Trying to set a vectorization decision for a scalar VF");
6665
6666	for (BasicBlock *BB : TheLoop->blocks()) {
6667	// For each instruction in the old loop.
6668	for (Instruction &I : *BB) {
6669	CallInst *CI = dyn_cast<CallInst>(Val: &I);
6670
6671	if (!CI)
6672	continue;
6673
6674	InstructionCost ScalarCost = InstructionCost::getInvalid();
6675	InstructionCost VectorCost = InstructionCost::getInvalid();
6676	InstructionCost IntrinsicCost = InstructionCost::getInvalid();
6677	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6678
6679	Function *ScalarFunc = CI->getCalledFunction();
6680	Type *ScalarRetTy = CI->getType();
6681	SmallVector<Type *, `4`> Tys, ScalarTys;
6682	bool MaskRequired = Legal->isMaskRequired(I: CI);
6683	for (auto &ArgOp : CI->args())
6684	ScalarTys.push_back(Elt: ArgOp ->getType());
6685
6686	// Compute corresponding vector type for return value and arguments.
6687	Type *RetTy = ToVectorTy(Scalar: ScalarRetTy, EC: VF);
6688	for (Type *ScalarTy : ScalarTys)
6689	Tys.push_back(Elt: ToVectorTy(Scalar: ScalarTy, EC: VF));
6690
6691	// An in-loop reduction using an fmuladd intrinsic is a special case;
6692	// we don't want the normal cost for that intrinsic.
6693	if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
6694	if (auto RedCost = getReductionPatternCost(I: CI, VF, Ty: RetTy, CostKind)) {
6695	setCallWideningDecision(CI, VF, Kind: CM_IntrinsicCall, Variant: nullptr,
6696	IID: getVectorIntrinsicIDForCall(CI, TLI),
6697	MaskPos: std::nullopt, Cost: *RedCost);
6698	continue;
6699	}
6700
6701	// Estimate cost of scalarized vector call. The source operands are
6702	// assumed to be vectors, so we need to extract individual elements from
6703	// there, execute VF scalar calls, and then gather the result into the
6704	// vector return value.
6705	InstructionCost ScalarCallCost =
6706	TTI.getCallInstrCost(F: ScalarFunc, RetTy: ScalarRetTy, Tys: ScalarTys, CostKind);
6707
6708	// Compute costs of unpacking argument values for the scalar calls and
6709	// packing the return values to a vector.
6710	InstructionCost ScalarizationCost =
6711	getScalarizationOverhead(I: CI, VF, CostKind);
6712
6713	ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6714
6715	// Find the cost of vectorizing the call, if we can find a suitable
6716	// vector variant of the function.
6717	bool UsesMask = false;
6718	VFInfo FuncInfo;
6719	Function VecFunc = nullptr*;
6720	// Search through any available variants for one we can use at this VF.
6721	for (VFInfo &Info : VFDatabase::getMappings(CI: *CI)) {
6722	// Must match requested VF.
6723	if (Info.Shape.VF != VF)
6724	continue;
6725
6726	// Must take a mask argument if one is required
6727	if (MaskRequired && !Info.isMasked())
6728	continue;
6729
6730	// Check that all parameter kinds are supported
6731	bool ParamsOk = true;
6732	for (VFParameter Param : Info.Shape.Parameters) {
6733	switch (Param.ParamKind) {
6734	case VFParamKind::Vector:
6735	break;
6736	case VFParamKind::OMP_Uniform: {
6737	Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
6738	// Make sure the scalar parameter in the loop is invariant.
6739	if (!PSE.getSE()->isLoopInvariant(S: PSE.getSCEV(V: ScalarParam),
6740	L: TheLoop))
6741	ParamsOk = false;
6742	break;
6743	}
6744	case VFParamKind::OMP_Linear: {
6745	Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
6746	// Find the stride for the scalar parameter in this loop and see if
6747	// it matches the stride for the variant.
6748	// TODO: do we need to figure out the cost of an extract to get the
6749	// first lane? Or do we hope that it will be folded away?
6750	ScalarEvolution *SE = PSE.getSE();
6751	const auto *SAR =
6752	dyn_cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: ScalarParam));
6753
6754	if (!SAR \|\| SAR->getLoop() != TheLoop) {
6755	ParamsOk = false;
6756	break;
6757	}
6758
6759	const SCEVConstant *Step =
6760	dyn_cast<SCEVConstant>(Val: SAR->getStepRecurrence(SE&: *SE));
6761
6762	if (!Step \|\|
6763	Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6764	ParamsOk = false;
6765
6766	break;
6767	}
6768	case VFParamKind::GlobalPredicate:
6769	UsesMask = true;
6770	break;
6771	default:
6772	ParamsOk = false;
6773	break;
6774	}
6775	}
6776
6777	if (!ParamsOk)
6778	continue;
6779
6780	// Found a suitable candidate, stop here.
6781	VecFunc = CI->getModule()->getFunction(Name: Info.VectorName);
6782	FuncInfo = Info;
6783	break;
6784	}
6785
6786	// Add in the cost of synthesizing a mask if one wasn't required.
6787	InstructionCost MaskCost = `0`;
6788	if (VecFunc && UsesMask && !MaskRequired)
6789	MaskCost = TTI.getShuffleCost(
6790	Kind: TargetTransformInfo::SK_Broadcast,
6791	Tp: VectorType::get(ElementType: IntegerType::getInt1Ty(
6792	C&: VecFunc->getFunctionType()->getContext()),
6793	EC: VF));
6794
6795	if (TLI && VecFunc && !CI->isNoBuiltin())
6796	VectorCost =
6797	TTI.getCallInstrCost(F: nullptr, RetTy, Tys, CostKind) + MaskCost;
6798
6799	// Find the cost of an intrinsic; some targets may have instructions that
6800	// perform the operation without needing an actual call.
6801	Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6802	if (IID != Intrinsic::not_intrinsic)
6803	IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6804
6805	InstructionCost Cost = ScalarCost;
6806	InstWidening Decision = CM_Scalarize;
6807
6808	if (VectorCost <= Cost) {
6809	Cost = VectorCost;
6810	Decision = CM_VectorCall;
6811	}
6812
6813	if (IntrinsicCost <= Cost) {
6814	Cost = IntrinsicCost;
6815	Decision = CM_IntrinsicCall;
6816	}
6817
6818	setCallWideningDecision(CI, VF, Kind: Decision, Variant: VecFunc, IID,
6819	MaskPos: FuncInfo.getParamIndexForOptionalMask(), Cost);
6820	}
6821	}
6822	}
6823
6824	InstructionCost
6825	LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6826	Type *&VectorTy) {
6827	Type *RetTy = I->getType();
6828	if (canTruncateToMinimalBitwidth(I, VF))
6829	RetTy = IntegerType::get(C&: RetTy->getContext(), NumBits: MinBWs [I]);
6830	auto SE = PSE.getSE();
6831	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6832
6833	auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6834	ElementCount VF) -> bool {
6835	if (VF.isScalar())
6836	return true;
6837
6838	auto Scalarized = InstsToScalarize.find(Val: VF);
6839	assert(Scalarized != InstsToScalarize.end() &&
6840	"VF not yet analyzed for scalarization profitability");
6841	return !Scalarized ->second.count(Val: I) &&
6842	llvm::all_of(Range: I->users(), P: [&](User *U) {
6843	auto *UI = cast<Instruction>(Val: U);
6844	return !Scalarized ->second.count(Val: UI);
6845	});
6846	};
6847	(void) hasSingleCopyAfterVectorization;
6848
6849	if (isScalarAfterVectorization(I, VF)) {
6850	// With the exception of GEPs and PHIs, after scalarization there should
6851	// only be one copy of the instruction generated in the loop. This is
6852	// because the VF is either 1, or any instructions that need scalarizing
6853	// have already been dealt with by the time we get here. As a result,
6854	// it means we don't have to multiply the instruction cost by VF.
6855	assert(I->getOpcode() == Instruction::GetElementPtr \|\|
6856	I->getOpcode() == Instruction::PHI \|\|
6857	(I->getOpcode() == Instruction::BitCast &&
6858	I->getType()->isPointerTy()) \|\|
6859	hasSingleCopyAfterVectorization(I, VF));
6860	VectorTy = RetTy;
6861	} else
6862	VectorTy = ToVectorTy(Scalar: RetTy, EC: VF);
6863
6864	// TODO: We need to estimate the cost of intrinsic calls.
6865	switch (I->getOpcode()) {
6866	case Instruction::GetElementPtr:
6867	// We mark this instruction as zero-cost because the cost of GEPs in
6868	// vectorized code depends on whether the corresponding memory instruction
6869	// is scalarized or not. Therefore, we handle GEPs with the memory
6870	// instruction cost.
6871	return `0`;
6872	case Instruction::Br: {
6873	// In cases of scalarized and predicated instructions, there will be VF
6874	// predicated blocks in the vectorized loop. Each branch around these
6875	// blocks requires also an extract of its vector compare i1 element.
6876	bool ScalarPredicatedBB = false;
6877	BranchInst *BI = cast<BranchInst>(Val: I);
6878	if (VF.isVector() && BI->isConditional() &&
6879	(PredicatedBBsAfterVectorization [VF].count(Ptr: BI->getSuccessor(i: `0`)) \|\|
6880	PredicatedBBsAfterVectorization [VF].count(Ptr: BI->getSuccessor(i: `1`))))
6881	ScalarPredicatedBB = true;
6882
6883	if (ScalarPredicatedBB) {
6884	// Not possible to scalarize scalable vector with predicated instructions.
6885	if (VF.isScalable())
6886	return InstructionCost::getInvalid();
6887	// Return cost for branches around scalarized and predicated blocks.
6888	auto *Vec_i1Ty =
6889	VectorType::get(ElementType: IntegerType::getInt1Ty(C&: RetTy->getContext()), EC: VF);
6890	return (
6891	TTI.getScalarizationOverhead(
6892	Ty: Vec_i1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
6893	/Insert/ false, /Extract/ true, CostKind) +
6894	(TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind) * VF.getFixedValue()));
6895	} else if (I->getParent() == TheLoop->getLoopLatch() \|\| VF.isScalar())
6896	// The back-edge branch will remain, as will all scalar branches.
6897	return TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind);
6898	else
6899	// This branch will be eliminated by if-conversion.
6900	return `0`;
6901	// Note: We currently assume zero cost for an unconditional branch inside
6902	// a predicated block since it will become a fall-through, although we
6903	// may decide in the future to call TTI for all branches.
6904	}
6905	case Instruction::PHI: {
6906	auto *Phi = cast<PHINode>(Val: I);
6907
6908	// First-order recurrences are replaced by vector shuffles inside the loop.
6909	if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6910	SmallVector<int> Mask(VF.getKnownMinValue());
6911	std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - `1`);
6912	return TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice,
6913	Tp: cast<VectorType>(Val: VectorTy), Mask, CostKind,
6914	Index: VF.getKnownMinValue() - `1`);
6915	}
6916
6917	// Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6918	// converted into select instructions. We require N - 1 selects per phi
6919	// node, where N is the number of incoming values.
6920	if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6921	return (Phi->getNumIncomingValues() - `1`) *
6922	TTI.getCmpSelInstrCost(
6923	Opcode: Instruction::Select, ValTy: ToVectorTy(Scalar: Phi->getType(), EC: VF),
6924	CondTy: ToVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF),
6925	VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
6926
6927	return TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
6928	}
6929	case Instruction::UDiv:
6930	case Instruction::SDiv:
6931	case Instruction::URem:
6932	case Instruction::SRem:
6933	if (VF.isVector() && isPredicatedInst(I)) {
6934	const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6935	return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6936	ScalarCost : SafeDivisorCost;
6937	}
6938	// We've proven all lanes safe to speculate, fall through.
6939	[[fallthrough]];
6940	case Instruction::Add:
6941	case Instruction::FAdd:
6942	case Instruction::Sub:
6943	case Instruction::FSub:
6944	case Instruction::Mul:
6945	case Instruction::FMul:
6946	case Instruction::FDiv:
6947	case Instruction::FRem:
6948	case Instruction::Shl:
6949	case Instruction::LShr:
6950	case Instruction::AShr:
6951	case Instruction::And:
6952	case Instruction::Or:
6953	case Instruction::Xor: {
6954	// If we're speculating on the stride being 1, the multiplication may
6955	// fold away. We can generalize this for all operations using the notion
6956	// of neutral elements. (TODO)
6957	if (I->getOpcode() == Instruction::Mul &&
6958	(PSE.getSCEV(V: I->getOperand(i: `0`))->isOne() \|\|
6959	PSE.getSCEV(V: I->getOperand(i: `1`))->isOne()))
6960	return `0`;
6961
6962	// Detect reduction patterns
6963	if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy, CostKind))
6964	return *RedCost;
6965
6966	// Certain instructions can be cheaper to vectorize if they have a constant
6967	// second vector operand. One example of this are shifts on x86.
6968	Value *Op2 = I->getOperand(i: `1`);
6969	auto Op2Info = TTI.getOperandInfo(V: Op2);
6970	if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6971	Legal->isInvariant(V: Op2))
6972	Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6973
6974	SmallVector<const Value *, `4`> Operands(I->operand_values());
6975	return TTI.getArithmeticInstrCost(
6976	Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6977	Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6978	Opd2Info: Op2Info, Args: Operands, CxtI: I, TLibInfo: TLI);
6979	}
6980	case Instruction::FNeg: {
6981	return TTI.getArithmeticInstrCost(
6982	Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6983	Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6984	Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6985	Args: I->getOperand(i: `0`), CxtI: I);
6986	}
6987	case Instruction::Select: {
6988	SelectInst *SI = cast<SelectInst>(Val: I);
6989	const SCEV *CondSCEV = SE->getSCEV(V: SI->getCondition());
6990	bool ScalarCond = (SE->isLoopInvariant(S: CondSCEV, L: TheLoop));
6991
6992	const Value Op0, Op1;
6993	using namespace llvm::PatternMatch;
6994	if (!ScalarCond && (match(V: I, P: m_LogicalAnd(L: m_Value(V&: Op0), R: m_Value(V&: Op1))) \|\|
6995	match(V: I, P: m_LogicalOr(L: m_Value(V&: Op0), R: m_Value(V&: Op1))))) {
6996	// select x, y, false --> x & y
6997	// select x, true, y --> x \| y
6998	const auto [Op1VK, Op1VP] = TTI::getOperandInfo(V: Op0);
6999	const auto [Op2VK, Op2VP] = TTI::getOperandInfo(V: Op1);
7000	assert(Op0->getType()->getScalarSizeInBits() == `1` &&
7001	Op1->getType()->getScalarSizeInBits() == `1`);
7002
7003	SmallVector<const Value *, `2`> Operands{Op0, Op1};
7004	return TTI.getArithmeticInstrCost(
7005	Opcode: match(V: I, P: m_LogicalOr()) ? Instruction::Or : Instruction::And, Ty: VectorTy,
7006	CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: I);
7007	}
7008
7009	Type *CondTy = SI->getCondition()->getType();
7010	if (!ScalarCond)
7011	CondTy = VectorType::get(ElementType: CondTy, EC: VF);
7012
7013	CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7014	if (auto *Cmp = dyn_cast<CmpInst>(Val: SI->getCondition()))
7015	Pred = Cmp->getPredicate();
7016	return TTI.getCmpSelInstrCost(Opcode: I->getOpcode(), ValTy: VectorTy, CondTy, VecPred: Pred,
7017	CostKind, I);
7018	}
7019	case Instruction::ICmp:
7020	case Instruction::FCmp: {
7021	Type *ValTy = I->getOperand(i: `0`)->getType();
7022	Instruction *Op0AsInstruction = dyn_cast<Instruction>(Val: I->getOperand(i: `0`));
7023	if (canTruncateToMinimalBitwidth(I: Op0AsInstruction, VF))
7024	ValTy = IntegerType::get(C&: ValTy->getContext(), NumBits: MinBWs [Op0AsInstruction]);
7025	VectorTy = ToVectorTy(Scalar: ValTy, EC: VF);
7026	return TTI.getCmpSelInstrCost(Opcode: I->getOpcode(), ValTy: VectorTy, CondTy: nullptr,
7027	VecPred: cast<CmpInst>(Val: I)->getPredicate(), CostKind,
7028	I);
7029	}
7030	case Instruction::Store:
7031	case Instruction::Load: {
7032	ElementCount Width = VF;
7033	if (Width.isVector()) {
7034	InstWidening Decision = getWideningDecision(I, VF: Width);
7035	assert(Decision != CM_Unknown &&
7036	"CM decision should be taken at this point");
7037	if (getWideningCost(I, VF) == InstructionCost::getInvalid())
7038	return InstructionCost::getInvalid();
7039	if (Decision == CM_Scalarize)
7040	Width = ElementCount::getFixed(MinVal: `1`);
7041	}
7042	VectorTy = ToVectorTy(Scalar: getLoadStoreType(I), EC: Width);
7043	return getMemoryInstructionCost(I, VF);
7044	}
7045	case Instruction::BitCast:
7046	if (I->getType()->isPointerTy())
7047	return `0`;
7048	[[fallthrough]];
7049	case Instruction::ZExt:
7050	case Instruction::SExt:
7051	case Instruction::FPToUI:
7052	case Instruction::FPToSI:
7053	case Instruction::FPExt:
7054	case Instruction::PtrToInt:
7055	case Instruction::IntToPtr:
7056	case Instruction::SIToFP:
7057	case Instruction::UIToFP:
7058	case Instruction::Trunc:
7059	case Instruction::FPTrunc: {
7060	// Computes the CastContextHint from a Load/Store instruction.
7061	auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7062	assert((isa<LoadInst>(I) \|\| isa<StoreInst>(I)) &&
7063	"Expected a load or a store!");
7064
7065	if (VF.isScalar() \|\| !TheLoop->contains(Inst: I))
7066	return TTI::CastContextHint::Normal;
7067
7068	switch (getWideningDecision(I, VF)) {
7069	case LoopVectorizationCostModel::CM_GatherScatter:
7070	return TTI::CastContextHint::GatherScatter;
7071	case LoopVectorizationCostModel::CM_Interleave:
7072	return TTI::CastContextHint::Interleave;
7073	case LoopVectorizationCostModel::CM_Scalarize:
7074	case LoopVectorizationCostModel::CM_Widen:
7075	return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7076	: TTI::CastContextHint::Normal;
7077	case LoopVectorizationCostModel::CM_Widen_Reverse:
7078	return TTI::CastContextHint::Reversed;
7079	case LoopVectorizationCostModel::CM_Unknown:
7080	llvm_unreachable("Instr did not go through cost modelling?");
7081	case LoopVectorizationCostModel::CM_VectorCall:
7082	case LoopVectorizationCostModel::CM_IntrinsicCall:
7083	llvm_unreachable_internal(msg: "Instr has invalid widening decision");
7084	}
7085
7086	llvm_unreachable("Unhandled case!");
7087	};
7088
7089	unsigned Opcode = I->getOpcode();
7090	TTI::CastContextHint CCH = TTI::CastContextHint::None;
7091	// For Trunc, the context is the only user, which must be a StoreInst.
7092	if (Opcode == Instruction::Trunc \|\| Opcode == Instruction::FPTrunc) {
7093	if (I->hasOneUse())
7094	if (StoreInst Store = dyn_cast<StoreInst>(Val: I->user_begin()))
7095	CCH = ComputeCCH (Store);
7096	}
7097	// For Z/Sext, the context is the operand, which must be a LoadInst.
7098	else if (Opcode == Instruction::ZExt \|\| Opcode == Instruction::SExt \|\|
7099	Opcode == Instruction::FPExt) {
7100	if (LoadInst *Load = dyn_cast<LoadInst>(Val: I->getOperand(i: `0`)))
7101	CCH = ComputeCCH (Load);
7102	}
7103
7104	// We optimize the truncation of induction variables having constant
7105	// integer steps. The cost of these truncations is the same as the scalar
7106	// operation.
7107	if (isOptimizableIVTruncate(I, VF)) {
7108	auto *Trunc = cast<TruncInst>(Val: I);
7109	return TTI.getCastInstrCost(Opcode: Instruction::Trunc, Dst: Trunc->getDestTy(),
7110	Src: Trunc->getSrcTy(), CCH, CostKind, I: Trunc);
7111	}
7112
7113	// Detect reduction patterns
7114	if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy, CostKind))
7115	return *RedCost;
7116
7117	Type *SrcScalarTy = I->getOperand(i: `0`)->getType();
7118	Type *SrcVecTy =
7119	VectorTy->isVectorTy() ? ToVectorTy(Scalar: SrcScalarTy, EC: VF) : SrcScalarTy;
7120	if (canTruncateToMinimalBitwidth(I, VF)) {
7121	// This cast is going to be shrunk. This may remove the cast or it might
7122	// turn it into slightly different cast. For example, if MinBW == 16,
7123	// "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7124	//
7125	// Calculate the modified src and dest types.
7126	Type *MinVecTy = VectorTy;
7127	if (Opcode == Instruction::Trunc) {
7128	SrcVecTy = smallestIntegerVectorType(T1: SrcVecTy, T2: MinVecTy);
7129	VectorTy =
7130	largestIntegerVectorType(T1: ToVectorTy(Scalar: I->getType(), EC: VF), T2: MinVecTy);
7131	} else if (Opcode == Instruction::ZExt \|\| Opcode == Instruction::SExt) {
7132	// Leave SrcVecTy unchanged - we only shrink the destination element
7133	// type.
7134	VectorTy =
7135	smallestIntegerVectorType(T1: ToVectorTy(Scalar: I->getType(), EC: VF), T2: MinVecTy);
7136	}
7137	}
7138
7139	return TTI.getCastInstrCost(Opcode, Dst: VectorTy, Src: SrcVecTy, CCH, CostKind, I);
7140	}
7141	case Instruction::Call:
7142	return getVectorCallCost(CI: cast<CallInst>(Val: I), VF);
7143	case Instruction::ExtractValue:
7144	return TTI.getInstructionCost(U: I, CostKind: TTI::TCK_RecipThroughput);
7145	case Instruction::Alloca:
7146	// We cannot easily widen alloca to a scalable alloca, as
7147	// the result would need to be a vector of pointers.
7148	if (VF.isScalable())
7149	return InstructionCost::getInvalid();
7150	[[fallthrough]];
7151	default:
7152	// This opcode is unknown. Assume that it is the same as 'mul'.
7153	return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
7154	} // end of switch.
7155	}
7156
7157	void LoopVectorizationCostModel::collectValuesToIgnore() {
7158	// Ignore ephemeral values.
7159	CodeMetrics::collectEphemeralValues(L: TheLoop, AC, EphValues&: ValuesToIgnore);
7160
7161	// Find all stores to invariant variables. Since they are going to sink
7162	// outside the loop we do not need calculate cost for them.
7163	for (BasicBlock *BB : TheLoop->blocks())
7164	for (Instruction &I : *BB) {
7165	StoreInst *SI;
7166	if ((SI = dyn_cast<StoreInst>(Val: &I)) &&
7167	Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand()))
7168	ValuesToIgnore.insert(Ptr: &I);
7169	}
7170
7171	// Ignore type-promoting instructions we identified during reduction
7172	// detection.
7173	for (const auto &Reduction : Legal->getReductionVars()) {
7174	const RecurrenceDescriptor &RedDes = Reduction.second;
7175	const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7176	VecValuesToIgnore.insert(I: Casts.begin(), E: Casts.end());
7177	}
7178	// Ignore type-casting instructions we identified during induction
7179	// detection.
7180	for (const auto &Induction : Legal->getInductionVars()) {
7181	const InductionDescriptor &IndDes = Induction.second;
7182	const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7183	VecValuesToIgnore.insert(I: Casts.begin(), E: Casts.end());
7184	}
7185	}
7186
7187	void LoopVectorizationCostModel::collectInLoopReductions() {
7188	for (const auto &Reduction : Legal->getReductionVars()) {
7189	PHINode *Phi = Reduction.first;
7190	const RecurrenceDescriptor &RdxDesc = Reduction.second;
7191
7192	// We don't collect reductions that are type promoted (yet).
7193	if (RdxDesc.getRecurrenceType() != Phi->getType())
7194	continue;
7195
7196	// If the target would prefer this reduction to happen "in-loop", then we
7197	// want to record it as such.
7198	unsigned Opcode = RdxDesc.getOpcode();
7199	if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7200	!TTI.preferInLoopReduction(Opcode, Ty: Phi->getType(),
7201	Flags: TargetTransformInfo::ReductionFlags ()))
7202	continue;
7203
7204	// Check that we can correctly put the reductions into the loop, by
7205	// finding the chain of operations that leads from the phi to the loop
7206	// exit value.
7207	SmallVector<Instruction *, `4`> ReductionOperations =
7208	RdxDesc.getReductionOpChain(Phi, L: TheLoop);
7209	bool InLoop = !ReductionOperations.empty();
7210
7211	if (InLoop) {
7212	InLoopReductions.insert(Ptr: Phi);
7213	// Add the elements to InLoopReductionImmediateChains for cost modelling.
7214	Instruction *LastChain = Phi;
7215	for (auto *I : ReductionOperations) {
7216	InLoopReductionImmediateChains [I] = LastChain;
7217	LastChain = I;
7218	}
7219	}
7220	LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7221	<< " reduction for phi: " << *Phi << "\n");
7222	}
7223	}
7224
7225	VPValue VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue A, VPValue *B,
7226	DebugLoc DL, const Twine &Name) {
7227	assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
7228	Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
7229	return tryInsertInstruction(
7230	VPI: new VPInstruction (Instruction::ICmp, Pred, A, B, DL, Name));
7231	}
7232
7233	// This function will select a scalable VF if the target supports scalable
7234	// vectors and a fixed one otherwise.
7235	// TODO: we could return a pair of values that specify the max VF and
7236	// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7237	// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7238	// doesn't have a cost model that can choose which plan to execute if
7239	// more than one is generated.
7240	static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
7241	LoopVectorizationCostModel &CM) {
7242	unsigned WidestType;
7243	std::tie(args: std::ignore, args&: WidestType) = CM.getSmallestAndWidestTypes();
7244
7245	TargetTransformInfo::RegisterKind RegKind =
7246	TTI.enableScalableVectorization()
7247	? TargetTransformInfo::RGK_ScalableVector
7248	: TargetTransformInfo::RGK_FixedWidthVector;
7249
7250	TypeSize RegSize = TTI.getRegisterBitWidth(K: RegKind);
7251	unsigned N = RegSize.getKnownMinValue() / WidestType;
7252	return ElementCount::get(MinVal: N, Scalable: RegSize.isScalable());
7253	}
7254
7255	VectorizationFactor
7256	LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7257	ElementCount VF = UserVF;
7258	// Outer loop handling: They may require CFG and instruction level
7259	// transformations before even evaluating whether vectorization is profitable.
7260	// Since we cannot modify the incoming IR, we need to build VPlan upfront in
7261	// the vectorization pipeline.
7262	if (!OrigLoop->isInnermost()) {
7263	// If the user doesn't provide a vectorization factor, determine a
7264	// reasonable one.
7265	if (UserVF.isZero()) {
7266	VF = determineVPlanVF(TTI, CM);
7267	LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7268
7269	// Make sure we have a VF > 1 for stress testing.
7270	if (VPlanBuildStressTest && (VF.isScalar() \|\| VF.isZero())) {
7271	LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7272	<< "overriding computed VF.\n");
7273	VF = ElementCount::getFixed(MinVal: `4`);
7274	}
7275	} else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
7276	!ForceTargetSupportsScalableVectors) {
7277	LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
7278	<< "not supported by the target.\n");
7279	reportVectorizationFailure(
7280	DebugMsg: "Scalable vectorization requested but not supported by the target",
7281	OREMsg: "the scalable user-specified vectorization width for outer-loop "
7282	"vectorization cannot be used because the target does not support "
7283	"scalable vectors.",
7284	ORETag: "ScalableVFUnfeasible", ORE, TheLoop: OrigLoop);
7285	return VectorizationFactor::Disabled();
7286	}
7287	assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7288	assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7289	"VF needs to be a power of two");
7290	LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7291	<< "VF " << VF << " to build VPlans.\n");
7292	buildVPlans(MinVF: VF, MaxVF: VF);
7293
7294	// For VPlan build stress testing, we bail out after VPlan construction.
7295	if (VPlanBuildStressTest)
7296	return VectorizationFactor::Disabled();
7297
7298	return {VF, `0` /Cost/, `0` / ScalarCost /};
7299	}
7300
7301	LLVM_DEBUG(
7302	dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7303	"VPlan-native path.\n");
7304	return VectorizationFactor::Disabled();
7305	}
7306
7307	std::optional<VectorizationFactor>
7308	LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7309	assert(OrigLoop->isInnermost() && "Inner loop expected.");
7310	CM.collectValuesToIgnore();
7311	CM.collectElementTypesForWidening();
7312
7313	FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7314	if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7315	return std::nullopt;
7316
7317	// Invalidate interleave groups if all blocks of loop will be predicated.
7318	if (CM.blockNeedsPredicationForAnyReason(BB: OrigLoop->getHeader()) &&
7319	!useMaskedInterleavedAccesses(TTI)) {
7320	LLVM_DEBUG(
7321	dbgs()
7322	<< "LV: Invalidate all interleaved groups due to fold-tail by masking "
7323	"which requires masked-interleaved support.\n");
7324	if (CM.InterleaveInfo.invalidateGroups())
7325	// Invalidating interleave groups also requires invalidating all decisions
7326	// based on them, which includes widening decisions and uniform and scalar
7327	// values.
7328	CM.invalidateCostModelingDecisions();
7329	}
7330
7331	ElementCount MaxUserVF =
7332	UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7333	bool UserVFIsLegal = ElementCount::isKnownLE(LHS: UserVF, RHS: MaxUserVF);
7334	if (!UserVF.isZero() && UserVFIsLegal) {
7335	assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7336	"VF needs to be a power of two");
7337	// Collect the instructions (and their associated costs) that will be more
7338	// profitable to scalarize.
7339	CM.collectInLoopReductions();
7340	if (CM.selectUserVectorizationFactor(UserVF)) {
7341	LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7342	buildVPlansWithVPRecipes(MinVF: UserVF, MaxVF: UserVF);
7343	if (!hasPlanWithVF(VF: UserVF)) {
7344	LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
7345	<< ".\n");
7346	return std::nullopt;
7347	}
7348
7349	LLVM_DEBUG(printPlans(dbgs()));
7350	return {{UserVF, `0`, `0`}};
7351	} else
7352	reportVectorizationInfo(Msg: "UserVF ignored because of invalid costs.",
7353	ORETag: "InvalidCost", ORE, TheLoop: OrigLoop);
7354	}
7355
7356	// Populate the set of Vectorization Factor Candidates.
7357	ElementCountSet VFCandidates;
7358	for (auto VF = ElementCount::getFixed(MinVal: `1`);
7359	ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.FixedVF); VF *= `2`)
7360	VFCandidates.insert(V: VF);
7361	for (auto VF = ElementCount::getScalable(MinVal: `1`);
7362	ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.ScalableVF); VF *= `2`)
7363	VFCandidates.insert(V: VF);
7364
7365	CM.collectInLoopReductions();
7366	for (const auto &VF : VFCandidates) {
7367	// Collect Uniform and Scalar instructions after vectorization with VF.
7368	CM.collectUniformsAndScalars(VF);
7369
7370	// Collect the instructions (and their associated costs) that will be more
7371	// profitable to scalarize.
7372	if (VF.isVector())
7373	CM.collectInstsToScalarize(VF);
7374	}
7375
7376	buildVPlansWithVPRecipes(MinVF: ElementCount::getFixed(MinVal: `1`), MaxVF: MaxFactors.FixedVF);
7377	buildVPlansWithVPRecipes(MinVF: ElementCount::getScalable(MinVal: `1`), MaxVF: MaxFactors.ScalableVF);
7378
7379	LLVM_DEBUG(printPlans(dbgs()));
7380	if (!MaxFactors.hasVector())
7381	return VectorizationFactor::Disabled();
7382
7383	// Select the optimal vectorization factor.
7384	VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
7385	assert((VF.Width.isScalar() \|\| VF.ScalarCost > `0`) && "when vectorizing, the scalar cost must be non-zero.");
7386	if (!hasPlanWithVF(VF: VF.Width)) {
7387	LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
7388	<< ".\n");
7389	return std::nullopt;
7390	}
7391	return VF;
7392	}
7393
7394	VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7395	assert(count_if(VPlans,
7396	[VF](const VPlanPtr &Plan) { return Plan ->hasVF(VF); }) ==
7397	`1` &&
7398	"Best VF has not a single VPlan.");
7399
7400	for (const VPlanPtr &Plan : VPlans) {
7401	if (Plan ->hasVF(VF))
7402	return *Plan.get();
7403	}
7404	llvm_unreachable("No plan found!");
7405	}
7406
7407	static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7408	SmallVector<Metadata *, `4`> MDs;
7409	// Reserve first location for self reference to the LoopID metadata node.
7410	MDs.push_back(Elt: nullptr);
7411	bool IsUnrollMetadata = false;
7412	MDNode *LoopID = L->getLoopID();
7413	if (LoopID) {
7414	// First find existing loop unrolling disable metadata.
7415	for (unsigned i = `1`, ie = LoopID->getNumOperands(); i < ie; ++i) {
7416	auto *MD = dyn_cast<MDNode>(Val: LoopID->getOperand(I: i));
7417	if (MD) {
7418	const auto *S = dyn_cast<MDString>(Val: MD->getOperand(I: `0`));
7419	IsUnrollMetadata =
7420	S && S->getString().starts_with(Prefix: "llvm.loop.unroll.disable");
7421	}
7422	MDs.push_back(Elt: LoopID->getOperand(I: i));
7423	}
7424	}
7425
7426	if (!IsUnrollMetadata) {
7427	// Add runtime unroll disable metadata.
7428	LLVMContext &Context = L->getHeader()->getContext();
7429	SmallVector<Metadata *, `1`> DisableOperands;
7430	DisableOperands.push_back(
7431	Elt: MDString::get(Context, Str: "llvm.loop.unroll.runtime.disable"));
7432	MDNode *DisableNode = MDNode::get(Context, MDs: DisableOperands);
7433	MDs.push_back(Elt: DisableNode);
7434	MDNode *NewLoopID = MDNode::get(Context, MDs);
7435	// Set operand 0 to refer to the loop id itself.
7436	NewLoopID->replaceOperandWith(I: `0`, New: NewLoopID);
7437	L->setLoopID(NewLoopID);
7438	}
7439	}
7440
7441	// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7442	// create a merge phi node for it and add it to \p ReductionResumeValues.
7443	static void createAndCollectMergePhiForReduction(
7444	VPInstruction *RedResult,
7445	DenseMap<const RecurrenceDescriptor , Value > &ReductionResumeValues,
7446	VPTransformState &State, Loop OrigLoop, BasicBlock LoopMiddleBlock) {
7447	if (!RedResult \|\|
7448	RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7449	return;
7450
7451	auto *PhiR = cast<VPReductionPHIRecipe>(Val: RedResult->getOperand(N: `0`));
7452	const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7453
7454	TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
7455	Value *FinalValue =
7456	State.get(Def: RedResult, Instance: VPIteration (State.UF - `1`, VPLane::getFirstLane()));
7457	auto *ResumePhi =
7458	dyn_cast<PHINode>(Val: PhiR->getStartValue()->getUnderlyingValue());
7459
7460	// TODO: bc.merge.rdx should not be created here, instead it should be
7461	// modeled in VPlan.
7462	BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7463	// Create a phi node that merges control-flow from the backedge-taken check
7464	// block and the middle block.
7465	auto *BCBlockPhi =
7466	PHINode::Create(Ty: FinalValue->getType(), NumReservedValues: `2`, NameStr: "bc.merge.rdx",
7467	InsertBefore: LoopScalarPreHeader->getTerminator()->getIterator());
7468
7469	// If we are fixing reductions in the epilogue loop then we should already
7470	// have created a bc.merge.rdx Phi after the main vector body. Ensure that
7471	// we carry over the incoming values correctly.
7472	for (auto *Incoming : predecessors(BB: LoopScalarPreHeader)) {
7473	if (Incoming == LoopMiddleBlock)
7474	BCBlockPhi->addIncoming(V: FinalValue, BB: Incoming);
7475	else if (ResumePhi && is_contained(Range: ResumePhi->blocks(), Element: Incoming))
7476	BCBlockPhi->addIncoming(V: ResumePhi->getIncomingValueForBlock(BB: Incoming),
7477	BB: Incoming);
7478	else
7479	BCBlockPhi->addIncoming(V: ReductionStartValue, BB: Incoming);
7480	}
7481
7482	auto *OrigPhi = cast<PHINode>(Val: PhiR->getUnderlyingValue());
7483	// TODO: This fixup should instead be modeled in VPlan.
7484	// Fix the scalar loop reduction variable with the incoming reduction sum
7485	// from the vector body and from the backedge value.
7486	int IncomingEdgeBlockIdx =
7487	OrigPhi->getBasicBlockIndex(BB: OrigLoop->getLoopLatch());
7488	assert(IncomingEdgeBlockIdx >= `0` && "Invalid block index");
7489	// Pick the other block.
7490	int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? `0` : `1`);
7491	OrigPhi->setIncomingValue(i: SelfEdgeBlockIdx, V: BCBlockPhi);
7492	Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7493	OrigPhi->setIncomingValue(i: IncomingEdgeBlockIdx, V: LoopExitInst);
7494
7495	ReductionResumeValues [&RdxDesc] = BCBlockPhi;
7496	}
7497
7498	std::pair<DenseMap<const SCEV , Value >,
7499	DenseMap<const RecurrenceDescriptor , Value >>
7500	LoopVectorizationPlanner::executePlan(
7501	ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7502	InnerLoopVectorizer &ILV, DominatorTree DT, bool* IsEpilogueVectorization,
7503	const DenseMap<const SCEV , Value > *ExpandedSCEVs) {
7504	assert(BestVPlan.hasVF(BestVF) &&
7505	"Trying to execute plan with unsupported VF");
7506	assert(BestVPlan.hasUF(BestUF) &&
7507	"Trying to execute plan with unsupported UF");
7508	assert(
7509	(IsEpilogueVectorization \|\| !ExpandedSCEVs) &&
7510	"expanded SCEVs to reuse can only be used during epilogue vectorization");
7511
7512	if (!IsEpilogueVectorization)
7513	VPlanTransforms::optimizeForVFAndUF(Plan&: BestVPlan, BestVF, BestUF, PSE);
7514
7515	LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
7516	<< ", UF=" << BestUF << `'\n'`);
7517	BestVPlan.setName("Final VPlan");
7518	LLVM_DEBUG(BestVPlan.dump());
7519
7520	// Perform the actual loop transformation.
7521	VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7522	OrigLoop->getHeader()->getContext());
7523
7524	// 0. Generate SCEV-dependent code into the preheader, including TripCount,
7525	// before making any changes to the CFG.
7526	if (!BestVPlan.getPreheader()->empty()) {
7527	State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7528	State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
7529	BestVPlan.getPreheader()->execute(State: &State);
7530	}
7531	if (!ILV.getTripCount())
7532	ILV.setTripCount(State.get(Def: BestVPlan.getTripCount(), Instance: {`0`, `0`}));
7533	else
7534	assert(IsEpilogueVectorization && "should only re-use the existing trip "
7535	"count during epilogue vectorization");
7536
7537	// 1. Set up the skeleton for vectorization, including vector pre-header and
7538	// middle block. The vector loop is created during VPlan execution.
7539	Value *CanonicalIVStartValue;
7540	std::tie(args&: State.CFG.PrevBB, args&: CanonicalIVStartValue) =
7541	ILV.createVectorizedLoopSkeleton(ExpandedSCEVs: ExpandedSCEVs ? *ExpandedSCEVs
7542	: State.ExpandedSCEVs);
7543
7544	// Only use noalias metadata when using memory checks guaranteeing no overlap
7545	// across all iterations.
7546	const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7547	std::unique_ptr<LoopVersioning> LVer = nullptr;
7548	if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7549	!LAI->getRuntimePointerChecking()->getDiffChecks()) {
7550
7551	// We currently don't use LoopVersioning for the actual loop cloning but we
7552	// still use it to add the noalias metadata.
7553	// TODO: Find a better way to re-use LoopVersioning functionality to add
7554	// metadata.
7555	LVer = std::make_unique<LoopVersioning>(
7556	args: *LAI, args: LAI->getRuntimePointerChecking()->getChecks(), args&: OrigLoop, args&: LI, args&: DT,
7557	args: PSE.getSE());
7558	State.LVer = &*LVer;
7559	State.LVer->prepareNoAliasMetadata();
7560	}
7561
7562	ILV.printDebugTracesAtStart();
7563
7564	//===------------------------------------------------===//
7565	//
7566	// Notice: any optimization or new instruction that go
7567	// into the code below should also be implemented in
7568	// the cost-model.
7569	//
7570	//===------------------------------------------------===//
7571
7572	// 2. Copy and widen instructions from the old loop into the new loop.
7573	BestVPlan.prepareToExecute(TripCount: ILV.getTripCount(),
7574	VectorTripCount: ILV.getOrCreateVectorTripCount(InsertBlock: nullptr),
7575	CanonicalIVStartValue, State);
7576
7577	BestVPlan.execute(State: &State);
7578
7579	// 2.5 Collect reduction resume values.
7580	DenseMap<const RecurrenceDescriptor , Value > ReductionResumeValues;
7581	auto *ExitVPBB =
7582	cast<VPBasicBlock>(Val: BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7583	for (VPRecipeBase &R : *ExitVPBB) {
7584	createAndCollectMergePhiForReduction(RedResult: dyn_cast<VPInstruction>(Val: &R),
7585	ReductionResumeValues, State, OrigLoop,
7586	LoopMiddleBlock: State.CFG.VPBB2IRBB [ExitVPBB]);
7587	}
7588
7589	// 2.6. Maintain Loop Hints
7590	// Keep all loop hints from the original loop on the vector loop (we'll
7591	// replace the vectorizer-specific hints below).
7592	MDNode *OrigLoopID = OrigLoop->getLoopID();
7593
7594	std::optional<MDNode *> VectorizedLoopID =
7595	makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
7596	LLVMLoopVectorizeFollowupVectorized});
7597
7598	VPBasicBlock *HeaderVPBB =
7599	BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7600	Loop *L = LI->getLoopFor(BB: State.CFG.VPBB2IRBB [HeaderVPBB]);
7601	if (VectorizedLoopID)
7602	L->setLoopID(*VectorizedLoopID);
7603	else {
7604	// Keep all loop hints from the original loop on the vector loop (we'll
7605	// replace the vectorizer-specific hints below).
7606	if (MDNode *LID = OrigLoop->getLoopID())
7607	L->setLoopID(LID);
7608
7609	LoopVectorizeHints Hints(L, true, *ORE);
7610	Hints.setAlreadyVectorized();
7611	}
7612	TargetTransformInfo::UnrollingPreferences UP;
7613	TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7614	if (!UP.UnrollVectorizedLoop \|\| CanonicalIVStartValue)
7615	AddRuntimeUnrollDisableMetaData(L);
7616
7617	// 3. Fix the vectorized code: take care of header phi's, live-outs,
7618	// predication, updating analyses.
7619	ILV.fixVectorizedLoop(State, Plan&: BestVPlan);
7620
7621	ILV.printDebugTracesAtEnd();
7622
7623	return {State.ExpandedSCEVs, ReductionResumeValues};
7624	}
7625
7626	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
7627	void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7628	for (const auto &Plan : VPlans)
7629	if (PrintVPlansInDotFormat)
7630	Plan ->printDOT(O);
7631	else
7632	Plan ->print(O);
7633	}
7634	#endif
7635
7636	//===--------------------------------------------------------------------===//
7637	// EpilogueVectorizerMainLoop
7638	//===--------------------------------------------------------------------===//
7639
7640	/// This function is partially responsible for generating the control flow
7641	/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7642	std::pair<BasicBlock , Value >
7643	EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7644	const SCEV2ValueTy &ExpandedSCEVs) {
7645	createVectorLoopSkeleton(Prefix: "");
7646
7647	// Generate the code to check the minimum iteration count of the vector
7648	// epilogue (see below).
7649	EPI.EpilogueIterationCountCheck =
7650	emitIterationCountCheck(Bypass: LoopScalarPreHeader, ForEpilogue: true);
7651	EPI.EpilogueIterationCountCheck->setName("iter.check");
7652
7653	// Generate the code to check any assumptions that we've made for SCEV
7654	// expressions.
7655	EPI.SCEVSafetyCheck = emitSCEVChecks(Bypass: LoopScalarPreHeader);
7656
7657	// Generate the code that checks at runtime if arrays overlap. We put the
7658	// checks into a separate block to make the more common case of few elements
7659	// faster.
7660	EPI.MemSafetyCheck = emitMemRuntimeChecks(Bypass: LoopScalarPreHeader);
7661
7662	// Generate the iteration count check for the main loop, after* the check*
7663	// for the epilogue loop, so that the path-length is shorter for the case
7664	// that goes directly through the vector epilogue. The longer-path length for
7665	// the main loop is compensated for, by the gain from vectorizing the larger
7666	// trip count. Note: the branch will get updated later on when we vectorize
7667	// the epilogue.
7668	EPI.MainLoopIterationCountCheck =
7669	emitIterationCountCheck(Bypass: LoopScalarPreHeader, ForEpilogue: false);
7670
7671	// Generate the induction variable.
7672	EPI.VectorTripCount = getOrCreateVectorTripCount(InsertBlock: LoopVectorPreHeader);
7673
7674	// Skip induction resume value creation here because they will be created in
7675	// the second pass for the scalar loop. The induction resume values for the
7676	// inductions in the epilogue loop are created before executing the plan for
7677	// the epilogue loop.
7678
7679	return {completeLoopSkeleton(), nullptr};
7680	}
7681
7682	void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7683	LLVM_DEBUG({
7684	dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7685	<< "Main Loop VF:" << EPI.MainLoopVF
7686	<< ", Main Loop UF:" << EPI.MainLoopUF
7687	<< ", Epilogue Loop VF:" << EPI.EpilogueVF
7688	<< ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7689	});
7690	}
7691
7692	void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7693	DEBUG_WITH_TYPE(VerboseDebug, {
7694	dbgs() << "intermediate fn:\n"
7695	<< *OrigLoop->getHeader()->getParent() << "\n";
7696	});
7697	}
7698
7699	BasicBlock *
7700	EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7701	bool ForEpilogue) {
7702	assert(Bypass && "Expected valid bypass basic block.");
7703	ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7704	unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7705	Value *Count = getTripCount();
7706	// Reuse existing vector loop preheader for TC checks.
7707	// Note that new preheader block is generated for vector loop.
7708	BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7709	IRBuilder<> Builder(TCCheckBlock->getTerminator());
7710
7711	// Generate code to check if the loop's trip count is less than VF UF of the*
7712	// main vector loop.
7713	auto P = Cost->requiresScalarEpilogue(IsVectorizing: ForEpilogue ? EPI.EpilogueVF.isVector()
7714	: VF.isVector())
7715	? ICmpInst::ICMP_ULE
7716	: ICmpInst::ICMP_ULT;
7717
7718	Value *CheckMinIters = Builder.CreateICmp(
7719	P, LHS: Count, RHS: createStepForVF(B&: Builder, Ty: Count->getType(), VF: VFactor, Step: UFactor),
7720	Name: "min.iters.check");
7721
7722	if (!ForEpilogue)
7723	TCCheckBlock->setName("vector.main.loop.iter.check");
7724
7725	// Create new preheader for vector loop.
7726	LoopVectorPreHeader = SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(),
7727	DT, LI, MSSAU: nullptr, BBName: "vector.ph");
7728
7729	if (ForEpilogue) {
7730	assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7731	DT->getNode(Bypass)->getIDom()) &&
7732	"TC check is expected to dominate Bypass");
7733
7734	// Update dominator for Bypass & LoopExit.
7735	DT->changeImmediateDominator(BB: Bypass, NewBB: TCCheckBlock);
7736	if (!Cost->requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector()))
7737	// For loops with multiple exits, there's no edge from the middle block
7738	// to exit blocks (as the epilogue must run) and thus no need to update
7739	// the immediate dominator of the exit blocks.
7740	DT->changeImmediateDominator(BB: LoopExitBlock, NewBB: TCCheckBlock);
7741
7742	LoopBypassBlocks.push_back(Elt: TCCheckBlock);
7743
7744	// Save the trip count so we don't have to regenerate it in the
7745	// vec.epilog.iter.check. This is safe to do because the trip count
7746	// generated here dominates the vector epilog iter check.
7747	EPI.TripCount = Count;
7748	}
7749
7750	BranchInst &BI =
7751	*BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
7752	if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator()))
7753	setBranchWeights(I&: BI, Weights: MinItersBypassWeights);
7754	ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI);
7755
7756	return TCCheckBlock;
7757	}
7758
7759	//===--------------------------------------------------------------------===//
7760	// EpilogueVectorizerEpilogueLoop
7761	//===--------------------------------------------------------------------===//
7762
7763	/// This function is partially responsible for generating the control flow
7764	/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7765	std::pair<BasicBlock , Value >
7766	EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7767	const SCEV2ValueTy &ExpandedSCEVs) {
7768	createVectorLoopSkeleton(Prefix: "vec.epilog.");
7769
7770	// Now, compare the remaining count and if there aren't enough iterations to
7771	// execute the vectorized epilogue skip to the scalar part.
7772	BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7773	VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7774	LoopVectorPreHeader =
7775	SplitBlock(Old: LoopVectorPreHeader, SplitPt: LoopVectorPreHeader->getTerminator(), DT,
7776	LI, MSSAU: nullptr, BBName: "vec.epilog.ph");
7777	emitMinimumVectorEpilogueIterCountCheck(Bypass: LoopScalarPreHeader,
7778	Insert: VecEpilogueIterationCountCheck);
7779
7780	// Adjust the control flow taking the state info from the main loop
7781	// vectorization into account.
7782	assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7783	"expected this to be saved from the previous pass.");
7784	EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7785	From: VecEpilogueIterationCountCheck, To: LoopVectorPreHeader);
7786
7787	DT->changeImmediateDominator(BB: LoopVectorPreHeader,
7788	NewBB: EPI.MainLoopIterationCountCheck);
7789
7790	EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7791	From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7792
7793	if (EPI.SCEVSafetyCheck)
7794	EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7795	From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7796	if (EPI.MemSafetyCheck)
7797	EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7798	From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7799
7800	DT->changeImmediateDominator(
7801	BB: VecEpilogueIterationCountCheck,
7802	NewBB: VecEpilogueIterationCountCheck->getSinglePredecessor());
7803
7804	DT->changeImmediateDominator(BB: LoopScalarPreHeader,
7805	NewBB: EPI.EpilogueIterationCountCheck);
7806	if (!Cost->requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector()))
7807	// If there is an epilogue which must run, there's no edge from the
7808	// middle block to exit blocks and thus no need to update the immediate
7809	// dominator of the exit blocks.
7810	DT->changeImmediateDominator(BB: LoopExitBlock,
7811	NewBB: EPI.EpilogueIterationCountCheck);
7812
7813	// Keep track of bypass blocks, as they feed start values to the induction and
7814	// reduction phis in the scalar loop preheader.
7815	if (EPI.SCEVSafetyCheck)
7816	LoopBypassBlocks.push_back(Elt: EPI.SCEVSafetyCheck);
7817	if (EPI.MemSafetyCheck)
7818	LoopBypassBlocks.push_back(Elt: EPI.MemSafetyCheck);
7819	LoopBypassBlocks.push_back(Elt: EPI.EpilogueIterationCountCheck);
7820
7821	// The vec.epilog.iter.check block may contain Phi nodes from inductions or
7822	// reductions which merge control-flow from the latch block and the middle
7823	// block. Update the incoming values here and move the Phi into the preheader.
7824	SmallVector<PHINode *, `4`> PhisInBlock;
7825	for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7826	PhisInBlock.push_back(Elt: &Phi);
7827
7828	for (PHINode *Phi : PhisInBlock) {
7829	Phi->moveBefore(MovePos: LoopVectorPreHeader->getFirstNonPHI());
7830	Phi->replaceIncomingBlockWith(
7831	Old: VecEpilogueIterationCountCheck->getSinglePredecessor(),
7832	New: VecEpilogueIterationCountCheck);
7833
7834	// If the phi doesn't have an incoming value from the
7835	// EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7836	// value and also those from other check blocks. This is needed for
7837	// reduction phis only.
7838	if (none_of(Range: Phi->blocks(), P: [&](BasicBlock *IncB) {
7839	return EPI.EpilogueIterationCountCheck == IncB;
7840	}))
7841	continue;
7842	Phi->removeIncomingValue(BB: EPI.EpilogueIterationCountCheck);
7843	if (EPI.SCEVSafetyCheck)
7844	Phi->removeIncomingValue(BB: EPI.SCEVSafetyCheck);
7845	if (EPI.MemSafetyCheck)
7846	Phi->removeIncomingValue(BB: EPI.MemSafetyCheck);
7847	}
7848
7849	// Generate a resume induction for the vector epilogue and put it in the
7850	// vector epilogue preheader
7851	Type *IdxTy = Legal->getWidestInductionType();
7852	PHINode *EPResumeVal = PHINode::Create(Ty: IdxTy, NumReservedValues: `2`, NameStr: "vec.epilog.resume.val");
7853	EPResumeVal->insertBefore(InsertPos: LoopVectorPreHeader->getFirstNonPHIIt());
7854	EPResumeVal->addIncoming(V: EPI.VectorTripCount, BB: VecEpilogueIterationCountCheck);
7855	EPResumeVal->addIncoming(V: ConstantInt::get(Ty: IdxTy, V: `0`),
7856	BB: EPI.MainLoopIterationCountCheck);
7857
7858	// Generate induction resume values. These variables save the new starting
7859	// indexes for the scalar loop. They are used to test if there are any tail
7860	// iterations left once the vector loop has completed.
7861	// Note that when the vectorized epilogue is skipped due to iteration count
7862	// check, then the resume value for the induction variable comes from
7863	// the trip count of the main vector loop, hence passing the AdditionalBypass
7864	// argument.
7865	createInductionResumeValues(ExpandedSCEVs,
7866	AdditionalBypass: {VecEpilogueIterationCountCheck,
7867	EPI.VectorTripCount} / AdditionalBypass /);
7868
7869	return {completeLoopSkeleton(), EPResumeVal};
7870	}
7871
7872	BasicBlock *
7873	EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7874	BasicBlock Bypass, BasicBlock Insert) {
7875
7876	assert(EPI.TripCount &&
7877	"Expected trip count to have been safed in the first pass.");
7878	assert(
7879	(!isa<Instruction>(EPI.TripCount) \|\|
7880	DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7881	"saved trip count does not dominate insertion point.");
7882	Value *TC = EPI.TripCount;
7883	IRBuilder<> Builder(Insert->getTerminator());
7884	Value *Count = Builder.CreateSub(LHS: TC, RHS: EPI.VectorTripCount, Name: "n.vec.remaining");
7885
7886	// Generate code to check if the loop's trip count is less than VF UF of the*
7887	// vector epilogue loop.
7888	auto P = Cost->requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector())
7889	? ICmpInst::ICMP_ULE
7890	: ICmpInst::ICMP_ULT;
7891
7892	Value *CheckMinIters =
7893	Builder.CreateICmp(P, LHS: Count,
7894	RHS: createStepForVF(B&: Builder, Ty: Count->getType(),
7895	VF: EPI.EpilogueVF, Step: EPI.EpilogueUF),
7896	Name: "min.epilog.iters.check");
7897
7898	BranchInst &BI =
7899	*BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
7900	if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())) {
7901	unsigned MainLoopStep = UF * VF.getKnownMinValue();
7902	unsigned EpilogueLoopStep =
7903	EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
7904	// We assume the remaining `Count` is equally distributed in
7905	// [0, MainLoopStep)
7906	// So the probability for `Count < EpilogueLoopStep` should be
7907	// min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7908	unsigned EstimatedSkipCount = std::min(a: MainLoopStep, b: EpilogueLoopStep);
7909	const uint32_t Weights[] = {EstimatedSkipCount,
7910	MainLoopStep - EstimatedSkipCount};
7911	setBranchWeights(I&: BI, Weights);
7912	}
7913	ReplaceInstWithInst(From: Insert->getTerminator(), To: &BI);
7914
7915	LoopBypassBlocks.push_back(Elt: Insert);
7916	return Insert;
7917	}
7918
7919	void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7920	LLVM_DEBUG({
7921	dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7922	<< "Epilogue Loop VF:" << EPI.EpilogueVF
7923	<< ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7924	});
7925	}
7926
7927	void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7928	DEBUG_WITH_TYPE(VerboseDebug, {
7929	dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7930	});
7931	}
7932
7933	bool LoopVectorizationPlanner::getDecisionAndClampRange(
7934	const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7935	assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7936	bool PredicateAtRangeStart = Predicate (Range.Start);
7937
7938	for (ElementCount TmpVF : VFRange (Range.Start * `2`, Range.End))
7939	if (Predicate (TmpVF) != PredicateAtRangeStart) {
7940	Range.End = TmpVF;
7941	break;
7942	}
7943
7944	return PredicateAtRangeStart;
7945	}
7946
7947	/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 \p MinVF,*
7948	/// 4 \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range*
7949	/// of VF's starting at a given VF and extending it as much as possible. Each
7950	/// vectorization decision can potentially shorten this sub-range during
7951	/// buildVPlan().
7952	void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7953	ElementCount MaxVF) {
7954	auto MaxVFTimes2 = MaxVF * `2`;
7955	for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) {
7956	VFRange SubRange = {VF, MaxVFTimes2};
7957	VPlans.push_back(Elt: buildVPlan(Range&: SubRange));
7958	VF = SubRange.End;
7959	}
7960	}
7961
7962	iterator_range<mapped_iterator<Use , std::function<VPValue (Value *)>>>
7963	VPRecipeBuilder::mapToVPValues(User::op_range Operands) {
7964	std::function<VPValue (Value )> Fn = [this](Value *Op) {
7965	if (auto *I = dyn_cast<Instruction>(Val: Op)) {
7966	if (auto *R = Ingredient2Recipe.lookup(Val: I))
7967	return R->getVPSingleValue();
7968	}
7969	return Plan.getOrAddLiveIn(V: Op);
7970	};
7971	return map_range(C&: Operands, F: Fn);
7972	}
7973
7974	VPValue VPRecipeBuilder::createEdgeMask(BasicBlock Src, BasicBlock *Dst) {
7975	assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7976
7977	// Look for cached value.
7978	std::pair<BasicBlock , BasicBlock > Edge(Src, Dst);
7979	EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Val: Edge);
7980	if (ECEntryIt != EdgeMaskCache.end())
7981	return ECEntryIt ->second;
7982
7983	VPValue *SrcMask = getBlockInMask(BB: Src);
7984
7985	// The terminator has to be a branch inst!
7986	BranchInst *BI = dyn_cast<BranchInst>(Val: Src->getTerminator());
7987	assert(BI && "Unexpected terminator found");
7988
7989	if (!BI->isConditional() \|\| BI->getSuccessor(i: `0`) == BI->getSuccessor(i: `1`))
7990	return EdgeMaskCache [Edge] = SrcMask;
7991
7992	// If source is an exiting block, we know the exit edge is dynamically dead
7993	// in the vector loop, and thus we don't need to restrict the mask. Avoid
7994	// adding uses of an otherwise potentially dead instruction.
7995	if (OrigLoop->isLoopExiting(BB: Src))
7996	return EdgeMaskCache [Edge] = SrcMask;
7997
7998	VPValue *EdgeMask = getVPValueOrAddLiveIn(V: BI->getCondition(), Plan);
7999	assert(EdgeMask && "No Edge Mask found for condition");
8000
8001	if (BI->getSuccessor(i: `0`) != Dst)
8002	EdgeMask = Builder.createNot(Operand: EdgeMask, DL: BI->getDebugLoc());
8003
8004	if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8005	// The condition is 'SrcMask && EdgeMask', which is equivalent to
8006	// 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8007	// The select version does not introduce new UB if SrcMask is false and
8008	// EdgeMask is poison. Using 'and' here introduces undefined behavior.
8009	VPValue *False = Plan.getOrAddLiveIn(
8010	V: ConstantInt::getFalse(Ty: BI->getCondition()->getType()));
8011	EdgeMask =
8012	Builder.createSelect(Cond: SrcMask, TrueVal: EdgeMask, FalseVal: False, DL: BI->getDebugLoc());
8013	}
8014
8015	return EdgeMaskCache [Edge] = EdgeMask;
8016	}
8017
8018	VPValue VPRecipeBuilder::getEdgeMask(BasicBlock Src, BasicBlock Dst) const* {
8019	assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8020
8021	// Look for cached value.
8022	std::pair<BasicBlock , BasicBlock > Edge(Src, Dst);
8023	EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Val: Edge);
8024	assert(ECEntryIt != EdgeMaskCache.end() &&
8025	"looking up mask for edge which has not been created");
8026	return ECEntryIt ->second;
8027	}
8028
8029	void VPRecipeBuilder::createHeaderMask() {
8030	BasicBlock *Header = OrigLoop->getHeader();
8031
8032	// When not folding the tail, use nullptr to model all-true mask.
8033	if (!CM.foldTailByMasking()) {
8034	BlockMaskCache [Header] = nullptr;
8035	return;
8036	}
8037
8038	// Introduce the early-exit compare IV <= BTC to form header block mask.
8039	// This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8040	// constructing the desired canonical IV in the header block as its first
8041	// non-phi instructions.
8042
8043	VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
8044	auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8045	auto IV = new* VPWidenCanonicalIVRecipe (Plan.getCanonicalIV());
8046	HeaderVPBB->insert(Recipe: IV, InsertPt: NewInsertionPoint);
8047
8048	VPBuilder::InsertPointGuard Guard(Builder);
8049	Builder.setInsertPoint(TheBB: HeaderVPBB, IP: NewInsertionPoint);
8050	VPValue BlockMask = nullptr*;
8051	VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
8052	BlockMask = Builder.createICmp(Pred: CmpInst::ICMP_ULE, A: IV, B: BTC);
8053	BlockMaskCache [Header] = BlockMask;
8054	}
8055
8056	VPValue VPRecipeBuilder::getBlockInMask(BasicBlock BB) const {
8057	// Return the cached value.
8058	BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(Val: BB);
8059	assert(BCEntryIt != BlockMaskCache.end() &&
8060	"Trying to access mask for block without one.");
8061	return BCEntryIt ->second;
8062	}
8063
8064	void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {
8065	assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8066	assert(BlockMaskCache.count(BB) == `0` && "Mask for block already computed");
8067	assert(OrigLoop->getHeader() != BB &&
8068	"Loop header must have cached block mask");
8069
8070	// All-one mask is modelled as no-mask following the convention for masked
8071	// load/store/gather/scatter. Initialize BlockMask to no-mask.
8072	VPValue BlockMask = nullptr*;
8073	// This is the block mask. We OR all incoming edges.
8074	for (auto *Predecessor : predecessors(BB)) {
8075	VPValue *EdgeMask = createEdgeMask(Src: Predecessor, Dst: BB);
8076	if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
8077	BlockMaskCache [BB] = EdgeMask;
8078	return;
8079	}
8080
8081	if (!BlockMask) { // BlockMask has its initialized nullptr value.
8082	BlockMask = EdgeMask;
8083	continue;
8084	}
8085
8086	BlockMask = Builder.createOr(LHS: BlockMask, RHS: EdgeMask, DL: {});
8087	}
8088
8089	BlockMaskCache [BB] = BlockMask;
8090	}
8091
8092	VPWidenMemoryRecipe *
8093	VPRecipeBuilder::tryToWidenMemory(Instruction I, ArrayRef<VPValue > Operands,
8094	VFRange &Range) {
8095	assert((isa<LoadInst>(I) \|\| isa<StoreInst>(I)) &&
8096	"Must be called with either a load or store");
8097
8098	auto willWiden = [&](ElementCount VF) -> bool {
8099	LoopVectorizationCostModel::InstWidening Decision =
8100	CM.getWideningDecision(I, VF);
8101	assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8102	"CM decision should be taken at this point.");
8103	if (Decision == LoopVectorizationCostModel::CM_Interleave)
8104	return true;
8105	if (CM.isScalarAfterVectorization(I, VF) \|\|
8106	CM.isProfitableToScalarize(I, VF))
8107	return false;
8108	return Decision != LoopVectorizationCostModel::CM_Scalarize;
8109	};
8110
8111	if (!LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: willWiden, Range))
8112	return nullptr;
8113
8114	VPValue Mask = nullptr*;
8115	if (Legal->isMaskRequired(I))
8116	Mask = getBlockInMask(BB: I->getParent());
8117
8118	// Determine if the pointer operand of the access is either consecutive or
8119	// reverse consecutive.
8120	LoopVectorizationCostModel::InstWidening Decision =
8121	CM.getWideningDecision(I, VF: Range.Start);
8122	bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8123	bool Consecutive =
8124	Reverse \|\| Decision == LoopVectorizationCostModel::CM_Widen;
8125
8126	VPValue *Ptr = isa<LoadInst>(Val: I) ? Operands [`0`] : Operands [`1`];
8127	if (Consecutive) {
8128	auto *GEP = dyn_cast<GetElementPtrInst>(
8129	Val: Ptr->getUnderlyingValue()->stripPointerCasts());
8130	auto VectorPtr = new* VPVectorPointerRecipe (
8131	Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
8132	I->getDebugLoc());
8133	Builder.getInsertBlock()->appendRecipe(Recipe: VectorPtr);
8134	Ptr = VectorPtr;
8135	}
8136	if (LoadInst *Load = dyn_cast<LoadInst>(Val: I))
8137	return new VPWidenLoadRecipe (*Load, Ptr, Mask, Consecutive, Reverse,
8138	I->getDebugLoc());
8139
8140	StoreInst *Store = cast<StoreInst>(Val: I);
8141	return new VPWidenStoreRecipe (*Store, Ptr, Operands [`0`], Mask, Consecutive,
8142	Reverse, I->getDebugLoc());
8143	}
8144
8145	/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8146	/// insert a recipe to expand the step for the induction recipe.
8147	static VPWidenIntOrFpInductionRecipe *
8148	createWidenInductionRecipes(PHINode Phi, Instruction PhiOrTrunc,
8149	VPValue Start, const* InductionDescriptor &IndDesc,
8150	VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,
8151	VFRange &Range) {
8152	assert(IndDesc.getStartValue() ==
8153	Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8154	assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8155	"step must be loop invariant");
8156
8157	VPValue *Step =
8158	vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: IndDesc.getStep(), SE);
8159	if (auto *TruncI = dyn_cast<TruncInst>(Val: PhiOrTrunc)) {
8160	return new VPWidenIntOrFpInductionRecipe (Phi, Start, Step, IndDesc, TruncI);
8161	}
8162	assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8163	return new VPWidenIntOrFpInductionRecipe (Phi, Start, Step, IndDesc);
8164	}
8165
8166	VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8167	PHINode Phi, ArrayRef<VPValue > Operands, VFRange &Range) {
8168
8169	// Check if this is an integer or fp induction. If so, build the recipe that
8170	// produces its scalar and vector values.
8171	if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8172	return createWidenInductionRecipes(Phi, PhiOrTrunc: Phi, Start: Operands [`0`], IndDesc: *II, Plan,
8173	SE&: PSE.getSE(), OrigLoop&: OrigLoop, Range);
8174
8175	// Check if this is pointer induction. If so, build the recipe for it.
8176	if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
8177	VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: II->getStep(),
8178	SE&: *PSE.getSE());
8179	return new VPWidenPointerInductionRecipe (
8180	Phi, Operands [`0`], Step, *II,
8181	LoopVectorizationPlanner::getDecisionAndClampRange(
8182	Predicate: [&](ElementCount VF) {
8183	return CM.isScalarAfterVectorization(I: Phi, VF);
8184	},
8185	Range));
8186	}
8187	return nullptr;
8188	}
8189
8190	VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8191	TruncInst I, ArrayRef<VPValue > Operands, VFRange &Range) {
8192	// Optimize the special case where the source is a constant integer
8193	// induction variable. Notice that we can only optimize the 'trunc' case
8194	// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8195	// (c) other casts depend on pointer size.
8196
8197	// Determine whether \p K is a truncation based on an induction variable that
8198	// can be optimized.
8199	auto isOptimizableIVTruncate =
8200	[&](Instruction K) -> std::function<bool*(ElementCount)> {
8201	return [=](ElementCount VF) -> bool {
8202	return CM.isOptimizableIVTruncate(I: K, VF);
8203	};
8204	};
8205
8206	if (LoopVectorizationPlanner::getDecisionAndClampRange(
8207	Predicate: isOptimizableIVTruncate (I), Range)) {
8208
8209	auto *Phi = cast<PHINode>(Val: I->getOperand(i_nocapture: `0`));
8210	const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8211	VPValue *Start = Plan.getOrAddLiveIn(V: II.getStartValue());
8212	return createWidenInductionRecipes(Phi, PhiOrTrunc: I, Start, IndDesc: II, Plan, SE&: *PSE.getSE(),
8213	OrigLoop&: *OrigLoop, Range);
8214	}
8215	return nullptr;
8216	}
8217
8218	VPBlendRecipe VPRecipeBuilder::tryToBlend(PHINode Phi,
8219	ArrayRef<VPValue *> Operands) {
8220	unsigned NumIncoming = Phi->getNumIncomingValues();
8221
8222	// We know that all PHIs in non-header blocks are converted into selects, so
8223	// we don't have to worry about the insertion order and we can just use the
8224	// builder. At this point we generate the predication tree. There may be
8225	// duplications since this is a simple recursive scan, but future
8226	// optimizations will clean it up.
8227	// TODO: At the moment the first mask is always skipped, but it would be
8228	// better to skip the most expensive mask.
8229	SmallVector<VPValue *, `2`> OperandsWithMask;
8230
8231	for (unsigned In = `0`; In < NumIncoming; In++) {
8232	OperandsWithMask.push_back(Elt: Operands [In]);
8233	VPValue *EdgeMask =
8234	getEdgeMask(Src: Phi->getIncomingBlock(i: In), Dst: Phi->getParent());
8235	if (!EdgeMask) {
8236	assert(In == `0` && "Both null and non-null edge masks found");
8237	assert(all_equal(Operands) &&
8238	"Distinct incoming values with one having a full mask");
8239	break;
8240	}
8241	if (In == `0`)
8242	continue;
8243	OperandsWithMask.push_back(Elt: EdgeMask);
8244	}
8245	return new VPBlendRecipe (Phi, OperandsWithMask);
8246	}
8247
8248	VPWidenCallRecipe VPRecipeBuilder::tryToWidenCall(CallInst CI,
8249	ArrayRef<VPValue *> Operands,
8250	VFRange &Range) {
8251	bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8252	Predicate: [this, CI](ElementCount VF) {
8253	return CM.isScalarWithPredication(I: CI, VF);
8254	},
8255	Range);
8256
8257	if (IsPredicated)
8258	return nullptr;
8259
8260	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8261	if (ID && (ID == Intrinsic::assume \|\| ID == Intrinsic::lifetime_end \|\|
8262	ID == Intrinsic::lifetime_start \|\| ID == Intrinsic::sideeffect \|\|
8263	ID == Intrinsic::pseudoprobe \|\|
8264	ID == Intrinsic::experimental_noalias_scope_decl))
8265	return nullptr;
8266
8267	SmallVector<VPValue *, `4`> Ops(Operands.take_front(N: CI->arg_size()));
8268
8269	// Is it beneficial to perform intrinsic call compared to lib call?
8270	bool ShouldUseVectorIntrinsic =
8271	ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8272	Predicate: [&](ElementCount VF) -> bool {
8273	return CM.getCallWideningDecision(CI, VF).Kind ==
8274	LoopVectorizationCostModel::CM_IntrinsicCall;
8275	},
8276	Range);
8277	if (ShouldUseVectorIntrinsic)
8278	return new VPWidenCallRecipe (*CI, make_range(x: Ops.begin(), y: Ops.end()), ID,
8279	CI->getDebugLoc());
8280
8281	Function Variant = nullptr*;
8282	std::optional<unsigned> MaskPos;
8283	// Is better to call a vectorized version of the function than to to scalarize
8284	// the call?
8285	auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8286	Predicate: [&](ElementCount VF) -> bool {
8287	// The following case may be scalarized depending on the VF.
8288	// The flag shows whether we can use a usual Call for vectorized
8289	// version of the instruction.
8290
8291	// If we've found a variant at a previous VF, then stop looking. A
8292	// vectorized variant of a function expects input in a certain shape
8293	// -- basically the number of input registers, the number of lanes
8294	// per register, and whether there's a mask required.
8295	// We store a pointer to the variant in the VPWidenCallRecipe, so
8296	// once we have an appropriate variant it's only valid for that VF.
8297	// This will force a different vplan to be generated for each VF that
8298	// finds a valid variant.
8299	if (Variant)
8300	return false;
8301	LoopVectorizationCostModel::CallWideningDecision Decision =
8302	CM.getCallWideningDecision(CI, VF);
8303	if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
8304	Variant = Decision.Variant;
8305	MaskPos = Decision.MaskPos;
8306	return true;
8307	}
8308
8309	return false;
8310	},
8311	Range);
8312	if (ShouldUseVectorCall) {
8313	if (MaskPos.has_value()) {
8314	// We have 2 cases that would require a mask:
8315	// 1) The block needs to be predicated, either due to a conditional
8316	// in the scalar loop or use of an active lane mask with
8317	// tail-folding, and we use the appropriate mask for the block.
8318	// 2) No mask is required for the block, but the only available
8319	// vector variant at this VF requires a mask, so we synthesize an
8320	// all-true mask.
8321	VPValue Mask = nullptr*;
8322	if (Legal->isMaskRequired(I: CI))
8323	Mask = getBlockInMask(BB: CI->getParent());
8324	else
8325	Mask = Plan.getOrAddLiveIn(V: ConstantInt::getTrue(
8326	Ty: IntegerType::getInt1Ty(C&: Variant->getFunctionType()->getContext())));
8327
8328	Ops.insert(I: Ops.begin() + *MaskPos, Elt: Mask);
8329	}
8330
8331	return new VPWidenCallRecipe (*CI, make_range(x: Ops.begin(), y: Ops.end()),
8332	Intrinsic::not_intrinsic, CI->getDebugLoc(),
8333	Variant);
8334	}
8335
8336	return nullptr;
8337	}
8338
8339	bool VPRecipeBuilder::shouldWiden(Instruction I, VFRange &Range) const* {
8340	assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8341	!isa<StoreInst>(I) && "Instruction should have been handled earlier");
8342	// Instruction should be widened, unless it is scalar after vectorization,
8343	// scalarization is profitable or it is predicated.
8344	auto WillScalarize = [this, I](ElementCount VF) -> bool {
8345	return CM.isScalarAfterVectorization(I, VF) \|\|
8346	CM.isProfitableToScalarize(I, VF) \|\|
8347	CM.isScalarWithPredication(I, VF);
8348	};
8349	return !LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillScalarize,
8350	Range);
8351	}
8352
8353	VPWidenRecipe VPRecipeBuilder::tryToWiden(Instruction I,
8354	ArrayRef<VPValue *> Operands,
8355	VPBasicBlock *VPBB) {
8356	switch (I->getOpcode()) {
8357	default:
8358	return nullptr;
8359	case Instruction::SDiv:
8360	case Instruction::UDiv:
8361	case Instruction::SRem:
8362	case Instruction::URem: {
8363	// If not provably safe, use a select to form a safe divisor before widening the
8364	// div/rem operation itself. Otherwise fall through to general handling below.
8365	if (CM.isPredicatedInst(I)) {
8366	SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8367	VPValue *Mask = getBlockInMask(BB: I->getParent());
8368	VPValue *One =
8369	Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: I->getType(), V: `1u`, IsSigned: false));
8370	auto *SafeRHS =
8371	new VPInstruction (Instruction::Select, {Mask, Ops [`1`], One},
8372	I->getDebugLoc());
8373	VPBB->appendRecipe(Recipe: SafeRHS);
8374	Ops [`1`] = SafeRHS;
8375	return new VPWidenRecipe (*I, make_range(x: Ops.begin(), y: Ops.end()));
8376	}
8377	[[fallthrough]];
8378	}
8379	case Instruction::Add:
8380	case Instruction::And:
8381	case Instruction::AShr:
8382	case Instruction::FAdd:
8383	case Instruction::FCmp:
8384	case Instruction::FDiv:
8385	case Instruction::FMul:
8386	case Instruction::FNeg:
8387	case Instruction::FRem:
8388	case Instruction::FSub:
8389	case Instruction::ICmp:
8390	case Instruction::LShr:
8391	case Instruction::Mul:
8392	case Instruction::Or:
8393	case Instruction::Select:
8394	case Instruction::Shl:
8395	case Instruction::Sub:
8396	case Instruction::Xor:
8397	case Instruction::Freeze:
8398	return new VPWidenRecipe (*I, make_range(x: Operands.begin(), y: Operands.end()));
8399	};
8400	}
8401
8402	void VPRecipeBuilder::fixHeaderPhis() {
8403	BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8404	for (VPHeaderPHIRecipe *R : PhisToFix) {
8405	auto *PN = cast<PHINode>(Val: R->getUnderlyingValue());
8406	VPRecipeBase *IncR =
8407	getRecipe(I: cast<Instruction>(Val: PN->getIncomingValueForBlock(BB: OrigLatch)));
8408	R->addOperand(Operand: IncR->getVPSingleValue());
8409	}
8410	}
8411
8412	VPReplicateRecipe VPRecipeBuilder::handleReplication(Instruction I,
8413	VFRange &Range) {
8414	bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8415	Predicate: [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8416	Range);
8417
8418	bool IsPredicated = CM.isPredicatedInst(I);
8419
8420	// Even if the instruction is not marked as uniform, there are certain
8421	// intrinsic calls that can be effectively treated as such, so we check for
8422	// them here. Conservatively, we only do this for scalable vectors, since
8423	// for fixed-width VFs we can always fall back on full scalarization.
8424	if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(Val: I)) {
8425	switch (cast<IntrinsicInst>(Val: I)->getIntrinsicID()) {
8426	case Intrinsic::assume:
8427	case Intrinsic::lifetime_start:
8428	case Intrinsic::lifetime_end:
8429	// For scalable vectors if one of the operands is variant then we still
8430	// want to mark as uniform, which will generate one instruction for just
8431	// the first lane of the vector. We can't scalarize the call in the same
8432	// way as for fixed-width vectors because we don't know how many lanes
8433	// there are.
8434	//
8435	// The reasons for doing it this way for scalable vectors are:
8436	// 1. For the assume intrinsic generating the instruction for the first
8437	// lane is still be better than not generating any at all. For
8438	// example, the input may be a splat across all lanes.
8439	// 2. For the lifetime start/end intrinsics the pointer operand only
8440	// does anything useful when the input comes from a stack object,
8441	// which suggests it should always be uniform. For non-stack objects
8442	// the effect is to poison the object, which still allows us to
8443	// remove the call.
8444	IsUniform = true;
8445	break;
8446	default:
8447	break;
8448	}
8449	}
8450	VPValue BlockInMask = nullptr*;
8451	if (!IsPredicated) {
8452	// Finalize the recipe for Instr, first if it is not predicated.
8453	LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8454	} else {
8455	LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8456	// Instructions marked for predication are replicated and a mask operand is
8457	// added initially. Masked replicate recipes will later be placed under an
8458	// if-then construct to prevent side-effects. Generate recipes to compute
8459	// the block mask for this region.
8460	BlockInMask = getBlockInMask(BB: I->getParent());
8461	}
8462
8463	auto Recipe = new* VPReplicateRecipe (I, mapToVPValues(Operands: I->operands()),
8464	IsUniform, BlockInMask);
8465	return Recipe;
8466	}
8467
8468	VPRecipeBase *
8469	VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8470	ArrayRef<VPValue *> Operands,
8471	VFRange &Range, VPBasicBlock *VPBB) {
8472	// First, check for specific widening recipes that deal with inductions, Phi
8473	// nodes, calls and memory operations.
8474	VPRecipeBase *Recipe;
8475	if (auto Phi = dyn_cast<PHINode>(Val: Instr)) {
8476	if (Phi->getParent() != OrigLoop->getHeader())
8477	return tryToBlend(Phi, Operands);
8478
8479	if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8480	return Recipe;
8481
8482	VPHeaderPHIRecipe PhiRecipe = nullptr*;
8483	assert((Legal->isReductionVariable(Phi) \|\|
8484	Legal->isFixedOrderRecurrence(Phi)) &&
8485	"can only widen reductions and fixed-order recurrences here");
8486	VPValue *StartV = Operands [`0`];
8487	if (Legal->isReductionVariable(PN: Phi)) {
8488	const RecurrenceDescriptor &RdxDesc =
8489	Legal->getReductionVars().find(Key: Phi)->second;
8490	assert(RdxDesc.getRecurrenceStartValue() ==
8491	Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8492	PhiRecipe = new VPReductionPHIRecipe (Phi, RdxDesc, *StartV,
8493	CM.isInLoopReduction(Phi),
8494	CM.useOrderedReductions(RdxDesc));
8495	} else {
8496	// TODO: Currently fixed-order recurrences are modeled as chains of
8497	// first-order recurrences. If there are no users of the intermediate
8498	// recurrences in the chain, the fixed order recurrence should be modeled
8499	// directly, enabling more efficient codegen.
8500	PhiRecipe = new VPFirstOrderRecurrencePHIRecipe (Phi, *StartV);
8501	}
8502
8503	PhisToFix.push_back(Elt: PhiRecipe);
8504	return PhiRecipe;
8505	}
8506
8507	if (isa<TruncInst>(Val: Instr) && (Recipe = tryToOptimizeInductionTruncate(
8508	I: cast<TruncInst>(Val: Instr), Operands, Range)))
8509	return Recipe;
8510
8511	// All widen recipes below deal only with VF > 1.
8512	if (LoopVectorizationPlanner::getDecisionAndClampRange(
8513	Predicate: [&](ElementCount VF) { return VF.isScalar(); }, Range))
8514	return nullptr;
8515
8516	if (auto *CI = dyn_cast<CallInst>(Val: Instr))
8517	return tryToWidenCall(CI, Operands, Range);
8518
8519	if (isa<LoadInst>(Val: Instr) \|\| isa<StoreInst>(Val: Instr))
8520	return tryToWidenMemory(I: Instr, Operands, Range);
8521
8522	if (!shouldWiden(I: Instr, Range))
8523	return nullptr;
8524
8525	if (auto GEP = dyn_cast<GetElementPtrInst>(Val: Instr))
8526	return new VPWidenGEPRecipe (GEP,
8527	make_range(x: Operands.begin(), y: Operands.end()));
8528
8529	if (auto *SI = dyn_cast<SelectInst>(Val: Instr)) {
8530	return new VPWidenSelectRecipe (
8531	*SI, make_range(x: Operands.begin(), y: Operands.end()));
8532	}
8533
8534	if (auto *CI = dyn_cast<CastInst>(Val: Instr)) {
8535	return new VPWidenCastRecipe (CI->getOpcode(), Operands [`0`], CI->getType(),
8536	*CI);
8537	}
8538
8539	return tryToWiden(I: Instr, Operands, VPBB);
8540	}
8541
8542	void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8543	ElementCount MaxVF) {
8544	assert(OrigLoop->isInnermost() && "Inner loop expected.");
8545
8546	auto MaxVFTimes2 = MaxVF * `2`;
8547	for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) {
8548	VFRange SubRange = {VF, MaxVFTimes2};
8549	if (auto Plan = tryToBuildVPlanWithVPRecipes(Range&: SubRange)) {
8550	// Now optimize the initial VPlan.
8551	if (!Plan ->hasVF(VF: ElementCount::getFixed(MinVal: `1`)))
8552	VPlanTransforms::truncateToMinimalBitwidths(
8553	Plan&: *Plan, MinBWs: CM.getMinimalBitwidths(), Ctx&: PSE.getSE()->getContext());
8554	VPlanTransforms::optimize(Plan&: Plan, SE&: PSE.getSE());
8555	// TODO: try to put it close to addActiveLaneMask().
8556	if (CM.foldTailWithEVL())
8557	VPlanTransforms::addExplicitVectorLength(Plan&: *Plan);
8558	assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8559	VPlans.push_back(Elt: std::move(Plan));
8560	}
8561	VF = SubRange.End;
8562	}
8563	}
8564
8565	// Add the necessary canonical IV and branch recipes required to control the
8566	// loop.
8567	static void addCanonicalIVRecipes(VPlan &Plan, Type IdxTy, bool* HasNUW,
8568	DebugLoc DL) {
8569	Value *StartIdx = ConstantInt::get(Ty: IdxTy, V: `0`);
8570	auto *StartV = Plan.getOrAddLiveIn(V: StartIdx);
8571
8572	// Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8573	auto CanonicalIVPHI = new* VPCanonicalIVPHIRecipe (StartV, DL);
8574	VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8575	VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8576	Header->insert(Recipe: CanonicalIVPHI, InsertPt: Header->begin());
8577
8578	VPBuilder Builder(TopRegion->getExitingBasicBlock());
8579	// Add a VPInstruction to increment the scalar canonical IV by VF UF.*
8580	auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8581	Opcode: Instruction::Add, Operands: {CanonicalIVPHI, &Plan.getVFxUF()}, WrapFlags: {HasNUW, false}, DL,
8582	Name: "index.next");
8583	CanonicalIVPHI->addOperand(Operand: CanonicalIVIncrement);
8584
8585	// Add the BranchOnCount VPInstruction to the latch.
8586	Builder.createNaryOp(Opcode: VPInstruction::BranchOnCount,
8587	Operands: {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8588	}
8589
8590	// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8591	// original exit block.
8592	static void addUsersInExitBlock(VPBasicBlock HeaderVPBB, Loop OrigLoop,
8593	VPRecipeBuilder &Builder, VPlan &Plan) {
8594	BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8595	BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8596	// Only handle single-exit loops with unique exit blocks for now.
8597	if (!ExitBB \|\| !ExitBB->getSinglePredecessor() \|\| !ExitingBB)
8598	return;
8599
8600	// Introduce VPUsers modeling the exit values.
8601	for (PHINode &ExitPhi : ExitBB->phis()) {
8602	Value *IncomingValue =
8603	ExitPhi.getIncomingValueForBlock(BB: ExitingBB);
8604	VPValue *V = Builder.getVPValueOrAddLiveIn(V: IncomingValue, Plan);
8605	Plan.addLiveOut(PN: &ExitPhi, V);
8606	}
8607	}
8608
8609	VPlanPtr
8610	LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8611
8612	SmallPtrSet<const InterleaveGroup<Instruction> *, `1`> InterleaveGroups;
8613
8614	// ---------------------------------------------------------------------------
8615	// Build initial VPlan: Scan the body of the loop in a topological order to
8616	// visit each basic block after having visited its predecessor basic blocks.
8617	// ---------------------------------------------------------------------------
8618
8619	// Create initial VPlan skeleton, having a basic block for the pre-header
8620	// which contains SCEV expansions that need to happen before the CFG is
8621	// modified; a basic block for the vector pre-header, followed by a region for
8622	// the vector loop, followed by the middle basic block. The skeleton vector
8623	// loop region contains a header and latch basic blocks.
8624	VPlanPtr Plan = VPlan::createInitialVPlan(
8625	TripCount: createTripCountSCEV(IdxTy: Legal->getWidestInductionType(), PSE, OrigLoop),
8626	PSE&: *PSE.getSE());
8627	VPBasicBlock HeaderVPBB = new* VPBasicBlock ("vector.body");
8628	VPBasicBlock LatchVPBB = new* VPBasicBlock ("vector.latch");
8629	VPBlockUtils::insertBlockAfter(NewBlock: LatchVPBB, BlockPtr: HeaderVPBB);
8630	Plan ->getVectorLoopRegion()->setEntry(HeaderVPBB);
8631	Plan ->getVectorLoopRegion()->setExiting(LatchVPBB);
8632
8633	// Don't use getDecisionAndClampRange here, because we don't know the UF
8634	// so this function is better to be conservative, rather than to split
8635	// it up into different VPlans.
8636	// TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8637	bool IVUpdateMayOverflow = false;
8638	for (ElementCount VF : Range)
8639	IVUpdateMayOverflow \|= !isIndvarOverflowCheckKnownFalse(Cost: &CM, VF);
8640
8641	DebugLoc DL = getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction());
8642	TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8643	// When not folding the tail, we know that the induction increment will not
8644	// overflow.
8645	bool HasNUW = Style == TailFoldingStyle::None;
8646	addCanonicalIVRecipes(Plan&: *Plan, IdxTy: Legal->getWidestInductionType(), HasNUW, DL);
8647
8648	VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
8649
8650	// ---------------------------------------------------------------------------
8651	// Pre-construction: record ingredients whose recipes we'll need to further
8652	// process after constructing the initial VPlan.
8653	// ---------------------------------------------------------------------------
8654
8655	// For each interleave group which is relevant for this (possibly trimmed)
8656	// Range, add it to the set of groups to be later applied to the VPlan and add
8657	// placeholders for its members' Recipes which we'll be replacing with a
8658	// single VPInterleaveRecipe.
8659	for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8660	auto applyIG = [IG, this](ElementCount VF) -> bool {
8661	bool Result = (VF.isVector() && // Query is illegal for VF == 1
8662	CM.getWideningDecision(I: IG->getInsertPos(), VF) ==
8663	LoopVectorizationCostModel::CM_Interleave);
8664	// For scalable vectors, the only interleave factor currently supported
8665	// is 2 since we require the (de)interleave2 intrinsics instead of
8666	// shufflevectors.
8667	assert((!Result \|\| !VF.isScalable() \|\| IG->getFactor() == `2`) &&
8668	"Unsupported interleave factor for scalable vectors");
8669	return Result;
8670	};
8671	if (!getDecisionAndClampRange(Predicate: applyIG, Range))
8672	continue;
8673	InterleaveGroups.insert(Ptr: IG);
8674	};
8675
8676	// ---------------------------------------------------------------------------
8677	// Construct recipes for the instructions in the loop
8678	// ---------------------------------------------------------------------------
8679
8680	// Scan the body of the loop in a topological order to visit each basic block
8681	// after having visited its predecessor basic blocks.
8682	LoopBlocksDFS DFS(OrigLoop);
8683	DFS.perform(LI);
8684
8685	VPBasicBlock *VPBB = HeaderVPBB;
8686	BasicBlock *HeaderBB = OrigLoop->getHeader();
8687	bool NeedsMasks =
8688	CM.foldTailByMasking() \|\|
8689	any_of(Range: OrigLoop->blocks(), P: [this, HeaderBB](BasicBlock *BB) {
8690	bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
8691	return Legal->blockNeedsPredication(BB) \|\| NeedsBlends;
8692	});
8693	for (BasicBlock *BB : make_range(x: DFS.beginRPO(), y: DFS.endRPO())) {
8694	// Relevant instructions from basic block BB will be grouped into VPRecipe
8695	// ingredients and fill a new VPBasicBlock.
8696	if (VPBB != HeaderVPBB)
8697	VPBB->setName(BB->getName());
8698	Builder.setInsertPoint(VPBB);
8699
8700	if (VPBB == HeaderVPBB)
8701	RecipeBuilder.createHeaderMask();
8702	else if (NeedsMasks)
8703	RecipeBuilder.createBlockInMask(BB);
8704
8705	// Introduce each ingredient into VPlan.
8706	// TODO: Model and preserve debug intrinsics in VPlan.
8707	for (Instruction &I : drop_end(RangeOrContainer: BB->instructionsWithoutDebug(SkipPseudoOp: false))) {
8708	Instruction *Instr = &I;
8709	SmallVector<VPValue *, `4`> Operands;
8710	auto *Phi = dyn_cast<PHINode>(Val: Instr);
8711	if (Phi && Phi->getParent() == HeaderBB) {
8712	Operands.push_back(Elt: Plan ->getOrAddLiveIn(
8713	V: Phi->getIncomingValueForBlock(BB: OrigLoop->getLoopPreheader())));
8714	} else {
8715	auto OpRange = RecipeBuilder.mapToVPValues(Operands: Instr->operands());
8716	Operands = {OpRange.begin(), OpRange.end()};
8717	}
8718
8719	// Invariant stores inside loop will be deleted and a single store
8720	// with the final reduction value will be added to the exit block
8721	StoreInst *SI;
8722	if ((SI = dyn_cast<StoreInst>(Val: &I)) &&
8723	Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand()))
8724	continue;
8725
8726	VPRecipeBase *Recipe =
8727	RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
8728	if (!Recipe)
8729	Recipe = RecipeBuilder.handleReplication(I: Instr, Range);
8730
8731	RecipeBuilder.setRecipe(I: Instr, R: Recipe);
8732	if (isa<VPHeaderPHIRecipe>(Val: Recipe)) {
8733	// VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8734	// the following cases, VPHeaderPHIRecipes may be created after non-phi
8735	// recipes and need to be moved to the phi section of HeaderVPBB:
8736	// tail-folding (non-phi recipes computing the header mask are*
8737	// introduced earlier than regular header phi recipes, and should appear
8738	// after them)
8739	// Optimizing truncates to VPWidenIntOrFpInductionRecipe.*
8740
8741	assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() \|\|
8742	CM.foldTailByMasking() \|\| isa<TruncInst>(Instr)) &&
8743	"unexpected recipe needs moving");
8744	Recipe->insertBefore(BB&: *HeaderVPBB, IP: HeaderVPBB->getFirstNonPhi());
8745	} else
8746	VPBB->appendRecipe(Recipe);
8747	}
8748
8749	VPBlockUtils::insertBlockAfter(NewBlock: new VPBasicBlock (), BlockPtr: VPBB);
8750	VPBB = cast<VPBasicBlock>(Val: VPBB->getSingleSuccessor());
8751	}
8752
8753	// After here, VPBB should not be used.
8754	VPBB = nullptr;
8755
8756	if (CM.requiresScalarEpilogue(Range)) {
8757	// No edge from the middle block to the unique exit block has been inserted
8758	// and there is nothing to fix from vector loop; phis should have incoming
8759	// from scalar loop only.
8760	} else
8761	addUsersInExitBlock(HeaderVPBB, OrigLoop, Builder&: RecipeBuilder, Plan&: *Plan);
8762
8763	assert(isa<VPRegionBlock>(Plan ->getVectorLoopRegion()) &&
8764	!Plan ->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8765	"entry block must be set to a VPRegionBlock having a non-empty entry "
8766	"VPBasicBlock");
8767	RecipeBuilder.fixHeaderPhis();
8768
8769	// ---------------------------------------------------------------------------
8770	// Transform initial VPlan: Apply previously taken decisions, in order, to
8771	// bring the VPlan to its final state.
8772	// ---------------------------------------------------------------------------
8773
8774	// Adjust the recipes for any inloop reductions.
8775	adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, MinVF: Range.Start);
8776
8777	// Interleave memory: for each Interleave Group we marked earlier as relevant
8778	// for this VPlan, replace the Recipes widening its memory instructions with a
8779	// single VPInterleaveRecipe at its insertion point.
8780	for (const auto *IG : InterleaveGroups) {
8781	auto *Recipe =
8782	cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IG->getInsertPos()));
8783	SmallVector<VPValue *, `4`> StoredValues;
8784	for (unsigned i = `0`; i < IG->getFactor(); ++i)
8785	if (auto *SI = dyn_cast_or_null<StoreInst>(Val: IG->getMember(Index: i))) {
8786	auto *StoreR = cast<VPWidenStoreRecipe>(Val: RecipeBuilder.getRecipe(I: SI));
8787	StoredValues.push_back(Elt: StoreR->getStoredValue());
8788	}
8789
8790	bool NeedsMaskForGaps =
8791	IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8792	auto VPIG = new* VPInterleaveRecipe (IG, Recipe->getAddr(), StoredValues,
8793	Recipe->getMask(), NeedsMaskForGaps);
8794	VPIG->insertBefore(InsertPos: Recipe);
8795	unsigned J = `0`;
8796	for (unsigned i = `0`; i < IG->getFactor(); ++i)
8797	if (Instruction *Member = IG->getMember(Index: i)) {
8798	VPRecipeBase *MemberR = RecipeBuilder.getRecipe(I: Member);
8799	if (!Member->getType()->isVoidTy()) {
8800	VPValue *OriginalV = MemberR->getVPSingleValue();
8801	OriginalV->replaceAllUsesWith(New: VPIG->getVPValue(I: J));
8802	J++;
8803	}
8804	MemberR->eraseFromParent();
8805	}
8806	}
8807
8808	for (ElementCount VF : Range)
8809	Plan ->addVF(VF);
8810	Plan ->setName("Initial VPlan");
8811
8812	// Replace VPValues for known constant strides guaranteed by predicate scalar
8813	// evolution.
8814	for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8815	auto *StrideV = cast<SCEVUnknown>(Val: Stride)->getValue();
8816	auto *ScevStride = dyn_cast<SCEVConstant>(Val: PSE.getSCEV(V: StrideV));
8817	// Only handle constant strides for now.
8818	if (!ScevStride)
8819	continue;
8820	Constant *CI = ConstantInt::get(Ty: Stride->getType(), V: ScevStride->getAPInt());
8821
8822	auto *ConstVPV = Plan ->getOrAddLiveIn(V: CI);
8823	// The versioned value may not be used in the loop directly, so just add a
8824	// new live-in in those cases.
8825	Plan ->getOrAddLiveIn(V: StrideV)->replaceAllUsesWith(New: ConstVPV);
8826	}
8827
8828	VPlanTransforms::dropPoisonGeneratingRecipes(Plan&: Plan, BlockNeedsPredication: [this](BasicBlock BB) {
8829	return Legal->blockNeedsPredication(BB);
8830	});
8831
8832	// Sink users of fixed-order recurrence past the recipe defining the previous
8833	// value and introduce FirstOrderRecurrenceSplice VPInstructions.
8834	if (!VPlanTransforms::adjustFixedOrderRecurrences(Plan&: *Plan, Builder))
8835	return nullptr;
8836
8837	if (useActiveLaneMask(Style)) {
8838	// TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8839	// TailFoldingStyle is visible there.
8840	bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8841	bool WithoutRuntimeCheck =
8842	Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8843	VPlanTransforms::addActiveLaneMask(Plan&: *Plan, UseActiveLaneMaskForControlFlow: ForControlFlow,
8844	DataAndControlFlowWithoutRuntimeCheck: WithoutRuntimeCheck);
8845	}
8846	return Plan;
8847	}
8848
8849	VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8850	// Outer loop handling: They may require CFG and instruction level
8851	// transformations before even evaluating whether vectorization is profitable.
8852	// Since we cannot modify the incoming IR, we need to build VPlan upfront in
8853	// the vectorization pipeline.
8854	assert(!OrigLoop->isInnermost());
8855	assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8856
8857	// Create new empty VPlan
8858	auto Plan = VPlan::createInitialVPlan(
8859	TripCount: createTripCountSCEV(IdxTy: Legal->getWidestInductionType(), PSE, OrigLoop),
8860	PSE&: *PSE.getSE());
8861
8862	// Build hierarchical CFG
8863	VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8864	HCFGBuilder.buildHierarchicalCFG();
8865
8866	for (ElementCount VF : Range)
8867	Plan ->addVF(VF);
8868
8869	VPlanTransforms::VPInstructionsToVPRecipes(
8870	Plan,
8871	GetIntOrFpInductionDescriptor: [this](PHINode P) { return* Legal->getIntOrFpInductionDescriptor(Phi: P); },
8872	SE&: PSE.getSE(), TLI: TLI);
8873
8874	// Remove the existing terminator of the exiting block of the top-most region.
8875	// A BranchOnCount will be added instead when adding the canonical IV recipes.
8876	auto *Term =
8877	Plan ->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8878	Term->eraseFromParent();
8879
8880	// Tail folding is not supported for outer loops, so the induction increment
8881	// is guaranteed to not wrap.
8882	bool HasNUW = true;
8883	addCanonicalIVRecipes(Plan&: *Plan, IdxTy: Legal->getWidestInductionType(), HasNUW,
8884	DL: DebugLoc ());
8885	assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8886	return Plan;
8887	}
8888
8889	// Adjust the recipes for reductions. For in-loop reductions the chain of
8890	// instructions leading from the loop exit instr to the phi need to be converted
8891	// to reductions, with one operand being vector and the other being the scalar
8892	// reduction chain. For other reductions, a select is introduced between the phi
8893	// and live-out recipes when folding the tail.
8894	//
8895	// A ComputeReductionResult recipe is added to the middle block, also for
8896	// in-loop reductions which compute their result in-loop, because generating
8897	// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8898	void LoopVectorizationPlanner::adjustRecipesForReductions(
8899	VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
8900	ElementCount MinVF) {
8901	VPRegionBlock *VectorLoopRegion = Plan ->getVectorLoopRegion();
8902	VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8903	// Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8904	// sank outside of the loop would keep the same order as they had in the
8905	// original loop.
8906	SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8907	for (VPRecipeBase &R : Header->phis()) {
8908	if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R))
8909	ReductionPHIList.emplace_back(Args&: ReductionPhi);
8910	}
8911	bool HasIntermediateStore = false;
8912	stable_sort(Range&: ReductionPHIList,
8913	C: [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8914	const VPReductionPHIRecipe *R2) {
8915	auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8916	auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8917	HasIntermediateStore \|= IS1 \|\| IS2;
8918
8919	// If neither of the recipes has an intermediate store, keep the
8920	// order the same.
8921	if (!IS1 && !IS2)
8922	return false;
8923
8924	// If only one of the recipes has an intermediate store, then
8925	// move it towards the beginning of the list.
8926	if (IS1 && !IS2)
8927	return true;
8928
8929	if (!IS1 && IS2)
8930	return false;
8931
8932	// If both recipes have an intermediate store, then the recipe
8933	// with the later store should be processed earlier. So it
8934	// should go to the beginning of the list.
8935	return DT->dominates(Def: IS2, User: IS1);
8936	});
8937
8938	if (HasIntermediateStore && ReductionPHIList.size() > `1`)
8939	for (VPRecipeBase *R : ReductionPHIList)
8940	R->moveBefore(BB&: *Header, I: Header->getFirstNonPhi());
8941
8942	for (VPRecipeBase &R : Header->phis()) {
8943	auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
8944	if (!PhiR \|\| !PhiR->isInLoop() \|\| (MinVF.isScalar() && !PhiR->isOrdered()))
8945	continue;
8946
8947	const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
8948	RecurKind Kind = RdxDesc.getRecurrenceKind();
8949	assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
8950	"AnyOf reductions are not allowed for in-loop reductions");
8951
8952	// Collect the chain of "link" recipes for the reduction starting at PhiR.
8953	SetVector<VPSingleDefRecipe *> Worklist;
8954	Worklist.insert(X: PhiR);
8955	for (unsigned I = `0`; I != Worklist.size(); ++I) {
8956	VPSingleDefRecipe *Cur = Worklist [I];
8957	for (VPUser *U : Cur->users()) {
8958	auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(Val: U);
8959	if (!UserRecipe) {
8960	assert(isa<VPLiveOut>(U) &&
8961	"U must either be a VPSingleDef or VPLiveOut");
8962	continue;
8963	}
8964	Worklist.insert(X: UserRecipe);
8965	}
8966	}
8967
8968	// Visit operation "Links" along the reduction chain top-down starting from
8969	// the phi until LoopExitValue. We keep track of the previous item
8970	// (PreviousLink) to tell which of the two operands of a Link will remain
8971	// scalar and which will be reduced. For minmax by select(cmp), Link will be
8972	// the select instructions. Blend recipes of in-loop reduction phi's will
8973	// get folded to their non-phi operand, as the reduction recipe handles the
8974	// condition directly.
8975	VPSingleDefRecipe PreviousLink = PhiR; // Aka Worklist[0].*
8976	for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
8977	Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
8978
8979	// Index of the first operand which holds a non-mask vector operand.
8980	unsigned IndexOfFirstOperand;
8981	// Recognize a call to the llvm.fmuladd intrinsic.
8982	bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
8983	VPValue *VecOp;
8984	VPBasicBlock *LinkVPBB = CurrentLink->getParent();
8985	if (IsFMulAdd) {
8986	assert(
8987	RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
8988	"Expected instruction to be a call to the llvm.fmuladd intrinsic");
8989	assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) \|\|
8990	isa<VPWidenCallRecipe>(CurrentLink)) &&
8991	CurrentLink->getOperand(`2`) == PreviousLink &&
8992	"expected a call where the previous link is the added operand");
8993
8994	// If the instruction is a call to the llvm.fmuladd intrinsic then we
8995	// need to create an fmul recipe (multiplying the first two operands of
8996	// the fmuladd together) to use as the vector operand for the fadd
8997	// reduction.
8998	VPInstruction FMulRecipe = new* VPInstruction (
8999	Instruction::FMul,
9000	{CurrentLink->getOperand(N: `0`), CurrentLink->getOperand(N: `1`)},
9001	CurrentLinkI->getFastMathFlags());
9002	LinkVPBB->insert(Recipe: FMulRecipe, InsertPt: CurrentLink->getIterator());
9003	VecOp = FMulRecipe;
9004	} else {
9005	auto *Blend = dyn_cast<VPBlendRecipe>(Val: CurrentLink);
9006	if (PhiR->isInLoop() && Blend) {
9007	assert(Blend->getNumIncomingValues() == `2` &&
9008	"Blend must have 2 incoming values");
9009	if (Blend->getIncomingValue(Idx: `0`) == PhiR)
9010	Blend->replaceAllUsesWith(New: Blend->getIncomingValue(Idx: `1`));
9011	else {
9012	assert(Blend->getIncomingValue(`1`) == PhiR &&
9013	"PhiR must be an operand of the blend");
9014	Blend->replaceAllUsesWith(New: Blend->getIncomingValue(Idx: `0`));
9015	}
9016	continue;
9017	}
9018
9019	if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9020	if (isa<VPWidenRecipe>(Val: CurrentLink)) {
9021	assert(isa<CmpInst>(CurrentLinkI) &&
9022	"need to have the compare of the select");
9023	continue;
9024	}
9025	assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9026	"must be a select recipe");
9027	IndexOfFirstOperand = `1`;
9028	} else {
9029	assert((MinVF.isScalar() \|\| isa<VPWidenRecipe>(CurrentLink)) &&
9030	"Expected to replace a VPWidenSC");
9031	IndexOfFirstOperand = `0`;
9032	}
9033	// Note that for non-commutable operands (cmp-selects), the semantics of
9034	// the cmp-select are captured in the recurrence kind.
9035	unsigned VecOpId =
9036	CurrentLink->getOperand(N: IndexOfFirstOperand) == PreviousLink
9037	? IndexOfFirstOperand + `1`
9038	: IndexOfFirstOperand;
9039	VecOp = CurrentLink->getOperand(N: VecOpId);
9040	assert(VecOp != PreviousLink &&
9041	CurrentLink->getOperand(CurrentLink->getNumOperands() - `1` -
9042	(VecOpId - IndexOfFirstOperand)) ==
9043	PreviousLink &&
9044	"PreviousLink must be the operand other than VecOp");
9045	}
9046
9047	BasicBlock *BB = CurrentLinkI->getParent();
9048	VPValue CondOp = nullptr*;
9049	if (CM.blockNeedsPredicationForAnyReason(BB))
9050	CondOp = RecipeBuilder.getBlockInMask(BB);
9051
9052	VPReductionRecipe *RedRecipe =
9053	new VPReductionRecipe (RdxDesc, CurrentLinkI, PreviousLink, VecOp,
9054	CondOp, CM.useOrderedReductions(RdxDesc));
9055	// Append the recipe to the end of the VPBasicBlock because we need to
9056	// ensure that it comes after all of it's inputs, including CondOp.
9057	// Note that this transformation may leave over dead recipes (including
9058	// CurrentLink), which will be cleaned by a later VPlan transform.
9059	LinkVPBB->appendRecipe(Recipe: RedRecipe);
9060	CurrentLink->replaceAllUsesWith(New: RedRecipe);
9061	PreviousLink = RedRecipe;
9062	}
9063	}
9064	Builder.setInsertPoint(&*LatchVPBB->begin());
9065	for (VPRecipeBase &R :
9066	Plan ->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9067	VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
9068	if (!PhiR)
9069	continue;
9070
9071	const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9072	// If tail is folded by masking, introduce selects between the phi
9073	// and the live-out instruction of each reduction, at the beginning of the
9074	// dedicated latch block.
9075	auto *OrigExitingVPV = PhiR->getBackedgeValue();
9076	auto *NewExitingVPV = PhiR->getBackedgeValue();
9077	if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
9078	VPValue *Cond = RecipeBuilder.getBlockInMask(BB: OrigLoop->getHeader());
9079	assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9080	"reduction recipe must be defined before latch");
9081	Type *PhiTy = PhiR->getOperand(N: `0`)->getLiveInIRValue()->getType();
9082	std::optional<FastMathFlags> FMFs =
9083	PhiTy->isFloatingPointTy()
9084	? std::make_optional(t: RdxDesc.getFastMathFlags())
9085	: std::nullopt;
9086	NewExitingVPV =
9087	Builder.createSelect(Cond, TrueVal: OrigExitingVPV, FalseVal: PhiR, DL: {}, Name: "", FMFs);
9088	OrigExitingVPV->replaceUsesWithIf(New: NewExitingVPV, ShouldReplace: [](VPUser &U, unsigned) {
9089	return isa<VPInstruction>(Val: &U) &&
9090	cast<VPInstruction>(Val: &U)->getOpcode() ==
9091	VPInstruction::ComputeReductionResult;
9092	});
9093	if (PreferPredicatedReductionSelect \|\|
9094	TTI.preferPredicatedReductionSelect(
9095	Opcode: PhiR->getRecurrenceDescriptor().getOpcode(), Ty: PhiTy,
9096	Flags: TargetTransformInfo::ReductionFlags ()))
9097	PhiR->setOperand(I: `1`, New: NewExitingVPV);
9098	}
9099
9100	// If the vector reduction can be performed in a smaller type, we truncate
9101	// then extend the loop exit value to enable InstCombine to evaluate the
9102	// entire expression in the smaller type.
9103	Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9104	if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
9105	assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9106	Type *RdxTy = RdxDesc.getRecurrenceType();
9107	auto *Trunc =
9108	new VPWidenCastRecipe (Instruction::Trunc, NewExitingVPV, RdxTy);
9109	auto *Extnd =
9110	RdxDesc.isSigned()
9111	? new VPWidenCastRecipe (Instruction::SExt, Trunc, PhiTy)
9112	: new VPWidenCastRecipe (Instruction::ZExt, Trunc, PhiTy);
9113
9114	Trunc->insertAfter(InsertPos: NewExitingVPV->getDefiningRecipe());
9115	Extnd->insertAfter(InsertPos: Trunc);
9116	if (PhiR->getOperand(N: `1`) == NewExitingVPV)
9117	PhiR->setOperand(I: `1`, New: Extnd->getVPSingleValue());
9118	NewExitingVPV = Extnd;
9119	}
9120
9121	// We want code in the middle block to appear to execute on the location of
9122	// the scalar loop's latch terminator because: (a) it is all compiler
9123	// generated, (b) these instructions are always executed after evaluating
9124	// the latch conditional branch, and (c) other passes may add new
9125	// predecessors which terminate on this line. This is the easiest way to
9126	// ensure we don't accidentally cause an extra step back into the loop while
9127	// debugging.
9128	DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9129
9130	// TODO: At the moment ComputeReductionResult also drives creation of the
9131	// bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9132	// even for in-loop reductions, until the reduction resume value handling is
9133	// also modeled in VPlan.
9134	auto FinalReductionResult = new* VPInstruction (
9135	VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9136	cast<VPBasicBlock>(Val: VectorLoopRegion->getSingleSuccessor())
9137	->appendRecipe(Recipe: FinalReductionResult);
9138	OrigExitingVPV->replaceUsesWithIf(
9139	New: FinalReductionResult,
9140	ShouldReplace: [](VPUser &User, unsigned) { return isa<VPLiveOut>(Val: &User); });
9141	}
9142
9143	VPlanTransforms::clearReductionWrapFlags(Plan&: *Plan);
9144	}
9145
9146	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
9147	void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9148	VPSlotTracker &SlotTracker) const {
9149	O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9150	IG->getInsertPos()->printAsOperand(O, PrintType: false);
9151	O << ", ";
9152	getAddr()->printAsOperand(OS&: O, Tracker&: SlotTracker);
9153	VPValue *Mask = getMask();
9154	if (Mask) {
9155	O << ", ";
9156	Mask->printAsOperand(OS&: O, Tracker&: SlotTracker);
9157	}
9158
9159	unsigned OpIdx = `0`;
9160	for (unsigned i = `0`; i < IG->getFactor(); ++i) {
9161	if (!IG->getMember(Index: i))
9162	continue;
9163	if (getNumStoreOperands() > `0`) {
9164	O << "\n" << Indent << " store ";
9165	getOperand(N: `1` + OpIdx)->printAsOperand(OS&: O, Tracker&: SlotTracker);
9166	O << " to index " << i;
9167	} else {
9168	O << "\n" << Indent << " ";
9169	getVPValue(I: OpIdx)->printAsOperand(OS&: O, Tracker&: SlotTracker);
9170	O << " = load from index " << i;
9171	}
9172	++OpIdx;
9173	}
9174	}
9175	#endif
9176
9177	void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9178	assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9179	"Not a pointer induction according to InductionDescriptor!");
9180	assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9181	"Unexpected type.");
9182	assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
9183	"Recipe should have been replaced");
9184
9185	auto *IVR = getParent()->getPlan()->getCanonicalIV();
9186	PHINode CanonicalIV = cast<PHINode>(Val: State.get(Def: IVR, Part: `0`, /IsScalar/* true));
9187	Type *PhiType = IndDesc.getStep()->getType();
9188
9189	// Build a pointer phi
9190	Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9191	Type *ScStValueType = ScalarStartValue->getType();
9192	PHINode *NewPointerPhi = PHINode::Create(Ty: ScStValueType, NumReservedValues: `2`, NameStr: "pointer.phi",
9193	InsertBefore: CanonicalIV->getIterator());
9194
9195	BasicBlock VectorPH = State.CFG.getPreheaderBBFor(R: this*);
9196	NewPointerPhi->addIncoming(V: ScalarStartValue, BB: VectorPH);
9197
9198	// A pointer induction, performed by using a gep
9199	BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
9200
9201	Value *ScalarStepValue = State.get(Def: getOperand(N: `1`), Instance: VPIteration (`0`, `0`));
9202	Value *RuntimeVF = getRuntimeVF(B&: State.Builder, Ty: PhiType, VF: State.VF);
9203	Value *NumUnrolledElems =
9204	State.Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: PhiType, V: State.UF));
9205	Value *InductionGEP = GetElementPtrInst::Create(
9206	PointeeType: State.Builder.getInt8Ty(), Ptr: NewPointerPhi,
9207	IdxList: State.Builder.CreateMul(LHS: ScalarStepValue, RHS: NumUnrolledElems), NameStr: "ptr.ind",
9208	InsertBefore: InductionLoc);
9209	// Add induction update using an incorrect block temporarily. The phi node
9210	// will be fixed after VPlan execution. Note that at this point the latch
9211	// block cannot be used, as it does not exist yet.
9212	// TODO: Model increment value in VPlan, by turning the recipe into a
9213	// multi-def and a subclass of VPHeaderPHIRecipe.
9214	NewPointerPhi->addIncoming(V: InductionGEP, BB: VectorPH);
9215
9216	// Create UF many actual address geps that use the pointer
9217	// phi as base and a vectorized version of the step value
9218	// (<step0, ..., stepN>) as offset.
9219	for (unsigned Part = `0`; Part < State.UF; ++Part) {
9220	Type *VecPhiType = VectorType::get(ElementType: PhiType, EC: State.VF);
9221	Value *StartOffsetScalar =
9222	State.Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: PhiType, V: Part));
9223	Value *StartOffset =
9224	State.Builder.CreateVectorSplat(EC: State.VF, V: StartOffsetScalar);
9225	// Create a vector of consecutive numbers from zero to VF.
9226	StartOffset = State.Builder.CreateAdd(
9227	LHS: StartOffset, RHS: State.Builder.CreateStepVector(DstType: VecPhiType));
9228
9229	assert(ScalarStepValue == State.get(getOperand(`1`), VPIteration (Part, `0`)) &&
9230	"scalar step must be the same across all parts");
9231	Value *GEP = State.Builder.CreateGEP(
9232	Ty: State.Builder.getInt8Ty(), Ptr: NewPointerPhi,
9233	IdxList: State.Builder.CreateMul(
9234	LHS: StartOffset,
9235	RHS: State.Builder.CreateVectorSplat(EC: State.VF, V: ScalarStepValue),
9236	Name: "vector.gep"));
9237	State.set(Def: this, V: GEP, Part);
9238	}
9239	}
9240
9241	void VPDerivedIVRecipe::execute(VPTransformState &State) {
9242	assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9243
9244	// Fast-math-flags propagate from the original induction instruction.
9245	IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9246	if (FPBinOp)
9247	State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9248
9249	Value *Step = State.get(Def: getStepValue(), Instance: VPIteration (`0`, `0`));
9250	Value *CanonicalIV = State.get(Def: getOperand(N: `1`), Instance: VPIteration (`0`, `0`));
9251	Value *DerivedIV = emitTransformedIndex(
9252	B&: State.Builder, Index: CanonicalIV, StartValue: getStartValue()->getLiveInIRValue(), Step,
9253	InductionKind: Kind, InductionBinOp: cast_if_present<BinaryOperator>(Val: FPBinOp));
9254	DerivedIV->setName("offset.idx");
9255	assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9256
9257	State.set(Def: this, V: DerivedIV, Instance: VPIteration (`0`, `0`));
9258	}
9259
9260	void VPInterleaveRecipe::execute(VPTransformState &State) {
9261	assert(!State.Instance && "Interleave group being replicated.");
9262	State.ILV->vectorizeInterleaveGroup(Group: IG, VPDefs: definedValues(), State, Addr: getAddr(),
9263	StoredValues: getStoredValues(), BlockInMask: getMask(),
9264	NeedsMaskForGaps);
9265	}
9266
9267	void VPReplicateRecipe::execute(VPTransformState &State) {
9268	Instruction *UI = getUnderlyingInstr();
9269	if (State.Instance) { // Generate a single instance.
9270	assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9271	State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: *State.Instance, State);
9272	// Insert scalar instance packing it into a vector.
9273	if (State.VF.isVector() && shouldPack()) {
9274	// If we're constructing lane 0, initialize to start from poison.
9275	if (State.Instance ->Lane.isFirstLane()) {
9276	assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9277	Value *Poison = PoisonValue::get(
9278	T: VectorType::get(ElementType: UI->getType(), EC: State.VF));
9279	State.set(Def: this, V: Poison, Part: State.Instance ->Part);
9280	}
9281	State.packScalarIntoVectorValue(Def: this, Instance: *State.Instance);
9282	}
9283	return;
9284	}
9285
9286	if (IsUniform) {
9287	// If the recipe is uniform across all parts (instead of just per VF), only
9288	// generate a single instance.
9289	if ((isa<LoadInst>(Val: UI) \|\| isa<StoreInst>(Val: UI)) &&
9290	all_of(Range: operands(), P: [](VPValue *Op) {
9291	return Op->isDefinedOutsideVectorRegions();
9292	})) {
9293	State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: VPIteration (`0`, `0`), State);
9294	if (user_begin() != user_end()) {
9295	for (unsigned Part = `1`; Part < State.UF; ++Part)
9296	State.set(Def: this, V: State.get(Def: this, Instance: VPIteration (`0`, `0`)),
9297	Instance: VPIteration (Part, `0`));
9298	}
9299	return;
9300	}
9301
9302	// Uniform within VL means we need to generate lane 0 only for each
9303	// unrolled copy.
9304	for (unsigned Part = `0`; Part < State.UF; ++Part)
9305	State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: VPIteration (Part, `0`), State);
9306	return;
9307	}
9308
9309	// A store of a loop varying value to a uniform address only needs the last
9310	// copy of the store.
9311	if (isa<StoreInst>(Val: UI) &&
9312	vputils::isUniformAfterVectorization(VPV: getOperand(N: `1`))) {
9313	auto Lane = VPLane::getLastLaneForVF(VF: State.VF);
9314	State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: VPIteration (State.UF - `1`, Lane),
9315	State);
9316	return;
9317	}
9318
9319	// Generate scalar instances for all VF lanes of all UF parts.
9320	assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9321	const unsigned EndLane = State.VF.getKnownMinValue();
9322	for (unsigned Part = `0`; Part < State.UF; ++Part)
9323	for (unsigned Lane = `0`; Lane < EndLane; ++Lane)
9324	State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: VPIteration (Part, Lane), State);
9325	}
9326
9327	void VPWidenLoadRecipe::execute(VPTransformState &State) {
9328	auto *LI = cast<LoadInst>(Val: &Ingredient);
9329
9330	Type *ScalarDataTy = getLoadStoreType(I: &Ingredient);
9331	auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
9332	const Align Alignment = getLoadStoreAlignment(I: &Ingredient);
9333	bool CreateGather = !isConsecutive();
9334
9335	auto &Builder = State.Builder;
9336	State.setDebugLocFrom(getDebugLoc());
9337	for (unsigned Part = `0`; Part < State.UF; ++Part) {
9338	Value *NewLI;
9339	Value Mask = nullptr*;
9340	if (auto *VPMask = getMask()) {
9341	// Mask reversal is only needed for non-all-one (null) masks, as reverse
9342	// of a null all-one mask is a null mask.
9343	Mask = State.get(Def: VPMask, Part);
9344	if (isReverse())
9345	Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse");
9346	}
9347
9348	Value Addr = State.get(Def: getAddr(), Part, /IsScalar/* !CreateGather);
9349	if (CreateGather) {
9350	NewLI = Builder.CreateMaskedGather(Ty: DataTy, Ptrs: Addr, Alignment, Mask, PassThru: nullptr,
9351	Name: "wide.masked.gather");
9352	} else if (Mask) {
9353	NewLI = Builder.CreateMaskedLoad(Ty: DataTy, Ptr: Addr, Alignment, Mask,
9354	PassThru: PoisonValue::get(T: DataTy),
9355	Name: "wide.masked.load");
9356	} else {
9357	NewLI = Builder.CreateAlignedLoad(Ty: DataTy, Ptr: Addr, Align: Alignment, Name: "wide.load");
9358	}
9359	// Add metadata to the load, but setVectorValue to the reverse shuffle.
9360	State.addMetadata(To: NewLI, From: LI);
9361	if (Reverse)
9362	NewLI = Builder.CreateVectorReverse(V: NewLI, Name: "reverse");
9363	State.set(Def: this, V: NewLI, Part);
9364	}
9365	}
9366
9367	void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
9368	assert(State.UF == `1` && "Expected only UF == 1 when vectorizing with "
9369	"explicit vector length.");
9370	// FIXME: Support reverse loading after vp_reverse is added.
9371	assert(!isReverse() && "Reverse loads are not implemented yet.");
9372
9373	auto *LI = cast<LoadInst>(Val: &Ingredient);
9374
9375	Type *ScalarDataTy = getLoadStoreType(I: &Ingredient);
9376	auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
9377	const Align Alignment = getLoadStoreAlignment(I: &Ingredient);
9378	bool CreateGather = !isConsecutive();
9379
9380	auto &Builder = State.Builder;
9381	State.setDebugLocFrom(getDebugLoc());
9382	CallInst *NewLI;
9383	Value *EVL = State.get(Def: getEVL(), Instance: VPIteration (`0`, `0`));
9384	Value *Addr = State.get(Def: getAddr(), Part: `0`, IsScalar: !CreateGather);
9385	Value *Mask = getMask()
9386	? State.get(Def: getMask(), Part: `0`)
9387	: Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
9388	if (CreateGather) {
9389	NewLI =
9390	Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
9391	nullptr, "wide.masked.gather");
9392	} else {
9393	VectorBuilder VBuilder(Builder);
9394	VBuilder.setEVL(EVL).setMask(Mask);
9395	NewLI = cast<CallInst>(Val: VBuilder.createVectorInstruction(
9396	Opcode: Instruction::Load, ReturnTy: DataTy, VecOpArray: Addr, Name: "vp.op.load"));
9397	}
9398	NewLI->addParamAttr(
9399	ArgNo: `0`, Attr: Attribute::getWithAlignment(Context&: NewLI->getContext(), Alignment));
9400	State.addMetadata(To: NewLI, From: LI);
9401	State.set(Def: this, V: NewLI, Part: `0`);
9402	}
9403
9404	void VPWidenStoreRecipe::execute(VPTransformState &State) {
9405	auto *SI = cast<StoreInst>(Val: &Ingredient);
9406
9407	VPValue *StoredVPValue = getStoredValue();
9408	bool CreateScatter = !isConsecutive();
9409	const Align Alignment = getLoadStoreAlignment(I: &Ingredient);
9410
9411	auto &Builder = State.Builder;
9412	State.setDebugLocFrom(getDebugLoc());
9413
9414	for (unsigned Part = `0`; Part < State.UF; ++Part) {
9415	Instruction NewSI = nullptr*;
9416	Value Mask = nullptr*;
9417	if (auto *VPMask = getMask()) {
9418	// Mask reversal is only needed for non-all-one (null) masks, as reverse
9419	// of a null all-one mask is a null mask.
9420	Mask = State.get(Def: VPMask, Part);
9421	if (isReverse())
9422	Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse");
9423	}
9424
9425	Value *StoredVal = State.get(Def: StoredVPValue, Part);
9426	if (isReverse()) {
9427	// If we store to reverse consecutive memory locations, then we need
9428	// to reverse the order of elements in the stored value.
9429	StoredVal = Builder.CreateVectorReverse(V: StoredVal, Name: "reverse");
9430	// We don't want to update the value in the map as it might be used in
9431	// another expression. So don't call resetVectorValue(StoredVal).
9432	}
9433	Value Addr = State.get(Def: getAddr(), Part, /IsScalar/* !CreateScatter);
9434	if (CreateScatter)
9435	NewSI = Builder.CreateMaskedScatter(Val: StoredVal, Ptrs: Addr, Alignment, Mask);
9436	else if (Mask)
9437	NewSI = Builder.CreateMaskedStore(Val: StoredVal, Ptr: Addr, Alignment, Mask);
9438	else
9439	NewSI = Builder.CreateAlignedStore(Val: StoredVal, Ptr: Addr, Align: Alignment);
9440	State.addMetadata(To: NewSI, From: SI);
9441	}
9442	}
9443
9444	void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
9445	assert(State.UF == `1` && "Expected only UF == 1 when vectorizing with "
9446	"explicit vector length.");
9447	// FIXME: Support reverse loading after vp_reverse is added.
9448	assert(!isReverse() && "Reverse store are not implemented yet.");
9449
9450	auto *SI = cast<StoreInst>(Val: &Ingredient);
9451
9452	VPValue *StoredValue = getStoredValue();
9453	bool CreateScatter = !isConsecutive();
9454	const Align Alignment = getLoadStoreAlignment(I: &Ingredient);
9455
9456	auto &Builder = State.Builder;
9457	State.setDebugLocFrom(getDebugLoc());
9458
9459	CallInst NewSI = nullptr*;
9460	Value *StoredVal = State.get(Def: StoredValue, Part: `0`);
9461	Value *EVL = State.get(Def: getEVL(), Instance: VPIteration (`0`, `0`));
9462	// FIXME: Support reverse store after vp_reverse is added.
9463	Value *Mask = getMask()
9464	? State.get(Def: getMask(), Part: `0`)
9465	: Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
9466	Value *Addr = State.get(Def: getAddr(), Part: `0`, IsScalar: !CreateScatter);
9467	if (CreateScatter) {
9468	NewSI = Builder.CreateIntrinsic(Type::getVoidTy(C&: EVL->getContext()),
9469	Intrinsic::vp_scatter,
9470	{StoredVal, Addr, Mask, EVL});
9471	} else {
9472	VectorBuilder VBuilder(Builder);
9473	VBuilder.setEVL(EVL).setMask(Mask);
9474	NewSI = cast<CallInst>(Val: VBuilder.createVectorInstruction(
9475	Opcode: Instruction::Store, ReturnTy: Type::getVoidTy(C&: EVL->getContext()),
9476	VecOpArray: {StoredVal, Addr}));
9477	}
9478	NewSI->addParamAttr(
9479	ArgNo: `1`, Attr: Attribute::getWithAlignment(Context&: NewSI->getContext(), Alignment));
9480	State.addMetadata(To: NewSI, From: SI);
9481	}
9482
9483	// Determine how to lower the scalar epilogue, which depends on 1) optimising
9484	// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9485	// predication, and 4) a TTI hook that analyses whether the loop is suitable
9486	// for predication.
9487	static ScalarEpilogueLowering getScalarEpilogueLowering(
9488	Function F, Loop L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9489	BlockFrequencyInfo BFI, TargetTransformInfo TTI, TargetLibraryInfo *TLI,
9490	LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9491	// 1) OptSize takes precedence over all other options, i.e. if this is set,
9492	// don't look at hints or options, and don't request a scalar epilogue.
9493	// (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9494	// LoopAccessInfo (due to code dependency and not being able to reliably get
9495	// PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9496	// of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9497	// versioning when the vectorization is forced, unlike hasOptSize. So revert
9498	// back to the old way and vectorize with versioning when forced. See D81345.)
9499	if (F->hasOptSize() \|\| (llvm::shouldOptimizeForSize(BB: L->getHeader(), PSI, BFI,
9500	QueryType: PGSOQueryType::IRPass) &&
9501	Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9502	return CM_ScalarEpilogueNotAllowedOptSize;
9503
9504	// 2) If set, obey the directives
9505	if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9506	switch (PreferPredicateOverEpilogue) {
9507	case PreferPredicateTy::ScalarEpilogue:
9508	return CM_ScalarEpilogueAllowed;
9509	case PreferPredicateTy::PredicateElseScalarEpilogue:
9510	return CM_ScalarEpilogueNotNeededUsePredicate;
9511	case PreferPredicateTy::PredicateOrDontVectorize:
9512	return CM_ScalarEpilogueNotAllowedUsePredicate;
9513	};
9514	}
9515
9516	// 3) If set, obey the hints
9517	switch (Hints.getPredicate()) {
9518	case LoopVectorizeHints::FK_Enabled:
9519	return CM_ScalarEpilogueNotNeededUsePredicate;
9520	case LoopVectorizeHints::FK_Disabled:
9521	return CM_ScalarEpilogueAllowed;
9522	};
9523
9524	// 4) if the TTI hook indicates this is profitable, request predication.
9525	TailFoldingInfo TFI(TLI, &LVL, IAI);
9526	if (TTI->preferPredicateOverEpilogue(TFI: &TFI))
9527	return CM_ScalarEpilogueNotNeededUsePredicate;
9528
9529	return CM_ScalarEpilogueAllowed;
9530	}
9531
9532	// Process the loop in the VPlan-native vectorization path. This path builds
9533	// VPlan upfront in the vectorization pipeline, which allows to apply
9534	// VPlan-to-VPlan transformations from the very beginning without modifying the
9535	// input LLVM IR.
9536	static bool processLoopInVPlanNativePath(
9537	Loop L, PredicatedScalarEvolution &PSE, LoopInfo LI, DominatorTree *DT,
9538	LoopVectorizationLegality LVL, TargetTransformInfo TTI,
9539	TargetLibraryInfo TLI, DemandedBits DB, AssumptionCache *AC,
9540	OptimizationRemarkEmitter ORE, BlockFrequencyInfo BFI,
9541	ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9542	LoopVectorizationRequirements &Requirements) {
9543
9544	if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) {
9545	LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9546	return false;
9547	}
9548	assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9549	Function *F = L->getHeader()->getParent();
9550	InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9551
9552	ScalarEpilogueLowering SEL =
9553	getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL&: *LVL, IAI: &IAI);
9554
9555	LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9556	&Hints, IAI);
9557	// Use the planner for outer loop vectorization.
9558	// TODO: CM is not used at this point inside the planner. Turn CM into an
9559	// optional argument if we don't need it in the future.
9560	LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9561	ORE);
9562
9563	// Get user vectorization factor.
9564	ElementCount UserVF = Hints.getWidth();
9565
9566	CM.collectElementTypesForWidening();
9567
9568	// Plan how to best vectorize, return the best VF and its cost.
9569	const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9570
9571	// If we are stress testing VPlan builds, do not attempt to generate vector
9572	// code. Masked vector code generation support will follow soon.
9573	// Also, do not attempt to vectorize if no vector code will be produced.
9574	if (VPlanBuildStressTest \|\| VectorizationFactor::Disabled() == VF)
9575	return false;
9576
9577	VPlan &BestPlan = LVP.getBestPlanFor(VF: VF.Width);
9578
9579	{
9580	bool AddBranchWeights =
9581	hasBranchWeightMD(I: *L->getLoopLatch()->getTerminator());
9582	GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9583	F->getParent()->getDataLayout(), AddBranchWeights);
9584	InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9585	VF.Width, `1`, LVL, &CM, BFI, PSI, Checks);
9586	LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9587	<< L->getHeader()->getParent()->getName() << "\"\n");
9588	LVP.executePlan(BestVF: VF.Width, BestUF: `1`, BestVPlan&: BestPlan, ILV&: LB, DT, IsEpilogueVectorization: false);
9589	}
9590
9591	reportVectorization(ORE, TheLoop: L, VF, IC: `1`);
9592
9593	// Mark the loop as already vectorized to avoid vectorizing again.
9594	Hints.setAlreadyVectorized();
9595	assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9596	return true;
9597	}
9598
9599	// Emit a remark if there are stores to floats that required a floating point
9600	// extension. If the vectorized loop was generated with floating point there
9601	// will be a performance penalty from the conversion overhead and the change in
9602	// the vector width.
9603	static void checkMixedPrecision(Loop L, OptimizationRemarkEmitter ORE) {
9604	SmallVector<Instruction *, `4`> Worklist;
9605	for (BasicBlock *BB : L->getBlocks()) {
9606	for (Instruction &Inst : *BB) {
9607	if (auto *S = dyn_cast<StoreInst>(Val: &Inst)) {
9608	if (S->getValueOperand()->getType()->isFloatTy())
9609	Worklist.push_back(Elt: S);
9610	}
9611	}
9612	}
9613
9614	// Traverse the floating point stores upwards searching, for floating point
9615	// conversions.
9616	SmallPtrSet<const Instruction *, `4`> Visited;
9617	SmallPtrSet<const Instruction *, `4`> EmittedRemark;
9618	while (!Worklist.empty()) {
9619	auto *I = Worklist.pop_back_val();
9620	if (!L->contains(Inst: I))
9621	continue;
9622	if (!Visited.insert(Ptr: I).second)
9623	continue;
9624
9625	// Emit a remark if the floating point store required a floating
9626	// point conversion.
9627	// TODO: More work could be done to identify the root cause such as a
9628	// constant or a function return type and point the user to it.
9629	if (isa<FPExtInst>(Val: I) && EmittedRemark.insert(Ptr: I).second)
9630	ORE->emit(RemarkBuilder: [&]() {
9631	return OptimizationRemarkAnalysis (LV_NAME, "VectorMixedPrecision",
9632	I->getDebugLoc(), L->getHeader())
9633	<< "floating point conversion changes vector width. "
9634	<< "Mixed floating point precision requires an up/down "
9635	<< "cast that will negatively impact performance.";
9636	});
9637
9638	for (Use &Op : I->operands())
9639	if (auto *OpI = dyn_cast<Instruction>(Val&: Op))
9640	Worklist.push_back(Elt: OpI);
9641	}
9642	}
9643
9644	static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9645	VectorizationFactor &VF,
9646	std::optional<unsigned> VScale, Loop *L,
9647	ScalarEvolution &SE,
9648	ScalarEpilogueLowering SEL) {
9649	InstructionCost CheckCost = Checks.getCost();
9650	if (!CheckCost.isValid())
9651	return false;
9652
9653	// When interleaving only scalar and vector cost will be equal, which in turn
9654	// would lead to a divide by 0. Fall back to hard threshold.
9655	if (VF.Width.isScalar()) {
9656	if (CheckCost > VectorizeMemoryCheckThreshold) {
9657	LLVM_DEBUG(
9658	dbgs()
9659	<< "LV: Interleaving only is not profitable due to runtime checks\n");
9660	return false;
9661	}
9662	return true;
9663	}
9664
9665	// The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9666	uint64_t ScalarC = *VF.ScalarCost.getValue();
9667	if (ScalarC == `0`)
9668	return true;
9669
9670	// First, compute the minimum iteration count required so that the vector
9671	// loop outperforms the scalar loop.
9672	// The total cost of the scalar loop is
9673	// ScalarC TC*
9674	// where
9675	// TC is the actual trip count of the loop.*
9676	// ScalarC is the cost of a single scalar iteration.*
9677	//
9678	// The total cost of the vector loop is
9679	// RtC + VecC (TC / VF) + EpiC*
9680	// where
9681	// RtC is the cost of the generated runtime checks*
9682	// VecC is the cost of a single vector iteration.*
9683	// TC is the actual trip count of the loop*
9684	// VF is the vectorization factor*
9685	// EpiCost is the cost of the generated epilogue, including the cost*
9686	// of the remaining scalar operations.
9687	//
9688	// Vectorization is profitable once the total vector cost is less than the
9689	// total scalar cost:
9690	// RtC + VecC (TC / VF) + EpiC < ScalarC * TC*
9691	//
9692	// Now we can compute the minimum required trip count TC as
9693	// VF (RtC + EpiC) / (ScalarC * VF - VecC) < TC*
9694	//
9695	// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9696	// the computations are performed on doubles, not integers and the result
9697	// is rounded up, hence we get an upper estimate of the TC.
9698	unsigned IntVF = VF.Width.getKnownMinValue();
9699	if (VF.Width.isScalable()) {
9700	unsigned AssumedMinimumVscale = `1`;
9701	if (VScale)
9702	AssumedMinimumVscale = *VScale;
9703	IntVF *= AssumedMinimumVscale;
9704	}
9705	uint64_t RtC = *CheckCost.getValue();
9706	uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
9707	uint64_t MinTC1 = Div == `0` ? `0` : divideCeil(Numerator: RtC * IntVF, Denominator: Div);
9708
9709	// Second, compute a minimum iteration count so that the cost of the
9710	// runtime checks is only a fraction of the total scalar loop cost. This
9711	// adds a loop-dependent bound on the overhead incurred if the runtime
9712	// checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9713	// TC. To bound the runtime check to be a fraction 1/X of the scalar*
9714	// cost, compute
9715	// RtC < ScalarC TC * (1 / X) ==> RtC * X / ScalarC < TC*
9716	uint64_t MinTC2 = divideCeil(Numerator: RtC * `10`, Denominator: ScalarC);
9717
9718	// Now pick the larger minimum. If it is not a multiple of VF and a scalar
9719	// epilogue is allowed, choose the next closest multiple of VF. This should
9720	// partly compensate for ignoring the epilogue cost.
9721	uint64_t MinTC = std::max(a: MinTC1, b: MinTC2);
9722	if (SEL == CM_ScalarEpilogueAllowed)
9723	MinTC = alignTo(Value: MinTC, Align: IntVF);
9724	VF.MinProfitableTripCount = ElementCount::getFixed(MinVal: MinTC);
9725
9726	LLVM_DEBUG(
9727	dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9728	<< VF.MinProfitableTripCount << "\n");
9729
9730	// Skip vectorization if the expected trip count is less than the minimum
9731	// required trip count.
9732	if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9733	if (ElementCount::isKnownLT(LHS: ElementCount::getFixed(MinVal: *ExpectedTC),
9734	RHS: VF.MinProfitableTripCount)) {
9735	LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9736	"trip count < minimum profitable VF ("
9737	<< *ExpectedTC << " < " << VF.MinProfitableTripCount
9738	<< ")\n");
9739
9740	return false;
9741	}
9742	}
9743	return true;
9744	}
9745
9746	LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9747	: InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced \|\|
9748	!EnableLoopInterleaving),
9749	VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced \|\|
9750	!EnableLoopVectorization) {}
9751
9752	bool LoopVectorizePass::processLoop(Loop *L) {
9753	assert((EnableVPlanNativePath \|\| L->isInnermost()) &&
9754	"VPlan-native path is not enabled. Only process inner loops.");
9755
9756	#ifndef NDEBUG
9757	const std::string DebugLocStr = getDebugLocString(L);
9758	#endif /* NDEBUG */
9759
9760	LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9761	<< L->getHeader()->getParent()->getName() << "' from "
9762	<< DebugLocStr << "\n");
9763
9764	LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9765
9766	LLVM_DEBUG(
9767	dbgs() << "LV: Loop hints:"
9768	<< " force="
9769	<< (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9770	? "disabled"
9771	: (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9772	? "enabled"
9773	: "?"))
9774	<< " width=" << Hints.getWidth()
9775	<< " interleave=" << Hints.getInterleave() << "\n");
9776
9777	// Function containing loop
9778	Function *F = L->getHeader()->getParent();
9779
9780	// Looking at the diagnostic output is the only way to determine if a loop
9781	// was vectorized (other than looking at the IR or machine code), so it
9782	// is important to generate an optimization remark for each loop. Most of
9783	// these messages are generated as OptimizationRemarkAnalysis. Remarks
9784	// generated as OptimizationRemark and OptimizationRemarkMissed are
9785	// less verbose reporting vectorized loops and unvectorized loops that may
9786	// benefit from vectorization, respectively.
9787
9788	if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9789	LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9790	return false;
9791	}
9792
9793	PredicatedScalarEvolution PSE(SE, L);
9794
9795	// Check if it is legal to vectorize the loop.
9796	LoopVectorizationRequirements Requirements;
9797	LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9798	&Requirements, &Hints, DB, AC, BFI, PSI);
9799	if (!LVL.canVectorize(UseVPlanNativePath: EnableVPlanNativePath)) {
9800	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9801	Hints.emitRemarkWithHints();
9802	return false;
9803	}
9804
9805	// Entrance to the VPlan-native vectorization path. Outer loops are processed
9806	// here. They may require CFG and instruction level transformations before
9807	// even evaluating whether vectorization is profitable. Since we cannot modify
9808	// the incoming IR, we need to build VPlan upfront in the vectorization
9809	// pipeline.
9810	if (!L->isInnermost())
9811	return processLoopInVPlanNativePath(L, PSE, LI, DT, LVL: &LVL, TTI, TLI, DB, AC,
9812	ORE, BFI, PSI, Hints, Requirements);
9813
9814	assert(L->isInnermost() && "Inner loop expected.");
9815
9816	InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9817	bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9818
9819	// If an override option has been passed in for interleaved accesses, use it.
9820	if (EnableInterleavedMemAccesses.getNumOccurrences() > `0`)
9821	UseInterleaved = EnableInterleavedMemAccesses;
9822
9823	// Analyze interleaved memory accesses.
9824	if (UseInterleaved)
9825	IAI.analyzeInterleaving(EnableMaskedInterleavedGroup: useMaskedInterleavedAccesses(TTI: *TTI));
9826
9827	// Check the function attributes and profiles to find out if this function
9828	// should be optimized for size.
9829	ScalarEpilogueLowering SEL =
9830	getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, IAI: &IAI);
9831
9832	// Check the loop for a trip count threshold: vectorize loops with a tiny trip
9833	// count by optimizing for size, to minimize overheads.
9834	auto ExpectedTC = getSmallBestKnownTC(SE&: *SE, L);
9835	if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9836	LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9837	<< "This loop is worth vectorizing only if no scalar "
9838	<< "iteration overheads are incurred.");
9839	if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9840	LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9841	else {
9842	if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
9843	LLVM_DEBUG(dbgs() << "\n");
9844	// Predicate tail-folded loops are efficient even when the loop
9845	// iteration count is low. However, setting the epilogue policy to
9846	// `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9847	// with runtime checks. It's more effective to let
9848	// `areRuntimeChecksProfitable` determine if vectorization is beneficial
9849	// for the loop.
9850	if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
9851	SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9852	} else {
9853	LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9854	"small to consider vectorizing.\n");
9855	reportVectorizationFailure(
9856	DebugMsg: "The trip count is below the minial threshold value.",
9857	OREMsg: "loop trip count is too low, avoiding vectorization",
9858	ORETag: "LowTripCount", ORE, TheLoop: L);
9859	Hints.emitRemarkWithHints();
9860	return false;
9861	}
9862	}
9863	}
9864
9865	// Check the function attributes to see if implicit floats or vectors are
9866	// allowed.
9867	if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9868	reportVectorizationFailure(
9869	DebugMsg: "Can't vectorize when the NoImplicitFloat attribute is used",
9870	OREMsg: "loop not vectorized due to NoImplicitFloat attribute",
9871	ORETag: "NoImplicitFloat", ORE, TheLoop: L);
9872	Hints.emitRemarkWithHints();
9873	return false;
9874	}
9875
9876	// Check if the target supports potentially unsafe FP vectorization.
9877	// FIXME: Add a check for the type of safety issue (denormal, signaling)
9878	// for the target we're vectorizing for, to make sure none of the
9879	// additional fp-math flags can help.
9880	if (Hints.isPotentiallyUnsafe() &&
9881	TTI->isFPVectorizationPotentiallyUnsafe()) {
9882	reportVectorizationFailure(
9883	DebugMsg: "Potentially unsafe FP op prevents vectorization",
9884	OREMsg: "loop not vectorized due to unsafe FP support.",
9885	ORETag: "UnsafeFP", ORE, TheLoop: L);
9886	Hints.emitRemarkWithHints();
9887	return false;
9888	}
9889
9890	bool AllowOrderedReductions;
9891	// If the flag is set, use that instead and override the TTI behaviour.
9892	if (ForceOrderedReductions.getNumOccurrences() > `0`)
9893	AllowOrderedReductions = ForceOrderedReductions;
9894	else
9895	AllowOrderedReductions = TTI->enableOrderedReductions();
9896	if (!LVL.canVectorizeFPMath(EnableStrictReductions: AllowOrderedReductions)) {
9897	ORE->emit(RemarkBuilder: [&]() {
9898	auto *ExactFPMathInst = Requirements.getExactFPInst();
9899	return OptimizationRemarkAnalysisFPCommute (DEBUG_TYPE, "CantReorderFPOps",
9900	ExactFPMathInst->getDebugLoc(),
9901	ExactFPMathInst->getParent())
9902	<< "loop not vectorized: cannot prove it is safe to reorder "
9903	"floating-point operations";
9904	});
9905	LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9906	"reorder floating-point operations\n");
9907	Hints.emitRemarkWithHints();
9908	return false;
9909	}
9910
9911	// Use the cost model.
9912	LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9913	F, &Hints, IAI);
9914	// Use the planner for vectorization.
9915	LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9916	ORE);
9917
9918	// Get user vectorization factor and interleave count.
9919	ElementCount UserVF = Hints.getWidth();
9920	unsigned UserIC = Hints.getInterleave();
9921
9922	// Plan how to best vectorize, return the best VF and its cost.
9923	std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9924
9925	VectorizationFactor VF = VectorizationFactor::Disabled();
9926	unsigned IC = `1`;
9927
9928	bool AddBranchWeights =
9929	hasBranchWeightMD(I: *L->getLoopLatch()->getTerminator());
9930	GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9931	F->getParent()->getDataLayout(), AddBranchWeights);
9932	if (MaybeVF) {
9933	VF = *MaybeVF;
9934	// Select the interleave count.
9935	IC = CM.selectInterleaveCount(VF: VF.Width, LoopCost: VF.Cost);
9936
9937	unsigned SelectedIC = std::max(a: IC, b: UserIC);
9938	// Optimistically generate runtime checks if they are needed. Drop them if
9939	// they turn out to not be profitable.
9940	if (VF.Width.isVector() \|\| SelectedIC > `1`)
9941	Checks.Create(L, LAI: *LVL.getLAI(), UnionPred: PSE.getPredicate(), VF: VF.Width, IC: SelectedIC);
9942
9943	// Check if it is profitable to vectorize with runtime checks.
9944	bool ForceVectorization =
9945	Hints.getForce() == LoopVectorizeHints::FK_Enabled;
9946	if (!ForceVectorization &&
9947	!areRuntimeChecksProfitable(Checks, VF, VScale: getVScaleForTuning(L, TTI: *TTI), L,
9948	SE&: *PSE.getSE(), SEL)) {
9949	ORE->emit(RemarkBuilder: [&]() {
9950	return OptimizationRemarkAnalysisAliasing (
9951	DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9952	L->getHeader())
9953	<< "loop not vectorized: cannot prove it is safe to reorder "
9954	"memory operations";
9955	});
9956	LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9957	Hints.emitRemarkWithHints();
9958	return false;
9959	}
9960	}
9961
9962	// Identify the diagnostic messages that should be produced.
9963	std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9964	bool VectorizeLoop = true, InterleaveLoop = true;
9965	if (VF.Width.isScalar()) {
9966	LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9967	VecDiagMsg = std::make_pair(
9968	x: "VectorizationNotBeneficial",
9969	y: "the cost-model indicates that vectorization is not beneficial");
9970	VectorizeLoop = false;
9971	}
9972
9973	if (!MaybeVF && UserIC > `1`) {
9974	// Tell the user interleaving was avoided up-front, despite being explicitly
9975	// requested.
9976	LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9977	"interleaving should be avoided up front\n");
9978	IntDiagMsg = std::make_pair(
9979	x: "InterleavingAvoided",
9980	y: "Ignoring UserIC, because interleaving was avoided up front");
9981	InterleaveLoop = false;
9982	} else if (IC == `1` && UserIC <= `1`) {
9983	// Tell the user interleaving is not beneficial.
9984	LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9985	IntDiagMsg = std::make_pair(
9986	x: "InterleavingNotBeneficial",
9987	y: "the cost-model indicates that interleaving is not beneficial");
9988	InterleaveLoop = false;
9989	if (UserIC == `1`) {
9990	IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9991	IntDiagMsg.second +=
9992	" and is explicitly disabled or interleave count is set to 1";
9993	}
9994	} else if (IC > `1` && UserIC == `1`) {
9995	// Tell the user interleaving is beneficial, but it explicitly disabled.
9996	LLVM_DEBUG(
9997	dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9998	IntDiagMsg = std::make_pair(
9999	x: "InterleavingBeneficialButDisabled",
10000	y: "the cost-model indicates that interleaving is beneficial "
10001	"but is explicitly disabled or interleave count is set to 1");
10002	InterleaveLoop = false;
10003	}
10004
10005	// Override IC if user provided an interleave count.
10006	IC = UserIC > `0` ? UserIC : IC;
10007
10008	// Emit diagnostic messages, if any.
10009	const char *VAPassName = Hints.vectorizeAnalysisPassName();
10010	if (!VectorizeLoop && !InterleaveLoop) {
10011	// Do not vectorize or interleaving the loop.
10012	ORE->emit(RemarkBuilder: [&]() {
10013	return OptimizationRemarkMissed (VAPassName, VecDiagMsg.first,
10014	L->getStartLoc(), L->getHeader())
10015	<< VecDiagMsg.second;
10016	});
10017	ORE->emit(RemarkBuilder: [&]() {
10018	return OptimizationRemarkMissed (LV_NAME, IntDiagMsg.first,
10019	L->getStartLoc(), L->getHeader())
10020	<< IntDiagMsg.second;
10021	});
10022	return false;
10023	} else if (!VectorizeLoop && InterleaveLoop) {
10024	LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << `'\n'`);
10025	ORE->emit(RemarkBuilder: [&]() {
10026	return OptimizationRemarkAnalysis (VAPassName, VecDiagMsg.first,
10027	L->getStartLoc(), L->getHeader())
10028	<< VecDiagMsg.second;
10029	});
10030	} else if (VectorizeLoop && !InterleaveLoop) {
10031	LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10032	<< ") in " << DebugLocStr << `'\n'`);
10033	ORE->emit(RemarkBuilder: [&]() {
10034	return OptimizationRemarkAnalysis (LV_NAME, IntDiagMsg.first,
10035	L->getStartLoc(), L->getHeader())
10036	<< IntDiagMsg.second;
10037	});
10038	} else if (VectorizeLoop && InterleaveLoop) {
10039	LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10040	<< ") in " << DebugLocStr << `'\n'`);
10041	LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << `'\n'`);
10042	}
10043
10044	bool DisableRuntimeUnroll = false;
10045	MDNode *OrigLoopID = L->getLoopID();
10046	{
10047	using namespace ore;
10048	if (!VectorizeLoop) {
10049	assert(IC > `1` && "interleave count should not be 1 or 0");
10050	// If we decided that it is not legal to vectorize the loop, then
10051	// interleave it.
10052	InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10053	&CM, BFI, PSI, Checks);
10054
10055	VPlan &BestPlan = LVP.getBestPlanFor(VF: VF.Width);
10056	LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: Unroller, DT, IsEpilogueVectorization: false);
10057
10058	ORE->emit(RemarkBuilder: [&]() {
10059	return OptimizationRemark (LV_NAME, "Interleaved", L->getStartLoc(),
10060	L->getHeader())
10061	<< "interleaved loop (interleaved count: "
10062	<< NV ("InterleaveCount", IC) << ")";
10063	});
10064	} else {
10065	// If we decided that it is legal* to vectorize the loop, then do it.*
10066
10067	// Consider vectorizing the epilogue too if it's profitable.
10068	VectorizationFactor EpilogueVF =
10069	LVP.selectEpilogueVectorizationFactor(MainLoopVF: VF.Width, IC);
10070	if (EpilogueVF.Width.isVector()) {
10071
10072	// The first pass vectorizes the main loop and creates a scalar epilogue
10073	// to be vectorized by executing the plan (potentially with a different
10074	// factor) again shortly afterwards.
10075	EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, `1`);
10076	EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10077	EPI, &LVL, &CM, BFI, PSI, Checks);
10078
10079	std::unique_ptr<VPlan> BestMainPlan(
10080	LVP.getBestPlanFor(VF: EPI.MainLoopVF).duplicate());
10081	const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
10082	BestVF: EPI.MainLoopVF, BestUF: EPI.MainLoopUF, BestVPlan&: BestMainPlan, ILV&: MainILV, DT, IsEpilogueVectorization: true*);
10083	++LoopsVectorized;
10084
10085	// Second pass vectorizes the epilogue and adjusts the control flow
10086	// edges from the first pass.
10087	EPI.MainLoopVF = EPI.EpilogueVF;
10088	EPI.MainLoopUF = EPI.EpilogueUF;
10089	EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10090	ORE, EPI, &LVL, &CM, BFI, PSI,
10091	Checks);
10092
10093	VPlan &BestEpiPlan = LVP.getBestPlanFor(VF: EPI.EpilogueVF);
10094	VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10095	VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10096	Header->setName("vec.epilog.vector.body");
10097
10098	// Re-use the trip count and steps expanded for the main loop, as
10099	// skeleton creation needs it as a value that dominates both the scalar
10100	// and vector epilogue loops
10101	// TODO: This is a workaround needed for epilogue vectorization and it
10102	// should be removed once induction resume value creation is done
10103	// directly in VPlan.
10104	EpilogILV.setTripCount(MainILV.getTripCount());
10105	for (auto &R : make_early_inc_range(Range&: *BestEpiPlan.getPreheader())) {
10106	auto *ExpandR = cast<VPExpandSCEVRecipe>(Val: &R);
10107	auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn(
10108	V: ExpandedSCEVs.find(Val: ExpandR->getSCEV())->second);
10109	ExpandR->replaceAllUsesWith(New: ExpandedVal);
10110	if (BestEpiPlan.getTripCount() == ExpandR)
10111	BestEpiPlan.resetTripCount(NewTripCount: ExpandedVal);
10112	ExpandR->eraseFromParent();
10113	}
10114
10115	// Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10116	// VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10117	// before vectorizing the epilogue loop.
10118	for (VPRecipeBase &R : Header->phis()) {
10119	if (isa<VPCanonicalIVPHIRecipe>(Val: &R))
10120	continue;
10121
10122	Value ResumeV = nullptr*;
10123	// TODO: Move setting of resume values to prepareToExecute.
10124	if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R)) {
10125	ResumeV = ReductionResumeValues
10126	.find(Val: &ReductionPhi->getRecurrenceDescriptor())
10127	->second;
10128	} else {
10129	// Create induction resume values for both widened pointer and
10130	// integer/fp inductions and update the start value of the induction
10131	// recipes to use the resume value.
10132	PHINode IndPhi = nullptr*;
10133	const InductionDescriptor *ID;
10134	if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(Val: &R)) {
10135	IndPhi = cast<PHINode>(Val: Ind->getUnderlyingValue());
10136	ID = &Ind->getInductionDescriptor();
10137	} else {
10138	auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(Val: &R);
10139	IndPhi = WidenInd->getPHINode();
10140	ID = &WidenInd->getInductionDescriptor();
10141	}
10142
10143	ResumeV = MainILV.createInductionResumeValue(
10144	OrigPhi: IndPhi, II: ID, Step: getExpandedStep(ID: ID, ExpandedSCEVs),
10145	BypassBlocks: {EPI.MainLoopIterationCountCheck});
10146	}
10147	assert(ResumeV && "Must have a resume value");
10148	VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(V: ResumeV);
10149	cast<VPHeaderPHIRecipe>(Val: &R)->setStartValue(StartVal);
10150	}
10151
10152	LVP.executePlan(BestVF: EPI.EpilogueVF, BestUF: EPI.EpilogueUF, BestVPlan&: BestEpiPlan, ILV&: EpilogILV,
10153	DT, IsEpilogueVectorization: true, ExpandedSCEVs: &ExpandedSCEVs);
10154	++LoopsEpilogueVectorized;
10155
10156	if (!MainILV.areSafetyChecksAdded())
10157	DisableRuntimeUnroll = true;
10158	} else {
10159	InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10160	VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10161	PSI, Checks);
10162
10163	VPlan &BestPlan = LVP.getBestPlanFor(VF: VF.Width);
10164	LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: LB, DT, IsEpilogueVectorization: false);
10165	++LoopsVectorized;
10166
10167	// Add metadata to disable runtime unrolling a scalar loop when there
10168	// are no runtime checks about strides and memory. A scalar loop that is
10169	// rarely used is not worth unrolling.
10170	if (!LB.areSafetyChecksAdded())
10171	DisableRuntimeUnroll = true;
10172	}
10173	// Report the vectorization decision.
10174	reportVectorization(ORE, TheLoop: L, VF, IC);
10175	}
10176
10177	if (ORE->allowExtraAnalysis(LV_NAME))
10178	checkMixedPrecision(L, ORE);
10179	}
10180
10181	std::optional<MDNode *> RemainderLoopID =
10182	makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
10183	LLVMLoopVectorizeFollowupEpilogue});
10184	if (RemainderLoopID) {
10185	L->setLoopID(*RemainderLoopID);
10186	} else {
10187	if (DisableRuntimeUnroll)
10188	AddRuntimeUnrollDisableMetaData(L);
10189
10190	// Mark the loop as already vectorized to avoid vectorizing again.
10191	Hints.setAlreadyVectorized();
10192	}
10193
10194	assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10195	return true;
10196	}
10197
10198	LoopVectorizeResult LoopVectorizePass::runImpl(
10199	Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10200	DominatorTree &DT_, BlockFrequencyInfo BFI_, TargetLibraryInfo TLI_,
10201	DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
10202	OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10203	SE = &SE_;
10204	LI = &LI_;
10205	TTI = &TTI_;
10206	DT = &DT_;
10207	BFI = BFI_;
10208	TLI = TLI_;
10209	AC = &AC_;
10210	LAIs = &LAIs_;
10211	DB = &DB_;
10212	ORE = &ORE_;
10213	PSI = PSI_;
10214
10215	// Don't attempt if
10216	// 1. the target claims to have no vector registers, and
10217	// 2. interleaving won't help ILP.
10218	//
10219	// The second condition is necessary because, even if the target has no
10220	// vector registers, loop vectorization may still enable scalar
10221	// interleaving.
10222	if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true)) &&
10223	TTI->getMaxInterleaveFactor(VF: ElementCount::getFixed(MinVal: `1`)) < `2`)
10224	return LoopVectorizeResult (false, false);
10225
10226	bool Changed = false, CFGChanged = false;
10227
10228	// The vectorizer requires loops to be in simplified form.
10229	// Since simplification may add new inner loops, it has to run before the
10230	// legality and profitability checks. This means running the loop vectorizer
10231	// will simplify all loops, regardless of whether anything end up being
10232	// vectorized.
10233	for (const auto &L : *LI)
10234	Changed \|= CFGChanged \|=
10235	simplifyLoop(L, DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA: false / PreserveLCSSA /);
10236
10237	// Build up a worklist of inner-loops to vectorize. This is necessary as
10238	// the act of vectorizing or partially unrolling a loop creates new loops
10239	// and can invalidate iterators across the loops.
10240	SmallVector<Loop *, `8`> Worklist;
10241
10242	for (Loop L : LI)
10243	collectSupportedLoops(L&: *L, LI, ORE, V&: Worklist);
10244
10245	LoopsAnalyzed += Worklist.size();
10246
10247	// Now walk the identified inner loops.
10248	while (!Worklist.empty()) {
10249	Loop *L = Worklist.pop_back_val();
10250
10251	// For the inner loops we actually process, form LCSSA to simplify the
10252	// transform.
10253	Changed \|= formLCSSARecursively(L&: L, DT: DT, LI, SE);
10254
10255	Changed \|= CFGChanged \|= processLoop(L);
10256
10257	if (Changed) {
10258	LAIs->clear();
10259
10260	#ifndef NDEBUG
10261	if (VerifySCEV)
10262	SE->verify();
10263	#endif
10264	}
10265	}
10266
10267	// Process each loop nest in the function.
10268	return LoopVectorizeResult (Changed, CFGChanged);
10269	}
10270
10271	PreservedAnalyses LoopVectorizePass::run(Function &F,
10272	FunctionAnalysisManager &AM) {
10273	auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
10274	// There are no loops in the function. Return before computing other expensive
10275	// analyses.
10276	if (LI.empty())
10277	return PreservedAnalyses::all();
10278	auto &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
10279	auto &TTI = AM.getResult<TargetIRAnalysis>(IR&: F);
10280	auto &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
10281	auto &TLI = AM.getResult<TargetLibraryAnalysis>(IR&: F);
10282	auto &AC = AM.getResult<AssumptionAnalysis>(IR&: F);
10283	auto &DB = AM.getResult<DemandedBitsAnalysis>(IR&: F);
10284	auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
10285
10286	LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(IR&: F);
10287	auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
10288	ProfileSummaryInfo *PSI =
10289	MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
10290	BlockFrequencyInfo BFI = nullptr*;
10291	if (PSI && PSI->hasProfileSummary())
10292	BFI = &AM.getResult<BlockFrequencyAnalysis>(IR&: F);
10293	LoopVectorizeResult Result =
10294	runImpl(F, SE_&: SE, LI_&: LI, TTI_&: TTI, DT_&: DT, BFI_: BFI, TLI_: &TLI, DB_&: DB, AC_&: AC, LAIs_&: LAIs, ORE_&: ORE, PSI_: PSI);
10295	if (!Result.MadeAnyChange)
10296	return PreservedAnalyses::all();
10297	PreservedAnalyses PA;
10298
10299	if (isAssignmentTrackingEnabled(M: *F.getParent())) {
10300	for (auto &BB : F)
10301	RemoveRedundantDbgInstrs(BB: &BB);
10302	}
10303
10304	// We currently do not preserve loopinfo/dominator analyses with outer loop
10305	// vectorization. Until this is addressed, mark these analyses as preserved
10306	// only for non-VPlan-native path.
10307	// TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10308	if (!EnableVPlanNativePath) {
10309	PA.preserve<LoopAnalysis>();
10310	PA.preserve<DominatorTreeAnalysis>();
10311	PA.preserve<ScalarEvolutionAnalysis>();
10312	}
10313
10314	if (Result.MadeCFGChange) {
10315	// Making CFG changes likely means a loop got vectorized. Indicate that
10316	// extra simplification passes should be run.
10317	// TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10318	// be run if runtime checks have been added.
10319	AM.getResult<ShouldRunExtraVectorPasses>(IR&: F);
10320	PA.preserve<ShouldRunExtraVectorPasses>();
10321	} else {
10322	PA.preserveSet<CFGAnalyses>();
10323	}
10324	return PA;
10325	}
10326
10327	void LoopVectorizePass::printPipeline(
10328	raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10329	static_cast<PassInfoMixin<LoopVectorizePass> >(this*)->printPipeline(
10330	OS, MapClassName2PassName);
10331
10332	OS << `'<'`;
10333	OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10334	OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10335	OS << `'>'`;
10336	}
10337

source code of llvm/lib/Transforms/Vectorize/LoopVectorize.cpp