1 | //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops |
10 | // and generates target-independent LLVM-IR. |
11 | // The vectorizer uses the TargetTransformInfo analysis to estimate the costs |
12 | // of instructions in order to estimate the profitability of vectorization. |
13 | // |
14 | // The loop vectorizer combines consecutive loop iterations into a single |
15 | // 'wide' iteration. After this transformation the index is incremented |
16 | // by the SIMD vector width, and not by one. |
17 | // |
18 | // This pass has three parts: |
19 | // 1. The main loop pass that drives the different parts. |
20 | // 2. LoopVectorizationLegality - A unit that checks for the legality |
21 | // of the vectorization. |
22 | // 3. InnerLoopVectorizer - A unit that performs the actual |
23 | // widening of instructions. |
24 | // 4. LoopVectorizationCostModel - A unit that checks for the profitability |
25 | // of vectorization. It decides on the optimal vector width, which |
26 | // can be one, if vectorization is not profitable. |
27 | // |
28 | // There is a development effort going on to migrate loop vectorizer to the |
29 | // VPlan infrastructure and to introduce outer loop vectorization support (see |
30 | // docs/VectorizationPlan.rst and |
31 | // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this |
32 | // purpose, we temporarily introduced the VPlan-native vectorization path: an |
33 | // alternative vectorization path that is natively implemented on top of the |
34 | // VPlan infrastructure. See EnableVPlanNativePath for enabling. |
35 | // |
36 | //===----------------------------------------------------------------------===// |
37 | // |
38 | // The reduction-variable vectorization is based on the paper: |
39 | // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. |
40 | // |
41 | // Variable uniformity checks are inspired by: |
42 | // Karrenberg, R. and Hack, S. Whole Function Vectorization. |
43 | // |
44 | // The interleaved access vectorization is based on the paper: |
45 | // Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved |
46 | // Data for SIMD |
47 | // |
48 | // Other ideas/concepts are from: |
49 | // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. |
50 | // |
51 | // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of |
52 | // Vectorizing Compilers. |
53 | // |
54 | //===----------------------------------------------------------------------===// |
55 | |
56 | #include "llvm/Transforms/Vectorize/LoopVectorize.h" |
57 | #include "LoopVectorizationPlanner.h" |
58 | #include "VPRecipeBuilder.h" |
59 | #include "VPlan.h" |
60 | #include "VPlanAnalysis.h" |
61 | #include "VPlanHCFGBuilder.h" |
62 | #include "VPlanTransforms.h" |
63 | #include "VPlanVerifier.h" |
64 | #include "llvm/ADT/APInt.h" |
65 | #include "llvm/ADT/ArrayRef.h" |
66 | #include "llvm/ADT/DenseMap.h" |
67 | #include "llvm/ADT/DenseMapInfo.h" |
68 | #include "llvm/ADT/Hashing.h" |
69 | #include "llvm/ADT/MapVector.h" |
70 | #include "llvm/ADT/STLExtras.h" |
71 | #include "llvm/ADT/SmallPtrSet.h" |
72 | #include "llvm/ADT/SmallSet.h" |
73 | #include "llvm/ADT/SmallVector.h" |
74 | #include "llvm/ADT/Statistic.h" |
75 | #include "llvm/ADT/StringRef.h" |
76 | #include "llvm/ADT/Twine.h" |
77 | #include "llvm/ADT/iterator_range.h" |
78 | #include "llvm/Analysis/AssumptionCache.h" |
79 | #include "llvm/Analysis/BasicAliasAnalysis.h" |
80 | #include "llvm/Analysis/BlockFrequencyInfo.h" |
81 | #include "llvm/Analysis/CFG.h" |
82 | #include "llvm/Analysis/CodeMetrics.h" |
83 | #include "llvm/Analysis/DemandedBits.h" |
84 | #include "llvm/Analysis/GlobalsModRef.h" |
85 | #include "llvm/Analysis/LoopAccessAnalysis.h" |
86 | #include "llvm/Analysis/LoopAnalysisManager.h" |
87 | #include "llvm/Analysis/LoopInfo.h" |
88 | #include "llvm/Analysis/LoopIterator.h" |
89 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
90 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
91 | #include "llvm/Analysis/ScalarEvolution.h" |
92 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
93 | #include "llvm/Analysis/TargetLibraryInfo.h" |
94 | #include "llvm/Analysis/TargetTransformInfo.h" |
95 | #include "llvm/Analysis/ValueTracking.h" |
96 | #include "llvm/Analysis/VectorUtils.h" |
97 | #include "llvm/IR/Attributes.h" |
98 | #include "llvm/IR/BasicBlock.h" |
99 | #include "llvm/IR/CFG.h" |
100 | #include "llvm/IR/Constant.h" |
101 | #include "llvm/IR/Constants.h" |
102 | #include "llvm/IR/DataLayout.h" |
103 | #include "llvm/IR/DebugInfo.h" |
104 | #include "llvm/IR/DebugInfoMetadata.h" |
105 | #include "llvm/IR/DebugLoc.h" |
106 | #include "llvm/IR/DerivedTypes.h" |
107 | #include "llvm/IR/DiagnosticInfo.h" |
108 | #include "llvm/IR/Dominators.h" |
109 | #include "llvm/IR/Function.h" |
110 | #include "llvm/IR/IRBuilder.h" |
111 | #include "llvm/IR/InstrTypes.h" |
112 | #include "llvm/IR/Instruction.h" |
113 | #include "llvm/IR/Instructions.h" |
114 | #include "llvm/IR/IntrinsicInst.h" |
115 | #include "llvm/IR/Intrinsics.h" |
116 | #include "llvm/IR/MDBuilder.h" |
117 | #include "llvm/IR/Metadata.h" |
118 | #include "llvm/IR/Module.h" |
119 | #include "llvm/IR/Operator.h" |
120 | #include "llvm/IR/PatternMatch.h" |
121 | #include "llvm/IR/ProfDataUtils.h" |
122 | #include "llvm/IR/Type.h" |
123 | #include "llvm/IR/Use.h" |
124 | #include "llvm/IR/User.h" |
125 | #include "llvm/IR/Value.h" |
126 | #include "llvm/IR/ValueHandle.h" |
127 | #include "llvm/IR/VectorBuilder.h" |
128 | #include "llvm/IR/Verifier.h" |
129 | #include "llvm/Support/Casting.h" |
130 | #include "llvm/Support/CommandLine.h" |
131 | #include "llvm/Support/Compiler.h" |
132 | #include "llvm/Support/Debug.h" |
133 | #include "llvm/Support/ErrorHandling.h" |
134 | #include "llvm/Support/InstructionCost.h" |
135 | #include "llvm/Support/MathExtras.h" |
136 | #include "llvm/Support/raw_ostream.h" |
137 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
138 | #include "llvm/Transforms/Utils/InjectTLIMappings.h" |
139 | #include "llvm/Transforms/Utils/LoopSimplify.h" |
140 | #include "llvm/Transforms/Utils/LoopUtils.h" |
141 | #include "llvm/Transforms/Utils/LoopVersioning.h" |
142 | #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" |
143 | #include "llvm/Transforms/Utils/SizeOpts.h" |
144 | #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" |
145 | #include <algorithm> |
146 | #include <cassert> |
147 | #include <cmath> |
148 | #include <cstdint> |
149 | #include <functional> |
150 | #include <iterator> |
151 | #include <limits> |
152 | #include <map> |
153 | #include <memory> |
154 | #include <string> |
155 | #include <tuple> |
156 | #include <utility> |
157 | |
158 | using namespace llvm; |
159 | |
160 | #define LV_NAME "loop-vectorize" |
161 | #define DEBUG_TYPE LV_NAME |
162 | |
163 | #ifndef NDEBUG |
164 | const char VerboseDebug[] = DEBUG_TYPE "-verbose" ; |
165 | #endif |
166 | |
167 | /// @{ |
168 | /// Metadata attribute names |
169 | const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all" ; |
170 | const char LLVMLoopVectorizeFollowupVectorized[] = |
171 | "llvm.loop.vectorize.followup_vectorized" ; |
172 | const char LLVMLoopVectorizeFollowupEpilogue[] = |
173 | "llvm.loop.vectorize.followup_epilogue" ; |
174 | /// @} |
175 | |
176 | STATISTIC(LoopsVectorized, "Number of loops vectorized" ); |
177 | STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization" ); |
178 | STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized" ); |
179 | |
180 | static cl::opt<bool> EnableEpilogueVectorization( |
181 | "enable-epilogue-vectorization" , cl::init(Val: true), cl::Hidden, |
182 | cl::desc("Enable vectorization of epilogue loops." )); |
183 | |
184 | static cl::opt<unsigned> EpilogueVectorizationForceVF( |
185 | "epilogue-vectorization-force-VF" , cl::init(Val: 1), cl::Hidden, |
186 | cl::desc("When epilogue vectorization is enabled, and a value greater than " |
187 | "1 is specified, forces the given VF for all applicable epilogue " |
188 | "loops." )); |
189 | |
190 | static cl::opt<unsigned> EpilogueVectorizationMinVF( |
191 | "epilogue-vectorization-minimum-VF" , cl::init(Val: 16), cl::Hidden, |
192 | cl::desc("Only loops with vectorization factor equal to or larger than " |
193 | "the specified value are considered for epilogue vectorization." )); |
194 | |
195 | /// Loops with a known constant trip count below this number are vectorized only |
196 | /// if no scalar iteration overheads are incurred. |
197 | static cl::opt<unsigned> TinyTripCountVectorThreshold( |
198 | "vectorizer-min-trip-count" , cl::init(Val: 16), cl::Hidden, |
199 | cl::desc("Loops with a constant trip count that is smaller than this " |
200 | "value are vectorized only if no scalar iteration overheads " |
201 | "are incurred." )); |
202 | |
203 | static cl::opt<unsigned> VectorizeMemoryCheckThreshold( |
204 | "vectorize-memory-check-threshold" , cl::init(Val: 128), cl::Hidden, |
205 | cl::desc("The maximum allowed number of runtime memory checks" )); |
206 | |
207 | // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, |
208 | // that predication is preferred, and this lists all options. I.e., the |
209 | // vectorizer will try to fold the tail-loop (epilogue) into the vector body |
210 | // and predicate the instructions accordingly. If tail-folding fails, there are |
211 | // different fallback strategies depending on these values: |
212 | namespace PreferPredicateTy { |
213 | enum Option { |
214 | ScalarEpilogue = 0, |
215 | PredicateElseScalarEpilogue, |
216 | PredicateOrDontVectorize |
217 | }; |
218 | } // namespace PreferPredicateTy |
219 | |
220 | static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue( |
221 | "prefer-predicate-over-epilogue" , |
222 | cl::init(Val: PreferPredicateTy::ScalarEpilogue), |
223 | cl::Hidden, |
224 | cl::desc("Tail-folding and predication preferences over creating a scalar " |
225 | "epilogue loop." ), |
226 | cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, |
227 | "scalar-epilogue" , |
228 | "Don't tail-predicate loops, create scalar epilogue" ), |
229 | clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, |
230 | "predicate-else-scalar-epilogue" , |
231 | "prefer tail-folding, create scalar epilogue if tail " |
232 | "folding fails." ), |
233 | clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, |
234 | "predicate-dont-vectorize" , |
235 | "prefers tail-folding, don't attempt vectorization if " |
236 | "tail-folding fails." ))); |
237 | |
238 | static cl::opt<TailFoldingStyle> ForceTailFoldingStyle( |
239 | "force-tail-folding-style" , cl::desc("Force the tail folding style" ), |
240 | cl::init(Val: TailFoldingStyle::None), |
241 | cl::values( |
242 | clEnumValN(TailFoldingStyle::None, "none" , "Disable tail folding" ), |
243 | clEnumValN( |
244 | TailFoldingStyle::Data, "data" , |
245 | "Create lane mask for data only, using active.lane.mask intrinsic" ), |
246 | clEnumValN(TailFoldingStyle::DataWithoutLaneMask, |
247 | "data-without-lane-mask" , |
248 | "Create lane mask with compare/stepvector" ), |
249 | clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control" , |
250 | "Create lane mask using active.lane.mask intrinsic, and use " |
251 | "it for both data and control flow" ), |
252 | clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, |
253 | "data-and-control-without-rt-check" , |
254 | "Similar to data-and-control, but remove the runtime check" ), |
255 | clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl" , |
256 | "Use predicated EVL instructions for tail folding. If EVL " |
257 | "is unsupported, fallback to data-without-lane-mask." ))); |
258 | |
259 | static cl::opt<bool> MaximizeBandwidth( |
260 | "vectorizer-maximize-bandwidth" , cl::init(Val: false), cl::Hidden, |
261 | cl::desc("Maximize bandwidth when selecting vectorization factor which " |
262 | "will be determined by the smallest type in loop." )); |
263 | |
264 | static cl::opt<bool> EnableInterleavedMemAccesses( |
265 | "enable-interleaved-mem-accesses" , cl::init(Val: false), cl::Hidden, |
266 | cl::desc("Enable vectorization on interleaved memory accesses in a loop" )); |
267 | |
268 | /// An interleave-group may need masking if it resides in a block that needs |
269 | /// predication, or in order to mask away gaps. |
270 | static cl::opt<bool> EnableMaskedInterleavedMemAccesses( |
271 | "enable-masked-interleaved-mem-accesses" , cl::init(Val: false), cl::Hidden, |
272 | cl::desc("Enable vectorization on masked interleaved memory accesses in a loop" )); |
273 | |
274 | static cl::opt<unsigned> ForceTargetNumScalarRegs( |
275 | "force-target-num-scalar-regs" , cl::init(Val: 0), cl::Hidden, |
276 | cl::desc("A flag that overrides the target's number of scalar registers." )); |
277 | |
278 | static cl::opt<unsigned> ForceTargetNumVectorRegs( |
279 | "force-target-num-vector-regs" , cl::init(Val: 0), cl::Hidden, |
280 | cl::desc("A flag that overrides the target's number of vector registers." )); |
281 | |
282 | static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor( |
283 | "force-target-max-scalar-interleave" , cl::init(Val: 0), cl::Hidden, |
284 | cl::desc("A flag that overrides the target's max interleave factor for " |
285 | "scalar loops." )); |
286 | |
287 | static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor( |
288 | "force-target-max-vector-interleave" , cl::init(Val: 0), cl::Hidden, |
289 | cl::desc("A flag that overrides the target's max interleave factor for " |
290 | "vectorized loops." )); |
291 | |
292 | static cl::opt<unsigned> ForceTargetInstructionCost( |
293 | "force-target-instruction-cost" , cl::init(Val: 0), cl::Hidden, |
294 | cl::desc("A flag that overrides the target's expected cost for " |
295 | "an instruction to a single constant value. Mostly " |
296 | "useful for getting consistent testing." )); |
297 | |
298 | static cl::opt<bool> ForceTargetSupportsScalableVectors( |
299 | "force-target-supports-scalable-vectors" , cl::init(Val: false), cl::Hidden, |
300 | cl::desc( |
301 | "Pretend that scalable vectors are supported, even if the target does " |
302 | "not support them. This flag should only be used for testing." )); |
303 | |
304 | static cl::opt<unsigned> SmallLoopCost( |
305 | "small-loop-cost" , cl::init(Val: 20), cl::Hidden, |
306 | cl::desc( |
307 | "The cost of a loop that is considered 'small' by the interleaver." )); |
308 | |
309 | static cl::opt<bool> LoopVectorizeWithBlockFrequency( |
310 | "loop-vectorize-with-block-frequency" , cl::init(Val: true), cl::Hidden, |
311 | cl::desc("Enable the use of the block frequency analysis to access PGO " |
312 | "heuristics minimizing code growth in cold regions and being more " |
313 | "aggressive in hot regions." )); |
314 | |
315 | // Runtime interleave loops for load/store throughput. |
316 | static cl::opt<bool> EnableLoadStoreRuntimeInterleave( |
317 | "enable-loadstore-runtime-interleave" , cl::init(Val: true), cl::Hidden, |
318 | cl::desc( |
319 | "Enable runtime interleaving until load/store ports are saturated" )); |
320 | |
321 | /// The number of stores in a loop that are allowed to need predication. |
322 | static cl::opt<unsigned> NumberOfStoresToPredicate( |
323 | "vectorize-num-stores-pred" , cl::init(Val: 1), cl::Hidden, |
324 | cl::desc("Max number of stores to be predicated behind an if." )); |
325 | |
326 | static cl::opt<bool> EnableIndVarRegisterHeur( |
327 | "enable-ind-var-reg-heur" , cl::init(Val: true), cl::Hidden, |
328 | cl::desc("Count the induction variable only once when interleaving" )); |
329 | |
330 | static cl::opt<bool> EnableCondStoresVectorization( |
331 | "enable-cond-stores-vec" , cl::init(Val: true), cl::Hidden, |
332 | cl::desc("Enable if predication of stores during vectorization." )); |
333 | |
334 | static cl::opt<unsigned> MaxNestedScalarReductionIC( |
335 | "max-nested-scalar-reduction-interleave" , cl::init(Val: 2), cl::Hidden, |
336 | cl::desc("The maximum interleave count to use when interleaving a scalar " |
337 | "reduction in a nested loop." )); |
338 | |
339 | static cl::opt<bool> |
340 | PreferInLoopReductions("prefer-inloop-reductions" , cl::init(Val: false), |
341 | cl::Hidden, |
342 | cl::desc("Prefer in-loop vector reductions, " |
343 | "overriding the targets preference." )); |
344 | |
345 | static cl::opt<bool> ForceOrderedReductions( |
346 | "force-ordered-reductions" , cl::init(Val: false), cl::Hidden, |
347 | cl::desc("Enable the vectorisation of loops with in-order (strict) " |
348 | "FP reductions" )); |
349 | |
350 | static cl::opt<bool> PreferPredicatedReductionSelect( |
351 | "prefer-predicated-reduction-select" , cl::init(Val: false), cl::Hidden, |
352 | cl::desc( |
353 | "Prefer predicating a reduction operation over an after loop select." )); |
354 | |
355 | namespace llvm { |
356 | cl::opt<bool> EnableVPlanNativePath( |
357 | "enable-vplan-native-path" , cl::Hidden, |
358 | cl::desc("Enable VPlan-native vectorization path with " |
359 | "support for outer loop vectorization." )); |
360 | } |
361 | |
362 | // This flag enables the stress testing of the VPlan H-CFG construction in the |
363 | // VPlan-native vectorization path. It must be used in conjuction with |
364 | // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the |
365 | // verification of the H-CFGs built. |
366 | static cl::opt<bool> VPlanBuildStressTest( |
367 | "vplan-build-stress-test" , cl::init(Val: false), cl::Hidden, |
368 | cl::desc( |
369 | "Build VPlan for every supported loop nest in the function and bail " |
370 | "out right after the build (stress test the VPlan H-CFG construction " |
371 | "in the VPlan-native vectorization path)." )); |
372 | |
373 | cl::opt<bool> llvm::EnableLoopInterleaving( |
374 | "interleave-loops" , cl::init(Val: true), cl::Hidden, |
375 | cl::desc("Enable loop interleaving in Loop vectorization passes" )); |
376 | cl::opt<bool> llvm::EnableLoopVectorization( |
377 | "vectorize-loops" , cl::init(Val: true), cl::Hidden, |
378 | cl::desc("Run the Loop vectorization passes" )); |
379 | |
380 | static cl::opt<bool> PrintVPlansInDotFormat( |
381 | "vplan-print-in-dot-format" , cl::Hidden, |
382 | cl::desc("Use dot format instead of plain text when dumping VPlans" )); |
383 | |
384 | static cl::opt<cl::boolOrDefault> ForceSafeDivisor( |
385 | "force-widen-divrem-via-safe-divisor" , cl::Hidden, |
386 | cl::desc( |
387 | "Override cost based safe divisor widening for div/rem instructions" )); |
388 | |
389 | static cl::opt<bool> UseWiderVFIfCallVariantsPresent( |
390 | "vectorizer-maximize-bandwidth-for-vector-calls" , cl::init(Val: true), |
391 | cl::Hidden, |
392 | cl::desc("Try wider VFs if they enable the use of vector variants" )); |
393 | |
394 | // Likelyhood of bypassing the vectorized loop because assumptions about SCEV |
395 | // variables not overflowing do not hold. See `emitSCEVChecks`. |
396 | static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127}; |
397 | // Likelyhood of bypassing the vectorized loop because pointers overlap. See |
398 | // `emitMemRuntimeChecks`. |
399 | static constexpr uint32_t MemCheckBypassWeights[] = {1, 127}; |
400 | // Likelyhood of bypassing the vectorized loop because there are zero trips left |
401 | // after prolog. See `emitIterationCountCheck`. |
402 | static constexpr uint32_t MinItersBypassWeights[] = {1, 127}; |
403 | |
404 | /// A helper function that returns true if the given type is irregular. The |
405 | /// type is irregular if its allocated size doesn't equal the store size of an |
406 | /// element of the corresponding vector type. |
407 | static bool hasIrregularType(Type *Ty, const DataLayout &DL) { |
408 | // Determine if an array of N elements of type Ty is "bitcast compatible" |
409 | // with a <N x Ty> vector. |
410 | // This is only true if there is no padding between the array elements. |
411 | return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty); |
412 | } |
413 | |
414 | /// A helper function that returns the reciprocal of the block probability of |
415 | /// predicated blocks. If we return X, we are assuming the predicated block |
416 | /// will execute once for every X iterations of the loop header. |
417 | /// |
418 | /// TODO: We should use actual block probability here, if available. Currently, |
419 | /// we always assume predicated blocks have a 50% chance of executing. |
420 | static unsigned getReciprocalPredBlockProb() { return 2; } |
421 | |
422 | /// Returns "best known" trip count for the specified loop \p L as defined by |
423 | /// the following procedure: |
424 | /// 1) Returns exact trip count if it is known. |
425 | /// 2) Returns expected trip count according to profile data if any. |
426 | /// 3) Returns upper bound estimate if it is known. |
427 | /// 4) Returns std::nullopt if all of the above failed. |
428 | static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, |
429 | Loop *L) { |
430 | // Check if exact trip count is known. |
431 | if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) |
432 | return ExpectedTC; |
433 | |
434 | // Check if there is an expected trip count available from profile data. |
435 | if (LoopVectorizeWithBlockFrequency) |
436 | if (auto EstimatedTC = getLoopEstimatedTripCount(L)) |
437 | return *EstimatedTC; |
438 | |
439 | // Check if upper bound estimate is known. |
440 | if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) |
441 | return ExpectedTC; |
442 | |
443 | return std::nullopt; |
444 | } |
445 | |
446 | /// Return a vector containing interleaved elements from multiple |
447 | /// smaller input vectors. |
448 | static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, |
449 | const Twine &Name) { |
450 | unsigned Factor = Vals.size(); |
451 | assert(Factor > 1 && "Tried to interleave invalid number of vectors" ); |
452 | |
453 | VectorType *VecTy = cast<VectorType>(Val: Vals[0]->getType()); |
454 | #ifndef NDEBUG |
455 | for (Value *Val : Vals) |
456 | assert(Val->getType() == VecTy && "Tried to interleave mismatched types" ); |
457 | #endif |
458 | |
459 | // Scalable vectors cannot use arbitrary shufflevectors (only splats), so |
460 | // must use intrinsics to interleave. |
461 | if (VecTy->isScalableTy()) { |
462 | VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VTy: VecTy); |
463 | return Builder.CreateIntrinsic( |
464 | WideVecTy, Intrinsic::experimental_vector_interleave2, Vals, |
465 | /*FMFSource=*/nullptr, Name); |
466 | } |
467 | |
468 | // Fixed length. Start by concatenating all vectors into a wide vector. |
469 | Value *WideVec = concatenateVectors(Builder, Vecs: Vals); |
470 | |
471 | // Interleave the elements into the wide vector. |
472 | const unsigned NumElts = VecTy->getElementCount().getFixedValue(); |
473 | return Builder.CreateShuffleVector( |
474 | V: WideVec, Mask: createInterleaveMask(VF: NumElts, NumVecs: Factor), Name); |
475 | } |
476 | |
477 | namespace { |
478 | // Forward declare GeneratedRTChecks. |
479 | class GeneratedRTChecks; |
480 | |
481 | using SCEV2ValueTy = DenseMap<const SCEV *, Value *>; |
482 | } // namespace |
483 | |
484 | namespace llvm { |
485 | |
486 | AnalysisKey ShouldRunExtraVectorPasses::; |
487 | |
488 | /// InnerLoopVectorizer vectorizes loops which contain only one basic |
489 | /// block to a specified vectorization factor (VF). |
490 | /// This class performs the widening of scalars into vectors, or multiple |
491 | /// scalars. This class also implements the following features: |
492 | /// * It inserts an epilogue loop for handling loops that don't have iteration |
493 | /// counts that are known to be a multiple of the vectorization factor. |
494 | /// * It handles the code generation for reduction variables. |
495 | /// * Scalarization (implementation using scalars) of un-vectorizable |
496 | /// instructions. |
497 | /// InnerLoopVectorizer does not perform any vectorization-legality |
498 | /// checks, and relies on the caller to check for the different legality |
499 | /// aspects. The InnerLoopVectorizer relies on the |
500 | /// LoopVectorizationLegality class to provide information about the induction |
501 | /// and reduction variables that were found to a given vectorization factor. |
502 | class InnerLoopVectorizer { |
503 | public: |
504 | InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, |
505 | LoopInfo *LI, DominatorTree *DT, |
506 | const TargetLibraryInfo *TLI, |
507 | const TargetTransformInfo *TTI, AssumptionCache *AC, |
508 | OptimizationRemarkEmitter *ORE, ElementCount VecWidth, |
509 | ElementCount MinProfitableTripCount, |
510 | unsigned UnrollFactor, LoopVectorizationLegality *LVL, |
511 | LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, |
512 | ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) |
513 | : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), |
514 | AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), |
515 | Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), |
516 | PSI(PSI), RTChecks(RTChecks) { |
517 | // Query this against the original loop and save it here because the profile |
518 | // of the original loop header may change as the transformation happens. |
519 | OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( |
520 | BB: OrigLoop->getHeader(), PSI, BFI, QueryType: PGSOQueryType::IRPass); |
521 | |
522 | if (MinProfitableTripCount.isZero()) |
523 | this->MinProfitableTripCount = VecWidth; |
524 | else |
525 | this->MinProfitableTripCount = MinProfitableTripCount; |
526 | } |
527 | |
528 | virtual ~InnerLoopVectorizer() = default; |
529 | |
530 | /// Create a new empty loop that will contain vectorized instructions later |
531 | /// on, while the old loop will be used as the scalar remainder. Control flow |
532 | /// is generated around the vectorized (and scalar epilogue) loops consisting |
533 | /// of various checks and bypasses. Return the pre-header block of the new |
534 | /// loop and the start value for the canonical induction, if it is != 0. The |
535 | /// latter is the case when vectorizing the epilogue loop. In the case of |
536 | /// epilogue vectorization, this function is overriden to handle the more |
537 | /// complex control flow around the loops. \p ExpandedSCEVs is used to |
538 | /// look up SCEV expansions for expressions needed during skeleton creation. |
539 | virtual std::pair<BasicBlock *, Value *> |
540 | createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs); |
541 | |
542 | /// Fix the vectorized code, taking care of header phi's, live-outs, and more. |
543 | void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); |
544 | |
545 | // Return true if any runtime check is added. |
546 | bool areSafetyChecksAdded() { return AddedSafetyChecks; } |
547 | |
548 | /// A helper function to scalarize a single Instruction in the innermost loop. |
549 | /// Generates a sequence of scalar instances for each lane between \p MinLane |
550 | /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, |
551 | /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p |
552 | /// Instr's operands. |
553 | void scalarizeInstruction(const Instruction *Instr, |
554 | VPReplicateRecipe *RepRecipe, |
555 | const VPIteration &Instance, |
556 | VPTransformState &State); |
557 | |
558 | /// Try to vectorize interleaved access group \p Group with the base address |
559 | /// given in \p Addr, optionally masking the vector operations if \p |
560 | /// BlockInMask is non-null. Use \p State to translate given VPValues to IR |
561 | /// values in the vectorized loop. |
562 | void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group, |
563 | ArrayRef<VPValue *> VPDefs, |
564 | VPTransformState &State, VPValue *Addr, |
565 | ArrayRef<VPValue *> StoredValues, |
566 | VPValue *BlockInMask, bool NeedsMaskForGaps); |
567 | |
568 | /// Fix the non-induction PHIs in \p Plan. |
569 | void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State); |
570 | |
571 | /// Create a new phi node for the induction variable \p OrigPhi to resume |
572 | /// iteration count in the scalar epilogue, from where the vectorized loop |
573 | /// left off. \p Step is the SCEV-expanded induction step to use. In cases |
574 | /// where the loop skeleton is more complicated (i.e., epilogue vectorization) |
575 | /// and the resume values can come from an additional bypass block, the \p |
576 | /// AdditionalBypass pair provides information about the bypass block and the |
577 | /// end value on the edge from bypass to this loop. |
578 | PHINode *createInductionResumeValue( |
579 | PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step, |
580 | ArrayRef<BasicBlock *> BypassBlocks, |
581 | std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); |
582 | |
583 | /// Returns the original loop trip count. |
584 | Value *getTripCount() const { return TripCount; } |
585 | |
586 | /// Used to set the trip count after ILV's construction and after the |
587 | /// preheader block has been executed. Note that this always holds the trip |
588 | /// count of the original loop for both main loop and epilogue vectorization. |
589 | void setTripCount(Value *TC) { TripCount = TC; } |
590 | |
591 | protected: |
592 | friend class LoopVectorizationPlanner; |
593 | |
594 | /// A small list of PHINodes. |
595 | using PhiVector = SmallVector<PHINode *, 4>; |
596 | |
597 | /// A type for scalarized values in the new loop. Each value from the |
598 | /// original loop, when scalarized, is represented by UF x VF scalar values |
599 | /// in the new unrolled loop, where UF is the unroll factor and VF is the |
600 | /// vectorization factor. |
601 | using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; |
602 | |
603 | /// Set up the values of the IVs correctly when exiting the vector loop. |
604 | void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, |
605 | Value *VectorTripCount, Value *EndValue, |
606 | BasicBlock *MiddleBlock, BasicBlock *, |
607 | VPlan &Plan, VPTransformState &State); |
608 | |
609 | /// Create the exit value of first order recurrences in the middle block and |
610 | /// update their users. |
611 | void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR, |
612 | VPTransformState &State); |
613 | |
614 | /// Iteratively sink the scalarized operands of a predicated instruction into |
615 | /// the block that was created for it. |
616 | void sinkScalarOperands(Instruction *PredInst); |
617 | |
618 | /// Returns (and creates if needed) the trip count of the widened loop. |
619 | Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); |
620 | |
621 | /// Returns a bitcasted value to the requested vector type. |
622 | /// Also handles bitcasts of vector<float> <-> vector<pointer> types. |
623 | Value *createBitOrPointerCast(Value *V, VectorType *DstVTy, |
624 | const DataLayout &DL); |
625 | |
626 | /// Emit a bypass check to see if the vector trip count is zero, including if |
627 | /// it overflows. |
628 | void emitIterationCountCheck(BasicBlock *Bypass); |
629 | |
630 | /// Emit a bypass check to see if all of the SCEV assumptions we've |
631 | /// had to make are correct. Returns the block containing the checks or |
632 | /// nullptr if no checks have been added. |
633 | BasicBlock *emitSCEVChecks(BasicBlock *Bypass); |
634 | |
635 | /// Emit bypass checks to check any memory assumptions we may have made. |
636 | /// Returns the block containing the checks or nullptr if no checks have been |
637 | /// added. |
638 | BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass); |
639 | |
640 | /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, |
641 | /// vector loop preheader, middle block and scalar preheader. |
642 | void createVectorLoopSkeleton(StringRef Prefix); |
643 | |
644 | /// Create new phi nodes for the induction variables to resume iteration count |
645 | /// in the scalar epilogue, from where the vectorized loop left off. |
646 | /// In cases where the loop skeleton is more complicated (eg. epilogue |
647 | /// vectorization) and the resume values can come from an additional bypass |
648 | /// block, the \p AdditionalBypass pair provides information about the bypass |
649 | /// block and the end value on the edge from bypass to this loop. |
650 | void createInductionResumeValues( |
651 | const SCEV2ValueTy &ExpandedSCEVs, |
652 | std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr}); |
653 | |
654 | /// Complete the loop skeleton by adding debug MDs, creating appropriate |
655 | /// conditional branches in the middle block, preparing the builder and |
656 | /// running the verifier. Return the preheader of the completed vector loop. |
657 | BasicBlock *completeLoopSkeleton(); |
658 | |
659 | /// Allow subclasses to override and print debug traces before/after vplan |
660 | /// execution, when trace information is requested. |
661 | virtual void printDebugTracesAtStart(){}; |
662 | virtual void printDebugTracesAtEnd(){}; |
663 | |
664 | /// The original loop. |
665 | Loop *OrigLoop; |
666 | |
667 | /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies |
668 | /// dynamic knowledge to simplify SCEV expressions and converts them to a |
669 | /// more usable form. |
670 | PredicatedScalarEvolution &PSE; |
671 | |
672 | /// Loop Info. |
673 | LoopInfo *LI; |
674 | |
675 | /// Dominator Tree. |
676 | DominatorTree *DT; |
677 | |
678 | /// Target Library Info. |
679 | const TargetLibraryInfo *TLI; |
680 | |
681 | /// Target Transform Info. |
682 | const TargetTransformInfo *TTI; |
683 | |
684 | /// Assumption Cache. |
685 | AssumptionCache *AC; |
686 | |
687 | /// Interface to emit optimization remarks. |
688 | OptimizationRemarkEmitter *ORE; |
689 | |
690 | /// The vectorization SIMD factor to use. Each vector will have this many |
691 | /// vector elements. |
692 | ElementCount VF; |
693 | |
694 | ElementCount MinProfitableTripCount; |
695 | |
696 | /// The vectorization unroll factor to use. Each scalar is vectorized to this |
697 | /// many different vector instructions. |
698 | unsigned UF; |
699 | |
700 | /// The builder that we use |
701 | IRBuilder<> Builder; |
702 | |
703 | // --- Vectorization state --- |
704 | |
705 | /// The vector-loop preheader. |
706 | BasicBlock *; |
707 | |
708 | /// The scalar-loop preheader. |
709 | BasicBlock *; |
710 | |
711 | /// Middle Block between the vector and the scalar. |
712 | BasicBlock *LoopMiddleBlock; |
713 | |
714 | /// The unique ExitBlock of the scalar loop if one exists. Note that |
715 | /// there can be multiple exiting edges reaching this block. |
716 | BasicBlock *LoopExitBlock; |
717 | |
718 | /// The scalar loop body. |
719 | BasicBlock *LoopScalarBody; |
720 | |
721 | /// A list of all bypass blocks. The first block is the entry of the loop. |
722 | SmallVector<BasicBlock *, 4> LoopBypassBlocks; |
723 | |
724 | /// Store instructions that were predicated. |
725 | SmallVector<Instruction *, 4> PredicatedInstructions; |
726 | |
727 | /// Trip count of the original loop. |
728 | Value *TripCount = nullptr; |
729 | |
730 | /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) |
731 | Value *VectorTripCount = nullptr; |
732 | |
733 | /// The legality analysis. |
734 | LoopVectorizationLegality *Legal; |
735 | |
736 | /// The profitablity analysis. |
737 | LoopVectorizationCostModel *Cost; |
738 | |
739 | // Record whether runtime checks are added. |
740 | bool AddedSafetyChecks = false; |
741 | |
742 | // Holds the end values for each induction variable. We save the end values |
743 | // so we can later fix-up the external users of the induction variables. |
744 | DenseMap<PHINode *, Value *> IVEndValues; |
745 | |
746 | /// BFI and PSI are used to check for profile guided size optimizations. |
747 | BlockFrequencyInfo *BFI; |
748 | ProfileSummaryInfo *PSI; |
749 | |
750 | // Whether this loop should be optimized for size based on profile guided size |
751 | // optimizatios. |
752 | bool OptForSizeBasedOnProfile; |
753 | |
754 | /// Structure to hold information about generated runtime checks, responsible |
755 | /// for cleaning the checks, if vectorization turns out unprofitable. |
756 | GeneratedRTChecks &RTChecks; |
757 | |
758 | // Holds the resume values for reductions in the loops, used to set the |
759 | // correct start value of reduction PHIs when vectorizing the epilogue. |
760 | SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4> |
761 | ReductionResumeValues; |
762 | }; |
763 | |
764 | class InnerLoopUnroller : public InnerLoopVectorizer { |
765 | public: |
766 | InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE, |
767 | LoopInfo *LI, DominatorTree *DT, |
768 | const TargetLibraryInfo *TLI, |
769 | const TargetTransformInfo *TTI, AssumptionCache *AC, |
770 | OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, |
771 | LoopVectorizationLegality *LVL, |
772 | LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, |
773 | ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) |
774 | : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, |
775 | ElementCount::getFixed(MinVal: 1), |
776 | ElementCount::getFixed(MinVal: 1), UnrollFactor, LVL, CM, |
777 | BFI, PSI, Check) {} |
778 | }; |
779 | |
780 | /// Encapsulate information regarding vectorization of a loop and its epilogue. |
781 | /// This information is meant to be updated and used across two stages of |
782 | /// epilogue vectorization. |
783 | struct EpilogueLoopVectorizationInfo { |
784 | ElementCount MainLoopVF = ElementCount::getFixed(MinVal: 0); |
785 | unsigned MainLoopUF = 0; |
786 | ElementCount EpilogueVF = ElementCount::getFixed(MinVal: 0); |
787 | unsigned EpilogueUF = 0; |
788 | BasicBlock *MainLoopIterationCountCheck = nullptr; |
789 | BasicBlock *EpilogueIterationCountCheck = nullptr; |
790 | BasicBlock *SCEVSafetyCheck = nullptr; |
791 | BasicBlock *MemSafetyCheck = nullptr; |
792 | Value *TripCount = nullptr; |
793 | Value *VectorTripCount = nullptr; |
794 | |
795 | EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, |
796 | ElementCount EVF, unsigned EUF) |
797 | : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { |
798 | assert(EUF == 1 && |
799 | "A high UF for the epilogue loop is likely not beneficial." ); |
800 | } |
801 | }; |
802 | |
803 | /// An extension of the inner loop vectorizer that creates a skeleton for a |
804 | /// vectorized loop that has its epilogue (residual) also vectorized. |
805 | /// The idea is to run the vplan on a given loop twice, firstly to setup the |
806 | /// skeleton and vectorize the main loop, and secondly to complete the skeleton |
807 | /// from the first step and vectorize the epilogue. This is achieved by |
808 | /// deriving two concrete strategy classes from this base class and invoking |
809 | /// them in succession from the loop vectorizer planner. |
810 | class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer { |
811 | public: |
812 | InnerLoopAndEpilogueVectorizer( |
813 | Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, |
814 | DominatorTree *DT, const TargetLibraryInfo *TLI, |
815 | const TargetTransformInfo *TTI, AssumptionCache *AC, |
816 | OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, |
817 | LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, |
818 | BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, |
819 | GeneratedRTChecks &Checks) |
820 | : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, |
821 | EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL, |
822 | CM, BFI, PSI, Checks), |
823 | EPI(EPI) {} |
824 | |
825 | // Override this function to handle the more complex control flow around the |
826 | // three loops. |
827 | std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton( |
828 | const SCEV2ValueTy &ExpandedSCEVs) final { |
829 | return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs); |
830 | } |
831 | |
832 | /// The interface for creating a vectorized skeleton using one of two |
833 | /// different strategies, each corresponding to one execution of the vplan |
834 | /// as described above. |
835 | virtual std::pair<BasicBlock *, Value *> |
836 | createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0; |
837 | |
838 | /// Holds and updates state information required to vectorize the main loop |
839 | /// and its epilogue in two separate passes. This setup helps us avoid |
840 | /// regenerating and recomputing runtime safety checks. It also helps us to |
841 | /// shorten the iteration-count-check path length for the cases where the |
842 | /// iteration count of the loop is so small that the main vector loop is |
843 | /// completely skipped. |
844 | EpilogueLoopVectorizationInfo &EPI; |
845 | }; |
846 | |
847 | /// A specialized derived class of inner loop vectorizer that performs |
848 | /// vectorization of *main* loops in the process of vectorizing loops and their |
849 | /// epilogues. |
850 | class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { |
851 | public: |
852 | EpilogueVectorizerMainLoop( |
853 | Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, |
854 | DominatorTree *DT, const TargetLibraryInfo *TLI, |
855 | const TargetTransformInfo *TTI, AssumptionCache *AC, |
856 | OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, |
857 | LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, |
858 | BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, |
859 | GeneratedRTChecks &Check) |
860 | : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, |
861 | EPI, LVL, CM, BFI, PSI, Check) {} |
862 | /// Implements the interface for creating a vectorized skeleton using the |
863 | /// *main loop* strategy (ie the first pass of vplan execution). |
864 | std::pair<BasicBlock *, Value *> |
865 | createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; |
866 | |
867 | protected: |
868 | /// Emits an iteration count bypass check once for the main loop (when \p |
869 | /// ForEpilogue is false) and once for the epilogue loop (when \p |
870 | /// ForEpilogue is true). |
871 | BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue); |
872 | void printDebugTracesAtStart() override; |
873 | void printDebugTracesAtEnd() override; |
874 | }; |
875 | |
876 | // A specialized derived class of inner loop vectorizer that performs |
877 | // vectorization of *epilogue* loops in the process of vectorizing loops and |
878 | // their epilogues. |
879 | class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { |
880 | public: |
881 | EpilogueVectorizerEpilogueLoop( |
882 | Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, |
883 | DominatorTree *DT, const TargetLibraryInfo *TLI, |
884 | const TargetTransformInfo *TTI, AssumptionCache *AC, |
885 | OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, |
886 | LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, |
887 | BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, |
888 | GeneratedRTChecks &Checks) |
889 | : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, |
890 | EPI, LVL, CM, BFI, PSI, Checks) { |
891 | TripCount = EPI.TripCount; |
892 | } |
893 | /// Implements the interface for creating a vectorized skeleton using the |
894 | /// *epilogue loop* strategy (ie the second pass of vplan execution). |
895 | std::pair<BasicBlock *, Value *> |
896 | createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final; |
897 | |
898 | protected: |
899 | /// Emits an iteration count bypass check after the main vector loop has |
900 | /// finished to see if there are any iterations left to execute by either |
901 | /// the vector epilogue or the scalar epilogue. |
902 | BasicBlock *emitMinimumVectorEpilogueIterCountCheck( |
903 | BasicBlock *Bypass, |
904 | BasicBlock *Insert); |
905 | void printDebugTracesAtStart() override; |
906 | void printDebugTracesAtEnd() override; |
907 | }; |
908 | } // end namespace llvm |
909 | |
910 | /// Look for a meaningful debug location on the instruction or it's |
911 | /// operands. |
912 | static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) { |
913 | if (!I) |
914 | return DebugLoc(); |
915 | |
916 | DebugLoc Empty; |
917 | if (I->getDebugLoc() != Empty) |
918 | return I->getDebugLoc(); |
919 | |
920 | for (Use &Op : I->operands()) { |
921 | if (Instruction *OpInst = dyn_cast<Instruction>(Val&: Op)) |
922 | if (OpInst->getDebugLoc() != Empty) |
923 | return OpInst->getDebugLoc(); |
924 | } |
925 | |
926 | return I->getDebugLoc(); |
927 | } |
928 | |
929 | /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I |
930 | /// is passed, the message relates to that particular instruction. |
931 | #ifndef NDEBUG |
932 | static void debugVectorizationMessage(const StringRef Prefix, |
933 | const StringRef DebugMsg, |
934 | Instruction *I) { |
935 | dbgs() << "LV: " << Prefix << DebugMsg; |
936 | if (I != nullptr) |
937 | dbgs() << " " << *I; |
938 | else |
939 | dbgs() << '.'; |
940 | dbgs() << '\n'; |
941 | } |
942 | #endif |
943 | |
944 | /// Create an analysis remark that explains why vectorization failed |
945 | /// |
946 | /// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p |
947 | /// RemarkName is the identifier for the remark. If \p I is passed it is an |
948 | /// instruction that prevents vectorization. Otherwise \p TheLoop is used for |
949 | /// the location of the remark. \return the remark object that can be |
950 | /// streamed to. |
951 | static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, |
952 | StringRef , Loop *TheLoop, Instruction *I) { |
953 | Value *CodeRegion = TheLoop->getHeader(); |
954 | DebugLoc DL = TheLoop->getStartLoc(); |
955 | |
956 | if (I) { |
957 | CodeRegion = I->getParent(); |
958 | // If there is no debug location attached to the instruction, revert back to |
959 | // using the loop's. |
960 | if (I->getDebugLoc()) |
961 | DL = I->getDebugLoc(); |
962 | } |
963 | |
964 | return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion); |
965 | } |
966 | |
967 | namespace llvm { |
968 | |
969 | /// Return a value for Step multiplied by VF. |
970 | Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, |
971 | int64_t Step) { |
972 | assert(Ty->isIntegerTy() && "Expected an integer step" ); |
973 | return B.CreateElementCount(DstType: Ty, EC: VF.multiplyCoefficientBy(RHS: Step)); |
974 | } |
975 | |
976 | /// Return the runtime value for VF. |
977 | Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) { |
978 | return B.CreateElementCount(DstType: Ty, EC: VF); |
979 | } |
980 | |
981 | const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, |
982 | Loop *OrigLoop) { |
983 | const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); |
984 | assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count" ); |
985 | |
986 | ScalarEvolution &SE = *PSE.getSE(); |
987 | return SE.getTripCountFromExitCount(ExitCount: BackedgeTakenCount, EvalTy: IdxTy, L: OrigLoop); |
988 | } |
989 | |
990 | void (const StringRef DebugMsg, |
991 | const StringRef OREMsg, const StringRef ORETag, |
992 | OptimizationRemarkEmitter *ORE, Loop *TheLoop, |
993 | Instruction *I) { |
994 | LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: " , DebugMsg, I)); |
995 | LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); |
996 | ORE->emit( |
997 | OptDiag&: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop, I) |
998 | << "loop not vectorized: " << OREMsg); |
999 | } |
1000 | |
1001 | void (const StringRef Msg, const StringRef ORETag, |
1002 | OptimizationRemarkEmitter *ORE, Loop *TheLoop, |
1003 | Instruction *I) { |
1004 | LLVM_DEBUG(debugVectorizationMessage("" , Msg, I)); |
1005 | LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); |
1006 | ORE->emit( |
1007 | OptDiag&: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop, I) |
1008 | << Msg); |
1009 | } |
1010 | |
1011 | /// Report successful vectorization of the loop. In case an outer loop is |
1012 | /// vectorized, prepend "outer" to the vectorization remark. |
1013 | static void (OptimizationRemarkEmitter *ORE, Loop *TheLoop, |
1014 | VectorizationFactor VF, unsigned IC) { |
1015 | LLVM_DEBUG(debugVectorizationMessage( |
1016 | "Vectorizing: " , TheLoop->isInnermost() ? "innermost loop" : "outer loop" , |
1017 | nullptr)); |
1018 | StringRef LoopType = TheLoop->isInnermost() ? "" : "outer " ; |
1019 | ORE->emit(RemarkBuilder: [&]() { |
1020 | return OptimizationRemark(LV_NAME, "Vectorized" , TheLoop->getStartLoc(), |
1021 | TheLoop->getHeader()) |
1022 | << "vectorized " << LoopType << "loop (vectorization width: " |
1023 | << ore::NV("VectorizationFactor" , VF.Width) |
1024 | << ", interleaved count: " << ore::NV("InterleaveCount" , IC) << ")" ; |
1025 | }); |
1026 | } |
1027 | |
1028 | } // end namespace llvm |
1029 | |
1030 | #ifndef NDEBUG |
1031 | /// \return string containing a file name and a line # for the given loop. |
1032 | static std::string getDebugLocString(const Loop *L) { |
1033 | std::string Result; |
1034 | if (L) { |
1035 | raw_string_ostream OS(Result); |
1036 | if (const DebugLoc LoopDbgLoc = L->getStartLoc()) |
1037 | LoopDbgLoc.print(OS); |
1038 | else |
1039 | // Just print the module name. |
1040 | OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); |
1041 | OS.flush(); |
1042 | } |
1043 | return Result; |
1044 | } |
1045 | #endif |
1046 | |
1047 | namespace llvm { |
1048 | |
1049 | // Loop vectorization cost-model hints how the scalar epilogue loop should be |
1050 | // lowered. |
1051 | enum ScalarEpilogueLowering { |
1052 | |
1053 | // The default: allowing scalar epilogues. |
1054 | CM_ScalarEpilogueAllowed, |
1055 | |
1056 | // Vectorization with OptForSize: don't allow epilogues. |
1057 | CM_ScalarEpilogueNotAllowedOptSize, |
1058 | |
1059 | // A special case of vectorisation with OptForSize: loops with a very small |
1060 | // trip count are considered for vectorization under OptForSize, thereby |
1061 | // making sure the cost of their loop body is dominant, free of runtime |
1062 | // guards and scalar iteration overheads. |
1063 | CM_ScalarEpilogueNotAllowedLowTripLoop, |
1064 | |
1065 | // Loop hint predicate indicating an epilogue is undesired. |
1066 | CM_ScalarEpilogueNotNeededUsePredicate, |
1067 | |
1068 | // Directive indicating we must either tail fold or not vectorize |
1069 | CM_ScalarEpilogueNotAllowedUsePredicate |
1070 | }; |
1071 | |
1072 | using InstructionVFPair = std::pair<Instruction *, ElementCount>; |
1073 | |
1074 | /// LoopVectorizationCostModel - estimates the expected speedups due to |
1075 | /// vectorization. |
1076 | /// In many cases vectorization is not profitable. This can happen because of |
1077 | /// a number of reasons. In this class we mainly attempt to predict the |
1078 | /// expected speedup/slowdowns due to the supported instruction set. We use the |
1079 | /// TargetTransformInfo to query the different backends for the cost of |
1080 | /// different operations. |
1081 | class LoopVectorizationCostModel { |
1082 | public: |
1083 | LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, |
1084 | PredicatedScalarEvolution &PSE, LoopInfo *LI, |
1085 | LoopVectorizationLegality *Legal, |
1086 | const TargetTransformInfo &TTI, |
1087 | const TargetLibraryInfo *TLI, DemandedBits *DB, |
1088 | AssumptionCache *AC, |
1089 | OptimizationRemarkEmitter *ORE, const Function *F, |
1090 | const LoopVectorizeHints *Hints, |
1091 | InterleavedAccessInfo &IAI) |
1092 | : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), |
1093 | TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), |
1094 | Hints(Hints), InterleaveInfo(IAI) {} |
1095 | |
1096 | /// \return An upper bound for the vectorization factors (both fixed and |
1097 | /// scalable). If the factors are 0, vectorization and interleaving should be |
1098 | /// avoided up front. |
1099 | FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC); |
1100 | |
1101 | /// \return True if runtime checks are required for vectorization, and false |
1102 | /// otherwise. |
1103 | bool runtimeChecksRequired(); |
1104 | |
1105 | /// Setup cost-based decisions for user vectorization factor. |
1106 | /// \return true if the UserVF is a feasible VF to be chosen. |
1107 | bool selectUserVectorizationFactor(ElementCount UserVF) { |
1108 | collectUniformsAndScalars(VF: UserVF); |
1109 | collectInstsToScalarize(VF: UserVF); |
1110 | return expectedCost(VF: UserVF).first.isValid(); |
1111 | } |
1112 | |
1113 | /// \return The size (in bits) of the smallest and widest types in the code |
1114 | /// that needs to be vectorized. We ignore values that remain scalar such as |
1115 | /// 64 bit loop indices. |
1116 | std::pair<unsigned, unsigned> getSmallestAndWidestTypes(); |
1117 | |
1118 | /// \return The desired interleave count. |
1119 | /// If interleave count has been specified by metadata it will be returned. |
1120 | /// Otherwise, the interleave count is computed and returned. VF and LoopCost |
1121 | /// are the selected vectorization factor and the cost of the selected VF. |
1122 | unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); |
1123 | |
1124 | /// Memory access instruction may be vectorized in more than one way. |
1125 | /// Form of instruction after vectorization depends on cost. |
1126 | /// This function takes cost-based decisions for Load/Store instructions |
1127 | /// and collects them in a map. This decisions map is used for building |
1128 | /// the lists of loop-uniform and loop-scalar instructions. |
1129 | /// The calculated cost is saved with widening decision in order to |
1130 | /// avoid redundant calculations. |
1131 | void setCostBasedWideningDecision(ElementCount VF); |
1132 | |
1133 | /// A call may be vectorized in different ways depending on whether we have |
1134 | /// vectorized variants available and whether the target supports masking. |
1135 | /// This function analyzes all calls in the function at the supplied VF, |
1136 | /// makes a decision based on the costs of available options, and stores that |
1137 | /// decision in a map for use in planning and plan execution. |
1138 | void setVectorizedCallDecision(ElementCount VF); |
1139 | |
1140 | /// A struct that represents some properties of the register usage |
1141 | /// of a loop. |
1142 | struct RegisterUsage { |
1143 | /// Holds the number of loop invariant values that are used in the loop. |
1144 | /// The key is ClassID of target-provided register class. |
1145 | SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; |
1146 | /// Holds the maximum number of concurrent live intervals in the loop. |
1147 | /// The key is ClassID of target-provided register class. |
1148 | SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; |
1149 | }; |
1150 | |
1151 | /// \return Returns information about the register usages of the loop for the |
1152 | /// given vectorization factors. |
1153 | SmallVector<RegisterUsage, 8> |
1154 | calculateRegisterUsage(ArrayRef<ElementCount> VFs); |
1155 | |
1156 | /// Collect values we want to ignore in the cost model. |
1157 | void collectValuesToIgnore(); |
1158 | |
1159 | /// Collect all element types in the loop for which widening is needed. |
1160 | void collectElementTypesForWidening(); |
1161 | |
1162 | /// Split reductions into those that happen in the loop, and those that happen |
1163 | /// outside. In loop reductions are collected into InLoopReductions. |
1164 | void collectInLoopReductions(); |
1165 | |
1166 | /// Returns true if we should use strict in-order reductions for the given |
1167 | /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, |
1168 | /// the IsOrdered flag of RdxDesc is set and we do not allow reordering |
1169 | /// of FP operations. |
1170 | bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const { |
1171 | return !Hints->allowReordering() && RdxDesc.isOrdered(); |
1172 | } |
1173 | |
1174 | /// \returns The smallest bitwidth each instruction can be represented with. |
1175 | /// The vector equivalents of these instructions should be truncated to this |
1176 | /// type. |
1177 | const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const { |
1178 | return MinBWs; |
1179 | } |
1180 | |
1181 | /// \returns True if it is more profitable to scalarize instruction \p I for |
1182 | /// vectorization factor \p VF. |
1183 | bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { |
1184 | assert(VF.isVector() && |
1185 | "Profitable to scalarize relevant only for VF > 1." ); |
1186 | assert( |
1187 | TheLoop->isInnermost() && |
1188 | "cost-model should not be used for outer loops (in VPlan-native path)" ); |
1189 | |
1190 | auto Scalars = InstsToScalarize.find(Val: VF); |
1191 | assert(Scalars != InstsToScalarize.end() && |
1192 | "VF not yet analyzed for scalarization profitability" ); |
1193 | return Scalars->second.contains(Val: I); |
1194 | } |
1195 | |
1196 | /// Returns true if \p I is known to be uniform after vectorization. |
1197 | bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { |
1198 | assert( |
1199 | TheLoop->isInnermost() && |
1200 | "cost-model should not be used for outer loops (in VPlan-native path)" ); |
1201 | // Pseudo probe needs to be duplicated for each unrolled iteration and |
1202 | // vector lane so that profiled loop trip count can be accurately |
1203 | // accumulated instead of being under counted. |
1204 | if (isa<PseudoProbeInst>(Val: I)) |
1205 | return false; |
1206 | |
1207 | if (VF.isScalar()) |
1208 | return true; |
1209 | |
1210 | auto UniformsPerVF = Uniforms.find(Val: VF); |
1211 | assert(UniformsPerVF != Uniforms.end() && |
1212 | "VF not yet analyzed for uniformity" ); |
1213 | return UniformsPerVF->second.count(Ptr: I); |
1214 | } |
1215 | |
1216 | /// Returns true if \p I is known to be scalar after vectorization. |
1217 | bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { |
1218 | assert( |
1219 | TheLoop->isInnermost() && |
1220 | "cost-model should not be used for outer loops (in VPlan-native path)" ); |
1221 | if (VF.isScalar()) |
1222 | return true; |
1223 | |
1224 | auto ScalarsPerVF = Scalars.find(Val: VF); |
1225 | assert(ScalarsPerVF != Scalars.end() && |
1226 | "Scalar values are not calculated for VF" ); |
1227 | return ScalarsPerVF->second.count(Ptr: I); |
1228 | } |
1229 | |
1230 | /// \returns True if instruction \p I can be truncated to a smaller bitwidth |
1231 | /// for vectorization factor \p VF. |
1232 | bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { |
1233 | return VF.isVector() && MinBWs.contains(Key: I) && |
1234 | !isProfitableToScalarize(I, VF) && |
1235 | !isScalarAfterVectorization(I, VF); |
1236 | } |
1237 | |
1238 | /// Decision that was taken during cost calculation for memory instruction. |
1239 | enum InstWidening { |
1240 | CM_Unknown, |
1241 | CM_Widen, // For consecutive accesses with stride +1. |
1242 | CM_Widen_Reverse, // For consecutive accesses with stride -1. |
1243 | CM_Interleave, |
1244 | CM_GatherScatter, |
1245 | CM_Scalarize, |
1246 | CM_VectorCall, |
1247 | CM_IntrinsicCall |
1248 | }; |
1249 | |
1250 | /// Save vectorization decision \p W and \p Cost taken by the cost model for |
1251 | /// instruction \p I and vector width \p VF. |
1252 | void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, |
1253 | InstructionCost Cost) { |
1254 | assert(VF.isVector() && "Expected VF >=2" ); |
1255 | WideningDecisions[std::make_pair(x&: I, y&: VF)] = std::make_pair(x&: W, y&: Cost); |
1256 | } |
1257 | |
1258 | /// Save vectorization decision \p W and \p Cost taken by the cost model for |
1259 | /// interleaving group \p Grp and vector width \p VF. |
1260 | void setWideningDecision(const InterleaveGroup<Instruction> *Grp, |
1261 | ElementCount VF, InstWidening W, |
1262 | InstructionCost Cost) { |
1263 | assert(VF.isVector() && "Expected VF >=2" ); |
1264 | /// Broadcast this decicion to all instructions inside the group. |
1265 | /// But the cost will be assigned to one instruction only. |
1266 | for (unsigned i = 0; i < Grp->getFactor(); ++i) { |
1267 | if (auto *I = Grp->getMember(Index: i)) { |
1268 | if (Grp->getInsertPos() == I) |
1269 | WideningDecisions[std::make_pair(x&: I, y&: VF)] = std::make_pair(x&: W, y&: Cost); |
1270 | else |
1271 | WideningDecisions[std::make_pair(x&: I, y&: VF)] = std::make_pair(x&: W, y: 0); |
1272 | } |
1273 | } |
1274 | } |
1275 | |
1276 | /// Return the cost model decision for the given instruction \p I and vector |
1277 | /// width \p VF. Return CM_Unknown if this instruction did not pass |
1278 | /// through the cost modeling. |
1279 | InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { |
1280 | assert(VF.isVector() && "Expected VF to be a vector VF" ); |
1281 | assert( |
1282 | TheLoop->isInnermost() && |
1283 | "cost-model should not be used for outer loops (in VPlan-native path)" ); |
1284 | |
1285 | std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(x&: I, y&: VF); |
1286 | auto Itr = WideningDecisions.find(Val: InstOnVF); |
1287 | if (Itr == WideningDecisions.end()) |
1288 | return CM_Unknown; |
1289 | return Itr->second.first; |
1290 | } |
1291 | |
1292 | /// Return the vectorization cost for the given instruction \p I and vector |
1293 | /// width \p VF. |
1294 | InstructionCost getWideningCost(Instruction *I, ElementCount VF) { |
1295 | assert(VF.isVector() && "Expected VF >=2" ); |
1296 | std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(x&: I, y&: VF); |
1297 | assert(WideningDecisions.contains(InstOnVF) && |
1298 | "The cost is not calculated" ); |
1299 | return WideningDecisions[InstOnVF].second; |
1300 | } |
1301 | |
1302 | struct CallWideningDecision { |
1303 | InstWidening Kind; |
1304 | Function *Variant; |
1305 | Intrinsic::ID IID; |
1306 | std::optional<unsigned> MaskPos; |
1307 | InstructionCost Cost; |
1308 | }; |
1309 | |
1310 | void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, |
1311 | Function *Variant, Intrinsic::ID IID, |
1312 | std::optional<unsigned> MaskPos, |
1313 | InstructionCost Cost) { |
1314 | assert(!VF.isScalar() && "Expected vector VF" ); |
1315 | CallWideningDecisions[std::make_pair(x&: CI, y&: VF)] = {.Kind: Kind, .Variant: Variant, .IID: IID, |
1316 | .MaskPos: MaskPos, .Cost: Cost}; |
1317 | } |
1318 | |
1319 | CallWideningDecision getCallWideningDecision(CallInst *CI, |
1320 | ElementCount VF) const { |
1321 | assert(!VF.isScalar() && "Expected vector VF" ); |
1322 | return CallWideningDecisions.at(Val: std::make_pair(x&: CI, y&: VF)); |
1323 | } |
1324 | |
1325 | /// Return True if instruction \p I is an optimizable truncate whose operand |
1326 | /// is an induction variable. Such a truncate will be removed by adding a new |
1327 | /// induction variable with the destination type. |
1328 | bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) { |
1329 | // If the instruction is not a truncate, return false. |
1330 | auto *Trunc = dyn_cast<TruncInst>(Val: I); |
1331 | if (!Trunc) |
1332 | return false; |
1333 | |
1334 | // Get the source and destination types of the truncate. |
1335 | Type *SrcTy = ToVectorTy(Scalar: cast<CastInst>(Val: I)->getSrcTy(), EC: VF); |
1336 | Type *DestTy = ToVectorTy(Scalar: cast<CastInst>(Val: I)->getDestTy(), EC: VF); |
1337 | |
1338 | // If the truncate is free for the given types, return false. Replacing a |
1339 | // free truncate with an induction variable would add an induction variable |
1340 | // update instruction to each iteration of the loop. We exclude from this |
1341 | // check the primary induction variable since it will need an update |
1342 | // instruction regardless. |
1343 | Value *Op = Trunc->getOperand(i_nocapture: 0); |
1344 | if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(Ty1: SrcTy, Ty2: DestTy)) |
1345 | return false; |
1346 | |
1347 | // If the truncated value is not an induction variable, return false. |
1348 | return Legal->isInductionPhi(V: Op); |
1349 | } |
1350 | |
1351 | /// Collects the instructions to scalarize for each predicated instruction in |
1352 | /// the loop. |
1353 | void collectInstsToScalarize(ElementCount VF); |
1354 | |
1355 | /// Collect Uniform and Scalar values for the given \p VF. |
1356 | /// The sets depend on CM decision for Load/Store instructions |
1357 | /// that may be vectorized as interleave, gather-scatter or scalarized. |
1358 | /// Also make a decision on what to do about call instructions in the loop |
1359 | /// at that VF -- scalarize, call a known vector routine, or call a |
1360 | /// vector intrinsic. |
1361 | void collectUniformsAndScalars(ElementCount VF) { |
1362 | // Do the analysis once. |
1363 | if (VF.isScalar() || Uniforms.contains(Val: VF)) |
1364 | return; |
1365 | setCostBasedWideningDecision(VF); |
1366 | setVectorizedCallDecision(VF); |
1367 | collectLoopUniforms(VF); |
1368 | collectLoopScalars(VF); |
1369 | } |
1370 | |
1371 | /// Returns true if the target machine supports masked store operation |
1372 | /// for the given \p DataType and kind of access to \p Ptr. |
1373 | bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { |
1374 | return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) && |
1375 | TTI.isLegalMaskedStore(DataType, Alignment); |
1376 | } |
1377 | |
1378 | /// Returns true if the target machine supports masked load operation |
1379 | /// for the given \p DataType and kind of access to \p Ptr. |
1380 | bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { |
1381 | return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) && |
1382 | TTI.isLegalMaskedLoad(DataType, Alignment); |
1383 | } |
1384 | |
1385 | /// Returns true if the target machine can represent \p V as a masked gather |
1386 | /// or scatter operation. |
1387 | bool isLegalGatherOrScatter(Value *V, ElementCount VF) { |
1388 | bool LI = isa<LoadInst>(Val: V); |
1389 | bool SI = isa<StoreInst>(Val: V); |
1390 | if (!LI && !SI) |
1391 | return false; |
1392 | auto *Ty = getLoadStoreType(I: V); |
1393 | Align Align = getLoadStoreAlignment(I: V); |
1394 | if (VF.isVector()) |
1395 | Ty = VectorType::get(ElementType: Ty, EC: VF); |
1396 | return (LI && TTI.isLegalMaskedGather(DataType: Ty, Alignment: Align)) || |
1397 | (SI && TTI.isLegalMaskedScatter(DataType: Ty, Alignment: Align)); |
1398 | } |
1399 | |
1400 | /// Returns true if the target machine supports all of the reduction |
1401 | /// variables found for the given VF. |
1402 | bool canVectorizeReductions(ElementCount VF) const { |
1403 | return (all_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool { |
1404 | const RecurrenceDescriptor &RdxDesc = Reduction.second; |
1405 | return TTI.isLegalToVectorizeReduction(RdxDesc, VF); |
1406 | })); |
1407 | } |
1408 | |
1409 | /// Given costs for both strategies, return true if the scalar predication |
1410 | /// lowering should be used for div/rem. This incorporates an override |
1411 | /// option so it is not simply a cost comparison. |
1412 | bool isDivRemScalarWithPredication(InstructionCost ScalarCost, |
1413 | InstructionCost SafeDivisorCost) const { |
1414 | switch (ForceSafeDivisor) { |
1415 | case cl::BOU_UNSET: |
1416 | return ScalarCost < SafeDivisorCost; |
1417 | case cl::BOU_TRUE: |
1418 | return false; |
1419 | case cl::BOU_FALSE: |
1420 | return true; |
1421 | }; |
1422 | llvm_unreachable("impossible case value" ); |
1423 | } |
1424 | |
1425 | /// Returns true if \p I is an instruction which requires predication and |
1426 | /// for which our chosen predication strategy is scalarization (i.e. we |
1427 | /// don't have an alternate strategy such as masking available). |
1428 | /// \p VF is the vectorization factor that will be used to vectorize \p I. |
1429 | bool isScalarWithPredication(Instruction *I, ElementCount VF) const; |
1430 | |
1431 | /// Returns true if \p I is an instruction that needs to be predicated |
1432 | /// at runtime. The result is independent of the predication mechanism. |
1433 | /// Superset of instructions that return true for isScalarWithPredication. |
1434 | bool isPredicatedInst(Instruction *I) const; |
1435 | |
1436 | /// Return the costs for our two available strategies for lowering a |
1437 | /// div/rem operation which requires speculating at least one lane. |
1438 | /// First result is for scalarization (will be invalid for scalable |
1439 | /// vectors); second is for the safe-divisor strategy. |
1440 | std::pair<InstructionCost, InstructionCost> |
1441 | getDivRemSpeculationCost(Instruction *I, |
1442 | ElementCount VF) const; |
1443 | |
1444 | /// Returns true if \p I is a memory instruction with consecutive memory |
1445 | /// access that can be widened. |
1446 | bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); |
1447 | |
1448 | /// Returns true if \p I is a memory instruction in an interleaved-group |
1449 | /// of memory accesses that can be vectorized with wide vector loads/stores |
1450 | /// and shuffles. |
1451 | bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF); |
1452 | |
1453 | /// Check if \p Instr belongs to any interleaved access group. |
1454 | bool isAccessInterleaved(Instruction *Instr) { |
1455 | return InterleaveInfo.isInterleaved(Instr); |
1456 | } |
1457 | |
1458 | /// Get the interleaved access group that \p Instr belongs to. |
1459 | const InterleaveGroup<Instruction> * |
1460 | getInterleavedAccessGroup(Instruction *Instr) { |
1461 | return InterleaveInfo.getInterleaveGroup(Instr); |
1462 | } |
1463 | |
1464 | /// Returns true if we're required to use a scalar epilogue for at least |
1465 | /// the final iteration of the original loop. |
1466 | bool requiresScalarEpilogue(bool IsVectorizing) const { |
1467 | if (!isScalarEpilogueAllowed()) |
1468 | return false; |
1469 | // If we might exit from anywhere but the latch, must run the exiting |
1470 | // iteration in scalar form. |
1471 | if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) |
1472 | return true; |
1473 | return IsVectorizing && InterleaveInfo.requiresScalarEpilogue(); |
1474 | } |
1475 | |
1476 | /// Returns true if we're required to use a scalar epilogue for at least |
1477 | /// the final iteration of the original loop for all VFs in \p Range. |
1478 | /// A scalar epilogue must either be required for all VFs in \p Range or for |
1479 | /// none. |
1480 | bool requiresScalarEpilogue(VFRange Range) const { |
1481 | auto RequiresScalarEpilogue = [this](ElementCount VF) { |
1482 | return requiresScalarEpilogue(IsVectorizing: VF.isVector()); |
1483 | }; |
1484 | bool IsRequired = all_of(Range, P: RequiresScalarEpilogue); |
1485 | assert( |
1486 | (IsRequired || none_of(Range, RequiresScalarEpilogue)) && |
1487 | "all VFs in range must agree on whether a scalar epilogue is required" ); |
1488 | return IsRequired; |
1489 | } |
1490 | |
1491 | /// Returns true if a scalar epilogue is not allowed due to optsize or a |
1492 | /// loop hint annotation. |
1493 | bool isScalarEpilogueAllowed() const { |
1494 | return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; |
1495 | } |
1496 | |
1497 | /// Returns the TailFoldingStyle that is best for the current loop. |
1498 | TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { |
1499 | if (!ChosenTailFoldingStyle) |
1500 | return TailFoldingStyle::None; |
1501 | return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first |
1502 | : ChosenTailFoldingStyle->second; |
1503 | } |
1504 | |
1505 | /// Selects and saves TailFoldingStyle for 2 options - if IV update may |
1506 | /// overflow or not. |
1507 | /// \param IsScalableVF true if scalable vector factors enabled. |
1508 | /// \param UserIC User specific interleave count. |
1509 | void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) { |
1510 | assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet." ); |
1511 | if (!Legal->prepareToFoldTailByMasking()) { |
1512 | ChosenTailFoldingStyle = |
1513 | std::make_pair(x: TailFoldingStyle::None, y: TailFoldingStyle::None); |
1514 | return; |
1515 | } |
1516 | |
1517 | if (!ForceTailFoldingStyle.getNumOccurrences()) { |
1518 | ChosenTailFoldingStyle = std::make_pair( |
1519 | x: TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true), |
1520 | y: TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)); |
1521 | return; |
1522 | } |
1523 | |
1524 | // Set styles when forced. |
1525 | ChosenTailFoldingStyle = std::make_pair(x&: ForceTailFoldingStyle.getValue(), |
1526 | y&: ForceTailFoldingStyle.getValue()); |
1527 | if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL) |
1528 | return; |
1529 | // Override forced styles if needed. |
1530 | // FIXME: use actual opcode/data type for analysis here. |
1531 | // FIXME: Investigate opportunity for fixed vector factor. |
1532 | bool EVLIsLegal = |
1533 | IsScalableVF && UserIC <= 1 && |
1534 | TTI.hasActiveVectorLength(Opcode: 0, DataType: nullptr, Alignment: Align()) && |
1535 | !EnableVPlanNativePath && |
1536 | // FIXME: implement support for max safe dependency distance. |
1537 | Legal->isSafeForAnyVectorWidth() && |
1538 | // FIXME: remove this once reductions are supported. |
1539 | Legal->getReductionVars().empty(); |
1540 | if (!EVLIsLegal) { |
1541 | // If for some reason EVL mode is unsupported, fallback to |
1542 | // DataWithoutLaneMask to try to vectorize the loop with folded tail |
1543 | // in a generic way. |
1544 | ChosenTailFoldingStyle = |
1545 | std::make_pair(x: TailFoldingStyle::DataWithoutLaneMask, |
1546 | y: TailFoldingStyle::DataWithoutLaneMask); |
1547 | LLVM_DEBUG( |
1548 | dbgs() |
1549 | << "LV: Preference for VP intrinsics indicated. Will " |
1550 | "not try to generate VP Intrinsics " |
1551 | << (UserIC > 1 |
1552 | ? "since interleave count specified is greater than 1.\n" |
1553 | : "due to non-interleaving reasons.\n" )); |
1554 | } |
1555 | } |
1556 | |
1557 | /// Returns true if all loop blocks should be masked to fold tail loop. |
1558 | bool foldTailByMasking() const { |
1559 | // TODO: check if it is possible to check for None style independent of |
1560 | // IVUpdateMayOverflow flag in getTailFoldingStyle. |
1561 | return getTailFoldingStyle() != TailFoldingStyle::None; |
1562 | } |
1563 | |
1564 | /// Returns true if the instructions in this block requires predication |
1565 | /// for any reason, e.g. because tail folding now requires a predicate |
1566 | /// or because the block in the original loop was predicated. |
1567 | bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { |
1568 | return foldTailByMasking() || Legal->blockNeedsPredication(BB); |
1569 | } |
1570 | |
1571 | /// Returns true if VP intrinsics with explicit vector length support should |
1572 | /// be generated in the tail folded loop. |
1573 | bool foldTailWithEVL() const { |
1574 | return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL && |
1575 | // FIXME: remove this once vp_reverse is supported. |
1576 | none_of( |
1577 | Range: WideningDecisions, |
1578 | P: [](const std::pair<std::pair<Instruction *, ElementCount>, |
1579 | std::pair<InstWidening, InstructionCost>> |
1580 | &Data) { return Data.second.first == CM_Widen_Reverse; }); |
1581 | } |
1582 | |
1583 | /// Returns true if the Phi is part of an inloop reduction. |
1584 | bool isInLoopReduction(PHINode *Phi) const { |
1585 | return InLoopReductions.contains(Ptr: Phi); |
1586 | } |
1587 | |
1588 | /// Estimate cost of an intrinsic call instruction CI if it were vectorized |
1589 | /// with factor VF. Return the cost of the instruction, including |
1590 | /// scalarization overhead if it's needed. |
1591 | InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const; |
1592 | |
1593 | /// Estimate cost of a call instruction CI if it were vectorized with factor |
1594 | /// VF. Return the cost of the instruction, including scalarization overhead |
1595 | /// if it's needed. |
1596 | InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const; |
1597 | |
1598 | /// Invalidates decisions already taken by the cost model. |
1599 | void invalidateCostModelingDecisions() { |
1600 | WideningDecisions.clear(); |
1601 | CallWideningDecisions.clear(); |
1602 | Uniforms.clear(); |
1603 | Scalars.clear(); |
1604 | } |
1605 | |
1606 | /// The vectorization cost is a combination of the cost itself and a boolean |
1607 | /// indicating whether any of the contributing operations will actually |
1608 | /// operate on vector values after type legalization in the backend. If this |
1609 | /// latter value is false, then all operations will be scalarized (i.e. no |
1610 | /// vectorization has actually taken place). |
1611 | using VectorizationCostTy = std::pair<InstructionCost, bool>; |
1612 | |
1613 | /// Returns the expected execution cost. The unit of the cost does |
1614 | /// not matter because we use the 'cost' units to compare different |
1615 | /// vector widths. The cost that is returned is *not* normalized by |
1616 | /// the factor width. If \p Invalid is not nullptr, this function |
1617 | /// will add a pair(Instruction*, ElementCount) to \p Invalid for |
1618 | /// each instruction that has an Invalid cost for the given VF. |
1619 | VectorizationCostTy |
1620 | expectedCost(ElementCount VF, |
1621 | SmallVectorImpl<InstructionVFPair> *Invalid = nullptr); |
1622 | |
1623 | bool hasPredStores() const { return NumPredStores > 0; } |
1624 | |
1625 | /// Returns true if epilogue vectorization is considered profitable, and |
1626 | /// false otherwise. |
1627 | /// \p VF is the vectorization factor chosen for the original loop. |
1628 | bool isEpilogueVectorizationProfitable(const ElementCount VF) const; |
1629 | |
1630 | private: |
1631 | unsigned NumPredStores = 0; |
1632 | |
1633 | /// \return An upper bound for the vectorization factors for both |
1634 | /// fixed and scalable vectorization, where the minimum-known number of |
1635 | /// elements is a power-of-2 larger than zero. If scalable vectorization is |
1636 | /// disabled or unsupported, then the scalable part will be equal to |
1637 | /// ElementCount::getScalable(0). |
1638 | FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount, |
1639 | ElementCount UserVF, |
1640 | bool FoldTailByMasking); |
1641 | |
1642 | /// \return the maximized element count based on the targets vector |
1643 | /// registers and the loop trip-count, but limited to a maximum safe VF. |
1644 | /// This is a helper function of computeFeasibleMaxVF. |
1645 | ElementCount getMaximizedVFForTarget(unsigned MaxTripCount, |
1646 | unsigned SmallestType, |
1647 | unsigned WidestType, |
1648 | ElementCount MaxSafeVF, |
1649 | bool FoldTailByMasking); |
1650 | |
1651 | /// \return the maximum legal scalable VF, based on the safe max number |
1652 | /// of elements. |
1653 | ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); |
1654 | |
1655 | /// Returns the execution time cost of an instruction for a given vector |
1656 | /// width. Vector width of one means scalar. |
1657 | VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); |
1658 | |
1659 | /// The cost-computation logic from getInstructionCost which provides |
1660 | /// the vector type as an output parameter. |
1661 | InstructionCost getInstructionCost(Instruction *I, ElementCount VF, |
1662 | Type *&VectorTy); |
1663 | |
1664 | /// Return the cost of instructions in an inloop reduction pattern, if I is |
1665 | /// part of that pattern. |
1666 | std::optional<InstructionCost> |
1667 | getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, |
1668 | TTI::TargetCostKind CostKind) const; |
1669 | |
1670 | /// Calculate vectorization cost of memory instruction \p I. |
1671 | InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); |
1672 | |
1673 | /// The cost computation for scalarized memory instruction. |
1674 | InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); |
1675 | |
1676 | /// The cost computation for interleaving group of memory instructions. |
1677 | InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); |
1678 | |
1679 | /// The cost computation for Gather/Scatter instruction. |
1680 | InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); |
1681 | |
1682 | /// The cost computation for widening instruction \p I with consecutive |
1683 | /// memory access. |
1684 | InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); |
1685 | |
1686 | /// The cost calculation for Load/Store instruction \p I with uniform pointer - |
1687 | /// Load: scalar load + broadcast. |
1688 | /// Store: scalar store + (loop invariant value stored? 0 : extract of last |
1689 | /// element) |
1690 | InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF); |
1691 | |
1692 | /// Estimate the overhead of scalarizing an instruction. This is a |
1693 | /// convenience wrapper for the type-based getScalarizationOverhead API. |
1694 | InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF, |
1695 | TTI::TargetCostKind CostKind) const; |
1696 | |
1697 | /// Returns true if an artificially high cost for emulated masked memrefs |
1698 | /// should be used. |
1699 | bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); |
1700 | |
1701 | /// Map of scalar integer values to the smallest bitwidth they can be legally |
1702 | /// represented as. The vector equivalents of these values should be truncated |
1703 | /// to this type. |
1704 | MapVector<Instruction *, uint64_t> MinBWs; |
1705 | |
1706 | /// A type representing the costs for instructions if they were to be |
1707 | /// scalarized rather than vectorized. The entries are Instruction-Cost |
1708 | /// pairs. |
1709 | using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>; |
1710 | |
1711 | /// A set containing all BasicBlocks that are known to present after |
1712 | /// vectorization as a predicated block. |
1713 | DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>> |
1714 | PredicatedBBsAfterVectorization; |
1715 | |
1716 | /// Records whether it is allowed to have the original scalar loop execute at |
1717 | /// least once. This may be needed as a fallback loop in case runtime |
1718 | /// aliasing/dependence checks fail, or to handle the tail/remainder |
1719 | /// iterations when the trip count is unknown or doesn't divide by the VF, |
1720 | /// or as a peel-loop to handle gaps in interleave-groups. |
1721 | /// Under optsize and when the trip count is very small we don't allow any |
1722 | /// iterations to execute in the scalar loop. |
1723 | ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; |
1724 | |
1725 | /// Control finally chosen tail folding style. The first element is used if |
1726 | /// the IV update may overflow, the second element - if it does not. |
1727 | std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>> |
1728 | ChosenTailFoldingStyle; |
1729 | |
1730 | /// A map holding scalar costs for different vectorization factors. The |
1731 | /// presence of a cost for an instruction in the mapping indicates that the |
1732 | /// instruction will be scalarized when vectorizing with the associated |
1733 | /// vectorization factor. The entries are VF-ScalarCostTy pairs. |
1734 | DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize; |
1735 | |
1736 | /// Holds the instructions known to be uniform after vectorization. |
1737 | /// The data is collected per VF. |
1738 | DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms; |
1739 | |
1740 | /// Holds the instructions known to be scalar after vectorization. |
1741 | /// The data is collected per VF. |
1742 | DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars; |
1743 | |
1744 | /// Holds the instructions (address computations) that are forced to be |
1745 | /// scalarized. |
1746 | DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars; |
1747 | |
1748 | /// PHINodes of the reductions that should be expanded in-loop. |
1749 | SmallPtrSet<PHINode *, 4> InLoopReductions; |
1750 | |
1751 | /// A Map of inloop reduction operations and their immediate chain operand. |
1752 | /// FIXME: This can be removed once reductions can be costed correctly in |
1753 | /// VPlan. This was added to allow quick lookup of the inloop operations. |
1754 | DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains; |
1755 | |
1756 | /// Returns the expected difference in cost from scalarizing the expression |
1757 | /// feeding a predicated instruction \p PredInst. The instructions to |
1758 | /// scalarize and their scalar costs are collected in \p ScalarCosts. A |
1759 | /// non-negative return value implies the expression will be scalarized. |
1760 | /// Currently, only single-use chains are considered for scalarization. |
1761 | InstructionCost computePredInstDiscount(Instruction *PredInst, |
1762 | ScalarCostsTy &ScalarCosts, |
1763 | ElementCount VF); |
1764 | |
1765 | /// Collect the instructions that are uniform after vectorization. An |
1766 | /// instruction is uniform if we represent it with a single scalar value in |
1767 | /// the vectorized loop corresponding to each vector iteration. Examples of |
1768 | /// uniform instructions include pointer operands of consecutive or |
1769 | /// interleaved memory accesses. Note that although uniformity implies an |
1770 | /// instruction will be scalar, the reverse is not true. In general, a |
1771 | /// scalarized instruction will be represented by VF scalar values in the |
1772 | /// vectorized loop, each corresponding to an iteration of the original |
1773 | /// scalar loop. |
1774 | void collectLoopUniforms(ElementCount VF); |
1775 | |
1776 | /// Collect the instructions that are scalar after vectorization. An |
1777 | /// instruction is scalar if it is known to be uniform or will be scalarized |
1778 | /// during vectorization. collectLoopScalars should only add non-uniform nodes |
1779 | /// to the list if they are used by a load/store instruction that is marked as |
1780 | /// CM_Scalarize. Non-uniform scalarized instructions will be represented by |
1781 | /// VF values in the vectorized loop, each corresponding to an iteration of |
1782 | /// the original scalar loop. |
1783 | void collectLoopScalars(ElementCount VF); |
1784 | |
1785 | /// Keeps cost model vectorization decision and cost for instructions. |
1786 | /// Right now it is used for memory instructions only. |
1787 | using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>, |
1788 | std::pair<InstWidening, InstructionCost>>; |
1789 | |
1790 | DecisionList WideningDecisions; |
1791 | |
1792 | using CallDecisionList = |
1793 | DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>; |
1794 | |
1795 | CallDecisionList CallWideningDecisions; |
1796 | |
1797 | /// Returns true if \p V is expected to be vectorized and it needs to be |
1798 | /// extracted. |
1799 | bool (Value *V, ElementCount VF) const { |
1800 | Instruction *I = dyn_cast<Instruction>(Val: V); |
1801 | if (VF.isScalar() || !I || !TheLoop->contains(Inst: I) || |
1802 | TheLoop->isLoopInvariant(V: I)) |
1803 | return false; |
1804 | |
1805 | // Assume we can vectorize V (and hence we need extraction) if the |
1806 | // scalars are not computed yet. This can happen, because it is called |
1807 | // via getScalarizationOverhead from setCostBasedWideningDecision, before |
1808 | // the scalars are collected. That should be a safe assumption in most |
1809 | // cases, because we check if the operands have vectorizable types |
1810 | // beforehand in LoopVectorizationLegality. |
1811 | return !Scalars.contains(Val: VF) || !isScalarAfterVectorization(I, VF); |
1812 | }; |
1813 | |
1814 | /// Returns a range containing only operands needing to be extracted. |
1815 | SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops, |
1816 | ElementCount VF) const { |
1817 | return SmallVector<Value *, 4>(make_filter_range( |
1818 | Range&: Ops, Pred: [this, VF](Value *V) { return this->needsExtract(V, VF); })); |
1819 | } |
1820 | |
1821 | public: |
1822 | /// The loop that we evaluate. |
1823 | Loop *TheLoop; |
1824 | |
1825 | /// Predicated scalar evolution analysis. |
1826 | PredicatedScalarEvolution &PSE; |
1827 | |
1828 | /// Loop Info analysis. |
1829 | LoopInfo *LI; |
1830 | |
1831 | /// Vectorization legality. |
1832 | LoopVectorizationLegality *Legal; |
1833 | |
1834 | /// Vector target information. |
1835 | const TargetTransformInfo &TTI; |
1836 | |
1837 | /// Target Library Info. |
1838 | const TargetLibraryInfo *TLI; |
1839 | |
1840 | /// Demanded bits analysis. |
1841 | DemandedBits *DB; |
1842 | |
1843 | /// Assumption cache. |
1844 | AssumptionCache *AC; |
1845 | |
1846 | /// Interface to emit optimization remarks. |
1847 | OptimizationRemarkEmitter *ORE; |
1848 | |
1849 | const Function *TheFunction; |
1850 | |
1851 | /// Loop Vectorize Hint. |
1852 | const LoopVectorizeHints *Hints; |
1853 | |
1854 | /// The interleave access information contains groups of interleaved accesses |
1855 | /// with the same stride and close to each other. |
1856 | InterleavedAccessInfo &InterleaveInfo; |
1857 | |
1858 | /// Values to ignore in the cost model. |
1859 | SmallPtrSet<const Value *, 16> ValuesToIgnore; |
1860 | |
1861 | /// Values to ignore in the cost model when VF > 1. |
1862 | SmallPtrSet<const Value *, 16> VecValuesToIgnore; |
1863 | |
1864 | /// All element types found in the loop. |
1865 | SmallPtrSet<Type *, 16> ElementTypesInLoop; |
1866 | }; |
1867 | } // end namespace llvm |
1868 | |
1869 | namespace { |
1870 | /// Helper struct to manage generating runtime checks for vectorization. |
1871 | /// |
1872 | /// The runtime checks are created up-front in temporary blocks to allow better |
1873 | /// estimating the cost and un-linked from the existing IR. After deciding to |
1874 | /// vectorize, the checks are moved back. If deciding not to vectorize, the |
1875 | /// temporary blocks are completely removed. |
1876 | class GeneratedRTChecks { |
1877 | /// Basic block which contains the generated SCEV checks, if any. |
1878 | BasicBlock *SCEVCheckBlock = nullptr; |
1879 | |
1880 | /// The value representing the result of the generated SCEV checks. If it is |
1881 | /// nullptr, either no SCEV checks have been generated or they have been used. |
1882 | Value *SCEVCheckCond = nullptr; |
1883 | |
1884 | /// Basic block which contains the generated memory runtime checks, if any. |
1885 | BasicBlock *MemCheckBlock = nullptr; |
1886 | |
1887 | /// The value representing the result of the generated memory runtime checks. |
1888 | /// If it is nullptr, either no memory runtime checks have been generated or |
1889 | /// they have been used. |
1890 | Value *MemRuntimeCheckCond = nullptr; |
1891 | |
1892 | DominatorTree *DT; |
1893 | LoopInfo *LI; |
1894 | TargetTransformInfo *TTI; |
1895 | |
1896 | SCEVExpander SCEVExp; |
1897 | SCEVExpander MemCheckExp; |
1898 | |
1899 | bool CostTooHigh = false; |
1900 | const bool AddBranchWeights; |
1901 | |
1902 | Loop *OuterLoop = nullptr; |
1903 | |
1904 | public: |
1905 | GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, |
1906 | TargetTransformInfo *TTI, const DataLayout &DL, |
1907 | bool AddBranchWeights) |
1908 | : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check" ), |
1909 | MemCheckExp(SE, DL, "scev.check" ), AddBranchWeights(AddBranchWeights) {} |
1910 | |
1911 | /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can |
1912 | /// accurately estimate the cost of the runtime checks. The blocks are |
1913 | /// un-linked from the IR and is added back during vector code generation. If |
1914 | /// there is no vector code generation, the check blocks are removed |
1915 | /// completely. |
1916 | void Create(Loop *L, const LoopAccessInfo &LAI, |
1917 | const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { |
1918 | |
1919 | // Hard cutoff to limit compile-time increase in case a very large number of |
1920 | // runtime checks needs to be generated. |
1921 | // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to |
1922 | // profile info. |
1923 | CostTooHigh = |
1924 | LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; |
1925 | if (CostTooHigh) |
1926 | return; |
1927 | |
1928 | BasicBlock * = L->getHeader(); |
1929 | BasicBlock * = L->getLoopPreheader(); |
1930 | |
1931 | // Use SplitBlock to create blocks for SCEV & memory runtime checks to |
1932 | // ensure the blocks are properly added to LoopInfo & DominatorTree. Those |
1933 | // may be used by SCEVExpander. The blocks will be un-linked from their |
1934 | // predecessors and removed from LI & DT at the end of the function. |
1935 | if (!UnionPred.isAlwaysTrue()) { |
1936 | SCEVCheckBlock = SplitBlock(Old: Preheader, SplitPt: Preheader->getTerminator(), DT, LI, |
1937 | MSSAU: nullptr, BBName: "vector.scevcheck" ); |
1938 | |
1939 | SCEVCheckCond = SCEVExp.expandCodeForPredicate( |
1940 | Pred: &UnionPred, Loc: SCEVCheckBlock->getTerminator()); |
1941 | } |
1942 | |
1943 | const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); |
1944 | if (RtPtrChecking.Need) { |
1945 | auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; |
1946 | MemCheckBlock = SplitBlock(Old: Pred, SplitPt: Pred->getTerminator(), DT, LI, MSSAU: nullptr, |
1947 | BBName: "vector.memcheck" ); |
1948 | |
1949 | auto DiffChecks = RtPtrChecking.getDiffChecks(); |
1950 | if (DiffChecks) { |
1951 | Value *RuntimeVF = nullptr; |
1952 | MemRuntimeCheckCond = addDiffRuntimeChecks( |
1953 | Loc: MemCheckBlock->getTerminator(), Checks: *DiffChecks, Expander&: MemCheckExp, |
1954 | GetVF: [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) { |
1955 | if (!RuntimeVF) |
1956 | RuntimeVF = getRuntimeVF(B, Ty: B.getIntNTy(N: Bits), VF); |
1957 | return RuntimeVF; |
1958 | }, |
1959 | IC); |
1960 | } else { |
1961 | MemRuntimeCheckCond = addRuntimeChecks( |
1962 | Loc: MemCheckBlock->getTerminator(), TheLoop: L, PointerChecks: RtPtrChecking.getChecks(), |
1963 | Expander&: MemCheckExp, HoistRuntimeChecks: VectorizerParams::HoistRuntimeChecks); |
1964 | } |
1965 | assert(MemRuntimeCheckCond && |
1966 | "no RT checks generated although RtPtrChecking " |
1967 | "claimed checks are required" ); |
1968 | } |
1969 | |
1970 | if (!MemCheckBlock && !SCEVCheckBlock) |
1971 | return; |
1972 | |
1973 | // Unhook the temporary block with the checks, update various places |
1974 | // accordingly. |
1975 | if (SCEVCheckBlock) |
1976 | SCEVCheckBlock->replaceAllUsesWith(V: Preheader); |
1977 | if (MemCheckBlock) |
1978 | MemCheckBlock->replaceAllUsesWith(V: Preheader); |
1979 | |
1980 | if (SCEVCheckBlock) { |
1981 | SCEVCheckBlock->getTerminator()->moveBefore(MovePos: Preheader->getTerminator()); |
1982 | new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); |
1983 | Preheader->getTerminator()->eraseFromParent(); |
1984 | } |
1985 | if (MemCheckBlock) { |
1986 | MemCheckBlock->getTerminator()->moveBefore(MovePos: Preheader->getTerminator()); |
1987 | new UnreachableInst(Preheader->getContext(), MemCheckBlock); |
1988 | Preheader->getTerminator()->eraseFromParent(); |
1989 | } |
1990 | |
1991 | DT->changeImmediateDominator(BB: LoopHeader, NewBB: Preheader); |
1992 | if (MemCheckBlock) { |
1993 | DT->eraseNode(BB: MemCheckBlock); |
1994 | LI->removeBlock(BB: MemCheckBlock); |
1995 | } |
1996 | if (SCEVCheckBlock) { |
1997 | DT->eraseNode(BB: SCEVCheckBlock); |
1998 | LI->removeBlock(BB: SCEVCheckBlock); |
1999 | } |
2000 | |
2001 | // Outer loop is used as part of the later cost calculations. |
2002 | OuterLoop = L->getParentLoop(); |
2003 | } |
2004 | |
2005 | InstructionCost getCost() { |
2006 | if (SCEVCheckBlock || MemCheckBlock) |
2007 | LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n" ); |
2008 | |
2009 | if (CostTooHigh) { |
2010 | InstructionCost Cost; |
2011 | Cost.setInvalid(); |
2012 | LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n" ); |
2013 | return Cost; |
2014 | } |
2015 | |
2016 | InstructionCost RTCheckCost = 0; |
2017 | if (SCEVCheckBlock) |
2018 | for (Instruction &I : *SCEVCheckBlock) { |
2019 | if (SCEVCheckBlock->getTerminator() == &I) |
2020 | continue; |
2021 | InstructionCost C = |
2022 | TTI->getInstructionCost(U: &I, CostKind: TTI::TCK_RecipThroughput); |
2023 | LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n" ); |
2024 | RTCheckCost += C; |
2025 | } |
2026 | if (MemCheckBlock) { |
2027 | InstructionCost MemCheckCost = 0; |
2028 | for (Instruction &I : *MemCheckBlock) { |
2029 | if (MemCheckBlock->getTerminator() == &I) |
2030 | continue; |
2031 | InstructionCost C = |
2032 | TTI->getInstructionCost(U: &I, CostKind: TTI::TCK_RecipThroughput); |
2033 | LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n" ); |
2034 | MemCheckCost += C; |
2035 | } |
2036 | |
2037 | // If the runtime memory checks are being created inside an outer loop |
2038 | // we should find out if these checks are outer loop invariant. If so, |
2039 | // the checks will likely be hoisted out and so the effective cost will |
2040 | // reduce according to the outer loop trip count. |
2041 | if (OuterLoop) { |
2042 | ScalarEvolution *SE = MemCheckExp.getSE(); |
2043 | // TODO: If profitable, we could refine this further by analysing every |
2044 | // individual memory check, since there could be a mixture of loop |
2045 | // variant and invariant checks that mean the final condition is |
2046 | // variant. |
2047 | const SCEV *Cond = SE->getSCEV(V: MemRuntimeCheckCond); |
2048 | if (SE->isLoopInvariant(S: Cond, L: OuterLoop)) { |
2049 | // It seems reasonable to assume that we can reduce the effective |
2050 | // cost of the checks even when we know nothing about the trip |
2051 | // count. Assume that the outer loop executes at least twice. |
2052 | unsigned BestTripCount = 2; |
2053 | |
2054 | // If exact trip count is known use that. |
2055 | if (unsigned SmallTC = SE->getSmallConstantTripCount(L: OuterLoop)) |
2056 | BestTripCount = SmallTC; |
2057 | else if (LoopVectorizeWithBlockFrequency) { |
2058 | // Else use profile data if available. |
2059 | if (auto EstimatedTC = getLoopEstimatedTripCount(L: OuterLoop)) |
2060 | BestTripCount = *EstimatedTC; |
2061 | } |
2062 | |
2063 | BestTripCount = std::max(a: BestTripCount, b: 1U); |
2064 | InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount; |
2065 | |
2066 | // Let's ensure the cost is always at least 1. |
2067 | NewMemCheckCost = std::max(a: *NewMemCheckCost.getValue(), |
2068 | b: (InstructionCost::CostType)1); |
2069 | |
2070 | if (BestTripCount > 1) |
2071 | LLVM_DEBUG(dbgs() |
2072 | << "We expect runtime memory checks to be hoisted " |
2073 | << "out of the outer loop. Cost reduced from " |
2074 | << MemCheckCost << " to " << NewMemCheckCost << '\n'); |
2075 | |
2076 | MemCheckCost = NewMemCheckCost; |
2077 | } |
2078 | } |
2079 | |
2080 | RTCheckCost += MemCheckCost; |
2081 | } |
2082 | |
2083 | if (SCEVCheckBlock || MemCheckBlock) |
2084 | LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost |
2085 | << "\n" ); |
2086 | |
2087 | return RTCheckCost; |
2088 | } |
2089 | |
2090 | /// Remove the created SCEV & memory runtime check blocks & instructions, if |
2091 | /// unused. |
2092 | ~GeneratedRTChecks() { |
2093 | SCEVExpanderCleaner SCEVCleaner(SCEVExp); |
2094 | SCEVExpanderCleaner MemCheckCleaner(MemCheckExp); |
2095 | if (!SCEVCheckCond) |
2096 | SCEVCleaner.markResultUsed(); |
2097 | |
2098 | if (!MemRuntimeCheckCond) |
2099 | MemCheckCleaner.markResultUsed(); |
2100 | |
2101 | if (MemRuntimeCheckCond) { |
2102 | auto &SE = *MemCheckExp.getSE(); |
2103 | // Memory runtime check generation creates compares that use expanded |
2104 | // values. Remove them before running the SCEVExpanderCleaners. |
2105 | for (auto &I : make_early_inc_range(Range: reverse(C&: *MemCheckBlock))) { |
2106 | if (MemCheckExp.isInsertedInstruction(I: &I)) |
2107 | continue; |
2108 | SE.forgetValue(V: &I); |
2109 | I.eraseFromParent(); |
2110 | } |
2111 | } |
2112 | MemCheckCleaner.cleanup(); |
2113 | SCEVCleaner.cleanup(); |
2114 | |
2115 | if (SCEVCheckCond) |
2116 | SCEVCheckBlock->eraseFromParent(); |
2117 | if (MemRuntimeCheckCond) |
2118 | MemCheckBlock->eraseFromParent(); |
2119 | } |
2120 | |
2121 | /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and |
2122 | /// adjusts the branches to branch to the vector preheader or \p Bypass, |
2123 | /// depending on the generated condition. |
2124 | BasicBlock *emitSCEVChecks(BasicBlock *Bypass, |
2125 | BasicBlock *, |
2126 | BasicBlock *LoopExitBlock) { |
2127 | if (!SCEVCheckCond) |
2128 | return nullptr; |
2129 | |
2130 | Value *Cond = SCEVCheckCond; |
2131 | // Mark the check as used, to prevent it from being removed during cleanup. |
2132 | SCEVCheckCond = nullptr; |
2133 | if (auto *C = dyn_cast<ConstantInt>(Val: Cond)) |
2134 | if (C->isZero()) |
2135 | return nullptr; |
2136 | |
2137 | auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); |
2138 | |
2139 | BranchInst::Create(IfTrue: LoopVectorPreHeader, InsertAtEnd: SCEVCheckBlock); |
2140 | // Create new preheader for vector loop. |
2141 | if (OuterLoop) |
2142 | OuterLoop->addBasicBlockToLoop(NewBB: SCEVCheckBlock, LI&: *LI); |
2143 | |
2144 | SCEVCheckBlock->getTerminator()->eraseFromParent(); |
2145 | SCEVCheckBlock->moveBefore(MovePos: LoopVectorPreHeader); |
2146 | Pred->getTerminator()->replaceSuccessorWith(OldBB: LoopVectorPreHeader, |
2147 | NewBB: SCEVCheckBlock); |
2148 | |
2149 | DT->addNewBlock(BB: SCEVCheckBlock, DomBB: Pred); |
2150 | DT->changeImmediateDominator(BB: LoopVectorPreHeader, NewBB: SCEVCheckBlock); |
2151 | |
2152 | BranchInst &BI = *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond); |
2153 | if (AddBranchWeights) |
2154 | setBranchWeights(I&: BI, Weights: SCEVCheckBypassWeights); |
2155 | ReplaceInstWithInst(From: SCEVCheckBlock->getTerminator(), To: &BI); |
2156 | return SCEVCheckBlock; |
2157 | } |
2158 | |
2159 | /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts |
2160 | /// the branches to branch to the vector preheader or \p Bypass, depending on |
2161 | /// the generated condition. |
2162 | BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass, |
2163 | BasicBlock *) { |
2164 | // Check if we generated code that checks in runtime if arrays overlap. |
2165 | if (!MemRuntimeCheckCond) |
2166 | return nullptr; |
2167 | |
2168 | auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); |
2169 | Pred->getTerminator()->replaceSuccessorWith(OldBB: LoopVectorPreHeader, |
2170 | NewBB: MemCheckBlock); |
2171 | |
2172 | DT->addNewBlock(BB: MemCheckBlock, DomBB: Pred); |
2173 | DT->changeImmediateDominator(BB: LoopVectorPreHeader, NewBB: MemCheckBlock); |
2174 | MemCheckBlock->moveBefore(MovePos: LoopVectorPreHeader); |
2175 | |
2176 | if (OuterLoop) |
2177 | OuterLoop->addBasicBlockToLoop(NewBB: MemCheckBlock, LI&: *LI); |
2178 | |
2179 | BranchInst &BI = |
2180 | *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: MemRuntimeCheckCond); |
2181 | if (AddBranchWeights) { |
2182 | setBranchWeights(I&: BI, Weights: MemCheckBypassWeights); |
2183 | } |
2184 | ReplaceInstWithInst(From: MemCheckBlock->getTerminator(), To: &BI); |
2185 | MemCheckBlock->getTerminator()->setDebugLoc( |
2186 | Pred->getTerminator()->getDebugLoc()); |
2187 | |
2188 | // Mark the check as used, to prevent it from being removed during cleanup. |
2189 | MemRuntimeCheckCond = nullptr; |
2190 | return MemCheckBlock; |
2191 | } |
2192 | }; |
2193 | } // namespace |
2194 | |
2195 | static bool useActiveLaneMask(TailFoldingStyle Style) { |
2196 | return Style == TailFoldingStyle::Data || |
2197 | Style == TailFoldingStyle::DataAndControlFlow || |
2198 | Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; |
2199 | } |
2200 | |
2201 | static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) { |
2202 | return Style == TailFoldingStyle::DataAndControlFlow || |
2203 | Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; |
2204 | } |
2205 | |
2206 | // Return true if \p OuterLp is an outer loop annotated with hints for explicit |
2207 | // vectorization. The loop needs to be annotated with #pragma omp simd |
2208 | // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the |
2209 | // vector length information is not provided, vectorization is not considered |
2210 | // explicit. Interleave hints are not allowed either. These limitations will be |
2211 | // relaxed in the future. |
2212 | // Please, note that we are currently forced to abuse the pragma 'clang |
2213 | // vectorize' semantics. This pragma provides *auto-vectorization hints* |
2214 | // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' |
2215 | // provides *explicit vectorization hints* (LV can bypass legal checks and |
2216 | // assume that vectorization is legal). However, both hints are implemented |
2217 | // using the same metadata (llvm.loop.vectorize, processed by |
2218 | // LoopVectorizeHints). This will be fixed in the future when the native IR |
2219 | // representation for pragma 'omp simd' is introduced. |
2220 | static bool (Loop *OuterLp, |
2221 | OptimizationRemarkEmitter *ORE) { |
2222 | assert(!OuterLp->isInnermost() && "This is not an outer loop" ); |
2223 | LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE); |
2224 | |
2225 | // Only outer loops with an explicit vectorization hint are supported. |
2226 | // Unannotated outer loops are ignored. |
2227 | if (Hints.getForce() == LoopVectorizeHints::FK_Undefined) |
2228 | return false; |
2229 | |
2230 | Function *Fn = OuterLp->getHeader()->getParent(); |
2231 | if (!Hints.allowVectorization(F: Fn, L: OuterLp, |
2232 | VectorizeOnlyWhenForced: true /*VectorizeOnlyWhenForced*/)) { |
2233 | LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n" ); |
2234 | return false; |
2235 | } |
2236 | |
2237 | if (Hints.getInterleave() > 1) { |
2238 | // TODO: Interleave support is future work. |
2239 | LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for " |
2240 | "outer loops.\n" ); |
2241 | Hints.emitRemarkWithHints(); |
2242 | return false; |
2243 | } |
2244 | |
2245 | return true; |
2246 | } |
2247 | |
2248 | static void (Loop &L, LoopInfo *LI, |
2249 | OptimizationRemarkEmitter *ORE, |
2250 | SmallVectorImpl<Loop *> &V) { |
2251 | // Collect inner loops and outer loops without irreducible control flow. For |
2252 | // now, only collect outer loops that have explicit vectorization hints. If we |
2253 | // are stress testing the VPlan H-CFG construction, we collect the outermost |
2254 | // loop of every loop nest. |
2255 | if (L.isInnermost() || VPlanBuildStressTest || |
2256 | (EnableVPlanNativePath && isExplicitVecOuterLoop(OuterLp: &L, ORE))) { |
2257 | LoopBlocksRPO RPOT(&L); |
2258 | RPOT.perform(LI); |
2259 | if (!containsIrreducibleCFG<const BasicBlock *>(RPOTraversal&: RPOT, LI: *LI)) { |
2260 | V.push_back(Elt: &L); |
2261 | // TODO: Collect inner loops inside marked outer loops in case |
2262 | // vectorization fails for the outer loop. Do not invoke |
2263 | // 'containsIrreducibleCFG' again for inner loops when the outer loop is |
2264 | // already known to be reducible. We can use an inherited attribute for |
2265 | // that. |
2266 | return; |
2267 | } |
2268 | } |
2269 | for (Loop *InnerL : L) |
2270 | collectSupportedLoops(L&: *InnerL, LI, ORE, V); |
2271 | } |
2272 | |
2273 | //===----------------------------------------------------------------------===// |
2274 | // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and |
2275 | // LoopVectorizationCostModel and LoopVectorizationPlanner. |
2276 | //===----------------------------------------------------------------------===// |
2277 | |
2278 | /// Compute the transformed value of Index at offset StartValue using step |
2279 | /// StepValue. |
2280 | /// For integer induction, returns StartValue + Index * StepValue. |
2281 | /// For pointer induction, returns StartValue[Index * StepValue]. |
2282 | /// FIXME: The newly created binary instructions should contain nsw/nuw |
2283 | /// flags, which can be found from the original scalar operations. |
2284 | static Value * |
2285 | emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, |
2286 | Value *Step, |
2287 | InductionDescriptor::InductionKind InductionKind, |
2288 | const BinaryOperator *InductionBinOp) { |
2289 | Type *StepTy = Step->getType(); |
2290 | Value *CastedIndex = StepTy->isIntegerTy() |
2291 | ? B.CreateSExtOrTrunc(V: Index, DestTy: StepTy) |
2292 | : B.CreateCast(Op: Instruction::SIToFP, V: Index, DestTy: StepTy); |
2293 | if (CastedIndex != Index) { |
2294 | CastedIndex->setName(CastedIndex->getName() + ".cast" ); |
2295 | Index = CastedIndex; |
2296 | } |
2297 | |
2298 | // Note: the IR at this point is broken. We cannot use SE to create any new |
2299 | // SCEV and then expand it, hoping that SCEV's simplification will give us |
2300 | // a more optimal code. Unfortunately, attempt of doing so on invalid IR may |
2301 | // lead to various SCEV crashes. So all we can do is to use builder and rely |
2302 | // on InstCombine for future simplifications. Here we handle some trivial |
2303 | // cases only. |
2304 | auto CreateAdd = [&B](Value *X, Value *Y) { |
2305 | assert(X->getType() == Y->getType() && "Types don't match!" ); |
2306 | if (auto *CX = dyn_cast<ConstantInt>(Val: X)) |
2307 | if (CX->isZero()) |
2308 | return Y; |
2309 | if (auto *CY = dyn_cast<ConstantInt>(Val: Y)) |
2310 | if (CY->isZero()) |
2311 | return X; |
2312 | return B.CreateAdd(LHS: X, RHS: Y); |
2313 | }; |
2314 | |
2315 | // We allow X to be a vector type, in which case Y will potentially be |
2316 | // splatted into a vector with the same element count. |
2317 | auto CreateMul = [&B](Value *X, Value *Y) { |
2318 | assert(X->getType()->getScalarType() == Y->getType() && |
2319 | "Types don't match!" ); |
2320 | if (auto *CX = dyn_cast<ConstantInt>(Val: X)) |
2321 | if (CX->isOne()) |
2322 | return Y; |
2323 | if (auto *CY = dyn_cast<ConstantInt>(Val: Y)) |
2324 | if (CY->isOne()) |
2325 | return X; |
2326 | VectorType *XVTy = dyn_cast<VectorType>(Val: X->getType()); |
2327 | if (XVTy && !isa<VectorType>(Val: Y->getType())) |
2328 | Y = B.CreateVectorSplat(EC: XVTy->getElementCount(), V: Y); |
2329 | return B.CreateMul(LHS: X, RHS: Y); |
2330 | }; |
2331 | |
2332 | switch (InductionKind) { |
2333 | case InductionDescriptor::IK_IntInduction: { |
2334 | assert(!isa<VectorType>(Index->getType()) && |
2335 | "Vector indices not supported for integer inductions yet" ); |
2336 | assert(Index->getType() == StartValue->getType() && |
2337 | "Index type does not match StartValue type" ); |
2338 | if (isa<ConstantInt>(Val: Step) && cast<ConstantInt>(Val: Step)->isMinusOne()) |
2339 | return B.CreateSub(LHS: StartValue, RHS: Index); |
2340 | auto *Offset = CreateMul(Index, Step); |
2341 | return CreateAdd(StartValue, Offset); |
2342 | } |
2343 | case InductionDescriptor::IK_PtrInduction: |
2344 | return B.CreatePtrAdd(Ptr: StartValue, Offset: CreateMul(Index, Step)); |
2345 | case InductionDescriptor::IK_FpInduction: { |
2346 | assert(!isa<VectorType>(Index->getType()) && |
2347 | "Vector indices not supported for FP inductions yet" ); |
2348 | assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value" ); |
2349 | assert(InductionBinOp && |
2350 | (InductionBinOp->getOpcode() == Instruction::FAdd || |
2351 | InductionBinOp->getOpcode() == Instruction::FSub) && |
2352 | "Original bin op should be defined for FP induction" ); |
2353 | |
2354 | Value *MulExp = B.CreateFMul(L: Step, R: Index); |
2355 | return B.CreateBinOp(Opc: InductionBinOp->getOpcode(), LHS: StartValue, RHS: MulExp, |
2356 | Name: "induction" ); |
2357 | } |
2358 | case InductionDescriptor::IK_NoInduction: |
2359 | return nullptr; |
2360 | } |
2361 | llvm_unreachable("invalid enum" ); |
2362 | } |
2363 | |
2364 | std::optional<unsigned> getMaxVScale(const Function &F, |
2365 | const TargetTransformInfo &TTI) { |
2366 | if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale()) |
2367 | return MaxVScale; |
2368 | |
2369 | if (F.hasFnAttribute(Attribute::VScaleRange)) |
2370 | return F.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); |
2371 | |
2372 | return std::nullopt; |
2373 | } |
2374 | |
2375 | /// For the given VF and UF and maximum trip count computed for the loop, return |
2376 | /// whether the induction variable might overflow in the vectorized loop. If not, |
2377 | /// then we know a runtime overflow check always evaluates to false and can be |
2378 | /// removed. |
2379 | static bool isIndvarOverflowCheckKnownFalse( |
2380 | const LoopVectorizationCostModel *Cost, |
2381 | ElementCount VF, std::optional<unsigned> UF = std::nullopt) { |
2382 | // Always be conservative if we don't know the exact unroll factor. |
2383 | unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF); |
2384 | |
2385 | Type *IdxTy = Cost->Legal->getWidestInductionType(); |
2386 | APInt MaxUIntTripCount = cast<IntegerType>(Val: IdxTy)->getMask(); |
2387 | |
2388 | // We know the runtime overflow check is known false iff the (max) trip-count |
2389 | // is known and (max) trip-count + (VF * UF) does not overflow in the type of |
2390 | // the vector loop induction variable. |
2391 | if (unsigned TC = |
2392 | Cost->PSE.getSE()->getSmallConstantMaxTripCount(L: Cost->TheLoop)) { |
2393 | uint64_t MaxVF = VF.getKnownMinValue(); |
2394 | if (VF.isScalable()) { |
2395 | std::optional<unsigned> MaxVScale = |
2396 | getMaxVScale(F: *Cost->TheFunction, TTI: Cost->TTI); |
2397 | if (!MaxVScale) |
2398 | return false; |
2399 | MaxVF *= *MaxVScale; |
2400 | } |
2401 | |
2402 | return (MaxUIntTripCount - TC).ugt(RHS: MaxVF * MaxUF); |
2403 | } |
2404 | |
2405 | return false; |
2406 | } |
2407 | |
2408 | // Return whether we allow using masked interleave-groups (for dealing with |
2409 | // strided loads/stores that reside in predicated blocks, or for dealing |
2410 | // with gaps). |
2411 | static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { |
2412 | // If an override option has been passed in for interleaved accesses, use it. |
2413 | if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) |
2414 | return EnableMaskedInterleavedMemAccesses; |
2415 | |
2416 | return TTI.enableMaskedInterleavedAccessVectorization(); |
2417 | } |
2418 | |
2419 | // Try to vectorize the interleave group that \p Instr belongs to. |
2420 | // |
2421 | // E.g. Translate following interleaved load group (factor = 3): |
2422 | // for (i = 0; i < N; i+=3) { |
2423 | // R = Pic[i]; // Member of index 0 |
2424 | // G = Pic[i+1]; // Member of index 1 |
2425 | // B = Pic[i+2]; // Member of index 2 |
2426 | // ... // do something to R, G, B |
2427 | // } |
2428 | // To: |
2429 | // %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B |
2430 | // %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements |
2431 | // %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements |
2432 | // %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements |
2433 | // |
2434 | // Or translate following interleaved store group (factor = 3): |
2435 | // for (i = 0; i < N; i+=3) { |
2436 | // ... do something to R, G, B |
2437 | // Pic[i] = R; // Member of index 0 |
2438 | // Pic[i+1] = G; // Member of index 1 |
2439 | // Pic[i+2] = B; // Member of index 2 |
2440 | // } |
2441 | // To: |
2442 | // %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> |
2443 | // %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u> |
2444 | // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, |
2445 | // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements |
2446 | // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B |
2447 | void InnerLoopVectorizer::vectorizeInterleaveGroup( |
2448 | const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs, |
2449 | VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues, |
2450 | VPValue *BlockInMask, bool NeedsMaskForGaps) { |
2451 | Instruction *Instr = Group->getInsertPos(); |
2452 | const DataLayout &DL = Instr->getModule()->getDataLayout(); |
2453 | |
2454 | // Prepare for the vector type of the interleaved load/store. |
2455 | Type *ScalarTy = getLoadStoreType(I: Instr); |
2456 | unsigned InterleaveFactor = Group->getFactor(); |
2457 | auto *VecTy = VectorType::get(ElementType: ScalarTy, EC: VF * InterleaveFactor); |
2458 | |
2459 | // Prepare for the new pointers. |
2460 | SmallVector<Value *, 2> AddrParts; |
2461 | unsigned Index = Group->getIndex(Instr); |
2462 | |
2463 | // TODO: extend the masked interleaved-group support to reversed access. |
2464 | assert((!BlockInMask || !Group->isReverse()) && |
2465 | "Reversed masked interleave-group not supported." ); |
2466 | |
2467 | Value *Idx; |
2468 | // If the group is reverse, adjust the index to refer to the last vector lane |
2469 | // instead of the first. We adjust the index from the first vector lane, |
2470 | // rather than directly getting the pointer for lane VF - 1, because the |
2471 | // pointer operand of the interleaved access is supposed to be uniform. For |
2472 | // uniform instructions, we're only required to generate a value for the |
2473 | // first vector lane in each unroll iteration. |
2474 | if (Group->isReverse()) { |
2475 | Value *RuntimeVF = getRuntimeVF(B&: Builder, Ty: Builder.getInt32Ty(), VF); |
2476 | Idx = Builder.CreateSub(LHS: RuntimeVF, RHS: Builder.getInt32(C: 1)); |
2477 | Idx = Builder.CreateMul(LHS: Idx, RHS: Builder.getInt32(C: Group->getFactor())); |
2478 | Idx = Builder.CreateAdd(LHS: Idx, RHS: Builder.getInt32(C: Index)); |
2479 | Idx = Builder.CreateNeg(V: Idx); |
2480 | } else |
2481 | Idx = Builder.getInt32(C: -Index); |
2482 | |
2483 | for (unsigned Part = 0; Part < UF; Part++) { |
2484 | Value *AddrPart = State.get(Def: Addr, Instance: VPIteration(Part, 0)); |
2485 | if (auto *I = dyn_cast<Instruction>(Val: AddrPart)) |
2486 | State.setDebugLocFrom(I->getDebugLoc()); |
2487 | |
2488 | // Notice current instruction could be any index. Need to adjust the address |
2489 | // to the member of index 0. |
2490 | // |
2491 | // E.g. a = A[i+1]; // Member of index 1 (Current instruction) |
2492 | // b = A[i]; // Member of index 0 |
2493 | // Current pointer is pointed to A[i+1], adjust it to A[i]. |
2494 | // |
2495 | // E.g. A[i+1] = a; // Member of index 1 |
2496 | // A[i] = b; // Member of index 0 |
2497 | // A[i+2] = c; // Member of index 2 (Current instruction) |
2498 | // Current pointer is pointed to A[i+2], adjust it to A[i]. |
2499 | |
2500 | bool InBounds = false; |
2501 | if (auto *gep = dyn_cast<GetElementPtrInst>(Val: AddrPart->stripPointerCasts())) |
2502 | InBounds = gep->isInBounds(); |
2503 | AddrPart = Builder.CreateGEP(Ty: ScalarTy, Ptr: AddrPart, IdxList: Idx, Name: "" , IsInBounds: InBounds); |
2504 | AddrParts.push_back(Elt: AddrPart); |
2505 | } |
2506 | |
2507 | State.setDebugLocFrom(Instr->getDebugLoc()); |
2508 | Value *PoisonVec = PoisonValue::get(T: VecTy); |
2509 | |
2510 | auto CreateGroupMask = [this, &BlockInMask, &State, &InterleaveFactor]( |
2511 | unsigned Part, Value *MaskForGaps) -> Value * { |
2512 | if (VF.isScalable()) { |
2513 | assert(!MaskForGaps && "Interleaved groups with gaps are not supported." ); |
2514 | assert(InterleaveFactor == 2 && |
2515 | "Unsupported deinterleave factor for scalable vectors" ); |
2516 | auto *BlockInMaskPart = State.get(Def: BlockInMask, Part); |
2517 | SmallVector<Value *, 2> Ops = {BlockInMaskPart, BlockInMaskPart}; |
2518 | auto *MaskTy = |
2519 | VectorType::get(ElementType: Builder.getInt1Ty(), NumElements: VF.getKnownMinValue() * 2, Scalable: true); |
2520 | return Builder.CreateIntrinsic( |
2521 | MaskTy, Intrinsic::experimental_vector_interleave2, Ops, |
2522 | /*FMFSource=*/nullptr, "interleaved.mask" ); |
2523 | } |
2524 | |
2525 | if (!BlockInMask) |
2526 | return MaskForGaps; |
2527 | |
2528 | Value *BlockInMaskPart = State.get(Def: BlockInMask, Part); |
2529 | Value *ShuffledMask = Builder.CreateShuffleVector( |
2530 | V: BlockInMaskPart, |
2531 | Mask: createReplicatedMask(ReplicationFactor: InterleaveFactor, VF: VF.getKnownMinValue()), |
2532 | Name: "interleaved.mask" ); |
2533 | return MaskForGaps ? Builder.CreateBinOp(Opc: Instruction::And, LHS: ShuffledMask, |
2534 | RHS: MaskForGaps) |
2535 | : ShuffledMask; |
2536 | }; |
2537 | |
2538 | // Vectorize the interleaved load group. |
2539 | if (isa<LoadInst>(Val: Instr)) { |
2540 | Value *MaskForGaps = nullptr; |
2541 | if (NeedsMaskForGaps) { |
2542 | MaskForGaps = |
2543 | createBitMaskForGaps(Builder, VF: VF.getKnownMinValue(), Group: *Group); |
2544 | assert(MaskForGaps && "Mask for Gaps is required but it is null" ); |
2545 | } |
2546 | |
2547 | // For each unroll part, create a wide load for the group. |
2548 | SmallVector<Value *, 2> NewLoads; |
2549 | for (unsigned Part = 0; Part < UF; Part++) { |
2550 | Instruction *NewLoad; |
2551 | if (BlockInMask || MaskForGaps) { |
2552 | assert(useMaskedInterleavedAccesses(*TTI) && |
2553 | "masked interleaved groups are not allowed." ); |
2554 | Value *GroupMask = CreateGroupMask(Part, MaskForGaps); |
2555 | NewLoad = |
2556 | Builder.CreateMaskedLoad(Ty: VecTy, Ptr: AddrParts[Part], Alignment: Group->getAlign(), |
2557 | Mask: GroupMask, PassThru: PoisonVec, Name: "wide.masked.vec" ); |
2558 | } |
2559 | else |
2560 | NewLoad = Builder.CreateAlignedLoad(Ty: VecTy, Ptr: AddrParts[Part], |
2561 | Align: Group->getAlign(), Name: "wide.vec" ); |
2562 | Group->addMetadata(NewInst: NewLoad); |
2563 | NewLoads.push_back(Elt: NewLoad); |
2564 | } |
2565 | |
2566 | if (VecTy->isScalableTy()) { |
2567 | assert(InterleaveFactor == 2 && |
2568 | "Unsupported deinterleave factor for scalable vectors" ); |
2569 | |
2570 | for (unsigned Part = 0; Part < UF; ++Part) { |
2571 | // Scalable vectors cannot use arbitrary shufflevectors (only splats), |
2572 | // so must use intrinsics to deinterleave. |
2573 | Value *DI = Builder.CreateIntrinsic( |
2574 | Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part], |
2575 | /*FMFSource=*/nullptr, "strided.vec" ); |
2576 | unsigned J = 0; |
2577 | for (unsigned I = 0; I < InterleaveFactor; ++I) { |
2578 | Instruction *Member = Group->getMember(Index: I); |
2579 | |
2580 | if (!Member) |
2581 | continue; |
2582 | |
2583 | Value *StridedVec = Builder.CreateExtractValue(Agg: DI, Idxs: I); |
2584 | // If this member has different type, cast the result type. |
2585 | if (Member->getType() != ScalarTy) { |
2586 | VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: VF); |
2587 | StridedVec = createBitOrPointerCast(V: StridedVec, DstVTy: OtherVTy, DL); |
2588 | } |
2589 | |
2590 | if (Group->isReverse()) |
2591 | StridedVec = Builder.CreateVectorReverse(V: StridedVec, Name: "reverse" ); |
2592 | |
2593 | State.set(Def: VPDefs[J], V: StridedVec, Part); |
2594 | ++J; |
2595 | } |
2596 | } |
2597 | |
2598 | return; |
2599 | } |
2600 | |
2601 | // For each member in the group, shuffle out the appropriate data from the |
2602 | // wide loads. |
2603 | unsigned J = 0; |
2604 | for (unsigned I = 0; I < InterleaveFactor; ++I) { |
2605 | Instruction *Member = Group->getMember(Index: I); |
2606 | |
2607 | // Skip the gaps in the group. |
2608 | if (!Member) |
2609 | continue; |
2610 | |
2611 | auto StrideMask = |
2612 | createStrideMask(Start: I, Stride: InterleaveFactor, VF: VF.getKnownMinValue()); |
2613 | for (unsigned Part = 0; Part < UF; Part++) { |
2614 | Value *StridedVec = Builder.CreateShuffleVector( |
2615 | V: NewLoads[Part], Mask: StrideMask, Name: "strided.vec" ); |
2616 | |
2617 | // If this member has different type, cast the result type. |
2618 | if (Member->getType() != ScalarTy) { |
2619 | assert(!VF.isScalable() && "VF is assumed to be non scalable." ); |
2620 | VectorType *OtherVTy = VectorType::get(ElementType: Member->getType(), EC: VF); |
2621 | StridedVec = createBitOrPointerCast(V: StridedVec, DstVTy: OtherVTy, DL); |
2622 | } |
2623 | |
2624 | if (Group->isReverse()) |
2625 | StridedVec = Builder.CreateVectorReverse(V: StridedVec, Name: "reverse" ); |
2626 | |
2627 | State.set(Def: VPDefs[J], V: StridedVec, Part); |
2628 | } |
2629 | ++J; |
2630 | } |
2631 | return; |
2632 | } |
2633 | |
2634 | // The sub vector type for current instruction. |
2635 | auto *SubVT = VectorType::get(ElementType: ScalarTy, EC: VF); |
2636 | |
2637 | // Vectorize the interleaved store group. |
2638 | Value *MaskForGaps = |
2639 | createBitMaskForGaps(Builder, VF: VF.getKnownMinValue(), Group: *Group); |
2640 | assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) && |
2641 | "masked interleaved groups are not allowed." ); |
2642 | assert((!MaskForGaps || !VF.isScalable()) && |
2643 | "masking gaps for scalable vectors is not yet supported." ); |
2644 | for (unsigned Part = 0; Part < UF; Part++) { |
2645 | // Collect the stored vector from each member. |
2646 | SmallVector<Value *, 4> StoredVecs; |
2647 | unsigned StoredIdx = 0; |
2648 | for (unsigned i = 0; i < InterleaveFactor; i++) { |
2649 | assert((Group->getMember(i) || MaskForGaps) && |
2650 | "Fail to get a member from an interleaved store group" ); |
2651 | Instruction *Member = Group->getMember(Index: i); |
2652 | |
2653 | // Skip the gaps in the group. |
2654 | if (!Member) { |
2655 | Value *Undef = PoisonValue::get(T: SubVT); |
2656 | StoredVecs.push_back(Elt: Undef); |
2657 | continue; |
2658 | } |
2659 | |
2660 | Value *StoredVec = State.get(Def: StoredValues[StoredIdx], Part); |
2661 | ++StoredIdx; |
2662 | |
2663 | if (Group->isReverse()) |
2664 | StoredVec = Builder.CreateVectorReverse(V: StoredVec, Name: "reverse" ); |
2665 | |
2666 | // If this member has different type, cast it to a unified type. |
2667 | |
2668 | if (StoredVec->getType() != SubVT) |
2669 | StoredVec = createBitOrPointerCast(V: StoredVec, DstVTy: SubVT, DL); |
2670 | |
2671 | StoredVecs.push_back(Elt: StoredVec); |
2672 | } |
2673 | |
2674 | // Interleave all the smaller vectors into one wider vector. |
2675 | Value *IVec = interleaveVectors(Builder, Vals: StoredVecs, Name: "interleaved.vec" ); |
2676 | Instruction *NewStoreInstr; |
2677 | if (BlockInMask || MaskForGaps) { |
2678 | Value *GroupMask = CreateGroupMask(Part, MaskForGaps); |
2679 | NewStoreInstr = Builder.CreateMaskedStore(Val: IVec, Ptr: AddrParts[Part], |
2680 | Alignment: Group->getAlign(), Mask: GroupMask); |
2681 | } else |
2682 | NewStoreInstr = |
2683 | Builder.CreateAlignedStore(Val: IVec, Ptr: AddrParts[Part], Align: Group->getAlign()); |
2684 | |
2685 | Group->addMetadata(NewInst: NewStoreInstr); |
2686 | } |
2687 | } |
2688 | |
2689 | void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, |
2690 | VPReplicateRecipe *RepRecipe, |
2691 | const VPIteration &Instance, |
2692 | VPTransformState &State) { |
2693 | assert(!Instr->getType()->isAggregateType() && "Can't handle vectors" ); |
2694 | |
2695 | // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for |
2696 | // the first lane and part. |
2697 | if (isa<NoAliasScopeDeclInst>(Val: Instr)) |
2698 | if (!Instance.isFirstIteration()) |
2699 | return; |
2700 | |
2701 | // Does this instruction return a value ? |
2702 | bool IsVoidRetTy = Instr->getType()->isVoidTy(); |
2703 | |
2704 | Instruction *Cloned = Instr->clone(); |
2705 | if (!IsVoidRetTy) { |
2706 | Cloned->setName(Instr->getName() + ".cloned" ); |
2707 | #if !defined(NDEBUG) |
2708 | // Verify that VPlan type inference results agree with the type of the |
2709 | // generated values. |
2710 | assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() && |
2711 | "inferred type and type from generated instructions do not match" ); |
2712 | #endif |
2713 | } |
2714 | |
2715 | RepRecipe->setFlags(Cloned); |
2716 | |
2717 | if (auto DL = Instr->getDebugLoc()) |
2718 | State.setDebugLocFrom(DL); |
2719 | |
2720 | // Replace the operands of the cloned instructions with their scalar |
2721 | // equivalents in the new loop. |
2722 | for (const auto &I : enumerate(First: RepRecipe->operands())) { |
2723 | auto InputInstance = Instance; |
2724 | VPValue *Operand = I.value(); |
2725 | if (vputils::isUniformAfterVectorization(VPV: Operand)) |
2726 | InputInstance.Lane = VPLane::getFirstLane(); |
2727 | Cloned->setOperand(i: I.index(), Val: State.get(Def: Operand, Instance: InputInstance)); |
2728 | } |
2729 | State.addNewMetadata(To: Cloned, Orig: Instr); |
2730 | |
2731 | // Place the cloned scalar in the new loop. |
2732 | State.Builder.Insert(I: Cloned); |
2733 | |
2734 | State.set(Def: RepRecipe, V: Cloned, Instance); |
2735 | |
2736 | // If we just cloned a new assumption, add it the assumption cache. |
2737 | if (auto *II = dyn_cast<AssumeInst>(Val: Cloned)) |
2738 | AC->registerAssumption(CI: II); |
2739 | |
2740 | // End if-block. |
2741 | bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator(); |
2742 | if (IfPredicateInstr) |
2743 | PredicatedInstructions.push_back(Elt: Cloned); |
2744 | } |
2745 | |
2746 | Value * |
2747 | InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { |
2748 | if (VectorTripCount) |
2749 | return VectorTripCount; |
2750 | |
2751 | Value *TC = getTripCount(); |
2752 | IRBuilder<> Builder(InsertBlock->getTerminator()); |
2753 | |
2754 | Type *Ty = TC->getType(); |
2755 | // This is where we can make the step a runtime constant. |
2756 | Value *Step = createStepForVF(B&: Builder, Ty, VF, Step: UF); |
2757 | |
2758 | // If the tail is to be folded by masking, round the number of iterations N |
2759 | // up to a multiple of Step instead of rounding down. This is done by first |
2760 | // adding Step-1 and then rounding down. Note that it's ok if this addition |
2761 | // overflows: the vector induction variable will eventually wrap to zero given |
2762 | // that it starts at zero and its Step is a power of two; the loop will then |
2763 | // exit, with the last early-exit vector comparison also producing all-true. |
2764 | // For scalable vectors the VF is not guaranteed to be a power of 2, but this |
2765 | // is accounted for in emitIterationCountCheck that adds an overflow check. |
2766 | if (Cost->foldTailByMasking()) { |
2767 | assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && |
2768 | "VF*UF must be a power of 2 when folding tail by masking" ); |
2769 | Value *NumLanes = getRuntimeVF(B&: Builder, Ty, VF: VF * UF); |
2770 | TC = Builder.CreateAdd( |
2771 | LHS: TC, RHS: Builder.CreateSub(LHS: NumLanes, RHS: ConstantInt::get(Ty, V: 1)), Name: "n.rnd.up" ); |
2772 | } |
2773 | |
2774 | // Now we need to generate the expression for the part of the loop that the |
2775 | // vectorized body will execute. This is equal to N - (N % Step) if scalar |
2776 | // iterations are not required for correctness, or N - Step, otherwise. Step |
2777 | // is equal to the vectorization factor (number of SIMD elements) times the |
2778 | // unroll factor (number of SIMD instructions). |
2779 | Value *R = Builder.CreateURem(LHS: TC, RHS: Step, Name: "n.mod.vf" ); |
2780 | |
2781 | // There are cases where we *must* run at least one iteration in the remainder |
2782 | // loop. See the cost model for when this can happen. If the step evenly |
2783 | // divides the trip count, we set the remainder to be equal to the step. If |
2784 | // the step does not evenly divide the trip count, no adjustment is necessary |
2785 | // since there will already be scalar iterations. Note that the minimum |
2786 | // iterations check ensures that N >= Step. |
2787 | if (Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector())) { |
2788 | auto *IsZero = Builder.CreateICmpEQ(LHS: R, RHS: ConstantInt::get(Ty: R->getType(), V: 0)); |
2789 | R = Builder.CreateSelect(C: IsZero, True: Step, False: R); |
2790 | } |
2791 | |
2792 | VectorTripCount = Builder.CreateSub(LHS: TC, RHS: R, Name: "n.vec" ); |
2793 | |
2794 | return VectorTripCount; |
2795 | } |
2796 | |
2797 | Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, |
2798 | const DataLayout &DL) { |
2799 | // Verify that V is a vector type with same number of elements as DstVTy. |
2800 | auto *DstFVTy = cast<VectorType>(Val: DstVTy); |
2801 | auto VF = DstFVTy->getElementCount(); |
2802 | auto *SrcVecTy = cast<VectorType>(Val: V->getType()); |
2803 | assert(VF == SrcVecTy->getElementCount() && "Vector dimensions do not match" ); |
2804 | Type *SrcElemTy = SrcVecTy->getElementType(); |
2805 | Type *DstElemTy = DstFVTy->getElementType(); |
2806 | assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) && |
2807 | "Vector elements must have same size" ); |
2808 | |
2809 | // Do a direct cast if element types are castable. |
2810 | if (CastInst::isBitOrNoopPointerCastable(SrcTy: SrcElemTy, DestTy: DstElemTy, DL)) { |
2811 | return Builder.CreateBitOrPointerCast(V, DestTy: DstFVTy); |
2812 | } |
2813 | // V cannot be directly casted to desired vector type. |
2814 | // May happen when V is a floating point vector but DstVTy is a vector of |
2815 | // pointers or vice-versa. Handle this using a two-step bitcast using an |
2816 | // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float. |
2817 | assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) && |
2818 | "Only one type should be a pointer type" ); |
2819 | assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) && |
2820 | "Only one type should be a floating point type" ); |
2821 | Type *IntTy = |
2822 | IntegerType::getIntNTy(C&: V->getContext(), N: DL.getTypeSizeInBits(Ty: SrcElemTy)); |
2823 | auto *VecIntTy = VectorType::get(ElementType: IntTy, EC: VF); |
2824 | Value *CastVal = Builder.CreateBitOrPointerCast(V, DestTy: VecIntTy); |
2825 | return Builder.CreateBitOrPointerCast(V: CastVal, DestTy: DstFVTy); |
2826 | } |
2827 | |
2828 | void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { |
2829 | Value *Count = getTripCount(); |
2830 | // Reuse existing vector loop preheader for TC checks. |
2831 | // Note that new preheader block is generated for vector loop. |
2832 | BasicBlock *const TCCheckBlock = LoopVectorPreHeader; |
2833 | IRBuilder<> Builder(TCCheckBlock->getTerminator()); |
2834 | |
2835 | // Generate code to check if the loop's trip count is less than VF * UF, or |
2836 | // equal to it in case a scalar epilogue is required; this implies that the |
2837 | // vector trip count is zero. This check also covers the case where adding one |
2838 | // to the backedge-taken count overflowed leading to an incorrect trip count |
2839 | // of zero. In this case we will also jump to the scalar loop. |
2840 | auto P = Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()) ? ICmpInst::ICMP_ULE |
2841 | : ICmpInst::ICMP_ULT; |
2842 | |
2843 | // If tail is to be folded, vector loop takes care of all iterations. |
2844 | Type *CountTy = Count->getType(); |
2845 | Value *CheckMinIters = Builder.getFalse(); |
2846 | auto CreateStep = [&]() -> Value * { |
2847 | // Create step with max(MinProTripCount, UF * VF). |
2848 | if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue()) |
2849 | return createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF); |
2850 | |
2851 | Value *MinProfTC = |
2852 | createStepForVF(B&: Builder, Ty: CountTy, VF: MinProfitableTripCount, Step: 1); |
2853 | if (!VF.isScalable()) |
2854 | return MinProfTC; |
2855 | return Builder.CreateBinaryIntrinsic( |
2856 | Intrinsic::ID: umax, LHS: MinProfTC, RHS: createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF)); |
2857 | }; |
2858 | |
2859 | TailFoldingStyle Style = Cost->getTailFoldingStyle(); |
2860 | if (Style == TailFoldingStyle::None) |
2861 | CheckMinIters = |
2862 | Builder.CreateICmp(P, LHS: Count, RHS: CreateStep(), Name: "min.iters.check" ); |
2863 | else if (VF.isScalable() && |
2864 | !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && |
2865 | Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { |
2866 | // vscale is not necessarily a power-of-2, which means we cannot guarantee |
2867 | // an overflow to zero when updating induction variables and so an |
2868 | // additional overflow check is required before entering the vector loop. |
2869 | |
2870 | // Get the maximum unsigned value for the type. |
2871 | Value *MaxUIntTripCount = |
2872 | ConstantInt::get(Ty: CountTy, V: cast<IntegerType>(Val: CountTy)->getMask()); |
2873 | Value *LHS = Builder.CreateSub(LHS: MaxUIntTripCount, RHS: Count); |
2874 | |
2875 | // Don't execute the vector loop if (UMax - n) < (VF * UF). |
2876 | CheckMinIters = Builder.CreateICmp(P: ICmpInst::ICMP_ULT, LHS, RHS: CreateStep()); |
2877 | } |
2878 | |
2879 | // Create new preheader for vector loop. |
2880 | LoopVectorPreHeader = |
2881 | SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(), DT, LI, MSSAU: nullptr, |
2882 | BBName: "vector.ph" ); |
2883 | |
2884 | assert(DT->properlyDominates(DT->getNode(TCCheckBlock), |
2885 | DT->getNode(Bypass)->getIDom()) && |
2886 | "TC check is expected to dominate Bypass" ); |
2887 | |
2888 | // Update dominator for Bypass & LoopExit (if needed). |
2889 | DT->changeImmediateDominator(BB: Bypass, NewBB: TCCheckBlock); |
2890 | if (!Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector())) |
2891 | // If there is an epilogue which must run, there's no edge from the |
2892 | // middle block to exit blocks and thus no need to update the immediate |
2893 | // dominator of the exit blocks. |
2894 | DT->changeImmediateDominator(BB: LoopExitBlock, NewBB: TCCheckBlock); |
2895 | |
2896 | BranchInst &BI = |
2897 | *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters); |
2898 | if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())) |
2899 | setBranchWeights(I&: BI, Weights: MinItersBypassWeights); |
2900 | ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI); |
2901 | LoopBypassBlocks.push_back(Elt: TCCheckBlock); |
2902 | } |
2903 | |
2904 | BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { |
2905 | BasicBlock *const SCEVCheckBlock = |
2906 | RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock); |
2907 | if (!SCEVCheckBlock) |
2908 | return nullptr; |
2909 | |
2910 | assert(!(SCEVCheckBlock->getParent()->hasOptSize() || |
2911 | (OptForSizeBasedOnProfile && |
2912 | Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && |
2913 | "Cannot SCEV check stride or overflow when optimizing for size" ); |
2914 | |
2915 | |
2916 | // Update dominator only if this is first RT check. |
2917 | if (LoopBypassBlocks.empty()) { |
2918 | DT->changeImmediateDominator(BB: Bypass, NewBB: SCEVCheckBlock); |
2919 | if (!Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector())) |
2920 | // If there is an epilogue which must run, there's no edge from the |
2921 | // middle block to exit blocks and thus no need to update the immediate |
2922 | // dominator of the exit blocks. |
2923 | DT->changeImmediateDominator(BB: LoopExitBlock, NewBB: SCEVCheckBlock); |
2924 | } |
2925 | |
2926 | LoopBypassBlocks.push_back(Elt: SCEVCheckBlock); |
2927 | AddedSafetyChecks = true; |
2928 | return SCEVCheckBlock; |
2929 | } |
2930 | |
2931 | BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { |
2932 | // VPlan-native path does not do any analysis for runtime checks currently. |
2933 | if (EnableVPlanNativePath) |
2934 | return nullptr; |
2935 | |
2936 | BasicBlock *const MemCheckBlock = |
2937 | RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader); |
2938 | |
2939 | // Check if we generated code that checks in runtime if arrays overlap. We put |
2940 | // the checks into a separate block to make the more common case of few |
2941 | // elements faster. |
2942 | if (!MemCheckBlock) |
2943 | return nullptr; |
2944 | |
2945 | if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { |
2946 | assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && |
2947 | "Cannot emit memory checks when optimizing for size, unless forced " |
2948 | "to vectorize." ); |
2949 | ORE->emit(RemarkBuilder: [&]() { |
2950 | return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize" , |
2951 | OrigLoop->getStartLoc(), |
2952 | OrigLoop->getHeader()) |
2953 | << "Code-size may be reduced by not forcing " |
2954 | "vectorization, or by source-code modifications " |
2955 | "eliminating the need for runtime checks " |
2956 | "(e.g., adding 'restrict')." ; |
2957 | }); |
2958 | } |
2959 | |
2960 | LoopBypassBlocks.push_back(Elt: MemCheckBlock); |
2961 | |
2962 | AddedSafetyChecks = true; |
2963 | |
2964 | return MemCheckBlock; |
2965 | } |
2966 | |
2967 | void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { |
2968 | LoopScalarBody = OrigLoop->getHeader(); |
2969 | LoopVectorPreHeader = OrigLoop->getLoopPreheader(); |
2970 | assert(LoopVectorPreHeader && "Invalid loop structure" ); |
2971 | LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr |
2972 | assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) && |
2973 | "multiple exit loop without required epilogue?" ); |
2974 | |
2975 | LoopMiddleBlock = |
2976 | SplitBlock(Old: LoopVectorPreHeader, SplitPt: LoopVectorPreHeader->getTerminator(), DT, |
2977 | LI, MSSAU: nullptr, BBName: Twine(Prefix) + "middle.block" ); |
2978 | LoopScalarPreHeader = |
2979 | SplitBlock(Old: LoopMiddleBlock, SplitPt: LoopMiddleBlock->getTerminator(), DT, LI, |
2980 | MSSAU: nullptr, BBName: Twine(Prefix) + "scalar.ph" ); |
2981 | |
2982 | auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); |
2983 | |
2984 | // Set up the middle block terminator. Two cases: |
2985 | // 1) If we know that we must execute the scalar epilogue, emit an |
2986 | // unconditional branch. |
2987 | // 2) Otherwise, we must have a single unique exit block (due to how we |
2988 | // implement the multiple exit case). In this case, set up a conditional |
2989 | // branch from the middle block to the loop scalar preheader, and the |
2990 | // exit block. completeLoopSkeleton will update the condition to use an |
2991 | // iteration check, if required to decide whether to execute the remainder. |
2992 | BranchInst *BrInst = |
2993 | Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()) |
2994 | ? BranchInst::Create(IfTrue: LoopScalarPreHeader) |
2995 | : BranchInst::Create(IfTrue: LoopExitBlock, IfFalse: LoopScalarPreHeader, |
2996 | Cond: Builder.getTrue()); |
2997 | BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); |
2998 | ReplaceInstWithInst(From: LoopMiddleBlock->getTerminator(), To: BrInst); |
2999 | |
3000 | // Update dominator for loop exit. During skeleton creation, only the vector |
3001 | // pre-header and the middle block are created. The vector loop is entirely |
3002 | // created during VPlan exection. |
3003 | if (!Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector())) |
3004 | // If there is an epilogue which must run, there's no edge from the |
3005 | // middle block to exit blocks and thus no need to update the immediate |
3006 | // dominator of the exit blocks. |
3007 | DT->changeImmediateDominator(BB: LoopExitBlock, NewBB: LoopMiddleBlock); |
3008 | } |
3009 | |
3010 | PHINode *InnerLoopVectorizer::createInductionResumeValue( |
3011 | PHINode *OrigPhi, const InductionDescriptor &II, Value *Step, |
3012 | ArrayRef<BasicBlock *> BypassBlocks, |
3013 | std::pair<BasicBlock *, Value *> AdditionalBypass) { |
3014 | Value *VectorTripCount = getOrCreateVectorTripCount(InsertBlock: LoopVectorPreHeader); |
3015 | assert(VectorTripCount && "Expected valid arguments" ); |
3016 | |
3017 | Instruction *OldInduction = Legal->getPrimaryInduction(); |
3018 | Value *&EndValue = IVEndValues[OrigPhi]; |
3019 | Value *EndValueFromAdditionalBypass = AdditionalBypass.second; |
3020 | if (OrigPhi == OldInduction) { |
3021 | // We know what the end value is. |
3022 | EndValue = VectorTripCount; |
3023 | } else { |
3024 | IRBuilder<> B(LoopVectorPreHeader->getTerminator()); |
3025 | |
3026 | // Fast-math-flags propagate from the original induction instruction. |
3027 | if (II.getInductionBinOp() && isa<FPMathOperator>(Val: II.getInductionBinOp())) |
3028 | B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); |
3029 | |
3030 | EndValue = emitTransformedIndex(B, Index: VectorTripCount, StartValue: II.getStartValue(), |
3031 | Step, InductionKind: II.getKind(), InductionBinOp: II.getInductionBinOp()); |
3032 | EndValue->setName("ind.end" ); |
3033 | |
3034 | // Compute the end value for the additional bypass (if applicable). |
3035 | if (AdditionalBypass.first) { |
3036 | B.SetInsertPoint(TheBB: AdditionalBypass.first, |
3037 | IP: AdditionalBypass.first->getFirstInsertionPt()); |
3038 | EndValueFromAdditionalBypass = |
3039 | emitTransformedIndex(B, Index: AdditionalBypass.second, StartValue: II.getStartValue(), |
3040 | Step, InductionKind: II.getKind(), InductionBinOp: II.getInductionBinOp()); |
3041 | EndValueFromAdditionalBypass->setName("ind.end" ); |
3042 | } |
3043 | } |
3044 | |
3045 | // Create phi nodes to merge from the backedge-taken check block. |
3046 | PHINode *BCResumeVal = |
3047 | PHINode::Create(Ty: OrigPhi->getType(), NumReservedValues: 3, NameStr: "bc.resume.val" , |
3048 | InsertBefore: LoopScalarPreHeader->getTerminator()->getIterator()); |
3049 | // Copy original phi DL over to the new one. |
3050 | BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); |
3051 | |
3052 | // The new PHI merges the original incoming value, in case of a bypass, |
3053 | // or the value at the end of the vectorized loop. |
3054 | BCResumeVal->addIncoming(V: EndValue, BB: LoopMiddleBlock); |
3055 | |
3056 | // Fix the scalar body counter (PHI node). |
3057 | // The old induction's phi node in the scalar body needs the truncated |
3058 | // value. |
3059 | for (BasicBlock *BB : BypassBlocks) |
3060 | BCResumeVal->addIncoming(V: II.getStartValue(), BB); |
3061 | |
3062 | if (AdditionalBypass.first) |
3063 | BCResumeVal->setIncomingValueForBlock(BB: AdditionalBypass.first, |
3064 | V: EndValueFromAdditionalBypass); |
3065 | return BCResumeVal; |
3066 | } |
3067 | |
3068 | /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV |
3069 | /// expansion results. |
3070 | static Value *getExpandedStep(const InductionDescriptor &ID, |
3071 | const SCEV2ValueTy &ExpandedSCEVs) { |
3072 | const SCEV *Step = ID.getStep(); |
3073 | if (auto *C = dyn_cast<SCEVConstant>(Val: Step)) |
3074 | return C->getValue(); |
3075 | if (auto *U = dyn_cast<SCEVUnknown>(Val: Step)) |
3076 | return U->getValue(); |
3077 | auto I = ExpandedSCEVs.find(Val: Step); |
3078 | assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point" ); |
3079 | return I->second; |
3080 | } |
3081 | |
3082 | void InnerLoopVectorizer::createInductionResumeValues( |
3083 | const SCEV2ValueTy &ExpandedSCEVs, |
3084 | std::pair<BasicBlock *, Value *> AdditionalBypass) { |
3085 | assert(((AdditionalBypass.first && AdditionalBypass.second) || |
3086 | (!AdditionalBypass.first && !AdditionalBypass.second)) && |
3087 | "Inconsistent information about additional bypass." ); |
3088 | // We are going to resume the execution of the scalar loop. |
3089 | // Go over all of the induction variables that we found and fix the |
3090 | // PHIs that are left in the scalar version of the loop. |
3091 | // The starting values of PHI nodes depend on the counter of the last |
3092 | // iteration in the vectorized loop. |
3093 | // If we come from a bypass edge then we need to start from the original |
3094 | // start value. |
3095 | for (const auto &InductionEntry : Legal->getInductionVars()) { |
3096 | PHINode *OrigPhi = InductionEntry.first; |
3097 | const InductionDescriptor &II = InductionEntry.second; |
3098 | PHINode *BCResumeVal = createInductionResumeValue( |
3099 | OrigPhi, II, Step: getExpandedStep(ID: II, ExpandedSCEVs), BypassBlocks: LoopBypassBlocks, |
3100 | AdditionalBypass); |
3101 | OrigPhi->setIncomingValueForBlock(BB: LoopScalarPreHeader, V: BCResumeVal); |
3102 | } |
3103 | } |
3104 | |
3105 | BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { |
3106 | // The trip counts should be cached by now. |
3107 | Value *Count = getTripCount(); |
3108 | Value *VectorTripCount = getOrCreateVectorTripCount(InsertBlock: LoopVectorPreHeader); |
3109 | |
3110 | auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); |
3111 | |
3112 | // Add a check in the middle block to see if we have completed |
3113 | // all of the iterations in the first vector loop. Three cases: |
3114 | // 1) If we require a scalar epilogue, there is no conditional branch as |
3115 | // we unconditionally branch to the scalar preheader. Do nothing. |
3116 | // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. |
3117 | // Thus if tail is to be folded, we know we don't need to run the |
3118 | // remainder and we can use the previous value for the condition (true). |
3119 | // 3) Otherwise, construct a runtime check. |
3120 | if (!Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()) && |
3121 | !Cost->foldTailByMasking()) { |
3122 | // Here we use the same DebugLoc as the scalar loop latch terminator instead |
3123 | // of the corresponding compare because they may have ended up with |
3124 | // different line numbers and we want to avoid awkward line stepping while |
3125 | // debugging. Eg. if the compare has got a line number inside the loop. |
3126 | // TODO: At the moment, CreateICmpEQ will simplify conditions with constant |
3127 | // operands. Perform simplification directly on VPlan once the branch is |
3128 | // modeled there. |
3129 | IRBuilder<> B(LoopMiddleBlock->getTerminator()); |
3130 | B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc()); |
3131 | Value *CmpN = B.CreateICmpEQ(LHS: Count, RHS: VectorTripCount, Name: "cmp.n" ); |
3132 | BranchInst &BI = *cast<BranchInst>(Val: LoopMiddleBlock->getTerminator()); |
3133 | BI.setCondition(CmpN); |
3134 | if (hasBranchWeightMD(I: *ScalarLatchTerm)) { |
3135 | // Assume that `Count % VectorTripCount` is equally distributed. |
3136 | unsigned TripCount = UF * VF.getKnownMinValue(); |
3137 | assert(TripCount > 0 && "trip count should not be zero" ); |
3138 | const uint32_t Weights[] = {1, TripCount - 1}; |
3139 | setBranchWeights(I&: BI, Weights); |
3140 | } |
3141 | } |
3142 | |
3143 | #ifdef EXPENSIVE_CHECKS |
3144 | assert(DT->verify(DominatorTree::VerificationLevel::Fast)); |
3145 | #endif |
3146 | |
3147 | return LoopVectorPreHeader; |
3148 | } |
3149 | |
3150 | std::pair<BasicBlock *, Value *> |
3151 | InnerLoopVectorizer::createVectorizedLoopSkeleton( |
3152 | const SCEV2ValueTy &ExpandedSCEVs) { |
3153 | /* |
3154 | In this function we generate a new loop. The new loop will contain |
3155 | the vectorized instructions while the old loop will continue to run the |
3156 | scalar remainder. |
3157 | |
3158 | [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's |
3159 | / | preheader are expanded here. Eventually all required SCEV |
3160 | / | expansion should happen here. |
3161 | / v |
3162 | | [ ] <-- vector loop bypass (may consist of multiple blocks). |
3163 | | / | |
3164 | | / v |
3165 | || [ ] <-- vector pre header. |
3166 | |/ | |
3167 | | v |
3168 | | [ ] \ |
3169 | | [ ]_| <-- vector loop (created during VPlan execution). |
3170 | | | |
3171 | | v |
3172 | \ -[ ] <--- middle-block. |
3173 | \/ | |
3174 | /\ v |
3175 | | ->[ ] <--- new preheader. |
3176 | | | |
3177 | (opt) v <-- edge from middle to exit iff epilogue is not required. |
3178 | | [ ] \ |
3179 | | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue). |
3180 | \ | |
3181 | \ v |
3182 | >[ ] <-- exit block(s). |
3183 | ... |
3184 | */ |
3185 | |
3186 | // Create an empty vector loop, and prepare basic blocks for the runtime |
3187 | // checks. |
3188 | createVectorLoopSkeleton(Prefix: "" ); |
3189 | |
3190 | // Now, compare the new count to zero. If it is zero skip the vector loop and |
3191 | // jump to the scalar loop. This check also covers the case where the |
3192 | // backedge-taken count is uint##_max: adding one to it will overflow leading |
3193 | // to an incorrect trip count of zero. In this (rare) case we will also jump |
3194 | // to the scalar loop. |
3195 | emitIterationCountCheck(Bypass: LoopScalarPreHeader); |
3196 | |
3197 | // Generate the code to check any assumptions that we've made for SCEV |
3198 | // expressions. |
3199 | emitSCEVChecks(Bypass: LoopScalarPreHeader); |
3200 | |
3201 | // Generate the code that checks in runtime if arrays overlap. We put the |
3202 | // checks into a separate block to make the more common case of few elements |
3203 | // faster. |
3204 | emitMemRuntimeChecks(Bypass: LoopScalarPreHeader); |
3205 | |
3206 | // Emit phis for the new starting index of the scalar loop. |
3207 | createInductionResumeValues(ExpandedSCEVs); |
3208 | |
3209 | return {completeLoopSkeleton(), nullptr}; |
3210 | } |
3211 | |
3212 | // Fix up external users of the induction variable. At this point, we are |
3213 | // in LCSSA form, with all external PHIs that use the IV having one input value, |
3214 | // coming from the remainder loop. We need those PHIs to also have a correct |
3215 | // value for the IV when arriving directly from the middle block. |
3216 | void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, |
3217 | const InductionDescriptor &II, |
3218 | Value *VectorTripCount, Value *EndValue, |
3219 | BasicBlock *MiddleBlock, |
3220 | BasicBlock *, VPlan &Plan, |
3221 | VPTransformState &State) { |
3222 | // There are two kinds of external IV usages - those that use the value |
3223 | // computed in the last iteration (the PHI) and those that use the penultimate |
3224 | // value (the value that feeds into the phi from the loop latch). |
3225 | // We allow both, but they, obviously, have different values. |
3226 | |
3227 | assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block" ); |
3228 | |
3229 | DenseMap<Value *, Value *> MissingVals; |
3230 | |
3231 | // An external user of the last iteration's value should see the value that |
3232 | // the remainder loop uses to initialize its own IV. |
3233 | Value *PostInc = OrigPhi->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch()); |
3234 | for (User *U : PostInc->users()) { |
3235 | Instruction *UI = cast<Instruction>(Val: U); |
3236 | if (!OrigLoop->contains(Inst: UI)) { |
3237 | assert(isa<PHINode>(UI) && "Expected LCSSA form" ); |
3238 | MissingVals[UI] = EndValue; |
3239 | } |
3240 | } |
3241 | |
3242 | // An external user of the penultimate value need to see EndValue - Step. |
3243 | // The simplest way to get this is to recompute it from the constituent SCEVs, |
3244 | // that is Start + (Step * (CRD - 1)). |
3245 | for (User *U : OrigPhi->users()) { |
3246 | auto *UI = cast<Instruction>(Val: U); |
3247 | if (!OrigLoop->contains(Inst: UI)) { |
3248 | assert(isa<PHINode>(UI) && "Expected LCSSA form" ); |
3249 | IRBuilder<> B(MiddleBlock->getTerminator()); |
3250 | |
3251 | // Fast-math-flags propagate from the original induction instruction. |
3252 | if (II.getInductionBinOp() && isa<FPMathOperator>(Val: II.getInductionBinOp())) |
3253 | B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); |
3254 | |
3255 | Value *CountMinusOne = B.CreateSub( |
3256 | LHS: VectorTripCount, RHS: ConstantInt::get(Ty: VectorTripCount->getType(), V: 1)); |
3257 | CountMinusOne->setName("cmo" ); |
3258 | |
3259 | VPValue *StepVPV = Plan.getSCEVExpansion(S: II.getStep()); |
3260 | assert(StepVPV && "step must have been expanded during VPlan execution" ); |
3261 | Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue() |
3262 | : State.get(Def: StepVPV, Instance: {0, 0}); |
3263 | Value *Escape = |
3264 | emitTransformedIndex(B, Index: CountMinusOne, StartValue: II.getStartValue(), Step, |
3265 | InductionKind: II.getKind(), InductionBinOp: II.getInductionBinOp()); |
3266 | Escape->setName("ind.escape" ); |
3267 | MissingVals[UI] = Escape; |
3268 | } |
3269 | } |
3270 | |
3271 | for (auto &I : MissingVals) { |
3272 | PHINode *PHI = cast<PHINode>(Val: I.first); |
3273 | // One corner case we have to handle is two IVs "chasing" each-other, |
3274 | // that is %IV2 = phi [...], [ %IV1, %latch ] |
3275 | // In this case, if IV1 has an external use, we need to avoid adding both |
3276 | // "last value of IV1" and "penultimate value of IV2". So, verify that we |
3277 | // don't already have an incoming value for the middle block. |
3278 | if (PHI->getBasicBlockIndex(BB: MiddleBlock) == -1) { |
3279 | PHI->addIncoming(V: I.second, BB: MiddleBlock); |
3280 | Plan.removeLiveOut(PN: PHI); |
3281 | } |
3282 | } |
3283 | } |
3284 | |
3285 | namespace { |
3286 | |
3287 | struct CSEDenseMapInfo { |
3288 | static bool canHandle(const Instruction *I) { |
3289 | return isa<InsertElementInst>(Val: I) || isa<ExtractElementInst>(Val: I) || |
3290 | isa<ShuffleVectorInst>(Val: I) || isa<GetElementPtrInst>(Val: I); |
3291 | } |
3292 | |
3293 | static inline Instruction *getEmptyKey() { |
3294 | return DenseMapInfo<Instruction *>::getEmptyKey(); |
3295 | } |
3296 | |
3297 | static inline Instruction *getTombstoneKey() { |
3298 | return DenseMapInfo<Instruction *>::getTombstoneKey(); |
3299 | } |
3300 | |
3301 | static unsigned getHashValue(const Instruction *I) { |
3302 | assert(canHandle(I) && "Unknown instruction!" ); |
3303 | return hash_combine(args: I->getOpcode(), args: hash_combine_range(first: I->value_op_begin(), |
3304 | last: I->value_op_end())); |
3305 | } |
3306 | |
3307 | static bool isEqual(const Instruction *LHS, const Instruction *RHS) { |
3308 | if (LHS == getEmptyKey() || RHS == getEmptyKey() || |
3309 | LHS == getTombstoneKey() || RHS == getTombstoneKey()) |
3310 | return LHS == RHS; |
3311 | return LHS->isIdenticalTo(I: RHS); |
3312 | } |
3313 | }; |
3314 | |
3315 | } // end anonymous namespace |
3316 | |
3317 | ///Perform cse of induction variable instructions. |
3318 | static void cse(BasicBlock *BB) { |
3319 | // Perform simple cse. |
3320 | SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap; |
3321 | for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) { |
3322 | if (!CSEDenseMapInfo::canHandle(I: &In)) |
3323 | continue; |
3324 | |
3325 | // Check if we can replace this instruction with any of the |
3326 | // visited instructions. |
3327 | if (Instruction *V = CSEMap.lookup(Val: &In)) { |
3328 | In.replaceAllUsesWith(V); |
3329 | In.eraseFromParent(); |
3330 | continue; |
3331 | } |
3332 | |
3333 | CSEMap[&In] = &In; |
3334 | } |
3335 | } |
3336 | |
3337 | InstructionCost |
3338 | LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, |
3339 | ElementCount VF) const { |
3340 | // We only need to calculate a cost if the VF is scalar; for actual vectors |
3341 | // we should already have a pre-calculated cost at each VF. |
3342 | if (!VF.isScalar()) |
3343 | return CallWideningDecisions.at(Val: std::make_pair(x&: CI, y&: VF)).Cost; |
3344 | |
3345 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
3346 | Type *RetTy = CI->getType(); |
3347 | if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI)) |
3348 | if (auto RedCost = getReductionPatternCost(I: CI, VF, VectorTy: RetTy, CostKind)) |
3349 | return *RedCost; |
3350 | |
3351 | SmallVector<Type *, 4> Tys; |
3352 | for (auto &ArgOp : CI->args()) |
3353 | Tys.push_back(Elt: ArgOp->getType()); |
3354 | |
3355 | InstructionCost ScalarCallCost = |
3356 | TTI.getCallInstrCost(F: CI->getCalledFunction(), RetTy, Tys, CostKind); |
3357 | |
3358 | // If this is an intrinsic we may have a lower cost for it. |
3359 | if (getVectorIntrinsicIDForCall(CI, TLI)) { |
3360 | InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); |
3361 | return std::min(a: ScalarCallCost, b: IntrinsicCost); |
3362 | } |
3363 | return ScalarCallCost; |
3364 | } |
3365 | |
3366 | static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { |
3367 | if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) |
3368 | return Elt; |
3369 | return VectorType::get(ElementType: Elt, EC: VF); |
3370 | } |
3371 | |
3372 | InstructionCost |
3373 | LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, |
3374 | ElementCount VF) const { |
3375 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); |
3376 | assert(ID && "Expected intrinsic call!" ); |
3377 | Type *RetTy = MaybeVectorizeType(Elt: CI->getType(), VF); |
3378 | FastMathFlags FMF; |
3379 | if (auto *FPMO = dyn_cast<FPMathOperator>(Val: CI)) |
3380 | FMF = FPMO->getFastMathFlags(); |
3381 | |
3382 | SmallVector<const Value *> Arguments(CI->args()); |
3383 | FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); |
3384 | SmallVector<Type *> ParamTys; |
3385 | std::transform(first: FTy->param_begin(), last: FTy->param_end(), |
3386 | result: std::back_inserter(x&: ParamTys), |
3387 | unary_op: [&](Type *Ty) { return MaybeVectorizeType(Elt: Ty, VF); }); |
3388 | |
3389 | IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, |
3390 | dyn_cast<IntrinsicInst>(Val: CI)); |
3391 | return TTI.getIntrinsicInstrCost(ICA: CostAttrs, |
3392 | CostKind: TargetTransformInfo::TCK_RecipThroughput); |
3393 | } |
3394 | |
3395 | static Type *smallestIntegerVectorType(Type *T1, Type *T2) { |
3396 | auto *I1 = cast<IntegerType>(Val: cast<VectorType>(Val: T1)->getElementType()); |
3397 | auto *I2 = cast<IntegerType>(Val: cast<VectorType>(Val: T2)->getElementType()); |
3398 | return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2; |
3399 | } |
3400 | |
3401 | static Type *largestIntegerVectorType(Type *T1, Type *T2) { |
3402 | auto *I1 = cast<IntegerType>(Val: cast<VectorType>(Val: T1)->getElementType()); |
3403 | auto *I2 = cast<IntegerType>(Val: cast<VectorType>(Val: T2)->getElementType()); |
3404 | return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2; |
3405 | } |
3406 | |
3407 | void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State, |
3408 | VPlan &Plan) { |
3409 | // Fix widened non-induction PHIs by setting up the PHI operands. |
3410 | if (EnableVPlanNativePath) |
3411 | fixNonInductionPHIs(Plan, State); |
3412 | |
3413 | // At this point every instruction in the original loop is widened to a |
3414 | // vector form. Now we need to fix the recurrences in the loop. These PHI |
3415 | // nodes are currently empty because we did not want to introduce cycles. |
3416 | // This is the second stage of vectorizing recurrences. Note that fixing |
3417 | // reduction phis are already modeled in VPlan. |
3418 | // TODO: Also model fixing fixed-order recurrence phis in VPlan. |
3419 | VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion(); |
3420 | VPBasicBlock * = VectorRegion->getEntryBasicBlock(); |
3421 | for (VPRecipeBase &R : HeaderVPBB->phis()) { |
3422 | if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &R)) |
3423 | fixFixedOrderRecurrence(PhiR: FOR, State); |
3424 | } |
3425 | |
3426 | // Forget the original basic block. |
3427 | PSE.getSE()->forgetLoop(L: OrigLoop); |
3428 | PSE.getSE()->forgetBlockAndLoopDispositions(); |
3429 | |
3430 | // After vectorization, the exit blocks of the original loop will have |
3431 | // additional predecessors. Invalidate SCEVs for the exit phis in case SE |
3432 | // looked through single-entry phis. |
3433 | SmallVector<BasicBlock *> ExitBlocks; |
3434 | OrigLoop->getExitBlocks(ExitBlocks); |
3435 | for (BasicBlock *Exit : ExitBlocks) |
3436 | for (PHINode &PN : Exit->phis()) |
3437 | PSE.getSE()->forgetLcssaPhiWithNewPredecessor(L: OrigLoop, V: &PN); |
3438 | |
3439 | VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock(); |
3440 | Loop *VectorLoop = LI->getLoopFor(BB: State.CFG.VPBB2IRBB[LatchVPBB]); |
3441 | if (Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector())) { |
3442 | // No edge from the middle block to the unique exit block has been inserted |
3443 | // and there is nothing to fix from vector loop; phis should have incoming |
3444 | // from scalar loop only. |
3445 | } else { |
3446 | // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking |
3447 | // the cost model. |
3448 | |
3449 | // If we inserted an edge from the middle block to the unique exit block, |
3450 | // update uses outside the loop (phis) to account for the newly inserted |
3451 | // edge. |
3452 | |
3453 | // Fix-up external users of the induction variables. |
3454 | for (const auto &Entry : Legal->getInductionVars()) |
3455 | fixupIVUsers(OrigPhi: Entry.first, II: Entry.second, |
3456 | VectorTripCount: getOrCreateVectorTripCount(InsertBlock: VectorLoop->getLoopPreheader()), |
3457 | EndValue: IVEndValues[Entry.first], MiddleBlock: LoopMiddleBlock, |
3458 | VectorHeader: VectorLoop->getHeader(), Plan, State); |
3459 | } |
3460 | |
3461 | // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated |
3462 | // in the exit block, so update the builder. |
3463 | State.Builder.SetInsertPoint(TheBB: State.CFG.ExitBB, |
3464 | IP: State.CFG.ExitBB->getFirstNonPHIIt()); |
3465 | for (const auto &KV : Plan.getLiveOuts()) |
3466 | KV.second->fixPhi(Plan, State); |
3467 | |
3468 | for (Instruction *PI : PredicatedInstructions) |
3469 | sinkScalarOperands(PredInst: &*PI); |
3470 | |
3471 | // Remove redundant induction instructions. |
3472 | cse(BB: VectorLoop->getHeader()); |
3473 | |
3474 | // Set/update profile weights for the vector and remainder loops as original |
3475 | // loop iterations are now distributed among them. Note that original loop |
3476 | // represented by LoopScalarBody becomes remainder loop after vectorization. |
3477 | // |
3478 | // For cases like foldTailByMasking() and requiresScalarEpiloque() we may |
3479 | // end up getting slightly roughened result but that should be OK since |
3480 | // profile is not inherently precise anyway. Note also possible bypass of |
3481 | // vector code caused by legality checks is ignored, assigning all the weight |
3482 | // to the vector loop, optimistically. |
3483 | // |
3484 | // For scalable vectorization we can't know at compile time how many iterations |
3485 | // of the loop are handled in one vector iteration, so instead assume a pessimistic |
3486 | // vscale of '1'. |
3487 | setProfileInfoAfterUnrolling(OrigLoop: LI->getLoopFor(BB: LoopScalarBody), UnrolledLoop: VectorLoop, |
3488 | RemainderLoop: LI->getLoopFor(BB: LoopScalarBody), |
3489 | UF: VF.getKnownMinValue() * UF); |
3490 | } |
3491 | |
3492 | void InnerLoopVectorizer::fixFixedOrderRecurrence( |
3493 | VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) { |
3494 | // This is the second phase of vectorizing first-order recurrences. An |
3495 | // overview of the transformation is described below. Suppose we have the |
3496 | // following loop. |
3497 | // |
3498 | // for (int i = 0; i < n; ++i) |
3499 | // b[i] = a[i] - a[i - 1]; |
3500 | // |
3501 | // There is a first-order recurrence on "a". For this loop, the shorthand |
3502 | // scalar IR looks like: |
3503 | // |
3504 | // scalar.ph: |
3505 | // s_init = a[-1] |
3506 | // br scalar.body |
3507 | // |
3508 | // scalar.body: |
3509 | // i = phi [0, scalar.ph], [i+1, scalar.body] |
3510 | // s1 = phi [s_init, scalar.ph], [s2, scalar.body] |
3511 | // s2 = a[i] |
3512 | // b[i] = s2 - s1 |
3513 | // br cond, scalar.body, ... |
3514 | // |
3515 | // In this example, s1 is a recurrence because it's value depends on the |
3516 | // previous iteration. In the first phase of vectorization, we created a |
3517 | // vector phi v1 for s1. We now complete the vectorization and produce the |
3518 | // shorthand vector IR shown below (for VF = 4, UF = 1). |
3519 | // |
3520 | // vector.ph: |
3521 | // v_init = vector(..., ..., ..., a[-1]) |
3522 | // br vector.body |
3523 | // |
3524 | // vector.body |
3525 | // i = phi [0, vector.ph], [i+4, vector.body] |
3526 | // v1 = phi [v_init, vector.ph], [v2, vector.body] |
3527 | // v2 = a[i, i+1, i+2, i+3]; |
3528 | // v3 = vector(v1(3), v2(0, 1, 2)) |
3529 | // b[i, i+1, i+2, i+3] = v2 - v3 |
3530 | // br cond, vector.body, middle.block |
3531 | // |
3532 | // middle.block: |
3533 | // x = v2(3) |
3534 | // br scalar.ph |
3535 | // |
3536 | // scalar.ph: |
3537 | // s_init = phi [x, middle.block], [a[-1], otherwise] |
3538 | // br scalar.body |
3539 | // |
3540 | // After execution completes the vector loop, we extract the next value of |
3541 | // the recurrence (x) to use as the initial value in the scalar loop. |
3542 | |
3543 | // Extract the last vector element in the middle block. This will be the |
3544 | // initial value for the recurrence when jumping to the scalar loop. |
3545 | VPValue *PreviousDef = PhiR->getBackedgeValue(); |
3546 | Value *Incoming = State.get(Def: PreviousDef, Part: UF - 1); |
3547 | auto * = Incoming; |
3548 | auto *IdxTy = Builder.getInt32Ty(); |
3549 | Value *RuntimeVF = nullptr; |
3550 | if (VF.isVector()) { |
3551 | auto *One = ConstantInt::get(Ty: IdxTy, V: 1); |
3552 | Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); |
3553 | RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF); |
3554 | auto *LastIdx = Builder.CreateSub(LHS: RuntimeVF, RHS: One); |
3555 | ExtractForScalar = |
3556 | Builder.CreateExtractElement(Vec: Incoming, Idx: LastIdx, Name: "vector.recur.extract" ); |
3557 | } |
3558 | |
3559 | auto RecurSplice = cast<VPInstruction>(Val: *PhiR->user_begin()); |
3560 | assert(PhiR->getNumUsers() == 1 && |
3561 | RecurSplice->getOpcode() == |
3562 | VPInstruction::FirstOrderRecurrenceSplice && |
3563 | "recurrence phi must have a single user: FirstOrderRecurrenceSplice" ); |
3564 | SmallVector<VPLiveOut *> LiveOuts; |
3565 | for (VPUser *U : RecurSplice->users()) |
3566 | if (auto *LiveOut = dyn_cast<VPLiveOut>(Val: U)) |
3567 | LiveOuts.push_back(Elt: LiveOut); |
3568 | |
3569 | if (!LiveOuts.empty()) { |
3570 | // Extract the second last element in the middle block if the |
3571 | // Phi is used outside the loop. We need to extract the phi itself |
3572 | // and not the last element (the phi update in the current iteration). This |
3573 | // will be the value when jumping to the exit block from the |
3574 | // LoopMiddleBlock, when the scalar loop is not run at all. |
3575 | Value * = nullptr; |
3576 | if (VF.isVector()) { |
3577 | auto *Idx = Builder.CreateSub(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: IdxTy, V: 2)); |
3578 | ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement( |
3579 | Vec: Incoming, Idx, Name: "vector.recur.extract.for.phi" ); |
3580 | } else { |
3581 | assert(UF > 1 && "VF and UF cannot both be 1" ); |
3582 | // When loop is unrolled without vectorizing, initialize |
3583 | // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled |
3584 | // value of `Incoming`. This is analogous to the vectorized case above: |
3585 | // extracting the second last element when VF > 1. |
3586 | ExtractForPhiUsedOutsideLoop = State.get(Def: PreviousDef, Part: UF - 2); |
3587 | } |
3588 | |
3589 | for (VPLiveOut *LiveOut : LiveOuts) { |
3590 | assert(!Cost->requiresScalarEpilogue(VF.isVector())); |
3591 | PHINode *LCSSAPhi = LiveOut->getPhi(); |
3592 | LCSSAPhi->addIncoming(V: ExtractForPhiUsedOutsideLoop, BB: LoopMiddleBlock); |
3593 | State.Plan->removeLiveOut(PN: LCSSAPhi); |
3594 | } |
3595 | } |
3596 | |
3597 | // Fix the initial value of the original recurrence in the scalar loop. |
3598 | Builder.SetInsertPoint(TheBB: LoopScalarPreHeader, IP: LoopScalarPreHeader->begin()); |
3599 | PHINode *Phi = cast<PHINode>(Val: PhiR->getUnderlyingValue()); |
3600 | auto *Start = Builder.CreatePHI(Ty: Phi->getType(), NumReservedValues: 2, Name: "scalar.recur.init" ); |
3601 | auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue(); |
3602 | for (auto *BB : predecessors(BB: LoopScalarPreHeader)) { |
3603 | auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit; |
3604 | Start->addIncoming(V: Incoming, BB); |
3605 | } |
3606 | |
3607 | Phi->setIncomingValueForBlock(BB: LoopScalarPreHeader, V: Start); |
3608 | Phi->setName("scalar.recur" ); |
3609 | } |
3610 | |
3611 | void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { |
3612 | // The basic block and loop containing the predicated instruction. |
3613 | auto *PredBB = PredInst->getParent(); |
3614 | auto *VectorLoop = LI->getLoopFor(BB: PredBB); |
3615 | |
3616 | // Initialize a worklist with the operands of the predicated instruction. |
3617 | SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end()); |
3618 | |
3619 | // Holds instructions that we need to analyze again. An instruction may be |
3620 | // reanalyzed if we don't yet know if we can sink it or not. |
3621 | SmallVector<Instruction *, 8> InstsToReanalyze; |
3622 | |
3623 | // Returns true if a given use occurs in the predicated block. Phi nodes use |
3624 | // their operands in their corresponding predecessor blocks. |
3625 | auto isBlockOfUsePredicated = [&](Use &U) -> bool { |
3626 | auto *I = cast<Instruction>(Val: U.getUser()); |
3627 | BasicBlock *BB = I->getParent(); |
3628 | if (auto *Phi = dyn_cast<PHINode>(Val: I)) |
3629 | BB = Phi->getIncomingBlock( |
3630 | i: PHINode::getIncomingValueNumForOperand(i: U.getOperandNo())); |
3631 | return BB == PredBB; |
3632 | }; |
3633 | |
3634 | // Iteratively sink the scalarized operands of the predicated instruction |
3635 | // into the block we created for it. When an instruction is sunk, it's |
3636 | // operands are then added to the worklist. The algorithm ends after one pass |
3637 | // through the worklist doesn't sink a single instruction. |
3638 | bool Changed; |
3639 | do { |
3640 | // Add the instructions that need to be reanalyzed to the worklist, and |
3641 | // reset the changed indicator. |
3642 | Worklist.insert(Start: InstsToReanalyze.begin(), End: InstsToReanalyze.end()); |
3643 | InstsToReanalyze.clear(); |
3644 | Changed = false; |
3645 | |
3646 | while (!Worklist.empty()) { |
3647 | auto *I = dyn_cast<Instruction>(Val: Worklist.pop_back_val()); |
3648 | |
3649 | // We can't sink an instruction if it is a phi node, is not in the loop, |
3650 | // may have side effects or may read from memory. |
3651 | // TODO Could dor more granular checking to allow sinking a load past non-store instructions. |
3652 | if (!I || isa<PHINode>(Val: I) || !VectorLoop->contains(Inst: I) || |
3653 | I->mayHaveSideEffects() || I->mayReadFromMemory()) |
3654 | continue; |
3655 | |
3656 | // If the instruction is already in PredBB, check if we can sink its |
3657 | // operands. In that case, VPlan's sinkScalarOperands() succeeded in |
3658 | // sinking the scalar instruction I, hence it appears in PredBB; but it |
3659 | // may have failed to sink I's operands (recursively), which we try |
3660 | // (again) here. |
3661 | if (I->getParent() == PredBB) { |
3662 | Worklist.insert(Start: I->op_begin(), End: I->op_end()); |
3663 | continue; |
3664 | } |
3665 | |
3666 | // It's legal to sink the instruction if all its uses occur in the |
3667 | // predicated block. Otherwise, there's nothing to do yet, and we may |
3668 | // need to reanalyze the instruction. |
3669 | if (!llvm::all_of(Range: I->uses(), P: isBlockOfUsePredicated)) { |
3670 | InstsToReanalyze.push_back(Elt: I); |
3671 | continue; |
3672 | } |
3673 | |
3674 | // Move the instruction to the beginning of the predicated block, and add |
3675 | // it's operands to the worklist. |
3676 | I->moveBefore(MovePos: &*PredBB->getFirstInsertionPt()); |
3677 | Worklist.insert(Start: I->op_begin(), End: I->op_end()); |
3678 | |
3679 | // The sinking may have enabled other instructions to be sunk, so we will |
3680 | // need to iterate. |
3681 | Changed = true; |
3682 | } |
3683 | } while (Changed); |
3684 | } |
3685 | |
3686 | void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan, |
3687 | VPTransformState &State) { |
3688 | auto Iter = vp_depth_first_deep(G: Plan.getEntry()); |
3689 | for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) { |
3690 | for (VPRecipeBase &P : VPBB->phis()) { |
3691 | VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &P); |
3692 | if (!VPPhi) |
3693 | continue; |
3694 | PHINode *NewPhi = cast<PHINode>(Val: State.get(Def: VPPhi, Part: 0)); |
3695 | // Make sure the builder has a valid insert point. |
3696 | Builder.SetInsertPoint(NewPhi); |
3697 | for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) { |
3698 | VPValue *Inc = VPPhi->getIncomingValue(I: i); |
3699 | VPBasicBlock *VPBB = VPPhi->getIncomingBlock(I: i); |
3700 | NewPhi->addIncoming(V: State.get(Def: Inc, Part: 0), BB: State.CFG.VPBB2IRBB[VPBB]); |
3701 | } |
3702 | } |
3703 | } |
3704 | } |
3705 | |
3706 | void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { |
3707 | // We should not collect Scalars more than once per VF. Right now, this |
3708 | // function is called from collectUniformsAndScalars(), which already does |
3709 | // this check. Collecting Scalars for VF=1 does not make any sense. |
3710 | assert(VF.isVector() && !Scalars.contains(VF) && |
3711 | "This function should not be visited twice for the same VF" ); |
3712 | |
3713 | // This avoids any chances of creating a REPLICATE recipe during planning |
3714 | // since that would result in generation of scalarized code during execution, |
3715 | // which is not supported for scalable vectors. |
3716 | if (VF.isScalable()) { |
3717 | Scalars[VF].insert(I: Uniforms[VF].begin(), E: Uniforms[VF].end()); |
3718 | return; |
3719 | } |
3720 | |
3721 | SmallSetVector<Instruction *, 8> Worklist; |
3722 | |
3723 | // These sets are used to seed the analysis with pointers used by memory |
3724 | // accesses that will remain scalar. |
3725 | SmallSetVector<Instruction *, 8> ScalarPtrs; |
3726 | SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs; |
3727 | auto *Latch = TheLoop->getLoopLatch(); |
3728 | |
3729 | // A helper that returns true if the use of Ptr by MemAccess will be scalar. |
3730 | // The pointer operands of loads and stores will be scalar as long as the |
3731 | // memory access is not a gather or scatter operation. The value operand of a |
3732 | // store will remain scalar if the store is scalarized. |
3733 | auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { |
3734 | InstWidening WideningDecision = getWideningDecision(I: MemAccess, VF); |
3735 | assert(WideningDecision != CM_Unknown && |
3736 | "Widening decision should be ready at this moment" ); |
3737 | if (auto *Store = dyn_cast<StoreInst>(Val: MemAccess)) |
3738 | if (Ptr == Store->getValueOperand()) |
3739 | return WideningDecision == CM_Scalarize; |
3740 | assert(Ptr == getLoadStorePointerOperand(MemAccess) && |
3741 | "Ptr is neither a value or pointer operand" ); |
3742 | return WideningDecision != CM_GatherScatter; |
3743 | }; |
3744 | |
3745 | // A helper that returns true if the given value is a bitcast or |
3746 | // getelementptr instruction contained in the loop. |
3747 | auto isLoopVaryingBitCastOrGEP = [&](Value *V) { |
3748 | return ((isa<BitCastInst>(Val: V) && V->getType()->isPointerTy()) || |
3749 | isa<GetElementPtrInst>(Val: V)) && |
3750 | !TheLoop->isLoopInvariant(V); |
3751 | }; |
3752 | |
3753 | // A helper that evaluates a memory access's use of a pointer. If the use will |
3754 | // be a scalar use and the pointer is only used by memory accesses, we place |
3755 | // the pointer in ScalarPtrs. Otherwise, the pointer is placed in |
3756 | // PossibleNonScalarPtrs. |
3757 | auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) { |
3758 | // We only care about bitcast and getelementptr instructions contained in |
3759 | // the loop. |
3760 | if (!isLoopVaryingBitCastOrGEP(Ptr)) |
3761 | return; |
3762 | |
3763 | // If the pointer has already been identified as scalar (e.g., if it was |
3764 | // also identified as uniform), there's nothing to do. |
3765 | auto *I = cast<Instruction>(Val: Ptr); |
3766 | if (Worklist.count(key: I)) |
3767 | return; |
3768 | |
3769 | // If the use of the pointer will be a scalar use, and all users of the |
3770 | // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, |
3771 | // place the pointer in PossibleNonScalarPtrs. |
3772 | if (isScalarUse(MemAccess, Ptr) && llvm::all_of(Range: I->users(), P: [&](User *U) { |
3773 | return isa<LoadInst>(Val: U) || isa<StoreInst>(Val: U); |
3774 | })) |
3775 | ScalarPtrs.insert(X: I); |
3776 | else |
3777 | PossibleNonScalarPtrs.insert(Ptr: I); |
3778 | }; |
3779 | |
3780 | // We seed the scalars analysis with three classes of instructions: (1) |
3781 | // instructions marked uniform-after-vectorization and (2) bitcast, |
3782 | // getelementptr and (pointer) phi instructions used by memory accesses |
3783 | // requiring a scalar use. |
3784 | // |
3785 | // (1) Add to the worklist all instructions that have been identified as |
3786 | // uniform-after-vectorization. |
3787 | Worklist.insert(Start: Uniforms[VF].begin(), End: Uniforms[VF].end()); |
3788 | |
3789 | // (2) Add to the worklist all bitcast and getelementptr instructions used by |
3790 | // memory accesses requiring a scalar use. The pointer operands of loads and |
3791 | // stores will be scalar as long as the memory accesses is not a gather or |
3792 | // scatter operation. The value operand of a store will remain scalar if the |
3793 | // store is scalarized. |
3794 | for (auto *BB : TheLoop->blocks()) |
3795 | for (auto &I : *BB) { |
3796 | if (auto *Load = dyn_cast<LoadInst>(Val: &I)) { |
3797 | evaluatePtrUse(Load, Load->getPointerOperand()); |
3798 | } else if (auto *Store = dyn_cast<StoreInst>(Val: &I)) { |
3799 | evaluatePtrUse(Store, Store->getPointerOperand()); |
3800 | evaluatePtrUse(Store, Store->getValueOperand()); |
3801 | } |
3802 | } |
3803 | for (auto *I : ScalarPtrs) |
3804 | if (!PossibleNonScalarPtrs.count(Ptr: I)) { |
3805 | LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n" ); |
3806 | Worklist.insert(X: I); |
3807 | } |
3808 | |
3809 | // Insert the forced scalars. |
3810 | // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector |
3811 | // induction variable when the PHI user is scalarized. |
3812 | auto ForcedScalar = ForcedScalars.find(Val: VF); |
3813 | if (ForcedScalar != ForcedScalars.end()) |
3814 | for (auto *I : ForcedScalar->second) { |
3815 | LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n" ); |
3816 | Worklist.insert(X: I); |
3817 | } |
3818 | |
3819 | // Expand the worklist by looking through any bitcasts and getelementptr |
3820 | // instructions we've already identified as scalar. This is similar to the |
3821 | // expansion step in collectLoopUniforms(); however, here we're only |
3822 | // expanding to include additional bitcasts and getelementptr instructions. |
3823 | unsigned Idx = 0; |
3824 | while (Idx != Worklist.size()) { |
3825 | Instruction *Dst = Worklist[Idx++]; |
3826 | if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(i: 0))) |
3827 | continue; |
3828 | auto *Src = cast<Instruction>(Val: Dst->getOperand(i: 0)); |
3829 | if (llvm::all_of(Range: Src->users(), P: [&](User *U) -> bool { |
3830 | auto *J = cast<Instruction>(Val: U); |
3831 | return !TheLoop->contains(Inst: J) || Worklist.count(key: J) || |
3832 | ((isa<LoadInst>(Val: J) || isa<StoreInst>(Val: J)) && |
3833 | isScalarUse(J, Src)); |
3834 | })) { |
3835 | Worklist.insert(X: Src); |
3836 | LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n" ); |
3837 | } |
3838 | } |
3839 | |
3840 | // An induction variable will remain scalar if all users of the induction |
3841 | // variable and induction variable update remain scalar. |
3842 | for (const auto &Induction : Legal->getInductionVars()) { |
3843 | auto *Ind = Induction.first; |
3844 | auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch)); |
3845 | |
3846 | // If tail-folding is applied, the primary induction variable will be used |
3847 | // to feed a vector compare. |
3848 | if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) |
3849 | continue; |
3850 | |
3851 | // Returns true if \p Indvar is a pointer induction that is used directly by |
3852 | // load/store instruction \p I. |
3853 | auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar, |
3854 | Instruction *I) { |
3855 | return Induction.second.getKind() == |
3856 | InductionDescriptor::IK_PtrInduction && |
3857 | (isa<LoadInst>(Val: I) || isa<StoreInst>(Val: I)) && |
3858 | Indvar == getLoadStorePointerOperand(V: I) && isScalarUse(I, Indvar); |
3859 | }; |
3860 | |
3861 | // Determine if all users of the induction variable are scalar after |
3862 | // vectorization. |
3863 | auto ScalarInd = llvm::all_of(Range: Ind->users(), P: [&](User *U) -> bool { |
3864 | auto *I = cast<Instruction>(Val: U); |
3865 | return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) || |
3866 | IsDirectLoadStoreFromPtrIndvar(Ind, I); |
3867 | }); |
3868 | if (!ScalarInd) |
3869 | continue; |
3870 | |
3871 | // If the induction variable update is a fixed-order recurrence, neither the |
3872 | // induction variable or its update should be marked scalar after |
3873 | // vectorization. |
3874 | auto *IndUpdatePhi = dyn_cast<PHINode>(Val: IndUpdate); |
3875 | if (IndUpdatePhi && Legal->isFixedOrderRecurrence(Phi: IndUpdatePhi)) |
3876 | continue; |
3877 | |
3878 | // Determine if all users of the induction variable update instruction are |
3879 | // scalar after vectorization. |
3880 | auto ScalarIndUpdate = |
3881 | llvm::all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool { |
3882 | auto *I = cast<Instruction>(Val: U); |
3883 | return I == Ind || !TheLoop->contains(Inst: I) || Worklist.count(key: I) || |
3884 | IsDirectLoadStoreFromPtrIndvar(IndUpdate, I); |
3885 | }); |
3886 | if (!ScalarIndUpdate) |
3887 | continue; |
3888 | |
3889 | // The induction variable and its update instruction will remain scalar. |
3890 | Worklist.insert(X: Ind); |
3891 | Worklist.insert(X: IndUpdate); |
3892 | LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n" ); |
3893 | LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate |
3894 | << "\n" ); |
3895 | } |
3896 | |
3897 | Scalars[VF].insert(I: Worklist.begin(), E: Worklist.end()); |
3898 | } |
3899 | |
3900 | bool LoopVectorizationCostModel::isScalarWithPredication( |
3901 | Instruction *I, ElementCount VF) const { |
3902 | if (!isPredicatedInst(I)) |
3903 | return false; |
3904 | |
3905 | // Do we have a non-scalar lowering for this predicated |
3906 | // instruction? No - it is scalar with predication. |
3907 | switch(I->getOpcode()) { |
3908 | default: |
3909 | return true; |
3910 | case Instruction::Call: |
3911 | if (VF.isScalar()) |
3912 | return true; |
3913 | return CallWideningDecisions.at(Val: std::make_pair(x: cast<CallInst>(Val: I), y&: VF)) |
3914 | .Kind == CM_Scalarize; |
3915 | case Instruction::Load: |
3916 | case Instruction::Store: { |
3917 | auto *Ptr = getLoadStorePointerOperand(V: I); |
3918 | auto *Ty = getLoadStoreType(I); |
3919 | Type *VTy = Ty; |
3920 | if (VF.isVector()) |
3921 | VTy = VectorType::get(ElementType: Ty, EC: VF); |
3922 | const Align Alignment = getLoadStoreAlignment(I); |
3923 | return isa<LoadInst>(Val: I) ? !(isLegalMaskedLoad(DataType: Ty, Ptr, Alignment) || |
3924 | TTI.isLegalMaskedGather(DataType: VTy, Alignment)) |
3925 | : !(isLegalMaskedStore(DataType: Ty, Ptr, Alignment) || |
3926 | TTI.isLegalMaskedScatter(DataType: VTy, Alignment)); |
3927 | } |
3928 | case Instruction::UDiv: |
3929 | case Instruction::SDiv: |
3930 | case Instruction::SRem: |
3931 | case Instruction::URem: { |
3932 | // We have the option to use the safe-divisor idiom to avoid predication. |
3933 | // The cost based decision here will always select safe-divisor for |
3934 | // scalable vectors as scalarization isn't legal. |
3935 | const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); |
3936 | return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost); |
3937 | } |
3938 | } |
3939 | } |
3940 | |
3941 | bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { |
3942 | if (!blockNeedsPredicationForAnyReason(BB: I->getParent())) |
3943 | return false; |
3944 | |
3945 | // Can we prove this instruction is safe to unconditionally execute? |
3946 | // If not, we must use some form of predication. |
3947 | switch(I->getOpcode()) { |
3948 | default: |
3949 | return false; |
3950 | case Instruction::Load: |
3951 | case Instruction::Store: { |
3952 | if (!Legal->isMaskRequired(I)) |
3953 | return false; |
3954 | // When we know the load's address is loop invariant and the instruction |
3955 | // in the original scalar loop was unconditionally executed then we |
3956 | // don't need to mark it as a predicated instruction. Tail folding may |
3957 | // introduce additional predication, but we're guaranteed to always have |
3958 | // at least one active lane. We call Legal->blockNeedsPredication here |
3959 | // because it doesn't query tail-folding. For stores, we need to prove |
3960 | // both speculation safety (which follows from the same argument as loads), |
3961 | // but also must prove the value being stored is correct. The easiest |
3962 | // form of the later is to require that all values stored are the same. |
3963 | if (Legal->isInvariant(V: getLoadStorePointerOperand(V: I)) && |
3964 | (isa<LoadInst>(Val: I) || |
3965 | (isa<StoreInst>(Val: I) && |
3966 | TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand()))) && |
3967 | !Legal->blockNeedsPredication(BB: I->getParent())) |
3968 | return false; |
3969 | return true; |
3970 | } |
3971 | case Instruction::UDiv: |
3972 | case Instruction::SDiv: |
3973 | case Instruction::SRem: |
3974 | case Instruction::URem: |
3975 | // TODO: We can use the loop-preheader as context point here and get |
3976 | // context sensitive reasoning |
3977 | return !isSafeToSpeculativelyExecute(I); |
3978 | case Instruction::Call: |
3979 | return Legal->isMaskRequired(I); |
3980 | } |
3981 | } |
3982 | |
3983 | std::pair<InstructionCost, InstructionCost> |
3984 | LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, |
3985 | ElementCount VF) const { |
3986 | assert(I->getOpcode() == Instruction::UDiv || |
3987 | I->getOpcode() == Instruction::SDiv || |
3988 | I->getOpcode() == Instruction::SRem || |
3989 | I->getOpcode() == Instruction::URem); |
3990 | assert(!isSafeToSpeculativelyExecute(I)); |
3991 | |
3992 | const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
3993 | |
3994 | // Scalarization isn't legal for scalable vector types |
3995 | InstructionCost ScalarizationCost = InstructionCost::getInvalid(); |
3996 | if (!VF.isScalable()) { |
3997 | // Get the scalarization cost and scale this amount by the probability of |
3998 | // executing the predicated block. If the instruction is not predicated, |
3999 | // we fall through to the next case. |
4000 | ScalarizationCost = 0; |
4001 | |
4002 | // These instructions have a non-void type, so account for the phi nodes |
4003 | // that we will create. This cost is likely to be zero. The phi node |
4004 | // cost, if any, should be scaled by the block probability because it |
4005 | // models a copy at the end of each predicated block. |
4006 | ScalarizationCost += VF.getKnownMinValue() * |
4007 | TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind); |
4008 | |
4009 | // The cost of the non-predicated instruction. |
4010 | ScalarizationCost += VF.getKnownMinValue() * |
4011 | TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: I->getType(), CostKind); |
4012 | |
4013 | // The cost of insertelement and extractelement instructions needed for |
4014 | // scalarization. |
4015 | ScalarizationCost += getScalarizationOverhead(I, VF, CostKind); |
4016 | |
4017 | // Scale the cost by the probability of executing the predicated blocks. |
4018 | // This assumes the predicated block for each vector lane is equally |
4019 | // likely. |
4020 | ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb(); |
4021 | } |
4022 | InstructionCost SafeDivisorCost = 0; |
4023 | |
4024 | auto *VecTy = ToVectorTy(Scalar: I->getType(), EC: VF); |
4025 | |
4026 | // The cost of the select guard to ensure all lanes are well defined |
4027 | // after we speculate above any internal control flow. |
4028 | SafeDivisorCost += TTI.getCmpSelInstrCost( |
4029 | Opcode: Instruction::Select, ValTy: VecTy, |
4030 | CondTy: ToVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF), |
4031 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind); |
4032 | |
4033 | // Certain instructions can be cheaper to vectorize if they have a constant |
4034 | // second vector operand. One example of this are shifts on x86. |
4035 | Value *Op2 = I->getOperand(i: 1); |
4036 | auto Op2Info = TTI.getOperandInfo(V: Op2); |
4037 | if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && |
4038 | Legal->isInvariant(V: Op2)) |
4039 | Op2Info.Kind = TargetTransformInfo::OK_UniformValue; |
4040 | |
4041 | SmallVector<const Value *, 4> Operands(I->operand_values()); |
4042 | SafeDivisorCost += TTI.getArithmeticInstrCost( |
4043 | Opcode: I->getOpcode(), Ty: VecTy, CostKind, |
4044 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
4045 | Opd2Info: Op2Info, Args: Operands, CxtI: I); |
4046 | return {ScalarizationCost, SafeDivisorCost}; |
4047 | } |
4048 | |
4049 | bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( |
4050 | Instruction *I, ElementCount VF) { |
4051 | assert(isAccessInterleaved(I) && "Expecting interleaved access." ); |
4052 | assert(getWideningDecision(I, VF) == CM_Unknown && |
4053 | "Decision should not be set yet." ); |
4054 | auto *Group = getInterleavedAccessGroup(Instr: I); |
4055 | assert(Group && "Must have a group." ); |
4056 | |
4057 | // If the instruction's allocated size doesn't equal it's type size, it |
4058 | // requires padding and will be scalarized. |
4059 | auto &DL = I->getModule()->getDataLayout(); |
4060 | auto *ScalarTy = getLoadStoreType(I); |
4061 | if (hasIrregularType(Ty: ScalarTy, DL)) |
4062 | return false; |
4063 | |
4064 | // If the group involves a non-integral pointer, we may not be able to |
4065 | // losslessly cast all values to a common type. |
4066 | unsigned InterleaveFactor = Group->getFactor(); |
4067 | bool ScalarNI = DL.isNonIntegralPointerType(Ty: ScalarTy); |
4068 | for (unsigned i = 0; i < InterleaveFactor; i++) { |
4069 | Instruction *Member = Group->getMember(Index: i); |
4070 | if (!Member) |
4071 | continue; |
4072 | auto *MemberTy = getLoadStoreType(I: Member); |
4073 | bool MemberNI = DL.isNonIntegralPointerType(Ty: MemberTy); |
4074 | // Don't coerce non-integral pointers to integers or vice versa. |
4075 | if (MemberNI != ScalarNI) { |
4076 | // TODO: Consider adding special nullptr value case here |
4077 | return false; |
4078 | } else if (MemberNI && ScalarNI && |
4079 | ScalarTy->getPointerAddressSpace() != |
4080 | MemberTy->getPointerAddressSpace()) { |
4081 | return false; |
4082 | } |
4083 | } |
4084 | |
4085 | // Check if masking is required. |
4086 | // A Group may need masking for one of two reasons: it resides in a block that |
4087 | // needs predication, or it was decided to use masking to deal with gaps |
4088 | // (either a gap at the end of a load-access that may result in a speculative |
4089 | // load, or any gaps in a store-access). |
4090 | bool PredicatedAccessRequiresMasking = |
4091 | blockNeedsPredicationForAnyReason(BB: I->getParent()) && |
4092 | Legal->isMaskRequired(I); |
4093 | bool LoadAccessWithGapsRequiresEpilogMasking = |
4094 | isa<LoadInst>(Val: I) && Group->requiresScalarEpilogue() && |
4095 | !isScalarEpilogueAllowed(); |
4096 | bool StoreAccessWithGapsRequiresMasking = |
4097 | isa<StoreInst>(Val: I) && (Group->getNumMembers() < Group->getFactor()); |
4098 | if (!PredicatedAccessRequiresMasking && |
4099 | !LoadAccessWithGapsRequiresEpilogMasking && |
4100 | !StoreAccessWithGapsRequiresMasking) |
4101 | return true; |
4102 | |
4103 | // If masked interleaving is required, we expect that the user/target had |
4104 | // enabled it, because otherwise it either wouldn't have been created or |
4105 | // it should have been invalidated by the CostModel. |
4106 | assert(useMaskedInterleavedAccesses(TTI) && |
4107 | "Masked interleave-groups for predicated accesses are not enabled." ); |
4108 | |
4109 | if (Group->isReverse()) |
4110 | return false; |
4111 | |
4112 | auto *Ty = getLoadStoreType(I); |
4113 | const Align Alignment = getLoadStoreAlignment(I); |
4114 | return isa<LoadInst>(Val: I) ? TTI.isLegalMaskedLoad(DataType: Ty, Alignment) |
4115 | : TTI.isLegalMaskedStore(DataType: Ty, Alignment); |
4116 | } |
4117 | |
4118 | bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( |
4119 | Instruction *I, ElementCount VF) { |
4120 | // Get and ensure we have a valid memory instruction. |
4121 | assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction" ); |
4122 | |
4123 | auto *Ptr = getLoadStorePointerOperand(V: I); |
4124 | auto *ScalarTy = getLoadStoreType(I); |
4125 | |
4126 | // In order to be widened, the pointer should be consecutive, first of all. |
4127 | if (!Legal->isConsecutivePtr(AccessTy: ScalarTy, Ptr)) |
4128 | return false; |
4129 | |
4130 | // If the instruction is a store located in a predicated block, it will be |
4131 | // scalarized. |
4132 | if (isScalarWithPredication(I, VF)) |
4133 | return false; |
4134 | |
4135 | // If the instruction's allocated size doesn't equal it's type size, it |
4136 | // requires padding and will be scalarized. |
4137 | auto &DL = I->getModule()->getDataLayout(); |
4138 | if (hasIrregularType(Ty: ScalarTy, DL)) |
4139 | return false; |
4140 | |
4141 | return true; |
4142 | } |
4143 | |
4144 | void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { |
4145 | // We should not collect Uniforms more than once per VF. Right now, |
4146 | // this function is called from collectUniformsAndScalars(), which |
4147 | // already does this check. Collecting Uniforms for VF=1 does not make any |
4148 | // sense. |
4149 | |
4150 | assert(VF.isVector() && !Uniforms.contains(VF) && |
4151 | "This function should not be visited twice for the same VF" ); |
4152 | |
4153 | // Visit the list of Uniforms. If we'll not find any uniform value, we'll |
4154 | // not analyze again. Uniforms.count(VF) will return 1. |
4155 | Uniforms[VF].clear(); |
4156 | |
4157 | // We now know that the loop is vectorizable! |
4158 | // Collect instructions inside the loop that will remain uniform after |
4159 | // vectorization. |
4160 | |
4161 | // Global values, params and instructions outside of current loop are out of |
4162 | // scope. |
4163 | auto isOutOfScope = [&](Value *V) -> bool { |
4164 | Instruction *I = dyn_cast<Instruction>(Val: V); |
4165 | return (!I || !TheLoop->contains(Inst: I)); |
4166 | }; |
4167 | |
4168 | // Worklist containing uniform instructions demanding lane 0. |
4169 | SetVector<Instruction *> Worklist; |
4170 | BasicBlock *Latch = TheLoop->getLoopLatch(); |
4171 | |
4172 | // Add uniform instructions demanding lane 0 to the worklist. Instructions |
4173 | // that are scalar with predication must not be considered uniform after |
4174 | // vectorization, because that would create an erroneous replicating region |
4175 | // where only a single instance out of VF should be formed. |
4176 | // TODO: optimize such seldom cases if found important, see PR40816. |
4177 | auto addToWorklistIfAllowed = [&](Instruction *I) -> void { |
4178 | if (isOutOfScope(I)) { |
4179 | LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: " |
4180 | << *I << "\n" ); |
4181 | return; |
4182 | } |
4183 | if (isScalarWithPredication(I, VF)) { |
4184 | LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " |
4185 | << *I << "\n" ); |
4186 | return; |
4187 | } |
4188 | LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n" ); |
4189 | Worklist.insert(X: I); |
4190 | }; |
4191 | |
4192 | // Start with the conditional branch. If the branch condition is an |
4193 | // instruction contained in the loop that is only used by the branch, it is |
4194 | // uniform. |
4195 | auto *Cmp = dyn_cast<Instruction>(Val: Latch->getTerminator()->getOperand(i: 0)); |
4196 | if (Cmp && TheLoop->contains(Inst: Cmp) && Cmp->hasOneUse()) |
4197 | addToWorklistIfAllowed(Cmp); |
4198 | |
4199 | auto PrevVF = VF.divideCoefficientBy(RHS: 2); |
4200 | // Return true if all lanes perform the same memory operation, and we can |
4201 | // thus chose to execute only one. |
4202 | auto isUniformMemOpUse = [&](Instruction *I) { |
4203 | // If the value was already known to not be uniform for the previous |
4204 | // (smaller VF), it cannot be uniform for the larger VF. |
4205 | if (PrevVF.isVector()) { |
4206 | auto Iter = Uniforms.find(Val: PrevVF); |
4207 | if (Iter != Uniforms.end() && !Iter->second.contains(Ptr: I)) |
4208 | return false; |
4209 | } |
4210 | if (!Legal->isUniformMemOp(I&: *I, VF)) |
4211 | return false; |
4212 | if (isa<LoadInst>(Val: I)) |
4213 | // Loading the same address always produces the same result - at least |
4214 | // assuming aliasing and ordering which have already been checked. |
4215 | return true; |
4216 | // Storing the same value on every iteration. |
4217 | return TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand()); |
4218 | }; |
4219 | |
4220 | auto isUniformDecision = [&](Instruction *I, ElementCount VF) { |
4221 | InstWidening WideningDecision = getWideningDecision(I, VF); |
4222 | assert(WideningDecision != CM_Unknown && |
4223 | "Widening decision should be ready at this moment" ); |
4224 | |
4225 | if (isUniformMemOpUse(I)) |
4226 | return true; |
4227 | |
4228 | return (WideningDecision == CM_Widen || |
4229 | WideningDecision == CM_Widen_Reverse || |
4230 | WideningDecision == CM_Interleave); |
4231 | }; |
4232 | |
4233 | // Returns true if Ptr is the pointer operand of a memory access instruction |
4234 | // I, I is known to not require scalarization, and the pointer is not also |
4235 | // stored. |
4236 | auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { |
4237 | if (isa<StoreInst>(Val: I) && I->getOperand(i: 0) == Ptr) |
4238 | return false; |
4239 | return getLoadStorePointerOperand(V: I) == Ptr && |
4240 | (isUniformDecision(I, VF) || Legal->isInvariant(V: Ptr)); |
4241 | }; |
4242 | |
4243 | // Holds a list of values which are known to have at least one uniform use. |
4244 | // Note that there may be other uses which aren't uniform. A "uniform use" |
4245 | // here is something which only demands lane 0 of the unrolled iterations; |
4246 | // it does not imply that all lanes produce the same value (e.g. this is not |
4247 | // the usual meaning of uniform) |
4248 | SetVector<Value *> HasUniformUse; |
4249 | |
4250 | // Scan the loop for instructions which are either a) known to have only |
4251 | // lane 0 demanded or b) are uses which demand only lane 0 of their operand. |
4252 | for (auto *BB : TheLoop->blocks()) |
4253 | for (auto &I : *BB) { |
4254 | if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &I)) { |
4255 | switch (II->getIntrinsicID()) { |
4256 | case Intrinsic::sideeffect: |
4257 | case Intrinsic::experimental_noalias_scope_decl: |
4258 | case Intrinsic::assume: |
4259 | case Intrinsic::lifetime_start: |
4260 | case Intrinsic::lifetime_end: |
4261 | if (TheLoop->hasLoopInvariantOperands(I: &I)) |
4262 | addToWorklistIfAllowed(&I); |
4263 | break; |
4264 | default: |
4265 | break; |
4266 | } |
4267 | } |
4268 | |
4269 | // ExtractValue instructions must be uniform, because the operands are |
4270 | // known to be loop-invariant. |
4271 | if (auto *EVI = dyn_cast<ExtractValueInst>(Val: &I)) { |
4272 | assert(isOutOfScope(EVI->getAggregateOperand()) && |
4273 | "Expected aggregate value to be loop invariant" ); |
4274 | addToWorklistIfAllowed(EVI); |
4275 | continue; |
4276 | } |
4277 | |
4278 | // If there's no pointer operand, there's nothing to do. |
4279 | auto *Ptr = getLoadStorePointerOperand(V: &I); |
4280 | if (!Ptr) |
4281 | continue; |
4282 | |
4283 | if (isUniformMemOpUse(&I)) |
4284 | addToWorklistIfAllowed(&I); |
4285 | |
4286 | if (isVectorizedMemAccessUse(&I, Ptr)) |
4287 | HasUniformUse.insert(X: Ptr); |
4288 | } |
4289 | |
4290 | // Add to the worklist any operands which have *only* uniform (e.g. lane 0 |
4291 | // demanding) users. Since loops are assumed to be in LCSSA form, this |
4292 | // disallows uses outside the loop as well. |
4293 | for (auto *V : HasUniformUse) { |
4294 | if (isOutOfScope(V)) |
4295 | continue; |
4296 | auto *I = cast<Instruction>(Val: V); |
4297 | auto UsersAreMemAccesses = |
4298 | llvm::all_of(Range: I->users(), P: [&](User *U) -> bool { |
4299 | return isVectorizedMemAccessUse(cast<Instruction>(Val: U), V); |
4300 | }); |
4301 | if (UsersAreMemAccesses) |
4302 | addToWorklistIfAllowed(I); |
4303 | } |
4304 | |
4305 | // Expand Worklist in topological order: whenever a new instruction |
4306 | // is added , its users should be already inside Worklist. It ensures |
4307 | // a uniform instruction will only be used by uniform instructions. |
4308 | unsigned idx = 0; |
4309 | while (idx != Worklist.size()) { |
4310 | Instruction *I = Worklist[idx++]; |
4311 | |
4312 | for (auto *OV : I->operand_values()) { |
4313 | // isOutOfScope operands cannot be uniform instructions. |
4314 | if (isOutOfScope(OV)) |
4315 | continue; |
4316 | // First order recurrence Phi's should typically be considered |
4317 | // non-uniform. |
4318 | auto *OP = dyn_cast<PHINode>(Val: OV); |
4319 | if (OP && Legal->isFixedOrderRecurrence(Phi: OP)) |
4320 | continue; |
4321 | // If all the users of the operand are uniform, then add the |
4322 | // operand into the uniform worklist. |
4323 | auto *OI = cast<Instruction>(Val: OV); |
4324 | if (llvm::all_of(Range: OI->users(), P: [&](User *U) -> bool { |
4325 | auto *J = cast<Instruction>(Val: U); |
4326 | return Worklist.count(key: J) || isVectorizedMemAccessUse(J, OI); |
4327 | })) |
4328 | addToWorklistIfAllowed(OI); |
4329 | } |
4330 | } |
4331 | |
4332 | // For an instruction to be added into Worklist above, all its users inside |
4333 | // the loop should also be in Worklist. However, this condition cannot be |
4334 | // true for phi nodes that form a cyclic dependence. We must process phi |
4335 | // nodes separately. An induction variable will remain uniform if all users |
4336 | // of the induction variable and induction variable update remain uniform. |
4337 | // The code below handles both pointer and non-pointer induction variables. |
4338 | for (const auto &Induction : Legal->getInductionVars()) { |
4339 | auto *Ind = Induction.first; |
4340 | auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch)); |
4341 | |
4342 | // Determine if all users of the induction variable are uniform after |
4343 | // vectorization. |
4344 | auto UniformInd = llvm::all_of(Range: Ind->users(), P: [&](User *U) -> bool { |
4345 | auto *I = cast<Instruction>(Val: U); |
4346 | return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) || |
4347 | isVectorizedMemAccessUse(I, Ind); |
4348 | }); |
4349 | if (!UniformInd) |
4350 | continue; |
4351 | |
4352 | // Determine if all users of the induction variable update instruction are |
4353 | // uniform after vectorization. |
4354 | auto UniformIndUpdate = |
4355 | llvm::all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool { |
4356 | auto *I = cast<Instruction>(Val: U); |
4357 | return I == Ind || !TheLoop->contains(Inst: I) || Worklist.count(key: I) || |
4358 | isVectorizedMemAccessUse(I, IndUpdate); |
4359 | }); |
4360 | if (!UniformIndUpdate) |
4361 | continue; |
4362 | |
4363 | // The induction variable and its update instruction will remain uniform. |
4364 | addToWorklistIfAllowed(Ind); |
4365 | addToWorklistIfAllowed(IndUpdate); |
4366 | } |
4367 | |
4368 | Uniforms[VF].insert(I: Worklist.begin(), E: Worklist.end()); |
4369 | } |
4370 | |
4371 | bool LoopVectorizationCostModel::runtimeChecksRequired() { |
4372 | LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n" ); |
4373 | |
4374 | if (Legal->getRuntimePointerChecking()->Need) { |
4375 | reportVectorizationFailure(DebugMsg: "Runtime ptr check is required with -Os/-Oz" , |
4376 | OREMsg: "runtime pointer checks needed. Enable vectorization of this " |
4377 | "loop with '#pragma clang loop vectorize(enable)' when " |
4378 | "compiling with -Os/-Oz" , |
4379 | ORETag: "CantVersionLoopWithOptForSize" , ORE, TheLoop); |
4380 | return true; |
4381 | } |
4382 | |
4383 | if (!PSE.getPredicate().isAlwaysTrue()) { |
4384 | reportVectorizationFailure(DebugMsg: "Runtime SCEV check is required with -Os/-Oz" , |
4385 | OREMsg: "runtime SCEV checks needed. Enable vectorization of this " |
4386 | "loop with '#pragma clang loop vectorize(enable)' when " |
4387 | "compiling with -Os/-Oz" , |
4388 | ORETag: "CantVersionLoopWithOptForSize" , ORE, TheLoop); |
4389 | return true; |
4390 | } |
4391 | |
4392 | // FIXME: Avoid specializing for stride==1 instead of bailing out. |
4393 | if (!Legal->getLAI()->getSymbolicStrides().empty()) { |
4394 | reportVectorizationFailure(DebugMsg: "Runtime stride check for small trip count" , |
4395 | OREMsg: "runtime stride == 1 checks needed. Enable vectorization of " |
4396 | "this loop without such check by compiling with -Os/-Oz" , |
4397 | ORETag: "CantVersionLoopWithOptForSize" , ORE, TheLoop); |
4398 | return true; |
4399 | } |
4400 | |
4401 | return false; |
4402 | } |
4403 | |
4404 | ElementCount |
4405 | LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { |
4406 | if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) |
4407 | return ElementCount::getScalable(MinVal: 0); |
4408 | |
4409 | if (Hints->isScalableVectorizationDisabled()) { |
4410 | reportVectorizationInfo(Msg: "Scalable vectorization is explicitly disabled" , |
4411 | ORETag: "ScalableVectorizationDisabled" , ORE, TheLoop); |
4412 | return ElementCount::getScalable(MinVal: 0); |
4413 | } |
4414 | |
4415 | LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n" ); |
4416 | |
4417 | auto MaxScalableVF = ElementCount::getScalable( |
4418 | MinVal: std::numeric_limits<ElementCount::ScalarTy>::max()); |
4419 | |
4420 | // Test that the loop-vectorizer can legalize all operations for this MaxVF. |
4421 | // FIXME: While for scalable vectors this is currently sufficient, this should |
4422 | // be replaced by a more detailed mechanism that filters out specific VFs, |
4423 | // instead of invalidating vectorization for a whole set of VFs based on the |
4424 | // MaxVF. |
4425 | |
4426 | // Disable scalable vectorization if the loop contains unsupported reductions. |
4427 | if (!canVectorizeReductions(VF: MaxScalableVF)) { |
4428 | reportVectorizationInfo( |
4429 | Msg: "Scalable vectorization not supported for the reduction " |
4430 | "operations found in this loop." , |
4431 | ORETag: "ScalableVFUnfeasible" , ORE, TheLoop); |
4432 | return ElementCount::getScalable(MinVal: 0); |
4433 | } |
4434 | |
4435 | // Disable scalable vectorization if the loop contains any instructions |
4436 | // with element types not supported for scalable vectors. |
4437 | if (any_of(Range&: ElementTypesInLoop, P: [&](Type *Ty) { |
4438 | return !Ty->isVoidTy() && |
4439 | !this->TTI.isElementTypeLegalForScalableVector(Ty); |
4440 | })) { |
4441 | reportVectorizationInfo(Msg: "Scalable vectorization is not supported " |
4442 | "for all element types found in this loop." , |
4443 | ORETag: "ScalableVFUnfeasible" , ORE, TheLoop); |
4444 | return ElementCount::getScalable(MinVal: 0); |
4445 | } |
4446 | |
4447 | if (Legal->isSafeForAnyVectorWidth()) |
4448 | return MaxScalableVF; |
4449 | |
4450 | // Limit MaxScalableVF by the maximum safe dependence distance. |
4451 | if (std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI)) |
4452 | MaxScalableVF = ElementCount::getScalable(MinVal: MaxSafeElements / *MaxVScale); |
4453 | else |
4454 | MaxScalableVF = ElementCount::getScalable(MinVal: 0); |
4455 | |
4456 | if (!MaxScalableVF) |
4457 | reportVectorizationInfo( |
4458 | Msg: "Max legal vector width too small, scalable vectorization " |
4459 | "unfeasible." , |
4460 | ORETag: "ScalableVFUnfeasible" , ORE, TheLoop); |
4461 | |
4462 | return MaxScalableVF; |
4463 | } |
4464 | |
4465 | FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( |
4466 | unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) { |
4467 | MinBWs = computeMinimumValueSizes(Blocks: TheLoop->getBlocks(), DB&: *DB, TTI: &TTI); |
4468 | unsigned SmallestType, WidestType; |
4469 | std::tie(args&: SmallestType, args&: WidestType) = getSmallestAndWidestTypes(); |
4470 | |
4471 | // Get the maximum safe dependence distance in bits computed by LAA. |
4472 | // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from |
4473 | // the memory accesses that is most restrictive (involved in the smallest |
4474 | // dependence distance). |
4475 | unsigned MaxSafeElements = |
4476 | llvm::bit_floor(Value: Legal->getMaxSafeVectorWidthInBits() / WidestType); |
4477 | |
4478 | auto MaxSafeFixedVF = ElementCount::getFixed(MinVal: MaxSafeElements); |
4479 | auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); |
4480 | |
4481 | LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF |
4482 | << ".\n" ); |
4483 | LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF |
4484 | << ".\n" ); |
4485 | |
4486 | // First analyze the UserVF, fall back if the UserVF should be ignored. |
4487 | if (UserVF) { |
4488 | auto MaxSafeUserVF = |
4489 | UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; |
4490 | |
4491 | if (ElementCount::isKnownLE(LHS: UserVF, RHS: MaxSafeUserVF)) { |
4492 | // If `VF=vscale x N` is safe, then so is `VF=N` |
4493 | if (UserVF.isScalable()) |
4494 | return FixedScalableVFPair( |
4495 | ElementCount::getFixed(MinVal: UserVF.getKnownMinValue()), UserVF); |
4496 | else |
4497 | return UserVF; |
4498 | } |
4499 | |
4500 | assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); |
4501 | |
4502 | // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it |
4503 | // is better to ignore the hint and let the compiler choose a suitable VF. |
4504 | if (!UserVF.isScalable()) { |
4505 | LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF |
4506 | << " is unsafe, clamping to max safe VF=" |
4507 | << MaxSafeFixedVF << ".\n" ); |
4508 | ORE->emit(RemarkBuilder: [&]() { |
4509 | return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor" , |
4510 | TheLoop->getStartLoc(), |
4511 | TheLoop->getHeader()) |
4512 | << "User-specified vectorization factor " |
4513 | << ore::NV("UserVectorizationFactor" , UserVF) |
4514 | << " is unsafe, clamping to maximum safe vectorization factor " |
4515 | << ore::NV("VectorizationFactor" , MaxSafeFixedVF); |
4516 | }); |
4517 | return MaxSafeFixedVF; |
4518 | } |
4519 | |
4520 | if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) { |
4521 | LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF |
4522 | << " is ignored because scalable vectors are not " |
4523 | "available.\n" ); |
4524 | ORE->emit(RemarkBuilder: [&]() { |
4525 | return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor" , |
4526 | TheLoop->getStartLoc(), |
4527 | TheLoop->getHeader()) |
4528 | << "User-specified vectorization factor " |
4529 | << ore::NV("UserVectorizationFactor" , UserVF) |
4530 | << " is ignored because the target does not support scalable " |
4531 | "vectors. The compiler will pick a more suitable value." ; |
4532 | }); |
4533 | } else { |
4534 | LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF |
4535 | << " is unsafe. Ignoring scalable UserVF.\n" ); |
4536 | ORE->emit(RemarkBuilder: [&]() { |
4537 | return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor" , |
4538 | TheLoop->getStartLoc(), |
4539 | TheLoop->getHeader()) |
4540 | << "User-specified vectorization factor " |
4541 | << ore::NV("UserVectorizationFactor" , UserVF) |
4542 | << " is unsafe. Ignoring the hint to let the compiler pick a " |
4543 | "more suitable value." ; |
4544 | }); |
4545 | } |
4546 | } |
4547 | |
4548 | LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType |
4549 | << " / " << WidestType << " bits.\n" ); |
4550 | |
4551 | FixedScalableVFPair Result(ElementCount::getFixed(MinVal: 1), |
4552 | ElementCount::getScalable(MinVal: 0)); |
4553 | if (auto MaxVF = |
4554 | getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, |
4555 | MaxSafeVF: MaxSafeFixedVF, FoldTailByMasking)) |
4556 | Result.FixedVF = MaxVF; |
4557 | |
4558 | if (auto MaxVF = |
4559 | getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType, |
4560 | MaxSafeVF: MaxSafeScalableVF, FoldTailByMasking)) |
4561 | if (MaxVF.isScalable()) { |
4562 | Result.ScalableVF = MaxVF; |
4563 | LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF |
4564 | << "\n" ); |
4565 | } |
4566 | |
4567 | return Result; |
4568 | } |
4569 | |
4570 | FixedScalableVFPair |
4571 | LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { |
4572 | if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { |
4573 | // TODO: It may by useful to do since it's still likely to be dynamically |
4574 | // uniform if the target can skip. |
4575 | reportVectorizationFailure( |
4576 | DebugMsg: "Not inserting runtime ptr check for divergent target" , |
4577 | OREMsg: "runtime pointer checks needed. Not enabled for divergent target" , |
4578 | ORETag: "CantVersionLoopWithDivergentTarget" , ORE, TheLoop); |
4579 | return FixedScalableVFPair::getNone(); |
4580 | } |
4581 | |
4582 | unsigned TC = PSE.getSE()->getSmallConstantTripCount(L: TheLoop); |
4583 | unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(L: TheLoop); |
4584 | LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); |
4585 | if (TC == 1) { |
4586 | reportVectorizationFailure(DebugMsg: "Single iteration (non) loop" , |
4587 | OREMsg: "loop trip count is one, irrelevant for vectorization" , |
4588 | ORETag: "SingleIterationLoop" , ORE, TheLoop); |
4589 | return FixedScalableVFPair::getNone(); |
4590 | } |
4591 | |
4592 | switch (ScalarEpilogueStatus) { |
4593 | case CM_ScalarEpilogueAllowed: |
4594 | return computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: false); |
4595 | case CM_ScalarEpilogueNotAllowedUsePredicate: |
4596 | [[fallthrough]]; |
4597 | case CM_ScalarEpilogueNotNeededUsePredicate: |
4598 | LLVM_DEBUG( |
4599 | dbgs() << "LV: vector predicate hint/switch found.\n" |
4600 | << "LV: Not allowing scalar epilogue, creating predicated " |
4601 | << "vector loop.\n" ); |
4602 | break; |
4603 | case CM_ScalarEpilogueNotAllowedLowTripLoop: |
4604 | // fallthrough as a special case of OptForSize |
4605 | case CM_ScalarEpilogueNotAllowedOptSize: |
4606 | if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) |
4607 | LLVM_DEBUG( |
4608 | dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n" ); |
4609 | else |
4610 | LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " |
4611 | << "count.\n" ); |
4612 | |
4613 | // Bail if runtime checks are required, which are not good when optimising |
4614 | // for size. |
4615 | if (runtimeChecksRequired()) |
4616 | return FixedScalableVFPair::getNone(); |
4617 | |
4618 | break; |
4619 | } |
4620 | |
4621 | // The only loops we can vectorize without a scalar epilogue, are loops with |
4622 | // a bottom-test and a single exiting block. We'd have to handle the fact |
4623 | // that not every instruction executes on the last iteration. This will |
4624 | // require a lane mask which varies through the vector loop body. (TODO) |
4625 | if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { |
4626 | // If there was a tail-folding hint/switch, but we can't fold the tail by |
4627 | // masking, fallback to a vectorization with a scalar epilogue. |
4628 | if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { |
4629 | LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " |
4630 | "scalar epilogue instead.\n" ); |
4631 | ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; |
4632 | return computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: false); |
4633 | } |
4634 | return FixedScalableVFPair::getNone(); |
4635 | } |
4636 | |
4637 | // Now try the tail folding |
4638 | |
4639 | // Invalidate interleave groups that require an epilogue if we can't mask |
4640 | // the interleave-group. |
4641 | if (!useMaskedInterleavedAccesses(TTI)) { |
4642 | assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() && |
4643 | "No decisions should have been taken at this point" ); |
4644 | // Note: There is no need to invalidate any cost modeling decisions here, as |
4645 | // non where taken so far. |
4646 | InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); |
4647 | } |
4648 | |
4649 | FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: true); |
4650 | |
4651 | // Avoid tail folding if the trip count is known to be a multiple of any VF |
4652 | // we choose. |
4653 | std::optional<unsigned> MaxPowerOf2RuntimeVF = |
4654 | MaxFactors.FixedVF.getFixedValue(); |
4655 | if (MaxFactors.ScalableVF) { |
4656 | std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI); |
4657 | if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) { |
4658 | MaxPowerOf2RuntimeVF = std::max<unsigned>( |
4659 | a: *MaxPowerOf2RuntimeVF, |
4660 | b: *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue()); |
4661 | } else |
4662 | MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now. |
4663 | } |
4664 | |
4665 | if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) { |
4666 | assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) && |
4667 | "MaxFixedVF must be a power of 2" ); |
4668 | unsigned MaxVFtimesIC = |
4669 | UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF; |
4670 | ScalarEvolution *SE = PSE.getSE(); |
4671 | const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount(); |
4672 | const SCEV *ExitCount = SE->getAddExpr( |
4673 | LHS: BackedgeTakenCount, RHS: SE->getOne(Ty: BackedgeTakenCount->getType())); |
4674 | const SCEV *Rem = SE->getURemExpr( |
4675 | LHS: SE->applyLoopGuards(Expr: ExitCount, L: TheLoop), |
4676 | RHS: SE->getConstant(Ty: BackedgeTakenCount->getType(), V: MaxVFtimesIC)); |
4677 | if (Rem->isZero()) { |
4678 | // Accept MaxFixedVF if we do not have a tail. |
4679 | LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n" ); |
4680 | return MaxFactors; |
4681 | } |
4682 | } |
4683 | |
4684 | // If we don't know the precise trip count, or if the trip count that we |
4685 | // found modulo the vectorization factor is not zero, try to fold the tail |
4686 | // by masking. |
4687 | // FIXME: look for a smaller MaxVF that does divide TC rather than masking. |
4688 | setTailFoldingStyles(IsScalableVF: MaxFactors.ScalableVF.isScalable(), UserIC); |
4689 | if (foldTailByMasking()) { |
4690 | if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) { |
4691 | LLVM_DEBUG( |
4692 | dbgs() |
4693 | << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will " |
4694 | "try to generate VP Intrinsics with scalable vector " |
4695 | "factors only.\n" ); |
4696 | // Tail folded loop using VP intrinsics restricts the VF to be scalable |
4697 | // for now. |
4698 | // TODO: extend it for fixed vectors, if required. |
4699 | assert(MaxFactors.ScalableVF.isScalable() && |
4700 | "Expected scalable vector factor." ); |
4701 | |
4702 | MaxFactors.FixedVF = ElementCount::getFixed(MinVal: 1); |
4703 | } |
4704 | return MaxFactors; |
4705 | } |
4706 | |
4707 | // If there was a tail-folding hint/switch, but we can't fold the tail by |
4708 | // masking, fallback to a vectorization with a scalar epilogue. |
4709 | if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { |
4710 | LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " |
4711 | "scalar epilogue instead.\n" ); |
4712 | ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; |
4713 | return MaxFactors; |
4714 | } |
4715 | |
4716 | if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) { |
4717 | LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n" ); |
4718 | return FixedScalableVFPair::getNone(); |
4719 | } |
4720 | |
4721 | if (TC == 0) { |
4722 | reportVectorizationFailure( |
4723 | DebugMsg: "Unable to calculate the loop count due to complex control flow" , |
4724 | OREMsg: "unable to calculate the loop count due to complex control flow" , |
4725 | ORETag: "UnknownLoopCountComplexCFG" , ORE, TheLoop); |
4726 | return FixedScalableVFPair::getNone(); |
4727 | } |
4728 | |
4729 | reportVectorizationFailure( |
4730 | DebugMsg: "Cannot optimize for size and vectorize at the same time." , |
4731 | OREMsg: "cannot optimize for size and vectorize at the same time. " |
4732 | "Enable vectorization of this loop with '#pragma clang loop " |
4733 | "vectorize(enable)' when compiling with -Os/-Oz" , |
4734 | ORETag: "NoTailLoopWithOptForSize" , ORE, TheLoop); |
4735 | return FixedScalableVFPair::getNone(); |
4736 | } |
4737 | |
4738 | ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( |
4739 | unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType, |
4740 | ElementCount MaxSafeVF, bool FoldTailByMasking) { |
4741 | bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); |
4742 | const TypeSize WidestRegister = TTI.getRegisterBitWidth( |
4743 | K: ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector |
4744 | : TargetTransformInfo::RGK_FixedWidthVector); |
4745 | |
4746 | // Convenience function to return the minimum of two ElementCounts. |
4747 | auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) { |
4748 | assert((LHS.isScalable() == RHS.isScalable()) && |
4749 | "Scalable flags must match" ); |
4750 | return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS; |
4751 | }; |
4752 | |
4753 | // Ensure MaxVF is a power of 2; the dependence distance bound may not be. |
4754 | // Note that both WidestRegister and WidestType may not be a powers of 2. |
4755 | auto MaxVectorElementCount = ElementCount::get( |
4756 | MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / WidestType), |
4757 | Scalable: ComputeScalableMaxVF); |
4758 | MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF); |
4759 | LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: " |
4760 | << (MaxVectorElementCount * WidestType) << " bits.\n" ); |
4761 | |
4762 | if (!MaxVectorElementCount) { |
4763 | LLVM_DEBUG(dbgs() << "LV: The target has no " |
4764 | << (ComputeScalableMaxVF ? "scalable" : "fixed" ) |
4765 | << " vector registers.\n" ); |
4766 | return ElementCount::getFixed(MinVal: 1); |
4767 | } |
4768 | |
4769 | unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue(); |
4770 | if (MaxVectorElementCount.isScalable() && |
4771 | TheFunction->hasFnAttribute(Attribute::VScaleRange)) { |
4772 | auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); |
4773 | auto Min = Attr.getVScaleRangeMin(); |
4774 | WidestRegisterMinEC *= Min; |
4775 | } |
4776 | |
4777 | // When a scalar epilogue is required, at least one iteration of the scalar |
4778 | // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a |
4779 | // max VF that results in a dead vector loop. |
4780 | if (MaxTripCount > 0 && requiresScalarEpilogue(IsVectorizing: true)) |
4781 | MaxTripCount -= 1; |
4782 | |
4783 | if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC && |
4784 | (!FoldTailByMasking || isPowerOf2_32(Value: MaxTripCount))) { |
4785 | // If upper bound loop trip count (TC) is known at compile time there is no |
4786 | // point in choosing VF greater than TC (as done in the loop below). Select |
4787 | // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is |
4788 | // scalable, we only fall back on a fixed VF when the TC is less than or |
4789 | // equal to the known number of lanes. |
4790 | auto ClampedUpperTripCount = llvm::bit_floor(Value: MaxTripCount); |
4791 | LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " |
4792 | "exceeding the constant trip count: " |
4793 | << ClampedUpperTripCount << "\n" ); |
4794 | return ElementCount::get( |
4795 | MinVal: ClampedUpperTripCount, |
4796 | Scalable: FoldTailByMasking ? MaxVectorElementCount.isScalable() : false); |
4797 | } |
4798 | |
4799 | TargetTransformInfo::RegisterKind RegKind = |
4800 | ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector |
4801 | : TargetTransformInfo::RGK_FixedWidthVector; |
4802 | ElementCount MaxVF = MaxVectorElementCount; |
4803 | if (MaximizeBandwidth || |
4804 | (MaximizeBandwidth.getNumOccurrences() == 0 && |
4805 | (TTI.shouldMaximizeVectorBandwidth(K: RegKind) || |
4806 | (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) { |
4807 | auto MaxVectorElementCountMaxBW = ElementCount::get( |
4808 | MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / SmallestType), |
4809 | Scalable: ComputeScalableMaxVF); |
4810 | MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF); |
4811 | |
4812 | // Collect all viable vectorization factors larger than the default MaxVF |
4813 | // (i.e. MaxVectorElementCount). |
4814 | SmallVector<ElementCount, 8> VFs; |
4815 | for (ElementCount VS = MaxVectorElementCount * 2; |
4816 | ElementCount::isKnownLE(LHS: VS, RHS: MaxVectorElementCountMaxBW); VS *= 2) |
4817 | VFs.push_back(Elt: VS); |
4818 | |
4819 | // For each VF calculate its register usage. |
4820 | auto RUs = calculateRegisterUsage(VFs); |
4821 | |
4822 | // Select the largest VF which doesn't require more registers than existing |
4823 | // ones. |
4824 | for (int i = RUs.size() - 1; i >= 0; --i) { |
4825 | bool Selected = true; |
4826 | for (auto &pair : RUs[i].MaxLocalUsers) { |
4827 | unsigned TargetNumRegisters = TTI.getNumberOfRegisters(ClassID: pair.first); |
4828 | if (pair.second > TargetNumRegisters) |
4829 | Selected = false; |
4830 | } |
4831 | if (Selected) { |
4832 | MaxVF = VFs[i]; |
4833 | break; |
4834 | } |
4835 | } |
4836 | if (ElementCount MinVF = |
4837 | TTI.getMinimumVF(ElemWidth: SmallestType, IsScalable: ComputeScalableMaxVF)) { |
4838 | if (ElementCount::isKnownLT(LHS: MaxVF, RHS: MinVF)) { |
4839 | LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF |
4840 | << ") with target's minimum: " << MinVF << '\n'); |
4841 | MaxVF = MinVF; |
4842 | } |
4843 | } |
4844 | |
4845 | // Invalidate any widening decisions we might have made, in case the loop |
4846 | // requires prediction (decided later), but we have already made some |
4847 | // load/store widening decisions. |
4848 | invalidateCostModelingDecisions(); |
4849 | } |
4850 | return MaxVF; |
4851 | } |
4852 | |
4853 | /// Convenience function that returns the value of vscale_range iff |
4854 | /// vscale_range.min == vscale_range.max or otherwise returns the value |
4855 | /// returned by the corresponding TTI method. |
4856 | static std::optional<unsigned> |
4857 | getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { |
4858 | const Function *Fn = L->getHeader()->getParent(); |
4859 | if (Fn->hasFnAttribute(Attribute::VScaleRange)) { |
4860 | auto Attr = Fn->getFnAttribute(Attribute::VScaleRange); |
4861 | auto Min = Attr.getVScaleRangeMin(); |
4862 | auto Max = Attr.getVScaleRangeMax(); |
4863 | if (Max && Min == Max) |
4864 | return Max; |
4865 | } |
4866 | |
4867 | return TTI.getVScaleForTuning(); |
4868 | } |
4869 | |
4870 | bool LoopVectorizationPlanner::isMoreProfitable( |
4871 | const VectorizationFactor &A, const VectorizationFactor &B) const { |
4872 | InstructionCost CostA = A.Cost; |
4873 | InstructionCost CostB = B.Cost; |
4874 | |
4875 | unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(L: OrigLoop); |
4876 | |
4877 | if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) { |
4878 | // If the trip count is a known (possibly small) constant, the trip count |
4879 | // will be rounded up to an integer number of iterations under |
4880 | // FoldTailByMasking. The total cost in that case will be |
4881 | // VecCost*ceil(TripCount/VF). When not folding the tail, the total |
4882 | // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be |
4883 | // some extra overheads, but for the purpose of comparing the costs of |
4884 | // different VFs we can use this to compare the total loop-body cost |
4885 | // expected after vectorization. |
4886 | auto GetCostForTC = [MaxTripCount, this](unsigned VF, |
4887 | InstructionCost VectorCost, |
4888 | InstructionCost ScalarCost) { |
4889 | return CM.foldTailByMasking() ? VectorCost * divideCeil(Numerator: MaxTripCount, Denominator: VF) |
4890 | : VectorCost * (MaxTripCount / VF) + |
4891 | ScalarCost * (MaxTripCount % VF); |
4892 | }; |
4893 | auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost); |
4894 | auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost); |
4895 | |
4896 | return RTCostA < RTCostB; |
4897 | } |
4898 | |
4899 | // Improve estimate for the vector width if it is scalable. |
4900 | unsigned EstimatedWidthA = A.Width.getKnownMinValue(); |
4901 | unsigned EstimatedWidthB = B.Width.getKnownMinValue(); |
4902 | if (std::optional<unsigned> VScale = getVScaleForTuning(L: OrigLoop, TTI)) { |
4903 | if (A.Width.isScalable()) |
4904 | EstimatedWidthA *= *VScale; |
4905 | if (B.Width.isScalable()) |
4906 | EstimatedWidthB *= *VScale; |
4907 | } |
4908 | |
4909 | // Assume vscale may be larger than 1 (or the value being tuned for), |
4910 | // so that scalable vectorization is slightly favorable over fixed-width |
4911 | // vectorization. |
4912 | if (A.Width.isScalable() && !B.Width.isScalable()) |
4913 | return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); |
4914 | |
4915 | // To avoid the need for FP division: |
4916 | // (CostA / A.Width) < (CostB / B.Width) |
4917 | // <=> (CostA * B.Width) < (CostB * A.Width) |
4918 | return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); |
4919 | } |
4920 | |
4921 | static void (SmallVector<InstructionVFPair> InvalidCosts, |
4922 | OptimizationRemarkEmitter *ORE, |
4923 | Loop *TheLoop) { |
4924 | if (InvalidCosts.empty()) |
4925 | return; |
4926 | |
4927 | // Emit a report of VFs with invalid costs in the loop. |
4928 | |
4929 | // Group the remarks per instruction, keeping the instruction order from |
4930 | // InvalidCosts. |
4931 | std::map<Instruction *, unsigned> Numbering; |
4932 | unsigned I = 0; |
4933 | for (auto &Pair : InvalidCosts) |
4934 | if (!Numbering.count(x: Pair.first)) |
4935 | Numbering[Pair.first] = I++; |
4936 | |
4937 | // Sort the list, first on instruction(number) then on VF. |
4938 | sort(C&: InvalidCosts, Comp: [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { |
4939 | if (Numbering[A.first] != Numbering[B.first]) |
4940 | return Numbering[A.first] < Numbering[B.first]; |
4941 | ElementCountComparator ECC; |
4942 | return ECC(A.second, B.second); |
4943 | }); |
4944 | |
4945 | // For a list of ordered instruction-vf pairs: |
4946 | // [(load, vf1), (load, vf2), (store, vf1)] |
4947 | // Group the instructions together to emit separate remarks for: |
4948 | // load (vf1, vf2) |
4949 | // store (vf1) |
4950 | auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts); |
4951 | auto Subset = ArrayRef<InstructionVFPair>(); |
4952 | do { |
4953 | if (Subset.empty()) |
4954 | Subset = Tail.take_front(N: 1); |
4955 | |
4956 | Instruction *I = Subset.front().first; |
4957 | |
4958 | // If the next instruction is different, or if there are no other pairs, |
4959 | // emit a remark for the collated subset. e.g. |
4960 | // [(load, vf1), (load, vf2))] |
4961 | // to emit: |
4962 | // remark: invalid costs for 'load' at VF=(vf, vf2) |
4963 | if (Subset == Tail || Tail[Subset.size()].first != I) { |
4964 | std::string OutString; |
4965 | raw_string_ostream OS(OutString); |
4966 | assert(!Subset.empty() && "Unexpected empty range" ); |
4967 | OS << "Instruction with invalid costs prevented vectorization at VF=(" ; |
4968 | for (const auto &Pair : Subset) |
4969 | OS << (Pair.second == Subset.front().second ? "" : ", " ) << Pair.second; |
4970 | OS << "):" ; |
4971 | if (auto *CI = dyn_cast<CallInst>(Val: I)) |
4972 | OS << " call to " << CI->getCalledFunction()->getName(); |
4973 | else |
4974 | OS << " " << I->getOpcodeName(); |
4975 | OS.flush(); |
4976 | reportVectorizationInfo(Msg: OutString, ORETag: "InvalidCost" , ORE, TheLoop, I); |
4977 | Tail = Tail.drop_front(N: Subset.size()); |
4978 | Subset = {}; |
4979 | } else |
4980 | // Grow the subset by one element |
4981 | Subset = Tail.take_front(N: Subset.size() + 1); |
4982 | } while (!Tail.empty()); |
4983 | } |
4984 | |
4985 | VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor( |
4986 | const ElementCountSet &VFCandidates) { |
4987 | InstructionCost ExpectedCost = |
4988 | CM.expectedCost(VF: ElementCount::getFixed(MinVal: 1)).first; |
4989 | LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n" ); |
4990 | assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop" ); |
4991 | assert(VFCandidates.count(ElementCount::getFixed(1)) && |
4992 | "Expected Scalar VF to be a candidate" ); |
4993 | |
4994 | const VectorizationFactor ScalarCost(ElementCount::getFixed(MinVal: 1), ExpectedCost, |
4995 | ExpectedCost); |
4996 | VectorizationFactor ChosenFactor = ScalarCost; |
4997 | |
4998 | bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; |
4999 | if (ForceVectorization && VFCandidates.size() > 1) { |
5000 | // Ignore scalar width, because the user explicitly wants vectorization. |
5001 | // Initialize cost to max so that VF = 2 is, at least, chosen during cost |
5002 | // evaluation. |
5003 | ChosenFactor.Cost = InstructionCost::getMax(); |
5004 | } |
5005 | |
5006 | SmallVector<InstructionVFPair> InvalidCosts; |
5007 | for (const auto &i : VFCandidates) { |
5008 | // The cost for scalar VF=1 is already calculated, so ignore it. |
5009 | if (i.isScalar()) |
5010 | continue; |
5011 | |
5012 | LoopVectorizationCostModel::VectorizationCostTy C = |
5013 | CM.expectedCost(VF: i, Invalid: &InvalidCosts); |
5014 | VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); |
5015 | |
5016 | #ifndef NDEBUG |
5017 | unsigned AssumedMinimumVscale = |
5018 | getVScaleForTuning(L: OrigLoop, TTI).value_or(u: 1); |
5019 | unsigned Width = |
5020 | Candidate.Width.isScalable() |
5021 | ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale |
5022 | : Candidate.Width.getFixedValue(); |
5023 | LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i |
5024 | << " costs: " << (Candidate.Cost / Width)); |
5025 | if (i.isScalable()) |
5026 | LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " |
5027 | << AssumedMinimumVscale << ")" ); |
5028 | LLVM_DEBUG(dbgs() << ".\n" ); |
5029 | #endif |
5030 | |
5031 | if (!C.second && !ForceVectorization) { |
5032 | LLVM_DEBUG( |
5033 | dbgs() << "LV: Not considering vector loop of width " << i |
5034 | << " because it will not generate any vector instructions.\n" ); |
5035 | continue; |
5036 | } |
5037 | |
5038 | // If profitable add it to ProfitableVF list. |
5039 | if (isMoreProfitable(A: Candidate, B: ScalarCost)) |
5040 | ProfitableVFs.push_back(Elt: Candidate); |
5041 | |
5042 | if (isMoreProfitable(A: Candidate, B: ChosenFactor)) |
5043 | ChosenFactor = Candidate; |
5044 | } |
5045 | |
5046 | emitInvalidCostRemarks(InvalidCosts, ORE, TheLoop: OrigLoop); |
5047 | |
5048 | if (!EnableCondStoresVectorization && CM.hasPredStores()) { |
5049 | reportVectorizationFailure( |
5050 | DebugMsg: "There are conditional stores." , |
5051 | OREMsg: "store that is conditionally executed prevents vectorization" , |
5052 | ORETag: "ConditionalStore" , ORE, TheLoop: OrigLoop); |
5053 | ChosenFactor = ScalarCost; |
5054 | } |
5055 | |
5056 | LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && |
5057 | !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() |
5058 | << "LV: Vectorization seems to be not beneficial, " |
5059 | << "but was forced by a user.\n" ); |
5060 | LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n" ); |
5061 | return ChosenFactor; |
5062 | } |
5063 | |
5064 | bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( |
5065 | ElementCount VF) const { |
5066 | // Cross iteration phis such as reductions need special handling and are |
5067 | // currently unsupported. |
5068 | if (any_of(Range: OrigLoop->getHeader()->phis(), |
5069 | P: [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(Phi: &Phi); })) |
5070 | return false; |
5071 | |
5072 | // Phis with uses outside of the loop require special handling and are |
5073 | // currently unsupported. |
5074 | for (const auto &Entry : Legal->getInductionVars()) { |
5075 | // Look for uses of the value of the induction at the last iteration. |
5076 | Value *PostInc = |
5077 | Entry.first->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch()); |
5078 | for (User *U : PostInc->users()) |
5079 | if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U))) |
5080 | return false; |
5081 | // Look for uses of penultimate value of the induction. |
5082 | for (User *U : Entry.first->users()) |
5083 | if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U))) |
5084 | return false; |
5085 | } |
5086 | |
5087 | // Epilogue vectorization code has not been auditted to ensure it handles |
5088 | // non-latch exits properly. It may be fine, but it needs auditted and |
5089 | // tested. |
5090 | if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) |
5091 | return false; |
5092 | |
5093 | return true; |
5094 | } |
5095 | |
5096 | bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( |
5097 | const ElementCount VF) const { |
5098 | // FIXME: We need a much better cost-model to take different parameters such |
5099 | // as register pressure, code size increase and cost of extra branches into |
5100 | // account. For now we apply a very crude heuristic and only consider loops |
5101 | // with vectorization factors larger than a certain value. |
5102 | |
5103 | // Allow the target to opt out entirely. |
5104 | if (!TTI.preferEpilogueVectorization()) |
5105 | return false; |
5106 | |
5107 | // We also consider epilogue vectorization unprofitable for targets that don't |
5108 | // consider interleaving beneficial (eg. MVE). |
5109 | if (TTI.getMaxInterleaveFactor(VF) <= 1) |
5110 | return false; |
5111 | |
5112 | unsigned Multiplier = 1; |
5113 | if (VF.isScalable()) |
5114 | Multiplier = getVScaleForTuning(L: TheLoop, TTI).value_or(u: 1); |
5115 | if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) |
5116 | return true; |
5117 | return false; |
5118 | } |
5119 | |
5120 | VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( |
5121 | const ElementCount MainLoopVF, unsigned IC) { |
5122 | VectorizationFactor Result = VectorizationFactor::Disabled(); |
5123 | if (!EnableEpilogueVectorization) { |
5124 | LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n" ); |
5125 | return Result; |
5126 | } |
5127 | |
5128 | if (!CM.isScalarEpilogueAllowed()) { |
5129 | LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " |
5130 | "epilogue is allowed.\n" ); |
5131 | return Result; |
5132 | } |
5133 | |
5134 | // Not really a cost consideration, but check for unsupported cases here to |
5135 | // simplify the logic. |
5136 | if (!isCandidateForEpilogueVectorization(VF: MainLoopVF)) { |
5137 | LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop " |
5138 | "is not a supported candidate.\n" ); |
5139 | return Result; |
5140 | } |
5141 | |
5142 | if (EpilogueVectorizationForceVF > 1) { |
5143 | LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n" ); |
5144 | ElementCount ForcedEC = ElementCount::getFixed(MinVal: EpilogueVectorizationForceVF); |
5145 | if (hasPlanWithVF(VF: ForcedEC)) |
5146 | return {ForcedEC, 0, 0}; |
5147 | else { |
5148 | LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " |
5149 | "viable.\n" ); |
5150 | return Result; |
5151 | } |
5152 | } |
5153 | |
5154 | if (OrigLoop->getHeader()->getParent()->hasOptSize() || |
5155 | OrigLoop->getHeader()->getParent()->hasMinSize()) { |
5156 | LLVM_DEBUG( |
5157 | dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n" ); |
5158 | return Result; |
5159 | } |
5160 | |
5161 | if (!CM.isEpilogueVectorizationProfitable(VF: MainLoopVF)) { |
5162 | LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " |
5163 | "this loop\n" ); |
5164 | return Result; |
5165 | } |
5166 | |
5167 | // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know |
5168 | // the main loop handles 8 lanes per iteration. We could still benefit from |
5169 | // vectorizing the epilogue loop with VF=4. |
5170 | ElementCount EstimatedRuntimeVF = MainLoopVF; |
5171 | if (MainLoopVF.isScalable()) { |
5172 | EstimatedRuntimeVF = ElementCount::getFixed(MinVal: MainLoopVF.getKnownMinValue()); |
5173 | if (std::optional<unsigned> VScale = getVScaleForTuning(L: OrigLoop, TTI)) |
5174 | EstimatedRuntimeVF *= *VScale; |
5175 | } |
5176 | |
5177 | ScalarEvolution &SE = *PSE.getSE(); |
5178 | Type *TCType = Legal->getWidestInductionType(); |
5179 | const SCEV *RemainingIterations = nullptr; |
5180 | for (auto &NextVF : ProfitableVFs) { |
5181 | // Skip candidate VFs without a corresponding VPlan. |
5182 | if (!hasPlanWithVF(VF: NextVF.Width)) |
5183 | continue; |
5184 | |
5185 | // Skip candidate VFs with widths >= the estimate runtime VF (scalable |
5186 | // vectors) or the VF of the main loop (fixed vectors). |
5187 | if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && |
5188 | ElementCount::isKnownGE(LHS: NextVF.Width, RHS: EstimatedRuntimeVF)) || |
5189 | ElementCount::isKnownGE(LHS: NextVF.Width, RHS: MainLoopVF)) |
5190 | continue; |
5191 | |
5192 | // If NextVF is greater than the number of remaining iterations, the |
5193 | // epilogue loop would be dead. Skip such factors. |
5194 | if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) { |
5195 | // TODO: extend to support scalable VFs. |
5196 | if (!RemainingIterations) { |
5197 | const SCEV *TC = createTripCountSCEV(IdxTy: TCType, PSE, OrigLoop); |
5198 | RemainingIterations = SE.getURemExpr( |
5199 | LHS: TC, RHS: SE.getConstant(Ty: TCType, V: MainLoopVF.getKnownMinValue() * IC)); |
5200 | } |
5201 | if (SE.isKnownPredicate( |
5202 | Pred: CmpInst::ICMP_UGT, |
5203 | LHS: SE.getConstant(Ty: TCType, V: NextVF.Width.getKnownMinValue()), |
5204 | RHS: RemainingIterations)) |
5205 | continue; |
5206 | } |
5207 | |
5208 | if (Result.Width.isScalar() || isMoreProfitable(A: NextVF, B: Result)) |
5209 | Result = NextVF; |
5210 | } |
5211 | |
5212 | if (Result != VectorizationFactor::Disabled()) |
5213 | LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " |
5214 | << Result.Width << "\n" ); |
5215 | return Result; |
5216 | } |
5217 | |
5218 | std::pair<unsigned, unsigned> |
5219 | LoopVectorizationCostModel::getSmallestAndWidestTypes() { |
5220 | unsigned MinWidth = -1U; |
5221 | unsigned MaxWidth = 8; |
5222 | const DataLayout &DL = TheFunction->getParent()->getDataLayout(); |
5223 | // For in-loop reductions, no element types are added to ElementTypesInLoop |
5224 | // if there are no loads/stores in the loop. In this case, check through the |
5225 | // reduction variables to determine the maximum width. |
5226 | if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) { |
5227 | // Reset MaxWidth so that we can find the smallest type used by recurrences |
5228 | // in the loop. |
5229 | MaxWidth = -1U; |
5230 | for (const auto &PhiDescriptorPair : Legal->getReductionVars()) { |
5231 | const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second; |
5232 | // When finding the min width used by the recurrence we need to account |
5233 | // for casts on the input operands of the recurrence. |
5234 | MaxWidth = std::min<unsigned>( |
5235 | a: MaxWidth, b: std::min<unsigned>( |
5236 | a: RdxDesc.getMinWidthCastToRecurrenceTypeInBits(), |
5237 | b: RdxDesc.getRecurrenceType()->getScalarSizeInBits())); |
5238 | } |
5239 | } else { |
5240 | for (Type *T : ElementTypesInLoop) { |
5241 | MinWidth = std::min<unsigned>( |
5242 | a: MinWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue()); |
5243 | MaxWidth = std::max<unsigned>( |
5244 | a: MaxWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue()); |
5245 | } |
5246 | } |
5247 | return {MinWidth, MaxWidth}; |
5248 | } |
5249 | |
5250 | void LoopVectorizationCostModel::collectElementTypesForWidening() { |
5251 | ElementTypesInLoop.clear(); |
5252 | // For each block. |
5253 | for (BasicBlock *BB : TheLoop->blocks()) { |
5254 | // For each instruction in the loop. |
5255 | for (Instruction &I : BB->instructionsWithoutDebug()) { |
5256 | Type *T = I.getType(); |
5257 | |
5258 | // Skip ignored values. |
5259 | if (ValuesToIgnore.count(Ptr: &I)) |
5260 | continue; |
5261 | |
5262 | // Only examine Loads, Stores and PHINodes. |
5263 | if (!isa<LoadInst>(Val: I) && !isa<StoreInst>(Val: I) && !isa<PHINode>(Val: I)) |
5264 | continue; |
5265 | |
5266 | // Examine PHI nodes that are reduction variables. Update the type to |
5267 | // account for the recurrence type. |
5268 | if (auto *PN = dyn_cast<PHINode>(Val: &I)) { |
5269 | if (!Legal->isReductionVariable(PN)) |
5270 | continue; |
5271 | const RecurrenceDescriptor &RdxDesc = |
5272 | Legal->getReductionVars().find(Key: PN)->second; |
5273 | if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || |
5274 | TTI.preferInLoopReduction(Opcode: RdxDesc.getOpcode(), |
5275 | Ty: RdxDesc.getRecurrenceType(), |
5276 | Flags: TargetTransformInfo::ReductionFlags())) |
5277 | continue; |
5278 | T = RdxDesc.getRecurrenceType(); |
5279 | } |
5280 | |
5281 | // Examine the stored values. |
5282 | if (auto *ST = dyn_cast<StoreInst>(Val: &I)) |
5283 | T = ST->getValueOperand()->getType(); |
5284 | |
5285 | assert(T->isSized() && |
5286 | "Expected the load/store/recurrence type to be sized" ); |
5287 | |
5288 | ElementTypesInLoop.insert(Ptr: T); |
5289 | } |
5290 | } |
5291 | } |
5292 | |
5293 | unsigned |
5294 | LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, |
5295 | InstructionCost LoopCost) { |
5296 | // -- The interleave heuristics -- |
5297 | // We interleave the loop in order to expose ILP and reduce the loop overhead. |
5298 | // There are many micro-architectural considerations that we can't predict |
5299 | // at this level. For example, frontend pressure (on decode or fetch) due to |
5300 | // code size, or the number and capabilities of the execution ports. |
5301 | // |
5302 | // We use the following heuristics to select the interleave count: |
5303 | // 1. If the code has reductions, then we interleave to break the cross |
5304 | // iteration dependency. |
5305 | // 2. If the loop is really small, then we interleave to reduce the loop |
5306 | // overhead. |
5307 | // 3. We don't interleave if we think that we will spill registers to memory |
5308 | // due to the increased register pressure. |
5309 | |
5310 | if (!isScalarEpilogueAllowed()) |
5311 | return 1; |
5312 | |
5313 | // Do not interleave if EVL is preferred and no User IC is specified. |
5314 | if (foldTailWithEVL()) { |
5315 | LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. " |
5316 | "Unroll factor forced to be 1.\n" ); |
5317 | return 1; |
5318 | } |
5319 | |
5320 | // We used the distance for the interleave count. |
5321 | if (!Legal->isSafeForAnyVectorWidth()) |
5322 | return 1; |
5323 | |
5324 | auto BestKnownTC = getSmallBestKnownTC(SE&: *PSE.getSE(), L: TheLoop); |
5325 | const bool HasReductions = !Legal->getReductionVars().empty(); |
5326 | |
5327 | // If we did not calculate the cost for VF (because the user selected the VF) |
5328 | // then we calculate the cost of VF here. |
5329 | if (LoopCost == 0) { |
5330 | LoopCost = expectedCost(VF).first; |
5331 | assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost" ); |
5332 | |
5333 | // Loop body is free and there is no need for interleaving. |
5334 | if (LoopCost == 0) |
5335 | return 1; |
5336 | } |
5337 | |
5338 | RegisterUsage R = calculateRegisterUsage(VFs: {VF})[0]; |
5339 | // We divide by these constants so assume that we have at least one |
5340 | // instruction that uses at least one register. |
5341 | for (auto& pair : R.MaxLocalUsers) { |
5342 | pair.second = std::max(a: pair.second, b: 1U); |
5343 | } |
5344 | |
5345 | // We calculate the interleave count using the following formula. |
5346 | // Subtract the number of loop invariants from the number of available |
5347 | // registers. These registers are used by all of the interleaved instances. |
5348 | // Next, divide the remaining registers by the number of registers that is |
5349 | // required by the loop, in order to estimate how many parallel instances |
5350 | // fit without causing spills. All of this is rounded down if necessary to be |
5351 | // a power of two. We want power of two interleave count to simplify any |
5352 | // addressing operations or alignment considerations. |
5353 | // We also want power of two interleave counts to ensure that the induction |
5354 | // variable of the vector loop wraps to zero, when tail is folded by masking; |
5355 | // this currently happens when OptForSize, in which case IC is set to 1 above. |
5356 | unsigned IC = UINT_MAX; |
5357 | |
5358 | for (auto& pair : R.MaxLocalUsers) { |
5359 | unsigned TargetNumRegisters = TTI.getNumberOfRegisters(ClassID: pair.first); |
5360 | LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters |
5361 | << " registers of " |
5362 | << TTI.getRegisterClassName(pair.first) << " register class\n" ); |
5363 | if (VF.isScalar()) { |
5364 | if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) |
5365 | TargetNumRegisters = ForceTargetNumScalarRegs; |
5366 | } else { |
5367 | if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) |
5368 | TargetNumRegisters = ForceTargetNumVectorRegs; |
5369 | } |
5370 | unsigned MaxLocalUsers = pair.second; |
5371 | unsigned LoopInvariantRegs = 0; |
5372 | if (R.LoopInvariantRegs.find(Key: pair.first) != R.LoopInvariantRegs.end()) |
5373 | LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; |
5374 | |
5375 | unsigned TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs) / |
5376 | MaxLocalUsers); |
5377 | // Don't count the induction variable as interleaved. |
5378 | if (EnableIndVarRegisterHeur) { |
5379 | TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs - 1) / |
5380 | std::max(a: 1U, b: (MaxLocalUsers - 1))); |
5381 | } |
5382 | |
5383 | IC = std::min(a: IC, b: TmpIC); |
5384 | } |
5385 | |
5386 | // Clamp the interleave ranges to reasonable counts. |
5387 | unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); |
5388 | |
5389 | // Check if the user has overridden the max. |
5390 | if (VF.isScalar()) { |
5391 | if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) |
5392 | MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; |
5393 | } else { |
5394 | if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0) |
5395 | MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; |
5396 | } |
5397 | |
5398 | unsigned EstimatedVF = VF.getKnownMinValue(); |
5399 | if (VF.isScalable()) { |
5400 | if (std::optional<unsigned> VScale = getVScaleForTuning(L: TheLoop, TTI)) |
5401 | EstimatedVF *= *VScale; |
5402 | } |
5403 | assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1" ); |
5404 | |
5405 | unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(L: TheLoop); |
5406 | if (KnownTC > 0) { |
5407 | // At least one iteration must be scalar when this constraint holds. So the |
5408 | // maximum available iterations for interleaving is one less. |
5409 | unsigned AvailableTC = |
5410 | requiresScalarEpilogue(IsVectorizing: VF.isVector()) ? KnownTC - 1 : KnownTC; |
5411 | |
5412 | // If trip count is known we select between two prospective ICs, where |
5413 | // 1) the aggressive IC is capped by the trip count divided by VF |
5414 | // 2) the conservative IC is capped by the trip count divided by (VF * 2) |
5415 | // The final IC is selected in a way that the epilogue loop trip count is |
5416 | // minimized while maximizing the IC itself, so that we either run the |
5417 | // vector loop at least once if it generates a small epilogue loop, or else |
5418 | // we run the vector loop at least twice. |
5419 | |
5420 | unsigned InterleaveCountUB = bit_floor( |
5421 | Value: std::max(a: 1u, b: std::min(a: AvailableTC / EstimatedVF, b: MaxInterleaveCount))); |
5422 | unsigned InterleaveCountLB = bit_floor(Value: std::max( |
5423 | a: 1u, b: std::min(a: AvailableTC / (EstimatedVF * 2), b: MaxInterleaveCount))); |
5424 | MaxInterleaveCount = InterleaveCountLB; |
5425 | |
5426 | if (InterleaveCountUB != InterleaveCountLB) { |
5427 | unsigned TailTripCountUB = |
5428 | (AvailableTC % (EstimatedVF * InterleaveCountUB)); |
5429 | unsigned TailTripCountLB = |
5430 | (AvailableTC % (EstimatedVF * InterleaveCountLB)); |
5431 | // If both produce same scalar tail, maximize the IC to do the same work |
5432 | // in fewer vector loop iterations |
5433 | if (TailTripCountUB == TailTripCountLB) |
5434 | MaxInterleaveCount = InterleaveCountUB; |
5435 | } |
5436 | } else if (BestKnownTC && *BestKnownTC > 0) { |
5437 | // At least one iteration must be scalar when this constraint holds. So the |
5438 | // maximum available iterations for interleaving is one less. |
5439 | unsigned AvailableTC = requiresScalarEpilogue(IsVectorizing: VF.isVector()) |
5440 | ? (*BestKnownTC) - 1 |
5441 | : *BestKnownTC; |
5442 | |
5443 | // If trip count is an estimated compile time constant, limit the |
5444 | // IC to be capped by the trip count divided by VF * 2, such that the vector |
5445 | // loop runs at least twice to make interleaving seem profitable when there |
5446 | // is an epilogue loop present. Since exact Trip count is not known we |
5447 | // choose to be conservative in our IC estimate. |
5448 | MaxInterleaveCount = bit_floor(Value: std::max( |
5449 | a: 1u, b: std::min(a: AvailableTC / (EstimatedVF * 2), b: MaxInterleaveCount))); |
5450 | } |
5451 | |
5452 | assert(MaxInterleaveCount > 0 && |
5453 | "Maximum interleave count must be greater than 0" ); |
5454 | |
5455 | // Clamp the calculated IC to be between the 1 and the max interleave count |
5456 | // that the target and trip count allows. |
5457 | if (IC > MaxInterleaveCount) |
5458 | IC = MaxInterleaveCount; |
5459 | else |
5460 | // Make sure IC is greater than 0. |
5461 | IC = std::max(a: 1u, b: IC); |
5462 | |
5463 | assert(IC > 0 && "Interleave count must be greater than 0." ); |
5464 | |
5465 | // Interleave if we vectorized this loop and there is a reduction that could |
5466 | // benefit from interleaving. |
5467 | if (VF.isVector() && HasReductions) { |
5468 | LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n" ); |
5469 | return IC; |
5470 | } |
5471 | |
5472 | // For any scalar loop that either requires runtime checks or predication we |
5473 | // are better off leaving this to the unroller. Note that if we've already |
5474 | // vectorized the loop we will have done the runtime check and so interleaving |
5475 | // won't require further checks. |
5476 | bool ScalarInterleavingRequiresPredication = |
5477 | (VF.isScalar() && any_of(Range: TheLoop->blocks(), P: [this](BasicBlock *BB) { |
5478 | return Legal->blockNeedsPredication(BB); |
5479 | })); |
5480 | bool ScalarInterleavingRequiresRuntimePointerCheck = |
5481 | (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); |
5482 | |
5483 | // We want to interleave small loops in order to reduce the loop overhead and |
5484 | // potentially expose ILP opportunities. |
5485 | LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' |
5486 | << "LV: IC is " << IC << '\n' |
5487 | << "LV: VF is " << VF << '\n'); |
5488 | const bool AggressivelyInterleaveReductions = |
5489 | TTI.enableAggressiveInterleaving(LoopHasReductions: HasReductions); |
5490 | if (!ScalarInterleavingRequiresRuntimePointerCheck && |
5491 | !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { |
5492 | // We assume that the cost overhead is 1 and we use the cost model |
5493 | // to estimate the cost of the loop and interleave until the cost of the |
5494 | // loop overhead is about 5% of the cost of the loop. |
5495 | unsigned SmallIC = std::min(a: IC, b: (unsigned)llvm::bit_floor<uint64_t>( |
5496 | Value: SmallLoopCost / *LoopCost.getValue())); |
5497 | |
5498 | // Interleave until store/load ports (estimated by max interleave count) are |
5499 | // saturated. |
5500 | unsigned NumStores = Legal->getNumStores(); |
5501 | unsigned NumLoads = Legal->getNumLoads(); |
5502 | unsigned StoresIC = IC / (NumStores ? NumStores : 1); |
5503 | unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1); |
5504 | |
5505 | // There is little point in interleaving for reductions containing selects |
5506 | // and compares when VF=1 since it may just create more overhead than it's |
5507 | // worth for loops with small trip counts. This is because we still have to |
5508 | // do the final reduction after the loop. |
5509 | bool HasSelectCmpReductions = |
5510 | HasReductions && |
5511 | any_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool { |
5512 | const RecurrenceDescriptor &RdxDesc = Reduction.second; |
5513 | return RecurrenceDescriptor::isAnyOfRecurrenceKind( |
5514 | Kind: RdxDesc.getRecurrenceKind()); |
5515 | }); |
5516 | if (HasSelectCmpReductions) { |
5517 | LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n" ); |
5518 | return 1; |
5519 | } |
5520 | |
5521 | // If we have a scalar reduction (vector reductions are already dealt with |
5522 | // by this point), we can increase the critical path length if the loop |
5523 | // we're interleaving is inside another loop. For tree-wise reductions |
5524 | // set the limit to 2, and for ordered reductions it's best to disable |
5525 | // interleaving entirely. |
5526 | if (HasReductions && TheLoop->getLoopDepth() > 1) { |
5527 | bool HasOrderedReductions = |
5528 | any_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool { |
5529 | const RecurrenceDescriptor &RdxDesc = Reduction.second; |
5530 | return RdxDesc.isOrdered(); |
5531 | }); |
5532 | if (HasOrderedReductions) { |
5533 | LLVM_DEBUG( |
5534 | dbgs() << "LV: Not interleaving scalar ordered reductions.\n" ); |
5535 | return 1; |
5536 | } |
5537 | |
5538 | unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC); |
5539 | SmallIC = std::min(a: SmallIC, b: F); |
5540 | StoresIC = std::min(a: StoresIC, b: F); |
5541 | LoadsIC = std::min(a: LoadsIC, b: F); |
5542 | } |
5543 | |
5544 | if (EnableLoadStoreRuntimeInterleave && |
5545 | std::max(a: StoresIC, b: LoadsIC) > SmallIC) { |
5546 | LLVM_DEBUG( |
5547 | dbgs() << "LV: Interleaving to saturate store or load ports.\n" ); |
5548 | return std::max(a: StoresIC, b: LoadsIC); |
5549 | } |
5550 | |
5551 | // If there are scalar reductions and TTI has enabled aggressive |
5552 | // interleaving for reductions, we will interleave to expose ILP. |
5553 | if (VF.isScalar() && AggressivelyInterleaveReductions) { |
5554 | LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n" ); |
5555 | // Interleave no less than SmallIC but not as aggressive as the normal IC |
5556 | // to satisfy the rare situation when resources are too limited. |
5557 | return std::max(a: IC / 2, b: SmallIC); |
5558 | } else { |
5559 | LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n" ); |
5560 | return SmallIC; |
5561 | } |
5562 | } |
5563 | |
5564 | // Interleave if this is a large loop (small loops are already dealt with by |
5565 | // this point) that could benefit from interleaving. |
5566 | if (AggressivelyInterleaveReductions) { |
5567 | LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n" ); |
5568 | return IC; |
5569 | } |
5570 | |
5571 | LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n" ); |
5572 | return 1; |
5573 | } |
5574 | |
5575 | SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> |
5576 | LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) { |
5577 | // This function calculates the register usage by measuring the highest number |
5578 | // of values that are alive at a single location. Obviously, this is a very |
5579 | // rough estimation. We scan the loop in a topological order in order and |
5580 | // assign a number to each instruction. We use RPO to ensure that defs are |
5581 | // met before their users. We assume that each instruction that has in-loop |
5582 | // users starts an interval. We record every time that an in-loop value is |
5583 | // used, so we have a list of the first and last occurrences of each |
5584 | // instruction. Next, we transpose this data structure into a multi map that |
5585 | // holds the list of intervals that *end* at a specific location. This multi |
5586 | // map allows us to perform a linear search. We scan the instructions linearly |
5587 | // and record each time that a new interval starts, by placing it in a set. |
5588 | // If we find this value in the multi-map then we remove it from the set. |
5589 | // The max register usage is the maximum size of the set. |
5590 | // We also search for instructions that are defined outside the loop, but are |
5591 | // used inside the loop. We need this number separately from the max-interval |
5592 | // usage number because when we unroll, loop-invariant values do not take |
5593 | // more register. |
5594 | LoopBlocksDFS DFS(TheLoop); |
5595 | DFS.perform(LI); |
5596 | |
5597 | RegisterUsage RU; |
5598 | |
5599 | // Each 'key' in the map opens a new interval. The values |
5600 | // of the map are the index of the 'last seen' usage of the |
5601 | // instruction that is the key. |
5602 | using IntervalMap = DenseMap<Instruction *, unsigned>; |
5603 | |
5604 | // Maps instruction to its index. |
5605 | SmallVector<Instruction *, 64> IdxToInstr; |
5606 | // Marks the end of each interval. |
5607 | IntervalMap EndPoint; |
5608 | // Saves the list of instruction indices that are used in the loop. |
5609 | SmallPtrSet<Instruction *, 8> Ends; |
5610 | // Saves the list of values that are used in the loop but are defined outside |
5611 | // the loop (not including non-instruction values such as arguments and |
5612 | // constants). |
5613 | SmallSetVector<Instruction *, 8> LoopInvariants; |
5614 | |
5615 | for (BasicBlock *BB : make_range(x: DFS.beginRPO(), y: DFS.endRPO())) { |
5616 | for (Instruction &I : BB->instructionsWithoutDebug()) { |
5617 | IdxToInstr.push_back(Elt: &I); |
5618 | |
5619 | // Save the end location of each USE. |
5620 | for (Value *U : I.operands()) { |
5621 | auto *Instr = dyn_cast<Instruction>(Val: U); |
5622 | |
5623 | // Ignore non-instruction values such as arguments, constants, etc. |
5624 | // FIXME: Might need some motivation why these values are ignored. If |
5625 | // for example an argument is used inside the loop it will increase the |
5626 | // register pressure (so shouldn't we add it to LoopInvariants). |
5627 | if (!Instr) |
5628 | continue; |
5629 | |
5630 | // If this instruction is outside the loop then record it and continue. |
5631 | if (!TheLoop->contains(Inst: Instr)) { |
5632 | LoopInvariants.insert(X: Instr); |
5633 | continue; |
5634 | } |
5635 | |
5636 | // Overwrite previous end points. |
5637 | EndPoint[Instr] = IdxToInstr.size(); |
5638 | Ends.insert(Ptr: Instr); |
5639 | } |
5640 | } |
5641 | } |
5642 | |
5643 | // Saves the list of intervals that end with the index in 'key'. |
5644 | using InstrList = SmallVector<Instruction *, 2>; |
5645 | DenseMap<unsigned, InstrList> TransposeEnds; |
5646 | |
5647 | // Transpose the EndPoints to a list of values that end at each index. |
5648 | for (auto &Interval : EndPoint) |
5649 | TransposeEnds[Interval.second].push_back(Elt: Interval.first); |
5650 | |
5651 | SmallPtrSet<Instruction *, 8> OpenIntervals; |
5652 | SmallVector<RegisterUsage, 8> RUs(VFs.size()); |
5653 | SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); |
5654 | |
5655 | LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n" ); |
5656 | |
5657 | const auto &TTICapture = TTI; |
5658 | auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { |
5659 | if (Ty->isTokenTy() || !VectorType::isValidElementType(ElemTy: Ty)) |
5660 | return 0; |
5661 | return TTICapture.getRegUsageForType(Ty: VectorType::get(ElementType: Ty, EC: VF)); |
5662 | }; |
5663 | |
5664 | for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) { |
5665 | Instruction *I = IdxToInstr[i]; |
5666 | |
5667 | // Remove all of the instructions that end at this location. |
5668 | InstrList &List = TransposeEnds[i]; |
5669 | for (Instruction *ToRemove : List) |
5670 | OpenIntervals.erase(Ptr: ToRemove); |
5671 | |
5672 | // Ignore instructions that are never used within the loop. |
5673 | if (!Ends.count(Ptr: I)) |
5674 | continue; |
5675 | |
5676 | // Skip ignored values. |
5677 | if (ValuesToIgnore.count(Ptr: I)) |
5678 | continue; |
5679 | |
5680 | collectInLoopReductions(); |
5681 | |
5682 | // For each VF find the maximum usage of registers. |
5683 | for (unsigned j = 0, e = VFs.size(); j < e; ++j) { |
5684 | // Count the number of registers used, per register class, given all open |
5685 | // intervals. |
5686 | // Note that elements in this SmallMapVector will be default constructed |
5687 | // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if |
5688 | // there is no previous entry for ClassID. |
5689 | SmallMapVector<unsigned, unsigned, 4> RegUsage; |
5690 | |
5691 | if (VFs[j].isScalar()) { |
5692 | for (auto *Inst : OpenIntervals) { |
5693 | unsigned ClassID = |
5694 | TTI.getRegisterClassForType(Vector: false, Ty: Inst->getType()); |
5695 | // FIXME: The target might use more than one register for the type |
5696 | // even in the scalar case. |
5697 | RegUsage[ClassID] += 1; |
5698 | } |
5699 | } else { |
5700 | collectUniformsAndScalars(VF: VFs[j]); |
5701 | for (auto *Inst : OpenIntervals) { |
5702 | // Skip ignored values for VF > 1. |
5703 | if (VecValuesToIgnore.count(Ptr: Inst)) |
5704 | continue; |
5705 | if (isScalarAfterVectorization(I: Inst, VF: VFs[j])) { |
5706 | unsigned ClassID = |
5707 | TTI.getRegisterClassForType(Vector: false, Ty: Inst->getType()); |
5708 | // FIXME: The target might use more than one register for the type |
5709 | // even in the scalar case. |
5710 | RegUsage[ClassID] += 1; |
5711 | } else { |
5712 | unsigned ClassID = |
5713 | TTI.getRegisterClassForType(Vector: true, Ty: Inst->getType()); |
5714 | RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); |
5715 | } |
5716 | } |
5717 | } |
5718 | |
5719 | for (auto& pair : RegUsage) { |
5720 | auto &Entry = MaxUsages[j][pair.first]; |
5721 | Entry = std::max(a: Entry, b: pair.second); |
5722 | } |
5723 | } |
5724 | |
5725 | LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " |
5726 | << OpenIntervals.size() << '\n'); |
5727 | |
5728 | // Add the current instruction to the list of open intervals. |
5729 | OpenIntervals.insert(Ptr: I); |
5730 | } |
5731 | |
5732 | for (unsigned i = 0, e = VFs.size(); i < e; ++i) { |
5733 | // Note that elements in this SmallMapVector will be default constructed |
5734 | // as 0. So we can use "Invariant[ClassID] += n" in the code below even if |
5735 | // there is no previous entry for ClassID. |
5736 | SmallMapVector<unsigned, unsigned, 4> Invariant; |
5737 | |
5738 | for (auto *Inst : LoopInvariants) { |
5739 | // FIXME: The target might use more than one register for the type |
5740 | // even in the scalar case. |
5741 | bool IsScalar = all_of(Range: Inst->users(), P: [&](User *U) { |
5742 | auto *I = cast<Instruction>(Val: U); |
5743 | return TheLoop != LI->getLoopFor(BB: I->getParent()) || |
5744 | isScalarAfterVectorization(I, VF: VFs[i]); |
5745 | }); |
5746 | |
5747 | ElementCount VF = IsScalar ? ElementCount::getFixed(MinVal: 1) : VFs[i]; |
5748 | unsigned ClassID = |
5749 | TTI.getRegisterClassForType(Vector: VF.isVector(), Ty: Inst->getType()); |
5750 | Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); |
5751 | } |
5752 | |
5753 | LLVM_DEBUG({ |
5754 | dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; |
5755 | dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() |
5756 | << " item\n" ; |
5757 | for (const auto &pair : MaxUsages[i]) { |
5758 | dbgs() << "LV(REG): RegisterClass: " |
5759 | << TTI.getRegisterClassName(pair.first) << ", " << pair.second |
5760 | << " registers\n" ; |
5761 | } |
5762 | dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() |
5763 | << " item\n" ; |
5764 | for (const auto &pair : Invariant) { |
5765 | dbgs() << "LV(REG): RegisterClass: " |
5766 | << TTI.getRegisterClassName(pair.first) << ", " << pair.second |
5767 | << " registers\n" ; |
5768 | } |
5769 | }); |
5770 | |
5771 | RU.LoopInvariantRegs = Invariant; |
5772 | RU.MaxLocalUsers = MaxUsages[i]; |
5773 | RUs[i] = RU; |
5774 | } |
5775 | |
5776 | return RUs; |
5777 | } |
5778 | |
5779 | bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, |
5780 | ElementCount VF) { |
5781 | // TODO: Cost model for emulated masked load/store is completely |
5782 | // broken. This hack guides the cost model to use an artificially |
5783 | // high enough value to practically disable vectorization with such |
5784 | // operations, except where previously deployed legality hack allowed |
5785 | // using very low cost values. This is to avoid regressions coming simply |
5786 | // from moving "masked load/store" check from legality to cost model. |
5787 | // Masked Load/Gather emulation was previously never allowed. |
5788 | // Limited number of Masked Store/Scatter emulation was allowed. |
5789 | assert((isPredicatedInst(I)) && |
5790 | "Expecting a scalar emulated instruction" ); |
5791 | return isa<LoadInst>(Val: I) || |
5792 | (isa<StoreInst>(Val: I) && |
5793 | NumPredStores > NumberOfStoresToPredicate); |
5794 | } |
5795 | |
5796 | void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { |
5797 | // If we aren't vectorizing the loop, or if we've already collected the |
5798 | // instructions to scalarize, there's nothing to do. Collection may already |
5799 | // have occurred if we have a user-selected VF and are now computing the |
5800 | // expected cost for interleaving. |
5801 | if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(Val: VF)) |
5802 | return; |
5803 | |
5804 | // Initialize a mapping for VF in InstsToScalalarize. If we find that it's |
5805 | // not profitable to scalarize any instructions, the presence of VF in the |
5806 | // map will indicate that we've analyzed it already. |
5807 | ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; |
5808 | |
5809 | PredicatedBBsAfterVectorization[VF].clear(); |
5810 | |
5811 | // Find all the instructions that are scalar with predication in the loop and |
5812 | // determine if it would be better to not if-convert the blocks they are in. |
5813 | // If so, we also record the instructions to scalarize. |
5814 | for (BasicBlock *BB : TheLoop->blocks()) { |
5815 | if (!blockNeedsPredicationForAnyReason(BB)) |
5816 | continue; |
5817 | for (Instruction &I : *BB) |
5818 | if (isScalarWithPredication(I: &I, VF)) { |
5819 | ScalarCostsTy ScalarCosts; |
5820 | // Do not apply discount if scalable, because that would lead to |
5821 | // invalid scalarization costs. |
5822 | // Do not apply discount logic if hacked cost is needed |
5823 | // for emulated masked memrefs. |
5824 | if (!isScalarAfterVectorization(I: &I, VF) && !VF.isScalable() && |
5825 | !useEmulatedMaskMemRefHack(I: &I, VF) && |
5826 | computePredInstDiscount(PredInst: &I, ScalarCosts, VF) >= 0) |
5827 | ScalarCostsVF.insert(I: ScalarCosts.begin(), E: ScalarCosts.end()); |
5828 | // Remember that BB will remain after vectorization. |
5829 | PredicatedBBsAfterVectorization[VF].insert(Ptr: BB); |
5830 | } |
5831 | } |
5832 | } |
5833 | |
5834 | InstructionCost LoopVectorizationCostModel::computePredInstDiscount( |
5835 | Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { |
5836 | assert(!isUniformAfterVectorization(PredInst, VF) && |
5837 | "Instruction marked uniform-after-vectorization will be predicated" ); |
5838 | |
5839 | // Initialize the discount to zero, meaning that the scalar version and the |
5840 | // vector version cost the same. |
5841 | InstructionCost Discount = 0; |
5842 | |
5843 | // Holds instructions to analyze. The instructions we visit are mapped in |
5844 | // ScalarCosts. Those instructions are the ones that would be scalarized if |
5845 | // we find that the scalar version costs less. |
5846 | SmallVector<Instruction *, 8> Worklist; |
5847 | |
5848 | // Returns true if the given instruction can be scalarized. |
5849 | auto canBeScalarized = [&](Instruction *I) -> bool { |
5850 | // We only attempt to scalarize instructions forming a single-use chain |
5851 | // from the original predicated block that would otherwise be vectorized. |
5852 | // Although not strictly necessary, we give up on instructions we know will |
5853 | // already be scalar to avoid traversing chains that are unlikely to be |
5854 | // beneficial. |
5855 | if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || |
5856 | isScalarAfterVectorization(I, VF)) |
5857 | return false; |
5858 | |
5859 | // If the instruction is scalar with predication, it will be analyzed |
5860 | // separately. We ignore it within the context of PredInst. |
5861 | if (isScalarWithPredication(I, VF)) |
5862 | return false; |
5863 | |
5864 | // If any of the instruction's operands are uniform after vectorization, |
5865 | // the instruction cannot be scalarized. This prevents, for example, a |
5866 | // masked load from being scalarized. |
5867 | // |
5868 | // We assume we will only emit a value for lane zero of an instruction |
5869 | // marked uniform after vectorization, rather than VF identical values. |
5870 | // Thus, if we scalarize an instruction that uses a uniform, we would |
5871 | // create uses of values corresponding to the lanes we aren't emitting code |
5872 | // for. This behavior can be changed by allowing getScalarValue to clone |
5873 | // the lane zero values for uniforms rather than asserting. |
5874 | for (Use &U : I->operands()) |
5875 | if (auto *J = dyn_cast<Instruction>(Val: U.get())) |
5876 | if (isUniformAfterVectorization(I: J, VF)) |
5877 | return false; |
5878 | |
5879 | // Otherwise, we can scalarize the instruction. |
5880 | return true; |
5881 | }; |
5882 | |
5883 | // Compute the expected cost discount from scalarizing the entire expression |
5884 | // feeding the predicated instruction. We currently only consider expressions |
5885 | // that are single-use instruction chains. |
5886 | Worklist.push_back(Elt: PredInst); |
5887 | while (!Worklist.empty()) { |
5888 | Instruction *I = Worklist.pop_back_val(); |
5889 | |
5890 | // If we've already analyzed the instruction, there's nothing to do. |
5891 | if (ScalarCosts.contains(Val: I)) |
5892 | continue; |
5893 | |
5894 | // Compute the cost of the vector instruction. Note that this cost already |
5895 | // includes the scalarization overhead of the predicated instruction. |
5896 | InstructionCost VectorCost = getInstructionCost(I, VF).first; |
5897 | |
5898 | // Compute the cost of the scalarized instruction. This cost is the cost of |
5899 | // the instruction as if it wasn't if-converted and instead remained in the |
5900 | // predicated block. We will scale this cost by block probability after |
5901 | // computing the scalarization overhead. |
5902 | InstructionCost ScalarCost = |
5903 | VF.getFixedValue() * |
5904 | getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1)).first; |
5905 | |
5906 | // Compute the scalarization overhead of needed insertelement instructions |
5907 | // and phi nodes. |
5908 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
5909 | if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { |
5910 | ScalarCost += TTI.getScalarizationOverhead( |
5911 | Ty: cast<VectorType>(Val: ToVectorTy(Scalar: I->getType(), EC: VF)), |
5912 | DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), /*Insert*/ true, |
5913 | /*Extract*/ false, CostKind); |
5914 | ScalarCost += |
5915 | VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind); |
5916 | } |
5917 | |
5918 | // Compute the scalarization overhead of needed extractelement |
5919 | // instructions. For each of the instruction's operands, if the operand can |
5920 | // be scalarized, add it to the worklist; otherwise, account for the |
5921 | // overhead. |
5922 | for (Use &U : I->operands()) |
5923 | if (auto *J = dyn_cast<Instruction>(Val: U.get())) { |
5924 | assert(VectorType::isValidElementType(J->getType()) && |
5925 | "Instruction has non-scalar type" ); |
5926 | if (canBeScalarized(J)) |
5927 | Worklist.push_back(Elt: J); |
5928 | else if (needsExtract(V: J, VF)) { |
5929 | ScalarCost += TTI.getScalarizationOverhead( |
5930 | Ty: cast<VectorType>(Val: ToVectorTy(Scalar: J->getType(), EC: VF)), |
5931 | DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), /*Insert*/ false, |
5932 | /*Extract*/ true, CostKind); |
5933 | } |
5934 | } |
5935 | |
5936 | // Scale the total scalar cost by block probability. |
5937 | ScalarCost /= getReciprocalPredBlockProb(); |
5938 | |
5939 | // Compute the discount. A non-negative discount means the vector version |
5940 | // of the instruction costs more, and scalarizing would be beneficial. |
5941 | Discount += VectorCost - ScalarCost; |
5942 | ScalarCosts[I] = ScalarCost; |
5943 | } |
5944 | |
5945 | return Discount; |
5946 | } |
5947 | |
5948 | LoopVectorizationCostModel::VectorizationCostTy |
5949 | LoopVectorizationCostModel::expectedCost( |
5950 | ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) { |
5951 | VectorizationCostTy Cost; |
5952 | |
5953 | // For each block. |
5954 | for (BasicBlock *BB : TheLoop->blocks()) { |
5955 | VectorizationCostTy BlockCost; |
5956 | |
5957 | // For each instruction in the old loop. |
5958 | for (Instruction &I : BB->instructionsWithoutDebug()) { |
5959 | // Skip ignored values. |
5960 | if (ValuesToIgnore.count(Ptr: &I) || |
5961 | (VF.isVector() && VecValuesToIgnore.count(Ptr: &I))) |
5962 | continue; |
5963 | |
5964 | VectorizationCostTy C = getInstructionCost(I: &I, VF); |
5965 | |
5966 | // Check if we should override the cost. |
5967 | if (C.first.isValid() && |
5968 | ForceTargetInstructionCost.getNumOccurrences() > 0) |
5969 | C.first = InstructionCost(ForceTargetInstructionCost); |
5970 | |
5971 | // Keep a list of instructions with invalid costs. |
5972 | if (Invalid && !C.first.isValid()) |
5973 | Invalid->emplace_back(Args: &I, Args&: VF); |
5974 | |
5975 | BlockCost.first += C.first; |
5976 | BlockCost.second |= C.second; |
5977 | LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first |
5978 | << " for VF " << VF << " For instruction: " << I |
5979 | << '\n'); |
5980 | } |
5981 | |
5982 | // If we are vectorizing a predicated block, it will have been |
5983 | // if-converted. This means that the block's instructions (aside from |
5984 | // stores and instructions that may divide by zero) will now be |
5985 | // unconditionally executed. For the scalar case, we may not always execute |
5986 | // the predicated block, if it is an if-else block. Thus, scale the block's |
5987 | // cost by the probability of executing it. blockNeedsPredication from |
5988 | // Legal is used so as to not include all blocks in tail folded loops. |
5989 | if (VF.isScalar() && Legal->blockNeedsPredication(BB)) |
5990 | BlockCost.first /= getReciprocalPredBlockProb(); |
5991 | |
5992 | Cost.first += BlockCost.first; |
5993 | Cost.second |= BlockCost.second; |
5994 | } |
5995 | |
5996 | return Cost; |
5997 | } |
5998 | |
5999 | /// Gets Address Access SCEV after verifying that the access pattern |
6000 | /// is loop invariant except the induction variable dependence. |
6001 | /// |
6002 | /// This SCEV can be sent to the Target in order to estimate the address |
6003 | /// calculation cost. |
6004 | static const SCEV *getAddressAccessSCEV( |
6005 | Value *Ptr, |
6006 | LoopVectorizationLegality *Legal, |
6007 | PredicatedScalarEvolution &PSE, |
6008 | const Loop *TheLoop) { |
6009 | |
6010 | auto *Gep = dyn_cast<GetElementPtrInst>(Val: Ptr); |
6011 | if (!Gep) |
6012 | return nullptr; |
6013 | |
6014 | // We are looking for a gep with all loop invariant indices except for one |
6015 | // which should be an induction variable. |
6016 | auto SE = PSE.getSE(); |
6017 | unsigned NumOperands = Gep->getNumOperands(); |
6018 | for (unsigned i = 1; i < NumOperands; ++i) { |
6019 | Value *Opd = Gep->getOperand(i_nocapture: i); |
6020 | if (!SE->isLoopInvariant(S: SE->getSCEV(V: Opd), L: TheLoop) && |
6021 | !Legal->isInductionVariable(V: Opd)) |
6022 | return nullptr; |
6023 | } |
6024 | |
6025 | // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. |
6026 | return PSE.getSCEV(V: Ptr); |
6027 | } |
6028 | |
6029 | InstructionCost |
6030 | LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, |
6031 | ElementCount VF) { |
6032 | assert(VF.isVector() && |
6033 | "Scalarization cost of instruction implies vectorization." ); |
6034 | if (VF.isScalable()) |
6035 | return InstructionCost::getInvalid(); |
6036 | |
6037 | Type *ValTy = getLoadStoreType(I); |
6038 | auto SE = PSE.getSE(); |
6039 | |
6040 | unsigned AS = getLoadStoreAddressSpace(I); |
6041 | Value *Ptr = getLoadStorePointerOperand(V: I); |
6042 | Type *PtrTy = ToVectorTy(Scalar: Ptr->getType(), EC: VF); |
6043 | // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` |
6044 | // that it is being called from this specific place. |
6045 | |
6046 | // Figure out whether the access is strided and get the stride value |
6047 | // if it's known in compile time |
6048 | const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); |
6049 | |
6050 | // Get the cost of the scalar memory instruction and address computation. |
6051 | InstructionCost Cost = |
6052 | VF.getKnownMinValue() * TTI.getAddressComputationCost(Ty: PtrTy, SE, Ptr: PtrSCEV); |
6053 | |
6054 | // Don't pass *I here, since it is scalar but will actually be part of a |
6055 | // vectorized loop where the user of it is a vectorized instruction. |
6056 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
6057 | const Align Alignment = getLoadStoreAlignment(I); |
6058 | Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(Opcode: I->getOpcode(), |
6059 | Src: ValTy->getScalarType(), |
6060 | Alignment, AddressSpace: AS, CostKind); |
6061 | |
6062 | // Get the overhead of the extractelement and insertelement instructions |
6063 | // we might create due to scalarization. |
6064 | Cost += getScalarizationOverhead(I, VF, CostKind); |
6065 | |
6066 | // If we have a predicated load/store, it will need extra i1 extracts and |
6067 | // conditional branches, but may not be executed for each vector lane. Scale |
6068 | // the cost by the probability of executing the predicated block. |
6069 | if (isPredicatedInst(I)) { |
6070 | Cost /= getReciprocalPredBlockProb(); |
6071 | |
6072 | // Add the cost of an i1 extract and a branch |
6073 | auto *Vec_i1Ty = |
6074 | VectorType::get(ElementType: IntegerType::getInt1Ty(C&: ValTy->getContext()), EC: VF); |
6075 | Cost += TTI.getScalarizationOverhead( |
6076 | Ty: Vec_i1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getKnownMinValue()), |
6077 | /*Insert=*/false, /*Extract=*/true, CostKind); |
6078 | Cost += TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind); |
6079 | |
6080 | if (useEmulatedMaskMemRefHack(I, VF)) |
6081 | // Artificially setting to a high enough value to practically disable |
6082 | // vectorization with such operations. |
6083 | Cost = 3000000; |
6084 | } |
6085 | |
6086 | return Cost; |
6087 | } |
6088 | |
6089 | InstructionCost |
6090 | LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, |
6091 | ElementCount VF) { |
6092 | Type *ValTy = getLoadStoreType(I); |
6093 | auto *VectorTy = cast<VectorType>(Val: ToVectorTy(Scalar: ValTy, EC: VF)); |
6094 | Value *Ptr = getLoadStorePointerOperand(V: I); |
6095 | unsigned AS = getLoadStoreAddressSpace(I); |
6096 | int ConsecutiveStride = Legal->isConsecutivePtr(AccessTy: ValTy, Ptr); |
6097 | enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
6098 | |
6099 | assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && |
6100 | "Stride should be 1 or -1 for consecutive memory access" ); |
6101 | const Align Alignment = getLoadStoreAlignment(I); |
6102 | InstructionCost Cost = 0; |
6103 | if (Legal->isMaskRequired(I)) { |
6104 | Cost += TTI.getMaskedMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS, |
6105 | CostKind); |
6106 | } else { |
6107 | TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0)); |
6108 | Cost += TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS, |
6109 | CostKind, OpdInfo: OpInfo, I); |
6110 | } |
6111 | |
6112 | bool Reverse = ConsecutiveStride < 0; |
6113 | if (Reverse) |
6114 | Cost += TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, Tp: VectorTy, |
6115 | Mask: std::nullopt, CostKind, Index: 0); |
6116 | return Cost; |
6117 | } |
6118 | |
6119 | InstructionCost |
6120 | LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, |
6121 | ElementCount VF) { |
6122 | assert(Legal->isUniformMemOp(*I, VF)); |
6123 | |
6124 | Type *ValTy = getLoadStoreType(I); |
6125 | auto *VectorTy = cast<VectorType>(Val: ToVectorTy(Scalar: ValTy, EC: VF)); |
6126 | const Align Alignment = getLoadStoreAlignment(I); |
6127 | unsigned AS = getLoadStoreAddressSpace(I); |
6128 | enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
6129 | if (isa<LoadInst>(Val: I)) { |
6130 | return TTI.getAddressComputationCost(Ty: ValTy) + |
6131 | TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: ValTy, Alignment, AddressSpace: AS, |
6132 | CostKind) + |
6133 | TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, Tp: VectorTy); |
6134 | } |
6135 | StoreInst *SI = cast<StoreInst>(Val: I); |
6136 | |
6137 | bool isLoopInvariantStoreValue = Legal->isInvariant(V: SI->getValueOperand()); |
6138 | return TTI.getAddressComputationCost(Ty: ValTy) + |
6139 | TTI.getMemoryOpCost(Opcode: Instruction::Store, Src: ValTy, Alignment, AddressSpace: AS, |
6140 | CostKind) + |
6141 | (isLoopInvariantStoreValue |
6142 | ? 0 |
6143 | : TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VectorTy, |
6144 | CostKind, Index: VF.getKnownMinValue() - 1)); |
6145 | } |
6146 | |
6147 | InstructionCost |
6148 | LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, |
6149 | ElementCount VF) { |
6150 | Type *ValTy = getLoadStoreType(I); |
6151 | auto *VectorTy = cast<VectorType>(Val: ToVectorTy(Scalar: ValTy, EC: VF)); |
6152 | const Align Alignment = getLoadStoreAlignment(I); |
6153 | const Value *Ptr = getLoadStorePointerOperand(V: I); |
6154 | |
6155 | return TTI.getAddressComputationCost(Ty: VectorTy) + |
6156 | TTI.getGatherScatterOpCost( |
6157 | Opcode: I->getOpcode(), DataTy: VectorTy, Ptr, VariableMask: Legal->isMaskRequired(I), Alignment, |
6158 | CostKind: TargetTransformInfo::TCK_RecipThroughput, I); |
6159 | } |
6160 | |
6161 | InstructionCost |
6162 | LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, |
6163 | ElementCount VF) { |
6164 | Type *ValTy = getLoadStoreType(I); |
6165 | auto *VectorTy = cast<VectorType>(Val: ToVectorTy(Scalar: ValTy, EC: VF)); |
6166 | unsigned AS = getLoadStoreAddressSpace(I); |
6167 | enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
6168 | |
6169 | auto Group = getInterleavedAccessGroup(Instr: I); |
6170 | assert(Group && "Fail to get an interleaved access group." ); |
6171 | |
6172 | unsigned InterleaveFactor = Group->getFactor(); |
6173 | auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor); |
6174 | |
6175 | // Holds the indices of existing members in the interleaved group. |
6176 | SmallVector<unsigned, 4> Indices; |
6177 | for (unsigned IF = 0; IF < InterleaveFactor; IF++) |
6178 | if (Group->getMember(Index: IF)) |
6179 | Indices.push_back(Elt: IF); |
6180 | |
6181 | // Calculate the cost of the whole interleaved group. |
6182 | bool UseMaskForGaps = |
6183 | (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) || |
6184 | (isa<StoreInst>(Val: I) && (Group->getNumMembers() < Group->getFactor())); |
6185 | InstructionCost Cost = TTI.getInterleavedMemoryOpCost( |
6186 | Opcode: I->getOpcode(), VecTy: WideVecTy, Factor: Group->getFactor(), Indices, Alignment: Group->getAlign(), |
6187 | AddressSpace: AS, CostKind, UseMaskForCond: Legal->isMaskRequired(I), UseMaskForGaps); |
6188 | |
6189 | if (Group->isReverse()) { |
6190 | // TODO: Add support for reversed masked interleaved access. |
6191 | assert(!Legal->isMaskRequired(I) && |
6192 | "Reverse masked interleaved access not supported." ); |
6193 | Cost += Group->getNumMembers() * |
6194 | TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, Tp: VectorTy, |
6195 | Mask: std::nullopt, CostKind, Index: 0); |
6196 | } |
6197 | return Cost; |
6198 | } |
6199 | |
6200 | std::optional<InstructionCost> |
6201 | LoopVectorizationCostModel::getReductionPatternCost( |
6202 | Instruction *I, ElementCount VF, Type *Ty, |
6203 | TTI::TargetCostKind CostKind) const { |
6204 | using namespace llvm::PatternMatch; |
6205 | // Early exit for no inloop reductions |
6206 | if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Val: Ty)) |
6207 | return std::nullopt; |
6208 | auto *VectorTy = cast<VectorType>(Val: Ty); |
6209 | |
6210 | // We are looking for a pattern of, and finding the minimal acceptable cost: |
6211 | // reduce(mul(ext(A), ext(B))) or |
6212 | // reduce(mul(A, B)) or |
6213 | // reduce(ext(A)) or |
6214 | // reduce(A). |
6215 | // The basic idea is that we walk down the tree to do that, finding the root |
6216 | // reduction instruction in InLoopReductionImmediateChains. From there we find |
6217 | // the pattern of mul/ext and test the cost of the entire pattern vs the cost |
6218 | // of the components. If the reduction cost is lower then we return it for the |
6219 | // reduction instruction and 0 for the other instructions in the pattern. If |
6220 | // it is not we return an invalid cost specifying the orignal cost method |
6221 | // should be used. |
6222 | Instruction *RetI = I; |
6223 | if (match(V: RetI, P: m_ZExtOrSExt(Op: m_Value()))) { |
6224 | if (!RetI->hasOneUser()) |
6225 | return std::nullopt; |
6226 | RetI = RetI->user_back(); |
6227 | } |
6228 | |
6229 | if (match(V: RetI, P: m_OneUse(SubPattern: m_Mul(L: m_Value(), R: m_Value()))) && |
6230 | RetI->user_back()->getOpcode() == Instruction::Add) { |
6231 | RetI = RetI->user_back(); |
6232 | } |
6233 | |
6234 | // Test if the found instruction is a reduction, and if not return an invalid |
6235 | // cost specifying the parent to use the original cost modelling. |
6236 | if (!InLoopReductionImmediateChains.count(Val: RetI)) |
6237 | return std::nullopt; |
6238 | |
6239 | // Find the reduction this chain is a part of and calculate the basic cost of |
6240 | // the reduction on its own. |
6241 | Instruction *LastChain = InLoopReductionImmediateChains.at(Val: RetI); |
6242 | Instruction *ReductionPhi = LastChain; |
6243 | while (!isa<PHINode>(Val: ReductionPhi)) |
6244 | ReductionPhi = InLoopReductionImmediateChains.at(Val: ReductionPhi); |
6245 | |
6246 | const RecurrenceDescriptor &RdxDesc = |
6247 | Legal->getReductionVars().find(Key: cast<PHINode>(Val: ReductionPhi))->second; |
6248 | |
6249 | InstructionCost BaseCost = TTI.getArithmeticReductionCost( |
6250 | Opcode: RdxDesc.getOpcode(), Ty: VectorTy, FMF: RdxDesc.getFastMathFlags(), CostKind); |
6251 | |
6252 | // For a call to the llvm.fmuladd intrinsic we need to add the cost of a |
6253 | // normal fmul instruction to the cost of the fadd reduction. |
6254 | if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) |
6255 | BaseCost += |
6256 | TTI.getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: VectorTy, CostKind); |
6257 | |
6258 | // If we're using ordered reductions then we can just return the base cost |
6259 | // here, since getArithmeticReductionCost calculates the full ordered |
6260 | // reduction cost when FP reassociation is not allowed. |
6261 | if (useOrderedReductions(RdxDesc)) |
6262 | return BaseCost; |
6263 | |
6264 | // Get the operand that was not the reduction chain and match it to one of the |
6265 | // patterns, returning the better cost if it is found. |
6266 | Instruction *RedOp = RetI->getOperand(i: 1) == LastChain |
6267 | ? dyn_cast<Instruction>(Val: RetI->getOperand(i: 0)) |
6268 | : dyn_cast<Instruction>(Val: RetI->getOperand(i: 1)); |
6269 | |
6270 | VectorTy = VectorType::get(ElementType: I->getOperand(i: 0)->getType(), Other: VectorTy); |
6271 | |
6272 | Instruction *Op0, *Op1; |
6273 | if (RedOp && RdxDesc.getOpcode() == Instruction::Add && |
6274 | match(V: RedOp, |
6275 | P: m_ZExtOrSExt(Op: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) && |
6276 | match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) && |
6277 | Op0->getOpcode() == Op1->getOpcode() && |
6278 | Op0->getOperand(i: 0)->getType() == Op1->getOperand(i: 0)->getType() && |
6279 | !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1) && |
6280 | (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { |
6281 | |
6282 | // Matched reduce.add(ext(mul(ext(A), ext(B))) |
6283 | // Note that the extend opcodes need to all match, or if A==B they will have |
6284 | // been converted to zext(mul(sext(A), sext(A))) as it is known positive, |
6285 | // which is equally fine. |
6286 | bool IsUnsigned = isa<ZExtInst>(Val: Op0); |
6287 | auto *ExtType = VectorType::get(ElementType: Op0->getOperand(i: 0)->getType(), Other: VectorTy); |
6288 | auto *MulType = VectorType::get(ElementType: Op0->getType(), Other: VectorTy); |
6289 | |
6290 | InstructionCost ExtCost = |
6291 | TTI.getCastInstrCost(Opcode: Op0->getOpcode(), Dst: MulType, Src: ExtType, |
6292 | CCH: TTI::CastContextHint::None, CostKind, I: Op0); |
6293 | InstructionCost MulCost = |
6294 | TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: MulType, CostKind); |
6295 | InstructionCost Ext2Cost = |
6296 | TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: MulType, |
6297 | CCH: TTI::CastContextHint::None, CostKind, I: RedOp); |
6298 | |
6299 | InstructionCost RedCost = TTI.getMulAccReductionCost( |
6300 | IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType, CostKind); |
6301 | |
6302 | if (RedCost.isValid() && |
6303 | RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost) |
6304 | return I == RetI ? RedCost : 0; |
6305 | } else if (RedOp && match(V: RedOp, P: m_ZExtOrSExt(Op: m_Value())) && |
6306 | !TheLoop->isLoopInvariant(V: RedOp)) { |
6307 | // Matched reduce(ext(A)) |
6308 | bool IsUnsigned = isa<ZExtInst>(Val: RedOp); |
6309 | auto *ExtType = VectorType::get(ElementType: RedOp->getOperand(i: 0)->getType(), Other: VectorTy); |
6310 | InstructionCost RedCost = TTI.getExtendedReductionCost( |
6311 | Opcode: RdxDesc.getOpcode(), IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType, |
6312 | FMF: RdxDesc.getFastMathFlags(), CostKind); |
6313 | |
6314 | InstructionCost ExtCost = |
6315 | TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: ExtType, |
6316 | CCH: TTI::CastContextHint::None, CostKind, I: RedOp); |
6317 | if (RedCost.isValid() && RedCost < BaseCost + ExtCost) |
6318 | return I == RetI ? RedCost : 0; |
6319 | } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add && |
6320 | match(V: RedOp, P: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) { |
6321 | if (match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) && |
6322 | Op0->getOpcode() == Op1->getOpcode() && |
6323 | !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1)) { |
6324 | bool IsUnsigned = isa<ZExtInst>(Val: Op0); |
6325 | Type *Op0Ty = Op0->getOperand(i: 0)->getType(); |
6326 | Type *Op1Ty = Op1->getOperand(i: 0)->getType(); |
6327 | Type *LargestOpTy = |
6328 | Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty |
6329 | : Op0Ty; |
6330 | auto *ExtType = VectorType::get(ElementType: LargestOpTy, Other: VectorTy); |
6331 | |
6332 | // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of |
6333 | // different sizes. We take the largest type as the ext to reduce, and add |
6334 | // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). |
6335 | InstructionCost ExtCost0 = TTI.getCastInstrCost( |
6336 | Opcode: Op0->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op0Ty, Other: VectorTy), |
6337 | CCH: TTI::CastContextHint::None, CostKind, I: Op0); |
6338 | InstructionCost ExtCost1 = TTI.getCastInstrCost( |
6339 | Opcode: Op1->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op1Ty, Other: VectorTy), |
6340 | CCH: TTI::CastContextHint::None, CostKind, I: Op1); |
6341 | InstructionCost MulCost = |
6342 | TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind); |
6343 | |
6344 | InstructionCost RedCost = TTI.getMulAccReductionCost( |
6345 | IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType, CostKind); |
6346 | InstructionCost = 0; |
6347 | if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { |
6348 | Instruction * = (Op0Ty != LargestOpTy) ? Op0 : Op1; |
6349 | ExtraExtCost = TTI.getCastInstrCost( |
6350 | Opcode: ExtraExtOp->getOpcode(), Dst: ExtType, |
6351 | Src: VectorType::get(ElementType: ExtraExtOp->getOperand(i: 0)->getType(), Other: VectorTy), |
6352 | CCH: TTI::CastContextHint::None, CostKind, I: ExtraExtOp); |
6353 | } |
6354 | |
6355 | if (RedCost.isValid() && |
6356 | (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) |
6357 | return I == RetI ? RedCost : 0; |
6358 | } else if (!match(V: I, P: m_ZExtOrSExt(Op: m_Value()))) { |
6359 | // Matched reduce.add(mul()) |
6360 | InstructionCost MulCost = |
6361 | TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind); |
6362 | |
6363 | InstructionCost RedCost = TTI.getMulAccReductionCost( |
6364 | IsUnsigned: true, ResTy: RdxDesc.getRecurrenceType(), Ty: VectorTy, CostKind); |
6365 | |
6366 | if (RedCost.isValid() && RedCost < MulCost + BaseCost) |
6367 | return I == RetI ? RedCost : 0; |
6368 | } |
6369 | } |
6370 | |
6371 | return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt; |
6372 | } |
6373 | |
6374 | InstructionCost |
6375 | LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, |
6376 | ElementCount VF) { |
6377 | // Calculate scalar cost only. Vectorization cost should be ready at this |
6378 | // moment. |
6379 | if (VF.isScalar()) { |
6380 | Type *ValTy = getLoadStoreType(I); |
6381 | const Align Alignment = getLoadStoreAlignment(I); |
6382 | unsigned AS = getLoadStoreAddressSpace(I); |
6383 | |
6384 | TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0)); |
6385 | return TTI.getAddressComputationCost(Ty: ValTy) + |
6386 | TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS, |
6387 | CostKind: TTI::TCK_RecipThroughput, OpdInfo: OpInfo, I); |
6388 | } |
6389 | return getWideningCost(I, VF); |
6390 | } |
6391 | |
6392 | LoopVectorizationCostModel::VectorizationCostTy |
6393 | LoopVectorizationCostModel::getInstructionCost(Instruction *I, |
6394 | ElementCount VF) { |
6395 | // If we know that this instruction will remain uniform, check the cost of |
6396 | // the scalar version. |
6397 | if (isUniformAfterVectorization(I, VF)) |
6398 | VF = ElementCount::getFixed(MinVal: 1); |
6399 | |
6400 | if (VF.isVector() && isProfitableToScalarize(I, VF)) |
6401 | return VectorizationCostTy(InstsToScalarize[VF][I], false); |
6402 | |
6403 | // Forced scalars do not have any scalarization overhead. |
6404 | auto ForcedScalar = ForcedScalars.find(Val: VF); |
6405 | if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { |
6406 | auto InstSet = ForcedScalar->second; |
6407 | if (InstSet.count(Ptr: I)) |
6408 | return VectorizationCostTy( |
6409 | (getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1)).first * |
6410 | VF.getKnownMinValue()), |
6411 | false); |
6412 | } |
6413 | |
6414 | Type *VectorTy; |
6415 | InstructionCost C = getInstructionCost(I, VF, VectorTy); |
6416 | |
6417 | bool TypeNotScalarized = false; |
6418 | if (VF.isVector() && VectorTy->isVectorTy()) { |
6419 | if (unsigned NumParts = TTI.getNumberOfParts(Tp: VectorTy)) { |
6420 | if (VF.isScalable()) |
6421 | // <vscale x 1 x iN> is assumed to be profitable over iN because |
6422 | // scalable registers are a distinct register class from scalar ones. |
6423 | // If we ever find a target which wants to lower scalable vectors |
6424 | // back to scalars, we'll need to update this code to explicitly |
6425 | // ask TTI about the register class uses for each part. |
6426 | TypeNotScalarized = NumParts <= VF.getKnownMinValue(); |
6427 | else |
6428 | TypeNotScalarized = NumParts < VF.getKnownMinValue(); |
6429 | } else |
6430 | C = InstructionCost::getInvalid(); |
6431 | } |
6432 | return VectorizationCostTy(C, TypeNotScalarized); |
6433 | } |
6434 | |
6435 | InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( |
6436 | Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { |
6437 | |
6438 | // There is no mechanism yet to create a scalable scalarization loop, |
6439 | // so this is currently Invalid. |
6440 | if (VF.isScalable()) |
6441 | return InstructionCost::getInvalid(); |
6442 | |
6443 | if (VF.isScalar()) |
6444 | return 0; |
6445 | |
6446 | InstructionCost Cost = 0; |
6447 | Type *RetTy = ToVectorTy(Scalar: I->getType(), EC: VF); |
6448 | if (!RetTy->isVoidTy() && |
6449 | (!isa<LoadInst>(Val: I) || !TTI.supportsEfficientVectorElementLoadStore())) |
6450 | Cost += TTI.getScalarizationOverhead( |
6451 | Ty: cast<VectorType>(Val: RetTy), DemandedElts: APInt::getAllOnes(numBits: VF.getKnownMinValue()), |
6452 | /*Insert*/ true, |
6453 | /*Extract*/ false, CostKind); |
6454 | |
6455 | // Some targets keep addresses scalar. |
6456 | if (isa<LoadInst>(Val: I) && !TTI.prefersVectorizedAddressing()) |
6457 | return Cost; |
6458 | |
6459 | // Some targets support efficient element stores. |
6460 | if (isa<StoreInst>(Val: I) && TTI.supportsEfficientVectorElementLoadStore()) |
6461 | return Cost; |
6462 | |
6463 | // Collect operands to consider. |
6464 | CallInst *CI = dyn_cast<CallInst>(Val: I); |
6465 | Instruction::op_range Ops = CI ? CI->args() : I->operands(); |
6466 | |
6467 | // Skip operands that do not require extraction/scalarization and do not incur |
6468 | // any overhead. |
6469 | SmallVector<Type *> Tys; |
6470 | for (auto *V : filterExtractingOperands(Ops, VF)) |
6471 | Tys.push_back(Elt: MaybeVectorizeType(Elt: V->getType(), VF)); |
6472 | return Cost + TTI.getOperandsScalarizationOverhead( |
6473 | Args: filterExtractingOperands(Ops, VF), Tys, CostKind); |
6474 | } |
6475 | |
6476 | void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { |
6477 | if (VF.isScalar()) |
6478 | return; |
6479 | NumPredStores = 0; |
6480 | for (BasicBlock *BB : TheLoop->blocks()) { |
6481 | // For each instruction in the old loop. |
6482 | for (Instruction &I : *BB) { |
6483 | Value *Ptr = getLoadStorePointerOperand(V: &I); |
6484 | if (!Ptr) |
6485 | continue; |
6486 | |
6487 | // TODO: We should generate better code and update the cost model for |
6488 | // predicated uniform stores. Today they are treated as any other |
6489 | // predicated store (see added test cases in |
6490 | // invariant-store-vectorization.ll). |
6491 | if (isa<StoreInst>(Val: &I) && isScalarWithPredication(I: &I, VF)) |
6492 | NumPredStores++; |
6493 | |
6494 | if (Legal->isUniformMemOp(I, VF)) { |
6495 | auto isLegalToScalarize = [&]() { |
6496 | if (!VF.isScalable()) |
6497 | // Scalarization of fixed length vectors "just works". |
6498 | return true; |
6499 | |
6500 | // We have dedicated lowering for unpredicated uniform loads and |
6501 | // stores. Note that even with tail folding we know that at least |
6502 | // one lane is active (i.e. generalized predication is not possible |
6503 | // here), and the logic below depends on this fact. |
6504 | if (!foldTailByMasking()) |
6505 | return true; |
6506 | |
6507 | // For scalable vectors, a uniform memop load is always |
6508 | // uniform-by-parts and we know how to scalarize that. |
6509 | if (isa<LoadInst>(Val: I)) |
6510 | return true; |
6511 | |
6512 | // A uniform store isn't neccessarily uniform-by-part |
6513 | // and we can't assume scalarization. |
6514 | auto &SI = cast<StoreInst>(Val&: I); |
6515 | return TheLoop->isLoopInvariant(V: SI.getValueOperand()); |
6516 | }; |
6517 | |
6518 | const InstructionCost GatherScatterCost = |
6519 | isLegalGatherOrScatter(V: &I, VF) ? |
6520 | getGatherScatterCost(I: &I, VF) : InstructionCost::getInvalid(); |
6521 | |
6522 | // Load: Scalar load + broadcast |
6523 | // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract |
6524 | // FIXME: This cost is a significant under-estimate for tail folded |
6525 | // memory ops. |
6526 | const InstructionCost ScalarizationCost = isLegalToScalarize() ? |
6527 | getUniformMemOpCost(I: &I, VF) : InstructionCost::getInvalid(); |
6528 | |
6529 | // Choose better solution for the current VF, Note that Invalid |
6530 | // costs compare as maximumal large. If both are invalid, we get |
6531 | // scalable invalid which signals a failure and a vectorization abort. |
6532 | if (GatherScatterCost < ScalarizationCost) |
6533 | setWideningDecision(I: &I, VF, W: CM_GatherScatter, Cost: GatherScatterCost); |
6534 | else |
6535 | setWideningDecision(I: &I, VF, W: CM_Scalarize, Cost: ScalarizationCost); |
6536 | continue; |
6537 | } |
6538 | |
6539 | // We assume that widening is the best solution when possible. |
6540 | if (memoryInstructionCanBeWidened(I: &I, VF)) { |
6541 | InstructionCost Cost = getConsecutiveMemOpCost(I: &I, VF); |
6542 | int ConsecutiveStride = Legal->isConsecutivePtr( |
6543 | AccessTy: getLoadStoreType(I: &I), Ptr: getLoadStorePointerOperand(V: &I)); |
6544 | assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && |
6545 | "Expected consecutive stride." ); |
6546 | InstWidening Decision = |
6547 | ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; |
6548 | setWideningDecision(I: &I, VF, W: Decision, Cost); |
6549 | continue; |
6550 | } |
6551 | |
6552 | // Choose between Interleaving, Gather/Scatter or Scalarization. |
6553 | InstructionCost InterleaveCost = InstructionCost::getInvalid(); |
6554 | unsigned NumAccesses = 1; |
6555 | if (isAccessInterleaved(Instr: &I)) { |
6556 | auto Group = getInterleavedAccessGroup(Instr: &I); |
6557 | assert(Group && "Fail to get an interleaved access group." ); |
6558 | |
6559 | // Make one decision for the whole group. |
6560 | if (getWideningDecision(I: &I, VF) != CM_Unknown) |
6561 | continue; |
6562 | |
6563 | NumAccesses = Group->getNumMembers(); |
6564 | if (interleavedAccessCanBeWidened(I: &I, VF)) |
6565 | InterleaveCost = getInterleaveGroupCost(I: &I, VF); |
6566 | } |
6567 | |
6568 | InstructionCost GatherScatterCost = |
6569 | isLegalGatherOrScatter(V: &I, VF) |
6570 | ? getGatherScatterCost(I: &I, VF) * NumAccesses |
6571 | : InstructionCost::getInvalid(); |
6572 | |
6573 | InstructionCost ScalarizationCost = |
6574 | getMemInstScalarizationCost(I: &I, VF) * NumAccesses; |
6575 | |
6576 | // Choose better solution for the current VF, |
6577 | // write down this decision and use it during vectorization. |
6578 | InstructionCost Cost; |
6579 | InstWidening Decision; |
6580 | if (InterleaveCost <= GatherScatterCost && |
6581 | InterleaveCost < ScalarizationCost) { |
6582 | Decision = CM_Interleave; |
6583 | Cost = InterleaveCost; |
6584 | } else if (GatherScatterCost < ScalarizationCost) { |
6585 | Decision = CM_GatherScatter; |
6586 | Cost = GatherScatterCost; |
6587 | } else { |
6588 | Decision = CM_Scalarize; |
6589 | Cost = ScalarizationCost; |
6590 | } |
6591 | // If the instructions belongs to an interleave group, the whole group |
6592 | // receives the same decision. The whole group receives the cost, but |
6593 | // the cost will actually be assigned to one instruction. |
6594 | if (auto Group = getInterleavedAccessGroup(Instr: &I)) |
6595 | setWideningDecision(Grp: Group, VF, W: Decision, Cost); |
6596 | else |
6597 | setWideningDecision(I: &I, VF, W: Decision, Cost); |
6598 | } |
6599 | } |
6600 | |
6601 | // Make sure that any load of address and any other address computation |
6602 | // remains scalar unless there is gather/scatter support. This avoids |
6603 | // inevitable extracts into address registers, and also has the benefit of |
6604 | // activating LSR more, since that pass can't optimize vectorized |
6605 | // addresses. |
6606 | if (TTI.prefersVectorizedAddressing()) |
6607 | return; |
6608 | |
6609 | // Start with all scalar pointer uses. |
6610 | SmallPtrSet<Instruction *, 8> AddrDefs; |
6611 | for (BasicBlock *BB : TheLoop->blocks()) |
6612 | for (Instruction &I : *BB) { |
6613 | Instruction *PtrDef = |
6614 | dyn_cast_or_null<Instruction>(Val: getLoadStorePointerOperand(V: &I)); |
6615 | if (PtrDef && TheLoop->contains(Inst: PtrDef) && |
6616 | getWideningDecision(I: &I, VF) != CM_GatherScatter) |
6617 | AddrDefs.insert(Ptr: PtrDef); |
6618 | } |
6619 | |
6620 | // Add all instructions used to generate the addresses. |
6621 | SmallVector<Instruction *, 4> Worklist; |
6622 | append_range(C&: Worklist, R&: AddrDefs); |
6623 | while (!Worklist.empty()) { |
6624 | Instruction *I = Worklist.pop_back_val(); |
6625 | for (auto &Op : I->operands()) |
6626 | if (auto *InstOp = dyn_cast<Instruction>(Val&: Op)) |
6627 | if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(Val: InstOp) && |
6628 | AddrDefs.insert(Ptr: InstOp).second) |
6629 | Worklist.push_back(Elt: InstOp); |
6630 | } |
6631 | |
6632 | for (auto *I : AddrDefs) { |
6633 | if (isa<LoadInst>(Val: I)) { |
6634 | // Setting the desired widening decision should ideally be handled in |
6635 | // by cost functions, but since this involves the task of finding out |
6636 | // if the loaded register is involved in an address computation, it is |
6637 | // instead changed here when we know this is the case. |
6638 | InstWidening Decision = getWideningDecision(I, VF); |
6639 | if (Decision == CM_Widen || Decision == CM_Widen_Reverse) |
6640 | // Scalarize a widened load of address. |
6641 | setWideningDecision( |
6642 | I, VF, W: CM_Scalarize, |
6643 | Cost: (VF.getKnownMinValue() * |
6644 | getMemoryInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1)))); |
6645 | else if (auto Group = getInterleavedAccessGroup(Instr: I)) { |
6646 | // Scalarize an interleave group of address loads. |
6647 | for (unsigned I = 0; I < Group->getFactor(); ++I) { |
6648 | if (Instruction *Member = Group->getMember(Index: I)) |
6649 | setWideningDecision( |
6650 | I: Member, VF, W: CM_Scalarize, |
6651 | Cost: (VF.getKnownMinValue() * |
6652 | getMemoryInstructionCost(I: Member, VF: ElementCount::getFixed(MinVal: 1)))); |
6653 | } |
6654 | } |
6655 | } else |
6656 | // Make sure I gets scalarized and a cost estimate without |
6657 | // scalarization overhead. |
6658 | ForcedScalars[VF].insert(Ptr: I); |
6659 | } |
6660 | } |
6661 | |
6662 | void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { |
6663 | assert(!VF.isScalar() && |
6664 | "Trying to set a vectorization decision for a scalar VF" ); |
6665 | |
6666 | for (BasicBlock *BB : TheLoop->blocks()) { |
6667 | // For each instruction in the old loop. |
6668 | for (Instruction &I : *BB) { |
6669 | CallInst *CI = dyn_cast<CallInst>(Val: &I); |
6670 | |
6671 | if (!CI) |
6672 | continue; |
6673 | |
6674 | InstructionCost ScalarCost = InstructionCost::getInvalid(); |
6675 | InstructionCost VectorCost = InstructionCost::getInvalid(); |
6676 | InstructionCost IntrinsicCost = InstructionCost::getInvalid(); |
6677 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
6678 | |
6679 | Function *ScalarFunc = CI->getCalledFunction(); |
6680 | Type *ScalarRetTy = CI->getType(); |
6681 | SmallVector<Type *, 4> Tys, ScalarTys; |
6682 | bool MaskRequired = Legal->isMaskRequired(I: CI); |
6683 | for (auto &ArgOp : CI->args()) |
6684 | ScalarTys.push_back(Elt: ArgOp->getType()); |
6685 | |
6686 | // Compute corresponding vector type for return value and arguments. |
6687 | Type *RetTy = ToVectorTy(Scalar: ScalarRetTy, EC: VF); |
6688 | for (Type *ScalarTy : ScalarTys) |
6689 | Tys.push_back(Elt: ToVectorTy(Scalar: ScalarTy, EC: VF)); |
6690 | |
6691 | // An in-loop reduction using an fmuladd intrinsic is a special case; |
6692 | // we don't want the normal cost for that intrinsic. |
6693 | if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI)) |
6694 | if (auto RedCost = getReductionPatternCost(I: CI, VF, Ty: RetTy, CostKind)) { |
6695 | setCallWideningDecision(CI, VF, Kind: CM_IntrinsicCall, Variant: nullptr, |
6696 | IID: getVectorIntrinsicIDForCall(CI, TLI), |
6697 | MaskPos: std::nullopt, Cost: *RedCost); |
6698 | continue; |
6699 | } |
6700 | |
6701 | // Estimate cost of scalarized vector call. The source operands are |
6702 | // assumed to be vectors, so we need to extract individual elements from |
6703 | // there, execute VF scalar calls, and then gather the result into the |
6704 | // vector return value. |
6705 | InstructionCost ScalarCallCost = |
6706 | TTI.getCallInstrCost(F: ScalarFunc, RetTy: ScalarRetTy, Tys: ScalarTys, CostKind); |
6707 | |
6708 | // Compute costs of unpacking argument values for the scalar calls and |
6709 | // packing the return values to a vector. |
6710 | InstructionCost ScalarizationCost = |
6711 | getScalarizationOverhead(I: CI, VF, CostKind); |
6712 | |
6713 | ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; |
6714 | |
6715 | // Find the cost of vectorizing the call, if we can find a suitable |
6716 | // vector variant of the function. |
6717 | bool UsesMask = false; |
6718 | VFInfo FuncInfo; |
6719 | Function *VecFunc = nullptr; |
6720 | // Search through any available variants for one we can use at this VF. |
6721 | for (VFInfo &Info : VFDatabase::getMappings(CI: *CI)) { |
6722 | // Must match requested VF. |
6723 | if (Info.Shape.VF != VF) |
6724 | continue; |
6725 | |
6726 | // Must take a mask argument if one is required |
6727 | if (MaskRequired && !Info.isMasked()) |
6728 | continue; |
6729 | |
6730 | // Check that all parameter kinds are supported |
6731 | bool ParamsOk = true; |
6732 | for (VFParameter Param : Info.Shape.Parameters) { |
6733 | switch (Param.ParamKind) { |
6734 | case VFParamKind::Vector: |
6735 | break; |
6736 | case VFParamKind::OMP_Uniform: { |
6737 | Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos); |
6738 | // Make sure the scalar parameter in the loop is invariant. |
6739 | if (!PSE.getSE()->isLoopInvariant(S: PSE.getSCEV(V: ScalarParam), |
6740 | L: TheLoop)) |
6741 | ParamsOk = false; |
6742 | break; |
6743 | } |
6744 | case VFParamKind::OMP_Linear: { |
6745 | Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos); |
6746 | // Find the stride for the scalar parameter in this loop and see if |
6747 | // it matches the stride for the variant. |
6748 | // TODO: do we need to figure out the cost of an extract to get the |
6749 | // first lane? Or do we hope that it will be folded away? |
6750 | ScalarEvolution *SE = PSE.getSE(); |
6751 | const auto *SAR = |
6752 | dyn_cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: ScalarParam)); |
6753 | |
6754 | if (!SAR || SAR->getLoop() != TheLoop) { |
6755 | ParamsOk = false; |
6756 | break; |
6757 | } |
6758 | |
6759 | const SCEVConstant *Step = |
6760 | dyn_cast<SCEVConstant>(Val: SAR->getStepRecurrence(SE&: *SE)); |
6761 | |
6762 | if (!Step || |
6763 | Step->getAPInt().getSExtValue() != Param.LinearStepOrPos) |
6764 | ParamsOk = false; |
6765 | |
6766 | break; |
6767 | } |
6768 | case VFParamKind::GlobalPredicate: |
6769 | UsesMask = true; |
6770 | break; |
6771 | default: |
6772 | ParamsOk = false; |
6773 | break; |
6774 | } |
6775 | } |
6776 | |
6777 | if (!ParamsOk) |
6778 | continue; |
6779 | |
6780 | // Found a suitable candidate, stop here. |
6781 | VecFunc = CI->getModule()->getFunction(Name: Info.VectorName); |
6782 | FuncInfo = Info; |
6783 | break; |
6784 | } |
6785 | |
6786 | // Add in the cost of synthesizing a mask if one wasn't required. |
6787 | InstructionCost MaskCost = 0; |
6788 | if (VecFunc && UsesMask && !MaskRequired) |
6789 | MaskCost = TTI.getShuffleCost( |
6790 | Kind: TargetTransformInfo::SK_Broadcast, |
6791 | Tp: VectorType::get(ElementType: IntegerType::getInt1Ty( |
6792 | C&: VecFunc->getFunctionType()->getContext()), |
6793 | EC: VF)); |
6794 | |
6795 | if (TLI && VecFunc && !CI->isNoBuiltin()) |
6796 | VectorCost = |
6797 | TTI.getCallInstrCost(F: nullptr, RetTy, Tys, CostKind) + MaskCost; |
6798 | |
6799 | // Find the cost of an intrinsic; some targets may have instructions that |
6800 | // perform the operation without needing an actual call. |
6801 | Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI); |
6802 | if (IID != Intrinsic::not_intrinsic) |
6803 | IntrinsicCost = getVectorIntrinsicCost(CI, VF); |
6804 | |
6805 | InstructionCost Cost = ScalarCost; |
6806 | InstWidening Decision = CM_Scalarize; |
6807 | |
6808 | if (VectorCost <= Cost) { |
6809 | Cost = VectorCost; |
6810 | Decision = CM_VectorCall; |
6811 | } |
6812 | |
6813 | if (IntrinsicCost <= Cost) { |
6814 | Cost = IntrinsicCost; |
6815 | Decision = CM_IntrinsicCall; |
6816 | } |
6817 | |
6818 | setCallWideningDecision(CI, VF, Kind: Decision, Variant: VecFunc, IID, |
6819 | MaskPos: FuncInfo.getParamIndexForOptionalMask(), Cost); |
6820 | } |
6821 | } |
6822 | } |
6823 | |
6824 | InstructionCost |
6825 | LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, |
6826 | Type *&VectorTy) { |
6827 | Type *RetTy = I->getType(); |
6828 | if (canTruncateToMinimalBitwidth(I, VF)) |
6829 | RetTy = IntegerType::get(C&: RetTy->getContext(), NumBits: MinBWs[I]); |
6830 | auto SE = PSE.getSE(); |
6831 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
6832 | |
6833 | auto hasSingleCopyAfterVectorization = [this](Instruction *I, |
6834 | ElementCount VF) -> bool { |
6835 | if (VF.isScalar()) |
6836 | return true; |
6837 | |
6838 | auto Scalarized = InstsToScalarize.find(Val: VF); |
6839 | assert(Scalarized != InstsToScalarize.end() && |
6840 | "VF not yet analyzed for scalarization profitability" ); |
6841 | return !Scalarized->second.count(Val: I) && |
6842 | llvm::all_of(Range: I->users(), P: [&](User *U) { |
6843 | auto *UI = cast<Instruction>(Val: U); |
6844 | return !Scalarized->second.count(Val: UI); |
6845 | }); |
6846 | }; |
6847 | (void) hasSingleCopyAfterVectorization; |
6848 | |
6849 | if (isScalarAfterVectorization(I, VF)) { |
6850 | // With the exception of GEPs and PHIs, after scalarization there should |
6851 | // only be one copy of the instruction generated in the loop. This is |
6852 | // because the VF is either 1, or any instructions that need scalarizing |
6853 | // have already been dealt with by the time we get here. As a result, |
6854 | // it means we don't have to multiply the instruction cost by VF. |
6855 | assert(I->getOpcode() == Instruction::GetElementPtr || |
6856 | I->getOpcode() == Instruction::PHI || |
6857 | (I->getOpcode() == Instruction::BitCast && |
6858 | I->getType()->isPointerTy()) || |
6859 | hasSingleCopyAfterVectorization(I, VF)); |
6860 | VectorTy = RetTy; |
6861 | } else |
6862 | VectorTy = ToVectorTy(Scalar: RetTy, EC: VF); |
6863 | |
6864 | // TODO: We need to estimate the cost of intrinsic calls. |
6865 | switch (I->getOpcode()) { |
6866 | case Instruction::GetElementPtr: |
6867 | // We mark this instruction as zero-cost because the cost of GEPs in |
6868 | // vectorized code depends on whether the corresponding memory instruction |
6869 | // is scalarized or not. Therefore, we handle GEPs with the memory |
6870 | // instruction cost. |
6871 | return 0; |
6872 | case Instruction::Br: { |
6873 | // In cases of scalarized and predicated instructions, there will be VF |
6874 | // predicated blocks in the vectorized loop. Each branch around these |
6875 | // blocks requires also an extract of its vector compare i1 element. |
6876 | bool ScalarPredicatedBB = false; |
6877 | BranchInst *BI = cast<BranchInst>(Val: I); |
6878 | if (VF.isVector() && BI->isConditional() && |
6879 | (PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 0)) || |
6880 | PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 1)))) |
6881 | ScalarPredicatedBB = true; |
6882 | |
6883 | if (ScalarPredicatedBB) { |
6884 | // Not possible to scalarize scalable vector with predicated instructions. |
6885 | if (VF.isScalable()) |
6886 | return InstructionCost::getInvalid(); |
6887 | // Return cost for branches around scalarized and predicated blocks. |
6888 | auto *Vec_i1Ty = |
6889 | VectorType::get(ElementType: IntegerType::getInt1Ty(C&: RetTy->getContext()), EC: VF); |
6890 | return ( |
6891 | TTI.getScalarizationOverhead( |
6892 | Ty: Vec_i1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), |
6893 | /*Insert*/ false, /*Extract*/ true, CostKind) + |
6894 | (TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind) * VF.getFixedValue())); |
6895 | } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) |
6896 | // The back-edge branch will remain, as will all scalar branches. |
6897 | return TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind); |
6898 | else |
6899 | // This branch will be eliminated by if-conversion. |
6900 | return 0; |
6901 | // Note: We currently assume zero cost for an unconditional branch inside |
6902 | // a predicated block since it will become a fall-through, although we |
6903 | // may decide in the future to call TTI for all branches. |
6904 | } |
6905 | case Instruction::PHI: { |
6906 | auto *Phi = cast<PHINode>(Val: I); |
6907 | |
6908 | // First-order recurrences are replaced by vector shuffles inside the loop. |
6909 | if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) { |
6910 | SmallVector<int> Mask(VF.getKnownMinValue()); |
6911 | std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - 1); |
6912 | return TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice, |
6913 | Tp: cast<VectorType>(Val: VectorTy), Mask, CostKind, |
6914 | Index: VF.getKnownMinValue() - 1); |
6915 | } |
6916 | |
6917 | // Phi nodes in non-header blocks (not inductions, reductions, etc.) are |
6918 | // converted into select instructions. We require N - 1 selects per phi |
6919 | // node, where N is the number of incoming values. |
6920 | if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) |
6921 | return (Phi->getNumIncomingValues() - 1) * |
6922 | TTI.getCmpSelInstrCost( |
6923 | Opcode: Instruction::Select, ValTy: ToVectorTy(Scalar: Phi->getType(), EC: VF), |
6924 | CondTy: ToVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF), |
6925 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind); |
6926 | |
6927 | return TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind); |
6928 | } |
6929 | case Instruction::UDiv: |
6930 | case Instruction::SDiv: |
6931 | case Instruction::URem: |
6932 | case Instruction::SRem: |
6933 | if (VF.isVector() && isPredicatedInst(I)) { |
6934 | const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); |
6935 | return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ? |
6936 | ScalarCost : SafeDivisorCost; |
6937 | } |
6938 | // We've proven all lanes safe to speculate, fall through. |
6939 | [[fallthrough]]; |
6940 | case Instruction::Add: |
6941 | case Instruction::FAdd: |
6942 | case Instruction::Sub: |
6943 | case Instruction::FSub: |
6944 | case Instruction::Mul: |
6945 | case Instruction::FMul: |
6946 | case Instruction::FDiv: |
6947 | case Instruction::FRem: |
6948 | case Instruction::Shl: |
6949 | case Instruction::LShr: |
6950 | case Instruction::AShr: |
6951 | case Instruction::And: |
6952 | case Instruction::Or: |
6953 | case Instruction::Xor: { |
6954 | // If we're speculating on the stride being 1, the multiplication may |
6955 | // fold away. We can generalize this for all operations using the notion |
6956 | // of neutral elements. (TODO) |
6957 | if (I->getOpcode() == Instruction::Mul && |
6958 | (PSE.getSCEV(V: I->getOperand(i: 0))->isOne() || |
6959 | PSE.getSCEV(V: I->getOperand(i: 1))->isOne())) |
6960 | return 0; |
6961 | |
6962 | // Detect reduction patterns |
6963 | if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy, CostKind)) |
6964 | return *RedCost; |
6965 | |
6966 | // Certain instructions can be cheaper to vectorize if they have a constant |
6967 | // second vector operand. One example of this are shifts on x86. |
6968 | Value *Op2 = I->getOperand(i: 1); |
6969 | auto Op2Info = TTI.getOperandInfo(V: Op2); |
6970 | if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue && |
6971 | Legal->isInvariant(V: Op2)) |
6972 | Op2Info.Kind = TargetTransformInfo::OK_UniformValue; |
6973 | |
6974 | SmallVector<const Value *, 4> Operands(I->operand_values()); |
6975 | return TTI.getArithmeticInstrCost( |
6976 | Opcode: I->getOpcode(), Ty: VectorTy, CostKind, |
6977 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
6978 | Opd2Info: Op2Info, Args: Operands, CxtI: I, TLibInfo: TLI); |
6979 | } |
6980 | case Instruction::FNeg: { |
6981 | return TTI.getArithmeticInstrCost( |
6982 | Opcode: I->getOpcode(), Ty: VectorTy, CostKind, |
6983 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
6984 | Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
6985 | Args: I->getOperand(i: 0), CxtI: I); |
6986 | } |
6987 | case Instruction::Select: { |
6988 | SelectInst *SI = cast<SelectInst>(Val: I); |
6989 | const SCEV *CondSCEV = SE->getSCEV(V: SI->getCondition()); |
6990 | bool ScalarCond = (SE->isLoopInvariant(S: CondSCEV, L: TheLoop)); |
6991 | |
6992 | const Value *Op0, *Op1; |
6993 | using namespace llvm::PatternMatch; |
6994 | if (!ScalarCond && (match(V: I, P: m_LogicalAnd(L: m_Value(V&: Op0), R: m_Value(V&: Op1))) || |
6995 | match(V: I, P: m_LogicalOr(L: m_Value(V&: Op0), R: m_Value(V&: Op1))))) { |
6996 | // select x, y, false --> x & y |
6997 | // select x, true, y --> x | y |
6998 | const auto [Op1VK, Op1VP] = TTI::getOperandInfo(V: Op0); |
6999 | const auto [Op2VK, Op2VP] = TTI::getOperandInfo(V: Op1); |
7000 | assert(Op0->getType()->getScalarSizeInBits() == 1 && |
7001 | Op1->getType()->getScalarSizeInBits() == 1); |
7002 | |
7003 | SmallVector<const Value *, 2> Operands{Op0, Op1}; |
7004 | return TTI.getArithmeticInstrCost( |
7005 | Opcode: match(V: I, P: m_LogicalOr()) ? Instruction::Or : Instruction::And, Ty: VectorTy, |
7006 | CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: I); |
7007 | } |
7008 | |
7009 | Type *CondTy = SI->getCondition()->getType(); |
7010 | if (!ScalarCond) |
7011 | CondTy = VectorType::get(ElementType: CondTy, EC: VF); |
7012 | |
7013 | CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; |
7014 | if (auto *Cmp = dyn_cast<CmpInst>(Val: SI->getCondition())) |
7015 | Pred = Cmp->getPredicate(); |
7016 | return TTI.getCmpSelInstrCost(Opcode: I->getOpcode(), ValTy: VectorTy, CondTy, VecPred: Pred, |
7017 | CostKind, I); |
7018 | } |
7019 | case Instruction::ICmp: |
7020 | case Instruction::FCmp: { |
7021 | Type *ValTy = I->getOperand(i: 0)->getType(); |
7022 | Instruction *Op0AsInstruction = dyn_cast<Instruction>(Val: I->getOperand(i: 0)); |
7023 | if (canTruncateToMinimalBitwidth(I: Op0AsInstruction, VF)) |
7024 | ValTy = IntegerType::get(C&: ValTy->getContext(), NumBits: MinBWs[Op0AsInstruction]); |
7025 | VectorTy = ToVectorTy(Scalar: ValTy, EC: VF); |
7026 | return TTI.getCmpSelInstrCost(Opcode: I->getOpcode(), ValTy: VectorTy, CondTy: nullptr, |
7027 | VecPred: cast<CmpInst>(Val: I)->getPredicate(), CostKind, |
7028 | I); |
7029 | } |
7030 | case Instruction::Store: |
7031 | case Instruction::Load: { |
7032 | ElementCount Width = VF; |
7033 | if (Width.isVector()) { |
7034 | InstWidening Decision = getWideningDecision(I, VF: Width); |
7035 | assert(Decision != CM_Unknown && |
7036 | "CM decision should be taken at this point" ); |
7037 | if (getWideningCost(I, VF) == InstructionCost::getInvalid()) |
7038 | return InstructionCost::getInvalid(); |
7039 | if (Decision == CM_Scalarize) |
7040 | Width = ElementCount::getFixed(MinVal: 1); |
7041 | } |
7042 | VectorTy = ToVectorTy(Scalar: getLoadStoreType(I), EC: Width); |
7043 | return getMemoryInstructionCost(I, VF); |
7044 | } |
7045 | case Instruction::BitCast: |
7046 | if (I->getType()->isPointerTy()) |
7047 | return 0; |
7048 | [[fallthrough]]; |
7049 | case Instruction::ZExt: |
7050 | case Instruction::SExt: |
7051 | case Instruction::FPToUI: |
7052 | case Instruction::FPToSI: |
7053 | case Instruction::FPExt: |
7054 | case Instruction::PtrToInt: |
7055 | case Instruction::IntToPtr: |
7056 | case Instruction::SIToFP: |
7057 | case Instruction::UIToFP: |
7058 | case Instruction::Trunc: |
7059 | case Instruction::FPTrunc: { |
7060 | // Computes the CastContextHint from a Load/Store instruction. |
7061 | auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint { |
7062 | assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && |
7063 | "Expected a load or a store!" ); |
7064 | |
7065 | if (VF.isScalar() || !TheLoop->contains(Inst: I)) |
7066 | return TTI::CastContextHint::Normal; |
7067 | |
7068 | switch (getWideningDecision(I, VF)) { |
7069 | case LoopVectorizationCostModel::CM_GatherScatter: |
7070 | return TTI::CastContextHint::GatherScatter; |
7071 | case LoopVectorizationCostModel::CM_Interleave: |
7072 | return TTI::CastContextHint::Interleave; |
7073 | case LoopVectorizationCostModel::CM_Scalarize: |
7074 | case LoopVectorizationCostModel::CM_Widen: |
7075 | return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked |
7076 | : TTI::CastContextHint::Normal; |
7077 | case LoopVectorizationCostModel::CM_Widen_Reverse: |
7078 | return TTI::CastContextHint::Reversed; |
7079 | case LoopVectorizationCostModel::CM_Unknown: |
7080 | llvm_unreachable("Instr did not go through cost modelling?" ); |
7081 | case LoopVectorizationCostModel::CM_VectorCall: |
7082 | case LoopVectorizationCostModel::CM_IntrinsicCall: |
7083 | llvm_unreachable_internal(msg: "Instr has invalid widening decision" ); |
7084 | } |
7085 | |
7086 | llvm_unreachable("Unhandled case!" ); |
7087 | }; |
7088 | |
7089 | unsigned Opcode = I->getOpcode(); |
7090 | TTI::CastContextHint CCH = TTI::CastContextHint::None; |
7091 | // For Trunc, the context is the only user, which must be a StoreInst. |
7092 | if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) { |
7093 | if (I->hasOneUse()) |
7094 | if (StoreInst *Store = dyn_cast<StoreInst>(Val: *I->user_begin())) |
7095 | CCH = ComputeCCH(Store); |
7096 | } |
7097 | // For Z/Sext, the context is the operand, which must be a LoadInst. |
7098 | else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt || |
7099 | Opcode == Instruction::FPExt) { |
7100 | if (LoadInst *Load = dyn_cast<LoadInst>(Val: I->getOperand(i: 0))) |
7101 | CCH = ComputeCCH(Load); |
7102 | } |
7103 | |
7104 | // We optimize the truncation of induction variables having constant |
7105 | // integer steps. The cost of these truncations is the same as the scalar |
7106 | // operation. |
7107 | if (isOptimizableIVTruncate(I, VF)) { |
7108 | auto *Trunc = cast<TruncInst>(Val: I); |
7109 | return TTI.getCastInstrCost(Opcode: Instruction::Trunc, Dst: Trunc->getDestTy(), |
7110 | Src: Trunc->getSrcTy(), CCH, CostKind, I: Trunc); |
7111 | } |
7112 | |
7113 | // Detect reduction patterns |
7114 | if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy, CostKind)) |
7115 | return *RedCost; |
7116 | |
7117 | Type *SrcScalarTy = I->getOperand(i: 0)->getType(); |
7118 | Type *SrcVecTy = |
7119 | VectorTy->isVectorTy() ? ToVectorTy(Scalar: SrcScalarTy, EC: VF) : SrcScalarTy; |
7120 | if (canTruncateToMinimalBitwidth(I, VF)) { |
7121 | // This cast is going to be shrunk. This may remove the cast or it might |
7122 | // turn it into slightly different cast. For example, if MinBW == 16, |
7123 | // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". |
7124 | // |
7125 | // Calculate the modified src and dest types. |
7126 | Type *MinVecTy = VectorTy; |
7127 | if (Opcode == Instruction::Trunc) { |
7128 | SrcVecTy = smallestIntegerVectorType(T1: SrcVecTy, T2: MinVecTy); |
7129 | VectorTy = |
7130 | largestIntegerVectorType(T1: ToVectorTy(Scalar: I->getType(), EC: VF), T2: MinVecTy); |
7131 | } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) { |
7132 | // Leave SrcVecTy unchanged - we only shrink the destination element |
7133 | // type. |
7134 | VectorTy = |
7135 | smallestIntegerVectorType(T1: ToVectorTy(Scalar: I->getType(), EC: VF), T2: MinVecTy); |
7136 | } |
7137 | } |
7138 | |
7139 | return TTI.getCastInstrCost(Opcode, Dst: VectorTy, Src: SrcVecTy, CCH, CostKind, I); |
7140 | } |
7141 | case Instruction::Call: |
7142 | return getVectorCallCost(CI: cast<CallInst>(Val: I), VF); |
7143 | case Instruction::ExtractValue: |
7144 | return TTI.getInstructionCost(U: I, CostKind: TTI::TCK_RecipThroughput); |
7145 | case Instruction::Alloca: |
7146 | // We cannot easily widen alloca to a scalable alloca, as |
7147 | // the result would need to be a vector of pointers. |
7148 | if (VF.isScalable()) |
7149 | return InstructionCost::getInvalid(); |
7150 | [[fallthrough]]; |
7151 | default: |
7152 | // This opcode is unknown. Assume that it is the same as 'mul'. |
7153 | return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind); |
7154 | } // end of switch. |
7155 | } |
7156 | |
7157 | void LoopVectorizationCostModel::collectValuesToIgnore() { |
7158 | // Ignore ephemeral values. |
7159 | CodeMetrics::collectEphemeralValues(L: TheLoop, AC, EphValues&: ValuesToIgnore); |
7160 | |
7161 | // Find all stores to invariant variables. Since they are going to sink |
7162 | // outside the loop we do not need calculate cost for them. |
7163 | for (BasicBlock *BB : TheLoop->blocks()) |
7164 | for (Instruction &I : *BB) { |
7165 | StoreInst *SI; |
7166 | if ((SI = dyn_cast<StoreInst>(Val: &I)) && |
7167 | Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand())) |
7168 | ValuesToIgnore.insert(Ptr: &I); |
7169 | } |
7170 | |
7171 | // Ignore type-promoting instructions we identified during reduction |
7172 | // detection. |
7173 | for (const auto &Reduction : Legal->getReductionVars()) { |
7174 | const RecurrenceDescriptor &RedDes = Reduction.second; |
7175 | const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); |
7176 | VecValuesToIgnore.insert(I: Casts.begin(), E: Casts.end()); |
7177 | } |
7178 | // Ignore type-casting instructions we identified during induction |
7179 | // detection. |
7180 | for (const auto &Induction : Legal->getInductionVars()) { |
7181 | const InductionDescriptor &IndDes = Induction.second; |
7182 | const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); |
7183 | VecValuesToIgnore.insert(I: Casts.begin(), E: Casts.end()); |
7184 | } |
7185 | } |
7186 | |
7187 | void LoopVectorizationCostModel::collectInLoopReductions() { |
7188 | for (const auto &Reduction : Legal->getReductionVars()) { |
7189 | PHINode *Phi = Reduction.first; |
7190 | const RecurrenceDescriptor &RdxDesc = Reduction.second; |
7191 | |
7192 | // We don't collect reductions that are type promoted (yet). |
7193 | if (RdxDesc.getRecurrenceType() != Phi->getType()) |
7194 | continue; |
7195 | |
7196 | // If the target would prefer this reduction to happen "in-loop", then we |
7197 | // want to record it as such. |
7198 | unsigned Opcode = RdxDesc.getOpcode(); |
7199 | if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && |
7200 | !TTI.preferInLoopReduction(Opcode, Ty: Phi->getType(), |
7201 | Flags: TargetTransformInfo::ReductionFlags())) |
7202 | continue; |
7203 | |
7204 | // Check that we can correctly put the reductions into the loop, by |
7205 | // finding the chain of operations that leads from the phi to the loop |
7206 | // exit value. |
7207 | SmallVector<Instruction *, 4> ReductionOperations = |
7208 | RdxDesc.getReductionOpChain(Phi, L: TheLoop); |
7209 | bool InLoop = !ReductionOperations.empty(); |
7210 | |
7211 | if (InLoop) { |
7212 | InLoopReductions.insert(Ptr: Phi); |
7213 | // Add the elements to InLoopReductionImmediateChains for cost modelling. |
7214 | Instruction *LastChain = Phi; |
7215 | for (auto *I : ReductionOperations) { |
7216 | InLoopReductionImmediateChains[I] = LastChain; |
7217 | LastChain = I; |
7218 | } |
7219 | } |
7220 | LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop" ) |
7221 | << " reduction for phi: " << *Phi << "\n" ); |
7222 | } |
7223 | } |
7224 | |
7225 | VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, |
7226 | DebugLoc DL, const Twine &Name) { |
7227 | assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE && |
7228 | Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate" ); |
7229 | return tryInsertInstruction( |
7230 | VPI: new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name)); |
7231 | } |
7232 | |
7233 | // This function will select a scalable VF if the target supports scalable |
7234 | // vectors and a fixed one otherwise. |
7235 | // TODO: we could return a pair of values that specify the max VF and |
7236 | // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of |
7237 | // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment |
7238 | // doesn't have a cost model that can choose which plan to execute if |
7239 | // more than one is generated. |
7240 | static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, |
7241 | LoopVectorizationCostModel &CM) { |
7242 | unsigned WidestType; |
7243 | std::tie(args: std::ignore, args&: WidestType) = CM.getSmallestAndWidestTypes(); |
7244 | |
7245 | TargetTransformInfo::RegisterKind RegKind = |
7246 | TTI.enableScalableVectorization() |
7247 | ? TargetTransformInfo::RGK_ScalableVector |
7248 | : TargetTransformInfo::RGK_FixedWidthVector; |
7249 | |
7250 | TypeSize RegSize = TTI.getRegisterBitWidth(K: RegKind); |
7251 | unsigned N = RegSize.getKnownMinValue() / WidestType; |
7252 | return ElementCount::get(MinVal: N, Scalable: RegSize.isScalable()); |
7253 | } |
7254 | |
7255 | VectorizationFactor |
7256 | LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { |
7257 | ElementCount VF = UserVF; |
7258 | // Outer loop handling: They may require CFG and instruction level |
7259 | // transformations before even evaluating whether vectorization is profitable. |
7260 | // Since we cannot modify the incoming IR, we need to build VPlan upfront in |
7261 | // the vectorization pipeline. |
7262 | if (!OrigLoop->isInnermost()) { |
7263 | // If the user doesn't provide a vectorization factor, determine a |
7264 | // reasonable one. |
7265 | if (UserVF.isZero()) { |
7266 | VF = determineVPlanVF(TTI, CM); |
7267 | LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n" ); |
7268 | |
7269 | // Make sure we have a VF > 1 for stress testing. |
7270 | if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) { |
7271 | LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: " |
7272 | << "overriding computed VF.\n" ); |
7273 | VF = ElementCount::getFixed(MinVal: 4); |
7274 | } |
7275 | } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() && |
7276 | !ForceTargetSupportsScalableVectors) { |
7277 | LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but " |
7278 | << "not supported by the target.\n" ); |
7279 | reportVectorizationFailure( |
7280 | DebugMsg: "Scalable vectorization requested but not supported by the target" , |
7281 | OREMsg: "the scalable user-specified vectorization width for outer-loop " |
7282 | "vectorization cannot be used because the target does not support " |
7283 | "scalable vectors." , |
7284 | ORETag: "ScalableVFUnfeasible" , ORE, TheLoop: OrigLoop); |
7285 | return VectorizationFactor::Disabled(); |
7286 | } |
7287 | assert(EnableVPlanNativePath && "VPlan-native path is not enabled." ); |
7288 | assert(isPowerOf2_32(VF.getKnownMinValue()) && |
7289 | "VF needs to be a power of two" ); |
7290 | LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "" ) |
7291 | << "VF " << VF << " to build VPlans.\n" ); |
7292 | buildVPlans(MinVF: VF, MaxVF: VF); |
7293 | |
7294 | // For VPlan build stress testing, we bail out after VPlan construction. |
7295 | if (VPlanBuildStressTest) |
7296 | return VectorizationFactor::Disabled(); |
7297 | |
7298 | return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; |
7299 | } |
7300 | |
7301 | LLVM_DEBUG( |
7302 | dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the " |
7303 | "VPlan-native path.\n" ); |
7304 | return VectorizationFactor::Disabled(); |
7305 | } |
7306 | |
7307 | std::optional<VectorizationFactor> |
7308 | LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { |
7309 | assert(OrigLoop->isInnermost() && "Inner loop expected." ); |
7310 | CM.collectValuesToIgnore(); |
7311 | CM.collectElementTypesForWidening(); |
7312 | |
7313 | FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); |
7314 | if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. |
7315 | return std::nullopt; |
7316 | |
7317 | // Invalidate interleave groups if all blocks of loop will be predicated. |
7318 | if (CM.blockNeedsPredicationForAnyReason(BB: OrigLoop->getHeader()) && |
7319 | !useMaskedInterleavedAccesses(TTI)) { |
7320 | LLVM_DEBUG( |
7321 | dbgs() |
7322 | << "LV: Invalidate all interleaved groups due to fold-tail by masking " |
7323 | "which requires masked-interleaved support.\n" ); |
7324 | if (CM.InterleaveInfo.invalidateGroups()) |
7325 | // Invalidating interleave groups also requires invalidating all decisions |
7326 | // based on them, which includes widening decisions and uniform and scalar |
7327 | // values. |
7328 | CM.invalidateCostModelingDecisions(); |
7329 | } |
7330 | |
7331 | ElementCount MaxUserVF = |
7332 | UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; |
7333 | bool UserVFIsLegal = ElementCount::isKnownLE(LHS: UserVF, RHS: MaxUserVF); |
7334 | if (!UserVF.isZero() && UserVFIsLegal) { |
7335 | assert(isPowerOf2_32(UserVF.getKnownMinValue()) && |
7336 | "VF needs to be a power of two" ); |
7337 | // Collect the instructions (and their associated costs) that will be more |
7338 | // profitable to scalarize. |
7339 | CM.collectInLoopReductions(); |
7340 | if (CM.selectUserVectorizationFactor(UserVF)) { |
7341 | LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n" ); |
7342 | buildVPlansWithVPRecipes(MinVF: UserVF, MaxVF: UserVF); |
7343 | if (!hasPlanWithVF(VF: UserVF)) { |
7344 | LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF |
7345 | << ".\n" ); |
7346 | return std::nullopt; |
7347 | } |
7348 | |
7349 | LLVM_DEBUG(printPlans(dbgs())); |
7350 | return {{UserVF, 0, 0}}; |
7351 | } else |
7352 | reportVectorizationInfo(Msg: "UserVF ignored because of invalid costs." , |
7353 | ORETag: "InvalidCost" , ORE, TheLoop: OrigLoop); |
7354 | } |
7355 | |
7356 | // Populate the set of Vectorization Factor Candidates. |
7357 | ElementCountSet VFCandidates; |
7358 | for (auto VF = ElementCount::getFixed(MinVal: 1); |
7359 | ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.FixedVF); VF *= 2) |
7360 | VFCandidates.insert(V: VF); |
7361 | for (auto VF = ElementCount::getScalable(MinVal: 1); |
7362 | ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.ScalableVF); VF *= 2) |
7363 | VFCandidates.insert(V: VF); |
7364 | |
7365 | CM.collectInLoopReductions(); |
7366 | for (const auto &VF : VFCandidates) { |
7367 | // Collect Uniform and Scalar instructions after vectorization with VF. |
7368 | CM.collectUniformsAndScalars(VF); |
7369 | |
7370 | // Collect the instructions (and their associated costs) that will be more |
7371 | // profitable to scalarize. |
7372 | if (VF.isVector()) |
7373 | CM.collectInstsToScalarize(VF); |
7374 | } |
7375 | |
7376 | buildVPlansWithVPRecipes(MinVF: ElementCount::getFixed(MinVal: 1), MaxVF: MaxFactors.FixedVF); |
7377 | buildVPlansWithVPRecipes(MinVF: ElementCount::getScalable(MinVal: 1), MaxVF: MaxFactors.ScalableVF); |
7378 | |
7379 | LLVM_DEBUG(printPlans(dbgs())); |
7380 | if (!MaxFactors.hasVector()) |
7381 | return VectorizationFactor::Disabled(); |
7382 | |
7383 | // Select the optimal vectorization factor. |
7384 | VectorizationFactor VF = selectVectorizationFactor(VFCandidates); |
7385 | assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero." ); |
7386 | if (!hasPlanWithVF(VF: VF.Width)) { |
7387 | LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width |
7388 | << ".\n" ); |
7389 | return std::nullopt; |
7390 | } |
7391 | return VF; |
7392 | } |
7393 | |
7394 | VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { |
7395 | assert(count_if(VPlans, |
7396 | [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == |
7397 | 1 && |
7398 | "Best VF has not a single VPlan." ); |
7399 | |
7400 | for (const VPlanPtr &Plan : VPlans) { |
7401 | if (Plan->hasVF(VF)) |
7402 | return *Plan.get(); |
7403 | } |
7404 | llvm_unreachable("No plan found!" ); |
7405 | } |
7406 | |
7407 | static void AddRuntimeUnrollDisableMetaData(Loop *L) { |
7408 | SmallVector<Metadata *, 4> MDs; |
7409 | // Reserve first location for self reference to the LoopID metadata node. |
7410 | MDs.push_back(Elt: nullptr); |
7411 | bool IsUnrollMetadata = false; |
7412 | MDNode *LoopID = L->getLoopID(); |
7413 | if (LoopID) { |
7414 | // First find existing loop unrolling disable metadata. |
7415 | for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { |
7416 | auto *MD = dyn_cast<MDNode>(Val: LoopID->getOperand(I: i)); |
7417 | if (MD) { |
7418 | const auto *S = dyn_cast<MDString>(Val: MD->getOperand(I: 0)); |
7419 | IsUnrollMetadata = |
7420 | S && S->getString().starts_with(Prefix: "llvm.loop.unroll.disable" ); |
7421 | } |
7422 | MDs.push_back(Elt: LoopID->getOperand(I: i)); |
7423 | } |
7424 | } |
7425 | |
7426 | if (!IsUnrollMetadata) { |
7427 | // Add runtime unroll disable metadata. |
7428 | LLVMContext &Context = L->getHeader()->getContext(); |
7429 | SmallVector<Metadata *, 1> DisableOperands; |
7430 | DisableOperands.push_back( |
7431 | Elt: MDString::get(Context, Str: "llvm.loop.unroll.runtime.disable" )); |
7432 | MDNode *DisableNode = MDNode::get(Context, MDs: DisableOperands); |
7433 | MDs.push_back(Elt: DisableNode); |
7434 | MDNode *NewLoopID = MDNode::get(Context, MDs); |
7435 | // Set operand 0 to refer to the loop id itself. |
7436 | NewLoopID->replaceOperandWith(I: 0, New: NewLoopID); |
7437 | L->setLoopID(NewLoopID); |
7438 | } |
7439 | } |
7440 | |
7441 | // Check if \p RedResult is a ComputeReductionResult instruction, and if it is |
7442 | // create a merge phi node for it and add it to \p ReductionResumeValues. |
7443 | static void createAndCollectMergePhiForReduction( |
7444 | VPInstruction *RedResult, |
7445 | DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues, |
7446 | VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock) { |
7447 | if (!RedResult || |
7448 | RedResult->getOpcode() != VPInstruction::ComputeReductionResult) |
7449 | return; |
7450 | |
7451 | auto *PhiR = cast<VPReductionPHIRecipe>(Val: RedResult->getOperand(N: 0)); |
7452 | const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); |
7453 | |
7454 | TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue(); |
7455 | Value *FinalValue = |
7456 | State.get(Def: RedResult, Instance: VPIteration(State.UF - 1, VPLane::getFirstLane())); |
7457 | auto *ResumePhi = |
7458 | dyn_cast<PHINode>(Val: PhiR->getStartValue()->getUnderlyingValue()); |
7459 | |
7460 | // TODO: bc.merge.rdx should not be created here, instead it should be |
7461 | // modeled in VPlan. |
7462 | BasicBlock * = OrigLoop->getLoopPreheader(); |
7463 | // Create a phi node that merges control-flow from the backedge-taken check |
7464 | // block and the middle block. |
7465 | auto *BCBlockPhi = |
7466 | PHINode::Create(Ty: FinalValue->getType(), NumReservedValues: 2, NameStr: "bc.merge.rdx" , |
7467 | InsertBefore: LoopScalarPreHeader->getTerminator()->getIterator()); |
7468 | |
7469 | // If we are fixing reductions in the epilogue loop then we should already |
7470 | // have created a bc.merge.rdx Phi after the main vector body. Ensure that |
7471 | // we carry over the incoming values correctly. |
7472 | for (auto *Incoming : predecessors(BB: LoopScalarPreHeader)) { |
7473 | if (Incoming == LoopMiddleBlock) |
7474 | BCBlockPhi->addIncoming(V: FinalValue, BB: Incoming); |
7475 | else if (ResumePhi && is_contained(Range: ResumePhi->blocks(), Element: Incoming)) |
7476 | BCBlockPhi->addIncoming(V: ResumePhi->getIncomingValueForBlock(BB: Incoming), |
7477 | BB: Incoming); |
7478 | else |
7479 | BCBlockPhi->addIncoming(V: ReductionStartValue, BB: Incoming); |
7480 | } |
7481 | |
7482 | auto *OrigPhi = cast<PHINode>(Val: PhiR->getUnderlyingValue()); |
7483 | // TODO: This fixup should instead be modeled in VPlan. |
7484 | // Fix the scalar loop reduction variable with the incoming reduction sum |
7485 | // from the vector body and from the backedge value. |
7486 | int IncomingEdgeBlockIdx = |
7487 | OrigPhi->getBasicBlockIndex(BB: OrigLoop->getLoopLatch()); |
7488 | assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index" ); |
7489 | // Pick the other block. |
7490 | int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); |
7491 | OrigPhi->setIncomingValue(i: SelfEdgeBlockIdx, V: BCBlockPhi); |
7492 | Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); |
7493 | OrigPhi->setIncomingValue(i: IncomingEdgeBlockIdx, V: LoopExitInst); |
7494 | |
7495 | ReductionResumeValues[&RdxDesc] = BCBlockPhi; |
7496 | } |
7497 | |
7498 | std::pair<DenseMap<const SCEV *, Value *>, |
7499 | DenseMap<const RecurrenceDescriptor *, Value *>> |
7500 | LoopVectorizationPlanner::executePlan( |
7501 | ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, |
7502 | InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization, |
7503 | const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) { |
7504 | assert(BestVPlan.hasVF(BestVF) && |
7505 | "Trying to execute plan with unsupported VF" ); |
7506 | assert(BestVPlan.hasUF(BestUF) && |
7507 | "Trying to execute plan with unsupported UF" ); |
7508 | assert( |
7509 | (IsEpilogueVectorization || !ExpandedSCEVs) && |
7510 | "expanded SCEVs to reuse can only be used during epilogue vectorization" ); |
7511 | |
7512 | if (!IsEpilogueVectorization) |
7513 | VPlanTransforms::optimizeForVFAndUF(Plan&: BestVPlan, BestVF, BestUF, PSE); |
7514 | |
7515 | LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF |
7516 | << ", UF=" << BestUF << '\n'); |
7517 | BestVPlan.setName("Final VPlan" ); |
7518 | LLVM_DEBUG(BestVPlan.dump()); |
7519 | |
7520 | // Perform the actual loop transformation. |
7521 | VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan, |
7522 | OrigLoop->getHeader()->getContext()); |
7523 | |
7524 | // 0. Generate SCEV-dependent code into the preheader, including TripCount, |
7525 | // before making any changes to the CFG. |
7526 | if (!BestVPlan.getPreheader()->empty()) { |
7527 | State.CFG.PrevBB = OrigLoop->getLoopPreheader(); |
7528 | State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator()); |
7529 | BestVPlan.getPreheader()->execute(State: &State); |
7530 | } |
7531 | if (!ILV.getTripCount()) |
7532 | ILV.setTripCount(State.get(Def: BestVPlan.getTripCount(), Instance: {0, 0})); |
7533 | else |
7534 | assert(IsEpilogueVectorization && "should only re-use the existing trip " |
7535 | "count during epilogue vectorization" ); |
7536 | |
7537 | // 1. Set up the skeleton for vectorization, including vector pre-header and |
7538 | // middle block. The vector loop is created during VPlan execution. |
7539 | Value *CanonicalIVStartValue; |
7540 | std::tie(args&: State.CFG.PrevBB, args&: CanonicalIVStartValue) = |
7541 | ILV.createVectorizedLoopSkeleton(ExpandedSCEVs: ExpandedSCEVs ? *ExpandedSCEVs |
7542 | : State.ExpandedSCEVs); |
7543 | |
7544 | // Only use noalias metadata when using memory checks guaranteeing no overlap |
7545 | // across all iterations. |
7546 | const LoopAccessInfo *LAI = ILV.Legal->getLAI(); |
7547 | std::unique_ptr<LoopVersioning> LVer = nullptr; |
7548 | if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() && |
7549 | !LAI->getRuntimePointerChecking()->getDiffChecks()) { |
7550 | |
7551 | // We currently don't use LoopVersioning for the actual loop cloning but we |
7552 | // still use it to add the noalias metadata. |
7553 | // TODO: Find a better way to re-use LoopVersioning functionality to add |
7554 | // metadata. |
7555 | LVer = std::make_unique<LoopVersioning>( |
7556 | args: *LAI, args: LAI->getRuntimePointerChecking()->getChecks(), args&: OrigLoop, args&: LI, args&: DT, |
7557 | args: PSE.getSE()); |
7558 | State.LVer = &*LVer; |
7559 | State.LVer->prepareNoAliasMetadata(); |
7560 | } |
7561 | |
7562 | ILV.printDebugTracesAtStart(); |
7563 | |
7564 | //===------------------------------------------------===// |
7565 | // |
7566 | // Notice: any optimization or new instruction that go |
7567 | // into the code below should also be implemented in |
7568 | // the cost-model. |
7569 | // |
7570 | //===------------------------------------------------===// |
7571 | |
7572 | // 2. Copy and widen instructions from the old loop into the new loop. |
7573 | BestVPlan.prepareToExecute(TripCount: ILV.getTripCount(), |
7574 | VectorTripCount: ILV.getOrCreateVectorTripCount(InsertBlock: nullptr), |
7575 | CanonicalIVStartValue, State); |
7576 | |
7577 | BestVPlan.execute(State: &State); |
7578 | |
7579 | // 2.5 Collect reduction resume values. |
7580 | DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues; |
7581 | auto *ExitVPBB = |
7582 | cast<VPBasicBlock>(Val: BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); |
7583 | for (VPRecipeBase &R : *ExitVPBB) { |
7584 | createAndCollectMergePhiForReduction(RedResult: dyn_cast<VPInstruction>(Val: &R), |
7585 | ReductionResumeValues, State, OrigLoop, |
7586 | LoopMiddleBlock: State.CFG.VPBB2IRBB[ExitVPBB]); |
7587 | } |
7588 | |
7589 | // 2.6. Maintain Loop Hints |
7590 | // Keep all loop hints from the original loop on the vector loop (we'll |
7591 | // replace the vectorizer-specific hints below). |
7592 | MDNode *OrigLoopID = OrigLoop->getLoopID(); |
7593 | |
7594 | std::optional<MDNode *> VectorizedLoopID = |
7595 | makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll, |
7596 | LLVMLoopVectorizeFollowupVectorized}); |
7597 | |
7598 | VPBasicBlock * = |
7599 | BestVPlan.getVectorLoopRegion()->getEntryBasicBlock(); |
7600 | Loop *L = LI->getLoopFor(BB: State.CFG.VPBB2IRBB[HeaderVPBB]); |
7601 | if (VectorizedLoopID) |
7602 | L->setLoopID(*VectorizedLoopID); |
7603 | else { |
7604 | // Keep all loop hints from the original loop on the vector loop (we'll |
7605 | // replace the vectorizer-specific hints below). |
7606 | if (MDNode *LID = OrigLoop->getLoopID()) |
7607 | L->setLoopID(LID); |
7608 | |
7609 | LoopVectorizeHints Hints(L, true, *ORE); |
7610 | Hints.setAlreadyVectorized(); |
7611 | } |
7612 | TargetTransformInfo::UnrollingPreferences UP; |
7613 | TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE); |
7614 | if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue) |
7615 | AddRuntimeUnrollDisableMetaData(L); |
7616 | |
7617 | // 3. Fix the vectorized code: take care of header phi's, live-outs, |
7618 | // predication, updating analyses. |
7619 | ILV.fixVectorizedLoop(State, Plan&: BestVPlan); |
7620 | |
7621 | ILV.printDebugTracesAtEnd(); |
7622 | |
7623 | return {State.ExpandedSCEVs, ReductionResumeValues}; |
7624 | } |
7625 | |
7626 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
7627 | void LoopVectorizationPlanner::printPlans(raw_ostream &O) { |
7628 | for (const auto &Plan : VPlans) |
7629 | if (PrintVPlansInDotFormat) |
7630 | Plan->printDOT(O); |
7631 | else |
7632 | Plan->print(O); |
7633 | } |
7634 | #endif |
7635 | |
7636 | //===--------------------------------------------------------------------===// |
7637 | // EpilogueVectorizerMainLoop |
7638 | //===--------------------------------------------------------------------===// |
7639 | |
7640 | /// This function is partially responsible for generating the control flow |
7641 | /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. |
7642 | std::pair<BasicBlock *, Value *> |
7643 | EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton( |
7644 | const SCEV2ValueTy &ExpandedSCEVs) { |
7645 | createVectorLoopSkeleton(Prefix: "" ); |
7646 | |
7647 | // Generate the code to check the minimum iteration count of the vector |
7648 | // epilogue (see below). |
7649 | EPI.EpilogueIterationCountCheck = |
7650 | emitIterationCountCheck(Bypass: LoopScalarPreHeader, ForEpilogue: true); |
7651 | EPI.EpilogueIterationCountCheck->setName("iter.check" ); |
7652 | |
7653 | // Generate the code to check any assumptions that we've made for SCEV |
7654 | // expressions. |
7655 | EPI.SCEVSafetyCheck = emitSCEVChecks(Bypass: LoopScalarPreHeader); |
7656 | |
7657 | // Generate the code that checks at runtime if arrays overlap. We put the |
7658 | // checks into a separate block to make the more common case of few elements |
7659 | // faster. |
7660 | EPI.MemSafetyCheck = emitMemRuntimeChecks(Bypass: LoopScalarPreHeader); |
7661 | |
7662 | // Generate the iteration count check for the main loop, *after* the check |
7663 | // for the epilogue loop, so that the path-length is shorter for the case |
7664 | // that goes directly through the vector epilogue. The longer-path length for |
7665 | // the main loop is compensated for, by the gain from vectorizing the larger |
7666 | // trip count. Note: the branch will get updated later on when we vectorize |
7667 | // the epilogue. |
7668 | EPI.MainLoopIterationCountCheck = |
7669 | emitIterationCountCheck(Bypass: LoopScalarPreHeader, ForEpilogue: false); |
7670 | |
7671 | // Generate the induction variable. |
7672 | EPI.VectorTripCount = getOrCreateVectorTripCount(InsertBlock: LoopVectorPreHeader); |
7673 | |
7674 | // Skip induction resume value creation here because they will be created in |
7675 | // the second pass for the scalar loop. The induction resume values for the |
7676 | // inductions in the epilogue loop are created before executing the plan for |
7677 | // the epilogue loop. |
7678 | |
7679 | return {completeLoopSkeleton(), nullptr}; |
7680 | } |
7681 | |
7682 | void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { |
7683 | LLVM_DEBUG({ |
7684 | dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" |
7685 | << "Main Loop VF:" << EPI.MainLoopVF |
7686 | << ", Main Loop UF:" << EPI.MainLoopUF |
7687 | << ", Epilogue Loop VF:" << EPI.EpilogueVF |
7688 | << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n" ; |
7689 | }); |
7690 | } |
7691 | |
7692 | void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() { |
7693 | DEBUG_WITH_TYPE(VerboseDebug, { |
7694 | dbgs() << "intermediate fn:\n" |
7695 | << *OrigLoop->getHeader()->getParent() << "\n" ; |
7696 | }); |
7697 | } |
7698 | |
7699 | BasicBlock * |
7700 | EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, |
7701 | bool ForEpilogue) { |
7702 | assert(Bypass && "Expected valid bypass basic block." ); |
7703 | ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF; |
7704 | unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF; |
7705 | Value *Count = getTripCount(); |
7706 | // Reuse existing vector loop preheader for TC checks. |
7707 | // Note that new preheader block is generated for vector loop. |
7708 | BasicBlock *const TCCheckBlock = LoopVectorPreHeader; |
7709 | IRBuilder<> Builder(TCCheckBlock->getTerminator()); |
7710 | |
7711 | // Generate code to check if the loop's trip count is less than VF * UF of the |
7712 | // main vector loop. |
7713 | auto P = Cost->requiresScalarEpilogue(IsVectorizing: ForEpilogue ? EPI.EpilogueVF.isVector() |
7714 | : VF.isVector()) |
7715 | ? ICmpInst::ICMP_ULE |
7716 | : ICmpInst::ICMP_ULT; |
7717 | |
7718 | Value *CheckMinIters = Builder.CreateICmp( |
7719 | P, LHS: Count, RHS: createStepForVF(B&: Builder, Ty: Count->getType(), VF: VFactor, Step: UFactor), |
7720 | Name: "min.iters.check" ); |
7721 | |
7722 | if (!ForEpilogue) |
7723 | TCCheckBlock->setName("vector.main.loop.iter.check" ); |
7724 | |
7725 | // Create new preheader for vector loop. |
7726 | LoopVectorPreHeader = SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(), |
7727 | DT, LI, MSSAU: nullptr, BBName: "vector.ph" ); |
7728 | |
7729 | if (ForEpilogue) { |
7730 | assert(DT->properlyDominates(DT->getNode(TCCheckBlock), |
7731 | DT->getNode(Bypass)->getIDom()) && |
7732 | "TC check is expected to dominate Bypass" ); |
7733 | |
7734 | // Update dominator for Bypass & LoopExit. |
7735 | DT->changeImmediateDominator(BB: Bypass, NewBB: TCCheckBlock); |
7736 | if (!Cost->requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector())) |
7737 | // For loops with multiple exits, there's no edge from the middle block |
7738 | // to exit blocks (as the epilogue must run) and thus no need to update |
7739 | // the immediate dominator of the exit blocks. |
7740 | DT->changeImmediateDominator(BB: LoopExitBlock, NewBB: TCCheckBlock); |
7741 | |
7742 | LoopBypassBlocks.push_back(Elt: TCCheckBlock); |
7743 | |
7744 | // Save the trip count so we don't have to regenerate it in the |
7745 | // vec.epilog.iter.check. This is safe to do because the trip count |
7746 | // generated here dominates the vector epilog iter check. |
7747 | EPI.TripCount = Count; |
7748 | } |
7749 | |
7750 | BranchInst &BI = |
7751 | *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters); |
7752 | if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())) |
7753 | setBranchWeights(I&: BI, Weights: MinItersBypassWeights); |
7754 | ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI); |
7755 | |
7756 | return TCCheckBlock; |
7757 | } |
7758 | |
7759 | //===--------------------------------------------------------------------===// |
7760 | // EpilogueVectorizerEpilogueLoop |
7761 | //===--------------------------------------------------------------------===// |
7762 | |
7763 | /// This function is partially responsible for generating the control flow |
7764 | /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. |
7765 | std::pair<BasicBlock *, Value *> |
7766 | EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( |
7767 | const SCEV2ValueTy &ExpandedSCEVs) { |
7768 | createVectorLoopSkeleton(Prefix: "vec.epilog." ); |
7769 | |
7770 | // Now, compare the remaining count and if there aren't enough iterations to |
7771 | // execute the vectorized epilogue skip to the scalar part. |
7772 | BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader; |
7773 | VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check" ); |
7774 | LoopVectorPreHeader = |
7775 | SplitBlock(Old: LoopVectorPreHeader, SplitPt: LoopVectorPreHeader->getTerminator(), DT, |
7776 | LI, MSSAU: nullptr, BBName: "vec.epilog.ph" ); |
7777 | emitMinimumVectorEpilogueIterCountCheck(Bypass: LoopScalarPreHeader, |
7778 | Insert: VecEpilogueIterationCountCheck); |
7779 | |
7780 | // Adjust the control flow taking the state info from the main loop |
7781 | // vectorization into account. |
7782 | assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && |
7783 | "expected this to be saved from the previous pass." ); |
7784 | EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( |
7785 | From: VecEpilogueIterationCountCheck, To: LoopVectorPreHeader); |
7786 | |
7787 | DT->changeImmediateDominator(BB: LoopVectorPreHeader, |
7788 | NewBB: EPI.MainLoopIterationCountCheck); |
7789 | |
7790 | EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( |
7791 | From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader); |
7792 | |
7793 | if (EPI.SCEVSafetyCheck) |
7794 | EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith( |
7795 | From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader); |
7796 | if (EPI.MemSafetyCheck) |
7797 | EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith( |
7798 | From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader); |
7799 | |
7800 | DT->changeImmediateDominator( |
7801 | BB: VecEpilogueIterationCountCheck, |
7802 | NewBB: VecEpilogueIterationCountCheck->getSinglePredecessor()); |
7803 | |
7804 | DT->changeImmediateDominator(BB: LoopScalarPreHeader, |
7805 | NewBB: EPI.EpilogueIterationCountCheck); |
7806 | if (!Cost->requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector())) |
7807 | // If there is an epilogue which must run, there's no edge from the |
7808 | // middle block to exit blocks and thus no need to update the immediate |
7809 | // dominator of the exit blocks. |
7810 | DT->changeImmediateDominator(BB: LoopExitBlock, |
7811 | NewBB: EPI.EpilogueIterationCountCheck); |
7812 | |
7813 | // Keep track of bypass blocks, as they feed start values to the induction and |
7814 | // reduction phis in the scalar loop preheader. |
7815 | if (EPI.SCEVSafetyCheck) |
7816 | LoopBypassBlocks.push_back(Elt: EPI.SCEVSafetyCheck); |
7817 | if (EPI.MemSafetyCheck) |
7818 | LoopBypassBlocks.push_back(Elt: EPI.MemSafetyCheck); |
7819 | LoopBypassBlocks.push_back(Elt: EPI.EpilogueIterationCountCheck); |
7820 | |
7821 | // The vec.epilog.iter.check block may contain Phi nodes from inductions or |
7822 | // reductions which merge control-flow from the latch block and the middle |
7823 | // block. Update the incoming values here and move the Phi into the preheader. |
7824 | SmallVector<PHINode *, 4> PhisInBlock; |
7825 | for (PHINode &Phi : VecEpilogueIterationCountCheck->phis()) |
7826 | PhisInBlock.push_back(Elt: &Phi); |
7827 | |
7828 | for (PHINode *Phi : PhisInBlock) { |
7829 | Phi->moveBefore(MovePos: LoopVectorPreHeader->getFirstNonPHI()); |
7830 | Phi->replaceIncomingBlockWith( |
7831 | Old: VecEpilogueIterationCountCheck->getSinglePredecessor(), |
7832 | New: VecEpilogueIterationCountCheck); |
7833 | |
7834 | // If the phi doesn't have an incoming value from the |
7835 | // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming |
7836 | // value and also those from other check blocks. This is needed for |
7837 | // reduction phis only. |
7838 | if (none_of(Range: Phi->blocks(), P: [&](BasicBlock *IncB) { |
7839 | return EPI.EpilogueIterationCountCheck == IncB; |
7840 | })) |
7841 | continue; |
7842 | Phi->removeIncomingValue(BB: EPI.EpilogueIterationCountCheck); |
7843 | if (EPI.SCEVSafetyCheck) |
7844 | Phi->removeIncomingValue(BB: EPI.SCEVSafetyCheck); |
7845 | if (EPI.MemSafetyCheck) |
7846 | Phi->removeIncomingValue(BB: EPI.MemSafetyCheck); |
7847 | } |
7848 | |
7849 | // Generate a resume induction for the vector epilogue and put it in the |
7850 | // vector epilogue preheader |
7851 | Type *IdxTy = Legal->getWidestInductionType(); |
7852 | PHINode *EPResumeVal = PHINode::Create(Ty: IdxTy, NumReservedValues: 2, NameStr: "vec.epilog.resume.val" ); |
7853 | EPResumeVal->insertBefore(InsertPos: LoopVectorPreHeader->getFirstNonPHIIt()); |
7854 | EPResumeVal->addIncoming(V: EPI.VectorTripCount, BB: VecEpilogueIterationCountCheck); |
7855 | EPResumeVal->addIncoming(V: ConstantInt::get(Ty: IdxTy, V: 0), |
7856 | BB: EPI.MainLoopIterationCountCheck); |
7857 | |
7858 | // Generate induction resume values. These variables save the new starting |
7859 | // indexes for the scalar loop. They are used to test if there are any tail |
7860 | // iterations left once the vector loop has completed. |
7861 | // Note that when the vectorized epilogue is skipped due to iteration count |
7862 | // check, then the resume value for the induction variable comes from |
7863 | // the trip count of the main vector loop, hence passing the AdditionalBypass |
7864 | // argument. |
7865 | createInductionResumeValues(ExpandedSCEVs, |
7866 | AdditionalBypass: {VecEpilogueIterationCountCheck, |
7867 | EPI.VectorTripCount} /* AdditionalBypass */); |
7868 | |
7869 | return {completeLoopSkeleton(), EPResumeVal}; |
7870 | } |
7871 | |
7872 | BasicBlock * |
7873 | EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( |
7874 | BasicBlock *Bypass, BasicBlock *Insert) { |
7875 | |
7876 | assert(EPI.TripCount && |
7877 | "Expected trip count to have been safed in the first pass." ); |
7878 | assert( |
7879 | (!isa<Instruction>(EPI.TripCount) || |
7880 | DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) && |
7881 | "saved trip count does not dominate insertion point." ); |
7882 | Value *TC = EPI.TripCount; |
7883 | IRBuilder<> Builder(Insert->getTerminator()); |
7884 | Value *Count = Builder.CreateSub(LHS: TC, RHS: EPI.VectorTripCount, Name: "n.vec.remaining" ); |
7885 | |
7886 | // Generate code to check if the loop's trip count is less than VF * UF of the |
7887 | // vector epilogue loop. |
7888 | auto P = Cost->requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector()) |
7889 | ? ICmpInst::ICMP_ULE |
7890 | : ICmpInst::ICMP_ULT; |
7891 | |
7892 | Value *CheckMinIters = |
7893 | Builder.CreateICmp(P, LHS: Count, |
7894 | RHS: createStepForVF(B&: Builder, Ty: Count->getType(), |
7895 | VF: EPI.EpilogueVF, Step: EPI.EpilogueUF), |
7896 | Name: "min.epilog.iters.check" ); |
7897 | |
7898 | BranchInst &BI = |
7899 | *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters); |
7900 | if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())) { |
7901 | unsigned MainLoopStep = UF * VF.getKnownMinValue(); |
7902 | unsigned EpilogueLoopStep = |
7903 | EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue(); |
7904 | // We assume the remaining `Count` is equally distributed in |
7905 | // [0, MainLoopStep) |
7906 | // So the probability for `Count < EpilogueLoopStep` should be |
7907 | // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep |
7908 | unsigned EstimatedSkipCount = std::min(a: MainLoopStep, b: EpilogueLoopStep); |
7909 | const uint32_t Weights[] = {EstimatedSkipCount, |
7910 | MainLoopStep - EstimatedSkipCount}; |
7911 | setBranchWeights(I&: BI, Weights); |
7912 | } |
7913 | ReplaceInstWithInst(From: Insert->getTerminator(), To: &BI); |
7914 | |
7915 | LoopBypassBlocks.push_back(Elt: Insert); |
7916 | return Insert; |
7917 | } |
7918 | |
7919 | void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { |
7920 | LLVM_DEBUG({ |
7921 | dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" |
7922 | << "Epilogue Loop VF:" << EPI.EpilogueVF |
7923 | << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n" ; |
7924 | }); |
7925 | } |
7926 | |
7927 | void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() { |
7928 | DEBUG_WITH_TYPE(VerboseDebug, { |
7929 | dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n" ; |
7930 | }); |
7931 | } |
7932 | |
7933 | bool LoopVectorizationPlanner::getDecisionAndClampRange( |
7934 | const std::function<bool(ElementCount)> &Predicate, VFRange &Range) { |
7935 | assert(!Range.isEmpty() && "Trying to test an empty VF range." ); |
7936 | bool PredicateAtRangeStart = Predicate(Range.Start); |
7937 | |
7938 | for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End)) |
7939 | if (Predicate(TmpVF) != PredicateAtRangeStart) { |
7940 | Range.End = TmpVF; |
7941 | break; |
7942 | } |
7943 | |
7944 | return PredicateAtRangeStart; |
7945 | } |
7946 | |
7947 | /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF, |
7948 | /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range |
7949 | /// of VF's starting at a given VF and extending it as much as possible. Each |
7950 | /// vectorization decision can potentially shorten this sub-range during |
7951 | /// buildVPlan(). |
7952 | void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, |
7953 | ElementCount MaxVF) { |
7954 | auto MaxVFTimes2 = MaxVF * 2; |
7955 | for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) { |
7956 | VFRange SubRange = {VF, MaxVFTimes2}; |
7957 | VPlans.push_back(Elt: buildVPlan(Range&: SubRange)); |
7958 | VF = SubRange.End; |
7959 | } |
7960 | } |
7961 | |
7962 | iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>> |
7963 | VPRecipeBuilder::mapToVPValues(User::op_range Operands) { |
7964 | std::function<VPValue *(Value *)> Fn = [this](Value *Op) { |
7965 | if (auto *I = dyn_cast<Instruction>(Val: Op)) { |
7966 | if (auto *R = Ingredient2Recipe.lookup(Val: I)) |
7967 | return R->getVPSingleValue(); |
7968 | } |
7969 | return Plan.getOrAddLiveIn(V: Op); |
7970 | }; |
7971 | return map_range(C&: Operands, F: Fn); |
7972 | } |
7973 | |
7974 | VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { |
7975 | assert(is_contained(predecessors(Dst), Src) && "Invalid edge" ); |
7976 | |
7977 | // Look for cached value. |
7978 | std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); |
7979 | EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Val: Edge); |
7980 | if (ECEntryIt != EdgeMaskCache.end()) |
7981 | return ECEntryIt->second; |
7982 | |
7983 | VPValue *SrcMask = getBlockInMask(BB: Src); |
7984 | |
7985 | // The terminator has to be a branch inst! |
7986 | BranchInst *BI = dyn_cast<BranchInst>(Val: Src->getTerminator()); |
7987 | assert(BI && "Unexpected terminator found" ); |
7988 | |
7989 | if (!BI->isConditional() || BI->getSuccessor(i: 0) == BI->getSuccessor(i: 1)) |
7990 | return EdgeMaskCache[Edge] = SrcMask; |
7991 | |
7992 | // If source is an exiting block, we know the exit edge is dynamically dead |
7993 | // in the vector loop, and thus we don't need to restrict the mask. Avoid |
7994 | // adding uses of an otherwise potentially dead instruction. |
7995 | if (OrigLoop->isLoopExiting(BB: Src)) |
7996 | return EdgeMaskCache[Edge] = SrcMask; |
7997 | |
7998 | VPValue *EdgeMask = getVPValueOrAddLiveIn(V: BI->getCondition(), Plan); |
7999 | assert(EdgeMask && "No Edge Mask found for condition" ); |
8000 | |
8001 | if (BI->getSuccessor(i: 0) != Dst) |
8002 | EdgeMask = Builder.createNot(Operand: EdgeMask, DL: BI->getDebugLoc()); |
8003 | |
8004 | if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. |
8005 | // The condition is 'SrcMask && EdgeMask', which is equivalent to |
8006 | // 'select i1 SrcMask, i1 EdgeMask, i1 false'. |
8007 | // The select version does not introduce new UB if SrcMask is false and |
8008 | // EdgeMask is poison. Using 'and' here introduces undefined behavior. |
8009 | VPValue *False = Plan.getOrAddLiveIn( |
8010 | V: ConstantInt::getFalse(Ty: BI->getCondition()->getType())); |
8011 | EdgeMask = |
8012 | Builder.createSelect(Cond: SrcMask, TrueVal: EdgeMask, FalseVal: False, DL: BI->getDebugLoc()); |
8013 | } |
8014 | |
8015 | return EdgeMaskCache[Edge] = EdgeMask; |
8016 | } |
8017 | |
8018 | VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const { |
8019 | assert(is_contained(predecessors(Dst), Src) && "Invalid edge" ); |
8020 | |
8021 | // Look for cached value. |
8022 | std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst); |
8023 | EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Val: Edge); |
8024 | assert(ECEntryIt != EdgeMaskCache.end() && |
8025 | "looking up mask for edge which has not been created" ); |
8026 | return ECEntryIt->second; |
8027 | } |
8028 | |
8029 | void VPRecipeBuilder::() { |
8030 | BasicBlock * = OrigLoop->getHeader(); |
8031 | |
8032 | // When not folding the tail, use nullptr to model all-true mask. |
8033 | if (!CM.foldTailByMasking()) { |
8034 | BlockMaskCache[Header] = nullptr; |
8035 | return; |
8036 | } |
8037 | |
8038 | // Introduce the early-exit compare IV <= BTC to form header block mask. |
8039 | // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by |
8040 | // constructing the desired canonical IV in the header block as its first |
8041 | // non-phi instructions. |
8042 | |
8043 | VPBasicBlock * = Plan.getVectorLoopRegion()->getEntryBasicBlock(); |
8044 | auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); |
8045 | auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); |
8046 | HeaderVPBB->insert(Recipe: IV, InsertPt: NewInsertionPoint); |
8047 | |
8048 | VPBuilder::InsertPointGuard Guard(Builder); |
8049 | Builder.setInsertPoint(TheBB: HeaderVPBB, IP: NewInsertionPoint); |
8050 | VPValue *BlockMask = nullptr; |
8051 | VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); |
8052 | BlockMask = Builder.createICmp(Pred: CmpInst::ICMP_ULE, A: IV, B: BTC); |
8053 | BlockMaskCache[Header] = BlockMask; |
8054 | } |
8055 | |
8056 | VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const { |
8057 | // Return the cached value. |
8058 | BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(Val: BB); |
8059 | assert(BCEntryIt != BlockMaskCache.end() && |
8060 | "Trying to access mask for block without one." ); |
8061 | return BCEntryIt->second; |
8062 | } |
8063 | |
8064 | void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) { |
8065 | assert(OrigLoop->contains(BB) && "Block is not a part of a loop" ); |
8066 | assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed" ); |
8067 | assert(OrigLoop->getHeader() != BB && |
8068 | "Loop header must have cached block mask" ); |
8069 | |
8070 | // All-one mask is modelled as no-mask following the convention for masked |
8071 | // load/store/gather/scatter. Initialize BlockMask to no-mask. |
8072 | VPValue *BlockMask = nullptr; |
8073 | // This is the block mask. We OR all incoming edges. |
8074 | for (auto *Predecessor : predecessors(BB)) { |
8075 | VPValue *EdgeMask = createEdgeMask(Src: Predecessor, Dst: BB); |
8076 | if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too. |
8077 | BlockMaskCache[BB] = EdgeMask; |
8078 | return; |
8079 | } |
8080 | |
8081 | if (!BlockMask) { // BlockMask has its initialized nullptr value. |
8082 | BlockMask = EdgeMask; |
8083 | continue; |
8084 | } |
8085 | |
8086 | BlockMask = Builder.createOr(LHS: BlockMask, RHS: EdgeMask, DL: {}); |
8087 | } |
8088 | |
8089 | BlockMaskCache[BB] = BlockMask; |
8090 | } |
8091 | |
8092 | VPWidenMemoryRecipe * |
8093 | VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands, |
8094 | VFRange &Range) { |
8095 | assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && |
8096 | "Must be called with either a load or store" ); |
8097 | |
8098 | auto willWiden = [&](ElementCount VF) -> bool { |
8099 | LoopVectorizationCostModel::InstWidening Decision = |
8100 | CM.getWideningDecision(I, VF); |
8101 | assert(Decision != LoopVectorizationCostModel::CM_Unknown && |
8102 | "CM decision should be taken at this point." ); |
8103 | if (Decision == LoopVectorizationCostModel::CM_Interleave) |
8104 | return true; |
8105 | if (CM.isScalarAfterVectorization(I, VF) || |
8106 | CM.isProfitableToScalarize(I, VF)) |
8107 | return false; |
8108 | return Decision != LoopVectorizationCostModel::CM_Scalarize; |
8109 | }; |
8110 | |
8111 | if (!LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: willWiden, Range)) |
8112 | return nullptr; |
8113 | |
8114 | VPValue *Mask = nullptr; |
8115 | if (Legal->isMaskRequired(I)) |
8116 | Mask = getBlockInMask(BB: I->getParent()); |
8117 | |
8118 | // Determine if the pointer operand of the access is either consecutive or |
8119 | // reverse consecutive. |
8120 | LoopVectorizationCostModel::InstWidening Decision = |
8121 | CM.getWideningDecision(I, VF: Range.Start); |
8122 | bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; |
8123 | bool Consecutive = |
8124 | Reverse || Decision == LoopVectorizationCostModel::CM_Widen; |
8125 | |
8126 | VPValue *Ptr = isa<LoadInst>(Val: I) ? Operands[0] : Operands[1]; |
8127 | if (Consecutive) { |
8128 | auto *GEP = dyn_cast<GetElementPtrInst>( |
8129 | Val: Ptr->getUnderlyingValue()->stripPointerCasts()); |
8130 | auto *VectorPtr = new VPVectorPointerRecipe( |
8131 | Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false, |
8132 | I->getDebugLoc()); |
8133 | Builder.getInsertBlock()->appendRecipe(Recipe: VectorPtr); |
8134 | Ptr = VectorPtr; |
8135 | } |
8136 | if (LoadInst *Load = dyn_cast<LoadInst>(Val: I)) |
8137 | return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, |
8138 | I->getDebugLoc()); |
8139 | |
8140 | StoreInst *Store = cast<StoreInst>(Val: I); |
8141 | return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive, |
8142 | Reverse, I->getDebugLoc()); |
8143 | } |
8144 | |
8145 | /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also |
8146 | /// insert a recipe to expand the step for the induction recipe. |
8147 | static VPWidenIntOrFpInductionRecipe * |
8148 | createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, |
8149 | VPValue *Start, const InductionDescriptor &IndDesc, |
8150 | VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, |
8151 | VFRange &Range) { |
8152 | assert(IndDesc.getStartValue() == |
8153 | Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader())); |
8154 | assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) && |
8155 | "step must be loop invariant" ); |
8156 | |
8157 | VPValue *Step = |
8158 | vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: IndDesc.getStep(), SE); |
8159 | if (auto *TruncI = dyn_cast<TruncInst>(Val: PhiOrTrunc)) { |
8160 | return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI); |
8161 | } |
8162 | assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here" ); |
8163 | return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc); |
8164 | } |
8165 | |
8166 | VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( |
8167 | PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) { |
8168 | |
8169 | // Check if this is an integer or fp induction. If so, build the recipe that |
8170 | // produces its scalar and vector values. |
8171 | if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) |
8172 | return createWidenInductionRecipes(Phi, PhiOrTrunc: Phi, Start: Operands[0], IndDesc: *II, Plan, |
8173 | SE&: *PSE.getSE(), OrigLoop&: *OrigLoop, Range); |
8174 | |
8175 | // Check if this is pointer induction. If so, build the recipe for it. |
8176 | if (auto *II = Legal->getPointerInductionDescriptor(Phi)) { |
8177 | VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: II->getStep(), |
8178 | SE&: *PSE.getSE()); |
8179 | return new VPWidenPointerInductionRecipe( |
8180 | Phi, Operands[0], Step, *II, |
8181 | LoopVectorizationPlanner::getDecisionAndClampRange( |
8182 | Predicate: [&](ElementCount VF) { |
8183 | return CM.isScalarAfterVectorization(I: Phi, VF); |
8184 | }, |
8185 | Range)); |
8186 | } |
8187 | return nullptr; |
8188 | } |
8189 | |
8190 | VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( |
8191 | TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) { |
8192 | // Optimize the special case where the source is a constant integer |
8193 | // induction variable. Notice that we can only optimize the 'trunc' case |
8194 | // because (a) FP conversions lose precision, (b) sext/zext may wrap, and |
8195 | // (c) other casts depend on pointer size. |
8196 | |
8197 | // Determine whether \p K is a truncation based on an induction variable that |
8198 | // can be optimized. |
8199 | auto isOptimizableIVTruncate = |
8200 | [&](Instruction *K) -> std::function<bool(ElementCount)> { |
8201 | return [=](ElementCount VF) -> bool { |
8202 | return CM.isOptimizableIVTruncate(I: K, VF); |
8203 | }; |
8204 | }; |
8205 | |
8206 | if (LoopVectorizationPlanner::getDecisionAndClampRange( |
8207 | Predicate: isOptimizableIVTruncate(I), Range)) { |
8208 | |
8209 | auto *Phi = cast<PHINode>(Val: I->getOperand(i_nocapture: 0)); |
8210 | const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); |
8211 | VPValue *Start = Plan.getOrAddLiveIn(V: II.getStartValue()); |
8212 | return createWidenInductionRecipes(Phi, PhiOrTrunc: I, Start, IndDesc: II, Plan, SE&: *PSE.getSE(), |
8213 | OrigLoop&: *OrigLoop, Range); |
8214 | } |
8215 | return nullptr; |
8216 | } |
8217 | |
8218 | VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, |
8219 | ArrayRef<VPValue *> Operands) { |
8220 | unsigned NumIncoming = Phi->getNumIncomingValues(); |
8221 | |
8222 | // We know that all PHIs in non-header blocks are converted into selects, so |
8223 | // we don't have to worry about the insertion order and we can just use the |
8224 | // builder. At this point we generate the predication tree. There may be |
8225 | // duplications since this is a simple recursive scan, but future |
8226 | // optimizations will clean it up. |
8227 | // TODO: At the moment the first mask is always skipped, but it would be |
8228 | // better to skip the most expensive mask. |
8229 | SmallVector<VPValue *, 2> OperandsWithMask; |
8230 | |
8231 | for (unsigned In = 0; In < NumIncoming; In++) { |
8232 | OperandsWithMask.push_back(Elt: Operands[In]); |
8233 | VPValue *EdgeMask = |
8234 | getEdgeMask(Src: Phi->getIncomingBlock(i: In), Dst: Phi->getParent()); |
8235 | if (!EdgeMask) { |
8236 | assert(In == 0 && "Both null and non-null edge masks found" ); |
8237 | assert(all_equal(Operands) && |
8238 | "Distinct incoming values with one having a full mask" ); |
8239 | break; |
8240 | } |
8241 | if (In == 0) |
8242 | continue; |
8243 | OperandsWithMask.push_back(Elt: EdgeMask); |
8244 | } |
8245 | return new VPBlendRecipe(Phi, OperandsWithMask); |
8246 | } |
8247 | |
8248 | VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, |
8249 | ArrayRef<VPValue *> Operands, |
8250 | VFRange &Range) { |
8251 | bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( |
8252 | Predicate: [this, CI](ElementCount VF) { |
8253 | return CM.isScalarWithPredication(I: CI, VF); |
8254 | }, |
8255 | Range); |
8256 | |
8257 | if (IsPredicated) |
8258 | return nullptr; |
8259 | |
8260 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); |
8261 | if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || |
8262 | ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect || |
8263 | ID == Intrinsic::pseudoprobe || |
8264 | ID == Intrinsic::experimental_noalias_scope_decl)) |
8265 | return nullptr; |
8266 | |
8267 | SmallVector<VPValue *, 4> Ops(Operands.take_front(N: CI->arg_size())); |
8268 | |
8269 | // Is it beneficial to perform intrinsic call compared to lib call? |
8270 | bool ShouldUseVectorIntrinsic = |
8271 | ID && LoopVectorizationPlanner::getDecisionAndClampRange( |
8272 | Predicate: [&](ElementCount VF) -> bool { |
8273 | return CM.getCallWideningDecision(CI, VF).Kind == |
8274 | LoopVectorizationCostModel::CM_IntrinsicCall; |
8275 | }, |
8276 | Range); |
8277 | if (ShouldUseVectorIntrinsic) |
8278 | return new VPWidenCallRecipe(*CI, make_range(x: Ops.begin(), y: Ops.end()), ID, |
8279 | CI->getDebugLoc()); |
8280 | |
8281 | Function *Variant = nullptr; |
8282 | std::optional<unsigned> MaskPos; |
8283 | // Is better to call a vectorized version of the function than to to scalarize |
8284 | // the call? |
8285 | auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( |
8286 | Predicate: [&](ElementCount VF) -> bool { |
8287 | // The following case may be scalarized depending on the VF. |
8288 | // The flag shows whether we can use a usual Call for vectorized |
8289 | // version of the instruction. |
8290 | |
8291 | // If we've found a variant at a previous VF, then stop looking. A |
8292 | // vectorized variant of a function expects input in a certain shape |
8293 | // -- basically the number of input registers, the number of lanes |
8294 | // per register, and whether there's a mask required. |
8295 | // We store a pointer to the variant in the VPWidenCallRecipe, so |
8296 | // once we have an appropriate variant it's only valid for that VF. |
8297 | // This will force a different vplan to be generated for each VF that |
8298 | // finds a valid variant. |
8299 | if (Variant) |
8300 | return false; |
8301 | LoopVectorizationCostModel::CallWideningDecision Decision = |
8302 | CM.getCallWideningDecision(CI, VF); |
8303 | if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) { |
8304 | Variant = Decision.Variant; |
8305 | MaskPos = Decision.MaskPos; |
8306 | return true; |
8307 | } |
8308 | |
8309 | return false; |
8310 | }, |
8311 | Range); |
8312 | if (ShouldUseVectorCall) { |
8313 | if (MaskPos.has_value()) { |
8314 | // We have 2 cases that would require a mask: |
8315 | // 1) The block needs to be predicated, either due to a conditional |
8316 | // in the scalar loop or use of an active lane mask with |
8317 | // tail-folding, and we use the appropriate mask for the block. |
8318 | // 2) No mask is required for the block, but the only available |
8319 | // vector variant at this VF requires a mask, so we synthesize an |
8320 | // all-true mask. |
8321 | VPValue *Mask = nullptr; |
8322 | if (Legal->isMaskRequired(I: CI)) |
8323 | Mask = getBlockInMask(BB: CI->getParent()); |
8324 | else |
8325 | Mask = Plan.getOrAddLiveIn(V: ConstantInt::getTrue( |
8326 | Ty: IntegerType::getInt1Ty(C&: Variant->getFunctionType()->getContext()))); |
8327 | |
8328 | Ops.insert(I: Ops.begin() + *MaskPos, Elt: Mask); |
8329 | } |
8330 | |
8331 | return new VPWidenCallRecipe(*CI, make_range(x: Ops.begin(), y: Ops.end()), |
8332 | Intrinsic::not_intrinsic, CI->getDebugLoc(), |
8333 | Variant); |
8334 | } |
8335 | |
8336 | return nullptr; |
8337 | } |
8338 | |
8339 | bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { |
8340 | assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) && |
8341 | !isa<StoreInst>(I) && "Instruction should have been handled earlier" ); |
8342 | // Instruction should be widened, unless it is scalar after vectorization, |
8343 | // scalarization is profitable or it is predicated. |
8344 | auto WillScalarize = [this, I](ElementCount VF) -> bool { |
8345 | return CM.isScalarAfterVectorization(I, VF) || |
8346 | CM.isProfitableToScalarize(I, VF) || |
8347 | CM.isScalarWithPredication(I, VF); |
8348 | }; |
8349 | return !LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillScalarize, |
8350 | Range); |
8351 | } |
8352 | |
8353 | VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, |
8354 | ArrayRef<VPValue *> Operands, |
8355 | VPBasicBlock *VPBB) { |
8356 | switch (I->getOpcode()) { |
8357 | default: |
8358 | return nullptr; |
8359 | case Instruction::SDiv: |
8360 | case Instruction::UDiv: |
8361 | case Instruction::SRem: |
8362 | case Instruction::URem: { |
8363 | // If not provably safe, use a select to form a safe divisor before widening the |
8364 | // div/rem operation itself. Otherwise fall through to general handling below. |
8365 | if (CM.isPredicatedInst(I)) { |
8366 | SmallVector<VPValue *> Ops(Operands.begin(), Operands.end()); |
8367 | VPValue *Mask = getBlockInMask(BB: I->getParent()); |
8368 | VPValue *One = |
8369 | Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: I->getType(), V: 1u, IsSigned: false)); |
8370 | auto *SafeRHS = |
8371 | new VPInstruction(Instruction::Select, {Mask, Ops[1], One}, |
8372 | I->getDebugLoc()); |
8373 | VPBB->appendRecipe(Recipe: SafeRHS); |
8374 | Ops[1] = SafeRHS; |
8375 | return new VPWidenRecipe(*I, make_range(x: Ops.begin(), y: Ops.end())); |
8376 | } |
8377 | [[fallthrough]]; |
8378 | } |
8379 | case Instruction::Add: |
8380 | case Instruction::And: |
8381 | case Instruction::AShr: |
8382 | case Instruction::FAdd: |
8383 | case Instruction::FCmp: |
8384 | case Instruction::FDiv: |
8385 | case Instruction::FMul: |
8386 | case Instruction::FNeg: |
8387 | case Instruction::FRem: |
8388 | case Instruction::FSub: |
8389 | case Instruction::ICmp: |
8390 | case Instruction::LShr: |
8391 | case Instruction::Mul: |
8392 | case Instruction::Or: |
8393 | case Instruction::Select: |
8394 | case Instruction::Shl: |
8395 | case Instruction::Sub: |
8396 | case Instruction::Xor: |
8397 | case Instruction::Freeze: |
8398 | return new VPWidenRecipe(*I, make_range(x: Operands.begin(), y: Operands.end())); |
8399 | }; |
8400 | } |
8401 | |
8402 | void VPRecipeBuilder::() { |
8403 | BasicBlock *OrigLatch = OrigLoop->getLoopLatch(); |
8404 | for (VPHeaderPHIRecipe *R : PhisToFix) { |
8405 | auto *PN = cast<PHINode>(Val: R->getUnderlyingValue()); |
8406 | VPRecipeBase *IncR = |
8407 | getRecipe(I: cast<Instruction>(Val: PN->getIncomingValueForBlock(BB: OrigLatch))); |
8408 | R->addOperand(Operand: IncR->getVPSingleValue()); |
8409 | } |
8410 | } |
8411 | |
8412 | VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I, |
8413 | VFRange &Range) { |
8414 | bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( |
8415 | Predicate: [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, |
8416 | Range); |
8417 | |
8418 | bool IsPredicated = CM.isPredicatedInst(I); |
8419 | |
8420 | // Even if the instruction is not marked as uniform, there are certain |
8421 | // intrinsic calls that can be effectively treated as such, so we check for |
8422 | // them here. Conservatively, we only do this for scalable vectors, since |
8423 | // for fixed-width VFs we can always fall back on full scalarization. |
8424 | if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(Val: I)) { |
8425 | switch (cast<IntrinsicInst>(Val: I)->getIntrinsicID()) { |
8426 | case Intrinsic::assume: |
8427 | case Intrinsic::lifetime_start: |
8428 | case Intrinsic::lifetime_end: |
8429 | // For scalable vectors if one of the operands is variant then we still |
8430 | // want to mark as uniform, which will generate one instruction for just |
8431 | // the first lane of the vector. We can't scalarize the call in the same |
8432 | // way as for fixed-width vectors because we don't know how many lanes |
8433 | // there are. |
8434 | // |
8435 | // The reasons for doing it this way for scalable vectors are: |
8436 | // 1. For the assume intrinsic generating the instruction for the first |
8437 | // lane is still be better than not generating any at all. For |
8438 | // example, the input may be a splat across all lanes. |
8439 | // 2. For the lifetime start/end intrinsics the pointer operand only |
8440 | // does anything useful when the input comes from a stack object, |
8441 | // which suggests it should always be uniform. For non-stack objects |
8442 | // the effect is to poison the object, which still allows us to |
8443 | // remove the call. |
8444 | IsUniform = true; |
8445 | break; |
8446 | default: |
8447 | break; |
8448 | } |
8449 | } |
8450 | VPValue *BlockInMask = nullptr; |
8451 | if (!IsPredicated) { |
8452 | // Finalize the recipe for Instr, first if it is not predicated. |
8453 | LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n" ); |
8454 | } else { |
8455 | LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n" ); |
8456 | // Instructions marked for predication are replicated and a mask operand is |
8457 | // added initially. Masked replicate recipes will later be placed under an |
8458 | // if-then construct to prevent side-effects. Generate recipes to compute |
8459 | // the block mask for this region. |
8460 | BlockInMask = getBlockInMask(BB: I->getParent()); |
8461 | } |
8462 | |
8463 | auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(Operands: I->operands()), |
8464 | IsUniform, BlockInMask); |
8465 | return Recipe; |
8466 | } |
8467 | |
8468 | VPRecipeBase * |
8469 | VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, |
8470 | ArrayRef<VPValue *> Operands, |
8471 | VFRange &Range, VPBasicBlock *VPBB) { |
8472 | // First, check for specific widening recipes that deal with inductions, Phi |
8473 | // nodes, calls and memory operations. |
8474 | VPRecipeBase *Recipe; |
8475 | if (auto Phi = dyn_cast<PHINode>(Val: Instr)) { |
8476 | if (Phi->getParent() != OrigLoop->getHeader()) |
8477 | return tryToBlend(Phi, Operands); |
8478 | |
8479 | if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range))) |
8480 | return Recipe; |
8481 | |
8482 | VPHeaderPHIRecipe *PhiRecipe = nullptr; |
8483 | assert((Legal->isReductionVariable(Phi) || |
8484 | Legal->isFixedOrderRecurrence(Phi)) && |
8485 | "can only widen reductions and fixed-order recurrences here" ); |
8486 | VPValue *StartV = Operands[0]; |
8487 | if (Legal->isReductionVariable(PN: Phi)) { |
8488 | const RecurrenceDescriptor &RdxDesc = |
8489 | Legal->getReductionVars().find(Key: Phi)->second; |
8490 | assert(RdxDesc.getRecurrenceStartValue() == |
8491 | Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); |
8492 | PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, |
8493 | CM.isInLoopReduction(Phi), |
8494 | CM.useOrderedReductions(RdxDesc)); |
8495 | } else { |
8496 | // TODO: Currently fixed-order recurrences are modeled as chains of |
8497 | // first-order recurrences. If there are no users of the intermediate |
8498 | // recurrences in the chain, the fixed order recurrence should be modeled |
8499 | // directly, enabling more efficient codegen. |
8500 | PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV); |
8501 | } |
8502 | |
8503 | PhisToFix.push_back(Elt: PhiRecipe); |
8504 | return PhiRecipe; |
8505 | } |
8506 | |
8507 | if (isa<TruncInst>(Val: Instr) && (Recipe = tryToOptimizeInductionTruncate( |
8508 | I: cast<TruncInst>(Val: Instr), Operands, Range))) |
8509 | return Recipe; |
8510 | |
8511 | // All widen recipes below deal only with VF > 1. |
8512 | if (LoopVectorizationPlanner::getDecisionAndClampRange( |
8513 | Predicate: [&](ElementCount VF) { return VF.isScalar(); }, Range)) |
8514 | return nullptr; |
8515 | |
8516 | if (auto *CI = dyn_cast<CallInst>(Val: Instr)) |
8517 | return tryToWidenCall(CI, Operands, Range); |
8518 | |
8519 | if (isa<LoadInst>(Val: Instr) || isa<StoreInst>(Val: Instr)) |
8520 | return tryToWidenMemory(I: Instr, Operands, Range); |
8521 | |
8522 | if (!shouldWiden(I: Instr, Range)) |
8523 | return nullptr; |
8524 | |
8525 | if (auto GEP = dyn_cast<GetElementPtrInst>(Val: Instr)) |
8526 | return new VPWidenGEPRecipe(GEP, |
8527 | make_range(x: Operands.begin(), y: Operands.end())); |
8528 | |
8529 | if (auto *SI = dyn_cast<SelectInst>(Val: Instr)) { |
8530 | return new VPWidenSelectRecipe( |
8531 | *SI, make_range(x: Operands.begin(), y: Operands.end())); |
8532 | } |
8533 | |
8534 | if (auto *CI = dyn_cast<CastInst>(Val: Instr)) { |
8535 | return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(), |
8536 | *CI); |
8537 | } |
8538 | |
8539 | return tryToWiden(I: Instr, Operands, VPBB); |
8540 | } |
8541 | |
8542 | void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, |
8543 | ElementCount MaxVF) { |
8544 | assert(OrigLoop->isInnermost() && "Inner loop expected." ); |
8545 | |
8546 | auto MaxVFTimes2 = MaxVF * 2; |
8547 | for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) { |
8548 | VFRange SubRange = {VF, MaxVFTimes2}; |
8549 | if (auto Plan = tryToBuildVPlanWithVPRecipes(Range&: SubRange)) { |
8550 | // Now optimize the initial VPlan. |
8551 | if (!Plan->hasVF(VF: ElementCount::getFixed(MinVal: 1))) |
8552 | VPlanTransforms::truncateToMinimalBitwidths( |
8553 | Plan&: *Plan, MinBWs: CM.getMinimalBitwidths(), Ctx&: PSE.getSE()->getContext()); |
8554 | VPlanTransforms::optimize(Plan&: *Plan, SE&: *PSE.getSE()); |
8555 | // TODO: try to put it close to addActiveLaneMask(). |
8556 | if (CM.foldTailWithEVL()) |
8557 | VPlanTransforms::addExplicitVectorLength(Plan&: *Plan); |
8558 | assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid" ); |
8559 | VPlans.push_back(Elt: std::move(Plan)); |
8560 | } |
8561 | VF = SubRange.End; |
8562 | } |
8563 | } |
8564 | |
8565 | // Add the necessary canonical IV and branch recipes required to control the |
8566 | // loop. |
8567 | static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, |
8568 | DebugLoc DL) { |
8569 | Value *StartIdx = ConstantInt::get(Ty: IdxTy, V: 0); |
8570 | auto *StartV = Plan.getOrAddLiveIn(V: StartIdx); |
8571 | |
8572 | // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. |
8573 | auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); |
8574 | VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); |
8575 | VPBasicBlock * = TopRegion->getEntryBasicBlock(); |
8576 | Header->insert(Recipe: CanonicalIVPHI, InsertPt: Header->begin()); |
8577 | |
8578 | VPBuilder Builder(TopRegion->getExitingBasicBlock()); |
8579 | // Add a VPInstruction to increment the scalar canonical IV by VF * UF. |
8580 | auto *CanonicalIVIncrement = Builder.createOverflowingOp( |
8581 | Opcode: Instruction::Add, Operands: {CanonicalIVPHI, &Plan.getVFxUF()}, WrapFlags: {HasNUW, false}, DL, |
8582 | Name: "index.next" ); |
8583 | CanonicalIVPHI->addOperand(Operand: CanonicalIVIncrement); |
8584 | |
8585 | // Add the BranchOnCount VPInstruction to the latch. |
8586 | Builder.createNaryOp(Opcode: VPInstruction::BranchOnCount, |
8587 | Operands: {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); |
8588 | } |
8589 | |
8590 | // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the |
8591 | // original exit block. |
8592 | static void addUsersInExitBlock(VPBasicBlock *, Loop *OrigLoop, |
8593 | VPRecipeBuilder &Builder, VPlan &Plan) { |
8594 | BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock(); |
8595 | BasicBlock *ExitingBB = OrigLoop->getExitingBlock(); |
8596 | // Only handle single-exit loops with unique exit blocks for now. |
8597 | if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB) |
8598 | return; |
8599 | |
8600 | // Introduce VPUsers modeling the exit values. |
8601 | for (PHINode &ExitPhi : ExitBB->phis()) { |
8602 | Value *IncomingValue = |
8603 | ExitPhi.getIncomingValueForBlock(BB: ExitingBB); |
8604 | VPValue *V = Builder.getVPValueOrAddLiveIn(V: IncomingValue, Plan); |
8605 | Plan.addLiveOut(PN: &ExitPhi, V); |
8606 | } |
8607 | } |
8608 | |
8609 | VPlanPtr |
8610 | LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { |
8611 | |
8612 | SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; |
8613 | |
8614 | // --------------------------------------------------------------------------- |
8615 | // Build initial VPlan: Scan the body of the loop in a topological order to |
8616 | // visit each basic block after having visited its predecessor basic blocks. |
8617 | // --------------------------------------------------------------------------- |
8618 | |
8619 | // Create initial VPlan skeleton, having a basic block for the pre-header |
8620 | // which contains SCEV expansions that need to happen before the CFG is |
8621 | // modified; a basic block for the vector pre-header, followed by a region for |
8622 | // the vector loop, followed by the middle basic block. The skeleton vector |
8623 | // loop region contains a header and latch basic blocks. |
8624 | VPlanPtr Plan = VPlan::createInitialVPlan( |
8625 | TripCount: createTripCountSCEV(IdxTy: Legal->getWidestInductionType(), PSE, OrigLoop), |
8626 | PSE&: *PSE.getSE()); |
8627 | VPBasicBlock * = new VPBasicBlock("vector.body" ); |
8628 | VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch" ); |
8629 | VPBlockUtils::insertBlockAfter(NewBlock: LatchVPBB, BlockPtr: HeaderVPBB); |
8630 | Plan->getVectorLoopRegion()->setEntry(HeaderVPBB); |
8631 | Plan->getVectorLoopRegion()->setExiting(LatchVPBB); |
8632 | |
8633 | // Don't use getDecisionAndClampRange here, because we don't know the UF |
8634 | // so this function is better to be conservative, rather than to split |
8635 | // it up into different VPlans. |
8636 | // TODO: Consider using getDecisionAndClampRange here to split up VPlans. |
8637 | bool IVUpdateMayOverflow = false; |
8638 | for (ElementCount VF : Range) |
8639 | IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(Cost: &CM, VF); |
8640 | |
8641 | DebugLoc DL = getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction()); |
8642 | TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); |
8643 | // When not folding the tail, we know that the induction increment will not |
8644 | // overflow. |
8645 | bool HasNUW = Style == TailFoldingStyle::None; |
8646 | addCanonicalIVRecipes(Plan&: *Plan, IdxTy: Legal->getWidestInductionType(), HasNUW, DL); |
8647 | |
8648 | VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder); |
8649 | |
8650 | // --------------------------------------------------------------------------- |
8651 | // Pre-construction: record ingredients whose recipes we'll need to further |
8652 | // process after constructing the initial VPlan. |
8653 | // --------------------------------------------------------------------------- |
8654 | |
8655 | // For each interleave group which is relevant for this (possibly trimmed) |
8656 | // Range, add it to the set of groups to be later applied to the VPlan and add |
8657 | // placeholders for its members' Recipes which we'll be replacing with a |
8658 | // single VPInterleaveRecipe. |
8659 | for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { |
8660 | auto applyIG = [IG, this](ElementCount VF) -> bool { |
8661 | bool Result = (VF.isVector() && // Query is illegal for VF == 1 |
8662 | CM.getWideningDecision(I: IG->getInsertPos(), VF) == |
8663 | LoopVectorizationCostModel::CM_Interleave); |
8664 | // For scalable vectors, the only interleave factor currently supported |
8665 | // is 2 since we require the (de)interleave2 intrinsics instead of |
8666 | // shufflevectors. |
8667 | assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && |
8668 | "Unsupported interleave factor for scalable vectors" ); |
8669 | return Result; |
8670 | }; |
8671 | if (!getDecisionAndClampRange(Predicate: applyIG, Range)) |
8672 | continue; |
8673 | InterleaveGroups.insert(Ptr: IG); |
8674 | }; |
8675 | |
8676 | // --------------------------------------------------------------------------- |
8677 | // Construct recipes for the instructions in the loop |
8678 | // --------------------------------------------------------------------------- |
8679 | |
8680 | // Scan the body of the loop in a topological order to visit each basic block |
8681 | // after having visited its predecessor basic blocks. |
8682 | LoopBlocksDFS DFS(OrigLoop); |
8683 | DFS.perform(LI); |
8684 | |
8685 | VPBasicBlock *VPBB = HeaderVPBB; |
8686 | BasicBlock * = OrigLoop->getHeader(); |
8687 | bool NeedsMasks = |
8688 | CM.foldTailByMasking() || |
8689 | any_of(Range: OrigLoop->blocks(), P: [this, HeaderBB](BasicBlock *BB) { |
8690 | bool NeedsBlends = BB != HeaderBB && !BB->phis().empty(); |
8691 | return Legal->blockNeedsPredication(BB) || NeedsBlends; |
8692 | }); |
8693 | for (BasicBlock *BB : make_range(x: DFS.beginRPO(), y: DFS.endRPO())) { |
8694 | // Relevant instructions from basic block BB will be grouped into VPRecipe |
8695 | // ingredients and fill a new VPBasicBlock. |
8696 | if (VPBB != HeaderVPBB) |
8697 | VPBB->setName(BB->getName()); |
8698 | Builder.setInsertPoint(VPBB); |
8699 | |
8700 | if (VPBB == HeaderVPBB) |
8701 | RecipeBuilder.createHeaderMask(); |
8702 | else if (NeedsMasks) |
8703 | RecipeBuilder.createBlockInMask(BB); |
8704 | |
8705 | // Introduce each ingredient into VPlan. |
8706 | // TODO: Model and preserve debug intrinsics in VPlan. |
8707 | for (Instruction &I : drop_end(RangeOrContainer: BB->instructionsWithoutDebug(SkipPseudoOp: false))) { |
8708 | Instruction *Instr = &I; |
8709 | SmallVector<VPValue *, 4> Operands; |
8710 | auto *Phi = dyn_cast<PHINode>(Val: Instr); |
8711 | if (Phi && Phi->getParent() == HeaderBB) { |
8712 | Operands.push_back(Elt: Plan->getOrAddLiveIn( |
8713 | V: Phi->getIncomingValueForBlock(BB: OrigLoop->getLoopPreheader()))); |
8714 | } else { |
8715 | auto OpRange = RecipeBuilder.mapToVPValues(Operands: Instr->operands()); |
8716 | Operands = {OpRange.begin(), OpRange.end()}; |
8717 | } |
8718 | |
8719 | // Invariant stores inside loop will be deleted and a single store |
8720 | // with the final reduction value will be added to the exit block |
8721 | StoreInst *SI; |
8722 | if ((SI = dyn_cast<StoreInst>(Val: &I)) && |
8723 | Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand())) |
8724 | continue; |
8725 | |
8726 | VPRecipeBase *Recipe = |
8727 | RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB); |
8728 | if (!Recipe) |
8729 | Recipe = RecipeBuilder.handleReplication(I: Instr, Range); |
8730 | |
8731 | RecipeBuilder.setRecipe(I: Instr, R: Recipe); |
8732 | if (isa<VPHeaderPHIRecipe>(Val: Recipe)) { |
8733 | // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In |
8734 | // the following cases, VPHeaderPHIRecipes may be created after non-phi |
8735 | // recipes and need to be moved to the phi section of HeaderVPBB: |
8736 | // * tail-folding (non-phi recipes computing the header mask are |
8737 | // introduced earlier than regular header phi recipes, and should appear |
8738 | // after them) |
8739 | // * Optimizing truncates to VPWidenIntOrFpInductionRecipe. |
8740 | |
8741 | assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() || |
8742 | CM.foldTailByMasking() || isa<TruncInst>(Instr)) && |
8743 | "unexpected recipe needs moving" ); |
8744 | Recipe->insertBefore(BB&: *HeaderVPBB, IP: HeaderVPBB->getFirstNonPhi()); |
8745 | } else |
8746 | VPBB->appendRecipe(Recipe); |
8747 | } |
8748 | |
8749 | VPBlockUtils::insertBlockAfter(NewBlock: new VPBasicBlock(), BlockPtr: VPBB); |
8750 | VPBB = cast<VPBasicBlock>(Val: VPBB->getSingleSuccessor()); |
8751 | } |
8752 | |
8753 | // After here, VPBB should not be used. |
8754 | VPBB = nullptr; |
8755 | |
8756 | if (CM.requiresScalarEpilogue(Range)) { |
8757 | // No edge from the middle block to the unique exit block has been inserted |
8758 | // and there is nothing to fix from vector loop; phis should have incoming |
8759 | // from scalar loop only. |
8760 | } else |
8761 | addUsersInExitBlock(HeaderVPBB, OrigLoop, Builder&: RecipeBuilder, Plan&: *Plan); |
8762 | |
8763 | assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) && |
8764 | !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() && |
8765 | "entry block must be set to a VPRegionBlock having a non-empty entry " |
8766 | "VPBasicBlock" ); |
8767 | RecipeBuilder.fixHeaderPhis(); |
8768 | |
8769 | // --------------------------------------------------------------------------- |
8770 | // Transform initial VPlan: Apply previously taken decisions, in order, to |
8771 | // bring the VPlan to its final state. |
8772 | // --------------------------------------------------------------------------- |
8773 | |
8774 | // Adjust the recipes for any inloop reductions. |
8775 | adjustRecipesForReductions(LatchVPBB, Plan, RecipeBuilder, MinVF: Range.Start); |
8776 | |
8777 | // Interleave memory: for each Interleave Group we marked earlier as relevant |
8778 | // for this VPlan, replace the Recipes widening its memory instructions with a |
8779 | // single VPInterleaveRecipe at its insertion point. |
8780 | for (const auto *IG : InterleaveGroups) { |
8781 | auto *Recipe = |
8782 | cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IG->getInsertPos())); |
8783 | SmallVector<VPValue *, 4> StoredValues; |
8784 | for (unsigned i = 0; i < IG->getFactor(); ++i) |
8785 | if (auto *SI = dyn_cast_or_null<StoreInst>(Val: IG->getMember(Index: i))) { |
8786 | auto *StoreR = cast<VPWidenStoreRecipe>(Val: RecipeBuilder.getRecipe(I: SI)); |
8787 | StoredValues.push_back(Elt: StoreR->getStoredValue()); |
8788 | } |
8789 | |
8790 | bool NeedsMaskForGaps = |
8791 | IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed(); |
8792 | auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues, |
8793 | Recipe->getMask(), NeedsMaskForGaps); |
8794 | VPIG->insertBefore(InsertPos: Recipe); |
8795 | unsigned J = 0; |
8796 | for (unsigned i = 0; i < IG->getFactor(); ++i) |
8797 | if (Instruction *Member = IG->getMember(Index: i)) { |
8798 | VPRecipeBase *MemberR = RecipeBuilder.getRecipe(I: Member); |
8799 | if (!Member->getType()->isVoidTy()) { |
8800 | VPValue *OriginalV = MemberR->getVPSingleValue(); |
8801 | OriginalV->replaceAllUsesWith(New: VPIG->getVPValue(I: J)); |
8802 | J++; |
8803 | } |
8804 | MemberR->eraseFromParent(); |
8805 | } |
8806 | } |
8807 | |
8808 | for (ElementCount VF : Range) |
8809 | Plan->addVF(VF); |
8810 | Plan->setName("Initial VPlan" ); |
8811 | |
8812 | // Replace VPValues for known constant strides guaranteed by predicate scalar |
8813 | // evolution. |
8814 | for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) { |
8815 | auto *StrideV = cast<SCEVUnknown>(Val: Stride)->getValue(); |
8816 | auto *ScevStride = dyn_cast<SCEVConstant>(Val: PSE.getSCEV(V: StrideV)); |
8817 | // Only handle constant strides for now. |
8818 | if (!ScevStride) |
8819 | continue; |
8820 | Constant *CI = ConstantInt::get(Ty: Stride->getType(), V: ScevStride->getAPInt()); |
8821 | |
8822 | auto *ConstVPV = Plan->getOrAddLiveIn(V: CI); |
8823 | // The versioned value may not be used in the loop directly, so just add a |
8824 | // new live-in in those cases. |
8825 | Plan->getOrAddLiveIn(V: StrideV)->replaceAllUsesWith(New: ConstVPV); |
8826 | } |
8827 | |
8828 | VPlanTransforms::dropPoisonGeneratingRecipes(Plan&: *Plan, BlockNeedsPredication: [this](BasicBlock *BB) { |
8829 | return Legal->blockNeedsPredication(BB); |
8830 | }); |
8831 | |
8832 | // Sink users of fixed-order recurrence past the recipe defining the previous |
8833 | // value and introduce FirstOrderRecurrenceSplice VPInstructions. |
8834 | if (!VPlanTransforms::adjustFixedOrderRecurrences(Plan&: *Plan, Builder)) |
8835 | return nullptr; |
8836 | |
8837 | if (useActiveLaneMask(Style)) { |
8838 | // TODO: Move checks to VPlanTransforms::addActiveLaneMask once |
8839 | // TailFoldingStyle is visible there. |
8840 | bool ForControlFlow = useActiveLaneMaskForControlFlow(Style); |
8841 | bool WithoutRuntimeCheck = |
8842 | Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; |
8843 | VPlanTransforms::addActiveLaneMask(Plan&: *Plan, UseActiveLaneMaskForControlFlow: ForControlFlow, |
8844 | DataAndControlFlowWithoutRuntimeCheck: WithoutRuntimeCheck); |
8845 | } |
8846 | return Plan; |
8847 | } |
8848 | |
8849 | VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { |
8850 | // Outer loop handling: They may require CFG and instruction level |
8851 | // transformations before even evaluating whether vectorization is profitable. |
8852 | // Since we cannot modify the incoming IR, we need to build VPlan upfront in |
8853 | // the vectorization pipeline. |
8854 | assert(!OrigLoop->isInnermost()); |
8855 | assert(EnableVPlanNativePath && "VPlan-native path is not enabled." ); |
8856 | |
8857 | // Create new empty VPlan |
8858 | auto Plan = VPlan::createInitialVPlan( |
8859 | TripCount: createTripCountSCEV(IdxTy: Legal->getWidestInductionType(), PSE, OrigLoop), |
8860 | PSE&: *PSE.getSE()); |
8861 | |
8862 | // Build hierarchical CFG |
8863 | VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); |
8864 | HCFGBuilder.buildHierarchicalCFG(); |
8865 | |
8866 | for (ElementCount VF : Range) |
8867 | Plan->addVF(VF); |
8868 | |
8869 | VPlanTransforms::VPInstructionsToVPRecipes( |
8870 | Plan, |
8871 | GetIntOrFpInductionDescriptor: [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(Phi: P); }, |
8872 | SE&: *PSE.getSE(), TLI: *TLI); |
8873 | |
8874 | // Remove the existing terminator of the exiting block of the top-most region. |
8875 | // A BranchOnCount will be added instead when adding the canonical IV recipes. |
8876 | auto *Term = |
8877 | Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); |
8878 | Term->eraseFromParent(); |
8879 | |
8880 | // Tail folding is not supported for outer loops, so the induction increment |
8881 | // is guaranteed to not wrap. |
8882 | bool HasNUW = true; |
8883 | addCanonicalIVRecipes(Plan&: *Plan, IdxTy: Legal->getWidestInductionType(), HasNUW, |
8884 | DL: DebugLoc()); |
8885 | assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid" ); |
8886 | return Plan; |
8887 | } |
8888 | |
8889 | // Adjust the recipes for reductions. For in-loop reductions the chain of |
8890 | // instructions leading from the loop exit instr to the phi need to be converted |
8891 | // to reductions, with one operand being vector and the other being the scalar |
8892 | // reduction chain. For other reductions, a select is introduced between the phi |
8893 | // and live-out recipes when folding the tail. |
8894 | // |
8895 | // A ComputeReductionResult recipe is added to the middle block, also for |
8896 | // in-loop reductions which compute their result in-loop, because generating |
8897 | // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes. |
8898 | void LoopVectorizationPlanner::adjustRecipesForReductions( |
8899 | VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, |
8900 | ElementCount MinVF) { |
8901 | VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion(); |
8902 | VPBasicBlock * = VectorLoopRegion->getEntryBasicBlock(); |
8903 | // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores |
8904 | // sank outside of the loop would keep the same order as they had in the |
8905 | // original loop. |
8906 | SmallVector<VPReductionPHIRecipe *> ReductionPHIList; |
8907 | for (VPRecipeBase &R : Header->phis()) { |
8908 | if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R)) |
8909 | ReductionPHIList.emplace_back(Args&: ReductionPhi); |
8910 | } |
8911 | bool HasIntermediateStore = false; |
8912 | stable_sort(Range&: ReductionPHIList, |
8913 | C: [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1, |
8914 | const VPReductionPHIRecipe *R2) { |
8915 | auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore; |
8916 | auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore; |
8917 | HasIntermediateStore |= IS1 || IS2; |
8918 | |
8919 | // If neither of the recipes has an intermediate store, keep the |
8920 | // order the same. |
8921 | if (!IS1 && !IS2) |
8922 | return false; |
8923 | |
8924 | // If only one of the recipes has an intermediate store, then |
8925 | // move it towards the beginning of the list. |
8926 | if (IS1 && !IS2) |
8927 | return true; |
8928 | |
8929 | if (!IS1 && IS2) |
8930 | return false; |
8931 | |
8932 | // If both recipes have an intermediate store, then the recipe |
8933 | // with the later store should be processed earlier. So it |
8934 | // should go to the beginning of the list. |
8935 | return DT->dominates(Def: IS2, User: IS1); |
8936 | }); |
8937 | |
8938 | if (HasIntermediateStore && ReductionPHIList.size() > 1) |
8939 | for (VPRecipeBase *R : ReductionPHIList) |
8940 | R->moveBefore(BB&: *Header, I: Header->getFirstNonPhi()); |
8941 | |
8942 | for (VPRecipeBase &R : Header->phis()) { |
8943 | auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R); |
8944 | if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered())) |
8945 | continue; |
8946 | |
8947 | const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); |
8948 | RecurKind Kind = RdxDesc.getRecurrenceKind(); |
8949 | assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) && |
8950 | "AnyOf reductions are not allowed for in-loop reductions" ); |
8951 | |
8952 | // Collect the chain of "link" recipes for the reduction starting at PhiR. |
8953 | SetVector<VPSingleDefRecipe *> Worklist; |
8954 | Worklist.insert(X: PhiR); |
8955 | for (unsigned I = 0; I != Worklist.size(); ++I) { |
8956 | VPSingleDefRecipe *Cur = Worklist[I]; |
8957 | for (VPUser *U : Cur->users()) { |
8958 | auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(Val: U); |
8959 | if (!UserRecipe) { |
8960 | assert(isa<VPLiveOut>(U) && |
8961 | "U must either be a VPSingleDef or VPLiveOut" ); |
8962 | continue; |
8963 | } |
8964 | Worklist.insert(X: UserRecipe); |
8965 | } |
8966 | } |
8967 | |
8968 | // Visit operation "Links" along the reduction chain top-down starting from |
8969 | // the phi until LoopExitValue. We keep track of the previous item |
8970 | // (PreviousLink) to tell which of the two operands of a Link will remain |
8971 | // scalar and which will be reduced. For minmax by select(cmp), Link will be |
8972 | // the select instructions. Blend recipes of in-loop reduction phi's will |
8973 | // get folded to their non-phi operand, as the reduction recipe handles the |
8974 | // condition directly. |
8975 | VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0]. |
8976 | for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) { |
8977 | Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr(); |
8978 | |
8979 | // Index of the first operand which holds a non-mask vector operand. |
8980 | unsigned IndexOfFirstOperand; |
8981 | // Recognize a call to the llvm.fmuladd intrinsic. |
8982 | bool IsFMulAdd = (Kind == RecurKind::FMulAdd); |
8983 | VPValue *VecOp; |
8984 | VPBasicBlock *LinkVPBB = CurrentLink->getParent(); |
8985 | if (IsFMulAdd) { |
8986 | assert( |
8987 | RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) && |
8988 | "Expected instruction to be a call to the llvm.fmuladd intrinsic" ); |
8989 | assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) || |
8990 | isa<VPWidenCallRecipe>(CurrentLink)) && |
8991 | CurrentLink->getOperand(2) == PreviousLink && |
8992 | "expected a call where the previous link is the added operand" ); |
8993 | |
8994 | // If the instruction is a call to the llvm.fmuladd intrinsic then we |
8995 | // need to create an fmul recipe (multiplying the first two operands of |
8996 | // the fmuladd together) to use as the vector operand for the fadd |
8997 | // reduction. |
8998 | VPInstruction *FMulRecipe = new VPInstruction( |
8999 | Instruction::FMul, |
9000 | {CurrentLink->getOperand(N: 0), CurrentLink->getOperand(N: 1)}, |
9001 | CurrentLinkI->getFastMathFlags()); |
9002 | LinkVPBB->insert(Recipe: FMulRecipe, InsertPt: CurrentLink->getIterator()); |
9003 | VecOp = FMulRecipe; |
9004 | } else { |
9005 | auto *Blend = dyn_cast<VPBlendRecipe>(Val: CurrentLink); |
9006 | if (PhiR->isInLoop() && Blend) { |
9007 | assert(Blend->getNumIncomingValues() == 2 && |
9008 | "Blend must have 2 incoming values" ); |
9009 | if (Blend->getIncomingValue(Idx: 0) == PhiR) |
9010 | Blend->replaceAllUsesWith(New: Blend->getIncomingValue(Idx: 1)); |
9011 | else { |
9012 | assert(Blend->getIncomingValue(1) == PhiR && |
9013 | "PhiR must be an operand of the blend" ); |
9014 | Blend->replaceAllUsesWith(New: Blend->getIncomingValue(Idx: 0)); |
9015 | } |
9016 | continue; |
9017 | } |
9018 | |
9019 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { |
9020 | if (isa<VPWidenRecipe>(Val: CurrentLink)) { |
9021 | assert(isa<CmpInst>(CurrentLinkI) && |
9022 | "need to have the compare of the select" ); |
9023 | continue; |
9024 | } |
9025 | assert(isa<VPWidenSelectRecipe>(CurrentLink) && |
9026 | "must be a select recipe" ); |
9027 | IndexOfFirstOperand = 1; |
9028 | } else { |
9029 | assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) && |
9030 | "Expected to replace a VPWidenSC" ); |
9031 | IndexOfFirstOperand = 0; |
9032 | } |
9033 | // Note that for non-commutable operands (cmp-selects), the semantics of |
9034 | // the cmp-select are captured in the recurrence kind. |
9035 | unsigned VecOpId = |
9036 | CurrentLink->getOperand(N: IndexOfFirstOperand) == PreviousLink |
9037 | ? IndexOfFirstOperand + 1 |
9038 | : IndexOfFirstOperand; |
9039 | VecOp = CurrentLink->getOperand(N: VecOpId); |
9040 | assert(VecOp != PreviousLink && |
9041 | CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 - |
9042 | (VecOpId - IndexOfFirstOperand)) == |
9043 | PreviousLink && |
9044 | "PreviousLink must be the operand other than VecOp" ); |
9045 | } |
9046 | |
9047 | BasicBlock *BB = CurrentLinkI->getParent(); |
9048 | VPValue *CondOp = nullptr; |
9049 | if (CM.blockNeedsPredicationForAnyReason(BB)) |
9050 | CondOp = RecipeBuilder.getBlockInMask(BB); |
9051 | |
9052 | VPReductionRecipe *RedRecipe = |
9053 | new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp, |
9054 | CondOp, CM.useOrderedReductions(RdxDesc)); |
9055 | // Append the recipe to the end of the VPBasicBlock because we need to |
9056 | // ensure that it comes after all of it's inputs, including CondOp. |
9057 | // Note that this transformation may leave over dead recipes (including |
9058 | // CurrentLink), which will be cleaned by a later VPlan transform. |
9059 | LinkVPBB->appendRecipe(Recipe: RedRecipe); |
9060 | CurrentLink->replaceAllUsesWith(New: RedRecipe); |
9061 | PreviousLink = RedRecipe; |
9062 | } |
9063 | } |
9064 | Builder.setInsertPoint(&*LatchVPBB->begin()); |
9065 | for (VPRecipeBase &R : |
9066 | Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { |
9067 | VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R); |
9068 | if (!PhiR) |
9069 | continue; |
9070 | |
9071 | const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); |
9072 | // If tail is folded by masking, introduce selects between the phi |
9073 | // and the live-out instruction of each reduction, at the beginning of the |
9074 | // dedicated latch block. |
9075 | auto *OrigExitingVPV = PhiR->getBackedgeValue(); |
9076 | auto *NewExitingVPV = PhiR->getBackedgeValue(); |
9077 | if (!PhiR->isInLoop() && CM.foldTailByMasking()) { |
9078 | VPValue *Cond = RecipeBuilder.getBlockInMask(BB: OrigLoop->getHeader()); |
9079 | assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB && |
9080 | "reduction recipe must be defined before latch" ); |
9081 | Type *PhiTy = PhiR->getOperand(N: 0)->getLiveInIRValue()->getType(); |
9082 | std::optional<FastMathFlags> FMFs = |
9083 | PhiTy->isFloatingPointTy() |
9084 | ? std::make_optional(t: RdxDesc.getFastMathFlags()) |
9085 | : std::nullopt; |
9086 | NewExitingVPV = |
9087 | Builder.createSelect(Cond, TrueVal: OrigExitingVPV, FalseVal: PhiR, DL: {}, Name: "" , FMFs); |
9088 | OrigExitingVPV->replaceUsesWithIf(New: NewExitingVPV, ShouldReplace: [](VPUser &U, unsigned) { |
9089 | return isa<VPInstruction>(Val: &U) && |
9090 | cast<VPInstruction>(Val: &U)->getOpcode() == |
9091 | VPInstruction::ComputeReductionResult; |
9092 | }); |
9093 | if (PreferPredicatedReductionSelect || |
9094 | TTI.preferPredicatedReductionSelect( |
9095 | Opcode: PhiR->getRecurrenceDescriptor().getOpcode(), Ty: PhiTy, |
9096 | Flags: TargetTransformInfo::ReductionFlags())) |
9097 | PhiR->setOperand(I: 1, New: NewExitingVPV); |
9098 | } |
9099 | |
9100 | // If the vector reduction can be performed in a smaller type, we truncate |
9101 | // then extend the loop exit value to enable InstCombine to evaluate the |
9102 | // entire expression in the smaller type. |
9103 | Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType(); |
9104 | if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { |
9105 | assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!" ); |
9106 | Type *RdxTy = RdxDesc.getRecurrenceType(); |
9107 | auto *Trunc = |
9108 | new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy); |
9109 | auto *Extnd = |
9110 | RdxDesc.isSigned() |
9111 | ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy) |
9112 | : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy); |
9113 | |
9114 | Trunc->insertAfter(InsertPos: NewExitingVPV->getDefiningRecipe()); |
9115 | Extnd->insertAfter(InsertPos: Trunc); |
9116 | if (PhiR->getOperand(N: 1) == NewExitingVPV) |
9117 | PhiR->setOperand(I: 1, New: Extnd->getVPSingleValue()); |
9118 | NewExitingVPV = Extnd; |
9119 | } |
9120 | |
9121 | // We want code in the middle block to appear to execute on the location of |
9122 | // the scalar loop's latch terminator because: (a) it is all compiler |
9123 | // generated, (b) these instructions are always executed after evaluating |
9124 | // the latch conditional branch, and (c) other passes may add new |
9125 | // predecessors which terminate on this line. This is the easiest way to |
9126 | // ensure we don't accidentally cause an extra step back into the loop while |
9127 | // debugging. |
9128 | DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc(); |
9129 | |
9130 | // TODO: At the moment ComputeReductionResult also drives creation of the |
9131 | // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here |
9132 | // even for in-loop reductions, until the reduction resume value handling is |
9133 | // also modeled in VPlan. |
9134 | auto *FinalReductionResult = new VPInstruction( |
9135 | VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL); |
9136 | cast<VPBasicBlock>(Val: VectorLoopRegion->getSingleSuccessor()) |
9137 | ->appendRecipe(Recipe: FinalReductionResult); |
9138 | OrigExitingVPV->replaceUsesWithIf( |
9139 | New: FinalReductionResult, |
9140 | ShouldReplace: [](VPUser &User, unsigned) { return isa<VPLiveOut>(Val: &User); }); |
9141 | } |
9142 | |
9143 | VPlanTransforms::clearReductionWrapFlags(Plan&: *Plan); |
9144 | } |
9145 | |
9146 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
9147 | void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, |
9148 | VPSlotTracker &SlotTracker) const { |
9149 | O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at " ; |
9150 | IG->getInsertPos()->printAsOperand(O, PrintType: false); |
9151 | O << ", " ; |
9152 | getAddr()->printAsOperand(OS&: O, Tracker&: SlotTracker); |
9153 | VPValue *Mask = getMask(); |
9154 | if (Mask) { |
9155 | O << ", " ; |
9156 | Mask->printAsOperand(OS&: O, Tracker&: SlotTracker); |
9157 | } |
9158 | |
9159 | unsigned OpIdx = 0; |
9160 | for (unsigned i = 0; i < IG->getFactor(); ++i) { |
9161 | if (!IG->getMember(Index: i)) |
9162 | continue; |
9163 | if (getNumStoreOperands() > 0) { |
9164 | O << "\n" << Indent << " store " ; |
9165 | getOperand(N: 1 + OpIdx)->printAsOperand(OS&: O, Tracker&: SlotTracker); |
9166 | O << " to index " << i; |
9167 | } else { |
9168 | O << "\n" << Indent << " " ; |
9169 | getVPValue(I: OpIdx)->printAsOperand(OS&: O, Tracker&: SlotTracker); |
9170 | O << " = load from index " << i; |
9171 | } |
9172 | ++OpIdx; |
9173 | } |
9174 | } |
9175 | #endif |
9176 | |
9177 | void VPWidenPointerInductionRecipe::execute(VPTransformState &State) { |
9178 | assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction && |
9179 | "Not a pointer induction according to InductionDescriptor!" ); |
9180 | assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() && |
9181 | "Unexpected type." ); |
9182 | assert(!onlyScalarsGenerated(State.VF.isScalable()) && |
9183 | "Recipe should have been replaced" ); |
9184 | |
9185 | auto *IVR = getParent()->getPlan()->getCanonicalIV(); |
9186 | PHINode *CanonicalIV = cast<PHINode>(Val: State.get(Def: IVR, Part: 0, /*IsScalar*/ true)); |
9187 | Type *PhiType = IndDesc.getStep()->getType(); |
9188 | |
9189 | // Build a pointer phi |
9190 | Value *ScalarStartValue = getStartValue()->getLiveInIRValue(); |
9191 | Type *ScStValueType = ScalarStartValue->getType(); |
9192 | PHINode *NewPointerPhi = PHINode::Create(Ty: ScStValueType, NumReservedValues: 2, NameStr: "pointer.phi" , |
9193 | InsertBefore: CanonicalIV->getIterator()); |
9194 | |
9195 | BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this); |
9196 | NewPointerPhi->addIncoming(V: ScalarStartValue, BB: VectorPH); |
9197 | |
9198 | // A pointer induction, performed by using a gep |
9199 | BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint(); |
9200 | |
9201 | Value *ScalarStepValue = State.get(Def: getOperand(N: 1), Instance: VPIteration(0, 0)); |
9202 | Value *RuntimeVF = getRuntimeVF(B&: State.Builder, Ty: PhiType, VF: State.VF); |
9203 | Value *NumUnrolledElems = |
9204 | State.Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: PhiType, V: State.UF)); |
9205 | Value *InductionGEP = GetElementPtrInst::Create( |
9206 | PointeeType: State.Builder.getInt8Ty(), Ptr: NewPointerPhi, |
9207 | IdxList: State.Builder.CreateMul(LHS: ScalarStepValue, RHS: NumUnrolledElems), NameStr: "ptr.ind" , |
9208 | InsertBefore: InductionLoc); |
9209 | // Add induction update using an incorrect block temporarily. The phi node |
9210 | // will be fixed after VPlan execution. Note that at this point the latch |
9211 | // block cannot be used, as it does not exist yet. |
9212 | // TODO: Model increment value in VPlan, by turning the recipe into a |
9213 | // multi-def and a subclass of VPHeaderPHIRecipe. |
9214 | NewPointerPhi->addIncoming(V: InductionGEP, BB: VectorPH); |
9215 | |
9216 | // Create UF many actual address geps that use the pointer |
9217 | // phi as base and a vectorized version of the step value |
9218 | // (<step*0, ..., step*N>) as offset. |
9219 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
9220 | Type *VecPhiType = VectorType::get(ElementType: PhiType, EC: State.VF); |
9221 | Value *StartOffsetScalar = |
9222 | State.Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: PhiType, V: Part)); |
9223 | Value *StartOffset = |
9224 | State.Builder.CreateVectorSplat(EC: State.VF, V: StartOffsetScalar); |
9225 | // Create a vector of consecutive numbers from zero to VF. |
9226 | StartOffset = State.Builder.CreateAdd( |
9227 | LHS: StartOffset, RHS: State.Builder.CreateStepVector(DstType: VecPhiType)); |
9228 | |
9229 | assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) && |
9230 | "scalar step must be the same across all parts" ); |
9231 | Value *GEP = State.Builder.CreateGEP( |
9232 | Ty: State.Builder.getInt8Ty(), Ptr: NewPointerPhi, |
9233 | IdxList: State.Builder.CreateMul( |
9234 | LHS: StartOffset, |
9235 | RHS: State.Builder.CreateVectorSplat(EC: State.VF, V: ScalarStepValue), |
9236 | Name: "vector.gep" )); |
9237 | State.set(Def: this, V: GEP, Part); |
9238 | } |
9239 | } |
9240 | |
9241 | void VPDerivedIVRecipe::execute(VPTransformState &State) { |
9242 | assert(!State.Instance && "VPDerivedIVRecipe being replicated." ); |
9243 | |
9244 | // Fast-math-flags propagate from the original induction instruction. |
9245 | IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); |
9246 | if (FPBinOp) |
9247 | State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags()); |
9248 | |
9249 | Value *Step = State.get(Def: getStepValue(), Instance: VPIteration(0, 0)); |
9250 | Value *CanonicalIV = State.get(Def: getOperand(N: 1), Instance: VPIteration(0, 0)); |
9251 | Value *DerivedIV = emitTransformedIndex( |
9252 | B&: State.Builder, Index: CanonicalIV, StartValue: getStartValue()->getLiveInIRValue(), Step, |
9253 | InductionKind: Kind, InductionBinOp: cast_if_present<BinaryOperator>(Val: FPBinOp)); |
9254 | DerivedIV->setName("offset.idx" ); |
9255 | assert(DerivedIV != CanonicalIV && "IV didn't need transforming?" ); |
9256 | |
9257 | State.set(Def: this, V: DerivedIV, Instance: VPIteration(0, 0)); |
9258 | } |
9259 | |
9260 | void VPInterleaveRecipe::execute(VPTransformState &State) { |
9261 | assert(!State.Instance && "Interleave group being replicated." ); |
9262 | State.ILV->vectorizeInterleaveGroup(Group: IG, VPDefs: definedValues(), State, Addr: getAddr(), |
9263 | StoredValues: getStoredValues(), BlockInMask: getMask(), |
9264 | NeedsMaskForGaps); |
9265 | } |
9266 | |
9267 | void VPReplicateRecipe::execute(VPTransformState &State) { |
9268 | Instruction *UI = getUnderlyingInstr(); |
9269 | if (State.Instance) { // Generate a single instance. |
9270 | assert(!State.VF.isScalable() && "Can't scalarize a scalable vector" ); |
9271 | State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: *State.Instance, State); |
9272 | // Insert scalar instance packing it into a vector. |
9273 | if (State.VF.isVector() && shouldPack()) { |
9274 | // If we're constructing lane 0, initialize to start from poison. |
9275 | if (State.Instance->Lane.isFirstLane()) { |
9276 | assert(!State.VF.isScalable() && "VF is assumed to be non scalable." ); |
9277 | Value *Poison = PoisonValue::get( |
9278 | T: VectorType::get(ElementType: UI->getType(), EC: State.VF)); |
9279 | State.set(Def: this, V: Poison, Part: State.Instance->Part); |
9280 | } |
9281 | State.packScalarIntoVectorValue(Def: this, Instance: *State.Instance); |
9282 | } |
9283 | return; |
9284 | } |
9285 | |
9286 | if (IsUniform) { |
9287 | // If the recipe is uniform across all parts (instead of just per VF), only |
9288 | // generate a single instance. |
9289 | if ((isa<LoadInst>(Val: UI) || isa<StoreInst>(Val: UI)) && |
9290 | all_of(Range: operands(), P: [](VPValue *Op) { |
9291 | return Op->isDefinedOutsideVectorRegions(); |
9292 | })) { |
9293 | State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: VPIteration(0, 0), State); |
9294 | if (user_begin() != user_end()) { |
9295 | for (unsigned Part = 1; Part < State.UF; ++Part) |
9296 | State.set(Def: this, V: State.get(Def: this, Instance: VPIteration(0, 0)), |
9297 | Instance: VPIteration(Part, 0)); |
9298 | } |
9299 | return; |
9300 | } |
9301 | |
9302 | // Uniform within VL means we need to generate lane 0 only for each |
9303 | // unrolled copy. |
9304 | for (unsigned Part = 0; Part < State.UF; ++Part) |
9305 | State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: VPIteration(Part, 0), State); |
9306 | return; |
9307 | } |
9308 | |
9309 | // A store of a loop varying value to a uniform address only needs the last |
9310 | // copy of the store. |
9311 | if (isa<StoreInst>(Val: UI) && |
9312 | vputils::isUniformAfterVectorization(VPV: getOperand(N: 1))) { |
9313 | auto Lane = VPLane::getLastLaneForVF(VF: State.VF); |
9314 | State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: VPIteration(State.UF - 1, Lane), |
9315 | State); |
9316 | return; |
9317 | } |
9318 | |
9319 | // Generate scalar instances for all VF lanes of all UF parts. |
9320 | assert(!State.VF.isScalable() && "Can't scalarize a scalable vector" ); |
9321 | const unsigned EndLane = State.VF.getKnownMinValue(); |
9322 | for (unsigned Part = 0; Part < State.UF; ++Part) |
9323 | for (unsigned Lane = 0; Lane < EndLane; ++Lane) |
9324 | State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: VPIteration(Part, Lane), State); |
9325 | } |
9326 | |
9327 | void VPWidenLoadRecipe::execute(VPTransformState &State) { |
9328 | auto *LI = cast<LoadInst>(Val: &Ingredient); |
9329 | |
9330 | Type *ScalarDataTy = getLoadStoreType(I: &Ingredient); |
9331 | auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF); |
9332 | const Align Alignment = getLoadStoreAlignment(I: &Ingredient); |
9333 | bool CreateGather = !isConsecutive(); |
9334 | |
9335 | auto &Builder = State.Builder; |
9336 | State.setDebugLocFrom(getDebugLoc()); |
9337 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
9338 | Value *NewLI; |
9339 | Value *Mask = nullptr; |
9340 | if (auto *VPMask = getMask()) { |
9341 | // Mask reversal is only needed for non-all-one (null) masks, as reverse |
9342 | // of a null all-one mask is a null mask. |
9343 | Mask = State.get(Def: VPMask, Part); |
9344 | if (isReverse()) |
9345 | Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse" ); |
9346 | } |
9347 | |
9348 | Value *Addr = State.get(Def: getAddr(), Part, /*IsScalar*/ !CreateGather); |
9349 | if (CreateGather) { |
9350 | NewLI = Builder.CreateMaskedGather(Ty: DataTy, Ptrs: Addr, Alignment, Mask, PassThru: nullptr, |
9351 | Name: "wide.masked.gather" ); |
9352 | } else if (Mask) { |
9353 | NewLI = Builder.CreateMaskedLoad(Ty: DataTy, Ptr: Addr, Alignment, Mask, |
9354 | PassThru: PoisonValue::get(T: DataTy), |
9355 | Name: "wide.masked.load" ); |
9356 | } else { |
9357 | NewLI = Builder.CreateAlignedLoad(Ty: DataTy, Ptr: Addr, Align: Alignment, Name: "wide.load" ); |
9358 | } |
9359 | // Add metadata to the load, but setVectorValue to the reverse shuffle. |
9360 | State.addMetadata(To: NewLI, From: LI); |
9361 | if (Reverse) |
9362 | NewLI = Builder.CreateVectorReverse(V: NewLI, Name: "reverse" ); |
9363 | State.set(Def: this, V: NewLI, Part); |
9364 | } |
9365 | } |
9366 | |
9367 | void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { |
9368 | assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " |
9369 | "explicit vector length." ); |
9370 | // FIXME: Support reverse loading after vp_reverse is added. |
9371 | assert(!isReverse() && "Reverse loads are not implemented yet." ); |
9372 | |
9373 | auto *LI = cast<LoadInst>(Val: &Ingredient); |
9374 | |
9375 | Type *ScalarDataTy = getLoadStoreType(I: &Ingredient); |
9376 | auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF); |
9377 | const Align Alignment = getLoadStoreAlignment(I: &Ingredient); |
9378 | bool CreateGather = !isConsecutive(); |
9379 | |
9380 | auto &Builder = State.Builder; |
9381 | State.setDebugLocFrom(getDebugLoc()); |
9382 | CallInst *NewLI; |
9383 | Value *EVL = State.get(Def: getEVL(), Instance: VPIteration(0, 0)); |
9384 | Value *Addr = State.get(Def: getAddr(), Part: 0, IsScalar: !CreateGather); |
9385 | Value *Mask = getMask() |
9386 | ? State.get(Def: getMask(), Part: 0) |
9387 | : Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue()); |
9388 | if (CreateGather) { |
9389 | NewLI = |
9390 | Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, |
9391 | nullptr, "wide.masked.gather" ); |
9392 | } else { |
9393 | VectorBuilder VBuilder(Builder); |
9394 | VBuilder.setEVL(EVL).setMask(Mask); |
9395 | NewLI = cast<CallInst>(Val: VBuilder.createVectorInstruction( |
9396 | Opcode: Instruction::Load, ReturnTy: DataTy, VecOpArray: Addr, Name: "vp.op.load" )); |
9397 | } |
9398 | NewLI->addParamAttr( |
9399 | ArgNo: 0, Attr: Attribute::getWithAlignment(Context&: NewLI->getContext(), Alignment)); |
9400 | State.addMetadata(To: NewLI, From: LI); |
9401 | State.set(Def: this, V: NewLI, Part: 0); |
9402 | } |
9403 | |
9404 | void VPWidenStoreRecipe::execute(VPTransformState &State) { |
9405 | auto *SI = cast<StoreInst>(Val: &Ingredient); |
9406 | |
9407 | VPValue *StoredVPValue = getStoredValue(); |
9408 | bool CreateScatter = !isConsecutive(); |
9409 | const Align Alignment = getLoadStoreAlignment(I: &Ingredient); |
9410 | |
9411 | auto &Builder = State.Builder; |
9412 | State.setDebugLocFrom(getDebugLoc()); |
9413 | |
9414 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
9415 | Instruction *NewSI = nullptr; |
9416 | Value *Mask = nullptr; |
9417 | if (auto *VPMask = getMask()) { |
9418 | // Mask reversal is only needed for non-all-one (null) masks, as reverse |
9419 | // of a null all-one mask is a null mask. |
9420 | Mask = State.get(Def: VPMask, Part); |
9421 | if (isReverse()) |
9422 | Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse" ); |
9423 | } |
9424 | |
9425 | Value *StoredVal = State.get(Def: StoredVPValue, Part); |
9426 | if (isReverse()) { |
9427 | // If we store to reverse consecutive memory locations, then we need |
9428 | // to reverse the order of elements in the stored value. |
9429 | StoredVal = Builder.CreateVectorReverse(V: StoredVal, Name: "reverse" ); |
9430 | // We don't want to update the value in the map as it might be used in |
9431 | // another expression. So don't call resetVectorValue(StoredVal). |
9432 | } |
9433 | Value *Addr = State.get(Def: getAddr(), Part, /*IsScalar*/ !CreateScatter); |
9434 | if (CreateScatter) |
9435 | NewSI = Builder.CreateMaskedScatter(Val: StoredVal, Ptrs: Addr, Alignment, Mask); |
9436 | else if (Mask) |
9437 | NewSI = Builder.CreateMaskedStore(Val: StoredVal, Ptr: Addr, Alignment, Mask); |
9438 | else |
9439 | NewSI = Builder.CreateAlignedStore(Val: StoredVal, Ptr: Addr, Align: Alignment); |
9440 | State.addMetadata(To: NewSI, From: SI); |
9441 | } |
9442 | } |
9443 | |
9444 | void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { |
9445 | assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " |
9446 | "explicit vector length." ); |
9447 | // FIXME: Support reverse loading after vp_reverse is added. |
9448 | assert(!isReverse() && "Reverse store are not implemented yet." ); |
9449 | |
9450 | auto *SI = cast<StoreInst>(Val: &Ingredient); |
9451 | |
9452 | VPValue *StoredValue = getStoredValue(); |
9453 | bool CreateScatter = !isConsecutive(); |
9454 | const Align Alignment = getLoadStoreAlignment(I: &Ingredient); |
9455 | |
9456 | auto &Builder = State.Builder; |
9457 | State.setDebugLocFrom(getDebugLoc()); |
9458 | |
9459 | CallInst *NewSI = nullptr; |
9460 | Value *StoredVal = State.get(Def: StoredValue, Part: 0); |
9461 | Value *EVL = State.get(Def: getEVL(), Instance: VPIteration(0, 0)); |
9462 | // FIXME: Support reverse store after vp_reverse is added. |
9463 | Value *Mask = getMask() |
9464 | ? State.get(Def: getMask(), Part: 0) |
9465 | : Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue()); |
9466 | Value *Addr = State.get(Def: getAddr(), Part: 0, IsScalar: !CreateScatter); |
9467 | if (CreateScatter) { |
9468 | NewSI = Builder.CreateIntrinsic(Type::getVoidTy(C&: EVL->getContext()), |
9469 | Intrinsic::vp_scatter, |
9470 | {StoredVal, Addr, Mask, EVL}); |
9471 | } else { |
9472 | VectorBuilder VBuilder(Builder); |
9473 | VBuilder.setEVL(EVL).setMask(Mask); |
9474 | NewSI = cast<CallInst>(Val: VBuilder.createVectorInstruction( |
9475 | Opcode: Instruction::Store, ReturnTy: Type::getVoidTy(C&: EVL->getContext()), |
9476 | VecOpArray: {StoredVal, Addr})); |
9477 | } |
9478 | NewSI->addParamAttr( |
9479 | ArgNo: 1, Attr: Attribute::getWithAlignment(Context&: NewSI->getContext(), Alignment)); |
9480 | State.addMetadata(To: NewSI, From: SI); |
9481 | } |
9482 | |
9483 | // Determine how to lower the scalar epilogue, which depends on 1) optimising |
9484 | // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing |
9485 | // predication, and 4) a TTI hook that analyses whether the loop is suitable |
9486 | // for predication. |
9487 | static ScalarEpilogueLowering getScalarEpilogueLowering( |
9488 | Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, |
9489 | BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, |
9490 | LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { |
9491 | // 1) OptSize takes precedence over all other options, i.e. if this is set, |
9492 | // don't look at hints or options, and don't request a scalar epilogue. |
9493 | // (For PGSO, as shouldOptimizeForSize isn't currently accessible from |
9494 | // LoopAccessInfo (due to code dependency and not being able to reliably get |
9495 | // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection |
9496 | // of strides in LoopAccessInfo::analyzeLoop() and vectorize without |
9497 | // versioning when the vectorization is forced, unlike hasOptSize. So revert |
9498 | // back to the old way and vectorize with versioning when forced. See D81345.) |
9499 | if (F->hasOptSize() || (llvm::shouldOptimizeForSize(BB: L->getHeader(), PSI, BFI, |
9500 | QueryType: PGSOQueryType::IRPass) && |
9501 | Hints.getForce() != LoopVectorizeHints::FK_Enabled)) |
9502 | return CM_ScalarEpilogueNotAllowedOptSize; |
9503 | |
9504 | // 2) If set, obey the directives |
9505 | if (PreferPredicateOverEpilogue.getNumOccurrences()) { |
9506 | switch (PreferPredicateOverEpilogue) { |
9507 | case PreferPredicateTy::ScalarEpilogue: |
9508 | return CM_ScalarEpilogueAllowed; |
9509 | case PreferPredicateTy::PredicateElseScalarEpilogue: |
9510 | return CM_ScalarEpilogueNotNeededUsePredicate; |
9511 | case PreferPredicateTy::PredicateOrDontVectorize: |
9512 | return CM_ScalarEpilogueNotAllowedUsePredicate; |
9513 | }; |
9514 | } |
9515 | |
9516 | // 3) If set, obey the hints |
9517 | switch (Hints.getPredicate()) { |
9518 | case LoopVectorizeHints::FK_Enabled: |
9519 | return CM_ScalarEpilogueNotNeededUsePredicate; |
9520 | case LoopVectorizeHints::FK_Disabled: |
9521 | return CM_ScalarEpilogueAllowed; |
9522 | }; |
9523 | |
9524 | // 4) if the TTI hook indicates this is profitable, request predication. |
9525 | TailFoldingInfo TFI(TLI, &LVL, IAI); |
9526 | if (TTI->preferPredicateOverEpilogue(TFI: &TFI)) |
9527 | return CM_ScalarEpilogueNotNeededUsePredicate; |
9528 | |
9529 | return CM_ScalarEpilogueAllowed; |
9530 | } |
9531 | |
9532 | // Process the loop in the VPlan-native vectorization path. This path builds |
9533 | // VPlan upfront in the vectorization pipeline, which allows to apply |
9534 | // VPlan-to-VPlan transformations from the very beginning without modifying the |
9535 | // input LLVM IR. |
9536 | static bool processLoopInVPlanNativePath( |
9537 | Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, |
9538 | LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, |
9539 | TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, |
9540 | OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, |
9541 | ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, |
9542 | LoopVectorizationRequirements &Requirements) { |
9543 | |
9544 | if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) { |
9545 | LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n" ); |
9546 | return false; |
9547 | } |
9548 | assert(EnableVPlanNativePath && "VPlan-native path is disabled." ); |
9549 | Function *F = L->getHeader()->getParent(); |
9550 | InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); |
9551 | |
9552 | ScalarEpilogueLowering SEL = |
9553 | getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL&: *LVL, IAI: &IAI); |
9554 | |
9555 | LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, |
9556 | &Hints, IAI); |
9557 | // Use the planner for outer loop vectorization. |
9558 | // TODO: CM is not used at this point inside the planner. Turn CM into an |
9559 | // optional argument if we don't need it in the future. |
9560 | LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints, |
9561 | ORE); |
9562 | |
9563 | // Get user vectorization factor. |
9564 | ElementCount UserVF = Hints.getWidth(); |
9565 | |
9566 | CM.collectElementTypesForWidening(); |
9567 | |
9568 | // Plan how to best vectorize, return the best VF and its cost. |
9569 | const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); |
9570 | |
9571 | // If we are stress testing VPlan builds, do not attempt to generate vector |
9572 | // code. Masked vector code generation support will follow soon. |
9573 | // Also, do not attempt to vectorize if no vector code will be produced. |
9574 | if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) |
9575 | return false; |
9576 | |
9577 | VPlan &BestPlan = LVP.getBestPlanFor(VF: VF.Width); |
9578 | |
9579 | { |
9580 | bool AddBranchWeights = |
9581 | hasBranchWeightMD(I: *L->getLoopLatch()->getTerminator()); |
9582 | GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, |
9583 | F->getParent()->getDataLayout(), AddBranchWeights); |
9584 | InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, |
9585 | VF.Width, 1, LVL, &CM, BFI, PSI, Checks); |
9586 | LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" |
9587 | << L->getHeader()->getParent()->getName() << "\"\n" ); |
9588 | LVP.executePlan(BestVF: VF.Width, BestUF: 1, BestVPlan&: BestPlan, ILV&: LB, DT, IsEpilogueVectorization: false); |
9589 | } |
9590 | |
9591 | reportVectorization(ORE, TheLoop: L, VF, IC: 1); |
9592 | |
9593 | // Mark the loop as already vectorized to avoid vectorizing again. |
9594 | Hints.setAlreadyVectorized(); |
9595 | assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); |
9596 | return true; |
9597 | } |
9598 | |
9599 | // Emit a remark if there are stores to floats that required a floating point |
9600 | // extension. If the vectorized loop was generated with floating point there |
9601 | // will be a performance penalty from the conversion overhead and the change in |
9602 | // the vector width. |
9603 | static void (Loop *L, OptimizationRemarkEmitter *ORE) { |
9604 | SmallVector<Instruction *, 4> Worklist; |
9605 | for (BasicBlock *BB : L->getBlocks()) { |
9606 | for (Instruction &Inst : *BB) { |
9607 | if (auto *S = dyn_cast<StoreInst>(Val: &Inst)) { |
9608 | if (S->getValueOperand()->getType()->isFloatTy()) |
9609 | Worklist.push_back(Elt: S); |
9610 | } |
9611 | } |
9612 | } |
9613 | |
9614 | // Traverse the floating point stores upwards searching, for floating point |
9615 | // conversions. |
9616 | SmallPtrSet<const Instruction *, 4> Visited; |
9617 | SmallPtrSet<const Instruction *, 4> ; |
9618 | while (!Worklist.empty()) { |
9619 | auto *I = Worklist.pop_back_val(); |
9620 | if (!L->contains(Inst: I)) |
9621 | continue; |
9622 | if (!Visited.insert(Ptr: I).second) |
9623 | continue; |
9624 | |
9625 | // Emit a remark if the floating point store required a floating |
9626 | // point conversion. |
9627 | // TODO: More work could be done to identify the root cause such as a |
9628 | // constant or a function return type and point the user to it. |
9629 | if (isa<FPExtInst>(Val: I) && EmittedRemark.insert(Ptr: I).second) |
9630 | ORE->emit(RemarkBuilder: [&]() { |
9631 | return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision" , |
9632 | I->getDebugLoc(), L->getHeader()) |
9633 | << "floating point conversion changes vector width. " |
9634 | << "Mixed floating point precision requires an up/down " |
9635 | << "cast that will negatively impact performance." ; |
9636 | }); |
9637 | |
9638 | for (Use &Op : I->operands()) |
9639 | if (auto *OpI = dyn_cast<Instruction>(Val&: Op)) |
9640 | Worklist.push_back(Elt: OpI); |
9641 | } |
9642 | } |
9643 | |
9644 | static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, |
9645 | VectorizationFactor &VF, |
9646 | std::optional<unsigned> VScale, Loop *L, |
9647 | ScalarEvolution &SE, |
9648 | ScalarEpilogueLowering SEL) { |
9649 | InstructionCost CheckCost = Checks.getCost(); |
9650 | if (!CheckCost.isValid()) |
9651 | return false; |
9652 | |
9653 | // When interleaving only scalar and vector cost will be equal, which in turn |
9654 | // would lead to a divide by 0. Fall back to hard threshold. |
9655 | if (VF.Width.isScalar()) { |
9656 | if (CheckCost > VectorizeMemoryCheckThreshold) { |
9657 | LLVM_DEBUG( |
9658 | dbgs() |
9659 | << "LV: Interleaving only is not profitable due to runtime checks\n" ); |
9660 | return false; |
9661 | } |
9662 | return true; |
9663 | } |
9664 | |
9665 | // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. |
9666 | uint64_t ScalarC = *VF.ScalarCost.getValue(); |
9667 | if (ScalarC == 0) |
9668 | return true; |
9669 | |
9670 | // First, compute the minimum iteration count required so that the vector |
9671 | // loop outperforms the scalar loop. |
9672 | // The total cost of the scalar loop is |
9673 | // ScalarC * TC |
9674 | // where |
9675 | // * TC is the actual trip count of the loop. |
9676 | // * ScalarC is the cost of a single scalar iteration. |
9677 | // |
9678 | // The total cost of the vector loop is |
9679 | // RtC + VecC * (TC / VF) + EpiC |
9680 | // where |
9681 | // * RtC is the cost of the generated runtime checks |
9682 | // * VecC is the cost of a single vector iteration. |
9683 | // * TC is the actual trip count of the loop |
9684 | // * VF is the vectorization factor |
9685 | // * EpiCost is the cost of the generated epilogue, including the cost |
9686 | // of the remaining scalar operations. |
9687 | // |
9688 | // Vectorization is profitable once the total vector cost is less than the |
9689 | // total scalar cost: |
9690 | // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC |
9691 | // |
9692 | // Now we can compute the minimum required trip count TC as |
9693 | // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC |
9694 | // |
9695 | // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that |
9696 | // the computations are performed on doubles, not integers and the result |
9697 | // is rounded up, hence we get an upper estimate of the TC. |
9698 | unsigned IntVF = VF.Width.getKnownMinValue(); |
9699 | if (VF.Width.isScalable()) { |
9700 | unsigned AssumedMinimumVscale = 1; |
9701 | if (VScale) |
9702 | AssumedMinimumVscale = *VScale; |
9703 | IntVF *= AssumedMinimumVscale; |
9704 | } |
9705 | uint64_t RtC = *CheckCost.getValue(); |
9706 | uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue(); |
9707 | uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(Numerator: RtC * IntVF, Denominator: Div); |
9708 | |
9709 | // Second, compute a minimum iteration count so that the cost of the |
9710 | // runtime checks is only a fraction of the total scalar loop cost. This |
9711 | // adds a loop-dependent bound on the overhead incurred if the runtime |
9712 | // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC |
9713 | // * TC. To bound the runtime check to be a fraction 1/X of the scalar |
9714 | // cost, compute |
9715 | // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC |
9716 | uint64_t MinTC2 = divideCeil(Numerator: RtC * 10, Denominator: ScalarC); |
9717 | |
9718 | // Now pick the larger minimum. If it is not a multiple of VF and a scalar |
9719 | // epilogue is allowed, choose the next closest multiple of VF. This should |
9720 | // partly compensate for ignoring the epilogue cost. |
9721 | uint64_t MinTC = std::max(a: MinTC1, b: MinTC2); |
9722 | if (SEL == CM_ScalarEpilogueAllowed) |
9723 | MinTC = alignTo(Value: MinTC, Align: IntVF); |
9724 | VF.MinProfitableTripCount = ElementCount::getFixed(MinVal: MinTC); |
9725 | |
9726 | LLVM_DEBUG( |
9727 | dbgs() << "LV: Minimum required TC for runtime checks to be profitable:" |
9728 | << VF.MinProfitableTripCount << "\n" ); |
9729 | |
9730 | // Skip vectorization if the expected trip count is less than the minimum |
9731 | // required trip count. |
9732 | if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { |
9733 | if (ElementCount::isKnownLT(LHS: ElementCount::getFixed(MinVal: *ExpectedTC), |
9734 | RHS: VF.MinProfitableTripCount)) { |
9735 | LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " |
9736 | "trip count < minimum profitable VF (" |
9737 | << *ExpectedTC << " < " << VF.MinProfitableTripCount |
9738 | << ")\n" ); |
9739 | |
9740 | return false; |
9741 | } |
9742 | } |
9743 | return true; |
9744 | } |
9745 | |
9746 | LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts) |
9747 | : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced || |
9748 | !EnableLoopInterleaving), |
9749 | VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || |
9750 | !EnableLoopVectorization) {} |
9751 | |
9752 | bool LoopVectorizePass::processLoop(Loop *L) { |
9753 | assert((EnableVPlanNativePath || L->isInnermost()) && |
9754 | "VPlan-native path is not enabled. Only process inner loops." ); |
9755 | |
9756 | #ifndef NDEBUG |
9757 | const std::string DebugLocStr = getDebugLocString(L); |
9758 | #endif /* NDEBUG */ |
9759 | |
9760 | LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '" |
9761 | << L->getHeader()->getParent()->getName() << "' from " |
9762 | << DebugLocStr << "\n" ); |
9763 | |
9764 | LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); |
9765 | |
9766 | LLVM_DEBUG( |
9767 | dbgs() << "LV: Loop hints:" |
9768 | << " force=" |
9769 | << (Hints.getForce() == LoopVectorizeHints::FK_Disabled |
9770 | ? "disabled" |
9771 | : (Hints.getForce() == LoopVectorizeHints::FK_Enabled |
9772 | ? "enabled" |
9773 | : "?" )) |
9774 | << " width=" << Hints.getWidth() |
9775 | << " interleave=" << Hints.getInterleave() << "\n" ); |
9776 | |
9777 | // Function containing loop |
9778 | Function *F = L->getHeader()->getParent(); |
9779 | |
9780 | // Looking at the diagnostic output is the only way to determine if a loop |
9781 | // was vectorized (other than looking at the IR or machine code), so it |
9782 | // is important to generate an optimization remark for each loop. Most of |
9783 | // these messages are generated as OptimizationRemarkAnalysis. Remarks |
9784 | // generated as OptimizationRemark and OptimizationRemarkMissed are |
9785 | // less verbose reporting vectorized loops and unvectorized loops that may |
9786 | // benefit from vectorization, respectively. |
9787 | |
9788 | if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { |
9789 | LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n" ); |
9790 | return false; |
9791 | } |
9792 | |
9793 | PredicatedScalarEvolution PSE(*SE, *L); |
9794 | |
9795 | // Check if it is legal to vectorize the loop. |
9796 | LoopVectorizationRequirements Requirements; |
9797 | LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, |
9798 | &Requirements, &Hints, DB, AC, BFI, PSI); |
9799 | if (!LVL.canVectorize(UseVPlanNativePath: EnableVPlanNativePath)) { |
9800 | LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n" ); |
9801 | Hints.emitRemarkWithHints(); |
9802 | return false; |
9803 | } |
9804 | |
9805 | // Entrance to the VPlan-native vectorization path. Outer loops are processed |
9806 | // here. They may require CFG and instruction level transformations before |
9807 | // even evaluating whether vectorization is profitable. Since we cannot modify |
9808 | // the incoming IR, we need to build VPlan upfront in the vectorization |
9809 | // pipeline. |
9810 | if (!L->isInnermost()) |
9811 | return processLoopInVPlanNativePath(L, PSE, LI, DT, LVL: &LVL, TTI, TLI, DB, AC, |
9812 | ORE, BFI, PSI, Hints, Requirements); |
9813 | |
9814 | assert(L->isInnermost() && "Inner loop expected." ); |
9815 | |
9816 | InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); |
9817 | bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); |
9818 | |
9819 | // If an override option has been passed in for interleaved accesses, use it. |
9820 | if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) |
9821 | UseInterleaved = EnableInterleavedMemAccesses; |
9822 | |
9823 | // Analyze interleaved memory accesses. |
9824 | if (UseInterleaved) |
9825 | IAI.analyzeInterleaving(EnableMaskedInterleavedGroup: useMaskedInterleavedAccesses(TTI: *TTI)); |
9826 | |
9827 | // Check the function attributes and profiles to find out if this function |
9828 | // should be optimized for size. |
9829 | ScalarEpilogueLowering SEL = |
9830 | getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, IAI: &IAI); |
9831 | |
9832 | // Check the loop for a trip count threshold: vectorize loops with a tiny trip |
9833 | // count by optimizing for size, to minimize overheads. |
9834 | auto ExpectedTC = getSmallBestKnownTC(SE&: *SE, L); |
9835 | if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { |
9836 | LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " |
9837 | << "This loop is worth vectorizing only if no scalar " |
9838 | << "iteration overheads are incurred." ); |
9839 | if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) |
9840 | LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n" ); |
9841 | else { |
9842 | if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) { |
9843 | LLVM_DEBUG(dbgs() << "\n" ); |
9844 | // Predicate tail-folded loops are efficient even when the loop |
9845 | // iteration count is low. However, setting the epilogue policy to |
9846 | // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops |
9847 | // with runtime checks. It's more effective to let |
9848 | // `areRuntimeChecksProfitable` determine if vectorization is beneficial |
9849 | // for the loop. |
9850 | if (SEL != CM_ScalarEpilogueNotNeededUsePredicate) |
9851 | SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; |
9852 | } else { |
9853 | LLVM_DEBUG(dbgs() << " But the target considers the trip count too " |
9854 | "small to consider vectorizing.\n" ); |
9855 | reportVectorizationFailure( |
9856 | DebugMsg: "The trip count is below the minial threshold value." , |
9857 | OREMsg: "loop trip count is too low, avoiding vectorization" , |
9858 | ORETag: "LowTripCount" , ORE, TheLoop: L); |
9859 | Hints.emitRemarkWithHints(); |
9860 | return false; |
9861 | } |
9862 | } |
9863 | } |
9864 | |
9865 | // Check the function attributes to see if implicit floats or vectors are |
9866 | // allowed. |
9867 | if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { |
9868 | reportVectorizationFailure( |
9869 | DebugMsg: "Can't vectorize when the NoImplicitFloat attribute is used" , |
9870 | OREMsg: "loop not vectorized due to NoImplicitFloat attribute" , |
9871 | ORETag: "NoImplicitFloat" , ORE, TheLoop: L); |
9872 | Hints.emitRemarkWithHints(); |
9873 | return false; |
9874 | } |
9875 | |
9876 | // Check if the target supports potentially unsafe FP vectorization. |
9877 | // FIXME: Add a check for the type of safety issue (denormal, signaling) |
9878 | // for the target we're vectorizing for, to make sure none of the |
9879 | // additional fp-math flags can help. |
9880 | if (Hints.isPotentiallyUnsafe() && |
9881 | TTI->isFPVectorizationPotentiallyUnsafe()) { |
9882 | reportVectorizationFailure( |
9883 | DebugMsg: "Potentially unsafe FP op prevents vectorization" , |
9884 | OREMsg: "loop not vectorized due to unsafe FP support." , |
9885 | ORETag: "UnsafeFP" , ORE, TheLoop: L); |
9886 | Hints.emitRemarkWithHints(); |
9887 | return false; |
9888 | } |
9889 | |
9890 | bool AllowOrderedReductions; |
9891 | // If the flag is set, use that instead and override the TTI behaviour. |
9892 | if (ForceOrderedReductions.getNumOccurrences() > 0) |
9893 | AllowOrderedReductions = ForceOrderedReductions; |
9894 | else |
9895 | AllowOrderedReductions = TTI->enableOrderedReductions(); |
9896 | if (!LVL.canVectorizeFPMath(EnableStrictReductions: AllowOrderedReductions)) { |
9897 | ORE->emit(RemarkBuilder: [&]() { |
9898 | auto *ExactFPMathInst = Requirements.getExactFPInst(); |
9899 | return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps" , |
9900 | ExactFPMathInst->getDebugLoc(), |
9901 | ExactFPMathInst->getParent()) |
9902 | << "loop not vectorized: cannot prove it is safe to reorder " |
9903 | "floating-point operations" ; |
9904 | }); |
9905 | LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " |
9906 | "reorder floating-point operations\n" ); |
9907 | Hints.emitRemarkWithHints(); |
9908 | return false; |
9909 | } |
9910 | |
9911 | // Use the cost model. |
9912 | LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, |
9913 | F, &Hints, IAI); |
9914 | // Use the planner for vectorization. |
9915 | LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, |
9916 | ORE); |
9917 | |
9918 | // Get user vectorization factor and interleave count. |
9919 | ElementCount UserVF = Hints.getWidth(); |
9920 | unsigned UserIC = Hints.getInterleave(); |
9921 | |
9922 | // Plan how to best vectorize, return the best VF and its cost. |
9923 | std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC); |
9924 | |
9925 | VectorizationFactor VF = VectorizationFactor::Disabled(); |
9926 | unsigned IC = 1; |
9927 | |
9928 | bool AddBranchWeights = |
9929 | hasBranchWeightMD(I: *L->getLoopLatch()->getTerminator()); |
9930 | GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, |
9931 | F->getParent()->getDataLayout(), AddBranchWeights); |
9932 | if (MaybeVF) { |
9933 | VF = *MaybeVF; |
9934 | // Select the interleave count. |
9935 | IC = CM.selectInterleaveCount(VF: VF.Width, LoopCost: VF.Cost); |
9936 | |
9937 | unsigned SelectedIC = std::max(a: IC, b: UserIC); |
9938 | // Optimistically generate runtime checks if they are needed. Drop them if |
9939 | // they turn out to not be profitable. |
9940 | if (VF.Width.isVector() || SelectedIC > 1) |
9941 | Checks.Create(L, LAI: *LVL.getLAI(), UnionPred: PSE.getPredicate(), VF: VF.Width, IC: SelectedIC); |
9942 | |
9943 | // Check if it is profitable to vectorize with runtime checks. |
9944 | bool ForceVectorization = |
9945 | Hints.getForce() == LoopVectorizeHints::FK_Enabled; |
9946 | if (!ForceVectorization && |
9947 | !areRuntimeChecksProfitable(Checks, VF, VScale: getVScaleForTuning(L, TTI: *TTI), L, |
9948 | SE&: *PSE.getSE(), SEL)) { |
9949 | ORE->emit(RemarkBuilder: [&]() { |
9950 | return OptimizationRemarkAnalysisAliasing( |
9951 | DEBUG_TYPE, "CantReorderMemOps" , L->getStartLoc(), |
9952 | L->getHeader()) |
9953 | << "loop not vectorized: cannot prove it is safe to reorder " |
9954 | "memory operations" ; |
9955 | }); |
9956 | LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n" ); |
9957 | Hints.emitRemarkWithHints(); |
9958 | return false; |
9959 | } |
9960 | } |
9961 | |
9962 | // Identify the diagnostic messages that should be produced. |
9963 | std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; |
9964 | bool VectorizeLoop = true, InterleaveLoop = true; |
9965 | if (VF.Width.isScalar()) { |
9966 | LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n" ); |
9967 | VecDiagMsg = std::make_pair( |
9968 | x: "VectorizationNotBeneficial" , |
9969 | y: "the cost-model indicates that vectorization is not beneficial" ); |
9970 | VectorizeLoop = false; |
9971 | } |
9972 | |
9973 | if (!MaybeVF && UserIC > 1) { |
9974 | // Tell the user interleaving was avoided up-front, despite being explicitly |
9975 | // requested. |
9976 | LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and " |
9977 | "interleaving should be avoided up front\n" ); |
9978 | IntDiagMsg = std::make_pair( |
9979 | x: "InterleavingAvoided" , |
9980 | y: "Ignoring UserIC, because interleaving was avoided up front" ); |
9981 | InterleaveLoop = false; |
9982 | } else if (IC == 1 && UserIC <= 1) { |
9983 | // Tell the user interleaving is not beneficial. |
9984 | LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n" ); |
9985 | IntDiagMsg = std::make_pair( |
9986 | x: "InterleavingNotBeneficial" , |
9987 | y: "the cost-model indicates that interleaving is not beneficial" ); |
9988 | InterleaveLoop = false; |
9989 | if (UserIC == 1) { |
9990 | IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled" ; |
9991 | IntDiagMsg.second += |
9992 | " and is explicitly disabled or interleave count is set to 1" ; |
9993 | } |
9994 | } else if (IC > 1 && UserIC == 1) { |
9995 | // Tell the user interleaving is beneficial, but it explicitly disabled. |
9996 | LLVM_DEBUG( |
9997 | dbgs() << "LV: Interleaving is beneficial but is explicitly disabled." ); |
9998 | IntDiagMsg = std::make_pair( |
9999 | x: "InterleavingBeneficialButDisabled" , |
10000 | y: "the cost-model indicates that interleaving is beneficial " |
10001 | "but is explicitly disabled or interleave count is set to 1" ); |
10002 | InterleaveLoop = false; |
10003 | } |
10004 | |
10005 | // Override IC if user provided an interleave count. |
10006 | IC = UserIC > 0 ? UserIC : IC; |
10007 | |
10008 | // Emit diagnostic messages, if any. |
10009 | const char *VAPassName = Hints.vectorizeAnalysisPassName(); |
10010 | if (!VectorizeLoop && !InterleaveLoop) { |
10011 | // Do not vectorize or interleaving the loop. |
10012 | ORE->emit(RemarkBuilder: [&]() { |
10013 | return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first, |
10014 | L->getStartLoc(), L->getHeader()) |
10015 | << VecDiagMsg.second; |
10016 | }); |
10017 | ORE->emit(RemarkBuilder: [&]() { |
10018 | return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first, |
10019 | L->getStartLoc(), L->getHeader()) |
10020 | << IntDiagMsg.second; |
10021 | }); |
10022 | return false; |
10023 | } else if (!VectorizeLoop && InterleaveLoop) { |
10024 | LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); |
10025 | ORE->emit(RemarkBuilder: [&]() { |
10026 | return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first, |
10027 | L->getStartLoc(), L->getHeader()) |
10028 | << VecDiagMsg.second; |
10029 | }); |
10030 | } else if (VectorizeLoop && !InterleaveLoop) { |
10031 | LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width |
10032 | << ") in " << DebugLocStr << '\n'); |
10033 | ORE->emit(RemarkBuilder: [&]() { |
10034 | return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, |
10035 | L->getStartLoc(), L->getHeader()) |
10036 | << IntDiagMsg.second; |
10037 | }); |
10038 | } else if (VectorizeLoop && InterleaveLoop) { |
10039 | LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width |
10040 | << ") in " << DebugLocStr << '\n'); |
10041 | LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); |
10042 | } |
10043 | |
10044 | bool DisableRuntimeUnroll = false; |
10045 | MDNode *OrigLoopID = L->getLoopID(); |
10046 | { |
10047 | using namespace ore; |
10048 | if (!VectorizeLoop) { |
10049 | assert(IC > 1 && "interleave count should not be 1 or 0" ); |
10050 | // If we decided that it is not legal to vectorize the loop, then |
10051 | // interleave it. |
10052 | InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, |
10053 | &CM, BFI, PSI, Checks); |
10054 | |
10055 | VPlan &BestPlan = LVP.getBestPlanFor(VF: VF.Width); |
10056 | LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: Unroller, DT, IsEpilogueVectorization: false); |
10057 | |
10058 | ORE->emit(RemarkBuilder: [&]() { |
10059 | return OptimizationRemark(LV_NAME, "Interleaved" , L->getStartLoc(), |
10060 | L->getHeader()) |
10061 | << "interleaved loop (interleaved count: " |
10062 | << NV("InterleaveCount" , IC) << ")" ; |
10063 | }); |
10064 | } else { |
10065 | // If we decided that it is *legal* to vectorize the loop, then do it. |
10066 | |
10067 | // Consider vectorizing the epilogue too if it's profitable. |
10068 | VectorizationFactor EpilogueVF = |
10069 | LVP.selectEpilogueVectorizationFactor(MainLoopVF: VF.Width, IC); |
10070 | if (EpilogueVF.Width.isVector()) { |
10071 | |
10072 | // The first pass vectorizes the main loop and creates a scalar epilogue |
10073 | // to be vectorized by executing the plan (potentially with a different |
10074 | // factor) again shortly afterwards. |
10075 | EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); |
10076 | EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, |
10077 | EPI, &LVL, &CM, BFI, PSI, Checks); |
10078 | |
10079 | std::unique_ptr<VPlan> BestMainPlan( |
10080 | LVP.getBestPlanFor(VF: EPI.MainLoopVF).duplicate()); |
10081 | const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan( |
10082 | BestVF: EPI.MainLoopVF, BestUF: EPI.MainLoopUF, BestVPlan&: *BestMainPlan, ILV&: MainILV, DT, IsEpilogueVectorization: true); |
10083 | ++LoopsVectorized; |
10084 | |
10085 | // Second pass vectorizes the epilogue and adjusts the control flow |
10086 | // edges from the first pass. |
10087 | EPI.MainLoopVF = EPI.EpilogueVF; |
10088 | EPI.MainLoopUF = EPI.EpilogueUF; |
10089 | EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, |
10090 | ORE, EPI, &LVL, &CM, BFI, PSI, |
10091 | Checks); |
10092 | |
10093 | VPlan &BestEpiPlan = LVP.getBestPlanFor(VF: EPI.EpilogueVF); |
10094 | VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); |
10095 | VPBasicBlock * = VectorLoop->getEntryBasicBlock(); |
10096 | Header->setName("vec.epilog.vector.body" ); |
10097 | |
10098 | // Re-use the trip count and steps expanded for the main loop, as |
10099 | // skeleton creation needs it as a value that dominates both the scalar |
10100 | // and vector epilogue loops |
10101 | // TODO: This is a workaround needed for epilogue vectorization and it |
10102 | // should be removed once induction resume value creation is done |
10103 | // directly in VPlan. |
10104 | EpilogILV.setTripCount(MainILV.getTripCount()); |
10105 | for (auto &R : make_early_inc_range(Range&: *BestEpiPlan.getPreheader())) { |
10106 | auto *ExpandR = cast<VPExpandSCEVRecipe>(Val: &R); |
10107 | auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn( |
10108 | V: ExpandedSCEVs.find(Val: ExpandR->getSCEV())->second); |
10109 | ExpandR->replaceAllUsesWith(New: ExpandedVal); |
10110 | if (BestEpiPlan.getTripCount() == ExpandR) |
10111 | BestEpiPlan.resetTripCount(NewTripCount: ExpandedVal); |
10112 | ExpandR->eraseFromParent(); |
10113 | } |
10114 | |
10115 | // Ensure that the start values for any VPWidenIntOrFpInductionRecipe, |
10116 | // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated |
10117 | // before vectorizing the epilogue loop. |
10118 | for (VPRecipeBase &R : Header->phis()) { |
10119 | if (isa<VPCanonicalIVPHIRecipe>(Val: &R)) |
10120 | continue; |
10121 | |
10122 | Value *ResumeV = nullptr; |
10123 | // TODO: Move setting of resume values to prepareToExecute. |
10124 | if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R)) { |
10125 | ResumeV = ReductionResumeValues |
10126 | .find(Val: &ReductionPhi->getRecurrenceDescriptor()) |
10127 | ->second; |
10128 | } else { |
10129 | // Create induction resume values for both widened pointer and |
10130 | // integer/fp inductions and update the start value of the induction |
10131 | // recipes to use the resume value. |
10132 | PHINode *IndPhi = nullptr; |
10133 | const InductionDescriptor *ID; |
10134 | if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(Val: &R)) { |
10135 | IndPhi = cast<PHINode>(Val: Ind->getUnderlyingValue()); |
10136 | ID = &Ind->getInductionDescriptor(); |
10137 | } else { |
10138 | auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(Val: &R); |
10139 | IndPhi = WidenInd->getPHINode(); |
10140 | ID = &WidenInd->getInductionDescriptor(); |
10141 | } |
10142 | |
10143 | ResumeV = MainILV.createInductionResumeValue( |
10144 | OrigPhi: IndPhi, II: *ID, Step: getExpandedStep(ID: *ID, ExpandedSCEVs), |
10145 | BypassBlocks: {EPI.MainLoopIterationCountCheck}); |
10146 | } |
10147 | assert(ResumeV && "Must have a resume value" ); |
10148 | VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(V: ResumeV); |
10149 | cast<VPHeaderPHIRecipe>(Val: &R)->setStartValue(StartVal); |
10150 | } |
10151 | |
10152 | LVP.executePlan(BestVF: EPI.EpilogueVF, BestUF: EPI.EpilogueUF, BestVPlan&: BestEpiPlan, ILV&: EpilogILV, |
10153 | DT, IsEpilogueVectorization: true, ExpandedSCEVs: &ExpandedSCEVs); |
10154 | ++LoopsEpilogueVectorized; |
10155 | |
10156 | if (!MainILV.areSafetyChecksAdded()) |
10157 | DisableRuntimeUnroll = true; |
10158 | } else { |
10159 | InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, |
10160 | VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, |
10161 | PSI, Checks); |
10162 | |
10163 | VPlan &BestPlan = LVP.getBestPlanFor(VF: VF.Width); |
10164 | LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: LB, DT, IsEpilogueVectorization: false); |
10165 | ++LoopsVectorized; |
10166 | |
10167 | // Add metadata to disable runtime unrolling a scalar loop when there |
10168 | // are no runtime checks about strides and memory. A scalar loop that is |
10169 | // rarely used is not worth unrolling. |
10170 | if (!LB.areSafetyChecksAdded()) |
10171 | DisableRuntimeUnroll = true; |
10172 | } |
10173 | // Report the vectorization decision. |
10174 | reportVectorization(ORE, TheLoop: L, VF, IC); |
10175 | } |
10176 | |
10177 | if (ORE->allowExtraAnalysis(LV_NAME)) |
10178 | checkMixedPrecision(L, ORE); |
10179 | } |
10180 | |
10181 | std::optional<MDNode *> RemainderLoopID = |
10182 | makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll, |
10183 | LLVMLoopVectorizeFollowupEpilogue}); |
10184 | if (RemainderLoopID) { |
10185 | L->setLoopID(*RemainderLoopID); |
10186 | } else { |
10187 | if (DisableRuntimeUnroll) |
10188 | AddRuntimeUnrollDisableMetaData(L); |
10189 | |
10190 | // Mark the loop as already vectorized to avoid vectorizing again. |
10191 | Hints.setAlreadyVectorized(); |
10192 | } |
10193 | |
10194 | assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); |
10195 | return true; |
10196 | } |
10197 | |
10198 | LoopVectorizeResult LoopVectorizePass::runImpl( |
10199 | Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, |
10200 | DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, |
10201 | DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, |
10202 | OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { |
10203 | SE = &SE_; |
10204 | LI = &LI_; |
10205 | TTI = &TTI_; |
10206 | DT = &DT_; |
10207 | BFI = BFI_; |
10208 | TLI = TLI_; |
10209 | AC = &AC_; |
10210 | LAIs = &LAIs_; |
10211 | DB = &DB_; |
10212 | ORE = &ORE_; |
10213 | PSI = PSI_; |
10214 | |
10215 | // Don't attempt if |
10216 | // 1. the target claims to have no vector registers, and |
10217 | // 2. interleaving won't help ILP. |
10218 | // |
10219 | // The second condition is necessary because, even if the target has no |
10220 | // vector registers, loop vectorization may still enable scalar |
10221 | // interleaving. |
10222 | if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true)) && |
10223 | TTI->getMaxInterleaveFactor(VF: ElementCount::getFixed(MinVal: 1)) < 2) |
10224 | return LoopVectorizeResult(false, false); |
10225 | |
10226 | bool Changed = false, CFGChanged = false; |
10227 | |
10228 | // The vectorizer requires loops to be in simplified form. |
10229 | // Since simplification may add new inner loops, it has to run before the |
10230 | // legality and profitability checks. This means running the loop vectorizer |
10231 | // will simplify all loops, regardless of whether anything end up being |
10232 | // vectorized. |
10233 | for (const auto &L : *LI) |
10234 | Changed |= CFGChanged |= |
10235 | simplifyLoop(L, DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA: false /* PreserveLCSSA */); |
10236 | |
10237 | // Build up a worklist of inner-loops to vectorize. This is necessary as |
10238 | // the act of vectorizing or partially unrolling a loop creates new loops |
10239 | // and can invalidate iterators across the loops. |
10240 | SmallVector<Loop *, 8> Worklist; |
10241 | |
10242 | for (Loop *L : *LI) |
10243 | collectSupportedLoops(L&: *L, LI, ORE, V&: Worklist); |
10244 | |
10245 | LoopsAnalyzed += Worklist.size(); |
10246 | |
10247 | // Now walk the identified inner loops. |
10248 | while (!Worklist.empty()) { |
10249 | Loop *L = Worklist.pop_back_val(); |
10250 | |
10251 | // For the inner loops we actually process, form LCSSA to simplify the |
10252 | // transform. |
10253 | Changed |= formLCSSARecursively(L&: *L, DT: *DT, LI, SE); |
10254 | |
10255 | Changed |= CFGChanged |= processLoop(L); |
10256 | |
10257 | if (Changed) { |
10258 | LAIs->clear(); |
10259 | |
10260 | #ifndef NDEBUG |
10261 | if (VerifySCEV) |
10262 | SE->verify(); |
10263 | #endif |
10264 | } |
10265 | } |
10266 | |
10267 | // Process each loop nest in the function. |
10268 | return LoopVectorizeResult(Changed, CFGChanged); |
10269 | } |
10270 | |
10271 | PreservedAnalyses LoopVectorizePass::run(Function &F, |
10272 | FunctionAnalysisManager &AM) { |
10273 | auto &LI = AM.getResult<LoopAnalysis>(IR&: F); |
10274 | // There are no loops in the function. Return before computing other expensive |
10275 | // analyses. |
10276 | if (LI.empty()) |
10277 | return PreservedAnalyses::all(); |
10278 | auto &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F); |
10279 | auto &TTI = AM.getResult<TargetIRAnalysis>(IR&: F); |
10280 | auto &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F); |
10281 | auto &TLI = AM.getResult<TargetLibraryAnalysis>(IR&: F); |
10282 | auto &AC = AM.getResult<AssumptionAnalysis>(IR&: F); |
10283 | auto &DB = AM.getResult<DemandedBitsAnalysis>(IR&: F); |
10284 | auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
10285 | |
10286 | LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(IR&: F); |
10287 | auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F); |
10288 | ProfileSummaryInfo *PSI = |
10289 | MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent()); |
10290 | BlockFrequencyInfo *BFI = nullptr; |
10291 | if (PSI && PSI->hasProfileSummary()) |
10292 | BFI = &AM.getResult<BlockFrequencyAnalysis>(IR&: F); |
10293 | LoopVectorizeResult Result = |
10294 | runImpl(F, SE_&: SE, LI_&: LI, TTI_&: TTI, DT_&: DT, BFI_: BFI, TLI_: &TLI, DB_&: DB, AC_&: AC, LAIs_&: LAIs, ORE_&: ORE, PSI_: PSI); |
10295 | if (!Result.MadeAnyChange) |
10296 | return PreservedAnalyses::all(); |
10297 | PreservedAnalyses PA; |
10298 | |
10299 | if (isAssignmentTrackingEnabled(M: *F.getParent())) { |
10300 | for (auto &BB : F) |
10301 | RemoveRedundantDbgInstrs(BB: &BB); |
10302 | } |
10303 | |
10304 | // We currently do not preserve loopinfo/dominator analyses with outer loop |
10305 | // vectorization. Until this is addressed, mark these analyses as preserved |
10306 | // only for non-VPlan-native path. |
10307 | // TODO: Preserve Loop and Dominator analyses for VPlan-native path. |
10308 | if (!EnableVPlanNativePath) { |
10309 | PA.preserve<LoopAnalysis>(); |
10310 | PA.preserve<DominatorTreeAnalysis>(); |
10311 | PA.preserve<ScalarEvolutionAnalysis>(); |
10312 | } |
10313 | |
10314 | if (Result.MadeCFGChange) { |
10315 | // Making CFG changes likely means a loop got vectorized. Indicate that |
10316 | // extra simplification passes should be run. |
10317 | // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only |
10318 | // be run if runtime checks have been added. |
10319 | AM.getResult<ShouldRunExtraVectorPasses>(IR&: F); |
10320 | PA.preserve<ShouldRunExtraVectorPasses>(); |
10321 | } else { |
10322 | PA.preserveSet<CFGAnalyses>(); |
10323 | } |
10324 | return PA; |
10325 | } |
10326 | |
10327 | void LoopVectorizePass::printPipeline( |
10328 | raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { |
10329 | static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline( |
10330 | OS, MapClassName2PassName); |
10331 | |
10332 | OS << '<'; |
10333 | OS << (InterleaveOnlyWhenForced ? "" : "no-" ) << "interleave-forced-only;" ; |
10334 | OS << (VectorizeOnlyWhenForced ? "" : "no-" ) << "vectorize-forced-only;" ; |
10335 | OS << '>'; |
10336 | } |
10337 | |