1 | //===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // The implementation for the loop memory dependence that was originally |
10 | // developed for the loop vectorizer. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "llvm/Analysis/LoopAccessAnalysis.h" |
15 | #include "llvm/ADT/APInt.h" |
16 | #include "llvm/ADT/DenseMap.h" |
17 | #include "llvm/ADT/EquivalenceClasses.h" |
18 | #include "llvm/ADT/PointerIntPair.h" |
19 | #include "llvm/ADT/STLExtras.h" |
20 | #include "llvm/ADT/SetVector.h" |
21 | #include "llvm/ADT/SmallPtrSet.h" |
22 | #include "llvm/ADT/SmallSet.h" |
23 | #include "llvm/ADT/SmallVector.h" |
24 | #include "llvm/Analysis/AliasAnalysis.h" |
25 | #include "llvm/Analysis/AliasSetTracker.h" |
26 | #include "llvm/Analysis/LoopAnalysisManager.h" |
27 | #include "llvm/Analysis/LoopInfo.h" |
28 | #include "llvm/Analysis/LoopIterator.h" |
29 | #include "llvm/Analysis/MemoryLocation.h" |
30 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
31 | #include "llvm/Analysis/ScalarEvolution.h" |
32 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
33 | #include "llvm/Analysis/TargetLibraryInfo.h" |
34 | #include "llvm/Analysis/ValueTracking.h" |
35 | #include "llvm/Analysis/VectorUtils.h" |
36 | #include "llvm/IR/BasicBlock.h" |
37 | #include "llvm/IR/Constants.h" |
38 | #include "llvm/IR/DataLayout.h" |
39 | #include "llvm/IR/DebugLoc.h" |
40 | #include "llvm/IR/DerivedTypes.h" |
41 | #include "llvm/IR/DiagnosticInfo.h" |
42 | #include "llvm/IR/Dominators.h" |
43 | #include "llvm/IR/Function.h" |
44 | #include "llvm/IR/GetElementPtrTypeIterator.h" |
45 | #include "llvm/IR/InstrTypes.h" |
46 | #include "llvm/IR/Instruction.h" |
47 | #include "llvm/IR/Instructions.h" |
48 | #include "llvm/IR/Operator.h" |
49 | #include "llvm/IR/PassManager.h" |
50 | #include "llvm/IR/PatternMatch.h" |
51 | #include "llvm/IR/Type.h" |
52 | #include "llvm/IR/Value.h" |
53 | #include "llvm/IR/ValueHandle.h" |
54 | #include "llvm/Support/Casting.h" |
55 | #include "llvm/Support/CommandLine.h" |
56 | #include "llvm/Support/Debug.h" |
57 | #include "llvm/Support/ErrorHandling.h" |
58 | #include "llvm/Support/raw_ostream.h" |
59 | #include <algorithm> |
60 | #include <cassert> |
61 | #include <cstdint> |
62 | #include <iterator> |
63 | #include <utility> |
64 | #include <variant> |
65 | #include <vector> |
66 | |
67 | using namespace llvm; |
68 | using namespace llvm::PatternMatch; |
69 | |
70 | #define DEBUG_TYPE "loop-accesses" |
71 | |
72 | static cl::opt<unsigned, true> |
73 | VectorizationFactor("force-vector-width" , cl::Hidden, |
74 | cl::desc("Sets the SIMD width. Zero is autoselect." ), |
75 | cl::location(L&: VectorizerParams::VectorizationFactor)); |
76 | unsigned VectorizerParams::VectorizationFactor; |
77 | |
78 | static cl::opt<unsigned, true> |
79 | VectorizationInterleave("force-vector-interleave" , cl::Hidden, |
80 | cl::desc("Sets the vectorization interleave count. " |
81 | "Zero is autoselect." ), |
82 | cl::location( |
83 | L&: VectorizerParams::VectorizationInterleave)); |
84 | unsigned VectorizerParams::VectorizationInterleave; |
85 | |
86 | static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold( |
87 | "runtime-memory-check-threshold" , cl::Hidden, |
88 | cl::desc("When performing memory disambiguation checks at runtime do not " |
89 | "generate more than this number of comparisons (default = 8)." ), |
90 | cl::location(L&: VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(Val: 8)); |
91 | unsigned VectorizerParams::RuntimeMemoryCheckThreshold; |
92 | |
93 | /// The maximum iterations used to merge memory checks |
94 | static cl::opt<unsigned> MemoryCheckMergeThreshold( |
95 | "memory-check-merge-threshold" , cl::Hidden, |
96 | cl::desc("Maximum number of comparisons done when trying to merge " |
97 | "runtime memory checks. (default = 100)" ), |
98 | cl::init(Val: 100)); |
99 | |
100 | /// Maximum SIMD width. |
101 | const unsigned VectorizerParams::MaxVectorWidth = 64; |
102 | |
103 | /// We collect dependences up to this threshold. |
104 | static cl::opt<unsigned> |
105 | MaxDependences("max-dependences" , cl::Hidden, |
106 | cl::desc("Maximum number of dependences collected by " |
107 | "loop-access analysis (default = 100)" ), |
108 | cl::init(Val: 100)); |
109 | |
110 | /// This enables versioning on the strides of symbolically striding memory |
111 | /// accesses in code like the following. |
112 | /// for (i = 0; i < N; ++i) |
113 | /// A[i * Stride1] += B[i * Stride2] ... |
114 | /// |
115 | /// Will be roughly translated to |
116 | /// if (Stride1 == 1 && Stride2 == 1) { |
117 | /// for (i = 0; i < N; i+=4) |
118 | /// A[i:i+3] += ... |
119 | /// } else |
120 | /// ... |
121 | static cl::opt<bool> EnableMemAccessVersioning( |
122 | "enable-mem-access-versioning" , cl::init(Val: true), cl::Hidden, |
123 | cl::desc("Enable symbolic stride memory access versioning" )); |
124 | |
125 | /// Enable store-to-load forwarding conflict detection. This option can |
126 | /// be disabled for correctness testing. |
127 | static cl::opt<bool> EnableForwardingConflictDetection( |
128 | "store-to-load-forwarding-conflict-detection" , cl::Hidden, |
129 | cl::desc("Enable conflict detection in loop-access analysis" ), |
130 | cl::init(Val: true)); |
131 | |
132 | static cl::opt<unsigned> MaxForkedSCEVDepth( |
133 | "max-forked-scev-depth" , cl::Hidden, |
134 | cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)" ), |
135 | cl::init(Val: 5)); |
136 | |
137 | static cl::opt<bool> SpeculateUnitStride( |
138 | "laa-speculate-unit-stride" , cl::Hidden, |
139 | cl::desc("Speculate that non-constant strides are unit in LAA" ), |
140 | cl::init(Val: true)); |
141 | |
142 | static cl::opt<bool, true> HoistRuntimeChecks( |
143 | "hoist-runtime-checks" , cl::Hidden, |
144 | cl::desc( |
145 | "Hoist inner loop runtime memory checks to outer loop if possible" ), |
146 | cl::location(L&: VectorizerParams::HoistRuntimeChecks), cl::init(Val: true)); |
147 | bool VectorizerParams::HoistRuntimeChecks; |
148 | |
149 | bool VectorizerParams::isInterleaveForced() { |
150 | return ::VectorizationInterleave.getNumOccurrences() > 0; |
151 | } |
152 | |
153 | const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, |
154 | const DenseMap<Value *, const SCEV *> &PtrToStride, |
155 | Value *Ptr) { |
156 | const SCEV *OrigSCEV = PSE.getSCEV(V: Ptr); |
157 | |
158 | // If there is an entry in the map return the SCEV of the pointer with the |
159 | // symbolic stride replaced by one. |
160 | DenseMap<Value *, const SCEV *>::const_iterator SI = PtrToStride.find(Val: Ptr); |
161 | if (SI == PtrToStride.end()) |
162 | // For a non-symbolic stride, just return the original expression. |
163 | return OrigSCEV; |
164 | |
165 | const SCEV *StrideSCEV = SI->second; |
166 | // Note: This assert is both overly strong and overly weak. The actual |
167 | // invariant here is that StrideSCEV should be loop invariant. The only |
168 | // such invariant strides we happen to speculate right now are unknowns |
169 | // and thus this is a reasonable proxy of the actual invariant. |
170 | assert(isa<SCEVUnknown>(StrideSCEV) && "shouldn't be in map" ); |
171 | |
172 | ScalarEvolution *SE = PSE.getSE(); |
173 | const auto *CT = SE->getOne(Ty: StrideSCEV->getType()); |
174 | PSE.addPredicate(Pred: *SE->getEqualPredicate(LHS: StrideSCEV, RHS: CT)); |
175 | auto *Expr = PSE.getSCEV(V: Ptr); |
176 | |
177 | LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV |
178 | << " by: " << *Expr << "\n" ); |
179 | return Expr; |
180 | } |
181 | |
182 | RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup( |
183 | unsigned Index, RuntimePointerChecking &RtCheck) |
184 | : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start), |
185 | AddressSpace(RtCheck.Pointers[Index] |
186 | .PointerValue->getType() |
187 | ->getPointerAddressSpace()), |
188 | NeedsFreeze(RtCheck.Pointers[Index].NeedsFreeze) { |
189 | Members.push_back(Elt: Index); |
190 | } |
191 | |
192 | /// Calculate Start and End points of memory access. |
193 | /// Let's assume A is the first access and B is a memory access on N-th loop |
194 | /// iteration. Then B is calculated as: |
195 | /// B = A + Step*N . |
196 | /// Step value may be positive or negative. |
197 | /// N is a calculated back-edge taken count: |
198 | /// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0 |
199 | /// Start and End points are calculated in the following way: |
200 | /// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt, |
201 | /// where SizeOfElt is the size of single memory access in bytes. |
202 | /// |
203 | /// There is no conflict when the intervals are disjoint: |
204 | /// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End) |
205 | void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, |
206 | Type *AccessTy, bool WritePtr, |
207 | unsigned DepSetId, unsigned ASId, |
208 | PredicatedScalarEvolution &PSE, |
209 | bool NeedsFreeze) { |
210 | ScalarEvolution *SE = PSE.getSE(); |
211 | |
212 | const SCEV *ScStart; |
213 | const SCEV *ScEnd; |
214 | |
215 | if (SE->isLoopInvariant(S: PtrExpr, L: Lp)) { |
216 | ScStart = ScEnd = PtrExpr; |
217 | } else { |
218 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrExpr); |
219 | assert(AR && "Invalid addrec expression" ); |
220 | const SCEV *Ex = PSE.getBackedgeTakenCount(); |
221 | |
222 | ScStart = AR->getStart(); |
223 | ScEnd = AR->evaluateAtIteration(It: Ex, SE&: *SE); |
224 | const SCEV *Step = AR->getStepRecurrence(SE&: *SE); |
225 | |
226 | // For expressions with negative step, the upper bound is ScStart and the |
227 | // lower bound is ScEnd. |
228 | if (const auto *CStep = dyn_cast<SCEVConstant>(Val: Step)) { |
229 | if (CStep->getValue()->isNegative()) |
230 | std::swap(a&: ScStart, b&: ScEnd); |
231 | } else { |
232 | // Fallback case: the step is not constant, but we can still |
233 | // get the upper and lower bounds of the interval by using min/max |
234 | // expressions. |
235 | ScStart = SE->getUMinExpr(LHS: ScStart, RHS: ScEnd); |
236 | ScEnd = SE->getUMaxExpr(LHS: AR->getStart(), RHS: ScEnd); |
237 | } |
238 | } |
239 | assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant" ); |
240 | assert(SE->isLoopInvariant(ScEnd, Lp)&& "ScEnd needs to be invariant" ); |
241 | |
242 | // Add the size of the pointed element to ScEnd. |
243 | auto &DL = Lp->getHeader()->getModule()->getDataLayout(); |
244 | Type *IdxTy = DL.getIndexType(PtrTy: Ptr->getType()); |
245 | const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IntTy: IdxTy, StoreTy: AccessTy); |
246 | ScEnd = SE->getAddExpr(LHS: ScEnd, RHS: EltSizeSCEV); |
247 | |
248 | Pointers.emplace_back(Args&: Ptr, Args&: ScStart, Args&: ScEnd, Args&: WritePtr, Args&: DepSetId, Args&: ASId, Args&: PtrExpr, |
249 | Args&: NeedsFreeze); |
250 | } |
251 | |
252 | void RuntimePointerChecking::tryToCreateDiffCheck( |
253 | const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) { |
254 | if (!CanUseDiffCheck) |
255 | return; |
256 | |
257 | // If either group contains multiple different pointers, bail out. |
258 | // TODO: Support multiple pointers by using the minimum or maximum pointer, |
259 | // depending on src & sink. |
260 | if (CGI.Members.size() != 1 || CGJ.Members.size() != 1) { |
261 | CanUseDiffCheck = false; |
262 | return; |
263 | } |
264 | |
265 | PointerInfo *Src = &Pointers[CGI.Members[0]]; |
266 | PointerInfo *Sink = &Pointers[CGJ.Members[0]]; |
267 | |
268 | // If either pointer is read and written, multiple checks may be needed. Bail |
269 | // out. |
270 | if (!DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: !Src->IsWritePtr).empty() || |
271 | !DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: !Sink->IsWritePtr).empty()) { |
272 | CanUseDiffCheck = false; |
273 | return; |
274 | } |
275 | |
276 | ArrayRef<unsigned> AccSrc = |
277 | DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: Src->IsWritePtr); |
278 | ArrayRef<unsigned> AccSink = |
279 | DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: Sink->IsWritePtr); |
280 | // If either pointer is accessed multiple times, there may not be a clear |
281 | // src/sink relation. Bail out for now. |
282 | if (AccSrc.size() != 1 || AccSink.size() != 1) { |
283 | CanUseDiffCheck = false; |
284 | return; |
285 | } |
286 | // If the sink is accessed before src, swap src/sink. |
287 | if (AccSink[0] < AccSrc[0]) |
288 | std::swap(a&: Src, b&: Sink); |
289 | |
290 | auto *SrcAR = dyn_cast<SCEVAddRecExpr>(Val: Src->Expr); |
291 | auto *SinkAR = dyn_cast<SCEVAddRecExpr>(Val: Sink->Expr); |
292 | if (!SrcAR || !SinkAR || SrcAR->getLoop() != DC.getInnermostLoop() || |
293 | SinkAR->getLoop() != DC.getInnermostLoop()) { |
294 | CanUseDiffCheck = false; |
295 | return; |
296 | } |
297 | |
298 | SmallVector<Instruction *, 4> SrcInsts = |
299 | DC.getInstructionsForAccess(Ptr: Src->PointerValue, isWrite: Src->IsWritePtr); |
300 | SmallVector<Instruction *, 4> SinkInsts = |
301 | DC.getInstructionsForAccess(Ptr: Sink->PointerValue, isWrite: Sink->IsWritePtr); |
302 | Type *SrcTy = getLoadStoreType(I: SrcInsts[0]); |
303 | Type *DstTy = getLoadStoreType(I: SinkInsts[0]); |
304 | if (isa<ScalableVectorType>(Val: SrcTy) || isa<ScalableVectorType>(Val: DstTy)) { |
305 | CanUseDiffCheck = false; |
306 | return; |
307 | } |
308 | const DataLayout &DL = |
309 | SinkAR->getLoop()->getHeader()->getModule()->getDataLayout(); |
310 | unsigned AllocSize = |
311 | std::max(a: DL.getTypeAllocSize(Ty: SrcTy), b: DL.getTypeAllocSize(Ty: DstTy)); |
312 | |
313 | // Only matching constant steps matching the AllocSize are supported at the |
314 | // moment. This simplifies the difference computation. Can be extended in the |
315 | // future. |
316 | auto *Step = dyn_cast<SCEVConstant>(Val: SinkAR->getStepRecurrence(SE&: *SE)); |
317 | if (!Step || Step != SrcAR->getStepRecurrence(SE&: *SE) || |
318 | Step->getAPInt().abs() != AllocSize) { |
319 | CanUseDiffCheck = false; |
320 | return; |
321 | } |
322 | |
323 | IntegerType *IntTy = |
324 | IntegerType::get(C&: Src->PointerValue->getContext(), |
325 | NumBits: DL.getPointerSizeInBits(AS: CGI.AddressSpace)); |
326 | |
327 | // When counting down, the dependence distance needs to be swapped. |
328 | if (Step->getValue()->isNegative()) |
329 | std::swap(a&: SinkAR, b&: SrcAR); |
330 | |
331 | const SCEV *SinkStartInt = SE->getPtrToIntExpr(Op: SinkAR->getStart(), Ty: IntTy); |
332 | const SCEV *SrcStartInt = SE->getPtrToIntExpr(Op: SrcAR->getStart(), Ty: IntTy); |
333 | if (isa<SCEVCouldNotCompute>(Val: SinkStartInt) || |
334 | isa<SCEVCouldNotCompute>(Val: SrcStartInt)) { |
335 | CanUseDiffCheck = false; |
336 | return; |
337 | } |
338 | |
339 | const Loop *InnerLoop = SrcAR->getLoop(); |
340 | // If the start values for both Src and Sink also vary according to an outer |
341 | // loop, then it's probably better to avoid creating diff checks because |
342 | // they may not be hoisted. We should instead let llvm::addRuntimeChecks |
343 | // do the expanded full range overlap checks, which can be hoisted. |
344 | if (HoistRuntimeChecks && InnerLoop->getParentLoop() && |
345 | isa<SCEVAddRecExpr>(Val: SinkStartInt) && isa<SCEVAddRecExpr>(Val: SrcStartInt)) { |
346 | auto *SrcStartAR = cast<SCEVAddRecExpr>(Val: SrcStartInt); |
347 | auto *SinkStartAR = cast<SCEVAddRecExpr>(Val: SinkStartInt); |
348 | const Loop *StartARLoop = SrcStartAR->getLoop(); |
349 | if (StartARLoop == SinkStartAR->getLoop() && |
350 | StartARLoop == InnerLoop->getParentLoop() && |
351 | // If the diff check would already be loop invariant (due to the |
352 | // recurrences being the same), then we prefer to keep the diff checks |
353 | // because they are cheaper. |
354 | SrcStartAR->getStepRecurrence(SE&: *SE) != |
355 | SinkStartAR->getStepRecurrence(SE&: *SE)) { |
356 | LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these " |
357 | "cannot be hoisted out of the outer loop\n" ); |
358 | CanUseDiffCheck = false; |
359 | return; |
360 | } |
361 | } |
362 | |
363 | LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n" |
364 | << "SrcStart: " << *SrcStartInt << '\n' |
365 | << "SinkStartInt: " << *SinkStartInt << '\n'); |
366 | DiffChecks.emplace_back(Args&: SrcStartInt, Args&: SinkStartInt, Args&: AllocSize, |
367 | Args: Src->NeedsFreeze || Sink->NeedsFreeze); |
368 | } |
369 | |
370 | SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() { |
371 | SmallVector<RuntimePointerCheck, 4> Checks; |
372 | |
373 | for (unsigned I = 0; I < CheckingGroups.size(); ++I) { |
374 | for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) { |
375 | const RuntimeCheckingPtrGroup &CGI = CheckingGroups[I]; |
376 | const RuntimeCheckingPtrGroup &CGJ = CheckingGroups[J]; |
377 | |
378 | if (needsChecking(M: CGI, N: CGJ)) { |
379 | tryToCreateDiffCheck(CGI, CGJ); |
380 | Checks.push_back(Elt: std::make_pair(x: &CGI, y: &CGJ)); |
381 | } |
382 | } |
383 | } |
384 | return Checks; |
385 | } |
386 | |
387 | void RuntimePointerChecking::generateChecks( |
388 | MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) { |
389 | assert(Checks.empty() && "Checks is not empty" ); |
390 | groupChecks(DepCands, UseDependencies); |
391 | Checks = generateChecks(); |
392 | } |
393 | |
394 | bool RuntimePointerChecking::needsChecking( |
395 | const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const { |
396 | for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I) |
397 | for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J) |
398 | if (needsChecking(I: M.Members[I], J: N.Members[J])) |
399 | return true; |
400 | return false; |
401 | } |
402 | |
403 | /// Compare \p I and \p J and return the minimum. |
404 | /// Return nullptr in case we couldn't find an answer. |
405 | static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J, |
406 | ScalarEvolution *SE) { |
407 | const SCEV *Diff = SE->getMinusSCEV(LHS: J, RHS: I); |
408 | const SCEVConstant *C = dyn_cast<const SCEVConstant>(Val: Diff); |
409 | |
410 | if (!C) |
411 | return nullptr; |
412 | if (C->getValue()->isNegative()) |
413 | return J; |
414 | return I; |
415 | } |
416 | |
417 | bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, |
418 | RuntimePointerChecking &RtCheck) { |
419 | return addPointer( |
420 | Index, Start: RtCheck.Pointers[Index].Start, End: RtCheck.Pointers[Index].End, |
421 | AS: RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(), |
422 | NeedsFreeze: RtCheck.Pointers[Index].NeedsFreeze, SE&: *RtCheck.SE); |
423 | } |
424 | |
425 | bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, const SCEV *Start, |
426 | const SCEV *End, unsigned AS, |
427 | bool NeedsFreeze, |
428 | ScalarEvolution &SE) { |
429 | assert(AddressSpace == AS && |
430 | "all pointers in a checking group must be in the same address space" ); |
431 | |
432 | // Compare the starts and ends with the known minimum and maximum |
433 | // of this set. We need to know how we compare against the min/max |
434 | // of the set in order to be able to emit memchecks. |
435 | const SCEV *Min0 = getMinFromExprs(I: Start, J: Low, SE: &SE); |
436 | if (!Min0) |
437 | return false; |
438 | |
439 | const SCEV *Min1 = getMinFromExprs(I: End, J: High, SE: &SE); |
440 | if (!Min1) |
441 | return false; |
442 | |
443 | // Update the low bound expression if we've found a new min value. |
444 | if (Min0 == Start) |
445 | Low = Start; |
446 | |
447 | // Update the high bound expression if we've found a new max value. |
448 | if (Min1 != End) |
449 | High = End; |
450 | |
451 | Members.push_back(Elt: Index); |
452 | this->NeedsFreeze |= NeedsFreeze; |
453 | return true; |
454 | } |
455 | |
456 | void RuntimePointerChecking::groupChecks( |
457 | MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) { |
458 | // We build the groups from dependency candidates equivalence classes |
459 | // because: |
460 | // - We know that pointers in the same equivalence class share |
461 | // the same underlying object and therefore there is a chance |
462 | // that we can compare pointers |
463 | // - We wouldn't be able to merge two pointers for which we need |
464 | // to emit a memcheck. The classes in DepCands are already |
465 | // conveniently built such that no two pointers in the same |
466 | // class need checking against each other. |
467 | |
468 | // We use the following (greedy) algorithm to construct the groups |
469 | // For every pointer in the equivalence class: |
470 | // For each existing group: |
471 | // - if the difference between this pointer and the min/max bounds |
472 | // of the group is a constant, then make the pointer part of the |
473 | // group and update the min/max bounds of that group as required. |
474 | |
475 | CheckingGroups.clear(); |
476 | |
477 | // If we need to check two pointers to the same underlying object |
478 | // with a non-constant difference, we shouldn't perform any pointer |
479 | // grouping with those pointers. This is because we can easily get |
480 | // into cases where the resulting check would return false, even when |
481 | // the accesses are safe. |
482 | // |
483 | // The following example shows this: |
484 | // for (i = 0; i < 1000; ++i) |
485 | // a[5000 + i * m] = a[i] + a[i + 9000] |
486 | // |
487 | // Here grouping gives a check of (5000, 5000 + 1000 * m) against |
488 | // (0, 10000) which is always false. However, if m is 1, there is no |
489 | // dependence. Not grouping the checks for a[i] and a[i + 9000] allows |
490 | // us to perform an accurate check in this case. |
491 | // |
492 | // The above case requires that we have an UnknownDependence between |
493 | // accesses to the same underlying object. This cannot happen unless |
494 | // FoundNonConstantDistanceDependence is set, and therefore UseDependencies |
495 | // is also false. In this case we will use the fallback path and create |
496 | // separate checking groups for all pointers. |
497 | |
498 | // If we don't have the dependency partitions, construct a new |
499 | // checking pointer group for each pointer. This is also required |
500 | // for correctness, because in this case we can have checking between |
501 | // pointers to the same underlying object. |
502 | if (!UseDependencies) { |
503 | for (unsigned I = 0; I < Pointers.size(); ++I) |
504 | CheckingGroups.push_back(Elt: RuntimeCheckingPtrGroup(I, *this)); |
505 | return; |
506 | } |
507 | |
508 | unsigned TotalComparisons = 0; |
509 | |
510 | DenseMap<Value *, SmallVector<unsigned>> PositionMap; |
511 | for (unsigned Index = 0; Index < Pointers.size(); ++Index) { |
512 | auto Iter = PositionMap.insert(KV: {Pointers[Index].PointerValue, {}}); |
513 | Iter.first->second.push_back(Elt: Index); |
514 | } |
515 | |
516 | // We need to keep track of what pointers we've already seen so we |
517 | // don't process them twice. |
518 | SmallSet<unsigned, 2> Seen; |
519 | |
520 | // Go through all equivalence classes, get the "pointer check groups" |
521 | // and add them to the overall solution. We use the order in which accesses |
522 | // appear in 'Pointers' to enforce determinism. |
523 | for (unsigned I = 0; I < Pointers.size(); ++I) { |
524 | // We've seen this pointer before, and therefore already processed |
525 | // its equivalence class. |
526 | if (Seen.count(V: I)) |
527 | continue; |
528 | |
529 | MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue, |
530 | Pointers[I].IsWritePtr); |
531 | |
532 | SmallVector<RuntimeCheckingPtrGroup, 2> Groups; |
533 | auto LeaderI = DepCands.findValue(V: DepCands.getLeaderValue(V: Access)); |
534 | |
535 | // Because DepCands is constructed by visiting accesses in the order in |
536 | // which they appear in alias sets (which is deterministic) and the |
537 | // iteration order within an equivalence class member is only dependent on |
538 | // the order in which unions and insertions are performed on the |
539 | // equivalence class, the iteration order is deterministic. |
540 | for (auto MI = DepCands.member_begin(I: LeaderI), ME = DepCands.member_end(); |
541 | MI != ME; ++MI) { |
542 | auto PointerI = PositionMap.find(Val: MI->getPointer()); |
543 | assert(PointerI != PositionMap.end() && |
544 | "pointer in equivalence class not found in PositionMap" ); |
545 | for (unsigned Pointer : PointerI->second) { |
546 | bool Merged = false; |
547 | // Mark this pointer as seen. |
548 | Seen.insert(V: Pointer); |
549 | |
550 | // Go through all the existing sets and see if we can find one |
551 | // which can include this pointer. |
552 | for (RuntimeCheckingPtrGroup &Group : Groups) { |
553 | // Don't perform more than a certain amount of comparisons. |
554 | // This should limit the cost of grouping the pointers to something |
555 | // reasonable. If we do end up hitting this threshold, the algorithm |
556 | // will create separate groups for all remaining pointers. |
557 | if (TotalComparisons > MemoryCheckMergeThreshold) |
558 | break; |
559 | |
560 | TotalComparisons++; |
561 | |
562 | if (Group.addPointer(Index: Pointer, RtCheck&: *this)) { |
563 | Merged = true; |
564 | break; |
565 | } |
566 | } |
567 | |
568 | if (!Merged) |
569 | // We couldn't add this pointer to any existing set or the threshold |
570 | // for the number of comparisons has been reached. Create a new group |
571 | // to hold the current pointer. |
572 | Groups.push_back(Elt: RuntimeCheckingPtrGroup(Pointer, *this)); |
573 | } |
574 | } |
575 | |
576 | // We've computed the grouped checks for this partition. |
577 | // Save the results and continue with the next one. |
578 | llvm::copy(Range&: Groups, Out: std::back_inserter(x&: CheckingGroups)); |
579 | } |
580 | } |
581 | |
582 | bool RuntimePointerChecking::arePointersInSamePartition( |
583 | const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1, |
584 | unsigned PtrIdx2) { |
585 | return (PtrToPartition[PtrIdx1] != -1 && |
586 | PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]); |
587 | } |
588 | |
589 | bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const { |
590 | const PointerInfo &PointerI = Pointers[I]; |
591 | const PointerInfo &PointerJ = Pointers[J]; |
592 | |
593 | // No need to check if two readonly pointers intersect. |
594 | if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr) |
595 | return false; |
596 | |
597 | // Only need to check pointers between two different dependency sets. |
598 | if (PointerI.DependencySetId == PointerJ.DependencySetId) |
599 | return false; |
600 | |
601 | // Only need to check pointers in the same alias set. |
602 | if (PointerI.AliasSetId != PointerJ.AliasSetId) |
603 | return false; |
604 | |
605 | return true; |
606 | } |
607 | |
608 | void RuntimePointerChecking::printChecks( |
609 | raw_ostream &OS, const SmallVectorImpl<RuntimePointerCheck> &Checks, |
610 | unsigned Depth) const { |
611 | unsigned N = 0; |
612 | for (const auto &Check : Checks) { |
613 | const auto &First = Check.first->Members, &Second = Check.second->Members; |
614 | |
615 | OS.indent(NumSpaces: Depth) << "Check " << N++ << ":\n" ; |
616 | |
617 | OS.indent(NumSpaces: Depth + 2) << "Comparing group (" << Check.first << "):\n" ; |
618 | for (unsigned K = 0; K < First.size(); ++K) |
619 | OS.indent(NumSpaces: Depth + 2) << *Pointers[First[K]].PointerValue << "\n" ; |
620 | |
621 | OS.indent(NumSpaces: Depth + 2) << "Against group (" << Check.second << "):\n" ; |
622 | for (unsigned K = 0; K < Second.size(); ++K) |
623 | OS.indent(NumSpaces: Depth + 2) << *Pointers[Second[K]].PointerValue << "\n" ; |
624 | } |
625 | } |
626 | |
627 | void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const { |
628 | |
629 | OS.indent(NumSpaces: Depth) << "Run-time memory checks:\n" ; |
630 | printChecks(OS, Checks, Depth); |
631 | |
632 | OS.indent(NumSpaces: Depth) << "Grouped accesses:\n" ; |
633 | for (unsigned I = 0; I < CheckingGroups.size(); ++I) { |
634 | const auto &CG = CheckingGroups[I]; |
635 | |
636 | OS.indent(NumSpaces: Depth + 2) << "Group " << &CG << ":\n" ; |
637 | OS.indent(NumSpaces: Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High |
638 | << ")\n" ; |
639 | for (unsigned J = 0; J < CG.Members.size(); ++J) { |
640 | OS.indent(NumSpaces: Depth + 6) << "Member: " << *Pointers[CG.Members[J]].Expr |
641 | << "\n" ; |
642 | } |
643 | } |
644 | } |
645 | |
646 | namespace { |
647 | |
648 | /// Analyses memory accesses in a loop. |
649 | /// |
650 | /// Checks whether run time pointer checks are needed and builds sets for data |
651 | /// dependence checking. |
652 | class AccessAnalysis { |
653 | public: |
654 | /// Read or write access location. |
655 | typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; |
656 | typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList; |
657 | |
658 | AccessAnalysis(Loop *TheLoop, AAResults *AA, LoopInfo *LI, |
659 | MemoryDepChecker::DepCandidates &DA, |
660 | PredicatedScalarEvolution &PSE, |
661 | SmallPtrSetImpl<MDNode *> &LoopAliasScopes) |
662 | : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE), |
663 | LoopAliasScopes(LoopAliasScopes) { |
664 | // We're analyzing dependences across loop iterations. |
665 | BAA.enableCrossIterationMode(); |
666 | } |
667 | |
668 | /// Register a load and whether it is only read from. |
669 | void addLoad(MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) { |
670 | Value *Ptr = const_cast<Value *>(Loc.Ptr); |
671 | AST.add(Loc: adjustLoc(Loc)); |
672 | Accesses[MemAccessInfo(Ptr, false)].insert(X: AccessTy); |
673 | if (IsReadOnly) |
674 | ReadOnlyPtr.insert(Ptr); |
675 | } |
676 | |
677 | /// Register a store. |
678 | void addStore(MemoryLocation &Loc, Type *AccessTy) { |
679 | Value *Ptr = const_cast<Value *>(Loc.Ptr); |
680 | AST.add(Loc: adjustLoc(Loc)); |
681 | Accesses[MemAccessInfo(Ptr, true)].insert(X: AccessTy); |
682 | } |
683 | |
684 | /// Check if we can emit a run-time no-alias check for \p Access. |
685 | /// |
686 | /// Returns true if we can emit a run-time no alias check for \p Access. |
687 | /// If we can check this access, this also adds it to a dependence set and |
688 | /// adds a run-time to check for it to \p RtCheck. If \p Assume is true, |
689 | /// we will attempt to use additional run-time checks in order to get |
690 | /// the bounds of the pointer. |
691 | bool createCheckForAccess(RuntimePointerChecking &RtCheck, |
692 | MemAccessInfo Access, Type *AccessTy, |
693 | const DenseMap<Value *, const SCEV *> &Strides, |
694 | DenseMap<Value *, unsigned> &DepSetId, |
695 | Loop *TheLoop, unsigned &RunningDepId, |
696 | unsigned ASId, bool ShouldCheckStride, bool Assume); |
697 | |
698 | /// Check whether we can check the pointers at runtime for |
699 | /// non-intersection. |
700 | /// |
701 | /// Returns true if we need no check or if we do and we can generate them |
702 | /// (i.e. the pointers have computable bounds). |
703 | bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE, |
704 | Loop *TheLoop, const DenseMap<Value *, const SCEV *> &Strides, |
705 | Value *&UncomputablePtr, bool ShouldCheckWrap = false); |
706 | |
707 | /// Goes over all memory accesses, checks whether a RT check is needed |
708 | /// and builds sets of dependent accesses. |
709 | void buildDependenceSets() { |
710 | processMemAccesses(); |
711 | } |
712 | |
713 | /// Initial processing of memory accesses determined that we need to |
714 | /// perform dependency checking. |
715 | /// |
716 | /// Note that this can later be cleared if we retry memcheck analysis without |
717 | /// dependency checking (i.e. FoundNonConstantDistanceDependence). |
718 | bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } |
719 | |
720 | /// We decided that no dependence analysis would be used. Reset the state. |
721 | void resetDepChecks(MemoryDepChecker &DepChecker) { |
722 | CheckDeps.clear(); |
723 | DepChecker.clearDependences(); |
724 | } |
725 | |
726 | MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; } |
727 | |
728 | const DenseMap<Value *, SmallVector<const Value *, 16>> & |
729 | getUnderlyingObjects() { |
730 | return UnderlyingObjects; |
731 | } |
732 | |
733 | private: |
734 | typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap; |
735 | |
736 | /// Adjust the MemoryLocation so that it represents accesses to this |
737 | /// location across all iterations, rather than a single one. |
738 | MemoryLocation adjustLoc(MemoryLocation Loc) const { |
739 | // The accessed location varies within the loop, but remains within the |
740 | // underlying object. |
741 | Loc.Size = LocationSize::beforeOrAfterPointer(); |
742 | Loc.AATags.Scope = adjustAliasScopeList(ScopeList: Loc.AATags.Scope); |
743 | Loc.AATags.NoAlias = adjustAliasScopeList(ScopeList: Loc.AATags.NoAlias); |
744 | return Loc; |
745 | } |
746 | |
747 | /// Drop alias scopes that are only valid within a single loop iteration. |
748 | MDNode *adjustAliasScopeList(MDNode *ScopeList) const { |
749 | if (!ScopeList) |
750 | return nullptr; |
751 | |
752 | // For the sake of simplicity, drop the whole scope list if any scope is |
753 | // iteration-local. |
754 | if (any_of(Range: ScopeList->operands(), P: [&](Metadata *Scope) { |
755 | return LoopAliasScopes.contains(Ptr: cast<MDNode>(Val: Scope)); |
756 | })) |
757 | return nullptr; |
758 | |
759 | return ScopeList; |
760 | } |
761 | |
762 | /// Go over all memory access and check whether runtime pointer checks |
763 | /// are needed and build sets of dependency check candidates. |
764 | void processMemAccesses(); |
765 | |
766 | /// Map of all accesses. Values are the types used to access memory pointed to |
767 | /// by the pointer. |
768 | PtrAccessMap Accesses; |
769 | |
770 | /// The loop being checked. |
771 | const Loop *TheLoop; |
772 | |
773 | /// List of accesses that need a further dependence check. |
774 | MemAccessInfoList CheckDeps; |
775 | |
776 | /// Set of pointers that are read only. |
777 | SmallPtrSet<Value*, 16> ReadOnlyPtr; |
778 | |
779 | /// Batched alias analysis results. |
780 | BatchAAResults BAA; |
781 | |
782 | /// An alias set tracker to partition the access set by underlying object and |
783 | //intrinsic property (such as TBAA metadata). |
784 | AliasSetTracker AST; |
785 | |
786 | LoopInfo *LI; |
787 | |
788 | /// Sets of potentially dependent accesses - members of one set share an |
789 | /// underlying pointer. The set "CheckDeps" identfies which sets really need a |
790 | /// dependence check. |
791 | MemoryDepChecker::DepCandidates &DepCands; |
792 | |
793 | /// Initial processing of memory accesses determined that we may need |
794 | /// to add memchecks. Perform the analysis to determine the necessary checks. |
795 | /// |
796 | /// Note that, this is different from isDependencyCheckNeeded. When we retry |
797 | /// memcheck analysis without dependency checking |
798 | /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is |
799 | /// cleared while this remains set if we have potentially dependent accesses. |
800 | bool IsRTCheckAnalysisNeeded = false; |
801 | |
802 | /// The SCEV predicate containing all the SCEV-related assumptions. |
803 | PredicatedScalarEvolution &PSE; |
804 | |
805 | DenseMap<Value *, SmallVector<const Value *, 16>> UnderlyingObjects; |
806 | |
807 | /// Alias scopes that are declared inside the loop, and as such not valid |
808 | /// across iterations. |
809 | SmallPtrSetImpl<MDNode *> &LoopAliasScopes; |
810 | }; |
811 | |
812 | } // end anonymous namespace |
813 | |
814 | /// Check whether a pointer can participate in a runtime bounds check. |
815 | /// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr |
816 | /// by adding run-time checks (overflow checks) if necessary. |
817 | static bool hasComputableBounds(PredicatedScalarEvolution &PSE, Value *Ptr, |
818 | const SCEV *PtrScev, Loop *L, bool Assume) { |
819 | // The bounds for loop-invariant pointer is trivial. |
820 | if (PSE.getSE()->isLoopInvariant(S: PtrScev, L)) |
821 | return true; |
822 | |
823 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev); |
824 | |
825 | if (!AR && Assume) |
826 | AR = PSE.getAsAddRec(V: Ptr); |
827 | |
828 | if (!AR) |
829 | return false; |
830 | |
831 | return AR->isAffine(); |
832 | } |
833 | |
834 | /// Check whether a pointer address cannot wrap. |
835 | static bool isNoWrap(PredicatedScalarEvolution &PSE, |
836 | const DenseMap<Value *, const SCEV *> &Strides, Value *Ptr, Type *AccessTy, |
837 | Loop *L) { |
838 | const SCEV *PtrScev = PSE.getSCEV(V: Ptr); |
839 | if (PSE.getSE()->isLoopInvariant(S: PtrScev, L)) |
840 | return true; |
841 | |
842 | int64_t Stride = getPtrStride(PSE, AccessTy, Ptr, Lp: L, StridesMap: Strides).value_or(u: 0); |
843 | if (Stride == 1 || PSE.hasNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW)) |
844 | return true; |
845 | |
846 | return false; |
847 | } |
848 | |
849 | static void visitPointers(Value *StartPtr, const Loop &InnermostLoop, |
850 | function_ref<void(Value *)> AddPointer) { |
851 | SmallPtrSet<Value *, 8> Visited; |
852 | SmallVector<Value *> WorkList; |
853 | WorkList.push_back(Elt: StartPtr); |
854 | |
855 | while (!WorkList.empty()) { |
856 | Value *Ptr = WorkList.pop_back_val(); |
857 | if (!Visited.insert(Ptr).second) |
858 | continue; |
859 | auto *PN = dyn_cast<PHINode>(Val: Ptr); |
860 | // SCEV does not look through non-header PHIs inside the loop. Such phis |
861 | // can be analyzed by adding separate accesses for each incoming pointer |
862 | // value. |
863 | if (PN && InnermostLoop.contains(BB: PN->getParent()) && |
864 | PN->getParent() != InnermostLoop.getHeader()) { |
865 | for (const Use &Inc : PN->incoming_values()) |
866 | WorkList.push_back(Elt: Inc); |
867 | } else |
868 | AddPointer(Ptr); |
869 | } |
870 | } |
871 | |
872 | // Walk back through the IR for a pointer, looking for a select like the |
873 | // following: |
874 | // |
875 | // %offset = select i1 %cmp, i64 %a, i64 %b |
876 | // %addr = getelementptr double, double* %base, i64 %offset |
877 | // %ld = load double, double* %addr, align 8 |
878 | // |
879 | // We won't be able to form a single SCEVAddRecExpr from this since the |
880 | // address for each loop iteration depends on %cmp. We could potentially |
881 | // produce multiple valid SCEVAddRecExprs, though, and check all of them for |
882 | // memory safety/aliasing if needed. |
883 | // |
884 | // If we encounter some IR we don't yet handle, or something obviously fine |
885 | // like a constant, then we just add the SCEV for that term to the list passed |
886 | // in by the caller. If we have a node that may potentially yield a valid |
887 | // SCEVAddRecExpr then we decompose it into parts and build the SCEV terms |
888 | // ourselves before adding to the list. |
889 | static void findForkedSCEVs( |
890 | ScalarEvolution *SE, const Loop *L, Value *Ptr, |
891 | SmallVectorImpl<PointerIntPair<const SCEV *, 1, bool>> &ScevList, |
892 | unsigned Depth) { |
893 | // If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or |
894 | // we've exceeded our limit on recursion, just return whatever we have |
895 | // regardless of whether it can be used for a forked pointer or not, along |
896 | // with an indication of whether it might be a poison or undef value. |
897 | const SCEV *Scev = SE->getSCEV(V: Ptr); |
898 | if (isa<SCEVAddRecExpr>(Val: Scev) || L->isLoopInvariant(V: Ptr) || |
899 | !isa<Instruction>(Val: Ptr) || Depth == 0) { |
900 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr)); |
901 | return; |
902 | } |
903 | |
904 | Depth--; |
905 | |
906 | auto UndefPoisonCheck = [](PointerIntPair<const SCEV *, 1, bool> S) { |
907 | return get<1>(Pair: S); |
908 | }; |
909 | |
910 | auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) { |
911 | switch (Opcode) { |
912 | case Instruction::Add: |
913 | return SE->getAddExpr(LHS: L, RHS: R); |
914 | case Instruction::Sub: |
915 | return SE->getMinusSCEV(LHS: L, RHS: R); |
916 | default: |
917 | llvm_unreachable("Unexpected binary operator when walking ForkedPtrs" ); |
918 | } |
919 | }; |
920 | |
921 | Instruction *I = cast<Instruction>(Val: Ptr); |
922 | unsigned Opcode = I->getOpcode(); |
923 | switch (Opcode) { |
924 | case Instruction::GetElementPtr: { |
925 | GetElementPtrInst *GEP = cast<GetElementPtrInst>(Val: I); |
926 | Type *SourceTy = GEP->getSourceElementType(); |
927 | // We only handle base + single offset GEPs here for now. |
928 | // Not dealing with preexisting gathers yet, so no vectors. |
929 | if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) { |
930 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: GEP)); |
931 | break; |
932 | } |
933 | SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> BaseScevs; |
934 | SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> OffsetScevs; |
935 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: BaseScevs, Depth); |
936 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: OffsetScevs, Depth); |
937 | |
938 | // See if we need to freeze our fork... |
939 | bool NeedsFreeze = any_of(Range&: BaseScevs, P: UndefPoisonCheck) || |
940 | any_of(Range&: OffsetScevs, P: UndefPoisonCheck); |
941 | |
942 | // Check that we only have a single fork, on either the base or the offset. |
943 | // Copy the SCEV across for the one without a fork in order to generate |
944 | // the full SCEV for both sides of the GEP. |
945 | if (OffsetScevs.size() == 2 && BaseScevs.size() == 1) |
946 | BaseScevs.push_back(Elt: BaseScevs[0]); |
947 | else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1) |
948 | OffsetScevs.push_back(Elt: OffsetScevs[0]); |
949 | else { |
950 | ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze); |
951 | break; |
952 | } |
953 | |
954 | // Find the pointer type we need to extend to. |
955 | Type *IntPtrTy = SE->getEffectiveSCEVType( |
956 | Ty: SE->getSCEV(V: GEP->getPointerOperand())->getType()); |
957 | |
958 | // Find the size of the type being pointed to. We only have a single |
959 | // index term (guarded above) so we don't need to index into arrays or |
960 | // structures, just get the size of the scalar value. |
961 | const SCEV *Size = SE->getSizeOfExpr(IntTy: IntPtrTy, AllocTy: SourceTy); |
962 | |
963 | // Scale up the offsets by the size of the type, then add to the bases. |
964 | const SCEV *Scaled1 = SE->getMulExpr( |
965 | LHS: Size, RHS: SE->getTruncateOrSignExtend(V: get<0>(Pair: OffsetScevs[0]), Ty: IntPtrTy)); |
966 | const SCEV *Scaled2 = SE->getMulExpr( |
967 | LHS: Size, RHS: SE->getTruncateOrSignExtend(V: get<0>(Pair: OffsetScevs[1]), Ty: IntPtrTy)); |
968 | ScevList.emplace_back(Args: SE->getAddExpr(LHS: get<0>(Pair: BaseScevs[0]), RHS: Scaled1), |
969 | Args&: NeedsFreeze); |
970 | ScevList.emplace_back(Args: SE->getAddExpr(LHS: get<0>(Pair: BaseScevs[1]), RHS: Scaled2), |
971 | Args&: NeedsFreeze); |
972 | break; |
973 | } |
974 | case Instruction::Select: { |
975 | SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs; |
976 | // A select means we've found a forked pointer, but we currently only |
977 | // support a single select per pointer so if there's another behind this |
978 | // then we just bail out and return the generic SCEV. |
979 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: ChildScevs, Depth); |
980 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 2), ScevList&: ChildScevs, Depth); |
981 | if (ChildScevs.size() == 2) { |
982 | ScevList.push_back(Elt: ChildScevs[0]); |
983 | ScevList.push_back(Elt: ChildScevs[1]); |
984 | } else |
985 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr)); |
986 | break; |
987 | } |
988 | case Instruction::PHI: { |
989 | SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs; |
990 | // A phi means we've found a forked pointer, but we currently only |
991 | // support a single phi per pointer so if there's another behind this |
992 | // then we just bail out and return the generic SCEV. |
993 | if (I->getNumOperands() == 2) { |
994 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: ChildScevs, Depth); |
995 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: ChildScevs, Depth); |
996 | } |
997 | if (ChildScevs.size() == 2) { |
998 | ScevList.push_back(Elt: ChildScevs[0]); |
999 | ScevList.push_back(Elt: ChildScevs[1]); |
1000 | } else |
1001 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr)); |
1002 | break; |
1003 | } |
1004 | case Instruction::Add: |
1005 | case Instruction::Sub: { |
1006 | SmallVector<PointerIntPair<const SCEV *, 1, bool>> LScevs; |
1007 | SmallVector<PointerIntPair<const SCEV *, 1, bool>> RScevs; |
1008 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 0), ScevList&: LScevs, Depth); |
1009 | findForkedSCEVs(SE, L, Ptr: I->getOperand(i: 1), ScevList&: RScevs, Depth); |
1010 | |
1011 | // See if we need to freeze our fork... |
1012 | bool NeedsFreeze = |
1013 | any_of(Range&: LScevs, P: UndefPoisonCheck) || any_of(Range&: RScevs, P: UndefPoisonCheck); |
1014 | |
1015 | // Check that we only have a single fork, on either the left or right side. |
1016 | // Copy the SCEV across for the one without a fork in order to generate |
1017 | // the full SCEV for both sides of the BinOp. |
1018 | if (LScevs.size() == 2 && RScevs.size() == 1) |
1019 | RScevs.push_back(Elt: RScevs[0]); |
1020 | else if (RScevs.size() == 2 && LScevs.size() == 1) |
1021 | LScevs.push_back(Elt: LScevs[0]); |
1022 | else { |
1023 | ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze); |
1024 | break; |
1025 | } |
1026 | |
1027 | ScevList.emplace_back( |
1028 | Args: GetBinOpExpr(Opcode, get<0>(Pair: LScevs[0]), get<0>(Pair: RScevs[0])), |
1029 | Args&: NeedsFreeze); |
1030 | ScevList.emplace_back( |
1031 | Args: GetBinOpExpr(Opcode, get<0>(Pair: LScevs[1]), get<0>(Pair: RScevs[1])), |
1032 | Args&: NeedsFreeze); |
1033 | break; |
1034 | } |
1035 | default: |
1036 | // Just return the current SCEV if we haven't handled the instruction yet. |
1037 | LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n" ); |
1038 | ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr)); |
1039 | break; |
1040 | } |
1041 | } |
1042 | |
1043 | static SmallVector<PointerIntPair<const SCEV *, 1, bool>> |
1044 | findForkedPointer(PredicatedScalarEvolution &PSE, |
1045 | const DenseMap<Value *, const SCEV *> &StridesMap, Value *Ptr, |
1046 | const Loop *L) { |
1047 | ScalarEvolution *SE = PSE.getSE(); |
1048 | assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!" ); |
1049 | SmallVector<PointerIntPair<const SCEV *, 1, bool>> Scevs; |
1050 | findForkedSCEVs(SE, L, Ptr, ScevList&: Scevs, Depth: MaxForkedSCEVDepth); |
1051 | |
1052 | // For now, we will only accept a forked pointer with two possible SCEVs |
1053 | // that are either SCEVAddRecExprs or loop invariant. |
1054 | if (Scevs.size() == 2 && |
1055 | (isa<SCEVAddRecExpr>(Val: get<0>(Pair: Scevs[0])) || |
1056 | SE->isLoopInvariant(S: get<0>(Pair: Scevs[0]), L)) && |
1057 | (isa<SCEVAddRecExpr>(Val: get<0>(Pair: Scevs[1])) || |
1058 | SE->isLoopInvariant(S: get<0>(Pair: Scevs[1]), L))) { |
1059 | LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n" ); |
1060 | LLVM_DEBUG(dbgs() << "\t(1) " << *get<0>(Scevs[0]) << "\n" ); |
1061 | LLVM_DEBUG(dbgs() << "\t(2) " << *get<0>(Scevs[1]) << "\n" ); |
1062 | return Scevs; |
1063 | } |
1064 | |
1065 | return {{replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr), false}}; |
1066 | } |
1067 | |
1068 | bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck, |
1069 | MemAccessInfo Access, Type *AccessTy, |
1070 | const DenseMap<Value *, const SCEV *> &StridesMap, |
1071 | DenseMap<Value *, unsigned> &DepSetId, |
1072 | Loop *TheLoop, unsigned &RunningDepId, |
1073 | unsigned ASId, bool ShouldCheckWrap, |
1074 | bool Assume) { |
1075 | Value *Ptr = Access.getPointer(); |
1076 | |
1077 | SmallVector<PointerIntPair<const SCEV *, 1, bool>> TranslatedPtrs = |
1078 | findForkedPointer(PSE, StridesMap, Ptr, L: TheLoop); |
1079 | |
1080 | for (auto &P : TranslatedPtrs) { |
1081 | const SCEV *PtrExpr = get<0>(Pair: P); |
1082 | if (!hasComputableBounds(PSE, Ptr, PtrScev: PtrExpr, L: TheLoop, Assume)) |
1083 | return false; |
1084 | |
1085 | // When we run after a failing dependency check we have to make sure |
1086 | // we don't have wrapping pointers. |
1087 | if (ShouldCheckWrap) { |
1088 | // Skip wrap checking when translating pointers. |
1089 | if (TranslatedPtrs.size() > 1) |
1090 | return false; |
1091 | |
1092 | if (!isNoWrap(PSE, Strides: StridesMap, Ptr, AccessTy, L: TheLoop)) { |
1093 | auto *Expr = PSE.getSCEV(V: Ptr); |
1094 | if (!Assume || !isa<SCEVAddRecExpr>(Val: Expr)) |
1095 | return false; |
1096 | PSE.setNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW); |
1097 | } |
1098 | } |
1099 | // If there's only one option for Ptr, look it up after bounds and wrap |
1100 | // checking, because assumptions might have been added to PSE. |
1101 | if (TranslatedPtrs.size() == 1) |
1102 | TranslatedPtrs[0] = {replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr), |
1103 | false}; |
1104 | } |
1105 | |
1106 | for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) { |
1107 | // The id of the dependence set. |
1108 | unsigned DepId; |
1109 | |
1110 | if (isDependencyCheckNeeded()) { |
1111 | Value *Leader = DepCands.getLeaderValue(V: Access).getPointer(); |
1112 | unsigned &LeaderId = DepSetId[Leader]; |
1113 | if (!LeaderId) |
1114 | LeaderId = RunningDepId++; |
1115 | DepId = LeaderId; |
1116 | } else |
1117 | // Each access has its own dependence set. |
1118 | DepId = RunningDepId++; |
1119 | |
1120 | bool IsWrite = Access.getInt(); |
1121 | RtCheck.insert(Lp: TheLoop, Ptr, PtrExpr, AccessTy, WritePtr: IsWrite, DepSetId: DepId, ASId, PSE, |
1122 | NeedsFreeze); |
1123 | LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n'); |
1124 | } |
1125 | |
1126 | return true; |
1127 | } |
1128 | |
1129 | bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck, |
1130 | ScalarEvolution *SE, Loop *TheLoop, |
1131 | const DenseMap<Value *, const SCEV *> &StridesMap, |
1132 | Value *&UncomputablePtr, bool ShouldCheckWrap) { |
1133 | // Find pointers with computable bounds. We are going to use this information |
1134 | // to place a runtime bound check. |
1135 | bool CanDoRT = true; |
1136 | |
1137 | bool MayNeedRTCheck = false; |
1138 | if (!IsRTCheckAnalysisNeeded) return true; |
1139 | |
1140 | bool IsDepCheckNeeded = isDependencyCheckNeeded(); |
1141 | |
1142 | // We assign a consecutive id to access from different alias sets. |
1143 | // Accesses between different groups doesn't need to be checked. |
1144 | unsigned ASId = 0; |
1145 | for (auto &AS : AST) { |
1146 | int NumReadPtrChecks = 0; |
1147 | int NumWritePtrChecks = 0; |
1148 | bool CanDoAliasSetRT = true; |
1149 | ++ASId; |
1150 | auto ASPointers = AS.getPointers(); |
1151 | |
1152 | // We assign consecutive id to access from different dependence sets. |
1153 | // Accesses within the same set don't need a runtime check. |
1154 | unsigned RunningDepId = 1; |
1155 | DenseMap<Value *, unsigned> DepSetId; |
1156 | |
1157 | SmallVector<std::pair<MemAccessInfo, Type *>, 4> Retries; |
1158 | |
1159 | // First, count how many write and read accesses are in the alias set. Also |
1160 | // collect MemAccessInfos for later. |
1161 | SmallVector<MemAccessInfo, 4> AccessInfos; |
1162 | for (const Value *Ptr_ : ASPointers) { |
1163 | Value *Ptr = const_cast<Value *>(Ptr_); |
1164 | bool IsWrite = Accesses.count(Key: MemAccessInfo(Ptr, true)); |
1165 | if (IsWrite) |
1166 | ++NumWritePtrChecks; |
1167 | else |
1168 | ++NumReadPtrChecks; |
1169 | AccessInfos.emplace_back(Args&: Ptr, Args&: IsWrite); |
1170 | } |
1171 | |
1172 | // We do not need runtime checks for this alias set, if there are no writes |
1173 | // or a single write and no reads. |
1174 | if (NumWritePtrChecks == 0 || |
1175 | (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) { |
1176 | assert((ASPointers.size() <= 1 || |
1177 | all_of(ASPointers, |
1178 | [this](const Value *Ptr) { |
1179 | MemAccessInfo AccessWrite(const_cast<Value *>(Ptr), |
1180 | true); |
1181 | return DepCands.findValue(AccessWrite) == DepCands.end(); |
1182 | })) && |
1183 | "Can only skip updating CanDoRT below, if all entries in AS " |
1184 | "are reads or there is at most 1 entry" ); |
1185 | continue; |
1186 | } |
1187 | |
1188 | for (auto &Access : AccessInfos) { |
1189 | for (const auto &AccessTy : Accesses[Access]) { |
1190 | if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap, |
1191 | DepSetId, TheLoop, RunningDepId, ASId, |
1192 | ShouldCheckWrap, Assume: false)) { |
1193 | LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:" |
1194 | << *Access.getPointer() << '\n'); |
1195 | Retries.push_back(Elt: {Access, AccessTy}); |
1196 | CanDoAliasSetRT = false; |
1197 | } |
1198 | } |
1199 | } |
1200 | |
1201 | // Note that this function computes CanDoRT and MayNeedRTCheck |
1202 | // independently. For example CanDoRT=false, MayNeedRTCheck=false means that |
1203 | // we have a pointer for which we couldn't find the bounds but we don't |
1204 | // actually need to emit any checks so it does not matter. |
1205 | // |
1206 | // We need runtime checks for this alias set, if there are at least 2 |
1207 | // dependence sets (in which case RunningDepId > 2) or if we need to re-try |
1208 | // any bound checks (because in that case the number of dependence sets is |
1209 | // incomplete). |
1210 | bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty(); |
1211 | |
1212 | // We need to perform run-time alias checks, but some pointers had bounds |
1213 | // that couldn't be checked. |
1214 | if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) { |
1215 | // Reset the CanDoSetRt flag and retry all accesses that have failed. |
1216 | // We know that we need these checks, so we can now be more aggressive |
1217 | // and add further checks if required (overflow checks). |
1218 | CanDoAliasSetRT = true; |
1219 | for (auto Retry : Retries) { |
1220 | MemAccessInfo Access = Retry.first; |
1221 | Type *AccessTy = Retry.second; |
1222 | if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap, |
1223 | DepSetId, TheLoop, RunningDepId, ASId, |
1224 | ShouldCheckWrap, /*Assume=*/true)) { |
1225 | CanDoAliasSetRT = false; |
1226 | UncomputablePtr = Access.getPointer(); |
1227 | break; |
1228 | } |
1229 | } |
1230 | } |
1231 | |
1232 | CanDoRT &= CanDoAliasSetRT; |
1233 | MayNeedRTCheck |= NeedsAliasSetRTCheck; |
1234 | ++ASId; |
1235 | } |
1236 | |
1237 | // If the pointers that we would use for the bounds comparison have different |
1238 | // address spaces, assume the values aren't directly comparable, so we can't |
1239 | // use them for the runtime check. We also have to assume they could |
1240 | // overlap. In the future there should be metadata for whether address spaces |
1241 | // are disjoint. |
1242 | unsigned NumPointers = RtCheck.Pointers.size(); |
1243 | for (unsigned i = 0; i < NumPointers; ++i) { |
1244 | for (unsigned j = i + 1; j < NumPointers; ++j) { |
1245 | // Only need to check pointers between two different dependency sets. |
1246 | if (RtCheck.Pointers[i].DependencySetId == |
1247 | RtCheck.Pointers[j].DependencySetId) |
1248 | continue; |
1249 | // Only need to check pointers in the same alias set. |
1250 | if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId) |
1251 | continue; |
1252 | |
1253 | Value *PtrI = RtCheck.Pointers[i].PointerValue; |
1254 | Value *PtrJ = RtCheck.Pointers[j].PointerValue; |
1255 | |
1256 | unsigned ASi = PtrI->getType()->getPointerAddressSpace(); |
1257 | unsigned ASj = PtrJ->getType()->getPointerAddressSpace(); |
1258 | if (ASi != ASj) { |
1259 | LLVM_DEBUG( |
1260 | dbgs() << "LAA: Runtime check would require comparison between" |
1261 | " different address spaces\n" ); |
1262 | return false; |
1263 | } |
1264 | } |
1265 | } |
1266 | |
1267 | if (MayNeedRTCheck && CanDoRT) |
1268 | RtCheck.generateChecks(DepCands, UseDependencies: IsDepCheckNeeded); |
1269 | |
1270 | LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks() |
1271 | << " pointer comparisons.\n" ); |
1272 | |
1273 | // If we can do run-time checks, but there are no checks, no runtime checks |
1274 | // are needed. This can happen when all pointers point to the same underlying |
1275 | // object for example. |
1276 | RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck; |
1277 | |
1278 | bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT; |
1279 | if (!CanDoRTIfNeeded) |
1280 | RtCheck.reset(); |
1281 | return CanDoRTIfNeeded; |
1282 | } |
1283 | |
1284 | void AccessAnalysis::processMemAccesses() { |
1285 | // We process the set twice: first we process read-write pointers, last we |
1286 | // process read-only pointers. This allows us to skip dependence tests for |
1287 | // read-only pointers. |
1288 | |
1289 | LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n" ); |
1290 | LLVM_DEBUG(dbgs() << " AST: " ; AST.dump()); |
1291 | LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n" ); |
1292 | LLVM_DEBUG({ |
1293 | for (auto A : Accesses) |
1294 | dbgs() << "\t" << *A.first.getPointer() << " (" |
1295 | << (A.first.getInt() |
1296 | ? "write" |
1297 | : (ReadOnlyPtr.count(A.first.getPointer()) ? "read-only" |
1298 | : "read" )) |
1299 | << ")\n" ; |
1300 | }); |
1301 | |
1302 | // The AliasSetTracker has nicely partitioned our pointers by metadata |
1303 | // compatibility and potential for underlying-object overlap. As a result, we |
1304 | // only need to check for potential pointer dependencies within each alias |
1305 | // set. |
1306 | for (const auto &AS : AST) { |
1307 | // Note that both the alias-set tracker and the alias sets themselves used |
1308 | // ordered collections internally and so the iteration order here is |
1309 | // deterministic. |
1310 | auto ASPointers = AS.getPointers(); |
1311 | |
1312 | bool SetHasWrite = false; |
1313 | |
1314 | // Map of pointers to last access encountered. |
1315 | typedef DenseMap<const Value*, MemAccessInfo> UnderlyingObjToAccessMap; |
1316 | UnderlyingObjToAccessMap ObjToLastAccess; |
1317 | |
1318 | // Set of access to check after all writes have been processed. |
1319 | PtrAccessMap DeferredAccesses; |
1320 | |
1321 | // Iterate over each alias set twice, once to process read/write pointers, |
1322 | // and then to process read-only pointers. |
1323 | for (int SetIteration = 0; SetIteration < 2; ++SetIteration) { |
1324 | bool UseDeferred = SetIteration > 0; |
1325 | PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses; |
1326 | |
1327 | for (const Value *Ptr_ : ASPointers) { |
1328 | Value *Ptr = const_cast<Value *>(Ptr_); |
1329 | |
1330 | // For a single memory access in AliasSetTracker, Accesses may contain |
1331 | // both read and write, and they both need to be handled for CheckDeps. |
1332 | for (const auto &AC : S) { |
1333 | if (AC.first.getPointer() != Ptr) |
1334 | continue; |
1335 | |
1336 | bool IsWrite = AC.first.getInt(); |
1337 | |
1338 | // If we're using the deferred access set, then it contains only |
1339 | // reads. |
1340 | bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite; |
1341 | if (UseDeferred && !IsReadOnlyPtr) |
1342 | continue; |
1343 | // Otherwise, the pointer must be in the PtrAccessSet, either as a |
1344 | // read or a write. |
1345 | assert(((IsReadOnlyPtr && UseDeferred) || IsWrite || |
1346 | S.count(MemAccessInfo(Ptr, false))) && |
1347 | "Alias-set pointer not in the access set?" ); |
1348 | |
1349 | MemAccessInfo Access(Ptr, IsWrite); |
1350 | DepCands.insert(Data: Access); |
1351 | |
1352 | // Memorize read-only pointers for later processing and skip them in |
1353 | // the first round (they need to be checked after we have seen all |
1354 | // write pointers). Note: we also mark pointer that are not |
1355 | // consecutive as "read-only" pointers (so that we check |
1356 | // "a[b[i]] +="). Hence, we need the second check for "!IsWrite". |
1357 | if (!UseDeferred && IsReadOnlyPtr) { |
1358 | // We only use the pointer keys, the types vector values don't |
1359 | // matter. |
1360 | DeferredAccesses.insert(KV: {Access, {}}); |
1361 | continue; |
1362 | } |
1363 | |
1364 | // If this is a write - check other reads and writes for conflicts. If |
1365 | // this is a read only check other writes for conflicts (but only if |
1366 | // there is no other write to the ptr - this is an optimization to |
1367 | // catch "a[i] = a[i] + " without having to do a dependence check). |
1368 | if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) { |
1369 | CheckDeps.push_back(Elt: Access); |
1370 | IsRTCheckAnalysisNeeded = true; |
1371 | } |
1372 | |
1373 | if (IsWrite) |
1374 | SetHasWrite = true; |
1375 | |
1376 | // Create sets of pointers connected by a shared alias set and |
1377 | // underlying object. |
1378 | typedef SmallVector<const Value *, 16> ValueVector; |
1379 | ValueVector TempObjects; |
1380 | |
1381 | UnderlyingObjects[Ptr] = {}; |
1382 | SmallVector<const Value *, 16> &UOs = UnderlyingObjects[Ptr]; |
1383 | ::getUnderlyingObjects(V: Ptr, Objects&: UOs, LI); |
1384 | LLVM_DEBUG(dbgs() |
1385 | << "Underlying objects for pointer " << *Ptr << "\n" ); |
1386 | for (const Value *UnderlyingObj : UOs) { |
1387 | // nullptr never alias, don't join sets for pointer that have "null" |
1388 | // in their UnderlyingObjects list. |
1389 | if (isa<ConstantPointerNull>(Val: UnderlyingObj) && |
1390 | !NullPointerIsDefined( |
1391 | F: TheLoop->getHeader()->getParent(), |
1392 | AS: UnderlyingObj->getType()->getPointerAddressSpace())) |
1393 | continue; |
1394 | |
1395 | UnderlyingObjToAccessMap::iterator Prev = |
1396 | ObjToLastAccess.find(Val: UnderlyingObj); |
1397 | if (Prev != ObjToLastAccess.end()) |
1398 | DepCands.unionSets(V1: Access, V2: Prev->second); |
1399 | |
1400 | ObjToLastAccess[UnderlyingObj] = Access; |
1401 | LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n" ); |
1402 | } |
1403 | } |
1404 | } |
1405 | } |
1406 | } |
1407 | } |
1408 | |
1409 | /// Return true if an AddRec pointer \p Ptr is unsigned non-wrapping, |
1410 | /// i.e. monotonically increasing/decreasing. |
1411 | static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR, |
1412 | PredicatedScalarEvolution &PSE, const Loop *L) { |
1413 | |
1414 | // FIXME: This should probably only return true for NUW. |
1415 | if (AR->getNoWrapFlags(Mask: SCEV::NoWrapMask)) |
1416 | return true; |
1417 | |
1418 | if (PSE.hasNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW)) |
1419 | return true; |
1420 | |
1421 | // Scalar evolution does not propagate the non-wrapping flags to values that |
1422 | // are derived from a non-wrapping induction variable because non-wrapping |
1423 | // could be flow-sensitive. |
1424 | // |
1425 | // Look through the potentially overflowing instruction to try to prove |
1426 | // non-wrapping for the *specific* value of Ptr. |
1427 | |
1428 | // The arithmetic implied by an inbounds GEP can't overflow. |
1429 | auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr); |
1430 | if (!GEP || !GEP->isInBounds()) |
1431 | return false; |
1432 | |
1433 | // Make sure there is only one non-const index and analyze that. |
1434 | Value *NonConstIndex = nullptr; |
1435 | for (Value *Index : GEP->indices()) |
1436 | if (!isa<ConstantInt>(Val: Index)) { |
1437 | if (NonConstIndex) |
1438 | return false; |
1439 | NonConstIndex = Index; |
1440 | } |
1441 | if (!NonConstIndex) |
1442 | // The recurrence is on the pointer, ignore for now. |
1443 | return false; |
1444 | |
1445 | // The index in GEP is signed. It is non-wrapping if it's derived from a NSW |
1446 | // AddRec using a NSW operation. |
1447 | if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(Val: NonConstIndex)) |
1448 | if (OBO->hasNoSignedWrap() && |
1449 | // Assume constant for other the operand so that the AddRec can be |
1450 | // easily found. |
1451 | isa<ConstantInt>(Val: OBO->getOperand(i_nocapture: 1))) { |
1452 | auto *OpScev = PSE.getSCEV(V: OBO->getOperand(i_nocapture: 0)); |
1453 | |
1454 | if (auto *OpAR = dyn_cast<SCEVAddRecExpr>(Val: OpScev)) |
1455 | return OpAR->getLoop() == L && OpAR->getNoWrapFlags(Mask: SCEV::FlagNSW); |
1456 | } |
1457 | |
1458 | return false; |
1459 | } |
1460 | |
1461 | /// Check whether the access through \p Ptr has a constant stride. |
1462 | std::optional<int64_t> llvm::getPtrStride(PredicatedScalarEvolution &PSE, |
1463 | Type *AccessTy, Value *Ptr, |
1464 | const Loop *Lp, |
1465 | const DenseMap<Value *, const SCEV *> &StridesMap, |
1466 | bool Assume, bool ShouldCheckWrap) { |
1467 | Type *Ty = Ptr->getType(); |
1468 | assert(Ty->isPointerTy() && "Unexpected non-ptr" ); |
1469 | |
1470 | if (isa<ScalableVectorType>(Val: AccessTy)) { |
1471 | LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy |
1472 | << "\n" ); |
1473 | return std::nullopt; |
1474 | } |
1475 | |
1476 | const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr); |
1477 | |
1478 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev); |
1479 | if (Assume && !AR) |
1480 | AR = PSE.getAsAddRec(V: Ptr); |
1481 | |
1482 | if (!AR) { |
1483 | LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr |
1484 | << " SCEV: " << *PtrScev << "\n" ); |
1485 | return std::nullopt; |
1486 | } |
1487 | |
1488 | // The access function must stride over the innermost loop. |
1489 | if (Lp != AR->getLoop()) { |
1490 | LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop " |
1491 | << *Ptr << " SCEV: " << *AR << "\n" ); |
1492 | return std::nullopt; |
1493 | } |
1494 | |
1495 | // Check the step is constant. |
1496 | const SCEV *Step = AR->getStepRecurrence(SE&: *PSE.getSE()); |
1497 | |
1498 | // Calculate the pointer stride and check if it is constant. |
1499 | const SCEVConstant *C = dyn_cast<SCEVConstant>(Val: Step); |
1500 | if (!C) { |
1501 | LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr |
1502 | << " SCEV: " << *AR << "\n" ); |
1503 | return std::nullopt; |
1504 | } |
1505 | |
1506 | auto &DL = Lp->getHeader()->getModule()->getDataLayout(); |
1507 | TypeSize AllocSize = DL.getTypeAllocSize(Ty: AccessTy); |
1508 | int64_t Size = AllocSize.getFixedValue(); |
1509 | const APInt &APStepVal = C->getAPInt(); |
1510 | |
1511 | // Huge step value - give up. |
1512 | if (APStepVal.getBitWidth() > 64) |
1513 | return std::nullopt; |
1514 | |
1515 | int64_t StepVal = APStepVal.getSExtValue(); |
1516 | |
1517 | // Strided access. |
1518 | int64_t Stride = StepVal / Size; |
1519 | int64_t Rem = StepVal % Size; |
1520 | if (Rem) |
1521 | return std::nullopt; |
1522 | |
1523 | if (!ShouldCheckWrap) |
1524 | return Stride; |
1525 | |
1526 | // The address calculation must not wrap. Otherwise, a dependence could be |
1527 | // inverted. |
1528 | if (isNoWrapAddRec(Ptr, AR, PSE, L: Lp)) |
1529 | return Stride; |
1530 | |
1531 | // An inbounds getelementptr that is a AddRec with a unit stride |
1532 | // cannot wrap per definition. If it did, the result would be poison |
1533 | // and any memory access dependent on it would be immediate UB |
1534 | // when executed. |
1535 | if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr); |
1536 | GEP && GEP->isInBounds() && (Stride == 1 || Stride == -1)) |
1537 | return Stride; |
1538 | |
1539 | // If the null pointer is undefined, then a access sequence which would |
1540 | // otherwise access it can be assumed not to unsigned wrap. Note that this |
1541 | // assumes the object in memory is aligned to the natural alignment. |
1542 | unsigned AddrSpace = Ty->getPointerAddressSpace(); |
1543 | if (!NullPointerIsDefined(F: Lp->getHeader()->getParent(), AS: AddrSpace) && |
1544 | (Stride == 1 || Stride == -1)) |
1545 | return Stride; |
1546 | |
1547 | if (Assume) { |
1548 | PSE.setNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW); |
1549 | LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n" |
1550 | << "LAA: Pointer: " << *Ptr << "\n" |
1551 | << "LAA: SCEV: " << *AR << "\n" |
1552 | << "LAA: Added an overflow assumption\n" ); |
1553 | return Stride; |
1554 | } |
1555 | LLVM_DEBUG( |
1556 | dbgs() << "LAA: Bad stride - Pointer may wrap in the address space " |
1557 | << *Ptr << " SCEV: " << *AR << "\n" ); |
1558 | return std::nullopt; |
1559 | } |
1560 | |
1561 | std::optional<int> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA, |
1562 | Type *ElemTyB, Value *PtrB, |
1563 | const DataLayout &DL, |
1564 | ScalarEvolution &SE, bool StrictCheck, |
1565 | bool CheckType) { |
1566 | assert(PtrA && PtrB && "Expected non-nullptr pointers." ); |
1567 | |
1568 | // Make sure that A and B are different pointers. |
1569 | if (PtrA == PtrB) |
1570 | return 0; |
1571 | |
1572 | // Make sure that the element types are the same if required. |
1573 | if (CheckType && ElemTyA != ElemTyB) |
1574 | return std::nullopt; |
1575 | |
1576 | unsigned ASA = PtrA->getType()->getPointerAddressSpace(); |
1577 | unsigned ASB = PtrB->getType()->getPointerAddressSpace(); |
1578 | |
1579 | // Check that the address spaces match. |
1580 | if (ASA != ASB) |
1581 | return std::nullopt; |
1582 | unsigned IdxWidth = DL.getIndexSizeInBits(AS: ASA); |
1583 | |
1584 | APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0); |
1585 | Value *PtrA1 = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetA); |
1586 | Value *PtrB1 = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetB); |
1587 | |
1588 | int Val; |
1589 | if (PtrA1 == PtrB1) { |
1590 | // Retrieve the address space again as pointer stripping now tracks through |
1591 | // `addrspacecast`. |
1592 | ASA = cast<PointerType>(Val: PtrA1->getType())->getAddressSpace(); |
1593 | ASB = cast<PointerType>(Val: PtrB1->getType())->getAddressSpace(); |
1594 | // Check that the address spaces match and that the pointers are valid. |
1595 | if (ASA != ASB) |
1596 | return std::nullopt; |
1597 | |
1598 | IdxWidth = DL.getIndexSizeInBits(AS: ASA); |
1599 | OffsetA = OffsetA.sextOrTrunc(width: IdxWidth); |
1600 | OffsetB = OffsetB.sextOrTrunc(width: IdxWidth); |
1601 | |
1602 | OffsetB -= OffsetA; |
1603 | Val = OffsetB.getSExtValue(); |
1604 | } else { |
1605 | // Otherwise compute the distance with SCEV between the base pointers. |
1606 | const SCEV *PtrSCEVA = SE.getSCEV(V: PtrA); |
1607 | const SCEV *PtrSCEVB = SE.getSCEV(V: PtrB); |
1608 | const auto *Diff = |
1609 | dyn_cast<SCEVConstant>(Val: SE.getMinusSCEV(LHS: PtrSCEVB, RHS: PtrSCEVA)); |
1610 | if (!Diff) |
1611 | return std::nullopt; |
1612 | Val = Diff->getAPInt().getSExtValue(); |
1613 | } |
1614 | int Size = DL.getTypeStoreSize(Ty: ElemTyA); |
1615 | int Dist = Val / Size; |
1616 | |
1617 | // Ensure that the calculated distance matches the type-based one after all |
1618 | // the bitcasts removal in the provided pointers. |
1619 | if (!StrictCheck || Dist * Size == Val) |
1620 | return Dist; |
1621 | return std::nullopt; |
1622 | } |
1623 | |
1624 | bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy, |
1625 | const DataLayout &DL, ScalarEvolution &SE, |
1626 | SmallVectorImpl<unsigned> &SortedIndices) { |
1627 | assert(llvm::all_of( |
1628 | VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && |
1629 | "Expected list of pointer operands." ); |
1630 | // Walk over the pointers, and map each of them to an offset relative to |
1631 | // first pointer in the array. |
1632 | Value *Ptr0 = VL[0]; |
1633 | |
1634 | using DistOrdPair = std::pair<int64_t, int>; |
1635 | auto Compare = llvm::less_first(); |
1636 | std::set<DistOrdPair, decltype(Compare)> Offsets(Compare); |
1637 | Offsets.emplace(args: 0, args: 0); |
1638 | int Cnt = 1; |
1639 | bool IsConsecutive = true; |
1640 | for (auto *Ptr : VL.drop_front()) { |
1641 | std::optional<int> Diff = getPointersDiff(ElemTyA: ElemTy, PtrA: Ptr0, ElemTyB: ElemTy, PtrB: Ptr, DL, SE, |
1642 | /*StrictCheck=*/true); |
1643 | if (!Diff) |
1644 | return false; |
1645 | |
1646 | // Check if the pointer with the same offset is found. |
1647 | int64_t Offset = *Diff; |
1648 | auto Res = Offsets.emplace(args&: Offset, args&: Cnt); |
1649 | if (!Res.second) |
1650 | return false; |
1651 | // Consecutive order if the inserted element is the last one. |
1652 | IsConsecutive = IsConsecutive && std::next(x: Res.first) == Offsets.end(); |
1653 | ++Cnt; |
1654 | } |
1655 | SortedIndices.clear(); |
1656 | if (!IsConsecutive) { |
1657 | // Fill SortedIndices array only if it is non-consecutive. |
1658 | SortedIndices.resize(N: VL.size()); |
1659 | Cnt = 0; |
1660 | for (const std::pair<int64_t, int> &Pair : Offsets) { |
1661 | SortedIndices[Cnt] = Pair.second; |
1662 | ++Cnt; |
1663 | } |
1664 | } |
1665 | return true; |
1666 | } |
1667 | |
1668 | /// Returns true if the memory operations \p A and \p B are consecutive. |
1669 | bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, |
1670 | ScalarEvolution &SE, bool CheckType) { |
1671 | Value *PtrA = getLoadStorePointerOperand(V: A); |
1672 | Value *PtrB = getLoadStorePointerOperand(V: B); |
1673 | if (!PtrA || !PtrB) |
1674 | return false; |
1675 | Type *ElemTyA = getLoadStoreType(I: A); |
1676 | Type *ElemTyB = getLoadStoreType(I: B); |
1677 | std::optional<int> Diff = |
1678 | getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE, |
1679 | /*StrictCheck=*/true, CheckType); |
1680 | return Diff && *Diff == 1; |
1681 | } |
1682 | |
1683 | void MemoryDepChecker::addAccess(StoreInst *SI) { |
1684 | visitPointers(StartPtr: SI->getPointerOperand(), InnermostLoop: *InnermostLoop, |
1685 | AddPointer: [this, SI](Value *Ptr) { |
1686 | Accesses[MemAccessInfo(Ptr, true)].push_back(x: AccessIdx); |
1687 | InstMap.push_back(Elt: SI); |
1688 | ++AccessIdx; |
1689 | }); |
1690 | } |
1691 | |
1692 | void MemoryDepChecker::addAccess(LoadInst *LI) { |
1693 | visitPointers(StartPtr: LI->getPointerOperand(), InnermostLoop: *InnermostLoop, |
1694 | AddPointer: [this, LI](Value *Ptr) { |
1695 | Accesses[MemAccessInfo(Ptr, false)].push_back(x: AccessIdx); |
1696 | InstMap.push_back(Elt: LI); |
1697 | ++AccessIdx; |
1698 | }); |
1699 | } |
1700 | |
1701 | MemoryDepChecker::VectorizationSafetyStatus |
1702 | MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) { |
1703 | switch (Type) { |
1704 | case NoDep: |
1705 | case Forward: |
1706 | case BackwardVectorizable: |
1707 | return VectorizationSafetyStatus::Safe; |
1708 | |
1709 | case Unknown: |
1710 | return VectorizationSafetyStatus::PossiblySafeWithRtChecks; |
1711 | case ForwardButPreventsForwarding: |
1712 | case Backward: |
1713 | case BackwardVectorizableButPreventsForwarding: |
1714 | case IndirectUnsafe: |
1715 | return VectorizationSafetyStatus::Unsafe; |
1716 | } |
1717 | llvm_unreachable("unexpected DepType!" ); |
1718 | } |
1719 | |
1720 | bool MemoryDepChecker::Dependence::isBackward() const { |
1721 | switch (Type) { |
1722 | case NoDep: |
1723 | case Forward: |
1724 | case ForwardButPreventsForwarding: |
1725 | case Unknown: |
1726 | case IndirectUnsafe: |
1727 | return false; |
1728 | |
1729 | case BackwardVectorizable: |
1730 | case Backward: |
1731 | case BackwardVectorizableButPreventsForwarding: |
1732 | return true; |
1733 | } |
1734 | llvm_unreachable("unexpected DepType!" ); |
1735 | } |
1736 | |
1737 | bool MemoryDepChecker::Dependence::isPossiblyBackward() const { |
1738 | return isBackward() || Type == Unknown; |
1739 | } |
1740 | |
1741 | bool MemoryDepChecker::Dependence::isForward() const { |
1742 | switch (Type) { |
1743 | case Forward: |
1744 | case ForwardButPreventsForwarding: |
1745 | return true; |
1746 | |
1747 | case NoDep: |
1748 | case Unknown: |
1749 | case BackwardVectorizable: |
1750 | case Backward: |
1751 | case BackwardVectorizableButPreventsForwarding: |
1752 | case IndirectUnsafe: |
1753 | return false; |
1754 | } |
1755 | llvm_unreachable("unexpected DepType!" ); |
1756 | } |
1757 | |
1758 | bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance, |
1759 | uint64_t TypeByteSize) { |
1760 | // If loads occur at a distance that is not a multiple of a feasible vector |
1761 | // factor store-load forwarding does not take place. |
1762 | // Positive dependences might cause troubles because vectorizing them might |
1763 | // prevent store-load forwarding making vectorized code run a lot slower. |
1764 | // a[i] = a[i-3] ^ a[i-8]; |
1765 | // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and |
1766 | // hence on your typical architecture store-load forwarding does not take |
1767 | // place. Vectorizing in such cases does not make sense. |
1768 | // Store-load forwarding distance. |
1769 | |
1770 | // After this many iterations store-to-load forwarding conflicts should not |
1771 | // cause any slowdowns. |
1772 | const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize; |
1773 | // Maximum vector factor. |
1774 | uint64_t MaxVFWithoutSLForwardIssues = std::min( |
1775 | a: VectorizerParams::MaxVectorWidth * TypeByteSize, b: MinDepDistBytes); |
1776 | |
1777 | // Compute the smallest VF at which the store and load would be misaligned. |
1778 | for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues; |
1779 | VF *= 2) { |
1780 | // If the number of vector iteration between the store and the load are |
1781 | // small we could incur conflicts. |
1782 | if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) { |
1783 | MaxVFWithoutSLForwardIssues = (VF >> 1); |
1784 | break; |
1785 | } |
1786 | } |
1787 | |
1788 | if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) { |
1789 | LLVM_DEBUG( |
1790 | dbgs() << "LAA: Distance " << Distance |
1791 | << " that could cause a store-load forwarding conflict\n" ); |
1792 | return true; |
1793 | } |
1794 | |
1795 | if (MaxVFWithoutSLForwardIssues < MinDepDistBytes && |
1796 | MaxVFWithoutSLForwardIssues != |
1797 | VectorizerParams::MaxVectorWidth * TypeByteSize) |
1798 | MinDepDistBytes = MaxVFWithoutSLForwardIssues; |
1799 | return false; |
1800 | } |
1801 | |
1802 | void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) { |
1803 | if (Status < S) |
1804 | Status = S; |
1805 | } |
1806 | |
1807 | /// Given a dependence-distance \p Dist between two |
1808 | /// memory accesses, that have the same stride whose absolute value is given |
1809 | /// in \p Stride, and that have the same type size \p TypeByteSize, |
1810 | /// in a loop whose takenCount is \p BackedgeTakenCount, check if it is |
1811 | /// possible to prove statically that the dependence distance is larger |
1812 | /// than the range that the accesses will travel through the execution of |
1813 | /// the loop. If so, return true; false otherwise. This is useful for |
1814 | /// example in loops such as the following (PR31098): |
1815 | /// for (i = 0; i < D; ++i) { |
1816 | /// = out[i]; |
1817 | /// out[i+D] = |
1818 | /// } |
1819 | static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE, |
1820 | const SCEV &BackedgeTakenCount, |
1821 | const SCEV &Dist, uint64_t Stride, |
1822 | uint64_t TypeByteSize) { |
1823 | |
1824 | // If we can prove that |
1825 | // (**) |Dist| > BackedgeTakenCount * Step |
1826 | // where Step is the absolute stride of the memory accesses in bytes, |
1827 | // then there is no dependence. |
1828 | // |
1829 | // Rationale: |
1830 | // We basically want to check if the absolute distance (|Dist/Step|) |
1831 | // is >= the loop iteration count (or > BackedgeTakenCount). |
1832 | // This is equivalent to the Strong SIV Test (Practical Dependence Testing, |
1833 | // Section 4.2.1); Note, that for vectorization it is sufficient to prove |
1834 | // that the dependence distance is >= VF; This is checked elsewhere. |
1835 | // But in some cases we can prune dependence distances early, and |
1836 | // even before selecting the VF, and without a runtime test, by comparing |
1837 | // the distance against the loop iteration count. Since the vectorized code |
1838 | // will be executed only if LoopCount >= VF, proving distance >= LoopCount |
1839 | // also guarantees that distance >= VF. |
1840 | // |
1841 | const uint64_t ByteStride = Stride * TypeByteSize; |
1842 | const SCEV *Step = SE.getConstant(Ty: BackedgeTakenCount.getType(), V: ByteStride); |
1843 | const SCEV *Product = SE.getMulExpr(LHS: &BackedgeTakenCount, RHS: Step); |
1844 | |
1845 | const SCEV *CastedDist = &Dist; |
1846 | const SCEV *CastedProduct = Product; |
1847 | uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Ty: Dist.getType()); |
1848 | uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Ty: Product->getType()); |
1849 | |
1850 | // The dependence distance can be positive/negative, so we sign extend Dist; |
1851 | // The multiplication of the absolute stride in bytes and the |
1852 | // backedgeTakenCount is non-negative, so we zero extend Product. |
1853 | if (DistTypeSizeBits > ProductTypeSizeBits) |
1854 | CastedProduct = SE.getZeroExtendExpr(Op: Product, Ty: Dist.getType()); |
1855 | else |
1856 | CastedDist = SE.getNoopOrSignExtend(V: &Dist, Ty: Product->getType()); |
1857 | |
1858 | // Is Dist - (BackedgeTakenCount * Step) > 0 ? |
1859 | // (If so, then we have proven (**) because |Dist| >= Dist) |
1860 | const SCEV *Minus = SE.getMinusSCEV(LHS: CastedDist, RHS: CastedProduct); |
1861 | if (SE.isKnownPositive(S: Minus)) |
1862 | return true; |
1863 | |
1864 | // Second try: Is -Dist - (BackedgeTakenCount * Step) > 0 ? |
1865 | // (If so, then we have proven (**) because |Dist| >= -1*Dist) |
1866 | const SCEV *NegDist = SE.getNegativeSCEV(V: CastedDist); |
1867 | Minus = SE.getMinusSCEV(LHS: NegDist, RHS: CastedProduct); |
1868 | if (SE.isKnownPositive(S: Minus)) |
1869 | return true; |
1870 | |
1871 | return false; |
1872 | } |
1873 | |
1874 | /// Check the dependence for two accesses with the same stride \p Stride. |
1875 | /// \p Distance is the positive distance and \p TypeByteSize is type size in |
1876 | /// bytes. |
1877 | /// |
1878 | /// \returns true if they are independent. |
1879 | static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride, |
1880 | uint64_t TypeByteSize) { |
1881 | assert(Stride > 1 && "The stride must be greater than 1" ); |
1882 | assert(TypeByteSize > 0 && "The type size in byte must be non-zero" ); |
1883 | assert(Distance > 0 && "The distance must be non-zero" ); |
1884 | |
1885 | // Skip if the distance is not multiple of type byte size. |
1886 | if (Distance % TypeByteSize) |
1887 | return false; |
1888 | |
1889 | uint64_t ScaledDist = Distance / TypeByteSize; |
1890 | |
1891 | // No dependence if the scaled distance is not multiple of the stride. |
1892 | // E.g. |
1893 | // for (i = 0; i < 1024 ; i += 4) |
1894 | // A[i+2] = A[i] + 1; |
1895 | // |
1896 | // Two accesses in memory (scaled distance is 2, stride is 4): |
1897 | // | A[0] | | | | A[4] | | | | |
1898 | // | | | A[2] | | | | A[6] | | |
1899 | // |
1900 | // E.g. |
1901 | // for (i = 0; i < 1024 ; i += 3) |
1902 | // A[i+4] = A[i] + 1; |
1903 | // |
1904 | // Two accesses in memory (scaled distance is 4, stride is 3): |
1905 | // | A[0] | | | A[3] | | | A[6] | | | |
1906 | // | | | | | A[4] | | | A[7] | | |
1907 | return ScaledDist % Stride; |
1908 | } |
1909 | |
1910 | /// Returns true if any of the underlying objects has a loop varying address, |
1911 | /// i.e. may change in \p L. |
1912 | static bool |
1913 | isLoopVariantIndirectAddress(ArrayRef<const Value *> UnderlyingObjects, |
1914 | ScalarEvolution &SE, const Loop *L) { |
1915 | return any_of(Range&: UnderlyingObjects, P: [&SE, L](const Value *UO) { |
1916 | return !SE.isLoopInvariant(S: SE.getSCEV(V: const_cast<Value *>(UO)), L); |
1917 | }); |
1918 | } |
1919 | |
1920 | // Get the dependence distance, stride, type size in whether i is a write for |
1921 | // the dependence between A and B. Returns a DepType, if we can prove there's |
1922 | // no dependence or the analysis fails. Outlined to lambda to limit he scope |
1923 | // of various temporary variables, like A/BPtr, StrideA/BPtr and others. |
1924 | // Returns either the dependence result, if it could already be determined, or a |
1925 | // tuple with (Distance, Stride, TypeSize, AIsWrite, BIsWrite). |
1926 | static std::variant<MemoryDepChecker::Dependence::DepType, |
1927 | std::tuple<const SCEV *, uint64_t, uint64_t, bool, bool>> |
1928 | getDependenceDistanceStrideAndSize( |
1929 | const AccessAnalysis::MemAccessInfo &A, Instruction *AInst, |
1930 | const AccessAnalysis::MemAccessInfo &B, Instruction *BInst, |
1931 | const DenseMap<Value *, const SCEV *> &Strides, |
1932 | const DenseMap<Value *, SmallVector<const Value *, 16>> &UnderlyingObjects, |
1933 | PredicatedScalarEvolution &PSE, const Loop *InnermostLoop) { |
1934 | auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout(); |
1935 | auto &SE = *PSE.getSE(); |
1936 | auto [APtr, AIsWrite] = A; |
1937 | auto [BPtr, BIsWrite] = B; |
1938 | |
1939 | // Two reads are independent. |
1940 | if (!AIsWrite && !BIsWrite) |
1941 | return MemoryDepChecker::Dependence::NoDep; |
1942 | |
1943 | Type *ATy = getLoadStoreType(I: AInst); |
1944 | Type *BTy = getLoadStoreType(I: BInst); |
1945 | |
1946 | // We cannot check pointers in different address spaces. |
1947 | if (APtr->getType()->getPointerAddressSpace() != |
1948 | BPtr->getType()->getPointerAddressSpace()) |
1949 | return MemoryDepChecker::Dependence::Unknown; |
1950 | |
1951 | int64_t StrideAPtr = |
1952 | getPtrStride(PSE, AccessTy: ATy, Ptr: APtr, Lp: InnermostLoop, StridesMap: Strides, Assume: true).value_or(u: 0); |
1953 | int64_t StrideBPtr = |
1954 | getPtrStride(PSE, AccessTy: BTy, Ptr: BPtr, Lp: InnermostLoop, StridesMap: Strides, Assume: true).value_or(u: 0); |
1955 | |
1956 | const SCEV *Src = PSE.getSCEV(V: APtr); |
1957 | const SCEV *Sink = PSE.getSCEV(V: BPtr); |
1958 | |
1959 | // If the induction step is negative we have to invert source and sink of the |
1960 | // dependence when measuring the distance between them. We should not swap |
1961 | // AIsWrite with BIsWrite, as their uses expect them in program order. |
1962 | if (StrideAPtr < 0) { |
1963 | std::swap(a&: Src, b&: Sink); |
1964 | std::swap(a&: AInst, b&: BInst); |
1965 | } |
1966 | |
1967 | const SCEV *Dist = SE.getMinusSCEV(LHS: Sink, RHS: Src); |
1968 | |
1969 | LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink |
1970 | << "(Induction step: " << StrideAPtr << ")\n" ); |
1971 | LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst |
1972 | << ": " << *Dist << "\n" ); |
1973 | |
1974 | // Needs accesses where the addresses of the accessed underlying objects do |
1975 | // not change within the loop. |
1976 | if (isLoopVariantIndirectAddress(UnderlyingObjects: UnderlyingObjects.find(Val: APtr)->second, SE, |
1977 | L: InnermostLoop) || |
1978 | isLoopVariantIndirectAddress(UnderlyingObjects: UnderlyingObjects.find(Val: BPtr)->second, SE, |
1979 | L: InnermostLoop)) |
1980 | return MemoryDepChecker::Dependence::IndirectUnsafe; |
1981 | |
1982 | // Need accesses with constant stride. We don't want to vectorize |
1983 | // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap |
1984 | // in the address space. |
1985 | if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr) { |
1986 | LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n" ); |
1987 | return MemoryDepChecker::Dependence::Unknown; |
1988 | } |
1989 | |
1990 | uint64_t TypeByteSize = DL.getTypeAllocSize(Ty: ATy); |
1991 | bool HasSameSize = |
1992 | DL.getTypeStoreSizeInBits(Ty: ATy) == DL.getTypeStoreSizeInBits(Ty: BTy); |
1993 | if (!HasSameSize) |
1994 | TypeByteSize = 0; |
1995 | uint64_t Stride = std::abs(i: StrideAPtr); |
1996 | return std::make_tuple(args&: Dist, args&: Stride, args&: TypeByteSize, args&: AIsWrite, args&: BIsWrite); |
1997 | } |
1998 | |
1999 | MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent( |
2000 | const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B, |
2001 | unsigned BIdx, const DenseMap<Value *, const SCEV *> &Strides, |
2002 | const DenseMap<Value *, SmallVector<const Value *, 16>> |
2003 | &UnderlyingObjects) { |
2004 | assert(AIdx < BIdx && "Must pass arguments in program order" ); |
2005 | |
2006 | // Get the dependence distance, stride, type size and what access writes for |
2007 | // the dependence between A and B. |
2008 | auto Res = getDependenceDistanceStrideAndSize( |
2009 | A, AInst: InstMap[AIdx], B, BInst: InstMap[BIdx], Strides, UnderlyingObjects, PSE, |
2010 | InnermostLoop); |
2011 | if (std::holds_alternative<Dependence::DepType>(v: Res)) |
2012 | return std::get<Dependence::DepType>(v&: Res); |
2013 | |
2014 | const auto &[Dist, Stride, TypeByteSize, AIsWrite, BIsWrite] = |
2015 | std::get<std::tuple<const SCEV *, uint64_t, uint64_t, bool, bool>>(v&: Res); |
2016 | bool HasSameSize = TypeByteSize > 0; |
2017 | |
2018 | ScalarEvolution &SE = *PSE.getSE(); |
2019 | auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout(); |
2020 | if (!isa<SCEVCouldNotCompute>(Val: Dist) && HasSameSize && |
2021 | isSafeDependenceDistance(DL, SE, BackedgeTakenCount: *(PSE.getBackedgeTakenCount()), Dist: *Dist, |
2022 | Stride, TypeByteSize)) |
2023 | return Dependence::NoDep; |
2024 | |
2025 | const SCEVConstant *C = dyn_cast<SCEVConstant>(Val: Dist); |
2026 | if (!C) { |
2027 | LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n" ); |
2028 | FoundNonConstantDistanceDependence = true; |
2029 | return Dependence::Unknown; |
2030 | } |
2031 | |
2032 | const APInt &Val = C->getAPInt(); |
2033 | int64_t Distance = Val.getSExtValue(); |
2034 | |
2035 | // Attempt to prove strided accesses independent. |
2036 | if (std::abs(i: Distance) > 0 && Stride > 1 && HasSameSize && |
2037 | areStridedAccessesIndependent(Distance: std::abs(i: Distance), Stride, TypeByteSize)) { |
2038 | LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n" ); |
2039 | return Dependence::NoDep; |
2040 | } |
2041 | |
2042 | // Negative distances are not plausible dependencies. |
2043 | if (Val.isNegative()) { |
2044 | bool IsTrueDataDependence = (AIsWrite && !BIsWrite); |
2045 | // There is no need to update MaxSafeVectorWidthInBits after call to |
2046 | // couldPreventStoreLoadForward, even if it changed MinDepDistBytes, |
2047 | // since a forward dependency will allow vectorization using any width. |
2048 | if (IsTrueDataDependence && EnableForwardingConflictDetection && |
2049 | (!HasSameSize || couldPreventStoreLoadForward(Distance: Val.abs().getZExtValue(), |
2050 | TypeByteSize))) { |
2051 | LLVM_DEBUG(dbgs() << "LAA: Forward but may prevent st->ld forwarding\n" ); |
2052 | return Dependence::ForwardButPreventsForwarding; |
2053 | } |
2054 | |
2055 | LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n" ); |
2056 | return Dependence::Forward; |
2057 | } |
2058 | |
2059 | // Write to the same location with the same size. |
2060 | if (Val == 0) { |
2061 | if (HasSameSize) |
2062 | return Dependence::Forward; |
2063 | LLVM_DEBUG( |
2064 | dbgs() << "LAA: Zero dependence difference but different type sizes\n" ); |
2065 | return Dependence::Unknown; |
2066 | } |
2067 | |
2068 | assert(Val.isStrictlyPositive() && "Expect a positive value" ); |
2069 | |
2070 | if (!HasSameSize) { |
2071 | LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with " |
2072 | "different type sizes\n" ); |
2073 | return Dependence::Unknown; |
2074 | } |
2075 | |
2076 | // Bail out early if passed-in parameters make vectorization not feasible. |
2077 | unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ? |
2078 | VectorizerParams::VectorizationFactor : 1); |
2079 | unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ? |
2080 | VectorizerParams::VectorizationInterleave : 1); |
2081 | // The minimum number of iterations for a vectorized/unrolled version. |
2082 | unsigned MinNumIter = std::max(a: ForcedFactor * ForcedUnroll, b: 2U); |
2083 | |
2084 | // It's not vectorizable if the distance is smaller than the minimum distance |
2085 | // needed for a vectroized/unrolled version. Vectorizing one iteration in |
2086 | // front needs TypeByteSize * Stride. Vectorizing the last iteration needs |
2087 | // TypeByteSize (No need to plus the last gap distance). |
2088 | // |
2089 | // E.g. Assume one char is 1 byte in memory and one int is 4 bytes. |
2090 | // foo(int *A) { |
2091 | // int *B = (int *)((char *)A + 14); |
2092 | // for (i = 0 ; i < 1024 ; i += 2) |
2093 | // B[i] = A[i] + 1; |
2094 | // } |
2095 | // |
2096 | // Two accesses in memory (stride is 2): |
2097 | // | A[0] | | A[2] | | A[4] | | A[6] | | |
2098 | // | B[0] | | B[2] | | B[4] | |
2099 | // |
2100 | // Distance needs for vectorizing iterations except the last iteration: |
2101 | // 4 * 2 * (MinNumIter - 1). Distance needs for the last iteration: 4. |
2102 | // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4. |
2103 | // |
2104 | // If MinNumIter is 2, it is vectorizable as the minimum distance needed is |
2105 | // 12, which is less than distance. |
2106 | // |
2107 | // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4), |
2108 | // the minimum distance needed is 28, which is greater than distance. It is |
2109 | // not safe to do vectorization. |
2110 | uint64_t MinDistanceNeeded = |
2111 | TypeByteSize * Stride * (MinNumIter - 1) + TypeByteSize; |
2112 | if (MinDistanceNeeded > static_cast<uint64_t>(Distance)) { |
2113 | LLVM_DEBUG(dbgs() << "LAA: Failure because of positive distance " |
2114 | << Distance << '\n'); |
2115 | return Dependence::Backward; |
2116 | } |
2117 | |
2118 | // Unsafe if the minimum distance needed is greater than smallest dependence |
2119 | // distance distance. |
2120 | if (MinDistanceNeeded > MinDepDistBytes) { |
2121 | LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least " |
2122 | << MinDistanceNeeded << " size in bytes\n" ); |
2123 | return Dependence::Backward; |
2124 | } |
2125 | |
2126 | // Positive distance bigger than max vectorization factor. |
2127 | // FIXME: Should use max factor instead of max distance in bytes, which could |
2128 | // not handle different types. |
2129 | // E.g. Assume one char is 1 byte in memory and one int is 4 bytes. |
2130 | // void foo (int *A, char *B) { |
2131 | // for (unsigned i = 0; i < 1024; i++) { |
2132 | // A[i+2] = A[i] + 1; |
2133 | // B[i+2] = B[i] + 1; |
2134 | // } |
2135 | // } |
2136 | // |
2137 | // This case is currently unsafe according to the max safe distance. If we |
2138 | // analyze the two accesses on array B, the max safe dependence distance |
2139 | // is 2. Then we analyze the accesses on array A, the minimum distance needed |
2140 | // is 8, which is less than 2 and forbidden vectorization, But actually |
2141 | // both A and B could be vectorized by 2 iterations. |
2142 | MinDepDistBytes = |
2143 | std::min(a: static_cast<uint64_t>(Distance), b: MinDepDistBytes); |
2144 | |
2145 | bool IsTrueDataDependence = (!AIsWrite && BIsWrite); |
2146 | uint64_t MinDepDistBytesOld = MinDepDistBytes; |
2147 | if (IsTrueDataDependence && EnableForwardingConflictDetection && |
2148 | couldPreventStoreLoadForward(Distance, TypeByteSize)) { |
2149 | // Sanity check that we didn't update MinDepDistBytes when calling |
2150 | // couldPreventStoreLoadForward |
2151 | assert(MinDepDistBytes == MinDepDistBytesOld && |
2152 | "An update to MinDepDistBytes requires an update to " |
2153 | "MaxSafeVectorWidthInBits" ); |
2154 | (void)MinDepDistBytesOld; |
2155 | return Dependence::BackwardVectorizableButPreventsForwarding; |
2156 | } |
2157 | |
2158 | // An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits |
2159 | // since there is a backwards dependency. |
2160 | uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * Stride); |
2161 | LLVM_DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue() |
2162 | << " with max VF = " << MaxVF << '\n'); |
2163 | uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8; |
2164 | MaxSafeVectorWidthInBits = std::min(a: MaxSafeVectorWidthInBits, b: MaxVFInBits); |
2165 | return Dependence::BackwardVectorizable; |
2166 | } |
2167 | |
2168 | bool MemoryDepChecker::areDepsSafe( |
2169 | DepCandidates &AccessSets, MemAccessInfoList &CheckDeps, |
2170 | const DenseMap<Value *, const SCEV *> &Strides, |
2171 | const DenseMap<Value *, SmallVector<const Value *, 16>> |
2172 | &UnderlyingObjects) { |
2173 | |
2174 | MinDepDistBytes = -1; |
2175 | SmallPtrSet<MemAccessInfo, 8> Visited; |
2176 | for (MemAccessInfo CurAccess : CheckDeps) { |
2177 | if (Visited.count(Ptr: CurAccess)) |
2178 | continue; |
2179 | |
2180 | // Get the relevant memory access set. |
2181 | EquivalenceClasses<MemAccessInfo>::iterator I = |
2182 | AccessSets.findValue(V: AccessSets.getLeaderValue(V: CurAccess)); |
2183 | |
2184 | // Check accesses within this set. |
2185 | EquivalenceClasses<MemAccessInfo>::member_iterator AI = |
2186 | AccessSets.member_begin(I); |
2187 | EquivalenceClasses<MemAccessInfo>::member_iterator AE = |
2188 | AccessSets.member_end(); |
2189 | |
2190 | // Check every access pair. |
2191 | while (AI != AE) { |
2192 | Visited.insert(Ptr: *AI); |
2193 | bool AIIsWrite = AI->getInt(); |
2194 | // Check loads only against next equivalent class, but stores also against |
2195 | // other stores in the same equivalence class - to the same address. |
2196 | EquivalenceClasses<MemAccessInfo>::member_iterator OI = |
2197 | (AIIsWrite ? AI : std::next(x: AI)); |
2198 | while (OI != AE) { |
2199 | // Check every accessing instruction pair in program order. |
2200 | for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(), |
2201 | I1E = Accesses[*AI].end(); I1 != I1E; ++I1) |
2202 | // Scan all accesses of another equivalence class, but only the next |
2203 | // accesses of the same equivalent class. |
2204 | for (std::vector<unsigned>::iterator |
2205 | I2 = (OI == AI ? std::next(x: I1) : Accesses[*OI].begin()), |
2206 | I2E = (OI == AI ? I1E : Accesses[*OI].end()); |
2207 | I2 != I2E; ++I2) { |
2208 | auto A = std::make_pair(x: &*AI, y&: *I1); |
2209 | auto B = std::make_pair(x: &*OI, y&: *I2); |
2210 | |
2211 | assert(*I1 != *I2); |
2212 | if (*I1 > *I2) |
2213 | std::swap(x&: A, y&: B); |
2214 | |
2215 | Dependence::DepType Type = |
2216 | isDependent(A: *A.first, AIdx: A.second, B: *B.first, BIdx: B.second, Strides, |
2217 | UnderlyingObjects); |
2218 | mergeInStatus(S: Dependence::isSafeForVectorization(Type)); |
2219 | |
2220 | // Gather dependences unless we accumulated MaxDependences |
2221 | // dependences. In that case return as soon as we find the first |
2222 | // unsafe dependence. This puts a limit on this quadratic |
2223 | // algorithm. |
2224 | if (RecordDependences) { |
2225 | if (Type != Dependence::NoDep) |
2226 | Dependences.push_back(Elt: Dependence(A.second, B.second, Type)); |
2227 | |
2228 | if (Dependences.size() >= MaxDependences) { |
2229 | RecordDependences = false; |
2230 | Dependences.clear(); |
2231 | LLVM_DEBUG(dbgs() |
2232 | << "Too many dependences, stopped recording\n" ); |
2233 | } |
2234 | } |
2235 | if (!RecordDependences && !isSafeForVectorization()) |
2236 | return false; |
2237 | } |
2238 | ++OI; |
2239 | } |
2240 | AI++; |
2241 | } |
2242 | } |
2243 | |
2244 | LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n" ); |
2245 | return isSafeForVectorization(); |
2246 | } |
2247 | |
2248 | SmallVector<Instruction *, 4> |
2249 | MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool isWrite) const { |
2250 | MemAccessInfo Access(Ptr, isWrite); |
2251 | auto &IndexVector = Accesses.find(Val: Access)->second; |
2252 | |
2253 | SmallVector<Instruction *, 4> Insts; |
2254 | transform(Range: IndexVector, |
2255 | d_first: std::back_inserter(x&: Insts), |
2256 | F: [&](unsigned Idx) { return this->InstMap[Idx]; }); |
2257 | return Insts; |
2258 | } |
2259 | |
2260 | const char *MemoryDepChecker::Dependence::DepName[] = { |
2261 | "NoDep" , |
2262 | "Unknown" , |
2263 | "IndidrectUnsafe" , |
2264 | "Forward" , |
2265 | "ForwardButPreventsForwarding" , |
2266 | "Backward" , |
2267 | "BackwardVectorizable" , |
2268 | "BackwardVectorizableButPreventsForwarding" }; |
2269 | |
2270 | void MemoryDepChecker::Dependence::print( |
2271 | raw_ostream &OS, unsigned Depth, |
2272 | const SmallVectorImpl<Instruction *> &Instrs) const { |
2273 | OS.indent(NumSpaces: Depth) << DepName[Type] << ":\n" ; |
2274 | OS.indent(NumSpaces: Depth + 2) << *Instrs[Source] << " -> \n" ; |
2275 | OS.indent(NumSpaces: Depth + 2) << *Instrs[Destination] << "\n" ; |
2276 | } |
2277 | |
2278 | bool LoopAccessInfo::canAnalyzeLoop() { |
2279 | // We need to have a loop header. |
2280 | LLVM_DEBUG(dbgs() << "LAA: Found a loop in " |
2281 | << TheLoop->getHeader()->getParent()->getName() << ": " |
2282 | << TheLoop->getHeader()->getName() << '\n'); |
2283 | |
2284 | // We can only analyze innermost loops. |
2285 | if (!TheLoop->isInnermost()) { |
2286 | LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n" ); |
2287 | recordAnalysis(RemarkName: "NotInnerMostLoop" ) << "loop is not the innermost loop" ; |
2288 | return false; |
2289 | } |
2290 | |
2291 | // We must have a single backedge. |
2292 | if (TheLoop->getNumBackEdges() != 1) { |
2293 | LLVM_DEBUG( |
2294 | dbgs() << "LAA: loop control flow is not understood by analyzer\n" ); |
2295 | recordAnalysis(RemarkName: "CFGNotUnderstood" ) |
2296 | << "loop control flow is not understood by analyzer" ; |
2297 | return false; |
2298 | } |
2299 | |
2300 | // ScalarEvolution needs to be able to find the exit count. |
2301 | const SCEV *ExitCount = PSE->getBackedgeTakenCount(); |
2302 | if (isa<SCEVCouldNotCompute>(Val: ExitCount)) { |
2303 | recordAnalysis(RemarkName: "CantComputeNumberOfIterations" ) |
2304 | << "could not determine number of loop iterations" ; |
2305 | LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n" ); |
2306 | return false; |
2307 | } |
2308 | |
2309 | return true; |
2310 | } |
2311 | |
2312 | void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI, |
2313 | const TargetLibraryInfo *TLI, |
2314 | DominatorTree *DT) { |
2315 | // Holds the Load and Store instructions. |
2316 | SmallVector<LoadInst *, 16> Loads; |
2317 | SmallVector<StoreInst *, 16> Stores; |
2318 | SmallPtrSet<MDNode *, 8> LoopAliasScopes; |
2319 | |
2320 | // Holds all the different accesses in the loop. |
2321 | unsigned NumReads = 0; |
2322 | unsigned NumReadWrites = 0; |
2323 | |
2324 | bool HasComplexMemInst = false; |
2325 | |
2326 | // A runtime check is only legal to insert if there are no convergent calls. |
2327 | HasConvergentOp = false; |
2328 | |
2329 | PtrRtChecking->Pointers.clear(); |
2330 | PtrRtChecking->Need = false; |
2331 | |
2332 | const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); |
2333 | |
2334 | const bool EnableMemAccessVersioningOfLoop = |
2335 | EnableMemAccessVersioning && |
2336 | !TheLoop->getHeader()->getParent()->hasOptSize(); |
2337 | |
2338 | // Traverse blocks in fixed RPOT order, regardless of their storage in the |
2339 | // loop info, as it may be arbitrary. |
2340 | LoopBlocksRPO RPOT(TheLoop); |
2341 | RPOT.perform(LI); |
2342 | for (BasicBlock *BB : RPOT) { |
2343 | // Scan the BB and collect legal loads and stores. Also detect any |
2344 | // convergent instructions. |
2345 | for (Instruction &I : *BB) { |
2346 | if (auto *Call = dyn_cast<CallBase>(Val: &I)) { |
2347 | if (Call->isConvergent()) |
2348 | HasConvergentOp = true; |
2349 | } |
2350 | |
2351 | // With both a non-vectorizable memory instruction and a convergent |
2352 | // operation, found in this loop, no reason to continue the search. |
2353 | if (HasComplexMemInst && HasConvergentOp) { |
2354 | CanVecMem = false; |
2355 | return; |
2356 | } |
2357 | |
2358 | // Avoid hitting recordAnalysis multiple times. |
2359 | if (HasComplexMemInst) |
2360 | continue; |
2361 | |
2362 | // Record alias scopes defined inside the loop. |
2363 | if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(Val: &I)) |
2364 | for (Metadata *Op : Decl->getScopeList()->operands()) |
2365 | LoopAliasScopes.insert(Ptr: cast<MDNode>(Val: Op)); |
2366 | |
2367 | // Many math library functions read the rounding mode. We will only |
2368 | // vectorize a loop if it contains known function calls that don't set |
2369 | // the flag. Therefore, it is safe to ignore this read from memory. |
2370 | auto *Call = dyn_cast<CallInst>(Val: &I); |
2371 | if (Call && getVectorIntrinsicIDForCall(CI: Call, TLI)) |
2372 | continue; |
2373 | |
2374 | // If this is a load, save it. If this instruction can read from memory |
2375 | // but is not a load, then we quit. Notice that we don't handle function |
2376 | // calls that read or write. |
2377 | if (I.mayReadFromMemory()) { |
2378 | // If the function has an explicit vectorized counterpart, we can safely |
2379 | // assume that it can be vectorized. |
2380 | if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() && |
2381 | !VFDatabase::getMappings(CI: *Call).empty()) |
2382 | continue; |
2383 | |
2384 | auto *Ld = dyn_cast<LoadInst>(Val: &I); |
2385 | if (!Ld) { |
2386 | recordAnalysis(RemarkName: "CantVectorizeInstruction" , Instr: Ld) |
2387 | << "instruction cannot be vectorized" ; |
2388 | HasComplexMemInst = true; |
2389 | continue; |
2390 | } |
2391 | if (!Ld->isSimple() && !IsAnnotatedParallel) { |
2392 | recordAnalysis(RemarkName: "NonSimpleLoad" , Instr: Ld) |
2393 | << "read with atomic ordering or volatile read" ; |
2394 | LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n" ); |
2395 | HasComplexMemInst = true; |
2396 | continue; |
2397 | } |
2398 | NumLoads++; |
2399 | Loads.push_back(Elt: Ld); |
2400 | DepChecker->addAccess(LI: Ld); |
2401 | if (EnableMemAccessVersioningOfLoop) |
2402 | collectStridedAccess(LoadOrStoreInst: Ld); |
2403 | continue; |
2404 | } |
2405 | |
2406 | // Save 'store' instructions. Abort if other instructions write to memory. |
2407 | if (I.mayWriteToMemory()) { |
2408 | auto *St = dyn_cast<StoreInst>(Val: &I); |
2409 | if (!St) { |
2410 | recordAnalysis(RemarkName: "CantVectorizeInstruction" , Instr: St) |
2411 | << "instruction cannot be vectorized" ; |
2412 | HasComplexMemInst = true; |
2413 | continue; |
2414 | } |
2415 | if (!St->isSimple() && !IsAnnotatedParallel) { |
2416 | recordAnalysis(RemarkName: "NonSimpleStore" , Instr: St) |
2417 | << "write with atomic ordering or volatile write" ; |
2418 | LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n" ); |
2419 | HasComplexMemInst = true; |
2420 | continue; |
2421 | } |
2422 | NumStores++; |
2423 | Stores.push_back(Elt: St); |
2424 | DepChecker->addAccess(SI: St); |
2425 | if (EnableMemAccessVersioningOfLoop) |
2426 | collectStridedAccess(LoadOrStoreInst: St); |
2427 | } |
2428 | } // Next instr. |
2429 | } // Next block. |
2430 | |
2431 | if (HasComplexMemInst) { |
2432 | CanVecMem = false; |
2433 | return; |
2434 | } |
2435 | |
2436 | // Now we have two lists that hold the loads and the stores. |
2437 | // Next, we find the pointers that they use. |
2438 | |
2439 | // Check if we see any stores. If there are no stores, then we don't |
2440 | // care if the pointers are *restrict*. |
2441 | if (!Stores.size()) { |
2442 | LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n" ); |
2443 | CanVecMem = true; |
2444 | return; |
2445 | } |
2446 | |
2447 | MemoryDepChecker::DepCandidates DependentAccesses; |
2448 | AccessAnalysis Accesses(TheLoop, AA, LI, DependentAccesses, *PSE, |
2449 | LoopAliasScopes); |
2450 | |
2451 | // Holds the analyzed pointers. We don't want to call getUnderlyingObjects |
2452 | // multiple times on the same object. If the ptr is accessed twice, once |
2453 | // for read and once for write, it will only appear once (on the write |
2454 | // list). This is okay, since we are going to check for conflicts between |
2455 | // writes and between reads and writes, but not between reads and reads. |
2456 | SmallSet<std::pair<Value *, Type *>, 16> Seen; |
2457 | |
2458 | // Record uniform store addresses to identify if we have multiple stores |
2459 | // to the same address. |
2460 | SmallPtrSet<Value *, 16> UniformStores; |
2461 | |
2462 | for (StoreInst *ST : Stores) { |
2463 | Value *Ptr = ST->getPointerOperand(); |
2464 | |
2465 | if (isInvariant(V: Ptr)) { |
2466 | // Record store instructions to loop invariant addresses |
2467 | StoresToInvariantAddresses.push_back(Elt: ST); |
2468 | HasDependenceInvolvingLoopInvariantAddress |= |
2469 | !UniformStores.insert(Ptr).second; |
2470 | } |
2471 | |
2472 | // If we did *not* see this pointer before, insert it to the read-write |
2473 | // list. At this phase it is only a 'write' list. |
2474 | Type *AccessTy = getLoadStoreType(I: ST); |
2475 | if (Seen.insert(V: {Ptr, AccessTy}).second) { |
2476 | ++NumReadWrites; |
2477 | |
2478 | MemoryLocation Loc = MemoryLocation::get(SI: ST); |
2479 | // The TBAA metadata could have a control dependency on the predication |
2480 | // condition, so we cannot rely on it when determining whether or not we |
2481 | // need runtime pointer checks. |
2482 | if (blockNeedsPredication(BB: ST->getParent(), TheLoop, DT)) |
2483 | Loc.AATags.TBAA = nullptr; |
2484 | |
2485 | visitPointers(StartPtr: const_cast<Value *>(Loc.Ptr), InnermostLoop: *TheLoop, |
2486 | AddPointer: [&Accesses, AccessTy, Loc](Value *Ptr) { |
2487 | MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr); |
2488 | Accesses.addStore(Loc&: NewLoc, AccessTy); |
2489 | }); |
2490 | } |
2491 | } |
2492 | |
2493 | if (IsAnnotatedParallel) { |
2494 | LLVM_DEBUG( |
2495 | dbgs() << "LAA: A loop annotated parallel, ignore memory dependency " |
2496 | << "checks.\n" ); |
2497 | CanVecMem = true; |
2498 | return; |
2499 | } |
2500 | |
2501 | for (LoadInst *LD : Loads) { |
2502 | Value *Ptr = LD->getPointerOperand(); |
2503 | // If we did *not* see this pointer before, insert it to the |
2504 | // read list. If we *did* see it before, then it is already in |
2505 | // the read-write list. This allows us to vectorize expressions |
2506 | // such as A[i] += x; Because the address of A[i] is a read-write |
2507 | // pointer. This only works if the index of A[i] is consecutive. |
2508 | // If the address of i is unknown (for example A[B[i]]) then we may |
2509 | // read a few words, modify, and write a few words, and some of the |
2510 | // words may be written to the same address. |
2511 | bool IsReadOnlyPtr = false; |
2512 | Type *AccessTy = getLoadStoreType(I: LD); |
2513 | if (Seen.insert(V: {Ptr, AccessTy}).second || |
2514 | !getPtrStride(PSE&: *PSE, AccessTy: LD->getType(), Ptr, Lp: TheLoop, StridesMap: SymbolicStrides).value_or(u: 0)) { |
2515 | ++NumReads; |
2516 | IsReadOnlyPtr = true; |
2517 | } |
2518 | |
2519 | // See if there is an unsafe dependency between a load to a uniform address and |
2520 | // store to the same uniform address. |
2521 | if (UniformStores.count(Ptr)) { |
2522 | LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform " |
2523 | "load and uniform store to the same address!\n" ); |
2524 | HasDependenceInvolvingLoopInvariantAddress = true; |
2525 | } |
2526 | |
2527 | MemoryLocation Loc = MemoryLocation::get(LI: LD); |
2528 | // The TBAA metadata could have a control dependency on the predication |
2529 | // condition, so we cannot rely on it when determining whether or not we |
2530 | // need runtime pointer checks. |
2531 | if (blockNeedsPredication(BB: LD->getParent(), TheLoop, DT)) |
2532 | Loc.AATags.TBAA = nullptr; |
2533 | |
2534 | visitPointers(StartPtr: const_cast<Value *>(Loc.Ptr), InnermostLoop: *TheLoop, |
2535 | AddPointer: [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) { |
2536 | MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr); |
2537 | Accesses.addLoad(Loc&: NewLoc, AccessTy, IsReadOnly: IsReadOnlyPtr); |
2538 | }); |
2539 | } |
2540 | |
2541 | // If we write (or read-write) to a single destination and there are no |
2542 | // other reads in this loop then is it safe to vectorize. |
2543 | if (NumReadWrites == 1 && NumReads == 0) { |
2544 | LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n" ); |
2545 | CanVecMem = true; |
2546 | return; |
2547 | } |
2548 | |
2549 | // Build dependence sets and check whether we need a runtime pointer bounds |
2550 | // check. |
2551 | Accesses.buildDependenceSets(); |
2552 | |
2553 | // Find pointers with computable bounds. We are going to use this information |
2554 | // to place a runtime bound check. |
2555 | Value *UncomputablePtr = nullptr; |
2556 | bool CanDoRTIfNeeded = |
2557 | Accesses.canCheckPtrAtRT(RtCheck&: *PtrRtChecking, SE: PSE->getSE(), TheLoop, |
2558 | StridesMap: SymbolicStrides, UncomputablePtr, ShouldCheckWrap: false); |
2559 | if (!CanDoRTIfNeeded) { |
2560 | auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr); |
2561 | recordAnalysis(RemarkName: "CantIdentifyArrayBounds" , Instr: I) |
2562 | << "cannot identify array bounds" ; |
2563 | LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find " |
2564 | << "the array bounds.\n" ); |
2565 | CanVecMem = false; |
2566 | return; |
2567 | } |
2568 | |
2569 | LLVM_DEBUG( |
2570 | dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n" ); |
2571 | |
2572 | CanVecMem = true; |
2573 | if (Accesses.isDependencyCheckNeeded()) { |
2574 | LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n" ); |
2575 | CanVecMem = DepChecker->areDepsSafe( |
2576 | AccessSets&: DependentAccesses, CheckDeps&: Accesses.getDependenciesToCheck(), Strides: SymbolicStrides, |
2577 | UnderlyingObjects: Accesses.getUnderlyingObjects()); |
2578 | |
2579 | if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) { |
2580 | LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n" ); |
2581 | |
2582 | // Clear the dependency checks. We assume they are not needed. |
2583 | Accesses.resetDepChecks(DepChecker&: *DepChecker); |
2584 | |
2585 | PtrRtChecking->reset(); |
2586 | PtrRtChecking->Need = true; |
2587 | |
2588 | auto *SE = PSE->getSE(); |
2589 | UncomputablePtr = nullptr; |
2590 | CanDoRTIfNeeded = Accesses.canCheckPtrAtRT( |
2591 | RtCheck&: *PtrRtChecking, SE, TheLoop, StridesMap: SymbolicStrides, UncomputablePtr, ShouldCheckWrap: true); |
2592 | |
2593 | // Check that we found the bounds for the pointer. |
2594 | if (!CanDoRTIfNeeded) { |
2595 | auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr); |
2596 | recordAnalysis(RemarkName: "CantCheckMemDepsAtRunTime" , Instr: I) |
2597 | << "cannot check memory dependencies at runtime" ; |
2598 | LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n" ); |
2599 | CanVecMem = false; |
2600 | return; |
2601 | } |
2602 | |
2603 | CanVecMem = true; |
2604 | } |
2605 | } |
2606 | |
2607 | if (HasConvergentOp) { |
2608 | recordAnalysis(RemarkName: "CantInsertRuntimeCheckWithConvergent" ) |
2609 | << "cannot add control dependency to convergent operation" ; |
2610 | LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check " |
2611 | "would be needed with a convergent operation\n" ); |
2612 | CanVecMem = false; |
2613 | return; |
2614 | } |
2615 | |
2616 | if (CanVecMem) |
2617 | LLVM_DEBUG( |
2618 | dbgs() << "LAA: No unsafe dependent memory operations in loop. We" |
2619 | << (PtrRtChecking->Need ? "" : " don't" ) |
2620 | << " need runtime memory checks.\n" ); |
2621 | else |
2622 | emitUnsafeDependenceRemark(); |
2623 | } |
2624 | |
2625 | void LoopAccessInfo::() { |
2626 | auto Deps = getDepChecker().getDependences(); |
2627 | if (!Deps) |
2628 | return; |
2629 | auto Found = llvm::find_if(Range: *Deps, P: [](const MemoryDepChecker::Dependence &D) { |
2630 | return MemoryDepChecker::Dependence::isSafeForVectorization(Type: D.Type) != |
2631 | MemoryDepChecker::VectorizationSafetyStatus::Safe; |
2632 | }); |
2633 | if (Found == Deps->end()) |
2634 | return; |
2635 | MemoryDepChecker::Dependence Dep = *Found; |
2636 | |
2637 | LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n" ); |
2638 | |
2639 | // Emit remark for first unsafe dependence |
2640 | bool HasForcedDistribution = false; |
2641 | std::optional<const MDOperand *> Value = |
2642 | findStringMetadataForLoop(TheLoop, Name: "llvm.loop.distribute.enable" ); |
2643 | if (Value) { |
2644 | const MDOperand *Op = *Value; |
2645 | assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata" ); |
2646 | HasForcedDistribution = mdconst::extract<ConstantInt>(MD: *Op)->getZExtValue(); |
2647 | } |
2648 | |
2649 | const std::string Info = |
2650 | HasForcedDistribution |
2651 | ? "unsafe dependent memory operations in loop." |
2652 | : "unsafe dependent memory operations in loop. Use " |
2653 | "#pragma clang loop distribute(enable) to allow loop distribution " |
2654 | "to attempt to isolate the offending operations into a separate " |
2655 | "loop" ; |
2656 | OptimizationRemarkAnalysis &R = |
2657 | recordAnalysis(RemarkName: "UnsafeDep" , Instr: Dep.getDestination(LAI: *this)) << Info; |
2658 | |
2659 | switch (Dep.Type) { |
2660 | case MemoryDepChecker::Dependence::NoDep: |
2661 | case MemoryDepChecker::Dependence::Forward: |
2662 | case MemoryDepChecker::Dependence::BackwardVectorizable: |
2663 | llvm_unreachable("Unexpected dependence" ); |
2664 | case MemoryDepChecker::Dependence::Backward: |
2665 | R << "\nBackward loop carried data dependence." ; |
2666 | break; |
2667 | case MemoryDepChecker::Dependence::ForwardButPreventsForwarding: |
2668 | R << "\nForward loop carried data dependence that prevents " |
2669 | "store-to-load forwarding." ; |
2670 | break; |
2671 | case MemoryDepChecker::Dependence::BackwardVectorizableButPreventsForwarding: |
2672 | R << "\nBackward loop carried data dependence that prevents " |
2673 | "store-to-load forwarding." ; |
2674 | break; |
2675 | case MemoryDepChecker::Dependence::IndirectUnsafe: |
2676 | R << "\nUnsafe indirect dependence." ; |
2677 | break; |
2678 | case MemoryDepChecker::Dependence::Unknown: |
2679 | R << "\nUnknown data dependence." ; |
2680 | break; |
2681 | } |
2682 | |
2683 | if (Instruction *I = Dep.getSource(LAI: *this)) { |
2684 | DebugLoc SourceLoc = I->getDebugLoc(); |
2685 | if (auto *DD = dyn_cast_or_null<Instruction>(Val: getPointerOperand(V: I))) |
2686 | SourceLoc = DD->getDebugLoc(); |
2687 | if (SourceLoc) |
2688 | R << " Memory location is the same as accessed at " |
2689 | << ore::NV("Location" , SourceLoc); |
2690 | } |
2691 | } |
2692 | |
2693 | bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, |
2694 | DominatorTree *DT) { |
2695 | assert(TheLoop->contains(BB) && "Unknown block used" ); |
2696 | |
2697 | // Blocks that do not dominate the latch need predication. |
2698 | BasicBlock* Latch = TheLoop->getLoopLatch(); |
2699 | return !DT->dominates(A: BB, B: Latch); |
2700 | } |
2701 | |
2702 | OptimizationRemarkAnalysis &LoopAccessInfo::recordAnalysis(StringRef , |
2703 | Instruction *I) { |
2704 | assert(!Report && "Multiple reports generated" ); |
2705 | |
2706 | Value *CodeRegion = TheLoop->getHeader(); |
2707 | DebugLoc DL = TheLoop->getStartLoc(); |
2708 | |
2709 | if (I) { |
2710 | CodeRegion = I->getParent(); |
2711 | // If there is no debug location attached to the instruction, revert back to |
2712 | // using the loop's. |
2713 | if (I->getDebugLoc()) |
2714 | DL = I->getDebugLoc(); |
2715 | } |
2716 | |
2717 | Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, args&: RemarkName, args&: DL, |
2718 | args&: CodeRegion); |
2719 | return *Report; |
2720 | } |
2721 | |
2722 | bool LoopAccessInfo::isInvariant(Value *V) const { |
2723 | auto *SE = PSE->getSE(); |
2724 | // TODO: Is this really what we want? Even without FP SCEV, we may want some |
2725 | // trivially loop-invariant FP values to be considered invariant. |
2726 | if (!SE->isSCEVable(Ty: V->getType())) |
2727 | return false; |
2728 | const SCEV *S = SE->getSCEV(V); |
2729 | return SE->isLoopInvariant(S, L: TheLoop); |
2730 | } |
2731 | |
2732 | /// Find the operand of the GEP that should be checked for consecutive |
2733 | /// stores. This ignores trailing indices that have no effect on the final |
2734 | /// pointer. |
2735 | static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) { |
2736 | const DataLayout &DL = Gep->getModule()->getDataLayout(); |
2737 | unsigned LastOperand = Gep->getNumOperands() - 1; |
2738 | TypeSize GEPAllocSize = DL.getTypeAllocSize(Ty: Gep->getResultElementType()); |
2739 | |
2740 | // Walk backwards and try to peel off zeros. |
2741 | while (LastOperand > 1 && match(V: Gep->getOperand(i_nocapture: LastOperand), P: m_Zero())) { |
2742 | // Find the type we're currently indexing into. |
2743 | gep_type_iterator GEPTI = gep_type_begin(GEP: Gep); |
2744 | std::advance(i&: GEPTI, n: LastOperand - 2); |
2745 | |
2746 | // If it's a type with the same allocation size as the result of the GEP we |
2747 | // can peel off the zero index. |
2748 | TypeSize ElemSize = GEPTI.isStruct() |
2749 | ? DL.getTypeAllocSize(Ty: GEPTI.getIndexedType()) |
2750 | : GEPTI.getSequentialElementStride(DL); |
2751 | if (ElemSize != GEPAllocSize) |
2752 | break; |
2753 | --LastOperand; |
2754 | } |
2755 | |
2756 | return LastOperand; |
2757 | } |
2758 | |
2759 | /// If the argument is a GEP, then returns the operand identified by |
2760 | /// getGEPInductionOperand. However, if there is some other non-loop-invariant |
2761 | /// operand, it returns that instead. |
2762 | static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { |
2763 | GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr); |
2764 | if (!GEP) |
2765 | return Ptr; |
2766 | |
2767 | unsigned InductionOperand = getGEPInductionOperand(Gep: GEP); |
2768 | |
2769 | // Check that all of the gep indices are uniform except for our induction |
2770 | // operand. |
2771 | for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i) |
2772 | if (i != InductionOperand && |
2773 | !SE->isLoopInvariant(S: SE->getSCEV(V: GEP->getOperand(i_nocapture: i)), L: Lp)) |
2774 | return Ptr; |
2775 | return GEP->getOperand(i_nocapture: InductionOperand); |
2776 | } |
2777 | |
2778 | /// If a value has only one user that is a CastInst, return it. |
2779 | static Value *getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) { |
2780 | Value *UniqueCast = nullptr; |
2781 | for (User *U : Ptr->users()) { |
2782 | CastInst *CI = dyn_cast<CastInst>(Val: U); |
2783 | if (CI && CI->getType() == Ty) { |
2784 | if (!UniqueCast) |
2785 | UniqueCast = CI; |
2786 | else |
2787 | return nullptr; |
2788 | } |
2789 | } |
2790 | return UniqueCast; |
2791 | } |
2792 | |
2793 | /// Get the stride of a pointer access in a loop. Looks for symbolic |
2794 | /// strides "a[i*stride]". Returns the symbolic stride, or null otherwise. |
2795 | static const SCEV *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { |
2796 | auto *PtrTy = dyn_cast<PointerType>(Val: Ptr->getType()); |
2797 | if (!PtrTy || PtrTy->isAggregateType()) |
2798 | return nullptr; |
2799 | |
2800 | // Try to remove a gep instruction to make the pointer (actually index at this |
2801 | // point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the |
2802 | // pointer, otherwise, we are analyzing the index. |
2803 | Value *OrigPtr = Ptr; |
2804 | |
2805 | // The size of the pointer access. |
2806 | int64_t PtrAccessSize = 1; |
2807 | |
2808 | Ptr = stripGetElementPtr(Ptr, SE, Lp); |
2809 | const SCEV *V = SE->getSCEV(V: Ptr); |
2810 | |
2811 | if (Ptr != OrigPtr) |
2812 | // Strip off casts. |
2813 | while (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(Val: V)) |
2814 | V = C->getOperand(); |
2815 | |
2816 | const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(Val: V); |
2817 | if (!S) |
2818 | return nullptr; |
2819 | |
2820 | // If the pointer is invariant then there is no stride and it makes no |
2821 | // sense to add it here. |
2822 | if (Lp != S->getLoop()) |
2823 | return nullptr; |
2824 | |
2825 | V = S->getStepRecurrence(SE&: *SE); |
2826 | if (!V) |
2827 | return nullptr; |
2828 | |
2829 | // Strip off the size of access multiplication if we are still analyzing the |
2830 | // pointer. |
2831 | if (OrigPtr == Ptr) { |
2832 | if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(Val: V)) { |
2833 | if (M->getOperand(i: 0)->getSCEVType() != scConstant) |
2834 | return nullptr; |
2835 | |
2836 | const APInt &APStepVal = cast<SCEVConstant>(Val: M->getOperand(i: 0))->getAPInt(); |
2837 | |
2838 | // Huge step value - give up. |
2839 | if (APStepVal.getBitWidth() > 64) |
2840 | return nullptr; |
2841 | |
2842 | int64_t StepVal = APStepVal.getSExtValue(); |
2843 | if (PtrAccessSize != StepVal) |
2844 | return nullptr; |
2845 | V = M->getOperand(i: 1); |
2846 | } |
2847 | } |
2848 | |
2849 | // Note that the restriction after this loop invariant check are only |
2850 | // profitability restrictions. |
2851 | if (!SE->isLoopInvariant(S: V, L: Lp)) |
2852 | return nullptr; |
2853 | |
2854 | // Look for the loop invariant symbolic value. |
2855 | const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Val: V); |
2856 | if (!U) { |
2857 | const auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V); |
2858 | if (!C) |
2859 | return nullptr; |
2860 | U = dyn_cast<SCEVUnknown>(Val: C->getOperand()); |
2861 | if (!U) |
2862 | return nullptr; |
2863 | |
2864 | // Match legacy behavior - this is not needed for correctness |
2865 | if (!getUniqueCastUse(Ptr: U->getValue(), Lp, Ty: V->getType())) |
2866 | return nullptr; |
2867 | } |
2868 | |
2869 | return V; |
2870 | } |
2871 | |
2872 | void LoopAccessInfo::collectStridedAccess(Value *MemAccess) { |
2873 | Value *Ptr = getLoadStorePointerOperand(V: MemAccess); |
2874 | if (!Ptr) |
2875 | return; |
2876 | |
2877 | // Note: getStrideFromPointer is a *profitability* heuristic. We |
2878 | // could broaden the scope of values returned here - to anything |
2879 | // which happens to be loop invariant and contributes to the |
2880 | // computation of an interesting IV - but we chose not to as we |
2881 | // don't have a cost model here, and broadening the scope exposes |
2882 | // far too many unprofitable cases. |
2883 | const SCEV *StrideExpr = getStrideFromPointer(Ptr, SE: PSE->getSE(), Lp: TheLoop); |
2884 | if (!StrideExpr) |
2885 | return; |
2886 | |
2887 | LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for " |
2888 | "versioning:" ); |
2889 | LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *StrideExpr << "\n" ); |
2890 | |
2891 | if (!SpeculateUnitStride) { |
2892 | LLVM_DEBUG(dbgs() << " Chose not to due to -laa-speculate-unit-stride\n" ); |
2893 | return; |
2894 | } |
2895 | |
2896 | // Avoid adding the "Stride == 1" predicate when we know that |
2897 | // Stride >= Trip-Count. Such a predicate will effectively optimize a single |
2898 | // or zero iteration loop, as Trip-Count <= Stride == 1. |
2899 | // |
2900 | // TODO: We are currently not making a very informed decision on when it is |
2901 | // beneficial to apply stride versioning. It might make more sense that the |
2902 | // users of this analysis (such as the vectorizer) will trigger it, based on |
2903 | // their specific cost considerations; For example, in cases where stride |
2904 | // versioning does not help resolving memory accesses/dependences, the |
2905 | // vectorizer should evaluate the cost of the runtime test, and the benefit |
2906 | // of various possible stride specializations, considering the alternatives |
2907 | // of using gather/scatters (if available). |
2908 | |
2909 | const SCEV *BETakenCount = PSE->getBackedgeTakenCount(); |
2910 | |
2911 | // Match the types so we can compare the stride and the BETakenCount. |
2912 | // The Stride can be positive/negative, so we sign extend Stride; |
2913 | // The backedgeTakenCount is non-negative, so we zero extend BETakenCount. |
2914 | const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout(); |
2915 | uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(Ty: StrideExpr->getType()); |
2916 | uint64_t BETypeSizeBits = DL.getTypeSizeInBits(Ty: BETakenCount->getType()); |
2917 | const SCEV *CastedStride = StrideExpr; |
2918 | const SCEV *CastedBECount = BETakenCount; |
2919 | ScalarEvolution *SE = PSE->getSE(); |
2920 | if (BETypeSizeBits >= StrideTypeSizeBits) |
2921 | CastedStride = SE->getNoopOrSignExtend(V: StrideExpr, Ty: BETakenCount->getType()); |
2922 | else |
2923 | CastedBECount = SE->getZeroExtendExpr(Op: BETakenCount, Ty: StrideExpr->getType()); |
2924 | const SCEV *StrideMinusBETaken = SE->getMinusSCEV(LHS: CastedStride, RHS: CastedBECount); |
2925 | // Since TripCount == BackEdgeTakenCount + 1, checking: |
2926 | // "Stride >= TripCount" is equivalent to checking: |
2927 | // Stride - BETakenCount > 0 |
2928 | if (SE->isKnownPositive(S: StrideMinusBETaken)) { |
2929 | LLVM_DEBUG( |
2930 | dbgs() << "LAA: Stride>=TripCount; No point in versioning as the " |
2931 | "Stride==1 predicate will imply that the loop executes " |
2932 | "at most once.\n" ); |
2933 | return; |
2934 | } |
2935 | LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n" ); |
2936 | |
2937 | // Strip back off the integer cast, and check that our result is a |
2938 | // SCEVUnknown as we expect. |
2939 | const SCEV *StrideBase = StrideExpr; |
2940 | if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: StrideBase)) |
2941 | StrideBase = C->getOperand(); |
2942 | SymbolicStrides[Ptr] = cast<SCEVUnknown>(Val: StrideBase); |
2943 | } |
2944 | |
2945 | LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, |
2946 | const TargetLibraryInfo *TLI, AAResults *AA, |
2947 | DominatorTree *DT, LoopInfo *LI) |
2948 | : PSE(std::make_unique<PredicatedScalarEvolution>(args&: *SE, args&: *L)), |
2949 | PtrRtChecking(nullptr), |
2950 | DepChecker(std::make_unique<MemoryDepChecker>(args&: *PSE, args&: L)), TheLoop(L) { |
2951 | PtrRtChecking = std::make_unique<RuntimePointerChecking>(args&: *DepChecker, args&: SE); |
2952 | if (canAnalyzeLoop()) { |
2953 | analyzeLoop(AA, LI, TLI, DT); |
2954 | } |
2955 | } |
2956 | |
2957 | void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { |
2958 | if (CanVecMem) { |
2959 | OS.indent(NumSpaces: Depth) << "Memory dependences are safe" ; |
2960 | const MemoryDepChecker &DC = getDepChecker(); |
2961 | if (!DC.isSafeForAnyVectorWidth()) |
2962 | OS << " with a maximum safe vector width of " |
2963 | << DC.getMaxSafeVectorWidthInBits() << " bits" ; |
2964 | if (PtrRtChecking->Need) |
2965 | OS << " with run-time checks" ; |
2966 | OS << "\n" ; |
2967 | } |
2968 | |
2969 | if (HasConvergentOp) |
2970 | OS.indent(NumSpaces: Depth) << "Has convergent operation in loop\n" ; |
2971 | |
2972 | if (Report) |
2973 | OS.indent(NumSpaces: Depth) << "Report: " << Report->getMsg() << "\n" ; |
2974 | |
2975 | if (auto *Dependences = DepChecker->getDependences()) { |
2976 | OS.indent(NumSpaces: Depth) << "Dependences:\n" ; |
2977 | for (const auto &Dep : *Dependences) { |
2978 | Dep.print(OS, Depth: Depth + 2, Instrs: DepChecker->getMemoryInstructions()); |
2979 | OS << "\n" ; |
2980 | } |
2981 | } else |
2982 | OS.indent(NumSpaces: Depth) << "Too many dependences, not recorded\n" ; |
2983 | |
2984 | // List the pair of accesses need run-time checks to prove independence. |
2985 | PtrRtChecking->print(OS, Depth); |
2986 | OS << "\n" ; |
2987 | |
2988 | OS.indent(NumSpaces: Depth) << "Non vectorizable stores to invariant address were " |
2989 | << (HasDependenceInvolvingLoopInvariantAddress ? "" : "not " ) |
2990 | << "found in loop.\n" ; |
2991 | |
2992 | OS.indent(NumSpaces: Depth) << "SCEV assumptions:\n" ; |
2993 | PSE->getPredicate().print(OS, Depth); |
2994 | |
2995 | OS << "\n" ; |
2996 | |
2997 | OS.indent(NumSpaces: Depth) << "Expressions re-written:\n" ; |
2998 | PSE->print(OS, Depth); |
2999 | } |
3000 | |
3001 | const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L) { |
3002 | auto I = LoopAccessInfoMap.insert(KV: {&L, nullptr}); |
3003 | |
3004 | if (I.second) |
3005 | I.first->second = |
3006 | std::make_unique<LoopAccessInfo>(args: &L, args: &SE, args&: TLI, args: &AA, args: &DT, args: &LI); |
3007 | |
3008 | return *I.first->second; |
3009 | } |
3010 | |
3011 | bool LoopAccessInfoManager::invalidate( |
3012 | Function &F, const PreservedAnalyses &PA, |
3013 | FunctionAnalysisManager::Invalidator &Inv) { |
3014 | // Check whether our analysis is preserved. |
3015 | auto PAC = PA.getChecker<LoopAccessAnalysis>(); |
3016 | if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Function>>()) |
3017 | // If not, give up now. |
3018 | return true; |
3019 | |
3020 | // Check whether the analyses we depend on became invalid for any reason. |
3021 | // Skip checking TargetLibraryAnalysis as it is immutable and can't become |
3022 | // invalid. |
3023 | return Inv.invalidate<AAManager>(IR&: F, PA) || |
3024 | Inv.invalidate<ScalarEvolutionAnalysis>(IR&: F, PA) || |
3025 | Inv.invalidate<LoopAnalysis>(IR&: F, PA) || |
3026 | Inv.invalidate<DominatorTreeAnalysis>(IR&: F, PA); |
3027 | } |
3028 | |
3029 | LoopAccessInfoManager LoopAccessAnalysis::run(Function &F, |
3030 | FunctionAnalysisManager &FAM) { |
3031 | auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(IR&: F); |
3032 | auto &AA = FAM.getResult<AAManager>(IR&: F); |
3033 | auto &DT = FAM.getResult<DominatorTreeAnalysis>(IR&: F); |
3034 | auto &LI = FAM.getResult<LoopAnalysis>(IR&: F); |
3035 | auto &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: F); |
3036 | return LoopAccessInfoManager(SE, AA, DT, LI, &TLI); |
3037 | } |
3038 | |
3039 | AnalysisKey LoopAccessAnalysis::Key; |
3040 | |