LoopAccessAnalysis.cpp source code [llvm/lib/Analysis/LoopAccessAnalysis.cpp]

1	//===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// The implementation for the loop memory dependence that was originally
10	// developed for the loop vectorizer.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "llvm/Analysis/LoopAccessAnalysis.h"
15	#include "llvm/ADT/APInt.h"
16	#include "llvm/ADT/DenseMap.h"
17	#include "llvm/ADT/EquivalenceClasses.h"
18	#include "llvm/ADT/PointerIntPair.h"
19	#include "llvm/ADT/STLExtras.h"
20	#include "llvm/ADT/SetVector.h"
21	#include "llvm/ADT/SmallPtrSet.h"
22	#include "llvm/ADT/SmallSet.h"
23	#include "llvm/ADT/SmallVector.h"
24	#include "llvm/Analysis/AliasAnalysis.h"
25	#include "llvm/Analysis/AliasSetTracker.h"
26	#include "llvm/Analysis/LoopAnalysisManager.h"
27	#include "llvm/Analysis/LoopInfo.h"
28	#include "llvm/Analysis/LoopIterator.h"
29	#include "llvm/Analysis/MemoryLocation.h"
30	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
31	#include "llvm/Analysis/ScalarEvolution.h"
32	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
33	#include "llvm/Analysis/TargetLibraryInfo.h"
34	#include "llvm/Analysis/ValueTracking.h"
35	#include "llvm/Analysis/VectorUtils.h"
36	#include "llvm/IR/BasicBlock.h"
37	#include "llvm/IR/Constants.h"
38	#include "llvm/IR/DataLayout.h"
39	#include "llvm/IR/DebugLoc.h"
40	#include "llvm/IR/DerivedTypes.h"
41	#include "llvm/IR/DiagnosticInfo.h"
42	#include "llvm/IR/Dominators.h"
43	#include "llvm/IR/Function.h"
44	#include "llvm/IR/GetElementPtrTypeIterator.h"
45	#include "llvm/IR/InstrTypes.h"
46	#include "llvm/IR/Instruction.h"
47	#include "llvm/IR/Instructions.h"
48	#include "llvm/IR/Operator.h"
49	#include "llvm/IR/PassManager.h"
50	#include "llvm/IR/PatternMatch.h"
51	#include "llvm/IR/Type.h"
52	#include "llvm/IR/Value.h"
53	#include "llvm/IR/ValueHandle.h"
54	#include "llvm/Support/Casting.h"
55	#include "llvm/Support/CommandLine.h"
56	#include "llvm/Support/Debug.h"
57	#include "llvm/Support/ErrorHandling.h"
58	#include "llvm/Support/raw_ostream.h"
59	#include <algorithm>
60	#include <cassert>
61	#include <cstdint>
62	#include <iterator>
63	#include <utility>
64	#include <variant>
65	#include <vector>
66
67	using namespace llvm;
68	using namespace llvm::PatternMatch;
69
70	#define DEBUG_TYPE "loop-accesses"
71
72	static cl::opt<unsigned, true>
73	VectorizationFactor("force-vector-width", cl::Hidden,
74	cl::desc ("Sets the SIMD width. Zero is autoselect."),
75	cl::location(L&: VectorizerParams::VectorizationFactor));
76	unsigned VectorizerParams::VectorizationFactor;
77
78	static cl::opt<unsigned, true>
79	VectorizationInterleave("force-vector-interleave", cl::Hidden,
80	cl::desc ("Sets the vectorization interleave count. "
81	"Zero is autoselect."),
82	cl::location(
83	L&: VectorizerParams::VectorizationInterleave));
84	unsigned VectorizerParams::VectorizationInterleave;
85
86	static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
87	"runtime-memory-check-threshold", cl::Hidden,
88	cl::desc ("When performing memory disambiguation checks at runtime do not "
89	"generate more than this number of comparisons (default = 8)."),
90	cl::location(L&: VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(Val: `8`));
91	unsigned VectorizerParams::RuntimeMemoryCheckThreshold;
92
93	/// The maximum iterations used to merge memory checks
94	static cl::opt<unsigned> MemoryCheckMergeThreshold(
95	"memory-check-merge-threshold", cl::Hidden,
96	cl::desc ("Maximum number of comparisons done when trying to merge "
97	"runtime memory checks. (default = 100)"),
98	cl::init(Val: `100`));
99
100	/// Maximum SIMD width.
101	const unsigned VectorizerParams::MaxVectorWidth = `64`;
102
103	/// We collect dependences up to this threshold.
104	static cl::opt<unsigned>
105	MaxDependences("max-dependences", cl::Hidden,
106	cl::desc ("Maximum number of dependences collected by "
107	"loop-access analysis (default = 100)"),
108	cl::init(Val: `100`));
109
110	/// This enables versioning on the strides of symbolically striding memory
111	/// accesses in code like the following.
112	/// for (i = 0; i < N; ++i)
113	/// A[i Stride1] += B[i * Stride2] ...*
114	///
115	/// Will be roughly translated to
116	/// if (Stride1 == 1 && Stride2 == 1) {
117	/// for (i = 0; i < N; i+=4)
118	/// A[i:i+3] += ...
119	/// } else
120	/// ...
121	static cl::opt<bool> EnableMemAccessVersioning(
122	"enable-mem-access-versioning", cl::init(Val: true), cl::Hidden,
123	cl::desc ("Enable symbolic stride memory access versioning"));
124
125	/// Enable store-to-load forwarding conflict detection. This option can
126	/// be disabled for correctness testing.
127	static cl::opt<bool> EnableForwardingConflictDetection(
128	"store-to-load-forwarding-conflict-detection", cl::Hidden,
129	cl::desc ("Enable conflict detection in loop-access analysis"),
130	cl::init(Val: true));
131
132	static cl::opt<unsigned> MaxForkedSCEVDepth(
133	"max-forked-scev-depth", cl::Hidden,
134	cl::desc ("Maximum recursion depth when finding forked SCEVs (default = 5)"),
135	cl::init(Val: `5`));
136
137	static cl::opt<bool> SpeculateUnitStride(
138	"laa-speculate-unit-stride", cl::Hidden,
139	cl::desc ("Speculate that non-constant strides are unit in LAA"),
140	cl::init(Val: true));
141
142	static cl::opt<bool, true> HoistRuntimeChecks(
143	"hoist-runtime-checks", cl::Hidden,
144	cl::desc (
145	"Hoist inner loop runtime memory checks to outer loop if possible"),
146	cl::location(L&: VectorizerParams::HoistRuntimeChecks), cl::init(Val: true));
147	bool VectorizerParams::HoistRuntimeChecks;
148
149	bool VectorizerParams::isInterleaveForced() {
150	return ::VectorizationInterleave.getNumOccurrences() > `0`;
151	}
152
153	const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
154	const DenseMap<Value , const* SCEV *> &PtrToStride,
155	Value *Ptr) {
156	const SCEV *OrigSCEV = PSE.getSCEV(V: Ptr);
157
158	// If there is an entry in the map return the SCEV of the pointer with the
159	// symbolic stride replaced by one.
160	DenseMap<Value , const* SCEV *>::const_iterator SI = PtrToStride.find(Val: Ptr);
161	if (SI == PtrToStride.end())
162	// For a non-symbolic stride, just return the original expression.
163	return OrigSCEV;
164
165	const SCEV *StrideSCEV = SI ->second;
166	// Note: This assert is both overly strong and overly weak. The actual
167	// invariant here is that StrideSCEV should be loop invariant. The only
168	// such invariant strides we happen to speculate right now are unknowns
169	// and thus this is a reasonable proxy of the actual invariant.
170	assert(isa<SCEVUnknown>(StrideSCEV) && "shouldn't be in map");
171
172	ScalarEvolution *SE = PSE.getSE();
173	const auto *CT = SE->getOne(Ty: StrideSCEV->getType());
174	PSE.addPredicate(Pred: *SE->getEqualPredicate(LHS: StrideSCEV, RHS: CT));
175	auto *Expr = PSE.getSCEV(V: Ptr);
176
177	LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
178	<< " by: " << *Expr << "\n");
179	return Expr;
180	}
181
182	RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
183	unsigned Index, RuntimePointerChecking &RtCheck)
184	: High(RtCheck.Pointers [Index].End), Low(RtCheck.Pointers [Index].Start),
185	AddressSpace(RtCheck.Pointers [Index]
186	.PointerValue ->getType()
187	->getPointerAddressSpace()),
188	NeedsFreeze(RtCheck.Pointers [Index].NeedsFreeze) {
189	Members.push_back(Elt: Index);
190	}
191
192	/// Calculate Start and End points of memory access.
193	/// Let's assume A is the first access and B is a memory access on N-th loop
194	/// iteration. Then B is calculated as:
195	/// B = A + StepN .*
196	/// Step value may be positive or negative.
197	/// N is a calculated back-edge taken count:
198	/// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
199	/// Start and End points are calculated in the following way:
200	/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
201	/// where SizeOfElt is the size of single memory access in bytes.
202	///
203	/// There is no conflict when the intervals are disjoint:
204	/// NoConflict = (P2.Start >= P1.End) \|\| (P1.Start >= P2.End)
205	void RuntimePointerChecking::insert(Loop Lp, Value Ptr, const SCEV *PtrExpr,
206	Type AccessTy, bool* WritePtr,
207	unsigned DepSetId, unsigned ASId,
208	PredicatedScalarEvolution &PSE,
209	bool NeedsFreeze) {
210	ScalarEvolution *SE = PSE.getSE();
211
212	const SCEV *ScStart;
213	const SCEV *ScEnd;
214
215	if (SE->isLoopInvariant(S: PtrExpr, L: Lp)) {
216	ScStart = ScEnd = PtrExpr;
217	} else {
218	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrExpr);
219	assert(AR && "Invalid addrec expression");
220	const SCEV *Ex = PSE.getBackedgeTakenCount();
221
222	ScStart = AR->getStart();
223	ScEnd = AR->evaluateAtIteration(It: Ex, SE&: *SE);
224	const SCEV Step = AR->getStepRecurrence(SE&: SE);
225
226	// For expressions with negative step, the upper bound is ScStart and the
227	// lower bound is ScEnd.
228	if (const auto *CStep = dyn_cast<SCEVConstant>(Val: Step)) {
229	if (CStep->getValue()->isNegative())
230	std::swap(a&: ScStart, b&: ScEnd);
231	} else {
232	// Fallback case: the step is not constant, but we can still
233	// get the upper and lower bounds of the interval by using min/max
234	// expressions.
235	ScStart = SE->getUMinExpr(LHS: ScStart, RHS: ScEnd);
236	ScEnd = SE->getUMaxExpr(LHS: AR->getStart(), RHS: ScEnd);
237	}
238	}
239	assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant");
240	assert(SE->isLoopInvariant(ScEnd, Lp)&& "ScEnd needs to be invariant");
241
242	// Add the size of the pointed element to ScEnd.
243	auto &DL = Lp->getHeader()->getModule()->getDataLayout();
244	Type *IdxTy = DL.getIndexType(PtrTy: Ptr->getType());
245	const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IntTy: IdxTy, StoreTy: AccessTy);
246	ScEnd = SE->getAddExpr(LHS: ScEnd, RHS: EltSizeSCEV);
247
248	Pointers.emplace_back(Args&: Ptr, Args&: ScStart, Args&: ScEnd, Args&: WritePtr, Args&: DepSetId, Args&: ASId, Args&: PtrExpr,
249	Args&: NeedsFreeze);
250	}
251
252	void RuntimePointerChecking::tryToCreateDiffCheck(
253	const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) {
254	if (!CanUseDiffCheck)
255	return;
256
257	// If either group contains multiple different pointers, bail out.
258	// TODO: Support multiple pointers by using the minimum or maximum pointer,
259	// depending on src & sink.
260	if (CGI.Members.size() != `1` \|\| CGJ.Members.size() != `1`) {
261	CanUseDiffCheck = false;
262	return;
263	}
264
265	PointerInfo *Src = &Pointers [CGI.Members [`0`]];
266	PointerInfo *Sink = &Pointers [CGJ.Members [`0`]];
267
268	// If either pointer is read and written, multiple checks may be needed. Bail
269	// out.
270	if (!DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: !Src->IsWritePtr).empty() \|\|
271	!DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: !Sink->IsWritePtr).empty()) {
272	CanUseDiffCheck = false;
273	return;
274	}
275
276	ArrayRef<unsigned> AccSrc =
277	DC.getOrderForAccess(Ptr: Src->PointerValue, IsWrite: Src->IsWritePtr);
278	ArrayRef<unsigned> AccSink =
279	DC.getOrderForAccess(Ptr: Sink->PointerValue, IsWrite: Sink->IsWritePtr);
280	// If either pointer is accessed multiple times, there may not be a clear
281	// src/sink relation. Bail out for now.
282	if (AccSrc.size() != `1` \|\| AccSink.size() != `1`) {
283	CanUseDiffCheck = false;
284	return;
285	}
286	// If the sink is accessed before src, swap src/sink.
287	if (AccSink [`0`] < AccSrc [`0`])
288	std::swap(a&: Src, b&: Sink);
289
290	auto *SrcAR = dyn_cast<SCEVAddRecExpr>(Val: Src->Expr);
291	auto *SinkAR = dyn_cast<SCEVAddRecExpr>(Val: Sink->Expr);
292	if (!SrcAR \|\| !SinkAR \|\| SrcAR->getLoop() != DC.getInnermostLoop() \|\|
293	SinkAR->getLoop() != DC.getInnermostLoop()) {
294	CanUseDiffCheck = false;
295	return;
296	}
297
298	SmallVector<Instruction *, `4`> SrcInsts =
299	DC.getInstructionsForAccess(Ptr: Src->PointerValue, isWrite: Src->IsWritePtr);
300	SmallVector<Instruction *, `4`> SinkInsts =
301	DC.getInstructionsForAccess(Ptr: Sink->PointerValue, isWrite: Sink->IsWritePtr);
302	Type *SrcTy = getLoadStoreType(I: SrcInsts [`0`]);
303	Type *DstTy = getLoadStoreType(I: SinkInsts [`0`]);
304	if (isa<ScalableVectorType>(Val: SrcTy) \|\| isa<ScalableVectorType>(Val: DstTy)) {
305	CanUseDiffCheck = false;
306	return;
307	}
308	const DataLayout &DL =
309	SinkAR->getLoop()->getHeader()->getModule()->getDataLayout();
310	unsigned AllocSize =
311	std::max(a: DL.getTypeAllocSize(Ty: SrcTy), b: DL.getTypeAllocSize(Ty: DstTy));
312
313	// Only matching constant steps matching the AllocSize are supported at the
314	// moment. This simplifies the difference computation. Can be extended in the
315	// future.
316	auto Step = dyn_cast<SCEVConstant>(Val: SinkAR->getStepRecurrence(SE&: SE));
317	if (!Step \|\| Step != SrcAR->getStepRecurrence(SE&: *SE) \|\|
318	Step->getAPInt().abs() != AllocSize) {
319	CanUseDiffCheck = false;
320	return;
321	}
322
323	IntegerType *IntTy =
324	IntegerType::get(C&: Src->PointerValue ->getContext(),
325	NumBits: DL.getPointerSizeInBits(AS: CGI.AddressSpace));
326
327	// When counting down, the dependence distance needs to be swapped.
328	if (Step->getValue()->isNegative())
329	std::swap(a&: SinkAR, b&: SrcAR);
330
331	const SCEV *SinkStartInt = SE->getPtrToIntExpr(Op: SinkAR->getStart(), Ty: IntTy);
332	const SCEV *SrcStartInt = SE->getPtrToIntExpr(Op: SrcAR->getStart(), Ty: IntTy);
333	if (isa<SCEVCouldNotCompute>(Val: SinkStartInt) \|\|
334	isa<SCEVCouldNotCompute>(Val: SrcStartInt)) {
335	CanUseDiffCheck = false;
336	return;
337	}
338
339	const Loop *InnerLoop = SrcAR->getLoop();
340	// If the start values for both Src and Sink also vary according to an outer
341	// loop, then it's probably better to avoid creating diff checks because
342	// they may not be hoisted. We should instead let llvm::addRuntimeChecks
343	// do the expanded full range overlap checks, which can be hoisted.
344	if (HoistRuntimeChecks && InnerLoop->getParentLoop() &&
345	isa<SCEVAddRecExpr>(Val: SinkStartInt) && isa<SCEVAddRecExpr>(Val: SrcStartInt)) {
346	auto *SrcStartAR = cast<SCEVAddRecExpr>(Val: SrcStartInt);
347	auto *SinkStartAR = cast<SCEVAddRecExpr>(Val: SinkStartInt);
348	const Loop *StartARLoop = SrcStartAR->getLoop();
349	if (StartARLoop == SinkStartAR->getLoop() &&
350	StartARLoop == InnerLoop->getParentLoop() &&
351	// If the diff check would already be loop invariant (due to the
352	// recurrences being the same), then we prefer to keep the diff checks
353	// because they are cheaper.
354	SrcStartAR->getStepRecurrence(SE&: *SE) !=
355	SinkStartAR->getStepRecurrence(SE&: *SE)) {
356	LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these "
357	"cannot be hoisted out of the outer loop\n");
358	CanUseDiffCheck = false;
359	return;
360	}
361	}
362
363	LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
364	<< "SrcStart: " << *SrcStartInt << `'\n'`
365	<< "SinkStartInt: " << *SinkStartInt << `'\n'`);
366	DiffChecks.emplace_back(Args&: SrcStartInt, Args&: SinkStartInt, Args&: AllocSize,
367	Args: Src->NeedsFreeze \|\| Sink->NeedsFreeze);
368	}
369
370	SmallVector<RuntimePointerCheck, `4`> RuntimePointerChecking::generateChecks() {
371	SmallVector<RuntimePointerCheck, `4`> Checks;
372
373	for (unsigned I = `0`; I < CheckingGroups.size(); ++I) {
374	for (unsigned J = I + `1`; J < CheckingGroups.size(); ++J) {
375	const RuntimeCheckingPtrGroup &CGI = CheckingGroups [I];
376	const RuntimeCheckingPtrGroup &CGJ = CheckingGroups [J];
377
378	if (needsChecking(M: CGI, N: CGJ)) {
379	tryToCreateDiffCheck(CGI, CGJ);
380	Checks.push_back(Elt: std::make_pair(x: &CGI, y: &CGJ));
381	}
382	}
383	}
384	return Checks;
385	}
386
387	void RuntimePointerChecking::generateChecks(
388	MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
389	assert(Checks.empty() && "Checks is not empty");
390	groupChecks(DepCands, UseDependencies);
391	Checks = generateChecks();
392	}
393
394	bool RuntimePointerChecking::needsChecking(
395	const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const {
396	for (unsigned I = `0`, EI = M.Members.size(); EI != I; ++I)
397	for (unsigned J = `0`, EJ = N.Members.size(); EJ != J; ++J)
398	if (needsChecking(I: M.Members [I], J: N.Members [J]))
399	return true;
400	return false;
401	}
402
403	/// Compare \p I and \p J and return the minimum.
404	/// Return nullptr in case we couldn't find an answer.
405	static const SCEV getMinFromExprs(const* SCEV I, const* SCEV *J,
406	ScalarEvolution *SE) {
407	const SCEV *Diff = SE->getMinusSCEV(LHS: J, RHS: I);
408	const SCEVConstant C = dyn_cast<const* SCEVConstant>(Val: Diff);
409
410	if (!C)
411	return nullptr;
412	if (C->getValue()->isNegative())
413	return J;
414	return I;
415	}
416
417	bool RuntimeCheckingPtrGroup::addPointer(unsigned Index,
418	RuntimePointerChecking &RtCheck) {
419	return addPointer(
420	Index, Start: RtCheck.Pointers [Index].Start, End: RtCheck.Pointers [Index].End,
421	AS: RtCheck.Pointers [Index].PointerValue ->getType()->getPointerAddressSpace(),
422	NeedsFreeze: RtCheck.Pointers [Index].NeedsFreeze, SE&: *RtCheck.SE);
423	}
424
425	bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, const SCEV *Start,
426	const SCEV End, unsigned* AS,
427	bool NeedsFreeze,
428	ScalarEvolution &SE) {
429	assert(AddressSpace == AS &&
430	"all pointers in a checking group must be in the same address space");
431
432	// Compare the starts and ends with the known minimum and maximum
433	// of this set. We need to know how we compare against the min/max
434	// of the set in order to be able to emit memchecks.
435	const SCEV *Min0 = getMinFromExprs(I: Start, J: Low, SE: &SE);
436	if (!Min0)
437	return false;
438
439	const SCEV *Min1 = getMinFromExprs(I: End, J: High, SE: &SE);
440	if (!Min1)
441	return false;
442
443	// Update the low bound expression if we've found a new min value.
444	if (Min0 == Start)
445	Low = Start;
446
447	// Update the high bound expression if we've found a new max value.
448	if (Min1 != End)
449	High = End;
450
451	Members.push_back(Elt: Index);
452	this->NeedsFreeze \|= NeedsFreeze;
453	return true;
454	}
455
456	void RuntimePointerChecking::groupChecks(
457	MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
458	// We build the groups from dependency candidates equivalence classes
459	// because:
460	// - We know that pointers in the same equivalence class share
461	// the same underlying object and therefore there is a chance
462	// that we can compare pointers
463	// - We wouldn't be able to merge two pointers for which we need
464	// to emit a memcheck. The classes in DepCands are already
465	// conveniently built such that no two pointers in the same
466	// class need checking against each other.
467
468	// We use the following (greedy) algorithm to construct the groups
469	// For every pointer in the equivalence class:
470	// For each existing group:
471	// - if the difference between this pointer and the min/max bounds
472	// of the group is a constant, then make the pointer part of the
473	// group and update the min/max bounds of that group as required.
474
475	CheckingGroups.clear();
476
477	// If we need to check two pointers to the same underlying object
478	// with a non-constant difference, we shouldn't perform any pointer
479	// grouping with those pointers. This is because we can easily get
480	// into cases where the resulting check would return false, even when
481	// the accesses are safe.
482	//
483	// The following example shows this:
484	// for (i = 0; i < 1000; ++i)
485	// a[5000 + i m] = a[i] + a[i + 9000]*
486	//
487	// Here grouping gives a check of (5000, 5000 + 1000 m) against*
488	// (0, 10000) which is always false. However, if m is 1, there is no
489	// dependence. Not grouping the checks for a[i] and a[i + 9000] allows
490	// us to perform an accurate check in this case.
491	//
492	// The above case requires that we have an UnknownDependence between
493	// accesses to the same underlying object. This cannot happen unless
494	// FoundNonConstantDistanceDependence is set, and therefore UseDependencies
495	// is also false. In this case we will use the fallback path and create
496	// separate checking groups for all pointers.
497
498	// If we don't have the dependency partitions, construct a new
499	// checking pointer group for each pointer. This is also required
500	// for correctness, because in this case we can have checking between
501	// pointers to the same underlying object.
502	if (!UseDependencies) {
503	for (unsigned I = `0`; I < Pointers.size(); ++I)
504	CheckingGroups.push_back(Elt: RuntimeCheckingPtrGroup (I, *this));
505	return;
506	}
507
508	unsigned TotalComparisons = `0`;
509
510	DenseMap<Value , SmallVector<unsigned*>> PositionMap;
511	for (unsigned Index = `0`; Index < Pointers.size(); ++Index) {
512	auto Iter = PositionMap.insert(KV: {Pointers [Index].PointerValue, {}});
513	Iter.first ->second.push_back(Elt: Index);
514	}
515
516	// We need to keep track of what pointers we've already seen so we
517	// don't process them twice.
518	SmallSet<unsigned, `2`> Seen;
519
520	// Go through all equivalence classes, get the "pointer check groups"
521	// and add them to the overall solution. We use the order in which accesses
522	// appear in 'Pointers' to enforce determinism.
523	for (unsigned I = `0`; I < Pointers.size(); ++I) {
524	// We've seen this pointer before, and therefore already processed
525	// its equivalence class.
526	if (Seen.count(V: I))
527	continue;
528
529	MemoryDepChecker::MemAccessInfo Access(Pointers [I].PointerValue,
530	Pointers [I].IsWritePtr);
531
532	SmallVector<RuntimeCheckingPtrGroup, `2`> Groups;
533	auto LeaderI = DepCands.findValue(V: DepCands.getLeaderValue(V: Access));
534
535	// Because DepCands is constructed by visiting accesses in the order in
536	// which they appear in alias sets (which is deterministic) and the
537	// iteration order within an equivalence class member is only dependent on
538	// the order in which unions and insertions are performed on the
539	// equivalence class, the iteration order is deterministic.
540	for (auto MI = DepCands.member_begin(I: LeaderI), ME = DepCands.member_end();
541	MI != ME; ++MI) {
542	auto PointerI = PositionMap.find(Val: MI ->getPointer());
543	assert(PointerI != PositionMap.end() &&
544	"pointer in equivalence class not found in PositionMap");
545	for (unsigned Pointer : PointerI ->second) {
546	bool Merged = false;
547	// Mark this pointer as seen.
548	Seen.insert(V: Pointer);
549
550	// Go through all the existing sets and see if we can find one
551	// which can include this pointer.
552	for (RuntimeCheckingPtrGroup &Group : Groups) {
553	// Don't perform more than a certain amount of comparisons.
554	// This should limit the cost of grouping the pointers to something
555	// reasonable. If we do end up hitting this threshold, the algorithm
556	// will create separate groups for all remaining pointers.
557	if (TotalComparisons > MemoryCheckMergeThreshold)
558	break;
559
560	TotalComparisons++;
561
562	if (Group.addPointer(Index: Pointer, RtCheck&: *this)) {
563	Merged = true;
564	break;
565	}
566	}
567
568	if (!Merged)
569	// We couldn't add this pointer to any existing set or the threshold
570	// for the number of comparisons has been reached. Create a new group
571	// to hold the current pointer.
572	Groups.push_back(Elt: RuntimeCheckingPtrGroup (Pointer, *this));
573	}
574	}
575
576	// We've computed the grouped checks for this partition.
577	// Save the results and continue with the next one.
578	llvm::copy(Range&: Groups, Out: std::back_inserter(x&: CheckingGroups));
579	}
580	}
581
582	bool RuntimePointerChecking::arePointersInSamePartition(
583	const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
584	unsigned PtrIdx2) {
585	return (PtrToPartition [PtrIdx1] != -`1` &&
586	PtrToPartition [PtrIdx1] == PtrToPartition [PtrIdx2]);
587	}
588
589	bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
590	const PointerInfo &PointerI = Pointers [I];
591	const PointerInfo &PointerJ = Pointers [J];
592
593	// No need to check if two readonly pointers intersect.
594	if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr)
595	return false;
596
597	// Only need to check pointers between two different dependency sets.
598	if (PointerI.DependencySetId == PointerJ.DependencySetId)
599	return false;
600
601	// Only need to check pointers in the same alias set.
602	if (PointerI.AliasSetId != PointerJ.AliasSetId)
603	return false;
604
605	return true;
606	}
607
608	void RuntimePointerChecking::printChecks(
609	raw_ostream &OS, const SmallVectorImpl<RuntimePointerCheck> &Checks,
610	unsigned Depth) const {
611	unsigned N = `0`;
612	for (const auto &Check : Checks) {
613	const auto &First = Check.first->Members, &Second = Check.second->Members;
614
615	OS.indent(NumSpaces: Depth) << "Check " << N++ << ":\n";
616
617	OS.indent(NumSpaces: Depth + `2`) << "Comparing group (" << Check.first << "):\n";
618	for (unsigned K = `0`; K < First.size(); ++K)
619	OS.indent(NumSpaces: Depth + `2`) << *Pointers [First [K]].PointerValue << "\n";
620
621	OS.indent(NumSpaces: Depth + `2`) << "Against group (" << Check.second << "):\n";
622	for (unsigned K = `0`; K < Second.size(); ++K)
623	OS.indent(NumSpaces: Depth + `2`) << *Pointers [Second [K]].PointerValue << "\n";
624	}
625	}
626
627	void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const {
628
629	OS.indent(NumSpaces: Depth) << "Run-time memory checks:\n";
630	printChecks(OS, Checks, Depth);
631
632	OS.indent(NumSpaces: Depth) << "Grouped accesses:\n";
633	for (unsigned I = `0`; I < CheckingGroups.size(); ++I) {
634	const auto &CG = CheckingGroups [I];
635
636	OS.indent(NumSpaces: Depth + `2`) << "Group " << &CG << ":\n";
637	OS.indent(NumSpaces: Depth + `4`) << "(Low: " << CG.Low << " High: " << CG.High
638	<< ")\n";
639	for (unsigned J = `0`; J < CG.Members.size(); ++J) {
640	OS.indent(NumSpaces: Depth + `6`) << "Member: " << *Pointers [CG.Members [J]].Expr
641	<< "\n";
642	}
643	}
644	}
645
646	namespace {
647
648	/// Analyses memory accesses in a loop.
649	///
650	/// Checks whether run time pointer checks are needed and builds sets for data
651	/// dependence checking.
652	class AccessAnalysis {
653	public:
654	/// Read or write access location.
655	typedef PointerIntPair<Value , `1`, bool*> MemAccessInfo;
656	typedef SmallVector<MemAccessInfo, `8`> MemAccessInfoList;
657
658	AccessAnalysis(Loop TheLoop, AAResults AA, LoopInfo *LI,
659	MemoryDepChecker::DepCandidates &DA,
660	PredicatedScalarEvolution &PSE,
661	SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
662	: TheLoop(TheLoop), BAA (*AA), AST (BAA), LI(LI), DepCands(DA), PSE(PSE),
663	LoopAliasScopes(LoopAliasScopes) {
664	// We're analyzing dependences across loop iterations.
665	BAA.enableCrossIterationMode();
666	}
667
668	/// Register a load and whether it is only read from.
669	void addLoad(MemoryLocation &Loc, Type AccessTy, bool* IsReadOnly) {
670	Value Ptr = const_cast<Value >(Loc.Ptr);
671	AST.add(Loc: adjustLoc(Loc));
672	Accesses [MemAccessInfo (Ptr, false)].insert(X: AccessTy);
673	if (IsReadOnly)
674	ReadOnlyPtr.insert(Ptr);
675	}
676
677	/// Register a store.
678	void addStore(MemoryLocation &Loc, Type *AccessTy) {
679	Value Ptr = const_cast<Value >(Loc.Ptr);
680	AST.add(Loc: adjustLoc(Loc));
681	Accesses [MemAccessInfo (Ptr, true)].insert(X: AccessTy);
682	}
683
684	/// Check if we can emit a run-time no-alias check for \p Access.
685	///
686	/// Returns true if we can emit a run-time no alias check for \p Access.
687	/// If we can check this access, this also adds it to a dependence set and
688	/// adds a run-time to check for it to \p RtCheck. If \p Assume is true,
689	/// we will attempt to use additional run-time checks in order to get
690	/// the bounds of the pointer.
691	bool createCheckForAccess(RuntimePointerChecking &RtCheck,
692	MemAccessInfo Access, Type *AccessTy,
693	const DenseMap<Value , const* SCEV *> &Strides,
694	DenseMap<Value , unsigned*> &DepSetId,
695	Loop TheLoop, unsigned* &RunningDepId,
696	unsigned ASId, bool ShouldCheckStride, bool Assume);
697
698	/// Check whether we can check the pointers at runtime for
699	/// non-intersection.
700	///
701	/// Returns true if we need no check or if we do and we can generate them
702	/// (i.e. the pointers have computable bounds).
703	bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE,
704	Loop TheLoop, const* DenseMap<Value , const* SCEV *> &Strides,
705	Value &UncomputablePtr, bool* ShouldCheckWrap = false);
706
707	/// Goes over all memory accesses, checks whether a RT check is needed
708	/// and builds sets of dependent accesses.
709	void buildDependenceSets() {
710	processMemAccesses();
711	}
712
713	/// Initial processing of memory accesses determined that we need to
714	/// perform dependency checking.
715	///
716	/// Note that this can later be cleared if we retry memcheck analysis without
717	/// dependency checking (i.e. FoundNonConstantDistanceDependence).
718	bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
719
720	/// We decided that no dependence analysis would be used. Reset the state.
721	void resetDepChecks(MemoryDepChecker &DepChecker) {
722	CheckDeps.clear();
723	DepChecker.clearDependences();
724	}
725
726	MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; }
727
728	const DenseMap<Value , SmallVector<const* Value *, `16`>> &
729	getUnderlyingObjects() {
730	return UnderlyingObjects;
731	}
732
733	private:
734	typedef MapVector<MemAccessInfo, SmallSetVector<Type *, `1`>> PtrAccessMap;
735
736	/// Adjust the MemoryLocation so that it represents accesses to this
737	/// location across all iterations, rather than a single one.
738	MemoryLocation adjustLoc(MemoryLocation Loc) const {
739	// The accessed location varies within the loop, but remains within the
740	// underlying object.
741	Loc.Size = LocationSize::beforeOrAfterPointer();
742	Loc.AATags.Scope = adjustAliasScopeList(ScopeList: Loc.AATags.Scope);
743	Loc.AATags.NoAlias = adjustAliasScopeList(ScopeList: Loc.AATags.NoAlias);
744	return Loc;
745	}
746
747	/// Drop alias scopes that are only valid within a single loop iteration.
748	MDNode adjustAliasScopeList(MDNode ScopeList) const {
749	if (!ScopeList)
750	return nullptr;
751
752	// For the sake of simplicity, drop the whole scope list if any scope is
753	// iteration-local.
754	if (any_of(Range: ScopeList->operands(), P: [&](Metadata *Scope) {
755	return LoopAliasScopes.contains(Ptr: cast<MDNode>(Val: Scope));
756	}))
757	return nullptr;
758
759	return ScopeList;
760	}
761
762	/// Go over all memory access and check whether runtime pointer checks
763	/// are needed and build sets of dependency check candidates.
764	void processMemAccesses();
765
766	/// Map of all accesses. Values are the types used to access memory pointed to
767	/// by the pointer.
768	PtrAccessMap Accesses;
769
770	/// The loop being checked.
771	const Loop *TheLoop;
772
773	/// List of accesses that need a further dependence check.
774	MemAccessInfoList CheckDeps;
775
776	/// Set of pointers that are read only.
777	SmallPtrSet<Value*, `16`> ReadOnlyPtr;
778
779	/// Batched alias analysis results.
780	BatchAAResults BAA;
781
782	/// An alias set tracker to partition the access set by underlying object and
783	//intrinsic property (such as TBAA metadata).
784	AliasSetTracker AST;
785
786	LoopInfo *LI;
787
788	/// Sets of potentially dependent accesses - members of one set share an
789	/// underlying pointer. The set "CheckDeps" identfies which sets really need a
790	/// dependence check.
791	MemoryDepChecker::DepCandidates &DepCands;
792
793	/// Initial processing of memory accesses determined that we may need
794	/// to add memchecks. Perform the analysis to determine the necessary checks.
795	///
796	/// Note that, this is different from isDependencyCheckNeeded. When we retry
797	/// memcheck analysis without dependency checking
798	/// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is
799	/// cleared while this remains set if we have potentially dependent accesses.
800	bool IsRTCheckAnalysisNeeded = false;
801
802	/// The SCEV predicate containing all the SCEV-related assumptions.
803	PredicatedScalarEvolution &PSE;
804
805	DenseMap<Value , SmallVector<const* Value *, `16`>> UnderlyingObjects;
806
807	/// Alias scopes that are declared inside the loop, and as such not valid
808	/// across iterations.
809	SmallPtrSetImpl<MDNode *> &LoopAliasScopes;
810	};
811
812	} // end anonymous namespace
813
814	/// Check whether a pointer can participate in a runtime bounds check.
815	/// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr
816	/// by adding run-time checks (overflow checks) if necessary.
817	static bool hasComputableBounds(PredicatedScalarEvolution &PSE, Value *Ptr,
818	const SCEV PtrScev, Loop L, bool Assume) {
819	// The bounds for loop-invariant pointer is trivial.
820	if (PSE.getSE()->isLoopInvariant(S: PtrScev, L))
821	return true;
822
823	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev);
824
825	if (!AR && Assume)
826	AR = PSE.getAsAddRec(V: Ptr);
827
828	if (!AR)
829	return false;
830
831	return AR->isAffine();
832	}
833
834	/// Check whether a pointer address cannot wrap.
835	static bool isNoWrap(PredicatedScalarEvolution &PSE,
836	const DenseMap<Value , const* SCEV > &Strides, Value Ptr, Type *AccessTy,
837	Loop *L) {
838	const SCEV *PtrScev = PSE.getSCEV(V: Ptr);
839	if (PSE.getSE()->isLoopInvariant(S: PtrScev, L))
840	return true;
841
842	int64_t Stride = getPtrStride(PSE, AccessTy, Ptr, Lp: L, StridesMap: Strides).value_or(u: `0`);
843	if (Stride == `1` \|\| PSE.hasNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW))
844	return true;
845
846	return false;
847	}
848
849	static void visitPointers(Value StartPtr, const* Loop &InnermostLoop,
850	function_ref<void(Value *)> AddPointer) {
851	SmallPtrSet<Value *, `8`> Visited;
852	SmallVector<Value *> WorkList;
853	WorkList.push_back(Elt: StartPtr);
854
855	while (!WorkList.empty()) {
856	Value *Ptr = WorkList.pop_back_val();
857	if (!Visited.insert(Ptr).second)
858	continue;
859	auto *PN = dyn_cast<PHINode>(Val: Ptr);
860	// SCEV does not look through non-header PHIs inside the loop. Such phis
861	// can be analyzed by adding separate accesses for each incoming pointer
862	// value.
863	if (PN && InnermostLoop.contains(BB: PN->getParent()) &&
864	PN->getParent() != InnermostLoop.getHeader()) {
865	for (const Use &Inc : PN->incoming_values())
866	WorkList.push_back(Elt: Inc);
867	} else
868	AddPointer (Ptr);
869	}
870	}
871
872	// Walk back through the IR for a pointer, looking for a select like the
873	// following:
874	//
875	// %offset = select i1 %cmp, i64 %a, i64 %b
876	// %addr = getelementptr double, double %base, i64 %offset*
877	// %ld = load double, double %addr, align 8*
878	//
879	// We won't be able to form a single SCEVAddRecExpr from this since the
880	// address for each loop iteration depends on %cmp. We could potentially
881	// produce multiple valid SCEVAddRecExprs, though, and check all of them for
882	// memory safety/aliasing if needed.
883	//
884	// If we encounter some IR we don't yet handle, or something obviously fine
885	// like a constant, then we just add the SCEV for that term to the list passed
886	// in by the caller. If we have a node that may potentially yield a valid
887	// SCEVAddRecExpr then we decompose it into parts and build the SCEV terms
888	// ourselves before adding to the list.
889	static void findForkedSCEVs(
890	ScalarEvolution SE, const* Loop L, Value Ptr,
891	SmallVectorImpl<PointerIntPair<const SCEV , `1`, bool*>> &ScevList,
892	unsigned Depth) {
893	// If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or
894	// we've exceeded our limit on recursion, just return whatever we have
895	// regardless of whether it can be used for a forked pointer or not, along
896	// with an indication of whether it might be a poison or undef value.
897	const SCEV *Scev = SE->getSCEV(V: Ptr);
898	if (isa<SCEVAddRecExpr>(Val: Scev) \|\| L->isLoopInvariant(V: Ptr) \|\|
899	!isa<Instruction>(Val: Ptr) \|\| Depth == `0`) {
900	ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
901	return;
902	}
903
904	Depth--;
905
906	auto UndefPoisonCheck = [](PointerIntPair<const SCEV , `1`, bool*> S) {
907	return get<`1`>(Pair: S);
908	};
909
910	auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV L, const* SCEV *R) {
911	switch (Opcode) {
912	case Instruction::Add:
913	return SE->getAddExpr(LHS: L, RHS: R);
914	case Instruction::Sub:
915	return SE->getMinusSCEV(LHS: L, RHS: R);
916	default:
917	llvm_unreachable("Unexpected binary operator when walking ForkedPtrs");
918	}
919	};
920
921	Instruction *I = cast<Instruction>(Val: Ptr);
922	unsigned Opcode = I->getOpcode();
923	switch (Opcode) {
924	case Instruction::GetElementPtr: {
925	GetElementPtrInst *GEP = cast<GetElementPtrInst>(Val: I);
926	Type *SourceTy = GEP->getSourceElementType();
927	// We only handle base + single offset GEPs here for now.
928	// Not dealing with preexisting gathers yet, so no vectors.
929	if (I->getNumOperands() != `2` \|\| SourceTy->isVectorTy()) {
930	ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: GEP));
931	break;
932	}
933	SmallVector<PointerIntPair<const SCEV , `1`, bool*>, `2`> BaseScevs;
934	SmallVector<PointerIntPair<const SCEV , `1`, bool*>, `2`> OffsetScevs;
935	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `0`), ScevList&: BaseScevs, Depth);
936	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `1`), ScevList&: OffsetScevs, Depth);
937
938	// See if we need to freeze our fork...
939	bool NeedsFreeze = any_of(Range&: BaseScevs, P: UndefPoisonCheck) \|\|
940	any_of(Range&: OffsetScevs, P: UndefPoisonCheck);
941
942	// Check that we only have a single fork, on either the base or the offset.
943	// Copy the SCEV across for the one without a fork in order to generate
944	// the full SCEV for both sides of the GEP.
945	if (OffsetScevs.size() == `2` && BaseScevs.size() == `1`)
946	BaseScevs.push_back(Elt: BaseScevs [`0`]);
947	else if (BaseScevs.size() == `2` && OffsetScevs.size() == `1`)
948	OffsetScevs.push_back(Elt: OffsetScevs [`0`]);
949	else {
950	ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze);
951	break;
952	}
953
954	// Find the pointer type we need to extend to.
955	Type *IntPtrTy = SE->getEffectiveSCEVType(
956	Ty: SE->getSCEV(V: GEP->getPointerOperand())->getType());
957
958	// Find the size of the type being pointed to. We only have a single
959	// index term (guarded above) so we don't need to index into arrays or
960	// structures, just get the size of the scalar value.
961	const SCEV *Size = SE->getSizeOfExpr(IntTy: IntPtrTy, AllocTy: SourceTy);
962
963	// Scale up the offsets by the size of the type, then add to the bases.
964	const SCEV *Scaled1 = SE->getMulExpr(
965	LHS: Size, RHS: SE->getTruncateOrSignExtend(V: get<`0`>(Pair: OffsetScevs [`0`]), Ty: IntPtrTy));
966	const SCEV *Scaled2 = SE->getMulExpr(
967	LHS: Size, RHS: SE->getTruncateOrSignExtend(V: get<`0`>(Pair: OffsetScevs [`1`]), Ty: IntPtrTy));
968	ScevList.emplace_back(Args: SE->getAddExpr(LHS: get<`0`>(Pair: BaseScevs [`0`]), RHS: Scaled1),
969	Args&: NeedsFreeze);
970	ScevList.emplace_back(Args: SE->getAddExpr(LHS: get<`0`>(Pair: BaseScevs [`1`]), RHS: Scaled2),
971	Args&: NeedsFreeze);
972	break;
973	}
974	case Instruction::Select: {
975	SmallVector<PointerIntPair<const SCEV , `1`, bool*>, `2`> ChildScevs;
976	// A select means we've found a forked pointer, but we currently only
977	// support a single select per pointer so if there's another behind this
978	// then we just bail out and return the generic SCEV.
979	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `1`), ScevList&: ChildScevs, Depth);
980	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `2`), ScevList&: ChildScevs, Depth);
981	if (ChildScevs.size() == `2`) {
982	ScevList.push_back(Elt: ChildScevs [`0`]);
983	ScevList.push_back(Elt: ChildScevs [`1`]);
984	} else
985	ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
986	break;
987	}
988	case Instruction::PHI: {
989	SmallVector<PointerIntPair<const SCEV , `1`, bool*>, `2`> ChildScevs;
990	// A phi means we've found a forked pointer, but we currently only
991	// support a single phi per pointer so if there's another behind this
992	// then we just bail out and return the generic SCEV.
993	if (I->getNumOperands() == `2`) {
994	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `0`), ScevList&: ChildScevs, Depth);
995	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `1`), ScevList&: ChildScevs, Depth);
996	}
997	if (ChildScevs.size() == `2`) {
998	ScevList.push_back(Elt: ChildScevs [`0`]);
999	ScevList.push_back(Elt: ChildScevs [`1`]);
1000	} else
1001	ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1002	break;
1003	}
1004	case Instruction::Add:
1005	case Instruction::Sub: {
1006	SmallVector<PointerIntPair<const SCEV , `1`, bool*>> LScevs;
1007	SmallVector<PointerIntPair<const SCEV , `1`, bool*>> RScevs;
1008	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `0`), ScevList&: LScevs, Depth);
1009	findForkedSCEVs(SE, L, Ptr: I->getOperand(i: `1`), ScevList&: RScevs, Depth);
1010
1011	// See if we need to freeze our fork...
1012	bool NeedsFreeze =
1013	any_of(Range&: LScevs, P: UndefPoisonCheck) \|\| any_of(Range&: RScevs, P: UndefPoisonCheck);
1014
1015	// Check that we only have a single fork, on either the left or right side.
1016	// Copy the SCEV across for the one without a fork in order to generate
1017	// the full SCEV for both sides of the BinOp.
1018	if (LScevs.size() == `2` && RScevs.size() == `1`)
1019	RScevs.push_back(Elt: RScevs [`0`]);
1020	else if (RScevs.size() == `2` && LScevs.size() == `1`)
1021	LScevs.push_back(Elt: LScevs [`0`]);
1022	else {
1023	ScevList.emplace_back(Args&: Scev, Args&: NeedsFreeze);
1024	break;
1025	}
1026
1027	ScevList.emplace_back(
1028	Args: GetBinOpExpr (Opcode, get<`0`>(Pair: LScevs [`0`]), get<`0`>(Pair: RScevs [`0`])),
1029	Args&: NeedsFreeze);
1030	ScevList.emplace_back(
1031	Args: GetBinOpExpr (Opcode, get<`0`>(Pair: LScevs [`1`]), get<`0`>(Pair: RScevs [`1`])),
1032	Args&: NeedsFreeze);
1033	break;
1034	}
1035	default:
1036	// Just return the current SCEV if we haven't handled the instruction yet.
1037	LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n");
1038	ScevList.emplace_back(Args&: Scev, Args: !isGuaranteedNotToBeUndefOrPoison(V: Ptr));
1039	break;
1040	}
1041	}
1042
1043	static SmallVector<PointerIntPair<const SCEV , `1`, bool*>>
1044	findForkedPointer(PredicatedScalarEvolution &PSE,
1045	const DenseMap<Value , const* SCEV > &StridesMap, Value Ptr,
1046	const Loop *L) {
1047	ScalarEvolution *SE = PSE.getSE();
1048	assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!");
1049	SmallVector<PointerIntPair<const SCEV , `1`, bool*>> Scevs;
1050	findForkedSCEVs(SE, L, Ptr, ScevList&: Scevs, Depth: MaxForkedSCEVDepth);
1051
1052	// For now, we will only accept a forked pointer with two possible SCEVs
1053	// that are either SCEVAddRecExprs or loop invariant.
1054	if (Scevs.size() == `2` &&
1055	(isa<SCEVAddRecExpr>(Val: get<`0`>(Pair: Scevs [`0`])) \|\|
1056	SE->isLoopInvariant(S: get<`0`>(Pair: Scevs [`0`]), L)) &&
1057	(isa<SCEVAddRecExpr>(Val: get<`0`>(Pair: Scevs [`1`])) \|\|
1058	SE->isLoopInvariant(S: get<`0`>(Pair: Scevs [`1`]), L))) {
1059	LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n");
1060	LLVM_DEBUG(dbgs() << "\t(1) " << *get<`0`>(Scevs[`0`]) << "\n");
1061	LLVM_DEBUG(dbgs() << "\t(2) " << *get<`0`>(Scevs[`1`]) << "\n");
1062	return Scevs;
1063	}
1064
1065	return {{replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr), false}};
1066	}
1067
1068	bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
1069	MemAccessInfo Access, Type *AccessTy,
1070	const DenseMap<Value , const* SCEV *> &StridesMap,
1071	DenseMap<Value , unsigned*> &DepSetId,
1072	Loop TheLoop, unsigned* &RunningDepId,
1073	unsigned ASId, bool ShouldCheckWrap,
1074	bool Assume) {
1075	Value *Ptr = Access.getPointer();
1076
1077	SmallVector<PointerIntPair<const SCEV , `1`, bool*>> TranslatedPtrs =
1078	findForkedPointer(PSE, StridesMap, Ptr, L: TheLoop);
1079
1080	for (auto &P : TranslatedPtrs) {
1081	const SCEV *PtrExpr = get<`0`>(Pair: P);
1082	if (!hasComputableBounds(PSE, Ptr, PtrScev: PtrExpr, L: TheLoop, Assume))
1083	return false;
1084
1085	// When we run after a failing dependency check we have to make sure
1086	// we don't have wrapping pointers.
1087	if (ShouldCheckWrap) {
1088	// Skip wrap checking when translating pointers.
1089	if (TranslatedPtrs.size() > `1`)
1090	return false;
1091
1092	if (!isNoWrap(PSE, Strides: StridesMap, Ptr, AccessTy, L: TheLoop)) {
1093	auto *Expr = PSE.getSCEV(V: Ptr);
1094	if (!Assume \|\| !isa<SCEVAddRecExpr>(Val: Expr))
1095	return false;
1096	PSE.setNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW);
1097	}
1098	}
1099	// If there's only one option for Ptr, look it up after bounds and wrap
1100	// checking, because assumptions might have been added to PSE.
1101	if (TranslatedPtrs.size() == `1`)
1102	TranslatedPtrs [`0`] = {replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr),
1103	false};
1104	}
1105
1106	for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) {
1107	// The id of the dependence set.
1108	unsigned DepId;
1109
1110	if (isDependencyCheckNeeded()) {
1111	Value *Leader = DepCands.getLeaderValue(V: Access).getPointer();
1112	unsigned &LeaderId = DepSetId [Leader];
1113	if (!LeaderId)
1114	LeaderId = RunningDepId++;
1115	DepId = LeaderId;
1116	} else
1117	// Each access has its own dependence set.
1118	DepId = RunningDepId++;
1119
1120	bool IsWrite = Access.getInt();
1121	RtCheck.insert(Lp: TheLoop, Ptr, PtrExpr, AccessTy, WritePtr: IsWrite, DepSetId: DepId, ASId, PSE,
1122	NeedsFreeze);
1123	LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << `'\n'`);
1124	}
1125
1126	return true;
1127	}
1128
1129	bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
1130	ScalarEvolution SE, Loop TheLoop,
1131	const DenseMap<Value , const* SCEV *> &StridesMap,
1132	Value &UncomputablePtr, bool* ShouldCheckWrap) {
1133	// Find pointers with computable bounds. We are going to use this information
1134	// to place a runtime bound check.
1135	bool CanDoRT = true;
1136
1137	bool MayNeedRTCheck = false;
1138	if (!IsRTCheckAnalysisNeeded) return true;
1139
1140	bool IsDepCheckNeeded = isDependencyCheckNeeded();
1141
1142	// We assign a consecutive id to access from different alias sets.
1143	// Accesses between different groups doesn't need to be checked.
1144	unsigned ASId = `0`;
1145	for (auto &AS : AST) {
1146	int NumReadPtrChecks = `0`;
1147	int NumWritePtrChecks = `0`;
1148	bool CanDoAliasSetRT = true;
1149	++ASId;
1150	auto ASPointers = AS.getPointers();
1151
1152	// We assign consecutive id to access from different dependence sets.
1153	// Accesses within the same set don't need a runtime check.
1154	unsigned RunningDepId = `1`;
1155	DenseMap<Value , unsigned*> DepSetId;
1156
1157	SmallVector<std::pair<MemAccessInfo, Type *>, `4`> Retries;
1158
1159	// First, count how many write and read accesses are in the alias set. Also
1160	// collect MemAccessInfos for later.
1161	SmallVector<MemAccessInfo, `4`> AccessInfos;
1162	for (const Value *Ptr_ : ASPointers) {
1163	Value Ptr = const_cast<Value >(Ptr_);
1164	bool IsWrite = Accesses.count(Key: MemAccessInfo (Ptr, true));
1165	if (IsWrite)
1166	++NumWritePtrChecks;
1167	else
1168	++NumReadPtrChecks;
1169	AccessInfos.emplace_back(Args&: Ptr, Args&: IsWrite);
1170	}
1171
1172	// We do not need runtime checks for this alias set, if there are no writes
1173	// or a single write and no reads.
1174	if (NumWritePtrChecks == `0` \|\|
1175	(NumWritePtrChecks == `1` && NumReadPtrChecks == `0`)) {
1176	assert((ASPointers.size() <= `1` \|\|
1177	all_of(ASPointers,
1178	[this](const Value *Ptr) {
1179	MemAccessInfo AccessWrite(const_cast<Value *>(Ptr),
1180	true);
1181	return DepCands.findValue(AccessWrite) == DepCands.end();
1182	})) &&
1183	"Can only skip updating CanDoRT below, if all entries in AS "
1184	"are reads or there is at most 1 entry");
1185	continue;
1186	}
1187
1188	for (auto &Access : AccessInfos) {
1189	for (const auto &AccessTy : Accesses [Access]) {
1190	if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1191	DepSetId, TheLoop, RunningDepId, ASId,
1192	ShouldCheckWrap, Assume: false)) {
1193	LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
1194	<< *Access.getPointer() << `'\n'`);
1195	Retries.push_back(Elt: {Access, AccessTy});
1196	CanDoAliasSetRT = false;
1197	}
1198	}
1199	}
1200
1201	// Note that this function computes CanDoRT and MayNeedRTCheck
1202	// independently. For example CanDoRT=false, MayNeedRTCheck=false means that
1203	// we have a pointer for which we couldn't find the bounds but we don't
1204	// actually need to emit any checks so it does not matter.
1205	//
1206	// We need runtime checks for this alias set, if there are at least 2
1207	// dependence sets (in which case RunningDepId > 2) or if we need to re-try
1208	// any bound checks (because in that case the number of dependence sets is
1209	// incomplete).
1210	bool NeedsAliasSetRTCheck = RunningDepId > `2` \|\| !Retries.empty();
1211
1212	// We need to perform run-time alias checks, but some pointers had bounds
1213	// that couldn't be checked.
1214	if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) {
1215	// Reset the CanDoSetRt flag and retry all accesses that have failed.
1216	// We know that we need these checks, so we can now be more aggressive
1217	// and add further checks if required (overflow checks).
1218	CanDoAliasSetRT = true;
1219	for (auto Retry : Retries) {
1220	MemAccessInfo Access = Retry.first;
1221	Type *AccessTy = Retry.second;
1222	if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
1223	DepSetId, TheLoop, RunningDepId, ASId,
1224	ShouldCheckWrap, /Assume=/true)) {
1225	CanDoAliasSetRT = false;
1226	UncomputablePtr = Access.getPointer();
1227	break;
1228	}
1229	}
1230	}
1231
1232	CanDoRT &= CanDoAliasSetRT;
1233	MayNeedRTCheck \|= NeedsAliasSetRTCheck;
1234	++ASId;
1235	}
1236
1237	// If the pointers that we would use for the bounds comparison have different
1238	// address spaces, assume the values aren't directly comparable, so we can't
1239	// use them for the runtime check. We also have to assume they could
1240	// overlap. In the future there should be metadata for whether address spaces
1241	// are disjoint.
1242	unsigned NumPointers = RtCheck.Pointers.size();
1243	for (unsigned i = `0`; i < NumPointers; ++i) {
1244	for (unsigned j = i + `1`; j < NumPointers; ++j) {
1245	// Only need to check pointers between two different dependency sets.
1246	if (RtCheck.Pointers [i].DependencySetId ==
1247	RtCheck.Pointers [j].DependencySetId)
1248	continue;
1249	// Only need to check pointers in the same alias set.
1250	if (RtCheck.Pointers [i].AliasSetId != RtCheck.Pointers [j].AliasSetId)
1251	continue;
1252
1253	Value *PtrI = RtCheck.Pointers [i].PointerValue;
1254	Value *PtrJ = RtCheck.Pointers [j].PointerValue;
1255
1256	unsigned ASi = PtrI->getType()->getPointerAddressSpace();
1257	unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
1258	if (ASi != ASj) {
1259	LLVM_DEBUG(
1260	dbgs() << "LAA: Runtime check would require comparison between"
1261	" different address spaces\n");
1262	return false;
1263	}
1264	}
1265	}
1266
1267	if (MayNeedRTCheck && CanDoRT)
1268	RtCheck.generateChecks(DepCands, UseDependencies: IsDepCheckNeeded);
1269
1270	LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
1271	<< " pointer comparisons.\n");
1272
1273	// If we can do run-time checks, but there are no checks, no runtime checks
1274	// are needed. This can happen when all pointers point to the same underlying
1275	// object for example.
1276	RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != `0` : MayNeedRTCheck;
1277
1278	bool CanDoRTIfNeeded = !RtCheck.Need \|\| CanDoRT;
1279	if (!CanDoRTIfNeeded)
1280	RtCheck.reset();
1281	return CanDoRTIfNeeded;
1282	}
1283
1284	void AccessAnalysis::processMemAccesses() {
1285	// We process the set twice: first we process read-write pointers, last we
1286	// process read-only pointers. This allows us to skip dependence tests for
1287	// read-only pointers.
1288
1289	LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
1290	LLVM_DEBUG(dbgs() << " AST: "; AST.dump());
1291	LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n");
1292	LLVM_DEBUG({
1293	for (auto A : Accesses)
1294	dbgs() << "\t" << *A.first.getPointer() << " ("
1295	<< (A.first.getInt()
1296	? "write"
1297	: (ReadOnlyPtr.count(A.first.getPointer()) ? "read-only"
1298	: "read"))
1299	<< ")\n";
1300	});
1301
1302	// The AliasSetTracker has nicely partitioned our pointers by metadata
1303	// compatibility and potential for underlying-object overlap. As a result, we
1304	// only need to check for potential pointer dependencies within each alias
1305	// set.
1306	for (const auto &AS : AST) {
1307	// Note that both the alias-set tracker and the alias sets themselves used
1308	// ordered collections internally and so the iteration order here is
1309	// deterministic.
1310	auto ASPointers = AS.getPointers();
1311
1312	bool SetHasWrite = false;
1313
1314	// Map of pointers to last access encountered.
1315	typedef DenseMap<const Value*, MemAccessInfo> UnderlyingObjToAccessMap;
1316	UnderlyingObjToAccessMap ObjToLastAccess;
1317
1318	// Set of access to check after all writes have been processed.
1319	PtrAccessMap DeferredAccesses;
1320
1321	// Iterate over each alias set twice, once to process read/write pointers,
1322	// and then to process read-only pointers.
1323	for (int SetIteration = `0`; SetIteration < `2`; ++SetIteration) {
1324	bool UseDeferred = SetIteration > `0`;
1325	PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses;
1326
1327	for (const Value *Ptr_ : ASPointers) {
1328	Value Ptr = const_cast<Value >(Ptr_);
1329
1330	// For a single memory access in AliasSetTracker, Accesses may contain
1331	// both read and write, and they both need to be handled for CheckDeps.
1332	for (const auto &AC : S) {
1333	if (AC.first.getPointer() != Ptr)
1334	continue;
1335
1336	bool IsWrite = AC.first.getInt();
1337
1338	// If we're using the deferred access set, then it contains only
1339	// reads.
1340	bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
1341	if (UseDeferred && !IsReadOnlyPtr)
1342	continue;
1343	// Otherwise, the pointer must be in the PtrAccessSet, either as a
1344	// read or a write.
1345	assert(((IsReadOnlyPtr && UseDeferred) \|\| IsWrite \|\|
1346	S.count(MemAccessInfo (Ptr, false))) &&
1347	"Alias-set pointer not in the access set?");
1348
1349	MemAccessInfo Access(Ptr, IsWrite);
1350	DepCands.insert(Data: Access);
1351
1352	// Memorize read-only pointers for later processing and skip them in
1353	// the first round (they need to be checked after we have seen all
1354	// write pointers). Note: we also mark pointer that are not
1355	// consecutive as "read-only" pointers (so that we check
1356	// "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
1357	if (!UseDeferred && IsReadOnlyPtr) {
1358	// We only use the pointer keys, the types vector values don't
1359	// matter.
1360	DeferredAccesses.insert(KV: {Access, {}});
1361	continue;
1362	}
1363
1364	// If this is a write - check other reads and writes for conflicts. If
1365	// this is a read only check other writes for conflicts (but only if
1366	// there is no other write to the ptr - this is an optimization to
1367	// catch "a[i] = a[i] + " without having to do a dependence check).
1368	if ((IsWrite \|\| IsReadOnlyPtr) && SetHasWrite) {
1369	CheckDeps.push_back(Elt: Access);
1370	IsRTCheckAnalysisNeeded = true;
1371	}
1372
1373	if (IsWrite)
1374	SetHasWrite = true;
1375
1376	// Create sets of pointers connected by a shared alias set and
1377	// underlying object.
1378	typedef SmallVector<const Value *, `16`> ValueVector;
1379	ValueVector TempObjects;
1380
1381	UnderlyingObjects [Ptr] = {};
1382	SmallVector<const Value *, `16`> &UOs = UnderlyingObjects [Ptr];
1383	::getUnderlyingObjects(V: Ptr, Objects&: UOs, LI);
1384	LLVM_DEBUG(dbgs()
1385	<< "Underlying objects for pointer " << *Ptr << "\n");
1386	for (const Value *UnderlyingObj : UOs) {
1387	// nullptr never alias, don't join sets for pointer that have "null"
1388	// in their UnderlyingObjects list.
1389	if (isa<ConstantPointerNull>(Val: UnderlyingObj) &&
1390	!NullPointerIsDefined(
1391	F: TheLoop->getHeader()->getParent(),
1392	AS: UnderlyingObj->getType()->getPointerAddressSpace()))
1393	continue;
1394
1395	UnderlyingObjToAccessMap::iterator Prev =
1396	ObjToLastAccess.find(Val: UnderlyingObj);
1397	if (Prev != ObjToLastAccess.end())
1398	DepCands.unionSets(V1: Access, V2: Prev ->second);
1399
1400	ObjToLastAccess [UnderlyingObj] = Access;
1401	LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n");
1402	}
1403	}
1404	}
1405	}
1406	}
1407	}
1408
1409	/// Return true if an AddRec pointer \p Ptr is unsigned non-wrapping,
1410	/// i.e. monotonically increasing/decreasing.
1411	static bool isNoWrapAddRec(Value Ptr, const* SCEVAddRecExpr *AR,
1412	PredicatedScalarEvolution &PSE, const Loop *L) {
1413
1414	// FIXME: This should probably only return true for NUW.
1415	if (AR->getNoWrapFlags(Mask: SCEV::NoWrapMask))
1416	return true;
1417
1418	if (PSE.hasNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW))
1419	return true;
1420
1421	// Scalar evolution does not propagate the non-wrapping flags to values that
1422	// are derived from a non-wrapping induction variable because non-wrapping
1423	// could be flow-sensitive.
1424	//
1425	// Look through the potentially overflowing instruction to try to prove
1426	// non-wrapping for the specific* value of Ptr.*
1427
1428	// The arithmetic implied by an inbounds GEP can't overflow.
1429	auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr);
1430	if (!GEP \|\| !GEP->isInBounds())
1431	return false;
1432
1433	// Make sure there is only one non-const index and analyze that.
1434	Value NonConstIndex = nullptr*;
1435	for (Value *Index : GEP->indices())
1436	if (!isa<ConstantInt>(Val: Index)) {
1437	if (NonConstIndex)
1438	return false;
1439	NonConstIndex = Index;
1440	}
1441	if (!NonConstIndex)
1442	// The recurrence is on the pointer, ignore for now.
1443	return false;
1444
1445	// The index in GEP is signed. It is non-wrapping if it's derived from a NSW
1446	// AddRec using a NSW operation.
1447	if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(Val: NonConstIndex))
1448	if (OBO->hasNoSignedWrap() &&
1449	// Assume constant for other the operand so that the AddRec can be
1450	// easily found.
1451	isa<ConstantInt>(Val: OBO->getOperand(i_nocapture: `1`))) {
1452	auto *OpScev = PSE.getSCEV(V: OBO->getOperand(i_nocapture: `0`));
1453
1454	if (auto *OpAR = dyn_cast<SCEVAddRecExpr>(Val: OpScev))
1455	return OpAR->getLoop() == L && OpAR->getNoWrapFlags(Mask: SCEV::FlagNSW);
1456	}
1457
1458	return false;
1459	}
1460
1461	/// Check whether the access through \p Ptr has a constant stride.
1462	std::optional<int64_t> llvm::getPtrStride(PredicatedScalarEvolution &PSE,
1463	Type AccessTy, Value Ptr,
1464	const Loop *Lp,
1465	const DenseMap<Value , const* SCEV *> &StridesMap,
1466	bool Assume, bool ShouldCheckWrap) {
1467	Type *Ty = Ptr->getType();
1468	assert(Ty->isPointerTy() && "Unexpected non-ptr");
1469
1470	if (isa<ScalableVectorType>(Val: AccessTy)) {
1471	LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
1472	<< "\n");
1473	return std::nullopt;
1474	}
1475
1476	const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, PtrToStride: StridesMap, Ptr);
1477
1478	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: PtrScev);
1479	if (Assume && !AR)
1480	AR = PSE.getAsAddRec(V: Ptr);
1481
1482	if (!AR) {
1483	LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
1484	<< " SCEV: " << *PtrScev << "\n");
1485	return std::nullopt;
1486	}
1487
1488	// The access function must stride over the innermost loop.
1489	if (Lp != AR->getLoop()) {
1490	LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop "
1491	<< Ptr << " SCEV: " << AR << "\n");
1492	return std::nullopt;
1493	}
1494
1495	// Check the step is constant.
1496	const SCEV Step = AR->getStepRecurrence(SE&: PSE.getSE());
1497
1498	// Calculate the pointer stride and check if it is constant.
1499	const SCEVConstant *C = dyn_cast<SCEVConstant>(Val: Step);
1500	if (!C) {
1501	LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr
1502	<< " SCEV: " << *AR << "\n");
1503	return std::nullopt;
1504	}
1505
1506	auto &DL = Lp->getHeader()->getModule()->getDataLayout();
1507	TypeSize AllocSize = DL.getTypeAllocSize(Ty: AccessTy);
1508	int64_t Size = AllocSize.getFixedValue();
1509	const APInt &APStepVal = C->getAPInt();
1510
1511	// Huge step value - give up.
1512	if (APStepVal.getBitWidth() > `64`)
1513	return std::nullopt;
1514
1515	int64_t StepVal = APStepVal.getSExtValue();
1516
1517	// Strided access.
1518	int64_t Stride = StepVal / Size;
1519	int64_t Rem = StepVal % Size;
1520	if (Rem)
1521	return std::nullopt;
1522
1523	if (!ShouldCheckWrap)
1524	return Stride;
1525
1526	// The address calculation must not wrap. Otherwise, a dependence could be
1527	// inverted.
1528	if (isNoWrapAddRec(Ptr, AR, PSE, L: Lp))
1529	return Stride;
1530
1531	// An inbounds getelementptr that is a AddRec with a unit stride
1532	// cannot wrap per definition. If it did, the result would be poison
1533	// and any memory access dependent on it would be immediate UB
1534	// when executed.
1535	if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr);
1536	GEP && GEP->isInBounds() && (Stride == `1` \|\| Stride == -`1`))
1537	return Stride;
1538
1539	// If the null pointer is undefined, then a access sequence which would
1540	// otherwise access it can be assumed not to unsigned wrap. Note that this
1541	// assumes the object in memory is aligned to the natural alignment.
1542	unsigned AddrSpace = Ty->getPointerAddressSpace();
1543	if (!NullPointerIsDefined(F: Lp->getHeader()->getParent(), AS: AddrSpace) &&
1544	(Stride == `1` \|\| Stride == -`1`))
1545	return Stride;
1546
1547	if (Assume) {
1548	PSE.setNoOverflow(V: Ptr, Flags: SCEVWrapPredicate::IncrementNUSW);
1549	LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n"
1550	<< "LAA: Pointer: " << *Ptr << "\n"
1551	<< "LAA: SCEV: " << *AR << "\n"
1552	<< "LAA: Added an overflow assumption\n");
1553	return Stride;
1554	}
1555	LLVM_DEBUG(
1556	dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
1557	<< Ptr << " SCEV: " << AR << "\n");
1558	return std::nullopt;
1559	}
1560
1561	std::optional<int> llvm::getPointersDiff(Type ElemTyA, Value PtrA,
1562	Type ElemTyB, Value PtrB,
1563	const DataLayout &DL,
1564	ScalarEvolution &SE, bool StrictCheck,
1565	bool CheckType) {
1566	assert(PtrA && PtrB && "Expected non-nullptr pointers.");
1567
1568	// Make sure that A and B are different pointers.
1569	if (PtrA == PtrB)
1570	return `0`;
1571
1572	// Make sure that the element types are the same if required.
1573	if (CheckType && ElemTyA != ElemTyB)
1574	return std::nullopt;
1575
1576	unsigned ASA = PtrA->getType()->getPointerAddressSpace();
1577	unsigned ASB = PtrB->getType()->getPointerAddressSpace();
1578
1579	// Check that the address spaces match.
1580	if (ASA != ASB)
1581	return std::nullopt;
1582	unsigned IdxWidth = DL.getIndexSizeInBits(AS: ASA);
1583
1584	APInt OffsetA(IdxWidth, `0`), OffsetB(IdxWidth, `0`);
1585	Value *PtrA1 = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetA);
1586	Value *PtrB1 = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetB);
1587
1588	int Val;
1589	if (PtrA1 == PtrB1) {
1590	// Retrieve the address space again as pointer stripping now tracks through
1591	// `addrspacecast`.
1592	ASA = cast<PointerType>(Val: PtrA1->getType())->getAddressSpace();
1593	ASB = cast<PointerType>(Val: PtrB1->getType())->getAddressSpace();
1594	// Check that the address spaces match and that the pointers are valid.
1595	if (ASA != ASB)
1596	return std::nullopt;
1597
1598	IdxWidth = DL.getIndexSizeInBits(AS: ASA);
1599	OffsetA = OffsetA.sextOrTrunc(width: IdxWidth);
1600	OffsetB = OffsetB.sextOrTrunc(width: IdxWidth);
1601
1602	OffsetB -= OffsetA;
1603	Val = OffsetB.getSExtValue();
1604	} else {
1605	// Otherwise compute the distance with SCEV between the base pointers.
1606	const SCEV *PtrSCEVA = SE.getSCEV(V: PtrA);
1607	const SCEV *PtrSCEVB = SE.getSCEV(V: PtrB);
1608	const auto *Diff =
1609	dyn_cast<SCEVConstant>(Val: SE.getMinusSCEV(LHS: PtrSCEVB, RHS: PtrSCEVA));
1610	if (!Diff)
1611	return std::nullopt;
1612	Val = Diff->getAPInt().getSExtValue();
1613	}
1614	int Size = DL.getTypeStoreSize(Ty: ElemTyA);
1615	int Dist = Val / Size;
1616
1617	// Ensure that the calculated distance matches the type-based one after all
1618	// the bitcasts removal in the provided pointers.
1619	if (!StrictCheck \|\| Dist * Size == Val)
1620	return Dist;
1621	return std::nullopt;
1622	}
1623
1624	bool llvm::sortPtrAccesses(ArrayRef<Value > VL, Type ElemTy,
1625	const DataLayout &DL, ScalarEvolution &SE,
1626	SmallVectorImpl<unsigned> &SortedIndices) {
1627	assert(llvm::all_of(
1628	VL, [](const Value V) { return* V->getType()->isPointerTy(); }) &&
1629	"Expected list of pointer operands.");
1630	// Walk over the pointers, and map each of them to an offset relative to
1631	// first pointer in the array.
1632	Value *Ptr0 = VL [`0`];
1633
1634	using DistOrdPair = std::pair<int64_t, int>;
1635	auto Compare = llvm::less_first ();
1636	std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
1637	Offsets.emplace(args: `0`, args: `0`);
1638	int Cnt = `1`;
1639	bool IsConsecutive = true;
1640	for (auto *Ptr : VL.drop_front()) {
1641	std::optional<int> Diff = getPointersDiff(ElemTyA: ElemTy, PtrA: Ptr0, ElemTyB: ElemTy, PtrB: Ptr, DL, SE,
1642	/StrictCheck=/true);
1643	if (!Diff)
1644	return false;
1645
1646	// Check if the pointer with the same offset is found.
1647	int64_t Offset = *Diff;
1648	auto Res = Offsets.emplace(args&: Offset, args&: Cnt);
1649	if (!Res.second)
1650	return false;
1651	// Consecutive order if the inserted element is the last one.
1652	IsConsecutive = IsConsecutive && std::next(x: Res.first) == Offsets.end();
1653	++Cnt;
1654	}
1655	SortedIndices.clear();
1656	if (!IsConsecutive) {
1657	// Fill SortedIndices array only if it is non-consecutive.
1658	SortedIndices.resize(N: VL.size());
1659	Cnt = `0`;
1660	for (const std::pair<int64_t, int> &Pair : Offsets) {
1661	SortedIndices [Cnt] = Pair.second;
1662	++Cnt;
1663	}
1664	}
1665	return true;
1666	}
1667
1668	/// Returns true if the memory operations \p A and \p B are consecutive.
1669	bool llvm::isConsecutiveAccess(Value A, Value B, const DataLayout &DL,
1670	ScalarEvolution &SE, bool CheckType) {
1671	Value *PtrA = getLoadStorePointerOperand(V: A);
1672	Value *PtrB = getLoadStorePointerOperand(V: B);
1673	if (!PtrA \|\| !PtrB)
1674	return false;
1675	Type *ElemTyA = getLoadStoreType(I: A);
1676	Type *ElemTyB = getLoadStoreType(I: B);
1677	std::optional<int> Diff =
1678	getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE,
1679	/StrictCheck=/true, CheckType);
1680	return Diff && *Diff == `1`;
1681	}
1682
1683	void MemoryDepChecker::addAccess(StoreInst *SI) {
1684	visitPointers(StartPtr: SI->getPointerOperand(), InnermostLoop: *InnermostLoop,
1685	AddPointer: [this, SI](Value *Ptr) {
1686	Accesses [MemAccessInfo (Ptr, true)].push_back(x: AccessIdx);
1687	InstMap.push_back(Elt: SI);
1688	++AccessIdx;
1689	});
1690	}
1691
1692	void MemoryDepChecker::addAccess(LoadInst *LI) {
1693	visitPointers(StartPtr: LI->getPointerOperand(), InnermostLoop: *InnermostLoop,
1694	AddPointer: [this, LI](Value *Ptr) {
1695	Accesses [MemAccessInfo (Ptr, false)].push_back(x: AccessIdx);
1696	InstMap.push_back(Elt: LI);
1697	++AccessIdx;
1698	});
1699	}
1700
1701	MemoryDepChecker::VectorizationSafetyStatus
1702	MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
1703	switch (Type) {
1704	case NoDep:
1705	case Forward:
1706	case BackwardVectorizable:
1707	return VectorizationSafetyStatus::Safe;
1708
1709	case Unknown:
1710	return VectorizationSafetyStatus::PossiblySafeWithRtChecks;
1711	case ForwardButPreventsForwarding:
1712	case Backward:
1713	case BackwardVectorizableButPreventsForwarding:
1714	case IndirectUnsafe:
1715	return VectorizationSafetyStatus::Unsafe;
1716	}
1717	llvm_unreachable("unexpected DepType!");
1718	}
1719
1720	bool MemoryDepChecker::Dependence::isBackward() const {
1721	switch (Type) {
1722	case NoDep:
1723	case Forward:
1724	case ForwardButPreventsForwarding:
1725	case Unknown:
1726	case IndirectUnsafe:
1727	return false;
1728
1729	case BackwardVectorizable:
1730	case Backward:
1731	case BackwardVectorizableButPreventsForwarding:
1732	return true;
1733	}
1734	llvm_unreachable("unexpected DepType!");
1735	}
1736
1737	bool MemoryDepChecker::Dependence::isPossiblyBackward() const {
1738	return isBackward() \|\| Type == Unknown;
1739	}
1740
1741	bool MemoryDepChecker::Dependence::isForward() const {
1742	switch (Type) {
1743	case Forward:
1744	case ForwardButPreventsForwarding:
1745	return true;
1746
1747	case NoDep:
1748	case Unknown:
1749	case BackwardVectorizable:
1750	case Backward:
1751	case BackwardVectorizableButPreventsForwarding:
1752	case IndirectUnsafe:
1753	return false;
1754	}
1755	llvm_unreachable("unexpected DepType!");
1756	}
1757
1758	bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
1759	uint64_t TypeByteSize) {
1760	// If loads occur at a distance that is not a multiple of a feasible vector
1761	// factor store-load forwarding does not take place.
1762	// Positive dependences might cause troubles because vectorizing them might
1763	// prevent store-load forwarding making vectorized code run a lot slower.
1764	// a[i] = a[i-3] ^ a[i-8];
1765	// The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
1766	// hence on your typical architecture store-load forwarding does not take
1767	// place. Vectorizing in such cases does not make sense.
1768	// Store-load forwarding distance.
1769
1770	// After this many iterations store-to-load forwarding conflicts should not
1771	// cause any slowdowns.
1772	const uint64_t NumItersForStoreLoadThroughMemory = `8` * TypeByteSize;
1773	// Maximum vector factor.
1774	uint64_t MaxVFWithoutSLForwardIssues = std::min(
1775	a: VectorizerParams::MaxVectorWidth * TypeByteSize, b: MinDepDistBytes);
1776
1777	// Compute the smallest VF at which the store and load would be misaligned.
1778	for (uint64_t VF = `2` * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues;
1779	VF *= `2`) {
1780	// If the number of vector iteration between the store and the load are
1781	// small we could incur conflicts.
1782	if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
1783	MaxVFWithoutSLForwardIssues = (VF >> `1`);
1784	break;
1785	}
1786	}
1787
1788	if (MaxVFWithoutSLForwardIssues < `2` * TypeByteSize) {
1789	LLVM_DEBUG(
1790	dbgs() << "LAA: Distance " << Distance
1791	<< " that could cause a store-load forwarding conflict\n");
1792	return true;
1793	}
1794
1795	if (MaxVFWithoutSLForwardIssues < MinDepDistBytes &&
1796	MaxVFWithoutSLForwardIssues !=
1797	VectorizerParams::MaxVectorWidth * TypeByteSize)
1798	MinDepDistBytes = MaxVFWithoutSLForwardIssues;
1799	return false;
1800	}
1801
1802	void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
1803	if (Status < S)
1804	Status = S;
1805	}
1806
1807	/// Given a dependence-distance \p Dist between two
1808	/// memory accesses, that have the same stride whose absolute value is given
1809	/// in \p Stride, and that have the same type size \p TypeByteSize,
1810	/// in a loop whose takenCount is \p BackedgeTakenCount, check if it is
1811	/// possible to prove statically that the dependence distance is larger
1812	/// than the range that the accesses will travel through the execution of
1813	/// the loop. If so, return true; false otherwise. This is useful for
1814	/// example in loops such as the following (PR31098):
1815	/// for (i = 0; i < D; ++i) {
1816	/// = out[i];
1817	/// out[i+D] =
1818	/// }
1819	static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
1820	const SCEV &BackedgeTakenCount,
1821	const SCEV &Dist, uint64_t Stride,
1822	uint64_t TypeByteSize) {
1823
1824	// If we can prove that
1825	// () \|Dist\| > BackedgeTakenCount Step*
1826	// where Step is the absolute stride of the memory accesses in bytes,
1827	// then there is no dependence.
1828	//
1829	// Rationale:
1830	// We basically want to check if the absolute distance (\|Dist/Step\|)
1831	// is >= the loop iteration count (or > BackedgeTakenCount).
1832	// This is equivalent to the Strong SIV Test (Practical Dependence Testing,
1833	// Section 4.2.1); Note, that for vectorization it is sufficient to prove
1834	// that the dependence distance is >= VF; This is checked elsewhere.
1835	// But in some cases we can prune dependence distances early, and
1836	// even before selecting the VF, and without a runtime test, by comparing
1837	// the distance against the loop iteration count. Since the vectorized code
1838	// will be executed only if LoopCount >= VF, proving distance >= LoopCount
1839	// also guarantees that distance >= VF.
1840	//
1841	const uint64_t ByteStride = Stride * TypeByteSize;
1842	const SCEV *Step = SE.getConstant(Ty: BackedgeTakenCount.getType(), V: ByteStride);
1843	const SCEV *Product = SE.getMulExpr(LHS: &BackedgeTakenCount, RHS: Step);
1844
1845	const SCEV *CastedDist = &Dist;
1846	const SCEV *CastedProduct = Product;
1847	uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Ty: Dist.getType());
1848	uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Ty: Product->getType());
1849
1850	// The dependence distance can be positive/negative, so we sign extend Dist;
1851	// The multiplication of the absolute stride in bytes and the
1852	// backedgeTakenCount is non-negative, so we zero extend Product.
1853	if (DistTypeSizeBits > ProductTypeSizeBits)
1854	CastedProduct = SE.getZeroExtendExpr(Op: Product, Ty: Dist.getType());
1855	else
1856	CastedDist = SE.getNoopOrSignExtend(V: &Dist, Ty: Product->getType());
1857
1858	// Is Dist - (BackedgeTakenCount Step) > 0 ?*
1859	// (If so, then we have proven () because \|Dist\| >= Dist)
1860	const SCEV *Minus = SE.getMinusSCEV(LHS: CastedDist, RHS: CastedProduct);
1861	if (SE.isKnownPositive(S: Minus))
1862	return true;
1863
1864	// Second try: Is -Dist - (BackedgeTakenCount Step) > 0 ?*
1865	// (If so, then we have proven () because \|Dist\| >= -1Dist)*
1866	const SCEV *NegDist = SE.getNegativeSCEV(V: CastedDist);
1867	Minus = SE.getMinusSCEV(LHS: NegDist, RHS: CastedProduct);
1868	if (SE.isKnownPositive(S: Minus))
1869	return true;
1870
1871	return false;
1872	}
1873
1874	/// Check the dependence for two accesses with the same stride \p Stride.
1875	/// \p Distance is the positive distance and \p TypeByteSize is type size in
1876	/// bytes.
1877	///
1878	/// \returns true if they are independent.
1879	static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
1880	uint64_t TypeByteSize) {
1881	assert(Stride > `1` && "The stride must be greater than 1");
1882	assert(TypeByteSize > `0` && "The type size in byte must be non-zero");
1883	assert(Distance > `0` && "The distance must be non-zero");
1884
1885	// Skip if the distance is not multiple of type byte size.
1886	if (Distance % TypeByteSize)
1887	return false;
1888
1889	uint64_t ScaledDist = Distance / TypeByteSize;
1890
1891	// No dependence if the scaled distance is not multiple of the stride.
1892	// E.g.
1893	// for (i = 0; i < 1024 ; i += 4)
1894	// A[i+2] = A[i] + 1;
1895	//
1896	// Two accesses in memory (scaled distance is 2, stride is 4):
1897	// \| A[0] \| \| \| \| A[4] \| \| \| \|
1898	// \| \| \| A[2] \| \| \| \| A[6] \| \|
1899	//
1900	// E.g.
1901	// for (i = 0; i < 1024 ; i += 3)
1902	// A[i+4] = A[i] + 1;
1903	//
1904	// Two accesses in memory (scaled distance is 4, stride is 3):
1905	// \| A[0] \| \| \| A[3] \| \| \| A[6] \| \| \|
1906	// \| \| \| \| \| A[4] \| \| \| A[7] \| \|
1907	return ScaledDist % Stride;
1908	}
1909
1910	/// Returns true if any of the underlying objects has a loop varying address,
1911	/// i.e. may change in \p L.
1912	static bool
1913	isLoopVariantIndirectAddress(ArrayRef<const Value *> UnderlyingObjects,
1914	ScalarEvolution &SE, const Loop *L) {
1915	return any_of(Range&: UnderlyingObjects, P: [&SE, L](const Value *UO) {
1916	return !SE.isLoopInvariant(S: SE.getSCEV(V: const_cast<Value *>(UO)), L);
1917	});
1918	}
1919
1920	// Get the dependence distance, stride, type size in whether i is a write for
1921	// the dependence between A and B. Returns a DepType, if we can prove there's
1922	// no dependence or the analysis fails. Outlined to lambda to limit he scope
1923	// of various temporary variables, like A/BPtr, StrideA/BPtr and others.
1924	// Returns either the dependence result, if it could already be determined, or a
1925	// tuple with (Distance, Stride, TypeSize, AIsWrite, BIsWrite).
1926	static std::variant<MemoryDepChecker::Dependence::DepType,
1927	std::tuple<const SCEV , uint64_t, uint64_t, bool, bool*>>
1928	getDependenceDistanceStrideAndSize(
1929	const AccessAnalysis::MemAccessInfo &A, Instruction *AInst,
1930	const AccessAnalysis::MemAccessInfo &B, Instruction *BInst,
1931	const DenseMap<Value , const* SCEV *> &Strides,
1932	const DenseMap<Value , SmallVector<const* Value *, `16`>> &UnderlyingObjects,
1933	PredicatedScalarEvolution &PSE, const Loop *InnermostLoop) {
1934	auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
1935	auto &SE = *PSE.getSE();
1936	auto [APtr, AIsWrite] = A;
1937	auto [BPtr, BIsWrite] = B;
1938
1939	// Two reads are independent.
1940	if (!AIsWrite && !BIsWrite)
1941	return MemoryDepChecker::Dependence::NoDep;
1942
1943	Type *ATy = getLoadStoreType(I: AInst);
1944	Type *BTy = getLoadStoreType(I: BInst);
1945
1946	// We cannot check pointers in different address spaces.
1947	if (APtr->getType()->getPointerAddressSpace() !=
1948	BPtr->getType()->getPointerAddressSpace())
1949	return MemoryDepChecker::Dependence::Unknown;
1950
1951	int64_t StrideAPtr =
1952	getPtrStride(PSE, AccessTy: ATy, Ptr: APtr, Lp: InnermostLoop, StridesMap: Strides, Assume: true).value_or(u: `0`);
1953	int64_t StrideBPtr =
1954	getPtrStride(PSE, AccessTy: BTy, Ptr: BPtr, Lp: InnermostLoop, StridesMap: Strides, Assume: true).value_or(u: `0`);
1955
1956	const SCEV *Src = PSE.getSCEV(V: APtr);
1957	const SCEV *Sink = PSE.getSCEV(V: BPtr);
1958
1959	// If the induction step is negative we have to invert source and sink of the
1960	// dependence when measuring the distance between them. We should not swap
1961	// AIsWrite with BIsWrite, as their uses expect them in program order.
1962	if (StrideAPtr < `0`) {
1963	std::swap(a&: Src, b&: Sink);
1964	std::swap(a&: AInst, b&: BInst);
1965	}
1966
1967	const SCEV *Dist = SE.getMinusSCEV(LHS: Sink, RHS: Src);
1968
1969	LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << Src << "Sink Scev: " << Sink
1970	<< "(Induction step: " << StrideAPtr << ")\n");
1971	LLVM_DEBUG(dbgs() << "LAA: Distance for " << AInst << " to " << BInst
1972	<< ": " << *Dist << "\n");
1973
1974	// Needs accesses where the addresses of the accessed underlying objects do
1975	// not change within the loop.
1976	if (isLoopVariantIndirectAddress(UnderlyingObjects: UnderlyingObjects.find(Val: APtr)->second, SE,
1977	L: InnermostLoop) \|\|
1978	isLoopVariantIndirectAddress(UnderlyingObjects: UnderlyingObjects.find(Val: BPtr)->second, SE,
1979	L: InnermostLoop))
1980	return MemoryDepChecker::Dependence::IndirectUnsafe;
1981
1982	// Need accesses with constant stride. We don't want to vectorize
1983	// "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap
1984	// in the address space.
1985	if (!StrideAPtr \|\| !StrideBPtr \|\| StrideAPtr != StrideBPtr) {
1986	LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
1987	return MemoryDepChecker::Dependence::Unknown;
1988	}
1989
1990	uint64_t TypeByteSize = DL.getTypeAllocSize(Ty: ATy);
1991	bool HasSameSize =
1992	DL.getTypeStoreSizeInBits(Ty: ATy) == DL.getTypeStoreSizeInBits(Ty: BTy);
1993	if (!HasSameSize)
1994	TypeByteSize = `0`;
1995	uint64_t Stride = std::abs(i: StrideAPtr);
1996	return std::make_tuple(args&: Dist, args&: Stride, args&: TypeByteSize, args&: AIsWrite, args&: BIsWrite);
1997	}
1998
1999	MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
2000	const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B,
2001	unsigned BIdx, const DenseMap<Value , const* SCEV *> &Strides,
2002	const DenseMap<Value , SmallVector<const* Value *, `16`>>
2003	&UnderlyingObjects) {
2004	assert(AIdx < BIdx && "Must pass arguments in program order");
2005
2006	// Get the dependence distance, stride, type size and what access writes for
2007	// the dependence between A and B.
2008	auto Res = getDependenceDistanceStrideAndSize(
2009	A, AInst: InstMap [AIdx], B, BInst: InstMap [BIdx], Strides, UnderlyingObjects, PSE,
2010	InnermostLoop);
2011	if (std::holds_alternative<Dependence::DepType>(v: Res))
2012	return std::get<Dependence::DepType>(v&: Res);
2013
2014	const auto &[Dist, Stride, TypeByteSize, AIsWrite, BIsWrite] =
2015	std::get<std::tuple<const SCEV , uint64_t, uint64_t, bool, bool*>>(v&: Res);
2016	bool HasSameSize = TypeByteSize > `0`;
2017
2018	ScalarEvolution &SE = *PSE.getSE();
2019	auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
2020	if (!isa<SCEVCouldNotCompute>(Val: Dist) && HasSameSize &&
2021	isSafeDependenceDistance(DL, SE, BackedgeTakenCount: (PSE.getBackedgeTakenCount()), Dist: Dist,
2022	Stride, TypeByteSize))
2023	return Dependence::NoDep;
2024
2025	const SCEVConstant *C = dyn_cast<SCEVConstant>(Val: Dist);
2026	if (!C) {
2027	LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
2028	FoundNonConstantDistanceDependence = true;
2029	return Dependence::Unknown;
2030	}
2031
2032	const APInt &Val = C->getAPInt();
2033	int64_t Distance = Val.getSExtValue();
2034
2035	// Attempt to prove strided accesses independent.
2036	if (std::abs(i: Distance) > `0` && Stride > `1` && HasSameSize &&
2037	areStridedAccessesIndependent(Distance: std::abs(i: Distance), Stride, TypeByteSize)) {
2038	LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
2039	return Dependence::NoDep;
2040	}
2041
2042	// Negative distances are not plausible dependencies.
2043	if (Val.isNegative()) {
2044	bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
2045	// There is no need to update MaxSafeVectorWidthInBits after call to
2046	// couldPreventStoreLoadForward, even if it changed MinDepDistBytes,
2047	// since a forward dependency will allow vectorization using any width.
2048	if (IsTrueDataDependence && EnableForwardingConflictDetection &&
2049	(!HasSameSize \|\| couldPreventStoreLoadForward(Distance: Val.abs().getZExtValue(),
2050	TypeByteSize))) {
2051	LLVM_DEBUG(dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
2052	return Dependence::ForwardButPreventsForwarding;
2053	}
2054
2055	LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
2056	return Dependence::Forward;
2057	}
2058
2059	// Write to the same location with the same size.
2060	if (Val == `0`) {
2061	if (HasSameSize)
2062	return Dependence::Forward;
2063	LLVM_DEBUG(
2064	dbgs() << "LAA: Zero dependence difference but different type sizes\n");
2065	return Dependence::Unknown;
2066	}
2067
2068	assert(Val.isStrictlyPositive() && "Expect a positive value");
2069
2070	if (!HasSameSize) {
2071	LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
2072	"different type sizes\n");
2073	return Dependence::Unknown;
2074	}
2075
2076	// Bail out early if passed-in parameters make vectorization not feasible.
2077	unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
2078	VectorizerParams::VectorizationFactor : `1`);
2079	unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
2080	VectorizerParams::VectorizationInterleave : `1`);
2081	// The minimum number of iterations for a vectorized/unrolled version.
2082	unsigned MinNumIter = std::max(a: ForcedFactor * ForcedUnroll, b: `2U`);
2083
2084	// It's not vectorizable if the distance is smaller than the minimum distance
2085	// needed for a vectroized/unrolled version. Vectorizing one iteration in
2086	// front needs TypeByteSize Stride. Vectorizing the last iteration needs*
2087	// TypeByteSize (No need to plus the last gap distance).
2088	//
2089	// E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2090	// foo(int A) {*
2091	// int B = (int )((char )A + 14);*
2092	// for (i = 0 ; i < 1024 ; i += 2)
2093	// B[i] = A[i] + 1;
2094	// }
2095	//
2096	// Two accesses in memory (stride is 2):
2097	// \| A[0] \| \| A[2] \| \| A[4] \| \| A[6] \| \|
2098	// \| B[0] \| \| B[2] \| \| B[4] \|
2099	//
2100	// Distance needs for vectorizing iterations except the last iteration:
2101	// 4 2 * (MinNumIter - 1). Distance needs for the last iteration: 4.*
2102	// So the minimum distance needed is: 4 2 * (MinNumIter - 1) + 4.*
2103	//
2104	// If MinNumIter is 2, it is vectorizable as the minimum distance needed is
2105	// 12, which is less than distance.
2106	//
2107	// If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
2108	// the minimum distance needed is 28, which is greater than distance. It is
2109	// not safe to do vectorization.
2110	uint64_t MinDistanceNeeded =
2111	TypeByteSize * Stride * (MinNumIter - `1`) + TypeByteSize;
2112	if (MinDistanceNeeded > static_cast<uint64_t>(Distance)) {
2113	LLVM_DEBUG(dbgs() << "LAA: Failure because of positive distance "
2114	<< Distance << `'\n'`);
2115	return Dependence::Backward;
2116	}
2117
2118	// Unsafe if the minimum distance needed is greater than smallest dependence
2119	// distance distance.
2120	if (MinDistanceNeeded > MinDepDistBytes) {
2121	LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
2122	<< MinDistanceNeeded << " size in bytes\n");
2123	return Dependence::Backward;
2124	}
2125
2126	// Positive distance bigger than max vectorization factor.
2127	// FIXME: Should use max factor instead of max distance in bytes, which could
2128	// not handle different types.
2129	// E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2130	// void foo (int A, char B) {
2131	// for (unsigned i = 0; i < 1024; i++) {
2132	// A[i+2] = A[i] + 1;
2133	// B[i+2] = B[i] + 1;
2134	// }
2135	// }
2136	//
2137	// This case is currently unsafe according to the max safe distance. If we
2138	// analyze the two accesses on array B, the max safe dependence distance
2139	// is 2. Then we analyze the accesses on array A, the minimum distance needed
2140	// is 8, which is less than 2 and forbidden vectorization, But actually
2141	// both A and B could be vectorized by 2 iterations.
2142	MinDepDistBytes =
2143	std::min(a: static_cast<uint64_t>(Distance), b: MinDepDistBytes);
2144
2145	bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
2146	uint64_t MinDepDistBytesOld = MinDepDistBytes;
2147	if (IsTrueDataDependence && EnableForwardingConflictDetection &&
2148	couldPreventStoreLoadForward(Distance, TypeByteSize)) {
2149	// Sanity check that we didn't update MinDepDistBytes when calling
2150	// couldPreventStoreLoadForward
2151	assert(MinDepDistBytes == MinDepDistBytesOld &&
2152	"An update to MinDepDistBytes requires an update to "
2153	"MaxSafeVectorWidthInBits");
2154	(void)MinDepDistBytesOld;
2155	return Dependence::BackwardVectorizableButPreventsForwarding;
2156	}
2157
2158	// An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
2159	// since there is a backwards dependency.
2160	uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * Stride);
2161	LLVM_DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
2162	<< " with max VF = " << MaxVF << `'\n'`);
2163	uint64_t MaxVFInBits = MaxVF * TypeByteSize * `8`;
2164	MaxSafeVectorWidthInBits = std::min(a: MaxSafeVectorWidthInBits, b: MaxVFInBits);
2165	return Dependence::BackwardVectorizable;
2166	}
2167
2168	bool MemoryDepChecker::areDepsSafe(
2169	DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
2170	const DenseMap<Value , const* SCEV *> &Strides,
2171	const DenseMap<Value , SmallVector<const* Value *, `16`>>
2172	&UnderlyingObjects) {
2173
2174	MinDepDistBytes = -`1`;
2175	SmallPtrSet<MemAccessInfo, `8`> Visited;
2176	for (MemAccessInfo CurAccess : CheckDeps) {
2177	if (Visited.count(Ptr: CurAccess))
2178	continue;
2179
2180	// Get the relevant memory access set.
2181	EquivalenceClasses<MemAccessInfo>::iterator I =
2182	AccessSets.findValue(V: AccessSets.getLeaderValue(V: CurAccess));
2183
2184	// Check accesses within this set.
2185	EquivalenceClasses<MemAccessInfo>::member_iterator AI =
2186	AccessSets.member_begin(I);
2187	EquivalenceClasses<MemAccessInfo>::member_iterator AE =
2188	AccessSets.member_end();
2189
2190	// Check every access pair.
2191	while (AI != AE) {
2192	Visited.insert(Ptr: *AI);
2193	bool AIIsWrite = AI ->getInt();
2194	// Check loads only against next equivalent class, but stores also against
2195	// other stores in the same equivalence class - to the same address.
2196	EquivalenceClasses<MemAccessInfo>::member_iterator OI =
2197	(AIIsWrite ? AI : std::next(x: AI));
2198	while (OI != AE) {
2199	// Check every accessing instruction pair in program order.
2200	for (std::vector<unsigned>::iterator I1 = Accesses [*AI].begin(),
2201	I1E = Accesses [*AI].end(); I1 != I1E; ++I1)
2202	// Scan all accesses of another equivalence class, but only the next
2203	// accesses of the same equivalent class.
2204	for (std::vector<unsigned>::iterator
2205	I2 = (OI == AI ? std::next(x: I1) : Accesses [*OI].begin()),
2206	I2E = (OI == AI ? I1E : Accesses [*OI].end());
2207	I2 != I2E; ++I2) {
2208	auto A = std::make_pair(x: &AI, y&: I1);
2209	auto B = std::make_pair(x: &OI, y&: I2);
2210
2211	assert(I1 != I2);
2212	if (I1 > I2)
2213	std::swap(x&: A, y&: B);
2214
2215	Dependence::DepType Type =
2216	isDependent(A: A.first, AIdx: A.second, B: B.first, BIdx: B.second, Strides,
2217	UnderlyingObjects);
2218	mergeInStatus(S: Dependence::isSafeForVectorization(Type));
2219
2220	// Gather dependences unless we accumulated MaxDependences
2221	// dependences. In that case return as soon as we find the first
2222	// unsafe dependence. This puts a limit on this quadratic
2223	// algorithm.
2224	if (RecordDependences) {
2225	if (Type != Dependence::NoDep)
2226	Dependences.push_back(Elt: Dependence (A.second, B.second, Type));
2227
2228	if (Dependences.size() >= MaxDependences) {
2229	RecordDependences = false;
2230	Dependences.clear();
2231	LLVM_DEBUG(dbgs()
2232	<< "Too many dependences, stopped recording\n");
2233	}
2234	}
2235	if (!RecordDependences && !isSafeForVectorization())
2236	return false;
2237	}
2238	++OI;
2239	}
2240	AI ++;
2241	}
2242	}
2243
2244	LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
2245	return isSafeForVectorization();
2246	}
2247
2248	SmallVector<Instruction *, `4`>
2249	MemoryDepChecker::getInstructionsForAccess(Value Ptr, bool* isWrite) const {
2250	MemAccessInfo Access(Ptr, isWrite);
2251	auto &IndexVector = Accesses.find(Val: Access)->second;
2252
2253	SmallVector<Instruction *, `4`> Insts;
2254	transform(Range: IndexVector,
2255	d_first: std::back_inserter(x&: Insts),
2256	F: [&](unsigned Idx) { return this->InstMap [Idx]; });
2257	return Insts;
2258	}
2259
2260	const char *MemoryDepChecker::Dependence::DepName[] = {
2261	"NoDep",
2262	"Unknown",
2263	"IndidrectUnsafe",
2264	"Forward",
2265	"ForwardButPreventsForwarding",
2266	"Backward",
2267	"BackwardVectorizable",
2268	"BackwardVectorizableButPreventsForwarding"};
2269
2270	void MemoryDepChecker::Dependence::print(
2271	raw_ostream &OS, unsigned Depth,
2272	const SmallVectorImpl<Instruction > &Instrs) const* {
2273	OS.indent(NumSpaces: Depth) << DepName[Type] << ":\n";
2274	OS.indent(NumSpaces: Depth + `2`) << *Instrs [Source] << " -> \n";
2275	OS.indent(NumSpaces: Depth + `2`) << *Instrs [Destination] << "\n";
2276	}
2277
2278	bool LoopAccessInfo::canAnalyzeLoop() {
2279	// We need to have a loop header.
2280	LLVM_DEBUG(dbgs() << "LAA: Found a loop in "
2281	<< TheLoop->getHeader()->getParent()->getName() << ": "
2282	<< TheLoop->getHeader()->getName() << `'\n'`);
2283
2284	// We can only analyze innermost loops.
2285	if (!TheLoop->isInnermost()) {
2286	LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
2287	recordAnalysis(RemarkName: "NotInnerMostLoop") << "loop is not the innermost loop";
2288	return false;
2289	}
2290
2291	// We must have a single backedge.
2292	if (TheLoop->getNumBackEdges() != `1`) {
2293	LLVM_DEBUG(
2294	dbgs() << "LAA: loop control flow is not understood by analyzer\n");
2295	recordAnalysis(RemarkName: "CFGNotUnderstood")
2296	<< "loop control flow is not understood by analyzer";
2297	return false;
2298	}
2299
2300	// ScalarEvolution needs to be able to find the exit count.
2301	const SCEV *ExitCount = PSE ->getBackedgeTakenCount();
2302	if (isa<SCEVCouldNotCompute>(Val: ExitCount)) {
2303	recordAnalysis(RemarkName: "CantComputeNumberOfIterations")
2304	<< "could not determine number of loop iterations";
2305	LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
2306	return false;
2307	}
2308
2309	return true;
2310	}
2311
2312	void LoopAccessInfo::analyzeLoop(AAResults AA, LoopInfo LI,
2313	const TargetLibraryInfo *TLI,
2314	DominatorTree *DT) {
2315	// Holds the Load and Store instructions.
2316	SmallVector<LoadInst *, `16`> Loads;
2317	SmallVector<StoreInst *, `16`> Stores;
2318	SmallPtrSet<MDNode *, `8`> LoopAliasScopes;
2319
2320	// Holds all the different accesses in the loop.
2321	unsigned NumReads = `0`;
2322	unsigned NumReadWrites = `0`;
2323
2324	bool HasComplexMemInst = false;
2325
2326	// A runtime check is only legal to insert if there are no convergent calls.
2327	HasConvergentOp = false;
2328
2329	PtrRtChecking ->Pointers.clear();
2330	PtrRtChecking ->Need = false;
2331
2332	const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
2333
2334	const bool EnableMemAccessVersioningOfLoop =
2335	EnableMemAccessVersioning &&
2336	!TheLoop->getHeader()->getParent()->hasOptSize();
2337
2338	// Traverse blocks in fixed RPOT order, regardless of their storage in the
2339	// loop info, as it may be arbitrary.
2340	LoopBlocksRPO RPOT(TheLoop);
2341	RPOT.perform(LI);
2342	for (BasicBlock *BB : RPOT) {
2343	// Scan the BB and collect legal loads and stores. Also detect any
2344	// convergent instructions.
2345	for (Instruction &I : *BB) {
2346	if (auto *Call = dyn_cast<CallBase>(Val: &I)) {
2347	if (Call->isConvergent())
2348	HasConvergentOp = true;
2349	}
2350
2351	// With both a non-vectorizable memory instruction and a convergent
2352	// operation, found in this loop, no reason to continue the search.
2353	if (HasComplexMemInst && HasConvergentOp) {
2354	CanVecMem = false;
2355	return;
2356	}
2357
2358	// Avoid hitting recordAnalysis multiple times.
2359	if (HasComplexMemInst)
2360	continue;
2361
2362	// Record alias scopes defined inside the loop.
2363	if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(Val: &I))
2364	for (Metadata *Op : Decl->getScopeList()->operands())
2365	LoopAliasScopes.insert(Ptr: cast<MDNode>(Val: Op));
2366
2367	// Many math library functions read the rounding mode. We will only
2368	// vectorize a loop if it contains known function calls that don't set
2369	// the flag. Therefore, it is safe to ignore this read from memory.
2370	auto *Call = dyn_cast<CallInst>(Val: &I);
2371	if (Call && getVectorIntrinsicIDForCall(CI: Call, TLI))
2372	continue;
2373
2374	// If this is a load, save it. If this instruction can read from memory
2375	// but is not a load, then we quit. Notice that we don't handle function
2376	// calls that read or write.
2377	if (I.mayReadFromMemory()) {
2378	// If the function has an explicit vectorized counterpart, we can safely
2379	// assume that it can be vectorized.
2380	if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
2381	!VFDatabase::getMappings(CI: *Call).empty())
2382	continue;
2383
2384	auto *Ld = dyn_cast<LoadInst>(Val: &I);
2385	if (!Ld) {
2386	recordAnalysis(RemarkName: "CantVectorizeInstruction", Instr: Ld)
2387	<< "instruction cannot be vectorized";
2388	HasComplexMemInst = true;
2389	continue;
2390	}
2391	if (!Ld->isSimple() && !IsAnnotatedParallel) {
2392	recordAnalysis(RemarkName: "NonSimpleLoad", Instr: Ld)
2393	<< "read with atomic ordering or volatile read";
2394	LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
2395	HasComplexMemInst = true;
2396	continue;
2397	}
2398	NumLoads++;
2399	Loads.push_back(Elt: Ld);
2400	DepChecker ->addAccess(LI: Ld);
2401	if (EnableMemAccessVersioningOfLoop)
2402	collectStridedAccess(LoadOrStoreInst: Ld);
2403	continue;
2404	}
2405
2406	// Save 'store' instructions. Abort if other instructions write to memory.
2407	if (I.mayWriteToMemory()) {
2408	auto *St = dyn_cast<StoreInst>(Val: &I);
2409	if (!St) {
2410	recordAnalysis(RemarkName: "CantVectorizeInstruction", Instr: St)
2411	<< "instruction cannot be vectorized";
2412	HasComplexMemInst = true;
2413	continue;
2414	}
2415	if (!St->isSimple() && !IsAnnotatedParallel) {
2416	recordAnalysis(RemarkName: "NonSimpleStore", Instr: St)
2417	<< "write with atomic ordering or volatile write";
2418	LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
2419	HasComplexMemInst = true;
2420	continue;
2421	}
2422	NumStores++;
2423	Stores.push_back(Elt: St);
2424	DepChecker ->addAccess(SI: St);
2425	if (EnableMemAccessVersioningOfLoop)
2426	collectStridedAccess(LoadOrStoreInst: St);
2427	}
2428	} // Next instr.
2429	} // Next block.
2430
2431	if (HasComplexMemInst) {
2432	CanVecMem = false;
2433	return;
2434	}
2435
2436	// Now we have two lists that hold the loads and the stores.
2437	// Next, we find the pointers that they use.
2438
2439	// Check if we see any stores. If there are no stores, then we don't
2440	// care if the pointers are restrict.
2441	if (!Stores.size()) {
2442	LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
2443	CanVecMem = true;
2444	return;
2445	}
2446
2447	MemoryDepChecker::DepCandidates DependentAccesses;
2448	AccessAnalysis Accesses(TheLoop, AA, LI, DependentAccesses, *PSE,
2449	LoopAliasScopes);
2450
2451	// Holds the analyzed pointers. We don't want to call getUnderlyingObjects
2452	// multiple times on the same object. If the ptr is accessed twice, once
2453	// for read and once for write, it will only appear once (on the write
2454	// list). This is okay, since we are going to check for conflicts between
2455	// writes and between reads and writes, but not between reads and reads.
2456	SmallSet<std::pair<Value , Type >, `16`> Seen;
2457
2458	// Record uniform store addresses to identify if we have multiple stores
2459	// to the same address.
2460	SmallPtrSet<Value *, `16`> UniformStores;
2461
2462	for (StoreInst *ST : Stores) {
2463	Value *Ptr = ST->getPointerOperand();
2464
2465	if (isInvariant(V: Ptr)) {
2466	// Record store instructions to loop invariant addresses
2467	StoresToInvariantAddresses.push_back(Elt: ST);
2468	HasDependenceInvolvingLoopInvariantAddress \|=
2469	!UniformStores.insert(Ptr).second;
2470	}
2471
2472	// If we did not* see this pointer before, insert it to the read-write*
2473	// list. At this phase it is only a 'write' list.
2474	Type *AccessTy = getLoadStoreType(I: ST);
2475	if (Seen.insert(V: {Ptr, AccessTy}).second) {
2476	++NumReadWrites;
2477
2478	MemoryLocation Loc = MemoryLocation::get(SI: ST);
2479	// The TBAA metadata could have a control dependency on the predication
2480	// condition, so we cannot rely on it when determining whether or not we
2481	// need runtime pointer checks.
2482	if (blockNeedsPredication(BB: ST->getParent(), TheLoop, DT))
2483	Loc.AATags.TBAA = nullptr;
2484
2485	visitPointers(StartPtr: const_cast<Value >(Loc.Ptr), InnermostLoop: TheLoop,
2486	AddPointer: [&Accesses, AccessTy, Loc](Value *Ptr) {
2487	MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr);
2488	Accesses.addStore(Loc&: NewLoc, AccessTy);
2489	});
2490	}
2491	}
2492
2493	if (IsAnnotatedParallel) {
2494	LLVM_DEBUG(
2495	dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
2496	<< "checks.\n");
2497	CanVecMem = true;
2498	return;
2499	}
2500
2501	for (LoadInst *LD : Loads) {
2502	Value *Ptr = LD->getPointerOperand();
2503	// If we did not* see this pointer before, insert it to the*
2504	// read list. If we did* see it before, then it is already in*
2505	// the read-write list. This allows us to vectorize expressions
2506	// such as A[i] += x; Because the address of A[i] is a read-write
2507	// pointer. This only works if the index of A[i] is consecutive.
2508	// If the address of i is unknown (for example A[B[i]]) then we may
2509	// read a few words, modify, and write a few words, and some of the
2510	// words may be written to the same address.
2511	bool IsReadOnlyPtr = false;
2512	Type *AccessTy = getLoadStoreType(I: LD);
2513	if (Seen.insert(V: {Ptr, AccessTy}).second \|\|
2514	!getPtrStride(PSE&: *PSE, AccessTy: LD->getType(), Ptr, Lp: TheLoop, StridesMap: SymbolicStrides).value_or(u: `0`)) {
2515	++NumReads;
2516	IsReadOnlyPtr = true;
2517	}
2518
2519	// See if there is an unsafe dependency between a load to a uniform address and
2520	// store to the same uniform address.
2521	if (UniformStores.count(Ptr)) {
2522	LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
2523	"load and uniform store to the same address!\n");
2524	HasDependenceInvolvingLoopInvariantAddress = true;
2525	}
2526
2527	MemoryLocation Loc = MemoryLocation::get(LI: LD);
2528	// The TBAA metadata could have a control dependency on the predication
2529	// condition, so we cannot rely on it when determining whether or not we
2530	// need runtime pointer checks.
2531	if (blockNeedsPredication(BB: LD->getParent(), TheLoop, DT))
2532	Loc.AATags.TBAA = nullptr;
2533
2534	visitPointers(StartPtr: const_cast<Value >(Loc.Ptr), InnermostLoop: TheLoop,
2535	AddPointer: [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) {
2536	MemoryLocation NewLoc = Loc.getWithNewPtr(NewPtr: Ptr);
2537	Accesses.addLoad(Loc&: NewLoc, AccessTy, IsReadOnly: IsReadOnlyPtr);
2538	});
2539	}
2540
2541	// If we write (or read-write) to a single destination and there are no
2542	// other reads in this loop then is it safe to vectorize.
2543	if (NumReadWrites == `1` && NumReads == `0`) {
2544	LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
2545	CanVecMem = true;
2546	return;
2547	}
2548
2549	// Build dependence sets and check whether we need a runtime pointer bounds
2550	// check.
2551	Accesses.buildDependenceSets();
2552
2553	// Find pointers with computable bounds. We are going to use this information
2554	// to place a runtime bound check.
2555	Value UncomputablePtr = nullptr*;
2556	bool CanDoRTIfNeeded =
2557	Accesses.canCheckPtrAtRT(RtCheck&: *PtrRtChecking, SE: PSE ->getSE(), TheLoop,
2558	StridesMap: SymbolicStrides, UncomputablePtr, ShouldCheckWrap: false);
2559	if (!CanDoRTIfNeeded) {
2560	auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr);
2561	recordAnalysis(RemarkName: "CantIdentifyArrayBounds", Instr: I)
2562	<< "cannot identify array bounds";
2563	LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
2564	<< "the array bounds.\n");
2565	CanVecMem = false;
2566	return;
2567	}
2568
2569	LLVM_DEBUG(
2570	dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n");
2571
2572	CanVecMem = true;
2573	if (Accesses.isDependencyCheckNeeded()) {
2574	LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
2575	CanVecMem = DepChecker ->areDepsSafe(
2576	AccessSets&: DependentAccesses, CheckDeps&: Accesses.getDependenciesToCheck(), Strides: SymbolicStrides,
2577	UnderlyingObjects: Accesses.getUnderlyingObjects());
2578
2579	if (!CanVecMem && DepChecker ->shouldRetryWithRuntimeCheck()) {
2580	LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
2581
2582	// Clear the dependency checks. We assume they are not needed.
2583	Accesses.resetDepChecks(DepChecker&: *DepChecker);
2584
2585	PtrRtChecking ->reset();
2586	PtrRtChecking ->Need = true;
2587
2588	auto *SE = PSE ->getSE();
2589	UncomputablePtr = nullptr;
2590	CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(
2591	RtCheck&: PtrRtChecking, SE, TheLoop, StridesMap: SymbolicStrides, UncomputablePtr, ShouldCheckWrap: true*);
2592
2593	// Check that we found the bounds for the pointer.
2594	if (!CanDoRTIfNeeded) {
2595	auto *I = dyn_cast_or_null<Instruction>(Val: UncomputablePtr);
2596	recordAnalysis(RemarkName: "CantCheckMemDepsAtRunTime", Instr: I)
2597	<< "cannot check memory dependencies at runtime";
2598	LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
2599	CanVecMem = false;
2600	return;
2601	}
2602
2603	CanVecMem = true;
2604	}
2605	}
2606
2607	if (HasConvergentOp) {
2608	recordAnalysis(RemarkName: "CantInsertRuntimeCheckWithConvergent")
2609	<< "cannot add control dependency to convergent operation";
2610	LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check "
2611	"would be needed with a convergent operation\n");
2612	CanVecMem = false;
2613	return;
2614	}
2615
2616	if (CanVecMem)
2617	LLVM_DEBUG(
2618	dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
2619	<< (PtrRtChecking ->Need ? "" : " don't")
2620	<< " need runtime memory checks.\n");
2621	else
2622	emitUnsafeDependenceRemark();
2623	}
2624
2625	void LoopAccessInfo::emitUnsafeDependenceRemark() {
2626	auto Deps = getDepChecker().getDependences();
2627	if (!Deps)
2628	return;
2629	auto Found = llvm::find_if(Range: Deps, P: [](const* MemoryDepChecker::Dependence &D) {
2630	return MemoryDepChecker::Dependence::isSafeForVectorization(Type: D.Type) !=
2631	MemoryDepChecker::VectorizationSafetyStatus::Safe;
2632	});
2633	if (Found == Deps->end())
2634	return;
2635	MemoryDepChecker::Dependence Dep = *Found;
2636
2637	LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
2638
2639	// Emit remark for first unsafe dependence
2640	bool HasForcedDistribution = false;
2641	std::optional<const MDOperand *> Value =
2642	findStringMetadataForLoop(TheLoop, Name: "llvm.loop.distribute.enable");
2643	if (Value) {
2644	const MDOperand Op = Value;
2645	assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
2646	HasForcedDistribution = mdconst::extract<ConstantInt>(MD: *Op)->getZExtValue();
2647	}
2648
2649	const std::string Info =
2650	HasForcedDistribution
2651	? "unsafe dependent memory operations in loop."
2652	: "unsafe dependent memory operations in loop. Use "
2653	"#pragma clang loop distribute(enable) to allow loop distribution "
2654	"to attempt to isolate the offending operations into a separate "
2655	"loop";
2656	OptimizationRemarkAnalysis &R =
2657	recordAnalysis(RemarkName: "UnsafeDep", Instr: Dep.getDestination(LAI: *this)) << Info;
2658
2659	switch (Dep.Type) {
2660	case MemoryDepChecker::Dependence::NoDep:
2661	case MemoryDepChecker::Dependence::Forward:
2662	case MemoryDepChecker::Dependence::BackwardVectorizable:
2663	llvm_unreachable("Unexpected dependence");
2664	case MemoryDepChecker::Dependence::Backward:
2665	R << "\nBackward loop carried data dependence.";
2666	break;
2667	case MemoryDepChecker::Dependence::ForwardButPreventsForwarding:
2668	R << "\nForward loop carried data dependence that prevents "
2669	"store-to-load forwarding.";
2670	break;
2671	case MemoryDepChecker::Dependence::BackwardVectorizableButPreventsForwarding:
2672	R << "\nBackward loop carried data dependence that prevents "
2673	"store-to-load forwarding.";
2674	break;
2675	case MemoryDepChecker::Dependence::IndirectUnsafe:
2676	R << "\nUnsafe indirect dependence.";
2677	break;
2678	case MemoryDepChecker::Dependence::Unknown:
2679	R << "\nUnknown data dependence.";
2680	break;
2681	}
2682
2683	if (Instruction I = Dep.getSource(LAI: this)) {
2684	DebugLoc SourceLoc = I->getDebugLoc();
2685	if (auto *DD = dyn_cast_or_null<Instruction>(Val: getPointerOperand(V: I)))
2686	SourceLoc = DD->getDebugLoc();
2687	if (SourceLoc)
2688	R << " Memory location is the same as accessed at "
2689	<< ore::NV ("Location", SourceLoc);
2690	}
2691	}
2692
2693	bool LoopAccessInfo::blockNeedsPredication(BasicBlock BB, Loop TheLoop,
2694	DominatorTree *DT) {
2695	assert(TheLoop->contains(BB) && "Unknown block used");
2696
2697	// Blocks that do not dominate the latch need predication.
2698	BasicBlock* Latch = TheLoop->getLoopLatch();
2699	return !DT->dominates(A: BB, B: Latch);
2700	}
2701
2702	OptimizationRemarkAnalysis &LoopAccessInfo::recordAnalysis(StringRef RemarkName,
2703	Instruction *I) {
2704	assert(!Report && "Multiple reports generated");
2705
2706	Value *CodeRegion = TheLoop->getHeader();
2707	DebugLoc DL = TheLoop->getStartLoc();
2708
2709	if (I) {
2710	CodeRegion = I->getParent();
2711	// If there is no debug location attached to the instruction, revert back to
2712	// using the loop's.
2713	if (I->getDebugLoc())
2714	DL = I->getDebugLoc();
2715	}
2716
2717	Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, args&: RemarkName, args&: DL,
2718	args&: CodeRegion);
2719	return *Report;
2720	}
2721
2722	bool LoopAccessInfo::isInvariant(Value V) const* {
2723	auto *SE = PSE ->getSE();
2724	// TODO: Is this really what we want? Even without FP SCEV, we may want some
2725	// trivially loop-invariant FP values to be considered invariant.
2726	if (!SE->isSCEVable(Ty: V->getType()))
2727	return false;
2728	const SCEV *S = SE->getSCEV(V);
2729	return SE->isLoopInvariant(S, L: TheLoop);
2730	}
2731
2732	/// Find the operand of the GEP that should be checked for consecutive
2733	/// stores. This ignores trailing indices that have no effect on the final
2734	/// pointer.
2735	static unsigned getGEPInductionOperand(const GetElementPtrInst *Gep) {
2736	const DataLayout &DL = Gep->getModule()->getDataLayout();
2737	unsigned LastOperand = Gep->getNumOperands() - `1`;
2738	TypeSize GEPAllocSize = DL.getTypeAllocSize(Ty: Gep->getResultElementType());
2739
2740	// Walk backwards and try to peel off zeros.
2741	while (LastOperand > `1` && match(V: Gep->getOperand(i_nocapture: LastOperand), P: m_Zero())) {
2742	// Find the type we're currently indexing into.
2743	gep_type_iterator GEPTI = gep_type_begin(GEP: Gep);
2744	std::advance(i&: GEPTI, n: LastOperand - `2`);
2745
2746	// If it's a type with the same allocation size as the result of the GEP we
2747	// can peel off the zero index.
2748	TypeSize ElemSize = GEPTI.isStruct()
2749	? DL.getTypeAllocSize(Ty: GEPTI.getIndexedType())
2750	: GEPTI.getSequentialElementStride(DL);
2751	if (ElemSize != GEPAllocSize)
2752	break;
2753	--LastOperand;
2754	}
2755
2756	return LastOperand;
2757	}
2758
2759	/// If the argument is a GEP, then returns the operand identified by
2760	/// getGEPInductionOperand. However, if there is some other non-loop-invariant
2761	/// operand, it returns that instead.
2762	static Value stripGetElementPtr(Value Ptr, ScalarEvolution SE, Loop Lp) {
2763	GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr);
2764	if (!GEP)
2765	return Ptr;
2766
2767	unsigned InductionOperand = getGEPInductionOperand(Gep: GEP);
2768
2769	// Check that all of the gep indices are uniform except for our induction
2770	// operand.
2771	for (unsigned i = `0`, e = GEP->getNumOperands(); i != e; ++i)
2772	if (i != InductionOperand &&
2773	!SE->isLoopInvariant(S: SE->getSCEV(V: GEP->getOperand(i_nocapture: i)), L: Lp))
2774	return Ptr;
2775	return GEP->getOperand(i_nocapture: InductionOperand);
2776	}
2777
2778	/// If a value has only one user that is a CastInst, return it.
2779	static Value getUniqueCastUse(Value Ptr, Loop Lp, Type Ty) {
2780	Value UniqueCast = nullptr*;
2781	for (User *U : Ptr->users()) {
2782	CastInst *CI = dyn_cast<CastInst>(Val: U);
2783	if (CI && CI->getType() == Ty) {
2784	if (!UniqueCast)
2785	UniqueCast = CI;
2786	else
2787	return nullptr;
2788	}
2789	}
2790	return UniqueCast;
2791	}
2792
2793	/// Get the stride of a pointer access in a loop. Looks for symbolic
2794	/// strides "a[istride]". Returns the symbolic stride, or null otherwise.*
2795	static const SCEV getStrideFromPointer(Value Ptr, ScalarEvolution SE, Loop Lp) {
2796	auto *PtrTy = dyn_cast<PointerType>(Val: Ptr->getType());
2797	if (!PtrTy \|\| PtrTy->isAggregateType())
2798	return nullptr;
2799
2800	// Try to remove a gep instruction to make the pointer (actually index at this
2801	// point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the
2802	// pointer, otherwise, we are analyzing the index.
2803	Value *OrigPtr = Ptr;
2804
2805	// The size of the pointer access.
2806	int64_t PtrAccessSize = `1`;
2807
2808	Ptr = stripGetElementPtr(Ptr, SE, Lp);
2809	const SCEV *V = SE->getSCEV(V: Ptr);
2810
2811	if (Ptr != OrigPtr)
2812	// Strip off casts.
2813	while (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(Val: V))
2814	V = C->getOperand();
2815
2816	const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(Val: V);
2817	if (!S)
2818	return nullptr;
2819
2820	// If the pointer is invariant then there is no stride and it makes no
2821	// sense to add it here.
2822	if (Lp != S->getLoop())
2823	return nullptr;
2824
2825	V = S->getStepRecurrence(SE&: *SE);
2826	if (!V)
2827	return nullptr;
2828
2829	// Strip off the size of access multiplication if we are still analyzing the
2830	// pointer.
2831	if (OrigPtr == Ptr) {
2832	if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(Val: V)) {
2833	if (M->getOperand(i: `0`)->getSCEVType() != scConstant)
2834	return nullptr;
2835
2836	const APInt &APStepVal = cast<SCEVConstant>(Val: M->getOperand(i: `0`))->getAPInt();
2837
2838	// Huge step value - give up.
2839	if (APStepVal.getBitWidth() > `64`)
2840	return nullptr;
2841
2842	int64_t StepVal = APStepVal.getSExtValue();
2843	if (PtrAccessSize != StepVal)
2844	return nullptr;
2845	V = M->getOperand(i: `1`);
2846	}
2847	}
2848
2849	// Note that the restriction after this loop invariant check are only
2850	// profitability restrictions.
2851	if (!SE->isLoopInvariant(S: V, L: Lp))
2852	return nullptr;
2853
2854	// Look for the loop invariant symbolic value.
2855	const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Val: V);
2856	if (!U) {
2857	const auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: V);
2858	if (!C)
2859	return nullptr;
2860	U = dyn_cast<SCEVUnknown>(Val: C->getOperand());
2861	if (!U)
2862	return nullptr;
2863
2864	// Match legacy behavior - this is not needed for correctness
2865	if (!getUniqueCastUse(Ptr: U->getValue(), Lp, Ty: V->getType()))
2866	return nullptr;
2867	}
2868
2869	return V;
2870	}
2871
2872	void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
2873	Value *Ptr = getLoadStorePointerOperand(V: MemAccess);
2874	if (!Ptr)
2875	return;
2876
2877	// Note: getStrideFromPointer is a profitability* heuristic. We*
2878	// could broaden the scope of values returned here - to anything
2879	// which happens to be loop invariant and contributes to the
2880	// computation of an interesting IV - but we chose not to as we
2881	// don't have a cost model here, and broadening the scope exposes
2882	// far too many unprofitable cases.
2883	const SCEV *StrideExpr = getStrideFromPointer(Ptr, SE: PSE ->getSE(), Lp: TheLoop);
2884	if (!StrideExpr)
2885	return;
2886
2887	LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
2888	"versioning:");
2889	LLVM_DEBUG(dbgs() << " Ptr: " << Ptr << " Stride: " << StrideExpr << "\n");
2890
2891	if (!SpeculateUnitStride) {
2892	LLVM_DEBUG(dbgs() << " Chose not to due to -laa-speculate-unit-stride\n");
2893	return;
2894	}
2895
2896	// Avoid adding the "Stride == 1" predicate when we know that
2897	// Stride >= Trip-Count. Such a predicate will effectively optimize a single
2898	// or zero iteration loop, as Trip-Count <= Stride == 1.
2899	//
2900	// TODO: We are currently not making a very informed decision on when it is
2901	// beneficial to apply stride versioning. It might make more sense that the
2902	// users of this analysis (such as the vectorizer) will trigger it, based on
2903	// their specific cost considerations; For example, in cases where stride
2904	// versioning does not help resolving memory accesses/dependences, the
2905	// vectorizer should evaluate the cost of the runtime test, and the benefit
2906	// of various possible stride specializations, considering the alternatives
2907	// of using gather/scatters (if available).
2908
2909	const SCEV *BETakenCount = PSE ->getBackedgeTakenCount();
2910
2911	// Match the types so we can compare the stride and the BETakenCount.
2912	// The Stride can be positive/negative, so we sign extend Stride;
2913	// The backedgeTakenCount is non-negative, so we zero extend BETakenCount.
2914	const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
2915	uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(Ty: StrideExpr->getType());
2916	uint64_t BETypeSizeBits = DL.getTypeSizeInBits(Ty: BETakenCount->getType());
2917	const SCEV *CastedStride = StrideExpr;
2918	const SCEV *CastedBECount = BETakenCount;
2919	ScalarEvolution *SE = PSE ->getSE();
2920	if (BETypeSizeBits >= StrideTypeSizeBits)
2921	CastedStride = SE->getNoopOrSignExtend(V: StrideExpr, Ty: BETakenCount->getType());
2922	else
2923	CastedBECount = SE->getZeroExtendExpr(Op: BETakenCount, Ty: StrideExpr->getType());
2924	const SCEV *StrideMinusBETaken = SE->getMinusSCEV(LHS: CastedStride, RHS: CastedBECount);
2925	// Since TripCount == BackEdgeTakenCount + 1, checking:
2926	// "Stride >= TripCount" is equivalent to checking:
2927	// Stride - BETakenCount > 0
2928	if (SE->isKnownPositive(S: StrideMinusBETaken)) {
2929	LLVM_DEBUG(
2930	dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
2931	"Stride==1 predicate will imply that the loop executes "
2932	"at most once.\n");
2933	return;
2934	}
2935	LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n");
2936
2937	// Strip back off the integer cast, and check that our result is a
2938	// SCEVUnknown as we expect.
2939	const SCEV *StrideBase = StrideExpr;
2940	if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(Val: StrideBase))
2941	StrideBase = C->getOperand();
2942	SymbolicStrides [Ptr] = cast<SCEVUnknown>(Val: StrideBase);
2943	}
2944
2945	LoopAccessInfo::LoopAccessInfo(Loop L, ScalarEvolution SE,
2946	const TargetLibraryInfo TLI, AAResults AA,
2947	DominatorTree DT, LoopInfo LI)
2948	: PSE(std::make_unique<PredicatedScalarEvolution>(args&: SE, args&: L)),
2949	PtrRtChecking (nullptr),
2950	DepChecker(std::make_unique<MemoryDepChecker>(args&: *PSE, args&: L)), TheLoop(L) {
2951	PtrRtChecking = std::make_unique<RuntimePointerChecking>(args&: *DepChecker, args&: SE);
2952	if (canAnalyzeLoop()) {
2953	analyzeLoop(AA, LI, TLI, DT);
2954	}
2955	}
2956
2957	void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
2958	if (CanVecMem) {
2959	OS.indent(NumSpaces: Depth) << "Memory dependences are safe";
2960	const MemoryDepChecker &DC = getDepChecker();
2961	if (!DC.isSafeForAnyVectorWidth())
2962	OS << " with a maximum safe vector width of "
2963	<< DC.getMaxSafeVectorWidthInBits() << " bits";
2964	if (PtrRtChecking ->Need)
2965	OS << " with run-time checks";
2966	OS << "\n";
2967	}
2968
2969	if (HasConvergentOp)
2970	OS.indent(NumSpaces: Depth) << "Has convergent operation in loop\n";
2971
2972	if (Report)
2973	OS.indent(NumSpaces: Depth) << "Report: " << Report ->getMsg() << "\n";
2974
2975	if (auto *Dependences = DepChecker ->getDependences()) {
2976	OS.indent(NumSpaces: Depth) << "Dependences:\n";
2977	for (const auto &Dep : *Dependences) {
2978	Dep.print(OS, Depth: Depth + `2`, Instrs: DepChecker ->getMemoryInstructions());
2979	OS << "\n";
2980	}
2981	} else
2982	OS.indent(NumSpaces: Depth) << "Too many dependences, not recorded\n";
2983
2984	// List the pair of accesses need run-time checks to prove independence.
2985	PtrRtChecking ->print(OS, Depth);
2986	OS << "\n";
2987
2988	OS.indent(NumSpaces: Depth) << "Non vectorizable stores to invariant address were "
2989	<< (HasDependenceInvolvingLoopInvariantAddress ? "" : "not ")
2990	<< "found in loop.\n";
2991
2992	OS.indent(NumSpaces: Depth) << "SCEV assumptions:\n";
2993	PSE ->getPredicate().print(OS, Depth);
2994
2995	OS << "\n";
2996
2997	OS.indent(NumSpaces: Depth) << "Expressions re-written:\n";
2998	PSE ->print(OS, Depth);
2999	}
3000
3001	const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L) {
3002	auto I = LoopAccessInfoMap.insert(KV: {&L, nullptr});
3003
3004	if (I.second)
3005	I.first ->second =
3006	std::make_unique<LoopAccessInfo>(args: &L, args: &SE, args&: TLI, args: &AA, args: &DT, args: &LI);
3007
3008	return *I.first ->second;
3009	}
3010
3011	bool LoopAccessInfoManager::invalidate(
3012	Function &F, const PreservedAnalyses &PA,
3013	FunctionAnalysisManager::Invalidator &Inv) {
3014	// Check whether our analysis is preserved.
3015	auto PAC = PA.getChecker<LoopAccessAnalysis>();
3016	if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Function>>())
3017	// If not, give up now.
3018	return true;
3019
3020	// Check whether the analyses we depend on became invalid for any reason.
3021	// Skip checking TargetLibraryAnalysis as it is immutable and can't become
3022	// invalid.
3023	return Inv.invalidate<AAManager>(IR&: F, PA) \|\|
3024	Inv.invalidate<ScalarEvolutionAnalysis>(IR&: F, PA) \|\|
3025	Inv.invalidate<LoopAnalysis>(IR&: F, PA) \|\|
3026	Inv.invalidate<DominatorTreeAnalysis>(IR&: F, PA);
3027	}
3028
3029	LoopAccessInfoManager LoopAccessAnalysis::run(Function &F,
3030	FunctionAnalysisManager &FAM) {
3031	auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(IR&: F);
3032	auto &AA = FAM.getResult<AAManager>(IR&: F);
3033	auto &DT = FAM.getResult<DominatorTreeAnalysis>(IR&: F);
3034	auto &LI = FAM.getResult<LoopAnalysis>(IR&: F);
3035	auto &TLI = FAM.getResult<TargetLibraryAnalysis>(IR&: F);
3036	return LoopAccessInfoManager (SE, AA, DT, LI, &TLI);
3037	}
3038
3039	AnalysisKey LoopAccessAnalysis::Key;
3040

source code of llvm/lib/Analysis/LoopAccessAnalysis.cpp