LoadStoreVectorizer.cpp source code [llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp]

1	//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass merges loads/stores to/from sequential memory addresses into vector
10	// loads/stores. Although there's nothing GPU-specific in here, this pass is
11	// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
12	//
13	// (For simplicity below we talk about loads only, but everything also applies
14	// to stores.)
15	//
16	// This pass is intended to be run late in the pipeline, after other
17	// vectorization opportunities have been exploited. So the assumption here is
18	// that immediately following our new vector load we'll need to extract out the
19	// individual elements of the load, so we can operate on them individually.
20	//
21	// On CPUs this transformation is usually not beneficial, because extracting the
22	// elements of a vector register is expensive on most architectures. It's
23	// usually better just to load each element individually into its own scalar
24	// register.
25	//
26	// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
27	// "vector load" loads directly into a series of scalar registers. In effect,
28	// extracting the elements of the vector is free. It's therefore always
29	// beneficial to vectorize a sequence of loads on these architectures.
30	//
31	// Vectorizing (perhaps a better name might be "coalescing") loads can have
32	// large performance impacts on GPU kernels, and opportunities for vectorizing
33	// are common in GPU code. This pass tries very hard to find such
34	// opportunities; its runtime is quadratic in the number of loads in a BB.
35	//
36	// Some CPU architectures, such as ARM, have instructions that load into
37	// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
38	// could use this pass (with some modifications), but currently it implements
39	// its own pass to do something similar to what we do here.
40	//
41	// Overview of the algorithm and terminology in this pass:
42	//
43	// - Break up each basic block into pseudo-BBs, composed of instructions which
44	// are guaranteed to transfer control to their successors.
45	// - Within a single pseudo-BB, find all loads, and group them into
46	// "equivalence classes" according to getUnderlyingObject() and loaded
47	// element size. Do the same for stores.
48	// - For each equivalence class, greedily build "chains". Each chain has a
49	// leader instruction, and every other member of the chain has a known
50	// constant offset from the first instr in the chain.
51	// - Break up chains so that they contain only contiguous accesses of legal
52	// size with no intervening may-alias instrs.
53	// - Convert each chain to vector instructions.
54	//
55	// The O(n^2) behavior of this pass comes from initially building the chains.
56	// In the worst case we have to compare each new instruction to all of those
57	// that came before. To limit this, we only calculate the offset to the leaders
58	// of the N most recently-used chains.
59
60	#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
61	#include "llvm/ADT/APInt.h"
62	#include "llvm/ADT/ArrayRef.h"
63	#include "llvm/ADT/DenseMap.h"
64	#include "llvm/ADT/MapVector.h"
65	#include "llvm/ADT/PostOrderIterator.h"
66	#include "llvm/ADT/STLExtras.h"
67	#include "llvm/ADT/Sequence.h"
68	#include "llvm/ADT/SmallPtrSet.h"
69	#include "llvm/ADT/SmallVector.h"
70	#include "llvm/ADT/Statistic.h"
71	#include "llvm/ADT/iterator_range.h"
72	#include "llvm/Analysis/AliasAnalysis.h"
73	#include "llvm/Analysis/AssumptionCache.h"
74	#include "llvm/Analysis/MemoryLocation.h"
75	#include "llvm/Analysis/ScalarEvolution.h"
76	#include "llvm/Analysis/TargetTransformInfo.h"
77	#include "llvm/Analysis/ValueTracking.h"
78	#include "llvm/Analysis/VectorUtils.h"
79	#include "llvm/IR/Attributes.h"
80	#include "llvm/IR/BasicBlock.h"
81	#include "llvm/IR/ConstantRange.h"
82	#include "llvm/IR/Constants.h"
83	#include "llvm/IR/DataLayout.h"
84	#include "llvm/IR/DerivedTypes.h"
85	#include "llvm/IR/Dominators.h"
86	#include "llvm/IR/Function.h"
87	#include "llvm/IR/GetElementPtrTypeIterator.h"
88	#include "llvm/IR/IRBuilder.h"
89	#include "llvm/IR/InstrTypes.h"
90	#include "llvm/IR/Instruction.h"
91	#include "llvm/IR/Instructions.h"
92	#include "llvm/IR/LLVMContext.h"
93	#include "llvm/IR/Module.h"
94	#include "llvm/IR/Type.h"
95	#include "llvm/IR/Value.h"
96	#include "llvm/InitializePasses.h"
97	#include "llvm/Pass.h"
98	#include "llvm/Support/Alignment.h"
99	#include "llvm/Support/Casting.h"
100	#include "llvm/Support/Debug.h"
101	#include "llvm/Support/KnownBits.h"
102	#include "llvm/Support/MathExtras.h"
103	#include "llvm/Support/ModRef.h"
104	#include "llvm/Support/raw_ostream.h"
105	#include "llvm/Transforms/Utils/Local.h"
106	#include <algorithm>
107	#include <cassert>
108	#include <cstdint>
109	#include <cstdlib>
110	#include <iterator>
111	#include <numeric>
112	#include <optional>
113	#include <tuple>
114	#include <type_traits>
115	#include <utility>
116	#include <vector>
117
118	using namespace llvm;
119
120	#define DEBUG_TYPE "load-store-vectorizer"
121
122	STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
123	STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
124
125	namespace {
126
127	// Equivalence class key, the initial tuple by which we group loads/stores.
128	// Loads/stores with different EqClassKeys are never merged.
129	//
130	// (We could in theory remove element-size from the this tuple. We'd just need
131	// to fix up the vector packing/unpacking code.)
132	using EqClassKey =
133	std::tuple<const Value * / result of getUnderlyingObject() /,
134	unsigned / AddrSpace /,
135	unsigned / Load/Store element size bits /,
136	char / IsLoad; char b/c bool can't be a DenseMap key /
137	>;
138	[[maybe_unused]] llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
139	const EqClassKey &K) {
140	const auto &[UnderlyingObject, AddrSpace, ElementSize, IsLoad] = K;
141	return OS << (IsLoad ? "load" : "store") << " of " << *UnderlyingObject
142	<< " of element size " << ElementSize << " bits in addrspace "
143	<< AddrSpace;
144	}
145
146	// A Chain is a set of instructions such that:
147	// - All instructions have the same equivalence class, so in particular all are
148	// loads, or all are stores.
149	// - We know the address accessed by the i'th chain elem relative to the
150	// chain's leader instruction, which is the first instr of the chain in BB
151	// order.
152	//
153	// Chains have two canonical orderings:
154	// - BB order, sorted by Instr->comesBefore.
155	// - Offset order, sorted by OffsetFromLeader.
156	// This pass switches back and forth between these orders.
157	struct ChainElem {
158	Instruction *Inst;
159	APInt OffsetFromLeader;
160	};
161	using Chain = SmallVector<ChainElem, `1`>;
162
163	void sortChainInBBOrder(Chain &C) {
164	sort(C, Comp: [](auto &A, auto &B) { return A.Inst->comesBefore(B.Inst); });
165	}
166
167	void sortChainInOffsetOrder(Chain &C) {
168	sort(C, Comp: [](const auto &A, const auto &B) {
169	if (A.OffsetFromLeader != B.OffsetFromLeader)
170	return A.OffsetFromLeader.slt(B.OffsetFromLeader);
171	return A.Inst->comesBefore(B.Inst); // stable tiebreaker
172	});
173	}
174
175	[[maybe_unused]] void dumpChain(ArrayRef<ChainElem> C) {
176	for (const auto &E : C) {
177	dbgs() << " " << *E.Inst << " (offset " << E.OffsetFromLeader << ")\n";
178	}
179	}
180
181	using EquivalenceClassMap =
182	MapVector<EqClassKey, SmallVector<Instruction *, `8`>>;
183
184	// FIXME: Assuming stack alignment of 4 is always good enough
185	constexpr unsigned StackAdjustedAlignment = `4`;
186
187	Instruction propagateMetadata(Instruction I, const Chain &C) {
188	SmallVector<Value *, `8`> Values;
189	for (const ChainElem &E : C)
190	Values.push_back(Elt: E.Inst);
191	return propagateMetadata(I, VL: Values);
192	}
193
194	bool isInvariantLoad(const Instruction *I) {
195	const LoadInst *LI = dyn_cast<LoadInst>(Val: I);
196	return LI != nullptr && LI->hasMetadata(KindID: LLVMContext::MD_invariant_load);
197	}
198
199	/// Reorders the instructions that I depends on (the instructions defining its
200	/// operands), to ensure they dominate I.
201	void reorder(Instruction *I) {
202	SmallPtrSet<Instruction *, `16`> InstructionsToMove;
203	SmallVector<Instruction *, `16`> Worklist;
204
205	Worklist.push_back(Elt: I);
206	while (!Worklist.empty()) {
207	Instruction *IW = Worklist.pop_back_val();
208	int NumOperands = IW->getNumOperands();
209	for (int i = `0`; i < NumOperands; i++) {
210	Instruction *IM = dyn_cast<Instruction>(Val: IW->getOperand(i));
211	if (!IM \|\| IM->getOpcode() == Instruction::PHI)
212	continue;
213
214	// If IM is in another BB, no need to move it, because this pass only
215	// vectorizes instructions within one BB.
216	if (IM->getParent() != I->getParent())
217	continue;
218
219	if (!IM->comesBefore(Other: I)) {
220	InstructionsToMove.insert(Ptr: IM);
221	Worklist.push_back(Elt: IM);
222	}
223	}
224	}
225
226	// All instructions to move should follow I. Start from I, not from begin().
227	for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;) {
228	Instruction IM = &(BBI ++);
229	if (!InstructionsToMove.count(Ptr: IM))
230	continue;
231	IM->moveBefore(MovePos: I);
232	}
233	}
234
235	class Vectorizer {
236	Function &F;
237	AliasAnalysis &AA;
238	AssumptionCache &AC;
239	DominatorTree &DT;
240	ScalarEvolution &SE;
241	TargetTransformInfo &TTI;
242	const DataLayout &DL;
243	IRBuilder<> Builder;
244
245	// We could erase instrs right after vectorizing them, but that can mess up
246	// our BB iterators, and also can make the equivalence class keys point to
247	// freed memory. This is fixable, but it's simpler just to wait until we're
248	// done with the BB and erase all at once.
249	SmallVector<Instruction *, `128`> ToErase;
250
251	public:
252	Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
253	DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
254	: F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
255	DL(F.getParent()->getDataLayout()), Builder (SE.getContext()) {}
256
257	bool run();
258
259	private:
260	static const unsigned MaxDepth = `3`;
261
262	/// Runs the vectorizer on a "pseudo basic block", which is a range of
263	/// instructions [Begin, End) within one BB all of which have
264	/// isGuaranteedToTransferExecutionToSuccessor(I) == true.
265	bool runOnPseudoBB(BasicBlock::iterator Begin, BasicBlock::iterator End);
266
267	/// Runs the vectorizer on one equivalence class, i.e. one set of loads/stores
268	/// in the same BB with the same value for getUnderlyingObject() etc.
269	bool runOnEquivalenceClass(const EqClassKey &EqClassKey,
270	ArrayRef<Instruction *> EqClass);
271
272	/// Runs the vectorizer on one chain, i.e. a subset of an equivalence class
273	/// where all instructions access a known, constant offset from the first
274	/// instruction.
275	bool runOnChain(Chain &C);
276
277	/// Splits the chain into subchains of instructions which read/write a
278	/// contiguous block of memory. Discards any length-1 subchains (because
279	/// there's nothing to vectorize in there).
280	std::vector<Chain> splitChainByContiguity(Chain &C);
281
282	/// Splits the chain into subchains where it's safe to hoist loads up to the
283	/// beginning of the sub-chain and it's safe to sink loads up to the end of
284	/// the sub-chain. Discards any length-1 subchains.
285	std::vector<Chain> splitChainByMayAliasInstrs(Chain &C);
286
287	/// Splits the chain into subchains that make legal, aligned accesses.
288	/// Discards any length-1 subchains.
289	std::vector<Chain> splitChainByAlignment(Chain &C);
290
291	/// Converts the instrs in the chain into a single vectorized load or store.
292	/// Adds the old scalar loads/stores to ToErase.
293	bool vectorizeChain(Chain &C);
294
295	/// Tries to compute the offset in bytes PtrB - PtrA.
296	std::optional<APInt> getConstantOffset(Value PtrA, Value PtrB,
297	Instruction *ContextInst,
298	unsigned Depth = `0`);
299	std::optional<APInt> getConstantOffsetComplexAddrs(Value PtrA, Value PtrB,
300	Instruction *ContextInst,
301	unsigned Depth);
302	std::optional<APInt> getConstantOffsetSelects(Value PtrA, Value PtrB,
303	Instruction *ContextInst,
304	unsigned Depth);
305
306	/// Gets the element type of the vector that the chain will load or store.
307	/// This is nontrivial because the chain may contain elements of different
308	/// types; e.g. it's legal to have a chain that contains both i32 and float.
309	Type getChainElemTy(const* Chain &C);
310
311	/// Determines whether ChainElem can be moved up (if IsLoad) or down (if
312	/// !IsLoad) to ChainBegin -- i.e. there are no intervening may-alias
313	/// instructions.
314	///
315	/// The map ChainElemOffsets must contain all of the elements in
316	/// [ChainBegin, ChainElem] and their offsets from some arbitrary base
317	/// address. It's ok if it contains additional entries.
318	template <bool IsLoadChain>
319	bool isSafeToMove(
320	Instruction ChainElem, Instruction ChainBegin,
321	const DenseMap<Instruction , APInt /OffsetFromLeader/*> &ChainOffsets);
322
323	/// Collects loads and stores grouped by "equivalence class", where:
324	/// - all elements in an eq class are a load or all are a store,
325	/// - they all load/store the same element size (it's OK to have e.g. i8 and
326	/// <4 x i8> in the same class, but not i32 and <4 x i8>), and
327	/// - they all have the same value for getUnderlyingObject().
328	EquivalenceClassMap collectEquivalenceClasses(BasicBlock::iterator Begin,
329	BasicBlock::iterator End);
330
331	/// Partitions Instrs into "chains" where every instruction has a known
332	/// constant offset from the first instr in the chain.
333	///
334	/// Postcondition: For all i, ret[i][0].second == 0, because the first instr
335	/// in the chain is the leader, and an instr touches distance 0 from itself.
336	std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
337	};
338
339	class LoadStoreVectorizerLegacyPass : public FunctionPass {
340	public:
341	static char ID;
342
343	LoadStoreVectorizerLegacyPass() : FunctionPass (ID) {
344	initializeLoadStoreVectorizerLegacyPassPass(
345	*PassRegistry::getPassRegistry());
346	}
347
348	bool runOnFunction(Function &F) override;
349
350	StringRef getPassName() const override {
351	return "GPU Load and Store Vectorizer";
352	}
353
354	void getAnalysisUsage(AnalysisUsage &AU) const override {
355	AU.addRequired<AAResultsWrapperPass>();
356	AU.addRequired<AssumptionCacheTracker>();
357	AU.addRequired<ScalarEvolutionWrapperPass>();
358	AU.addRequired<DominatorTreeWrapperPass>();
359	AU.addRequired<TargetTransformInfoWrapperPass>();
360	AU.setPreservesCFG();
361	}
362	};
363
364	} // end anonymous namespace
365
366	char LoadStoreVectorizerLegacyPass::ID = `0`;
367
368	INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
369	"Vectorize load and Store instructions", false, false)
370	INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
371	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker);
372	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
373	INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
374	INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
375	INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
376	INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
377	"Vectorize load and store instructions", false, false)
378
379	Pass *llvm::createLoadStoreVectorizerPass() {
380	return new LoadStoreVectorizerLegacyPass ();
381	}
382
383	bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
384	// Don't vectorize when the attribute NoImplicitFloat is used.
385	if (skipFunction(F) \|\| F.hasFnAttribute(Attribute::NoImplicitFloat))
386	return false;
387
388	AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
389	DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
390	ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
391	TargetTransformInfo &TTI =
392	getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
393
394	AssumptionCache &AC =
395	getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
396
397	return Vectorizer (F, AA, AC, DT, SE, TTI).run();
398	}
399
400	PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
401	FunctionAnalysisManager &AM) {
402	// Don't vectorize when the attribute NoImplicitFloat is used.
403	if (F.hasFnAttribute(Attribute::NoImplicitFloat))
404	return PreservedAnalyses::all();
405
406	AliasAnalysis &AA = AM.getResult<AAManager>(IR&: F);
407	DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
408	ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
409	TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(IR&: F);
410	AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(IR&: F);
411
412	bool Changed = Vectorizer (F, AA, AC, DT, SE, TTI).run();
413	PreservedAnalyses PA;
414	PA.preserveSet<CFGAnalyses>();
415	return Changed ? PA : PreservedAnalyses::all();
416	}
417
418	bool Vectorizer::run() {
419	bool Changed = false;
420	// Break up the BB if there are any instrs which aren't guaranteed to transfer
421	// execution to their successor.
422	//
423	// Consider, for example:
424	//
425	// def assert_arr_len(int n) { if (n < 2) exit(); }
426	//
427	// load arr[0]
428	// call assert_array_len(arr.length)
429	// load arr[1]
430	//
431	// Even though assert_arr_len does not read or write any memory, we can't
432	// speculate the second load before the call. More info at
433	// https://github.com/llvm/llvm-project/issues/52950.
434	for (BasicBlock *BB : post_order(G: &F)) {
435	// BB must at least have a terminator.
436	assert(!BB->empty());
437
438	SmallVector<BasicBlock::iterator, `8`> Barriers;
439	Barriers.push_back(Elt: BB->begin());
440	for (Instruction &I : *BB)
441	if (!isGuaranteedToTransferExecutionToSuccessor(I: &I))
442	Barriers.push_back(Elt: I.getIterator());
443	Barriers.push_back(Elt: BB->end());
444
445	for (auto It = Barriers.begin(), End = std::prev(x: Barriers.end()); It != End;
446	++It)
447	Changed \|= runOnPseudoBB(Begin: It, End: std::next(x: It));
448
449	for (Instruction *I : ToErase) {
450	auto *PtrOperand = getLoadStorePointerOperand(V: I);
451	if (I->use_empty())
452	I->eraseFromParent();
453	RecursivelyDeleteTriviallyDeadInstructions(V: PtrOperand);
454	}
455	ToErase.clear();
456	}
457
458	return Changed;
459	}
460
461	bool Vectorizer::runOnPseudoBB(BasicBlock::iterator Begin,
462	BasicBlock::iterator End) {
463	LLVM_DEBUG({
464	dbgs() << "LSV: Running on pseudo-BB [" << *Begin << " ... ";
465	if (End != Begin ->getParent()->end())
466	dbgs() << *End;
467	else
468	dbgs() << "<BB end>";
469	dbgs() << ")\n";
470	});
471
472	bool Changed = false;
473	for (const auto &[EqClassKey, EqClass] :
474	collectEquivalenceClasses(Begin, End))
475	Changed \|= runOnEquivalenceClass(EqClassKey, EqClass);
476
477	return Changed;
478	}
479
480	bool Vectorizer::runOnEquivalenceClass(const EqClassKey &EqClassKey,
481	ArrayRef<Instruction *> EqClass) {
482	bool Changed = false;
483
484	LLVM_DEBUG({
485	dbgs() << "LSV: Running on equivalence class of size " << EqClass.size()
486	<< " keyed on " << EqClassKey << ":\n";
487	for (Instruction *I : EqClass)
488	dbgs() << " " << *I << "\n";
489	});
490
491	std::vector<Chain> Chains = gatherChains(Instrs: EqClass);
492	LLVM_DEBUG(dbgs() << "LSV: Got " << Chains.size()
493	<< " nontrivial chains.\n";);
494	for (Chain &C : Chains)
495	Changed \|= runOnChain(C);
496	return Changed;
497	}
498
499	bool Vectorizer::runOnChain(Chain &C) {
500	LLVM_DEBUG({
501	dbgs() << "LSV: Running on chain with " << C.size() << " instructions:\n";
502	dumpChain(C);
503	});
504
505	// Split up the chain into increasingly smaller chains, until we can finally
506	// vectorize the chains.
507	//
508	// (Don't be scared by the depth of the loop nest here. These operations are
509	// all at worst O(n lg n) in the number of instructions, and splitting chains
510	// doesn't change the number of instrs. So the whole loop nest is O(n lg n).)
511	bool Changed = false;
512	for (auto &C : splitChainByMayAliasInstrs(C))
513	for (auto &C : splitChainByContiguity(C))
514	for (auto &C : splitChainByAlignment(C))
515	Changed \|= vectorizeChain(C);
516	return Changed;
517	}
518
519	std::vector<Chain> Vectorizer::splitChainByMayAliasInstrs(Chain &C) {
520	if (C.empty())
521	return {};
522
523	sortChainInBBOrder(C);
524
525	LLVM_DEBUG({
526	dbgs() << "LSV: splitChainByMayAliasInstrs considering chain:\n";
527	dumpChain(C);
528	});
529
530	// We know that elements in the chain with nonverlapping offsets can't
531	// alias, but AA may not be smart enough to figure this out. Use a
532	// hashtable.
533	DenseMap<Instruction , APInt /OffsetFromLeader/*> ChainOffsets;
534	for (const auto &E : C)
535	ChainOffsets.insert(KV: {&*E.Inst, E.OffsetFromLeader});
536
537	// Loads get hoisted up to the first load in the chain. Stores get sunk
538	// down to the last store in the chain. Our algorithm for loads is:
539	//
540	// - Take the first element of the chain. This is the start of a new chain.
541	// - Take the next element of `Chain` and check for may-alias instructions
542	// up to the start of NewChain. If no may-alias instrs, add it to
543	// NewChain. Otherwise, start a new NewChain.
544	//
545	// For stores it's the same except in the reverse direction.
546	//
547	// We expect IsLoad to be an std::bool_constant.
548	auto Impl = [&](auto IsLoad) {
549	// MSVC is unhappy if IsLoad is a capture, so pass it as an arg.
550	auto [ChainBegin, ChainEnd] = [&](auto IsLoad) {
551	if constexpr (IsLoad())
552	return std::make_pair(x: C.begin(), y: C.end());
553	else
554	return std::make_pair(x: C.rbegin(), y: C.rend());
555	}(IsLoad);
556	assert(ChainBegin != ChainEnd);
557
558	std::vector<Chain> Chains;
559	SmallVector<ChainElem, `1`> NewChain;
560	NewChain.push_back(*ChainBegin);
561	for (auto ChainIt = std::next(ChainBegin); ChainIt != ChainEnd; ++ChainIt) {
562	if (isSafeToMove<IsLoad>(ChainIt->Inst, NewChain.front().Inst,
563	ChainOffsets)) {
564	LLVM_DEBUG(dbgs() << "LSV: No intervening may-alias instrs; can merge "
565	<< ChainIt->Inst << " into " << ChainBegin->Inst
566	<< "\n");
567	NewChain.push_back(*ChainIt);
568	} else {
569	LLVM_DEBUG(
570	dbgs() << "LSV: Found intervening may-alias instrs; cannot merge "
571	<< ChainIt->Inst << " into " << ChainBegin->Inst << "\n");
572	if (NewChain.size() > `1`) {
573	LLVM_DEBUG({
574	dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
575	dumpChain(NewChain);
576	});
577	Chains.push_back(x: std::move(NewChain));
578	}
579
580	// Start a new chain.
581	NewChain = SmallVector<ChainElem, `1`>({*ChainIt});
582	}
583	}
584	if (NewChain.size() > `1`) {
585	LLVM_DEBUG({
586	dbgs() << "LSV: got nontrivial chain without aliasing instrs:\n";
587	dumpChain(NewChain);
588	});
589	Chains.push_back(x: std::move(NewChain));
590	}
591	return Chains;
592	};
593
594	if (isa<LoadInst>(Val: C [`0`].Inst))
595	return Impl (/IsLoad=/std::bool_constant<true>());
596
597	assert(isa<StoreInst>(C[`0`].Inst));
598	return Impl (/IsLoad=/std::bool_constant<false>());
599	}
600
601	std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
602	if (C.empty())
603	return {};
604
605	sortChainInOffsetOrder(C);
606
607	LLVM_DEBUG({
608	dbgs() << "LSV: splitChainByContiguity considering chain:\n";
609	dumpChain(C);
610	});
611
612	std::vector<Chain> Ret;
613	Ret.push_back(x: {C.front()});
614
615	for (auto It = std::next(x: C.begin()), End = C.end(); It != End; ++It) {
616	// `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
617	auto &CurChain = Ret.back();
618	const ChainElem &Prev = CurChain.back();
619	unsigned SzBits = DL.getTypeSizeInBits(Ty: getLoadStoreType(I: &*Prev.Inst));
620	assert(SzBits % `8` == `0` && "Non-byte sizes should have been filtered out by "
621	"collectEquivalenceClass");
622	APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / `8`;
623
624	// Add this instruction to the end of the current chain, or start a new one.
625	bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
626	LLVM_DEBUG(dbgs() << "LSV: Instructions are "
627	<< (AreContiguous ? "" : "not ") << "contiguous: "
628	<< *Prev.Inst << " (ends at offset " << PrevReadEnd
629	<< ") -> " << *It->Inst << " (starts at offset "
630	<< It->OffsetFromLeader << ")\n");
631	if (AreContiguous)
632	CurChain.push_back(Elt: *It);
633	else
634	Ret.push_back(x: {*It});
635	}
636
637	// Filter out length-1 chains, these are uninteresting.
638	llvm::erase_if(C&: Ret, P: [](const auto &Chain) { return Chain.size() <= `1`; });
639	return Ret;
640	}
641
642	Type Vectorizer::getChainElemTy(const* Chain &C) {
643	assert(!C.empty());
644	// The rules are:
645	// - If there are any pointer types in the chain, use an integer type.
646	// - Prefer an integer type if it appears in the chain.
647	// - Otherwise, use the first type in the chain.
648	//
649	// The rule about pointer types is a simplification when we merge e.g. a load
650	// of a ptr and a double. There's no direct conversion from a ptr to a
651	// double; it requires a ptrtoint followed by a bitcast.
652	//
653	// It's unclear to me if the other rules have any practical effect, but we do
654	// it to match this pass's previous behavior.
655	if (any_of(Range: C, P: [](const ChainElem &E) {
656	return getLoadStoreType(I: E.Inst)->getScalarType()->isPointerTy();
657	})) {
658	return Type::getIntNTy(
659	C&: F.getContext(),
660	N: DL.getTypeSizeInBits(Ty: getLoadStoreType(I: C [`0`].Inst)->getScalarType()));
661	}
662
663	for (const ChainElem &E : C)
664	if (Type *T = getLoadStoreType(I: E.Inst)->getScalarType(); T->isIntegerTy())
665	return T;
666	return getLoadStoreType(I: C [`0`].Inst)->getScalarType();
667	}
668
669	std::vector<Chain> Vectorizer::splitChainByAlignment(Chain &C) {
670	// We use a simple greedy algorithm.
671	// - Given a chain of length N, find all prefixes that
672	// (a) are not longer than the max register length, and
673	// (b) are a power of 2.
674	// - Starting from the longest prefix, try to create a vector of that length.
675	// - If one of them works, great. Repeat the algorithm on any remaining
676	// elements in the chain.
677	// - If none of them work, discard the first element and repeat on a chain
678	// of length N-1.
679	if (C.empty())
680	return {};
681
682	sortChainInOffsetOrder(C);
683
684	LLVM_DEBUG({
685	dbgs() << "LSV: splitChainByAlignment considering chain:\n";
686	dumpChain(C);
687	});
688
689	bool IsLoadChain = isa<LoadInst>(Val: C [`0`].Inst);
690	auto getVectorFactor = [&](unsigned VF, unsigned LoadStoreSize,
691	unsigned ChainSizeBytes, VectorType *VecTy) {
692	return IsLoadChain ? TTI.getLoadVectorFactor(VF, LoadSize: LoadStoreSize,
693	ChainSizeInBytes: ChainSizeBytes, VecTy)
694	: TTI.getStoreVectorFactor(VF, StoreSize: LoadStoreSize,
695	ChainSizeInBytes: ChainSizeBytes, VecTy);
696	};
697
698	#ifndef NDEBUG
699	for (const auto &E : C) {
700	Type *Ty = getLoadStoreType(I: E.Inst)->getScalarType();
701	assert(isPowerOf2_32(DL.getTypeSizeInBits(Ty)) &&
702	"Should have filtered out non-power-of-two elements in "
703	"collectEquivalenceClasses.");
704	}
705	#endif
706
707	unsigned AS = getLoadStoreAddressSpace(I: C [`0`].Inst);
708	unsigned VecRegBytes = TTI.getLoadStoreVecRegBitWidth(AddrSpace: AS) / `8`;
709
710	std::vector<Chain> Ret;
711	for (unsigned CBegin = `0`; CBegin < C.size(); ++CBegin) {
712	// Find candidate chains of size not greater than the largest vector reg.
713	// These chains are over the closed interval [CBegin, CEnd].
714	SmallVector<std::pair<unsigned /CEnd/, unsigned /SizeBytes/>, `8`>
715	CandidateChains;
716	for (unsigned CEnd = CBegin + `1`, Size = C.size(); CEnd < Size; ++CEnd) {
717	APInt Sz = C [CEnd].OffsetFromLeader +
718	DL.getTypeStoreSize(Ty: getLoadStoreType(I: C [CEnd].Inst)) -
719	C [CBegin].OffsetFromLeader;
720	if (Sz.sgt(RHS: VecRegBytes))
721	break;
722	CandidateChains.push_back(
723	Elt: {CEnd, static_cast<unsigned>(Sz.getLimitedValue())});
724	}
725
726	// Consider the longest chain first.
727	for (auto It = CandidateChains.rbegin(), End = CandidateChains.rend();
728	It != End; ++It) {
729	auto [CEnd, SizeBytes] = *It;
730	LLVM_DEBUG(
731	dbgs() << "LSV: splitChainByAlignment considering candidate chain ["
732	<< C[CBegin].Inst << " ... " << C[CEnd].Inst << "]\n");
733
734	Type *VecElemTy = getChainElemTy(C);
735	// Note, VecElemTy is a power of 2, but might be less than one byte. For
736	// example, we can vectorize 2 x <2 x i4> to <4 x i4>, and in this case
737	// VecElemTy would be i4.
738	unsigned VecElemBits = DL.getTypeSizeInBits(Ty: VecElemTy);
739
740	// SizeBytes and VecElemBits are powers of 2, so they divide evenly.
741	assert((`8` * SizeBytes) % VecElemBits == `0`);
742	unsigned NumVecElems = `8` * SizeBytes / VecElemBits;
743	FixedVectorType *VecTy = FixedVectorType::get(ElementType: VecElemTy, NumElts: NumVecElems);
744	unsigned VF = `8` * VecRegBytes / VecElemBits;
745
746	// Check that TTI is happy with this vectorization factor.
747	unsigned TargetVF = getVectorFactor (VF, VecElemBits,
748	VecElemBits * NumVecElems / `8`, VecTy);
749	if (TargetVF != VF && TargetVF < NumVecElems) {
750	LLVM_DEBUG(
751	dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
752	"because TargetVF="
753	<< TargetVF << " != VF=" << VF
754	<< " and TargetVF < NumVecElems=" << NumVecElems << "\n");
755	continue;
756	}
757
758	// Is a load/store with this alignment allowed by TTI and at least as fast
759	// as an unvectorized load/store?
760	//
761	// TTI and F are passed as explicit captures to WAR an MSVC misparse (??).
762	auto IsAllowedAndFast = [&, SizeBytes = SizeBytes, &TTI = TTI,
763	&F = F](Align Alignment) {
764	if (Alignment.value() % SizeBytes == `0`)
765	return true;
766	unsigned VectorizedSpeed = `0`;
767	bool AllowsMisaligned = TTI.allowsMisalignedMemoryAccesses(
768	Context&: F.getContext(), BitWidth: SizeBytes * `8`, AddressSpace: AS, Alignment, Fast: &VectorizedSpeed);
769	if (!AllowsMisaligned) {
770	LLVM_DEBUG(dbgs()
771	<< "LSV: Access of " << SizeBytes << "B in addrspace "
772	<< AS << " with alignment " << Alignment.value()
773	<< " is misaligned, and therefore can't be vectorized.\n");
774	return false;
775	}
776
777	unsigned ElementwiseSpeed = `0`;
778	(TTI).allowsMisalignedMemoryAccesses(Context&: (F).getContext(), BitWidth: VecElemBits, AddressSpace: AS,
779	Alignment, Fast: &ElementwiseSpeed);
780	if (VectorizedSpeed < ElementwiseSpeed) {
781	LLVM_DEBUG(dbgs()
782	<< "LSV: Access of " << SizeBytes << "B in addrspace "
783	<< AS << " with alignment " << Alignment.value()
784	<< " has relative speed " << VectorizedSpeed
785	<< ", which is lower than the elementwise speed of "
786	<< ElementwiseSpeed
787	<< ". Therefore this access won't be vectorized.\n");
788	return false;
789	}
790	return true;
791	};
792
793	// If we're loading/storing from an alloca, align it if possible.
794	//
795	// FIXME: We eagerly upgrade the alignment, regardless of whether TTI
796	// tells us this is beneficial. This feels a bit odd, but it matches
797	// existing tests. This isn't so* bad, because at most we align to 4*
798	// bytes (current value of StackAdjustedAlignment).
799	//
800	// FIXME: We will upgrade the alignment of the alloca even if it turns out
801	// we can't vectorize for some other reason.
802	Value *PtrOperand = getLoadStorePointerOperand(V: C [CBegin].Inst);
803	bool IsAllocaAccess = AS == DL.getAllocaAddrSpace() &&
804	isa<AllocaInst>(Val: PtrOperand->stripPointerCasts());
805	Align Alignment = getLoadStoreAlignment(I: C [CBegin].Inst);
806	Align PrefAlign = Align (StackAdjustedAlignment);
807	if (IsAllocaAccess && Alignment.value() % SizeBytes != `0` &&
808	IsAllowedAndFast (PrefAlign)) {
809	Align NewAlign = getOrEnforceKnownAlignment(
810	V: PtrOperand, PrefAlign, DL, CxtI: C [CBegin].Inst, AC: nullptr, DT: &DT);
811	if (NewAlign >= Alignment) {
812	LLVM_DEBUG(dbgs()
813	<< "LSV: splitByChain upgrading alloca alignment from "
814	<< Alignment.value() << " to " << NewAlign.value()
815	<< "\n");
816	Alignment = NewAlign;
817	}
818	}
819
820	if (!IsAllowedAndFast (Alignment)) {
821	LLVM_DEBUG(
822	dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
823	"because its alignment is not AllowedAndFast: "
824	<< Alignment.value() << "\n");
825	continue;
826	}
827
828	if ((IsLoadChain &&
829	!TTI.isLegalToVectorizeLoadChain(ChainSizeInBytes: SizeBytes, Alignment, AddrSpace: AS)) \|\|
830	(!IsLoadChain &&
831	!TTI.isLegalToVectorizeStoreChain(ChainSizeInBytes: SizeBytes, Alignment, AddrSpace: AS))) {
832	LLVM_DEBUG(
833	dbgs() << "LSV: splitChainByAlignment discarding candidate chain "
834	"because !isLegalToVectorizeLoad/StoreChain.");
835	continue;
836	}
837
838	// Hooray, we can vectorize this chain!
839	Chain &NewChain = Ret.emplace_back();
840	for (unsigned I = CBegin; I <= CEnd; ++I)
841	NewChain.push_back(Elt: C [I]);
842	CBegin = CEnd; // Skip over the instructions we've added to the chain.
843	break;
844	}
845	}
846	return Ret;
847	}
848
849	bool Vectorizer::vectorizeChain(Chain &C) {
850	if (C.size() < `2`)
851	return false;
852
853	sortChainInOffsetOrder(C);
854
855	LLVM_DEBUG({
856	dbgs() << "LSV: Vectorizing chain of " << C.size() << " instructions:\n";
857	dumpChain(C);
858	});
859
860	Type *VecElemTy = getChainElemTy(C);
861	bool IsLoadChain = isa<LoadInst>(Val: C [`0`].Inst);
862	unsigned AS = getLoadStoreAddressSpace(I: C [`0`].Inst);
863	unsigned ChainBytes = std::accumulate(
864	first: C.begin(), last: C.end(), init: `0u`, binary_op: [&](unsigned Bytes, const ChainElem &E) {
865	return Bytes + DL.getTypeStoreSize(Ty: getLoadStoreType(I: E.Inst));
866	});
867	assert(ChainBytes % DL.getTypeStoreSize(VecElemTy) == `0`);
868	// VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
869	// than 1 byte (e.g. VecTy == <32 x i1>).
870	Type *VecTy = FixedVectorType::get(
871	ElementType: VecElemTy, NumElts: `8` * ChainBytes / DL.getTypeSizeInBits(Ty: VecElemTy));
872
873	Align Alignment = getLoadStoreAlignment(I: C [`0`].Inst);
874	// If this is a load/store of an alloca, we might have upgraded the alloca's
875	// alignment earlier. Get the new alignment.
876	if (AS == DL.getAllocaAddrSpace()) {
877	Alignment = std::max(
878	a: Alignment,
879	b: getOrEnforceKnownAlignment(V: getLoadStorePointerOperand(V: C [`0`].Inst),
880	PrefAlign: MaybeAlign (), DL, CxtI: C [`0`].Inst, AC: nullptr, DT: &DT));
881	}
882
883	// All elements of the chain must have the same scalar-type size.
884	#ifndef NDEBUG
885	for (const ChainElem &E : C)
886	assert(DL.getTypeStoreSize(getLoadStoreType(E.Inst)->getScalarType()) ==
887	DL.getTypeStoreSize(VecElemTy));
888	#endif
889
890	Instruction *VecInst;
891	if (IsLoadChain) {
892	// Loads get hoisted to the location of the first load in the chain. We may
893	// also need to hoist the (transitive) operands of the loads.
894	Builder.SetInsertPoint(
895	llvm::min_element(Range&: C, C: [](const auto &A, const auto &B) {
896	return A.Inst->comesBefore(B.Inst);
897	})->Inst);
898
899	// Chain is in offset order, so C[0] is the instr with the lowest offset,
900	// i.e. the root of the vector.
901	VecInst = Builder.CreateAlignedLoad(Ty: VecTy,
902	Ptr: getLoadStorePointerOperand(V: C [`0`].Inst),
903	Align: Alignment);
904
905	unsigned VecIdx = `0`;
906	for (const ChainElem &E : C) {
907	Instruction *I = E.Inst;
908	Value *V;
909	Type *T = getLoadStoreType(I);
910	if (auto *VT = dyn_cast<FixedVectorType>(Val: T)) {
911	auto Mask = llvm::to_vector<`8`>(
912	Range: llvm::seq<int>(Begin: VecIdx, End: VecIdx + VT->getNumElements()));
913	V = Builder.CreateShuffleVector(V: VecInst, Mask, Name: I->getName());
914	VecIdx += VT->getNumElements();
915	} else {
916	V = Builder.CreateExtractElement(Vec: VecInst, Idx: Builder.getInt32(C: VecIdx),
917	Name: I->getName());
918	++VecIdx;
919	}
920	if (V->getType() != I->getType())
921	V = Builder.CreateBitOrPointerCast(V, DestTy: I->getType());
922	I->replaceAllUsesWith(V);
923	}
924
925	// Finally, we need to reorder the instrs in the BB so that the (transitive)
926	// operands of VecInst appear before it. To see why, suppose we have
927	// vectorized the following code:
928	//
929	// ptr1 = gep a, 1
930	// load1 = load i32 ptr1
931	// ptr0 = gep a, 0
932	// load0 = load i32 ptr0
933	//
934	// We will put the vectorized load at the location of the earliest load in
935	// the BB, i.e. load1. We get:
936	//
937	// ptr1 = gep a, 1
938	// loadv = load <2 x i32> ptr0
939	// load0 = extractelement loadv, 0
940	// load1 = extractelement loadv, 1
941	// ptr0 = gep a, 0
942	//
943	// Notice that loadv uses ptr0, which is defined after* it!*
944	reorder(I: VecInst);
945	} else {
946	// Stores get sunk to the location of the last store in the chain.
947	Builder.SetInsertPoint(llvm::max_element(Range&: C, C: [](auto &A, auto &B) {
948	return A.Inst->comesBefore(B.Inst);
949	})->Inst);
950
951	// Build the vector to store.
952	Value *Vec = PoisonValue::get(T: VecTy);
953	unsigned VecIdx = `0`;
954	auto InsertElem = [&](Value *V) {
955	if (V->getType() != VecElemTy)
956	V = Builder.CreateBitOrPointerCast(V, DestTy: VecElemTy);
957	Vec = Builder.CreateInsertElement(Vec, NewElt: V, Idx: Builder.getInt32(C: VecIdx++));
958	};
959	for (const ChainElem &E : C) {
960	auto I = cast<StoreInst>(Val: E.Inst);
961	if (FixedVectorType *VT =
962	dyn_cast<FixedVectorType>(Val: getLoadStoreType(I))) {
963	for (int J = `0`, JE = VT->getNumElements(); J < JE; ++J) {
964	InsertElem (Builder.CreateExtractElement(Vec: I->getValueOperand(),
965	Idx: Builder.getInt32(C: J)));
966	}
967	} else {
968	InsertElem (I->getValueOperand());
969	}
970	}
971
972	// Chain is in offset order, so C[0] is the instr with the lowest offset,
973	// i.e. the root of the vector.
974	VecInst = Builder.CreateAlignedStore(
975	Val: Vec,
976	Ptr: getLoadStorePointerOperand(V: C [`0`].Inst),
977	Align: Alignment);
978	}
979
980	propagateMetadata(I: VecInst, C);
981
982	for (const ChainElem &E : C)
983	ToErase.push_back(Elt: E.Inst);
984
985	++NumVectorInstructions;
986	NumScalarsVectorized += C.size();
987	return true;
988	}
989
990	template <bool IsLoadChain>
991	bool Vectorizer::isSafeToMove(
992	Instruction ChainElem, Instruction ChainBegin,
993	const DenseMap<Instruction , APInt /OffsetFromLeader/*> &ChainOffsets) {
994	LLVM_DEBUG(dbgs() << "LSV: isSafeToMove(" << *ChainElem << " -> "
995	<< *ChainBegin << ")\n");
996
997	assert(isa<LoadInst>(ChainElem) == IsLoadChain);
998	if (ChainElem == ChainBegin)
999	return true;
1000
1001	// Invariant loads can always be reordered; by definition they are not
1002	// clobbered by stores.
1003	if (isInvariantLoad(I: ChainElem))
1004	return true;
1005
1006	auto BBIt = std::next([&] {
1007	if constexpr (IsLoadChain)
1008	return BasicBlock::reverse_iterator (ChainElem);
1009	else
1010	return BasicBlock::iterator (ChainElem);
1011	}());
1012	auto BBItEnd = std::next([&] {
1013	if constexpr (IsLoadChain)
1014	return BasicBlock::reverse_iterator (ChainBegin);
1015	else
1016	return BasicBlock::iterator (ChainBegin);
1017	}());
1018
1019	const APInt &ChainElemOffset = ChainOffsets.at(Val: ChainElem);
1020	const unsigned ChainElemSize =
1021	DL.getTypeStoreSize(Ty: getLoadStoreType(I: ChainElem));
1022
1023	for (; BBIt != BBItEnd; ++BBIt) {
1024	Instruction I = &BBIt;
1025
1026	if (!I->mayReadOrWriteMemory())
1027	continue;
1028
1029	// Loads can be reordered with other loads.
1030	if (IsLoadChain && isa<LoadInst>(Val: I))
1031	continue;
1032
1033	// Stores can be sunk below invariant loads.
1034	if (!IsLoadChain && isInvariantLoad(I))
1035	continue;
1036
1037	// If I is in the chain, we can tell whether it aliases ChainIt by checking
1038	// what offset ChainIt accesses. This may be better than AA is able to do.
1039	//
1040	// We should really only have duplicate offsets for stores (the duplicate
1041	// loads should be CSE'ed), but in case we have a duplicate load, we'll
1042	// split the chain so we don't have to handle this case specially.
1043	if (auto OffsetIt = ChainOffsets.find(Val: I); OffsetIt != ChainOffsets.end()) {
1044	// I and ChainElem overlap if:
1045	// - I and ChainElem have the same offset, OR
1046	// - I's offset is less than ChainElem's, but I touches past the
1047	// beginning of ChainElem, OR
1048	// - ChainElem's offset is less than I's, but ChainElem touches past the
1049	// beginning of I.
1050	const APInt &IOffset = OffsetIt ->second;
1051	unsigned IElemSize = DL.getTypeStoreSize(Ty: getLoadStoreType(I));
1052	if (IOffset == ChainElemOffset \|\|
1053	(IOffset.sle(RHS: ChainElemOffset) &&
1054	(IOffset + IElemSize).sgt(RHS: ChainElemOffset)) \|\|
1055	(ChainElemOffset.sle(RHS: IOffset) &&
1056	(ChainElemOffset + ChainElemSize).sgt(RHS: OffsetIt ->second))) {
1057	LLVM_DEBUG({
1058	// Double check that AA also sees this alias. If not, we probably
1059	// have a bug.
1060	ModRefInfo MR = AA.getModRefInfo(I, MemoryLocation::get(ChainElem));
1061	assert(IsLoadChain ? isModSet(MR) : isModOrRefSet(MR));
1062	dbgs() << "LSV: Found alias in chain: " << *I << "\n";
1063	});
1064	return false; // We found an aliasing instruction; bail.
1065	}
1066
1067	continue; // We're confident there's no alias.
1068	}
1069
1070	LLVM_DEBUG(dbgs() << "LSV: Querying AA for " << *I << "\n");
1071	ModRefInfo MR = AA.getModRefInfo(I, OptLoc: MemoryLocation::get(Inst: ChainElem));
1072	if (IsLoadChain ? isModSet(MRI: MR) : isModOrRefSet(MRI: MR)) {
1073	LLVM_DEBUG(dbgs() << "LSV: Found alias in chain:\n"
1074	<< " Aliasing instruction:\n"
1075	<< " " << *I << `'\n'`
1076	<< " Aliased instruction and pointer:\n"
1077	<< " " << *ChainElem << `'\n'`
1078	<< " " << *getLoadStorePointerOperand(ChainElem)
1079	<< `'\n'`);
1080
1081	return false;
1082	}
1083	}
1084	return true;
1085	}
1086
1087	static bool checkNoWrapFlags(Instruction I, bool* Signed) {
1088	BinaryOperator *BinOpI = cast<BinaryOperator>(Val: I);
1089	return (Signed && BinOpI->hasNoSignedWrap()) \|\|
1090	(!Signed && BinOpI->hasNoUnsignedWrap());
1091	}
1092
1093	static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
1094	unsigned MatchingOpIdxA, Instruction *AddOpB,
1095	unsigned MatchingOpIdxB, bool Signed) {
1096	LLVM_DEBUG(dbgs() << "LSV: checkIfSafeAddSequence IdxDiff=" << IdxDiff
1097	<< ", AddOpA=" << *AddOpA << ", MatchingOpIdxA="
1098	<< MatchingOpIdxA << ", AddOpB=" << *AddOpB
1099	<< ", MatchingOpIdxB=" << MatchingOpIdxB
1100	<< ", Signed=" << Signed << "\n");
1101	// If both OpA and OpB are adds with NSW/NUW and with one of the operands
1102	// being the same, we can guarantee that the transformation is safe if we can
1103	// prove that OpA won't overflow when Ret added to the other operand of OpA.
1104	// For example:
1105	// %tmp7 = add nsw i32 %tmp2, %v0
1106	// %tmp8 = sext i32 %tmp7 to i64
1107	// ...
1108	// %tmp11 = add nsw i32 %v0, 1
1109	// %tmp12 = add nsw i32 %tmp2, %tmp11
1110	// %tmp13 = sext i32 %tmp12 to i64
1111	//
1112	// Both %tmp7 and %tmp12 have the nsw flag and the first operand is %tmp2.
1113	// It's guaranteed that adding 1 to %tmp7 won't overflow because %tmp11 adds
1114	// 1 to %v0 and both %tmp11 and %tmp12 have the nsw flag.
1115	assert(AddOpA->getOpcode() == Instruction::Add &&
1116	AddOpB->getOpcode() == Instruction::Add &&
1117	checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
1118	if (AddOpA->getOperand(i: MatchingOpIdxA) ==
1119	AddOpB->getOperand(i: MatchingOpIdxB)) {
1120	Value *OtherOperandA = AddOpA->getOperand(i: MatchingOpIdxA == `1` ? `0` : `1`);
1121	Value *OtherOperandB = AddOpB->getOperand(i: MatchingOpIdxB == `1` ? `0` : `1`);
1122	Instruction *OtherInstrA = dyn_cast<Instruction>(Val: OtherOperandA);
1123	Instruction *OtherInstrB = dyn_cast<Instruction>(Val: OtherOperandB);
1124	// Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
1125	if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
1126	checkNoWrapFlags(I: OtherInstrB, Signed) &&
1127	isa<ConstantInt>(Val: OtherInstrB->getOperand(i: `1`))) {
1128	int64_t CstVal =
1129	cast<ConstantInt>(Val: OtherInstrB->getOperand(i: `1`))->getSExtValue();
1130	if (OtherInstrB->getOperand(i: `0`) == OtherOperandA &&
1131	IdxDiff.getSExtValue() == CstVal)
1132	return true;
1133	}
1134	// Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
1135	if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
1136	checkNoWrapFlags(I: OtherInstrA, Signed) &&
1137	isa<ConstantInt>(Val: OtherInstrA->getOperand(i: `1`))) {
1138	int64_t CstVal =
1139	cast<ConstantInt>(Val: OtherInstrA->getOperand(i: `1`))->getSExtValue();
1140	if (OtherInstrA->getOperand(i: `0`) == OtherOperandB &&
1141	IdxDiff.getSExtValue() == -CstVal)
1142	return true;
1143	}
1144	// Match `x +nsw/nuw (y +nsw/nuw c)` and
1145	// `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
1146	if (OtherInstrA && OtherInstrB &&
1147	OtherInstrA->getOpcode() == Instruction::Add &&
1148	OtherInstrB->getOpcode() == Instruction::Add &&
1149	checkNoWrapFlags(I: OtherInstrA, Signed) &&
1150	checkNoWrapFlags(I: OtherInstrB, Signed) &&
1151	isa<ConstantInt>(Val: OtherInstrA->getOperand(i: `1`)) &&
1152	isa<ConstantInt>(Val: OtherInstrB->getOperand(i: `1`))) {
1153	int64_t CstValA =
1154	cast<ConstantInt>(Val: OtherInstrA->getOperand(i: `1`))->getSExtValue();
1155	int64_t CstValB =
1156	cast<ConstantInt>(Val: OtherInstrB->getOperand(i: `1`))->getSExtValue();
1157	if (OtherInstrA->getOperand(i: `0`) == OtherInstrB->getOperand(i: `0`) &&
1158	IdxDiff.getSExtValue() == (CstValB - CstValA))
1159	return true;
1160	}
1161	}
1162	return false;
1163	}
1164
1165	std::optional<APInt> Vectorizer::getConstantOffsetComplexAddrs(
1166	Value PtrA, Value PtrB, Instruction ContextInst, unsigned* Depth) {
1167	LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs PtrA=" << *PtrA
1168	<< " PtrB=" << PtrB << " ContextInst=" << ContextInst
1169	<< " Depth=" << Depth << "\n");
1170	auto *GEPA = dyn_cast<GetElementPtrInst>(Val: PtrA);
1171	auto *GEPB = dyn_cast<GetElementPtrInst>(Val: PtrB);
1172	if (!GEPA \|\| !GEPB)
1173	return getConstantOffsetSelects(PtrA, PtrB, ContextInst, Depth);
1174
1175	// Look through GEPs after checking they're the same except for the last
1176	// index.
1177	if (GEPA->getNumOperands() != GEPB->getNumOperands() \|\|
1178	GEPA->getPointerOperand() != GEPB->getPointerOperand())
1179	return std::nullopt;
1180	gep_type_iterator GTIA = gep_type_begin(GEP: GEPA);
1181	gep_type_iterator GTIB = gep_type_begin(GEP: GEPB);
1182	for (unsigned I = `0`, E = GEPA->getNumIndices() - `1`; I < E; ++I) {
1183	if (GTIA.getOperand() != GTIB.getOperand())
1184	return std::nullopt;
1185	++GTIA;
1186	++GTIB;
1187	}
1188
1189	Instruction *OpA = dyn_cast<Instruction>(Val: GTIA.getOperand());
1190	Instruction *OpB = dyn_cast<Instruction>(Val: GTIB.getOperand());
1191	if (!OpA \|\| !OpB \|\| OpA->getOpcode() != OpB->getOpcode() \|\|
1192	OpA->getType() != OpB->getType())
1193	return std::nullopt;
1194
1195	uint64_t Stride = GTIA.getSequentialElementStride(DL);
1196
1197	// Only look through a ZExt/SExt.
1198	if (!isa<SExtInst>(Val: OpA) && !isa<ZExtInst>(Val: OpA))
1199	return std::nullopt;
1200
1201	bool Signed = isa<SExtInst>(Val: OpA);
1202
1203	// At this point A could be a function parameter, i.e. not an instruction
1204	Value *ValA = OpA->getOperand(i: `0`);
1205	OpB = dyn_cast<Instruction>(Val: OpB->getOperand(i: `0`));
1206	if (!OpB \|\| ValA->getType() != OpB->getType())
1207	return std::nullopt;
1208
1209	const SCEV *OffsetSCEVA = SE.getSCEV(V: ValA);
1210	const SCEV *OffsetSCEVB = SE.getSCEV(V: OpB);
1211	const SCEV *IdxDiffSCEV = SE.getMinusSCEV(LHS: OffsetSCEVB, RHS: OffsetSCEVA);
1212	if (IdxDiffSCEV == SE.getCouldNotCompute())
1213	return std::nullopt;
1214
1215	ConstantRange IdxDiffRange = SE.getSignedRange(S: IdxDiffSCEV);
1216	if (!IdxDiffRange.isSingleElement())
1217	return std::nullopt;
1218	APInt IdxDiff = *IdxDiffRange.getSingleElement();
1219
1220	LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetComplexAddrs IdxDiff=" << IdxDiff
1221	<< "\n");
1222
1223	// Now we need to prove that adding IdxDiff to ValA won't overflow.
1224	bool Safe = false;
1225
1226	// First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
1227	// ValA, we're okay.
1228	if (OpB->getOpcode() == Instruction::Add &&
1229	isa<ConstantInt>(Val: OpB->getOperand(i: `1`)) &&
1230	IdxDiff.sle(RHS: cast<ConstantInt>(Val: OpB->getOperand(i: `1`))->getSExtValue()) &&
1231	checkNoWrapFlags(I: OpB, Signed))
1232	Safe = true;
1233
1234	// Second attempt: check if we have eligible add NSW/NUW instruction
1235	// sequences.
1236	OpA = dyn_cast<Instruction>(Val: ValA);
1237	if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
1238	OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(I: OpA, Signed) &&
1239	checkNoWrapFlags(I: OpB, Signed)) {
1240	// In the checks below a matching operand in OpA and OpB is an operand which
1241	// is the same in those two instructions. Below we account for possible
1242	// orders of the operands of these add instructions.
1243	for (unsigned MatchingOpIdxA : {`0`, `1`})
1244	for (unsigned MatchingOpIdxB : {`0`, `1`})
1245	if (!Safe)
1246	Safe = checkIfSafeAddSequence(IdxDiff, AddOpA: OpA, MatchingOpIdxA, AddOpB: OpB,
1247	MatchingOpIdxB, Signed);
1248	}
1249
1250	unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
1251
1252	// Third attempt:
1253	//
1254	// Assuming IdxDiff is positive: If all set bits of IdxDiff or any higher
1255	// order bit other than the sign bit are known to be zero in ValA, we can add
1256	// Diff to it while guaranteeing no overflow of any sort.
1257	//
1258	// If IdxDiff is negative, do the same, but swap ValA and ValB.
1259	if (!Safe) {
1260	// When computing known bits, use the GEPs as context instructions, since
1261	// they likely are in the same BB as the load/store.
1262	KnownBits Known(BitWidth);
1263	computeKnownBits(V: (IdxDiff.sge(RHS: `0`) ? ValA : OpB), Known, DL, Depth: `0`, AC: &AC,
1264	CxtI: ContextInst, DT: &DT);
1265	APInt BitsAllowedToBeSet = Known.Zero.zext(width: IdxDiff.getBitWidth());
1266	if (Signed)
1267	BitsAllowedToBeSet.clearBit(BitPosition: BitWidth - `1`);
1268	if (BitsAllowedToBeSet.ult(RHS: IdxDiff.abs()))
1269	return std::nullopt;
1270	Safe = true;
1271	}
1272
1273	if (Safe)
1274	return IdxDiff * Stride;
1275	return std::nullopt;
1276	}
1277
1278	std::optional<APInt> Vectorizer::getConstantOffsetSelects(
1279	Value PtrA, Value PtrB, Instruction ContextInst, unsigned* Depth) {
1280	if (Depth++ == MaxDepth)
1281	return std::nullopt;
1282
1283	if (auto *SelectA = dyn_cast<SelectInst>(Val: PtrA)) {
1284	if (auto *SelectB = dyn_cast<SelectInst>(Val: PtrB)) {
1285	if (SelectA->getCondition() != SelectB->getCondition())
1286	return std::nullopt;
1287	LLVM_DEBUG(dbgs() << "LSV: getConstantOffsetSelects, PtrA=" << *PtrA
1288	<< ", PtrB=" << *PtrB << ", ContextInst="
1289	<< *ContextInst << ", Depth=" << Depth << "\n");
1290	std::optional<APInt> TrueDiff = getConstantOffset(
1291	PtrA: SelectA->getTrueValue(), PtrB: SelectB->getTrueValue(), ContextInst, Depth);
1292	if (!TrueDiff.has_value())
1293	return std::nullopt;
1294	std::optional<APInt> FalseDiff =
1295	getConstantOffset(PtrA: SelectA->getFalseValue(), PtrB: SelectB->getFalseValue(),
1296	ContextInst, Depth);
1297	if (TrueDiff == FalseDiff)
1298	return TrueDiff;
1299	}
1300	}
1301	return std::nullopt;
1302	}
1303
1304	EquivalenceClassMap
1305	Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
1306	BasicBlock::iterator End) {
1307	EquivalenceClassMap Ret;
1308
1309	auto getUnderlyingObject = [](const Value Ptr) -> const* Value * {
1310	const Value *ObjPtr = llvm::getUnderlyingObject(V: Ptr);
1311	if (const auto *Sel = dyn_cast<SelectInst>(Val: ObjPtr)) {
1312	// The select's themselves are distinct instructions even if they share
1313	// the same condition and evaluate to consecutive pointers for true and
1314	// false values of the condition. Therefore using the select's themselves
1315	// for grouping instructions would put consecutive accesses into different
1316	// lists and they won't be even checked for being consecutive, and won't
1317	// be vectorized.
1318	return Sel->getCondition();
1319	}
1320	return ObjPtr;
1321	};
1322
1323	for (Instruction &I : make_range(x: Begin, y: End)) {
1324	auto *LI = dyn_cast<LoadInst>(Val: &I);
1325	auto *SI = dyn_cast<StoreInst>(Val: &I);
1326	if (!LI && !SI)
1327	continue;
1328
1329	if ((LI && !LI->isSimple()) \|\| (SI && !SI->isSimple()))
1330	continue;
1331
1332	if ((LI && !TTI.isLegalToVectorizeLoad(LI)) \|\|
1333	(SI && !TTI.isLegalToVectorizeStore(SI)))
1334	continue;
1335
1336	Type *Ty = getLoadStoreType(I: &I);
1337	if (!VectorType::isValidElementType(ElemTy: Ty->getScalarType()))
1338	continue;
1339
1340	// Skip weird non-byte sizes. They probably aren't worth the effort of
1341	// handling correctly.
1342	unsigned TySize = DL.getTypeSizeInBits(Ty);
1343	if ((TySize % `8`) != `0`)
1344	continue;
1345
1346	// Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
1347	// functions are currently using an integer type for the vectorized
1348	// load/store, and does not support casting between the integer type and a
1349	// vector of pointers (e.g. i64 to <2 x i16>)*
1350	if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
1351	continue;
1352
1353	Value *Ptr = getLoadStorePointerOperand(V: &I);
1354	unsigned AS = Ptr->getType()->getPointerAddressSpace();
1355	unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AddrSpace: AS);
1356
1357	unsigned VF = VecRegSize / TySize;
1358	VectorType *VecTy = dyn_cast<VectorType>(Val: Ty);
1359
1360	// Only handle power-of-two sized elements.
1361	if ((!VecTy && !isPowerOf2_32(Value: DL.getTypeSizeInBits(Ty))) \|\|
1362	(VecTy && !isPowerOf2_32(Value: DL.getTypeSizeInBits(Ty: VecTy->getScalarType()))))
1363	continue;
1364
1365	// No point in looking at these if they're too big to vectorize.
1366	if (TySize > VecRegSize / `2` \|\|
1367	(VecTy && TTI.getLoadVectorFactor(VF, LoadSize: TySize, ChainSizeInBytes: TySize / `8`, VecTy) == `0`))
1368	continue;
1369
1370	Ret [{getUnderlyingObject (Ptr), AS,
1371	DL.getTypeSizeInBits(Ty: getLoadStoreType(I: &I)->getScalarType()),
1372	/IsLoad=/LI != nullptr}]
1373	.push_back(Elt: &I);
1374	}
1375
1376	return Ret;
1377	}
1378
1379	std::vector<Chain> Vectorizer::gatherChains(ArrayRef<Instruction *> Instrs) {
1380	if (Instrs.empty())
1381	return {};
1382
1383	unsigned AS = getLoadStoreAddressSpace(I: Instrs [`0`]);
1384	unsigned ASPtrBits = DL.getIndexSizeInBits(AS);
1385
1386	#ifndef NDEBUG
1387	// Check that Instrs is in BB order and all have the same addr space.
1388	for (size_t I = `1`; I < Instrs.size(); ++I) {
1389	assert(Instrs[I - `1`]->comesBefore(Instrs[I]));
1390	assert(getLoadStoreAddressSpace(Instrs[I]) == AS);
1391	}
1392	#endif
1393
1394	// Machinery to build an MRU-hashtable of Chains.
1395	//
1396	// (Ideally this could be done with MapVector, but as currently implemented,
1397	// moving an element to the front of a MapVector is O(n).)
1398	struct InstrListElem : ilist_node<InstrListElem>,
1399	std::pair<Instruction *, Chain> {
1400	explicit InstrListElem(Instruction *I)
1401	: std::pair<Instruction *, Chain>(I, {}) {}
1402	};
1403	struct InstrListElemDenseMapInfo {
1404	using PtrInfo = DenseMapInfo<InstrListElem *>;
1405	using IInfo = DenseMapInfo<Instruction *>;
1406	static InstrListElem getEmptyKey() { return* PtrInfo::getEmptyKey(); }
1407	static InstrListElem *getTombstoneKey() {
1408	return PtrInfo::getTombstoneKey();
1409	}
1410	static unsigned getHashValue(const InstrListElem *E) {
1411	return IInfo::getHashValue(PtrVal: E->first);
1412	}
1413	static bool isEqual(const InstrListElem A, const* InstrListElem *B) {
1414	if (A == getEmptyKey() \|\| B == getEmptyKey())
1415	return A == getEmptyKey() && B == getEmptyKey();
1416	if (A == getTombstoneKey() \|\| B == getTombstoneKey())
1417	return A == getTombstoneKey() && B == getTombstoneKey();
1418	return IInfo::isEqual(LHS: A->first, RHS: B->first);
1419	}
1420	};
1421	SpecificBumpPtrAllocator<InstrListElem> Allocator;
1422	simple_ilist<InstrListElem> MRU;
1423	DenseSet<InstrListElem *, InstrListElemDenseMapInfo> Chains;
1424
1425	// Compare each instruction in `instrs` to leader of the N most recently-used
1426	// chains. This limits the O(n^2) behavior of this pass while also allowing
1427	// us to build arbitrarily long chains.
1428	for (Instruction *I : Instrs) {
1429	constexpr int MaxChainsToTry = `64`;
1430
1431	bool MatchFound = false;
1432	auto ChainIter = MRU.begin();
1433	for (size_t J = `0`; J < MaxChainsToTry && ChainIter != MRU.end();
1434	++J, ++ChainIter) {
1435	std::optional<APInt> Offset = getConstantOffset(
1436	PtrA: getLoadStorePointerOperand(V: ChainIter ->first),
1437	PtrB: getLoadStorePointerOperand(V: I),
1438	/ContextInst=/
1439	(ChainIter ->first->comesBefore(Other: I) ? I : ChainIter ->first));
1440	if (Offset.has_value()) {
1441	// `Offset` might not have the expected number of bits, if e.g. AS has a
1442	// different number of bits than opaque pointers.
1443	ChainIter ->second.push_back(Elt: ChainElem{.Inst: I, .OffsetFromLeader: Offset.value()});
1444	// Move ChainIter to the front of the MRU list.
1445	MRU.remove(N&: *ChainIter);
1446	MRU.push_front(Node&: *ChainIter);
1447	MatchFound = true;
1448	break;
1449	}
1450	}
1451
1452	if (!MatchFound) {
1453	APInt ZeroOffset(ASPtrBits, `0`);
1454	InstrListElem E = new* (Allocator.Allocate()) InstrListElem(I);
1455	E->second.push_back(Elt: ChainElem{.Inst: I, .OffsetFromLeader: ZeroOffset});
1456	MRU.push_front(Node&: *E);
1457	Chains.insert(V: E);
1458	}
1459	}
1460
1461	std::vector<Chain> Ret;
1462	Ret.reserve(n: Chains.size());
1463	// Iterate over MRU rather than Chains so the order is deterministic.
1464	for (auto &E : MRU)
1465	if (E.second.size() > `1`)
1466	Ret.push_back(x: std::move(E.second));
1467	return Ret;
1468	}
1469
1470	std::optional<APInt> Vectorizer::getConstantOffset(Value PtrA, Value PtrB,
1471	Instruction *ContextInst,
1472	unsigned Depth) {
1473	LLVM_DEBUG(dbgs() << "LSV: getConstantOffset, PtrA=" << *PtrA
1474	<< ", PtrB=" << PtrB << ", ContextInst= " << ContextInst
1475	<< ", Depth=" << Depth << "\n");
1476	// We'll ultimately return a value of this bit width, even if computations
1477	// happen in a different width.
1478	unsigned OrigBitWidth = DL.getIndexTypeSizeInBits(Ty: PtrA->getType());
1479	APInt OffsetA(OrigBitWidth, `0`);
1480	APInt OffsetB(OrigBitWidth, `0`);
1481	PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetA);
1482	PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, Offset&: OffsetB);
1483	unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(Ty: PtrA->getType());
1484	if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(Ty: PtrB->getType()))
1485	return std::nullopt;
1486
1487	// If we have to shrink the pointer, stripAndAccumulateInBoundsConstantOffsets
1488	// should properly handle a possible overflow and the value should fit into
1489	// the smallest data type used in the cast/gep chain.
1490	assert(OffsetA.getSignificantBits() <= NewPtrBitWidth &&
1491	OffsetB.getSignificantBits() <= NewPtrBitWidth);
1492
1493	OffsetA = OffsetA.sextOrTrunc(width: NewPtrBitWidth);
1494	OffsetB = OffsetB.sextOrTrunc(width: NewPtrBitWidth);
1495	if (PtrA == PtrB)
1496	return (OffsetB - OffsetA).sextOrTrunc(width: OrigBitWidth);
1497
1498	// Try to compute B - A.
1499	const SCEV *DistScev = SE.getMinusSCEV(LHS: SE.getSCEV(V: PtrB), RHS: SE.getSCEV(V: PtrA));
1500	if (DistScev != SE.getCouldNotCompute()) {
1501	LLVM_DEBUG(dbgs() << "LSV: SCEV PtrB - PtrA =" << *DistScev << "\n");
1502	ConstantRange DistRange = SE.getSignedRange(S: DistScev);
1503	if (DistRange.isSingleElement()) {
1504	// Handle index width (the width of Dist) != pointer width (the width of
1505	// the Offsets at this point).*
1506	APInt Dist = DistRange.getSingleElement()->sextOrTrunc(width: NewPtrBitWidth);
1507	return (OffsetB - OffsetA + Dist).sextOrTrunc(width: OrigBitWidth);
1508	}
1509	}
1510	std::optional<APInt> Diff =
1511	getConstantOffsetComplexAddrs(PtrA, PtrB, ContextInst, Depth);
1512	if (Diff.has_value())
1513	return (OffsetB - OffsetA + Diff ->sext(width: OffsetB.getBitWidth()))
1514	.sextOrTrunc(width: OrigBitWidth);
1515	return std::nullopt;
1516	}
1517

source code of llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp