InterleavedLoadCombinePass.cpp source code [llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp]

1	//===- InterleavedLoadCombine.cpp - Combine Interleaved Loads ---- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// \file
10	//
11	// This file defines the interleaved-load-combine pass. The pass searches for
12	// ShuffleVectorInstruction that execute interleaving loads. If a matching
13	// pattern is found, it adds a combined load and further instructions in a
14	// pattern that is detectable by InterleavedAccesPass. The old instructions are
15	// left dead to be removed later. The pass is specifically designed to be
16	// executed just before InterleavedAccesPass to find any left-over instances
17	// that are not detected within former passes.
18	//
19	//===----------------------------------------------------------------------===//
20
21	#include "llvm/ADT/Statistic.h"
22	#include "llvm/Analysis/MemorySSA.h"
23	#include "llvm/Analysis/MemorySSAUpdater.h"
24	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
25	#include "llvm/Analysis/TargetTransformInfo.h"
26	#include "llvm/CodeGen/InterleavedLoadCombine.h"
27	#include "llvm/CodeGen/Passes.h"
28	#include "llvm/CodeGen/TargetLowering.h"
29	#include "llvm/CodeGen/TargetPassConfig.h"
30	#include "llvm/CodeGen/TargetSubtargetInfo.h"
31	#include "llvm/IR/DataLayout.h"
32	#include "llvm/IR/Dominators.h"
33	#include "llvm/IR/Function.h"
34	#include "llvm/IR/IRBuilder.h"
35	#include "llvm/IR/Instructions.h"
36	#include "llvm/IR/Module.h"
37	#include "llvm/InitializePasses.h"
38	#include "llvm/Pass.h"
39	#include "llvm/Support/Debug.h"
40	#include "llvm/Support/ErrorHandling.h"
41	#include "llvm/Support/raw_ostream.h"
42	#include "llvm/Target/TargetMachine.h"
43
44	#include <algorithm>
45	#include <cassert>
46	#include <list>
47
48	using namespace llvm;
49
50	#define DEBUG_TYPE "interleaved-load-combine"
51
52	namespace {
53
54	/// Statistic counter
55	STATISTIC(NumInterleavedLoadCombine, "Number of combined loads");
56
57	/// Option to disable the pass
58	static cl::opt<bool> DisableInterleavedLoadCombine(
59	"disable-" DEBUG_TYPE, cl::init(Val: false), cl::Hidden,
60	cl::desc ("Disable combining of interleaved loads"));
61
62	struct VectorInfo;
63
64	struct InterleavedLoadCombineImpl {
65	public:
66	InterleavedLoadCombineImpl(Function &F, DominatorTree &DT, MemorySSA &MSSA,
67	const TargetMachine &TM)
68	: F(F), DT(DT), MSSA(MSSA),
69	TLI(*TM.getSubtargetImpl(F)->getTargetLowering()),
70	TTI(TM.getTargetTransformInfo(F)) {}
71
72	/// Scan the function for interleaved load candidates and execute the
73	/// replacement if applicable.
74	bool run();
75
76	private:
77	/// Function this pass is working on
78	Function &F;
79
80	/// Dominator Tree Analysis
81	DominatorTree &DT;
82
83	/// Memory Alias Analyses
84	MemorySSA &MSSA;
85
86	/// Target Lowering Information
87	const TargetLowering &TLI;
88
89	/// Target Transform Information
90	const TargetTransformInfo TTI;
91
92	/// Find the instruction in sets LIs that dominates all others, return nullptr
93	/// if there is none.
94	LoadInst findFirstLoad(const* std::set<LoadInst *> &LIs);
95
96	/// Replace interleaved load candidates. It does additional
97	/// analyses if this makes sense. Returns true on success and false
98	/// of nothing has been changed.
99	bool combine(std::list<VectorInfo> &InterleavedLoad,
100	OptimizationRemarkEmitter &ORE);
101
102	/// Given a set of VectorInfo containing candidates for a given interleave
103	/// factor, find a set that represents a 'factor' interleaved load.
104	bool findPattern(std::list<VectorInfo> &Candidates,
105	std::list<VectorInfo> &InterleavedLoad, unsigned Factor,
106	const DataLayout &DL);
107	}; // InterleavedLoadCombine
108
109	/// First Order Polynomial on an n-Bit Integer Value
110	///
111	/// Polynomial(Value) = Value B + A + E2^(n-e)
112	///
113	/// A and B are the coefficients. E2^(n-e) is an error within 'e' most*
114	/// significant bits. It is introduced if an exact computation cannot be proven
115	/// (e.q. division by 2).
116	///
117	/// As part of this optimization multiple loads will be combined. It necessary
118	/// to prove that loads are within some relative offset to each other. This
119	/// class is used to prove relative offsets of values loaded from memory.
120	///
121	/// Representing an integer in this form is sound since addition in two's
122	/// complement is associative (trivial) and multiplication distributes over the
123	/// addition (see Proof(1) in Polynomial::mul). Further, both operations
124	/// commute.
125	//
126	// Example:
127	// declare @fn(i64 %IDX, <4 x float> %PTR) {*
128	// %Pa1 = add i64 %IDX, 2
129	// %Pa2 = lshr i64 %Pa1, 1
130	// %Pa3 = getelementptr inbounds <4 x float>, <4 x float> %PTR, i64 %Pa2*
131	// %Va = load <4 x float>, <4 x float> %Pa3*
132	//
133	// %Pb1 = add i64 %IDX, 4
134	// %Pb2 = lshr i64 %Pb1, 1
135	// %Pb3 = getelementptr inbounds <4 x float>, <4 x float> %PTR, i64 %Pb2*
136	// %Vb = load <4 x float>, <4 x float> %Pb3*
137	// ... }
138	//
139	// The goal is to prove that two loads load consecutive addresses.
140	//
141	// In this case the polynomials are constructed by the following
142	// steps.
143	//
144	// The number tag #e specifies the error bits.
145	//
146	// Pa_0 = %IDX #0
147	// Pa_1 = %IDX + 2 #0 \| add 2
148	// Pa_2 = %IDX/2 + 1 #1 \| lshr 1
149	// Pa_3 = %IDX/2 + 1 #1 \| GEP, step signext to i64
150	// Pa_4 = (%IDX/2)16 + 16 #0 \| GEP, multiply index by sizeof(4) for floats*
151	// Pa_5 = (%IDX/2)16 + 16 #0 \| GEP, add offset of leading components*
152	//
153	// Pb_0 = %IDX #0
154	// Pb_1 = %IDX + 4 #0 \| add 2
155	// Pb_2 = %IDX/2 + 2 #1 \| lshr 1
156	// Pb_3 = %IDX/2 + 2 #1 \| GEP, step signext to i64
157	// Pb_4 = (%IDX/2)16 + 32 #0 \| GEP, multiply index by sizeof(4) for floats*
158	// Pb_5 = (%IDX/2)16 + 16 #0 \| GEP, add offset of leading components*
159	//
160	// Pb_5 - Pa_5 = 16 #0 \| subtract to get the offset
161	//
162	// Remark: %PTR is not maintained within this class. So in this instance the
163	// offset of 16 can only be assumed if the pointers are equal.
164	//
165	class Polynomial {
166	/// Operations on B
167	enum BOps {
168	LShr,
169	Mul,
170	SExt,
171	Trunc,
172	};
173
174	/// Number of Error Bits e
175	unsigned ErrorMSBs = (unsigned)-`1`;
176
177	/// Value
178	Value V = nullptr*;
179
180	/// Coefficient B
181	SmallVector<std::pair<BOps, APInt>, `4`> B;
182
183	/// Coefficient A
184	APInt A;
185
186	public:
187	Polynomial(Value *V) : V(V) {
188	IntegerType *Ty = dyn_cast<IntegerType>(Val: V->getType());
189	if (Ty) {
190	ErrorMSBs = `0`;
191	this->V = V;
192	A = APInt (Ty->getBitWidth(), `0`);
193	}
194	}
195
196	Polynomial(const APInt &A, unsigned ErrorMSBs = `0`)
197	: ErrorMSBs(ErrorMSBs), A (A) {}
198
199	Polynomial(unsigned BitWidth, uint64_t A, unsigned ErrorMSBs = `0`)
200	: ErrorMSBs(ErrorMSBs), A (BitWidth, A) {}
201
202	Polynomial() = default;
203
204	/// Increment and clamp the number of undefined bits.
205	void incErrorMSBs(unsigned amt) {
206	if (ErrorMSBs == (unsigned)-`1`)
207	return;
208
209	ErrorMSBs += amt;
210	if (ErrorMSBs > A.getBitWidth())
211	ErrorMSBs = A.getBitWidth();
212	}
213
214	/// Decrement and clamp the number of undefined bits.
215	void decErrorMSBs(unsigned amt) {
216	if (ErrorMSBs == (unsigned)-`1`)
217	return;
218
219	if (ErrorMSBs > amt)
220	ErrorMSBs -= amt;
221	else
222	ErrorMSBs = `0`;
223	}
224
225	/// Apply an add on the polynomial
226	Polynomial &add(const APInt &C) {
227	// Note: Addition is associative in two's complement even when in case of
228	// signed overflow.
229	//
230	// Error bits can only propagate into higher significant bits. As these are
231	// already regarded as undefined, there is no change.
232	//
233	// Theorem: Adding a constant to a polynomial does not change the error
234	// term.
235	//
236	// Proof:
237	//
238	// Since the addition is associative and commutes:
239	//
240	// (B + A + E2^(n-e)) + C = B + (A + C) + E2^(n-e)
241	// [qed]
242
243	if (C.getBitWidth() != A.getBitWidth()) {
244	ErrorMSBs = (unsigned)-`1`;
245	return *this;
246	}
247
248	A += C;
249	return *this;
250	}
251
252	/// Apply a multiplication onto the polynomial.
253	Polynomial &mul(const APInt &C) {
254	// Note: Multiplication distributes over the addition
255	//
256	// Theorem: Multiplication distributes over the addition
257	//
258	// Proof(1):
259	//
260	// (B+A)C =-*
261	// = (B + A) + (B + A) + .. {C Times}
262	// addition is associative and commutes, hence
263	// = B + B + .. {C Times} .. + A + A + .. {C times}
264	// = BC + AC
265	// (see (function add) for signed values and overflows)
266	// [qed]
267	//
268	// Theorem: If C has c trailing zeros, errors bits in A or B are shifted out
269	// to the left.
270	//
271	// Proof(2):
272	//
273	// Let B' and A' be the n-Bit inputs with some unknown errors EA,
274	// EB at e leading bits. B' and A' can be written down as:
275	//
276	// B' = B + 2^(n-e)EB*
277	// A' = A + 2^(n-e)EA*
278	//
279	// Let C' be an input with c trailing zero bits. C' can be written as
280	//
281	// C' = C2^c*
282	//
283	// Therefore we can compute the result by using distributivity and
284	// commutativity.
285	//
286	// (B'C' + A'C') = [B + 2^(n-e)EB] * C' + [A + 2^(n-e)EA] C' =*
287	// = [B + 2^(n-e)EB + A + 2^(n-e)EA] C' =*
288	// = (B'+A') C' =*
289	// = [B + 2^(n-e)EB + A + 2^(n-e)EA] C' =*
290	// = [B + A + 2^(n-e)EB + 2^(n-e)EA] C' =*
291	// = (B + A) C' + [2^(n-e)EB + 2^(n-e)EA)] * C' =*
292	// = (B + A) C' + [2^(n-e)EB + 2^(n-e)EA)] * C2^c =
293	// = (B + A) C' + C(EB + EA)2^(n-e)2^c =
294	//
295	// Let EC be the final error with EC = C(EB + EA)*
296	//
297	// = (B + A)C' + EC2^(n-e)2^c =*
298	// = (B + A)C' + EC2^(n-(e-c))
299	//
300	// Since EC is multiplied by 2^(n-(e-c)) the resulting error contains c
301	// less error bits than the input. c bits are shifted out to the left.
302	// [qed]
303
304	if (C.getBitWidth() != A.getBitWidth()) {
305	ErrorMSBs = (unsigned)-`1`;
306	return *this;
307	}
308
309	// Multiplying by one is a no-op.
310	if (C.isOne()) {
311	return *this;
312	}
313
314	// Multiplying by zero removes the coefficient B and defines all bits.
315	if (C.isZero()) {
316	ErrorMSBs = `0`;
317	deleteB();
318	}
319
320	// See Proof(2): Trailing zero bits indicate a left shift. This removes
321	// leading bits from the result even if they are undefined.
322	decErrorMSBs(amt: C.countr_zero());
323
324	A *= C;
325	pushBOperation(Op: Mul, C);
326	return *this;
327	}
328
329	/// Apply a logical shift right on the polynomial
330	Polynomial &lshr(const APInt &C) {
331	// Theorem(1): (B + A + E2^(n-e)) >> 1 => (B >> 1) + (A >> 1) + E'2^(n-e')
332	// where
333	// e' = e + 1,
334	// E is a e-bit number,
335	// E' is a e'-bit number,
336	// holds under the following precondition:
337	// pre(1): A % 2 = 0
338	// pre(2): e < n, (see Theorem(2) for the trivial case with e=n)
339	// where >> expresses a logical shift to the right, with adding zeros.
340	//
341	// We need to show that for every, E there is a E'
342	//
343	// B = b_h 2^(n-1) + b_m * 2 + b_l*
344	// A = a_h 2^(n-1) + a_m * 2 (pre(1))*
345	//
346	// where a_h, b_h, b_l are single bits, and a_m, b_m are (n-2) bit numbers
347	//
348	// Let X = (B + A + E2^(n-e)) >> 1*
349	// Let Y = (B >> 1) + (A >> 1) + E2^(n-e) >> 1*
350	//
351	// X = [B + A + E2^(n-e)] >> 1 =*
352	// = [ b_h 2^(n-1) + b_m * 2 + b_l +*
353	// + a_h 2^(n-1) + a_m * 2 +*
354	// + E 2^(n-e) ] >> 1 =*
355	//
356	// The sum is built by putting the overflow of [a_m + b+n] into the term
357	// 2^(n-1). As there are no more bits beyond 2^(n-1) the overflow within
358	// this bit is discarded. This is expressed by % 2.
359	//
360	// The bit in position 0 cannot overflow into the term (b_m + a_m).
361	//
362	// = [ ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) 2^(n-1) +*
363	// + ((b_m + a_m) % 2^(n-2)) 2 +*
364	// + b_l + E 2^(n-e) ] >> 1 =*
365	//
366	// The shift is computed by dividing the terms by 2 and by cutting off
367	// b_l.
368	//
369	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) 2^(n-2) +*
370	// + ((b_m + a_m) % 2^(n-2)) +
371	// + E 2^(n-(e+1)) =*
372	//
373	// by the definition in the Theorem e+1 = e'
374	//
375	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) 2^(n-2) +*
376	// + ((b_m + a_m) % 2^(n-2)) +
377	// + E 2^(n-e') =*
378	//
379	// Compute Y by applying distributivity first
380	//
381	// Y = (B >> 1) + (A >> 1) + E2^(n-e') =*
382	// = (b_h 2^(n-1) + b_m * 2 + b_l) >> 1 +*
383	// + (a_h 2^(n-1) + a_m * 2) >> 1 +*
384	// + E 2^(n-e) >> 1 =*
385	//
386	// Again, the shift is computed by dividing the terms by 2 and by cutting
387	// off b_l.
388	//
389	// = b_h 2^(n-2) + b_m +*
390	// + a_h 2^(n-2) + a_m +*
391	// + E 2^(n-(e+1)) =*
392	//
393	// Again, the sum is built by putting the overflow of [a_m + b+n] into
394	// the term 2^(n-1). But this time there is room for a second bit in the
395	// term 2^(n-2) we add this bit to a new term and denote it o_h in a
396	// second step.
397	//
398	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] >> 1) 2^(n-1) +*
399	// + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) 2^(n-2) +*
400	// + ((b_m + a_m) % 2^(n-2)) +
401	// + E 2^(n-(e+1)) =*
402	//
403	// Let o_h = [b_h + a_h + (b_m + a_m) >> (n-2)] >> 1
404	// Further replace e+1 by e'.
405	//
406	// = o_h 2^(n-1) +*
407	// + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) 2^(n-2) +*
408	// + ((b_m + a_m) % 2^(n-2)) +
409	// + E 2^(n-e') =*
410	//
411	// Move o_h into the error term and construct E'. To ensure that there is
412	// no 2^x with negative x, this step requires pre(2) (e < n).
413	//
414	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) 2^(n-2) +*
415	// + ((b_m + a_m) % 2^(n-2)) +
416	// + o_h 2^(e'-1) * 2^(n-e') + \| pre(2), move 2^(e'-1)*
417	// \| out of the old exponent
418	// + E 2^(n-e') =*
419	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) 2^(n-2) +*
420	// + ((b_m + a_m) % 2^(n-2)) +
421	// + [o_h 2^(e'-1) + E] * 2^(n-e') + \| move 2^(e'-1) out of*
422	// \| the old exponent
423	//
424	// Let E' = o_h 2^(e'-1) + E*
425	//
426	// = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) 2^(n-2) +*
427	// + ((b_m + a_m) % 2^(n-2)) +
428	// + E' 2^(n-e')*
429	//
430	// Because X and Y are distinct only in there error terms and E' can be
431	// constructed as shown the theorem holds.
432	// [qed]
433	//
434	// For completeness in case of the case e=n it is also required to show that
435	// distributivity can be applied.
436	//
437	// In this case Theorem(1) transforms to (the pre-condition on A can also be
438	// dropped)
439	//
440	// Theorem(2): (B + A + E) >> 1 => (B >> 1) + (A >> 1) + E'
441	// where
442	// A, B, E, E' are two's complement numbers with the same bit
443	// width
444	//
445	// Let A + B + E = X
446	// Let (B >> 1) + (A >> 1) = Y
447	//
448	// Therefore we need to show that for every X and Y there is an E' which
449	// makes the equation
450	//
451	// X = Y + E'
452	//
453	// hold. This is trivially the case for E' = X - Y.
454	//
455	// [qed]
456	//
457	// Remark: Distributing lshr with and arbitrary number n can be expressed as
458	// ((((B + A) lshr 1) lshr 1) ... ) {n times}.
459	// This construction induces n additional error bits at the left.
460
461	if (C.getBitWidth() != A.getBitWidth()) {
462	ErrorMSBs = (unsigned)-`1`;
463	return *this;
464	}
465
466	if (C.isZero())
467	return *this;
468
469	// Test if the result will be zero
470	unsigned shiftAmt = C.getZExtValue();
471	if (shiftAmt >= C.getBitWidth())
472	return mul(C: APInt (C.getBitWidth(), `0`));
473
474	// The proof that shiftAmt LSBs are zero for at least one summand is only
475	// possible for the constant number.
476	//
477	// If this can be proven add shiftAmt to the error counter
478	// `ErrorMSBs`. Otherwise set all bits as undefined.
479	if (A.countr_zero() < shiftAmt)
480	ErrorMSBs = A.getBitWidth();
481	else
482	incErrorMSBs(amt: shiftAmt);
483
484	// Apply the operation.
485	pushBOperation(Op: LShr, C);
486	A = A.lshr(shiftAmt);
487
488	return *this;
489	}
490
491	/// Apply a sign-extend or truncate operation on the polynomial.
492	Polynomial &sextOrTrunc(unsigned n) {
493	if (n < A.getBitWidth()) {
494	// Truncate: Clearly undefined Bits on the MSB side are removed
495	// if there are any.
496	decErrorMSBs(amt: A.getBitWidth() - n);
497	A = A.trunc(width: n);
498	pushBOperation(Op: Trunc, C: APInt (sizeof(n) * `8`, n));
499	}
500	if (n > A.getBitWidth()) {
501	// Extend: Clearly extending first and adding later is different
502	// to adding first and extending later in all extended bits.
503	incErrorMSBs(amt: n - A.getBitWidth());
504	A = A.sext(width: n);
505	pushBOperation(Op: SExt, C: APInt (sizeof(n) * `8`, n));
506	}
507
508	return *this;
509	}
510
511	/// Test if there is a coefficient B.
512	bool isFirstOrder() const { return V != nullptr; }
513
514	/// Test coefficient B of two Polynomials are equal.
515	bool isCompatibleTo(const Polynomial &o) const {
516	// The polynomial use different bit width.
517	if (A.getBitWidth() != o.A.getBitWidth())
518	return false;
519
520	// If neither Polynomial has the Coefficient B.
521	if (!isFirstOrder() && !o.isFirstOrder())
522	return true;
523
524	// The index variable is different.
525	if (V != o.V)
526	return false;
527
528	// Check the operations.
529	if (B.size() != o.B.size())
530	return false;
531
532	auto *ob = o.B.begin();
533	for (const auto &b : B) {
534	if (b != *ob)
535	return false;
536	ob++;
537	}
538
539	return true;
540	}
541
542	/// Subtract two polynomials, return an undefined polynomial if
543	/// subtraction is not possible.
544	Polynomial operator-(const Polynomial &o) const {
545	// Return an undefined polynomial if incompatible.
546	if (!isCompatibleTo(o))
547	return Polynomial ();
548
549	// If the polynomials are compatible (meaning they have the same
550	// coefficient on B), B is eliminated. Thus a polynomial solely
551	// containing A is returned
552	return Polynomial (A - o.A, std::max(a: ErrorMSBs, b: o.ErrorMSBs));
553	}
554
555	/// Subtract a constant from a polynomial,
556	Polynomial operator-(uint64_t C) const {
557	Polynomial Result(*this);
558	Result.A -= C;
559	return Result;
560	}
561
562	/// Add a constant to a polynomial,
563	Polynomial operator+(uint64_t C) const {
564	Polynomial Result(*this);
565	Result.A += C;
566	return Result;
567	}
568
569	/// Returns true if it can be proven that two Polynomials are equal.
570	bool isProvenEqualTo(const Polynomial &o) {
571	// Subtract both polynomials and test if it is fully defined and zero.
572	Polynomial r = *this - o;
573	return (r.ErrorMSBs == `0`) && (!r.isFirstOrder()) && (r.A.isZero());
574	}
575
576	/// Print the polynomial into a stream.
577	void print(raw_ostream &OS) const {
578	OS << "[{#ErrBits:" << ErrorMSBs << "} ";
579
580	if (V) {
581	for (auto b : B)
582	OS << "(";
583	OS << "(" << *V << ") ";
584
585	for (auto b : B) {
586	switch (b.first) {
587	case LShr:
588	OS << "LShr ";
589	break;
590	case Mul:
591	OS << "Mul ";
592	break;
593	case SExt:
594	OS << "SExt ";
595	break;
596	case Trunc:
597	OS << "Trunc ";
598	break;
599	}
600
601	OS << b.second << ") ";
602	}
603	}
604
605	OS << "+ " << A << "]";
606	}
607
608	private:
609	void deleteB() {
610	V = nullptr;
611	B.clear();
612	}
613
614	void pushBOperation(const BOps Op, const APInt &C) {
615	if (isFirstOrder()) {
616	B.push_back(Elt: std::make_pair(x: Op, y: C));
617	return;
618	}
619	}
620	};
621
622	#ifndef NDEBUG
623	static raw_ostream &operator<<(raw_ostream &OS, const Polynomial &S) {
624	S.print(OS);
625	return OS;
626	}
627	#endif
628
629	/// VectorInfo stores abstract the following information for each vector
630	/// element:
631	///
632	/// 1) The memory address loaded into the element as Polynomial
633	/// 2) a set of load instruction necessary to construct the vector,
634	/// 3) a set of all other instructions that are necessary to create the vector and
635	/// 4) a pointer value that can be used as relative base for all elements.
636	struct VectorInfo {
637	private:
638	VectorInfo(const VectorInfo &c) : VTy(c.VTy) {
639	llvm_unreachable(
640	"Copying VectorInfo is neither implemented nor necessary,");
641	}
642
643	public:
644	/// Information of a Vector Element
645	struct ElementInfo {
646	/// Offset Polynomial.
647	Polynomial Ofs;
648
649	/// The Load Instruction used to Load the entry. LI is null if the pointer
650	/// of the load instruction does not point on to the entry
651	LoadInst *LI;
652
653	ElementInfo(Polynomial Offset = Polynomial (), LoadInst LI = nullptr*)
654	: Ofs (Offset), LI(LI) {}
655	};
656
657	/// Basic-block the load instructions are within
658	BasicBlock BB = nullptr*;
659
660	/// Pointer value of all participation load instructions
661	Value PV = nullptr*;
662
663	/// Participating load instructions
664	std::set<LoadInst *> LIs;
665
666	/// Participating instructions
667	std::set<Instruction *> Is;
668
669	/// Final shuffle-vector instruction
670	ShuffleVectorInst SVI = nullptr*;
671
672	/// Information of the offset for each vector element
673	ElementInfo *EI;
674
675	/// Vector Type
676	FixedVectorType *const VTy;
677
678	VectorInfo(FixedVectorType *VTy) : VTy(VTy) {
679	EI = new ElementInfo[VTy->getNumElements()];
680	}
681
682	VectorInfo &operator=(const VectorInfo &other) = delete;
683
684	virtual ~VectorInfo() { delete[] EI; }
685
686	unsigned getDimension() const { return VTy->getNumElements(); }
687
688	/// Test if the VectorInfo can be part of an interleaved load with the
689	/// specified factor.
690	///
691	/// \param Factor of the interleave
692	/// \param DL Targets Datalayout
693	///
694	/// \returns true if this is possible and false if not
695	bool isInterleaved(unsigned Factor, const DataLayout &DL) const {
696	unsigned Size = DL.getTypeAllocSize(Ty: VTy->getElementType());
697	for (unsigned i = `1`; i < getDimension(); i++) {
698	if (!EI[i].Ofs.isProvenEqualTo(o: EI[`0`].Ofs + i * Factor * Size)) {
699	return false;
700	}
701	}
702	return true;
703	}
704
705	/// Recursively computes the vector information stored in V.
706	///
707	/// This function delegates the work to specialized implementations
708	///
709	/// \param V Value to operate on
710	/// \param Result Result of the computation
711	///
712	/// \returns false if no sensible information can be gathered.
713	static bool compute(Value V, VectorInfo &Result, const* DataLayout &DL) {
714	ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(Val: V);
715	if (SVI)
716	return computeFromSVI(SVI, Result, DL);
717	LoadInst *LI = dyn_cast<LoadInst>(Val: V);
718	if (LI)
719	return computeFromLI(LI, Result, DL);
720	BitCastInst *BCI = dyn_cast<BitCastInst>(Val: V);
721	if (BCI)
722	return computeFromBCI(BCI, Result, DL);
723	return false;
724	}
725
726	/// BitCastInst specialization to compute the vector information.
727	///
728	/// \param BCI BitCastInst to operate on
729	/// \param Result Result of the computation
730	///
731	/// \returns false if no sensible information can be gathered.
732	static bool computeFromBCI(BitCastInst *BCI, VectorInfo &Result,
733	const DataLayout &DL) {
734	Instruction *Op = dyn_cast<Instruction>(Val: BCI->getOperand(i_nocapture: `0`));
735
736	if (!Op)
737	return false;
738
739	FixedVectorType *VTy = dyn_cast<FixedVectorType>(Val: Op->getType());
740	if (!VTy)
741	return false;
742
743	// We can only cast from large to smaller vectors
744	if (Result.VTy->getNumElements() % VTy->getNumElements())
745	return false;
746
747	unsigned Factor = Result.VTy->getNumElements() / VTy->getNumElements();
748	unsigned NewSize = DL.getTypeAllocSize(Ty: Result.VTy->getElementType());
749	unsigned OldSize = DL.getTypeAllocSize(Ty: VTy->getElementType());
750
751	if (NewSize * Factor != OldSize)
752	return false;
753
754	VectorInfo Old(VTy);
755	if (!compute(V: Op, Result&: Old, DL))
756	return false;
757
758	for (unsigned i = `0`; i < Result.VTy->getNumElements(); i += Factor) {
759	for (unsigned j = `0`; j < Factor; j++) {
760	Result.EI[i + j] =
761	ElementInfo (Old.EI[i / Factor].Ofs + j * NewSize,
762	j == `0` ? Old.EI[i / Factor].LI : nullptr);
763	}
764	}
765
766	Result.BB = Old.BB;
767	Result.PV = Old.PV;
768	Result.LIs.insert(first: Old.LIs.begin(), last: Old.LIs.end());
769	Result.Is.insert(first: Old.Is.begin(), last: Old.Is.end());
770	Result.Is.insert(x: BCI);
771	Result.SVI = nullptr;
772
773	return true;
774	}
775
776	/// ShuffleVectorInst specialization to compute vector information.
777	///
778	/// \param SVI ShuffleVectorInst to operate on
779	/// \param Result Result of the computation
780	///
781	/// Compute the left and the right side vector information and merge them by
782	/// applying the shuffle operation. This function also ensures that the left
783	/// and right side have compatible loads. This means that all loads are with
784	/// in the same basic block and are based on the same pointer.
785	///
786	/// \returns false if no sensible information can be gathered.
787	static bool computeFromSVI(ShuffleVectorInst *SVI, VectorInfo &Result,
788	const DataLayout &DL) {
789	FixedVectorType *ArgTy =
790	cast<FixedVectorType>(Val: SVI->getOperand(i_nocapture: `0`)->getType());
791
792	// Compute the left hand vector information.
793	VectorInfo LHS(ArgTy);
794	if (!compute(V: SVI->getOperand(i_nocapture: `0`), Result&: LHS, DL))
795	LHS.BB = nullptr;
796
797	// Compute the right hand vector information.
798	VectorInfo RHS(ArgTy);
799	if (!compute(V: SVI->getOperand(i_nocapture: `1`), Result&: RHS, DL))
800	RHS.BB = nullptr;
801
802	// Neither operand produced sensible results?
803	if (!LHS.BB && !RHS.BB)
804	return false;
805	// Only RHS produced sensible results?
806	else if (!LHS.BB) {
807	Result.BB = RHS.BB;
808	Result.PV = RHS.PV;
809	}
810	// Only LHS produced sensible results?
811	else if (!RHS.BB) {
812	Result.BB = LHS.BB;
813	Result.PV = LHS.PV;
814	}
815	// Both operands produced sensible results?
816	else if ((LHS.BB == RHS.BB) && (LHS.PV == RHS.PV)) {
817	Result.BB = LHS.BB;
818	Result.PV = LHS.PV;
819	}
820	// Both operands produced sensible results but they are incompatible.
821	else {
822	return false;
823	}
824
825	// Merge and apply the operation on the offset information.
826	if (LHS.BB) {
827	Result.LIs.insert(first: LHS.LIs.begin(), last: LHS.LIs.end());
828	Result.Is.insert(first: LHS.Is.begin(), last: LHS.Is.end());
829	}
830	if (RHS.BB) {
831	Result.LIs.insert(first: RHS.LIs.begin(), last: RHS.LIs.end());
832	Result.Is.insert(first: RHS.Is.begin(), last: RHS.Is.end());
833	}
834	Result.Is.insert(x: SVI);
835	Result.SVI = SVI;
836
837	int j = `0`;
838	for (int i : SVI->getShuffleMask()) {
839	assert((i < `2` * (signed)ArgTy->getNumElements()) &&
840	"Invalid ShuffleVectorInst (index out of bounds)");
841
842	if (i < `0`)
843	Result.EI[j] = ElementInfo ();
844	else if (i < (signed)ArgTy->getNumElements()) {
845	if (LHS.BB)
846	Result.EI[j] = LHS.EI[i];
847	else
848	Result.EI[j] = ElementInfo ();
849	} else {
850	if (RHS.BB)
851	Result.EI[j] = RHS.EI[i - ArgTy->getNumElements()];
852	else
853	Result.EI[j] = ElementInfo ();
854	}
855	j++;
856	}
857
858	return true;
859	}
860
861	/// LoadInst specialization to compute vector information.
862	///
863	/// This function also acts as abort condition to the recursion.
864	///
865	/// \param LI LoadInst to operate on
866	/// \param Result Result of the computation
867	///
868	/// \returns false if no sensible information can be gathered.
869	static bool computeFromLI(LoadInst *LI, VectorInfo &Result,
870	const DataLayout &DL) {
871	Value *BasePtr;
872	Polynomial Offset;
873
874	if (LI->isVolatile())
875	return false;
876
877	if (LI->isAtomic())
878	return false;
879
880	// Get the base polynomial
881	computePolynomialFromPointer(Ptr&: *LI->getPointerOperand(), Result&: Offset, BasePtr, DL);
882
883	Result.BB = LI->getParent();
884	Result.PV = BasePtr;
885	Result.LIs.insert(x: LI);
886	Result.Is.insert(x: LI);
887
888	for (unsigned i = `0`; i < Result.getDimension(); i++) {
889	Value *Idx[`2`] = {
890	ConstantInt::get(Ty: Type::getInt32Ty(C&: LI->getContext()), V: `0`),
891	ConstantInt::get(Ty: Type::getInt32Ty(C&: LI->getContext()), V: i),
892	};
893	int64_t Ofs = DL.getIndexedOffsetInType(ElemTy: Result.VTy, Indices: ArrayRef(Idx, `2`));
894	Result.EI[i] = ElementInfo (Offset + Ofs, i == `0` ? LI : nullptr);
895	}
896
897	return true;
898	}
899
900	/// Recursively compute polynomial of a value.
901	///
902	/// \param BO Input binary operation
903	/// \param Result Result polynomial
904	static void computePolynomialBinOp(BinaryOperator &BO, Polynomial &Result) {
905	Value *LHS = BO.getOperand(i_nocapture: `0`);
906	Value *RHS = BO.getOperand(i_nocapture: `1`);
907
908	// Find the RHS Constant if any
909	ConstantInt *C = dyn_cast<ConstantInt>(Val: RHS);
910	if ((!C) && BO.isCommutative()) {
911	C = dyn_cast<ConstantInt>(Val: LHS);
912	if (C)
913	std::swap(a&: LHS, b&: RHS);
914	}
915
916	switch (BO.getOpcode()) {
917	case Instruction::Add:
918	if (!C)
919	break;
920
921	computePolynomial(V&: *LHS, Result);
922	Result.add(C: C->getValue());
923	return;
924
925	case Instruction::LShr:
926	if (!C)
927	break;
928
929	computePolynomial(V&: *LHS, Result);
930	Result.lshr(C: C->getValue());
931	return;
932
933	default:
934	break;
935	}
936
937	Result = Polynomial (&BO);
938	}
939
940	/// Recursively compute polynomial of a value
941	///
942	/// \param V input value
943	/// \param Result result polynomial
944	static void computePolynomial(Value &V, Polynomial &Result) {
945	if (auto *BO = dyn_cast<BinaryOperator>(Val: &V))
946	computePolynomialBinOp(BO&: *BO, Result);
947	else
948	Result = Polynomial (&V);
949	}
950
951	/// Compute the Polynomial representation of a Pointer type.
952	///
953	/// \param Ptr input pointer value
954	/// \param Result result polynomial
955	/// \param BasePtr pointer the polynomial is based on
956	/// \param DL Datalayout of the target machine
957	static void computePolynomialFromPointer(Value &Ptr, Polynomial &Result,
958	Value *&BasePtr,
959	const DataLayout &DL) {
960	// Not a pointer type? Return an undefined polynomial
961	PointerType *PtrTy = dyn_cast<PointerType>(Val: Ptr.getType());
962	if (!PtrTy) {
963	Result = Polynomial ();
964	BasePtr = nullptr;
965	return;
966	}
967	unsigned PointerBits =
968	DL.getIndexSizeInBits(AS: PtrTy->getPointerAddressSpace());
969
970	/// Skip pointer casts. Return Zero polynomial otherwise
971	if (isa<CastInst>(Val: &Ptr)) {
972	CastInst &CI = *cast<CastInst>(Val: &Ptr);
973	switch (CI.getOpcode()) {
974	case Instruction::BitCast:
975	computePolynomialFromPointer(Ptr&: *CI.getOperand(i_nocapture: `0`), Result, BasePtr, DL);
976	break;
977	default:
978	BasePtr = &Ptr;
979	Polynomial (PointerBits, `0`);
980	break;
981	}
982	}
983	/// Resolve GetElementPtrInst.
984	else if (isa<GetElementPtrInst>(Val: &Ptr)) {
985	GetElementPtrInst &GEP = *cast<GetElementPtrInst>(Val: &Ptr);
986
987	APInt BaseOffset(PointerBits, `0`);
988
989	// Check if we can compute the Offset with accumulateConstantOffset
990	if (GEP.accumulateConstantOffset(DL, Offset&: BaseOffset)) {
991	Result = Polynomial (BaseOffset);
992	BasePtr = GEP.getPointerOperand();
993	return;
994	} else {
995	// Otherwise we allow that the last index operand of the GEP is
996	// non-constant.
997	unsigned idxOperand, e;
998	SmallVector<Value *, `4`> Indices;
999	for (idxOperand = `1`, e = GEP.getNumOperands(); idxOperand < e;
1000	idxOperand++) {
1001	ConstantInt *IDX = dyn_cast<ConstantInt>(Val: GEP.getOperand(i_nocapture: idxOperand));
1002	if (!IDX)
1003	break;
1004	Indices.push_back(Elt: IDX);
1005	}
1006
1007	// It must also be the last operand.
1008	if (idxOperand + `1` != e) {
1009	Result = Polynomial ();
1010	BasePtr = nullptr;
1011	return;
1012	}
1013
1014	// Compute the polynomial of the index operand.
1015	computePolynomial(V&: *GEP.getOperand(i_nocapture: idxOperand), Result);
1016
1017	// Compute base offset from zero based index, excluding the last
1018	// variable operand.
1019	BaseOffset =
1020	DL.getIndexedOffsetInType(ElemTy: GEP.getSourceElementType(), Indices);
1021
1022	// Apply the operations of GEP to the polynomial.
1023	unsigned ResultSize = DL.getTypeAllocSize(Ty: GEP.getResultElementType());
1024	Result.sextOrTrunc(n: PointerBits);
1025	Result.mul(C: APInt (PointerBits, ResultSize));
1026	Result.add(C: BaseOffset);
1027	BasePtr = GEP.getPointerOperand();
1028	}
1029	}
1030	// All other instructions are handled by using the value as base pointer and
1031	// a zero polynomial.
1032	else {
1033	BasePtr = &Ptr;
1034	Polynomial (DL.getIndexSizeInBits(AS: PtrTy->getPointerAddressSpace()), `0`);
1035	}
1036	}
1037
1038	#ifndef NDEBUG
1039	void print(raw_ostream &OS) const {
1040	if (PV)
1041	OS << *PV;
1042	else
1043	OS << "(none)";
1044	OS << " + ";
1045	for (unsigned i = `0`; i < getDimension(); i++)
1046	OS << ((i == `0`) ? "[" : ", ") << EI[i].Ofs;
1047	OS << "]";
1048	}
1049	#endif
1050	};
1051
1052	} // anonymous namespace
1053
1054	bool InterleavedLoadCombineImpl::findPattern(
1055	std::list<VectorInfo> &Candidates, std::list<VectorInfo> &InterleavedLoad,
1056	unsigned Factor, const DataLayout &DL) {
1057	for (auto C0 = Candidates.begin(), E0 = Candidates.end(); C0 != E0; ++C0) {
1058	unsigned i;
1059	// Try to find an interleaved load using the front of Worklist as first line
1060	unsigned Size = DL.getTypeAllocSize(Ty: C0 ->VTy->getElementType());
1061
1062	// List containing iterators pointing to the VectorInfos of the candidates
1063	std::vector<std::list<VectorInfo>::iterator> Res(Factor, Candidates.end());
1064
1065	for (auto C = Candidates.begin(), E = Candidates.end(); C != E; C ++) {
1066	if (C ->VTy != C0 ->VTy)
1067	continue;
1068	if (C ->BB != C0 ->BB)
1069	continue;
1070	if (C ->PV != C0 ->PV)
1071	continue;
1072
1073	// Check the current value matches any of factor - 1 remaining lines
1074	for (i = `1`; i < Factor; i++) {
1075	if (C ->EI[`0`].Ofs.isProvenEqualTo(o: C0 ->EI[`0`].Ofs + i * Size)) {
1076	Res [i] = C;
1077	}
1078	}
1079
1080	for (i = `1`; i < Factor; i++) {
1081	if (Res [i] == Candidates.end())
1082	break;
1083	}
1084	if (i == Factor) {
1085	Res [`0`] = C0;
1086	break;
1087	}
1088	}
1089
1090	if (Res [`0`] != Candidates.end()) {
1091	// Move the result into the output
1092	for (unsigned i = `0`; i < Factor; i++) {
1093	InterleavedLoad.splice(position: InterleavedLoad.end(), x&: Candidates, i: Res [i]);
1094	}
1095
1096	return true;
1097	}
1098	}
1099	return false;
1100	}
1101
1102	LoadInst *
1103	InterleavedLoadCombineImpl::findFirstLoad(const std::set<LoadInst *> &LIs) {
1104	assert(!LIs.empty() && "No load instructions given.");
1105
1106	// All LIs are within the same BB. Select the first for a reference.
1107	BasicBlock BB = (LIs.begin())->getParent();
1108	BasicBlock::iterator FLI = llvm::find_if(
1109	Range&: BB, P: [&LIs](Instruction &I) -> bool* { return is_contained(Range: LIs, Element: &I); });
1110	assert(FLI != BB->end());
1111
1112	return cast<LoadInst>(Val&: FLI);
1113	}
1114
1115	bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
1116	OptimizationRemarkEmitter &ORE) {
1117	LLVM_DEBUG(dbgs() << "Checking interleaved load\n");
1118
1119	// The insertion point is the LoadInst which loads the first values. The
1120	// following tests are used to proof that the combined load can be inserted
1121	// just before InsertionPoint.
1122	LoadInst *InsertionPoint = InterleavedLoad.front().EI[`0`].LI;
1123
1124	// Test if the offset is computed
1125	if (!InsertionPoint)
1126	return false;
1127
1128	std::set<LoadInst *> LIs;
1129	std::set<Instruction *> Is;
1130	std::set<Instruction *> SVIs;
1131
1132	InstructionCost InterleavedCost;
1133	InstructionCost InstructionCost = `0`;
1134	const TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency;
1135
1136	// Get the interleave factor
1137	unsigned Factor = InterleavedLoad.size();
1138
1139	// Merge all input sets used in analysis
1140	for (auto &VI : InterleavedLoad) {
1141	// Generate a set of all load instructions to be combined
1142	LIs.insert(first: VI.LIs.begin(), last: VI.LIs.end());
1143
1144	// Generate a set of all instructions taking part in load
1145	// interleaved. This list excludes the instructions necessary for the
1146	// polynomial construction.
1147	Is.insert(first: VI.Is.begin(), last: VI.Is.end());
1148
1149	// Generate the set of the final ShuffleVectorInst.
1150	SVIs.insert(x: VI.SVI);
1151	}
1152
1153	// There is nothing to combine.
1154	if (LIs.size() < `2`)
1155	return false;
1156
1157	// Test if all participating instruction will be dead after the
1158	// transformation. If intermediate results are used, no performance gain can
1159	// be expected. Also sum the cost of the Instructions beeing left dead.
1160	for (const auto &I : Is) {
1161	// Compute the old cost
1162	InstructionCost += TTI.getInstructionCost(U: I, CostKind);
1163
1164	// The final SVIs are allowed not to be dead, all uses will be replaced
1165	if (SVIs.find(x: I) != SVIs.end())
1166	continue;
1167
1168	// If there are users outside the set to be eliminated, we abort the
1169	// transformation. No gain can be expected.
1170	for (auto *U : I->users()) {
1171	if (Is.find(x: dyn_cast<Instruction>(Val: U)) == Is.end())
1172	return false;
1173	}
1174	}
1175
1176	// We need to have a valid cost in order to proceed.
1177	if (!InstructionCost.isValid())
1178	return false;
1179
1180	// We know that all LoadInst are within the same BB. This guarantees that
1181	// either everything or nothing is loaded.
1182	LoadInst *First = findFirstLoad(LIs);
1183
1184	// To be safe that the loads can be combined, iterate over all loads and test
1185	// that the corresponding defining access dominates first LI. This guarantees
1186	// that there are no aliasing stores in between the loads.
1187	auto FMA = MSSA.getMemoryAccess(I: First);
1188	for (auto *LI : LIs) {
1189	auto MADef = MSSA.getMemoryAccess(I: LI)->getDefiningAccess();
1190	if (!MSSA.dominates(A: MADef, B: FMA))
1191	return false;
1192	}
1193	assert(!LIs.empty() && "There are no LoadInst to combine");
1194
1195	// It is necessary that insertion point dominates all final ShuffleVectorInst.
1196	for (auto &VI : InterleavedLoad) {
1197	if (!DT.dominates(Def: InsertionPoint, User: VI.SVI))
1198	return false;
1199	}
1200
1201	// All checks are done. Add instructions detectable by InterleavedAccessPass
1202	// The old instruction will are left dead.
1203	IRBuilder<> Builder(InsertionPoint);
1204	Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType();
1205	unsigned ElementsPerSVI =
1206	cast<FixedVectorType>(Val: InterleavedLoad.front().SVI->getType())
1207	->getNumElements();
1208	FixedVectorType ILTy = FixedVectorType::get(ElementType: ETy, NumElts: Factor ElementsPerSVI);
1209
1210	auto Indices = llvm::to_vector<`4`>(Range: llvm::seq<unsigned>(Begin: `0`, End: Factor));
1211	InterleavedCost = TTI.getInterleavedMemoryOpCost(
1212	Opcode: Instruction::Load, VecTy: ILTy, Factor, Indices, Alignment: InsertionPoint->getAlign(),
1213	AddressSpace: InsertionPoint->getPointerAddressSpace(), CostKind);
1214
1215	if (InterleavedCost >= InstructionCost) {
1216	return false;
1217	}
1218
1219	// Create the wide load and update the MemorySSA.
1220	auto Ptr = InsertionPoint->getPointerOperand();
1221	auto LI = Builder.CreateAlignedLoad(Ty: ILTy, Ptr, Align: InsertionPoint->getAlign(),
1222	Name: "interleaved.wide.load");
1223	auto MSSAU = MemorySSAUpdater (&MSSA);
1224	MemoryUse *MSSALoad = cast<MemoryUse>(Val: MSSAU.createMemoryAccessBefore(
1225	I: LI, Definition: nullptr, InsertPt: MSSA.getMemoryAccess(I: InsertionPoint)));
1226	MSSAU.insertUse(Use: MSSALoad, /RenameUses=/ true);
1227
1228	// Create the final SVIs and replace all uses.
1229	int i = `0`;
1230	for (auto &VI : InterleavedLoad) {
1231	SmallVector<int, `4`> Mask;
1232	for (unsigned j = `0`; j < ElementsPerSVI; j++)
1233	Mask.push_back(Elt: i + j * Factor);
1234
1235	Builder.SetInsertPoint(VI.SVI);
1236	auto SVI = Builder.CreateShuffleVector(V: LI, Mask, Name: "interleaved.shuffle");
1237	VI.SVI->replaceAllUsesWith(V: SVI);
1238	i++;
1239	}
1240
1241	NumInterleavedLoadCombine ++;
1242	ORE.emit(RemarkBuilder: [&]() {
1243	return OptimizationRemark (DEBUG_TYPE, "Combined Interleaved Load", LI)
1244	<< "Load interleaved combined with factor "
1245	<< ore::NV ("Factor", Factor);
1246	});
1247
1248	return true;
1249	}
1250
1251	bool InterleavedLoadCombineImpl::run() {
1252	OptimizationRemarkEmitter ORE(&F);
1253	bool changed = false;
1254	unsigned MaxFactor = TLI.getMaxSupportedInterleaveFactor();
1255
1256	auto &DL = F.getParent()->getDataLayout();
1257
1258	// Start with the highest factor to avoid combining and recombining.
1259	for (unsigned Factor = MaxFactor; Factor >= `2`; Factor--) {
1260	std::list<VectorInfo> Candidates;
1261
1262	for (BasicBlock &BB : F) {
1263	for (Instruction &I : BB) {
1264	if (auto SVI = dyn_cast<ShuffleVectorInst>(Val: &I)) {
1265	// We don't support scalable vectors in this pass.
1266	if (isa<ScalableVectorType>(Val: SVI->getType()))
1267	continue;
1268
1269	Candidates.emplace_back(args: cast<FixedVectorType>(Val: SVI->getType()));
1270
1271	if (!VectorInfo::computeFromSVI(SVI, Result&: Candidates.back(), DL)) {
1272	Candidates.pop_back();
1273	continue;
1274	}
1275
1276	if (!Candidates.back().isInterleaved(Factor, DL)) {
1277	Candidates.pop_back();
1278	}
1279	}
1280	}
1281	}
1282
1283	std::list<VectorInfo> InterleavedLoad;
1284	while (findPattern(Candidates, InterleavedLoad, Factor, DL)) {
1285	if (combine(InterleavedLoad, ORE)) {
1286	changed = true;
1287	} else {
1288	// Remove the first element of the Interleaved Load but put the others
1289	// back on the list and continue searching
1290	Candidates.splice(position: Candidates.begin(), x&: InterleavedLoad,
1291	first: std::next(x: InterleavedLoad.begin()),
1292	last: InterleavedLoad.end());
1293	}
1294	InterleavedLoad.clear();
1295	}
1296	}
1297
1298	return changed;
1299	}
1300
1301	namespace {
1302	/// This pass combines interleaved loads into a pattern detectable by
1303	/// InterleavedAccessPass.
1304	struct InterleavedLoadCombine : public FunctionPass {
1305	static char ID;
1306
1307	InterleavedLoadCombine() : FunctionPass (ID) {
1308	initializeInterleavedLoadCombinePass(*PassRegistry::getPassRegistry());
1309	}
1310
1311	StringRef getPassName() const override {
1312	return "Interleaved Load Combine Pass";
1313	}
1314
1315	bool runOnFunction(Function &F) override {
1316	if (DisableInterleavedLoadCombine)
1317	return false;
1318
1319	auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
1320	if (!TPC)
1321	return false;
1322
1323	LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName()
1324	<< "\n");
1325
1326	return InterleavedLoadCombineImpl (
1327	F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
1328	getAnalysis<MemorySSAWrapperPass>().getMSSA(),
1329	TPC->getTM<TargetMachine>())
1330	.run();
1331	}
1332
1333	void getAnalysisUsage(AnalysisUsage &AU) const override {
1334	AU.addRequired<MemorySSAWrapperPass>();
1335	AU.addRequired<DominatorTreeWrapperPass>();
1336	FunctionPass::getAnalysisUsage(AU);
1337	}
1338
1339	private:
1340	};
1341	} // anonymous namespace
1342
1343	PreservedAnalyses
1344	InterleavedLoadCombinePass::run(Function &F, FunctionAnalysisManager &FAM) {
1345
1346	auto &DT = FAM.getResult<DominatorTreeAnalysis>(IR&: F);
1347	auto &MemSSA = FAM.getResult<MemorySSAAnalysis>(IR&: F).getMSSA();
1348	bool Changed = InterleavedLoadCombineImpl (F, DT, MemSSA, *TM).run();
1349	return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
1350	}
1351
1352	char InterleavedLoadCombine::ID = `0`;
1353
1354	INITIALIZE_PASS_BEGIN(
1355	InterleavedLoadCombine, DEBUG_TYPE,
1356	"Combine interleaved loads into wide loads and shufflevector instructions",
1357	false, false)
1358	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
1359	INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
1360	INITIALIZE_PASS_END(
1361	InterleavedLoadCombine, DEBUG_TYPE,
1362	"Combine interleaved loads into wide loads and shufflevector instructions",
1363	false, false)
1364
1365	FunctionPass *
1366	llvm::createInterleavedLoadCombinePass() {
1367	auto P = new InterleavedLoadCombine ();
1368	return P;
1369	}
1370

source code of llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp