VectorCombine.cpp source code [llvm/lib/Transforms/Vectorize/VectorCombine.cpp]

1	//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass optimizes scalar/vector interactions using target cost models. The
10	// transforms implemented here may not fit in traditional loop-based or SLP
11	// vectorization passes.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "llvm/Transforms/Vectorize/VectorCombine.h"
16	#include "llvm/ADT/DenseMap.h"
17	#include "llvm/ADT/ScopeExit.h"
18	#include "llvm/ADT/Statistic.h"
19	#include "llvm/Analysis/AssumptionCache.h"
20	#include "llvm/Analysis/BasicAliasAnalysis.h"
21	#include "llvm/Analysis/GlobalsModRef.h"
22	#include "llvm/Analysis/Loads.h"
23	#include "llvm/Analysis/TargetTransformInfo.h"
24	#include "llvm/Analysis/ValueTracking.h"
25	#include "llvm/Analysis/VectorUtils.h"
26	#include "llvm/IR/Dominators.h"
27	#include "llvm/IR/Function.h"
28	#include "llvm/IR/IRBuilder.h"
29	#include "llvm/IR/PatternMatch.h"
30	#include "llvm/Support/CommandLine.h"
31	#include "llvm/Transforms/Utils/Local.h"
32	#include "llvm/Transforms/Utils/LoopUtils.h"
33	#include <numeric>
34	#include <queue>
35
36	#define DEBUG_TYPE "vector-combine"
37	#include "llvm/Transforms/Utils/InstructionWorklist.h"
38
39	using namespace llvm;
40	using namespace llvm::PatternMatch;
41
42	STATISTIC(NumVecLoad, "Number of vector loads formed");
43	STATISTIC(NumVecCmp, "Number of vector compares formed");
44	STATISTIC(NumVecBO, "Number of vector binops formed");
45	STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
46	STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
47	STATISTIC(NumScalarBO, "Number of scalar binops formed");
48	STATISTIC(NumScalarCmp, "Number of scalar compares formed");
49
50	static cl::opt<bool> DisableVectorCombine(
51	"disable-vector-combine", cl::init(Val: false), cl::Hidden,
52	cl::desc ("Disable all vector combine transforms"));
53
54	static cl::opt<bool> DisableBinopExtractShuffle(
55	"disable-binop-extract-shuffle", cl::init(Val: false), cl::Hidden,
56	cl::desc ("Disable binop extract to shuffle transforms"));
57
58	static cl::opt<unsigned> MaxInstrsToScan(
59	"vector-combine-max-scan-instrs", cl::init(Val: `30`), cl::Hidden,
60	cl::desc ("Max number of instructions to scan for vector combining."));
61
62	static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
63
64	namespace {
65	class VectorCombine {
66	public:
67	VectorCombine(Function &F, const TargetTransformInfo &TTI,
68	const DominatorTree &DT, AAResults &AA, AssumptionCache &AC,
69	const DataLayout DL, bool* TryEarlyFoldsOnly)
70	: F(F), Builder (F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC), DL(DL),
71	TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
72
73	bool run();
74
75	private:
76	Function &F;
77	IRBuilder<> Builder;
78	const TargetTransformInfo &TTI;
79	const DominatorTree &DT;
80	AAResults &AA;
81	AssumptionCache &AC;
82	const DataLayout *DL;
83
84	/// If true, only perform beneficial early IR transforms. Do not introduce new
85	/// vector operations.
86	bool TryEarlyFoldsOnly;
87
88	InstructionWorklist Worklist;
89
90	// TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
91	// parameter. That should be updated to specific sub-classes because the
92	// run loop was changed to dispatch on opcode.
93	bool vectorizeLoadInsert(Instruction &I);
94	bool widenSubvectorLoad(Instruction &I);
95	ExtractElementInst getShuffleExtract(ExtractElementInst Ext0,
96	ExtractElementInst *Ext1,
97	unsigned PreferredExtractIndex) const;
98	bool isExtractExtractCheap(ExtractElementInst Ext0, ExtractElementInst Ext1,
99	const Instruction &I,
100	ExtractElementInst *&ConvertToShuffle,
101	unsigned PreferredExtractIndex);
102	void foldExtExtCmp(ExtractElementInst Ext0, ExtractElementInst Ext1,
103	Instruction &I);
104	void foldExtExtBinop(ExtractElementInst Ext0, ExtractElementInst Ext1,
105	Instruction &I);
106	bool foldExtractExtract(Instruction &I);
107	bool foldInsExtFNeg(Instruction &I);
108	bool foldBitcastShuffle(Instruction &I);
109	bool scalarizeBinopOrCmp(Instruction &I);
110	bool scalarizeVPIntrinsic(Instruction &I);
111	bool foldExtractedCmps(Instruction &I);
112	bool foldSingleElementStore(Instruction &I);
113	bool scalarizeLoadExtract(Instruction &I);
114	bool foldShuffleOfBinops(Instruction &I);
115	bool foldShuffleOfCastops(Instruction &I);
116	bool foldShuffleOfShuffles(Instruction &I);
117	bool foldShuffleFromReductions(Instruction &I);
118	bool foldTruncFromReductions(Instruction &I);
119	bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
120
121	void replaceValue(Value &Old, Value &New) {
122	Old.replaceAllUsesWith(V: &New);
123	if (auto *NewI = dyn_cast<Instruction>(Val: &New)) {
124	New.takeName(V: &Old);
125	Worklist.pushUsersToWorkList(I&: *NewI);
126	Worklist.pushValue(V: NewI);
127	}
128	Worklist.pushValue(V: &Old);
129	}
130
131	void eraseInstruction(Instruction &I) {
132	for (Value *Op : I.operands())
133	Worklist.pushValue(V: Op);
134	Worklist.remove(I: &I);
135	I.eraseFromParent();
136	}
137	};
138	} // namespace
139
140	/// Return the source operand of a potentially bitcasted value. If there is no
141	/// bitcast, return the input value itself.
142	static Value peekThroughBitcasts(Value V) {
143	while (auto *BitCast = dyn_cast<BitCastInst>(Val: V))
144	V = BitCast->getOperand(i_nocapture: `0`);
145	return V;
146	}
147
148	static bool canWidenLoad(LoadInst Load, const* TargetTransformInfo &TTI) {
149	// Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
150	// The widened load may load data from dirty regions or create data races
151	// non-existent in the source.
152	if (!Load \|\| !Load->isSimple() \|\| !Load->hasOneUse() \|\|
153	Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) \|\|
154	mustSuppressSpeculation(LI: *Load))
155	return false;
156
157	// We are potentially transforming byte-sized (8-bit) memory accesses, so make
158	// sure we have all of our type-based constraints in place for this target.
159	Type *ScalarTy = Load->getType()->getScalarType();
160	uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
161	unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
162	if (!ScalarSize \|\| !MinVectorSize \|\| MinVectorSize % ScalarSize != `0` \|\|
163	ScalarSize % `8` != `0`)
164	return false;
165
166	return true;
167	}
168
169	bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
170	// Match insert into fixed vector of scalar value.
171	// TODO: Handle non-zero insert index.
172	Value *Scalar;
173	if (!match(V: &I, P: m_InsertElt(Val: m_Undef(), Elt: m_Value(V&: Scalar), Idx: m_ZeroInt())) \|\|
174	!Scalar->hasOneUse())
175	return false;
176
177	// Optionally match an extract from another vector.
178	Value *X;
179	bool HasExtract = match(V: Scalar, P: m_ExtractElt(Val: m_Value(V&: X), Idx: m_ZeroInt()));
180	if (!HasExtract)
181	X = Scalar;
182
183	auto *Load = dyn_cast<LoadInst>(Val: X);
184	if (!canWidenLoad(Load, TTI))
185	return false;
186
187	Type *ScalarTy = Scalar->getType();
188	uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
189	unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
190
191	// Check safety of replacing the scalar load with a larger vector load.
192	// We use minimal alignment (maximum flexibility) because we only care about
193	// the dereferenceable region. When calculating cost and creating a new op,
194	// we may use a larger value based on alignment attributes.
195	Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
196	assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
197
198	unsigned MinVecNumElts = MinVectorSize / ScalarSize;
199	auto MinVecTy = VectorType::get(ElementType: ScalarTy, NumElements: MinVecNumElts, Scalable: false*);
200	unsigned OffsetEltIndex = `0`;
201	Align Alignment = Load->getAlign();
202	if (!isSafeToLoadUnconditionally(V: SrcPtr, Ty: MinVecTy, Alignment: Align (`1`), DL: *DL, ScanFrom: Load, AC: &AC,
203	DT: &DT)) {
204	// It is not safe to load directly from the pointer, but we can still peek
205	// through gep offsets and check if it safe to load from a base address with
206	// updated alignment. If it is, we can shuffle the element(s) into place
207	// after loading.
208	unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(Ty: SrcPtr->getType());
209	APInt Offset(OffsetBitWidth, `0`);
210	SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL: *DL, Offset);
211
212	// We want to shuffle the result down from a high element of a vector, so
213	// the offset must be positive.
214	if (Offset.isNegative())
215	return false;
216
217	// The offset must be a multiple of the scalar element to shuffle cleanly
218	// in the element's size.
219	uint64_t ScalarSizeInBytes = ScalarSize / `8`;
220	if (Offset.urem(RHS: ScalarSizeInBytes) != `0`)
221	return false;
222
223	// If we load MinVecNumElts, will our target element still be loaded?
224	OffsetEltIndex = Offset.udiv(RHS: ScalarSizeInBytes).getZExtValue();
225	if (OffsetEltIndex >= MinVecNumElts)
226	return false;
227
228	if (!isSafeToLoadUnconditionally(V: SrcPtr, Ty: MinVecTy, Alignment: Align (`1`), DL: *DL, ScanFrom: Load, AC: &AC,
229	DT: &DT))
230	return false;
231
232	// Update alignment with offset value. Note that the offset could be negated
233	// to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
234	// negation does not change the result of the alignment calculation.
235	Alignment = commonAlignment(A: Alignment, Offset: Offset.getZExtValue());
236	}
237
238	// Original pattern: insertelt undef, load [free casts of] PtrOp, 0
239	// Use the greater of the alignment on the load or its source pointer.
240	Alignment = std::max(a: SrcPtr->getPointerAlignment(DL: *DL), b: Alignment);
241	Type *LoadTy = Load->getType();
242	unsigned AS = Load->getPointerAddressSpace();
243	InstructionCost OldCost =
244	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LoadTy, Alignment, AddressSpace: AS);
245	APInt DemandedElts = APInt::getOneBitSet(numBits: MinVecNumElts, BitNo: `0`);
246	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
247	OldCost +=
248	TTI.getScalarizationOverhead(Ty: MinVecTy, DemandedElts,
249	/ Insert / true, Extract: HasExtract, CostKind);
250
251	// New pattern: load VecPtr
252	InstructionCost NewCost =
253	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: MinVecTy, Alignment, AddressSpace: AS);
254	// Optionally, we are shuffling the loaded vector element(s) into place.
255	// For the mask set everything but element 0 to undef to prevent poison from
256	// propagating from the extra loaded memory. This will also optionally
257	// shrink/grow the vector from the loaded size to the output size.
258	// We assume this operation has no cost in codegen if there was no offset.
259	// Note that we could use freeze to avoid poison problems, but then we might
260	// still need a shuffle to change the vector size.
261	auto *Ty = cast<FixedVectorType>(Val: I.getType());
262	unsigned OutputNumElts = Ty->getNumElements();
263	SmallVector<int, `16`> Mask(OutputNumElts, PoisonMaskElem);
264	assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
265	Mask [`0`] = OffsetEltIndex;
266	if (OffsetEltIndex)
267	NewCost += TTI.getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, Tp: MinVecTy, Mask);
268
269	// We can aggressively convert to the vector form because the backend can
270	// invert this transform if it does not result in a performance win.
271	if (OldCost < NewCost \|\| !NewCost.isValid())
272	return false;
273
274	// It is safe and potentially profitable to load a vector directly:
275	// inselt undef, load Scalar, 0 --> load VecPtr
276	IRBuilder<> Builder(Load);
277	Value *CastedPtr =
278	Builder.CreatePointerBitCastOrAddrSpaceCast(V: SrcPtr, DestTy: Builder.getPtrTy(AddrSpace: AS));
279	Value *VecLd = Builder.CreateAlignedLoad(Ty: MinVecTy, Ptr: CastedPtr, Align: Alignment);
280	VecLd = Builder.CreateShuffleVector(V: VecLd, Mask);
281
282	replaceValue(Old&: I, New&: *VecLd);
283	++NumVecLoad;
284	return true;
285	}
286
287	/// If we are loading a vector and then inserting it into a larger vector with
288	/// undefined elements, try to load the larger vector and eliminate the insert.
289	/// This removes a shuffle in IR and may allow combining of other loaded values.
290	bool VectorCombine::widenSubvectorLoad(Instruction &I) {
291	// Match subvector insert of fixed vector.
292	auto *Shuf = cast<ShuffleVectorInst>(Val: &I);
293	if (!Shuf->isIdentityWithPadding())
294	return false;
295
296	// Allow a non-canonical shuffle mask that is choosing elements from op1.
297	unsigned NumOpElts =
298	cast<FixedVectorType>(Val: Shuf->getOperand(i_nocapture: `0`)->getType())->getNumElements();
299	unsigned OpIndex = any_of(Range: Shuf->getShuffleMask(), P: [&NumOpElts](int M) {
300	return M >= (int)(NumOpElts);
301	});
302
303	auto *Load = dyn_cast<LoadInst>(Val: Shuf->getOperand(i_nocapture: OpIndex));
304	if (!canWidenLoad(Load, TTI))
305	return false;
306
307	// We use minimal alignment (maximum flexibility) because we only care about
308	// the dereferenceable region. When calculating cost and creating a new op,
309	// we may use a larger value based on alignment attributes.
310	auto *Ty = cast<FixedVectorType>(Val: I.getType());
311	Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
312	assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
313	Align Alignment = Load->getAlign();
314	if (!isSafeToLoadUnconditionally(V: SrcPtr, Ty, Alignment: Align (`1`), DL: *DL, ScanFrom: Load, AC: &AC, DT: &DT))
315	return false;
316
317	Alignment = std::max(a: SrcPtr->getPointerAlignment(DL: *DL), b: Alignment);
318	Type *LoadTy = Load->getType();
319	unsigned AS = Load->getPointerAddressSpace();
320
321	// Original pattern: insert_subvector (load PtrOp)
322	// This conservatively assumes that the cost of a subvector insert into an
323	// undef value is 0. We could add that cost if the cost model accurately
324	// reflects the real cost of that operation.
325	InstructionCost OldCost =
326	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LoadTy, Alignment, AddressSpace: AS);
327
328	// New pattern: load PtrOp
329	InstructionCost NewCost =
330	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: Ty, Alignment, AddressSpace: AS);
331
332	// We can aggressively convert to the vector form because the backend can
333	// invert this transform if it does not result in a performance win.
334	if (OldCost < NewCost \|\| !NewCost.isValid())
335	return false;
336
337	IRBuilder<> Builder(Load);
338	Value *CastedPtr =
339	Builder.CreatePointerBitCastOrAddrSpaceCast(V: SrcPtr, DestTy: Builder.getPtrTy(AddrSpace: AS));
340	Value *VecLd = Builder.CreateAlignedLoad(Ty, Ptr: CastedPtr, Align: Alignment);
341	replaceValue(Old&: I, New&: *VecLd);
342	++NumVecLoad;
343	return true;
344	}
345
346	/// Determine which, if any, of the inputs should be replaced by a shuffle
347	/// followed by extract from a different index.
348	ExtractElementInst *VectorCombine::getShuffleExtract(
349	ExtractElementInst Ext0, ExtractElementInst Ext1,
350	unsigned PreferredExtractIndex = InvalidIndex) const {
351	auto *Index0C = dyn_cast<ConstantInt>(Val: Ext0->getIndexOperand());
352	auto *Index1C = dyn_cast<ConstantInt>(Val: Ext1->getIndexOperand());
353	assert(Index0C && Index1C && "Expected constant extract indexes");
354
355	unsigned Index0 = Index0C->getZExtValue();
356	unsigned Index1 = Index1C->getZExtValue();
357
358	// If the extract indexes are identical, no shuffle is needed.
359	if (Index0 == Index1)
360	return nullptr;
361
362	Type *VecTy = Ext0->getVectorOperand()->getType();
363	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
364	assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
365	InstructionCost Cost0 =
366	TTI.getVectorInstrCost(I: *Ext0, Val: VecTy, CostKind, Index: Index0);
367	InstructionCost Cost1 =
368	TTI.getVectorInstrCost(I: *Ext1, Val: VecTy, CostKind, Index: Index1);
369
370	// If both costs are invalid no shuffle is needed
371	if (!Cost0.isValid() && !Cost1.isValid())
372	return nullptr;
373
374	// We are extracting from 2 different indexes, so one operand must be shuffled
375	// before performing a vector operation and/or extract. The more expensive
376	// extract will be replaced by a shuffle.
377	if (Cost0 > Cost1)
378	return Ext0;
379	if (Cost1 > Cost0)
380	return Ext1;
381
382	// If the costs are equal and there is a preferred extract index, shuffle the
383	// opposite operand.
384	if (PreferredExtractIndex == Index0)
385	return Ext1;
386	if (PreferredExtractIndex == Index1)
387	return Ext0;
388
389	// Otherwise, replace the extract with the higher index.
390	return Index0 > Index1 ? Ext0 : Ext1;
391	}
392
393	/// Compare the relative costs of 2 extracts followed by scalar operation vs.
394	/// vector operation(s) followed by extract. Return true if the existing
395	/// instructions are cheaper than a vector alternative. Otherwise, return false
396	/// and if one of the extracts should be transformed to a shufflevector, set
397	/// \p ConvertToShuffle to that extract instruction.
398	bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
399	ExtractElementInst *Ext1,
400	const Instruction &I,
401	ExtractElementInst *&ConvertToShuffle,
402	unsigned PreferredExtractIndex) {
403	auto *Ext0IndexC = dyn_cast<ConstantInt>(Val: Ext0->getOperand(i_nocapture: `1`));
404	auto *Ext1IndexC = dyn_cast<ConstantInt>(Val: Ext1->getOperand(i_nocapture: `1`));
405	assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
406
407	unsigned Opcode = I.getOpcode();
408	Type *ScalarTy = Ext0->getType();
409	auto *VecTy = cast<VectorType>(Val: Ext0->getOperand(i_nocapture: `0`)->getType());
410	InstructionCost ScalarOpCost, VectorOpCost;
411
412	// Get cost estimates for scalar and vector versions of the operation.
413	bool IsBinOp = Instruction::isBinaryOp(Opcode);
414	if (IsBinOp) {
415	ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, Ty: ScalarTy);
416	VectorOpCost = TTI.getArithmeticInstrCost(Opcode, Ty: VecTy);
417	} else {
418	assert((Opcode == Instruction::ICmp \|\| Opcode == Instruction::FCmp) &&
419	"Expected a compare");
420	CmpInst::Predicate Pred = cast<CmpInst>(Val: I).getPredicate();
421	ScalarOpCost = TTI.getCmpSelInstrCost(
422	Opcode, ValTy: ScalarTy, CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy), VecPred: Pred);
423	VectorOpCost = TTI.getCmpSelInstrCost(
424	Opcode, ValTy: VecTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VecTy), VecPred: Pred);
425	}
426
427	// Get cost estimates for the extract elements. These costs will factor into
428	// both sequences.
429	unsigned Ext0Index = Ext0IndexC->getZExtValue();
430	unsigned Ext1Index = Ext1IndexC->getZExtValue();
431	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
432
433	InstructionCost Extract0Cost =
434	TTI.getVectorInstrCost(I: *Ext0, Val: VecTy, CostKind, Index: Ext0Index);
435	InstructionCost Extract1Cost =
436	TTI.getVectorInstrCost(I: *Ext1, Val: VecTy, CostKind, Index: Ext1Index);
437
438	// A more expensive extract will always be replaced by a splat shuffle.
439	// For example, if Ext0 is more expensive:
440	// opcode (extelt V0, Ext0), (ext V1, Ext1) -->
441	// extelt (opcode (splat V0, Ext0), V1), Ext1
442	// TODO: Evaluate whether that always results in lowest cost. Alternatively,
443	// check the cost of creating a broadcast shuffle and shuffling both
444	// operands to element 0.
445	InstructionCost CheapExtractCost = std::min(a: Extract0Cost, b: Extract1Cost);
446
447	// Extra uses of the extracts mean that we include those costs in the
448	// vector total because those instructions will not be eliminated.
449	InstructionCost OldCost, NewCost;
450	if (Ext0->getOperand(i_nocapture: `0`) == Ext1->getOperand(i_nocapture: `0`) && Ext0Index == Ext1Index) {
451	// Handle a special case. If the 2 extracts are identical, adjust the
452	// formulas to account for that. The extra use charge allows for either the
453	// CSE'd pattern or an unoptimized form with identical values:
454	// opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
455	bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(N: `2`)
456	: !Ext0->hasOneUse() \|\| !Ext1->hasOneUse();
457	OldCost = CheapExtractCost + ScalarOpCost;
458	NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
459	} else {
460	// Handle the general case. Each extract is actually a different value:
461	// opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
462	OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
463	NewCost = VectorOpCost + CheapExtractCost +
464	!Ext0->hasOneUse() * Extract0Cost +
465	!Ext1->hasOneUse() * Extract1Cost;
466	}
467
468	ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
469	if (ConvertToShuffle) {
470	if (IsBinOp && DisableBinopExtractShuffle)
471	return true;
472
473	// If we are extracting from 2 different indexes, then one operand must be
474	// shuffled before performing the vector operation. The shuffle mask is
475	// poison except for 1 lane that is being translated to the remaining
476	// extraction lane. Therefore, it is a splat shuffle. Ex:
477	// ShufMask = { poison, poison, 0, poison }
478	// TODO: The cost model has an option for a "broadcast" shuffle
479	// (splat-from-element-0), but no option for a more general splat.
480	NewCost +=
481	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: VecTy);
482	}
483
484	// Aggressively form a vector op if the cost is equal because the transform
485	// may enable further optimization.
486	// Codegen can reverse this transform (scalarize) if it was not profitable.
487	return OldCost < NewCost;
488	}
489
490	/// Create a shuffle that translates (shifts) 1 element from the input vector
491	/// to a new element location.
492	static Value createShiftShuffle(Value Vec, unsigned OldIndex,
493	unsigned NewIndex, IRBuilder<> &Builder) {
494	// The shuffle mask is poison except for 1 lane that is being translated
495	// to the new element index. Example for OldIndex == 2 and NewIndex == 0:
496	// ShufMask = { 2, poison, poison, poison }
497	auto *VecTy = cast<FixedVectorType>(Val: Vec->getType());
498	SmallVector<int, `32`> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
499	ShufMask [NewIndex] = OldIndex;
500	return Builder.CreateShuffleVector(V: Vec, Mask: ShufMask, Name: "shift");
501	}
502
503	/// Given an extract element instruction with constant index operand, shuffle
504	/// the source vector (shift the scalar element) to a NewIndex for extraction.
505	/// Return null if the input can be constant folded, so that we are not creating
506	/// unnecessary instructions.
507	static ExtractElementInst translateExtract(ExtractElementInst ExtElt,
508	unsigned NewIndex,
509	IRBuilder<> &Builder) {
510	// Shufflevectors can only be created for fixed-width vectors.
511	if (!isa<FixedVectorType>(Val: ExtElt->getOperand(i_nocapture: `0`)->getType()))
512	return nullptr;
513
514	// If the extract can be constant-folded, this code is unsimplified. Defer
515	// to other passes to handle that.
516	Value *X = ExtElt->getVectorOperand();
517	Value *C = ExtElt->getIndexOperand();
518	assert(isa<ConstantInt>(C) && "Expected a constant index operand");
519	if (isa<Constant>(Val: X))
520	return nullptr;
521
522	Value *Shuf = createShiftShuffle(Vec: X, OldIndex: cast<ConstantInt>(Val: C)->getZExtValue(),
523	NewIndex, Builder);
524	return cast<ExtractElementInst>(Val: Builder.CreateExtractElement(Vec: Shuf, Idx: NewIndex));
525	}
526
527	/// Try to reduce extract element costs by converting scalar compares to vector
528	/// compares followed by extract.
529	/// cmp (ext0 V0, C), (ext1 V1, C)
530	void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0,
531	ExtractElementInst *Ext1, Instruction &I) {
532	assert(isa<CmpInst>(&I) && "Expected a compare");
533	assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
534	cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
535	"Expected matching constant extract indexes");
536
537	// cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C
538	++NumVecCmp;
539	CmpInst::Predicate Pred = cast<CmpInst>(Val: &I)->getPredicate();
540	Value V0 = Ext0->getVectorOperand(), V1 = Ext1->getVectorOperand();
541	Value *VecCmp = Builder.CreateCmp(Pred, LHS: V0, RHS: V1);
542	Value *NewExt = Builder.CreateExtractElement(Vec: VecCmp, Idx: Ext0->getIndexOperand());
543	replaceValue(Old&: I, New&: *NewExt);
544	}
545
546	/// Try to reduce extract element costs by converting scalar binops to vector
547	/// binops followed by extract.
548	/// bo (ext0 V0, C), (ext1 V1, C)
549	void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0,
550	ExtractElementInst *Ext1, Instruction &I) {
551	assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
552	assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
553	cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
554	"Expected matching constant extract indexes");
555
556	// bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C
557	++NumVecBO;
558	Value V0 = Ext0->getVectorOperand(), V1 = Ext1->getVectorOperand();
559	Value *VecBO =
560	Builder.CreateBinOp(Opc: cast<BinaryOperator>(Val: &I)->getOpcode(), LHS: V0, RHS: V1);
561
562	// All IR flags are safe to back-propagate because any potential poison
563	// created in unused vector elements is discarded by the extract.
564	if (auto *VecBOInst = dyn_cast<Instruction>(Val: VecBO))
565	VecBOInst->copyIRFlags(V: &I);
566
567	Value *NewExt = Builder.CreateExtractElement(Vec: VecBO, Idx: Ext0->getIndexOperand());
568	replaceValue(Old&: I, New&: *NewExt);
569	}
570
571	/// Match an instruction with extracted vector operands.
572	bool VectorCombine::foldExtractExtract(Instruction &I) {
573	// It is not safe to transform things like div, urem, etc. because we may
574	// create undefined behavior when executing those on unknown vector elements.
575	if (!isSafeToSpeculativelyExecute(I: &I))
576	return false;
577
578	Instruction I0, I1;
579	CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
580	if (!match(V: &I, P: m_Cmp(Pred, L: m_Instruction(I&: I0), R: m_Instruction(I&: I1))) &&
581	!match(V: &I, P: m_BinOp(L: m_Instruction(I&: I0), R: m_Instruction(I&: I1))))
582	return false;
583
584	Value V0, V1;
585	uint64_t C0, C1;
586	if (!match(V: I0, P: m_ExtractElt(Val: m_Value(V&: V0), Idx: m_ConstantInt(V&: C0))) \|\|
587	!match(V: I1, P: m_ExtractElt(Val: m_Value(V&: V1), Idx: m_ConstantInt(V&: C1))) \|\|
588	V0->getType() != V1->getType())
589	return false;
590
591	// If the scalar value 'I' is going to be re-inserted into a vector, then try
592	// to create an extract to that same element. The extract/insert can be
593	// reduced to a "select shuffle".
594	// TODO: If we add a larger pattern match that starts from an insert, this
595	// probably becomes unnecessary.
596	auto *Ext0 = cast<ExtractElementInst>(Val: I0);
597	auto *Ext1 = cast<ExtractElementInst>(Val: I1);
598	uint64_t InsertIndex = InvalidIndex;
599	if (I.hasOneUse())
600	match(V: I.user_back(),
601	P: m_InsertElt(Val: m_Value(), Elt: m_Value(), Idx: m_ConstantInt(V&: InsertIndex)));
602
603	ExtractElementInst *ExtractToChange;
604	if (isExtractExtractCheap(Ext0, Ext1, I, ConvertToShuffle&: ExtractToChange, PreferredExtractIndex: InsertIndex))
605	return false;
606
607	if (ExtractToChange) {
608	unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
609	ExtractElementInst *NewExtract =
610	translateExtract(ExtElt: ExtractToChange, NewIndex: CheapExtractIdx, Builder);
611	if (!NewExtract)
612	return false;
613	if (ExtractToChange == Ext0)
614	Ext0 = NewExtract;
615	else
616	Ext1 = NewExtract;
617	}
618
619	if (Pred != CmpInst::BAD_ICMP_PREDICATE)
620	foldExtExtCmp(Ext0, Ext1, I);
621	else
622	foldExtExtBinop(Ext0, Ext1, I);
623
624	Worklist.push(I: Ext0);
625	Worklist.push(I: Ext1);
626	return true;
627	}
628
629	/// Try to replace an extract + scalar fneg + insert with a vector fneg +
630	/// shuffle.
631	bool VectorCombine::foldInsExtFNeg(Instruction &I) {
632	// Match an insert (op (extract)) pattern.
633	Value *DestVec;
634	uint64_t Index;
635	Instruction *FNeg;
636	if (!match(V: &I, P: m_InsertElt(Val: m_Value(V&: DestVec), Elt: m_OneUse(SubPattern: m_Instruction(I&: FNeg)),
637	Idx: m_ConstantInt(V&: Index))))
638	return false;
639
640	// Note: This handles the canonical fneg instruction and "fsub -0.0, X".
641	Value *SrcVec;
642	Instruction *Extract;
643	if (!match(V: FNeg, P: m_FNeg(X: m_CombineAnd(
644	L: m_Instruction(I&: Extract),
645	R: m_ExtractElt(Val: m_Value(V&: SrcVec), Idx: m_SpecificInt(V: Index))))))
646	return false;
647
648	// TODO: We could handle this with a length-changing shuffle.
649	auto *VecTy = cast<FixedVectorType>(Val: I.getType());
650	if (SrcVec->getType() != VecTy)
651	return false;
652
653	// Ignore bogus insert/extract index.
654	unsigned NumElts = VecTy->getNumElements();
655	if (Index >= NumElts)
656	return false;
657
658	// We are inserting the negated element into the same lane that we extracted
659	// from. This is equivalent to a select-shuffle that chooses all but the
660	// negated element from the destination vector.
661	SmallVector<int> Mask(NumElts);
662	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
663	Mask [Index] = Index + NumElts;
664
665	Type *ScalarTy = VecTy->getScalarType();
666	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
667	InstructionCost OldCost =
668	TTI.getArithmeticInstrCost(Opcode: Instruction::FNeg, Ty: ScalarTy) +
669	TTI.getVectorInstrCost(I, Val: VecTy, CostKind, Index);
670
671	// If the extract has one use, it will be eliminated, so count it in the
672	// original cost. If it has more than one use, ignore the cost because it will
673	// be the same before/after.
674	if (Extract->hasOneUse())
675	OldCost += TTI.getVectorInstrCost(I: *Extract, Val: VecTy, CostKind, Index);
676
677	InstructionCost NewCost =
678	TTI.getArithmeticInstrCost(Opcode: Instruction::FNeg, Ty: VecTy) +
679	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Select, Tp: VecTy, Mask);
680
681	if (NewCost > OldCost)
682	return false;
683
684	// insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index -->
685	// shuffle DestVec, (fneg SrcVec), Mask
686	Value *VecFNeg = Builder.CreateFNegFMF(V: SrcVec, FMFSource: FNeg);
687	Value *Shuf = Builder.CreateShuffleVector(V1: DestVec, V2: VecFNeg, Mask);
688	replaceValue(Old&: I, New&: *Shuf);
689	return true;
690	}
691
692	/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
693	/// destination type followed by shuffle. This can enable further transforms by
694	/// moving bitcasts or shuffles together.
695	bool VectorCombine::foldBitcastShuffle(Instruction &I) {
696	Value V0, V1;
697	ArrayRef<int> Mask;
698	if (!match(V: &I, P: m_BitCast(Op: m_OneUse(
699	SubPattern: m_Shuffle(v1: m_Value(V&: V0), v2: m_Value(V&: V1), mask: m_Mask (Mask))))))
700	return false;
701
702	// 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
703	// scalable type is unknown; Second, we cannot reason if the narrowed shuffle
704	// mask for scalable type is a splat or not.
705	// 2) Disallow non-vector casts.
706	// TODO: We could allow any shuffle.
707	auto *DestTy = dyn_cast<FixedVectorType>(Val: I.getType());
708	auto *SrcTy = dyn_cast<FixedVectorType>(Val: V0->getType());
709	if (!DestTy \|\| !SrcTy)
710	return false;
711
712	unsigned DestEltSize = DestTy->getScalarSizeInBits();
713	unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
714	if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != `0`)
715	return false;
716
717	bool IsUnary = isa<UndefValue>(Val: V1);
718
719	// For binary shuffles, only fold bitcast(shuffle(X,Y))
720	// if it won't increase the number of bitcasts.
721	if (!IsUnary) {
722	auto *BCTy0 = dyn_cast<FixedVectorType>(Val: peekThroughBitcasts(V: V0)->getType());
723	auto *BCTy1 = dyn_cast<FixedVectorType>(Val: peekThroughBitcasts(V: V1)->getType());
724	if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
725	!(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
726	return false;
727	}
728
729	SmallVector<int, `16`> NewMask;
730	if (DestEltSize <= SrcEltSize) {
731	// The bitcast is from wide to narrow/equal elements. The shuffle mask can
732	// always be expanded to the equivalent form choosing narrower elements.
733	assert(SrcEltSize % DestEltSize == `0` && "Unexpected shuffle mask");
734	unsigned ScaleFactor = SrcEltSize / DestEltSize;
735	narrowShuffleMaskElts(Scale: ScaleFactor, Mask, ScaledMask&: NewMask);
736	} else {
737	// The bitcast is from narrow elements to wide elements. The shuffle mask
738	// must choose consecutive elements to allow casting first.
739	assert(DestEltSize % SrcEltSize == `0` && "Unexpected shuffle mask");
740	unsigned ScaleFactor = DestEltSize / SrcEltSize;
741	if (!widenShuffleMaskElts(Scale: ScaleFactor, Mask, ScaledMask&: NewMask))
742	return false;
743	}
744
745	// Bitcast the shuffle src - keep its original width but using the destination
746	// scalar type.
747	unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
748	auto *NewShuffleTy =
749	FixedVectorType::get(ElementType: DestTy->getScalarType(), NumElts: NumSrcElts);
750	auto *OldShuffleTy =
751	FixedVectorType::get(ElementType: SrcTy->getScalarType(), NumElts: Mask.size());
752	unsigned NumOps = IsUnary ? `1` : `2`;
753
754	// The new shuffle must not cost more than the old shuffle.
755	TargetTransformInfo::TargetCostKind CK =
756	TargetTransformInfo::TCK_RecipThroughput;
757	TargetTransformInfo::ShuffleKind SK =
758	IsUnary ? TargetTransformInfo::SK_PermuteSingleSrc
759	: TargetTransformInfo::SK_PermuteTwoSrc;
760
761	InstructionCost DestCost =
762	TTI.getShuffleCost(Kind: SK, Tp: NewShuffleTy, Mask: NewMask, CostKind: CK) +
763	(NumOps * TTI.getCastInstrCost(Opcode: Instruction::BitCast, Dst: NewShuffleTy, Src: SrcTy,
764	CCH: TargetTransformInfo::CastContextHint::None,
765	CostKind: CK));
766	InstructionCost SrcCost =
767	TTI.getShuffleCost(Kind: SK, Tp: SrcTy, Mask, CostKind: CK) +
768	TTI.getCastInstrCost(Opcode: Instruction::BitCast, Dst: DestTy, Src: OldShuffleTy,
769	CCH: TargetTransformInfo::CastContextHint::None, CostKind: CK);
770	if (DestCost > SrcCost \|\| !DestCost.isValid())
771	return false;
772
773	// bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
774	++NumShufOfBitcast;
775	Value *CastV0 = Builder.CreateBitCast(V: peekThroughBitcasts(V: V0), DestTy: NewShuffleTy);
776	Value *CastV1 = Builder.CreateBitCast(V: peekThroughBitcasts(V: V1), DestTy: NewShuffleTy);
777	Value *Shuf = Builder.CreateShuffleVector(V1: CastV0, V2: CastV1, Mask: NewMask);
778	replaceValue(Old&: I, New&: *Shuf);
779	return true;
780	}
781
782	/// VP Intrinsics whose vector operands are both splat values may be simplified
783	/// into the scalar version of the operation and the result splatted. This
784	/// can lead to scalarization down the line.
785	bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
786	if (!isa<VPIntrinsic>(Val: I))
787	return false;
788	VPIntrinsic &VPI = cast<VPIntrinsic>(Val&: I);
789	Value *Op0 = VPI.getArgOperand(i: `0`);
790	Value *Op1 = VPI.getArgOperand(i: `1`);
791
792	if (!isSplatValue(V: Op0) \|\| !isSplatValue(V: Op1))
793	return false;
794
795	// Check getSplatValue early in this function, to avoid doing unnecessary
796	// work.
797	Value *ScalarOp0 = getSplatValue(V: Op0);
798	Value *ScalarOp1 = getSplatValue(V: Op1);
799	if (!ScalarOp0 \|\| !ScalarOp1)
800	return false;
801
802	// For the binary VP intrinsics supported here, the result on disabled lanes
803	// is a poison value. For now, only do this simplification if all lanes
804	// are active.
805	// TODO: Relax the condition that all lanes are active by using insertelement
806	// on inactive lanes.
807	auto IsAllTrueMask = [](Value *MaskVal) {
808	if (Value *SplattedVal = getSplatValue(V: MaskVal))
809	if (auto *ConstValue = dyn_cast<Constant>(Val: SplattedVal))
810	return ConstValue->isAllOnesValue();
811	return false;
812	};
813	if (!IsAllTrueMask (VPI.getArgOperand(i: `2`)))
814	return false;
815
816	// Check to make sure we support scalarization of the intrinsic
817	Intrinsic::ID IntrID = VPI.getIntrinsicID();
818	if (!VPBinOpIntrinsic::isVPBinOp(ID: IntrID))
819	return false;
820
821	// Calculate cost of splatting both operands into vectors and the vector
822	// intrinsic
823	VectorType *VecTy = cast<VectorType>(Val: VPI.getType());
824	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
825	SmallVector<int> Mask;
826	if (auto *FVTy = dyn_cast<FixedVectorType>(Val: VecTy))
827	Mask.resize(N: FVTy->getNumElements(), NV: `0`);
828	InstructionCost SplatCost =
829	TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy, CostKind, Index: `0`) +
830	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, Tp: VecTy, Mask);
831
832	// Calculate the cost of the VP Intrinsic
833	SmallVector<Type *, `4`> Args;
834	for (Value *V : VPI.args())
835	Args.push_back(Elt: V->getType());
836	IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
837	InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind);
838	InstructionCost OldCost = `2` * SplatCost + VectorOpCost;
839
840	// Determine scalar opcode
841	std::optional<unsigned> FunctionalOpcode =
842	VPI.getFunctionalOpcode();
843	std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
844	if (!FunctionalOpcode) {
845	ScalarIntrID = VPI.getFunctionalIntrinsicID();
846	if (!ScalarIntrID)
847	return false;
848	}
849
850	// Calculate cost of scalarizing
851	InstructionCost ScalarOpCost = `0`;
852	if (ScalarIntrID) {
853	IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
854	ScalarOpCost = TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind);
855	} else {
856	ScalarOpCost =
857	TTI.getArithmeticInstrCost(Opcode: *FunctionalOpcode, Ty: VecTy->getScalarType());
858	}
859
860	// The existing splats may be kept around if other instructions use them.
861	InstructionCost CostToKeepSplats =
862	(SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
863	InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
864
865	LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
866	<< "\n");
867	LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
868	<< ", Cost of scalarizing:" << NewCost << "\n");
869
870	// We want to scalarize unless the vector variant actually has lower cost.
871	if (OldCost < NewCost \|\| !NewCost.isValid())
872	return false;
873
874	// Scalarize the intrinsic
875	ElementCount EC = cast<VectorType>(Val: Op0->getType())->getElementCount();
876	Value *EVL = VPI.getArgOperand(i: `3`);
877
878	// If the VP op might introduce UB or poison, we can scalarize it provided
879	// that we know the EVL > 0: If the EVL is zero, then the original VP op
880	// becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
881	// scalarizing it.
882	bool SafeToSpeculate;
883	if (ScalarIntrID)
884	SafeToSpeculate = Intrinsic::getAttributes(C&: I.getContext(), id: *ScalarIntrID)
885	.hasFnAttr(Attribute::AttrKind::Speculatable);
886	else
887	SafeToSpeculate = isSafeToSpeculativelyExecuteWithOpcode(
888	Opcode: FunctionalOpcode, Inst: &VPI, CtxI: nullptr*, AC: &AC, DT: &DT);
889	if (!SafeToSpeculate &&
890	!isKnownNonZero(V: EVL, Q: SimplifyQuery (*DL, &DT, &AC, &VPI)))
891	return false;
892
893	Value *ScalarVal =
894	ScalarIntrID
895	? Builder.CreateIntrinsic(RetTy: VecTy->getScalarType(), ID: *ScalarIntrID,
896	Args: {ScalarOp0, ScalarOp1})
897	: Builder.CreateBinOp(Opc: (Instruction::BinaryOps)(*FunctionalOpcode),
898	LHS: ScalarOp0, RHS: ScalarOp1);
899
900	replaceValue(Old&: VPI, New&: *Builder.CreateVectorSplat(EC, V: ScalarVal));
901	return true;
902	}
903
904	/// Match a vector binop or compare instruction with at least one inserted
905	/// scalar operand and convert to scalar binop/cmp followed by insertelement.
906	bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
907	CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
908	Value Ins0, Ins1;
909	if (!match(V: &I, P: m_BinOp(L: m_Value(V&: Ins0), R: m_Value(V&: Ins1))) &&
910	!match(V: &I, P: m_Cmp(Pred, L: m_Value(V&: Ins0), R: m_Value(V&: Ins1))))
911	return false;
912
913	// Do not convert the vector condition of a vector select into a scalar
914	// condition. That may cause problems for codegen because of differences in
915	// boolean formats and register-file transfers.
916	// TODO: Can we account for that in the cost model?
917	bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE;
918	if (IsCmp)
919	for (User *U : I.users())
920	if (match(V: U, P: m_Select(C: m_Specific(V: &I), L: m_Value(), R: m_Value())))
921	return false;
922
923	// Match against one or both scalar values being inserted into constant
924	// vectors:
925	// vec_op VecC0, (inselt VecC1, V1, Index)
926	// vec_op (inselt VecC0, V0, Index), VecC1
927	// vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index)
928	// TODO: Deal with mismatched index constants and variable indexes?
929	Constant VecC0 = nullptr, VecC1 = nullptr;
930	Value V0 = nullptr, V1 = nullptr;
931	uint64_t Index0 = `0`, Index1 = `0`;
932	if (!match(V: Ins0, P: m_InsertElt(Val: m_Constant(C&: VecC0), Elt: m_Value(V&: V0),
933	Idx: m_ConstantInt(V&: Index0))) &&
934	!match(V: Ins0, P: m_Constant(C&: VecC0)))
935	return false;
936	if (!match(V: Ins1, P: m_InsertElt(Val: m_Constant(C&: VecC1), Elt: m_Value(V&: V1),
937	Idx: m_ConstantInt(V&: Index1))) &&
938	!match(V: Ins1, P: m_Constant(C&: VecC1)))
939	return false;
940
941	bool IsConst0 = !V0;
942	bool IsConst1 = !V1;
943	if (IsConst0 && IsConst1)
944	return false;
945	if (!IsConst0 && !IsConst1 && Index0 != Index1)
946	return false;
947
948	// Bail for single insertion if it is a load.
949	// TODO: Handle this once getVectorInstrCost can cost for load/stores.
950	auto *I0 = dyn_cast_or_null<Instruction>(Val: V0);
951	auto *I1 = dyn_cast_or_null<Instruction>(Val: V1);
952	if ((IsConst0 && I1 && I1->mayReadFromMemory()) \|\|
953	(IsConst1 && I0 && I0->mayReadFromMemory()))
954	return false;
955
956	uint64_t Index = IsConst0 ? Index1 : Index0;
957	Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType();
958	Type *VecTy = I.getType();
959	assert(VecTy->isVectorTy() &&
960	(IsConst0 \|\| IsConst1 \|\| V0->getType() == V1->getType()) &&
961	(ScalarTy->isIntegerTy() \|\| ScalarTy->isFloatingPointTy() \|\|
962	ScalarTy->isPointerTy()) &&
963	"Unexpected types for insert element into binop or cmp");
964
965	unsigned Opcode = I.getOpcode();
966	InstructionCost ScalarOpCost, VectorOpCost;
967	if (IsCmp) {
968	CmpInst::Predicate Pred = cast<CmpInst>(Val&: I).getPredicate();
969	ScalarOpCost = TTI.getCmpSelInstrCost(
970	Opcode, ValTy: ScalarTy, CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy), VecPred: Pred);
971	VectorOpCost = TTI.getCmpSelInstrCost(
972	Opcode, ValTy: VecTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VecTy), VecPred: Pred);
973	} else {
974	ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, Ty: ScalarTy);
975	VectorOpCost = TTI.getArithmeticInstrCost(Opcode, Ty: VecTy);
976	}
977
978	// Get cost estimate for the insert element. This cost will factor into
979	// both sequences.
980	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
981	InstructionCost InsertCost = TTI.getVectorInstrCost(
982	Opcode: Instruction::InsertElement, Val: VecTy, CostKind, Index);
983	InstructionCost OldCost =
984	(IsConst0 ? `0` : InsertCost) + (IsConst1 ? `0` : InsertCost) + VectorOpCost;
985	InstructionCost NewCost = ScalarOpCost + InsertCost +
986	(IsConst0 ? `0` : !Ins0->hasOneUse() * InsertCost) +
987	(IsConst1 ? `0` : !Ins1->hasOneUse() * InsertCost);
988
989	// We want to scalarize unless the vector variant actually has lower cost.
990	if (OldCost < NewCost \|\| !NewCost.isValid())
991	return false;
992
993	// vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
994	// inselt NewVecC, (scalar_op V0, V1), Index
995	if (IsCmp)
996	++NumScalarCmp;
997	else
998	++NumScalarBO;
999
1000	// For constant cases, extract the scalar element, this should constant fold.
1001	if (IsConst0)
1002	V0 = ConstantExpr::getExtractElement(Vec: VecC0, Idx: Builder.getInt64(C: Index));
1003	if (IsConst1)
1004	V1 = ConstantExpr::getExtractElement(Vec: VecC1, Idx: Builder.getInt64(C: Index));
1005
1006	Value *Scalar =
1007	IsCmp ? Builder.CreateCmp(Pred, LHS: V0, RHS: V1)
1008	: Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Opcode, LHS: V0, RHS: V1);
1009
1010	Scalar->setName(I.getName() + ".scalar");
1011
1012	// All IR flags are safe to back-propagate. There is no potential for extra
1013	// poison to be created by the scalar instruction.
1014	if (auto *ScalarInst = dyn_cast<Instruction>(Val: Scalar))
1015	ScalarInst->copyIRFlags(V: &I);
1016
1017	// Fold the vector constants in the original vectors into a new base vector.
1018	Value *NewVecC =
1019	IsCmp ? Builder.CreateCmp(Pred, LHS: VecC0, RHS: VecC1)
1020	: Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Opcode, LHS: VecC0, RHS: VecC1);
1021	Value *Insert = Builder.CreateInsertElement(Vec: NewVecC, NewElt: Scalar, Idx: Index);
1022	replaceValue(Old&: I, New&: *Insert);
1023	return true;
1024	}
1025
1026	/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1027	/// a vector into vector operations followed by extract. Note: The SLP pass
1028	/// may miss this pattern because of implementation problems.
1029	bool VectorCombine::foldExtractedCmps(Instruction &I) {
1030	// We are looking for a scalar binop of booleans.
1031	// binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1032	if (!I.isBinaryOp() \|\| !I.getType()->isIntegerTy(Bitwidth: `1`))
1033	return false;
1034
1035	// The compare predicates should match, and each compare should have a
1036	// constant operand.
1037	// TODO: Relax the one-use constraints.
1038	Value B0 = I.getOperand(i: `0`), B1 = I.getOperand(i: `1`);
1039	Instruction I0, I1;
1040	Constant C0, C1;
1041	CmpInst::Predicate P0, P1;
1042	if (!match(V: B0, P: m_OneUse(SubPattern: m_Cmp(Pred&: P0, L: m_Instruction(I&: I0), R: m_Constant(C&: C0)))) \|\|
1043	!match(V: B1, P: m_OneUse(SubPattern: m_Cmp(Pred&: P1, L: m_Instruction(I&: I1), R: m_Constant(C&: C1)))) \|\|
1044	P0 != P1)
1045	return false;
1046
1047	// The compare operands must be extracts of the same vector with constant
1048	// extract indexes.
1049	// TODO: Relax the one-use constraints.
1050	Value *X;
1051	uint64_t Index0, Index1;
1052	if (!match(V: I0, P: m_OneUse(SubPattern: m_ExtractElt(Val: m_Value(V&: X), Idx: m_ConstantInt(V&: Index0)))) \|\|
1053	!match(V: I1, P: m_OneUse(SubPattern: m_ExtractElt(Val: m_Specific(V: X), Idx: m_ConstantInt(V&: Index1)))))
1054	return false;
1055
1056	auto *Ext0 = cast<ExtractElementInst>(Val: I0);
1057	auto *Ext1 = cast<ExtractElementInst>(Val: I1);
1058	ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1);
1059	if (!ConvertToShuf)
1060	return false;
1061
1062	// The original scalar pattern is:
1063	// binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1064	CmpInst::Predicate Pred = P0;
1065	unsigned CmpOpcode = CmpInst::isFPPredicate(P: Pred) ? Instruction::FCmp
1066	: Instruction::ICmp;
1067	auto *VecTy = dyn_cast<FixedVectorType>(Val: X->getType());
1068	if (!VecTy)
1069	return false;
1070
1071	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1072	InstructionCost OldCost =
1073	TTI.getVectorInstrCost(I: *Ext0, Val: VecTy, CostKind, Index: Index0);
1074	OldCost += TTI.getVectorInstrCost(I: *Ext1, Val: VecTy, CostKind, Index: Index1);
1075	OldCost +=
1076	TTI.getCmpSelInstrCost(Opcode: CmpOpcode, ValTy: I0->getType(),
1077	CondTy: CmpInst::makeCmpResultType(opnd_type: I0->getType()), VecPred: Pred) *
1078	`2`;
1079	OldCost += TTI.getArithmeticInstrCost(Opcode: I.getOpcode(), Ty: I.getType());
1080
1081	// The proposed vector pattern is:
1082	// vcmp = cmp Pred X, VecC
1083	// ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1084	int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1085	int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1086	auto *CmpTy = cast<FixedVectorType>(Val: CmpInst::makeCmpResultType(opnd_type: X->getType()));
1087	InstructionCost NewCost = TTI.getCmpSelInstrCost(
1088	Opcode: CmpOpcode, ValTy: X->getType(), CondTy: CmpInst::makeCmpResultType(opnd_type: X->getType()), VecPred: Pred);
1089	SmallVector<int, `32`> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1090	ShufMask [CheapIndex] = ExpensiveIndex;
1091	NewCost += TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: CmpTy,
1092	Mask: ShufMask);
1093	NewCost += TTI.getArithmeticInstrCost(Opcode: I.getOpcode(), Ty: CmpTy);
1094	NewCost += TTI.getVectorInstrCost(I: *Ext0, Val: CmpTy, CostKind, Index: CheapIndex);
1095
1096	// Aggressively form vector ops if the cost is equal because the transform
1097	// may enable further optimization.
1098	// Codegen can reverse this transform (scalarize) if it was not profitable.
1099	if (OldCost < NewCost \|\| !NewCost.isValid())
1100	return false;
1101
1102	// Create a vector constant from the 2 scalar constants.
1103	SmallVector<Constant *, `32`> CmpC(VecTy->getNumElements(),
1104	PoisonValue::get(T: VecTy->getElementType()));
1105	CmpC [Index0] = C0;
1106	CmpC [Index1] = C1;
1107	Value *VCmp = Builder.CreateCmp(Pred, LHS: X, RHS: ConstantVector::get(V: CmpC));
1108
1109	Value *Shuf = createShiftShuffle(Vec: VCmp, OldIndex: ExpensiveIndex, NewIndex: CheapIndex, Builder);
1110	Value *VecLogic = Builder.CreateBinOp(Opc: cast<BinaryOperator>(Val&: I).getOpcode(),
1111	LHS: VCmp, RHS: Shuf);
1112	Value *NewExt = Builder.CreateExtractElement(Vec: VecLogic, Idx: CheapIndex);
1113	replaceValue(Old&: I, New&: *NewExt);
1114	++NumVecCmpBO;
1115	return true;
1116	}
1117
1118	// Check if memory loc modified between two instrs in the same BB
1119	static bool isMemModifiedBetween(BasicBlock::iterator Begin,
1120	BasicBlock::iterator End,
1121	const MemoryLocation &Loc, AAResults &AA) {
1122	unsigned NumScanned = `0`;
1123	return std::any_of(first: Begin, last: End, pred: [&](const Instruction &Instr) {
1124	return isModSet(MRI: AA.getModRefInfo(I: &Instr, OptLoc: Loc)) \|\|
1125	++NumScanned > MaxInstrsToScan;
1126	});
1127	}
1128
1129	namespace {
1130	/// Helper class to indicate whether a vector index can be safely scalarized and
1131	/// if a freeze needs to be inserted.
1132	class ScalarizationResult {
1133	enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1134
1135	StatusTy Status;
1136	Value *ToFreeze;
1137
1138	ScalarizationResult(StatusTy Status, Value ToFreeze = nullptr*)
1139	: Status(Status), ToFreeze(ToFreeze) {}
1140
1141	public:
1142	ScalarizationResult(const ScalarizationResult &Other) = default;
1143	~ScalarizationResult() {
1144	assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1145	}
1146
1147	static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1148	static ScalarizationResult safe() { return {StatusTy::Safe}; }
1149	static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1150	return {StatusTy::SafeWithFreeze, ToFreeze};
1151	}
1152
1153	/// Returns true if the index can be scalarize without requiring a freeze.
1154	bool isSafe() const { return Status == StatusTy::Safe; }
1155	/// Returns true if the index cannot be scalarized.
1156	bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1157	/// Returns true if the index can be scalarize, but requires inserting a
1158	/// freeze.
1159	bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1160
1161	/// Reset the state of Unsafe and clear ToFreze if set.
1162	void discard() {
1163	ToFreeze = nullptr;
1164	Status = StatusTy::Unsafe;
1165	}
1166
1167	/// Freeze the ToFreeze and update the use in \p User to use it.
1168	void freeze(IRBuilder<> &Builder, Instruction &UserI) {
1169	assert(isSafeWithFreeze() &&
1170	"should only be used when freezing is required");
1171	assert(is_contained(ToFreeze->users(), &UserI) &&
1172	"UserI must be a user of ToFreeze");
1173	IRBuilder<>::InsertPointGuard Guard(Builder);
1174	Builder.SetInsertPoint(cast<Instruction>(Val: &UserI));
1175	Value *Frozen =
1176	Builder.CreateFreeze(V: ToFreeze, Name: ToFreeze->getName() + ".frozen");
1177	for (Use &U : make_early_inc_range(Range: (UserI.operands())))
1178	if (U.get() == ToFreeze)
1179	U.set(Frozen);
1180
1181	ToFreeze = nullptr;
1182	}
1183	};
1184	} // namespace
1185
1186	/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1187	/// Idx. \p Idx must access a valid vector element.
1188	static ScalarizationResult canScalarizeAccess(VectorType VecTy, Value Idx,
1189	Instruction *CtxI,
1190	AssumptionCache &AC,
1191	const DominatorTree &DT) {
1192	// We do checks for both fixed vector types and scalable vector types.
1193	// This is the number of elements of fixed vector types,
1194	// or the minimum number of elements of scalable vector types.
1195	uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1196
1197	if (auto *C = dyn_cast<ConstantInt>(Val: Idx)) {
1198	if (C->getValue().ult(RHS: NumElements))
1199	return ScalarizationResult::safe();
1200	return ScalarizationResult::unsafe();
1201	}
1202
1203	unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1204	APInt Zero(IntWidth, `0`);
1205	APInt MaxElts(IntWidth, NumElements);
1206	ConstantRange ValidIndices(Zero, MaxElts);
1207	ConstantRange IdxRange(IntWidth, true);
1208
1209	if (isGuaranteedNotToBePoison(V: Idx, AC: &AC)) {
1210	if (ValidIndices.contains(CR: computeConstantRange(V: Idx, / ForSigned / false,
1211	UseInstrInfo: true, AC: &AC, CtxI, DT: &DT)))
1212	return ScalarizationResult::safe();
1213	return ScalarizationResult::unsafe();
1214	}
1215
1216	// If the index may be poison, check if we can insert a freeze before the
1217	// range of the index is restricted.
1218	Value *IdxBase;
1219	ConstantInt *CI;
1220	if (match(V: Idx, P: m_And(L: m_Value(V&: IdxBase), R: m_ConstantInt(CI)))) {
1221	IdxRange = IdxRange.binaryAnd(Other: CI->getValue());
1222	} else if (match(V: Idx, P: m_URem(L: m_Value(V&: IdxBase), R: m_ConstantInt(CI)))) {
1223	IdxRange = IdxRange.urem(Other: CI->getValue());
1224	}
1225
1226	if (ValidIndices.contains(CR: IdxRange))
1227	return ScalarizationResult::safeWithFreeze(ToFreeze: IdxBase);
1228	return ScalarizationResult::unsafe();
1229	}
1230
1231	/// The memory operation on a vector of \p ScalarType had alignment of
1232	/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1233	/// alignment that will be valid for the memory operation on a single scalar
1234	/// element of the same type with index \p Idx.
1235	static Align computeAlignmentAfterScalarization(Align VectorAlignment,
1236	Type ScalarType, Value Idx,
1237	const DataLayout &DL) {
1238	if (auto *C = dyn_cast<ConstantInt>(Val: Idx))
1239	return commonAlignment(A: VectorAlignment,
1240	Offset: C->getZExtValue() * DL.getTypeStoreSize(Ty: ScalarType));
1241	return commonAlignment(A: VectorAlignment, Offset: DL.getTypeStoreSize(Ty: ScalarType));
1242	}
1243
1244	// Combine patterns like:
1245	// %0 = load <4 x i32>, <4 x i32> %a*
1246	// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1247	// store <4 x i32> %1, <4 x i32> %a*
1248	// to:
1249	// %0 = bitcast <4 x i32>* %a to i32*
1250	// %1 = getelementptr inbounds i32, i32 %0, i64 0, i64 1*
1251	// store i32 %b, i32 %1*
1252	bool VectorCombine::foldSingleElementStore(Instruction &I) {
1253	auto *SI = cast<StoreInst>(Val: &I);
1254	if (!SI->isSimple() \|\| !isa<VectorType>(Val: SI->getValueOperand()->getType()))
1255	return false;
1256
1257	// TODO: Combine more complicated patterns (multiple insert) by referencing
1258	// TargetTransformInfo.
1259	Instruction *Source;
1260	Value *NewElement;
1261	Value *Idx;
1262	if (!match(V: SI->getValueOperand(),
1263	P: m_InsertElt(Val: m_Instruction(I&: Source), Elt: m_Value(V&: NewElement),
1264	Idx: m_Value(V&: Idx))))
1265	return false;
1266
1267	if (auto *Load = dyn_cast<LoadInst>(Val: Source)) {
1268	auto VecTy = cast<VectorType>(Val: SI->getValueOperand()->getType());
1269	Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1270	// Don't optimize for atomic/volatile load or store. Ensure memory is not
1271	// modified between, vector type matches store size, and index is inbounds.
1272	if (!Load->isSimple() \|\| Load->getParent() != SI->getParent() \|\|
1273	!DL->typeSizeEqualsStoreSize(Ty: Load->getType()->getScalarType()) \|\|
1274	SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1275	return false;
1276
1277	auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, CtxI: Load, AC, DT);
1278	if (ScalarizableIdx.isUnsafe() \|\|
1279	isMemModifiedBetween(Begin: Load->getIterator(), End: SI->getIterator(),
1280	Loc: MemoryLocation::get(SI), AA))
1281	return false;
1282
1283	if (ScalarizableIdx.isSafeWithFreeze())
1284	ScalarizableIdx.freeze(Builder, UserI&: *cast<Instruction>(Val: Idx));
1285	Value *GEP = Builder.CreateInBoundsGEP(
1286	Ty: SI->getValueOperand()->getType(), Ptr: SI->getPointerOperand(),
1287	IdxList: {ConstantInt::get(Ty: Idx->getType(), V: `0`), Idx});
1288	StoreInst *NSI = Builder.CreateStore(Val: NewElement, Ptr: GEP);
1289	NSI->copyMetadata(SrcInst: *SI);
1290	Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1291	VectorAlignment: std::max(a: SI->getAlign(), b: Load->getAlign()), ScalarType: NewElement->getType(), Idx,
1292	DL: *DL);
1293	NSI->setAlignment(ScalarOpAlignment);
1294	replaceValue(Old&: I, New&: *NSI);
1295	eraseInstruction(I);
1296	return true;
1297	}
1298
1299	return false;
1300	}
1301
1302	/// Try to scalarize vector loads feeding extractelement instructions.
1303	bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
1304	Value *Ptr;
1305	if (!match(V: &I, P: m_Load(Op: m_Value(V&: Ptr))))
1306	return false;
1307
1308	auto *VecTy = cast<VectorType>(Val: I.getType());
1309	auto *LI = cast<LoadInst>(Val: &I);
1310	if (LI->isVolatile() \|\| !DL->typeSizeEqualsStoreSize(Ty: VecTy->getScalarType()))
1311	return false;
1312
1313	InstructionCost OriginalCost =
1314	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: VecTy, Alignment: LI->getAlign(),
1315	AddressSpace: LI->getPointerAddressSpace());
1316	InstructionCost ScalarizedCost = `0`;
1317
1318	Instruction *LastCheckedInst = LI;
1319	unsigned NumInstChecked = `0`;
1320	DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
1321	auto FailureGuard = make_scope_exit(F: [&]() {
1322	// If the transform is aborted, discard the ScalarizationResults.
1323	for (auto &Pair : NeedFreeze)
1324	Pair.second.discard();
1325	});
1326
1327	// Check if all users of the load are extracts with no memory modifications
1328	// between the load and the extract. Compute the cost of both the original
1329	// code and the scalarized version.
1330	for (User *U : LI->users()) {
1331	auto *UI = dyn_cast<ExtractElementInst>(Val: U);
1332	if (!UI \|\| UI->getParent() != LI->getParent())
1333	return false;
1334
1335	// Check if any instruction between the load and the extract may modify
1336	// memory.
1337	if (LastCheckedInst->comesBefore(Other: UI)) {
1338	for (Instruction &I :
1339	make_range(x: std::next(x: LI->getIterator()), y: UI->getIterator())) {
1340	// Bail out if we reached the check limit or the instruction may write
1341	// to memory.
1342	if (NumInstChecked == MaxInstrsToScan \|\| I.mayWriteToMemory())
1343	return false;
1344	NumInstChecked++;
1345	}
1346	LastCheckedInst = UI;
1347	}
1348
1349	auto ScalarIdx = canScalarizeAccess(VecTy, Idx: UI->getOperand(i_nocapture: `1`), CtxI: &I, AC, DT);
1350	if (ScalarIdx.isUnsafe())
1351	return false;
1352	if (ScalarIdx.isSafeWithFreeze()) {
1353	NeedFreeze.try_emplace(Key: UI, Args&: ScalarIdx);
1354	ScalarIdx.discard();
1355	}
1356
1357	auto *Index = dyn_cast<ConstantInt>(Val: UI->getOperand(i_nocapture: `1`));
1358	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1359	OriginalCost +=
1360	TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, CostKind,
1361	Index: Index ? Index->getZExtValue() : -`1`);
1362	ScalarizedCost +=
1363	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: VecTy->getElementType(),
1364	Alignment: Align (`1`), AddressSpace: LI->getPointerAddressSpace());
1365	ScalarizedCost += TTI.getAddressComputationCost(Ty: VecTy->getElementType());
1366	}
1367
1368	if (ScalarizedCost >= OriginalCost)
1369	return false;
1370
1371	// Replace extracts with narrow scalar loads.
1372	for (User *U : LI->users()) {
1373	auto *EI = cast<ExtractElementInst>(Val: U);
1374	Value *Idx = EI->getOperand(i_nocapture: `1`);
1375
1376	// Insert 'freeze' for poison indexes.
1377	auto It = NeedFreeze.find(Val: EI);
1378	if (It != NeedFreeze.end())
1379	It ->second.freeze(Builder, UserI&: *cast<Instruction>(Val: Idx));
1380
1381	Builder.SetInsertPoint(EI);
1382	Value *GEP =
1383	Builder.CreateInBoundsGEP(Ty: VecTy, Ptr, IdxList: {Builder.getInt32(C: `0`), Idx});
1384	auto *NewLoad = cast<LoadInst>(Val: Builder.CreateLoad(
1385	Ty: VecTy->getElementType(), Ptr: GEP, Name: EI->getName() + ".scalar"));
1386
1387	Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1388	VectorAlignment: LI->getAlign(), ScalarType: VecTy->getElementType(), Idx, DL: *DL);
1389	NewLoad->setAlignment(ScalarOpAlignment);
1390
1391	replaceValue(Old&: EI, New&: NewLoad);
1392	}
1393
1394	FailureGuard.release();
1395	return true;
1396	}
1397
1398	/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
1399	bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
1400	BinaryOperator B0, B1;
1401	ArrayRef<int> OldMask;
1402	if (!match(V: &I, P: m_Shuffle(v1: m_OneUse(SubPattern: m_BinOp(I&: B0)), v2: m_OneUse(SubPattern: m_BinOp(I&: B1)),
1403	mask: m_Mask (OldMask))))
1404	return false;
1405
1406	// Don't introduce poison into div/rem.
1407	if (any_of(Range&: OldMask, P: [](int M) { return M == PoisonMaskElem; }) &&
1408	B0->isIntDivRem())
1409	return false;
1410
1411	// TODO: Add support for addlike etc.
1412	Instruction::BinaryOps Opcode = B0->getOpcode();
1413	if (Opcode != B1->getOpcode())
1414	return false;
1415
1416	auto *ShuffleDstTy = dyn_cast<FixedVectorType>(Val: I.getType());
1417	auto *BinOpTy = dyn_cast<FixedVectorType>(Val: B0->getType());
1418	if (!ShuffleDstTy \|\| !BinOpTy)
1419	return false;
1420
1421	unsigned NumSrcElts = BinOpTy->getNumElements();
1422
1423	// If we have something like "add X, Y" and "add Z, X", swap ops to match.
1424	Value X = B0->getOperand(i_nocapture: `0`), Y = B0->getOperand(i_nocapture: `1`);
1425	Value Z = B1->getOperand(i_nocapture: `0`), W = B1->getOperand(i_nocapture: `1`);
1426	if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W &&
1427	(X == W \|\| Y == Z))
1428	std::swap(a&: X, b&: Y);
1429
1430	auto ConvertToUnary = [NumSrcElts](int &M) {
1431	if (M >= (int)NumSrcElts)
1432	M -= NumSrcElts;
1433	};
1434
1435	SmallVector<int> NewMask0(OldMask.begin(), OldMask.end());
1436	TargetTransformInfo::ShuffleKind SK0 = TargetTransformInfo::SK_PermuteTwoSrc;
1437	if (X == Z) {
1438	llvm::for_each(Range&: NewMask0, F: ConvertToUnary);
1439	SK0 = TargetTransformInfo::SK_PermuteSingleSrc;
1440	Z = PoisonValue::get(T: BinOpTy);
1441	}
1442
1443	SmallVector<int> NewMask1(OldMask.begin(), OldMask.end());
1444	TargetTransformInfo::ShuffleKind SK1 = TargetTransformInfo::SK_PermuteTwoSrc;
1445	if (Y == W) {
1446	llvm::for_each(Range&: NewMask1, F: ConvertToUnary);
1447	SK1 = TargetTransformInfo::SK_PermuteSingleSrc;
1448	W = PoisonValue::get(T: BinOpTy);
1449	}
1450
1451	// Try to replace a binop with a shuffle if the shuffle is not costly.
1452	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1453
1454	InstructionCost OldCost =
1455	TTI.getArithmeticInstrCost(Opcode: B0->getOpcode(), Ty: BinOpTy, CostKind) +
1456	TTI.getArithmeticInstrCost(Opcode: B1->getOpcode(), Ty: BinOpTy, CostKind) +
1457	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteTwoSrc, Tp: BinOpTy,
1458	Mask: OldMask, CostKind, Index: `0`, SubTp: nullptr, Args: {B0, B1}, CxtI: &I);
1459
1460	InstructionCost NewCost =
1461	TTI.getShuffleCost(Kind: SK0, Tp: BinOpTy, Mask: NewMask0, CostKind, Index: `0`, SubTp: nullptr, Args: {X, Z}) +
1462	TTI.getShuffleCost(Kind: SK1, Tp: BinOpTy, Mask: NewMask1, CostKind, Index: `0`, SubTp: nullptr, Args: {Y, W}) +
1463	TTI.getArithmeticInstrCost(Opcode, Ty: ShuffleDstTy, CostKind);
1464
1465	LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
1466	<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1467	<< "\n");
1468	if (NewCost >= OldCost)
1469	return false;
1470
1471	Value *Shuf0 = Builder.CreateShuffleVector(V1: X, V2: Z, Mask: NewMask0);
1472	Value *Shuf1 = Builder.CreateShuffleVector(V1: Y, V2: W, Mask: NewMask1);
1473	Value *NewBO = Builder.CreateBinOp(Opc: Opcode, LHS: Shuf0, RHS: Shuf1);
1474
1475	// Intersect flags from the old binops.
1476	if (auto *NewInst = dyn_cast<Instruction>(Val: NewBO)) {
1477	NewInst->copyIRFlags(V: B0);
1478	NewInst->andIRFlags(V: B1);
1479	}
1480
1481	Worklist.pushValue(V: Shuf0);
1482	Worklist.pushValue(V: Shuf1);
1483	replaceValue(Old&: I, New&: *NewBO);
1484	return true;
1485	}
1486
1487	/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
1488	/// into "castop (shuffle)".
1489	bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
1490	Value V0, V1;
1491	ArrayRef<int> OldMask;
1492	if (!match(V: &I, P: m_Shuffle(v1: m_OneUse(SubPattern: m_Value(V&: V0)), v2: m_OneUse(SubPattern: m_Value(V&: V1)),
1493	mask: m_Mask (OldMask))))
1494	return false;
1495
1496	auto *C0 = dyn_cast<CastInst>(Val: V0);
1497	auto *C1 = dyn_cast<CastInst>(Val: V1);
1498	if (!C0 \|\| !C1)
1499	return false;
1500
1501	Instruction::CastOps Opcode = C0->getOpcode();
1502	if (C0->getSrcTy() != C1->getSrcTy())
1503	return false;
1504
1505	// Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
1506	if (Opcode != C1->getOpcode()) {
1507	if (match(V: C0, P: m_SExtLike(Op: m_Value())) && match(V: C1, P: m_SExtLike(Op: m_Value())))
1508	Opcode = Instruction::SExt;
1509	else
1510	return false;
1511	}
1512
1513	auto *ShuffleDstTy = dyn_cast<FixedVectorType>(Val: I.getType());
1514	auto *CastDstTy = dyn_cast<FixedVectorType>(Val: C0->getDestTy());
1515	auto *CastSrcTy = dyn_cast<FixedVectorType>(Val: C0->getSrcTy());
1516	if (!ShuffleDstTy \|\| !CastDstTy \|\| !CastSrcTy)
1517	return false;
1518
1519	unsigned NumSrcElts = CastSrcTy->getNumElements();
1520	unsigned NumDstElts = CastDstTy->getNumElements();
1521	assert((NumDstElts == NumSrcElts \|\| Opcode == Instruction::BitCast) &&
1522	"Only bitcasts expected to alter src/dst element counts");
1523
1524	// Check for bitcasting of unscalable vector types.
1525	// e.g. <32 x i40> -> <40 x i32>
1526	if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != `0` &&
1527	(NumDstElts % NumSrcElts) != `0`)
1528	return false;
1529
1530	SmallVector<int, `16`> NewMask;
1531	if (NumSrcElts >= NumDstElts) {
1532	// The bitcast is from wide to narrow/equal elements. The shuffle mask can
1533	// always be expanded to the equivalent form choosing narrower elements.
1534	assert(NumSrcElts % NumDstElts == `0` && "Unexpected shuffle mask");
1535	unsigned ScaleFactor = NumSrcElts / NumDstElts;
1536	narrowShuffleMaskElts(Scale: ScaleFactor, Mask: OldMask, ScaledMask&: NewMask);
1537	} else {
1538	// The bitcast is from narrow elements to wide elements. The shuffle mask
1539	// must choose consecutive elements to allow casting first.
1540	assert(NumDstElts % NumSrcElts == `0` && "Unexpected shuffle mask");
1541	unsigned ScaleFactor = NumDstElts / NumSrcElts;
1542	if (!widenShuffleMaskElts(Scale: ScaleFactor, Mask: OldMask, ScaledMask&: NewMask))
1543	return false;
1544	}
1545
1546	auto *NewShuffleDstTy =
1547	FixedVectorType::get(ElementType: CastSrcTy->getScalarType(), NumElts: NewMask.size());
1548
1549	// Try to replace a castop with a shuffle if the shuffle is not costly.
1550	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1551
1552	InstructionCost OldCost =
1553	TTI.getCastInstrCost(Opcode: C0->getOpcode(), Dst: CastDstTy, Src: CastSrcTy,
1554	CCH: TTI::CastContextHint::None, CostKind) +
1555	TTI.getCastInstrCost(Opcode: C1->getOpcode(), Dst: CastDstTy, Src: CastSrcTy,
1556	CCH: TTI::CastContextHint::None, CostKind);
1557	OldCost +=
1558	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteTwoSrc, Tp: CastDstTy,
1559	Mask: OldMask, CostKind, Index: `0`, SubTp: nullptr, Args: std::nullopt, CxtI: &I);
1560
1561	InstructionCost NewCost = TTI.getShuffleCost(
1562	Kind: TargetTransformInfo::SK_PermuteTwoSrc, Tp: CastSrcTy, Mask: NewMask, CostKind);
1563	NewCost += TTI.getCastInstrCost(Opcode, Dst: ShuffleDstTy, Src: NewShuffleDstTy,
1564	CCH: TTI::CastContextHint::None, CostKind);
1565
1566	LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
1567	<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1568	<< "\n");
1569	if (NewCost > OldCost)
1570	return false;
1571
1572	Value *Shuf = Builder.CreateShuffleVector(V1: C0->getOperand(i_nocapture: `0`),
1573	V2: C1->getOperand(i_nocapture: `0`), Mask: NewMask);
1574	Value *Cast = Builder.CreateCast(Op: Opcode, V: Shuf, DestTy: ShuffleDstTy);
1575
1576	// Intersect flags from the old casts.
1577	if (auto *NewInst = dyn_cast<Instruction>(Val: Cast)) {
1578	NewInst->copyIRFlags(V: C0);
1579	NewInst->andIRFlags(V: C1);
1580	}
1581
1582	Worklist.pushValue(V: Shuf);
1583	replaceValue(Old&: I, New&: *Cast);
1584	return true;
1585	}
1586
1587	/// Try to convert "shuffle (shuffle x, undef), (shuffle y, undef)"
1588	/// into "shuffle x, y".
1589	bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
1590	Value V0, V1;
1591	UndefValue U0, U1;
1592	ArrayRef<int> OuterMask, InnerMask0, InnerMask1;
1593	if (!match(V: &I, P: m_Shuffle(v1: m_OneUse(SubPattern: m_Shuffle(v1: m_Value(V&: V0), v2: m_UndefValue(U&: U0),
1594	mask: m_Mask (InnerMask0))),
1595	v2: m_OneUse(SubPattern: m_Shuffle(v1: m_Value(V&: V1), v2: m_UndefValue(U&: U1),
1596	mask: m_Mask (InnerMask1))),
1597	mask: m_Mask (OuterMask))))
1598	return false;
1599
1600	auto *ShufI0 = dyn_cast<Instruction>(Val: I.getOperand(i: `0`));
1601	auto *ShufI1 = dyn_cast<Instruction>(Val: I.getOperand(i: `1`));
1602	auto *ShuffleDstTy = dyn_cast<FixedVectorType>(Val: I.getType());
1603	auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(Val: V0->getType());
1604	auto *ShuffleImmTy = dyn_cast<FixedVectorType>(Val: I.getOperand(i: `0`)->getType());
1605	if (!ShuffleDstTy \|\| !ShuffleSrcTy \|\| !ShuffleImmTy \|\|
1606	V0->getType() != V1->getType())
1607	return false;
1608
1609	unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
1610	unsigned NumImmElts = ShuffleImmTy->getNumElements();
1611
1612	// Bail if either inner masks reference a RHS undef arg.
1613	if ((!isa<PoisonValue>(Val: U0) &&
1614	any_of(Range&: InnerMask0, P: [&](int M) { return M >= (int)NumSrcElts; })) \|\|
1615	(!isa<PoisonValue>(Val: U1) &&
1616	any_of(Range&: InnerMask1, P: [&](int M) { return M >= (int)NumSrcElts; })))
1617	return false;
1618
1619	// Merge shuffles - replace index to the RHS poison arg with PoisonMaskElem,
1620	SmallVector<int, `16`> NewMask(OuterMask.begin(), OuterMask.end());
1621	for (int &M : NewMask) {
1622	if (`0` <= M && M < (int)NumImmElts) {
1623	M = (InnerMask0 [M] >= (int)NumSrcElts) ? PoisonMaskElem : InnerMask0 [M];
1624	} else if (M >= (int)NumImmElts) {
1625	if (InnerMask1 [M - NumImmElts] >= (int)NumSrcElts)
1626	M = PoisonMaskElem;
1627	else
1628	M = InnerMask1 [M - NumImmElts] + (V0 == V1 ? `0` : NumSrcElts);
1629	}
1630	}
1631
1632	// Have we folded to an Identity shuffle?
1633	if (ShuffleVectorInst::isIdentityMask(Mask: NewMask, NumSrcElts)) {
1634	replaceValue(Old&: I, New&: *V0);
1635	return true;
1636	}
1637
1638	// Try to merge the shuffles if the new shuffle is not costly.
1639	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1640
1641	InstructionCost OldCost =
1642	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: ShuffleSrcTy,
1643	Mask: InnerMask0, CostKind, Index: `0`, SubTp: nullptr, Args: {V0, U0}, CxtI: ShufI0) +
1644	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: ShuffleSrcTy,
1645	Mask: InnerMask1, CostKind, Index: `0`, SubTp: nullptr, Args: {V1, U1}, CxtI: ShufI1) +
1646	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteTwoSrc, Tp: ShuffleImmTy,
1647	Mask: OuterMask, CostKind, Index: `0`, SubTp: nullptr, Args: {ShufI0, ShufI1}, CxtI: &I);
1648
1649	InstructionCost NewCost =
1650	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteTwoSrc, Tp: ShuffleSrcTy,
1651	Mask: NewMask, CostKind, Index: `0`, SubTp: nullptr, Args: {V0, V1});
1652
1653	LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
1654	<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1655	<< "\n");
1656	if (NewCost > OldCost)
1657	return false;
1658
1659	// Clear unused sources to poison.
1660	if (none_of(Range&: NewMask, P: [&](int M) { return `0` <= M && M < (int)NumSrcElts; }))
1661	V0 = PoisonValue::get(T: ShuffleSrcTy);
1662	if (none_of(Range&: NewMask, P: [&](int M) { return (int)NumSrcElts <= M; }))
1663	V1 = PoisonValue::get(T: ShuffleSrcTy);
1664
1665	Value *Shuf = Builder.CreateShuffleVector(V1: V0, V2: V1, Mask: NewMask);
1666	replaceValue(Old&: I, New&: *Shuf);
1667	return true;
1668	}
1669
1670	/// Given a commutative reduction, the order of the input lanes does not alter
1671	/// the results. We can use this to remove certain shuffles feeding the
1672	/// reduction, removing the need to shuffle at all.
1673	bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
1674	auto *II = dyn_cast<IntrinsicInst>(Val: &I);
1675	if (!II)
1676	return false;
1677	switch (II->getIntrinsicID()) {
1678	case Intrinsic::vector_reduce_add:
1679	case Intrinsic::vector_reduce_mul:
1680	case Intrinsic::vector_reduce_and:
1681	case Intrinsic::vector_reduce_or:
1682	case Intrinsic::vector_reduce_xor:
1683	case Intrinsic::vector_reduce_smin:
1684	case Intrinsic::vector_reduce_smax:
1685	case Intrinsic::vector_reduce_umin:
1686	case Intrinsic::vector_reduce_umax:
1687	break;
1688	default:
1689	return false;
1690	}
1691
1692	// Find all the inputs when looking through operations that do not alter the
1693	// lane order (binops, for example). Currently we look for a single shuffle,
1694	// and can ignore splat values.
1695	std::queue<Value *> Worklist;
1696	SmallPtrSet<Value *, `4`> Visited;
1697	ShuffleVectorInst Shuffle = nullptr*;
1698	if (auto *Op = dyn_cast<Instruction>(Val: I.getOperand(i: `0`)))
1699	Worklist.push(x: Op);
1700
1701	while (!Worklist.empty()) {
1702	Value *CV = Worklist.front();
1703	Worklist.pop();
1704	if (Visited.contains(Ptr: CV))
1705	continue;
1706
1707	// Splats don't change the order, so can be safely ignored.
1708	if (isSplatValue(V: CV))
1709	continue;
1710
1711	Visited.insert(Ptr: CV);
1712
1713	if (auto *CI = dyn_cast<Instruction>(Val: CV)) {
1714	if (CI->isBinaryOp()) {
1715	for (auto *Op : CI->operand_values())
1716	Worklist.push(x: Op);
1717	continue;
1718	} else if (auto *SV = dyn_cast<ShuffleVectorInst>(Val: CI)) {
1719	if (Shuffle && Shuffle != SV)
1720	return false;
1721	Shuffle = SV;
1722	continue;
1723	}
1724	}
1725
1726	// Anything else is currently an unknown node.
1727	return false;
1728	}
1729
1730	if (!Shuffle)
1731	return false;
1732
1733	// Check all uses of the binary ops and shuffles are also included in the
1734	// lane-invariant operations (Visited should be the list of lanewise
1735	// instructions, including the shuffle that we found).
1736	for (auto *V : Visited)
1737	for (auto *U : V->users())
1738	if (!Visited.contains(Ptr: U) && U != &I)
1739	return false;
1740
1741	FixedVectorType *VecType =
1742	dyn_cast<FixedVectorType>(Val: II->getOperand(i_nocapture: `0`)->getType());
1743	if (!VecType)
1744	return false;
1745	FixedVectorType *ShuffleInputType =
1746	dyn_cast<FixedVectorType>(Val: Shuffle->getOperand(i_nocapture: `0`)->getType());
1747	if (!ShuffleInputType)
1748	return false;
1749	unsigned NumInputElts = ShuffleInputType->getNumElements();
1750
1751	// Find the mask from sorting the lanes into order. This is most likely to
1752	// become a identity or concat mask. Undef elements are pushed to the end.
1753	SmallVector<int> ConcatMask;
1754	Shuffle->getShuffleMask(Result&: ConcatMask);
1755	sort(C&: ConcatMask, Comp: [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
1756	// In the case of a truncating shuffle it's possible for the mask
1757	// to have an index greater than the size of the resulting vector.
1758	// This requires special handling.
1759	bool IsTruncatingShuffle = VecType->getNumElements() < NumInputElts;
1760	bool UsesSecondVec =
1761	any_of(Range&: ConcatMask, P: [&](int M) { return M >= (int)NumInputElts; });
1762
1763	FixedVectorType *VecTyForCost =
1764	(UsesSecondVec && !IsTruncatingShuffle) ? VecType : ShuffleInputType;
1765	InstructionCost OldCost = TTI.getShuffleCost(
1766	Kind: UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc,
1767	Tp: VecTyForCost, Mask: Shuffle->getShuffleMask());
1768	InstructionCost NewCost = TTI.getShuffleCost(
1769	Kind: UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc,
1770	Tp: VecTyForCost, Mask: ConcatMask);
1771
1772	LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
1773	<< "\n");
1774	LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
1775	<< "\n");
1776	if (NewCost < OldCost) {
1777	Builder.SetInsertPoint(Shuffle);
1778	Value *NewShuffle = Builder.CreateShuffleVector(
1779	V1: Shuffle->getOperand(i_nocapture: `0`), V2: Shuffle->getOperand(i_nocapture: `1`), Mask: ConcatMask);
1780	LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
1781	replaceValue(Old&: Shuffle, New&: NewShuffle);
1782	}
1783
1784	// See if we can re-use foldSelectShuffle, getting it to reduce the size of
1785	// the shuffle into a nicer order, as it can ignore the order of the shuffles.
1786	return foldSelectShuffle(I&: Shuffle, FromReduction: true*);
1787	}
1788
1789	/// Determine if its more efficient to fold:
1790	/// reduce(trunc(x)) -> trunc(reduce(x)).
1791	bool VectorCombine::foldTruncFromReductions(Instruction &I) {
1792	auto *II = dyn_cast<IntrinsicInst>(Val: &I);
1793	if (!II)
1794	return false;
1795
1796	Intrinsic::ID IID = II->getIntrinsicID();
1797	switch (IID) {
1798	case Intrinsic::vector_reduce_add:
1799	case Intrinsic::vector_reduce_mul:
1800	case Intrinsic::vector_reduce_and:
1801	case Intrinsic::vector_reduce_or:
1802	case Intrinsic::vector_reduce_xor:
1803	break;
1804	default:
1805	return false;
1806	}
1807
1808	unsigned ReductionOpc = getArithmeticReductionInstruction(RdxID: IID);
1809	Value *ReductionSrc = I.getOperand(i: `0`);
1810
1811	Value *TruncSrc;
1812	if (!match(V: ReductionSrc, P: m_OneUse(SubPattern: m_Trunc(Op: m_Value(V&: TruncSrc)))))
1813	return false;
1814
1815	auto *Trunc = cast<CastInst>(Val: ReductionSrc);
1816	auto *TruncSrcTy = cast<VectorType>(Val: TruncSrc->getType());
1817	auto *ReductionSrcTy = cast<VectorType>(Val: ReductionSrc->getType());
1818	Type *ResultTy = I.getType();
1819
1820	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1821	InstructionCost OldCost =
1822	TTI.getCastInstrCost(Opcode: Instruction::Trunc, Dst: ReductionSrcTy, Src: TruncSrcTy,
1823	CCH: TTI::CastContextHint::None, CostKind, I: Trunc) +
1824	TTI.getArithmeticReductionCost(Opcode: ReductionOpc, Ty: ReductionSrcTy, FMF: std::nullopt,
1825	CostKind);
1826	InstructionCost NewCost =
1827	TTI.getArithmeticReductionCost(Opcode: ReductionOpc, Ty: TruncSrcTy, FMF: std::nullopt,
1828	CostKind) +
1829	TTI.getCastInstrCost(Opcode: Instruction::Trunc, Dst: ResultTy,
1830	Src: ReductionSrcTy->getScalarType(),
1831	CCH: TTI::CastContextHint::None, CostKind);
1832
1833	if (OldCost <= NewCost \|\| !NewCost.isValid())
1834	return false;
1835
1836	Value *NewReduction = Builder.CreateIntrinsic(
1837	RetTy: TruncSrcTy->getScalarType(), ID: II->getIntrinsicID(), Args: {TruncSrc});
1838	Value *NewTruncation = Builder.CreateTrunc(V: NewReduction, DestTy: ResultTy);
1839	replaceValue(Old&: I, New&: *NewTruncation);
1840	return true;
1841	}
1842
1843	/// This method looks for groups of shuffles acting on binops, of the form:
1844	/// %x = shuffle ...
1845	/// %y = shuffle ...
1846	/// %a = binop %x, %y
1847	/// %b = binop %x, %y
1848	/// shuffle %a, %b, selectmask
1849	/// We may, especially if the shuffle is wider than legal, be able to convert
1850	/// the shuffle to a form where only parts of a and b need to be computed. On
1851	/// architectures with no obvious "select" shuffle, this can reduce the total
1852	/// number of operations if the target reports them as cheaper.
1853	bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
1854	auto *SVI = cast<ShuffleVectorInst>(Val: &I);
1855	auto *VT = cast<FixedVectorType>(Val: I.getType());
1856	auto *Op0 = dyn_cast<Instruction>(Val: SVI->getOperand(i_nocapture: `0`));
1857	auto *Op1 = dyn_cast<Instruction>(Val: SVI->getOperand(i_nocapture: `1`));
1858	if (!Op0 \|\| !Op1 \|\| Op0 == Op1 \|\| !Op0->isBinaryOp() \|\| !Op1->isBinaryOp() \|\|
1859	VT != Op0->getType())
1860	return false;
1861
1862	auto *SVI0A = dyn_cast<Instruction>(Val: Op0->getOperand(i: `0`));
1863	auto *SVI0B = dyn_cast<Instruction>(Val: Op0->getOperand(i: `1`));
1864	auto *SVI1A = dyn_cast<Instruction>(Val: Op1->getOperand(i: `0`));
1865	auto *SVI1B = dyn_cast<Instruction>(Val: Op1->getOperand(i: `1`));
1866	SmallPtrSet<Instruction *, `4`> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
1867	auto checkSVNonOpUses = [&](Instruction *I) {
1868	if (!I \|\| I->getOperand(i: `0`)->getType() != VT)
1869	return true;
1870	return any_of(Range: I->users(), P: [&](User *U) {
1871	return U != Op0 && U != Op1 &&
1872	!(isa<ShuffleVectorInst>(Val: U) &&
1873	(InputShuffles.contains(Ptr: cast<Instruction>(Val: U)) \|\|
1874	isInstructionTriviallyDead(I: cast<Instruction>(Val: U))));
1875	});
1876	};
1877	if (checkSVNonOpUses (SVI0A) \|\| checkSVNonOpUses (SVI0B) \|\|
1878	checkSVNonOpUses (SVI1A) \|\| checkSVNonOpUses (SVI1B))
1879	return false;
1880
1881	// Collect all the uses that are shuffles that we can transform together. We
1882	// may not have a single shuffle, but a group that can all be transformed
1883	// together profitably.
1884	SmallVector<ShuffleVectorInst *> Shuffles;
1885	auto collectShuffles = [&](Instruction *I) {
1886	for (auto *U : I->users()) {
1887	auto *SV = dyn_cast<ShuffleVectorInst>(Val: U);
1888	if (!SV \|\| SV->getType() != VT)
1889	return false;
1890	if ((SV->getOperand(i_nocapture: `0`) != Op0 && SV->getOperand(i_nocapture: `0`) != Op1) \|\|
1891	(SV->getOperand(i_nocapture: `1`) != Op0 && SV->getOperand(i_nocapture: `1`) != Op1))
1892	return false;
1893	if (!llvm::is_contained(Range&: Shuffles, Element: SV))
1894	Shuffles.push_back(Elt: SV);
1895	}
1896	return true;
1897	};
1898	if (!collectShuffles (Op0) \|\| !collectShuffles (Op1))
1899	return false;
1900	// From a reduction, we need to be processing a single shuffle, otherwise the
1901	// other uses will not be lane-invariant.
1902	if (FromReduction && Shuffles.size() > `1`)
1903	return false;
1904
1905	// Add any shuffle uses for the shuffles we have found, to include them in our
1906	// cost calculations.
1907	if (!FromReduction) {
1908	for (ShuffleVectorInst *SV : Shuffles) {
1909	for (auto *U : SV->users()) {
1910	ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(Val: U);
1911	if (SSV && isa<UndefValue>(Val: SSV->getOperand(i_nocapture: `1`)) && SSV->getType() == VT)
1912	Shuffles.push_back(Elt: SSV);
1913	}
1914	}
1915	}
1916
1917	// For each of the output shuffles, we try to sort all the first vector
1918	// elements to the beginning, followed by the second array elements at the
1919	// end. If the binops are legalized to smaller vectors, this may reduce total
1920	// number of binops. We compute the ReconstructMask mask needed to convert
1921	// back to the original lane order.
1922	SmallVector<std::pair<int, int>> V1, V2;
1923	SmallVector<SmallVector<int>> OrigReconstructMasks;
1924	int MaxV1Elt = `0`, MaxV2Elt = `0`;
1925	unsigned NumElts = VT->getNumElements();
1926	for (ShuffleVectorInst *SVN : Shuffles) {
1927	SmallVector<int> Mask;
1928	SVN->getShuffleMask(Result&: Mask);
1929
1930	// Check the operands are the same as the original, or reversed (in which
1931	// case we need to commute the mask).
1932	Value *SVOp0 = SVN->getOperand(i_nocapture: `0`);
1933	Value *SVOp1 = SVN->getOperand(i_nocapture: `1`);
1934	if (isa<UndefValue>(Val: SVOp1)) {
1935	auto *SSV = cast<ShuffleVectorInst>(Val: SVOp0);
1936	SVOp0 = SSV->getOperand(i_nocapture: `0`);
1937	SVOp1 = SSV->getOperand(i_nocapture: `1`);
1938	for (unsigned I = `0`, E = Mask.size(); I != E; I++) {
1939	if (Mask [I] >= static_cast<int>(SSV->getShuffleMask().size()))
1940	return false;
1941	Mask [I] = Mask [I] < `0` ? Mask [I] : SSV->getMaskValue(Elt: Mask [I]);
1942	}
1943	}
1944	if (SVOp0 == Op1 && SVOp1 == Op0) {
1945	std::swap(a&: SVOp0, b&: SVOp1);
1946	ShuffleVectorInst::commuteShuffleMask(Mask, InVecNumElts: NumElts);
1947	}
1948	if (SVOp0 != Op0 \|\| SVOp1 != Op1)
1949	return false;
1950
1951	// Calculate the reconstruction mask for this shuffle, as the mask needed to
1952	// take the packed values from Op0/Op1 and reconstructing to the original
1953	// order.
1954	SmallVector<int> ReconstructMask;
1955	for (unsigned I = `0`; I < Mask.size(); I++) {
1956	if (Mask [I] < `0`) {
1957	ReconstructMask.push_back(Elt: -`1`);
1958	} else if (Mask [I] < static_cast<int>(NumElts)) {
1959	MaxV1Elt = std::max(a: MaxV1Elt, b: Mask [I]);
1960	auto It = find_if(Range&: V1, P: [&](const std::pair<int, int> &A) {
1961	return Mask [I] == A.first;
1962	});
1963	if (It != V1.end())
1964	ReconstructMask.push_back(Elt: It - V1.begin());
1965	else {
1966	ReconstructMask.push_back(Elt: V1.size());
1967	V1.emplace_back(Args&: Mask [I], Args: V1.size());
1968	}
1969	} else {
1970	MaxV2Elt = std::max<int>(a: MaxV2Elt, b: Mask [I] - NumElts);
1971	auto It = find_if(Range&: V2, P: [&](const std::pair<int, int> &A) {
1972	return Mask [I] - static_cast<int>(NumElts) == A.first;
1973	});
1974	if (It != V2.end())
1975	ReconstructMask.push_back(Elt: NumElts + It - V2.begin());
1976	else {
1977	ReconstructMask.push_back(Elt: NumElts + V2.size());
1978	V2.emplace_back(Args: Mask [I] - NumElts, Args: NumElts + V2.size());
1979	}
1980	}
1981	}
1982
1983	// For reductions, we know that the lane ordering out doesn't alter the
1984	// result. In-order can help simplify the shuffle away.
1985	if (FromReduction)
1986	sort(C&: ReconstructMask);
1987	OrigReconstructMasks.push_back(Elt: std::move(ReconstructMask));
1988	}
1989
1990	// If the Maximum element used from V1 and V2 are not larger than the new
1991	// vectors, the vectors are already packes and performing the optimization
1992	// again will likely not help any further. This also prevents us from getting
1993	// stuck in a cycle in case the costs do not also rule it out.
1994	if (V1.empty() \|\| V2.empty() \|\|
1995	(MaxV1Elt == static_cast<int>(V1.size()) - `1` &&
1996	MaxV2Elt == static_cast<int>(V2.size()) - `1`))
1997	return false;
1998
1999	// GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
2000	// shuffle of another shuffle, or not a shuffle (that is treated like a
2001	// identity shuffle).
2002	auto GetBaseMaskValue = [&](Instruction I, int* M) {
2003	auto *SV = dyn_cast<ShuffleVectorInst>(Val: I);
2004	if (!SV)
2005	return M;
2006	if (isa<UndefValue>(Val: SV->getOperand(i_nocapture: `1`)))
2007	if (auto *SSV = dyn_cast<ShuffleVectorInst>(Val: SV->getOperand(i_nocapture: `0`)))
2008	if (InputShuffles.contains(Ptr: SSV))
2009	return SSV->getMaskValue(Elt: SV->getMaskValue(Elt: M));
2010	return SV->getMaskValue(Elt: M);
2011	};
2012
2013	// Attempt to sort the inputs my ascending mask values to make simpler input
2014	// shuffles and push complex shuffles down to the uses. We sort on the first
2015	// of the two input shuffle orders, to try and get at least one input into a
2016	// nice order.
2017	auto SortBase = [&](Instruction A, std::pair<int, int*> X,
2018	std::pair<int, int> Y) {
2019	int MXA = GetBaseMaskValue (A, X.first);
2020	int MYA = GetBaseMaskValue (A, Y.first);
2021	return MXA < MYA;
2022	};
2023	stable_sort(Range&: V1, C: [&](std::pair<int, int> A, std::pair<int, int> B) {
2024	return SortBase (SVI0A, A, B);
2025	});
2026	stable_sort(Range&: V2, C: [&](std::pair<int, int> A, std::pair<int, int> B) {
2027	return SortBase (SVI1A, A, B);
2028	});
2029	// Calculate our ReconstructMasks from the OrigReconstructMasks and the
2030	// modified order of the input shuffles.
2031	SmallVector<SmallVector<int>> ReconstructMasks;
2032	for (const auto &Mask : OrigReconstructMasks) {
2033	SmallVector<int> ReconstructMask;
2034	for (int M : Mask) {
2035	auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
2036	auto It = find_if(Range: V, P: [M](auto A) { return A.second == M; });
2037	assert(It != V.end() && "Expected all entries in Mask");
2038	return std::distance(first: V.begin(), last: It);
2039	};
2040	if (M < `0`)
2041	ReconstructMask.push_back(Elt: -`1`);
2042	else if (M < static_cast<int>(NumElts)) {
2043	ReconstructMask.push_back(Elt: FindIndex (V1, M));
2044	} else {
2045	ReconstructMask.push_back(Elt: NumElts + FindIndex (V2, M));
2046	}
2047	}
2048	ReconstructMasks.push_back(Elt: std::move(ReconstructMask));
2049	}
2050
2051	// Calculate the masks needed for the new input shuffles, which get padded
2052	// with undef
2053	SmallVector<int> V1A, V1B, V2A, V2B;
2054	for (unsigned I = `0`; I < V1.size(); I++) {
2055	V1A.push_back(Elt: GetBaseMaskValue (SVI0A, V1 [I].first));
2056	V1B.push_back(Elt: GetBaseMaskValue (SVI0B, V1 [I].first));
2057	}
2058	for (unsigned I = `0`; I < V2.size(); I++) {
2059	V2A.push_back(Elt: GetBaseMaskValue (SVI1A, V2 [I].first));
2060	V2B.push_back(Elt: GetBaseMaskValue (SVI1B, V2 [I].first));
2061	}
2062	while (V1A.size() < NumElts) {
2063	V1A.push_back(Elt: PoisonMaskElem);
2064	V1B.push_back(Elt: PoisonMaskElem);
2065	}
2066	while (V2A.size() < NumElts) {
2067	V2A.push_back(Elt: PoisonMaskElem);
2068	V2B.push_back(Elt: PoisonMaskElem);
2069	}
2070
2071	auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
2072	auto *SV = dyn_cast<ShuffleVectorInst>(Val: I);
2073	if (!SV)
2074	return C;
2075	return C + TTI.getShuffleCost(Kind: isa<UndefValue>(Val: SV->getOperand(i_nocapture: `1`))
2076	? TTI::SK_PermuteSingleSrc
2077	: TTI::SK_PermuteTwoSrc,
2078	Tp: VT, Mask: SV->getShuffleMask());
2079	};
2080	auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
2081	return C + TTI.getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, Tp: VT, Mask);
2082	};
2083
2084	// Get the costs of the shuffles + binops before and after with the new
2085	// shuffle masks.
2086	InstructionCost CostBefore =
2087	TTI.getArithmeticInstrCost(Opcode: Op0->getOpcode(), Ty: VT) +
2088	TTI.getArithmeticInstrCost(Opcode: Op1->getOpcode(), Ty: VT);
2089	CostBefore += std::accumulate(first: Shuffles.begin(), last: Shuffles.end(),
2090	init: InstructionCost (`0`), binary_op: AddShuffleCost);
2091	CostBefore += std::accumulate(first: InputShuffles.begin(), last: InputShuffles.end(),
2092	init: InstructionCost (`0`), binary_op: AddShuffleCost);
2093
2094	// The new binops will be unused for lanes past the used shuffle lengths.
2095	// These types attempt to get the correct cost for that from the target.
2096	FixedVectorType *Op0SmallVT =
2097	FixedVectorType::get(ElementType: VT->getScalarType(), NumElts: V1.size());
2098	FixedVectorType *Op1SmallVT =
2099	FixedVectorType::get(ElementType: VT->getScalarType(), NumElts: V2.size());
2100	InstructionCost CostAfter =
2101	TTI.getArithmeticInstrCost(Opcode: Op0->getOpcode(), Ty: Op0SmallVT) +
2102	TTI.getArithmeticInstrCost(Opcode: Op1->getOpcode(), Ty: Op1SmallVT);
2103	CostAfter += std::accumulate(first: ReconstructMasks.begin(), last: ReconstructMasks.end(),
2104	init: InstructionCost (`0`), binary_op: AddShuffleMaskCost);
2105	std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
2106	CostAfter +=
2107	std::accumulate(first: OutputShuffleMasks.begin(), last: OutputShuffleMasks.end(),
2108	init: InstructionCost (`0`), binary_op: AddShuffleMaskCost);
2109
2110	LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
2111	LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
2112	<< " vs CostAfter: " << CostAfter << "\n");
2113	if (CostBefore <= CostAfter)
2114	return false;
2115
2116	// The cost model has passed, create the new instructions.
2117	auto GetShuffleOperand = [&](Instruction I, unsigned* Op) -> Value * {
2118	auto *SV = dyn_cast<ShuffleVectorInst>(Val: I);
2119	if (!SV)
2120	return I;
2121	if (isa<UndefValue>(Val: SV->getOperand(i_nocapture: `1`)))
2122	if (auto *SSV = dyn_cast<ShuffleVectorInst>(Val: SV->getOperand(i_nocapture: `0`)))
2123	if (InputShuffles.contains(Ptr: SSV))
2124	return SSV->getOperand(i_nocapture: Op);
2125	return SV->getOperand(i_nocapture: Op);
2126	};
2127	Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
2128	Value *NSV0A = Builder.CreateShuffleVector(V1: GetShuffleOperand (SVI0A, `0`),
2129	V2: GetShuffleOperand (SVI0A, `1`), Mask: V1A);
2130	Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
2131	Value *NSV0B = Builder.CreateShuffleVector(V1: GetShuffleOperand (SVI0B, `0`),
2132	V2: GetShuffleOperand (SVI0B, `1`), Mask: V1B);
2133	Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
2134	Value *NSV1A = Builder.CreateShuffleVector(V1: GetShuffleOperand (SVI1A, `0`),
2135	V2: GetShuffleOperand (SVI1A, `1`), Mask: V2A);
2136	Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
2137	Value *NSV1B = Builder.CreateShuffleVector(V1: GetShuffleOperand (SVI1B, `0`),
2138	V2: GetShuffleOperand (SVI1B, `1`), Mask: V2B);
2139	Builder.SetInsertPoint(Op0);
2140	Value *NOp0 = Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Op0->getOpcode(),
2141	LHS: NSV0A, RHS: NSV0B);
2142	if (auto *I = dyn_cast<Instruction>(Val: NOp0))
2143	I->copyIRFlags(V: Op0, IncludeWrapFlags: true);
2144	Builder.SetInsertPoint(Op1);
2145	Value *NOp1 = Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Op1->getOpcode(),
2146	LHS: NSV1A, RHS: NSV1B);
2147	if (auto *I = dyn_cast<Instruction>(Val: NOp1))
2148	I->copyIRFlags(V: Op1, IncludeWrapFlags: true);
2149
2150	for (int S = `0`, E = ReconstructMasks.size(); S != E; S++) {
2151	Builder.SetInsertPoint(Shuffles [S]);
2152	Value *NSV = Builder.CreateShuffleVector(V1: NOp0, V2: NOp1, Mask: ReconstructMasks [S]);
2153	replaceValue(Old&: Shuffles [S], New&: NSV);
2154	}
2155
2156	Worklist.pushValue(V: NSV0A);
2157	Worklist.pushValue(V: NSV0B);
2158	Worklist.pushValue(V: NSV1A);
2159	Worklist.pushValue(V: NSV1B);
2160	for (auto *S : Shuffles)
2161	Worklist.add(I: S);
2162	return true;
2163	}
2164
2165	/// This is the entry point for all transforms. Pass manager differences are
2166	/// handled in the callers of this function.
2167	bool VectorCombine::run() {
2168	if (DisableVectorCombine)
2169	return false;
2170
2171	// Don't attempt vectorization if the target does not support vectors.
2172	if (!TTI.getNumberOfRegisters(ClassID: TTI.getRegisterClassForType(/Vector/ true)))
2173	return false;
2174
2175	bool MadeChange = false;
2176	auto FoldInst = [this, &MadeChange](Instruction &I) {
2177	Builder.SetInsertPoint(&I);
2178	bool IsFixedVectorType = isa<FixedVectorType>(Val: I.getType());
2179	auto Opcode = I.getOpcode();
2180
2181	// These folds should be beneficial regardless of when this pass is run
2182	// in the optimization pipeline.
2183	// The type checking is for run-time efficiency. We can avoid wasting time
2184	// dispatching to folding functions if there's no chance of matching.
2185	if (IsFixedVectorType) {
2186	switch (Opcode) {
2187	case Instruction::InsertElement:
2188	MadeChange \|= vectorizeLoadInsert(I);
2189	break;
2190	case Instruction::ShuffleVector:
2191	MadeChange \|= widenSubvectorLoad(I);
2192	break;
2193	default:
2194	break;
2195	}
2196	}
2197
2198	// This transform works with scalable and fixed vectors
2199	// TODO: Identify and allow other scalable transforms
2200	if (isa<VectorType>(Val: I.getType())) {
2201	MadeChange \|= scalarizeBinopOrCmp(I);
2202	MadeChange \|= scalarizeLoadExtract(I);
2203	MadeChange \|= scalarizeVPIntrinsic(I);
2204	}
2205
2206	if (Opcode == Instruction::Store)
2207	MadeChange \|= foldSingleElementStore(I);
2208
2209	// If this is an early pipeline invocation of this pass, we are done.
2210	if (TryEarlyFoldsOnly)
2211	return;
2212
2213	// Otherwise, try folds that improve codegen but may interfere with
2214	// early IR canonicalizations.
2215	// The type checking is for run-time efficiency. We can avoid wasting time
2216	// dispatching to folding functions if there's no chance of matching.
2217	if (IsFixedVectorType) {
2218	switch (Opcode) {
2219	case Instruction::InsertElement:
2220	MadeChange \|= foldInsExtFNeg(I);
2221	break;
2222	case Instruction::ShuffleVector:
2223	MadeChange \|= foldShuffleOfBinops(I);
2224	MadeChange \|= foldShuffleOfCastops(I);
2225	MadeChange \|= foldShuffleOfShuffles(I);
2226	MadeChange \|= foldSelectShuffle(I);
2227	break;
2228	case Instruction::BitCast:
2229	MadeChange \|= foldBitcastShuffle(I);
2230	break;
2231	}
2232	} else {
2233	switch (Opcode) {
2234	case Instruction::Call:
2235	MadeChange \|= foldShuffleFromReductions(I);
2236	MadeChange \|= foldTruncFromReductions(I);
2237	break;
2238	case Instruction::ICmp:
2239	case Instruction::FCmp:
2240	MadeChange \|= foldExtractExtract(I);
2241	break;
2242	default:
2243	if (Instruction::isBinaryOp(Opcode)) {
2244	MadeChange \|= foldExtractExtract(I);
2245	MadeChange \|= foldExtractedCmps(I);
2246	}
2247	break;
2248	}
2249	}
2250	};
2251
2252	for (BasicBlock &BB : F) {
2253	// Ignore unreachable basic blocks.
2254	if (!DT.isReachableFromEntry(A: &BB))
2255	continue;
2256	// Use early increment range so that we can erase instructions in loop.
2257	for (Instruction &I : make_early_inc_range(Range&: BB)) {
2258	if (I.isDebugOrPseudoInst())
2259	continue;
2260	FoldInst (I);
2261	}
2262	}
2263
2264	while (!Worklist.isEmpty()) {
2265	Instruction *I = Worklist.removeOne();
2266	if (!I)
2267	continue;
2268
2269	if (isInstructionTriviallyDead(I)) {
2270	eraseInstruction(I&: *I);
2271	continue;
2272	}
2273
2274	FoldInst (*I);
2275	}
2276
2277	return MadeChange;
2278	}
2279
2280	PreservedAnalyses VectorCombinePass::run(Function &F,
2281	FunctionAnalysisManager &FAM) {
2282	auto &AC = FAM.getResult<AssumptionAnalysis>(IR&: F);
2283	TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F);
2284	DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(IR&: F);
2285	AAResults &AA = FAM.getResult<AAManager>(IR&: F);
2286	const DataLayout *DL = &F.getParent()->getDataLayout();
2287	VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TryEarlyFoldsOnly);
2288	if (!Combiner.run())
2289	return PreservedAnalyses::all();
2290	PreservedAnalyses PA;
2291	PA.preserveSet<CFGAnalyses>();
2292	return PA;
2293	}
2294

source code of llvm/lib/Transforms/Vectorize/VectorCombine.cpp