VPlanSLP.cpp source code [llvm/lib/Transforms/Vectorize/VPlanSLP.cpp]

1	//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	/// This file implements SLP analysis based on VPlan. The analysis is based on
9	/// the ideas described in
10	///
11	/// Look-ahead SLP: auto-vectorization in the presence of commutative
12	/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
13	/// Luís F. W. Góes
14	///
15	//===----------------------------------------------------------------------===//
16
17	#include "VPlan.h"
18	#include "VPlanValue.h"
19	#include "llvm/ADT/DenseMap.h"
20	#include "llvm/ADT/SmallVector.h"
21	#include "llvm/Analysis/VectorUtils.h"
22	#include "llvm/IR/Instruction.h"
23	#include "llvm/IR/Instructions.h"
24	#include "llvm/IR/Type.h"
25	#include "llvm/IR/Value.h"
26	#include "llvm/Support/Casting.h"
27	#include "llvm/Support/Debug.h"
28	#include "llvm/Support/ErrorHandling.h"
29	#include "llvm/Support/raw_ostream.h"
30	#include <algorithm>
31	#include <cassert>
32	#include <optional>
33	#include <utility>
34
35	using namespace llvm;
36
37	#define DEBUG_TYPE "vplan-slp"
38
39	// Number of levels to look ahead when re-ordering multi node operands.
40	static unsigned LookaheadMaxDepth = `5`;
41
42	VPInstruction *VPlanSlp::markFailed() {
43	// FIXME: Currently this is used to signal we hit instructions we cannot
44	// trivially SLP'ize.
45	CompletelySLP = false;
46	return nullptr;
47	}
48
49	void VPlanSlp::addCombined(ArrayRef<VPValue > Operands, VPInstruction New) {
50	if (all_of(Range&: Operands, P: [](VPValue *V) {
51	return cast<VPInstruction>(Val: V)->getUnderlyingInstr();
52	})) {
53	unsigned BundleSize = `0`;
54	for (VPValue *V : Operands) {
55	Type *T = cast<VPInstruction>(Val: V)->getUnderlyingInstr()->getType();
56	assert(!T->isVectorTy() && "Only scalar types supported for now");
57	BundleSize += T->getScalarSizeInBits();
58	}
59	WidestBundleBits = std::max(a: WidestBundleBits, b: BundleSize);
60	}
61
62	auto Res = BundleToCombined.try_emplace(Key: to_vector<`4`>(Range&: Operands), Args&: New);
63	assert(Res.second &&
64	"Already created a combined instruction for the operand bundle");
65	(void)Res;
66	}
67
68	bool VPlanSlp::areVectorizable(ArrayRef<VPValue > Operands) const* {
69	// Currently we only support VPInstructions.
70	if (!all_of(Range&: Operands, P: [](VPValue *Op) {
71	return Op && isa<VPInstruction>(Val: Op) &&
72	cast<VPInstruction>(Val: Op)->getUnderlyingInstr();
73	})) {
74	LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
75	return false;
76	}
77
78	// Check if opcodes and type width agree for all instructions in the bundle.
79	// FIXME: Differing widths/opcodes can be handled by inserting additional
80	// instructions.
81	// FIXME: Deal with non-primitive types.
82	const Instruction *OriginalInstr =
83	cast<VPInstruction>(Val: Operands [`0`])->getUnderlyingInstr();
84	unsigned Opcode = OriginalInstr->getOpcode();
85	unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
86	if (!all_of(Range&: Operands, P: [Opcode, Width](VPValue *Op) {
87	const Instruction *I = cast<VPInstruction>(Val: Op)->getUnderlyingInstr();
88	return I->getOpcode() == Opcode &&
89	I->getType()->getPrimitiveSizeInBits() == Width;
90	})) {
91	LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
92	return false;
93	}
94
95	// For now, all operands must be defined in the same BB.
96	if (any_of(Range&: Operands, P: [this](VPValue *Op) {
97	return cast<VPInstruction>(Val: Op)->getParent() != &this->BB;
98	})) {
99	LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
100	return false;
101	}
102
103	if (any_of(Range&: Operands,
104	P: [](VPValue Op) { return* Op->hasMoreThanOneUniqueUser(); })) {
105	LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
106	return false;
107	}
108
109	// For loads, check that there are no instructions writing to memory in
110	// between them.
111	// TODO: we only have to forbid instructions writing to memory that could
112	// interfere with any of the loads in the bundle
113	if (Opcode == Instruction::Load) {
114	unsigned LoadsSeen = `0`;
115	VPBasicBlock *Parent = cast<VPInstruction>(Val: Operands [`0`])->getParent();
116	for (auto &I : *Parent) {
117	auto *VPI = dyn_cast<VPInstruction>(Val: &I);
118	if (!VPI)
119	break;
120	if (VPI->getOpcode() == Instruction::Load &&
121	llvm::is_contained(Range&: Operands, Element: VPI))
122	LoadsSeen++;
123
124	if (LoadsSeen == Operands.size())
125	break;
126	if (LoadsSeen > `0` && VPI->mayWriteToMemory()) {
127	LLVM_DEBUG(
128	dbgs() << "VPSLP: instruction modifying memory between loads\n");
129	return false;
130	}
131	}
132
133	if (!all_of(Range&: Operands, P: [](VPValue *Op) {
134	return cast<LoadInst>(Val: cast<VPInstruction>(Val: Op)->getUnderlyingInstr())
135	->isSimple();
136	})) {
137	LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
138	return false;
139	}
140	}
141
142	if (Opcode == Instruction::Store)
143	if (!all_of(Range&: Operands, P: [](VPValue *Op) {
144	return cast<StoreInst>(Val: cast<VPInstruction>(Val: Op)->getUnderlyingInstr())
145	->isSimple();
146	})) {
147	LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
148	return false;
149	}
150
151	return true;
152	}
153
154	static SmallVector<VPValue , `4`> getOperands(ArrayRef<VPValue > Values,
155	unsigned OperandIndex) {
156	SmallVector<VPValue *, `4`> Operands;
157	for (VPValue *V : Values) {
158	// Currently we only support VPInstructions.
159	auto *U = cast<VPInstruction>(Val: V);
160	Operands.push_back(Elt: U->getOperand(N: OperandIndex));
161	}
162	return Operands;
163	}
164
165	static bool areCommutative(ArrayRef<VPValue *> Values) {
166	return Instruction::isCommutative(
167	Opcode: cast<VPInstruction>(Val: Values [`0`])->getOpcode());
168	}
169
170	static SmallVector<SmallVector<VPValue *, `4`>, `4`>
171	getOperands(ArrayRef<VPValue *> Values) {
172	SmallVector<SmallVector<VPValue *, `4`>, `4`> Result;
173	auto *VPI = cast<VPInstruction>(Val: Values [`0`]);
174
175	switch (VPI->getOpcode()) {
176	case Instruction::Load:
177	llvm_unreachable("Loads terminate a tree, no need to get operands");
178	case Instruction::Store:
179	Result.push_back(Elt: getOperands(Values, OperandIndex: `0`));
180	break;
181	default:
182	for (unsigned I = `0`, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
183	Result.push_back(Elt: getOperands(Values, OperandIndex: I));
184	break;
185	}
186
187	return Result;
188	}
189
190	/// Returns the opcode of Values or ~0 if they do not all agree.
191	static std::optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
192	unsigned Opcode = cast<VPInstruction>(Val: Values [`0`])->getOpcode();
193	if (any_of(Range&: Values, P: [Opcode](VPValue *V) {
194	return cast<VPInstruction>(Val: V)->getOpcode() != Opcode;
195	}))
196	return std::nullopt;
197	return {Opcode};
198	}
199
200	/// Returns true if A and B access sequential memory if they are loads or
201	/// stores or if they have identical opcodes otherwise.
202	static bool areConsecutiveOrMatch(VPInstruction A, VPInstruction B,
203	VPInterleavedAccessInfo &IAI) {
204	if (A->getOpcode() != B->getOpcode())
205	return false;
206
207	if (A->getOpcode() != Instruction::Load &&
208	A->getOpcode() != Instruction::Store)
209	return true;
210	auto *GA = IAI.getInterleaveGroup(Instr: A);
211	auto *GB = IAI.getInterleaveGroup(Instr: B);
212
213	return GA && GB && GA == GB && GA->getIndex(Instr: A) + `1` == GB->getIndex(Instr: B);
214	}
215
216	/// Implements getLAScore from Listing 7 in the paper.
217	/// Traverses and compares operands of V1 and V2 to MaxLevel.
218	static unsigned getLAScore(VPValue V1, VPValue V2, unsigned MaxLevel,
219	VPInterleavedAccessInfo &IAI) {
220	auto *I1 = dyn_cast<VPInstruction>(Val: V1);
221	auto *I2 = dyn_cast<VPInstruction>(Val: V2);
222	// Currently we only support VPInstructions.
223	if (!I1 \|\| !I2)
224	return `0`;
225
226	if (MaxLevel == `0`)
227	return (unsigned)areConsecutiveOrMatch(A: I1, B: I2, IAI);
228
229	unsigned Score = `0`;
230	for (unsigned I = `0`, EV1 = I1->getNumOperands(); I < EV1; ++I)
231	for (unsigned J = `0`, EV2 = I2->getNumOperands(); J < EV2; ++J)
232	Score +=
233	getLAScore(V1: I1->getOperand(N: I), V2: I2->getOperand(N: J), MaxLevel: MaxLevel - `1`, IAI);
234	return Score;
235	}
236
237	std::pair<VPlanSlp::OpMode, VPValue *>
238	VPlanSlp::getBest(OpMode Mode, VPValue *Last,
239	SmallPtrSetImpl<VPValue *> &Candidates,
240	VPInterleavedAccessInfo &IAI) {
241	assert((Mode == OpMode::Load \|\| Mode == OpMode::Opcode) &&
242	"Currently we only handle load and commutative opcodes");
243	LLVM_DEBUG(dbgs() << " getBest\n");
244
245	SmallVector<VPValue *, `4`> BestCandidates;
246	LLVM_DEBUG(dbgs() << " Candidates for "
247	<< *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
248	for (auto *Candidate : Candidates) {
249	auto *LastI = cast<VPInstruction>(Val: Last);
250	auto *CandidateI = cast<VPInstruction>(Val: Candidate);
251	if (areConsecutiveOrMatch(A: LastI, B: CandidateI, IAI)) {
252	LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
253	<< " ");
254	BestCandidates.push_back(Elt: Candidate);
255	}
256	}
257	LLVM_DEBUG(dbgs() << "\n");
258
259	if (BestCandidates.empty())
260	return {OpMode::Failed, nullptr};
261
262	if (BestCandidates.size() == `1`)
263	return {Mode, BestCandidates [`0`]};
264
265	VPValue Best = nullptr*;
266	unsigned BestScore = `0`;
267	for (unsigned Depth = `1`; Depth < LookaheadMaxDepth; Depth++) {
268	unsigned PrevScore = ~`0u`;
269	bool AllSame = true;
270
271	// FIXME: Avoid visiting the same operands multiple times.
272	for (auto *Candidate : BestCandidates) {
273	unsigned Score = getLAScore(V1: Last, V2: Candidate, MaxLevel: Depth, IAI);
274	if (PrevScore == ~`0u`)
275	PrevScore = Score;
276	if (PrevScore != Score)
277	AllSame = false;
278	PrevScore = Score;
279
280	if (Score > BestScore) {
281	BestScore = Score;
282	Best = Candidate;
283	}
284	}
285	if (!AllSame)
286	break;
287	}
288	LLVM_DEBUG(dbgs() << "Found best "
289	<< *cast<VPInstruction>(Best)->getUnderlyingInstr()
290	<< "\n");
291	Candidates.erase(Ptr: Best);
292
293	return {Mode, Best};
294	}
295
296	SmallVector<VPlanSlp::MultiNodeOpTy, `4`> VPlanSlp::reorderMultiNodeOps() {
297	SmallVector<MultiNodeOpTy, `4`> FinalOrder;
298	SmallVector<OpMode, `4`> Mode;
299	FinalOrder.reserve(N: MultiNodeOps.size());
300	Mode.reserve(N: MultiNodeOps.size());
301
302	LLVM_DEBUG(dbgs() << "Reordering multinode\n");
303
304	for (auto &Operands : MultiNodeOps) {
305	FinalOrder.push_back(Elt: {Operands.first, {Operands.second [`0`]}});
306	if (cast<VPInstruction>(Val: Operands.second [`0`])->getOpcode() ==
307	Instruction::Load)
308	Mode.push_back(Elt: OpMode::Load);
309	else
310	Mode.push_back(Elt: OpMode::Opcode);
311	}
312
313	for (unsigned Lane = `1`, E = MultiNodeOps [`0`].second.size(); Lane < E; ++Lane) {
314	LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n");
315	SmallPtrSet<VPValue *, `4`> Candidates;
316	LLVM_DEBUG(dbgs() << " Candidates ");
317	for (auto Ops : MultiNodeOps) {
318	LLVM_DEBUG(
319	dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
320	<< " ");
321	Candidates.insert(Ptr: Ops.second [Lane]);
322	}
323	LLVM_DEBUG(dbgs() << "\n");
324
325	for (unsigned Op = `0`, E = MultiNodeOps.size(); Op < E; ++Op) {
326	LLVM_DEBUG(dbgs() << " Checking " << Op << "\n");
327	if (Mode [Op] == OpMode::Failed)
328	continue;
329
330	VPValue *Last = FinalOrder [Op].second [Lane - `1`];
331	std::pair<OpMode, VPValue *> Res =
332	getBest(Mode: Mode [Op], Last, Candidates, IAI);
333	if (Res.second)
334	FinalOrder [Op].second.push_back(Elt: Res.second);
335	else
336	// TODO: handle this case
337	FinalOrder [Op].second.push_back(Elt: markFailed());
338	}
339	}
340
341	return FinalOrder;
342	}
343
344	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
345	void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
346	dbgs() << " Ops: ";
347	for (auto *Op : Values) {
348	if (auto *VPInstr = cast_or_null<VPInstruction>(Val: Op))
349	if (auto *Instr = VPInstr->getUnderlyingInstr()) {
350	dbgs() << *Instr << " \| ";
351	continue;
352	}
353	dbgs() << " nullptr \| ";
354	}
355	dbgs() << "\n";
356	}
357	#endif
358
359	VPInstruction VPlanSlp::buildGraph(ArrayRef<VPValue > Values) {
360	assert(!Values.empty() && "Need some operands!");
361
362	// If we already visited this instruction bundle, re-use the existing node
363	auto I = BundleToCombined.find(Val: to_vector<`4`>(Range&: Values));
364	if (I != BundleToCombined.end()) {
365	#ifndef NDEBUG
366	// Check that the resulting graph is a tree. If we re-use a node, this means
367	// its values have multiple users. We only allow this, if all users of each
368	// value are the same instruction.
369	for (auto *V : Values) {
370	auto UI = V->user_begin();
371	auto FirstUser = UI++;
372	while (UI != V->user_end()) {
373	assert(*UI == FirstUser && "Currently we only support SLP trees.");
374	UI++;
375	}
376	}
377	#endif
378	return I ->second;
379	}
380
381	// Dump inputs
382	LLVM_DEBUG({
383	dbgs() << "buildGraph: ";
384	dumpBundle(Values);
385	});
386
387	if (!areVectorizable(Operands: Values))
388	return markFailed();
389
390	assert(getOpcode(Values) && "Opcodes for all values must match");
391	unsigned ValuesOpcode = *getOpcode(Values);
392
393	SmallVector<VPValue *, `4`> CombinedOperands;
394	if (areCommutative(Values)) {
395	bool MultiNodeRoot = !MultiNodeActive;
396	MultiNodeActive = true;
397	for (auto &Operands : getOperands(Values)) {
398	LLVM_DEBUG({
399	dbgs() << " Visiting Commutative";
400	dumpBundle(Operands);
401	});
402
403	auto OperandsOpcode = getOpcode(Values: Operands);
404	if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
405	LLVM_DEBUG(dbgs() << " Same opcode, continue building\n");
406	CombinedOperands.push_back(Elt: buildGraph(Values: Operands));
407	} else {
408	LLVM_DEBUG(dbgs() << " Adding multinode Ops\n");
409	// Create dummy VPInstruction, which will we replace later by the
410	// re-ordered operand.
411	VPInstruction Op = new* VPInstruction (`0`, {});
412	CombinedOperands.push_back(Elt: Op);
413	MultiNodeOps.emplace_back(Args&: Op, Args&: Operands);
414	}
415	}
416
417	if (MultiNodeRoot) {
418	LLVM_DEBUG(dbgs() << "Reorder \n");
419	MultiNodeActive = false;
420
421	auto FinalOrder = reorderMultiNodeOps();
422
423	MultiNodeOps.clear();
424	for (auto &Ops : FinalOrder) {
425	VPInstruction *NewOp = buildGraph(Values: Ops.second);
426	Ops.first->replaceAllUsesWith(New: NewOp);
427	for (unsigned i = `0`; i < CombinedOperands.size(); i++)
428	if (CombinedOperands [i] == Ops.first)
429	CombinedOperands [i] = NewOp;
430	delete Ops.first;
431	Ops.first = NewOp;
432	}
433	LLVM_DEBUG(dbgs() << "Found final order\n");
434	}
435	} else {
436	LLVM_DEBUG(dbgs() << " NonCommuntative\n");
437	if (ValuesOpcode == Instruction::Load)
438	for (VPValue *V : Values)
439	CombinedOperands.push_back(Elt: cast<VPInstruction>(Val: V)->getOperand(N: `0`));
440	else
441	for (auto &Operands : getOperands(Values))
442	CombinedOperands.push_back(Elt: buildGraph(Values: Operands));
443	}
444
445	unsigned Opcode;
446	switch (ValuesOpcode) {
447	case Instruction::Load:
448	Opcode = VPInstruction::SLPLoad;
449	break;
450	case Instruction::Store:
451	Opcode = VPInstruction::SLPStore;
452	break;
453	default:
454	Opcode = ValuesOpcode;
455	break;
456	}
457
458	if (!CompletelySLP)
459	return markFailed();
460
461	assert(CombinedOperands.size() > `0` && "Need more some operands");
462	auto *Inst = cast<VPInstruction>(Val: Values [`0`])->getUnderlyingInstr();
463	auto VPI = new* VPInstruction (Opcode, CombinedOperands, Inst->getDebugLoc());
464
465	LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
466	<< *cast<VPInstruction>(Values[`0`]) << "\n");
467	addCombined(Operands: Values, New: VPI);
468	return VPI;
469	}
470

source code of llvm/lib/Transforms/Vectorize/VPlanSLP.cpp