ARMLowOverheadLoops.cpp source code [llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp]

1	//===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	/// \file
9	/// Finalize v8.1-m low-overhead loops by converting the associated pseudo
10	/// instructions into machine operations.
11	/// The expectation is that the loop contains three pseudo instructions:
12	/// - t2LoopStart - placed in the preheader or pre-preheader. The do-loop*
13	/// form should be in the preheader, whereas the while form should be in the
14	/// preheaders only predecessor.
15	/// - t2LoopDec - placed within in the loop body.
16	/// - t2LoopEnd - the loop latch terminator.
17	///
18	/// In addition to this, we also look for the presence of the VCTP instruction,
19	/// which determines whether we can generated the tail-predicated low-overhead
20	/// loop form.
21	///
22	/// Assumptions and Dependencies:
23	/// Low-overhead loops are constructed and executed using a setup instruction:
24	/// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP.
25	/// WLS(TP) and LE(TP) are branching instructions with a (large) limited range
26	/// but fixed polarity: WLS can only branch forwards and LE can only branch
27	/// backwards. These restrictions mean that this pass is dependent upon block
28	/// layout and block sizes, which is why it's the last pass to run. The same is
29	/// true for ConstantIslands, but this pass does not increase the size of the
30	/// basic blocks, nor does it change the CFG. Instructions are mainly removed
31	/// during the transform and pseudo instructions are replaced by real ones. In
32	/// some cases, when we have to revert to a 'normal' loop, we have to introduce
33	/// multiple instructions for a single pseudo (see RevertWhile and
34	/// RevertLoopEnd). To handle this situation, t2WhileLoopStartLR and t2LoopEnd
35	/// are defined to be as large as this maximum sequence of replacement
36	/// instructions.
37	///
38	/// A note on VPR.P0 (the lane mask):
39	/// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a
40	/// "VPT Active" context (which includes low-overhead loops and vpt blocks).
41	/// They will simply "and" the result of their calculation with the current
42	/// value of VPR.P0. You can think of it like this:
43	/// \verbatim
44	/// if VPT active: ; Between a DLSTP/LETP, or for predicated instrs
45	/// VPR.P0 &= Value
46	/// else
47	/// VPR.P0 = Value
48	/// \endverbatim
49	/// When we're inside the low-overhead loop (between DLSTP and LETP), we always
50	/// fall in the "VPT active" case, so we can consider that all VPR writes by
51	/// one of those instruction is actually a "and".
52	//===----------------------------------------------------------------------===//
53
54	#include "ARM.h"
55	#include "ARMBaseInstrInfo.h"
56	#include "ARMBaseRegisterInfo.h"
57	#include "ARMBasicBlockInfo.h"
58	#include "ARMSubtarget.h"
59	#include "MVETailPredUtils.h"
60	#include "Thumb2InstrInfo.h"
61	#include "llvm/ADT/SetOperations.h"
62	#include "llvm/ADT/SetVector.h"
63	#include "llvm/CodeGen/LivePhysRegs.h"
64	#include "llvm/CodeGen/MachineFrameInfo.h"
65	#include "llvm/CodeGen/MachineFunctionPass.h"
66	#include "llvm/CodeGen/MachineLoopInfo.h"
67	#include "llvm/CodeGen/MachineLoopUtils.h"
68	#include "llvm/CodeGen/MachineRegisterInfo.h"
69	#include "llvm/CodeGen/Passes.h"
70	#include "llvm/CodeGen/ReachingDefAnalysis.h"
71	#include "llvm/MC/MCInstrDesc.h"
72
73	using namespace llvm;
74
75	#define DEBUG_TYPE "arm-low-overhead-loops"
76	#define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass"
77
78	static cl::opt<bool>
79	DisableTailPredication("arm-loloops-disable-tailpred", cl::Hidden,
80	cl::desc ("Disable tail-predication in the ARM LowOverheadLoop pass"),
81	cl::init(Val: false));
82
83	static cl::opt<bool>
84	DisableOmitDLS("arm-disable-omit-dls", cl::Hidden,
85	cl::desc ("Disable omitting 'dls lr, lr' instructions"),
86	cl::init(Val: false));
87
88	static bool isVectorPredicated(MachineInstr *MI) {
89	int PIdx = llvm::findFirstVPTPredOperandIdx(MI: *MI);
90	return PIdx != -`1` && MI->getOperand(i: PIdx + `1`).getReg() == ARM::VPR;
91	}
92
93	static bool isVectorPredicate(MachineInstr *MI) {
94	return MI->findRegisterDefOperandIdx(ARM::Reg: VPR, /TRI=/nullptr) != -`1`;
95	}
96
97	static bool hasVPRUse(MachineInstr &MI) {
98	return MI.findRegisterUseOperandIdx(ARM::Reg: VPR, /TRI=/nullptr) != -`1`;
99	}
100
101	static bool isDomainMVE(MachineInstr *MI) {
102	uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask;
103	return Domain == ARMII::DomainMVE;
104	}
105
106	static int getVecSize(const MachineInstr &MI) {
107	const MCInstrDesc &MCID = MI.getDesc();
108	uint64_t Flags = MCID.TSFlags;
109	return (Flags & ARMII::VecSize) >> ARMII::VecSizeShift;
110	}
111
112	static bool shouldInspect(MachineInstr &MI) {
113	if (MI.isDebugInstr())
114	return false;
115	return isDomainMVE(MI: &MI) \|\| isVectorPredicate(MI: &MI) \|\| hasVPRUse(MI);
116	}
117
118	namespace {
119
120	using InstSet = SmallPtrSetImpl<MachineInstr *>;
121
122	class PostOrderLoopTraversal {
123	MachineLoop &ML;
124	MachineLoopInfo &MLI;
125	SmallPtrSet<MachineBasicBlock*, `4`> Visited;
126	SmallVector<MachineBasicBlock*, `4`> Order;
127
128	public:
129	PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI)
130	: ML(ML), MLI(MLI) { }
131
132	const SmallVectorImpl<MachineBasicBlock> &getOrder() const* {
133	return Order;
134	}
135
136	// Visit all the blocks within the loop, as well as exit blocks and any
137	// blocks properly dominating the header.
138	void ProcessLoop() {
139	std::function<void(MachineBasicBlock)> Search = [this*, &Search]
140	(MachineBasicBlock MBB) -> void* {
141	if (Visited.count(Ptr: MBB))
142	return;
143
144	Visited.insert(Ptr: MBB);
145	for (auto *Succ : MBB->successors()) {
146	if (!ML.contains(BB: Succ))
147	continue;
148	Search (Succ);
149	}
150	Order.push_back(Elt: MBB);
151	};
152
153	// Insert exit blocks.
154	SmallVector<MachineBasicBlock*, `2`> ExitBlocks;
155	ML.getExitBlocks(ExitBlocks);
156	append_range(C&: Order, R&: ExitBlocks);
157
158	// Then add the loop body.
159	Search (ML.getHeader());
160
161	// Then try the preheader and its predecessors.
162	std::function<void(MachineBasicBlock*)> GetPredecessor =
163	[this, &GetPredecessor] (MachineBasicBlock MBB) -> void* {
164	Order.push_back(Elt: MBB);
165	if (MBB->pred_size() == `1`)
166	GetPredecessor (*MBB->pred_begin());
167	};
168
169	if (auto *Preheader = ML.getLoopPreheader())
170	GetPredecessor (Preheader);
171	else if (auto Preheader = MLI.findLoopPreheader(L: &ML, SpeculativePreheader: true, FindMultiLoopPreheader: true*))
172	GetPredecessor (Preheader);
173	}
174	};
175
176	struct PredicatedMI {
177	MachineInstr MI = nullptr*;
178	SetVector<MachineInstr*> Predicates;
179
180	public:
181	PredicatedMI(MachineInstr I, SetVector<MachineInstr > &Preds) : MI(I) {
182	assert(I && "Instruction must not be null!");
183	Predicates.insert(Start: Preds.begin(), End: Preds.end());
184	}
185	};
186
187	// Represent the current state of the VPR and hold all instances which
188	// represent a VPT block, which is a list of instructions that begins with a
189	// VPT/VPST and has a maximum of four proceeding instructions. All
190	// instructions within the block are predicated upon the vpr and we allow
191	// instructions to define the vpr within in the block too.
192	class VPTState {
193	friend struct LowOverheadLoop;
194
195	SmallVector<MachineInstr *, `4`> Insts;
196
197	static SmallVector<VPTState, `4`> Blocks;
198	static SetVector<MachineInstr *> CurrentPredicates;
199	static std::map<MachineInstr *,
200	std::unique_ptr<PredicatedMI>> PredicatedInsts;
201
202	static void CreateVPTBlock(MachineInstr *MI) {
203	assert((CurrentPredicates.size() \|\| MI->getParent()->isLiveIn(ARM::VPR))
204	&& "Can't begin VPT without predicate");
205	Blocks.emplace_back(Args&: MI);
206	// The execution of MI is predicated upon the current set of instructions
207	// that are AND'ed together to form the VPR predicate value. In the case
208	// that MI is a VPT, CurrentPredicates will also just be MI.
209	PredicatedInsts.emplace(
210	args&: MI, args: std::make_unique<PredicatedMI>(args&: MI, args&: CurrentPredicates));
211	}
212
213	static void reset() {
214	Blocks.clear();
215	PredicatedInsts.clear();
216	CurrentPredicates.clear();
217	}
218
219	static void addInst(MachineInstr *MI) {
220	Blocks.back().insert(MI);
221	PredicatedInsts.emplace(
222	args&: MI, args: std::make_unique<PredicatedMI>(args&: MI, args&: CurrentPredicates));
223	}
224
225	static void addPredicate(MachineInstr *MI) {
226	LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI);
227	CurrentPredicates.insert(X: MI);
228	}
229
230	static void resetPredicate(MachineInstr *MI) {
231	LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI);
232	CurrentPredicates.clear();
233	CurrentPredicates.insert(X: MI);
234	}
235
236	public:
237	// Have we found an instruction within the block which defines the vpr? If
238	// so, not all the instructions in the block will have the same predicate.
239	static bool hasUniformPredicate(VPTState &Block) {
240	return getDivergent(Block) == nullptr;
241	}
242
243	// If it exists, return the first internal instruction which modifies the
244	// VPR.
245	static MachineInstr *getDivergent(VPTState &Block) {
246	SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
247	for (unsigned i = `1`; i < Insts.size(); ++i) {
248	MachineInstr *Next = Insts [i];
249	if (isVectorPredicate(MI: Next))
250	return Next; // Found an instruction altering the vpr.
251	}
252	return nullptr;
253	}
254
255	// Return whether the given instruction is predicated upon a VCTP.
256	static bool isPredicatedOnVCTP(MachineInstr MI, bool* Exclusive = false) {
257	SetVector<MachineInstr *> &Predicates = PredicatedInsts [MI]->Predicates;
258	if (Exclusive && Predicates.size() != `1`)
259	return false;
260	return llvm::any_of(Range&: Predicates, P: isVCTP);
261	}
262
263	// Is the VPST, controlling the block entry, predicated upon a VCTP.
264	static bool isEntryPredicatedOnVCTP(VPTState &Block,
265	bool Exclusive = false) {
266	SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
267	return isPredicatedOnVCTP(MI: Insts.front(), Exclusive);
268	}
269
270	// If this block begins with a VPT, we can check whether it's using
271	// at least one predicated input(s), as well as possible loop invariant
272	// which would result in it being implicitly predicated.
273	static bool hasImplicitlyValidVPT(VPTState &Block,
274	ReachingDefAnalysis &RDA) {
275	SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
276	MachineInstr *VPT = Insts.front();
277	assert(isVPTOpcode(VPT->getOpcode()) &&
278	"Expected VPT block to begin with VPT/VPST");
279
280	if (VPT->getOpcode() == ARM::MVE_VPST)
281	return false;
282
283	auto IsOperandPredicated = [&](MachineInstr MI, unsigned* Idx) {
284	MachineInstr *Op = RDA.getMIOperand(MI, MO&: MI->getOperand(i: Idx));
285	return Op && PredicatedInsts.count(x: Op) && isPredicatedOnVCTP(MI: Op);
286	};
287
288	auto IsOperandInvariant = [&](MachineInstr MI, unsigned* Idx) {
289	MachineOperand &MO = MI->getOperand(i: Idx);
290	if (!MO.isReg() \|\| !MO.getReg())
291	return true;
292
293	SmallPtrSet<MachineInstr *, `2`> Defs;
294	RDA.getGlobalReachingDefs(MI, PhysReg: MO.getReg(), Defs);
295	if (Defs.empty())
296	return true;
297
298	for (auto *Def : Defs)
299	if (Def->getParent() == VPT->getParent())
300	return false;
301	return true;
302	};
303
304	// Check that at least one of the operands is directly predicated on a
305	// vctp and allow an invariant value too.
306	return (IsOperandPredicated(VPT, `1`) \|\| IsOperandPredicated(VPT, `2`)) &&
307	(IsOperandPredicated(VPT, `1`) \|\| IsOperandInvariant(VPT, `1`)) &&
308	(IsOperandPredicated(VPT, `2`) \|\| IsOperandInvariant(VPT, `2`));
309	}
310
311	static bool isValid(ReachingDefAnalysis &RDA) {
312	// All predication within the loop should be based on vctp. If the block
313	// isn't predicated on entry, check whether the vctp is within the block
314	// and that all other instructions are then predicated on it.
315	for (auto &Block : Blocks) {
316	if (isEntryPredicatedOnVCTP(Block, Exclusive: false) \|\|
317	hasImplicitlyValidVPT(Block, RDA))
318	continue;
319
320	SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
321	// We don't know how to convert a block with just a VPT;VCTP into
322	// anything valid once we remove the VCTP. For now just bail out.
323	assert(isVPTOpcode(Insts.front()->getOpcode()) &&
324	"Expected VPT block to start with a VPST or VPT!");
325	if (Insts.size() == `2` && Insts.front()->getOpcode() != ARM::MVE_VPST &&
326	isVCTP(MI: Insts.back()))
327	return false;
328
329	for (auto *MI : Insts) {
330	// Check that any internal VCTPs are 'Then' predicated.
331	if (isVCTP(MI) && getVPTInstrPredicate(MI: *MI) != ARMVCC::Then)
332	return false;
333	// Skip other instructions that build up the predicate.
334	if (MI->getOpcode() == ARM::MVE_VPST \|\| isVectorPredicate(MI))
335	continue;
336	// Check that any other instructions are predicated upon a vctp.
337	// TODO: We could infer when VPTs are implicitly predicated on the
338	// vctp (when the operands are predicated).
339	if (!isPredicatedOnVCTP(MI)) {
340	LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI);
341	return false;
342	}
343	}
344	}
345	return true;
346	}
347
348	VPTState(MachineInstr *MI) { Insts.push_back(Elt: MI); }
349
350	void insert(MachineInstr *MI) {
351	Insts.push_back(Elt: MI);
352	// VPT/VPST + 4 predicated instructions.
353	assert(Insts.size() <= `5` && "Too many instructions in VPT block!");
354	}
355
356	bool containsVCTP() const {
357	return llvm::any_of(Range: Insts, P: isVCTP);
358	}
359
360	unsigned size() const { return Insts.size(); }
361	SmallVectorImpl<MachineInstr > &getInsts() { return* Insts; }
362	};
363
364	struct LowOverheadLoop {
365
366	MachineLoop &ML;
367	MachineBasicBlock Preheader = nullptr*;
368	MachineLoopInfo &MLI;
369	ReachingDefAnalysis &RDA;
370	const TargetRegisterInfo &TRI;
371	const ARMBaseInstrInfo &TII;
372	MachineFunction MF = nullptr*;
373	MachineBasicBlock::iterator StartInsertPt;
374	MachineBasicBlock StartInsertBB = nullptr*;
375	MachineInstr Start = nullptr*;
376	MachineInstr Dec = nullptr*;
377	MachineInstr End = nullptr*;
378	MachineOperand TPNumElements;
379	SmallVector<MachineInstr *, `4`> VCTPs;
380	SmallPtrSet<MachineInstr *, `4`> ToRemove;
381	SmallPtrSet<MachineInstr *, `4`> BlockMasksToRecompute;
382	SmallPtrSet<MachineInstr *, `4`> DoubleWidthResultInstrs;
383	SmallPtrSet<MachineInstr *, `4`> VMOVCopies;
384	bool Revert = false;
385	bool CannotTailPredicate = false;
386
387	LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI,
388	ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI,
389	const ARMBaseInstrInfo &TII)
390	: ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII),
391	TPNumElements(MachineOperand::CreateImm(Val: `0`)) {
392	MF = ML.getHeader()->getParent();
393	if (auto *MBB = ML.getLoopPreheader())
394	Preheader = MBB;
395	else if (auto MBB = MLI.findLoopPreheader(L: &ML, SpeculativePreheader: true, FindMultiLoopPreheader: true*))
396	Preheader = MBB;
397	VPTState::reset();
398	}
399
400	// If this is an MVE instruction, check that we know how to use tail
401	// predication with it. Record VPT blocks and return whether the
402	// instruction is valid for tail predication.
403	bool ValidateMVEInst(MachineInstr *MI);
404
405	void AnalyseMVEInst(MachineInstr *MI) {
406	CannotTailPredicate = !ValidateMVEInst(MI);
407	}
408
409	bool IsTailPredicationLegal() const {
410	// For now, let's keep things really simple and only support a single
411	// block for tail predication.
412	return !Revert && FoundAllComponents() && !VCTPs.empty() &&
413	!CannotTailPredicate && ML.getNumBlocks() == `1`;
414	}
415
416	// Given that MI is a VCTP, check that is equivalent to any other VCTPs
417	// found.
418	bool AddVCTP(MachineInstr *MI);
419
420	// Check that the predication in the loop will be equivalent once we
421	// perform the conversion. Also ensure that we can provide the number
422	// of elements to the loop start instruction.
423	bool ValidateTailPredicate();
424
425	// Check that any values available outside of the loop will be the same
426	// after tail predication conversion.
427	bool ValidateLiveOuts();
428
429	// Check the branch targets are within range and we satisfy our
430	// restrictions.
431	void Validate(ARMBasicBlockUtils *BBUtils);
432
433	bool FoundAllComponents() const {
434	return Start && Dec && End;
435	}
436
437	SmallVectorImpl<VPTState> &getVPTBlocks() {
438	return VPTState::Blocks;
439	}
440
441	// Return the operand for the loop start instruction. This will be the loop
442	// iteration count, or the number of elements if we're tail predicating.
443	MachineOperand &getLoopStartOperand() {
444	if (IsTailPredicationLegal())
445	return TPNumElements;
446	return Start->getOperand(i: `1`);
447	}
448
449	unsigned getStartOpcode() const {
450	bool IsDo = isDoLoopStart(MI: *Start);
451	if (!IsTailPredicationLegal())
452	return IsDo ? ARM::t2DLS : ARM::t2WLS;
453
454	return VCTPOpcodeToLSTP(Opcode: VCTPs.back()->getOpcode(), IsDoLoop: IsDo);
455	}
456
457	void dump() const {
458	if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
459	if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
460	if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;
461	if (!VCTPs.empty()) {
462	dbgs() << "ARM Loops: Found VCTP(s):\n";
463	for (auto *MI : VCTPs)
464	dbgs() << " - " << *MI;
465	}
466	if (!FoundAllComponents())
467	dbgs() << "ARM Loops: Not a low-overhead loop.\n";
468	else if (!(Start && Dec && End))
469	dbgs() << "ARM Loops: Failed to find all loop components.\n";
470	}
471	};
472
473	class ARMLowOverheadLoops : public MachineFunctionPass {
474	MachineFunction MF = nullptr*;
475	MachineLoopInfo MLI = nullptr*;
476	ReachingDefAnalysis RDA = nullptr*;
477	const ARMBaseInstrInfo TII = nullptr*;
478	MachineRegisterInfo MRI = nullptr*;
479	const TargetRegisterInfo TRI = nullptr*;
480	std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
481
482	public:
483	static char ID;
484
485	ARMLowOverheadLoops() : MachineFunctionPass (ID) { }
486
487	void getAnalysisUsage(AnalysisUsage &AU) const override {
488	AU.setPreservesCFG();
489	AU.addRequired<MachineLoopInfo>();
490	AU.addRequired<ReachingDefAnalysis>();
491	MachineFunctionPass::getAnalysisUsage(AU);
492	}
493
494	bool runOnMachineFunction(MachineFunction &MF) override;
495
496	MachineFunctionProperties getRequiredProperties() const override {
497	return MachineFunctionProperties ().set(
498	MachineFunctionProperties::Property::NoVRegs).set(
499	MachineFunctionProperties::Property::TracksLiveness);
500	}
501
502	StringRef getPassName() const override {
503	return ARM_LOW_OVERHEAD_LOOPS_NAME;
504	}
505
506	private:
507	bool ProcessLoop(MachineLoop *ML);
508
509	bool RevertNonLoops();
510
511	void RevertWhile(MachineInstr MI) const*;
512	void RevertDo(MachineInstr MI) const*;
513
514	bool RevertLoopDec(MachineInstr MI) const*;
515
516	void RevertLoopEnd(MachineInstr MI, bool* SkipCmp = false) const;
517
518	void RevertLoopEndDec(MachineInstr MI) const*;
519
520	void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
521
522	MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
523
524	void Expand(LowOverheadLoop &LoLoop);
525
526	void IterationCountDCE(LowOverheadLoop &LoLoop);
527	};
528	}
529
530	char ARMLowOverheadLoops::ID = `0`;
531
532	SmallVector<VPTState, `4`> VPTState::Blocks;
533	SetVector<MachineInstr *> VPTState::CurrentPredicates;
534	std::map<MachineInstr *,
535	std::unique_ptr<PredicatedMI>> VPTState::PredicatedInsts;
536
537	INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
538	false, false)
539
540	static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA,
541	InstSet &ToRemove, InstSet &Ignore) {
542
543	// Check that we can remove all of Killed without having to modify any IT
544	// blocks.
545	auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) {
546	// Collect the dead code and the MBBs in which they reside.
547	SmallPtrSet<MachineBasicBlock*, `2`> BasicBlocks;
548	for (auto *Dead : Killed)
549	BasicBlocks.insert(Ptr: Dead->getParent());
550
551	// Collect IT blocks in all affected basic blocks.
552	std::map<MachineInstr , SmallPtrSet<MachineInstr , `2`>> ITBlocks;
553	for (auto *MBB : BasicBlocks) {
554	for (auto &IT : *MBB) {
555	if (IT.getOpcode() != ARM::t2IT)
556	continue;
557	RDA.getReachingLocalUses(MI: &IT, PhysReg: MCRegister::from(ARM::Val: ITSTATE),
558	Uses&: ITBlocks [&IT]);
559	}
560	}
561
562	// If we're removing all of the instructions within an IT block, then
563	// also remove the IT instruction.
564	SmallPtrSet<MachineInstr *, `2`> ModifiedITs;
565	SmallPtrSet<MachineInstr *, `2`> RemoveITs;
566	for (auto *Dead : Killed) {
567	if (MachineOperand *MO =
568	Dead->findRegisterUseOperand(ARM::ITSTATE, /TRI=/nullptr)) {
569	MachineInstr IT = RDA.getMIOperand(MI: Dead, MO&: MO);
570	RemoveITs.insert(Ptr: IT);
571	auto &CurrentBlock = ITBlocks [IT];
572	CurrentBlock.erase(Dead);
573	if (CurrentBlock.empty())
574	ModifiedITs.erase(Ptr: IT);
575	else
576	ModifiedITs.insert(Ptr: IT);
577	}
578	}
579	if (!ModifiedITs.empty())
580	return false;
581	Killed.insert(I: RemoveITs.begin(), E: RemoveITs.end());
582	return true;
583	};
584
585	SmallPtrSet<MachineInstr *, `2`> Uses;
586	if (!RDA.isSafeToRemove(MI, ToRemove&: Uses, Ignore))
587	return false;
588
589	if (WontCorruptITs (Uses, RDA)) {
590	ToRemove.insert(I: Uses.begin(), E: Uses.end());
591	LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI
592	<< " - can also remove:\n";
593	for (auto *Use : Uses)
594	dbgs() << " - " << *Use);
595
596	SmallPtrSet<MachineInstr*, `4`> Killed;
597	RDA.collectKilledOperands(MI, Dead&: Killed);
598	if (WontCorruptITs (Killed, RDA)) {
599	ToRemove.insert(I: Killed.begin(), E: Killed.end());
600	LLVM_DEBUG(for (auto *Dead : Killed)
601	dbgs() << " - " << *Dead);
602	}
603	return true;
604	}
605	return false;
606	}
607
608	bool LowOverheadLoop::ValidateTailPredicate() {
609	if (!IsTailPredicationLegal()) {
610	LLVM_DEBUG(if (VCTPs.empty())
611	dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n";
612	dbgs() << "ARM Loops: Tail-predication is not valid.\n");
613	return false;
614	}
615
616	assert(!VCTPs.empty() && "VCTP instruction expected but is not set");
617	assert(ML.getBlocks().size() == `1` &&
618	"Shouldn't be processing a loop with more than one block");
619
620	if (DisableTailPredication) {
621	LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n");
622	return false;
623	}
624
625	if (!VPTState::isValid(RDA)) {
626	LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n");
627	return false;
628	}
629
630	if (!ValidateLiveOuts()) {
631	LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n");
632	return false;
633	}
634
635	// For tail predication, we need to provide the number of elements, instead
636	// of the iteration count, to the loop start instruction. The number of
637	// elements is provided to the vctp instruction, so we need to check that
638	// we can use this register at InsertPt.
639	MachineInstr *VCTP = VCTPs.back();
640	if (Start->getOpcode() == ARM::t2DoLoopStartTP \|\|
641	Start->getOpcode() == ARM::t2WhileLoopStartTP) {
642	TPNumElements = Start->getOperand(i: `2`);
643	StartInsertPt = Start;
644	StartInsertBB = Start->getParent();
645	} else {
646	TPNumElements = VCTP->getOperand(i: `1`);
647	MCRegister NumElements = TPNumElements.getReg().asMCReg();
648
649	// If the register is defined within loop, then we can't perform TP.
650	// TODO: Check whether this is just a mov of a register that would be
651	// available.
652	if (RDA.hasLocalDefBefore(MI: VCTP, PhysReg: NumElements)) {
653	LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
654	return false;
655	}
656
657	// The element count register maybe defined after InsertPt, in which case we
658	// need to try to move either InsertPt or the def so that the [w\|d]lstp can
659	// use the value.
660
661	if (StartInsertPt != StartInsertBB->end() &&
662	!RDA.isReachingDefLiveOut(MI: &*StartInsertPt, PhysReg: NumElements)) {
663	if (auto *ElemDef =
664	RDA.getLocalLiveOutMIDef(MBB: StartInsertBB, PhysReg: NumElements)) {
665	if (RDA.isSafeToMoveForwards(From: ElemDef, To: &*StartInsertPt)) {
666	ElemDef->removeFromParent();
667	StartInsertBB->insert(I: StartInsertPt, MI: ElemDef);
668	LLVM_DEBUG(dbgs()
669	<< "ARM Loops: Moved element count def: " << *ElemDef);
670	} else if (RDA.isSafeToMoveBackwards(From: &*StartInsertPt, To: ElemDef)) {
671	StartInsertPt ->removeFromParent();
672	StartInsertBB->insertAfter(I: MachineBasicBlock::iterator (ElemDef),
673	MI: &*StartInsertPt);
674	LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
675	} else {
676	// If we fail to move an instruction and the element count is provided
677	// by a mov, use the mov operand if it will have the same value at the
678	// insertion point
679	MachineOperand Operand = ElemDef->getOperand(i: `1`);
680	if (isMovRegOpcode(Opc: ElemDef->getOpcode()) &&
681	RDA.getUniqueReachingMIDef(MI: ElemDef, PhysReg: Operand.getReg().asMCReg()) ==
682	RDA.getUniqueReachingMIDef(MI: &*StartInsertPt,
683	PhysReg: Operand.getReg().asMCReg())) {
684	TPNumElements = Operand;
685	NumElements = TPNumElements.getReg();
686	} else {
687	LLVM_DEBUG(dbgs()
688	<< "ARM Loops: Unable to move element count to loop "
689	<< "start instruction.\n");
690	return false;
691	}
692	}
693	}
694	}
695
696	// Especially in the case of while loops, InsertBB may not be the
697	// preheader, so we need to check that the register isn't redefined
698	// before entering the loop.
699	auto CannotProvideElements = [this](MachineBasicBlock *MBB,
700	MCRegister NumElements) {
701	if (MBB->empty())
702	return false;
703	// NumElements is redefined in this block.
704	if (RDA.hasLocalDefBefore(MI: &MBB->back(), PhysReg: NumElements))
705	return true;
706
707	// Don't continue searching up through multiple predecessors.
708	if (MBB->pred_size() > `1`)
709	return true;
710
711	return false;
712	};
713
714	// Search backwards for a def, until we get to InsertBB.
715	MachineBasicBlock *MBB = Preheader;
716	while (MBB && MBB != StartInsertBB) {
717	if (CannotProvideElements (MBB, NumElements)) {
718	LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n");
719	return false;
720	}
721	MBB = *MBB->pred_begin();
722	}
723	}
724
725	// Could inserting the [W\|D]LSTP cause some unintended affects? In a perfect
726	// world the [w\|d]lstp instruction would be last instruction in the preheader
727	// and so it would only affect instructions within the loop body. But due to
728	// scheduling, and/or the logic in this pass (above), the insertion point can
729	// be moved earlier. So if the Loop Start isn't the last instruction in the
730	// preheader, and if the initial element count is smaller than the vector
731	// width, the Loop Start instruction will immediately generate one or more
732	// false lane mask which can, incorrectly, affect the proceeding MVE
733	// instructions in the preheader.
734	if (std::any_of(first: StartInsertPt, last: StartInsertBB->end(), pred: shouldInspect)) {
735	LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W\|D]LSTP\n");
736	return false;
737	}
738
739	// For any DoubleWidthResultInstrs we found whilst scanning instructions, they
740	// need to compute an output size that is smaller than the VCTP mask operates
741	// on. The VecSize of the DoubleWidthResult is the larger vector size - the
742	// size it extends into, so any VCTP VecSize <= is valid.
743	unsigned VCTPVecSize = getVecSize(MI: *VCTP);
744	for (MachineInstr *MI : DoubleWidthResultInstrs) {
745	unsigned InstrVecSize = getVecSize(MI: *MI);
746	if (InstrVecSize > VCTPVecSize) {
747	LLVM_DEBUG(dbgs() << "ARM Loops: Double width result larger than VCTP "
748	<< "VecSize:\n" << *MI);
749	return false;
750	}
751	}
752
753	// Check that the value change of the element count is what we expect and
754	// that the predication will be equivalent. For this we need:
755	// NumElements = NumElements - VectorWidth. The sub will be a sub immediate
756	// and we can also allow register copies within the chain too.
757	auto IsValidSub = [](MachineInstr MI, int* ExpectedVecWidth) {
758	return -getAddSubImmediate(MI&: *MI) == ExpectedVecWidth;
759	};
760
761	MachineBasicBlock *MBB = VCTP->getParent();
762	// Remove modifications to the element count since they have no purpose in a
763	// tail predicated loop. Explicitly refer to the vctp operand no matter which
764	// register NumElements has been assigned to, since that is what the
765	// modifications will be using
766	if (auto *Def = RDA.getUniqueReachingMIDef(
767	MI: &MBB->back(), PhysReg: VCTP->getOperand(i: `1`).getReg().asMCReg())) {
768	SmallPtrSet<MachineInstr*, `2`> ElementChain;
769	SmallPtrSet<MachineInstr*, `2`> Ignore;
770	unsigned ExpectedVectorWidth = getTailPredVectorWidth(Opcode: VCTP->getOpcode());
771
772	Ignore.insert(I: VCTPs.begin(), E: VCTPs.end());
773
774	if (TryRemove(MI: Def, RDA, ToRemove&: ElementChain, Ignore)) {
775	bool FoundSub = false;
776
777	for (auto *MI : ElementChain) {
778	if (isMovRegOpcode(Opc: MI->getOpcode()))
779	continue;
780
781	if (isSubImmOpcode(Opc: MI->getOpcode())) {
782	if (FoundSub \|\| !IsValidSub (MI, ExpectedVectorWidth)) {
783	LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
784	" count: " << *MI);
785	return false;
786	}
787	FoundSub = true;
788	} else {
789	LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element"
790	" count: " << *MI);
791	return false;
792	}
793	}
794	ToRemove.insert(I: ElementChain.begin(), E: ElementChain.end());
795	}
796	}
797
798	// If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we
799	// can also remove any extra instructions in the preheader, which often
800	// includes a now unused MOV.
801	if ((Start->getOpcode() == ARM::t2DoLoopStartTP \|\|
802	Start->getOpcode() == ARM::t2WhileLoopStartTP) &&
803	Preheader && !Preheader->empty() &&
804	!RDA.hasLocalDefBefore(MI: VCTP, PhysReg: VCTP->getOperand(i: `1`).getReg())) {
805	if (auto *Def = RDA.getUniqueReachingMIDef(
806	MI: &Preheader->back(), PhysReg: VCTP->getOperand(i: `1`).getReg().asMCReg())) {
807	SmallPtrSet<MachineInstr*, `2`> Ignore;
808	Ignore.insert(I: VCTPs.begin(), E: VCTPs.end());
809	TryRemove(MI: Def, RDA, ToRemove, Ignore);
810	}
811	}
812
813	return true;
814	}
815
816	static bool isRegInClass(const MachineOperand &MO,
817	const TargetRegisterClass *Class) {
818	return MO.isReg() && MO.getReg() && Class->contains(Reg: MO.getReg());
819	}
820
821	// MVE 'narrowing' operate on half a lane, reading from half and writing
822	// to half, which are referred to has the top and bottom half. The other
823	// half retains its previous value.
824	static bool retainsPreviousHalfElement(const MachineInstr &MI) {
825	const MCInstrDesc &MCID = MI.getDesc();
826	uint64_t Flags = MCID.TSFlags;
827	return (Flags & ARMII::RetainsPreviousHalfElement) != `0`;
828	}
829
830	// Some MVE instructions read from the top/bottom halves of their operand(s)
831	// and generate a vector result with result elements that are double the
832	// width of the input.
833	static bool producesDoubleWidthResult(const MachineInstr &MI) {
834	const MCInstrDesc &MCID = MI.getDesc();
835	uint64_t Flags = MCID.TSFlags;
836	return (Flags & ARMII::DoubleWidthResult) != `0`;
837	}
838
839	static bool isHorizontalReduction(const MachineInstr &MI) {
840	const MCInstrDesc &MCID = MI.getDesc();
841	uint64_t Flags = MCID.TSFlags;
842	return (Flags & ARMII::HorizontalReduction) != `0`;
843	}
844
845	// Can this instruction generate a non-zero result when given only zeroed
846	// operands? This allows us to know that, given operands with false bytes
847	// zeroed by masked loads, that the result will also contain zeros in those
848	// bytes.
849	static bool canGenerateNonZeros(const MachineInstr &MI) {
850
851	// Check for instructions which can write into a larger element size,
852	// possibly writing into a previous zero'd lane.
853	if (producesDoubleWidthResult(MI))
854	return true;
855
856	switch (MI.getOpcode()) {
857	default:
858	break;
859	// FIXME: VNEG FP and -0? I think we'll need to handle this once we allow
860	// fp16 -> fp32 vector conversions.
861	// Instructions that perform a NOT will generate 1s from 0s.
862	case ARM::MVE_VMVN:
863	case ARM::MVE_VORN:
864	// Count leading zeros will do just that!
865	case ARM::MVE_VCLZs8:
866	case ARM::MVE_VCLZs16:
867	case ARM::MVE_VCLZs32:
868	return true;
869	}
870	return false;
871	}
872
873	// Look at its register uses to see if it only can only receive zeros
874	// into its false lanes which would then produce zeros. Also check that
875	// the output register is also defined by an FalseLanesZero instruction
876	// so that if tail-predication happens, the lanes that aren't updated will
877	// still be zeros.
878	static bool producesFalseLanesZero(MachineInstr &MI,
879	const TargetRegisterClass *QPRs,
880	const ReachingDefAnalysis &RDA,
881	InstSet &FalseLanesZero) {
882	if (canGenerateNonZeros(MI))
883	return false;
884
885	bool isPredicated = isVectorPredicated(MI: &MI);
886	// Predicated loads will write zeros to the falsely predicated bytes of the
887	// destination register.
888	if (MI.mayLoad())
889	return isPredicated;
890
891	auto IsZeroInit = [](MachineInstr *Def) {
892	return !isVectorPredicated(Def) &&
893	Def->getOpcode() == ARM::MVE_VMOVimmi32 &&
894	Def->getOperand(`1`).getImm() == `0`;
895	};
896
897	bool AllowScalars = isHorizontalReduction(MI);
898	for (auto &MO : MI.operands()) {
899	if (!MO.isReg() \|\| !MO.getReg())
900	continue;
901	if (!isRegInClass(MO, Class: QPRs) && AllowScalars)
902	continue;
903	// Skip the lr predicate reg
904	int PIdx = llvm::findFirstVPTPredOperandIdx(MI);
905	if (PIdx != -`1` && (int)MO.getOperandNo() == PIdx + `2`)
906	continue;
907
908	// Check that this instruction will produce zeros in its false lanes:
909	// - If it only consumes false lanes zero or constant 0 (vmov #0)
910	// - If it's predicated, it only matters that it's def register already has
911	// false lane zeros, so we can ignore the uses.
912	SmallPtrSet<MachineInstr *, `2`> Defs;
913	RDA.getGlobalReachingDefs(MI: &MI, PhysReg: MO.getReg(), Defs);
914	if (Defs.empty())
915	return false;
916	for (auto *Def : Defs) {
917	if (Def == &MI \|\| FalseLanesZero.count(Ptr: Def) \|\| IsZeroInit(Def))
918	continue;
919	if (MO.isUse() && isPredicated)
920	continue;
921	return false;
922	}
923	}
924	LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
925	return true;
926	}
927
928	bool LowOverheadLoop::ValidateLiveOuts() {
929	// We want to find out if the tail-predicated version of this loop will
930	// produce the same values as the loop in its original form. For this to
931	// be true, the newly inserted implicit predication must not change the
932	// the (observable) results.
933	// We're doing this because many instructions in the loop will not be
934	// predicated and so the conversion from VPT predication to tail-predication
935	// can result in different values being produced; due to the tail-predication
936	// preventing many instructions from updating their falsely predicated
937	// lanes. This analysis assumes that all the instructions perform lane-wise
938	// operations and don't perform any exchanges.
939	// A masked load, whether through VPT or tail predication, will write zeros
940	// to any of the falsely predicated bytes. So, from the loads, we know that
941	// the false lanes are zeroed and here we're trying to track that those false
942	// lanes remain zero, or where they change, the differences are masked away
943	// by their user(s).
944	// All MVE stores have to be predicated, so we know that any predicate load
945	// operands, or stored results are equivalent already. Other explicitly
946	// predicated instructions will perform the same operation in the original
947	// loop and the tail-predicated form too. Because of this, we can insert
948	// loads, stores and other predicated instructions into our Predicated
949	// set and build from there.
950	const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::i: MQPRRegClassID);
951	SetVector<MachineInstr *> FalseLanesUnknown;
952	SmallPtrSet<MachineInstr *, `4`> FalseLanesZero;
953	SmallPtrSet<MachineInstr *, `4`> Predicated;
954	MachineBasicBlock *Header = ML.getHeader();
955
956	LLVM_DEBUG(dbgs() << "ARM Loops: Validating Live outs\n");
957
958	for (auto &MI : *Header) {
959	if (!shouldInspect(MI))
960	continue;
961
962	if (isVCTP(MI: &MI) \|\| isVPTOpcode(Opc: MI.getOpcode()))
963	continue;
964
965	bool isPredicated = isVectorPredicated(MI: &MI);
966	bool retainsOrReduces =
967	retainsPreviousHalfElement(MI) \|\| isHorizontalReduction(MI);
968
969	if (isPredicated)
970	Predicated.insert(Ptr: &MI);
971	if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero))
972	FalseLanesZero.insert(Ptr: &MI);
973	else if (MI.getNumDefs() == `0`)
974	continue;
975	else if (!isPredicated && retainsOrReduces) {
976	LLVM_DEBUG(dbgs() << " Unpredicated instruction that retainsOrReduces: " << MI);
977	return false;
978	} else if (!isPredicated && MI.getOpcode() != ARM::MQPRCopy)
979	FalseLanesUnknown.insert(X: &MI);
980	}
981
982	LLVM_DEBUG({
983	dbgs() << " Predicated:\n";
984	for (auto *I : Predicated)
985	dbgs() << " " << *I;
986	dbgs() << " FalseLanesZero:\n";
987	for (auto *I : FalseLanesZero)
988	dbgs() << " " << *I;
989	dbgs() << " FalseLanesUnknown:\n";
990	for (auto *I : FalseLanesUnknown)
991	dbgs() << " " << *I;
992	});
993
994	auto HasPredicatedUsers = [this](MachineInstr MI, const* MachineOperand &MO,
995	SmallPtrSetImpl<MachineInstr *> &Predicated) {
996	SmallPtrSet<MachineInstr *, `2`> Uses;
997	RDA.getGlobalUses(MI, PhysReg: MO.getReg().asMCReg(), Uses);
998	for (auto *Use : Uses) {
999	if (Use != MI && !Predicated.count(Ptr: Use))
1000	return false;
1001	}
1002	return true;
1003	};
1004
1005	// Visit the unknowns in reverse so that we can start at the values being
1006	// stored and then we can work towards the leaves, hopefully adding more
1007	// instructions to Predicated. Successfully terminating the loop means that
1008	// all the unknown values have to found to be masked by predicated user(s).
1009	// For any unpredicated values, we store them in NonPredicated so that we
1010	// can later check whether these form a reduction.
1011	SmallPtrSet<MachineInstr*, `2`> NonPredicated;
1012	for (auto *MI : reverse(C&: FalseLanesUnknown)) {
1013	for (auto &MO : MI->operands()) {
1014	if (!isRegInClass(MO, Class: QPRs) \|\| !MO.isDef())
1015	continue;
1016	if (!HasPredicatedUsers (MI, MO, Predicated)) {
1017	LLVM_DEBUG(dbgs() << " Found an unknown def of : "
1018	<< TRI.getRegAsmName(MO.getReg()) << " at " << *MI);
1019	NonPredicated.insert(Ptr: MI);
1020	break;
1021	}
1022	}
1023	// Any unknown false lanes have been masked away by the user(s).
1024	if (!NonPredicated.contains(Ptr: MI))
1025	Predicated.insert(Ptr: MI);
1026	}
1027
1028	SmallPtrSet<MachineInstr *, `2`> LiveOutMIs;
1029	SmallVector<MachineBasicBlock *, `2`> ExitBlocks;
1030	ML.getExitBlocks(ExitBlocks);
1031	assert(ML.getNumBlocks() == `1` && "Expected single block loop!");
1032	assert(ExitBlocks.size() == `1` && "Expected a single exit block");
1033	MachineBasicBlock *ExitBB = ExitBlocks.front();
1034	for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) {
1035	// TODO: Instead of blocking predication, we could move the vctp to the exit
1036	// block and calculate it's operand there in or the preheader.
1037	if (RegMask.PhysReg == ARM::VPR) {
1038	LLVM_DEBUG(dbgs() << " VPR is live in to the exit block.");
1039	return false;
1040	}
1041	// Check Q-regs that are live in the exit blocks. We don't collect scalars
1042	// because they won't be affected by lane predication.
1043	if (QPRs->contains(Reg: RegMask.PhysReg))
1044	if (auto *MI = RDA.getLocalLiveOutMIDef(MBB: Header, PhysReg: RegMask.PhysReg))
1045	LiveOutMIs.insert(Ptr: MI);
1046	}
1047
1048	// We've already validated that any VPT predication within the loop will be
1049	// equivalent when we perform the predication transformation; so we know that
1050	// any VPT predicated instruction is predicated upon VCTP. Any live-out
1051	// instruction needs to be predicated, so check this here. The instructions
1052	// in NonPredicated have been found to be a reduction that we can ensure its
1053	// legality. Any MQPRCopy found will need to validate its input as if it was
1054	// live out.
1055	SmallVector<MachineInstr *> Worklist(LiveOutMIs.begin(), LiveOutMIs.end());
1056	while (!Worklist.empty()) {
1057	MachineInstr *MI = Worklist.pop_back_val();
1058	if (MI->getOpcode() == ARM::MQPRCopy) {
1059	VMOVCopies.insert(Ptr: MI);
1060	MachineInstr *CopySrc =
1061	RDA.getUniqueReachingMIDef(MI, PhysReg: MI->getOperand(i: `1`).getReg());
1062	if (CopySrc)
1063	Worklist.push_back(Elt: CopySrc);
1064	} else if (NonPredicated.count(Ptr: MI) && FalseLanesUnknown.contains(key: MI)) {
1065	LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI);
1066	VMOVCopies.clear();
1067	return false;
1068	}
1069	}
1070
1071	return true;
1072	}
1073
1074	void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) {
1075	if (Revert)
1076	return;
1077
1078	// Check branch target ranges: WLS[TP] can only branch forwards and LE[TP]
1079	// can only jump back.
1080	auto ValidateRanges = [](MachineInstr Start, MachineInstr End,
1081	ARMBasicBlockUtils *BBUtils, MachineLoop &ML) {
1082	MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd
1083	? End->getOperand(i: `1`).getMBB()
1084	: End->getOperand(i: `2`).getMBB();
1085	// TODO Maybe there's cases where the target doesn't have to be the header,
1086	// but for now be safe and revert.
1087	if (TgtBB != ML.getHeader()) {
1088	LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n");
1089	return false;
1090	}
1091
1092	// The WLS and LE instructions have 12-bits for the label offset. WLS
1093	// requires a positive offset, while LE uses negative.
1094	if (BBUtils->getOffsetOf(MI: End) < BBUtils->getOffsetOf(MBB: ML.getHeader()) \|\|
1095	!BBUtils->isBBInRange(MI: End, DestBB: ML.getHeader(), MaxDisp: `4094`)) {
1096	LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
1097	return false;
1098	}
1099
1100	if (isWhileLoopStart(MI: *Start)) {
1101	MachineBasicBlock TargetBB = getWhileLoopStartTargetBB(MI: Start);
1102	if (BBUtils->getOffsetOf(MI: Start) > BBUtils->getOffsetOf(MBB: TargetBB) \|\|
1103	!BBUtils->isBBInRange(MI: Start, DestBB: TargetBB, MaxDisp: `4094`)) {
1104	LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n");
1105	return false;
1106	}
1107	}
1108	return true;
1109	};
1110
1111	StartInsertPt = MachineBasicBlock::iterator (Start);
1112	StartInsertBB = Start->getParent();
1113	LLVM_DEBUG(dbgs() << "ARM Loops: Will insert LoopStart at "
1114	<< *StartInsertPt);
1115
1116	Revert = !ValidateRanges (Start, End, BBUtils, ML);
1117	CannotTailPredicate = !ValidateTailPredicate();
1118	}
1119
1120	bool LowOverheadLoop::AddVCTP(MachineInstr *MI) {
1121	LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI);
1122	if (VCTPs.empty()) {
1123	VCTPs.push_back(Elt: MI);
1124	return true;
1125	}
1126
1127	// If we find another VCTP, check whether it uses the same value as the main VCTP.
1128	// If it does, store it in the VCTPs set, else refuse it.
1129	MachineInstr *Prev = VCTPs.back();
1130	if (!Prev->getOperand(i: `1`).isIdenticalTo(Other: MI->getOperand(i: `1`)) \|\|
1131	!RDA.hasSameReachingDef(A: Prev, B: MI, PhysReg: MI->getOperand(i: `1`).getReg().asMCReg())) {
1132	LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
1133	"definition from the main VCTP");
1134	return false;
1135	}
1136	VCTPs.push_back(Elt: MI);
1137	return true;
1138	}
1139
1140	static bool ValidateMVEStore(MachineInstr MI, MachineLoop ML) {
1141
1142	auto GetFrameIndex = [](MachineMemOperand *Operand) {
1143	const PseudoSourceValue *PseudoValue = Operand->getPseudoValue();
1144	if (PseudoValue && PseudoValue->kind() == PseudoSourceValue::FixedStack) {
1145	if (const auto *FS = dyn_cast<FixedStackPseudoSourceValue>(Val: PseudoValue)) {
1146	return FS->getFrameIndex();
1147	}
1148	}
1149	return -`1`;
1150	};
1151
1152	auto IsStackOp = [GetFrameIndex](MachineInstr *I) {
1153	switch (I->getOpcode()) {
1154	case ARM::MVE_VSTRWU32:
1155	case ARM::MVE_VLDRWU32: {
1156	return I->getOperand(`1`).getReg() == ARM::SP &&
1157	I->memoperands().size() == `1` &&
1158	GetFrameIndex(I->memoperands().front()) >= `0`;
1159	}
1160	default:
1161	return false;
1162	}
1163	};
1164
1165	// An unpredicated vector register spill is allowed if all of the uses of the
1166	// stack slot are within the loop
1167	if (MI->getOpcode() != ARM::MVE_VSTRWU32 \|\| !IsStackOp(MI))
1168	return false;
1169
1170	// Search all blocks after the loop for accesses to the same stack slot.
1171	// ReachingDefAnalysis doesn't work for sp as it relies on registers being
1172	// live-out (which sp never is) to know what blocks to look in
1173	if (MI->memoperands().size() == `0`)
1174	return false;
1175	int FI = GetFrameIndex (MI->memoperands().front());
1176
1177	auto &FrameInfo = MI->getParent()->getParent()->getFrameInfo();
1178	if (FI == -`1` \|\| !FrameInfo.isSpillSlotObjectIndex(ObjectIdx: FI))
1179	return false;
1180
1181	SmallVector<MachineBasicBlock *> Frontier;
1182	ML->getExitBlocks(ExitBlocks&: Frontier);
1183	SmallPtrSet<MachineBasicBlock *, `4`> Visited{MI->getParent()};
1184	unsigned Idx = `0`;
1185	while (Idx < Frontier.size()) {
1186	MachineBasicBlock *BB = Frontier [Idx];
1187	bool LookAtSuccessors = true;
1188	for (auto &I : *BB) {
1189	if (!IsStackOp(&I) \|\| I.memoperands().size() == `0`)
1190	continue;
1191	if (GetFrameIndex (I.memoperands().front()) != FI)
1192	continue;
1193	// If this block has a store to the stack slot before any loads then we
1194	// can ignore the block
1195	if (I.getOpcode() == ARM::MVE_VSTRWU32) {
1196	LookAtSuccessors = false;
1197	break;
1198	}
1199	// If the store and the load are using the same stack slot then the
1200	// store isn't valid for tail predication
1201	if (I.getOpcode() == ARM::MVE_VLDRWU32)
1202	return false;
1203	}
1204
1205	if (LookAtSuccessors) {
1206	for (auto *Succ : BB->successors()) {
1207	if (!Visited.contains(Ptr: Succ) && !is_contained(Range&: Frontier, Element: Succ))
1208	Frontier.push_back(Elt: Succ);
1209	}
1210	}
1211	Visited.insert(Ptr: BB);
1212	Idx++;
1213	}
1214
1215	return true;
1216	}
1217
1218	bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) {
1219	if (CannotTailPredicate)
1220	return false;
1221
1222	if (!shouldInspect(MI&: *MI))
1223	return true;
1224
1225	if (MI->getOpcode() == ARM::MVE_VPSEL \|\|
1226	MI->getOpcode() == ARM::MVE_VPNOT) {
1227	// TODO: Allow VPSEL and VPNOT, we currently cannot because:
1228	// 1) It will use the VPR as a predicate operand, but doesn't have to be
1229	// instead a VPT block, which means we can assert while building up
1230	// the VPT block because we don't find another VPT or VPST to being a new
1231	// one.
1232	// 2) VPSEL still requires a VPR operand even after tail predicating,
1233	// which means we can't remove it unless there is another
1234	// instruction, such as vcmp, that can provide the VPR def.
1235	return false;
1236	}
1237
1238	// Record all VCTPs and check that they're equivalent to one another.
1239	if (isVCTP(MI) && !AddVCTP(MI))
1240	return false;
1241
1242	// Inspect uses first so that any instructions that alter the VPR don't
1243	// alter the predicate upon themselves.
1244	const MCInstrDesc &MCID = MI->getDesc();
1245	bool IsUse = false;
1246	unsigned LastOpIdx = MI->getNumOperands() - `1`;
1247	for (const auto &Op : enumerate(First: reverse(C: MCID.operands()))) {
1248	const MachineOperand &MO = MI->getOperand(i: LastOpIdx - Op.index());
1249	if (!MO.isReg() \|\| !MO.isUse() \|\| MO.getReg() != ARM::VPR)
1250	continue;
1251
1252	if (ARM::isVpred(op: Op.value().OperandType)) {
1253	VPTState::addInst(MI);
1254	IsUse = true;
1255	} else if (MI->getOpcode() != ARM::MVE_VPST) {
1256	LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI);
1257	return false;
1258	}
1259	}
1260
1261	// If we find an instruction that has been marked as not valid for tail
1262	// predication, only allow the instruction if it's contained within a valid
1263	// VPT block.
1264	bool RequiresExplicitPredication =
1265	(MCID.TSFlags & ARMII::ValidForTailPredication) == `0`;
1266	if (isDomainMVE(MI) && RequiresExplicitPredication) {
1267	if (MI->getOpcode() == ARM::MQPRCopy)
1268	return true;
1269	if (!IsUse && producesDoubleWidthResult(MI: *MI)) {
1270	DoubleWidthResultInstrs.insert(Ptr: MI);
1271	return true;
1272	}
1273
1274	LLVM_DEBUG(if (!IsUse) dbgs()
1275	<< "ARM Loops: Can't tail predicate: " << *MI);
1276	return IsUse;
1277	}
1278
1279	// If the instruction is already explicitly predicated, then the conversion
1280	// will be fine, but ensure that all store operations are predicated.
1281	if (MI->mayStore() && !ValidateMVEStore(MI, ML: &ML))
1282	return IsUse;
1283
1284	// If this instruction defines the VPR, update the predicate for the
1285	// proceeding instructions.
1286	if (isVectorPredicate(MI)) {
1287	// Clear the existing predicate when we're not in VPT Active state,
1288	// otherwise we add to it.
1289	if (!isVectorPredicated(MI))
1290	VPTState::resetPredicate(MI);
1291	else
1292	VPTState::addPredicate(MI);
1293	}
1294
1295	// Finally once the predicate has been modified, we can start a new VPT
1296	// block if necessary.
1297	if (isVPTOpcode(Opc: MI->getOpcode()))
1298	VPTState::CreateVPTBlock(MI);
1299
1300	return true;
1301	}
1302
1303	bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
1304	const ARMSubtarget &ST = mf.getSubtarget<ARMSubtarget>();
1305	if (!ST.hasLOB())
1306	return false;
1307
1308	MF = &mf;
1309	LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
1310
1311	MLI = &getAnalysis<MachineLoopInfo>();
1312	RDA = &getAnalysis<ReachingDefAnalysis>();
1313	MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
1314	MRI = &MF->getRegInfo();
1315	TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
1316	TRI = ST.getRegisterInfo();
1317	BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils (*MF));
1318	BBUtils ->computeAllBlockSizes();
1319	BBUtils ->adjustBBOffsetsAfter(MBB: &MF->front());
1320
1321	bool Changed = false;
1322	for (auto ML : MLI) {
1323	if (ML->isOutermost())
1324	Changed \|= ProcessLoop(ML);
1325	}
1326	Changed \|= RevertNonLoops();
1327	return Changed;
1328	}
1329
1330	bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
1331
1332	bool Changed = false;
1333
1334	// Process inner loops first.
1335	for (MachineLoop L : ML)
1336	Changed \|= ProcessLoop(ML: L);
1337
1338	LLVM_DEBUG({
1339	dbgs() << "ARM Loops: Processing loop containing:\n";
1340	if (auto *Preheader = ML->getLoopPreheader())
1341	dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
1342	else if (auto Preheader = MLI->findLoopPreheader(ML, true, true*))
1343	dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n";
1344	for (auto *MBB : ML->getBlocks())
1345	dbgs() << " - Block: " << printMBBReference(*MBB) << "\n";
1346	});
1347
1348	// Search the given block for a loop start instruction. If one isn't found,
1349	// and there's only one predecessor block, search that one too.
1350	std::function<MachineInstr(MachineBasicBlock)> SearchForStart =
1351	[&SearchForStart](MachineBasicBlock MBB) -> MachineInstr {
1352	for (auto &MI : *MBB) {
1353	if (isLoopStart(MI))
1354	return &MI;
1355	}
1356	if (MBB->pred_size() == `1`)
1357	return SearchForStart (*MBB->pred_begin());
1358	return nullptr;
1359	};
1360
1361	LowOverheadLoop LoLoop(ML, MLI, RDA, TRI, *TII);
1362	// Search the preheader for the start intrinsic.
1363	// FIXME: I don't see why we shouldn't be supporting multiple predecessors
1364	// with potentially multiple set.loop.iterations, so we need to enable this.
1365	if (LoLoop.Preheader)
1366	LoLoop.Start = SearchForStart (LoLoop.Preheader);
1367	else
1368	return Changed;
1369
1370	// Find the low-overhead loop components and decide whether or not to fall
1371	// back to a normal loop. Also look for a vctp instructions and decide
1372	// whether we can convert that predicate using tail predication.
1373	for (auto *MBB : reverse(C: ML->getBlocks())) {
1374	for (auto &MI : *MBB) {
1375	if (MI.isDebugValue())
1376	continue;
1377	else if (MI.getOpcode() == ARM::t2LoopDec)
1378	LoLoop.Dec = &MI;
1379	else if (MI.getOpcode() == ARM::t2LoopEnd)
1380	LoLoop.End = &MI;
1381	else if (MI.getOpcode() == ARM::t2LoopEndDec)
1382	LoLoop.End = LoLoop.Dec = &MI;
1383	else if (isLoopStart(MI))
1384	LoLoop.Start = &MI;
1385	else if (MI.getDesc().isCall()) {
1386	// TODO: Though the call will require LE to execute again, does this
1387	// mean we should revert? Always executing LE hopefully should be
1388	// faster than performing a sub,cmp,br or even subs,br.
1389	LoLoop.Revert = true;
1390	LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n");
1391	} else {
1392	// Record VPR defs and build up their corresponding vpt blocks.
1393	// Check we know how to tail predicate any mve instructions.
1394	LoLoop.AnalyseMVEInst(MI: &MI);
1395	}
1396	}
1397	}
1398
1399	LLVM_DEBUG(LoLoop.dump());
1400	if (!LoLoop.FoundAllComponents()) {
1401	LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n");
1402	return Changed;
1403	}
1404
1405	assert(LoLoop.Start->getOpcode() != ARM::t2WhileLoopStart &&
1406	"Expected t2WhileLoopStart to be removed before regalloc!");
1407
1408	// Check that the only instruction using LoopDec is LoopEnd. This can only
1409	// happen when the Dec and End are separate, not a single t2LoopEndDec.
1410	// TODO: Check for copy chains that really have no effect.
1411	if (LoLoop.Dec != LoLoop.End) {
1412	SmallPtrSet<MachineInstr *, `2`> Uses;
1413	RDA->getReachingLocalUses(LoLoop.Dec, MCRegister::from(ARM::LR), Uses);
1414	if (Uses.size() > `1` \|\| !Uses.count(Ptr: LoLoop.End)) {
1415	LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n");
1416	LoLoop.Revert = true;
1417	}
1418	}
1419	LoLoop.Validate(BBUtils: BBUtils.get());
1420	Expand(LoLoop);
1421	return true;
1422	}
1423
1424	// WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a
1425	// beq that branches to the exit branch.
1426	// TODO: We could also try to generate a cbz if the value in LR is also in
1427	// another low register.
1428	void ARMLowOverheadLoops::RevertWhile(MachineInstr MI) const* {
1429	LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI);
1430	MachineBasicBlock DestBB = getWhileLoopStartTargetBB(MI: MI);
1431	unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, `254`) ?
1432	ARM::tBcc : ARM::t2Bcc;
1433
1434	RevertWhileLoopStartLR(MI, TII, BrOpc);
1435	}
1436
1437	void ARMLowOverheadLoops::RevertDo(MachineInstr MI) const* {
1438	LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI);
1439	RevertDoLoopStart(MI, TII);
1440	}
1441
1442	bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr MI) const* {
1443	LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
1444	MachineBasicBlock *MBB = MI->getParent();
1445	SmallPtrSet<MachineInstr*, `1`> Ignore;
1446	for (auto I = MachineBasicBlock::iterator (MI), E = MBB->end(); I != E; ++I) {
1447	if (I->getOpcode() == ARM::t2LoopEnd) {
1448	Ignore.insert(Ptr: &*I);
1449	break;
1450	}
1451	}
1452
1453	// If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
1454	bool SetFlags =
1455	RDA->isSafeToDefRegAt(MI, MCRegister::from(ARM::CPSR), Ignore);
1456
1457	llvm::RevertLoopDec(MI, TII, SetFlags);
1458	return SetFlags;
1459	}
1460
1461	// Generate a subs, or sub and cmp, and a branch instead of an LE.
1462	void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr MI, bool* SkipCmp) const {
1463	LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI);
1464
1465	MachineBasicBlock *DestBB = MI->getOperand(i: `1`).getMBB();
1466	unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, `254`) ?
1467	ARM::tBcc : ARM::t2Bcc;
1468
1469	llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp);
1470	}
1471
1472	// Generate a subs, or sub and cmp, and a branch instead of an LE.
1473	void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr MI) const* {
1474	LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI);
1475	assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!");
1476	MachineBasicBlock *MBB = MI->getParent();
1477
1478	MachineInstrBuilder MIB =
1479	BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri));
1480	MIB.addDef(ARM::LR);
1481	MIB.add(MO: MI->getOperand(i: `1`));
1482	MIB.addImm(Val: `1`);
1483	MIB.addImm(Val: ARMCC::AL);
1484	MIB.addReg(ARM::NoRegister);
1485	MIB.addReg(ARM::CPSR);
1486	MIB ->getOperand(i: `5`).setIsDef(true);
1487
1488	MachineBasicBlock *DestBB = MI->getOperand(i: `2`).getMBB();
1489	unsigned BrOpc =
1490	BBUtils->isBBInRange(MI, DestBB, `254`) ? ARM::tBcc : ARM::t2Bcc;
1491
1492	// Create bne
1493	MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc));
1494	MIB.add(MO: MI->getOperand(i: `2`)); // branch target
1495	MIB.addImm(Val: ARMCC::NE); // condition code
1496	MIB.addReg(ARM::CPSR);
1497
1498	MI->eraseFromParent();
1499	}
1500
1501	// Perform dead code elimation on the loop iteration count setup expression.
1502	// If we are tail-predicating, the number of elements to be processed is the
1503	// operand of the VCTP instruction in the vector body, see getCount(), which is
1504	// register $r3 in this example:
1505	//
1506	// $lr = big-itercount-expression
1507	// ..
1508	// $lr = t2DoLoopStart renamable $lr
1509	// vector.body:
1510	// ..
1511	// $vpr = MVE_VCTP32 renamable $r3
1512	// renamable $lr = t2LoopDec killed renamable $lr, 1
1513	// t2LoopEnd renamable $lr, %vector.body
1514	// tB %end
1515	//
1516	// What we would like achieve here is to replace the do-loop start pseudo
1517	// instruction t2DoLoopStart with:
1518	//
1519	// $lr = MVE_DLSTP_32 killed renamable $r3
1520	//
1521	// Thus, $r3 which defines the number of elements, is written to $lr,
1522	// and then we want to delete the whole chain that used to define $lr,
1523	// see the comment below how this chain could look like.
1524	//
1525	void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {
1526	if (!LoLoop.IsTailPredicationLegal())
1527	return;
1528
1529	LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n");
1530
1531	MachineInstr *Def = RDA->getMIOperand(MI: LoLoop.Start, Idx: `1`);
1532	if (!Def) {
1533	LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n");
1534	return;
1535	}
1536
1537	// Collect and remove the users of iteration count.
1538	SmallPtrSet<MachineInstr*, `4`> Killed = { LoLoop.Start, LoLoop.Dec,
1539	LoLoop.End };
1540	if (!TryRemove(MI: Def, RDA&: *RDA, ToRemove&: LoLoop.ToRemove, Ignore&: Killed))
1541	LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n");
1542	}
1543
1544	MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
1545	LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n");
1546	// When using tail-predication, try to delete the dead code that was used to
1547	// calculate the number of loop iterations.
1548	IterationCountDCE(LoLoop);
1549
1550	MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt;
1551	MachineInstr *Start = LoLoop.Start;
1552	MachineBasicBlock *MBB = LoLoop.StartInsertBB;
1553	unsigned Opc = LoLoop.getStartOpcode();
1554	MachineOperand &Count = LoLoop.getLoopStartOperand();
1555
1556	// A DLS lr, lr we needn't emit
1557	MachineInstr* NewStart;
1558	if (!DisableOmitDLS && Opc == ARM::t2DLS && Count.isReg() &&
1559	Count.getReg() == ARM::LR) {
1560	LLVM_DEBUG(dbgs() << "ARM Loops: Didn't insert start: DLS lr, lr");
1561	NewStart = nullptr;
1562	} else {
1563	MachineInstrBuilder MIB =
1564	BuildMI(*MBB, InsertPt, Start->getDebugLoc(), TII->get(Opc));
1565
1566	MIB.addDef(ARM::LR);
1567	MIB.add(MO: Count);
1568	if (isWhileLoopStart(MI: *Start))
1569	MIB.addMBB(MBB: getWhileLoopStartTargetBB(MI: *Start));
1570
1571	LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
1572	NewStart = &*MIB;
1573	}
1574
1575	LoLoop.ToRemove.insert(Ptr: Start);
1576	return NewStart;
1577	}
1578
1579	void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
1580	auto RemovePredicate = [](MachineInstr *MI) {
1581	if (MI->isDebugInstr())
1582	return;
1583	LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI);
1584	int PIdx = llvm::findFirstVPTPredOperandIdx(MI: *MI);
1585	assert(PIdx >= `1` && "Trying to unpredicate a non-predicated instruction");
1586	assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then &&
1587	"Expected Then predicate!");
1588	MI->getOperand(i: PIdx).setImm(ARMVCC::None);
1589	MI->getOperand(i: PIdx + `1`).setReg(`0`);
1590	};
1591
1592	for (auto &Block : LoLoop.getVPTBlocks()) {
1593	SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts();
1594
1595	auto ReplaceVCMPWithVPT = [&](MachineInstr &TheVCMP, MachineInstr At) {
1596	assert(TheVCMP && "Replacing a removed or non-existent VCMP");
1597	// Replace the VCMP with a VPT
1598	MachineInstrBuilder MIB =
1599	BuildMI(*At->getParent(), At, At->getDebugLoc(),
1600	TII->get(VCMPOpcodeToVPT(Opcode: TheVCMP->getOpcode())));
1601	MIB.addImm(Val: ARMVCC::Then);
1602	// Register one
1603	MIB.add(MO: TheVCMP->getOperand(i: `1`));
1604	// Register two
1605	MIB.add(MO: TheVCMP->getOperand(i: `2`));
1606	// The comparison code, e.g. ge, eq, lt
1607	MIB.add(MO: TheVCMP->getOperand(i: `3`));
1608	LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB);
1609	LoLoop.BlockMasksToRecompute.insert(Ptr: MIB.getInstr());
1610	LoLoop.ToRemove.insert(Ptr: TheVCMP);
1611	TheVCMP = nullptr;
1612	};
1613
1614	if (VPTState::isEntryPredicatedOnVCTP(Block, /exclusive/ Exclusive: true)) {
1615	MachineInstr *VPST = Insts.front();
1616	if (VPTState::hasUniformPredicate(Block)) {
1617	// A vpt block starting with VPST, is only predicated upon vctp and has no
1618	// internal vpr defs:
1619	// - Remove vpst.
1620	// - Unpredicate the remaining instructions.
1621	LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1622	for (unsigned i = `1`; i < Insts.size(); ++i)
1623	RemovePredicate (Insts [i]);
1624	} else {
1625	// The VPT block has a non-uniform predicate but it uses a vpst and its
1626	// entry is guarded only by a vctp, which means we:
1627	// - Need to remove the original vpst.
1628	// - Then need to unpredicate any following instructions, until
1629	// we come across the divergent vpr def.
1630	// - Insert a new vpst to predicate the instruction(s) that following
1631	// the divergent vpr def.
1632	MachineInstr *Divergent = VPTState::getDivergent(Block);
1633	MachineBasicBlock *MBB = Divergent->getParent();
1634	auto DivergentNext = ++MachineBasicBlock::iterator (Divergent);
1635	while (DivergentNext != MBB->end() && DivergentNext ->isDebugInstr())
1636	++DivergentNext;
1637
1638	bool DivergentNextIsPredicated =
1639	DivergentNext != MBB->end() &&
1640	getVPTInstrPredicate(MI: *DivergentNext) != ARMVCC::None;
1641
1642	for (auto I = ++MachineBasicBlock::iterator (VPST), E = DivergentNext;
1643	I != E; ++I)
1644	RemovePredicate (&*I);
1645
1646	// Check if the instruction defining vpr is a vcmp so it can be combined
1647	// with the VPST This should be the divergent instruction
1648	MachineInstr *VCMP =
1649	VCMPOpcodeToVPT(Opcode: Divergent->getOpcode()) != `0` ? Divergent : nullptr;
1650
1651	if (DivergentNextIsPredicated) {
1652	// Insert a VPST at the divergent only if the next instruction
1653	// would actually use it. A VCMP following a VPST can be
1654	// merged into a VPT so do that instead if the VCMP exists.
1655	if (!VCMP) {
1656	// Create a VPST (with a null mask for now, we'll recompute it
1657	// later)
1658	MachineInstrBuilder MIB =
1659	BuildMI(*Divergent->getParent(), Divergent,
1660	Divergent->getDebugLoc(), TII->get(ARM::MVE_VPST));
1661	MIB.addImm(Val: `0`);
1662	LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
1663	LoLoop.BlockMasksToRecompute.insert(Ptr: MIB.getInstr());
1664	} else {
1665	// No RDA checks are necessary here since the VPST would have been
1666	// directly after the VCMP
1667	ReplaceVCMPWithVPT (VCMP, VCMP);
1668	}
1669	}
1670	}
1671	LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1672	LoLoop.ToRemove.insert(Ptr: VPST);
1673	} else if (Block.containsVCTP()) {
1674	// The vctp will be removed, so either the entire block will be dead or
1675	// the block mask of the vp(s)t will need to be recomputed.
1676	MachineInstr *VPST = Insts.front();
1677	if (Block.size() == `2`) {
1678	assert(VPST->getOpcode() == ARM::MVE_VPST &&
1679	"Found a VPST in an otherwise empty vpt block");
1680	LoLoop.ToRemove.insert(Ptr: VPST);
1681	} else
1682	LoLoop.BlockMasksToRecompute.insert(Ptr: VPST);
1683	} else if (Insts.front()->getOpcode() == ARM::MVE_VPST) {
1684	// If this block starts with a VPST then attempt to merge it with the
1685	// preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT
1686	// block that no longer exists
1687	MachineInstr *VPST = Insts.front();
1688	auto Next = ++MachineBasicBlock::iterator (VPST);
1689	assert(getVPTInstrPredicate(*Next) != ARMVCC::None &&
1690	"The instruction after a VPST must be predicated");
1691	(void)Next;
1692	MachineInstr *VprDef = RDA->getUniqueReachingMIDef(VPST, ARM::VPR);
1693	if (VprDef && VCMPOpcodeToVPT(Opcode: VprDef->getOpcode()) &&
1694	!LoLoop.ToRemove.contains(Ptr: VprDef)) {
1695	MachineInstr *VCMP = VprDef;
1696	// The VCMP and VPST can only be merged if the VCMP's operands will have
1697	// the same values at the VPST.
1698	// If any of the instructions between the VCMP and VPST are predicated
1699	// then a different code path is expected to have merged the VCMP and
1700	// VPST already.
1701	if (std::none_of(first: ++MachineBasicBlock::iterator (VCMP),
1702	last: MachineBasicBlock::iterator (VPST), pred: hasVPRUse) &&
1703	RDA->hasSameReachingDef(A: VCMP, B: VPST, PhysReg: VCMP->getOperand(i: `1`).getReg()) &&
1704	RDA->hasSameReachingDef(A: VCMP, B: VPST, PhysReg: VCMP->getOperand(i: `2`).getReg())) {
1705	ReplaceVCMPWithVPT (VCMP, VPST);
1706	LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST);
1707	LoLoop.ToRemove.insert(Ptr: VPST);
1708	}
1709	}
1710	}
1711	}
1712
1713	LoLoop.ToRemove.insert(I: LoLoop.VCTPs.begin(), E: LoLoop.VCTPs.end());
1714	}
1715
1716	void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
1717
1718	// Combine the LoopDec and LoopEnd instructions into LE(TP).
1719	auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) {
1720	MachineInstr *End = LoLoop.End;
1721	MachineBasicBlock *MBB = End->getParent();
1722	unsigned Opc = LoLoop.IsTailPredicationLegal() ?
1723	ARM::MVE_LETP : ARM::t2LEUpdate;
1724	MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(),
1725	TII->get(Opc));
1726	MIB.addDef(ARM::LR);
1727	unsigned Off = LoLoop.Dec == LoLoop.End ? `1` : `0`;
1728	MIB.add(MO: End->getOperand(i: Off + `0`));
1729	MIB.add(MO: End->getOperand(i: Off + `1`));
1730	LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
1731	LoLoop.ToRemove.insert(Ptr: LoLoop.Dec);
1732	LoLoop.ToRemove.insert(Ptr: End);
1733	return &*MIB;
1734	};
1735
1736	// TODO: We should be able to automatically remove these branches before we
1737	// get here - probably by teaching analyzeBranch about the pseudo
1738	// instructions.
1739	// If there is an unconditional branch, after I, that just branches to the
1740	// next block, remove it.
1741	auto RemoveDeadBranch = [](MachineInstr *I) {
1742	MachineBasicBlock *BB = I->getParent();
1743	MachineInstr *Terminator = &BB->instr_back();
1744	if (Terminator->isUnconditionalBranch() && I != Terminator) {
1745	MachineBasicBlock *Succ = Terminator->getOperand(i: `0`).getMBB();
1746	if (BB->isLayoutSuccessor(MBB: Succ)) {
1747	LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator);
1748	Terminator->eraseFromParent();
1749	}
1750	}
1751	};
1752
1753	// And VMOVCopies need to become 2xVMOVD for tail predication to be valid.
1754	// Anything other MQPRCopy can be converted to MVE_VORR later on.
1755	auto ExpandVMOVCopies = [this](SmallPtrSet<MachineInstr *, `4`> &VMOVCopies) {
1756	for (auto *MI : VMOVCopies) {
1757	LLVM_DEBUG(dbgs() << "Converting copy to VMOVD: " << *MI);
1758	assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
1759	MachineBasicBlock *MBB = MI->getParent();
1760	Register Dst = MI->getOperand(i: `0`).getReg();
1761	Register Src = MI->getOperand(i: `1`).getReg();
1762	auto MIB1 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD),
1763	ARM::D0 + (Dst - ARM::Q0) * `2`)
1764	.addReg(ARM::D0 + (Src - ARM::Q0) * `2`)
1765	.add(predOps(ARMCC::AL));
1766	(void)MIB1;
1767	LLVM_DEBUG(dbgs() << " into " << *MIB1);
1768	auto MIB2 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD),
1769	ARM::D0 + (Dst - ARM::Q0) * `2` + `1`)
1770	.addReg(ARM::D0 + (Src - ARM::Q0) * `2` + `1`)
1771	.add(predOps(ARMCC::AL));
1772	LLVM_DEBUG(dbgs() << " and " << *MIB2);
1773	(void)MIB2;
1774	MI->eraseFromParent();
1775	}
1776	};
1777
1778	if (LoLoop.Revert) {
1779	if (isWhileLoopStart(MI: *LoLoop.Start))
1780	RevertWhile(MI: LoLoop.Start);
1781	else
1782	RevertDo(MI: LoLoop.Start);
1783	if (LoLoop.Dec == LoLoop.End)
1784	RevertLoopEndDec(MI: LoLoop.End);
1785	else
1786	RevertLoopEnd(MI: LoLoop.End, SkipCmp: RevertLoopDec(MI: LoLoop.Dec));
1787	} else {
1788	ExpandVMOVCopies (LoLoop.VMOVCopies);
1789	LoLoop.Start = ExpandLoopStart(LoLoop);
1790	if (LoLoop.Start)
1791	RemoveDeadBranch (LoLoop.Start);
1792	LoLoop.End = ExpandLoopEnd(LoLoop);
1793	RemoveDeadBranch (LoLoop.End);
1794	if (LoLoop.IsTailPredicationLegal())
1795	ConvertVPTBlocks(LoLoop);
1796	for (auto *I : LoLoop.ToRemove) {
1797	LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I);
1798	I->eraseFromParent();
1799	}
1800	for (auto *I : LoLoop.BlockMasksToRecompute) {
1801	LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I);
1802	recomputeVPTBlockMask(Instr&: *I);
1803	LLVM_DEBUG(dbgs() << " ... done: " << *I);
1804	}
1805	}
1806
1807	PostOrderLoopTraversal DFS(LoLoop.ML, *MLI);
1808	DFS.ProcessLoop();
1809	const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
1810	fullyRecomputeLiveIns(MBBs: PostOrder);
1811
1812	for (auto *MBB : reverse(C: PostOrder))
1813	recomputeLivenessFlags(MBB&: *MBB);
1814
1815	// We've moved, removed and inserted new instructions, so update RDA.
1816	RDA->reset();
1817	}
1818
1819	bool ARMLowOverheadLoops::RevertNonLoops() {
1820	LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n");
1821	bool Changed = false;
1822
1823	for (auto &MBB : *MF) {
1824	SmallVector<MachineInstr*, `4`> Starts;
1825	SmallVector<MachineInstr*, `4`> Decs;
1826	SmallVector<MachineInstr*, `4`> Ends;
1827	SmallVector<MachineInstr *, `4`> EndDecs;
1828	SmallVector<MachineInstr *, `4`> MQPRCopies;
1829
1830	for (auto &I : MBB) {
1831	if (isLoopStart(MI: I))
1832	Starts.push_back(Elt: &I);
1833	else if (I.getOpcode() == ARM::t2LoopDec)
1834	Decs.push_back(Elt: &I);
1835	else if (I.getOpcode() == ARM::t2LoopEnd)
1836	Ends.push_back(Elt: &I);
1837	else if (I.getOpcode() == ARM::t2LoopEndDec)
1838	EndDecs.push_back(Elt: &I);
1839	else if (I.getOpcode() == ARM::MQPRCopy)
1840	MQPRCopies.push_back(Elt: &I);
1841	}
1842
1843	if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty() &&
1844	MQPRCopies.empty())
1845	continue;
1846
1847	Changed = true;
1848
1849	for (auto *Start : Starts) {
1850	if (isWhileLoopStart(MI: *Start))
1851	RevertWhile(MI: Start);
1852	else
1853	RevertDo(MI: Start);
1854	}
1855	for (auto *Dec : Decs)
1856	RevertLoopDec(MI: Dec);
1857
1858	for (auto *End : Ends)
1859	RevertLoopEnd(MI: End);
1860	for (auto *End : EndDecs)
1861	RevertLoopEndDec(MI: End);
1862	for (auto *MI : MQPRCopies) {
1863	LLVM_DEBUG(dbgs() << "Converting copy to VORR: " << *MI);
1864	assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!");
1865	MachineBasicBlock *MBB = MI->getParent();
1866	auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::MVE_VORR),
1867	MI->getOperand(`0`).getReg())
1868	.add(MI->getOperand(`1`))
1869	.add(MI->getOperand(`1`));
1870	addUnpredicatedMveVpredROp(MIB, MI->getOperand(i: `0`).getReg());
1871	MI->eraseFromParent();
1872	}
1873	}
1874	return Changed;
1875	}
1876
1877	FunctionPass *llvm::createARMLowOverheadLoopsPass() {
1878	return new ARMLowOverheadLoops ();
1879	}
1880

source code of llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp