1 | //===-- ARMLowOverheadLoops.cpp - CodeGen Low-overhead Loops ---*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// Finalize v8.1-m low-overhead loops by converting the associated pseudo |
10 | /// instructions into machine operations. |
11 | /// The expectation is that the loop contains three pseudo instructions: |
12 | /// - t2*LoopStart - placed in the preheader or pre-preheader. The do-loop |
13 | /// form should be in the preheader, whereas the while form should be in the |
14 | /// preheaders only predecessor. |
15 | /// - t2LoopDec - placed within in the loop body. |
16 | /// - t2LoopEnd - the loop latch terminator. |
17 | /// |
18 | /// In addition to this, we also look for the presence of the VCTP instruction, |
19 | /// which determines whether we can generated the tail-predicated low-overhead |
20 | /// loop form. |
21 | /// |
22 | /// Assumptions and Dependencies: |
23 | /// Low-overhead loops are constructed and executed using a setup instruction: |
24 | /// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP. |
25 | /// WLS(TP) and LE(TP) are branching instructions with a (large) limited range |
26 | /// but fixed polarity: WLS can only branch forwards and LE can only branch |
27 | /// backwards. These restrictions mean that this pass is dependent upon block |
28 | /// layout and block sizes, which is why it's the last pass to run. The same is |
29 | /// true for ConstantIslands, but this pass does not increase the size of the |
30 | /// basic blocks, nor does it change the CFG. Instructions are mainly removed |
31 | /// during the transform and pseudo instructions are replaced by real ones. In |
32 | /// some cases, when we have to revert to a 'normal' loop, we have to introduce |
33 | /// multiple instructions for a single pseudo (see RevertWhile and |
34 | /// RevertLoopEnd). To handle this situation, t2WhileLoopStartLR and t2LoopEnd |
35 | /// are defined to be as large as this maximum sequence of replacement |
36 | /// instructions. |
37 | /// |
38 | /// A note on VPR.P0 (the lane mask): |
39 | /// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a |
40 | /// "VPT Active" context (which includes low-overhead loops and vpt blocks). |
41 | /// They will simply "and" the result of their calculation with the current |
42 | /// value of VPR.P0. You can think of it like this: |
43 | /// \verbatim |
44 | /// if VPT active: ; Between a DLSTP/LETP, or for predicated instrs |
45 | /// VPR.P0 &= Value |
46 | /// else |
47 | /// VPR.P0 = Value |
48 | /// \endverbatim |
49 | /// When we're inside the low-overhead loop (between DLSTP and LETP), we always |
50 | /// fall in the "VPT active" case, so we can consider that all VPR writes by |
51 | /// one of those instruction is actually a "and". |
52 | //===----------------------------------------------------------------------===// |
53 | |
54 | #include "ARM.h" |
55 | #include "ARMBaseInstrInfo.h" |
56 | #include "ARMBaseRegisterInfo.h" |
57 | #include "ARMBasicBlockInfo.h" |
58 | #include "ARMSubtarget.h" |
59 | #include "MVETailPredUtils.h" |
60 | #include "Thumb2InstrInfo.h" |
61 | #include "llvm/ADT/SetOperations.h" |
62 | #include "llvm/ADT/SetVector.h" |
63 | #include "llvm/CodeGen/LivePhysRegs.h" |
64 | #include "llvm/CodeGen/MachineFrameInfo.h" |
65 | #include "llvm/CodeGen/MachineFunctionPass.h" |
66 | #include "llvm/CodeGen/MachineLoopInfo.h" |
67 | #include "llvm/CodeGen/MachineLoopUtils.h" |
68 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
69 | #include "llvm/CodeGen/Passes.h" |
70 | #include "llvm/CodeGen/ReachingDefAnalysis.h" |
71 | #include "llvm/MC/MCInstrDesc.h" |
72 | |
73 | using namespace llvm; |
74 | |
75 | #define DEBUG_TYPE "arm-low-overhead-loops" |
76 | #define ARM_LOW_OVERHEAD_LOOPS_NAME "ARM Low Overhead Loops pass" |
77 | |
78 | static cl::opt<bool> |
79 | DisableTailPredication("arm-loloops-disable-tailpred" , cl::Hidden, |
80 | cl::desc("Disable tail-predication in the ARM LowOverheadLoop pass" ), |
81 | cl::init(Val: false)); |
82 | |
83 | static cl::opt<bool> |
84 | DisableOmitDLS("arm-disable-omit-dls" , cl::Hidden, |
85 | cl::desc("Disable omitting 'dls lr, lr' instructions" ), |
86 | cl::init(Val: false)); |
87 | |
88 | static bool isVectorPredicated(MachineInstr *MI) { |
89 | int PIdx = llvm::findFirstVPTPredOperandIdx(MI: *MI); |
90 | return PIdx != -1 && MI->getOperand(i: PIdx + 1).getReg() == ARM::VPR; |
91 | } |
92 | |
93 | static bool isVectorPredicate(MachineInstr *MI) { |
94 | return MI->findRegisterDefOperandIdx(ARM::Reg: VPR, /*TRI=*/nullptr) != -1; |
95 | } |
96 | |
97 | static bool hasVPRUse(MachineInstr &MI) { |
98 | return MI.findRegisterUseOperandIdx(ARM::Reg: VPR, /*TRI=*/nullptr) != -1; |
99 | } |
100 | |
101 | static bool isDomainMVE(MachineInstr *MI) { |
102 | uint64_t Domain = MI->getDesc().TSFlags & ARMII::DomainMask; |
103 | return Domain == ARMII::DomainMVE; |
104 | } |
105 | |
106 | static int getVecSize(const MachineInstr &MI) { |
107 | const MCInstrDesc &MCID = MI.getDesc(); |
108 | uint64_t Flags = MCID.TSFlags; |
109 | return (Flags & ARMII::VecSize) >> ARMII::VecSizeShift; |
110 | } |
111 | |
112 | static bool shouldInspect(MachineInstr &MI) { |
113 | if (MI.isDebugInstr()) |
114 | return false; |
115 | return isDomainMVE(MI: &MI) || isVectorPredicate(MI: &MI) || hasVPRUse(MI); |
116 | } |
117 | |
118 | namespace { |
119 | |
120 | using InstSet = SmallPtrSetImpl<MachineInstr *>; |
121 | |
122 | class PostOrderLoopTraversal { |
123 | MachineLoop &ML; |
124 | MachineLoopInfo &MLI; |
125 | SmallPtrSet<MachineBasicBlock*, 4> Visited; |
126 | SmallVector<MachineBasicBlock*, 4> Order; |
127 | |
128 | public: |
129 | PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI) |
130 | : ML(ML), MLI(MLI) { } |
131 | |
132 | const SmallVectorImpl<MachineBasicBlock*> &getOrder() const { |
133 | return Order; |
134 | } |
135 | |
136 | // Visit all the blocks within the loop, as well as exit blocks and any |
137 | // blocks properly dominating the header. |
138 | void ProcessLoop() { |
139 | std::function<void(MachineBasicBlock*)> Search = [this, &Search] |
140 | (MachineBasicBlock *MBB) -> void { |
141 | if (Visited.count(Ptr: MBB)) |
142 | return; |
143 | |
144 | Visited.insert(Ptr: MBB); |
145 | for (auto *Succ : MBB->successors()) { |
146 | if (!ML.contains(BB: Succ)) |
147 | continue; |
148 | Search(Succ); |
149 | } |
150 | Order.push_back(Elt: MBB); |
151 | }; |
152 | |
153 | // Insert exit blocks. |
154 | SmallVector<MachineBasicBlock*, 2> ExitBlocks; |
155 | ML.getExitBlocks(ExitBlocks); |
156 | append_range(C&: Order, R&: ExitBlocks); |
157 | |
158 | // Then add the loop body. |
159 | Search(ML.getHeader()); |
160 | |
161 | // Then try the preheader and its predecessors. |
162 | std::function<void(MachineBasicBlock*)> GetPredecessor = |
163 | [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void { |
164 | Order.push_back(Elt: MBB); |
165 | if (MBB->pred_size() == 1) |
166 | GetPredecessor(*MBB->pred_begin()); |
167 | }; |
168 | |
169 | if (auto * = ML.getLoopPreheader()) |
170 | GetPredecessor(Preheader); |
171 | else if (auto * = MLI.findLoopPreheader(L: &ML, SpeculativePreheader: true, FindMultiLoopPreheader: true)) |
172 | GetPredecessor(Preheader); |
173 | } |
174 | }; |
175 | |
176 | struct PredicatedMI { |
177 | MachineInstr *MI = nullptr; |
178 | SetVector<MachineInstr*> Predicates; |
179 | |
180 | public: |
181 | PredicatedMI(MachineInstr *I, SetVector<MachineInstr *> &Preds) : MI(I) { |
182 | assert(I && "Instruction must not be null!" ); |
183 | Predicates.insert(Start: Preds.begin(), End: Preds.end()); |
184 | } |
185 | }; |
186 | |
187 | // Represent the current state of the VPR and hold all instances which |
188 | // represent a VPT block, which is a list of instructions that begins with a |
189 | // VPT/VPST and has a maximum of four proceeding instructions. All |
190 | // instructions within the block are predicated upon the vpr and we allow |
191 | // instructions to define the vpr within in the block too. |
192 | class VPTState { |
193 | friend struct LowOverheadLoop; |
194 | |
195 | SmallVector<MachineInstr *, 4> Insts; |
196 | |
197 | static SmallVector<VPTState, 4> Blocks; |
198 | static SetVector<MachineInstr *> CurrentPredicates; |
199 | static std::map<MachineInstr *, |
200 | std::unique_ptr<PredicatedMI>> PredicatedInsts; |
201 | |
202 | static void CreateVPTBlock(MachineInstr *MI) { |
203 | assert((CurrentPredicates.size() || MI->getParent()->isLiveIn(ARM::VPR)) |
204 | && "Can't begin VPT without predicate" ); |
205 | Blocks.emplace_back(Args&: MI); |
206 | // The execution of MI is predicated upon the current set of instructions |
207 | // that are AND'ed together to form the VPR predicate value. In the case |
208 | // that MI is a VPT, CurrentPredicates will also just be MI. |
209 | PredicatedInsts.emplace( |
210 | args&: MI, args: std::make_unique<PredicatedMI>(args&: MI, args&: CurrentPredicates)); |
211 | } |
212 | |
213 | static void reset() { |
214 | Blocks.clear(); |
215 | PredicatedInsts.clear(); |
216 | CurrentPredicates.clear(); |
217 | } |
218 | |
219 | static void addInst(MachineInstr *MI) { |
220 | Blocks.back().insert(MI); |
221 | PredicatedInsts.emplace( |
222 | args&: MI, args: std::make_unique<PredicatedMI>(args&: MI, args&: CurrentPredicates)); |
223 | } |
224 | |
225 | static void addPredicate(MachineInstr *MI) { |
226 | LLVM_DEBUG(dbgs() << "ARM Loops: Adding VPT Predicate: " << *MI); |
227 | CurrentPredicates.insert(X: MI); |
228 | } |
229 | |
230 | static void resetPredicate(MachineInstr *MI) { |
231 | LLVM_DEBUG(dbgs() << "ARM Loops: Resetting VPT Predicate: " << *MI); |
232 | CurrentPredicates.clear(); |
233 | CurrentPredicates.insert(X: MI); |
234 | } |
235 | |
236 | public: |
237 | // Have we found an instruction within the block which defines the vpr? If |
238 | // so, not all the instructions in the block will have the same predicate. |
239 | static bool hasUniformPredicate(VPTState &Block) { |
240 | return getDivergent(Block) == nullptr; |
241 | } |
242 | |
243 | // If it exists, return the first internal instruction which modifies the |
244 | // VPR. |
245 | static MachineInstr *getDivergent(VPTState &Block) { |
246 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
247 | for (unsigned i = 1; i < Insts.size(); ++i) { |
248 | MachineInstr *Next = Insts[i]; |
249 | if (isVectorPredicate(MI: Next)) |
250 | return Next; // Found an instruction altering the vpr. |
251 | } |
252 | return nullptr; |
253 | } |
254 | |
255 | // Return whether the given instruction is predicated upon a VCTP. |
256 | static bool isPredicatedOnVCTP(MachineInstr *MI, bool Exclusive = false) { |
257 | SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]->Predicates; |
258 | if (Exclusive && Predicates.size() != 1) |
259 | return false; |
260 | return llvm::any_of(Range&: Predicates, P: isVCTP); |
261 | } |
262 | |
263 | // Is the VPST, controlling the block entry, predicated upon a VCTP. |
264 | static bool isEntryPredicatedOnVCTP(VPTState &Block, |
265 | bool Exclusive = false) { |
266 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
267 | return isPredicatedOnVCTP(MI: Insts.front(), Exclusive); |
268 | } |
269 | |
270 | // If this block begins with a VPT, we can check whether it's using |
271 | // at least one predicated input(s), as well as possible loop invariant |
272 | // which would result in it being implicitly predicated. |
273 | static bool hasImplicitlyValidVPT(VPTState &Block, |
274 | ReachingDefAnalysis &RDA) { |
275 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
276 | MachineInstr *VPT = Insts.front(); |
277 | assert(isVPTOpcode(VPT->getOpcode()) && |
278 | "Expected VPT block to begin with VPT/VPST" ); |
279 | |
280 | if (VPT->getOpcode() == ARM::MVE_VPST) |
281 | return false; |
282 | |
283 | auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) { |
284 | MachineInstr *Op = RDA.getMIOperand(MI, MO&: MI->getOperand(i: Idx)); |
285 | return Op && PredicatedInsts.count(x: Op) && isPredicatedOnVCTP(MI: Op); |
286 | }; |
287 | |
288 | auto IsOperandInvariant = [&](MachineInstr *MI, unsigned Idx) { |
289 | MachineOperand &MO = MI->getOperand(i: Idx); |
290 | if (!MO.isReg() || !MO.getReg()) |
291 | return true; |
292 | |
293 | SmallPtrSet<MachineInstr *, 2> Defs; |
294 | RDA.getGlobalReachingDefs(MI, PhysReg: MO.getReg(), Defs); |
295 | if (Defs.empty()) |
296 | return true; |
297 | |
298 | for (auto *Def : Defs) |
299 | if (Def->getParent() == VPT->getParent()) |
300 | return false; |
301 | return true; |
302 | }; |
303 | |
304 | // Check that at least one of the operands is directly predicated on a |
305 | // vctp and allow an invariant value too. |
306 | return (IsOperandPredicated(VPT, 1) || IsOperandPredicated(VPT, 2)) && |
307 | (IsOperandPredicated(VPT, 1) || IsOperandInvariant(VPT, 1)) && |
308 | (IsOperandPredicated(VPT, 2) || IsOperandInvariant(VPT, 2)); |
309 | } |
310 | |
311 | static bool isValid(ReachingDefAnalysis &RDA) { |
312 | // All predication within the loop should be based on vctp. If the block |
313 | // isn't predicated on entry, check whether the vctp is within the block |
314 | // and that all other instructions are then predicated on it. |
315 | for (auto &Block : Blocks) { |
316 | if (isEntryPredicatedOnVCTP(Block, Exclusive: false) || |
317 | hasImplicitlyValidVPT(Block, RDA)) |
318 | continue; |
319 | |
320 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
321 | // We don't know how to convert a block with just a VPT;VCTP into |
322 | // anything valid once we remove the VCTP. For now just bail out. |
323 | assert(isVPTOpcode(Insts.front()->getOpcode()) && |
324 | "Expected VPT block to start with a VPST or VPT!" ); |
325 | if (Insts.size() == 2 && Insts.front()->getOpcode() != ARM::MVE_VPST && |
326 | isVCTP(MI: Insts.back())) |
327 | return false; |
328 | |
329 | for (auto *MI : Insts) { |
330 | // Check that any internal VCTPs are 'Then' predicated. |
331 | if (isVCTP(MI) && getVPTInstrPredicate(MI: *MI) != ARMVCC::Then) |
332 | return false; |
333 | // Skip other instructions that build up the predicate. |
334 | if (MI->getOpcode() == ARM::MVE_VPST || isVectorPredicate(MI)) |
335 | continue; |
336 | // Check that any other instructions are predicated upon a vctp. |
337 | // TODO: We could infer when VPTs are implicitly predicated on the |
338 | // vctp (when the operands are predicated). |
339 | if (!isPredicatedOnVCTP(MI)) { |
340 | LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *MI); |
341 | return false; |
342 | } |
343 | } |
344 | } |
345 | return true; |
346 | } |
347 | |
348 | VPTState(MachineInstr *MI) { Insts.push_back(Elt: MI); } |
349 | |
350 | void insert(MachineInstr *MI) { |
351 | Insts.push_back(Elt: MI); |
352 | // VPT/VPST + 4 predicated instructions. |
353 | assert(Insts.size() <= 5 && "Too many instructions in VPT block!" ); |
354 | } |
355 | |
356 | bool containsVCTP() const { |
357 | return llvm::any_of(Range: Insts, P: isVCTP); |
358 | } |
359 | |
360 | unsigned size() const { return Insts.size(); } |
361 | SmallVectorImpl<MachineInstr *> &getInsts() { return Insts; } |
362 | }; |
363 | |
364 | struct LowOverheadLoop { |
365 | |
366 | MachineLoop &ML; |
367 | MachineBasicBlock * = nullptr; |
368 | MachineLoopInfo &MLI; |
369 | ReachingDefAnalysis &RDA; |
370 | const TargetRegisterInfo &TRI; |
371 | const ARMBaseInstrInfo &TII; |
372 | MachineFunction *MF = nullptr; |
373 | MachineBasicBlock::iterator StartInsertPt; |
374 | MachineBasicBlock *StartInsertBB = nullptr; |
375 | MachineInstr *Start = nullptr; |
376 | MachineInstr *Dec = nullptr; |
377 | MachineInstr *End = nullptr; |
378 | MachineOperand TPNumElements; |
379 | SmallVector<MachineInstr *, 4> VCTPs; |
380 | SmallPtrSet<MachineInstr *, 4> ToRemove; |
381 | SmallPtrSet<MachineInstr *, 4> BlockMasksToRecompute; |
382 | SmallPtrSet<MachineInstr *, 4> DoubleWidthResultInstrs; |
383 | SmallPtrSet<MachineInstr *, 4> VMOVCopies; |
384 | bool Revert = false; |
385 | bool CannotTailPredicate = false; |
386 | |
387 | LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI, |
388 | ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI, |
389 | const ARMBaseInstrInfo &TII) |
390 | : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII), |
391 | TPNumElements(MachineOperand::CreateImm(Val: 0)) { |
392 | MF = ML.getHeader()->getParent(); |
393 | if (auto *MBB = ML.getLoopPreheader()) |
394 | Preheader = MBB; |
395 | else if (auto *MBB = MLI.findLoopPreheader(L: &ML, SpeculativePreheader: true, FindMultiLoopPreheader: true)) |
396 | Preheader = MBB; |
397 | VPTState::reset(); |
398 | } |
399 | |
400 | // If this is an MVE instruction, check that we know how to use tail |
401 | // predication with it. Record VPT blocks and return whether the |
402 | // instruction is valid for tail predication. |
403 | bool ValidateMVEInst(MachineInstr *MI); |
404 | |
405 | void AnalyseMVEInst(MachineInstr *MI) { |
406 | CannotTailPredicate = !ValidateMVEInst(MI); |
407 | } |
408 | |
409 | bool IsTailPredicationLegal() const { |
410 | // For now, let's keep things really simple and only support a single |
411 | // block for tail predication. |
412 | return !Revert && FoundAllComponents() && !VCTPs.empty() && |
413 | !CannotTailPredicate && ML.getNumBlocks() == 1; |
414 | } |
415 | |
416 | // Given that MI is a VCTP, check that is equivalent to any other VCTPs |
417 | // found. |
418 | bool AddVCTP(MachineInstr *MI); |
419 | |
420 | // Check that the predication in the loop will be equivalent once we |
421 | // perform the conversion. Also ensure that we can provide the number |
422 | // of elements to the loop start instruction. |
423 | bool ValidateTailPredicate(); |
424 | |
425 | // Check that any values available outside of the loop will be the same |
426 | // after tail predication conversion. |
427 | bool ValidateLiveOuts(); |
428 | |
429 | // Check the branch targets are within range and we satisfy our |
430 | // restrictions. |
431 | void Validate(ARMBasicBlockUtils *BBUtils); |
432 | |
433 | bool FoundAllComponents() const { |
434 | return Start && Dec && End; |
435 | } |
436 | |
437 | SmallVectorImpl<VPTState> &getVPTBlocks() { |
438 | return VPTState::Blocks; |
439 | } |
440 | |
441 | // Return the operand for the loop start instruction. This will be the loop |
442 | // iteration count, or the number of elements if we're tail predicating. |
443 | MachineOperand &getLoopStartOperand() { |
444 | if (IsTailPredicationLegal()) |
445 | return TPNumElements; |
446 | return Start->getOperand(i: 1); |
447 | } |
448 | |
449 | unsigned getStartOpcode() const { |
450 | bool IsDo = isDoLoopStart(MI: *Start); |
451 | if (!IsTailPredicationLegal()) |
452 | return IsDo ? ARM::t2DLS : ARM::t2WLS; |
453 | |
454 | return VCTPOpcodeToLSTP(Opcode: VCTPs.back()->getOpcode(), IsDoLoop: IsDo); |
455 | } |
456 | |
457 | void dump() const { |
458 | if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start; |
459 | if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec; |
460 | if (End) dbgs() << "ARM Loops: Found Loop End: " << *End; |
461 | if (!VCTPs.empty()) { |
462 | dbgs() << "ARM Loops: Found VCTP(s):\n" ; |
463 | for (auto *MI : VCTPs) |
464 | dbgs() << " - " << *MI; |
465 | } |
466 | if (!FoundAllComponents()) |
467 | dbgs() << "ARM Loops: Not a low-overhead loop.\n" ; |
468 | else if (!(Start && Dec && End)) |
469 | dbgs() << "ARM Loops: Failed to find all loop components.\n" ; |
470 | } |
471 | }; |
472 | |
473 | class ARMLowOverheadLoops : public MachineFunctionPass { |
474 | MachineFunction *MF = nullptr; |
475 | MachineLoopInfo *MLI = nullptr; |
476 | ReachingDefAnalysis *RDA = nullptr; |
477 | const ARMBaseInstrInfo *TII = nullptr; |
478 | MachineRegisterInfo *MRI = nullptr; |
479 | const TargetRegisterInfo *TRI = nullptr; |
480 | std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr; |
481 | |
482 | public: |
483 | static char ID; |
484 | |
485 | ARMLowOverheadLoops() : MachineFunctionPass(ID) { } |
486 | |
487 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
488 | AU.setPreservesCFG(); |
489 | AU.addRequired<MachineLoopInfo>(); |
490 | AU.addRequired<ReachingDefAnalysis>(); |
491 | MachineFunctionPass::getAnalysisUsage(AU); |
492 | } |
493 | |
494 | bool runOnMachineFunction(MachineFunction &MF) override; |
495 | |
496 | MachineFunctionProperties getRequiredProperties() const override { |
497 | return MachineFunctionProperties().set( |
498 | MachineFunctionProperties::Property::NoVRegs).set( |
499 | MachineFunctionProperties::Property::TracksLiveness); |
500 | } |
501 | |
502 | StringRef getPassName() const override { |
503 | return ARM_LOW_OVERHEAD_LOOPS_NAME; |
504 | } |
505 | |
506 | private: |
507 | bool ProcessLoop(MachineLoop *ML); |
508 | |
509 | bool RevertNonLoops(); |
510 | |
511 | void RevertWhile(MachineInstr *MI) const; |
512 | void RevertDo(MachineInstr *MI) const; |
513 | |
514 | bool RevertLoopDec(MachineInstr *MI) const; |
515 | |
516 | void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const; |
517 | |
518 | void RevertLoopEndDec(MachineInstr *MI) const; |
519 | |
520 | void ConvertVPTBlocks(LowOverheadLoop &LoLoop); |
521 | |
522 | MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop); |
523 | |
524 | void Expand(LowOverheadLoop &LoLoop); |
525 | |
526 | void IterationCountDCE(LowOverheadLoop &LoLoop); |
527 | }; |
528 | } |
529 | |
530 | char ARMLowOverheadLoops::ID = 0; |
531 | |
532 | SmallVector<VPTState, 4> VPTState::Blocks; |
533 | SetVector<MachineInstr *> VPTState::CurrentPredicates; |
534 | std::map<MachineInstr *, |
535 | std::unique_ptr<PredicatedMI>> VPTState::PredicatedInsts; |
536 | |
537 | INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, |
538 | false, false) |
539 | |
540 | static bool TryRemove(MachineInstr *MI, ReachingDefAnalysis &RDA, |
541 | InstSet &ToRemove, InstSet &Ignore) { |
542 | |
543 | // Check that we can remove all of Killed without having to modify any IT |
544 | // blocks. |
545 | auto WontCorruptITs = [](InstSet &Killed, ReachingDefAnalysis &RDA) { |
546 | // Collect the dead code and the MBBs in which they reside. |
547 | SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks; |
548 | for (auto *Dead : Killed) |
549 | BasicBlocks.insert(Ptr: Dead->getParent()); |
550 | |
551 | // Collect IT blocks in all affected basic blocks. |
552 | std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks; |
553 | for (auto *MBB : BasicBlocks) { |
554 | for (auto &IT : *MBB) { |
555 | if (IT.getOpcode() != ARM::t2IT) |
556 | continue; |
557 | RDA.getReachingLocalUses(MI: &IT, PhysReg: MCRegister::from(ARM::Val: ITSTATE), |
558 | Uses&: ITBlocks[&IT]); |
559 | } |
560 | } |
561 | |
562 | // If we're removing all of the instructions within an IT block, then |
563 | // also remove the IT instruction. |
564 | SmallPtrSet<MachineInstr *, 2> ModifiedITs; |
565 | SmallPtrSet<MachineInstr *, 2> RemoveITs; |
566 | for (auto *Dead : Killed) { |
567 | if (MachineOperand *MO = |
568 | Dead->findRegisterUseOperand(ARM::ITSTATE, /*TRI=*/nullptr)) { |
569 | MachineInstr *IT = RDA.getMIOperand(MI: Dead, MO&: *MO); |
570 | RemoveITs.insert(Ptr: IT); |
571 | auto &CurrentBlock = ITBlocks[IT]; |
572 | CurrentBlock.erase(Dead); |
573 | if (CurrentBlock.empty()) |
574 | ModifiedITs.erase(Ptr: IT); |
575 | else |
576 | ModifiedITs.insert(Ptr: IT); |
577 | } |
578 | } |
579 | if (!ModifiedITs.empty()) |
580 | return false; |
581 | Killed.insert(I: RemoveITs.begin(), E: RemoveITs.end()); |
582 | return true; |
583 | }; |
584 | |
585 | SmallPtrSet<MachineInstr *, 2> Uses; |
586 | if (!RDA.isSafeToRemove(MI, ToRemove&: Uses, Ignore)) |
587 | return false; |
588 | |
589 | if (WontCorruptITs(Uses, RDA)) { |
590 | ToRemove.insert(I: Uses.begin(), E: Uses.end()); |
591 | LLVM_DEBUG(dbgs() << "ARM Loops: Able to remove: " << *MI |
592 | << " - can also remove:\n" ; |
593 | for (auto *Use : Uses) |
594 | dbgs() << " - " << *Use); |
595 | |
596 | SmallPtrSet<MachineInstr*, 4> Killed; |
597 | RDA.collectKilledOperands(MI, Dead&: Killed); |
598 | if (WontCorruptITs(Killed, RDA)) { |
599 | ToRemove.insert(I: Killed.begin(), E: Killed.end()); |
600 | LLVM_DEBUG(for (auto *Dead : Killed) |
601 | dbgs() << " - " << *Dead); |
602 | } |
603 | return true; |
604 | } |
605 | return false; |
606 | } |
607 | |
608 | bool LowOverheadLoop::ValidateTailPredicate() { |
609 | if (!IsTailPredicationLegal()) { |
610 | LLVM_DEBUG(if (VCTPs.empty()) |
611 | dbgs() << "ARM Loops: Didn't find a VCTP instruction.\n" ; |
612 | dbgs() << "ARM Loops: Tail-predication is not valid.\n" ); |
613 | return false; |
614 | } |
615 | |
616 | assert(!VCTPs.empty() && "VCTP instruction expected but is not set" ); |
617 | assert(ML.getBlocks().size() == 1 && |
618 | "Shouldn't be processing a loop with more than one block" ); |
619 | |
620 | if (DisableTailPredication) { |
621 | LLVM_DEBUG(dbgs() << "ARM Loops: tail-predication is disabled\n" ); |
622 | return false; |
623 | } |
624 | |
625 | if (!VPTState::isValid(RDA)) { |
626 | LLVM_DEBUG(dbgs() << "ARM Loops: Invalid VPT state.\n" ); |
627 | return false; |
628 | } |
629 | |
630 | if (!ValidateLiveOuts()) { |
631 | LLVM_DEBUG(dbgs() << "ARM Loops: Invalid live outs.\n" ); |
632 | return false; |
633 | } |
634 | |
635 | // For tail predication, we need to provide the number of elements, instead |
636 | // of the iteration count, to the loop start instruction. The number of |
637 | // elements is provided to the vctp instruction, so we need to check that |
638 | // we can use this register at InsertPt. |
639 | MachineInstr *VCTP = VCTPs.back(); |
640 | if (Start->getOpcode() == ARM::t2DoLoopStartTP || |
641 | Start->getOpcode() == ARM::t2WhileLoopStartTP) { |
642 | TPNumElements = Start->getOperand(i: 2); |
643 | StartInsertPt = Start; |
644 | StartInsertBB = Start->getParent(); |
645 | } else { |
646 | TPNumElements = VCTP->getOperand(i: 1); |
647 | MCRegister NumElements = TPNumElements.getReg().asMCReg(); |
648 | |
649 | // If the register is defined within loop, then we can't perform TP. |
650 | // TODO: Check whether this is just a mov of a register that would be |
651 | // available. |
652 | if (RDA.hasLocalDefBefore(MI: VCTP, PhysReg: NumElements)) { |
653 | LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n" ); |
654 | return false; |
655 | } |
656 | |
657 | // The element count register maybe defined after InsertPt, in which case we |
658 | // need to try to move either InsertPt or the def so that the [w|d]lstp can |
659 | // use the value. |
660 | |
661 | if (StartInsertPt != StartInsertBB->end() && |
662 | !RDA.isReachingDefLiveOut(MI: &*StartInsertPt, PhysReg: NumElements)) { |
663 | if (auto *ElemDef = |
664 | RDA.getLocalLiveOutMIDef(MBB: StartInsertBB, PhysReg: NumElements)) { |
665 | if (RDA.isSafeToMoveForwards(From: ElemDef, To: &*StartInsertPt)) { |
666 | ElemDef->removeFromParent(); |
667 | StartInsertBB->insert(I: StartInsertPt, MI: ElemDef); |
668 | LLVM_DEBUG(dbgs() |
669 | << "ARM Loops: Moved element count def: " << *ElemDef); |
670 | } else if (RDA.isSafeToMoveBackwards(From: &*StartInsertPt, To: ElemDef)) { |
671 | StartInsertPt->removeFromParent(); |
672 | StartInsertBB->insertAfter(I: MachineBasicBlock::iterator(ElemDef), |
673 | MI: &*StartInsertPt); |
674 | LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef); |
675 | } else { |
676 | // If we fail to move an instruction and the element count is provided |
677 | // by a mov, use the mov operand if it will have the same value at the |
678 | // insertion point |
679 | MachineOperand Operand = ElemDef->getOperand(i: 1); |
680 | if (isMovRegOpcode(Opc: ElemDef->getOpcode()) && |
681 | RDA.getUniqueReachingMIDef(MI: ElemDef, PhysReg: Operand.getReg().asMCReg()) == |
682 | RDA.getUniqueReachingMIDef(MI: &*StartInsertPt, |
683 | PhysReg: Operand.getReg().asMCReg())) { |
684 | TPNumElements = Operand; |
685 | NumElements = TPNumElements.getReg(); |
686 | } else { |
687 | LLVM_DEBUG(dbgs() |
688 | << "ARM Loops: Unable to move element count to loop " |
689 | << "start instruction.\n" ); |
690 | return false; |
691 | } |
692 | } |
693 | } |
694 | } |
695 | |
696 | // Especially in the case of while loops, InsertBB may not be the |
697 | // preheader, so we need to check that the register isn't redefined |
698 | // before entering the loop. |
699 | auto CannotProvideElements = [this](MachineBasicBlock *MBB, |
700 | MCRegister NumElements) { |
701 | if (MBB->empty()) |
702 | return false; |
703 | // NumElements is redefined in this block. |
704 | if (RDA.hasLocalDefBefore(MI: &MBB->back(), PhysReg: NumElements)) |
705 | return true; |
706 | |
707 | // Don't continue searching up through multiple predecessors. |
708 | if (MBB->pred_size() > 1) |
709 | return true; |
710 | |
711 | return false; |
712 | }; |
713 | |
714 | // Search backwards for a def, until we get to InsertBB. |
715 | MachineBasicBlock *MBB = Preheader; |
716 | while (MBB && MBB != StartInsertBB) { |
717 | if (CannotProvideElements(MBB, NumElements)) { |
718 | LLVM_DEBUG(dbgs() << "ARM Loops: Unable to provide element count.\n" ); |
719 | return false; |
720 | } |
721 | MBB = *MBB->pred_begin(); |
722 | } |
723 | } |
724 | |
725 | // Could inserting the [W|D]LSTP cause some unintended affects? In a perfect |
726 | // world the [w|d]lstp instruction would be last instruction in the preheader |
727 | // and so it would only affect instructions within the loop body. But due to |
728 | // scheduling, and/or the logic in this pass (above), the insertion point can |
729 | // be moved earlier. So if the Loop Start isn't the last instruction in the |
730 | // preheader, and if the initial element count is smaller than the vector |
731 | // width, the Loop Start instruction will immediately generate one or more |
732 | // false lane mask which can, incorrectly, affect the proceeding MVE |
733 | // instructions in the preheader. |
734 | if (std::any_of(first: StartInsertPt, last: StartInsertBB->end(), pred: shouldInspect)) { |
735 | LLVM_DEBUG(dbgs() << "ARM Loops: Instruction blocks [W|D]LSTP\n" ); |
736 | return false; |
737 | } |
738 | |
739 | // For any DoubleWidthResultInstrs we found whilst scanning instructions, they |
740 | // need to compute an output size that is smaller than the VCTP mask operates |
741 | // on. The VecSize of the DoubleWidthResult is the larger vector size - the |
742 | // size it extends into, so any VCTP VecSize <= is valid. |
743 | unsigned VCTPVecSize = getVecSize(MI: *VCTP); |
744 | for (MachineInstr *MI : DoubleWidthResultInstrs) { |
745 | unsigned InstrVecSize = getVecSize(MI: *MI); |
746 | if (InstrVecSize > VCTPVecSize) { |
747 | LLVM_DEBUG(dbgs() << "ARM Loops: Double width result larger than VCTP " |
748 | << "VecSize:\n" << *MI); |
749 | return false; |
750 | } |
751 | } |
752 | |
753 | // Check that the value change of the element count is what we expect and |
754 | // that the predication will be equivalent. For this we need: |
755 | // NumElements = NumElements - VectorWidth. The sub will be a sub immediate |
756 | // and we can also allow register copies within the chain too. |
757 | auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) { |
758 | return -getAddSubImmediate(MI&: *MI) == ExpectedVecWidth; |
759 | }; |
760 | |
761 | MachineBasicBlock *MBB = VCTP->getParent(); |
762 | // Remove modifications to the element count since they have no purpose in a |
763 | // tail predicated loop. Explicitly refer to the vctp operand no matter which |
764 | // register NumElements has been assigned to, since that is what the |
765 | // modifications will be using |
766 | if (auto *Def = RDA.getUniqueReachingMIDef( |
767 | MI: &MBB->back(), PhysReg: VCTP->getOperand(i: 1).getReg().asMCReg())) { |
768 | SmallPtrSet<MachineInstr*, 2> ElementChain; |
769 | SmallPtrSet<MachineInstr*, 2> Ignore; |
770 | unsigned ExpectedVectorWidth = getTailPredVectorWidth(Opcode: VCTP->getOpcode()); |
771 | |
772 | Ignore.insert(I: VCTPs.begin(), E: VCTPs.end()); |
773 | |
774 | if (TryRemove(MI: Def, RDA, ToRemove&: ElementChain, Ignore)) { |
775 | bool FoundSub = false; |
776 | |
777 | for (auto *MI : ElementChain) { |
778 | if (isMovRegOpcode(Opc: MI->getOpcode())) |
779 | continue; |
780 | |
781 | if (isSubImmOpcode(Opc: MI->getOpcode())) { |
782 | if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth)) { |
783 | LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element" |
784 | " count: " << *MI); |
785 | return false; |
786 | } |
787 | FoundSub = true; |
788 | } else { |
789 | LLVM_DEBUG(dbgs() << "ARM Loops: Unexpected instruction in element" |
790 | " count: " << *MI); |
791 | return false; |
792 | } |
793 | } |
794 | ToRemove.insert(I: ElementChain.begin(), E: ElementChain.end()); |
795 | } |
796 | } |
797 | |
798 | // If we converted the LoopStart to a t2DoLoopStartTP/t2WhileLoopStartTP, we |
799 | // can also remove any extra instructions in the preheader, which often |
800 | // includes a now unused MOV. |
801 | if ((Start->getOpcode() == ARM::t2DoLoopStartTP || |
802 | Start->getOpcode() == ARM::t2WhileLoopStartTP) && |
803 | Preheader && !Preheader->empty() && |
804 | !RDA.hasLocalDefBefore(MI: VCTP, PhysReg: VCTP->getOperand(i: 1).getReg())) { |
805 | if (auto *Def = RDA.getUniqueReachingMIDef( |
806 | MI: &Preheader->back(), PhysReg: VCTP->getOperand(i: 1).getReg().asMCReg())) { |
807 | SmallPtrSet<MachineInstr*, 2> Ignore; |
808 | Ignore.insert(I: VCTPs.begin(), E: VCTPs.end()); |
809 | TryRemove(MI: Def, RDA, ToRemove, Ignore); |
810 | } |
811 | } |
812 | |
813 | return true; |
814 | } |
815 | |
816 | static bool isRegInClass(const MachineOperand &MO, |
817 | const TargetRegisterClass *Class) { |
818 | return MO.isReg() && MO.getReg() && Class->contains(Reg: MO.getReg()); |
819 | } |
820 | |
821 | // MVE 'narrowing' operate on half a lane, reading from half and writing |
822 | // to half, which are referred to has the top and bottom half. The other |
823 | // half retains its previous value. |
824 | static bool retainsPreviousHalfElement(const MachineInstr &MI) { |
825 | const MCInstrDesc &MCID = MI.getDesc(); |
826 | uint64_t Flags = MCID.TSFlags; |
827 | return (Flags & ARMII::RetainsPreviousHalfElement) != 0; |
828 | } |
829 | |
830 | // Some MVE instructions read from the top/bottom halves of their operand(s) |
831 | // and generate a vector result with result elements that are double the |
832 | // width of the input. |
833 | static bool producesDoubleWidthResult(const MachineInstr &MI) { |
834 | const MCInstrDesc &MCID = MI.getDesc(); |
835 | uint64_t Flags = MCID.TSFlags; |
836 | return (Flags & ARMII::DoubleWidthResult) != 0; |
837 | } |
838 | |
839 | static bool isHorizontalReduction(const MachineInstr &MI) { |
840 | const MCInstrDesc &MCID = MI.getDesc(); |
841 | uint64_t Flags = MCID.TSFlags; |
842 | return (Flags & ARMII::HorizontalReduction) != 0; |
843 | } |
844 | |
845 | // Can this instruction generate a non-zero result when given only zeroed |
846 | // operands? This allows us to know that, given operands with false bytes |
847 | // zeroed by masked loads, that the result will also contain zeros in those |
848 | // bytes. |
849 | static bool canGenerateNonZeros(const MachineInstr &MI) { |
850 | |
851 | // Check for instructions which can write into a larger element size, |
852 | // possibly writing into a previous zero'd lane. |
853 | if (producesDoubleWidthResult(MI)) |
854 | return true; |
855 | |
856 | switch (MI.getOpcode()) { |
857 | default: |
858 | break; |
859 | // FIXME: VNEG FP and -0? I think we'll need to handle this once we allow |
860 | // fp16 -> fp32 vector conversions. |
861 | // Instructions that perform a NOT will generate 1s from 0s. |
862 | case ARM::MVE_VMVN: |
863 | case ARM::MVE_VORN: |
864 | // Count leading zeros will do just that! |
865 | case ARM::MVE_VCLZs8: |
866 | case ARM::MVE_VCLZs16: |
867 | case ARM::MVE_VCLZs32: |
868 | return true; |
869 | } |
870 | return false; |
871 | } |
872 | |
873 | // Look at its register uses to see if it only can only receive zeros |
874 | // into its false lanes which would then produce zeros. Also check that |
875 | // the output register is also defined by an FalseLanesZero instruction |
876 | // so that if tail-predication happens, the lanes that aren't updated will |
877 | // still be zeros. |
878 | static bool producesFalseLanesZero(MachineInstr &MI, |
879 | const TargetRegisterClass *QPRs, |
880 | const ReachingDefAnalysis &RDA, |
881 | InstSet &FalseLanesZero) { |
882 | if (canGenerateNonZeros(MI)) |
883 | return false; |
884 | |
885 | bool isPredicated = isVectorPredicated(MI: &MI); |
886 | // Predicated loads will write zeros to the falsely predicated bytes of the |
887 | // destination register. |
888 | if (MI.mayLoad()) |
889 | return isPredicated; |
890 | |
891 | auto IsZeroInit = [](MachineInstr *Def) { |
892 | return !isVectorPredicated(Def) && |
893 | Def->getOpcode() == ARM::MVE_VMOVimmi32 && |
894 | Def->getOperand(1).getImm() == 0; |
895 | }; |
896 | |
897 | bool AllowScalars = isHorizontalReduction(MI); |
898 | for (auto &MO : MI.operands()) { |
899 | if (!MO.isReg() || !MO.getReg()) |
900 | continue; |
901 | if (!isRegInClass(MO, Class: QPRs) && AllowScalars) |
902 | continue; |
903 | // Skip the lr predicate reg |
904 | int PIdx = llvm::findFirstVPTPredOperandIdx(MI); |
905 | if (PIdx != -1 && (int)MO.getOperandNo() == PIdx + 2) |
906 | continue; |
907 | |
908 | // Check that this instruction will produce zeros in its false lanes: |
909 | // - If it only consumes false lanes zero or constant 0 (vmov #0) |
910 | // - If it's predicated, it only matters that it's def register already has |
911 | // false lane zeros, so we can ignore the uses. |
912 | SmallPtrSet<MachineInstr *, 2> Defs; |
913 | RDA.getGlobalReachingDefs(MI: &MI, PhysReg: MO.getReg(), Defs); |
914 | if (Defs.empty()) |
915 | return false; |
916 | for (auto *Def : Defs) { |
917 | if (Def == &MI || FalseLanesZero.count(Ptr: Def) || IsZeroInit(Def)) |
918 | continue; |
919 | if (MO.isUse() && isPredicated) |
920 | continue; |
921 | return false; |
922 | } |
923 | } |
924 | LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI); |
925 | return true; |
926 | } |
927 | |
928 | bool LowOverheadLoop::ValidateLiveOuts() { |
929 | // We want to find out if the tail-predicated version of this loop will |
930 | // produce the same values as the loop in its original form. For this to |
931 | // be true, the newly inserted implicit predication must not change the |
932 | // the (observable) results. |
933 | // We're doing this because many instructions in the loop will not be |
934 | // predicated and so the conversion from VPT predication to tail-predication |
935 | // can result in different values being produced; due to the tail-predication |
936 | // preventing many instructions from updating their falsely predicated |
937 | // lanes. This analysis assumes that all the instructions perform lane-wise |
938 | // operations and don't perform any exchanges. |
939 | // A masked load, whether through VPT or tail predication, will write zeros |
940 | // to any of the falsely predicated bytes. So, from the loads, we know that |
941 | // the false lanes are zeroed and here we're trying to track that those false |
942 | // lanes remain zero, or where they change, the differences are masked away |
943 | // by their user(s). |
944 | // All MVE stores have to be predicated, so we know that any predicate load |
945 | // operands, or stored results are equivalent already. Other explicitly |
946 | // predicated instructions will perform the same operation in the original |
947 | // loop and the tail-predicated form too. Because of this, we can insert |
948 | // loads, stores and other predicated instructions into our Predicated |
949 | // set and build from there. |
950 | const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::i: MQPRRegClassID); |
951 | SetVector<MachineInstr *> FalseLanesUnknown; |
952 | SmallPtrSet<MachineInstr *, 4> FalseLanesZero; |
953 | SmallPtrSet<MachineInstr *, 4> Predicated; |
954 | MachineBasicBlock * = ML.getHeader(); |
955 | |
956 | LLVM_DEBUG(dbgs() << "ARM Loops: Validating Live outs\n" ); |
957 | |
958 | for (auto &MI : *Header) { |
959 | if (!shouldInspect(MI)) |
960 | continue; |
961 | |
962 | if (isVCTP(MI: &MI) || isVPTOpcode(Opc: MI.getOpcode())) |
963 | continue; |
964 | |
965 | bool isPredicated = isVectorPredicated(MI: &MI); |
966 | bool retainsOrReduces = |
967 | retainsPreviousHalfElement(MI) || isHorizontalReduction(MI); |
968 | |
969 | if (isPredicated) |
970 | Predicated.insert(Ptr: &MI); |
971 | if (producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero)) |
972 | FalseLanesZero.insert(Ptr: &MI); |
973 | else if (MI.getNumDefs() == 0) |
974 | continue; |
975 | else if (!isPredicated && retainsOrReduces) { |
976 | LLVM_DEBUG(dbgs() << " Unpredicated instruction that retainsOrReduces: " << MI); |
977 | return false; |
978 | } else if (!isPredicated && MI.getOpcode() != ARM::MQPRCopy) |
979 | FalseLanesUnknown.insert(X: &MI); |
980 | } |
981 | |
982 | LLVM_DEBUG({ |
983 | dbgs() << " Predicated:\n" ; |
984 | for (auto *I : Predicated) |
985 | dbgs() << " " << *I; |
986 | dbgs() << " FalseLanesZero:\n" ; |
987 | for (auto *I : FalseLanesZero) |
988 | dbgs() << " " << *I; |
989 | dbgs() << " FalseLanesUnknown:\n" ; |
990 | for (auto *I : FalseLanesUnknown) |
991 | dbgs() << " " << *I; |
992 | }); |
993 | |
994 | auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO, |
995 | SmallPtrSetImpl<MachineInstr *> &Predicated) { |
996 | SmallPtrSet<MachineInstr *, 2> Uses; |
997 | RDA.getGlobalUses(MI, PhysReg: MO.getReg().asMCReg(), Uses); |
998 | for (auto *Use : Uses) { |
999 | if (Use != MI && !Predicated.count(Ptr: Use)) |
1000 | return false; |
1001 | } |
1002 | return true; |
1003 | }; |
1004 | |
1005 | // Visit the unknowns in reverse so that we can start at the values being |
1006 | // stored and then we can work towards the leaves, hopefully adding more |
1007 | // instructions to Predicated. Successfully terminating the loop means that |
1008 | // all the unknown values have to found to be masked by predicated user(s). |
1009 | // For any unpredicated values, we store them in NonPredicated so that we |
1010 | // can later check whether these form a reduction. |
1011 | SmallPtrSet<MachineInstr*, 2> NonPredicated; |
1012 | for (auto *MI : reverse(C&: FalseLanesUnknown)) { |
1013 | for (auto &MO : MI->operands()) { |
1014 | if (!isRegInClass(MO, Class: QPRs) || !MO.isDef()) |
1015 | continue; |
1016 | if (!HasPredicatedUsers(MI, MO, Predicated)) { |
1017 | LLVM_DEBUG(dbgs() << " Found an unknown def of : " |
1018 | << TRI.getRegAsmName(MO.getReg()) << " at " << *MI); |
1019 | NonPredicated.insert(Ptr: MI); |
1020 | break; |
1021 | } |
1022 | } |
1023 | // Any unknown false lanes have been masked away by the user(s). |
1024 | if (!NonPredicated.contains(Ptr: MI)) |
1025 | Predicated.insert(Ptr: MI); |
1026 | } |
1027 | |
1028 | SmallPtrSet<MachineInstr *, 2> LiveOutMIs; |
1029 | SmallVector<MachineBasicBlock *, 2> ExitBlocks; |
1030 | ML.getExitBlocks(ExitBlocks); |
1031 | assert(ML.getNumBlocks() == 1 && "Expected single block loop!" ); |
1032 | assert(ExitBlocks.size() == 1 && "Expected a single exit block" ); |
1033 | MachineBasicBlock *ExitBB = ExitBlocks.front(); |
1034 | for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) { |
1035 | // TODO: Instead of blocking predication, we could move the vctp to the exit |
1036 | // block and calculate it's operand there in or the preheader. |
1037 | if (RegMask.PhysReg == ARM::VPR) { |
1038 | LLVM_DEBUG(dbgs() << " VPR is live in to the exit block." ); |
1039 | return false; |
1040 | } |
1041 | // Check Q-regs that are live in the exit blocks. We don't collect scalars |
1042 | // because they won't be affected by lane predication. |
1043 | if (QPRs->contains(Reg: RegMask.PhysReg)) |
1044 | if (auto *MI = RDA.getLocalLiveOutMIDef(MBB: Header, PhysReg: RegMask.PhysReg)) |
1045 | LiveOutMIs.insert(Ptr: MI); |
1046 | } |
1047 | |
1048 | // We've already validated that any VPT predication within the loop will be |
1049 | // equivalent when we perform the predication transformation; so we know that |
1050 | // any VPT predicated instruction is predicated upon VCTP. Any live-out |
1051 | // instruction needs to be predicated, so check this here. The instructions |
1052 | // in NonPredicated have been found to be a reduction that we can ensure its |
1053 | // legality. Any MQPRCopy found will need to validate its input as if it was |
1054 | // live out. |
1055 | SmallVector<MachineInstr *> Worklist(LiveOutMIs.begin(), LiveOutMIs.end()); |
1056 | while (!Worklist.empty()) { |
1057 | MachineInstr *MI = Worklist.pop_back_val(); |
1058 | if (MI->getOpcode() == ARM::MQPRCopy) { |
1059 | VMOVCopies.insert(Ptr: MI); |
1060 | MachineInstr *CopySrc = |
1061 | RDA.getUniqueReachingMIDef(MI, PhysReg: MI->getOperand(i: 1).getReg()); |
1062 | if (CopySrc) |
1063 | Worklist.push_back(Elt: CopySrc); |
1064 | } else if (NonPredicated.count(Ptr: MI) && FalseLanesUnknown.contains(key: MI)) { |
1065 | LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI); |
1066 | VMOVCopies.clear(); |
1067 | return false; |
1068 | } |
1069 | } |
1070 | |
1071 | return true; |
1072 | } |
1073 | |
1074 | void LowOverheadLoop::Validate(ARMBasicBlockUtils *BBUtils) { |
1075 | if (Revert) |
1076 | return; |
1077 | |
1078 | // Check branch target ranges: WLS[TP] can only branch forwards and LE[TP] |
1079 | // can only jump back. |
1080 | auto ValidateRanges = [](MachineInstr *Start, MachineInstr *End, |
1081 | ARMBasicBlockUtils *BBUtils, MachineLoop &ML) { |
1082 | MachineBasicBlock *TgtBB = End->getOpcode() == ARM::t2LoopEnd |
1083 | ? End->getOperand(i: 1).getMBB() |
1084 | : End->getOperand(i: 2).getMBB(); |
1085 | // TODO Maybe there's cases where the target doesn't have to be the header, |
1086 | // but for now be safe and revert. |
1087 | if (TgtBB != ML.getHeader()) { |
1088 | LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targeting header.\n" ); |
1089 | return false; |
1090 | } |
1091 | |
1092 | // The WLS and LE instructions have 12-bits for the label offset. WLS |
1093 | // requires a positive offset, while LE uses negative. |
1094 | if (BBUtils->getOffsetOf(MI: End) < BBUtils->getOffsetOf(MBB: ML.getHeader()) || |
1095 | !BBUtils->isBBInRange(MI: End, DestBB: ML.getHeader(), MaxDisp: 4094)) { |
1096 | LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n" ); |
1097 | return false; |
1098 | } |
1099 | |
1100 | if (isWhileLoopStart(MI: *Start)) { |
1101 | MachineBasicBlock *TargetBB = getWhileLoopStartTargetBB(MI: *Start); |
1102 | if (BBUtils->getOffsetOf(MI: Start) > BBUtils->getOffsetOf(MBB: TargetBB) || |
1103 | !BBUtils->isBBInRange(MI: Start, DestBB: TargetBB, MaxDisp: 4094)) { |
1104 | LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n" ); |
1105 | return false; |
1106 | } |
1107 | } |
1108 | return true; |
1109 | }; |
1110 | |
1111 | StartInsertPt = MachineBasicBlock::iterator(Start); |
1112 | StartInsertBB = Start->getParent(); |
1113 | LLVM_DEBUG(dbgs() << "ARM Loops: Will insert LoopStart at " |
1114 | << *StartInsertPt); |
1115 | |
1116 | Revert = !ValidateRanges(Start, End, BBUtils, ML); |
1117 | CannotTailPredicate = !ValidateTailPredicate(); |
1118 | } |
1119 | |
1120 | bool LowOverheadLoop::AddVCTP(MachineInstr *MI) { |
1121 | LLVM_DEBUG(dbgs() << "ARM Loops: Adding VCTP: " << *MI); |
1122 | if (VCTPs.empty()) { |
1123 | VCTPs.push_back(Elt: MI); |
1124 | return true; |
1125 | } |
1126 | |
1127 | // If we find another VCTP, check whether it uses the same value as the main VCTP. |
1128 | // If it does, store it in the VCTPs set, else refuse it. |
1129 | MachineInstr *Prev = VCTPs.back(); |
1130 | if (!Prev->getOperand(i: 1).isIdenticalTo(Other: MI->getOperand(i: 1)) || |
1131 | !RDA.hasSameReachingDef(A: Prev, B: MI, PhysReg: MI->getOperand(i: 1).getReg().asMCReg())) { |
1132 | LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching " |
1133 | "definition from the main VCTP" ); |
1134 | return false; |
1135 | } |
1136 | VCTPs.push_back(Elt: MI); |
1137 | return true; |
1138 | } |
1139 | |
1140 | static bool ValidateMVEStore(MachineInstr *MI, MachineLoop *ML) { |
1141 | |
1142 | auto GetFrameIndex = [](MachineMemOperand *Operand) { |
1143 | const PseudoSourceValue *PseudoValue = Operand->getPseudoValue(); |
1144 | if (PseudoValue && PseudoValue->kind() == PseudoSourceValue::FixedStack) { |
1145 | if (const auto *FS = dyn_cast<FixedStackPseudoSourceValue>(Val: PseudoValue)) { |
1146 | return FS->getFrameIndex(); |
1147 | } |
1148 | } |
1149 | return -1; |
1150 | }; |
1151 | |
1152 | auto IsStackOp = [GetFrameIndex](MachineInstr *I) { |
1153 | switch (I->getOpcode()) { |
1154 | case ARM::MVE_VSTRWU32: |
1155 | case ARM::MVE_VLDRWU32: { |
1156 | return I->getOperand(1).getReg() == ARM::SP && |
1157 | I->memoperands().size() == 1 && |
1158 | GetFrameIndex(I->memoperands().front()) >= 0; |
1159 | } |
1160 | default: |
1161 | return false; |
1162 | } |
1163 | }; |
1164 | |
1165 | // An unpredicated vector register spill is allowed if all of the uses of the |
1166 | // stack slot are within the loop |
1167 | if (MI->getOpcode() != ARM::MVE_VSTRWU32 || !IsStackOp(MI)) |
1168 | return false; |
1169 | |
1170 | // Search all blocks after the loop for accesses to the same stack slot. |
1171 | // ReachingDefAnalysis doesn't work for sp as it relies on registers being |
1172 | // live-out (which sp never is) to know what blocks to look in |
1173 | if (MI->memoperands().size() == 0) |
1174 | return false; |
1175 | int FI = GetFrameIndex(MI->memoperands().front()); |
1176 | |
1177 | auto &FrameInfo = MI->getParent()->getParent()->getFrameInfo(); |
1178 | if (FI == -1 || !FrameInfo.isSpillSlotObjectIndex(ObjectIdx: FI)) |
1179 | return false; |
1180 | |
1181 | SmallVector<MachineBasicBlock *> Frontier; |
1182 | ML->getExitBlocks(ExitBlocks&: Frontier); |
1183 | SmallPtrSet<MachineBasicBlock *, 4> Visited{MI->getParent()}; |
1184 | unsigned Idx = 0; |
1185 | while (Idx < Frontier.size()) { |
1186 | MachineBasicBlock *BB = Frontier[Idx]; |
1187 | bool LookAtSuccessors = true; |
1188 | for (auto &I : *BB) { |
1189 | if (!IsStackOp(&I) || I.memoperands().size() == 0) |
1190 | continue; |
1191 | if (GetFrameIndex(I.memoperands().front()) != FI) |
1192 | continue; |
1193 | // If this block has a store to the stack slot before any loads then we |
1194 | // can ignore the block |
1195 | if (I.getOpcode() == ARM::MVE_VSTRWU32) { |
1196 | LookAtSuccessors = false; |
1197 | break; |
1198 | } |
1199 | // If the store and the load are using the same stack slot then the |
1200 | // store isn't valid for tail predication |
1201 | if (I.getOpcode() == ARM::MVE_VLDRWU32) |
1202 | return false; |
1203 | } |
1204 | |
1205 | if (LookAtSuccessors) { |
1206 | for (auto *Succ : BB->successors()) { |
1207 | if (!Visited.contains(Ptr: Succ) && !is_contained(Range&: Frontier, Element: Succ)) |
1208 | Frontier.push_back(Elt: Succ); |
1209 | } |
1210 | } |
1211 | Visited.insert(Ptr: BB); |
1212 | Idx++; |
1213 | } |
1214 | |
1215 | return true; |
1216 | } |
1217 | |
1218 | bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) { |
1219 | if (CannotTailPredicate) |
1220 | return false; |
1221 | |
1222 | if (!shouldInspect(MI&: *MI)) |
1223 | return true; |
1224 | |
1225 | if (MI->getOpcode() == ARM::MVE_VPSEL || |
1226 | MI->getOpcode() == ARM::MVE_VPNOT) { |
1227 | // TODO: Allow VPSEL and VPNOT, we currently cannot because: |
1228 | // 1) It will use the VPR as a predicate operand, but doesn't have to be |
1229 | // instead a VPT block, which means we can assert while building up |
1230 | // the VPT block because we don't find another VPT or VPST to being a new |
1231 | // one. |
1232 | // 2) VPSEL still requires a VPR operand even after tail predicating, |
1233 | // which means we can't remove it unless there is another |
1234 | // instruction, such as vcmp, that can provide the VPR def. |
1235 | return false; |
1236 | } |
1237 | |
1238 | // Record all VCTPs and check that they're equivalent to one another. |
1239 | if (isVCTP(MI) && !AddVCTP(MI)) |
1240 | return false; |
1241 | |
1242 | // Inspect uses first so that any instructions that alter the VPR don't |
1243 | // alter the predicate upon themselves. |
1244 | const MCInstrDesc &MCID = MI->getDesc(); |
1245 | bool IsUse = false; |
1246 | unsigned LastOpIdx = MI->getNumOperands() - 1; |
1247 | for (const auto &Op : enumerate(First: reverse(C: MCID.operands()))) { |
1248 | const MachineOperand &MO = MI->getOperand(i: LastOpIdx - Op.index()); |
1249 | if (!MO.isReg() || !MO.isUse() || MO.getReg() != ARM::VPR) |
1250 | continue; |
1251 | |
1252 | if (ARM::isVpred(op: Op.value().OperandType)) { |
1253 | VPTState::addInst(MI); |
1254 | IsUse = true; |
1255 | } else if (MI->getOpcode() != ARM::MVE_VPST) { |
1256 | LLVM_DEBUG(dbgs() << "ARM Loops: Found instruction using vpr: " << *MI); |
1257 | return false; |
1258 | } |
1259 | } |
1260 | |
1261 | // If we find an instruction that has been marked as not valid for tail |
1262 | // predication, only allow the instruction if it's contained within a valid |
1263 | // VPT block. |
1264 | bool RequiresExplicitPredication = |
1265 | (MCID.TSFlags & ARMII::ValidForTailPredication) == 0; |
1266 | if (isDomainMVE(MI) && RequiresExplicitPredication) { |
1267 | if (MI->getOpcode() == ARM::MQPRCopy) |
1268 | return true; |
1269 | if (!IsUse && producesDoubleWidthResult(MI: *MI)) { |
1270 | DoubleWidthResultInstrs.insert(Ptr: MI); |
1271 | return true; |
1272 | } |
1273 | |
1274 | LLVM_DEBUG(if (!IsUse) dbgs() |
1275 | << "ARM Loops: Can't tail predicate: " << *MI); |
1276 | return IsUse; |
1277 | } |
1278 | |
1279 | // If the instruction is already explicitly predicated, then the conversion |
1280 | // will be fine, but ensure that all store operations are predicated. |
1281 | if (MI->mayStore() && !ValidateMVEStore(MI, ML: &ML)) |
1282 | return IsUse; |
1283 | |
1284 | // If this instruction defines the VPR, update the predicate for the |
1285 | // proceeding instructions. |
1286 | if (isVectorPredicate(MI)) { |
1287 | // Clear the existing predicate when we're not in VPT Active state, |
1288 | // otherwise we add to it. |
1289 | if (!isVectorPredicated(MI)) |
1290 | VPTState::resetPredicate(MI); |
1291 | else |
1292 | VPTState::addPredicate(MI); |
1293 | } |
1294 | |
1295 | // Finally once the predicate has been modified, we can start a new VPT |
1296 | // block if necessary. |
1297 | if (isVPTOpcode(Opc: MI->getOpcode())) |
1298 | VPTState::CreateVPTBlock(MI); |
1299 | |
1300 | return true; |
1301 | } |
1302 | |
1303 | bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { |
1304 | const ARMSubtarget &ST = mf.getSubtarget<ARMSubtarget>(); |
1305 | if (!ST.hasLOB()) |
1306 | return false; |
1307 | |
1308 | MF = &mf; |
1309 | LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n" ); |
1310 | |
1311 | MLI = &getAnalysis<MachineLoopInfo>(); |
1312 | RDA = &getAnalysis<ReachingDefAnalysis>(); |
1313 | MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness); |
1314 | MRI = &MF->getRegInfo(); |
1315 | TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo()); |
1316 | TRI = ST.getRegisterInfo(); |
1317 | BBUtils = std::unique_ptr<ARMBasicBlockUtils>(new ARMBasicBlockUtils(*MF)); |
1318 | BBUtils->computeAllBlockSizes(); |
1319 | BBUtils->adjustBBOffsetsAfter(MBB: &MF->front()); |
1320 | |
1321 | bool Changed = false; |
1322 | for (auto *ML : *MLI) { |
1323 | if (ML->isOutermost()) |
1324 | Changed |= ProcessLoop(ML); |
1325 | } |
1326 | Changed |= RevertNonLoops(); |
1327 | return Changed; |
1328 | } |
1329 | |
1330 | bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { |
1331 | |
1332 | bool Changed = false; |
1333 | |
1334 | // Process inner loops first. |
1335 | for (MachineLoop *L : *ML) |
1336 | Changed |= ProcessLoop(ML: L); |
1337 | |
1338 | LLVM_DEBUG({ |
1339 | dbgs() << "ARM Loops: Processing loop containing:\n" ; |
1340 | if (auto * = ML->getLoopPreheader()) |
1341 | dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n" ; |
1342 | else if (auto * = MLI->findLoopPreheader(ML, true, true)) |
1343 | dbgs() << " - Preheader: " << printMBBReference(*Preheader) << "\n" ; |
1344 | for (auto *MBB : ML->getBlocks()) |
1345 | dbgs() << " - Block: " << printMBBReference(*MBB) << "\n" ; |
1346 | }); |
1347 | |
1348 | // Search the given block for a loop start instruction. If one isn't found, |
1349 | // and there's only one predecessor block, search that one too. |
1350 | std::function<MachineInstr*(MachineBasicBlock*)> SearchForStart = |
1351 | [&SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* { |
1352 | for (auto &MI : *MBB) { |
1353 | if (isLoopStart(MI)) |
1354 | return &MI; |
1355 | } |
1356 | if (MBB->pred_size() == 1) |
1357 | return SearchForStart(*MBB->pred_begin()); |
1358 | return nullptr; |
1359 | }; |
1360 | |
1361 | LowOverheadLoop LoLoop(*ML, *MLI, *RDA, *TRI, *TII); |
1362 | // Search the preheader for the start intrinsic. |
1363 | // FIXME: I don't see why we shouldn't be supporting multiple predecessors |
1364 | // with potentially multiple set.loop.iterations, so we need to enable this. |
1365 | if (LoLoop.Preheader) |
1366 | LoLoop.Start = SearchForStart(LoLoop.Preheader); |
1367 | else |
1368 | return Changed; |
1369 | |
1370 | // Find the low-overhead loop components and decide whether or not to fall |
1371 | // back to a normal loop. Also look for a vctp instructions and decide |
1372 | // whether we can convert that predicate using tail predication. |
1373 | for (auto *MBB : reverse(C: ML->getBlocks())) { |
1374 | for (auto &MI : *MBB) { |
1375 | if (MI.isDebugValue()) |
1376 | continue; |
1377 | else if (MI.getOpcode() == ARM::t2LoopDec) |
1378 | LoLoop.Dec = &MI; |
1379 | else if (MI.getOpcode() == ARM::t2LoopEnd) |
1380 | LoLoop.End = &MI; |
1381 | else if (MI.getOpcode() == ARM::t2LoopEndDec) |
1382 | LoLoop.End = LoLoop.Dec = &MI; |
1383 | else if (isLoopStart(MI)) |
1384 | LoLoop.Start = &MI; |
1385 | else if (MI.getDesc().isCall()) { |
1386 | // TODO: Though the call will require LE to execute again, does this |
1387 | // mean we should revert? Always executing LE hopefully should be |
1388 | // faster than performing a sub,cmp,br or even subs,br. |
1389 | LoLoop.Revert = true; |
1390 | LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n" ); |
1391 | } else { |
1392 | // Record VPR defs and build up their corresponding vpt blocks. |
1393 | // Check we know how to tail predicate any mve instructions. |
1394 | LoLoop.AnalyseMVEInst(MI: &MI); |
1395 | } |
1396 | } |
1397 | } |
1398 | |
1399 | LLVM_DEBUG(LoLoop.dump()); |
1400 | if (!LoLoop.FoundAllComponents()) { |
1401 | LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find loop start, update, end\n" ); |
1402 | return Changed; |
1403 | } |
1404 | |
1405 | assert(LoLoop.Start->getOpcode() != ARM::t2WhileLoopStart && |
1406 | "Expected t2WhileLoopStart to be removed before regalloc!" ); |
1407 | |
1408 | // Check that the only instruction using LoopDec is LoopEnd. This can only |
1409 | // happen when the Dec and End are separate, not a single t2LoopEndDec. |
1410 | // TODO: Check for copy chains that really have no effect. |
1411 | if (LoLoop.Dec != LoLoop.End) { |
1412 | SmallPtrSet<MachineInstr *, 2> Uses; |
1413 | RDA->getReachingLocalUses(LoLoop.Dec, MCRegister::from(ARM::LR), Uses); |
1414 | if (Uses.size() > 1 || !Uses.count(Ptr: LoLoop.End)) { |
1415 | LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n" ); |
1416 | LoLoop.Revert = true; |
1417 | } |
1418 | } |
1419 | LoLoop.Validate(BBUtils: BBUtils.get()); |
1420 | Expand(LoLoop); |
1421 | return true; |
1422 | } |
1423 | |
1424 | // WhileLoopStart holds the exit block, so produce a cmp lr, 0 and then a |
1425 | // beq that branches to the exit branch. |
1426 | // TODO: We could also try to generate a cbz if the value in LR is also in |
1427 | // another low register. |
1428 | void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const { |
1429 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp: " << *MI); |
1430 | MachineBasicBlock *DestBB = getWhileLoopStartTargetBB(MI: *MI); |
1431 | unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? |
1432 | ARM::tBcc : ARM::t2Bcc; |
1433 | |
1434 | RevertWhileLoopStartLR(MI, TII, BrOpc); |
1435 | } |
1436 | |
1437 | void ARMLowOverheadLoops::RevertDo(MachineInstr *MI) const { |
1438 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to mov: " << *MI); |
1439 | RevertDoLoopStart(MI, TII); |
1440 | } |
1441 | |
1442 | bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const { |
1443 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI); |
1444 | MachineBasicBlock *MBB = MI->getParent(); |
1445 | SmallPtrSet<MachineInstr*, 1> Ignore; |
1446 | for (auto I = MachineBasicBlock::iterator(MI), E = MBB->end(); I != E; ++I) { |
1447 | if (I->getOpcode() == ARM::t2LoopEnd) { |
1448 | Ignore.insert(Ptr: &*I); |
1449 | break; |
1450 | } |
1451 | } |
1452 | |
1453 | // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS. |
1454 | bool SetFlags = |
1455 | RDA->isSafeToDefRegAt(MI, MCRegister::from(ARM::CPSR), Ignore); |
1456 | |
1457 | llvm::RevertLoopDec(MI, TII, SetFlags); |
1458 | return SetFlags; |
1459 | } |
1460 | |
1461 | // Generate a subs, or sub and cmp, and a branch instead of an LE. |
1462 | void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const { |
1463 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to cmp, br: " << *MI); |
1464 | |
1465 | MachineBasicBlock *DestBB = MI->getOperand(i: 1).getMBB(); |
1466 | unsigned BrOpc = BBUtils->isBBInRange(MI, DestBB, 254) ? |
1467 | ARM::tBcc : ARM::t2Bcc; |
1468 | |
1469 | llvm::RevertLoopEnd(MI, TII, BrOpc, SkipCmp); |
1470 | } |
1471 | |
1472 | // Generate a subs, or sub and cmp, and a branch instead of an LE. |
1473 | void ARMLowOverheadLoops::RevertLoopEndDec(MachineInstr *MI) const { |
1474 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to subs, br: " << *MI); |
1475 | assert(MI->getOpcode() == ARM::t2LoopEndDec && "Expected a t2LoopEndDec!" ); |
1476 | MachineBasicBlock *MBB = MI->getParent(); |
1477 | |
1478 | MachineInstrBuilder MIB = |
1479 | BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri)); |
1480 | MIB.addDef(ARM::LR); |
1481 | MIB.add(MO: MI->getOperand(i: 1)); |
1482 | MIB.addImm(Val: 1); |
1483 | MIB.addImm(Val: ARMCC::AL); |
1484 | MIB.addReg(ARM::NoRegister); |
1485 | MIB.addReg(ARM::CPSR); |
1486 | MIB->getOperand(i: 5).setIsDef(true); |
1487 | |
1488 | MachineBasicBlock *DestBB = MI->getOperand(i: 2).getMBB(); |
1489 | unsigned BrOpc = |
1490 | BBUtils->isBBInRange(MI, DestBB, 254) ? ARM::tBcc : ARM::t2Bcc; |
1491 | |
1492 | // Create bne |
1493 | MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(BrOpc)); |
1494 | MIB.add(MO: MI->getOperand(i: 2)); // branch target |
1495 | MIB.addImm(Val: ARMCC::NE); // condition code |
1496 | MIB.addReg(ARM::CPSR); |
1497 | |
1498 | MI->eraseFromParent(); |
1499 | } |
1500 | |
1501 | // Perform dead code elimation on the loop iteration count setup expression. |
1502 | // If we are tail-predicating, the number of elements to be processed is the |
1503 | // operand of the VCTP instruction in the vector body, see getCount(), which is |
1504 | // register $r3 in this example: |
1505 | // |
1506 | // $lr = big-itercount-expression |
1507 | // .. |
1508 | // $lr = t2DoLoopStart renamable $lr |
1509 | // vector.body: |
1510 | // .. |
1511 | // $vpr = MVE_VCTP32 renamable $r3 |
1512 | // renamable $lr = t2LoopDec killed renamable $lr, 1 |
1513 | // t2LoopEnd renamable $lr, %vector.body |
1514 | // tB %end |
1515 | // |
1516 | // What we would like achieve here is to replace the do-loop start pseudo |
1517 | // instruction t2DoLoopStart with: |
1518 | // |
1519 | // $lr = MVE_DLSTP_32 killed renamable $r3 |
1520 | // |
1521 | // Thus, $r3 which defines the number of elements, is written to $lr, |
1522 | // and then we want to delete the whole chain that used to define $lr, |
1523 | // see the comment below how this chain could look like. |
1524 | // |
1525 | void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) { |
1526 | if (!LoLoop.IsTailPredicationLegal()) |
1527 | return; |
1528 | |
1529 | LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n" ); |
1530 | |
1531 | MachineInstr *Def = RDA->getMIOperand(MI: LoLoop.Start, Idx: 1); |
1532 | if (!Def) { |
1533 | LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n" ); |
1534 | return; |
1535 | } |
1536 | |
1537 | // Collect and remove the users of iteration count. |
1538 | SmallPtrSet<MachineInstr*, 4> Killed = { LoLoop.Start, LoLoop.Dec, |
1539 | LoLoop.End }; |
1540 | if (!TryRemove(MI: Def, RDA&: *RDA, ToRemove&: LoLoop.ToRemove, Ignore&: Killed)) |
1541 | LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n" ); |
1542 | } |
1543 | |
1544 | MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { |
1545 | LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n" ); |
1546 | // When using tail-predication, try to delete the dead code that was used to |
1547 | // calculate the number of loop iterations. |
1548 | IterationCountDCE(LoLoop); |
1549 | |
1550 | MachineBasicBlock::iterator InsertPt = LoLoop.StartInsertPt; |
1551 | MachineInstr *Start = LoLoop.Start; |
1552 | MachineBasicBlock *MBB = LoLoop.StartInsertBB; |
1553 | unsigned Opc = LoLoop.getStartOpcode(); |
1554 | MachineOperand &Count = LoLoop.getLoopStartOperand(); |
1555 | |
1556 | // A DLS lr, lr we needn't emit |
1557 | MachineInstr* NewStart; |
1558 | if (!DisableOmitDLS && Opc == ARM::t2DLS && Count.isReg() && |
1559 | Count.getReg() == ARM::LR) { |
1560 | LLVM_DEBUG(dbgs() << "ARM Loops: Didn't insert start: DLS lr, lr" ); |
1561 | NewStart = nullptr; |
1562 | } else { |
1563 | MachineInstrBuilder MIB = |
1564 | BuildMI(*MBB, InsertPt, Start->getDebugLoc(), TII->get(Opc)); |
1565 | |
1566 | MIB.addDef(ARM::LR); |
1567 | MIB.add(MO: Count); |
1568 | if (isWhileLoopStart(MI: *Start)) |
1569 | MIB.addMBB(MBB: getWhileLoopStartTargetBB(MI: *Start)); |
1570 | |
1571 | LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); |
1572 | NewStart = &*MIB; |
1573 | } |
1574 | |
1575 | LoLoop.ToRemove.insert(Ptr: Start); |
1576 | return NewStart; |
1577 | } |
1578 | |
1579 | void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { |
1580 | auto RemovePredicate = [](MachineInstr *MI) { |
1581 | if (MI->isDebugInstr()) |
1582 | return; |
1583 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing predicate from: " << *MI); |
1584 | int PIdx = llvm::findFirstVPTPredOperandIdx(MI: *MI); |
1585 | assert(PIdx >= 1 && "Trying to unpredicate a non-predicated instruction" ); |
1586 | assert(MI->getOperand(PIdx).getImm() == ARMVCC::Then && |
1587 | "Expected Then predicate!" ); |
1588 | MI->getOperand(i: PIdx).setImm(ARMVCC::None); |
1589 | MI->getOperand(i: PIdx + 1).setReg(0); |
1590 | }; |
1591 | |
1592 | for (auto &Block : LoLoop.getVPTBlocks()) { |
1593 | SmallVectorImpl<MachineInstr *> &Insts = Block.getInsts(); |
1594 | |
1595 | auto ReplaceVCMPWithVPT = [&](MachineInstr *&TheVCMP, MachineInstr *At) { |
1596 | assert(TheVCMP && "Replacing a removed or non-existent VCMP" ); |
1597 | // Replace the VCMP with a VPT |
1598 | MachineInstrBuilder MIB = |
1599 | BuildMI(*At->getParent(), At, At->getDebugLoc(), |
1600 | TII->get(VCMPOpcodeToVPT(Opcode: TheVCMP->getOpcode()))); |
1601 | MIB.addImm(Val: ARMVCC::Then); |
1602 | // Register one |
1603 | MIB.add(MO: TheVCMP->getOperand(i: 1)); |
1604 | // Register two |
1605 | MIB.add(MO: TheVCMP->getOperand(i: 2)); |
1606 | // The comparison code, e.g. ge, eq, lt |
1607 | MIB.add(MO: TheVCMP->getOperand(i: 3)); |
1608 | LLVM_DEBUG(dbgs() << "ARM Loops: Combining with VCMP to VPT: " << *MIB); |
1609 | LoLoop.BlockMasksToRecompute.insert(Ptr: MIB.getInstr()); |
1610 | LoLoop.ToRemove.insert(Ptr: TheVCMP); |
1611 | TheVCMP = nullptr; |
1612 | }; |
1613 | |
1614 | if (VPTState::isEntryPredicatedOnVCTP(Block, /*exclusive*/ Exclusive: true)) { |
1615 | MachineInstr *VPST = Insts.front(); |
1616 | if (VPTState::hasUniformPredicate(Block)) { |
1617 | // A vpt block starting with VPST, is only predicated upon vctp and has no |
1618 | // internal vpr defs: |
1619 | // - Remove vpst. |
1620 | // - Unpredicate the remaining instructions. |
1621 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); |
1622 | for (unsigned i = 1; i < Insts.size(); ++i) |
1623 | RemovePredicate(Insts[i]); |
1624 | } else { |
1625 | // The VPT block has a non-uniform predicate but it uses a vpst and its |
1626 | // entry is guarded only by a vctp, which means we: |
1627 | // - Need to remove the original vpst. |
1628 | // - Then need to unpredicate any following instructions, until |
1629 | // we come across the divergent vpr def. |
1630 | // - Insert a new vpst to predicate the instruction(s) that following |
1631 | // the divergent vpr def. |
1632 | MachineInstr *Divergent = VPTState::getDivergent(Block); |
1633 | MachineBasicBlock *MBB = Divergent->getParent(); |
1634 | auto DivergentNext = ++MachineBasicBlock::iterator(Divergent); |
1635 | while (DivergentNext != MBB->end() && DivergentNext->isDebugInstr()) |
1636 | ++DivergentNext; |
1637 | |
1638 | bool DivergentNextIsPredicated = |
1639 | DivergentNext != MBB->end() && |
1640 | getVPTInstrPredicate(MI: *DivergentNext) != ARMVCC::None; |
1641 | |
1642 | for (auto I = ++MachineBasicBlock::iterator(VPST), E = DivergentNext; |
1643 | I != E; ++I) |
1644 | RemovePredicate(&*I); |
1645 | |
1646 | // Check if the instruction defining vpr is a vcmp so it can be combined |
1647 | // with the VPST This should be the divergent instruction |
1648 | MachineInstr *VCMP = |
1649 | VCMPOpcodeToVPT(Opcode: Divergent->getOpcode()) != 0 ? Divergent : nullptr; |
1650 | |
1651 | if (DivergentNextIsPredicated) { |
1652 | // Insert a VPST at the divergent only if the next instruction |
1653 | // would actually use it. A VCMP following a VPST can be |
1654 | // merged into a VPT so do that instead if the VCMP exists. |
1655 | if (!VCMP) { |
1656 | // Create a VPST (with a null mask for now, we'll recompute it |
1657 | // later) |
1658 | MachineInstrBuilder MIB = |
1659 | BuildMI(*Divergent->getParent(), Divergent, |
1660 | Divergent->getDebugLoc(), TII->get(ARM::MVE_VPST)); |
1661 | MIB.addImm(Val: 0); |
1662 | LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB); |
1663 | LoLoop.BlockMasksToRecompute.insert(Ptr: MIB.getInstr()); |
1664 | } else { |
1665 | // No RDA checks are necessary here since the VPST would have been |
1666 | // directly after the VCMP |
1667 | ReplaceVCMPWithVPT(VCMP, VCMP); |
1668 | } |
1669 | } |
1670 | } |
1671 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); |
1672 | LoLoop.ToRemove.insert(Ptr: VPST); |
1673 | } else if (Block.containsVCTP()) { |
1674 | // The vctp will be removed, so either the entire block will be dead or |
1675 | // the block mask of the vp(s)t will need to be recomputed. |
1676 | MachineInstr *VPST = Insts.front(); |
1677 | if (Block.size() == 2) { |
1678 | assert(VPST->getOpcode() == ARM::MVE_VPST && |
1679 | "Found a VPST in an otherwise empty vpt block" ); |
1680 | LoLoop.ToRemove.insert(Ptr: VPST); |
1681 | } else |
1682 | LoLoop.BlockMasksToRecompute.insert(Ptr: VPST); |
1683 | } else if (Insts.front()->getOpcode() == ARM::MVE_VPST) { |
1684 | // If this block starts with a VPST then attempt to merge it with the |
1685 | // preceeding un-merged VCMP into a VPT. This VCMP comes from a VPT |
1686 | // block that no longer exists |
1687 | MachineInstr *VPST = Insts.front(); |
1688 | auto Next = ++MachineBasicBlock::iterator(VPST); |
1689 | assert(getVPTInstrPredicate(*Next) != ARMVCC::None && |
1690 | "The instruction after a VPST must be predicated" ); |
1691 | (void)Next; |
1692 | MachineInstr *VprDef = RDA->getUniqueReachingMIDef(VPST, ARM::VPR); |
1693 | if (VprDef && VCMPOpcodeToVPT(Opcode: VprDef->getOpcode()) && |
1694 | !LoLoop.ToRemove.contains(Ptr: VprDef)) { |
1695 | MachineInstr *VCMP = VprDef; |
1696 | // The VCMP and VPST can only be merged if the VCMP's operands will have |
1697 | // the same values at the VPST. |
1698 | // If any of the instructions between the VCMP and VPST are predicated |
1699 | // then a different code path is expected to have merged the VCMP and |
1700 | // VPST already. |
1701 | if (std::none_of(first: ++MachineBasicBlock::iterator(VCMP), |
1702 | last: MachineBasicBlock::iterator(VPST), pred: hasVPRUse) && |
1703 | RDA->hasSameReachingDef(A: VCMP, B: VPST, PhysReg: VCMP->getOperand(i: 1).getReg()) && |
1704 | RDA->hasSameReachingDef(A: VCMP, B: VPST, PhysReg: VCMP->getOperand(i: 2).getReg())) { |
1705 | ReplaceVCMPWithVPT(VCMP, VPST); |
1706 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *VPST); |
1707 | LoLoop.ToRemove.insert(Ptr: VPST); |
1708 | } |
1709 | } |
1710 | } |
1711 | } |
1712 | |
1713 | LoLoop.ToRemove.insert(I: LoLoop.VCTPs.begin(), E: LoLoop.VCTPs.end()); |
1714 | } |
1715 | |
1716 | void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { |
1717 | |
1718 | // Combine the LoopDec and LoopEnd instructions into LE(TP). |
1719 | auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) { |
1720 | MachineInstr *End = LoLoop.End; |
1721 | MachineBasicBlock *MBB = End->getParent(); |
1722 | unsigned Opc = LoLoop.IsTailPredicationLegal() ? |
1723 | ARM::MVE_LETP : ARM::t2LEUpdate; |
1724 | MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(), |
1725 | TII->get(Opc)); |
1726 | MIB.addDef(ARM::LR); |
1727 | unsigned Off = LoLoop.Dec == LoLoop.End ? 1 : 0; |
1728 | MIB.add(MO: End->getOperand(i: Off + 0)); |
1729 | MIB.add(MO: End->getOperand(i: Off + 1)); |
1730 | LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); |
1731 | LoLoop.ToRemove.insert(Ptr: LoLoop.Dec); |
1732 | LoLoop.ToRemove.insert(Ptr: End); |
1733 | return &*MIB; |
1734 | }; |
1735 | |
1736 | // TODO: We should be able to automatically remove these branches before we |
1737 | // get here - probably by teaching analyzeBranch about the pseudo |
1738 | // instructions. |
1739 | // If there is an unconditional branch, after I, that just branches to the |
1740 | // next block, remove it. |
1741 | auto RemoveDeadBranch = [](MachineInstr *I) { |
1742 | MachineBasicBlock *BB = I->getParent(); |
1743 | MachineInstr *Terminator = &BB->instr_back(); |
1744 | if (Terminator->isUnconditionalBranch() && I != Terminator) { |
1745 | MachineBasicBlock *Succ = Terminator->getOperand(i: 0).getMBB(); |
1746 | if (BB->isLayoutSuccessor(MBB: Succ)) { |
1747 | LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator); |
1748 | Terminator->eraseFromParent(); |
1749 | } |
1750 | } |
1751 | }; |
1752 | |
1753 | // And VMOVCopies need to become 2xVMOVD for tail predication to be valid. |
1754 | // Anything other MQPRCopy can be converted to MVE_VORR later on. |
1755 | auto ExpandVMOVCopies = [this](SmallPtrSet<MachineInstr *, 4> &VMOVCopies) { |
1756 | for (auto *MI : VMOVCopies) { |
1757 | LLVM_DEBUG(dbgs() << "Converting copy to VMOVD: " << *MI); |
1758 | assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!" ); |
1759 | MachineBasicBlock *MBB = MI->getParent(); |
1760 | Register Dst = MI->getOperand(i: 0).getReg(); |
1761 | Register Src = MI->getOperand(i: 1).getReg(); |
1762 | auto MIB1 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD), |
1763 | ARM::D0 + (Dst - ARM::Q0) * 2) |
1764 | .addReg(ARM::D0 + (Src - ARM::Q0) * 2) |
1765 | .add(predOps(ARMCC::AL)); |
1766 | (void)MIB1; |
1767 | LLVM_DEBUG(dbgs() << " into " << *MIB1); |
1768 | auto MIB2 = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::VMOVD), |
1769 | ARM::D0 + (Dst - ARM::Q0) * 2 + 1) |
1770 | .addReg(ARM::D0 + (Src - ARM::Q0) * 2 + 1) |
1771 | .add(predOps(ARMCC::AL)); |
1772 | LLVM_DEBUG(dbgs() << " and " << *MIB2); |
1773 | (void)MIB2; |
1774 | MI->eraseFromParent(); |
1775 | } |
1776 | }; |
1777 | |
1778 | if (LoLoop.Revert) { |
1779 | if (isWhileLoopStart(MI: *LoLoop.Start)) |
1780 | RevertWhile(MI: LoLoop.Start); |
1781 | else |
1782 | RevertDo(MI: LoLoop.Start); |
1783 | if (LoLoop.Dec == LoLoop.End) |
1784 | RevertLoopEndDec(MI: LoLoop.End); |
1785 | else |
1786 | RevertLoopEnd(MI: LoLoop.End, SkipCmp: RevertLoopDec(MI: LoLoop.Dec)); |
1787 | } else { |
1788 | ExpandVMOVCopies(LoLoop.VMOVCopies); |
1789 | LoLoop.Start = ExpandLoopStart(LoLoop); |
1790 | if (LoLoop.Start) |
1791 | RemoveDeadBranch(LoLoop.Start); |
1792 | LoLoop.End = ExpandLoopEnd(LoLoop); |
1793 | RemoveDeadBranch(LoLoop.End); |
1794 | if (LoLoop.IsTailPredicationLegal()) |
1795 | ConvertVPTBlocks(LoLoop); |
1796 | for (auto *I : LoLoop.ToRemove) { |
1797 | LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I); |
1798 | I->eraseFromParent(); |
1799 | } |
1800 | for (auto *I : LoLoop.BlockMasksToRecompute) { |
1801 | LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I); |
1802 | recomputeVPTBlockMask(Instr&: *I); |
1803 | LLVM_DEBUG(dbgs() << " ... done: " << *I); |
1804 | } |
1805 | } |
1806 | |
1807 | PostOrderLoopTraversal DFS(LoLoop.ML, *MLI); |
1808 | DFS.ProcessLoop(); |
1809 | const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder(); |
1810 | fullyRecomputeLiveIns(MBBs: PostOrder); |
1811 | |
1812 | for (auto *MBB : reverse(C: PostOrder)) |
1813 | recomputeLivenessFlags(MBB&: *MBB); |
1814 | |
1815 | // We've moved, removed and inserted new instructions, so update RDA. |
1816 | RDA->reset(); |
1817 | } |
1818 | |
1819 | bool ARMLowOverheadLoops::RevertNonLoops() { |
1820 | LLVM_DEBUG(dbgs() << "ARM Loops: Reverting any remaining pseudos...\n" ); |
1821 | bool Changed = false; |
1822 | |
1823 | for (auto &MBB : *MF) { |
1824 | SmallVector<MachineInstr*, 4> Starts; |
1825 | SmallVector<MachineInstr*, 4> Decs; |
1826 | SmallVector<MachineInstr*, 4> Ends; |
1827 | SmallVector<MachineInstr *, 4> EndDecs; |
1828 | SmallVector<MachineInstr *, 4> MQPRCopies; |
1829 | |
1830 | for (auto &I : MBB) { |
1831 | if (isLoopStart(MI: I)) |
1832 | Starts.push_back(Elt: &I); |
1833 | else if (I.getOpcode() == ARM::t2LoopDec) |
1834 | Decs.push_back(Elt: &I); |
1835 | else if (I.getOpcode() == ARM::t2LoopEnd) |
1836 | Ends.push_back(Elt: &I); |
1837 | else if (I.getOpcode() == ARM::t2LoopEndDec) |
1838 | EndDecs.push_back(Elt: &I); |
1839 | else if (I.getOpcode() == ARM::MQPRCopy) |
1840 | MQPRCopies.push_back(Elt: &I); |
1841 | } |
1842 | |
1843 | if (Starts.empty() && Decs.empty() && Ends.empty() && EndDecs.empty() && |
1844 | MQPRCopies.empty()) |
1845 | continue; |
1846 | |
1847 | Changed = true; |
1848 | |
1849 | for (auto *Start : Starts) { |
1850 | if (isWhileLoopStart(MI: *Start)) |
1851 | RevertWhile(MI: Start); |
1852 | else |
1853 | RevertDo(MI: Start); |
1854 | } |
1855 | for (auto *Dec : Decs) |
1856 | RevertLoopDec(MI: Dec); |
1857 | |
1858 | for (auto *End : Ends) |
1859 | RevertLoopEnd(MI: End); |
1860 | for (auto *End : EndDecs) |
1861 | RevertLoopEndDec(MI: End); |
1862 | for (auto *MI : MQPRCopies) { |
1863 | LLVM_DEBUG(dbgs() << "Converting copy to VORR: " << *MI); |
1864 | assert(MI->getOpcode() == ARM::MQPRCopy && "Only expected MQPRCOPY!" ); |
1865 | MachineBasicBlock *MBB = MI->getParent(); |
1866 | auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::MVE_VORR), |
1867 | MI->getOperand(0).getReg()) |
1868 | .add(MI->getOperand(1)) |
1869 | .add(MI->getOperand(1)); |
1870 | addUnpredicatedMveVpredROp(MIB, MI->getOperand(i: 0).getReg()); |
1871 | MI->eraseFromParent(); |
1872 | } |
1873 | } |
1874 | return Changed; |
1875 | } |
1876 | |
1877 | FunctionPass *llvm::createARMLowOverheadLoopsPass() { |
1878 | return new ARMLowOverheadLoops(); |
1879 | } |
1880 | |