1//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the AArch64 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64InstrInfo.h"
14#include "AArch64ExpandImm.h"
15#include "AArch64FrameLowering.h"
16#include "AArch64MachineFunctionInfo.h"
17#include "AArch64PointerAuth.h"
18#include "AArch64Subtarget.h"
19#include "MCTargetDesc/AArch64AddressingModes.h"
20#include "Utils/AArch64BaseInfo.h"
21#include "llvm/ADT/ArrayRef.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/CodeGen/LivePhysRegs.h"
25#include "llvm/CodeGen/MachineBasicBlock.h"
26#include "llvm/CodeGen/MachineCombinerPattern.h"
27#include "llvm/CodeGen/MachineFrameInfo.h"
28#include "llvm/CodeGen/MachineFunction.h"
29#include "llvm/CodeGen/MachineInstr.h"
30#include "llvm/CodeGen/MachineInstrBuilder.h"
31#include "llvm/CodeGen/MachineMemOperand.h"
32#include "llvm/CodeGen/MachineModuleInfo.h"
33#include "llvm/CodeGen/MachineOperand.h"
34#include "llvm/CodeGen/MachineRegisterInfo.h"
35#include "llvm/CodeGen/RegisterScavenging.h"
36#include "llvm/CodeGen/StackMaps.h"
37#include "llvm/CodeGen/TargetRegisterInfo.h"
38#include "llvm/CodeGen/TargetSubtargetInfo.h"
39#include "llvm/IR/DebugInfoMetadata.h"
40#include "llvm/IR/DebugLoc.h"
41#include "llvm/IR/GlobalValue.h"
42#include "llvm/MC/MCAsmInfo.h"
43#include "llvm/MC/MCInst.h"
44#include "llvm/MC/MCInstBuilder.h"
45#include "llvm/MC/MCInstrDesc.h"
46#include "llvm/Support/Casting.h"
47#include "llvm/Support/CodeGen.h"
48#include "llvm/Support/CommandLine.h"
49#include "llvm/Support/ErrorHandling.h"
50#include "llvm/Support/LEB128.h"
51#include "llvm/Support/MathExtras.h"
52#include "llvm/Target/TargetMachine.h"
53#include "llvm/Target/TargetOptions.h"
54#include <cassert>
55#include <cstdint>
56#include <iterator>
57#include <utility>
58
59using namespace llvm;
60
61#define GET_INSTRINFO_CTOR_DTOR
62#include "AArch64GenInstrInfo.inc"
63
64static cl::opt<unsigned> TBZDisplacementBits(
65 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
66 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
67
68static cl::opt<unsigned> CBZDisplacementBits(
69 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(Val: 19),
70 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
71
72static cl::opt<unsigned>
73 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(Val: 19),
74 cl::desc("Restrict range of Bcc instructions (DEBUG)"));
75
76static cl::opt<unsigned>
77 BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(Val: 26),
78 cl::desc("Restrict range of B instructions (DEBUG)"));
79
80AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
81 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
82 AArch64::CATCHRET),
83 RI(STI.getTargetTriple()), Subtarget(STI) {}
84
85/// GetInstSize - Return the number of bytes of code the specified
86/// instruction may be. This returns the maximum number of bytes.
87unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
88 const MachineBasicBlock &MBB = *MI.getParent();
89 const MachineFunction *MF = MBB.getParent();
90 const Function &F = MF->getFunction();
91 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
92
93 {
94 auto Op = MI.getOpcode();
95 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR)
96 return getInlineAsmLength(MI.getOperand(i: 0).getSymbolName(), *MAI);
97 }
98
99 // Meta-instructions emit no code.
100 if (MI.isMetaInstruction())
101 return 0;
102
103 // FIXME: We currently only handle pseudoinstructions that don't get expanded
104 // before the assembly printer.
105 unsigned NumBytes = 0;
106 const MCInstrDesc &Desc = MI.getDesc();
107
108 // Size should be preferably set in
109 // llvm/lib/Target/AArch64/AArch64InstrInfo.td (default case).
110 // Specific cases handle instructions of variable sizes
111 switch (Desc.getOpcode()) {
112 default:
113 if (Desc.getSize())
114 return Desc.getSize();
115
116 // Anything not explicitly designated otherwise (i.e. pseudo-instructions
117 // with fixed constant size but not specified in .td file) is a normal
118 // 4-byte insn.
119 NumBytes = 4;
120 break;
121 case TargetOpcode::STACKMAP:
122 // The upper bound for a stackmap intrinsic is the full length of its shadow
123 NumBytes = StackMapOpers(&MI).getNumPatchBytes();
124 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
125 break;
126 case TargetOpcode::PATCHPOINT:
127 // The size of the patchpoint intrinsic is the number of bytes requested
128 NumBytes = PatchPointOpers(&MI).getNumPatchBytes();
129 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
130 break;
131 case TargetOpcode::STATEPOINT:
132 NumBytes = StatepointOpers(&MI).getNumPatchBytes();
133 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
134 // No patch bytes means a normal call inst is emitted
135 if (NumBytes == 0)
136 NumBytes = 4;
137 break;
138 case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
139 // If `patchable-function-entry` is set, PATCHABLE_FUNCTION_ENTER
140 // instructions are expanded to the specified number of NOPs. Otherwise,
141 // they are expanded to 36-byte XRay sleds.
142 NumBytes =
143 F.getFnAttributeAsParsedInteger(Kind: "patchable-function-entry", Default: 9) * 4;
144 break;
145 case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
146 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
147 // An XRay sled can be 4 bytes of alignment plus a 32-byte block.
148 NumBytes = 36;
149 break;
150 case TargetOpcode::PATCHABLE_EVENT_CALL:
151 // EVENT_CALL XRay sleds are exactly 6 instructions long (no alignment).
152 NumBytes = 24;
153 break;
154
155 case AArch64::SPACE:
156 NumBytes = MI.getOperand(i: 1).getImm();
157 break;
158 case TargetOpcode::BUNDLE:
159 NumBytes = getInstBundleLength(MI);
160 break;
161 }
162
163 return NumBytes;
164}
165
166unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
167 unsigned Size = 0;
168 MachineBasicBlock::const_instr_iterator I = MI.getIterator();
169 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
170 while (++I != E && I->isInsideBundle()) {
171 assert(!I->isBundle() && "No nested bundle!");
172 Size += getInstSizeInBytes(MI: *I);
173 }
174 return Size;
175}
176
177static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
178 SmallVectorImpl<MachineOperand> &Cond) {
179 // Block ends with fall-through condbranch.
180 switch (LastInst->getOpcode()) {
181 default:
182 llvm_unreachable("Unknown branch instruction?");
183 case AArch64::Bcc:
184 Target = LastInst->getOperand(i: 1).getMBB();
185 Cond.push_back(Elt: LastInst->getOperand(i: 0));
186 break;
187 case AArch64::CBZW:
188 case AArch64::CBZX:
189 case AArch64::CBNZW:
190 case AArch64::CBNZX:
191 Target = LastInst->getOperand(i: 1).getMBB();
192 Cond.push_back(Elt: MachineOperand::CreateImm(Val: -1));
193 Cond.push_back(Elt: MachineOperand::CreateImm(Val: LastInst->getOpcode()));
194 Cond.push_back(Elt: LastInst->getOperand(i: 0));
195 break;
196 case AArch64::TBZW:
197 case AArch64::TBZX:
198 case AArch64::TBNZW:
199 case AArch64::TBNZX:
200 Target = LastInst->getOperand(i: 2).getMBB();
201 Cond.push_back(Elt: MachineOperand::CreateImm(Val: -1));
202 Cond.push_back(Elt: MachineOperand::CreateImm(Val: LastInst->getOpcode()));
203 Cond.push_back(Elt: LastInst->getOperand(i: 0));
204 Cond.push_back(Elt: LastInst->getOperand(i: 1));
205 }
206}
207
208static unsigned getBranchDisplacementBits(unsigned Opc) {
209 switch (Opc) {
210 default:
211 llvm_unreachable("unexpected opcode!");
212 case AArch64::B:
213 return BDisplacementBits;
214 case AArch64::TBNZW:
215 case AArch64::TBZW:
216 case AArch64::TBNZX:
217 case AArch64::TBZX:
218 return TBZDisplacementBits;
219 case AArch64::CBNZW:
220 case AArch64::CBZW:
221 case AArch64::CBNZX:
222 case AArch64::CBZX:
223 return CBZDisplacementBits;
224 case AArch64::Bcc:
225 return BCCDisplacementBits;
226 }
227}
228
229bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp,
230 int64_t BrOffset) const {
231 unsigned Bits = getBranchDisplacementBits(Opc: BranchOp);
232 assert(Bits >= 3 && "max branch displacement must be enough to jump"
233 "over conditional branch expansion");
234 return isIntN(N: Bits, x: BrOffset / 4);
235}
236
237MachineBasicBlock *
238AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
239 switch (MI.getOpcode()) {
240 default:
241 llvm_unreachable("unexpected opcode!");
242 case AArch64::B:
243 return MI.getOperand(i: 0).getMBB();
244 case AArch64::TBZW:
245 case AArch64::TBNZW:
246 case AArch64::TBZX:
247 case AArch64::TBNZX:
248 return MI.getOperand(i: 2).getMBB();
249 case AArch64::CBZW:
250 case AArch64::CBNZW:
251 case AArch64::CBZX:
252 case AArch64::CBNZX:
253 case AArch64::Bcc:
254 return MI.getOperand(i: 1).getMBB();
255 }
256}
257
258void AArch64InstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
259 MachineBasicBlock &NewDestBB,
260 MachineBasicBlock &RestoreBB,
261 const DebugLoc &DL,
262 int64_t BrOffset,
263 RegScavenger *RS) const {
264 assert(RS && "RegScavenger required for long branching");
265 assert(MBB.empty() &&
266 "new block should be inserted for expanding unconditional branch");
267 assert(MBB.pred_size() == 1);
268 assert(RestoreBB.empty() &&
269 "restore block should be inserted for restoring clobbered registers");
270
271 auto buildIndirectBranch = [&](Register Reg, MachineBasicBlock &DestBB) {
272 // Offsets outside of the signed 33-bit range are not supported for ADRP +
273 // ADD.
274 if (!isInt<33>(x: BrOffset))
275 report_fatal_error(
276 reason: "Branch offsets outside of the signed 33-bit range not supported");
277
278 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADRP), Reg)
279 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGE);
280 BuildMI(MBB, MBB.end(), DL, get(AArch64::ADDXri), Reg)
281 .addReg(Reg)
282 .addSym(DestBB.getSymbol(), AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
283 .addImm(0);
284 BuildMI(MBB, MBB.end(), DL, get(AArch64::BR)).addReg(Reg);
285 };
286
287 RS->enterBasicBlockEnd(MBB);
288 // If X16 is unused, we can rely on the linker to insert a range extension
289 // thunk if NewDestBB is out of range of a single B instruction.
290 constexpr Register Reg = AArch64::X16;
291 if (!RS->isRegUsed(Reg)) {
292 insertUnconditionalBranch(MBB, &NewDestBB, DL);
293 RS->setRegUsed(Reg);
294 return;
295 }
296
297 // If there's a free register and it's worth inflating the code size,
298 // manually insert the indirect branch.
299 Register Scavenged = RS->FindUnusedReg(RC: &AArch64::GPR64RegClass);
300 if (Scavenged != AArch64::NoRegister &&
301 MBB.getSectionID() == MBBSectionID::ColdSectionID) {
302 buildIndirectBranch(Scavenged, NewDestBB);
303 RS->setRegUsed(Reg: Scavenged);
304 return;
305 }
306
307 // Note: Spilling X16 briefly moves the stack pointer, making it incompatible
308 // with red zones.
309 AArch64FunctionInfo *AFI = MBB.getParent()->getInfo<AArch64FunctionInfo>();
310 if (!AFI || AFI->hasRedZone().value_or(u: true))
311 report_fatal_error(
312 reason: "Unable to insert indirect branch inside function that has red zone");
313
314 // Otherwise, spill X16 and defer range extension to the linker.
315 BuildMI(MBB, MBB.end(), DL, get(AArch64::STRXpre))
316 .addReg(AArch64::SP, RegState::Define)
317 .addReg(Reg)
318 .addReg(AArch64::SP)
319 .addImm(-16);
320
321 BuildMI(MBB, MBB.end(), DL, get(AArch64::B)).addMBB(&RestoreBB);
322
323 BuildMI(RestoreBB, RestoreBB.end(), DL, get(AArch64::LDRXpost))
324 .addReg(AArch64::SP, RegState::Define)
325 .addReg(Reg, RegState::Define)
326 .addReg(AArch64::SP)
327 .addImm(16);
328}
329
330// Branch analysis.
331bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
332 MachineBasicBlock *&TBB,
333 MachineBasicBlock *&FBB,
334 SmallVectorImpl<MachineOperand> &Cond,
335 bool AllowModify) const {
336 // If the block has no terminators, it just falls into the block after it.
337 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
338 if (I == MBB.end())
339 return false;
340
341 // Skip over SpeculationBarrierEndBB terminators
342 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
343 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
344 --I;
345 }
346
347 if (!isUnpredicatedTerminator(*I))
348 return false;
349
350 // Get the last instruction in the block.
351 MachineInstr *LastInst = &*I;
352
353 // If there is only one terminator instruction, process it.
354 unsigned LastOpc = LastInst->getOpcode();
355 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
356 if (isUncondBranchOpcode(Opc: LastOpc)) {
357 TBB = LastInst->getOperand(i: 0).getMBB();
358 return false;
359 }
360 if (isCondBranchOpcode(Opc: LastOpc)) {
361 // Block ends with fall-through condbranch.
362 parseCondBranch(LastInst, Target&: TBB, Cond);
363 return false;
364 }
365 return true; // Can't handle indirect branch.
366 }
367
368 // Get the instruction before it if it is a terminator.
369 MachineInstr *SecondLastInst = &*I;
370 unsigned SecondLastOpc = SecondLastInst->getOpcode();
371
372 // If AllowModify is true and the block ends with two or more unconditional
373 // branches, delete all but the first unconditional branch.
374 if (AllowModify && isUncondBranchOpcode(Opc: LastOpc)) {
375 while (isUncondBranchOpcode(Opc: SecondLastOpc)) {
376 LastInst->eraseFromParent();
377 LastInst = SecondLastInst;
378 LastOpc = LastInst->getOpcode();
379 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
380 // Return now the only terminator is an unconditional branch.
381 TBB = LastInst->getOperand(i: 0).getMBB();
382 return false;
383 }
384 SecondLastInst = &*I;
385 SecondLastOpc = SecondLastInst->getOpcode();
386 }
387 }
388
389 // If we're allowed to modify and the block ends in a unconditional branch
390 // which could simply fallthrough, remove the branch. (Note: This case only
391 // matters when we can't understand the whole sequence, otherwise it's also
392 // handled by BranchFolding.cpp.)
393 if (AllowModify && isUncondBranchOpcode(Opc: LastOpc) &&
394 MBB.isLayoutSuccessor(MBB: getBranchDestBlock(MI: *LastInst))) {
395 LastInst->eraseFromParent();
396 LastInst = SecondLastInst;
397 LastOpc = LastInst->getOpcode();
398 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
399 assert(!isUncondBranchOpcode(LastOpc) &&
400 "unreachable unconditional branches removed above");
401
402 if (isCondBranchOpcode(Opc: LastOpc)) {
403 // Block ends with fall-through condbranch.
404 parseCondBranch(LastInst, Target&: TBB, Cond);
405 return false;
406 }
407 return true; // Can't handle indirect branch.
408 }
409 SecondLastInst = &*I;
410 SecondLastOpc = SecondLastInst->getOpcode();
411 }
412
413 // If there are three terminators, we don't know what sort of block this is.
414 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
415 return true;
416
417 // If the block ends with a B and a Bcc, handle it.
418 if (isCondBranchOpcode(Opc: SecondLastOpc) && isUncondBranchOpcode(Opc: LastOpc)) {
419 parseCondBranch(LastInst: SecondLastInst, Target&: TBB, Cond);
420 FBB = LastInst->getOperand(i: 0).getMBB();
421 return false;
422 }
423
424 // If the block ends with two unconditional branches, handle it. The second
425 // one is not executed, so remove it.
426 if (isUncondBranchOpcode(Opc: SecondLastOpc) && isUncondBranchOpcode(Opc: LastOpc)) {
427 TBB = SecondLastInst->getOperand(i: 0).getMBB();
428 I = LastInst;
429 if (AllowModify)
430 I->eraseFromParent();
431 return false;
432 }
433
434 // ...likewise if it ends with an indirect branch followed by an unconditional
435 // branch.
436 if (isIndirectBranchOpcode(Opc: SecondLastOpc) && isUncondBranchOpcode(Opc: LastOpc)) {
437 I = LastInst;
438 if (AllowModify)
439 I->eraseFromParent();
440 return true;
441 }
442
443 // Otherwise, can't handle this.
444 return true;
445}
446
447bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
448 MachineBranchPredicate &MBP,
449 bool AllowModify) const {
450 // For the moment, handle only a block which ends with a cb(n)zx followed by
451 // a fallthrough. Why this? Because it is a common form.
452 // TODO: Should we handle b.cc?
453
454 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
455 if (I == MBB.end())
456 return true;
457
458 // Skip over SpeculationBarrierEndBB terminators
459 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
460 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
461 --I;
462 }
463
464 if (!isUnpredicatedTerminator(*I))
465 return true;
466
467 // Get the last instruction in the block.
468 MachineInstr *LastInst = &*I;
469 unsigned LastOpc = LastInst->getOpcode();
470 if (!isCondBranchOpcode(Opc: LastOpc))
471 return true;
472
473 switch (LastOpc) {
474 default:
475 return true;
476 case AArch64::CBZW:
477 case AArch64::CBZX:
478 case AArch64::CBNZW:
479 case AArch64::CBNZX:
480 break;
481 };
482
483 MBP.TrueDest = LastInst->getOperand(i: 1).getMBB();
484 assert(MBP.TrueDest && "expected!");
485 MBP.FalseDest = MBB.getNextNode();
486
487 MBP.ConditionDef = nullptr;
488 MBP.SingleUseCondition = false;
489
490 MBP.LHS = LastInst->getOperand(i: 0);
491 MBP.RHS = MachineOperand::CreateImm(Val: 0);
492 MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE
493 : MachineBranchPredicate::PRED_EQ;
494 return false;
495}
496
497bool AArch64InstrInfo::reverseBranchCondition(
498 SmallVectorImpl<MachineOperand> &Cond) const {
499 if (Cond[0].getImm() != -1) {
500 // Regular Bcc
501 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
502 Cond[0].setImm(AArch64CC::getInvertedCondCode(Code: CC));
503 } else {
504 // Folded compare-and-branch
505 switch (Cond[1].getImm()) {
506 default:
507 llvm_unreachable("Unknown conditional branch!");
508 case AArch64::CBZW:
509 Cond[1].setImm(AArch64::CBNZW);
510 break;
511 case AArch64::CBNZW:
512 Cond[1].setImm(AArch64::CBZW);
513 break;
514 case AArch64::CBZX:
515 Cond[1].setImm(AArch64::CBNZX);
516 break;
517 case AArch64::CBNZX:
518 Cond[1].setImm(AArch64::CBZX);
519 break;
520 case AArch64::TBZW:
521 Cond[1].setImm(AArch64::TBNZW);
522 break;
523 case AArch64::TBNZW:
524 Cond[1].setImm(AArch64::TBZW);
525 break;
526 case AArch64::TBZX:
527 Cond[1].setImm(AArch64::TBNZX);
528 break;
529 case AArch64::TBNZX:
530 Cond[1].setImm(AArch64::TBZX);
531 break;
532 }
533 }
534
535 return false;
536}
537
538unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB,
539 int *BytesRemoved) const {
540 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
541 if (I == MBB.end())
542 return 0;
543
544 if (!isUncondBranchOpcode(Opc: I->getOpcode()) &&
545 !isCondBranchOpcode(Opc: I->getOpcode()))
546 return 0;
547
548 // Remove the branch.
549 I->eraseFromParent();
550
551 I = MBB.end();
552
553 if (I == MBB.begin()) {
554 if (BytesRemoved)
555 *BytesRemoved = 4;
556 return 1;
557 }
558 --I;
559 if (!isCondBranchOpcode(Opc: I->getOpcode())) {
560 if (BytesRemoved)
561 *BytesRemoved = 4;
562 return 1;
563 }
564
565 // Remove the branch.
566 I->eraseFromParent();
567 if (BytesRemoved)
568 *BytesRemoved = 8;
569
570 return 2;
571}
572
573void AArch64InstrInfo::instantiateCondBranch(
574 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
575 ArrayRef<MachineOperand> Cond) const {
576 if (Cond[0].getImm() != -1) {
577 // Regular Bcc
578 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
579 } else {
580 // Folded compare-and-branch
581 // Note that we use addOperand instead of addReg to keep the flags.
582 const MachineInstrBuilder MIB =
583 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
584 if (Cond.size() > 3)
585 MIB.addImm(Val: Cond[3].getImm());
586 MIB.addMBB(MBB: TBB);
587 }
588}
589
590unsigned AArch64InstrInfo::insertBranch(
591 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
592 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
593 // Shouldn't be a fall through.
594 assert(TBB && "insertBranch must not be told to insert a fallthrough");
595
596 if (!FBB) {
597 if (Cond.empty()) // Unconditional branch?
598 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
599 else
600 instantiateCondBranch(MBB, DL, TBB, Cond);
601
602 if (BytesAdded)
603 *BytesAdded = 4;
604
605 return 1;
606 }
607
608 // Two-way conditional branch.
609 instantiateCondBranch(MBB, DL, TBB, Cond);
610 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
611
612 if (BytesAdded)
613 *BytesAdded = 8;
614
615 return 2;
616}
617
618// Find the original register that VReg is copied from.
619static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
620 while (Register::isVirtualRegister(Reg: VReg)) {
621 const MachineInstr *DefMI = MRI.getVRegDef(Reg: VReg);
622 if (!DefMI->isFullCopy())
623 return VReg;
624 VReg = DefMI->getOperand(i: 1).getReg();
625 }
626 return VReg;
627}
628
629// Determine if VReg is defined by an instruction that can be folded into a
630// csel instruction. If so, return the folded opcode, and the replacement
631// register.
632static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
633 unsigned *NewVReg = nullptr) {
634 VReg = removeCopies(MRI, VReg);
635 if (!Register::isVirtualRegister(Reg: VReg))
636 return 0;
637
638 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
639 const MachineInstr *DefMI = MRI.getVRegDef(Reg: VReg);
640 unsigned Opc = 0;
641 unsigned SrcOpNum = 0;
642 switch (DefMI->getOpcode()) {
643 case AArch64::ADDSXri:
644 case AArch64::ADDSWri:
645 // if NZCV is used, do not fold.
646 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
647 true) == -1)
648 return 0;
649 // fall-through to ADDXri and ADDWri.
650 [[fallthrough]];
651 case AArch64::ADDXri:
652 case AArch64::ADDWri:
653 // add x, 1 -> csinc.
654 if (!DefMI->getOperand(i: 2).isImm() || DefMI->getOperand(i: 2).getImm() != 1 ||
655 DefMI->getOperand(i: 3).getImm() != 0)
656 return 0;
657 SrcOpNum = 1;
658 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
659 break;
660
661 case AArch64::ORNXrr:
662 case AArch64::ORNWrr: {
663 // not x -> csinv, represented as orn dst, xzr, src.
664 unsigned ZReg = removeCopies(MRI, VReg: DefMI->getOperand(i: 1).getReg());
665 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
666 return 0;
667 SrcOpNum = 2;
668 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
669 break;
670 }
671
672 case AArch64::SUBSXrr:
673 case AArch64::SUBSWrr:
674 // if NZCV is used, do not fold.
675 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
676 true) == -1)
677 return 0;
678 // fall-through to SUBXrr and SUBWrr.
679 [[fallthrough]];
680 case AArch64::SUBXrr:
681 case AArch64::SUBWrr: {
682 // neg x -> csneg, represented as sub dst, xzr, src.
683 unsigned ZReg = removeCopies(MRI, VReg: DefMI->getOperand(i: 1).getReg());
684 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
685 return 0;
686 SrcOpNum = 2;
687 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
688 break;
689 }
690 default:
691 return 0;
692 }
693 assert(Opc && SrcOpNum && "Missing parameters");
694
695 if (NewVReg)
696 *NewVReg = DefMI->getOperand(i: SrcOpNum).getReg();
697 return Opc;
698}
699
700bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
701 ArrayRef<MachineOperand> Cond,
702 Register DstReg, Register TrueReg,
703 Register FalseReg, int &CondCycles,
704 int &TrueCycles,
705 int &FalseCycles) const {
706 // Check register classes.
707 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
708 const TargetRegisterClass *RC =
709 RI.getCommonSubClass(MRI.getRegClass(Reg: TrueReg), MRI.getRegClass(Reg: FalseReg));
710 if (!RC)
711 return false;
712
713 // Also need to check the dest regclass, in case we're trying to optimize
714 // something like:
715 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
716 if (!RI.getCommonSubClass(RC, MRI.getRegClass(Reg: DstReg)))
717 return false;
718
719 // Expanding cbz/tbz requires an extra cycle of latency on the condition.
720 unsigned ExtraCondLat = Cond.size() != 1;
721
722 // GPRs are handled by csel.
723 // FIXME: Fold in x+1, -x, and ~x when applicable.
724 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
725 AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
726 // Single-cycle csel, csinc, csinv, and csneg.
727 CondCycles = 1 + ExtraCondLat;
728 TrueCycles = FalseCycles = 1;
729 if (canFoldIntoCSel(MRI, VReg: TrueReg))
730 TrueCycles = 0;
731 else if (canFoldIntoCSel(MRI, VReg: FalseReg))
732 FalseCycles = 0;
733 return true;
734 }
735
736 // Scalar floating point is handled by fcsel.
737 // FIXME: Form fabs, fmin, and fmax when applicable.
738 if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
739 AArch64::FPR32RegClass.hasSubClassEq(RC)) {
740 CondCycles = 5 + ExtraCondLat;
741 TrueCycles = FalseCycles = 2;
742 return true;
743 }
744
745 // Can't do vectors.
746 return false;
747}
748
749void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
750 MachineBasicBlock::iterator I,
751 const DebugLoc &DL, Register DstReg,
752 ArrayRef<MachineOperand> Cond,
753 Register TrueReg, Register FalseReg) const {
754 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
755
756 // Parse the condition code, see parseCondBranch() above.
757 AArch64CC::CondCode CC;
758 switch (Cond.size()) {
759 default:
760 llvm_unreachable("Unknown condition opcode in Cond");
761 case 1: // b.cc
762 CC = AArch64CC::CondCode(Cond[0].getImm());
763 break;
764 case 3: { // cbz/cbnz
765 // We must insert a compare against 0.
766 bool Is64Bit;
767 switch (Cond[1].getImm()) {
768 default:
769 llvm_unreachable("Unknown branch opcode in Cond");
770 case AArch64::CBZW:
771 Is64Bit = false;
772 CC = AArch64CC::EQ;
773 break;
774 case AArch64::CBZX:
775 Is64Bit = true;
776 CC = AArch64CC::EQ;
777 break;
778 case AArch64::CBNZW:
779 Is64Bit = false;
780 CC = AArch64CC::NE;
781 break;
782 case AArch64::CBNZX:
783 Is64Bit = true;
784 CC = AArch64CC::NE;
785 break;
786 }
787 Register SrcReg = Cond[2].getReg();
788 if (Is64Bit) {
789 // cmp reg, #0 is actually subs xzr, reg, #0.
790 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
791 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
792 .addReg(SrcReg)
793 .addImm(0)
794 .addImm(0);
795 } else {
796 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
797 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
798 .addReg(SrcReg)
799 .addImm(0)
800 .addImm(0);
801 }
802 break;
803 }
804 case 4: { // tbz/tbnz
805 // We must insert a tst instruction.
806 switch (Cond[1].getImm()) {
807 default:
808 llvm_unreachable("Unknown branch opcode in Cond");
809 case AArch64::TBZW:
810 case AArch64::TBZX:
811 CC = AArch64CC::EQ;
812 break;
813 case AArch64::TBNZW:
814 case AArch64::TBNZX:
815 CC = AArch64CC::NE;
816 break;
817 }
818 // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
819 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
820 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
821 .addReg(Cond[2].getReg())
822 .addImm(
823 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
824 else
825 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
826 .addReg(Cond[2].getReg())
827 .addImm(
828 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
829 break;
830 }
831 }
832
833 unsigned Opc = 0;
834 const TargetRegisterClass *RC = nullptr;
835 bool TryFold = false;
836 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
837 RC = &AArch64::GPR64RegClass;
838 Opc = AArch64::CSELXr;
839 TryFold = true;
840 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
841 RC = &AArch64::GPR32RegClass;
842 Opc = AArch64::CSELWr;
843 TryFold = true;
844 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
845 RC = &AArch64::FPR64RegClass;
846 Opc = AArch64::FCSELDrrr;
847 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
848 RC = &AArch64::FPR32RegClass;
849 Opc = AArch64::FCSELSrrr;
850 }
851 assert(RC && "Unsupported regclass");
852
853 // Try folding simple instructions into the csel.
854 if (TryFold) {
855 unsigned NewVReg = 0;
856 unsigned FoldedOpc = canFoldIntoCSel(MRI, VReg: TrueReg, NewVReg: &NewVReg);
857 if (FoldedOpc) {
858 // The folded opcodes csinc, csinc and csneg apply the operation to
859 // FalseReg, so we need to invert the condition.
860 CC = AArch64CC::getInvertedCondCode(Code: CC);
861 TrueReg = FalseReg;
862 } else
863 FoldedOpc = canFoldIntoCSel(MRI, VReg: FalseReg, NewVReg: &NewVReg);
864
865 // Fold the operation. Leave any dead instructions for DCE to clean up.
866 if (FoldedOpc) {
867 FalseReg = NewVReg;
868 Opc = FoldedOpc;
869 // The extends the live range of NewVReg.
870 MRI.clearKillFlags(Reg: NewVReg);
871 }
872 }
873
874 // Pull all virtual register into the appropriate class.
875 MRI.constrainRegClass(Reg: TrueReg, RC);
876 MRI.constrainRegClass(Reg: FalseReg, RC);
877
878 // Insert the csel.
879 BuildMI(MBB, I, DL, get(Opc), DstReg)
880 .addReg(TrueReg)
881 .addReg(FalseReg)
882 .addImm(CC);
883}
884
885// Return true if Imm can be loaded into a register by a "cheap" sequence of
886// instructions. For now, "cheap" means at most two instructions.
887static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
888 if (BitSize == 32)
889 return true;
890
891 assert(BitSize == 64 && "Only bit sizes of 32 or 64 allowed");
892 uint64_t Imm = static_cast<uint64_t>(MI.getOperand(i: 1).getImm());
893 SmallVector<AArch64_IMM::ImmInsnModel, 4> Is;
894 AArch64_IMM::expandMOVImm(Imm, BitSize, Insn&: Is);
895
896 return Is.size() <= 2;
897}
898
899// FIXME: this implementation should be micro-architecture dependent, so a
900// micro-architecture target hook should be introduced here in future.
901bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
902 if (Subtarget.hasExynosCheapAsMoveHandling()) {
903 if (isExynosCheapAsMove(MI))
904 return true;
905 return MI.isAsCheapAsAMove();
906 }
907
908 switch (MI.getOpcode()) {
909 default:
910 return MI.isAsCheapAsAMove();
911
912 case AArch64::ADDWrs:
913 case AArch64::ADDXrs:
914 case AArch64::SUBWrs:
915 case AArch64::SUBXrs:
916 return Subtarget.hasALULSLFast() && MI.getOperand(i: 3).getImm() <= 4;
917
918 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
919 // ORRXri, it is as cheap as MOV.
920 // Likewise if it can be expanded to MOVZ/MOVN/MOVK.
921 case AArch64::MOVi32imm:
922 return isCheapImmediate(MI, BitSize: 32);
923 case AArch64::MOVi64imm:
924 return isCheapImmediate(MI, BitSize: 64);
925 }
926}
927
928bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) {
929 switch (MI.getOpcode()) {
930 default:
931 return false;
932
933 case AArch64::ADDWrs:
934 case AArch64::ADDXrs:
935 case AArch64::ADDSWrs:
936 case AArch64::ADDSXrs: {
937 unsigned Imm = MI.getOperand(i: 3).getImm();
938 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
939 if (ShiftVal == 0)
940 return true;
941 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
942 }
943
944 case AArch64::ADDWrx:
945 case AArch64::ADDXrx:
946 case AArch64::ADDXrx64:
947 case AArch64::ADDSWrx:
948 case AArch64::ADDSXrx:
949 case AArch64::ADDSXrx64: {
950 unsigned Imm = MI.getOperand(i: 3).getImm();
951 switch (AArch64_AM::getArithExtendType(Imm)) {
952 default:
953 return false;
954 case AArch64_AM::UXTB:
955 case AArch64_AM::UXTH:
956 case AArch64_AM::UXTW:
957 case AArch64_AM::UXTX:
958 return AArch64_AM::getArithShiftValue(Imm) <= 4;
959 }
960 }
961
962 case AArch64::SUBWrs:
963 case AArch64::SUBSWrs: {
964 unsigned Imm = MI.getOperand(i: 3).getImm();
965 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
966 return ShiftVal == 0 ||
967 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
968 }
969
970 case AArch64::SUBXrs:
971 case AArch64::SUBSXrs: {
972 unsigned Imm = MI.getOperand(i: 3).getImm();
973 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
974 return ShiftVal == 0 ||
975 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
976 }
977
978 case AArch64::SUBWrx:
979 case AArch64::SUBXrx:
980 case AArch64::SUBXrx64:
981 case AArch64::SUBSWrx:
982 case AArch64::SUBSXrx:
983 case AArch64::SUBSXrx64: {
984 unsigned Imm = MI.getOperand(i: 3).getImm();
985 switch (AArch64_AM::getArithExtendType(Imm)) {
986 default:
987 return false;
988 case AArch64_AM::UXTB:
989 case AArch64_AM::UXTH:
990 case AArch64_AM::UXTW:
991 case AArch64_AM::UXTX:
992 return AArch64_AM::getArithShiftValue(Imm) == 0;
993 }
994 }
995
996 case AArch64::LDRBBroW:
997 case AArch64::LDRBBroX:
998 case AArch64::LDRBroW:
999 case AArch64::LDRBroX:
1000 case AArch64::LDRDroW:
1001 case AArch64::LDRDroX:
1002 case AArch64::LDRHHroW:
1003 case AArch64::LDRHHroX:
1004 case AArch64::LDRHroW:
1005 case AArch64::LDRHroX:
1006 case AArch64::LDRQroW:
1007 case AArch64::LDRQroX:
1008 case AArch64::LDRSBWroW:
1009 case AArch64::LDRSBWroX:
1010 case AArch64::LDRSBXroW:
1011 case AArch64::LDRSBXroX:
1012 case AArch64::LDRSHWroW:
1013 case AArch64::LDRSHWroX:
1014 case AArch64::LDRSHXroW:
1015 case AArch64::LDRSHXroX:
1016 case AArch64::LDRSWroW:
1017 case AArch64::LDRSWroX:
1018 case AArch64::LDRSroW:
1019 case AArch64::LDRSroX:
1020 case AArch64::LDRWroW:
1021 case AArch64::LDRWroX:
1022 case AArch64::LDRXroW:
1023 case AArch64::LDRXroX:
1024 case AArch64::PRFMroW:
1025 case AArch64::PRFMroX:
1026 case AArch64::STRBBroW:
1027 case AArch64::STRBBroX:
1028 case AArch64::STRBroW:
1029 case AArch64::STRBroX:
1030 case AArch64::STRDroW:
1031 case AArch64::STRDroX:
1032 case AArch64::STRHHroW:
1033 case AArch64::STRHHroX:
1034 case AArch64::STRHroW:
1035 case AArch64::STRHroX:
1036 case AArch64::STRQroW:
1037 case AArch64::STRQroX:
1038 case AArch64::STRSroW:
1039 case AArch64::STRSroX:
1040 case AArch64::STRWroW:
1041 case AArch64::STRWroX:
1042 case AArch64::STRXroW:
1043 case AArch64::STRXroX: {
1044 unsigned IsSigned = MI.getOperand(i: 3).getImm();
1045 return !IsSigned;
1046 }
1047 }
1048}
1049
1050bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
1051 unsigned Opc = MI.getOpcode();
1052 switch (Opc) {
1053 default:
1054 return false;
1055 case AArch64::SEH_StackAlloc:
1056 case AArch64::SEH_SaveFPLR:
1057 case AArch64::SEH_SaveFPLR_X:
1058 case AArch64::SEH_SaveReg:
1059 case AArch64::SEH_SaveReg_X:
1060 case AArch64::SEH_SaveRegP:
1061 case AArch64::SEH_SaveRegP_X:
1062 case AArch64::SEH_SaveFReg:
1063 case AArch64::SEH_SaveFReg_X:
1064 case AArch64::SEH_SaveFRegP:
1065 case AArch64::SEH_SaveFRegP_X:
1066 case AArch64::SEH_SetFP:
1067 case AArch64::SEH_AddFP:
1068 case AArch64::SEH_Nop:
1069 case AArch64::SEH_PrologEnd:
1070 case AArch64::SEH_EpilogStart:
1071 case AArch64::SEH_EpilogEnd:
1072 case AArch64::SEH_PACSignLR:
1073 case AArch64::SEH_SaveAnyRegQP:
1074 case AArch64::SEH_SaveAnyRegQPX:
1075 return true;
1076 }
1077}
1078
1079bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
1080 Register &SrcReg, Register &DstReg,
1081 unsigned &SubIdx) const {
1082 switch (MI.getOpcode()) {
1083 default:
1084 return false;
1085 case AArch64::SBFMXri: // aka sxtw
1086 case AArch64::UBFMXri: // aka uxtw
1087 // Check for the 32 -> 64 bit extension case, these instructions can do
1088 // much more.
1089 if (MI.getOperand(i: 2).getImm() != 0 || MI.getOperand(i: 3).getImm() != 31)
1090 return false;
1091 // This is a signed or unsigned 32 -> 64 bit extension.
1092 SrcReg = MI.getOperand(i: 1).getReg();
1093 DstReg = MI.getOperand(i: 0).getReg();
1094 SubIdx = AArch64::sub_32;
1095 return true;
1096 }
1097}
1098
1099bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
1100 const MachineInstr &MIa, const MachineInstr &MIb) const {
1101 const TargetRegisterInfo *TRI = &getRegisterInfo();
1102 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
1103 int64_t OffsetA = 0, OffsetB = 0;
1104 TypeSize WidthA(0, false), WidthB(0, false);
1105 bool OffsetAIsScalable = false, OffsetBIsScalable = false;
1106
1107 assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
1108 assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
1109
1110 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
1111 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1112 return false;
1113
1114 // Retrieve the base, offset from the base and width. Width
1115 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
1116 // base are identical, and the offset of a lower memory access +
1117 // the width doesn't overlap the offset of a higher memory access,
1118 // then the memory accesses are different.
1119 // If OffsetAIsScalable and OffsetBIsScalable are both true, they
1120 // are assumed to have the same scale (vscale).
1121 if (getMemOperandWithOffsetWidth(MI: MIa, BaseOp&: BaseOpA, Offset&: OffsetA, OffsetIsScalable&: OffsetAIsScalable,
1122 Width&: WidthA, TRI) &&
1123 getMemOperandWithOffsetWidth(MI: MIb, BaseOp&: BaseOpB, Offset&: OffsetB, OffsetIsScalable&: OffsetBIsScalable,
1124 Width&: WidthB, TRI)) {
1125 if (BaseOpA->isIdenticalTo(Other: *BaseOpB) &&
1126 OffsetAIsScalable == OffsetBIsScalable) {
1127 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1128 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1129 TypeSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1130 if (LowWidth.isScalable() == OffsetAIsScalable &&
1131 LowOffset + (int)LowWidth.getKnownMinValue() <= HighOffset)
1132 return true;
1133 }
1134 }
1135 return false;
1136}
1137
1138bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1139 const MachineBasicBlock *MBB,
1140 const MachineFunction &MF) const {
1141 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
1142 return true;
1143
1144 // Do not move an instruction that can be recognized as a branch target.
1145 if (hasBTISemantics(MI))
1146 return true;
1147
1148 switch (MI.getOpcode()) {
1149 case AArch64::HINT:
1150 // CSDB hints are scheduling barriers.
1151 if (MI.getOperand(i: 0).getImm() == 0x14)
1152 return true;
1153 break;
1154 case AArch64::DSB:
1155 case AArch64::ISB:
1156 // DSB and ISB also are scheduling barriers.
1157 return true;
1158 case AArch64::MSRpstatesvcrImm1:
1159 // SMSTART and SMSTOP are also scheduling barriers.
1160 return true;
1161 default:;
1162 }
1163 if (isSEHInstruction(MI))
1164 return true;
1165 auto Next = std::next(x: MI.getIterator());
1166 return Next != MBB->end() && Next->isCFIInstruction();
1167}
1168
1169/// analyzeCompare - For a comparison instruction, return the source registers
1170/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
1171/// Return true if the comparison instruction can be analyzed.
1172bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
1173 Register &SrcReg2, int64_t &CmpMask,
1174 int64_t &CmpValue) const {
1175 // The first operand can be a frame index where we'd normally expect a
1176 // register.
1177 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands");
1178 if (!MI.getOperand(i: 1).isReg())
1179 return false;
1180
1181 switch (MI.getOpcode()) {
1182 default:
1183 break;
1184 case AArch64::PTEST_PP:
1185 case AArch64::PTEST_PP_ANY:
1186 SrcReg = MI.getOperand(i: 0).getReg();
1187 SrcReg2 = MI.getOperand(i: 1).getReg();
1188 // Not sure about the mask and value for now...
1189 CmpMask = ~0;
1190 CmpValue = 0;
1191 return true;
1192 case AArch64::SUBSWrr:
1193 case AArch64::SUBSWrs:
1194 case AArch64::SUBSWrx:
1195 case AArch64::SUBSXrr:
1196 case AArch64::SUBSXrs:
1197 case AArch64::SUBSXrx:
1198 case AArch64::ADDSWrr:
1199 case AArch64::ADDSWrs:
1200 case AArch64::ADDSWrx:
1201 case AArch64::ADDSXrr:
1202 case AArch64::ADDSXrs:
1203 case AArch64::ADDSXrx:
1204 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1205 SrcReg = MI.getOperand(i: 1).getReg();
1206 SrcReg2 = MI.getOperand(i: 2).getReg();
1207 CmpMask = ~0;
1208 CmpValue = 0;
1209 return true;
1210 case AArch64::SUBSWri:
1211 case AArch64::ADDSWri:
1212 case AArch64::SUBSXri:
1213 case AArch64::ADDSXri:
1214 SrcReg = MI.getOperand(i: 1).getReg();
1215 SrcReg2 = 0;
1216 CmpMask = ~0;
1217 CmpValue = MI.getOperand(i: 2).getImm();
1218 return true;
1219 case AArch64::ANDSWri:
1220 case AArch64::ANDSXri:
1221 // ANDS does not use the same encoding scheme as the others xxxS
1222 // instructions.
1223 SrcReg = MI.getOperand(i: 1).getReg();
1224 SrcReg2 = 0;
1225 CmpMask = ~0;
1226 CmpValue = AArch64_AM::decodeLogicalImmediate(
1227 MI.getOperand(2).getImm(),
1228 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64);
1229 return true;
1230 }
1231
1232 return false;
1233}
1234
1235static bool UpdateOperandRegClass(MachineInstr &Instr) {
1236 MachineBasicBlock *MBB = Instr.getParent();
1237 assert(MBB && "Can't get MachineBasicBlock here");
1238 MachineFunction *MF = MBB->getParent();
1239 assert(MF && "Can't get MachineFunction here");
1240 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
1241 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
1242 MachineRegisterInfo *MRI = &MF->getRegInfo();
1243
1244 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
1245 ++OpIdx) {
1246 MachineOperand &MO = Instr.getOperand(i: OpIdx);
1247 const TargetRegisterClass *OpRegCstraints =
1248 Instr.getRegClassConstraint(OpIdx, TII, TRI);
1249
1250 // If there's no constraint, there's nothing to do.
1251 if (!OpRegCstraints)
1252 continue;
1253 // If the operand is a frame index, there's nothing to do here.
1254 // A frame index operand will resolve correctly during PEI.
1255 if (MO.isFI())
1256 continue;
1257
1258 assert(MO.isReg() &&
1259 "Operand has register constraints without being a register!");
1260
1261 Register Reg = MO.getReg();
1262 if (Reg.isPhysical()) {
1263 if (!OpRegCstraints->contains(Reg))
1264 return false;
1265 } else if (!OpRegCstraints->hasSubClassEq(RC: MRI->getRegClass(Reg)) &&
1266 !MRI->constrainRegClass(Reg, RC: OpRegCstraints))
1267 return false;
1268 }
1269
1270 return true;
1271}
1272
1273/// Return the opcode that does not set flags when possible - otherwise
1274/// return the original opcode. The caller is responsible to do the actual
1275/// substitution and legality checking.
1276static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
1277 // Don't convert all compare instructions, because for some the zero register
1278 // encoding becomes the sp register.
1279 bool MIDefinesZeroReg = false;
1280 if (MI.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1281 MI.definesRegister(AArch64::XZR, /*TRI=*/nullptr))
1282 MIDefinesZeroReg = true;
1283
1284 switch (MI.getOpcode()) {
1285 default:
1286 return MI.getOpcode();
1287 case AArch64::ADDSWrr:
1288 return AArch64::ADDWrr;
1289 case AArch64::ADDSWri:
1290 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
1291 case AArch64::ADDSWrs:
1292 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
1293 case AArch64::ADDSWrx:
1294 return AArch64::ADDWrx;
1295 case AArch64::ADDSXrr:
1296 return AArch64::ADDXrr;
1297 case AArch64::ADDSXri:
1298 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
1299 case AArch64::ADDSXrs:
1300 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
1301 case AArch64::ADDSXrx:
1302 return AArch64::ADDXrx;
1303 case AArch64::SUBSWrr:
1304 return AArch64::SUBWrr;
1305 case AArch64::SUBSWri:
1306 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
1307 case AArch64::SUBSWrs:
1308 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
1309 case AArch64::SUBSWrx:
1310 return AArch64::SUBWrx;
1311 case AArch64::SUBSXrr:
1312 return AArch64::SUBXrr;
1313 case AArch64::SUBSXri:
1314 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
1315 case AArch64::SUBSXrs:
1316 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
1317 case AArch64::SUBSXrx:
1318 return AArch64::SUBXrx;
1319 }
1320}
1321
1322enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 };
1323
1324/// True when condition flags are accessed (either by writing or reading)
1325/// on the instruction trace starting at From and ending at To.
1326///
1327/// Note: If From and To are from different blocks it's assumed CC are accessed
1328/// on the path.
1329static bool areCFlagsAccessedBetweenInstrs(
1330 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
1331 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
1332 // Early exit if To is at the beginning of the BB.
1333 if (To == To->getParent()->begin())
1334 return true;
1335
1336 // Check whether the instructions are in the same basic block
1337 // If not, assume the condition flags might get modified somewhere.
1338 if (To->getParent() != From->getParent())
1339 return true;
1340
1341 // From must be above To.
1342 assert(std::any_of(
1343 ++To.getReverse(), To->getParent()->rend(),
1344 [From](MachineInstr &MI) { return MI.getIterator() == From; }));
1345
1346 // We iterate backward starting at \p To until we hit \p From.
1347 for (const MachineInstr &Instr :
1348 instructionsWithoutDebug(It: ++To.getReverse(), End: From.getReverse())) {
1349 if (((AccessToCheck & AK_Write) &&
1350 Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
1351 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
1352 return true;
1353 }
1354 return false;
1355}
1356
1357/// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating
1358/// operation which could set the flags in an identical manner
1359bool AArch64InstrInfo::optimizePTestInstr(
1360 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg,
1361 const MachineRegisterInfo *MRI) const {
1362 auto *Mask = MRI->getUniqueVRegDef(Reg: MaskReg);
1363 auto *Pred = MRI->getUniqueVRegDef(Reg: PredReg);
1364 auto NewOp = Pred->getOpcode();
1365 bool OpChanged = false;
1366
1367 unsigned MaskOpcode = Mask->getOpcode();
1368 unsigned PredOpcode = Pred->getOpcode();
1369 bool PredIsPTestLike = isPTestLikeOpcode(Opc: PredOpcode);
1370 bool PredIsWhileLike = isWhileOpcode(Opc: PredOpcode);
1371
1372 if (isPTrueOpcode(Opc: MaskOpcode) && (PredIsPTestLike || PredIsWhileLike) &&
1373 getElementSizeForOpcode(Opc: MaskOpcode) ==
1374 getElementSizeForOpcode(Opc: PredOpcode) &&
1375 Mask->getOperand(i: 1).getImm() == 31) {
1376 // For PTEST(PTRUE_ALL, WHILE), if the element size matches, the PTEST is
1377 // redundant since WHILE performs an implicit PTEST with an all active
1378 // mask. Must be an all active predicate of matching element size.
1379
1380 // For PTEST(PTRUE_ALL, PTEST_LIKE), the PTEST is redundant if the
1381 // PTEST_LIKE instruction uses the same all active mask and the element
1382 // size matches. If the PTEST has a condition of any then it is always
1383 // redundant.
1384 if (PredIsPTestLike) {
1385 auto PTestLikeMask = MRI->getUniqueVRegDef(Reg: Pred->getOperand(i: 1).getReg());
1386 if (Mask != PTestLikeMask && PTest->getOpcode() != AArch64::PTEST_PP_ANY)
1387 return false;
1388 }
1389
1390 // Fallthough to simply remove the PTEST.
1391 } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike) &&
1392 PTest->getOpcode() == AArch64::PTEST_PP_ANY) {
1393 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an
1394 // instruction that sets the flags as PTEST would. This is only valid when
1395 // the condition is any.
1396
1397 // Fallthough to simply remove the PTEST.
1398 } else if (PredIsPTestLike) {
1399 // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
1400 // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
1401 // on 8-bit predicates like the PTEST. Otherwise, for instructions like
1402 // compare that also support 16/32/64-bit predicates, the implicit PTEST
1403 // performed by the compare could consider fewer lanes for these element
1404 // sizes.
1405 //
1406 // For example, consider
1407 //
1408 // ptrue p0.b ; P0=1111-1111-1111-1111
1409 // index z0.s, #0, #1 ; Z0=<0,1,2,3>
1410 // index z1.s, #1, #1 ; Z1=<1,2,3,4>
1411 // cmphi p1.s, p0/z, z1.s, z0.s ; P1=0001-0001-0001-0001
1412 // ; ^ last active
1413 // ptest p0, p1.b ; P1=0001-0001-0001-0001
1414 // ; ^ last active
1415 //
1416 // where the compare generates a canonical all active 32-bit predicate
1417 // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
1418 // active flag, whereas the PTEST instruction with the same mask doesn't.
1419 // For PTEST_ANY this doesn't apply as the flags in this case would be
1420 // identical regardless of element size.
1421 auto PTestLikeMask = MRI->getUniqueVRegDef(Reg: Pred->getOperand(i: 1).getReg());
1422 uint64_t PredElementSize = getElementSizeForOpcode(Opc: PredOpcode);
1423 if ((Mask != PTestLikeMask) ||
1424 (PredElementSize != AArch64::ElementSizeB &&
1425 PTest->getOpcode() != AArch64::PTEST_PP_ANY))
1426 return false;
1427
1428 // Fallthough to simply remove the PTEST.
1429 } else {
1430 // If OP in PTEST(PG, OP(PG, ...)) has a flag-setting variant change the
1431 // opcode so the PTEST becomes redundant.
1432 switch (PredOpcode) {
1433 case AArch64::AND_PPzPP:
1434 case AArch64::BIC_PPzPP:
1435 case AArch64::EOR_PPzPP:
1436 case AArch64::NAND_PPzPP:
1437 case AArch64::NOR_PPzPP:
1438 case AArch64::ORN_PPzPP:
1439 case AArch64::ORR_PPzPP:
1440 case AArch64::BRKA_PPzP:
1441 case AArch64::BRKPA_PPzPP:
1442 case AArch64::BRKB_PPzP:
1443 case AArch64::BRKPB_PPzPP:
1444 case AArch64::RDFFR_PPz: {
1445 // Check to see if our mask is the same. If not the resulting flag bits
1446 // may be different and we can't remove the ptest.
1447 auto *PredMask = MRI->getUniqueVRegDef(Reg: Pred->getOperand(i: 1).getReg());
1448 if (Mask != PredMask)
1449 return false;
1450 break;
1451 }
1452 case AArch64::BRKN_PPzP: {
1453 // BRKN uses an all active implicit mask to set flags unlike the other
1454 // flag-setting instructions.
1455 // PTEST(PTRUE_B(31), BRKN(PG, A, B)) -> BRKNS(PG, A, B).
1456 if ((MaskOpcode != AArch64::PTRUE_B) ||
1457 (Mask->getOperand(1).getImm() != 31))
1458 return false;
1459 break;
1460 }
1461 case AArch64::PTRUE_B:
1462 // PTEST(OP=PTRUE_B(A), OP) -> PTRUES_B(A)
1463 break;
1464 default:
1465 // Bail out if we don't recognize the input
1466 return false;
1467 }
1468
1469 NewOp = convertToFlagSettingOpc(Opc: PredOpcode);
1470 OpChanged = true;
1471 }
1472
1473 const TargetRegisterInfo *TRI = &getRegisterInfo();
1474
1475 // If another instruction between Pred and PTest accesses flags, don't remove
1476 // the ptest or update the earlier instruction to modify them.
1477 if (areCFlagsAccessedBetweenInstrs(From: Pred, To: PTest, TRI))
1478 return false;
1479
1480 // If we pass all the checks, it's safe to remove the PTEST and use the flags
1481 // as they are prior to PTEST. Sometimes this requires the tested PTEST
1482 // operand to be replaced with an equivalent instruction that also sets the
1483 // flags.
1484 Pred->setDesc(get(NewOp));
1485 PTest->eraseFromParent();
1486 if (OpChanged) {
1487 bool succeeded = UpdateOperandRegClass(Instr&: *Pred);
1488 (void)succeeded;
1489 assert(succeeded && "Operands have incompatible register classes!");
1490 Pred->addRegisterDefined(AArch64::NZCV, TRI);
1491 }
1492
1493 // Ensure that the flags def is live.
1494 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) {
1495 unsigned i = 0, e = Pred->getNumOperands();
1496 for (; i != e; ++i) {
1497 MachineOperand &MO = Pred->getOperand(i);
1498 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) {
1499 MO.setIsDead(false);
1500 break;
1501 }
1502 }
1503 }
1504 return true;
1505}
1506
1507/// Try to optimize a compare instruction. A compare instruction is an
1508/// instruction which produces AArch64::NZCV. It can be truly compare
1509/// instruction
1510/// when there are no uses of its destination register.
1511///
1512/// The following steps are tried in order:
1513/// 1. Convert CmpInstr into an unconditional version.
1514/// 2. Remove CmpInstr if above there is an instruction producing a needed
1515/// condition code or an instruction which can be converted into such an
1516/// instruction.
1517/// Only comparison with zero is supported.
1518bool AArch64InstrInfo::optimizeCompareInstr(
1519 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask,
1520 int64_t CmpValue, const MachineRegisterInfo *MRI) const {
1521 assert(CmpInstr.getParent());
1522 assert(MRI);
1523
1524 // Replace SUBSWrr with SUBWrr if NZCV is not used.
1525 int DeadNZCVIdx =
1526 CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
1527 if (DeadNZCVIdx != -1) {
1528 if (CmpInstr.definesRegister(AArch64::WZR, /*TRI=*/nullptr) ||
1529 CmpInstr.definesRegister(AArch64::XZR, /*TRI=*/nullptr)) {
1530 CmpInstr.eraseFromParent();
1531 return true;
1532 }
1533 unsigned Opc = CmpInstr.getOpcode();
1534 unsigned NewOpc = convertToNonFlagSettingOpc(MI: CmpInstr);
1535 if (NewOpc == Opc)
1536 return false;
1537 const MCInstrDesc &MCID = get(NewOpc);
1538 CmpInstr.setDesc(MCID);
1539 CmpInstr.removeOperand(OpNo: DeadNZCVIdx);
1540 bool succeeded = UpdateOperandRegClass(Instr&: CmpInstr);
1541 (void)succeeded;
1542 assert(succeeded && "Some operands reg class are incompatible!");
1543 return true;
1544 }
1545
1546 if (CmpInstr.getOpcode() == AArch64::PTEST_PP ||
1547 CmpInstr.getOpcode() == AArch64::PTEST_PP_ANY)
1548 return optimizePTestInstr(PTest: &CmpInstr, MaskReg: SrcReg, PredReg: SrcReg2, MRI);
1549
1550 if (SrcReg2 != 0)
1551 return false;
1552
1553 // CmpInstr is a Compare instruction if destination register is not used.
1554 if (!MRI->use_nodbg_empty(RegNo: CmpInstr.getOperand(i: 0).getReg()))
1555 return false;
1556
1557 if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, MRI: *MRI))
1558 return true;
1559 return (CmpValue == 0 || CmpValue == 1) &&
1560 removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, MRI: *MRI);
1561}
1562
1563/// Get opcode of S version of Instr.
1564/// If Instr is S version its opcode is returned.
1565/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
1566/// or we are not interested in it.
1567static unsigned sForm(MachineInstr &Instr) {
1568 switch (Instr.getOpcode()) {
1569 default:
1570 return AArch64::INSTRUCTION_LIST_END;
1571
1572 case AArch64::ADDSWrr:
1573 case AArch64::ADDSWri:
1574 case AArch64::ADDSXrr:
1575 case AArch64::ADDSXri:
1576 case AArch64::SUBSWrr:
1577 case AArch64::SUBSWri:
1578 case AArch64::SUBSXrr:
1579 case AArch64::SUBSXri:
1580 return Instr.getOpcode();
1581
1582 case AArch64::ADDWrr:
1583 return AArch64::ADDSWrr;
1584 case AArch64::ADDWri:
1585 return AArch64::ADDSWri;
1586 case AArch64::ADDXrr:
1587 return AArch64::ADDSXrr;
1588 case AArch64::ADDXri:
1589 return AArch64::ADDSXri;
1590 case AArch64::ADCWr:
1591 return AArch64::ADCSWr;
1592 case AArch64::ADCXr:
1593 return AArch64::ADCSXr;
1594 case AArch64::SUBWrr:
1595 return AArch64::SUBSWrr;
1596 case AArch64::SUBWri:
1597 return AArch64::SUBSWri;
1598 case AArch64::SUBXrr:
1599 return AArch64::SUBSXrr;
1600 case AArch64::SUBXri:
1601 return AArch64::SUBSXri;
1602 case AArch64::SBCWr:
1603 return AArch64::SBCSWr;
1604 case AArch64::SBCXr:
1605 return AArch64::SBCSXr;
1606 case AArch64::ANDWri:
1607 return AArch64::ANDSWri;
1608 case AArch64::ANDXri:
1609 return AArch64::ANDSXri;
1610 }
1611}
1612
1613/// Check if AArch64::NZCV should be alive in successors of MBB.
1614static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) {
1615 for (auto *BB : MBB->successors())
1616 if (BB->isLiveIn(AArch64::NZCV))
1617 return true;
1618 return false;
1619}
1620
1621/// \returns The condition code operand index for \p Instr if it is a branch
1622/// or select and -1 otherwise.
1623static int
1624findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) {
1625 switch (Instr.getOpcode()) {
1626 default:
1627 return -1;
1628
1629 case AArch64::Bcc: {
1630 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
1631 assert(Idx >= 2);
1632 return Idx - 2;
1633 }
1634
1635 case AArch64::CSINVWr:
1636 case AArch64::CSINVXr:
1637 case AArch64::CSINCWr:
1638 case AArch64::CSINCXr:
1639 case AArch64::CSELWr:
1640 case AArch64::CSELXr:
1641 case AArch64::CSNEGWr:
1642 case AArch64::CSNEGXr:
1643 case AArch64::FCSELSrrr:
1644 case AArch64::FCSELDrrr: {
1645 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV, /*TRI=*/nullptr);
1646 assert(Idx >= 1);
1647 return Idx - 1;
1648 }
1649 }
1650}
1651
1652/// Find a condition code used by the instruction.
1653/// Returns AArch64CC::Invalid if either the instruction does not use condition
1654/// codes or we don't optimize CmpInstr in the presence of such instructions.
1655static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
1656 int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr);
1657 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>(
1658 Instr.getOperand(i: CCIdx).getImm())
1659 : AArch64CC::Invalid;
1660}
1661
1662static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
1663 assert(CC != AArch64CC::Invalid);
1664 UsedNZCV UsedFlags;
1665 switch (CC) {
1666 default:
1667 break;
1668
1669 case AArch64CC::EQ: // Z set
1670 case AArch64CC::NE: // Z clear
1671 UsedFlags.Z = true;
1672 break;
1673
1674 case AArch64CC::HI: // Z clear and C set
1675 case AArch64CC::LS: // Z set or C clear
1676 UsedFlags.Z = true;
1677 [[fallthrough]];
1678 case AArch64CC::HS: // C set
1679 case AArch64CC::LO: // C clear
1680 UsedFlags.C = true;
1681 break;
1682
1683 case AArch64CC::MI: // N set
1684 case AArch64CC::PL: // N clear
1685 UsedFlags.N = true;
1686 break;
1687
1688 case AArch64CC::VS: // V set
1689 case AArch64CC::VC: // V clear
1690 UsedFlags.V = true;
1691 break;
1692
1693 case AArch64CC::GT: // Z clear, N and V the same
1694 case AArch64CC::LE: // Z set, N and V differ
1695 UsedFlags.Z = true;
1696 [[fallthrough]];
1697 case AArch64CC::GE: // N and V the same
1698 case AArch64CC::LT: // N and V differ
1699 UsedFlags.N = true;
1700 UsedFlags.V = true;
1701 break;
1702 }
1703 return UsedFlags;
1704}
1705
1706/// \returns Conditions flags used after \p CmpInstr in its MachineBB if NZCV
1707/// flags are not alive in successors of the same \p CmpInstr and \p MI parent.
1708/// \returns std::nullopt otherwise.
1709///
1710/// Collect instructions using that flags in \p CCUseInstrs if provided.
1711std::optional<UsedNZCV>
1712llvm::examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr,
1713 const TargetRegisterInfo &TRI,
1714 SmallVectorImpl<MachineInstr *> *CCUseInstrs) {
1715 MachineBasicBlock *CmpParent = CmpInstr.getParent();
1716 if (MI.getParent() != CmpParent)
1717 return std::nullopt;
1718
1719 if (areCFlagsAliveInSuccessors(MBB: CmpParent))
1720 return std::nullopt;
1721
1722 UsedNZCV NZCVUsedAfterCmp;
1723 for (MachineInstr &Instr : instructionsWithoutDebug(
1724 It: std::next(x: CmpInstr.getIterator()), End: CmpParent->instr_end())) {
1725 if (Instr.readsRegister(AArch64::NZCV, &TRI)) {
1726 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
1727 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
1728 return std::nullopt;
1729 NZCVUsedAfterCmp |= getUsedNZCV(CC);
1730 if (CCUseInstrs)
1731 CCUseInstrs->push_back(Elt: &Instr);
1732 }
1733 if (Instr.modifiesRegister(AArch64::NZCV, &TRI))
1734 break;
1735 }
1736 return NZCVUsedAfterCmp;
1737}
1738
1739static bool isADDSRegImm(unsigned Opcode) {
1740 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
1741}
1742
1743static bool isSUBSRegImm(unsigned Opcode) {
1744 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
1745}
1746
1747/// Check if CmpInstr can be substituted by MI.
1748///
1749/// CmpInstr can be substituted:
1750/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1751/// - and, MI and CmpInstr are from the same MachineBB
1752/// - and, condition flags are not alive in successors of the CmpInstr parent
1753/// - and, if MI opcode is the S form there must be no defs of flags between
1754/// MI and CmpInstr
1755/// or if MI opcode is not the S form there must be neither defs of flags
1756/// nor uses of flags between MI and CmpInstr.
1757/// - and, if C/V flags are not used after CmpInstr
1758/// or if N flag is used but MI produces poison value if signed overflow
1759/// occurs.
1760static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr,
1761 const TargetRegisterInfo &TRI) {
1762 // NOTE this assertion guarantees that MI.getOpcode() is add or subtraction
1763 // that may or may not set flags.
1764 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END);
1765
1766 const unsigned CmpOpcode = CmpInstr.getOpcode();
1767 if (!isADDSRegImm(Opcode: CmpOpcode) && !isSUBSRegImm(Opcode: CmpOpcode))
1768 return false;
1769
1770 assert((CmpInstr.getOperand(2).isImm() &&
1771 CmpInstr.getOperand(2).getImm() == 0) &&
1772 "Caller guarantees that CmpInstr compares with constant 0");
1773
1774 std::optional<UsedNZCV> NZVCUsed = examineCFlagsUse(MI, CmpInstr, TRI);
1775 if (!NZVCUsed || NZVCUsed->C)
1776 return false;
1777
1778 // CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0', and MI is either
1779 // '%vreg = add ...' or '%vreg = sub ...'.
1780 // Condition flag V is used to indicate signed overflow.
1781 // 1) MI and CmpInstr set N and V to the same value.
1782 // 2) If MI is add/sub with no-signed-wrap, it produces a poison value when
1783 // signed overflow occurs, so CmpInstr could still be simplified away.
1784 if (NZVCUsed->V && !MI.getFlag(Flag: MachineInstr::NoSWrap))
1785 return false;
1786
1787 AccessKind AccessToCheck = AK_Write;
1788 if (sForm(Instr&: MI) != MI.getOpcode())
1789 AccessToCheck = AK_All;
1790 return !areCFlagsAccessedBetweenInstrs(From: &MI, To: &CmpInstr, TRI: &TRI, AccessToCheck);
1791}
1792
1793/// Substitute an instruction comparing to zero with another instruction
1794/// which produces needed condition flags.
1795///
1796/// Return true on success.
1797bool AArch64InstrInfo::substituteCmpToZero(
1798 MachineInstr &CmpInstr, unsigned SrcReg,
1799 const MachineRegisterInfo &MRI) const {
1800 // Get the unique definition of SrcReg.
1801 MachineInstr *MI = MRI.getUniqueVRegDef(Reg: SrcReg);
1802 if (!MI)
1803 return false;
1804
1805 const TargetRegisterInfo &TRI = getRegisterInfo();
1806
1807 unsigned NewOpc = sForm(Instr&: *MI);
1808 if (NewOpc == AArch64::INSTRUCTION_LIST_END)
1809 return false;
1810
1811 if (!canInstrSubstituteCmpInstr(MI&: *MI, CmpInstr, TRI))
1812 return false;
1813
1814 // Update the instruction to set NZCV.
1815 MI->setDesc(get(NewOpc));
1816 CmpInstr.eraseFromParent();
1817 bool succeeded = UpdateOperandRegClass(Instr&: *MI);
1818 (void)succeeded;
1819 assert(succeeded && "Some operands reg class are incompatible!");
1820 MI->addRegisterDefined(AArch64::NZCV, &TRI);
1821 return true;
1822}
1823
1824/// \returns True if \p CmpInstr can be removed.
1825///
1826/// \p IsInvertCC is true if, after removing \p CmpInstr, condition
1827/// codes used in \p CCUseInstrs must be inverted.
1828static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr,
1829 int CmpValue, const TargetRegisterInfo &TRI,
1830 SmallVectorImpl<MachineInstr *> &CCUseInstrs,
1831 bool &IsInvertCC) {
1832 assert((CmpValue == 0 || CmpValue == 1) &&
1833 "Only comparisons to 0 or 1 considered for removal!");
1834
1835 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>'
1836 unsigned MIOpc = MI.getOpcode();
1837 if (MIOpc == AArch64::CSINCWr) {
1838 if (MI.getOperand(1).getReg() != AArch64::WZR ||
1839 MI.getOperand(2).getReg() != AArch64::WZR)
1840 return false;
1841 } else if (MIOpc == AArch64::CSINCXr) {
1842 if (MI.getOperand(1).getReg() != AArch64::XZR ||
1843 MI.getOperand(2).getReg() != AArch64::XZR)
1844 return false;
1845 } else {
1846 return false;
1847 }
1848 AArch64CC::CondCode MICC = findCondCodeUsedByInstr(Instr: MI);
1849 if (MICC == AArch64CC::Invalid)
1850 return false;
1851
1852 // NZCV needs to be defined
1853 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) != -1)
1854 return false;
1855
1856 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1'
1857 const unsigned CmpOpcode = CmpInstr.getOpcode();
1858 bool IsSubsRegImm = isSUBSRegImm(Opcode: CmpOpcode);
1859 if (CmpValue && !IsSubsRegImm)
1860 return false;
1861 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(Opcode: CmpOpcode))
1862 return false;
1863
1864 // MI conditions allowed: eq, ne, mi, pl
1865 UsedNZCV MIUsedNZCV = getUsedNZCV(CC: MICC);
1866 if (MIUsedNZCV.C || MIUsedNZCV.V)
1867 return false;
1868
1869 std::optional<UsedNZCV> NZCVUsedAfterCmp =
1870 examineCFlagsUse(MI, CmpInstr, TRI, CCUseInstrs: &CCUseInstrs);
1871 // Condition flags are not used in CmpInstr basic block successors and only
1872 // Z or N flags allowed to be used after CmpInstr within its basic block
1873 if (!NZCVUsedAfterCmp || NZCVUsedAfterCmp->C || NZCVUsedAfterCmp->V)
1874 return false;
1875 // Z or N flag used after CmpInstr must correspond to the flag used in MI
1876 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) ||
1877 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z))
1878 return false;
1879 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne
1880 if (MIUsedNZCV.N && !CmpValue)
1881 return false;
1882
1883 // There must be no defs of flags between MI and CmpInstr
1884 if (areCFlagsAccessedBetweenInstrs(From: &MI, To: &CmpInstr, TRI: &TRI, AccessToCheck: AK_Write))
1885 return false;
1886
1887 // Condition code is inverted in the following cases:
1888 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
1889 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1'
1890 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) ||
1891 (!CmpValue && MICC == AArch64CC::NE);
1892 return true;
1893}
1894
1895/// Remove comparison in csinc-cmp sequence
1896///
1897/// Examples:
1898/// 1. \code
1899/// csinc w9, wzr, wzr, ne
1900/// cmp w9, #0
1901/// b.eq
1902/// \endcode
1903/// to
1904/// \code
1905/// csinc w9, wzr, wzr, ne
1906/// b.ne
1907/// \endcode
1908///
1909/// 2. \code
1910/// csinc x2, xzr, xzr, mi
1911/// cmp x2, #1
1912/// b.pl
1913/// \endcode
1914/// to
1915/// \code
1916/// csinc x2, xzr, xzr, mi
1917/// b.pl
1918/// \endcode
1919///
1920/// \param CmpInstr comparison instruction
1921/// \return True when comparison removed
1922bool AArch64InstrInfo::removeCmpToZeroOrOne(
1923 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue,
1924 const MachineRegisterInfo &MRI) const {
1925 MachineInstr *MI = MRI.getUniqueVRegDef(Reg: SrcReg);
1926 if (!MI)
1927 return false;
1928 const TargetRegisterInfo &TRI = getRegisterInfo();
1929 SmallVector<MachineInstr *, 4> CCUseInstrs;
1930 bool IsInvertCC = false;
1931 if (!canCmpInstrBeRemoved(MI&: *MI, CmpInstr, CmpValue, TRI, CCUseInstrs,
1932 IsInvertCC))
1933 return false;
1934 // Make transformation
1935 CmpInstr.eraseFromParent();
1936 if (IsInvertCC) {
1937 // Invert condition codes in CmpInstr CC users
1938 for (MachineInstr *CCUseInstr : CCUseInstrs) {
1939 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(Instr: *CCUseInstr);
1940 assert(Idx >= 0 && "Unexpected instruction using CC.");
1941 MachineOperand &CCOperand = CCUseInstr->getOperand(i: Idx);
1942 AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode(
1943 Code: static_cast<AArch64CC::CondCode>(CCOperand.getImm()));
1944 CCOperand.setImm(CCUse);
1945 }
1946 }
1947 return true;
1948}
1949
1950bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1951 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
1952 MI.getOpcode() != AArch64::CATCHRET)
1953 return false;
1954
1955 MachineBasicBlock &MBB = *MI.getParent();
1956 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
1957 auto TRI = Subtarget.getRegisterInfo();
1958 DebugLoc DL = MI.getDebugLoc();
1959
1960 if (MI.getOpcode() == AArch64::CATCHRET) {
1961 // Skip to the first instruction before the epilog.
1962 const TargetInstrInfo *TII =
1963 MBB.getParent()->getSubtarget().getInstrInfo();
1964 MachineBasicBlock *TargetMBB = MI.getOperand(i: 0).getMBB();
1965 auto MBBI = MachineBasicBlock::iterator(MI);
1966 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(x: MBBI);
1967 while (FirstEpilogSEH->getFlag(Flag: MachineInstr::FrameDestroy) &&
1968 FirstEpilogSEH != MBB.begin())
1969 FirstEpilogSEH = std::prev(x: FirstEpilogSEH);
1970 if (FirstEpilogSEH != MBB.begin())
1971 FirstEpilogSEH = std::next(x: FirstEpilogSEH);
1972 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP))
1973 .addReg(AArch64::X0, RegState::Define)
1974 .addMBB(TargetMBB);
1975 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri))
1976 .addReg(AArch64::X0, RegState::Define)
1977 .addReg(AArch64::X0)
1978 .addMBB(TargetMBB)
1979 .addImm(0);
1980 return true;
1981 }
1982
1983 Register Reg = MI.getOperand(i: 0).getReg();
1984 Module &M = *MBB.getParent()->getFunction().getParent();
1985 if (M.getStackProtectorGuard() == "sysreg") {
1986 const AArch64SysReg::SysReg *SrcReg =
1987 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg());
1988 if (!SrcReg)
1989 report_fatal_error(reason: "Unknown SysReg for Stack Protector Guard Register");
1990
1991 // mrs xN, sysreg
1992 BuildMI(MBB, MI, DL, get(AArch64::MRS))
1993 .addDef(Reg, RegState::Renamable)
1994 .addImm(SrcReg->Encoding);
1995 int Offset = M.getStackProtectorGuardOffset();
1996 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) {
1997 // ldr xN, [xN, #offset]
1998 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
1999 .addDef(Reg)
2000 .addUse(Reg, RegState::Kill)
2001 .addImm(Offset / 8);
2002 } else if (Offset >= -256 && Offset <= 255) {
2003 // ldur xN, [xN, #offset]
2004 BuildMI(MBB, MI, DL, get(AArch64::LDURXi))
2005 .addDef(Reg)
2006 .addUse(Reg, RegState::Kill)
2007 .addImm(Offset);
2008 } else if (Offset >= -4095 && Offset <= 4095) {
2009 if (Offset > 0) {
2010 // add xN, xN, #offset
2011 BuildMI(MBB, MI, DL, get(AArch64::ADDXri))
2012 .addDef(Reg)
2013 .addUse(Reg, RegState::Kill)
2014 .addImm(Offset)
2015 .addImm(0);
2016 } else {
2017 // sub xN, xN, #offset
2018 BuildMI(MBB, MI, DL, get(AArch64::SUBXri))
2019 .addDef(Reg)
2020 .addUse(Reg, RegState::Kill)
2021 .addImm(-Offset)
2022 .addImm(0);
2023 }
2024 // ldr xN, [xN]
2025 BuildMI(MBB, MI, DL, get(AArch64::LDRXui))
2026 .addDef(Reg)
2027 .addUse(Reg, RegState::Kill)
2028 .addImm(0);
2029 } else {
2030 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger
2031 // than 23760.
2032 // It might be nice to use AArch64::MOVi32imm here, which would get
2033 // expanded in PreSched2 after PostRA, but our lone scratch Reg already
2034 // contains the MRS result. findScratchNonCalleeSaveRegister() in
2035 // AArch64FrameLowering might help us find such a scratch register
2036 // though. If we failed to find a scratch register, we could emit a
2037 // stream of add instructions to build up the immediate. Or, we could try
2038 // to insert a AArch64::MOVi32imm before register allocation so that we
2039 // didn't need to scavenge for a scratch register.
2040 report_fatal_error(reason: "Unable to encode Stack Protector Guard Offset");
2041 }
2042 MBB.erase(I: MI);
2043 return true;
2044 }
2045
2046 const GlobalValue *GV =
2047 cast<GlobalValue>(Val: (*MI.memoperands_begin())->getValue());
2048 const TargetMachine &TM = MBB.getParent()->getTarget();
2049 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
2050 const unsigned char MO_NC = AArch64II::MO_NC;
2051
2052 if ((OpFlags & AArch64II::MO_GOT) != 0) {
2053 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
2054 .addGlobalAddress(GV, 0, OpFlags);
2055 if (Subtarget.isTargetILP32()) {
2056 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2057 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2058 .addDef(Reg32, RegState::Dead)
2059 .addUse(Reg, RegState::Kill)
2060 .addImm(0)
2061 .addMemOperand(*MI.memoperands_begin())
2062 .addDef(Reg, RegState::Implicit);
2063 } else {
2064 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2065 .addReg(Reg, RegState::Kill)
2066 .addImm(0)
2067 .addMemOperand(*MI.memoperands_begin());
2068 }
2069 } else if (TM.getCodeModel() == CodeModel::Large) {
2070 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
2071 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
2072 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
2073 .addImm(0);
2074 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2075 .addReg(Reg, RegState::Kill)
2076 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC)
2077 .addImm(16);
2078 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2079 .addReg(Reg, RegState::Kill)
2080 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC)
2081 .addImm(32);
2082 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
2083 .addReg(Reg, RegState::Kill)
2084 .addGlobalAddress(GV, 0, AArch64II::MO_G3)
2085 .addImm(48);
2086 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2087 .addReg(Reg, RegState::Kill)
2088 .addImm(0)
2089 .addMemOperand(*MI.memoperands_begin());
2090 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2091 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg)
2092 .addGlobalAddress(GV, 0, OpFlags);
2093 } else {
2094 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
2095 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
2096 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
2097 if (Subtarget.isTargetILP32()) {
2098 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
2099 BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
2100 .addDef(Reg32, RegState::Dead)
2101 .addUse(Reg, RegState::Kill)
2102 .addGlobalAddress(GV, 0, LoFlags)
2103 .addMemOperand(*MI.memoperands_begin())
2104 .addDef(Reg, RegState::Implicit);
2105 } else {
2106 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
2107 .addReg(Reg, RegState::Kill)
2108 .addGlobalAddress(GV, 0, LoFlags)
2109 .addMemOperand(*MI.memoperands_begin());
2110 }
2111 }
2112
2113 MBB.erase(I: MI);
2114
2115 return true;
2116}
2117
2118// Return true if this instruction simply sets its single destination register
2119// to zero. This is equivalent to a register rename of the zero-register.
2120bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) {
2121 switch (MI.getOpcode()) {
2122 default:
2123 break;
2124 case AArch64::MOVZWi:
2125 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
2126 if (MI.getOperand(i: 1).isImm() && MI.getOperand(i: 1).getImm() == 0) {
2127 assert(MI.getDesc().getNumOperands() == 3 &&
2128 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
2129 return true;
2130 }
2131 break;
2132 case AArch64::ANDWri: // and Rd, Rzr, #imm
2133 return MI.getOperand(1).getReg() == AArch64::WZR;
2134 case AArch64::ANDXri:
2135 return MI.getOperand(1).getReg() == AArch64::XZR;
2136 case TargetOpcode::COPY:
2137 return MI.getOperand(1).getReg() == AArch64::WZR;
2138 }
2139 return false;
2140}
2141
2142// Return true if this instruction simply renames a general register without
2143// modifying bits.
2144bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) {
2145 switch (MI.getOpcode()) {
2146 default:
2147 break;
2148 case TargetOpcode::COPY: {
2149 // GPR32 copies will by lowered to ORRXrs
2150 Register DstReg = MI.getOperand(i: 0).getReg();
2151 return (AArch64::GPR32RegClass.contains(DstReg) ||
2152 AArch64::GPR64RegClass.contains(DstReg));
2153 }
2154 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
2155 if (MI.getOperand(1).getReg() == AArch64::XZR) {
2156 assert(MI.getDesc().getNumOperands() == 4 &&
2157 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
2158 return true;
2159 }
2160 break;
2161 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
2162 if (MI.getOperand(i: 2).getImm() == 0) {
2163 assert(MI.getDesc().getNumOperands() == 4 &&
2164 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
2165 return true;
2166 }
2167 break;
2168 }
2169 return false;
2170}
2171
2172// Return true if this instruction simply renames a general register without
2173// modifying bits.
2174bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) {
2175 switch (MI.getOpcode()) {
2176 default:
2177 break;
2178 case TargetOpcode::COPY: {
2179 Register DstReg = MI.getOperand(i: 0).getReg();
2180 return AArch64::FPR128RegClass.contains(DstReg);
2181 }
2182 case AArch64::ORRv16i8:
2183 if (MI.getOperand(i: 1).getReg() == MI.getOperand(i: 2).getReg()) {
2184 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
2185 "invalid ORRv16i8 operands");
2186 return true;
2187 }
2188 break;
2189 }
2190 return false;
2191}
2192
2193Register AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
2194 int &FrameIndex) const {
2195 switch (MI.getOpcode()) {
2196 default:
2197 break;
2198 case AArch64::LDRWui:
2199 case AArch64::LDRXui:
2200 case AArch64::LDRBui:
2201 case AArch64::LDRHui:
2202 case AArch64::LDRSui:
2203 case AArch64::LDRDui:
2204 case AArch64::LDRQui:
2205 case AArch64::LDR_PXI:
2206 if (MI.getOperand(i: 0).getSubReg() == 0 && MI.getOperand(i: 1).isFI() &&
2207 MI.getOperand(i: 2).isImm() && MI.getOperand(i: 2).getImm() == 0) {
2208 FrameIndex = MI.getOperand(i: 1).getIndex();
2209 return MI.getOperand(i: 0).getReg();
2210 }
2211 break;
2212 }
2213
2214 return 0;
2215}
2216
2217Register AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
2218 int &FrameIndex) const {
2219 switch (MI.getOpcode()) {
2220 default:
2221 break;
2222 case AArch64::STRWui:
2223 case AArch64::STRXui:
2224 case AArch64::STRBui:
2225 case AArch64::STRHui:
2226 case AArch64::STRSui:
2227 case AArch64::STRDui:
2228 case AArch64::STRQui:
2229 case AArch64::STR_PXI:
2230 if (MI.getOperand(i: 0).getSubReg() == 0 && MI.getOperand(i: 1).isFI() &&
2231 MI.getOperand(i: 2).isImm() && MI.getOperand(i: 2).getImm() == 0) {
2232 FrameIndex = MI.getOperand(i: 1).getIndex();
2233 return MI.getOperand(i: 0).getReg();
2234 }
2235 break;
2236 }
2237 return 0;
2238}
2239
2240/// Check all MachineMemOperands for a hint to suppress pairing.
2241bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) {
2242 return llvm::any_of(Range: MI.memoperands(), P: [](MachineMemOperand *MMO) {
2243 return MMO->getFlags() & MOSuppressPair;
2244 });
2245}
2246
2247/// Set a flag on the first MachineMemOperand to suppress pairing.
2248void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) {
2249 if (MI.memoperands_empty())
2250 return;
2251 (*MI.memoperands_begin())->setFlags(MOSuppressPair);
2252}
2253
2254/// Check all MachineMemOperands for a hint that the load/store is strided.
2255bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) {
2256 return llvm::any_of(Range: MI.memoperands(), P: [](MachineMemOperand *MMO) {
2257 return MMO->getFlags() & MOStridedAccess;
2258 });
2259}
2260
2261bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) {
2262 switch (Opc) {
2263 default:
2264 return false;
2265 case AArch64::STURSi:
2266 case AArch64::STRSpre:
2267 case AArch64::STURDi:
2268 case AArch64::STRDpre:
2269 case AArch64::STURQi:
2270 case AArch64::STRQpre:
2271 case AArch64::STURBBi:
2272 case AArch64::STURHHi:
2273 case AArch64::STURWi:
2274 case AArch64::STRWpre:
2275 case AArch64::STURXi:
2276 case AArch64::STRXpre:
2277 case AArch64::LDURSi:
2278 case AArch64::LDRSpre:
2279 case AArch64::LDURDi:
2280 case AArch64::LDRDpre:
2281 case AArch64::LDURQi:
2282 case AArch64::LDRQpre:
2283 case AArch64::LDURWi:
2284 case AArch64::LDRWpre:
2285 case AArch64::LDURXi:
2286 case AArch64::LDRXpre:
2287 case AArch64::LDRSWpre:
2288 case AArch64::LDURSWi:
2289 case AArch64::LDURHHi:
2290 case AArch64::LDURBBi:
2291 case AArch64::LDURSBWi:
2292 case AArch64::LDURSHWi:
2293 return true;
2294 }
2295}
2296
2297std::optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) {
2298 switch (Opc) {
2299 default: return {};
2300 case AArch64::PRFMui: return AArch64::PRFUMi;
2301 case AArch64::LDRXui: return AArch64::LDURXi;
2302 case AArch64::LDRWui: return AArch64::LDURWi;
2303 case AArch64::LDRBui: return AArch64::LDURBi;
2304 case AArch64::LDRHui: return AArch64::LDURHi;
2305 case AArch64::LDRSui: return AArch64::LDURSi;
2306 case AArch64::LDRDui: return AArch64::LDURDi;
2307 case AArch64::LDRQui: return AArch64::LDURQi;
2308 case AArch64::LDRBBui: return AArch64::LDURBBi;
2309 case AArch64::LDRHHui: return AArch64::LDURHHi;
2310 case AArch64::LDRSBXui: return AArch64::LDURSBXi;
2311 case AArch64::LDRSBWui: return AArch64::LDURSBWi;
2312 case AArch64::LDRSHXui: return AArch64::LDURSHXi;
2313 case AArch64::LDRSHWui: return AArch64::LDURSHWi;
2314 case AArch64::LDRSWui: return AArch64::LDURSWi;
2315 case AArch64::STRXui: return AArch64::STURXi;
2316 case AArch64::STRWui: return AArch64::STURWi;
2317 case AArch64::STRBui: return AArch64::STURBi;
2318 case AArch64::STRHui: return AArch64::STURHi;
2319 case AArch64::STRSui: return AArch64::STURSi;
2320 case AArch64::STRDui: return AArch64::STURDi;
2321 case AArch64::STRQui: return AArch64::STURQi;
2322 case AArch64::STRBBui: return AArch64::STURBBi;
2323 case AArch64::STRHHui: return AArch64::STURHHi;
2324 }
2325}
2326
2327unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
2328 switch (Opc) {
2329 default:
2330 return 2;
2331 case AArch64::LDPXi:
2332 case AArch64::LDPDi:
2333 case AArch64::STPXi:
2334 case AArch64::STPDi:
2335 case AArch64::LDNPXi:
2336 case AArch64::LDNPDi:
2337 case AArch64::STNPXi:
2338 case AArch64::STNPDi:
2339 case AArch64::LDPQi:
2340 case AArch64::STPQi:
2341 case AArch64::LDNPQi:
2342 case AArch64::STNPQi:
2343 case AArch64::LDPWi:
2344 case AArch64::LDPSi:
2345 case AArch64::STPWi:
2346 case AArch64::STPSi:
2347 case AArch64::LDNPWi:
2348 case AArch64::LDNPSi:
2349 case AArch64::STNPWi:
2350 case AArch64::STNPSi:
2351 case AArch64::LDG:
2352 case AArch64::STGPi:
2353
2354 case AArch64::LD1B_IMM:
2355 case AArch64::LD1B_H_IMM:
2356 case AArch64::LD1B_S_IMM:
2357 case AArch64::LD1B_D_IMM:
2358 case AArch64::LD1SB_H_IMM:
2359 case AArch64::LD1SB_S_IMM:
2360 case AArch64::LD1SB_D_IMM:
2361 case AArch64::LD1H_IMM:
2362 case AArch64::LD1H_S_IMM:
2363 case AArch64::LD1H_D_IMM:
2364 case AArch64::LD1SH_S_IMM:
2365 case AArch64::LD1SH_D_IMM:
2366 case AArch64::LD1W_IMM:
2367 case AArch64::LD1W_D_IMM:
2368 case AArch64::LD1SW_D_IMM:
2369 case AArch64::LD1D_IMM:
2370
2371 case AArch64::LD2B_IMM:
2372 case AArch64::LD2H_IMM:
2373 case AArch64::LD2W_IMM:
2374 case AArch64::LD2D_IMM:
2375 case AArch64::LD3B_IMM:
2376 case AArch64::LD3H_IMM:
2377 case AArch64::LD3W_IMM:
2378 case AArch64::LD3D_IMM:
2379 case AArch64::LD4B_IMM:
2380 case AArch64::LD4H_IMM:
2381 case AArch64::LD4W_IMM:
2382 case AArch64::LD4D_IMM:
2383
2384 case AArch64::ST1B_IMM:
2385 case AArch64::ST1B_H_IMM:
2386 case AArch64::ST1B_S_IMM:
2387 case AArch64::ST1B_D_IMM:
2388 case AArch64::ST1H_IMM:
2389 case AArch64::ST1H_S_IMM:
2390 case AArch64::ST1H_D_IMM:
2391 case AArch64::ST1W_IMM:
2392 case AArch64::ST1W_D_IMM:
2393 case AArch64::ST1D_IMM:
2394
2395 case AArch64::ST2B_IMM:
2396 case AArch64::ST2H_IMM:
2397 case AArch64::ST2W_IMM:
2398 case AArch64::ST2D_IMM:
2399 case AArch64::ST3B_IMM:
2400 case AArch64::ST3H_IMM:
2401 case AArch64::ST3W_IMM:
2402 case AArch64::ST3D_IMM:
2403 case AArch64::ST4B_IMM:
2404 case AArch64::ST4H_IMM:
2405 case AArch64::ST4W_IMM:
2406 case AArch64::ST4D_IMM:
2407
2408 case AArch64::LD1RB_IMM:
2409 case AArch64::LD1RB_H_IMM:
2410 case AArch64::LD1RB_S_IMM:
2411 case AArch64::LD1RB_D_IMM:
2412 case AArch64::LD1RSB_H_IMM:
2413 case AArch64::LD1RSB_S_IMM:
2414 case AArch64::LD1RSB_D_IMM:
2415 case AArch64::LD1RH_IMM:
2416 case AArch64::LD1RH_S_IMM:
2417 case AArch64::LD1RH_D_IMM:
2418 case AArch64::LD1RSH_S_IMM:
2419 case AArch64::LD1RSH_D_IMM:
2420 case AArch64::LD1RW_IMM:
2421 case AArch64::LD1RW_D_IMM:
2422 case AArch64::LD1RSW_IMM:
2423 case AArch64::LD1RD_IMM:
2424
2425 case AArch64::LDNT1B_ZRI:
2426 case AArch64::LDNT1H_ZRI:
2427 case AArch64::LDNT1W_ZRI:
2428 case AArch64::LDNT1D_ZRI:
2429 case AArch64::STNT1B_ZRI:
2430 case AArch64::STNT1H_ZRI:
2431 case AArch64::STNT1W_ZRI:
2432 case AArch64::STNT1D_ZRI:
2433
2434 case AArch64::LDNF1B_IMM:
2435 case AArch64::LDNF1B_H_IMM:
2436 case AArch64::LDNF1B_S_IMM:
2437 case AArch64::LDNF1B_D_IMM:
2438 case AArch64::LDNF1SB_H_IMM:
2439 case AArch64::LDNF1SB_S_IMM:
2440 case AArch64::LDNF1SB_D_IMM:
2441 case AArch64::LDNF1H_IMM:
2442 case AArch64::LDNF1H_S_IMM:
2443 case AArch64::LDNF1H_D_IMM:
2444 case AArch64::LDNF1SH_S_IMM:
2445 case AArch64::LDNF1SH_D_IMM:
2446 case AArch64::LDNF1W_IMM:
2447 case AArch64::LDNF1W_D_IMM:
2448 case AArch64::LDNF1SW_D_IMM:
2449 case AArch64::LDNF1D_IMM:
2450 return 3;
2451 case AArch64::ADDG:
2452 case AArch64::STGi:
2453 case AArch64::LDR_PXI:
2454 case AArch64::STR_PXI:
2455 return 2;
2456 }
2457}
2458
2459bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
2460 switch (MI.getOpcode()) {
2461 default:
2462 return false;
2463 // Scaled instructions.
2464 case AArch64::STRSui:
2465 case AArch64::STRDui:
2466 case AArch64::STRQui:
2467 case AArch64::STRXui:
2468 case AArch64::STRWui:
2469 case AArch64::LDRSui:
2470 case AArch64::LDRDui:
2471 case AArch64::LDRQui:
2472 case AArch64::LDRXui:
2473 case AArch64::LDRWui:
2474 case AArch64::LDRSWui:
2475 // Unscaled instructions.
2476 case AArch64::STURSi:
2477 case AArch64::STRSpre:
2478 case AArch64::STURDi:
2479 case AArch64::STRDpre:
2480 case AArch64::STURQi:
2481 case AArch64::STRQpre:
2482 case AArch64::STURWi:
2483 case AArch64::STRWpre:
2484 case AArch64::STURXi:
2485 case AArch64::STRXpre:
2486 case AArch64::LDURSi:
2487 case AArch64::LDRSpre:
2488 case AArch64::LDURDi:
2489 case AArch64::LDRDpre:
2490 case AArch64::LDURQi:
2491 case AArch64::LDRQpre:
2492 case AArch64::LDURWi:
2493 case AArch64::LDRWpre:
2494 case AArch64::LDURXi:
2495 case AArch64::LDRXpre:
2496 case AArch64::LDURSWi:
2497 case AArch64::LDRSWpre:
2498 return true;
2499 }
2500}
2501
2502bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {
2503 switch (MI.getOpcode()) {
2504 default:
2505 assert((!MI.isCall() || !MI.isReturn()) &&
2506 "Unexpected instruction - was a new tail call opcode introduced?");
2507 return false;
2508 case AArch64::TCRETURNdi:
2509 case AArch64::TCRETURNri:
2510 case AArch64::TCRETURNrix16x17:
2511 case AArch64::TCRETURNrix17:
2512 case AArch64::TCRETURNrinotx16:
2513 case AArch64::TCRETURNriALL:
2514 return true;
2515 }
2516}
2517
2518unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) {
2519 switch (Opc) {
2520 default:
2521 llvm_unreachable("Opcode has no flag setting equivalent!");
2522 // 32-bit cases:
2523 case AArch64::ADDWri:
2524 return AArch64::ADDSWri;
2525 case AArch64::ADDWrr:
2526 return AArch64::ADDSWrr;
2527 case AArch64::ADDWrs:
2528 return AArch64::ADDSWrs;
2529 case AArch64::ADDWrx:
2530 return AArch64::ADDSWrx;
2531 case AArch64::ANDWri:
2532 return AArch64::ANDSWri;
2533 case AArch64::ANDWrr:
2534 return AArch64::ANDSWrr;
2535 case AArch64::ANDWrs:
2536 return AArch64::ANDSWrs;
2537 case AArch64::BICWrr:
2538 return AArch64::BICSWrr;
2539 case AArch64::BICWrs:
2540 return AArch64::BICSWrs;
2541 case AArch64::SUBWri:
2542 return AArch64::SUBSWri;
2543 case AArch64::SUBWrr:
2544 return AArch64::SUBSWrr;
2545 case AArch64::SUBWrs:
2546 return AArch64::SUBSWrs;
2547 case AArch64::SUBWrx:
2548 return AArch64::SUBSWrx;
2549 // 64-bit cases:
2550 case AArch64::ADDXri:
2551 return AArch64::ADDSXri;
2552 case AArch64::ADDXrr:
2553 return AArch64::ADDSXrr;
2554 case AArch64::ADDXrs:
2555 return AArch64::ADDSXrs;
2556 case AArch64::ADDXrx:
2557 return AArch64::ADDSXrx;
2558 case AArch64::ANDXri:
2559 return AArch64::ANDSXri;
2560 case AArch64::ANDXrr:
2561 return AArch64::ANDSXrr;
2562 case AArch64::ANDXrs:
2563 return AArch64::ANDSXrs;
2564 case AArch64::BICXrr:
2565 return AArch64::BICSXrr;
2566 case AArch64::BICXrs:
2567 return AArch64::BICSXrs;
2568 case AArch64::SUBXri:
2569 return AArch64::SUBSXri;
2570 case AArch64::SUBXrr:
2571 return AArch64::SUBSXrr;
2572 case AArch64::SUBXrs:
2573 return AArch64::SUBSXrs;
2574 case AArch64::SUBXrx:
2575 return AArch64::SUBSXrx;
2576 // SVE instructions:
2577 case AArch64::AND_PPzPP:
2578 return AArch64::ANDS_PPzPP;
2579 case AArch64::BIC_PPzPP:
2580 return AArch64::BICS_PPzPP;
2581 case AArch64::EOR_PPzPP:
2582 return AArch64::EORS_PPzPP;
2583 case AArch64::NAND_PPzPP:
2584 return AArch64::NANDS_PPzPP;
2585 case AArch64::NOR_PPzPP:
2586 return AArch64::NORS_PPzPP;
2587 case AArch64::ORN_PPzPP:
2588 return AArch64::ORNS_PPzPP;
2589 case AArch64::ORR_PPzPP:
2590 return AArch64::ORRS_PPzPP;
2591 case AArch64::BRKA_PPzP:
2592 return AArch64::BRKAS_PPzP;
2593 case AArch64::BRKPA_PPzPP:
2594 return AArch64::BRKPAS_PPzPP;
2595 case AArch64::BRKB_PPzP:
2596 return AArch64::BRKBS_PPzP;
2597 case AArch64::BRKPB_PPzPP:
2598 return AArch64::BRKPBS_PPzPP;
2599 case AArch64::BRKN_PPzP:
2600 return AArch64::BRKNS_PPzP;
2601 case AArch64::RDFFR_PPz:
2602 return AArch64::RDFFRS_PPz;
2603 case AArch64::PTRUE_B:
2604 return AArch64::PTRUES_B;
2605 }
2606}
2607
2608// Is this a candidate for ld/st merging or pairing? For example, we don't
2609// touch volatiles or load/stores that have a hint to avoid pair formation.
2610bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
2611
2612 bool IsPreLdSt = isPreLdSt(MI);
2613
2614 // If this is a volatile load/store, don't mess with it.
2615 if (MI.hasOrderedMemoryRef())
2616 return false;
2617
2618 // Make sure this is a reg/fi+imm (as opposed to an address reloc).
2619 // For Pre-inc LD/ST, the operand is shifted by one.
2620 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() ||
2621 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) &&
2622 "Expected a reg or frame index operand.");
2623
2624 // For Pre-indexed addressing quadword instructions, the third operand is the
2625 // immediate value.
2626 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(i: 3).isImm();
2627
2628 if (!MI.getOperand(i: 2).isImm() && !IsImmPreLdSt)
2629 return false;
2630
2631 // Can't merge/pair if the instruction modifies the base register.
2632 // e.g., ldr x0, [x0]
2633 // This case will never occur with an FI base.
2634 // However, if the instruction is an LDR<S,D,Q,W,X,SW>pre or
2635 // STR<S,D,Q,W,X>pre, it can be merged.
2636 // For example:
2637 // ldr q0, [x11, #32]!
2638 // ldr q1, [x11, #16]
2639 // to
2640 // ldp q0, q1, [x11, #32]!
2641 if (MI.getOperand(i: 1).isReg() && !IsPreLdSt) {
2642 Register BaseReg = MI.getOperand(i: 1).getReg();
2643 const TargetRegisterInfo *TRI = &getRegisterInfo();
2644 if (MI.modifiesRegister(Reg: BaseReg, TRI))
2645 return false;
2646 }
2647
2648 // Check if this load/store has a hint to avoid pair formation.
2649 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
2650 if (isLdStPairSuppressed(MI))
2651 return false;
2652
2653 // Do not pair any callee-save store/reload instructions in the
2654 // prologue/epilogue if the CFI information encoded the operations as separate
2655 // instructions, as that will cause the size of the actual prologue to mismatch
2656 // with the prologue size recorded in the Windows CFI.
2657 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo();
2658 bool NeedsWinCFI = MAI->usesWindowsCFI() &&
2659 MI.getMF()->getFunction().needsUnwindTableEntry();
2660 if (NeedsWinCFI && (MI.getFlag(Flag: MachineInstr::FrameSetup) ||
2661 MI.getFlag(Flag: MachineInstr::FrameDestroy)))
2662 return false;
2663
2664 // On some CPUs quad load/store pairs are slower than two single load/stores.
2665 if (Subtarget.isPaired128Slow()) {
2666 switch (MI.getOpcode()) {
2667 default:
2668 break;
2669 case AArch64::LDURQi:
2670 case AArch64::STURQi:
2671 case AArch64::LDRQui:
2672 case AArch64::STRQui:
2673 return false;
2674 }
2675 }
2676
2677 return true;
2678}
2679
2680bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
2681 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
2682 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
2683 const TargetRegisterInfo *TRI) const {
2684 if (!LdSt.mayLoadOrStore())
2685 return false;
2686
2687 const MachineOperand *BaseOp;
2688 TypeSize WidthN(0, false);
2689 if (!getMemOperandWithOffsetWidth(MI: LdSt, BaseOp, Offset, OffsetIsScalable,
2690 Width&: WidthN, TRI))
2691 return false;
2692 // The maximum vscale is 16 under AArch64, return the maximal extent for the
2693 // vector.
2694 Width = LocationSize::precise(Value: WidthN);
2695 BaseOps.push_back(Elt: BaseOp);
2696 return true;
2697}
2698
2699std::optional<ExtAddrMode>
2700AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
2701 const TargetRegisterInfo *TRI) const {
2702 const MachineOperand *Base; // Filled with the base operand of MI.
2703 int64_t Offset; // Filled with the offset of MI.
2704 bool OffsetIsScalable;
2705 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI))
2706 return std::nullopt;
2707
2708 if (!Base->isReg())
2709 return std::nullopt;
2710 ExtAddrMode AM;
2711 AM.BaseReg = Base->getReg();
2712 AM.Displacement = Offset;
2713 AM.ScaledReg = 0;
2714 AM.Scale = 0;
2715 return AM;
2716}
2717
2718bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI,
2719 Register Reg,
2720 const MachineInstr &AddrI,
2721 ExtAddrMode &AM) const {
2722 // Filter out instructions into which we cannot fold.
2723 unsigned NumBytes;
2724 int64_t OffsetScale = 1;
2725 switch (MemI.getOpcode()) {
2726 default:
2727 return false;
2728
2729 case AArch64::LDURQi:
2730 case AArch64::STURQi:
2731 NumBytes = 16;
2732 break;
2733
2734 case AArch64::LDURDi:
2735 case AArch64::STURDi:
2736 case AArch64::LDURXi:
2737 case AArch64::STURXi:
2738 NumBytes = 8;
2739 break;
2740
2741 case AArch64::LDURWi:
2742 case AArch64::LDURSWi:
2743 case AArch64::STURWi:
2744 NumBytes = 4;
2745 break;
2746
2747 case AArch64::LDURHi:
2748 case AArch64::STURHi:
2749 case AArch64::LDURHHi:
2750 case AArch64::STURHHi:
2751 case AArch64::LDURSHXi:
2752 case AArch64::LDURSHWi:
2753 NumBytes = 2;
2754 break;
2755
2756 case AArch64::LDRBroX:
2757 case AArch64::LDRBBroX:
2758 case AArch64::LDRSBXroX:
2759 case AArch64::LDRSBWroX:
2760 case AArch64::STRBroX:
2761 case AArch64::STRBBroX:
2762 case AArch64::LDURBi:
2763 case AArch64::LDURBBi:
2764 case AArch64::LDURSBXi:
2765 case AArch64::LDURSBWi:
2766 case AArch64::STURBi:
2767 case AArch64::STURBBi:
2768 case AArch64::LDRBui:
2769 case AArch64::LDRBBui:
2770 case AArch64::LDRSBXui:
2771 case AArch64::LDRSBWui:
2772 case AArch64::STRBui:
2773 case AArch64::STRBBui:
2774 NumBytes = 1;
2775 break;
2776
2777 case AArch64::LDRQroX:
2778 case AArch64::STRQroX:
2779 case AArch64::LDRQui:
2780 case AArch64::STRQui:
2781 NumBytes = 16;
2782 OffsetScale = 16;
2783 break;
2784
2785 case AArch64::LDRDroX:
2786 case AArch64::STRDroX:
2787 case AArch64::LDRXroX:
2788 case AArch64::STRXroX:
2789 case AArch64::LDRDui:
2790 case AArch64::STRDui:
2791 case AArch64::LDRXui:
2792 case AArch64::STRXui:
2793 NumBytes = 8;
2794 OffsetScale = 8;
2795 break;
2796
2797 case AArch64::LDRWroX:
2798 case AArch64::LDRSWroX:
2799 case AArch64::STRWroX:
2800 case AArch64::LDRWui:
2801 case AArch64::LDRSWui:
2802 case AArch64::STRWui:
2803 NumBytes = 4;
2804 OffsetScale = 4;
2805 break;
2806
2807 case AArch64::LDRHroX:
2808 case AArch64::STRHroX:
2809 case AArch64::LDRHHroX:
2810 case AArch64::STRHHroX:
2811 case AArch64::LDRSHXroX:
2812 case AArch64::LDRSHWroX:
2813 case AArch64::LDRHui:
2814 case AArch64::STRHui:
2815 case AArch64::LDRHHui:
2816 case AArch64::STRHHui:
2817 case AArch64::LDRSHXui:
2818 case AArch64::LDRSHWui:
2819 NumBytes = 2;
2820 OffsetScale = 2;
2821 break;
2822 }
2823
2824 // Check the fold operand is not the loaded/stored value.
2825 const MachineOperand &BaseRegOp = MemI.getOperand(i: 0);
2826 if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg)
2827 return false;
2828
2829 // Handle memory instructions with a [Reg, Reg] addressing mode.
2830 if (MemI.getOperand(i: 2).isReg()) {
2831 // Bail if the addressing mode already includes extension of the offset
2832 // register.
2833 if (MemI.getOperand(i: 3).getImm())
2834 return false;
2835
2836 // Check if we actually have a scaled offset.
2837 if (MemI.getOperand(i: 4).getImm() == 0)
2838 OffsetScale = 1;
2839
2840 // If the address instructions is folded into the base register, then the
2841 // addressing mode must not have a scale. Then we can swap the base and the
2842 // scaled registers.
2843 if (MemI.getOperand(i: 1).getReg() == Reg && OffsetScale != 1)
2844 return false;
2845
2846 switch (AddrI.getOpcode()) {
2847 default:
2848 return false;
2849
2850 case AArch64::SBFMXri:
2851 // sxtw Xa, Wm
2852 // ldr Xd, [Xn, Xa, lsl #N]
2853 // ->
2854 // ldr Xd, [Xn, Wm, sxtw #N]
2855 if (AddrI.getOperand(i: 2).getImm() != 0 ||
2856 AddrI.getOperand(i: 3).getImm() != 31)
2857 return false;
2858
2859 AM.BaseReg = MemI.getOperand(i: 1).getReg();
2860 if (AM.BaseReg == Reg)
2861 AM.BaseReg = MemI.getOperand(i: 2).getReg();
2862 AM.ScaledReg = AddrI.getOperand(i: 1).getReg();
2863 AM.Scale = OffsetScale;
2864 AM.Displacement = 0;
2865 AM.Form = ExtAddrMode::Formula::SExtScaledReg;
2866 return true;
2867
2868 case TargetOpcode::SUBREG_TO_REG: {
2869 // mov Wa, Wm
2870 // ldr Xd, [Xn, Xa, lsl #N]
2871 // ->
2872 // ldr Xd, [Xn, Wm, uxtw #N]
2873
2874 // Zero-extension looks like an ORRWrs followed by a SUBREG_TO_REG.
2875 if (AddrI.getOperand(1).getImm() != 0 ||
2876 AddrI.getOperand(3).getImm() != AArch64::sub_32)
2877 return false;
2878
2879 const MachineRegisterInfo &MRI = AddrI.getMF()->getRegInfo();
2880 Register OffsetReg = AddrI.getOperand(i: 2).getReg();
2881 if (!OffsetReg.isVirtual() || !MRI.hasOneNonDBGUse(RegNo: OffsetReg))
2882 return false;
2883
2884 const MachineInstr &DefMI = *MRI.getVRegDef(Reg: OffsetReg);
2885 if (DefMI.getOpcode() != AArch64::ORRWrs ||
2886 DefMI.getOperand(1).getReg() != AArch64::WZR ||
2887 DefMI.getOperand(3).getImm() != 0)
2888 return false;
2889
2890 AM.BaseReg = MemI.getOperand(i: 1).getReg();
2891 if (AM.BaseReg == Reg)
2892 AM.BaseReg = MemI.getOperand(i: 2).getReg();
2893 AM.ScaledReg = DefMI.getOperand(i: 2).getReg();
2894 AM.Scale = OffsetScale;
2895 AM.Displacement = 0;
2896 AM.Form = ExtAddrMode::Formula::ZExtScaledReg;
2897 return true;
2898 }
2899 }
2900 }
2901
2902 // Handle memory instructions with a [Reg, #Imm] addressing mode.
2903
2904 // Check we are not breaking a potential conversion to an LDP.
2905 auto validateOffsetForLDP = [](unsigned NumBytes, int64_t OldOffset,
2906 int64_t NewOffset) -> bool {
2907 int64_t MinOffset, MaxOffset;
2908 switch (NumBytes) {
2909 default:
2910 return true;
2911 case 4:
2912 MinOffset = -256;
2913 MaxOffset = 252;
2914 break;
2915 case 8:
2916 MinOffset = -512;
2917 MaxOffset = 504;
2918 break;
2919 case 16:
2920 MinOffset = -1024;
2921 MaxOffset = 1008;
2922 break;
2923 }
2924 return OldOffset < MinOffset || OldOffset > MaxOffset ||
2925 (NewOffset >= MinOffset && NewOffset <= MaxOffset);
2926 };
2927 auto canFoldAddSubImmIntoAddrMode = [&](int64_t Disp) -> bool {
2928 int64_t OldOffset = MemI.getOperand(i: 2).getImm() * OffsetScale;
2929 int64_t NewOffset = OldOffset + Disp;
2930 if (!isLegalAddressingMode(NumBytes, Offset: NewOffset, /* Scale */ 0))
2931 return false;
2932 // If the old offset would fit into an LDP, but the new offset wouldn't,
2933 // bail out.
2934 if (!validateOffsetForLDP(NumBytes, OldOffset, NewOffset))
2935 return false;
2936 AM.BaseReg = AddrI.getOperand(i: 1).getReg();
2937 AM.ScaledReg = 0;
2938 AM.Scale = 0;
2939 AM.Displacement = NewOffset;
2940 AM.Form = ExtAddrMode::Formula::Basic;
2941 return true;
2942 };
2943
2944 auto canFoldAddRegIntoAddrMode =
2945 [&](int64_t Scale,
2946 ExtAddrMode::Formula Form = ExtAddrMode::Formula::Basic) -> bool {
2947 if (MemI.getOperand(i: 2).getImm() != 0)
2948 return false;
2949 if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale))
2950 return false;
2951 AM.BaseReg = AddrI.getOperand(i: 1).getReg();
2952 AM.ScaledReg = AddrI.getOperand(i: 2).getReg();
2953 AM.Scale = Scale;
2954 AM.Displacement = 0;
2955 AM.Form = Form;
2956 return true;
2957 };
2958
2959 auto avoidSlowSTRQ = [&](const MachineInstr &MemI) {
2960 unsigned Opcode = MemI.getOpcode();
2961 return (Opcode == AArch64::STURQi || Opcode == AArch64::STRQui) &&
2962 Subtarget.isSTRQroSlow();
2963 };
2964
2965 int64_t Disp = 0;
2966 const bool OptSize = MemI.getMF()->getFunction().hasOptSize();
2967 switch (AddrI.getOpcode()) {
2968 default:
2969 return false;
2970
2971 case AArch64::ADDXri:
2972 // add Xa, Xn, #N
2973 // ldr Xd, [Xa, #M]
2974 // ->
2975 // ldr Xd, [Xn, #N'+M]
2976 Disp = AddrI.getOperand(i: 2).getImm() << AddrI.getOperand(i: 3).getImm();
2977 return canFoldAddSubImmIntoAddrMode(Disp);
2978
2979 case AArch64::SUBXri:
2980 // sub Xa, Xn, #N
2981 // ldr Xd, [Xa, #M]
2982 // ->
2983 // ldr Xd, [Xn, #N'+M]
2984 Disp = AddrI.getOperand(i: 2).getImm() << AddrI.getOperand(i: 3).getImm();
2985 return canFoldAddSubImmIntoAddrMode(-Disp);
2986
2987 case AArch64::ADDXrs: {
2988 // add Xa, Xn, Xm, lsl #N
2989 // ldr Xd, [Xa]
2990 // ->
2991 // ldr Xd, [Xn, Xm, lsl #N]
2992
2993 // Don't fold the add if the result would be slower, unless optimising for
2994 // size.
2995 unsigned Shift = static_cast<unsigned>(AddrI.getOperand(i: 3).getImm());
2996 if (AArch64_AM::getShiftType(Imm: Shift) != AArch64_AM::ShiftExtendType::LSL)
2997 return false;
2998 Shift = AArch64_AM::getShiftValue(Imm: Shift);
2999 if (!OptSize) {
3000 if (Shift != 2 && Shift != 3 && Subtarget.hasAddrLSLSlow14())
3001 return false;
3002 if (avoidSlowSTRQ(MemI))
3003 return false;
3004 }
3005 return canFoldAddRegIntoAddrMode(1ULL << Shift);
3006 }
3007
3008 case AArch64::ADDXrr:
3009 // add Xa, Xn, Xm
3010 // ldr Xd, [Xa]
3011 // ->
3012 // ldr Xd, [Xn, Xm, lsl #0]
3013
3014 // Don't fold the add if the result would be slower, unless optimising for
3015 // size.
3016 if (!OptSize && avoidSlowSTRQ(MemI))
3017 return false;
3018 return canFoldAddRegIntoAddrMode(1);
3019
3020 case AArch64::ADDXrx:
3021 // add Xa, Xn, Wm, {s,u}xtw #N
3022 // ldr Xd, [Xa]
3023 // ->
3024 // ldr Xd, [Xn, Wm, {s,u}xtw #N]
3025
3026 // Don't fold the add if the result would be slower, unless optimising for
3027 // size.
3028 if (!OptSize && avoidSlowSTRQ(MemI))
3029 return false;
3030
3031 // Can fold only sign-/zero-extend of a word.
3032 unsigned Imm = static_cast<unsigned>(AddrI.getOperand(i: 3).getImm());
3033 AArch64_AM::ShiftExtendType Extend = AArch64_AM::getArithExtendType(Imm);
3034 if (Extend != AArch64_AM::UXTW && Extend != AArch64_AM::SXTW)
3035 return false;
3036
3037 return canFoldAddRegIntoAddrMode(
3038 1ULL << AArch64_AM::getArithShiftValue(Imm),
3039 (Extend == AArch64_AM::SXTW) ? ExtAddrMode::Formula::SExtScaledReg
3040 : ExtAddrMode::Formula::ZExtScaledReg);
3041 }
3042}
3043
3044// Given an opcode for an instruction with a [Reg, #Imm] addressing mode,
3045// return the opcode of an instruction performing the same operation, but using
3046// the [Reg, Reg] addressing mode.
3047static unsigned regOffsetOpcode(unsigned Opcode) {
3048 switch (Opcode) {
3049 default:
3050 llvm_unreachable("Address folding not implemented for instruction");
3051
3052 case AArch64::LDURQi:
3053 case AArch64::LDRQui:
3054 return AArch64::LDRQroX;
3055 case AArch64::STURQi:
3056 case AArch64::STRQui:
3057 return AArch64::STRQroX;
3058 case AArch64::LDURDi:
3059 case AArch64::LDRDui:
3060 return AArch64::LDRDroX;
3061 case AArch64::STURDi:
3062 case AArch64::STRDui:
3063 return AArch64::STRDroX;
3064 case AArch64::LDURXi:
3065 case AArch64::LDRXui:
3066 return AArch64::LDRXroX;
3067 case AArch64::STURXi:
3068 case AArch64::STRXui:
3069 return AArch64::STRXroX;
3070 case AArch64::LDURWi:
3071 case AArch64::LDRWui:
3072 return AArch64::LDRWroX;
3073 case AArch64::LDURSWi:
3074 case AArch64::LDRSWui:
3075 return AArch64::LDRSWroX;
3076 case AArch64::STURWi:
3077 case AArch64::STRWui:
3078 return AArch64::STRWroX;
3079 case AArch64::LDURHi:
3080 case AArch64::LDRHui:
3081 return AArch64::LDRHroX;
3082 case AArch64::STURHi:
3083 case AArch64::STRHui:
3084 return AArch64::STRHroX;
3085 case AArch64::LDURHHi:
3086 case AArch64::LDRHHui:
3087 return AArch64::LDRHHroX;
3088 case AArch64::STURHHi:
3089 case AArch64::STRHHui:
3090 return AArch64::STRHHroX;
3091 case AArch64::LDURSHXi:
3092 case AArch64::LDRSHXui:
3093 return AArch64::LDRSHXroX;
3094 case AArch64::LDURSHWi:
3095 case AArch64::LDRSHWui:
3096 return AArch64::LDRSHWroX;
3097 case AArch64::LDURBi:
3098 case AArch64::LDRBui:
3099 return AArch64::LDRBroX;
3100 case AArch64::LDURBBi:
3101 case AArch64::LDRBBui:
3102 return AArch64::LDRBBroX;
3103 case AArch64::LDURSBXi:
3104 case AArch64::LDRSBXui:
3105 return AArch64::LDRSBXroX;
3106 case AArch64::LDURSBWi:
3107 case AArch64::LDRSBWui:
3108 return AArch64::LDRSBWroX;
3109 case AArch64::STURBi:
3110 case AArch64::STRBui:
3111 return AArch64::STRBroX;
3112 case AArch64::STURBBi:
3113 case AArch64::STRBBui:
3114 return AArch64::STRBBroX;
3115 }
3116}
3117
3118// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3119// the opcode of an instruction performing the same operation, but using the
3120// [Reg, #Imm] addressing mode with scaled offset.
3121unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) {
3122 switch (Opcode) {
3123 default:
3124 llvm_unreachable("Address folding not implemented for instruction");
3125
3126 case AArch64::LDURQi:
3127 Scale = 16;
3128 return AArch64::LDRQui;
3129 case AArch64::STURQi:
3130 Scale = 16;
3131 return AArch64::STRQui;
3132 case AArch64::LDURDi:
3133 Scale = 8;
3134 return AArch64::LDRDui;
3135 case AArch64::STURDi:
3136 Scale = 8;
3137 return AArch64::STRDui;
3138 case AArch64::LDURXi:
3139 Scale = 8;
3140 return AArch64::LDRXui;
3141 case AArch64::STURXi:
3142 Scale = 8;
3143 return AArch64::STRXui;
3144 case AArch64::LDURWi:
3145 Scale = 4;
3146 return AArch64::LDRWui;
3147 case AArch64::LDURSWi:
3148 Scale = 4;
3149 return AArch64::LDRSWui;
3150 case AArch64::STURWi:
3151 Scale = 4;
3152 return AArch64::STRWui;
3153 case AArch64::LDURHi:
3154 Scale = 2;
3155 return AArch64::LDRHui;
3156 case AArch64::STURHi:
3157 Scale = 2;
3158 return AArch64::STRHui;
3159 case AArch64::LDURHHi:
3160 Scale = 2;
3161 return AArch64::LDRHHui;
3162 case AArch64::STURHHi:
3163 Scale = 2;
3164 return AArch64::STRHHui;
3165 case AArch64::LDURSHXi:
3166 Scale = 2;
3167 return AArch64::LDRSHXui;
3168 case AArch64::LDURSHWi:
3169 Scale = 2;
3170 return AArch64::LDRSHWui;
3171 case AArch64::LDURBi:
3172 Scale = 1;
3173 return AArch64::LDRBui;
3174 case AArch64::LDURBBi:
3175 Scale = 1;
3176 return AArch64::LDRBBui;
3177 case AArch64::LDURSBXi:
3178 Scale = 1;
3179 return AArch64::LDRSBXui;
3180 case AArch64::LDURSBWi:
3181 Scale = 1;
3182 return AArch64::LDRSBWui;
3183 case AArch64::STURBi:
3184 Scale = 1;
3185 return AArch64::STRBui;
3186 case AArch64::STURBBi:
3187 Scale = 1;
3188 return AArch64::STRBBui;
3189 case AArch64::LDRQui:
3190 case AArch64::STRQui:
3191 Scale = 16;
3192 return Opcode;
3193 case AArch64::LDRDui:
3194 case AArch64::STRDui:
3195 case AArch64::LDRXui:
3196 case AArch64::STRXui:
3197 Scale = 8;
3198 return Opcode;
3199 case AArch64::LDRWui:
3200 case AArch64::LDRSWui:
3201 case AArch64::STRWui:
3202 Scale = 4;
3203 return Opcode;
3204 case AArch64::LDRHui:
3205 case AArch64::STRHui:
3206 case AArch64::LDRHHui:
3207 case AArch64::STRHHui:
3208 case AArch64::LDRSHXui:
3209 case AArch64::LDRSHWui:
3210 Scale = 2;
3211 return Opcode;
3212 case AArch64::LDRBui:
3213 case AArch64::LDRBBui:
3214 case AArch64::LDRSBXui:
3215 case AArch64::LDRSBWui:
3216 case AArch64::STRBui:
3217 case AArch64::STRBBui:
3218 Scale = 1;
3219 return Opcode;
3220 }
3221}
3222
3223// Given an opcode for an instruction with a [Reg, #Imm] addressing mode, return
3224// the opcode of an instruction performing the same operation, but using the
3225// [Reg, #Imm] addressing mode with unscaled offset.
3226unsigned unscaledOffsetOpcode(unsigned Opcode) {
3227 switch (Opcode) {
3228 default:
3229 llvm_unreachable("Address folding not implemented for instruction");
3230
3231 case AArch64::LDURQi:
3232 case AArch64::STURQi:
3233 case AArch64::LDURDi:
3234 case AArch64::STURDi:
3235 case AArch64::LDURXi:
3236 case AArch64::STURXi:
3237 case AArch64::LDURWi:
3238 case AArch64::LDURSWi:
3239 case AArch64::STURWi:
3240 case AArch64::LDURHi:
3241 case AArch64::STURHi:
3242 case AArch64::LDURHHi:
3243 case AArch64::STURHHi:
3244 case AArch64::LDURSHXi:
3245 case AArch64::LDURSHWi:
3246 case AArch64::LDURBi:
3247 case AArch64::STURBi:
3248 case AArch64::LDURBBi:
3249 case AArch64::STURBBi:
3250 case AArch64::LDURSBWi:
3251 case AArch64::LDURSBXi:
3252 return Opcode;
3253 case AArch64::LDRQui:
3254 return AArch64::LDURQi;
3255 case AArch64::STRQui:
3256 return AArch64::STURQi;
3257 case AArch64::LDRDui:
3258 return AArch64::LDURDi;
3259 case AArch64::STRDui:
3260 return AArch64::STURDi;
3261 case AArch64::LDRXui:
3262 return AArch64::LDURXi;
3263 case AArch64::STRXui:
3264 return AArch64::STURXi;
3265 case AArch64::LDRWui:
3266 return AArch64::LDURWi;
3267 case AArch64::LDRSWui:
3268 return AArch64::LDURSWi;
3269 case AArch64::STRWui:
3270 return AArch64::STURWi;
3271 case AArch64::LDRHui:
3272 return AArch64::LDURHi;
3273 case AArch64::STRHui:
3274 return AArch64::STURHi;
3275 case AArch64::LDRHHui:
3276 return AArch64::LDURHHi;
3277 case AArch64::STRHHui:
3278 return AArch64::STURHHi;
3279 case AArch64::LDRSHXui:
3280 return AArch64::LDURSHXi;
3281 case AArch64::LDRSHWui:
3282 return AArch64::LDURSHWi;
3283 case AArch64::LDRBBui:
3284 return AArch64::LDURBBi;
3285 case AArch64::LDRBui:
3286 return AArch64::LDURBi;
3287 case AArch64::STRBBui:
3288 return AArch64::STURBBi;
3289 case AArch64::STRBui:
3290 return AArch64::STURBi;
3291 case AArch64::LDRSBWui:
3292 return AArch64::LDURSBWi;
3293 case AArch64::LDRSBXui:
3294 return AArch64::LDURSBXi;
3295 }
3296}
3297
3298// Given the opcode of a memory load/store instruction, return the opcode of an
3299// instruction performing the same operation, but using
3300// the [Reg, Reg, {s,u}xtw #N] addressing mode with sign-/zero-extend of the
3301// offset register.
3302static unsigned offsetExtendOpcode(unsigned Opcode) {
3303 switch (Opcode) {
3304 default:
3305 llvm_unreachable("Address folding not implemented for instruction");
3306
3307 case AArch64::LDRQroX:
3308 case AArch64::LDURQi:
3309 case AArch64::LDRQui:
3310 return AArch64::LDRQroW;
3311 case AArch64::STRQroX:
3312 case AArch64::STURQi:
3313 case AArch64::STRQui:
3314 return AArch64::STRQroW;
3315 case AArch64::LDRDroX:
3316 case AArch64::LDURDi:
3317 case AArch64::LDRDui:
3318 return AArch64::LDRDroW;
3319 case AArch64::STRDroX:
3320 case AArch64::STURDi:
3321 case AArch64::STRDui:
3322 return AArch64::STRDroW;
3323 case AArch64::LDRXroX:
3324 case AArch64::LDURXi:
3325 case AArch64::LDRXui:
3326 return AArch64::LDRXroW;
3327 case AArch64::STRXroX:
3328 case AArch64::STURXi:
3329 case AArch64::STRXui:
3330 return AArch64::STRXroW;
3331 case AArch64::LDRWroX:
3332 case AArch64::LDURWi:
3333 case AArch64::LDRWui:
3334 return AArch64::LDRWroW;
3335 case AArch64::LDRSWroX:
3336 case AArch64::LDURSWi:
3337 case AArch64::LDRSWui:
3338 return AArch64::LDRSWroW;
3339 case AArch64::STRWroX:
3340 case AArch64::STURWi:
3341 case AArch64::STRWui:
3342 return AArch64::STRWroW;
3343 case AArch64::LDRHroX:
3344 case AArch64::LDURHi:
3345 case AArch64::LDRHui:
3346 return AArch64::LDRHroW;
3347 case AArch64::STRHroX:
3348 case AArch64::STURHi:
3349 case AArch64::STRHui:
3350 return AArch64::STRHroW;
3351 case AArch64::LDRHHroX:
3352 case AArch64::LDURHHi:
3353 case AArch64::LDRHHui:
3354 return AArch64::LDRHHroW;
3355 case AArch64::STRHHroX:
3356 case AArch64::STURHHi:
3357 case AArch64::STRHHui:
3358 return AArch64::STRHHroW;
3359 case AArch64::LDRSHXroX:
3360 case AArch64::LDURSHXi:
3361 case AArch64::LDRSHXui:
3362 return AArch64::LDRSHXroW;
3363 case AArch64::LDRSHWroX:
3364 case AArch64::LDURSHWi:
3365 case AArch64::LDRSHWui:
3366 return AArch64::LDRSHWroW;
3367 case AArch64::LDRBroX:
3368 case AArch64::LDURBi:
3369 case AArch64::LDRBui:
3370 return AArch64::LDRBroW;
3371 case AArch64::LDRBBroX:
3372 case AArch64::LDURBBi:
3373 case AArch64::LDRBBui:
3374 return AArch64::LDRBBroW;
3375 case AArch64::LDRSBXroX:
3376 case AArch64::LDURSBXi:
3377 case AArch64::LDRSBXui:
3378 return AArch64::LDRSBXroW;
3379 case AArch64::LDRSBWroX:
3380 case AArch64::LDURSBWi:
3381 case AArch64::LDRSBWui:
3382 return AArch64::LDRSBWroW;
3383 case AArch64::STRBroX:
3384 case AArch64::STURBi:
3385 case AArch64::STRBui:
3386 return AArch64::STRBroW;
3387 case AArch64::STRBBroX:
3388 case AArch64::STURBBi:
3389 case AArch64::STRBBui:
3390 return AArch64::STRBBroW;
3391 }
3392}
3393
3394MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,
3395 const ExtAddrMode &AM) const {
3396
3397 const DebugLoc &DL = MemI.getDebugLoc();
3398 MachineBasicBlock &MBB = *MemI.getParent();
3399 MachineRegisterInfo &MRI = MemI.getMF()->getRegInfo();
3400
3401 if (AM.Form == ExtAddrMode::Formula::Basic) {
3402 if (AM.ScaledReg) {
3403 // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`.
3404 unsigned Opcode = regOffsetOpcode(Opcode: MemI.getOpcode());
3405 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3406 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3407 .addReg(MemI.getOperand(i: 0).getReg(),
3408 MemI.mayLoad() ? RegState::Define : 0)
3409 .addReg(AM.BaseReg)
3410 .addReg(AM.ScaledReg)
3411 .addImm(0)
3412 .addImm(AM.Scale > 1)
3413 .setMemRefs(MemI.memoperands())
3414 .setMIFlags(MemI.getFlags());
3415 return B.getInstr();
3416 }
3417
3418 assert(AM.ScaledReg == 0 && AM.Scale == 0 &&
3419 "Addressing mode not supported for folding");
3420
3421 // The new instruction will be in the form `ld[u]r Rt, [Xn, #imm]`.
3422 unsigned Scale = 1;
3423 unsigned Opcode = MemI.getOpcode();
3424 if (isInt<9>(x: AM.Displacement))
3425 Opcode = unscaledOffsetOpcode(Opcode);
3426 else
3427 Opcode = scaledOffsetOpcode(Opcode, Scale);
3428
3429 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3430 .addReg(MemI.getOperand(i: 0).getReg(),
3431 MemI.mayLoad() ? RegState::Define : 0)
3432 .addReg(AM.BaseReg)
3433 .addImm(AM.Displacement / Scale)
3434 .setMemRefs(MemI.memoperands())
3435 .setMIFlags(MemI.getFlags());
3436 return B.getInstr();
3437 }
3438
3439 if (AM.Form == ExtAddrMode::Formula::SExtScaledReg ||
3440 AM.Form == ExtAddrMode::Formula::ZExtScaledReg) {
3441 // The new instruction will be in the form `ldr Rt, [Xn, Wm, {s,u}xtw #N]`.
3442 assert(AM.ScaledReg && !AM.Displacement &&
3443 "Address offset can be a register or an immediate, but not both");
3444 unsigned Opcode = offsetExtendOpcode(Opcode: MemI.getOpcode());
3445 MRI.constrainRegClass(AM.BaseReg, &AArch64::GPR64spRegClass);
3446 // Make sure the offset register is in the correct register class.
3447 Register OffsetReg = AM.ScaledReg;
3448 const TargetRegisterClass *RC = MRI.getRegClass(Reg: OffsetReg);
3449 if (RC->hasSuperClassEq(&AArch64::GPR64RegClass)) {
3450 OffsetReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3451 BuildMI(MBB, MemI, DL, get(TargetOpcode::COPY), OffsetReg)
3452 .addReg(AM.ScaledReg, 0, AArch64::sub_32);
3453 }
3454 auto B = BuildMI(MBB, MemI, DL, get(Opcode))
3455 .addReg(MemI.getOperand(i: 0).getReg(),
3456 MemI.mayLoad() ? RegState::Define : 0)
3457 .addReg(AM.BaseReg)
3458 .addReg(OffsetReg)
3459 .addImm(AM.Form == ExtAddrMode::Formula::SExtScaledReg)
3460 .addImm(AM.Scale != 1)
3461 .setMemRefs(MemI.memoperands())
3462 .setMIFlags(MemI.getFlags());
3463
3464 return B.getInstr();
3465 }
3466
3467 llvm_unreachable(
3468 "Function must not be called with an addressing mode it can't handle");
3469}
3470
3471bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
3472 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
3473 bool &OffsetIsScalable, TypeSize &Width,
3474 const TargetRegisterInfo *TRI) const {
3475 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3476 // Handle only loads/stores with base register followed by immediate offset.
3477 if (LdSt.getNumExplicitOperands() == 3) {
3478 // Non-paired instruction (e.g., ldr x1, [x0, #8]).
3479 if ((!LdSt.getOperand(i: 1).isReg() && !LdSt.getOperand(i: 1).isFI()) ||
3480 !LdSt.getOperand(i: 2).isImm())
3481 return false;
3482 } else if (LdSt.getNumExplicitOperands() == 4) {
3483 // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
3484 if (!LdSt.getOperand(i: 1).isReg() ||
3485 (!LdSt.getOperand(i: 2).isReg() && !LdSt.getOperand(i: 2).isFI()) ||
3486 !LdSt.getOperand(i: 3).isImm())
3487 return false;
3488 } else
3489 return false;
3490
3491 // Get the scaling factor for the instruction and set the width for the
3492 // instruction.
3493 TypeSize Scale(0U, false);
3494 int64_t Dummy1, Dummy2;
3495
3496 // If this returns false, then it's an instruction we don't want to handle.
3497 if (!getMemOpInfo(Opcode: LdSt.getOpcode(), Scale, Width, MinOffset&: Dummy1, MaxOffset&: Dummy2))
3498 return false;
3499
3500 // Compute the offset. Offset is calculated as the immediate operand
3501 // multiplied by the scaling factor. Unscaled instructions have scaling factor
3502 // set to 1.
3503 if (LdSt.getNumExplicitOperands() == 3) {
3504 BaseOp = &LdSt.getOperand(i: 1);
3505 Offset = LdSt.getOperand(i: 2).getImm() * Scale.getKnownMinValue();
3506 } else {
3507 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
3508 BaseOp = &LdSt.getOperand(i: 2);
3509 Offset = LdSt.getOperand(i: 3).getImm() * Scale.getKnownMinValue();
3510 }
3511 OffsetIsScalable = Scale.isScalable();
3512
3513 if (!BaseOp->isReg() && !BaseOp->isFI())
3514 return false;
3515
3516 return true;
3517}
3518
3519MachineOperand &
3520AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
3521 assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
3522 MachineOperand &OfsOp = LdSt.getOperand(i: LdSt.getNumExplicitOperands() - 1);
3523 assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
3524 return OfsOp;
3525}
3526
3527bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
3528 TypeSize &Width, int64_t &MinOffset,
3529 int64_t &MaxOffset) {
3530 switch (Opcode) {
3531 // Not a memory operation or something we want to handle.
3532 default:
3533 Scale = TypeSize::getFixed(ExactSize: 0);
3534 Width = TypeSize::getFixed(ExactSize: 0);
3535 MinOffset = MaxOffset = 0;
3536 return false;
3537 case AArch64::STRWpost:
3538 case AArch64::LDRWpost:
3539 Width = TypeSize::getFixed(ExactSize: 32);
3540 Scale = TypeSize::getFixed(ExactSize: 4);
3541 MinOffset = -256;
3542 MaxOffset = 255;
3543 break;
3544 case AArch64::LDURQi:
3545 case AArch64::STURQi:
3546 Width = TypeSize::getFixed(ExactSize: 16);
3547 Scale = TypeSize::getFixed(ExactSize: 1);
3548 MinOffset = -256;
3549 MaxOffset = 255;
3550 break;
3551 case AArch64::PRFUMi:
3552 case AArch64::LDURXi:
3553 case AArch64::LDURDi:
3554 case AArch64::LDAPURXi:
3555 case AArch64::STURXi:
3556 case AArch64::STURDi:
3557 case AArch64::STLURXi:
3558 Width = TypeSize::getFixed(ExactSize: 8);
3559 Scale = TypeSize::getFixed(ExactSize: 1);
3560 MinOffset = -256;
3561 MaxOffset = 255;
3562 break;
3563 case AArch64::LDURWi:
3564 case AArch64::LDURSi:
3565 case AArch64::LDURSWi:
3566 case AArch64::LDAPURi:
3567 case AArch64::LDAPURSWi:
3568 case AArch64::STURWi:
3569 case AArch64::STURSi:
3570 case AArch64::STLURWi:
3571 Width = TypeSize::getFixed(ExactSize: 4);
3572 Scale = TypeSize::getFixed(ExactSize: 1);
3573 MinOffset = -256;
3574 MaxOffset = 255;
3575 break;
3576 case AArch64::LDURHi:
3577 case AArch64::LDURHHi:
3578 case AArch64::LDURSHXi:
3579 case AArch64::LDURSHWi:
3580 case AArch64::LDAPURHi:
3581 case AArch64::LDAPURSHWi:
3582 case AArch64::LDAPURSHXi:
3583 case AArch64::STURHi:
3584 case AArch64::STURHHi:
3585 case AArch64::STLURHi:
3586 Width = TypeSize::getFixed(ExactSize: 2);
3587 Scale = TypeSize::getFixed(ExactSize: 1);
3588 MinOffset = -256;
3589 MaxOffset = 255;
3590 break;
3591 case AArch64::LDURBi:
3592 case AArch64::LDURBBi:
3593 case AArch64::LDURSBXi:
3594 case AArch64::LDURSBWi:
3595 case AArch64::LDAPURBi:
3596 case AArch64::LDAPURSBWi:
3597 case AArch64::LDAPURSBXi:
3598 case AArch64::STURBi:
3599 case AArch64::STURBBi:
3600 case AArch64::STLURBi:
3601 Width = TypeSize::getFixed(ExactSize: 1);
3602 Scale = TypeSize::getFixed(ExactSize: 1);
3603 MinOffset = -256;
3604 MaxOffset = 255;
3605 break;
3606 case AArch64::LDPQi:
3607 case AArch64::LDNPQi:
3608 case AArch64::STPQi:
3609 case AArch64::STNPQi:
3610 Scale = TypeSize::getFixed(ExactSize: 16);
3611 Width = TypeSize::getFixed(ExactSize: 32);
3612 MinOffset = -64;
3613 MaxOffset = 63;
3614 break;
3615 case AArch64::LDRQui:
3616 case AArch64::STRQui:
3617 Scale = TypeSize::getFixed(ExactSize: 16);
3618 Width = TypeSize::getFixed(ExactSize: 16);
3619 MinOffset = 0;
3620 MaxOffset = 4095;
3621 break;
3622 case AArch64::LDPXi:
3623 case AArch64::LDPDi:
3624 case AArch64::LDNPXi:
3625 case AArch64::LDNPDi:
3626 case AArch64::STPXi:
3627 case AArch64::STPDi:
3628 case AArch64::STNPXi:
3629 case AArch64::STNPDi:
3630 Scale = TypeSize::getFixed(ExactSize: 8);
3631 Width = TypeSize::getFixed(ExactSize: 16);
3632 MinOffset = -64;
3633 MaxOffset = 63;
3634 break;
3635 case AArch64::PRFMui:
3636 case AArch64::LDRXui:
3637 case AArch64::LDRDui:
3638 case AArch64::STRXui:
3639 case AArch64::STRDui:
3640 Scale = TypeSize::getFixed(ExactSize: 8);
3641 Width = TypeSize::getFixed(ExactSize: 8);
3642 MinOffset = 0;
3643 MaxOffset = 4095;
3644 break;
3645 case AArch64::StoreSwiftAsyncContext:
3646 // Store is an STRXui, but there might be an ADDXri in the expansion too.
3647 Scale = TypeSize::getFixed(ExactSize: 1);
3648 Width = TypeSize::getFixed(ExactSize: 8);
3649 MinOffset = 0;
3650 MaxOffset = 4095;
3651 break;
3652 case AArch64::LDPWi:
3653 case AArch64::LDPSi:
3654 case AArch64::LDNPWi:
3655 case AArch64::LDNPSi:
3656 case AArch64::STPWi:
3657 case AArch64::STPSi:
3658 case AArch64::STNPWi:
3659 case AArch64::STNPSi:
3660 Scale = TypeSize::getFixed(ExactSize: 4);
3661 Width = TypeSize::getFixed(ExactSize: 8);
3662 MinOffset = -64;
3663 MaxOffset = 63;
3664 break;
3665 case AArch64::LDRWui:
3666 case AArch64::LDRSui:
3667 case AArch64::LDRSWui:
3668 case AArch64::STRWui:
3669 case AArch64::STRSui:
3670 Scale = TypeSize::getFixed(ExactSize: 4);
3671 Width = TypeSize::getFixed(ExactSize: 4);
3672 MinOffset = 0;
3673 MaxOffset = 4095;
3674 break;
3675 case AArch64::LDRHui:
3676 case AArch64::LDRHHui:
3677 case AArch64::LDRSHWui:
3678 case AArch64::LDRSHXui:
3679 case AArch64::STRHui:
3680 case AArch64::STRHHui:
3681 Scale = TypeSize::getFixed(ExactSize: 2);
3682 Width = TypeSize::getFixed(ExactSize: 2);
3683 MinOffset = 0;
3684 MaxOffset = 4095;
3685 break;
3686 case AArch64::LDRBui:
3687 case AArch64::LDRBBui:
3688 case AArch64::LDRSBWui:
3689 case AArch64::LDRSBXui:
3690 case AArch64::STRBui:
3691 case AArch64::STRBBui:
3692 Scale = TypeSize::getFixed(ExactSize: 1);
3693 Width = TypeSize::getFixed(ExactSize: 1);
3694 MinOffset = 0;
3695 MaxOffset = 4095;
3696 break;
3697 case AArch64::STPXpre:
3698 case AArch64::LDPXpost:
3699 case AArch64::STPDpre:
3700 case AArch64::LDPDpost:
3701 Scale = TypeSize::getFixed(ExactSize: 8);
3702 Width = TypeSize::getFixed(ExactSize: 8);
3703 MinOffset = -512;
3704 MaxOffset = 504;
3705 break;
3706 case AArch64::STPQpre:
3707 case AArch64::LDPQpost:
3708 Scale = TypeSize::getFixed(ExactSize: 16);
3709 Width = TypeSize::getFixed(ExactSize: 16);
3710 MinOffset = -1024;
3711 MaxOffset = 1008;
3712 break;
3713 case AArch64::STRXpre:
3714 case AArch64::STRDpre:
3715 case AArch64::LDRXpost:
3716 case AArch64::LDRDpost:
3717 Scale = TypeSize::getFixed(ExactSize: 1);
3718 Width = TypeSize::getFixed(ExactSize: 8);
3719 MinOffset = -256;
3720 MaxOffset = 255;
3721 break;
3722 case AArch64::STRQpre:
3723 case AArch64::LDRQpost:
3724 Scale = TypeSize::getFixed(ExactSize: 1);
3725 Width = TypeSize::getFixed(ExactSize: 16);
3726 MinOffset = -256;
3727 MaxOffset = 255;
3728 break;
3729 case AArch64::ADDG:
3730 Scale = TypeSize::getFixed(ExactSize: 16);
3731 Width = TypeSize::getFixed(ExactSize: 0);
3732 MinOffset = 0;
3733 MaxOffset = 63;
3734 break;
3735 case AArch64::TAGPstack:
3736 Scale = TypeSize::getFixed(ExactSize: 16);
3737 Width = TypeSize::getFixed(ExactSize: 0);
3738 // TAGP with a negative offset turns into SUBP, which has a maximum offset
3739 // of 63 (not 64!).
3740 MinOffset = -63;
3741 MaxOffset = 63;
3742 break;
3743 case AArch64::LDG:
3744 case AArch64::STGi:
3745 case AArch64::STZGi:
3746 Scale = TypeSize::getFixed(ExactSize: 16);
3747 Width = TypeSize::getFixed(ExactSize: 16);
3748 MinOffset = -256;
3749 MaxOffset = 255;
3750 break;
3751 case AArch64::STR_ZZZZXI:
3752 case AArch64::LDR_ZZZZXI:
3753 Scale = TypeSize::getScalable(MinimumSize: 16);
3754 Width = TypeSize::getScalable(MinimumSize: 16 * 4);
3755 MinOffset = -256;
3756 MaxOffset = 252;
3757 break;
3758 case AArch64::STR_ZZZXI:
3759 case AArch64::LDR_ZZZXI:
3760 Scale = TypeSize::getScalable(MinimumSize: 16);
3761 Width = TypeSize::getScalable(MinimumSize: 16 * 3);
3762 MinOffset = -256;
3763 MaxOffset = 253;
3764 break;
3765 case AArch64::STR_ZZXI:
3766 case AArch64::LDR_ZZXI:
3767 Scale = TypeSize::getScalable(MinimumSize: 16);
3768 Width = TypeSize::getScalable(MinimumSize: 16 * 2);
3769 MinOffset = -256;
3770 MaxOffset = 254;
3771 break;
3772 case AArch64::LDR_PXI:
3773 case AArch64::STR_PXI:
3774 Scale = TypeSize::getScalable(MinimumSize: 2);
3775 Width = TypeSize::getScalable(MinimumSize: 2);
3776 MinOffset = -256;
3777 MaxOffset = 255;
3778 break;
3779 case AArch64::LDR_PPXI:
3780 case AArch64::STR_PPXI:
3781 Scale = TypeSize::getScalable(MinimumSize: 2);
3782 Width = TypeSize::getScalable(MinimumSize: 2 * 2);
3783 MinOffset = -256;
3784 MaxOffset = 254;
3785 break;
3786 case AArch64::LDR_ZXI:
3787 case AArch64::STR_ZXI:
3788 Scale = TypeSize::getScalable(MinimumSize: 16);
3789 Width = TypeSize::getScalable(MinimumSize: 16);
3790 MinOffset = -256;
3791 MaxOffset = 255;
3792 break;
3793 case AArch64::LD1B_IMM:
3794 case AArch64::LD1H_IMM:
3795 case AArch64::LD1W_IMM:
3796 case AArch64::LD1D_IMM:
3797 case AArch64::LDNT1B_ZRI:
3798 case AArch64::LDNT1H_ZRI:
3799 case AArch64::LDNT1W_ZRI:
3800 case AArch64::LDNT1D_ZRI:
3801 case AArch64::ST1B_IMM:
3802 case AArch64::ST1H_IMM:
3803 case AArch64::ST1W_IMM:
3804 case AArch64::ST1D_IMM:
3805 case AArch64::STNT1B_ZRI:
3806 case AArch64::STNT1H_ZRI:
3807 case AArch64::STNT1W_ZRI:
3808 case AArch64::STNT1D_ZRI:
3809 case AArch64::LDNF1B_IMM:
3810 case AArch64::LDNF1H_IMM:
3811 case AArch64::LDNF1W_IMM:
3812 case AArch64::LDNF1D_IMM:
3813 // A full vectors worth of data
3814 // Width = mbytes * elements
3815 Scale = TypeSize::getScalable(MinimumSize: 16);
3816 Width = TypeSize::getScalable(MinimumSize: 16);
3817 MinOffset = -8;
3818 MaxOffset = 7;
3819 break;
3820 case AArch64::LD2B_IMM:
3821 case AArch64::LD2H_IMM:
3822 case AArch64::LD2W_IMM:
3823 case AArch64::LD2D_IMM:
3824 case AArch64::ST2B_IMM:
3825 case AArch64::ST2H_IMM:
3826 case AArch64::ST2W_IMM:
3827 case AArch64::ST2D_IMM:
3828 Scale = TypeSize::getScalable(MinimumSize: 32);
3829 Width = TypeSize::getScalable(MinimumSize: 16 * 2);
3830 MinOffset = -8;
3831 MaxOffset = 7;
3832 break;
3833 case AArch64::LD3B_IMM:
3834 case AArch64::LD3H_IMM:
3835 case AArch64::LD3W_IMM:
3836 case AArch64::LD3D_IMM:
3837 case AArch64::ST3B_IMM:
3838 case AArch64::ST3H_IMM:
3839 case AArch64::ST3W_IMM:
3840 case AArch64::ST3D_IMM:
3841 Scale = TypeSize::getScalable(MinimumSize: 48);
3842 Width = TypeSize::getScalable(MinimumSize: 16 * 3);
3843 MinOffset = -8;
3844 MaxOffset = 7;
3845 break;
3846 case AArch64::LD4B_IMM:
3847 case AArch64::LD4H_IMM:
3848 case AArch64::LD4W_IMM:
3849 case AArch64::LD4D_IMM:
3850 case AArch64::ST4B_IMM:
3851 case AArch64::ST4H_IMM:
3852 case AArch64::ST4W_IMM:
3853 case AArch64::ST4D_IMM:
3854 Scale = TypeSize::getScalable(MinimumSize: 64);
3855 Width = TypeSize::getScalable(MinimumSize: 16 * 4);
3856 MinOffset = -8;
3857 MaxOffset = 7;
3858 break;
3859 case AArch64::LD1B_H_IMM:
3860 case AArch64::LD1SB_H_IMM:
3861 case AArch64::LD1H_S_IMM:
3862 case AArch64::LD1SH_S_IMM:
3863 case AArch64::LD1W_D_IMM:
3864 case AArch64::LD1SW_D_IMM:
3865 case AArch64::ST1B_H_IMM:
3866 case AArch64::ST1H_S_IMM:
3867 case AArch64::ST1W_D_IMM:
3868 case AArch64::LDNF1B_H_IMM:
3869 case AArch64::LDNF1SB_H_IMM:
3870 case AArch64::LDNF1H_S_IMM:
3871 case AArch64::LDNF1SH_S_IMM:
3872 case AArch64::LDNF1W_D_IMM:
3873 case AArch64::LDNF1SW_D_IMM:
3874 // A half vector worth of data
3875 // Width = mbytes * elements
3876 Scale = TypeSize::getScalable(MinimumSize: 8);
3877 Width = TypeSize::getScalable(MinimumSize: 8);
3878 MinOffset = -8;
3879 MaxOffset = 7;
3880 break;
3881 case AArch64::LD1B_S_IMM:
3882 case AArch64::LD1SB_S_IMM:
3883 case AArch64::LD1H_D_IMM:
3884 case AArch64::LD1SH_D_IMM:
3885 case AArch64::ST1B_S_IMM:
3886 case AArch64::ST1H_D_IMM:
3887 case AArch64::LDNF1B_S_IMM:
3888 case AArch64::LDNF1SB_S_IMM:
3889 case AArch64::LDNF1H_D_IMM:
3890 case AArch64::LDNF1SH_D_IMM:
3891 // A quarter vector worth of data
3892 // Width = mbytes * elements
3893 Scale = TypeSize::getScalable(MinimumSize: 4);
3894 Width = TypeSize::getScalable(MinimumSize: 4);
3895 MinOffset = -8;
3896 MaxOffset = 7;
3897 break;
3898 case AArch64::LD1B_D_IMM:
3899 case AArch64::LD1SB_D_IMM:
3900 case AArch64::ST1B_D_IMM:
3901 case AArch64::LDNF1B_D_IMM:
3902 case AArch64::LDNF1SB_D_IMM:
3903 // A eighth vector worth of data
3904 // Width = mbytes * elements
3905 Scale = TypeSize::getScalable(MinimumSize: 2);
3906 Width = TypeSize::getScalable(MinimumSize: 2);
3907 MinOffset = -8;
3908 MaxOffset = 7;
3909 break;
3910 case AArch64::ST2Gi:
3911 case AArch64::STZ2Gi:
3912 Scale = TypeSize::getFixed(ExactSize: 16);
3913 Width = TypeSize::getFixed(ExactSize: 32);
3914 MinOffset = -256;
3915 MaxOffset = 255;
3916 break;
3917 case AArch64::STGPi:
3918 Scale = TypeSize::getFixed(ExactSize: 16);
3919 Width = TypeSize::getFixed(ExactSize: 16);
3920 MinOffset = -64;
3921 MaxOffset = 63;
3922 break;
3923 case AArch64::LD1RB_IMM:
3924 case AArch64::LD1RB_H_IMM:
3925 case AArch64::LD1RB_S_IMM:
3926 case AArch64::LD1RB_D_IMM:
3927 case AArch64::LD1RSB_H_IMM:
3928 case AArch64::LD1RSB_S_IMM:
3929 case AArch64::LD1RSB_D_IMM:
3930 Scale = TypeSize::getFixed(ExactSize: 1);
3931 Width = TypeSize::getFixed(ExactSize: 1);
3932 MinOffset = 0;
3933 MaxOffset = 63;
3934 break;
3935 case AArch64::LD1RH_IMM:
3936 case AArch64::LD1RH_S_IMM:
3937 case AArch64::LD1RH_D_IMM:
3938 case AArch64::LD1RSH_S_IMM:
3939 case AArch64::LD1RSH_D_IMM:
3940 Scale = TypeSize::getFixed(ExactSize: 2);
3941 Width = TypeSize::getFixed(ExactSize: 2);
3942 MinOffset = 0;
3943 MaxOffset = 63;
3944 break;
3945 case AArch64::LD1RW_IMM:
3946 case AArch64::LD1RW_D_IMM:
3947 case AArch64::LD1RSW_IMM:
3948 Scale = TypeSize::getFixed(ExactSize: 4);
3949 Width = TypeSize::getFixed(ExactSize: 4);
3950 MinOffset = 0;
3951 MaxOffset = 63;
3952 break;
3953 case AArch64::LD1RD_IMM:
3954 Scale = TypeSize::getFixed(ExactSize: 8);
3955 Width = TypeSize::getFixed(ExactSize: 8);
3956 MinOffset = 0;
3957 MaxOffset = 63;
3958 break;
3959 }
3960
3961 return true;
3962}
3963
3964// Scaling factor for unscaled load or store.
3965int AArch64InstrInfo::getMemScale(unsigned Opc) {
3966 switch (Opc) {
3967 default:
3968 llvm_unreachable("Opcode has unknown scale!");
3969 case AArch64::LDRBBui:
3970 case AArch64::LDURBBi:
3971 case AArch64::LDRSBWui:
3972 case AArch64::LDURSBWi:
3973 case AArch64::STRBBui:
3974 case AArch64::STURBBi:
3975 return 1;
3976 case AArch64::LDRHHui:
3977 case AArch64::LDURHHi:
3978 case AArch64::LDRSHWui:
3979 case AArch64::LDURSHWi:
3980 case AArch64::STRHHui:
3981 case AArch64::STURHHi:
3982 return 2;
3983 case AArch64::LDRSui:
3984 case AArch64::LDURSi:
3985 case AArch64::LDRSpre:
3986 case AArch64::LDRSWui:
3987 case AArch64::LDURSWi:
3988 case AArch64::LDRSWpre:
3989 case AArch64::LDRWpre:
3990 case AArch64::LDRWui:
3991 case AArch64::LDURWi:
3992 case AArch64::STRSui:
3993 case AArch64::STURSi:
3994 case AArch64::STRSpre:
3995 case AArch64::STRWui:
3996 case AArch64::STURWi:
3997 case AArch64::STRWpre:
3998 case AArch64::LDPSi:
3999 case AArch64::LDPSWi:
4000 case AArch64::LDPWi:
4001 case AArch64::STPSi:
4002 case AArch64::STPWi:
4003 return 4;
4004 case AArch64::LDRDui:
4005 case AArch64::LDURDi:
4006 case AArch64::LDRDpre:
4007 case AArch64::LDRXui:
4008 case AArch64::LDURXi:
4009 case AArch64::LDRXpre:
4010 case AArch64::STRDui:
4011 case AArch64::STURDi:
4012 case AArch64::STRDpre:
4013 case AArch64::STRXui:
4014 case AArch64::STURXi:
4015 case AArch64::STRXpre:
4016 case AArch64::LDPDi:
4017 case AArch64::LDPXi:
4018 case AArch64::STPDi:
4019 case AArch64::STPXi:
4020 return 8;
4021 case AArch64::LDRQui:
4022 case AArch64::LDURQi:
4023 case AArch64::STRQui:
4024 case AArch64::STURQi:
4025 case AArch64::STRQpre:
4026 case AArch64::LDPQi:
4027 case AArch64::LDRQpre:
4028 case AArch64::STPQi:
4029 case AArch64::STGi:
4030 case AArch64::STZGi:
4031 case AArch64::ST2Gi:
4032 case AArch64::STZ2Gi:
4033 case AArch64::STGPi:
4034 return 16;
4035 }
4036}
4037
4038bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) {
4039 switch (MI.getOpcode()) {
4040 default:
4041 return false;
4042 case AArch64::LDRWpre:
4043 case AArch64::LDRXpre:
4044 case AArch64::LDRSWpre:
4045 case AArch64::LDRSpre:
4046 case AArch64::LDRDpre:
4047 case AArch64::LDRQpre:
4048 return true;
4049 }
4050}
4051
4052bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) {
4053 switch (MI.getOpcode()) {
4054 default:
4055 return false;
4056 case AArch64::STRWpre:
4057 case AArch64::STRXpre:
4058 case AArch64::STRSpre:
4059 case AArch64::STRDpre:
4060 case AArch64::STRQpre:
4061 return true;
4062 }
4063}
4064
4065bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) {
4066 return isPreLd(MI) || isPreSt(MI);
4067}
4068
4069bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) {
4070 switch (MI.getOpcode()) {
4071 default:
4072 return false;
4073 case AArch64::LDPSi:
4074 case AArch64::LDPSWi:
4075 case AArch64::LDPDi:
4076 case AArch64::LDPQi:
4077 case AArch64::LDPWi:
4078 case AArch64::LDPXi:
4079 case AArch64::STPSi:
4080 case AArch64::STPDi:
4081 case AArch64::STPQi:
4082 case AArch64::STPWi:
4083 case AArch64::STPXi:
4084 case AArch64::STGPi:
4085 return true;
4086 }
4087}
4088
4089const MachineOperand &AArch64InstrInfo::getLdStBaseOp(const MachineInstr &MI) {
4090 unsigned Idx =
4091 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2
4092 : 1;
4093 return MI.getOperand(i: Idx);
4094}
4095
4096const MachineOperand &
4097AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
4098 unsigned Idx =
4099 AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3
4100 : 2;
4101 return MI.getOperand(i: Idx);
4102}
4103
4104static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
4105 Register Reg) {
4106 if (MI.getParent() == nullptr)
4107 return nullptr;
4108 const MachineFunction *MF = MI.getParent()->getParent();
4109 return MF ? MF->getRegInfo().getRegClassOrNull(Reg) : nullptr;
4110}
4111
4112bool AArch64InstrInfo::isHForm(const MachineInstr &MI) {
4113 auto IsHFPR = [&](const MachineOperand &Op) {
4114 if (!Op.isReg())
4115 return false;
4116 auto Reg = Op.getReg();
4117 if (Reg.isPhysical())
4118 return AArch64::FPR16RegClass.contains(Reg);
4119 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4120 return TRC == &AArch64::FPR16RegClass ||
4121 TRC == &AArch64::FPR16_loRegClass;
4122 };
4123 return llvm::any_of(Range: MI.operands(), P: IsHFPR);
4124}
4125
4126bool AArch64InstrInfo::isQForm(const MachineInstr &MI) {
4127 auto IsQFPR = [&](const MachineOperand &Op) {
4128 if (!Op.isReg())
4129 return false;
4130 auto Reg = Op.getReg();
4131 if (Reg.isPhysical())
4132 return AArch64::FPR128RegClass.contains(Reg);
4133 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4134 return TRC == &AArch64::FPR128RegClass ||
4135 TRC == &AArch64::FPR128_loRegClass;
4136 };
4137 return llvm::any_of(Range: MI.operands(), P: IsQFPR);
4138}
4139
4140bool AArch64InstrInfo::hasBTISemantics(const MachineInstr &MI) {
4141 switch (MI.getOpcode()) {
4142 case AArch64::BRK:
4143 case AArch64::HLT:
4144 case AArch64::PACIASP:
4145 case AArch64::PACIBSP:
4146 // Implicit BTI behavior.
4147 return true;
4148 case AArch64::PAUTH_PROLOGUE:
4149 // PAUTH_PROLOGUE expands to PACI(A|B)SP.
4150 return true;
4151 case AArch64::HINT: {
4152 unsigned Imm = MI.getOperand(i: 0).getImm();
4153 // Explicit BTI instruction.
4154 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38)
4155 return true;
4156 // PACI(A|B)SP instructions.
4157 if (Imm == 25 || Imm == 27)
4158 return true;
4159 return false;
4160 }
4161 default:
4162 return false;
4163 }
4164}
4165
4166bool AArch64InstrInfo::isFpOrNEON(const MachineInstr &MI) {
4167 auto IsFPR = [&](const MachineOperand &Op) {
4168 if (!Op.isReg())
4169 return false;
4170 auto Reg = Op.getReg();
4171 if (Reg.isPhysical())
4172 return AArch64::FPR128RegClass.contains(Reg) ||
4173 AArch64::FPR64RegClass.contains(Reg) ||
4174 AArch64::FPR32RegClass.contains(Reg) ||
4175 AArch64::FPR16RegClass.contains(Reg) ||
4176 AArch64::FPR8RegClass.contains(Reg);
4177
4178 const TargetRegisterClass *TRC = ::getRegClass(MI, Reg);
4179 return TRC == &AArch64::FPR128RegClass ||
4180 TRC == &AArch64::FPR128_loRegClass ||
4181 TRC == &AArch64::FPR64RegClass ||
4182 TRC == &AArch64::FPR64_loRegClass ||
4183 TRC == &AArch64::FPR32RegClass || TRC == &AArch64::FPR16RegClass ||
4184 TRC == &AArch64::FPR8RegClass;
4185 };
4186 return llvm::any_of(Range: MI.operands(), P: IsFPR);
4187}
4188
4189// Scale the unscaled offsets. Returns false if the unscaled offset can't be
4190// scaled.
4191static bool scaleOffset(unsigned Opc, int64_t &Offset) {
4192 int Scale = AArch64InstrInfo::getMemScale(Opc);
4193
4194 // If the byte-offset isn't a multiple of the stride, we can't scale this
4195 // offset.
4196 if (Offset % Scale != 0)
4197 return false;
4198
4199 // Convert the byte-offset used by unscaled into an "element" offset used
4200 // by the scaled pair load/store instructions.
4201 Offset /= Scale;
4202 return true;
4203}
4204
4205static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
4206 if (FirstOpc == SecondOpc)
4207 return true;
4208 // We can also pair sign-ext and zero-ext instructions.
4209 switch (FirstOpc) {
4210 default:
4211 return false;
4212 case AArch64::STRSui:
4213 case AArch64::STURSi:
4214 return SecondOpc == AArch64::STRSui || SecondOpc == AArch64::STURSi;
4215 case AArch64::STRDui:
4216 case AArch64::STURDi:
4217 return SecondOpc == AArch64::STRDui || SecondOpc == AArch64::STURDi;
4218 case AArch64::STRQui:
4219 case AArch64::STURQi:
4220 return SecondOpc == AArch64::STRQui || SecondOpc == AArch64::STURQi;
4221 case AArch64::STRWui:
4222 case AArch64::STURWi:
4223 return SecondOpc == AArch64::STRWui || SecondOpc == AArch64::STURWi;
4224 case AArch64::STRXui:
4225 case AArch64::STURXi:
4226 return SecondOpc == AArch64::STRXui || SecondOpc == AArch64::STURXi;
4227 case AArch64::LDRSui:
4228 case AArch64::LDURSi:
4229 return SecondOpc == AArch64::LDRSui || SecondOpc == AArch64::LDURSi;
4230 case AArch64::LDRDui:
4231 case AArch64::LDURDi:
4232 return SecondOpc == AArch64::LDRDui || SecondOpc == AArch64::LDURDi;
4233 case AArch64::LDRQui:
4234 case AArch64::LDURQi:
4235 return SecondOpc == AArch64::LDRQui || SecondOpc == AArch64::LDURQi;
4236 case AArch64::LDRWui:
4237 case AArch64::LDURWi:
4238 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
4239 case AArch64::LDRSWui:
4240 case AArch64::LDURSWi:
4241 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
4242 case AArch64::LDRXui:
4243 case AArch64::LDURXi:
4244 return SecondOpc == AArch64::LDRXui || SecondOpc == AArch64::LDURXi;
4245 }
4246 // These instructions can't be paired based on their opcodes.
4247 return false;
4248}
4249
4250static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
4251 int64_t Offset1, unsigned Opcode1, int FI2,
4252 int64_t Offset2, unsigned Opcode2) {
4253 // Accesses through fixed stack object frame indices may access a different
4254 // fixed stack slot. Check that the object offsets + offsets match.
4255 if (MFI.isFixedObjectIndex(ObjectIdx: FI1) && MFI.isFixedObjectIndex(ObjectIdx: FI2)) {
4256 int64_t ObjectOffset1 = MFI.getObjectOffset(ObjectIdx: FI1);
4257 int64_t ObjectOffset2 = MFI.getObjectOffset(ObjectIdx: FI2);
4258 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
4259 // Convert to scaled object offsets.
4260 int Scale1 = AArch64InstrInfo::getMemScale(Opc: Opcode1);
4261 if (ObjectOffset1 % Scale1 != 0)
4262 return false;
4263 ObjectOffset1 /= Scale1;
4264 int Scale2 = AArch64InstrInfo::getMemScale(Opc: Opcode2);
4265 if (ObjectOffset2 % Scale2 != 0)
4266 return false;
4267 ObjectOffset2 /= Scale2;
4268 ObjectOffset1 += Offset1;
4269 ObjectOffset2 += Offset2;
4270 return ObjectOffset1 + 1 == ObjectOffset2;
4271 }
4272
4273 return FI1 == FI2;
4274}
4275
4276/// Detect opportunities for ldp/stp formation.
4277///
4278/// Only called for LdSt for which getMemOperandWithOffset returns true.
4279bool AArch64InstrInfo::shouldClusterMemOps(
4280 ArrayRef<const MachineOperand *> BaseOps1, int64_t OpOffset1,
4281 bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2,
4282 int64_t OpOffset2, bool OffsetIsScalable2, unsigned ClusterSize,
4283 unsigned NumBytes) const {
4284 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
4285 const MachineOperand &BaseOp1 = *BaseOps1.front();
4286 const MachineOperand &BaseOp2 = *BaseOps2.front();
4287 const MachineInstr &FirstLdSt = *BaseOp1.getParent();
4288 const MachineInstr &SecondLdSt = *BaseOp2.getParent();
4289 if (BaseOp1.getType() != BaseOp2.getType())
4290 return false;
4291
4292 assert((BaseOp1.isReg() || BaseOp1.isFI()) &&
4293 "Only base registers and frame indices are supported.");
4294
4295 // Check for both base regs and base FI.
4296 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg())
4297 return false;
4298
4299 // Only cluster up to a single pair.
4300 if (ClusterSize > 2)
4301 return false;
4302
4303 if (!isPairableLdStInst(MI: FirstLdSt) || !isPairableLdStInst(MI: SecondLdSt))
4304 return false;
4305
4306 // Can we pair these instructions based on their opcodes?
4307 unsigned FirstOpc = FirstLdSt.getOpcode();
4308 unsigned SecondOpc = SecondLdSt.getOpcode();
4309 if (!canPairLdStOpc(FirstOpc, SecondOpc))
4310 return false;
4311
4312 // Can't merge volatiles or load/stores that have a hint to avoid pair
4313 // formation, for example.
4314 if (!isCandidateToMergeOrPair(MI: FirstLdSt) ||
4315 !isCandidateToMergeOrPair(MI: SecondLdSt))
4316 return false;
4317
4318 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
4319 int64_t Offset1 = FirstLdSt.getOperand(i: 2).getImm();
4320 if (hasUnscaledLdStOffset(Opc: FirstOpc) && !scaleOffset(Opc: FirstOpc, Offset&: Offset1))
4321 return false;
4322
4323 int64_t Offset2 = SecondLdSt.getOperand(i: 2).getImm();
4324 if (hasUnscaledLdStOffset(Opc: SecondOpc) && !scaleOffset(Opc: SecondOpc, Offset&: Offset2))
4325 return false;
4326
4327 // Pairwise instructions have a 7-bit signed offset field.
4328 if (Offset1 > 63 || Offset1 < -64)
4329 return false;
4330
4331 // The caller should already have ordered First/SecondLdSt by offset.
4332 // Note: except for non-equal frame index bases
4333 if (BaseOp1.isFI()) {
4334 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
4335 "Caller should have ordered offsets.");
4336
4337 const MachineFrameInfo &MFI =
4338 FirstLdSt.getParent()->getParent()->getFrameInfo();
4339 return shouldClusterFI(MFI, FI1: BaseOp1.getIndex(), Offset1, Opcode1: FirstOpc,
4340 FI2: BaseOp2.getIndex(), Offset2, Opcode2: SecondOpc);
4341 }
4342
4343 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
4344
4345 return Offset1 + 1 == Offset2;
4346}
4347
4348static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
4349 unsigned Reg, unsigned SubIdx,
4350 unsigned State,
4351 const TargetRegisterInfo *TRI) {
4352 if (!SubIdx)
4353 return MIB.addReg(RegNo: Reg, flags: State);
4354
4355 if (Register::isPhysicalRegister(Reg))
4356 return MIB.addReg(RegNo: TRI->getSubReg(Reg, Idx: SubIdx), flags: State);
4357 return MIB.addReg(RegNo: Reg, flags: State, SubReg: SubIdx);
4358}
4359
4360static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
4361 unsigned NumRegs) {
4362 // We really want the positive remainder mod 32 here, that happens to be
4363 // easily obtainable with a mask.
4364 return ((DestReg - SrcReg) & 0x1f) < NumRegs;
4365}
4366
4367void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
4368 MachineBasicBlock::iterator I,
4369 const DebugLoc &DL, MCRegister DestReg,
4370 MCRegister SrcReg, bool KillSrc,
4371 unsigned Opcode,
4372 ArrayRef<unsigned> Indices) const {
4373 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
4374 const TargetRegisterInfo *TRI = &getRegisterInfo();
4375 uint16_t DestEncoding = TRI->getEncodingValue(RegNo: DestReg);
4376 uint16_t SrcEncoding = TRI->getEncodingValue(RegNo: SrcReg);
4377 unsigned NumRegs = Indices.size();
4378
4379 int SubReg = 0, End = NumRegs, Incr = 1;
4380 if (forwardCopyWillClobberTuple(DestReg: DestEncoding, SrcReg: SrcEncoding, NumRegs)) {
4381 SubReg = NumRegs - 1;
4382 End = -1;
4383 Incr = -1;
4384 }
4385
4386 for (; SubReg != End; SubReg += Incr) {
4387 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
4388 AddSubReg(MIB, Reg: DestReg, SubIdx: Indices[SubReg], State: RegState::Define, TRI);
4389 AddSubReg(MIB, Reg: SrcReg, SubIdx: Indices[SubReg], State: 0, TRI);
4390 AddSubReg(MIB, Reg: SrcReg, SubIdx: Indices[SubReg], State: getKillRegState(B: KillSrc), TRI);
4391 }
4392}
4393
4394void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
4395 MachineBasicBlock::iterator I,
4396 DebugLoc DL, unsigned DestReg,
4397 unsigned SrcReg, bool KillSrc,
4398 unsigned Opcode, unsigned ZeroReg,
4399 llvm::ArrayRef<unsigned> Indices) const {
4400 const TargetRegisterInfo *TRI = &getRegisterInfo();
4401 unsigned NumRegs = Indices.size();
4402
4403#ifndef NDEBUG
4404 uint16_t DestEncoding = TRI->getEncodingValue(RegNo: DestReg);
4405 uint16_t SrcEncoding = TRI->getEncodingValue(RegNo: SrcReg);
4406 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 &&
4407 "GPR reg sequences should not be able to overlap");
4408#endif
4409
4410 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) {
4411 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode));
4412 AddSubReg(MIB, Reg: DestReg, SubIdx: Indices[SubReg], State: RegState::Define, TRI);
4413 MIB.addReg(RegNo: ZeroReg);
4414 AddSubReg(MIB, Reg: SrcReg, SubIdx: Indices[SubReg], State: getKillRegState(B: KillSrc), TRI);
4415 MIB.addImm(Val: 0);
4416 }
4417}
4418
4419void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
4420 MachineBasicBlock::iterator I,
4421 const DebugLoc &DL, MCRegister DestReg,
4422 MCRegister SrcReg, bool KillSrc) const {
4423 if (AArch64::GPR32spRegClass.contains(DestReg) &&
4424 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
4425 const TargetRegisterInfo *TRI = &getRegisterInfo();
4426
4427 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
4428 // If either operand is WSP, expand to ADD #0.
4429 if (Subtarget.hasZeroCycleRegMove()) {
4430 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
4431 MCRegister DestRegX = TRI->getMatchingSuperReg(
4432 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4433 MCRegister SrcRegX = TRI->getMatchingSuperReg(
4434 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4435 // This instruction is reading and writing X registers. This may upset
4436 // the register scavenger and machine verifier, so we need to indicate
4437 // that we are reading an undefined value from SrcRegX, but a proper
4438 // value from SrcReg.
4439 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
4440 .addReg(SrcRegX, RegState::Undef)
4441 .addImm(0)
4442 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
4443 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
4444 } else {
4445 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
4446 .addReg(SrcReg, getKillRegState(KillSrc))
4447 .addImm(0)
4448 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4449 }
4450 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
4451 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
4452 .addImm(0)
4453 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4454 } else {
4455 if (Subtarget.hasZeroCycleRegMove()) {
4456 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
4457 MCRegister DestRegX = TRI->getMatchingSuperReg(
4458 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4459 MCRegister SrcRegX = TRI->getMatchingSuperReg(
4460 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
4461 // This instruction is reading and writing X registers. This may upset
4462 // the register scavenger and machine verifier, so we need to indicate
4463 // that we are reading an undefined value from SrcRegX, but a proper
4464 // value from SrcReg.
4465 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
4466 .addReg(AArch64::XZR)
4467 .addReg(SrcRegX, RegState::Undef)
4468 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
4469 } else {
4470 // Otherwise, expand to ORR WZR.
4471 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
4472 .addReg(AArch64::WZR)
4473 .addReg(SrcReg, getKillRegState(KillSrc));
4474 }
4475 }
4476 return;
4477 }
4478
4479 // Copy a Predicate register by ORRing with itself.
4480 if (AArch64::PPRRegClass.contains(DestReg) &&
4481 AArch64::PPRRegClass.contains(SrcReg)) {
4482 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
4483 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
4484 .addReg(SrcReg) // Pg
4485 .addReg(SrcReg)
4486 .addReg(SrcReg, getKillRegState(KillSrc));
4487 return;
4488 }
4489
4490 // Copy a predicate-as-counter register by ORRing with itself as if it
4491 // were a regular predicate (mask) register.
4492 bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
4493 bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
4494 if (DestIsPNR || SrcIsPNR) {
4495 assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
4496 "Unexpected predicate-as-counter register.");
4497 auto ToPPR = [](MCRegister R) -> MCRegister {
4498 return (R - AArch64::PN0) + AArch64::P0;
4499 };
4500 MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg;
4501 MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg;
4502
4503 if (PPRSrcReg != PPRDestReg) {
4504 auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
4505 .addReg(PPRSrcReg) // Pg
4506 .addReg(PPRSrcReg)
4507 .addReg(PPRSrcReg, getKillRegState(KillSrc));
4508 if (DestIsPNR)
4509 NewMI.addDef(DestReg, RegState::Implicit);
4510 }
4511 return;
4512 }
4513
4514 // Copy a Z register by ORRing with itself.
4515 if (AArch64::ZPRRegClass.contains(DestReg) &&
4516 AArch64::ZPRRegClass.contains(SrcReg)) {
4517 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
4518 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg)
4519 .addReg(SrcReg)
4520 .addReg(SrcReg, getKillRegState(KillSrc));
4521 return;
4522 }
4523
4524 // Copy a Z register pair by copying the individual sub-registers.
4525 if ((AArch64::ZPR2RegClass.contains(DestReg) ||
4526 AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
4527 (AArch64::ZPR2RegClass.contains(SrcReg) ||
4528 AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
4529 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
4530 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
4531 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4532 Indices);
4533 return;
4534 }
4535
4536 // Copy a Z register triple by copying the individual sub-registers.
4537 if (AArch64::ZPR3RegClass.contains(DestReg) &&
4538 AArch64::ZPR3RegClass.contains(SrcReg)) {
4539 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
4540 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
4541 AArch64::zsub2};
4542 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4543 Indices);
4544 return;
4545 }
4546
4547 // Copy a Z register quad by copying the individual sub-registers.
4548 if ((AArch64::ZPR4RegClass.contains(DestReg) ||
4549 AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
4550 (AArch64::ZPR4RegClass.contains(SrcReg) ||
4551 AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
4552 assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
4553 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
4554 AArch64::zsub2, AArch64::zsub3};
4555 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
4556 Indices);
4557 return;
4558 }
4559
4560 if (AArch64::GPR64spRegClass.contains(DestReg) &&
4561 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
4562 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
4563 // If either operand is SP, expand to ADD #0.
4564 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
4565 .addReg(SrcReg, getKillRegState(KillSrc))
4566 .addImm(0)
4567 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4568 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
4569 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
4570 .addImm(0)
4571 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
4572 } else {
4573 // Otherwise, expand to ORR XZR.
4574 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
4575 .addReg(AArch64::XZR)
4576 .addReg(SrcReg, getKillRegState(KillSrc));
4577 }
4578 return;
4579 }
4580
4581 // Copy a DDDD register quad by copying the individual sub-registers.
4582 if (AArch64::DDDDRegClass.contains(DestReg) &&
4583 AArch64::DDDDRegClass.contains(SrcReg)) {
4584 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
4585 AArch64::dsub2, AArch64::dsub3};
4586 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4587 Indices);
4588 return;
4589 }
4590
4591 // Copy a DDD register triple by copying the individual sub-registers.
4592 if (AArch64::DDDRegClass.contains(DestReg) &&
4593 AArch64::DDDRegClass.contains(SrcReg)) {
4594 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1,
4595 AArch64::dsub2};
4596 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4597 Indices);
4598 return;
4599 }
4600
4601 // Copy a DD register pair by copying the individual sub-registers.
4602 if (AArch64::DDRegClass.contains(DestReg) &&
4603 AArch64::DDRegClass.contains(SrcReg)) {
4604 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1};
4605 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
4606 Indices);
4607 return;
4608 }
4609
4610 // Copy a QQQQ register quad by copying the individual sub-registers.
4611 if (AArch64::QQQQRegClass.contains(DestReg) &&
4612 AArch64::QQQQRegClass.contains(SrcReg)) {
4613 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
4614 AArch64::qsub2, AArch64::qsub3};
4615 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4616 Indices);
4617 return;
4618 }
4619
4620 // Copy a QQQ register triple by copying the individual sub-registers.
4621 if (AArch64::QQQRegClass.contains(DestReg) &&
4622 AArch64::QQQRegClass.contains(SrcReg)) {
4623 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1,
4624 AArch64::qsub2};
4625 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4626 Indices);
4627 return;
4628 }
4629
4630 // Copy a QQ register pair by copying the individual sub-registers.
4631 if (AArch64::QQRegClass.contains(DestReg) &&
4632 AArch64::QQRegClass.contains(SrcReg)) {
4633 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1};
4634 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
4635 Indices);
4636 return;
4637 }
4638
4639 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) &&
4640 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) {
4641 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64};
4642 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs,
4643 AArch64::XZR, Indices);
4644 return;
4645 }
4646
4647 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) &&
4648 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) {
4649 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32};
4650 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs,
4651 AArch64::WZR, Indices);
4652 return;
4653 }
4654
4655 if (AArch64::FPR128RegClass.contains(DestReg) &&
4656 AArch64::FPR128RegClass.contains(SrcReg)) {
4657 if (Subtarget.hasSVEorSME() && !Subtarget.isNeonAvailable())
4658 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ))
4659 .addReg(AArch64::Z0 + (DestReg - AArch64::Q0), RegState::Define)
4660 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0))
4661 .addReg(AArch64::Z0 + (SrcReg - AArch64::Q0));
4662 else if (Subtarget.hasNEON())
4663 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
4664 .addReg(SrcReg)
4665 .addReg(SrcReg, getKillRegState(KillSrc));
4666 else {
4667 BuildMI(MBB, I, DL, get(AArch64::STRQpre))
4668 .addReg(AArch64::SP, RegState::Define)
4669 .addReg(SrcReg, getKillRegState(KillSrc))
4670 .addReg(AArch64::SP)
4671 .addImm(-16);
4672 BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
4673 .addReg(AArch64::SP, RegState::Define)
4674 .addReg(DestReg, RegState::Define)
4675 .addReg(AArch64::SP)
4676 .addImm(16);
4677 }
4678 return;
4679 }
4680
4681 if (AArch64::FPR64RegClass.contains(DestReg) &&
4682 AArch64::FPR64RegClass.contains(SrcReg)) {
4683 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
4684 .addReg(SrcReg, getKillRegState(KillSrc));
4685 return;
4686 }
4687
4688 if (AArch64::FPR32RegClass.contains(DestReg) &&
4689 AArch64::FPR32RegClass.contains(SrcReg)) {
4690 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4691 .addReg(SrcReg, getKillRegState(KillSrc));
4692 return;
4693 }
4694
4695 if (AArch64::FPR16RegClass.contains(DestReg) &&
4696 AArch64::FPR16RegClass.contains(SrcReg)) {
4697 DestReg =
4698 RI.getMatchingSuperReg(DestReg, AArch64::hsub, &AArch64::FPR32RegClass);
4699 SrcReg =
4700 RI.getMatchingSuperReg(SrcReg, AArch64::hsub, &AArch64::FPR32RegClass);
4701 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4702 .addReg(SrcReg, getKillRegState(KillSrc));
4703 return;
4704 }
4705
4706 if (AArch64::FPR8RegClass.contains(DestReg) &&
4707 AArch64::FPR8RegClass.contains(SrcReg)) {
4708 DestReg =
4709 RI.getMatchingSuperReg(DestReg, AArch64::bsub, &AArch64::FPR32RegClass);
4710 SrcReg =
4711 RI.getMatchingSuperReg(SrcReg, AArch64::bsub, &AArch64::FPR32RegClass);
4712 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
4713 .addReg(SrcReg, getKillRegState(KillSrc));
4714 return;
4715 }
4716
4717 // Copies between GPR64 and FPR64.
4718 if (AArch64::FPR64RegClass.contains(DestReg) &&
4719 AArch64::GPR64RegClass.contains(SrcReg)) {
4720 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
4721 .addReg(SrcReg, getKillRegState(KillSrc));
4722 return;
4723 }
4724 if (AArch64::GPR64RegClass.contains(DestReg) &&
4725 AArch64::FPR64RegClass.contains(SrcReg)) {
4726 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
4727 .addReg(SrcReg, getKillRegState(KillSrc));
4728 return;
4729 }
4730 // Copies between GPR32 and FPR32.
4731 if (AArch64::FPR32RegClass.contains(DestReg) &&
4732 AArch64::GPR32RegClass.contains(SrcReg)) {
4733 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
4734 .addReg(SrcReg, getKillRegState(KillSrc));
4735 return;
4736 }
4737 if (AArch64::GPR32RegClass.contains(DestReg) &&
4738 AArch64::FPR32RegClass.contains(SrcReg)) {
4739 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
4740 .addReg(SrcReg, getKillRegState(KillSrc));
4741 return;
4742 }
4743
4744 if (DestReg == AArch64::NZCV) {
4745 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
4746 BuildMI(MBB, I, DL, get(AArch64::MSR))
4747 .addImm(AArch64SysReg::NZCV)
4748 .addReg(SrcReg, getKillRegState(KillSrc))
4749 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
4750 return;
4751 }
4752
4753 if (SrcReg == AArch64::NZCV) {
4754 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
4755 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
4756 .addImm(AArch64SysReg::NZCV)
4757 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
4758 return;
4759 }
4760
4761#ifndef NDEBUG
4762 const TargetRegisterInfo &TRI = getRegisterInfo();
4763 errs() << TRI.getRegAsmName(Reg: DestReg) << " = COPY "
4764 << TRI.getRegAsmName(Reg: SrcReg) << "\n";
4765#endif
4766 llvm_unreachable("unimplemented reg-to-reg copy");
4767}
4768
4769static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
4770 MachineBasicBlock &MBB,
4771 MachineBasicBlock::iterator InsertBefore,
4772 const MCInstrDesc &MCID,
4773 Register SrcReg, bool IsKill,
4774 unsigned SubIdx0, unsigned SubIdx1, int FI,
4775 MachineMemOperand *MMO) {
4776 Register SrcReg0 = SrcReg;
4777 Register SrcReg1 = SrcReg;
4778 if (SrcReg.isPhysical()) {
4779 SrcReg0 = TRI.getSubReg(Reg: SrcReg, Idx: SubIdx0);
4780 SubIdx0 = 0;
4781 SrcReg1 = TRI.getSubReg(Reg: SrcReg, Idx: SubIdx1);
4782 SubIdx1 = 0;
4783 }
4784 BuildMI(BB&: MBB, I: InsertBefore, MIMD: DebugLoc(), MCID)
4785 .addReg(RegNo: SrcReg0, flags: getKillRegState(B: IsKill), SubReg: SubIdx0)
4786 .addReg(RegNo: SrcReg1, flags: getKillRegState(B: IsKill), SubReg: SubIdx1)
4787 .addFrameIndex(Idx: FI)
4788 .addImm(Val: 0)
4789 .addMemOperand(MMO);
4790}
4791
4792void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
4793 MachineBasicBlock::iterator MBBI,
4794 Register SrcReg, bool isKill, int FI,
4795 const TargetRegisterClass *RC,
4796 const TargetRegisterInfo *TRI,
4797 Register VReg) const {
4798 MachineFunction &MF = *MBB.getParent();
4799 MachineFrameInfo &MFI = MF.getFrameInfo();
4800
4801 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
4802 MachineMemOperand *MMO =
4803 MF.getMachineMemOperand(PtrInfo, F: MachineMemOperand::MOStore,
4804 Size: MFI.getObjectSize(ObjectIdx: FI), BaseAlignment: MFI.getObjectAlign(ObjectIdx: FI));
4805 unsigned Opc = 0;
4806 bool Offset = true;
4807 MCRegister PNRReg = MCRegister::NoRegister;
4808 unsigned StackID = TargetStackID::Default;
4809 switch (TRI->getSpillSize(RC: *RC)) {
4810 case 1:
4811 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
4812 Opc = AArch64::STRBui;
4813 break;
4814 case 2: {
4815 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
4816 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
4817 Opc = AArch64::STRHui;
4818 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
4819 assert(Subtarget.hasSVEorSME() &&
4820 "Unexpected register store without SVE store instructions");
4821 assert((!IsPNR || Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
4822 "Unexpected register store without SVE2p1 or SME2");
4823 Opc = AArch64::STR_PXI;
4824 StackID = TargetStackID::ScalableVector;
4825 }
4826 break;
4827 }
4828 case 4:
4829 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
4830 Opc = AArch64::STRWui;
4831 if (SrcReg.isVirtual())
4832 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
4833 else
4834 assert(SrcReg != AArch64::WSP);
4835 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
4836 Opc = AArch64::STRSui;
4837 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
4838 Opc = AArch64::STR_PPXI;
4839 StackID = TargetStackID::ScalableVector;
4840 }
4841 break;
4842 case 8:
4843 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
4844 Opc = AArch64::STRXui;
4845 if (SrcReg.isVirtual())
4846 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
4847 else
4848 assert(SrcReg != AArch64::SP);
4849 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
4850 Opc = AArch64::STRDui;
4851 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
4852 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
4853 get(AArch64::STPWi), SrcReg, isKill,
4854 AArch64::sube32, AArch64::subo32, FI, MMO);
4855 return;
4856 }
4857 break;
4858 case 16:
4859 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
4860 Opc = AArch64::STRQui;
4861 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
4862 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4863 Opc = AArch64::ST1Twov1d;
4864 Offset = false;
4865 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
4866 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI,
4867 get(AArch64::STPXi), SrcReg, isKill,
4868 AArch64::sube64, AArch64::subo64, FI, MMO);
4869 return;
4870 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
4871 assert(Subtarget.hasSVEorSME() &&
4872 "Unexpected register store without SVE store instructions");
4873 Opc = AArch64::STR_ZXI;
4874 StackID = TargetStackID::ScalableVector;
4875 }
4876 break;
4877 case 24:
4878 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
4879 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4880 Opc = AArch64::ST1Threev1d;
4881 Offset = false;
4882 }
4883 break;
4884 case 32:
4885 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
4886 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4887 Opc = AArch64::ST1Fourv1d;
4888 Offset = false;
4889 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
4890 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4891 Opc = AArch64::ST1Twov2d;
4892 Offset = false;
4893 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
4894 AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
4895 assert(Subtarget.hasSVEorSME() &&
4896 "Unexpected register store without SVE store instructions");
4897 Opc = AArch64::STR_ZZXI;
4898 StackID = TargetStackID::ScalableVector;
4899 }
4900 break;
4901 case 48:
4902 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
4903 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4904 Opc = AArch64::ST1Threev2d;
4905 Offset = false;
4906 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
4907 assert(Subtarget.hasSVEorSME() &&
4908 "Unexpected register store without SVE store instructions");
4909 Opc = AArch64::STR_ZZZXI;
4910 StackID = TargetStackID::ScalableVector;
4911 }
4912 break;
4913 case 64:
4914 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
4915 assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
4916 Opc = AArch64::ST1Fourv2d;
4917 Offset = false;
4918 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
4919 AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
4920 assert(Subtarget.hasSVEorSME() &&
4921 "Unexpected register store without SVE store instructions");
4922 Opc = AArch64::STR_ZZZZXI;
4923 StackID = TargetStackID::ScalableVector;
4924 }
4925 break;
4926 }
4927 assert(Opc && "Unknown register class");
4928 MFI.setStackID(ObjectIdx: FI, ID: StackID);
4929
4930 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
4931 .addReg(SrcReg, getKillRegState(B: isKill))
4932 .addFrameIndex(FI);
4933
4934 if (Offset)
4935 MI.addImm(Val: 0);
4936 if (PNRReg.isValid())
4937 MI.addDef(RegNo: PNRReg, Flags: RegState::Implicit);
4938 MI.addMemOperand(MMO);
4939}
4940
4941static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
4942 MachineBasicBlock &MBB,
4943 MachineBasicBlock::iterator InsertBefore,
4944 const MCInstrDesc &MCID,
4945 Register DestReg, unsigned SubIdx0,
4946 unsigned SubIdx1, int FI,
4947 MachineMemOperand *MMO) {
4948 Register DestReg0 = DestReg;
4949 Register DestReg1 = DestReg;
4950 bool IsUndef = true;
4951 if (DestReg.isPhysical()) {
4952 DestReg0 = TRI.getSubReg(Reg: DestReg, Idx: SubIdx0);
4953 SubIdx0 = 0;
4954 DestReg1 = TRI.getSubReg(Reg: DestReg, Idx: SubIdx1);
4955 SubIdx1 = 0;
4956 IsUndef = false;
4957 }
4958 BuildMI(BB&: MBB, I: InsertBefore, MIMD: DebugLoc(), MCID)
4959 .addReg(RegNo: DestReg0, flags: RegState::Define | getUndefRegState(B: IsUndef), SubReg: SubIdx0)
4960 .addReg(RegNo: DestReg1, flags: RegState::Define | getUndefRegState(B: IsUndef), SubReg: SubIdx1)
4961 .addFrameIndex(Idx: FI)
4962 .addImm(Val: 0)
4963 .addMemOperand(MMO);
4964}
4965
4966void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
4967 MachineBasicBlock::iterator MBBI,
4968 Register DestReg, int FI,
4969 const TargetRegisterClass *RC,
4970 const TargetRegisterInfo *TRI,
4971 Register VReg) const {
4972 MachineFunction &MF = *MBB.getParent();
4973 MachineFrameInfo &MFI = MF.getFrameInfo();
4974 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
4975 MachineMemOperand *MMO =
4976 MF.getMachineMemOperand(PtrInfo, F: MachineMemOperand::MOLoad,
4977 Size: MFI.getObjectSize(ObjectIdx: FI), BaseAlignment: MFI.getObjectAlign(ObjectIdx: FI));
4978
4979 unsigned Opc = 0;
4980 bool Offset = true;
4981 unsigned StackID = TargetStackID::Default;
4982 Register PNRReg = MCRegister::NoRegister;
4983 switch (TRI->getSpillSize(RC: *RC)) {
4984 case 1:
4985 if (AArch64::FPR8RegClass.hasSubClassEq(RC))
4986 Opc = AArch64::LDRBui;
4987 break;
4988 case 2: {
4989 bool IsPNR = AArch64::PNRRegClass.hasSubClassEq(RC);
4990 if (AArch64::FPR16RegClass.hasSubClassEq(RC))
4991 Opc = AArch64::LDRHui;
4992 else if (IsPNR || AArch64::PPRRegClass.hasSubClassEq(RC)) {
4993 assert(Subtarget.hasSVEorSME() &&
4994 "Unexpected register load without SVE load instructions");
4995 assert((!IsPNR || Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
4996 "Unexpected register load without SVE2p1 or SME2");
4997 if (IsPNR)
4998 PNRReg = DestReg;
4999 Opc = AArch64::LDR_PXI;
5000 StackID = TargetStackID::ScalableVector;
5001 }
5002 break;
5003 }
5004 case 4:
5005 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
5006 Opc = AArch64::LDRWui;
5007 if (DestReg.isVirtual())
5008 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
5009 else
5010 assert(DestReg != AArch64::WSP);
5011 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
5012 Opc = AArch64::LDRSui;
5013 else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
5014 Opc = AArch64::LDR_PPXI;
5015 StackID = TargetStackID::ScalableVector;
5016 }
5017 break;
5018 case 8:
5019 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
5020 Opc = AArch64::LDRXui;
5021 if (DestReg.isVirtual())
5022 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
5023 else
5024 assert(DestReg != AArch64::SP);
5025 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) {
5026 Opc = AArch64::LDRDui;
5027 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) {
5028 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5029 get(AArch64::LDPWi), DestReg, AArch64::sube32,
5030 AArch64::subo32, FI, MMO);
5031 return;
5032 }
5033 break;
5034 case 16:
5035 if (AArch64::FPR128RegClass.hasSubClassEq(RC))
5036 Opc = AArch64::LDRQui;
5037 else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
5038 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5039 Opc = AArch64::LD1Twov1d;
5040 Offset = false;
5041 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) {
5042 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI,
5043 get(AArch64::LDPXi), DestReg, AArch64::sube64,
5044 AArch64::subo64, FI, MMO);
5045 return;
5046 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
5047 assert(Subtarget.hasSVEorSME() &&
5048 "Unexpected register load without SVE load instructions");
5049 Opc = AArch64::LDR_ZXI;
5050 StackID = TargetStackID::ScalableVector;
5051 }
5052 break;
5053 case 24:
5054 if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
5055 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5056 Opc = AArch64::LD1Threev1d;
5057 Offset = false;
5058 }
5059 break;
5060 case 32:
5061 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
5062 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5063 Opc = AArch64::LD1Fourv1d;
5064 Offset = false;
5065 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
5066 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5067 Opc = AArch64::LD1Twov2d;
5068 Offset = false;
5069 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
5070 AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5071 assert(Subtarget.hasSVEorSME() &&
5072 "Unexpected register load without SVE load instructions");
5073 Opc = AArch64::LDR_ZZXI;
5074 StackID = TargetStackID::ScalableVector;
5075 }
5076 break;
5077 case 48:
5078 if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
5079 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5080 Opc = AArch64::LD1Threev2d;
5081 Offset = false;
5082 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
5083 assert(Subtarget.hasSVEorSME() &&
5084 "Unexpected register load without SVE load instructions");
5085 Opc = AArch64::LDR_ZZZXI;
5086 StackID = TargetStackID::ScalableVector;
5087 }
5088 break;
5089 case 64:
5090 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
5091 assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
5092 Opc = AArch64::LD1Fourv2d;
5093 Offset = false;
5094 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
5095 AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
5096 assert(Subtarget.hasSVEorSME() &&
5097 "Unexpected register load without SVE load instructions");
5098 Opc = AArch64::LDR_ZZZZXI;
5099 StackID = TargetStackID::ScalableVector;
5100 }
5101 break;
5102 }
5103
5104 assert(Opc && "Unknown register class");
5105 MFI.setStackID(ObjectIdx: FI, ID: StackID);
5106
5107 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
5108 .addReg(DestReg, getDefRegState(B: true))
5109 .addFrameIndex(FI);
5110 if (Offset)
5111 MI.addImm(Val: 0);
5112 if (PNRReg.isValid() && !PNRReg.isVirtual())
5113 MI.addDef(RegNo: PNRReg, Flags: RegState::Implicit);
5114 MI.addMemOperand(MMO);
5115
5116 if (PNRReg.isValid() && PNRReg.isVirtual())
5117 BuildMI(MBB, MBBI, DebugLoc(), get(TargetOpcode::COPY), PNRReg)
5118 .addReg(DestReg);
5119}
5120
5121bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
5122 const MachineInstr &UseMI,
5123 const TargetRegisterInfo *TRI) {
5124 return any_of(Range: instructionsWithoutDebug(It: std::next(x: DefMI.getIterator()),
5125 End: UseMI.getIterator()),
5126 P: [TRI](const MachineInstr &I) {
5127 return I.modifiesRegister(AArch64::NZCV, TRI) ||
5128 I.readsRegister(AArch64::NZCV, TRI);
5129 });
5130}
5131
5132void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5133 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) {
5134 // The smallest scalable element supported by scaled SVE addressing
5135 // modes are predicates, which are 2 scalable bytes in size. So the scalable
5136 // byte offset must always be a multiple of 2.
5137 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5138
5139 // VGSized offsets are divided by '2', because the VG register is the
5140 // the number of 64bit granules as opposed to 128bit vector chunks,
5141 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled.
5142 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes.
5143 // VG = n * 2 and the dwarf offset must be VG * 8 bytes.
5144 ByteSized = Offset.getFixed();
5145 VGSized = Offset.getScalable() / 2;
5146}
5147
5148/// Returns the offset in parts to which this frame offset can be
5149/// decomposed for the purpose of describing a frame offset.
5150/// For non-scalable offsets this is simply its byte size.
5151void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5152 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors,
5153 int64_t &NumDataVectors) {
5154 // The smallest scalable element supported by scaled SVE addressing
5155 // modes are predicates, which are 2 scalable bytes in size. So the scalable
5156 // byte offset must always be a multiple of 2.
5157 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset");
5158
5159 NumBytes = Offset.getFixed();
5160 NumDataVectors = 0;
5161 NumPredicateVectors = Offset.getScalable() / 2;
5162 // This method is used to get the offsets to adjust the frame offset.
5163 // If the function requires ADDPL to be used and needs more than two ADDPL
5164 // instructions, part of the offset is folded into NumDataVectors so that it
5165 // uses ADDVL for part of it, reducing the number of ADDPL instructions.
5166 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
5167 NumPredicateVectors > 62) {
5168 NumDataVectors = NumPredicateVectors / 8;
5169 NumPredicateVectors -= NumDataVectors * 8;
5170 }
5171}
5172
5173// Convenience function to create a DWARF expression for
5174// Expr + NumBytes + NumVGScaledBytes * AArch64::VG
5175static void appendVGScaledOffsetExpr(SmallVectorImpl<char> &Expr, int NumBytes,
5176 int NumVGScaledBytes, unsigned VG,
5177 llvm::raw_string_ostream &Comment) {
5178 uint8_t buffer[16];
5179
5180 if (NumBytes) {
5181 Expr.push_back(Elt: dwarf::DW_OP_consts);
5182 Expr.append(in_start: buffer, in_end: buffer + encodeSLEB128(Value: NumBytes, p: buffer));
5183 Expr.push_back(Elt: (uint8_t)dwarf::DW_OP_plus);
5184 Comment << (NumBytes < 0 ? " - " : " + ") << std::abs(x: NumBytes);
5185 }
5186
5187 if (NumVGScaledBytes) {
5188 Expr.push_back(Elt: (uint8_t)dwarf::DW_OP_consts);
5189 Expr.append(in_start: buffer, in_end: buffer + encodeSLEB128(Value: NumVGScaledBytes, p: buffer));
5190
5191 Expr.push_back(Elt: (uint8_t)dwarf::DW_OP_bregx);
5192 Expr.append(in_start: buffer, in_end: buffer + encodeULEB128(Value: VG, p: buffer));
5193 Expr.push_back(Elt: 0);
5194
5195 Expr.push_back(Elt: (uint8_t)dwarf::DW_OP_mul);
5196 Expr.push_back(Elt: (uint8_t)dwarf::DW_OP_plus);
5197
5198 Comment << (NumVGScaledBytes < 0 ? " - " : " + ")
5199 << std::abs(x: NumVGScaledBytes) << " * VG";
5200 }
5201}
5202
5203// Creates an MCCFIInstruction:
5204// { DW_CFA_def_cfa_expression, ULEB128 (sizeof expr), expr }
5205static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
5206 unsigned Reg,
5207 const StackOffset &Offset) {
5208 int64_t NumBytes, NumVGScaledBytes;
5209 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(Offset, ByteSized&: NumBytes,
5210 VGSized&: NumVGScaledBytes);
5211 std::string CommentBuffer;
5212 llvm::raw_string_ostream Comment(CommentBuffer);
5213
5214 if (Reg == AArch64::SP)
5215 Comment << "sp";
5216 else if (Reg == AArch64::FP)
5217 Comment << "fp";
5218 else
5219 Comment << printReg(Reg, TRI: &TRI);
5220
5221 // Build up the expression (Reg + NumBytes + NumVGScaledBytes * AArch64::VG)
5222 SmallString<64> Expr;
5223 unsigned DwarfReg = TRI.getDwarfRegNum(RegNum: Reg, isEH: true);
5224 Expr.push_back(Elt: (uint8_t)(dwarf::DW_OP_breg0 + DwarfReg));
5225 Expr.push_back(Elt: 0);
5226 appendVGScaledOffsetExpr(Expr, NumBytes, NumVGScaledBytes,
5227 TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5228
5229 // Wrap this into DW_CFA_def_cfa.
5230 SmallString<64> DefCfaExpr;
5231 DefCfaExpr.push_back(Elt: dwarf::DW_CFA_def_cfa_expression);
5232 uint8_t buffer[16];
5233 DefCfaExpr.append(in_start: buffer, in_end: buffer + encodeULEB128(Value: Expr.size(), p: buffer));
5234 DefCfaExpr.append(RHS: Expr.str());
5235 return MCCFIInstruction::createEscape(L: nullptr, Vals: DefCfaExpr.str(), Loc: SMLoc(),
5236 Comment: Comment.str());
5237}
5238
5239MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,
5240 unsigned FrameReg, unsigned Reg,
5241 const StackOffset &Offset,
5242 bool LastAdjustmentWasScalable) {
5243 if (Offset.getScalable())
5244 return createDefCFAExpression(TRI, Reg, Offset);
5245
5246 if (FrameReg == Reg && !LastAdjustmentWasScalable)
5247 return MCCFIInstruction::cfiDefCfaOffset(L: nullptr, Offset: int(Offset.getFixed()));
5248
5249 unsigned DwarfReg = TRI.getDwarfRegNum(RegNum: Reg, isEH: true);
5250 return MCCFIInstruction::cfiDefCfa(L: nullptr, Register: DwarfReg, Offset: (int)Offset.getFixed());
5251}
5252
5253MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI,
5254 unsigned Reg,
5255 const StackOffset &OffsetFromDefCFA) {
5256 int64_t NumBytes, NumVGScaledBytes;
5257 AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
5258 Offset: OffsetFromDefCFA, ByteSized&: NumBytes, VGSized&: NumVGScaledBytes);
5259
5260 unsigned DwarfReg = TRI.getDwarfRegNum(RegNum: Reg, isEH: true);
5261
5262 // Non-scalable offsets can use DW_CFA_offset directly.
5263 if (!NumVGScaledBytes)
5264 return MCCFIInstruction::createOffset(L: nullptr, Register: DwarfReg, Offset: NumBytes);
5265
5266 std::string CommentBuffer;
5267 llvm::raw_string_ostream Comment(CommentBuffer);
5268 Comment << printReg(Reg, TRI: &TRI) << " @ cfa";
5269
5270 // Build up expression (NumBytes + NumVGScaledBytes * AArch64::VG)
5271 SmallString<64> OffsetExpr;
5272 appendVGScaledOffsetExpr(OffsetExpr, NumBytes, NumVGScaledBytes,
5273 TRI.getDwarfRegNum(AArch64::VG, true), Comment);
5274
5275 // Wrap this into DW_CFA_expression
5276 SmallString<64> CfaExpr;
5277 CfaExpr.push_back(Elt: dwarf::DW_CFA_expression);
5278 uint8_t buffer[16];
5279 CfaExpr.append(in_start: buffer, in_end: buffer + encodeULEB128(Value: DwarfReg, p: buffer));
5280 CfaExpr.append(in_start: buffer, in_end: buffer + encodeULEB128(Value: OffsetExpr.size(), p: buffer));
5281 CfaExpr.append(RHS: OffsetExpr.str());
5282
5283 return MCCFIInstruction::createEscape(L: nullptr, Vals: CfaExpr.str(), Loc: SMLoc(),
5284 Comment: Comment.str());
5285}
5286
5287// Helper function to emit a frame offset adjustment from a given
5288// pointer (SrcReg), stored into DestReg. This function is explicit
5289// in that it requires the opcode.
5290static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
5291 MachineBasicBlock::iterator MBBI,
5292 const DebugLoc &DL, unsigned DestReg,
5293 unsigned SrcReg, int64_t Offset, unsigned Opc,
5294 const TargetInstrInfo *TII,
5295 MachineInstr::MIFlag Flag, bool NeedsWinCFI,
5296 bool *HasWinCFI, bool EmitCFAOffset,
5297 StackOffset CFAOffset, unsigned FrameReg) {
5298 int Sign = 1;
5299 unsigned MaxEncoding, ShiftSize;
5300 switch (Opc) {
5301 case AArch64::ADDXri:
5302 case AArch64::ADDSXri:
5303 case AArch64::SUBXri:
5304 case AArch64::SUBSXri:
5305 MaxEncoding = 0xfff;
5306 ShiftSize = 12;
5307 break;
5308 case AArch64::ADDVL_XXI:
5309 case AArch64::ADDPL_XXI:
5310 case AArch64::ADDSVL_XXI:
5311 case AArch64::ADDSPL_XXI:
5312 MaxEncoding = 31;
5313 ShiftSize = 0;
5314 if (Offset < 0) {
5315 MaxEncoding = 32;
5316 Sign = -1;
5317 Offset = -Offset;
5318 }
5319 break;
5320 default:
5321 llvm_unreachable("Unsupported opcode");
5322 }
5323
5324 // `Offset` can be in bytes or in "scalable bytes".
5325 int VScale = 1;
5326 if (Opc == AArch64::ADDVL_XXI || Opc == AArch64::ADDSVL_XXI)
5327 VScale = 16;
5328 else if (Opc == AArch64::ADDPL_XXI || Opc == AArch64::ADDSPL_XXI)
5329 VScale = 2;
5330
5331 // FIXME: If the offset won't fit in 24-bits, compute the offset into a
5332 // scratch register. If DestReg is a virtual register, use it as the
5333 // scratch register; otherwise, create a new virtual register (to be
5334 // replaced by the scavenger at the end of PEI). That case can be optimized
5335 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
5336 // register can be loaded with offset%8 and the add/sub can use an extending
5337 // instruction with LSL#3.
5338 // Currently the function handles any offsets but generates a poor sequence
5339 // of code.
5340 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
5341
5342 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
5343 Register TmpReg = DestReg;
5344 if (TmpReg == AArch64::XZR)
5345 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
5346 &AArch64::GPR64RegClass);
5347 do {
5348 uint64_t ThisVal = std::min<uint64_t>(a: Offset, b: MaxEncodableValue);
5349 unsigned LocalShiftSize = 0;
5350 if (ThisVal > MaxEncoding) {
5351 ThisVal = ThisVal >> ShiftSize;
5352 LocalShiftSize = ShiftSize;
5353 }
5354 assert((ThisVal >> ShiftSize) <= MaxEncoding &&
5355 "Encoding cannot handle value that big");
5356
5357 Offset -= ThisVal << LocalShiftSize;
5358 if (Offset == 0)
5359 TmpReg = DestReg;
5360 auto MBI = BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: Opc), DestReg: TmpReg)
5361 .addReg(RegNo: SrcReg)
5362 .addImm(Val: Sign * (int)ThisVal);
5363 if (ShiftSize)
5364 MBI = MBI.addImm(
5365 Val: AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: LocalShiftSize));
5366 MBI = MBI.setMIFlag(Flag);
5367
5368 auto Change =
5369 VScale == 1
5370 ? StackOffset::getFixed(Fixed: ThisVal << LocalShiftSize)
5371 : StackOffset::getScalable(Scalable: VScale * (ThisVal << LocalShiftSize));
5372 if (Sign == -1 || Opc == AArch64::SUBXri || Opc == AArch64::SUBSXri)
5373 CFAOffset += Change;
5374 else
5375 CFAOffset -= Change;
5376 if (EmitCFAOffset && DestReg == TmpReg) {
5377 MachineFunction &MF = *MBB.getParent();
5378 const TargetSubtargetInfo &STI = MF.getSubtarget();
5379 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
5380
5381 unsigned CFIIndex = MF.addFrameInst(
5382 Inst: createDefCFA(TRI, FrameReg, Reg: DestReg, Offset: CFAOffset, LastAdjustmentWasScalable: VScale != 1));
5383 BuildMI(BB&: MBB, I: MBBI, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::CFI_INSTRUCTION))
5384 .addCFIIndex(CFIIndex)
5385 .setMIFlags(Flag);
5386 }
5387
5388 if (NeedsWinCFI) {
5389 assert(Sign == 1 && "SEH directives should always have a positive sign");
5390 int Imm = (int)(ThisVal << LocalShiftSize);
5391 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) ||
5392 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) {
5393 if (HasWinCFI)
5394 *HasWinCFI = true;
5395 if (Imm == 0)
5396 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag);
5397 else
5398 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
5399 .addImm(Imm)
5400 .setMIFlag(Flag);
5401 assert(Offset == 0 && "Expected remaining offset to be zero to "
5402 "emit a single SEH directive");
5403 } else if (DestReg == AArch64::SP) {
5404 if (HasWinCFI)
5405 *HasWinCFI = true;
5406 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc");
5407 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
5408 .addImm(Imm)
5409 .setMIFlag(Flag);
5410 }
5411 }
5412
5413 SrcReg = TmpReg;
5414 } while (Offset);
5415}
5416
5417void llvm::emitFrameOffset(MachineBasicBlock &MBB,
5418 MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
5419 unsigned DestReg, unsigned SrcReg,
5420 StackOffset Offset, const TargetInstrInfo *TII,
5421 MachineInstr::MIFlag Flag, bool SetNZCV,
5422 bool NeedsWinCFI, bool *HasWinCFI,
5423 bool EmitCFAOffset, StackOffset CFAOffset,
5424 unsigned FrameReg) {
5425 // If a function is marked as arm_locally_streaming, then the runtime value of
5426 // vscale in the prologue/epilogue is different the runtime value of vscale
5427 // in the function's body. To avoid having to consider multiple vscales,
5428 // we can use `addsvl` to allocate any scalable stack-slots, which under
5429 // most circumstances will be only locals, not callee-save slots.
5430 const Function &F = MBB.getParent()->getFunction();
5431 bool UseSVL = F.hasFnAttribute(Kind: "aarch64_pstate_sm_body");
5432
5433 int64_t Bytes, NumPredicateVectors, NumDataVectors;
5434 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets(
5435 Offset, NumBytes&: Bytes, NumPredicateVectors, NumDataVectors);
5436
5437 // First emit non-scalable frame offsets, or a simple 'mov'.
5438 if (Bytes || (!Offset && SrcReg != DestReg)) {
5439 assert((DestReg != AArch64::SP || Bytes % 8 == 0) &&
5440 "SP increment/decrement not 8-byte aligned");
5441 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri;
5442 if (Bytes < 0) {
5443 Bytes = -Bytes;
5444 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri;
5445 }
5446 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Offset: Bytes, Opc, TII, Flag,
5447 NeedsWinCFI, HasWinCFI, EmitCFAOffset, CFAOffset,
5448 FrameReg);
5449 CFAOffset += (Opc == AArch64::ADDXri || Opc == AArch64::ADDSXri)
5450 ? StackOffset::getFixed(-Bytes)
5451 : StackOffset::getFixed(Bytes);
5452 SrcReg = DestReg;
5453 FrameReg = DestReg;
5454 }
5455
5456 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
5457 "SetNZCV not supported with SVE vectors");
5458 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
5459 "WinCFI not supported with SVE vectors");
5460
5461 if (NumDataVectors) {
5462 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
5463 UseSVL ? AArch64::ADDSVL_XXI : AArch64::ADDVL_XXI,
5464 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
5465 CFAOffset, FrameReg);
5466 CFAOffset += StackOffset::getScalable(Scalable: -NumDataVectors * 16);
5467 SrcReg = DestReg;
5468 }
5469
5470 if (NumPredicateVectors) {
5471 assert(DestReg != AArch64::SP && "Unaligned access to SP");
5472 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
5473 UseSVL ? AArch64::ADDSPL_XXI : AArch64::ADDPL_XXI,
5474 TII, Flag, NeedsWinCFI, nullptr, EmitCFAOffset,
5475 CFAOffset, FrameReg);
5476 }
5477}
5478
5479MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
5480 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
5481 MachineBasicBlock::iterator InsertPt, int FrameIndex,
5482 LiveIntervals *LIS, VirtRegMap *VRM) const {
5483 // This is a bit of a hack. Consider this instruction:
5484 //
5485 // %0 = COPY %sp; GPR64all:%0
5486 //
5487 // We explicitly chose GPR64all for the virtual register so such a copy might
5488 // be eliminated by RegisterCoalescer. However, that may not be possible, and
5489 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
5490 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
5491 //
5492 // To prevent that, we are going to constrain the %0 register class here.
5493 if (MI.isFullCopy()) {
5494 Register DstReg = MI.getOperand(i: 0).getReg();
5495 Register SrcReg = MI.getOperand(i: 1).getReg();
5496 if (SrcReg == AArch64::SP && DstReg.isVirtual()) {
5497 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
5498 return nullptr;
5499 }
5500 if (DstReg == AArch64::SP && SrcReg.isVirtual()) {
5501 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
5502 return nullptr;
5503 }
5504 // Nothing can folded with copy from/to NZCV.
5505 if (SrcReg == AArch64::NZCV || DstReg == AArch64::NZCV)
5506 return nullptr;
5507 }
5508
5509 // Handle the case where a copy is being spilled or filled but the source
5510 // and destination register class don't match. For example:
5511 //
5512 // %0 = COPY %xzr; GPR64common:%0
5513 //
5514 // In this case we can still safely fold away the COPY and generate the
5515 // following spill code:
5516 //
5517 // STRXui %xzr, %stack.0
5518 //
5519 // This also eliminates spilled cross register class COPYs (e.g. between x and
5520 // d regs) of the same size. For example:
5521 //
5522 // %0 = COPY %1; GPR64:%0, FPR64:%1
5523 //
5524 // will be filled as
5525 //
5526 // LDRDui %0, fi<#0>
5527 //
5528 // instead of
5529 //
5530 // LDRXui %Temp, fi<#0>
5531 // %0 = FMOV %Temp
5532 //
5533 if (MI.isCopy() && Ops.size() == 1 &&
5534 // Make sure we're only folding the explicit COPY defs/uses.
5535 (Ops[0] == 0 || Ops[0] == 1)) {
5536 bool IsSpill = Ops[0] == 0;
5537 bool IsFill = !IsSpill;
5538 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
5539 const MachineRegisterInfo &MRI = MF.getRegInfo();
5540 MachineBasicBlock &MBB = *MI.getParent();
5541 const MachineOperand &DstMO = MI.getOperand(i: 0);
5542 const MachineOperand &SrcMO = MI.getOperand(i: 1);
5543 Register DstReg = DstMO.getReg();
5544 Register SrcReg = SrcMO.getReg();
5545 // This is slightly expensive to compute for physical regs since
5546 // getMinimalPhysRegClass is slow.
5547 auto getRegClass = [&](unsigned Reg) {
5548 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg)
5549 : TRI.getMinimalPhysRegClass(Reg);
5550 };
5551
5552 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
5553 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
5554 TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
5555 "Mismatched register size in non subreg COPY");
5556 if (IsSpill)
5557 storeRegToStackSlot(MBB, MBBI: InsertPt, SrcReg, isKill: SrcMO.isKill(), FI: FrameIndex,
5558 RC: getRegClass(SrcReg), TRI: &TRI, VReg: Register());
5559 else
5560 loadRegFromStackSlot(MBB, MBBI: InsertPt, DestReg: DstReg, FI: FrameIndex,
5561 RC: getRegClass(DstReg), TRI: &TRI, VReg: Register());
5562 return &*--InsertPt;
5563 }
5564
5565 // Handle cases like spilling def of:
5566 //
5567 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0
5568 //
5569 // where the physical register source can be widened and stored to the full
5570 // virtual reg destination stack slot, in this case producing:
5571 //
5572 // STRXui %xzr, %stack.0
5573 //
5574 if (IsSpill && DstMO.isUndef() && SrcReg == AArch64::WZR &&
5575 TRI.getRegSizeInBits(*getRegClass(DstReg)) == 64) {
5576 assert(SrcMO.getSubReg() == 0 &&
5577 "Unexpected subreg on physical register");
5578 storeRegToStackSlot(MBB, InsertPt, AArch64::XZR, SrcMO.isKill(),
5579 FrameIndex, &AArch64::GPR64RegClass, &TRI,
5580 Register());
5581 return &*--InsertPt;
5582 }
5583
5584 // Handle cases like filling use of:
5585 //
5586 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1
5587 //
5588 // where we can load the full virtual reg source stack slot, into the subreg
5589 // destination, in this case producing:
5590 //
5591 // LDRWui %0:sub_32<def,read-undef>, %stack.0
5592 //
5593 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
5594 const TargetRegisterClass *FillRC;
5595 switch (DstMO.getSubReg()) {
5596 default:
5597 FillRC = nullptr;
5598 break;
5599 case AArch64::sub_32:
5600 FillRC = &AArch64::GPR32RegClass;
5601 break;
5602 case AArch64::ssub:
5603 FillRC = &AArch64::FPR32RegClass;
5604 break;
5605 case AArch64::dsub:
5606 FillRC = &AArch64::FPR64RegClass;
5607 break;
5608 }
5609
5610 if (FillRC) {
5611 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
5612 TRI.getRegSizeInBits(*FillRC) &&
5613 "Mismatched regclass size on folded subreg COPY");
5614 loadRegFromStackSlot(MBB, MBBI: InsertPt, DestReg: DstReg, FI: FrameIndex, RC: FillRC, TRI: &TRI,
5615 VReg: Register());
5616 MachineInstr &LoadMI = *--InsertPt;
5617 MachineOperand &LoadDst = LoadMI.getOperand(i: 0);
5618 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load");
5619 LoadDst.setSubReg(DstMO.getSubReg());
5620 LoadDst.setIsUndef();
5621 return &LoadMI;
5622 }
5623 }
5624 }
5625
5626 // Cannot fold.
5627 return nullptr;
5628}
5629
5630int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
5631 StackOffset &SOffset,
5632 bool *OutUseUnscaledOp,
5633 unsigned *OutUnscaledOp,
5634 int64_t *EmittableOffset) {
5635 // Set output values in case of early exit.
5636 if (EmittableOffset)
5637 *EmittableOffset = 0;
5638 if (OutUseUnscaledOp)
5639 *OutUseUnscaledOp = false;
5640 if (OutUnscaledOp)
5641 *OutUnscaledOp = 0;
5642
5643 // Exit early for structured vector spills/fills as they can't take an
5644 // immediate offset.
5645 switch (MI.getOpcode()) {
5646 default:
5647 break;
5648 case AArch64::LD1Rv1d:
5649 case AArch64::LD1Rv2s:
5650 case AArch64::LD1Rv2d:
5651 case AArch64::LD1Rv4h:
5652 case AArch64::LD1Rv4s:
5653 case AArch64::LD1Rv8b:
5654 case AArch64::LD1Rv8h:
5655 case AArch64::LD1Rv16b:
5656 case AArch64::LD1Twov2d:
5657 case AArch64::LD1Threev2d:
5658 case AArch64::LD1Fourv2d:
5659 case AArch64::LD1Twov1d:
5660 case AArch64::LD1Threev1d:
5661 case AArch64::LD1Fourv1d:
5662 case AArch64::ST1Twov2d:
5663 case AArch64::ST1Threev2d:
5664 case AArch64::ST1Fourv2d:
5665 case AArch64::ST1Twov1d:
5666 case AArch64::ST1Threev1d:
5667 case AArch64::ST1Fourv1d:
5668 case AArch64::ST1i8:
5669 case AArch64::ST1i16:
5670 case AArch64::ST1i32:
5671 case AArch64::ST1i64:
5672 case AArch64::IRG:
5673 case AArch64::IRGstack:
5674 case AArch64::STGloop:
5675 case AArch64::STZGloop:
5676 return AArch64FrameOffsetCannotUpdate;
5677 }
5678
5679 // Get the min/max offset and the scale.
5680 TypeSize ScaleValue(0U, false), Width(0U, false);
5681 int64_t MinOff, MaxOff;
5682 if (!AArch64InstrInfo::getMemOpInfo(Opcode: MI.getOpcode(), Scale&: ScaleValue, Width, MinOffset&: MinOff,
5683 MaxOffset&: MaxOff))
5684 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
5685
5686 // Construct the complete offset.
5687 bool IsMulVL = ScaleValue.isScalable();
5688 unsigned Scale = ScaleValue.getKnownMinValue();
5689 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed();
5690
5691 const MachineOperand &ImmOpnd =
5692 MI.getOperand(i: AArch64InstrInfo::getLoadStoreImmIdx(Opc: MI.getOpcode()));
5693 Offset += ImmOpnd.getImm() * Scale;
5694
5695 // If the offset doesn't match the scale, we rewrite the instruction to
5696 // use the unscaled instruction instead. Likewise, if we have a negative
5697 // offset and there is an unscaled op to use.
5698 std::optional<unsigned> UnscaledOp =
5699 AArch64InstrInfo::getUnscaledLdSt(Opc: MI.getOpcode());
5700 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
5701 if (useUnscaledOp &&
5702 !AArch64InstrInfo::getMemOpInfo(Opcode: *UnscaledOp, Scale&: ScaleValue, Width, MinOffset&: MinOff,
5703 MaxOffset&: MaxOff))
5704 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
5705
5706 Scale = ScaleValue.getKnownMinValue();
5707 assert(IsMulVL == ScaleValue.isScalable() &&
5708 "Unscaled opcode has different value for scalable");
5709
5710 int64_t Remainder = Offset % Scale;
5711 assert(!(Remainder && useUnscaledOp) &&
5712 "Cannot have remainder when using unscaled op");
5713
5714 assert(MinOff < MaxOff && "Unexpected Min/Max offsets");
5715 int64_t NewOffset = Offset / Scale;
5716 if (MinOff <= NewOffset && NewOffset <= MaxOff)
5717 Offset = Remainder;
5718 else {
5719 NewOffset = NewOffset < 0 ? MinOff : MaxOff;
5720 Offset = Offset - NewOffset * Scale;
5721 }
5722
5723 if (EmittableOffset)
5724 *EmittableOffset = NewOffset;
5725 if (OutUseUnscaledOp)
5726 *OutUseUnscaledOp = useUnscaledOp;
5727 if (OutUnscaledOp && UnscaledOp)
5728 *OutUnscaledOp = *UnscaledOp;
5729
5730 if (IsMulVL)
5731 SOffset = StackOffset::get(Fixed: SOffset.getFixed(), Scalable: Offset);
5732 else
5733 SOffset = StackOffset::get(Fixed: Offset, Scalable: SOffset.getScalable());
5734 return AArch64FrameOffsetCanUpdate |
5735 (SOffset ? 0 : AArch64FrameOffsetIsLegal);
5736}
5737
5738bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
5739 unsigned FrameReg, StackOffset &Offset,
5740 const AArch64InstrInfo *TII) {
5741 unsigned Opcode = MI.getOpcode();
5742 unsigned ImmIdx = FrameRegIdx + 1;
5743
5744 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
5745 Offset += StackOffset::getFixed(Fixed: MI.getOperand(i: ImmIdx).getImm());
5746 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
5747 MI.getOperand(0).getReg(), FrameReg, Offset, TII,
5748 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
5749 MI.eraseFromParent();
5750 Offset = StackOffset();
5751 return true;
5752 }
5753
5754 int64_t NewOffset;
5755 unsigned UnscaledOp;
5756 bool UseUnscaledOp;
5757 int Status = isAArch64FrameOffsetLegal(MI, SOffset&: Offset, OutUseUnscaledOp: &UseUnscaledOp,
5758 OutUnscaledOp: &UnscaledOp, EmittableOffset: &NewOffset);
5759 if (Status & AArch64FrameOffsetCanUpdate) {
5760 if (Status & AArch64FrameOffsetIsLegal)
5761 // Replace the FrameIndex with FrameReg.
5762 MI.getOperand(i: FrameRegIdx).ChangeToRegister(Reg: FrameReg, isDef: false);
5763 if (UseUnscaledOp)
5764 MI.setDesc(TII->get(UnscaledOp));
5765
5766 MI.getOperand(i: ImmIdx).ChangeToImmediate(ImmVal: NewOffset);
5767 return !Offset;
5768 }
5769
5770 return false;
5771}
5772
5773void AArch64InstrInfo::insertNoop(MachineBasicBlock &MBB,
5774 MachineBasicBlock::iterator MI) const {
5775 DebugLoc DL;
5776 BuildMI(MBB, MI, DL, get(AArch64::HINT)).addImm(0);
5777}
5778
5779MCInst AArch64InstrInfo::getNop() const {
5780 return MCInstBuilder(AArch64::HINT).addImm(0);
5781}
5782
5783// AArch64 supports MachineCombiner.
5784bool AArch64InstrInfo::useMachineCombiner() const { return true; }
5785
5786// True when Opc sets flag
5787static bool isCombineInstrSettingFlag(unsigned Opc) {
5788 switch (Opc) {
5789 case AArch64::ADDSWrr:
5790 case AArch64::ADDSWri:
5791 case AArch64::ADDSXrr:
5792 case AArch64::ADDSXri:
5793 case AArch64::SUBSWrr:
5794 case AArch64::SUBSXrr:
5795 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5796 case AArch64::SUBSWri:
5797 case AArch64::SUBSXri:
5798 return true;
5799 default:
5800 break;
5801 }
5802 return false;
5803}
5804
5805// 32b Opcodes that can be combined with a MUL
5806static bool isCombineInstrCandidate32(unsigned Opc) {
5807 switch (Opc) {
5808 case AArch64::ADDWrr:
5809 case AArch64::ADDWri:
5810 case AArch64::SUBWrr:
5811 case AArch64::ADDSWrr:
5812 case AArch64::ADDSWri:
5813 case AArch64::SUBSWrr:
5814 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5815 case AArch64::SUBWri:
5816 case AArch64::SUBSWri:
5817 return true;
5818 default:
5819 break;
5820 }
5821 return false;
5822}
5823
5824// 64b Opcodes that can be combined with a MUL
5825static bool isCombineInstrCandidate64(unsigned Opc) {
5826 switch (Opc) {
5827 case AArch64::ADDXrr:
5828 case AArch64::ADDXri:
5829 case AArch64::SUBXrr:
5830 case AArch64::ADDSXrr:
5831 case AArch64::ADDSXri:
5832 case AArch64::SUBSXrr:
5833 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
5834 case AArch64::SUBXri:
5835 case AArch64::SUBSXri:
5836 case AArch64::ADDv8i8:
5837 case AArch64::ADDv16i8:
5838 case AArch64::ADDv4i16:
5839 case AArch64::ADDv8i16:
5840 case AArch64::ADDv2i32:
5841 case AArch64::ADDv4i32:
5842 case AArch64::SUBv8i8:
5843 case AArch64::SUBv16i8:
5844 case AArch64::SUBv4i16:
5845 case AArch64::SUBv8i16:
5846 case AArch64::SUBv2i32:
5847 case AArch64::SUBv4i32:
5848 return true;
5849 default:
5850 break;
5851 }
5852 return false;
5853}
5854
5855// FP Opcodes that can be combined with a FMUL.
5856static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
5857 switch (Inst.getOpcode()) {
5858 default:
5859 break;
5860 case AArch64::FADDHrr:
5861 case AArch64::FADDSrr:
5862 case AArch64::FADDDrr:
5863 case AArch64::FADDv4f16:
5864 case AArch64::FADDv8f16:
5865 case AArch64::FADDv2f32:
5866 case AArch64::FADDv2f64:
5867 case AArch64::FADDv4f32:
5868 case AArch64::FSUBHrr:
5869 case AArch64::FSUBSrr:
5870 case AArch64::FSUBDrr:
5871 case AArch64::FSUBv4f16:
5872 case AArch64::FSUBv8f16:
5873 case AArch64::FSUBv2f32:
5874 case AArch64::FSUBv2f64:
5875 case AArch64::FSUBv4f32:
5876 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
5877 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by
5878 // the target options or if FADD/FSUB has the contract fast-math flag.
5879 return Options.UnsafeFPMath ||
5880 Options.AllowFPOpFusion == FPOpFusion::Fast ||
5881 Inst.getFlag(Flag: MachineInstr::FmContract);
5882 return true;
5883 }
5884 return false;
5885}
5886
5887// Opcodes that can be combined with a MUL
5888static bool isCombineInstrCandidate(unsigned Opc) {
5889 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
5890}
5891
5892//
5893// Utility routine that checks if \param MO is defined by an
5894// \param CombineOpc instruction in the basic block \param MBB
5895static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
5896 unsigned CombineOpc, unsigned ZeroReg = 0,
5897 bool CheckZeroReg = false) {
5898 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
5899 MachineInstr *MI = nullptr;
5900
5901 if (MO.isReg() && MO.getReg().isVirtual())
5902 MI = MRI.getUniqueVRegDef(Reg: MO.getReg());
5903 // And it needs to be in the trace (otherwise, it won't have a depth).
5904 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
5905 return false;
5906 // Must only used by the user we combine with.
5907 if (!MRI.hasOneNonDBGUse(RegNo: MI->getOperand(i: 0).getReg()))
5908 return false;
5909
5910 if (CheckZeroReg) {
5911 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
5912 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
5913 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
5914 // The third input reg must be zero.
5915 if (MI->getOperand(i: 3).getReg() != ZeroReg)
5916 return false;
5917 }
5918
5919 if (isCombineInstrSettingFlag(CombineOpc) &&
5920 MI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) == -1)
5921 return false;
5922
5923 return true;
5924}
5925
5926//
5927// Is \param MO defined by an integer multiply and can be combined?
5928static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
5929 unsigned MulOpc, unsigned ZeroReg) {
5930 return canCombine(MBB, MO, CombineOpc: MulOpc, ZeroReg, CheckZeroReg: true);
5931}
5932
5933//
5934// Is \param MO defined by a floating-point multiply and can be combined?
5935static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
5936 unsigned MulOpc) {
5937 return canCombine(MBB, MO, CombineOpc: MulOpc);
5938}
5939
5940// TODO: There are many more machine instruction opcodes to match:
5941// 1. Other data types (integer, vectors)
5942// 2. Other math / logic operations (xor, or)
5943// 3. Other forms of the same operation (intrinsics and other variants)
5944bool AArch64InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
5945 bool Invert) const {
5946 if (Invert)
5947 return false;
5948 switch (Inst.getOpcode()) {
5949 // == Floating-point types ==
5950 // -- Floating-point instructions --
5951 case AArch64::FADDHrr:
5952 case AArch64::FADDSrr:
5953 case AArch64::FADDDrr:
5954 case AArch64::FMULHrr:
5955 case AArch64::FMULSrr:
5956 case AArch64::FMULDrr:
5957 case AArch64::FMULX16:
5958 case AArch64::FMULX32:
5959 case AArch64::FMULX64:
5960 // -- Advanced SIMD instructions --
5961 case AArch64::FADDv4f16:
5962 case AArch64::FADDv8f16:
5963 case AArch64::FADDv2f32:
5964 case AArch64::FADDv4f32:
5965 case AArch64::FADDv2f64:
5966 case AArch64::FMULv4f16:
5967 case AArch64::FMULv8f16:
5968 case AArch64::FMULv2f32:
5969 case AArch64::FMULv4f32:
5970 case AArch64::FMULv2f64:
5971 case AArch64::FMULXv4f16:
5972 case AArch64::FMULXv8f16:
5973 case AArch64::FMULXv2f32:
5974 case AArch64::FMULXv4f32:
5975 case AArch64::FMULXv2f64:
5976 // -- SVE instructions --
5977 // Opcodes FMULX_ZZZ_? don't exist because there is no unpredicated FMULX
5978 // in the SVE instruction set (though there are predicated ones).
5979 case AArch64::FADD_ZZZ_H:
5980 case AArch64::FADD_ZZZ_S:
5981 case AArch64::FADD_ZZZ_D:
5982 case AArch64::FMUL_ZZZ_H:
5983 case AArch64::FMUL_ZZZ_S:
5984 case AArch64::FMUL_ZZZ_D:
5985 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath ||
5986 (Inst.getFlag(Flag: MachineInstr::MIFlag::FmReassoc) &&
5987 Inst.getFlag(Flag: MachineInstr::MIFlag::FmNsz));
5988
5989 // == Integer types ==
5990 // -- Base instructions --
5991 // Opcodes MULWrr and MULXrr don't exist because
5992 // `MUL <Wd>, <Wn>, <Wm>` and `MUL <Xd>, <Xn>, <Xm>` are aliases of
5993 // `MADD <Wd>, <Wn>, <Wm>, WZR` and `MADD <Xd>, <Xn>, <Xm>, XZR` respectively.
5994 // The machine-combiner does not support three-source-operands machine
5995 // instruction. So we cannot reassociate MULs.
5996 case AArch64::ADDWrr:
5997 case AArch64::ADDXrr:
5998 case AArch64::ANDWrr:
5999 case AArch64::ANDXrr:
6000 case AArch64::ORRWrr:
6001 case AArch64::ORRXrr:
6002 case AArch64::EORWrr:
6003 case AArch64::EORXrr:
6004 case AArch64::EONWrr:
6005 case AArch64::EONXrr:
6006 // -- Advanced SIMD instructions --
6007 // Opcodes MULv1i64 and MULv2i64 don't exist because there is no 64-bit MUL
6008 // in the Advanced SIMD instruction set.
6009 case AArch64::ADDv8i8:
6010 case AArch64::ADDv16i8:
6011 case AArch64::ADDv4i16:
6012 case AArch64::ADDv8i16:
6013 case AArch64::ADDv2i32:
6014 case AArch64::ADDv4i32:
6015 case AArch64::ADDv1i64:
6016 case AArch64::ADDv2i64:
6017 case AArch64::MULv8i8:
6018 case AArch64::MULv16i8:
6019 case AArch64::MULv4i16:
6020 case AArch64::MULv8i16:
6021 case AArch64::MULv2i32:
6022 case AArch64::MULv4i32:
6023 case AArch64::ANDv8i8:
6024 case AArch64::ANDv16i8:
6025 case AArch64::ORRv8i8:
6026 case AArch64::ORRv16i8:
6027 case AArch64::EORv8i8:
6028 case AArch64::EORv16i8:
6029 // -- SVE instructions --
6030 case AArch64::ADD_ZZZ_B:
6031 case AArch64::ADD_ZZZ_H:
6032 case AArch64::ADD_ZZZ_S:
6033 case AArch64::ADD_ZZZ_D:
6034 case AArch64::MUL_ZZZ_B:
6035 case AArch64::MUL_ZZZ_H:
6036 case AArch64::MUL_ZZZ_S:
6037 case AArch64::MUL_ZZZ_D:
6038 case AArch64::AND_ZZZ:
6039 case AArch64::ORR_ZZZ:
6040 case AArch64::EOR_ZZZ:
6041 return true;
6042
6043 default:
6044 return false;
6045 }
6046}
6047
6048/// Find instructions that can be turned into madd.
6049static bool getMaddPatterns(MachineInstr &Root,
6050 SmallVectorImpl<unsigned> &Patterns) {
6051 unsigned Opc = Root.getOpcode();
6052 MachineBasicBlock &MBB = *Root.getParent();
6053 bool Found = false;
6054
6055 if (!isCombineInstrCandidate(Opc))
6056 return false;
6057 if (isCombineInstrSettingFlag(Opc)) {
6058 int Cmp_NZCV =
6059 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true);
6060 // When NZCV is live bail out.
6061 if (Cmp_NZCV == -1)
6062 return false;
6063 unsigned NewOpc = convertToNonFlagSettingOpc(MI: Root);
6064 // When opcode can't change bail out.
6065 // CHECKME: do we miss any cases for opcode conversion?
6066 if (NewOpc == Opc)
6067 return false;
6068 Opc = NewOpc;
6069 }
6070
6071 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg,
6072 unsigned Pattern) {
6073 if (canCombineWithMUL(MBB, MO&: Root.getOperand(i: Operand), MulOpc: Opcode, ZeroReg)) {
6074 Patterns.push_back(Elt: Pattern);
6075 Found = true;
6076 }
6077 };
6078
6079 auto setVFound = [&](int Opcode, int Operand, unsigned Pattern) {
6080 if (canCombine(MBB, MO&: Root.getOperand(i: Operand), CombineOpc: Opcode)) {
6081 Patterns.push_back(Elt: Pattern);
6082 Found = true;
6083 }
6084 };
6085
6086 typedef AArch64MachineCombinerPattern MCP;
6087
6088 switch (Opc) {
6089 default:
6090 break;
6091 case AArch64::ADDWrr:
6092 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6093 "ADDWrr does not have register operands");
6094 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1);
6095 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2);
6096 break;
6097 case AArch64::ADDXrr:
6098 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1);
6099 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2);
6100 break;
6101 case AArch64::SUBWrr:
6102 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2);
6103 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1);
6104 break;
6105 case AArch64::SUBXrr:
6106 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2);
6107 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1);
6108 break;
6109 case AArch64::ADDWri:
6110 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1);
6111 break;
6112 case AArch64::ADDXri:
6113 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1);
6114 break;
6115 case AArch64::SUBWri:
6116 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1);
6117 break;
6118 case AArch64::SUBXri:
6119 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
6120 break;
6121 case AArch64::ADDv8i8:
6122 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
6123 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
6124 break;
6125 case AArch64::ADDv16i8:
6126 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
6127 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
6128 break;
6129 case AArch64::ADDv4i16:
6130 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
6131 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
6132 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
6133 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
6134 break;
6135 case AArch64::ADDv8i16:
6136 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
6137 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
6138 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
6139 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
6140 break;
6141 case AArch64::ADDv2i32:
6142 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
6143 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
6144 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
6145 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
6146 break;
6147 case AArch64::ADDv4i32:
6148 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
6149 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
6150 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
6151 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
6152 break;
6153 case AArch64::SUBv8i8:
6154 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
6155 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
6156 break;
6157 case AArch64::SUBv16i8:
6158 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
6159 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
6160 break;
6161 case AArch64::SUBv4i16:
6162 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
6163 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
6164 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
6165 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
6166 break;
6167 case AArch64::SUBv8i16:
6168 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
6169 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
6170 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
6171 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
6172 break;
6173 case AArch64::SUBv2i32:
6174 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
6175 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
6176 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
6177 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
6178 break;
6179 case AArch64::SUBv4i32:
6180 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
6181 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
6182 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
6183 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
6184 break;
6185 }
6186 return Found;
6187}
6188/// Floating-Point Support
6189
6190/// Find instructions that can be turned into madd.
6191static bool getFMAPatterns(MachineInstr &Root,
6192 SmallVectorImpl<unsigned> &Patterns) {
6193
6194 if (!isCombineInstrCandidateFP(Inst: Root))
6195 return false;
6196
6197 MachineBasicBlock &MBB = *Root.getParent();
6198 bool Found = false;
6199
6200 auto Match = [&](int Opcode, int Operand, unsigned Pattern) -> bool {
6201 if (canCombineWithFMUL(MBB, MO&: Root.getOperand(i: Operand), MulOpc: Opcode)) {
6202 Patterns.push_back(Elt: Pattern);
6203 return true;
6204 }
6205 return false;
6206 };
6207
6208 typedef AArch64MachineCombinerPattern MCP;
6209
6210 switch (Root.getOpcode()) {
6211 default:
6212 assert(false && "Unsupported FP instruction in combiner\n");
6213 break;
6214 case AArch64::FADDHrr:
6215 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6216 "FADDHrr does not have register operands");
6217
6218 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1);
6219 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2);
6220 break;
6221 case AArch64::FADDSrr:
6222 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
6223 "FADDSrr does not have register operands");
6224
6225 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) ||
6226 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1);
6227
6228 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) ||
6229 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2);
6230 break;
6231 case AArch64::FADDDrr:
6232 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) ||
6233 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1);
6234
6235 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) ||
6236 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2);
6237 break;
6238 case AArch64::FADDv4f16:
6239 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) ||
6240 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1);
6241
6242 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) ||
6243 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2);
6244 break;
6245 case AArch64::FADDv8f16:
6246 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) ||
6247 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1);
6248
6249 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) ||
6250 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2);
6251 break;
6252 case AArch64::FADDv2f32:
6253 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) ||
6254 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1);
6255
6256 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) ||
6257 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2);
6258 break;
6259 case AArch64::FADDv2f64:
6260 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) ||
6261 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1);
6262
6263 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) ||
6264 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2);
6265 break;
6266 case AArch64::FADDv4f32:
6267 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) ||
6268 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1);
6269
6270 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) ||
6271 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2);
6272 break;
6273 case AArch64::FSUBHrr:
6274 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1);
6275 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2);
6276 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1);
6277 break;
6278 case AArch64::FSUBSrr:
6279 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1);
6280
6281 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) ||
6282 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2);
6283
6284 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1);
6285 break;
6286 case AArch64::FSUBDrr:
6287 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1);
6288
6289 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) ||
6290 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);
6291
6292 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
6293 break;
6294 case AArch64::FSUBv4f16:
6295 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) ||
6296 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);
6297
6298 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) ||
6299 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
6300 break;
6301 case AArch64::FSUBv8f16:
6302 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) ||
6303 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);
6304
6305 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) ||
6306 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
6307 break;
6308 case AArch64::FSUBv2f32:
6309 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) ||
6310 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2);
6311
6312 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) ||
6313 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1);
6314 break;
6315 case AArch64::FSUBv2f64:
6316 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) ||
6317 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2);
6318
6319 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) ||
6320 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1);
6321 break;
6322 case AArch64::FSUBv4f32:
6323 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) ||
6324 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2);
6325
6326 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) ||
6327 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1);
6328 break;
6329 }
6330 return Found;
6331}
6332
6333static bool getFMULPatterns(MachineInstr &Root,
6334 SmallVectorImpl<unsigned> &Patterns) {
6335 MachineBasicBlock &MBB = *Root.getParent();
6336 bool Found = false;
6337
6338 auto Match = [&](unsigned Opcode, int Operand, unsigned Pattern) -> bool {
6339 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6340 MachineOperand &MO = Root.getOperand(i: Operand);
6341 MachineInstr *MI = nullptr;
6342 if (MO.isReg() && MO.getReg().isVirtual())
6343 MI = MRI.getUniqueVRegDef(Reg: MO.getReg());
6344 // Ignore No-op COPYs in FMUL(COPY(DUP(..)))
6345 if (MI && MI->getOpcode() == TargetOpcode::COPY &&
6346 MI->getOperand(i: 1).getReg().isVirtual())
6347 MI = MRI.getUniqueVRegDef(Reg: MI->getOperand(i: 1).getReg());
6348 if (MI && MI->getOpcode() == Opcode) {
6349 Patterns.push_back(Elt: Pattern);
6350 return true;
6351 }
6352 return false;
6353 };
6354
6355 typedef AArch64MachineCombinerPattern MCP;
6356
6357 switch (Root.getOpcode()) {
6358 default:
6359 return false;
6360 case AArch64::FMULv2f32:
6361 Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
6362 Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
6363 break;
6364 case AArch64::FMULv2f64:
6365 Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
6366 Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
6367 break;
6368 case AArch64::FMULv4f16:
6369 Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
6370 Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
6371 break;
6372 case AArch64::FMULv4f32:
6373 Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
6374 Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
6375 break;
6376 case AArch64::FMULv8f16:
6377 Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
6378 Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
6379 break;
6380 }
6381
6382 return Found;
6383}
6384
6385static bool getFNEGPatterns(MachineInstr &Root,
6386 SmallVectorImpl<unsigned> &Patterns) {
6387 unsigned Opc = Root.getOpcode();
6388 MachineBasicBlock &MBB = *Root.getParent();
6389 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6390
6391 auto Match = [&](unsigned Opcode, unsigned Pattern) -> bool {
6392 MachineOperand &MO = Root.getOperand(i: 1);
6393 MachineInstr *MI = MRI.getUniqueVRegDef(Reg: MO.getReg());
6394 if (MI != nullptr && (MI->getOpcode() == Opcode) &&
6395 MRI.hasOneNonDBGUse(RegNo: MI->getOperand(i: 0).getReg()) &&
6396 Root.getFlag(Flag: MachineInstr::MIFlag::FmContract) &&
6397 Root.getFlag(Flag: MachineInstr::MIFlag::FmNsz) &&
6398 MI->getFlag(Flag: MachineInstr::MIFlag::FmContract) &&
6399 MI->getFlag(Flag: MachineInstr::MIFlag::FmNsz)) {
6400 Patterns.push_back(Elt: Pattern);
6401 return true;
6402 }
6403 return false;
6404 };
6405
6406 switch (Opc) {
6407 default:
6408 break;
6409 case AArch64::FNEGDr:
6410 return Match(AArch64::FMADDDrrr, AArch64MachineCombinerPattern::FNMADD);
6411 case AArch64::FNEGSr:
6412 return Match(AArch64::FMADDSrrr, AArch64MachineCombinerPattern::FNMADD);
6413 }
6414
6415 return false;
6416}
6417
6418/// Return true when a code sequence can improve throughput. It
6419/// should be called only for instructions in loops.
6420/// \param Pattern - combiner pattern
6421bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const {
6422 switch (Pattern) {
6423 default:
6424 break;
6425 case AArch64MachineCombinerPattern::FMULADDH_OP1:
6426 case AArch64MachineCombinerPattern::FMULADDH_OP2:
6427 case AArch64MachineCombinerPattern::FMULSUBH_OP1:
6428 case AArch64MachineCombinerPattern::FMULSUBH_OP2:
6429 case AArch64MachineCombinerPattern::FMULADDS_OP1:
6430 case AArch64MachineCombinerPattern::FMULADDS_OP2:
6431 case AArch64MachineCombinerPattern::FMULSUBS_OP1:
6432 case AArch64MachineCombinerPattern::FMULSUBS_OP2:
6433 case AArch64MachineCombinerPattern::FMULADDD_OP1:
6434 case AArch64MachineCombinerPattern::FMULADDD_OP2:
6435 case AArch64MachineCombinerPattern::FMULSUBD_OP1:
6436 case AArch64MachineCombinerPattern::FMULSUBD_OP2:
6437 case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
6438 case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
6439 case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
6440 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
6441 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
6442 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
6443 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
6444 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
6445 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
6446 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
6447 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
6448 case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
6449 case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
6450 case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
6451 case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
6452 case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
6453 case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
6454 case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
6455 case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
6456 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
6457 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
6458 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
6459 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
6460 case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
6461 case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
6462 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
6463 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
6464 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1:
6465 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
6466 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1:
6467 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
6468 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
6469 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
6470 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
6471 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
6472 case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
6473 case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
6474 case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
6475 case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
6476 case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
6477 case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
6478 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
6479 case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
6480 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
6481 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2:
6482 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
6483 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2:
6484 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
6485 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2:
6486 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
6487 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2:
6488 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
6489 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2:
6490 case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
6491 case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
6492 case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
6493 case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
6494 case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
6495 case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
6496 case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
6497 case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
6498 case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
6499 case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
6500 case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
6501 case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
6502 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
6503 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
6504 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
6505 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
6506 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
6507 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
6508 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
6509 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
6510 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
6511 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
6512 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
6513 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
6514 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
6515 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
6516 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
6517 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
6518 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
6519 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
6520 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
6521 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
6522 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
6523 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
6524 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
6525 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
6526 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
6527 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
6528 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
6529 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
6530 return true;
6531 } // end switch (Pattern)
6532 return false;
6533}
6534
6535/// Find other MI combine patterns.
6536static bool getMiscPatterns(MachineInstr &Root,
6537 SmallVectorImpl<unsigned> &Patterns) {
6538 // A - (B + C) ==> (A - B) - C or (A - C) - B
6539 unsigned Opc = Root.getOpcode();
6540 MachineBasicBlock &MBB = *Root.getParent();
6541
6542 switch (Opc) {
6543 case AArch64::SUBWrr:
6544 case AArch64::SUBSWrr:
6545 case AArch64::SUBXrr:
6546 case AArch64::SUBSXrr:
6547 // Found candidate root.
6548 break;
6549 default:
6550 return false;
6551 }
6552
6553 if (isCombineInstrSettingFlag(Opc) &&
6554 Root.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr, true) ==
6555 -1)
6556 return false;
6557
6558 if (canCombine(MBB, Root.getOperand(2), AArch64::ADDWrr) ||
6559 canCombine(MBB, Root.getOperand(2), AArch64::ADDSWrr) ||
6560 canCombine(MBB, Root.getOperand(2), AArch64::ADDXrr) ||
6561 canCombine(MBB, Root.getOperand(2), AArch64::ADDSXrr)) {
6562 Patterns.push_back(Elt: AArch64MachineCombinerPattern::SUBADD_OP1);
6563 Patterns.push_back(Elt: AArch64MachineCombinerPattern::SUBADD_OP2);
6564 return true;
6565 }
6566
6567 return false;
6568}
6569
6570CombinerObjective
6571AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const {
6572 switch (Pattern) {
6573 case AArch64MachineCombinerPattern::SUBADD_OP1:
6574 case AArch64MachineCombinerPattern::SUBADD_OP2:
6575 return CombinerObjective::MustReduceDepth;
6576 default:
6577 return TargetInstrInfo::getCombinerObjective(Pattern);
6578 }
6579}
6580
6581/// Return true when there is potentially a faster code sequence for an
6582/// instruction chain ending in \p Root. All potential patterns are listed in
6583/// the \p Pattern vector. Pattern should be sorted in priority order since the
6584/// pattern evaluator stops checking as soon as it finds a faster sequence.
6585
6586bool AArch64InstrInfo::getMachineCombinerPatterns(
6587 MachineInstr &Root, SmallVectorImpl<unsigned> &Patterns,
6588 bool DoRegPressureReduce) const {
6589 // Integer patterns
6590 if (getMaddPatterns(Root, Patterns))
6591 return true;
6592 // Floating point patterns
6593 if (getFMULPatterns(Root, Patterns))
6594 return true;
6595 if (getFMAPatterns(Root, Patterns))
6596 return true;
6597 if (getFNEGPatterns(Root, Patterns))
6598 return true;
6599
6600 // Other patterns
6601 if (getMiscPatterns(Root, Patterns))
6602 return true;
6603
6604 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
6605 DoRegPressureReduce);
6606}
6607
6608enum class FMAInstKind { Default, Indexed, Accumulator };
6609/// genFusedMultiply - Generate fused multiply instructions.
6610/// This function supports both integer and floating point instructions.
6611/// A typical example:
6612/// F|MUL I=A,B,0
6613/// F|ADD R,I,C
6614/// ==> F|MADD R,A,B,C
6615/// \param MF Containing MachineFunction
6616/// \param MRI Register information
6617/// \param TII Target information
6618/// \param Root is the F|ADD instruction
6619/// \param [out] InsInstrs is a vector of machine instructions and will
6620/// contain the generated madd instruction
6621/// \param IdxMulOpd is index of operand in Root that is the result of
6622/// the F|MUL. In the example above IdxMulOpd is 1.
6623/// \param MaddOpc the opcode fo the f|madd instruction
6624/// \param RC Register class of operands
6625/// \param kind of fma instruction (addressing mode) to be generated
6626/// \param ReplacedAddend is the result register from the instruction
6627/// replacing the non-combined operand, if any.
6628static MachineInstr *
6629genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
6630 const TargetInstrInfo *TII, MachineInstr &Root,
6631 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
6632 unsigned MaddOpc, const TargetRegisterClass *RC,
6633 FMAInstKind kind = FMAInstKind::Default,
6634 const Register *ReplacedAddend = nullptr) {
6635 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
6636
6637 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
6638 MachineInstr *MUL = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: IdxMulOpd).getReg());
6639 Register ResultReg = Root.getOperand(i: 0).getReg();
6640 Register SrcReg0 = MUL->getOperand(i: 1).getReg();
6641 bool Src0IsKill = MUL->getOperand(i: 1).isKill();
6642 Register SrcReg1 = MUL->getOperand(i: 2).getReg();
6643 bool Src1IsKill = MUL->getOperand(i: 2).isKill();
6644
6645 Register SrcReg2;
6646 bool Src2IsKill;
6647 if (ReplacedAddend) {
6648 // If we just generated a new addend, we must be it's only use.
6649 SrcReg2 = *ReplacedAddend;
6650 Src2IsKill = true;
6651 } else {
6652 SrcReg2 = Root.getOperand(i: IdxOtherOpd).getReg();
6653 Src2IsKill = Root.getOperand(i: IdxOtherOpd).isKill();
6654 }
6655
6656 if (ResultReg.isVirtual())
6657 MRI.constrainRegClass(Reg: ResultReg, RC);
6658 if (SrcReg0.isVirtual())
6659 MRI.constrainRegClass(Reg: SrcReg0, RC);
6660 if (SrcReg1.isVirtual())
6661 MRI.constrainRegClass(Reg: SrcReg1, RC);
6662 if (SrcReg2.isVirtual())
6663 MRI.constrainRegClass(Reg: SrcReg2, RC);
6664
6665 MachineInstrBuilder MIB;
6666 if (kind == FMAInstKind::Default)
6667 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
6668 .addReg(RegNo: SrcReg0, flags: getKillRegState(B: Src0IsKill))
6669 .addReg(RegNo: SrcReg1, flags: getKillRegState(B: Src1IsKill))
6670 .addReg(RegNo: SrcReg2, flags: getKillRegState(B: Src2IsKill));
6671 else if (kind == FMAInstKind::Indexed)
6672 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
6673 .addReg(RegNo: SrcReg2, flags: getKillRegState(B: Src2IsKill))
6674 .addReg(RegNo: SrcReg0, flags: getKillRegState(B: Src0IsKill))
6675 .addReg(RegNo: SrcReg1, flags: getKillRegState(B: Src1IsKill))
6676 .addImm(Val: MUL->getOperand(i: 3).getImm());
6677 else if (kind == FMAInstKind::Accumulator)
6678 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
6679 .addReg(RegNo: SrcReg2, flags: getKillRegState(B: Src2IsKill))
6680 .addReg(RegNo: SrcReg0, flags: getKillRegState(B: Src0IsKill))
6681 .addReg(RegNo: SrcReg1, flags: getKillRegState(B: Src1IsKill));
6682 else
6683 assert(false && "Invalid FMA instruction kind \n");
6684 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
6685 InsInstrs.push_back(Elt: MIB);
6686 return MUL;
6687}
6688
6689static MachineInstr *
6690genFNegatedMAD(MachineFunction &MF, MachineRegisterInfo &MRI,
6691 const TargetInstrInfo *TII, MachineInstr &Root,
6692 SmallVectorImpl<MachineInstr *> &InsInstrs) {
6693 MachineInstr *MAD = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: 1).getReg());
6694
6695 unsigned Opc = 0;
6696 const TargetRegisterClass *RC = MRI.getRegClass(Reg: MAD->getOperand(i: 0).getReg());
6697 if (AArch64::FPR32RegClass.hasSubClassEq(RC))
6698 Opc = AArch64::FNMADDSrrr;
6699 else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
6700 Opc = AArch64::FNMADDDrrr;
6701 else
6702 return nullptr;
6703
6704 Register ResultReg = Root.getOperand(i: 0).getReg();
6705 Register SrcReg0 = MAD->getOperand(i: 1).getReg();
6706 Register SrcReg1 = MAD->getOperand(i: 2).getReg();
6707 Register SrcReg2 = MAD->getOperand(i: 3).getReg();
6708 bool Src0IsKill = MAD->getOperand(i: 1).isKill();
6709 bool Src1IsKill = MAD->getOperand(i: 2).isKill();
6710 bool Src2IsKill = MAD->getOperand(i: 3).isKill();
6711 if (ResultReg.isVirtual())
6712 MRI.constrainRegClass(Reg: ResultReg, RC);
6713 if (SrcReg0.isVirtual())
6714 MRI.constrainRegClass(Reg: SrcReg0, RC);
6715 if (SrcReg1.isVirtual())
6716 MRI.constrainRegClass(Reg: SrcReg1, RC);
6717 if (SrcReg2.isVirtual())
6718 MRI.constrainRegClass(Reg: SrcReg2, RC);
6719
6720 MachineInstrBuilder MIB =
6721 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: Opc), DestReg: ResultReg)
6722 .addReg(RegNo: SrcReg0, flags: getKillRegState(B: Src0IsKill))
6723 .addReg(RegNo: SrcReg1, flags: getKillRegState(B: Src1IsKill))
6724 .addReg(RegNo: SrcReg2, flags: getKillRegState(B: Src2IsKill));
6725 InsInstrs.push_back(Elt: MIB);
6726
6727 return MAD;
6728}
6729
6730/// Fold (FMUL x (DUP y lane)) into (FMUL_indexed x y lane)
6731static MachineInstr *
6732genIndexedMultiply(MachineInstr &Root,
6733 SmallVectorImpl<MachineInstr *> &InsInstrs,
6734 unsigned IdxDupOp, unsigned MulOpc,
6735 const TargetRegisterClass *RC, MachineRegisterInfo &MRI) {
6736 assert(((IdxDupOp == 1) || (IdxDupOp == 2)) &&
6737 "Invalid index of FMUL operand");
6738
6739 MachineFunction &MF = *Root.getMF();
6740 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
6741
6742 MachineInstr *Dup =
6743 MF.getRegInfo().getUniqueVRegDef(Reg: Root.getOperand(i: IdxDupOp).getReg());
6744
6745 if (Dup->getOpcode() == TargetOpcode::COPY)
6746 Dup = MRI.getUniqueVRegDef(Reg: Dup->getOperand(i: 1).getReg());
6747
6748 Register DupSrcReg = Dup->getOperand(i: 1).getReg();
6749 MRI.clearKillFlags(Reg: DupSrcReg);
6750 MRI.constrainRegClass(Reg: DupSrcReg, RC);
6751
6752 unsigned DupSrcLane = Dup->getOperand(i: 2).getImm();
6753
6754 unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
6755 MachineOperand &MulOp = Root.getOperand(i: IdxMulOp);
6756
6757 Register ResultReg = Root.getOperand(i: 0).getReg();
6758
6759 MachineInstrBuilder MIB;
6760 MIB = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MulOpc), DestReg: ResultReg)
6761 .add(MO: MulOp)
6762 .addReg(RegNo: DupSrcReg)
6763 .addImm(Val: DupSrcLane);
6764
6765 InsInstrs.push_back(Elt: MIB);
6766 return &Root;
6767}
6768
6769/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
6770/// instructions.
6771///
6772/// \see genFusedMultiply
6773static MachineInstr *genFusedMultiplyAcc(
6774 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6775 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6776 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
6777 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6778 kind: FMAInstKind::Accumulator);
6779}
6780
6781/// genNeg - Helper to generate an intermediate negation of the second operand
6782/// of Root
6783static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
6784 const TargetInstrInfo *TII, MachineInstr &Root,
6785 SmallVectorImpl<MachineInstr *> &InsInstrs,
6786 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
6787 unsigned MnegOpc, const TargetRegisterClass *RC) {
6788 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
6789 MachineInstrBuilder MIB =
6790 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MnegOpc), DestReg: NewVR)
6791 .add(MO: Root.getOperand(i: 2));
6792 InsInstrs.push_back(Elt: MIB);
6793
6794 assert(InstrIdxForVirtReg.empty());
6795 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
6796
6797 return NewVR;
6798}
6799
6800/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
6801/// instructions with an additional negation of the accumulator
6802static MachineInstr *genFusedMultiplyAccNeg(
6803 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6804 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6805 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
6806 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
6807 assert(IdxMulOpd == 1);
6808
6809 Register NewVR =
6810 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
6811 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6812 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
6813}
6814
6815/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
6816/// instructions.
6817///
6818/// \see genFusedMultiply
6819static MachineInstr *genFusedMultiplyIdx(
6820 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6821 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6822 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
6823 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6824 kind: FMAInstKind::Indexed);
6825}
6826
6827/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
6828/// instructions with an additional negation of the accumulator
6829static MachineInstr *genFusedMultiplyIdxNeg(
6830 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
6831 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
6832 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
6833 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
6834 assert(IdxMulOpd == 1);
6835
6836 Register NewVR =
6837 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
6838
6839 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
6840 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
6841}
6842
6843/// genMaddR - Generate madd instruction and combine mul and add using
6844/// an extra virtual register
6845/// Example - an ADD intermediate needs to be stored in a register:
6846/// MUL I=A,B,0
6847/// ADD R,I,Imm
6848/// ==> ORR V, ZR, Imm
6849/// ==> MADD R,A,B,V
6850/// \param MF Containing MachineFunction
6851/// \param MRI Register information
6852/// \param TII Target information
6853/// \param Root is the ADD instruction
6854/// \param [out] InsInstrs is a vector of machine instructions and will
6855/// contain the generated madd instruction
6856/// \param IdxMulOpd is index of operand in Root that is the result of
6857/// the MUL. In the example above IdxMulOpd is 1.
6858/// \param MaddOpc the opcode fo the madd instruction
6859/// \param VR is a virtual register that holds the value of an ADD operand
6860/// (V in the example above).
6861/// \param RC Register class of operands
6862static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
6863 const TargetInstrInfo *TII, MachineInstr &Root,
6864 SmallVectorImpl<MachineInstr *> &InsInstrs,
6865 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR,
6866 const TargetRegisterClass *RC) {
6867 assert(IdxMulOpd == 1 || IdxMulOpd == 2);
6868
6869 MachineInstr *MUL = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: IdxMulOpd).getReg());
6870 Register ResultReg = Root.getOperand(i: 0).getReg();
6871 Register SrcReg0 = MUL->getOperand(i: 1).getReg();
6872 bool Src0IsKill = MUL->getOperand(i: 1).isKill();
6873 Register SrcReg1 = MUL->getOperand(i: 2).getReg();
6874 bool Src1IsKill = MUL->getOperand(i: 2).isKill();
6875
6876 if (ResultReg.isVirtual())
6877 MRI.constrainRegClass(Reg: ResultReg, RC);
6878 if (SrcReg0.isVirtual())
6879 MRI.constrainRegClass(Reg: SrcReg0, RC);
6880 if (SrcReg1.isVirtual())
6881 MRI.constrainRegClass(Reg: SrcReg1, RC);
6882 if (Register::isVirtualRegister(Reg: VR))
6883 MRI.constrainRegClass(Reg: VR, RC);
6884
6885 MachineInstrBuilder MIB =
6886 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MaddOpc), DestReg: ResultReg)
6887 .addReg(RegNo: SrcReg0, flags: getKillRegState(B: Src0IsKill))
6888 .addReg(RegNo: SrcReg1, flags: getKillRegState(B: Src1IsKill))
6889 .addReg(RegNo: VR);
6890 // Insert the MADD
6891 InsInstrs.push_back(Elt: MIB);
6892 return MUL;
6893}
6894
6895/// Do the following transformation
6896/// A - (B + C) ==> (A - B) - C
6897/// A - (B + C) ==> (A - C) - B
6898static void
6899genSubAdd2SubSub(MachineFunction &MF, MachineRegisterInfo &MRI,
6900 const TargetInstrInfo *TII, MachineInstr &Root,
6901 SmallVectorImpl<MachineInstr *> &InsInstrs,
6902 SmallVectorImpl<MachineInstr *> &DelInstrs,
6903 unsigned IdxOpd1,
6904 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
6905 assert(IdxOpd1 == 1 || IdxOpd1 == 2);
6906 unsigned IdxOtherOpd = IdxOpd1 == 1 ? 2 : 1;
6907 MachineInstr *AddMI = MRI.getUniqueVRegDef(Reg: Root.getOperand(i: 2).getReg());
6908
6909 Register ResultReg = Root.getOperand(i: 0).getReg();
6910 Register RegA = Root.getOperand(i: 1).getReg();
6911 bool RegAIsKill = Root.getOperand(i: 1).isKill();
6912 Register RegB = AddMI->getOperand(i: IdxOpd1).getReg();
6913 bool RegBIsKill = AddMI->getOperand(i: IdxOpd1).isKill();
6914 Register RegC = AddMI->getOperand(i: IdxOtherOpd).getReg();
6915 bool RegCIsKill = AddMI->getOperand(i: IdxOtherOpd).isKill();
6916 Register NewVR = MRI.createVirtualRegister(RegClass: MRI.getRegClass(Reg: RegA));
6917
6918 unsigned Opcode = Root.getOpcode();
6919 if (Opcode == AArch64::SUBSWrr)
6920 Opcode = AArch64::SUBWrr;
6921 else if (Opcode == AArch64::SUBSXrr)
6922 Opcode = AArch64::SUBXrr;
6923 else
6924 assert((Opcode == AArch64::SUBWrr || Opcode == AArch64::SUBXrr) &&
6925 "Unexpected instruction opcode.");
6926
6927 MachineInstrBuilder MIB1 =
6928 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode), DestReg: NewVR)
6929 .addReg(RegNo: RegA, flags: getKillRegState(B: RegAIsKill))
6930 .addReg(RegNo: RegB, flags: getKillRegState(B: RegBIsKill));
6931 MachineInstrBuilder MIB2 =
6932 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode), DestReg: ResultReg)
6933 .addReg(RegNo: NewVR, flags: getKillRegState(B: true))
6934 .addReg(RegNo: RegC, flags: getKillRegState(B: RegCIsKill));
6935
6936 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
6937 InsInstrs.push_back(Elt: MIB1);
6938 InsInstrs.push_back(Elt: MIB2);
6939 DelInstrs.push_back(Elt: AddMI);
6940}
6941
6942/// When getMachineCombinerPatterns() finds potential patterns,
6943/// this function generates the instructions that could replace the
6944/// original code sequence
6945void AArch64InstrInfo::genAlternativeCodeSequence(
6946 MachineInstr &Root, unsigned Pattern,
6947 SmallVectorImpl<MachineInstr *> &InsInstrs,
6948 SmallVectorImpl<MachineInstr *> &DelInstrs,
6949 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
6950 MachineBasicBlock &MBB = *Root.getParent();
6951 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6952 MachineFunction &MF = *MBB.getParent();
6953 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
6954
6955 MachineInstr *MUL = nullptr;
6956 const TargetRegisterClass *RC;
6957 unsigned Opc;
6958 switch (Pattern) {
6959 default:
6960 // Reassociate instructions.
6961 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
6962 DelInstrs, InstrIdxForVirtReg);
6963 return;
6964 case AArch64MachineCombinerPattern::SUBADD_OP1:
6965 // A - (B + C)
6966 // ==> (A - B) - C
6967 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, IdxOpd1: 1,
6968 InstrIdxForVirtReg);
6969 break;
6970 case AArch64MachineCombinerPattern::SUBADD_OP2:
6971 // A - (B + C)
6972 // ==> (A - C) - B
6973 genSubAdd2SubSub(MF, MRI, TII, Root, InsInstrs, DelInstrs, IdxOpd1: 2,
6974 InstrIdxForVirtReg);
6975 break;
6976 case AArch64MachineCombinerPattern::MULADDW_OP1:
6977 case AArch64MachineCombinerPattern::MULADDX_OP1:
6978 // MUL I=A,B,0
6979 // ADD R,I,C
6980 // ==> MADD R,A,B,C
6981 // --- Create(MADD);
6982 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP1) {
6983 Opc = AArch64::MADDWrrr;
6984 RC = &AArch64::GPR32RegClass;
6985 } else {
6986 Opc = AArch64::MADDXrrr;
6987 RC = &AArch64::GPR64RegClass;
6988 }
6989 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
6990 break;
6991 case AArch64MachineCombinerPattern::MULADDW_OP2:
6992 case AArch64MachineCombinerPattern::MULADDX_OP2:
6993 // MUL I=A,B,0
6994 // ADD R,C,I
6995 // ==> MADD R,A,B,C
6996 // --- Create(MADD);
6997 if (Pattern == AArch64MachineCombinerPattern::MULADDW_OP2) {
6998 Opc = AArch64::MADDWrrr;
6999 RC = &AArch64::GPR32RegClass;
7000 } else {
7001 Opc = AArch64::MADDXrrr;
7002 RC = &AArch64::GPR64RegClass;
7003 }
7004 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7005 break;
7006 case AArch64MachineCombinerPattern::MULADDWI_OP1:
7007 case AArch64MachineCombinerPattern::MULADDXI_OP1: {
7008 // MUL I=A,B,0
7009 // ADD R,I,Imm
7010 // ==> MOV V, Imm
7011 // ==> MADD R,A,B,V
7012 // --- Create(MADD);
7013 const TargetRegisterClass *OrrRC;
7014 unsigned BitSize, OrrOpc, ZeroReg;
7015 if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1) {
7016 OrrOpc = AArch64::ORRWri;
7017 OrrRC = &AArch64::GPR32spRegClass;
7018 BitSize = 32;
7019 ZeroReg = AArch64::WZR;
7020 Opc = AArch64::MADDWrrr;
7021 RC = &AArch64::GPR32RegClass;
7022 } else {
7023 OrrOpc = AArch64::ORRXri;
7024 OrrRC = &AArch64::GPR64spRegClass;
7025 BitSize = 64;
7026 ZeroReg = AArch64::XZR;
7027 Opc = AArch64::MADDXrrr;
7028 RC = &AArch64::GPR64RegClass;
7029 }
7030 Register NewVR = MRI.createVirtualRegister(RegClass: OrrRC);
7031 uint64_t Imm = Root.getOperand(i: 2).getImm();
7032
7033 if (Root.getOperand(i: 3).isImm()) {
7034 unsigned Val = Root.getOperand(i: 3).getImm();
7035 Imm = Imm << Val;
7036 }
7037 uint64_t UImm = SignExtend64(X: Imm, B: BitSize);
7038 // The immediate can be composed via a single instruction.
7039 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7040 AArch64_IMM::expandMOVImm(Imm: UImm, BitSize, Insn);
7041 if (Insn.size() != 1)
7042 return;
7043 auto MovI = Insn.begin();
7044 MachineInstrBuilder MIB1;
7045 // MOV is an alias for one of three instructions: movz, movn, and orr.
7046 if (MovI->Opcode == OrrOpc)
7047 MIB1 = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: OrrOpc), DestReg: NewVR)
7048 .addReg(RegNo: ZeroReg)
7049 .addImm(Val: MovI->Op2);
7050 else {
7051 if (BitSize == 32)
7052 assert((MovI->Opcode == AArch64::MOVNWi ||
7053 MovI->Opcode == AArch64::MOVZWi) &&
7054 "Expected opcode");
7055 else
7056 assert((MovI->Opcode == AArch64::MOVNXi ||
7057 MovI->Opcode == AArch64::MOVZXi) &&
7058 "Expected opcode");
7059 MIB1 = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MovI->Opcode), DestReg: NewVR)
7060 .addImm(Val: MovI->Op1)
7061 .addImm(Val: MovI->Op2);
7062 }
7063 InsInstrs.push_back(Elt: MIB1);
7064 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
7065 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, VR: NewVR, RC);
7066 break;
7067 }
7068 case AArch64MachineCombinerPattern::MULSUBW_OP1:
7069 case AArch64MachineCombinerPattern::MULSUBX_OP1: {
7070 // MUL I=A,B,0
7071 // SUB R,I, C
7072 // ==> SUB V, 0, C
7073 // ==> MADD R,A,B,V // = -C + A*B
7074 // --- Create(MADD);
7075 const TargetRegisterClass *SubRC;
7076 unsigned SubOpc, ZeroReg;
7077 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP1) {
7078 SubOpc = AArch64::SUBWrr;
7079 SubRC = &AArch64::GPR32spRegClass;
7080 ZeroReg = AArch64::WZR;
7081 Opc = AArch64::MADDWrrr;
7082 RC = &AArch64::GPR32RegClass;
7083 } else {
7084 SubOpc = AArch64::SUBXrr;
7085 SubRC = &AArch64::GPR64spRegClass;
7086 ZeroReg = AArch64::XZR;
7087 Opc = AArch64::MADDXrrr;
7088 RC = &AArch64::GPR64RegClass;
7089 }
7090 Register NewVR = MRI.createVirtualRegister(RegClass: SubRC);
7091 // SUB NewVR, 0, C
7092 MachineInstrBuilder MIB1 =
7093 BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: SubOpc), DestReg: NewVR)
7094 .addReg(RegNo: ZeroReg)
7095 .add(MO: Root.getOperand(i: 2));
7096 InsInstrs.push_back(Elt: MIB1);
7097 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
7098 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, VR: NewVR, RC);
7099 break;
7100 }
7101 case AArch64MachineCombinerPattern::MULSUBW_OP2:
7102 case AArch64MachineCombinerPattern::MULSUBX_OP2:
7103 // MUL I=A,B,0
7104 // SUB R,C,I
7105 // ==> MSUB R,A,B,C (computes C - A*B)
7106 // --- Create(MSUB);
7107 if (Pattern == AArch64MachineCombinerPattern::MULSUBW_OP2) {
7108 Opc = AArch64::MSUBWrrr;
7109 RC = &AArch64::GPR32RegClass;
7110 } else {
7111 Opc = AArch64::MSUBXrrr;
7112 RC = &AArch64::GPR64RegClass;
7113 }
7114 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7115 break;
7116 case AArch64MachineCombinerPattern::MULSUBWI_OP1:
7117 case AArch64MachineCombinerPattern::MULSUBXI_OP1: {
7118 // MUL I=A,B,0
7119 // SUB R,I, Imm
7120 // ==> MOV V, -Imm
7121 // ==> MADD R,A,B,V // = -Imm + A*B
7122 // --- Create(MADD);
7123 const TargetRegisterClass *OrrRC;
7124 unsigned BitSize, OrrOpc, ZeroReg;
7125 if (Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) {
7126 OrrOpc = AArch64::ORRWri;
7127 OrrRC = &AArch64::GPR32spRegClass;
7128 BitSize = 32;
7129 ZeroReg = AArch64::WZR;
7130 Opc = AArch64::MADDWrrr;
7131 RC = &AArch64::GPR32RegClass;
7132 } else {
7133 OrrOpc = AArch64::ORRXri;
7134 OrrRC = &AArch64::GPR64spRegClass;
7135 BitSize = 64;
7136 ZeroReg = AArch64::XZR;
7137 Opc = AArch64::MADDXrrr;
7138 RC = &AArch64::GPR64RegClass;
7139 }
7140 Register NewVR = MRI.createVirtualRegister(RegClass: OrrRC);
7141 uint64_t Imm = Root.getOperand(i: 2).getImm();
7142 if (Root.getOperand(i: 3).isImm()) {
7143 unsigned Val = Root.getOperand(i: 3).getImm();
7144 Imm = Imm << Val;
7145 }
7146 uint64_t UImm = SignExtend64(X: -Imm, B: BitSize);
7147 // The immediate can be composed via a single instruction.
7148 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
7149 AArch64_IMM::expandMOVImm(Imm: UImm, BitSize, Insn);
7150 if (Insn.size() != 1)
7151 return;
7152 auto MovI = Insn.begin();
7153 MachineInstrBuilder MIB1;
7154 // MOV is an alias for one of three instructions: movz, movn, and orr.
7155 if (MovI->Opcode == OrrOpc)
7156 MIB1 = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: OrrOpc), DestReg: NewVR)
7157 .addReg(RegNo: ZeroReg)
7158 .addImm(Val: MovI->Op2);
7159 else {
7160 if (BitSize == 32)
7161 assert((MovI->Opcode == AArch64::MOVNWi ||
7162 MovI->Opcode == AArch64::MOVZWi) &&
7163 "Expected opcode");
7164 else
7165 assert((MovI->Opcode == AArch64::MOVNXi ||
7166 MovI->Opcode == AArch64::MOVZXi) &&
7167 "Expected opcode");
7168 MIB1 = BuildMI(MF, MIMD: MIMetadata(Root), MCID: TII->get(Opcode: MovI->Opcode), DestReg: NewVR)
7169 .addImm(Val: MovI->Op1)
7170 .addImm(Val: MovI->Op2);
7171 }
7172 InsInstrs.push_back(Elt: MIB1);
7173 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
7174 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, VR: NewVR, RC);
7175 break;
7176 }
7177
7178 case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
7179 Opc = AArch64::MLAv8i8;
7180 RC = &AArch64::FPR64RegClass;
7181 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7182 break;
7183 case AArch64MachineCombinerPattern::MULADDv8i8_OP2:
7184 Opc = AArch64::MLAv8i8;
7185 RC = &AArch64::FPR64RegClass;
7186 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7187 break;
7188 case AArch64MachineCombinerPattern::MULADDv16i8_OP1:
7189 Opc = AArch64::MLAv16i8;
7190 RC = &AArch64::FPR128RegClass;
7191 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7192 break;
7193 case AArch64MachineCombinerPattern::MULADDv16i8_OP2:
7194 Opc = AArch64::MLAv16i8;
7195 RC = &AArch64::FPR128RegClass;
7196 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7197 break;
7198 case AArch64MachineCombinerPattern::MULADDv4i16_OP1:
7199 Opc = AArch64::MLAv4i16;
7200 RC = &AArch64::FPR64RegClass;
7201 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7202 break;
7203 case AArch64MachineCombinerPattern::MULADDv4i16_OP2:
7204 Opc = AArch64::MLAv4i16;
7205 RC = &AArch64::FPR64RegClass;
7206 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7207 break;
7208 case AArch64MachineCombinerPattern::MULADDv8i16_OP1:
7209 Opc = AArch64::MLAv8i16;
7210 RC = &AArch64::FPR128RegClass;
7211 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7212 break;
7213 case AArch64MachineCombinerPattern::MULADDv8i16_OP2:
7214 Opc = AArch64::MLAv8i16;
7215 RC = &AArch64::FPR128RegClass;
7216 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7217 break;
7218 case AArch64MachineCombinerPattern::MULADDv2i32_OP1:
7219 Opc = AArch64::MLAv2i32;
7220 RC = &AArch64::FPR64RegClass;
7221 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7222 break;
7223 case AArch64MachineCombinerPattern::MULADDv2i32_OP2:
7224 Opc = AArch64::MLAv2i32;
7225 RC = &AArch64::FPR64RegClass;
7226 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7227 break;
7228 case AArch64MachineCombinerPattern::MULADDv4i32_OP1:
7229 Opc = AArch64::MLAv4i32;
7230 RC = &AArch64::FPR128RegClass;
7231 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7232 break;
7233 case AArch64MachineCombinerPattern::MULADDv4i32_OP2:
7234 Opc = AArch64::MLAv4i32;
7235 RC = &AArch64::FPR128RegClass;
7236 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7237 break;
7238
7239 case AArch64MachineCombinerPattern::MULSUBv8i8_OP1:
7240 Opc = AArch64::MLAv8i8;
7241 RC = &AArch64::FPR64RegClass;
7242 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7243 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
7244 RC);
7245 break;
7246 case AArch64MachineCombinerPattern::MULSUBv8i8_OP2:
7247 Opc = AArch64::MLSv8i8;
7248 RC = &AArch64::FPR64RegClass;
7249 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7250 break;
7251 case AArch64MachineCombinerPattern::MULSUBv16i8_OP1:
7252 Opc = AArch64::MLAv16i8;
7253 RC = &AArch64::FPR128RegClass;
7254 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7255 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
7256 RC);
7257 break;
7258 case AArch64MachineCombinerPattern::MULSUBv16i8_OP2:
7259 Opc = AArch64::MLSv16i8;
7260 RC = &AArch64::FPR128RegClass;
7261 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7262 break;
7263 case AArch64MachineCombinerPattern::MULSUBv4i16_OP1:
7264 Opc = AArch64::MLAv4i16;
7265 RC = &AArch64::FPR64RegClass;
7266 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7267 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
7268 RC);
7269 break;
7270 case AArch64MachineCombinerPattern::MULSUBv4i16_OP2:
7271 Opc = AArch64::MLSv4i16;
7272 RC = &AArch64::FPR64RegClass;
7273 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7274 break;
7275 case AArch64MachineCombinerPattern::MULSUBv8i16_OP1:
7276 Opc = AArch64::MLAv8i16;
7277 RC = &AArch64::FPR128RegClass;
7278 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7279 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
7280 RC);
7281 break;
7282 case AArch64MachineCombinerPattern::MULSUBv8i16_OP2:
7283 Opc = AArch64::MLSv8i16;
7284 RC = &AArch64::FPR128RegClass;
7285 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7286 break;
7287 case AArch64MachineCombinerPattern::MULSUBv2i32_OP1:
7288 Opc = AArch64::MLAv2i32;
7289 RC = &AArch64::FPR64RegClass;
7290 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7291 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
7292 RC);
7293 break;
7294 case AArch64MachineCombinerPattern::MULSUBv2i32_OP2:
7295 Opc = AArch64::MLSv2i32;
7296 RC = &AArch64::FPR64RegClass;
7297 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7298 break;
7299 case AArch64MachineCombinerPattern::MULSUBv4i32_OP1:
7300 Opc = AArch64::MLAv4i32;
7301 RC = &AArch64::FPR128RegClass;
7302 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
7303 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
7304 RC);
7305 break;
7306 case AArch64MachineCombinerPattern::MULSUBv4i32_OP2:
7307 Opc = AArch64::MLSv4i32;
7308 RC = &AArch64::FPR128RegClass;
7309 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7310 break;
7311
7312 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP1:
7313 Opc = AArch64::MLAv4i16_indexed;
7314 RC = &AArch64::FPR64RegClass;
7315 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7316 break;
7317 case AArch64MachineCombinerPattern::MULADDv4i16_indexed_OP2:
7318 Opc = AArch64::MLAv4i16_indexed;
7319 RC = &AArch64::FPR64RegClass;
7320 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7321 break;
7322 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP1:
7323 Opc = AArch64::MLAv8i16_indexed;
7324 RC = &AArch64::FPR128RegClass;
7325 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7326 break;
7327 case AArch64MachineCombinerPattern::MULADDv8i16_indexed_OP2:
7328 Opc = AArch64::MLAv8i16_indexed;
7329 RC = &AArch64::FPR128RegClass;
7330 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7331 break;
7332 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP1:
7333 Opc = AArch64::MLAv2i32_indexed;
7334 RC = &AArch64::FPR64RegClass;
7335 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7336 break;
7337 case AArch64MachineCombinerPattern::MULADDv2i32_indexed_OP2:
7338 Opc = AArch64::MLAv2i32_indexed;
7339 RC = &AArch64::FPR64RegClass;
7340 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7341 break;
7342 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP1:
7343 Opc = AArch64::MLAv4i32_indexed;
7344 RC = &AArch64::FPR128RegClass;
7345 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7346 break;
7347 case AArch64MachineCombinerPattern::MULADDv4i32_indexed_OP2:
7348 Opc = AArch64::MLAv4i32_indexed;
7349 RC = &AArch64::FPR128RegClass;
7350 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7351 break;
7352
7353 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
7354 Opc = AArch64::MLAv4i16_indexed;
7355 RC = &AArch64::FPR64RegClass;
7356 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7357 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
7358 RC);
7359 break;
7360 case AArch64MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
7361 Opc = AArch64::MLSv4i16_indexed;
7362 RC = &AArch64::FPR64RegClass;
7363 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7364 break;
7365 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
7366 Opc = AArch64::MLAv8i16_indexed;
7367 RC = &AArch64::FPR128RegClass;
7368 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7369 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
7370 RC);
7371 break;
7372 case AArch64MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
7373 Opc = AArch64::MLSv8i16_indexed;
7374 RC = &AArch64::FPR128RegClass;
7375 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7376 break;
7377 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
7378 Opc = AArch64::MLAv2i32_indexed;
7379 RC = &AArch64::FPR64RegClass;
7380 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7381 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
7382 RC);
7383 break;
7384 case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
7385 Opc = AArch64::MLSv2i32_indexed;
7386 RC = &AArch64::FPR64RegClass;
7387 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7388 break;
7389 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
7390 Opc = AArch64::MLAv4i32_indexed;
7391 RC = &AArch64::FPR128RegClass;
7392 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
7393 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
7394 RC);
7395 break;
7396 case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
7397 Opc = AArch64::MLSv4i32_indexed;
7398 RC = &AArch64::FPR128RegClass;
7399 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7400 break;
7401
7402 // Floating Point Support
7403 case AArch64MachineCombinerPattern::FMULADDH_OP1:
7404 Opc = AArch64::FMADDHrrr;
7405 RC = &AArch64::FPR16RegClass;
7406 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7407 break;
7408 case AArch64MachineCombinerPattern::FMULADDS_OP1:
7409 Opc = AArch64::FMADDSrrr;
7410 RC = &AArch64::FPR32RegClass;
7411 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7412 break;
7413 case AArch64MachineCombinerPattern::FMULADDD_OP1:
7414 Opc = AArch64::FMADDDrrr;
7415 RC = &AArch64::FPR64RegClass;
7416 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7417 break;
7418
7419 case AArch64MachineCombinerPattern::FMULADDH_OP2:
7420 Opc = AArch64::FMADDHrrr;
7421 RC = &AArch64::FPR16RegClass;
7422 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7423 break;
7424 case AArch64MachineCombinerPattern::FMULADDS_OP2:
7425 Opc = AArch64::FMADDSrrr;
7426 RC = &AArch64::FPR32RegClass;
7427 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7428 break;
7429 case AArch64MachineCombinerPattern::FMULADDD_OP2:
7430 Opc = AArch64::FMADDDrrr;
7431 RC = &AArch64::FPR64RegClass;
7432 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7433 break;
7434
7435 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP1:
7436 Opc = AArch64::FMLAv1i32_indexed;
7437 RC = &AArch64::FPR32RegClass;
7438 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7439 kind: FMAInstKind::Indexed);
7440 break;
7441 case AArch64MachineCombinerPattern::FMLAv1i32_indexed_OP2:
7442 Opc = AArch64::FMLAv1i32_indexed;
7443 RC = &AArch64::FPR32RegClass;
7444 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7445 kind: FMAInstKind::Indexed);
7446 break;
7447
7448 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP1:
7449 Opc = AArch64::FMLAv1i64_indexed;
7450 RC = &AArch64::FPR64RegClass;
7451 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7452 kind: FMAInstKind::Indexed);
7453 break;
7454 case AArch64MachineCombinerPattern::FMLAv1i64_indexed_OP2:
7455 Opc = AArch64::FMLAv1i64_indexed;
7456 RC = &AArch64::FPR64RegClass;
7457 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7458 kind: FMAInstKind::Indexed);
7459 break;
7460
7461 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP1:
7462 RC = &AArch64::FPR64RegClass;
7463 Opc = AArch64::FMLAv4i16_indexed;
7464 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7465 kind: FMAInstKind::Indexed);
7466 break;
7467 case AArch64MachineCombinerPattern::FMLAv4f16_OP1:
7468 RC = &AArch64::FPR64RegClass;
7469 Opc = AArch64::FMLAv4f16;
7470 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7471 kind: FMAInstKind::Accumulator);
7472 break;
7473 case AArch64MachineCombinerPattern::FMLAv4i16_indexed_OP2:
7474 RC = &AArch64::FPR64RegClass;
7475 Opc = AArch64::FMLAv4i16_indexed;
7476 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7477 kind: FMAInstKind::Indexed);
7478 break;
7479 case AArch64MachineCombinerPattern::FMLAv4f16_OP2:
7480 RC = &AArch64::FPR64RegClass;
7481 Opc = AArch64::FMLAv4f16;
7482 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7483 kind: FMAInstKind::Accumulator);
7484 break;
7485
7486 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1:
7487 case AArch64MachineCombinerPattern::FMLAv2f32_OP1:
7488 RC = &AArch64::FPR64RegClass;
7489 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
7490 Opc = AArch64::FMLAv2i32_indexed;
7491 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7492 kind: FMAInstKind::Indexed);
7493 } else {
7494 Opc = AArch64::FMLAv2f32;
7495 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7496 kind: FMAInstKind::Accumulator);
7497 }
7498 break;
7499 case AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2:
7500 case AArch64MachineCombinerPattern::FMLAv2f32_OP2:
7501 RC = &AArch64::FPR64RegClass;
7502 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
7503 Opc = AArch64::FMLAv2i32_indexed;
7504 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7505 kind: FMAInstKind::Indexed);
7506 } else {
7507 Opc = AArch64::FMLAv2f32;
7508 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7509 kind: FMAInstKind::Accumulator);
7510 }
7511 break;
7512
7513 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP1:
7514 RC = &AArch64::FPR128RegClass;
7515 Opc = AArch64::FMLAv8i16_indexed;
7516 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7517 kind: FMAInstKind::Indexed);
7518 break;
7519 case AArch64MachineCombinerPattern::FMLAv8f16_OP1:
7520 RC = &AArch64::FPR128RegClass;
7521 Opc = AArch64::FMLAv8f16;
7522 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7523 kind: FMAInstKind::Accumulator);
7524 break;
7525 case AArch64MachineCombinerPattern::FMLAv8i16_indexed_OP2:
7526 RC = &AArch64::FPR128RegClass;
7527 Opc = AArch64::FMLAv8i16_indexed;
7528 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7529 kind: FMAInstKind::Indexed);
7530 break;
7531 case AArch64MachineCombinerPattern::FMLAv8f16_OP2:
7532 RC = &AArch64::FPR128RegClass;
7533 Opc = AArch64::FMLAv8f16;
7534 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7535 kind: FMAInstKind::Accumulator);
7536 break;
7537
7538 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1:
7539 case AArch64MachineCombinerPattern::FMLAv2f64_OP1:
7540 RC = &AArch64::FPR128RegClass;
7541 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
7542 Opc = AArch64::FMLAv2i64_indexed;
7543 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7544 kind: FMAInstKind::Indexed);
7545 } else {
7546 Opc = AArch64::FMLAv2f64;
7547 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7548 kind: FMAInstKind::Accumulator);
7549 }
7550 break;
7551 case AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2:
7552 case AArch64MachineCombinerPattern::FMLAv2f64_OP2:
7553 RC = &AArch64::FPR128RegClass;
7554 if (Pattern == AArch64MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
7555 Opc = AArch64::FMLAv2i64_indexed;
7556 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7557 kind: FMAInstKind::Indexed);
7558 } else {
7559 Opc = AArch64::FMLAv2f64;
7560 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7561 kind: FMAInstKind::Accumulator);
7562 }
7563 break;
7564
7565 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1:
7566 case AArch64MachineCombinerPattern::FMLAv4f32_OP1:
7567 RC = &AArch64::FPR128RegClass;
7568 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
7569 Opc = AArch64::FMLAv4i32_indexed;
7570 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7571 kind: FMAInstKind::Indexed);
7572 } else {
7573 Opc = AArch64::FMLAv4f32;
7574 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7575 kind: FMAInstKind::Accumulator);
7576 }
7577 break;
7578
7579 case AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2:
7580 case AArch64MachineCombinerPattern::FMLAv4f32_OP2:
7581 RC = &AArch64::FPR128RegClass;
7582 if (Pattern == AArch64MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
7583 Opc = AArch64::FMLAv4i32_indexed;
7584 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7585 kind: FMAInstKind::Indexed);
7586 } else {
7587 Opc = AArch64::FMLAv4f32;
7588 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7589 kind: FMAInstKind::Accumulator);
7590 }
7591 break;
7592
7593 case AArch64MachineCombinerPattern::FMULSUBH_OP1:
7594 Opc = AArch64::FNMSUBHrrr;
7595 RC = &AArch64::FPR16RegClass;
7596 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7597 break;
7598 case AArch64MachineCombinerPattern::FMULSUBS_OP1:
7599 Opc = AArch64::FNMSUBSrrr;
7600 RC = &AArch64::FPR32RegClass;
7601 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7602 break;
7603 case AArch64MachineCombinerPattern::FMULSUBD_OP1:
7604 Opc = AArch64::FNMSUBDrrr;
7605 RC = &AArch64::FPR64RegClass;
7606 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7607 break;
7608
7609 case AArch64MachineCombinerPattern::FNMULSUBH_OP1:
7610 Opc = AArch64::FNMADDHrrr;
7611 RC = &AArch64::FPR16RegClass;
7612 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7613 break;
7614 case AArch64MachineCombinerPattern::FNMULSUBS_OP1:
7615 Opc = AArch64::FNMADDSrrr;
7616 RC = &AArch64::FPR32RegClass;
7617 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7618 break;
7619 case AArch64MachineCombinerPattern::FNMULSUBD_OP1:
7620 Opc = AArch64::FNMADDDrrr;
7621 RC = &AArch64::FPR64RegClass;
7622 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC);
7623 break;
7624
7625 case AArch64MachineCombinerPattern::FMULSUBH_OP2:
7626 Opc = AArch64::FMSUBHrrr;
7627 RC = &AArch64::FPR16RegClass;
7628 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7629 break;
7630 case AArch64MachineCombinerPattern::FMULSUBS_OP2:
7631 Opc = AArch64::FMSUBSrrr;
7632 RC = &AArch64::FPR32RegClass;
7633 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7634 break;
7635 case AArch64MachineCombinerPattern::FMULSUBD_OP2:
7636 Opc = AArch64::FMSUBDrrr;
7637 RC = &AArch64::FPR64RegClass;
7638 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC);
7639 break;
7640
7641 case AArch64MachineCombinerPattern::FMLSv1i32_indexed_OP2:
7642 Opc = AArch64::FMLSv1i32_indexed;
7643 RC = &AArch64::FPR32RegClass;
7644 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7645 kind: FMAInstKind::Indexed);
7646 break;
7647
7648 case AArch64MachineCombinerPattern::FMLSv1i64_indexed_OP2:
7649 Opc = AArch64::FMLSv1i64_indexed;
7650 RC = &AArch64::FPR64RegClass;
7651 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7652 kind: FMAInstKind::Indexed);
7653 break;
7654
7655 case AArch64MachineCombinerPattern::FMLSv4f16_OP1:
7656 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
7657 RC = &AArch64::FPR64RegClass;
7658 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
7659 MachineInstrBuilder MIB1 =
7660 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f16), NewVR)
7661 .add(Root.getOperand(2));
7662 InsInstrs.push_back(Elt: MIB1);
7663 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
7664 if (Pattern == AArch64MachineCombinerPattern::FMLSv4f16_OP1) {
7665 Opc = AArch64::FMLAv4f16;
7666 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7667 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
7668 } else {
7669 Opc = AArch64::FMLAv4i16_indexed;
7670 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7671 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
7672 }
7673 break;
7674 }
7675 case AArch64MachineCombinerPattern::FMLSv4f16_OP2:
7676 RC = &AArch64::FPR64RegClass;
7677 Opc = AArch64::FMLSv4f16;
7678 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7679 kind: FMAInstKind::Accumulator);
7680 break;
7681 case AArch64MachineCombinerPattern::FMLSv4i16_indexed_OP2:
7682 RC = &AArch64::FPR64RegClass;
7683 Opc = AArch64::FMLSv4i16_indexed;
7684 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7685 kind: FMAInstKind::Indexed);
7686 break;
7687
7688 case AArch64MachineCombinerPattern::FMLSv2f32_OP2:
7689 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2:
7690 RC = &AArch64::FPR64RegClass;
7691 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
7692 Opc = AArch64::FMLSv2i32_indexed;
7693 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7694 kind: FMAInstKind::Indexed);
7695 } else {
7696 Opc = AArch64::FMLSv2f32;
7697 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7698 kind: FMAInstKind::Accumulator);
7699 }
7700 break;
7701
7702 case AArch64MachineCombinerPattern::FMLSv8f16_OP1:
7703 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
7704 RC = &AArch64::FPR128RegClass;
7705 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
7706 MachineInstrBuilder MIB1 =
7707 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv8f16), NewVR)
7708 .add(Root.getOperand(2));
7709 InsInstrs.push_back(Elt: MIB1);
7710 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
7711 if (Pattern == AArch64MachineCombinerPattern::FMLSv8f16_OP1) {
7712 Opc = AArch64::FMLAv8f16;
7713 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7714 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
7715 } else {
7716 Opc = AArch64::FMLAv8i16_indexed;
7717 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7718 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
7719 }
7720 break;
7721 }
7722 case AArch64MachineCombinerPattern::FMLSv8f16_OP2:
7723 RC = &AArch64::FPR128RegClass;
7724 Opc = AArch64::FMLSv8f16;
7725 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7726 kind: FMAInstKind::Accumulator);
7727 break;
7728 case AArch64MachineCombinerPattern::FMLSv8i16_indexed_OP2:
7729 RC = &AArch64::FPR128RegClass;
7730 Opc = AArch64::FMLSv8i16_indexed;
7731 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7732 kind: FMAInstKind::Indexed);
7733 break;
7734
7735 case AArch64MachineCombinerPattern::FMLSv2f64_OP2:
7736 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2:
7737 RC = &AArch64::FPR128RegClass;
7738 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
7739 Opc = AArch64::FMLSv2i64_indexed;
7740 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7741 kind: FMAInstKind::Indexed);
7742 } else {
7743 Opc = AArch64::FMLSv2f64;
7744 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7745 kind: FMAInstKind::Accumulator);
7746 }
7747 break;
7748
7749 case AArch64MachineCombinerPattern::FMLSv4f32_OP2:
7750 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2:
7751 RC = &AArch64::FPR128RegClass;
7752 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
7753 Opc = AArch64::FMLSv4i32_indexed;
7754 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7755 kind: FMAInstKind::Indexed);
7756 } else {
7757 Opc = AArch64::FMLSv4f32;
7758 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 2, MaddOpc: Opc, RC,
7759 kind: FMAInstKind::Accumulator);
7760 }
7761 break;
7762 case AArch64MachineCombinerPattern::FMLSv2f32_OP1:
7763 case AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1: {
7764 RC = &AArch64::FPR64RegClass;
7765 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
7766 MachineInstrBuilder MIB1 =
7767 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f32), NewVR)
7768 .add(Root.getOperand(2));
7769 InsInstrs.push_back(Elt: MIB1);
7770 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
7771 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i32_indexed_OP1) {
7772 Opc = AArch64::FMLAv2i32_indexed;
7773 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7774 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
7775 } else {
7776 Opc = AArch64::FMLAv2f32;
7777 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7778 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
7779 }
7780 break;
7781 }
7782 case AArch64MachineCombinerPattern::FMLSv4f32_OP1:
7783 case AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1: {
7784 RC = &AArch64::FPR128RegClass;
7785 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
7786 MachineInstrBuilder MIB1 =
7787 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv4f32), NewVR)
7788 .add(Root.getOperand(2));
7789 InsInstrs.push_back(Elt: MIB1);
7790 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
7791 if (Pattern == AArch64MachineCombinerPattern::FMLSv4i32_indexed_OP1) {
7792 Opc = AArch64::FMLAv4i32_indexed;
7793 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7794 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
7795 } else {
7796 Opc = AArch64::FMLAv4f32;
7797 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7798 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
7799 }
7800 break;
7801 }
7802 case AArch64MachineCombinerPattern::FMLSv2f64_OP1:
7803 case AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1: {
7804 RC = &AArch64::FPR128RegClass;
7805 Register NewVR = MRI.createVirtualRegister(RegClass: RC);
7806 MachineInstrBuilder MIB1 =
7807 BuildMI(MF, MIMetadata(Root), TII->get(AArch64::FNEGv2f64), NewVR)
7808 .add(Root.getOperand(2));
7809 InsInstrs.push_back(Elt: MIB1);
7810 InstrIdxForVirtReg.insert(KV: std::make_pair(x&: NewVR, y: 0));
7811 if (Pattern == AArch64MachineCombinerPattern::FMLSv2i64_indexed_OP1) {
7812 Opc = AArch64::FMLAv2i64_indexed;
7813 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7814 kind: FMAInstKind::Indexed, ReplacedAddend: &NewVR);
7815 } else {
7816 Opc = AArch64::FMLAv2f64;
7817 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd: 1, MaddOpc: Opc, RC,
7818 kind: FMAInstKind::Accumulator, ReplacedAddend: &NewVR);
7819 }
7820 break;
7821 }
7822 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1:
7823 case AArch64MachineCombinerPattern::FMULv2i32_indexed_OP2: {
7824 unsigned IdxDupOp =
7825 (Pattern == AArch64MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1
7826 : 2;
7827 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
7828 &AArch64::FPR128RegClass, MRI);
7829 break;
7830 }
7831 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1:
7832 case AArch64MachineCombinerPattern::FMULv2i64_indexed_OP2: {
7833 unsigned IdxDupOp =
7834 (Pattern == AArch64MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1
7835 : 2;
7836 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
7837 &AArch64::FPR128RegClass, MRI);
7838 break;
7839 }
7840 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1:
7841 case AArch64MachineCombinerPattern::FMULv4i16_indexed_OP2: {
7842 unsigned IdxDupOp =
7843 (Pattern == AArch64MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1
7844 : 2;
7845 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
7846 &AArch64::FPR128_loRegClass, MRI);
7847 break;
7848 }
7849 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1:
7850 case AArch64MachineCombinerPattern::FMULv4i32_indexed_OP2: {
7851 unsigned IdxDupOp =
7852 (Pattern == AArch64MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1
7853 : 2;
7854 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
7855 &AArch64::FPR128RegClass, MRI);
7856 break;
7857 }
7858 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1:
7859 case AArch64MachineCombinerPattern::FMULv8i16_indexed_OP2: {
7860 unsigned IdxDupOp =
7861 (Pattern == AArch64MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1
7862 : 2;
7863 genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
7864 &AArch64::FPR128_loRegClass, MRI);
7865 break;
7866 }
7867 case AArch64MachineCombinerPattern::FNMADD: {
7868 MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs);
7869 break;
7870 }
7871
7872 } // end switch (Pattern)
7873 // Record MUL and ADD/SUB for deletion
7874 if (MUL)
7875 DelInstrs.push_back(Elt: MUL);
7876 DelInstrs.push_back(Elt: &Root);
7877
7878 // Set the flags on the inserted instructions to be the merged flags of the
7879 // instructions that we have combined.
7880 uint32_t Flags = Root.getFlags();
7881 if (MUL)
7882 Flags = Root.mergeFlagsWith(Other: *MUL);
7883 for (auto *MI : InsInstrs)
7884 MI->setFlags(Flags);
7885}
7886
7887/// Replace csincr-branch sequence by simple conditional branch
7888///
7889/// Examples:
7890/// 1. \code
7891/// csinc w9, wzr, wzr, <condition code>
7892/// tbnz w9, #0, 0x44
7893/// \endcode
7894/// to
7895/// \code
7896/// b.<inverted condition code>
7897/// \endcode
7898///
7899/// 2. \code
7900/// csinc w9, wzr, wzr, <condition code>
7901/// tbz w9, #0, 0x44
7902/// \endcode
7903/// to
7904/// \code
7905/// b.<condition code>
7906/// \endcode
7907///
7908/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
7909/// compare's constant operand is power of 2.
7910///
7911/// Examples:
7912/// \code
7913/// and w8, w8, #0x400
7914/// cbnz w8, L1
7915/// \endcode
7916/// to
7917/// \code
7918/// tbnz w8, #10, L1
7919/// \endcode
7920///
7921/// \param MI Conditional Branch
7922/// \return True when the simple conditional branch is generated
7923///
7924bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
7925 bool IsNegativeBranch = false;
7926 bool IsTestAndBranch = false;
7927 unsigned TargetBBInMI = 0;
7928 switch (MI.getOpcode()) {
7929 default:
7930 llvm_unreachable("Unknown branch instruction?");
7931 case AArch64::Bcc:
7932 return false;
7933 case AArch64::CBZW:
7934 case AArch64::CBZX:
7935 TargetBBInMI = 1;
7936 break;
7937 case AArch64::CBNZW:
7938 case AArch64::CBNZX:
7939 TargetBBInMI = 1;
7940 IsNegativeBranch = true;
7941 break;
7942 case AArch64::TBZW:
7943 case AArch64::TBZX:
7944 TargetBBInMI = 2;
7945 IsTestAndBranch = true;
7946 break;
7947 case AArch64::TBNZW:
7948 case AArch64::TBNZX:
7949 TargetBBInMI = 2;
7950 IsNegativeBranch = true;
7951 IsTestAndBranch = true;
7952 break;
7953 }
7954 // So we increment a zero register and test for bits other
7955 // than bit 0? Conservatively bail out in case the verifier
7956 // missed this case.
7957 if (IsTestAndBranch && MI.getOperand(i: 1).getImm())
7958 return false;
7959
7960 // Find Definition.
7961 assert(MI.getParent() && "Incomplete machine instruciton\n");
7962 MachineBasicBlock *MBB = MI.getParent();
7963 MachineFunction *MF = MBB->getParent();
7964 MachineRegisterInfo *MRI = &MF->getRegInfo();
7965 Register VReg = MI.getOperand(i: 0).getReg();
7966 if (!VReg.isVirtual())
7967 return false;
7968
7969 MachineInstr *DefMI = MRI->getVRegDef(Reg: VReg);
7970
7971 // Look through COPY instructions to find definition.
7972 while (DefMI->isCopy()) {
7973 Register CopyVReg = DefMI->getOperand(i: 1).getReg();
7974 if (!MRI->hasOneNonDBGUse(RegNo: CopyVReg))
7975 return false;
7976 if (!MRI->hasOneDef(RegNo: CopyVReg))
7977 return false;
7978 DefMI = MRI->getVRegDef(Reg: CopyVReg);
7979 }
7980
7981 switch (DefMI->getOpcode()) {
7982 default:
7983 return false;
7984 // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
7985 case AArch64::ANDWri:
7986 case AArch64::ANDXri: {
7987 if (IsTestAndBranch)
7988 return false;
7989 if (DefMI->getParent() != MBB)
7990 return false;
7991 if (!MRI->hasOneNonDBGUse(RegNo: VReg))
7992 return false;
7993
7994 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
7995 uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
7996 val: DefMI->getOperand(i: 2).getImm(), regSize: Is32Bit ? 32 : 64);
7997 if (!isPowerOf2_64(Value: Mask))
7998 return false;
7999
8000 MachineOperand &MO = DefMI->getOperand(i: 1);
8001 Register NewReg = MO.getReg();
8002 if (!NewReg.isVirtual())
8003 return false;
8004
8005 assert(!MRI->def_empty(NewReg) && "Register must be defined.");
8006
8007 MachineBasicBlock &RefToMBB = *MBB;
8008 MachineBasicBlock *TBB = MI.getOperand(i: 1).getMBB();
8009 DebugLoc DL = MI.getDebugLoc();
8010 unsigned Imm = Log2_64(Value: Mask);
8011 unsigned Opc = (Imm < 32)
8012 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
8013 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
8014 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
8015 .addReg(NewReg)
8016 .addImm(Imm)
8017 .addMBB(TBB);
8018 // Register lives on to the CBZ now.
8019 MO.setIsKill(false);
8020
8021 // For immediate smaller than 32, we need to use the 32-bit
8022 // variant (W) in all cases. Indeed the 64-bit variant does not
8023 // allow to encode them.
8024 // Therefore, if the input register is 64-bit, we need to take the
8025 // 32-bit sub-part.
8026 if (!Is32Bit && Imm < 32)
8027 NewMI->getOperand(0).setSubReg(AArch64::sub_32);
8028 MI.eraseFromParent();
8029 return true;
8030 }
8031 // Look for CSINC
8032 case AArch64::CSINCWr:
8033 case AArch64::CSINCXr: {
8034 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
8035 DefMI->getOperand(2).getReg() == AArch64::WZR) &&
8036 !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
8037 DefMI->getOperand(2).getReg() == AArch64::XZR))
8038 return false;
8039
8040 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr,
8041 true) != -1)
8042 return false;
8043
8044 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(i: 3).getImm();
8045 // Convert only when the condition code is not modified between
8046 // the CSINC and the branch. The CC may be used by other
8047 // instructions in between.
8048 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
8049 return false;
8050 MachineBasicBlock &RefToMBB = *MBB;
8051 MachineBasicBlock *TBB = MI.getOperand(i: TargetBBInMI).getMBB();
8052 DebugLoc DL = MI.getDebugLoc();
8053 if (IsNegativeBranch)
8054 CC = AArch64CC::getInvertedCondCode(Code: CC);
8055 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
8056 MI.eraseFromParent();
8057 return true;
8058 }
8059 }
8060}
8061
8062std::pair<unsigned, unsigned>
8063AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
8064 const unsigned Mask = AArch64II::MO_FRAGMENT;
8065 return std::make_pair(x: TF & Mask, y: TF & ~Mask);
8066}
8067
8068ArrayRef<std::pair<unsigned, const char *>>
8069AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
8070 using namespace AArch64II;
8071
8072 static const std::pair<unsigned, const char *> TargetFlags[] = {
8073 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"},
8074 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"},
8075 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"},
8076 {MO_HI12, "aarch64-hi12"}};
8077 return ArrayRef(TargetFlags);
8078}
8079
8080ArrayRef<std::pair<unsigned, const char *>>
8081AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
8082 using namespace AArch64II;
8083
8084 static const std::pair<unsigned, const char *> TargetFlags[] = {
8085 {MO_COFFSTUB, "aarch64-coffstub"},
8086 {MO_GOT, "aarch64-got"},
8087 {MO_NC, "aarch64-nc"},
8088 {MO_S, "aarch64-s"},
8089 {MO_TLS, "aarch64-tls"},
8090 {MO_DLLIMPORT, "aarch64-dllimport"},
8091 {MO_PREL, "aarch64-prel"},
8092 {MO_TAGGED, "aarch64-tagged"},
8093 {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
8094 };
8095 return ArrayRef(TargetFlags);
8096}
8097
8098ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
8099AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
8100 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8101 {{MOSuppressPair, "aarch64-suppress-pair"},
8102 {MOStridedAccess, "aarch64-strided-access"}};
8103 return ArrayRef(TargetFlags);
8104}
8105
8106/// Constants defining how certain sequences should be outlined.
8107/// This encompasses how an outlined function should be called, and what kind of
8108/// frame should be emitted for that outlined function.
8109///
8110/// \p MachineOutlinerDefault implies that the function should be called with
8111/// a save and restore of LR to the stack.
8112///
8113/// That is,
8114///
8115/// I1 Save LR OUTLINED_FUNCTION:
8116/// I2 --> BL OUTLINED_FUNCTION I1
8117/// I3 Restore LR I2
8118/// I3
8119/// RET
8120///
8121/// * Call construction overhead: 3 (save + BL + restore)
8122/// * Frame construction overhead: 1 (ret)
8123/// * Requires stack fixups? Yes
8124///
8125/// \p MachineOutlinerTailCall implies that the function is being created from
8126/// a sequence of instructions ending in a return.
8127///
8128/// That is,
8129///
8130/// I1 OUTLINED_FUNCTION:
8131/// I2 --> B OUTLINED_FUNCTION I1
8132/// RET I2
8133/// RET
8134///
8135/// * Call construction overhead: 1 (B)
8136/// * Frame construction overhead: 0 (Return included in sequence)
8137/// * Requires stack fixups? No
8138///
8139/// \p MachineOutlinerNoLRSave implies that the function should be called using
8140/// a BL instruction, but doesn't require LR to be saved and restored. This
8141/// happens when LR is known to be dead.
8142///
8143/// That is,
8144///
8145/// I1 OUTLINED_FUNCTION:
8146/// I2 --> BL OUTLINED_FUNCTION I1
8147/// I3 I2
8148/// I3
8149/// RET
8150///
8151/// * Call construction overhead: 1 (BL)
8152/// * Frame construction overhead: 1 (RET)
8153/// * Requires stack fixups? No
8154///
8155/// \p MachineOutlinerThunk implies that the function is being created from
8156/// a sequence of instructions ending in a call. The outlined function is
8157/// called with a BL instruction, and the outlined function tail-calls the
8158/// original call destination.
8159///
8160/// That is,
8161///
8162/// I1 OUTLINED_FUNCTION:
8163/// I2 --> BL OUTLINED_FUNCTION I1
8164/// BL f I2
8165/// B f
8166/// * Call construction overhead: 1 (BL)
8167/// * Frame construction overhead: 0
8168/// * Requires stack fixups? No
8169///
8170/// \p MachineOutlinerRegSave implies that the function should be called with a
8171/// save and restore of LR to an available register. This allows us to avoid
8172/// stack fixups. Note that this outlining variant is compatible with the
8173/// NoLRSave case.
8174///
8175/// That is,
8176///
8177/// I1 Save LR OUTLINED_FUNCTION:
8178/// I2 --> BL OUTLINED_FUNCTION I1
8179/// I3 Restore LR I2
8180/// I3
8181/// RET
8182///
8183/// * Call construction overhead: 3 (save + BL + restore)
8184/// * Frame construction overhead: 1 (ret)
8185/// * Requires stack fixups? No
8186enum MachineOutlinerClass {
8187 MachineOutlinerDefault, /// Emit a save, restore, call, and return.
8188 MachineOutlinerTailCall, /// Only emit a branch.
8189 MachineOutlinerNoLRSave, /// Emit a call and return.
8190 MachineOutlinerThunk, /// Emit a call and tail-call.
8191 MachineOutlinerRegSave /// Same as default, but save to a register.
8192};
8193
8194enum MachineOutlinerMBBFlags {
8195 LRUnavailableSomewhere = 0x2,
8196 HasCalls = 0x4,
8197 UnsafeRegsDead = 0x8
8198};
8199
8200Register
8201AArch64InstrInfo::findRegisterToSaveLRTo(outliner::Candidate &C) const {
8202 MachineFunction *MF = C.getMF();
8203 const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
8204 const AArch64RegisterInfo *ARI =
8205 static_cast<const AArch64RegisterInfo *>(&TRI);
8206 // Check if there is an available register across the sequence that we can
8207 // use.
8208 for (unsigned Reg : AArch64::GPR64RegClass) {
8209 if (!ARI->isReservedReg(*MF, Reg) &&
8210 Reg != AArch64::LR && // LR is not reserved, but don't use it.
8211 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved.
8212 Reg != AArch64::X17 && // Ditto for X17.
8213 C.isAvailableAcrossAndOutOfSeq(Reg, TRI) &&
8214 C.isAvailableInsideSeq(Reg, TRI))
8215 return Reg;
8216 }
8217 return Register();
8218}
8219
8220static bool
8221outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
8222 const outliner::Candidate &b) {
8223 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8224 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8225
8226 return MFIa->shouldSignReturnAddress(SpillsLR: false) == MFIb->shouldSignReturnAddress(SpillsLR: false) &&
8227 MFIa->shouldSignReturnAddress(SpillsLR: true) == MFIb->shouldSignReturnAddress(SpillsLR: true);
8228}
8229
8230static bool
8231outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
8232 const outliner::Candidate &b) {
8233 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>();
8234 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>();
8235
8236 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey();
8237}
8238
8239static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
8240 const outliner::Candidate &b) {
8241 const AArch64Subtarget &SubtargetA =
8242 a.getMF()->getSubtarget<AArch64Subtarget>();
8243 const AArch64Subtarget &SubtargetB =
8244 b.getMF()->getSubtarget<AArch64Subtarget>();
8245 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
8246}
8247
8248std::optional<outliner::OutlinedFunction>
8249AArch64InstrInfo::getOutliningCandidateInfo(
8250 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
8251 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
8252
8253 unsigned SequenceSize = 0;
8254 for (auto &MI : FirstCand)
8255 SequenceSize += getInstSizeInBytes(MI);
8256
8257 unsigned NumBytesToCreateFrame = 0;
8258
8259 // We only allow outlining for functions having exactly matching return
8260 // address signing attributes, i.e., all share the same value for the
8261 // attribute "sign-return-address" and all share the same type of key they
8262 // are signed with.
8263 // Additionally we require all functions to simultaniously either support
8264 // v8.3a features or not. Otherwise an outlined function could get signed
8265 // using dedicated v8.3 instructions and a call from a function that doesn't
8266 // support v8.3 instructions would therefore be invalid.
8267 if (std::adjacent_find(
8268 first: RepeatedSequenceLocs.begin(), last: RepeatedSequenceLocs.end(),
8269 binary_pred: [](const outliner::Candidate &a, const outliner::Candidate &b) {
8270 // Return true if a and b are non-equal w.r.t. return address
8271 // signing or support of v8.3a features
8272 if (outliningCandidatesSigningScopeConsensus(a, b) &&
8273 outliningCandidatesSigningKeyConsensus(a, b) &&
8274 outliningCandidatesV8_3OpsConsensus(a, b)) {
8275 return false;
8276 }
8277 return true;
8278 }) != RepeatedSequenceLocs.end()) {
8279 return std::nullopt;
8280 }
8281
8282 // Since at this point all candidates agree on their return address signing
8283 // picking just one is fine. If the candidate functions potentially sign their
8284 // return addresses, the outlined function should do the same. Note that in
8285 // the case of "sign-return-address"="non-leaf" this is an assumption: It is
8286 // not certainly true that the outlined function will have to sign its return
8287 // address but this decision is made later, when the decision to outline
8288 // has already been made.
8289 // The same holds for the number of additional instructions we need: On
8290 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
8291 // necessary. However, at this point we don't know if the outlined function
8292 // will have a RET instruction so we assume the worst.
8293 const TargetRegisterInfo &TRI = getRegisterInfo();
8294 // Performing a tail call may require extra checks when PAuth is enabled.
8295 // If PAuth is disabled, set it to zero for uniformity.
8296 unsigned NumBytesToCheckLRInTCEpilogue = 0;
8297 if (FirstCand.getMF()
8298 ->getInfo<AArch64FunctionInfo>()
8299 ->shouldSignReturnAddress(SpillsLR: true)) {
8300 // One PAC and one AUT instructions
8301 NumBytesToCreateFrame += 8;
8302
8303 // PAuth is enabled - set extra tail call cost, if any.
8304 auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod();
8305 NumBytesToCheckLRInTCEpilogue =
8306 AArch64PAuth::getCheckerSizeInBytes(Method: LRCheckMethod);
8307 // Checking the authenticated LR value may significantly impact
8308 // SequenceSize, so account for it for more precise results.
8309 if (isTailCallReturnInst(MI: RepeatedSequenceLocs[0].back()))
8310 SequenceSize += NumBytesToCheckLRInTCEpilogue;
8311
8312 // We have to check if sp modifying instructions would get outlined.
8313 // If so we only allow outlining if sp is unchanged overall, so matching
8314 // sub and add instructions are okay to outline, all other sp modifications
8315 // are not
8316 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
8317 int SPValue = 0;
8318 for (auto &MI : C) {
8319 if (MI.modifiesRegister(AArch64::SP, &TRI)) {
8320 switch (MI.getOpcode()) {
8321 case AArch64::ADDXri:
8322 case AArch64::ADDWri:
8323 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
8324 assert(MI.getOperand(2).isImm() &&
8325 "Expected operand to be immediate");
8326 assert(MI.getOperand(1).isReg() &&
8327 "Expected operand to be a register");
8328 // Check if the add just increments sp. If so, we search for
8329 // matching sub instructions that decrement sp. If not, the
8330 // modification is illegal
8331 if (MI.getOperand(1).getReg() == AArch64::SP)
8332 SPValue += MI.getOperand(i: 2).getImm();
8333 else
8334 return true;
8335 break;
8336 case AArch64::SUBXri:
8337 case AArch64::SUBWri:
8338 assert(MI.getNumOperands() == 4 && "Wrong number of operands");
8339 assert(MI.getOperand(2).isImm() &&
8340 "Expected operand to be immediate");
8341 assert(MI.getOperand(1).isReg() &&
8342 "Expected operand to be a register");
8343 // Check if the sub just decrements sp. If so, we search for
8344 // matching add instructions that increment sp. If not, the
8345 // modification is illegal
8346 if (MI.getOperand(1).getReg() == AArch64::SP)
8347 SPValue -= MI.getOperand(i: 2).getImm();
8348 else
8349 return true;
8350 break;
8351 default:
8352 return true;
8353 }
8354 }
8355 }
8356 if (SPValue)
8357 return true;
8358 return false;
8359 };
8360 // Remove candidates with illegal stack modifying instructions
8361 llvm::erase_if(C&: RepeatedSequenceLocs, P: hasIllegalSPModification);
8362
8363 // If the sequence doesn't have enough candidates left, then we're done.
8364 if (RepeatedSequenceLocs.size() < 2)
8365 return std::nullopt;
8366 }
8367
8368 // Properties about candidate MBBs that hold for all of them.
8369 unsigned FlagsSetInAll = 0xF;
8370
8371 // Compute liveness information for each candidate, and set FlagsSetInAll.
8372 for (outliner::Candidate &C : RepeatedSequenceLocs)
8373 FlagsSetInAll &= C.Flags;
8374
8375 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
8376
8377 // Helper lambda which sets call information for every candidate.
8378 auto SetCandidateCallInfo =
8379 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
8380 for (outliner::Candidate &C : RepeatedSequenceLocs)
8381 C.setCallInfo(CID: CallID, CO: NumBytesForCall);
8382 };
8383
8384 unsigned FrameID = MachineOutlinerDefault;
8385 NumBytesToCreateFrame += 4;
8386
8387 bool HasBTI = any_of(Range&: RepeatedSequenceLocs, P: [](outliner::Candidate &C) {
8388 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement();
8389 });
8390
8391 // We check to see if CFI Instructions are present, and if they are
8392 // we find the number of CFI Instructions in the candidates.
8393 unsigned CFICount = 0;
8394 for (auto &I : RepeatedSequenceLocs[0]) {
8395 if (I.isCFIInstruction())
8396 CFICount++;
8397 }
8398
8399 // We compare the number of found CFI Instructions to the number of CFI
8400 // instructions in the parent function for each candidate. We must check this
8401 // since if we outline one of the CFI instructions in a function, we have to
8402 // outline them all for correctness. If we do not, the address offsets will be
8403 // incorrect between the two sections of the program.
8404 for (outliner::Candidate &C : RepeatedSequenceLocs) {
8405 std::vector<MCCFIInstruction> CFIInstructions =
8406 C.getMF()->getFrameInstructions();
8407
8408 if (CFICount > 0 && CFICount != CFIInstructions.size())
8409 return std::nullopt;
8410 }
8411
8412 // Returns true if an instructions is safe to fix up, false otherwise.
8413 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
8414 if (MI.isCall())
8415 return true;
8416
8417 if (!MI.modifiesRegister(AArch64::SP, &TRI) &&
8418 !MI.readsRegister(AArch64::SP, &TRI))
8419 return true;
8420
8421 // Any modification of SP will break our code to save/restore LR.
8422 // FIXME: We could handle some instructions which add a constant
8423 // offset to SP, with a bit more work.
8424 if (MI.modifiesRegister(AArch64::SP, &TRI))
8425 return false;
8426
8427 // At this point, we have a stack instruction that we might need to
8428 // fix up. We'll handle it if it's a load or store.
8429 if (MI.mayLoadOrStore()) {
8430 const MachineOperand *Base; // Filled with the base operand of MI.
8431 int64_t Offset; // Filled with the offset of MI.
8432 bool OffsetIsScalable;
8433
8434 // Does it allow us to offset the base operand and is the base the
8435 // register SP?
8436 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
8437 !Base->isReg() || Base->getReg() != AArch64::SP)
8438 return false;
8439
8440 // Fixe-up code below assumes bytes.
8441 if (OffsetIsScalable)
8442 return false;
8443
8444 // Find the minimum/maximum offset for this instruction and check
8445 // if fixing it up would be in range.
8446 int64_t MinOffset,
8447 MaxOffset; // Unscaled offsets for the instruction.
8448 // The scale to multiply the offsets by.
8449 TypeSize Scale(0U, false), DummyWidth(0U, false);
8450 getMemOpInfo(Opcode: MI.getOpcode(), Scale, Width&: DummyWidth, MinOffset, MaxOffset);
8451
8452 Offset += 16; // Update the offset to what it would be if we outlined.
8453 if (Offset < MinOffset * (int64_t)Scale.getFixedValue() ||
8454 Offset > MaxOffset * (int64_t)Scale.getFixedValue())
8455 return false;
8456
8457 // It's in range, so we can outline it.
8458 return true;
8459 }
8460
8461 // FIXME: Add handling for instructions like "add x0, sp, #8".
8462
8463 // We can't fix it up, so don't outline it.
8464 return false;
8465 };
8466
8467 // True if it's possible to fix up each stack instruction in this sequence.
8468 // Important for frames/call variants that modify the stack.
8469 bool AllStackInstrsSafe = llvm::all_of(Range&: FirstCand, P: IsSafeToFixup);
8470
8471 // If the last instruction in any candidate is a terminator, then we should
8472 // tail call all of the candidates.
8473 if (RepeatedSequenceLocs[0].back().isTerminator()) {
8474 FrameID = MachineOutlinerTailCall;
8475 NumBytesToCreateFrame = 0;
8476 unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
8477 SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
8478 }
8479
8480 else if (LastInstrOpcode == AArch64::BL ||
8481 ((LastInstrOpcode == AArch64::BLR ||
8482 LastInstrOpcode == AArch64::BLRNoIP) &&
8483 !HasBTI)) {
8484 // FIXME: Do we need to check if the code after this uses the value of LR?
8485 FrameID = MachineOutlinerThunk;
8486 NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
8487 SetCandidateCallInfo(MachineOutlinerThunk, 4);
8488 }
8489
8490 else {
8491 // We need to decide how to emit calls + frames. We can always emit the same
8492 // frame if we don't need to save to the stack. If we have to save to the
8493 // stack, then we need a different frame.
8494 unsigned NumBytesNoStackCalls = 0;
8495 std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
8496
8497 // Check if we have to save LR.
8498 for (outliner::Candidate &C : RepeatedSequenceLocs) {
8499 bool LRAvailable =
8500 (C.Flags & MachineOutlinerMBBFlags::LRUnavailableSomewhere)
8501 ? C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI)
8502 : true;
8503 // If we have a noreturn caller, then we're going to be conservative and
8504 // say that we have to save LR. If we don't have a ret at the end of the
8505 // block, then we can't reason about liveness accurately.
8506 //
8507 // FIXME: We can probably do better than always disabling this in
8508 // noreturn functions by fixing up the liveness info.
8509 bool IsNoReturn =
8510 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
8511
8512 // Is LR available? If so, we don't need a save.
8513 if (LRAvailable && !IsNoReturn) {
8514 NumBytesNoStackCalls += 4;
8515 C.setCallInfo(CID: MachineOutlinerNoLRSave, CO: 4);
8516 CandidatesWithoutStackFixups.push_back(x: C);
8517 }
8518
8519 // Is an unused register available? If so, we won't modify the stack, so
8520 // we can outline with the same frame type as those that don't save LR.
8521 else if (findRegisterToSaveLRTo(C)) {
8522 NumBytesNoStackCalls += 12;
8523 C.setCallInfo(CID: MachineOutlinerRegSave, CO: 12);
8524 CandidatesWithoutStackFixups.push_back(x: C);
8525 }
8526
8527 // Is SP used in the sequence at all? If not, we don't have to modify
8528 // the stack, so we are guaranteed to get the same frame.
8529 else if (C.isAvailableInsideSeq(AArch64::SP, TRI)) {
8530 NumBytesNoStackCalls += 12;
8531 C.setCallInfo(CID: MachineOutlinerDefault, CO: 12);
8532 CandidatesWithoutStackFixups.push_back(x: C);
8533 }
8534
8535 // If we outline this, we need to modify the stack. Pretend we don't
8536 // outline this by saving all of its bytes.
8537 else {
8538 NumBytesNoStackCalls += SequenceSize;
8539 }
8540 }
8541
8542 // If there are no places where we have to save LR, then note that we
8543 // don't have to update the stack. Otherwise, give every candidate the
8544 // default call type, as long as it's safe to do so.
8545 if (!AllStackInstrsSafe ||
8546 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) {
8547 RepeatedSequenceLocs = CandidatesWithoutStackFixups;
8548 FrameID = MachineOutlinerNoLRSave;
8549 } else {
8550 SetCandidateCallInfo(MachineOutlinerDefault, 12);
8551
8552 // Bugzilla ID: 46767
8553 // TODO: Check if fixing up the stack more than once is safe so we can
8554 // outline these.
8555 //
8556 // An outline resulting in a caller that requires stack fixups at the
8557 // callsite to a callee that also requires stack fixups can happen when
8558 // there are no available registers at the candidate callsite for a
8559 // candidate that itself also has calls.
8560 //
8561 // In other words if function_containing_sequence in the following pseudo
8562 // assembly requires that we save LR at the point of the call, but there
8563 // are no available registers: in this case we save using SP and as a
8564 // result the SP offsets requires stack fixups by multiples of 16.
8565 //
8566 // function_containing_sequence:
8567 // ...
8568 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
8569 // call OUTLINED_FUNCTION_N
8570 // restore LR from SP
8571 // ...
8572 //
8573 // OUTLINED_FUNCTION_N:
8574 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N
8575 // ...
8576 // bl foo
8577 // restore LR from SP
8578 // ret
8579 //
8580 // Because the code to handle more than one stack fixup does not
8581 // currently have the proper checks for legality, these cases will assert
8582 // in the AArch64 MachineOutliner. This is because the code to do this
8583 // needs more hardening, testing, better checks that generated code is
8584 // legal, etc and because it is only verified to handle a single pass of
8585 // stack fixup.
8586 //
8587 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch
8588 // these cases until they are known to be handled. Bugzilla 46767 is
8589 // referenced in comments at the assert site.
8590 //
8591 // To avoid asserting (or generating non-legal code on noassert builds)
8592 // we remove all candidates which would need more than one stack fixup by
8593 // pruning the cases where the candidate has calls while also having no
8594 // available LR and having no available general purpose registers to copy
8595 // LR to (ie one extra stack save/restore).
8596 //
8597 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
8598 erase_if(C&: RepeatedSequenceLocs, P: [this, &TRI](outliner::Candidate &C) {
8599 auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
8600 return (llvm::any_of(C, IsCall)) &&
8601 (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
8602 !findRegisterToSaveLRTo(C));
8603 });
8604 }
8605 }
8606
8607 // If we dropped all of the candidates, bail out here.
8608 if (RepeatedSequenceLocs.size() < 2) {
8609 RepeatedSequenceLocs.clear();
8610 return std::nullopt;
8611 }
8612 }
8613
8614 // Does every candidate's MBB contain a call? If so, then we might have a call
8615 // in the range.
8616 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
8617 // Check if the range contains a call. These require a save + restore of the
8618 // link register.
8619 bool ModStackToSaveLR = false;
8620 if (std::any_of(first: FirstCand.begin(), last: std::prev(x: FirstCand.end()),
8621 pred: [](const MachineInstr &MI) { return MI.isCall(); }))
8622 ModStackToSaveLR = true;
8623
8624 // Handle the last instruction separately. If this is a tail call, then the
8625 // last instruction is a call. We don't want to save + restore in this case.
8626 // However, it could be possible that the last instruction is a call without
8627 // it being valid to tail call this sequence. We should consider this as
8628 // well.
8629 else if (FrameID != MachineOutlinerThunk &&
8630 FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
8631 ModStackToSaveLR = true;
8632
8633 if (ModStackToSaveLR) {
8634 // We can't fix up the stack. Bail out.
8635 if (!AllStackInstrsSafe) {
8636 RepeatedSequenceLocs.clear();
8637 return std::nullopt;
8638 }
8639
8640 // Save + restore LR.
8641 NumBytesToCreateFrame += 8;
8642 }
8643 }
8644
8645 // If we have CFI instructions, we can only outline if the outlined section
8646 // can be a tail call
8647 if (FrameID != MachineOutlinerTailCall && CFICount > 0)
8648 return std::nullopt;
8649
8650 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
8651 NumBytesToCreateFrame, FrameID);
8652}
8653
8654void AArch64InstrInfo::mergeOutliningCandidateAttributes(
8655 Function &F, std::vector<outliner::Candidate> &Candidates) const {
8656 // If a bunch of candidates reach this point they must agree on their return
8657 // address signing. It is therefore enough to just consider the signing
8658 // behaviour of one of them
8659 const auto &CFn = Candidates.front().getMF()->getFunction();
8660
8661 // Since all candidates belong to the same module, just copy the
8662 // function-level attributes of an arbitrary function.
8663 if (CFn.hasFnAttribute(Kind: "sign-return-address"))
8664 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "sign-return-address"));
8665 if (CFn.hasFnAttribute(Kind: "sign-return-address-key"))
8666 F.addFnAttr(Attr: CFn.getFnAttribute(Kind: "sign-return-address-key"));
8667
8668 AArch64GenInstrInfo::mergeOutliningCandidateAttributes(F, Candidates);
8669}
8670
8671bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
8672 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
8673 const Function &F = MF.getFunction();
8674
8675 // Can F be deduplicated by the linker? If it can, don't outline from it.
8676 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
8677 return false;
8678
8679 // Don't outline from functions with section markings; the program could
8680 // expect that all the code is in the named section.
8681 // FIXME: Allow outlining from multiple functions with the same section
8682 // marking.
8683 if (F.hasSection())
8684 return false;
8685
8686 // Outlining from functions with redzones is unsafe since the outliner may
8687 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't
8688 // outline from it.
8689 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
8690 if (!AFI || AFI->hasRedZone().value_or(u: true))
8691 return false;
8692
8693 // FIXME: Teach the outliner to generate/handle Windows unwind info.
8694 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
8695 return false;
8696
8697 // It's safe to outline from MF.
8698 return true;
8699}
8700
8701SmallVector<std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
8702AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
8703 unsigned &Flags) const {
8704 assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
8705 "Must track liveness!");
8706 SmallVector<
8707 std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
8708 Ranges;
8709 // According to the AArch64 Procedure Call Standard, the following are
8710 // undefined on entry/exit from a function call:
8711 //
8712 // * Registers x16, x17, (and thus w16, w17)
8713 // * Condition codes (and thus the NZCV register)
8714 //
8715 // If any of these registers are used inside or live across an outlined
8716 // function, then they may be modified later, either by the compiler or
8717 // some other tool (like the linker).
8718 //
8719 // To avoid outlining in these situations, partition each block into ranges
8720 // where these registers are dead. We will only outline from those ranges.
8721 LiveRegUnits LRU(getRegisterInfo());
8722 auto AreAllUnsafeRegsDead = [&LRU]() {
8723 return LRU.available(AArch64::W16) && LRU.available(AArch64::W17) &&
8724 LRU.available(AArch64::NZCV);
8725 };
8726
8727 // We need to know if LR is live across an outlining boundary later on in
8728 // order to decide how we'll create the outlined call, frame, etc.
8729 //
8730 // It's pretty expensive to check this for *every candidate* within a block.
8731 // That's some potentially n^2 behaviour, since in the worst case, we'd need
8732 // to compute liveness from the end of the block for O(n) candidates within
8733 // the block.
8734 //
8735 // So, to improve the average case, let's keep track of liveness from the end
8736 // of the block to the beginning of *every outlinable range*. If we know that
8737 // LR is available in every range we could outline from, then we know that
8738 // we don't need to check liveness for any candidate within that range.
8739 bool LRAvailableEverywhere = true;
8740 // Compute liveness bottom-up.
8741 LRU.addLiveOuts(MBB);
8742 // Update flags that require info about the entire MBB.
8743 auto UpdateWholeMBBFlags = [&Flags](const MachineInstr &MI) {
8744 if (MI.isCall() && !MI.isTerminator())
8745 Flags |= MachineOutlinerMBBFlags::HasCalls;
8746 };
8747 // Range: [RangeBegin, RangeEnd)
8748 MachineBasicBlock::instr_iterator RangeBegin, RangeEnd;
8749 unsigned RangeLen;
8750 auto CreateNewRangeStartingAt =
8751 [&RangeBegin, &RangeEnd,
8752 &RangeLen](MachineBasicBlock::instr_iterator NewBegin) {
8753 RangeBegin = NewBegin;
8754 RangeEnd = std::next(x: RangeBegin);
8755 RangeLen = 0;
8756 };
8757 auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() {
8758 // At least one unsafe register is not dead. We do not want to outline at
8759 // this point. If it is long enough to outline from, save the range
8760 // [RangeBegin, RangeEnd).
8761 if (RangeLen > 1)
8762 Ranges.push_back(Elt: std::make_pair(x&: RangeBegin, y&: RangeEnd));
8763 };
8764 // Find the first point where all unsafe registers are dead.
8765 // FIND: <safe instr> <-- end of first potential range
8766 // SKIP: <unsafe def>
8767 // SKIP: ... everything between ...
8768 // SKIP: <unsafe use>
8769 auto FirstPossibleEndPt = MBB.instr_rbegin();
8770 for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
8771 LRU.stepBackward(MI: *FirstPossibleEndPt);
8772 // Update flags that impact how we outline across the entire block,
8773 // regardless of safety.
8774 UpdateWholeMBBFlags(*FirstPossibleEndPt);
8775 if (AreAllUnsafeRegsDead())
8776 break;
8777 }
8778 // If we exhausted the entire block, we have no safe ranges to outline.
8779 if (FirstPossibleEndPt == MBB.instr_rend())
8780 return Ranges;
8781 // Current range.
8782 CreateNewRangeStartingAt(FirstPossibleEndPt->getIterator());
8783 // StartPt points to the first place where all unsafe registers
8784 // are dead (if there is any such point). Begin partitioning the MBB into
8785 // ranges.
8786 for (auto &MI : make_range(x: FirstPossibleEndPt, y: MBB.instr_rend())) {
8787 LRU.stepBackward(MI);
8788 UpdateWholeMBBFlags(MI);
8789 if (!AreAllUnsafeRegsDead()) {
8790 SaveRangeIfNonEmpty();
8791 CreateNewRangeStartingAt(MI.getIterator());
8792 continue;
8793 }
8794 LRAvailableEverywhere &= LRU.available(AArch64::LR);
8795 RangeBegin = MI.getIterator();
8796 ++RangeLen;
8797 }
8798 // Above loop misses the last (or only) range. If we are still safe, then
8799 // let's save the range.
8800 if (AreAllUnsafeRegsDead())
8801 SaveRangeIfNonEmpty();
8802 if (Ranges.empty())
8803 return Ranges;
8804 // We found the ranges bottom-up. Mapping expects the top-down. Reverse
8805 // the order.
8806 std::reverse(first: Ranges.begin(), last: Ranges.end());
8807 // If there is at least one outlinable range where LR is unavailable
8808 // somewhere, remember that.
8809 if (!LRAvailableEverywhere)
8810 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
8811 return Ranges;
8812}
8813
8814outliner::InstrType
8815AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
8816 unsigned Flags) const {
8817 MachineInstr &MI = *MIT;
8818 MachineBasicBlock *MBB = MI.getParent();
8819 MachineFunction *MF = MBB->getParent();
8820 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
8821
8822 // Don't outline anything used for return address signing. The outlined
8823 // function will get signed later if needed
8824 switch (MI.getOpcode()) {
8825 case AArch64::PACM:
8826 case AArch64::PACIASP:
8827 case AArch64::PACIBSP:
8828 case AArch64::PACIASPPC:
8829 case AArch64::PACIBSPPC:
8830 case AArch64::AUTIASP:
8831 case AArch64::AUTIBSP:
8832 case AArch64::AUTIASPPCi:
8833 case AArch64::AUTIASPPCr:
8834 case AArch64::AUTIBSPPCi:
8835 case AArch64::AUTIBSPPCr:
8836 case AArch64::RETAA:
8837 case AArch64::RETAB:
8838 case AArch64::RETAASPPCi:
8839 case AArch64::RETAASPPCr:
8840 case AArch64::RETABSPPCi:
8841 case AArch64::RETABSPPCr:
8842 case AArch64::EMITBKEY:
8843 case AArch64::PAUTH_PROLOGUE:
8844 case AArch64::PAUTH_EPILOGUE:
8845 return outliner::InstrType::Illegal;
8846 }
8847
8848 // Don't outline LOHs.
8849 if (FuncInfo->getLOHRelated().count(Ptr: &MI))
8850 return outliner::InstrType::Illegal;
8851
8852 // We can only outline these if we will tail call the outlined function, or
8853 // fix up the CFI offsets. Currently, CFI instructions are outlined only if
8854 // in a tail call.
8855 //
8856 // FIXME: If the proper fixups for the offset are implemented, this should be
8857 // possible.
8858 if (MI.isCFIInstruction())
8859 return outliner::InstrType::Legal;
8860
8861 // Is this a terminator for a basic block?
8862 if (MI.isTerminator())
8863 // TargetInstrInfo::getOutliningType has already filtered out anything
8864 // that would break this, so we can allow it here.
8865 return outliner::InstrType::Legal;
8866
8867 // Make sure none of the operands are un-outlinable.
8868 for (const MachineOperand &MOP : MI.operands()) {
8869 // A check preventing CFI indices was here before, but only CFI
8870 // instructions should have those.
8871 assert(!MOP.isCFIIndex());
8872
8873 // If it uses LR or W30 explicitly, then don't touch it.
8874 if (MOP.isReg() && !MOP.isImplicit() &&
8875 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30))
8876 return outliner::InstrType::Illegal;
8877 }
8878
8879 // Special cases for instructions that can always be outlined, but will fail
8880 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always
8881 // be outlined because they don't require a *specific* value to be in LR.
8882 if (MI.getOpcode() == AArch64::ADRP)
8883 return outliner::InstrType::Legal;
8884
8885 // If MI is a call we might be able to outline it. We don't want to outline
8886 // any calls that rely on the position of items on the stack. When we outline
8887 // something containing a call, we have to emit a save and restore of LR in
8888 // the outlined function. Currently, this always happens by saving LR to the
8889 // stack. Thus, if we outline, say, half the parameters for a function call
8890 // plus the call, then we'll break the callee's expectations for the layout
8891 // of the stack.
8892 //
8893 // FIXME: Allow calls to functions which construct a stack frame, as long
8894 // as they don't access arguments on the stack.
8895 // FIXME: Figure out some way to analyze functions defined in other modules.
8896 // We should be able to compute the memory usage based on the IR calling
8897 // convention, even if we can't see the definition.
8898 if (MI.isCall()) {
8899 // Get the function associated with the call. Look at each operand and find
8900 // the one that represents the callee and get its name.
8901 const Function *Callee = nullptr;
8902 for (const MachineOperand &MOP : MI.operands()) {
8903 if (MOP.isGlobal()) {
8904 Callee = dyn_cast<Function>(Val: MOP.getGlobal());
8905 break;
8906 }
8907 }
8908
8909 // Never outline calls to mcount. There isn't any rule that would require
8910 // this, but the Linux kernel's "ftrace" feature depends on it.
8911 if (Callee && Callee->getName() == "\01_mcount")
8912 return outliner::InstrType::Illegal;
8913
8914 // If we don't know anything about the callee, assume it depends on the
8915 // stack layout of the caller. In that case, it's only legal to outline
8916 // as a tail-call. Explicitly list the call instructions we know about so we
8917 // don't get unexpected results with call pseudo-instructions.
8918 auto UnknownCallOutlineType = outliner::InstrType::Illegal;
8919 if (MI.getOpcode() == AArch64::BLR ||
8920 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
8921 UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
8922
8923 if (!Callee)
8924 return UnknownCallOutlineType;
8925
8926 // We have a function we have information about. Check it if it's something
8927 // can safely outline.
8928 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(F: *Callee);
8929
8930 // We don't know what's going on with the callee at all. Don't touch it.
8931 if (!CalleeMF)
8932 return UnknownCallOutlineType;
8933
8934 // Check if we know anything about the callee saves on the function. If we
8935 // don't, then don't touch it, since that implies that we haven't
8936 // computed anything about its stack frame yet.
8937 MachineFrameInfo &MFI = CalleeMF->getFrameInfo();
8938 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 ||
8939 MFI.getNumObjects() > 0)
8940 return UnknownCallOutlineType;
8941
8942 // At this point, we can say that CalleeMF ought to not pass anything on the
8943 // stack. Therefore, we can outline it.
8944 return outliner::InstrType::Legal;
8945 }
8946
8947 // Don't touch the link register or W30.
8948 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) ||
8949 MI.modifiesRegister(AArch64::W30, &getRegisterInfo()))
8950 return outliner::InstrType::Illegal;
8951
8952 // Don't outline BTI instructions, because that will prevent the outlining
8953 // site from being indirectly callable.
8954 if (hasBTISemantics(MI))
8955 return outliner::InstrType::Illegal;
8956
8957 return outliner::InstrType::Legal;
8958}
8959
8960void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
8961 for (MachineInstr &MI : MBB) {
8962 const MachineOperand *Base;
8963 TypeSize Width(0, false);
8964 int64_t Offset;
8965 bool OffsetIsScalable;
8966
8967 // Is this a load or store with an immediate offset with SP as the base?
8968 if (!MI.mayLoadOrStore() ||
8969 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
8970 &RI) ||
8971 (Base->isReg() && Base->getReg() != AArch64::SP))
8972 continue;
8973
8974 // It is, so we have to fix it up.
8975 TypeSize Scale(0U, false);
8976 int64_t Dummy1, Dummy2;
8977
8978 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(LdSt&: MI);
8979 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
8980 getMemOpInfo(Opcode: MI.getOpcode(), Scale, Width, MinOffset&: Dummy1, MaxOffset&: Dummy2);
8981 assert(Scale != 0 && "Unexpected opcode!");
8982 assert(!OffsetIsScalable && "Expected offset to be a byte offset");
8983
8984 // We've pushed the return address to the stack, so add 16 to the offset.
8985 // This is safe, since we already checked if it would overflow when we
8986 // checked if this instruction was legal to outline.
8987 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedValue();
8988 StackOffsetOperand.setImm(NewImm);
8989 }
8990}
8991
8992static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
8993 const AArch64InstrInfo *TII,
8994 bool ShouldSignReturnAddr) {
8995 if (!ShouldSignReturnAddr)
8996 return;
8997
8998 BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(AArch64::PAUTH_PROLOGUE))
8999 .setMIFlag(MachineInstr::FrameSetup);
9000 BuildMI(MBB, MBB.getFirstInstrTerminator(), DebugLoc(),
9001 TII->get(AArch64::PAUTH_EPILOGUE))
9002 .setMIFlag(MachineInstr::FrameDestroy);
9003}
9004
9005void AArch64InstrInfo::buildOutlinedFrame(
9006 MachineBasicBlock &MBB, MachineFunction &MF,
9007 const outliner::OutlinedFunction &OF) const {
9008
9009 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
9010
9011 if (OF.FrameConstructionID == MachineOutlinerTailCall)
9012 FI->setOutliningStyle("Tail Call");
9013 else if (OF.FrameConstructionID == MachineOutlinerThunk) {
9014 // For thunk outlining, rewrite the last instruction from a call to a
9015 // tail-call.
9016 MachineInstr *Call = &*--MBB.instr_end();
9017 unsigned TailOpcode;
9018 if (Call->getOpcode() == AArch64::BL) {
9019 TailOpcode = AArch64::TCRETURNdi;
9020 } else {
9021 assert(Call->getOpcode() == AArch64::BLR ||
9022 Call->getOpcode() == AArch64::BLRNoIP);
9023 TailOpcode = AArch64::TCRETURNriALL;
9024 }
9025 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
9026 .add(Call->getOperand(i: 0))
9027 .addImm(0);
9028 MBB.insert(I: MBB.end(), MI: TC);
9029 Call->eraseFromParent();
9030
9031 FI->setOutliningStyle("Thunk");
9032 }
9033
9034 bool IsLeafFunction = true;
9035
9036 // Is there a call in the outlined range?
9037 auto IsNonTailCall = [](const MachineInstr &MI) {
9038 return MI.isCall() && !MI.isReturn();
9039 };
9040
9041 if (llvm::any_of(Range: MBB.instrs(), P: IsNonTailCall)) {
9042 // Fix up the instructions in the range, since we're going to modify the
9043 // stack.
9044
9045 // Bugzilla ID: 46767
9046 // TODO: Check if fixing up twice is safe so we can outline these.
9047 assert(OF.FrameConstructionID != MachineOutlinerDefault &&
9048 "Can only fix up stack references once");
9049 fixupPostOutline(MBB);
9050
9051 IsLeafFunction = false;
9052
9053 // LR has to be a live in so that we can save it.
9054 if (!MBB.isLiveIn(AArch64::LR))
9055 MBB.addLiveIn(AArch64::LR);
9056
9057 MachineBasicBlock::iterator It = MBB.begin();
9058 MachineBasicBlock::iterator Et = MBB.end();
9059
9060 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9061 OF.FrameConstructionID == MachineOutlinerThunk)
9062 Et = std::prev(x: MBB.end());
9063
9064 // Insert a save before the outlined region
9065 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9066 .addReg(AArch64::SP, RegState::Define)
9067 .addReg(AArch64::LR)
9068 .addReg(AArch64::SP)
9069 .addImm(-16);
9070 It = MBB.insert(I: It, MI: STRXpre);
9071
9072 if (MF.getInfo<AArch64FunctionInfo>()->needsDwarfUnwindInfo(MF)) {
9073 const TargetSubtargetInfo &STI = MF.getSubtarget();
9074 const MCRegisterInfo *MRI = STI.getRegisterInfo();
9075 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true);
9076
9077 // Add a CFI saying the stack was moved 16 B down.
9078 int64_t StackPosEntry =
9079 MF.addFrameInst(Inst: MCCFIInstruction::cfiDefCfaOffset(L: nullptr, Offset: 16));
9080 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
9081 .addCFIIndex(StackPosEntry)
9082 .setMIFlags(MachineInstr::FrameSetup);
9083
9084 // Add a CFI saying that the LR that we want to find is now 16 B higher
9085 // than before.
9086 int64_t LRPosEntry = MF.addFrameInst(
9087 Inst: MCCFIInstruction::createOffset(L: nullptr, Register: DwarfReg, Offset: -16));
9088 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
9089 .addCFIIndex(LRPosEntry)
9090 .setMIFlags(MachineInstr::FrameSetup);
9091 }
9092
9093 // Insert a restore before the terminator for the function.
9094 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9095 .addReg(AArch64::SP, RegState::Define)
9096 .addReg(AArch64::LR, RegState::Define)
9097 .addReg(AArch64::SP)
9098 .addImm(16);
9099 Et = MBB.insert(I: Et, MI: LDRXpost);
9100 }
9101
9102 bool ShouldSignReturnAddr = FI->shouldSignReturnAddress(SpillsLR: !IsLeafFunction);
9103
9104 // If this is a tail call outlined function, then there's already a return.
9105 if (OF.FrameConstructionID == MachineOutlinerTailCall ||
9106 OF.FrameConstructionID == MachineOutlinerThunk) {
9107 signOutlinedFunction(MF, MBB, TII: this, ShouldSignReturnAddr);
9108 return;
9109 }
9110
9111 // It's not a tail call, so we have to insert the return ourselves.
9112
9113 // LR has to be a live in so that we can return to it.
9114 if (!MBB.isLiveIn(AArch64::LR))
9115 MBB.addLiveIn(AArch64::LR);
9116
9117 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
9118 .addReg(AArch64::LR);
9119 MBB.insert(I: MBB.end(), MI: ret);
9120
9121 signOutlinedFunction(MF, MBB, TII: this, ShouldSignReturnAddr);
9122
9123 FI->setOutliningStyle("Function");
9124
9125 // Did we have to modify the stack by saving the link register?
9126 if (OF.FrameConstructionID != MachineOutlinerDefault)
9127 return;
9128
9129 // We modified the stack.
9130 // Walk over the basic block and fix up all the stack accesses.
9131 fixupPostOutline(MBB);
9132}
9133
9134MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
9135 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
9136 MachineFunction &MF, outliner::Candidate &C) const {
9137
9138 // Are we tail calling?
9139 if (C.CallConstructionID == MachineOutlinerTailCall) {
9140 // If yes, then we can just branch to the label.
9141 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi))
9142 .addGlobalAddress(M.getNamedValue(MF.getName()))
9143 .addImm(0));
9144 return It;
9145 }
9146
9147 // Are we saving the link register?
9148 if (C.CallConstructionID == MachineOutlinerNoLRSave ||
9149 C.CallConstructionID == MachineOutlinerThunk) {
9150 // No, so just insert the call.
9151 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9152 .addGlobalAddress(M.getNamedValue(MF.getName())));
9153 return It;
9154 }
9155
9156 // We want to return the spot where we inserted the call.
9157 MachineBasicBlock::iterator CallPt;
9158
9159 // Instructions for saving and restoring LR around the call instruction we're
9160 // going to insert.
9161 MachineInstr *Save;
9162 MachineInstr *Restore;
9163 // Can we save to a register?
9164 if (C.CallConstructionID == MachineOutlinerRegSave) {
9165 // FIXME: This logic should be sunk into a target-specific interface so that
9166 // we don't have to recompute the register.
9167 Register Reg = findRegisterToSaveLRTo(C);
9168 assert(Reg && "No callee-saved register available?");
9169
9170 // LR has to be a live in so that we can save it.
9171 if (!MBB.isLiveIn(AArch64::LR))
9172 MBB.addLiveIn(AArch64::LR);
9173
9174 // Save and restore LR from Reg.
9175 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg)
9176 .addReg(AArch64::XZR)
9177 .addReg(AArch64::LR)
9178 .addImm(0);
9179 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR)
9180 .addReg(AArch64::XZR)
9181 .addReg(Reg)
9182 .addImm(0);
9183 } else {
9184 // We have the default case. Save and restore from SP.
9185 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
9186 .addReg(AArch64::SP, RegState::Define)
9187 .addReg(AArch64::LR)
9188 .addReg(AArch64::SP)
9189 .addImm(-16);
9190 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
9191 .addReg(AArch64::SP, RegState::Define)
9192 .addReg(AArch64::LR, RegState::Define)
9193 .addReg(AArch64::SP)
9194 .addImm(16);
9195 }
9196
9197 It = MBB.insert(I: It, MI: Save);
9198 It++;
9199
9200 // Insert the call.
9201 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL))
9202 .addGlobalAddress(M.getNamedValue(MF.getName())));
9203 CallPt = It;
9204 It++;
9205
9206 It = MBB.insert(I: It, MI: Restore);
9207 return CallPt;
9208}
9209
9210bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
9211 MachineFunction &MF) const {
9212 return MF.getFunction().hasMinSize();
9213}
9214
9215void AArch64InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
9216 MachineBasicBlock::iterator Iter,
9217 DebugLoc &DL,
9218 bool AllowSideEffects) const {
9219 const MachineFunction &MF = *MBB.getParent();
9220 const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
9221 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
9222
9223 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
9224 BuildMI(MBB, Iter, DL, get(AArch64::MOVZXi), Reg).addImm(0).addImm(0);
9225 } else if (STI.hasSVE()) {
9226 BuildMI(MBB, Iter, DL, get(AArch64::DUP_ZI_D), Reg)
9227 .addImm(0)
9228 .addImm(0);
9229 } else {
9230 BuildMI(MBB, Iter, DL, get(AArch64::MOVIv2d_ns), Reg)
9231 .addImm(0);
9232 }
9233}
9234
9235std::optional<DestSourcePair>
9236AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
9237
9238 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
9239 // and zero immediate operands used as an alias for mov instruction.
9240 if (MI.getOpcode() == AArch64::ORRWrs &&
9241 MI.getOperand(1).getReg() == AArch64::WZR &&
9242 MI.getOperand(3).getImm() == 0x0 &&
9243 // Check that the w->w move is not a zero-extending w->x mov.
9244 (!MI.getOperand(0).getReg().isVirtual() ||
9245 MI.getOperand(0).getSubReg() == 0) &&
9246 (!MI.getOperand(0).getReg().isPhysical() ||
9247 MI.findRegisterDefOperandIdx(MI.getOperand(0).getReg() - AArch64::W0 +
9248 AArch64::X0,
9249 /*TRI=*/nullptr) == -1))
9250 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 2)};
9251
9252 if (MI.getOpcode() == AArch64::ORRXrs &&
9253 MI.getOperand(1).getReg() == AArch64::XZR &&
9254 MI.getOperand(3).getImm() == 0x0)
9255 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 2)};
9256
9257 return std::nullopt;
9258}
9259
9260std::optional<DestSourcePair>
9261AArch64InstrInfo::isCopyLikeInstrImpl(const MachineInstr &MI) const {
9262 if (MI.getOpcode() == AArch64::ORRWrs &&
9263 MI.getOperand(1).getReg() == AArch64::WZR &&
9264 MI.getOperand(3).getImm() == 0x0)
9265 return DestSourcePair{MI.getOperand(i: 0), MI.getOperand(i: 2)};
9266 return std::nullopt;
9267}
9268
9269std::optional<RegImmPair>
9270AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, Register Reg) const {
9271 int Sign = 1;
9272 int64_t Offset = 0;
9273
9274 // TODO: Handle cases where Reg is a super- or sub-register of the
9275 // destination register.
9276 const MachineOperand &Op0 = MI.getOperand(i: 0);
9277 if (!Op0.isReg() || Reg != Op0.getReg())
9278 return std::nullopt;
9279
9280 switch (MI.getOpcode()) {
9281 default:
9282 return std::nullopt;
9283 case AArch64::SUBWri:
9284 case AArch64::SUBXri:
9285 case AArch64::SUBSWri:
9286 case AArch64::SUBSXri:
9287 Sign *= -1;
9288 [[fallthrough]];
9289 case AArch64::ADDSWri:
9290 case AArch64::ADDSXri:
9291 case AArch64::ADDWri:
9292 case AArch64::ADDXri: {
9293 // TODO: Third operand can be global address (usually some string).
9294 if (!MI.getOperand(i: 0).isReg() || !MI.getOperand(i: 1).isReg() ||
9295 !MI.getOperand(i: 2).isImm())
9296 return std::nullopt;
9297 int Shift = MI.getOperand(i: 3).getImm();
9298 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
9299 Offset = Sign * (MI.getOperand(i: 2).getImm() << Shift);
9300 }
9301 }
9302 return RegImmPair{MI.getOperand(i: 1).getReg(), Offset};
9303}
9304
9305/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
9306/// the destination register then, if possible, describe the value in terms of
9307/// the source register.
9308static std::optional<ParamLoadedValue>
9309describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
9310 const TargetInstrInfo *TII,
9311 const TargetRegisterInfo *TRI) {
9312 auto DestSrc = TII->isCopyLikeInstr(MI);
9313 if (!DestSrc)
9314 return std::nullopt;
9315
9316 Register DestReg = DestSrc->Destination->getReg();
9317 Register SrcReg = DestSrc->Source->getReg();
9318
9319 auto Expr = DIExpression::get(Context&: MI.getMF()->getFunction().getContext(), Elements: {});
9320
9321 // If the described register is the destination, just return the source.
9322 if (DestReg == DescribedReg)
9323 return ParamLoadedValue(MachineOperand::CreateReg(Reg: SrcReg, isDef: false), Expr);
9324
9325 // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
9326 if (MI.getOpcode() == AArch64::ORRWrs &&
9327 TRI->isSuperRegister(DestReg, DescribedReg))
9328 return ParamLoadedValue(MachineOperand::CreateReg(Reg: SrcReg, isDef: false), Expr);
9329
9330 // We may need to describe the lower part of a ORRXrs move.
9331 if (MI.getOpcode() == AArch64::ORRXrs &&
9332 TRI->isSubRegister(DestReg, DescribedReg)) {
9333 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
9334 return ParamLoadedValue(MachineOperand::CreateReg(Reg: SrcSubReg, isDef: false), Expr);
9335 }
9336
9337 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
9338 "Unhandled ORR[XW]rs copy case");
9339
9340 return std::nullopt;
9341}
9342
9343bool AArch64InstrInfo::isFunctionSafeToSplit(const MachineFunction &MF) const {
9344 // Functions cannot be split to different sections on AArch64 if they have
9345 // a red zone. This is because relaxing a cross-section branch may require
9346 // incrementing the stack pointer to spill a register, which would overwrite
9347 // the red zone.
9348 if (MF.getInfo<AArch64FunctionInfo>()->hasRedZone().value_or(u: true))
9349 return false;
9350
9351 return TargetInstrInfo::isFunctionSafeToSplit(MF);
9352}
9353
9354bool AArch64InstrInfo::isMBBSafeToSplitToCold(
9355 const MachineBasicBlock &MBB) const {
9356 // Asm Goto blocks can contain conditional branches to goto labels, which can
9357 // get moved out of range of the branch instruction.
9358 auto isAsmGoto = [](const MachineInstr &MI) {
9359 return MI.getOpcode() == AArch64::INLINEASM_BR;
9360 };
9361 if (llvm::any_of(Range: MBB, P: isAsmGoto) || MBB.isInlineAsmBrIndirectTarget())
9362 return false;
9363
9364 // Because jump tables are label-relative instead of table-relative, they all
9365 // must be in the same section or relocation fixup handling will fail.
9366
9367 // Check if MBB is a jump table target
9368 const MachineJumpTableInfo *MJTI = MBB.getParent()->getJumpTableInfo();
9369 auto containsMBB = [&MBB](const MachineJumpTableEntry &JTE) {
9370 return llvm::is_contained(Range: JTE.MBBs, Element: &MBB);
9371 };
9372 if (MJTI != nullptr && llvm::any_of(Range: MJTI->getJumpTables(), P: containsMBB))
9373 return false;
9374
9375 // Check if MBB contains a jump table lookup
9376 for (const MachineInstr &MI : MBB) {
9377 switch (MI.getOpcode()) {
9378 case TargetOpcode::G_BRJT:
9379 case AArch64::JumpTableDest32:
9380 case AArch64::JumpTableDest16:
9381 case AArch64::JumpTableDest8:
9382 return false;
9383 default:
9384 continue;
9385 }
9386 }
9387
9388 // MBB isn't a special case, so it's safe to be split to the cold section.
9389 return true;
9390}
9391
9392std::optional<ParamLoadedValue>
9393AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
9394 Register Reg) const {
9395 const MachineFunction *MF = MI.getMF();
9396 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
9397 switch (MI.getOpcode()) {
9398 case AArch64::MOVZWi:
9399 case AArch64::MOVZXi: {
9400 // MOVZWi may be used for producing zero-extended 32-bit immediates in
9401 // 64-bit parameters, so we need to consider super-registers.
9402 if (!TRI->isSuperRegisterEq(RegA: MI.getOperand(i: 0).getReg(), RegB: Reg))
9403 return std::nullopt;
9404
9405 if (!MI.getOperand(i: 1).isImm())
9406 return std::nullopt;
9407 int64_t Immediate = MI.getOperand(i: 1).getImm();
9408 int Shift = MI.getOperand(i: 2).getImm();
9409 return ParamLoadedValue(MachineOperand::CreateImm(Val: Immediate << Shift),
9410 nullptr);
9411 }
9412 case AArch64::ORRWrs:
9413 case AArch64::ORRXrs:
9414 return describeORRLoadedValue(MI, Reg, this, TRI);
9415 }
9416
9417 return TargetInstrInfo::describeLoadedValue(MI, Reg);
9418}
9419
9420bool AArch64InstrInfo::isExtendLikelyToBeFolded(
9421 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const {
9422 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT ||
9423 ExtMI.getOpcode() == TargetOpcode::G_ZEXT ||
9424 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT);
9425
9426 // Anyexts are nops.
9427 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT)
9428 return true;
9429
9430 Register DefReg = ExtMI.getOperand(i: 0).getReg();
9431 if (!MRI.hasOneNonDBGUse(RegNo: DefReg))
9432 return false;
9433
9434 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an
9435 // addressing mode.
9436 auto *UserMI = &*MRI.use_instr_nodbg_begin(RegNo: DefReg);
9437 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD;
9438}
9439
9440uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
9441 return get(Opc).TSFlags & AArch64::ElementSizeMask;
9442}
9443
9444bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const {
9445 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike;
9446}
9447
9448bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const {
9449 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile;
9450}
9451
9452unsigned int
9453AArch64InstrInfo::getTailDuplicateSize(CodeGenOptLevel OptLevel) const {
9454 return OptLevel >= CodeGenOptLevel::Aggressive ? 6 : 2;
9455}
9456
9457bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
9458 unsigned Scale) const {
9459 if (Offset && Scale)
9460 return false;
9461
9462 // Check Reg + Imm
9463 if (!Scale) {
9464 // 9-bit signed offset
9465 if (isInt<9>(x: Offset))
9466 return true;
9467
9468 // 12-bit unsigned offset
9469 unsigned Shift = Log2_64(Value: NumBytes);
9470 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
9471 // Must be a multiple of NumBytes (NumBytes is a power of 2)
9472 (Offset >> Shift) << Shift == Offset)
9473 return true;
9474 return false;
9475 }
9476
9477 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
9478 return Scale == 1 || (Scale > 0 && Scale == NumBytes);
9479}
9480
9481unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
9482 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
9483 return AArch64::BLRNoIP;
9484 else
9485 return AArch64::BLR;
9486}
9487
9488MachineBasicBlock::iterator
9489AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
9490 Register TargetReg, bool FrameSetup) const {
9491 assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP");
9492
9493 MachineBasicBlock &MBB = *MBBI->getParent();
9494 MachineFunction &MF = *MBB.getParent();
9495 const AArch64InstrInfo *TII =
9496 MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
9497 int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
9498 DebugLoc DL = MBB.findDebugLoc(MBBI);
9499
9500 MachineFunction::iterator MBBInsertPoint = std::next(x: MBB.getIterator());
9501 MachineBasicBlock *LoopTestMBB =
9502 MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
9503 MF.insert(MBBI: MBBInsertPoint, MBB: LoopTestMBB);
9504 MachineBasicBlock *LoopBodyMBB =
9505 MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
9506 MF.insert(MBBI: MBBInsertPoint, MBB: LoopBodyMBB);
9507 MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(BB: MBB.getBasicBlock());
9508 MF.insert(MBBI: MBBInsertPoint, MBB: ExitMBB);
9509 MachineInstr::MIFlag Flags =
9510 FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;
9511
9512 // LoopTest:
9513 // SUB SP, SP, #ProbeSize
9514 emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
9515 AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
9516
9517 // CMP SP, TargetReg
9518 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
9519 AArch64::XZR)
9520 .addReg(AArch64::SP)
9521 .addReg(TargetReg)
9522 .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
9523 .setMIFlags(Flags);
9524
9525 // B.<Cond> LoopExit
9526 BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
9527 .addImm(AArch64CC::LE)
9528 .addMBB(ExitMBB)
9529 .setMIFlags(Flags);
9530
9531 // STR XZR, [SP]
9532 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
9533 .addReg(AArch64::XZR)
9534 .addReg(AArch64::SP)
9535 .addImm(0)
9536 .setMIFlags(Flags);
9537
9538 // B loop
9539 BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
9540 .addMBB(LoopTestMBB)
9541 .setMIFlags(Flags);
9542
9543 // LoopExit:
9544 // MOV SP, TargetReg
9545 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
9546 .addReg(TargetReg)
9547 .addImm(0)
9548 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
9549 .setMIFlags(Flags);
9550
9551 // LDR XZR, [SP]
9552 BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui))
9553 .addReg(AArch64::XZR, RegState::Define)
9554 .addReg(AArch64::SP)
9555 .addImm(0)
9556 .setMIFlags(Flags);
9557
9558 ExitMBB->splice(Where: ExitMBB->end(), Other: &MBB, From: std::next(x: MBBI), To: MBB.end());
9559 ExitMBB->transferSuccessorsAndUpdatePHIs(FromMBB: &MBB);
9560
9561 LoopTestMBB->addSuccessor(Succ: ExitMBB);
9562 LoopTestMBB->addSuccessor(Succ: LoopBodyMBB);
9563 LoopBodyMBB->addSuccessor(Succ: LoopTestMBB);
9564 MBB.addSuccessor(Succ: LoopTestMBB);
9565
9566 // Update liveins.
9567 if (MF.getRegInfo().reservedRegsFrozen())
9568 fullyRecomputeLiveIns(MBBs: {ExitMBB, LoopBodyMBB, LoopTestMBB});
9569
9570 return ExitMBB->begin();
9571}
9572
9573namespace {
9574class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
9575 MachineInstr *PredBranch;
9576 SmallVector<MachineOperand, 4> Cond;
9577
9578public:
9579 AArch64PipelinerLoopInfo(MachineInstr *PredBranch,
9580 const SmallVectorImpl<MachineOperand> &Cond)
9581 : PredBranch(PredBranch), Cond(Cond.begin(), Cond.end()) {}
9582
9583 bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
9584 // Make the instructions for loop control be placed in stage 0.
9585 // The predecessors of PredBranch are considered by the caller.
9586 return MI == PredBranch;
9587 }
9588
9589 std::optional<bool> createTripCountGreaterCondition(
9590 int TC, MachineBasicBlock &MBB,
9591 SmallVectorImpl<MachineOperand> &CondParam) override {
9592 // A branch instruction will be inserted as "if (Cond) goto epilogue".
9593 // Cond is normalized for such use.
9594 // The predecessors of the branch are assumed to have already been inserted.
9595 CondParam = Cond;
9596 return {};
9597 }
9598
9599 void setPreheader(MachineBasicBlock *NewPreheader) override {}
9600
9601 void adjustTripCount(int TripCountAdjust) override {}
9602
9603 void disposed() override {}
9604};
9605} // namespace
9606
9607static bool isCompareAndBranch(unsigned Opcode) {
9608 switch (Opcode) {
9609 case AArch64::CBZW:
9610 case AArch64::CBZX:
9611 case AArch64::CBNZW:
9612 case AArch64::CBNZX:
9613 case AArch64::TBZW:
9614 case AArch64::TBZX:
9615 case AArch64::TBNZW:
9616 case AArch64::TBNZX:
9617 return true;
9618 }
9619 return false;
9620}
9621
9622std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
9623AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
9624 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
9625 SmallVector<MachineOperand, 4> Cond;
9626 if (analyzeBranch(MBB&: *LoopBB, TBB, FBB, Cond))
9627 return nullptr;
9628
9629 // Infinite loops are not supported
9630 if (TBB == LoopBB && FBB == LoopBB)
9631 return nullptr;
9632
9633 // Must be conditional branch
9634 if (FBB == nullptr)
9635 return nullptr;
9636
9637 assert((TBB == LoopBB || FBB == LoopBB) &&
9638 "The Loop must be a single-basic-block loop");
9639
9640 // Normalization for createTripCountGreaterCondition()
9641 if (TBB == LoopBB)
9642 reverseBranchCondition(Cond);
9643
9644 MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
9645 const TargetRegisterInfo &TRI = getRegisterInfo();
9646
9647 // Find the immediate predecessor of the conditional branch
9648 MachineInstr *PredBranch = nullptr;
9649 if (CondBranch->getOpcode() == AArch64::Bcc) {
9650 for (MachineInstr &MI : reverse(C&: *LoopBB)) {
9651 if (MI.modifiesRegister(AArch64::NZCV, &TRI)) {
9652 PredBranch = &MI;
9653 break;
9654 }
9655 }
9656 if (!PredBranch)
9657 return nullptr;
9658 } else if (isCompareAndBranch(Opcode: CondBranch->getOpcode())) {
9659 const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
9660 Register Reg = CondBranch->getOperand(i: 0).getReg();
9661 if (!Reg.isVirtual())
9662 return nullptr;
9663 PredBranch = MRI.getVRegDef(Reg);
9664
9665 // MachinePipeliner does not expect that the immediate predecessor is a Phi
9666 if (PredBranch->isPHI())
9667 return nullptr;
9668
9669 if (PredBranch->getParent() != LoopBB)
9670 return nullptr;
9671 } else {
9672 return nullptr;
9673 }
9674
9675 return std::make_unique<AArch64PipelinerLoopInfo>(args&: PredBranch, args&: Cond);
9676}
9677
9678#define GET_INSTRINFO_HELPERS
9679#define GET_INSTRMAP_INFO
9680#include "AArch64GenInstrInfo.inc"
9681

source code of llvm/lib/Target/AArch64/AArch64InstrInfo.cpp