1//===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AArch64.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AArch64GlobalISelUtils.h"
15#include "AArch64InstrInfo.h"
16#include "AArch64MachineFunctionInfo.h"
17#include "AArch64RegisterBankInfo.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
20#include "AArch64TargetMachine.h"
21#include "MCTargetDesc/AArch64AddressingModes.h"
22#include "MCTargetDesc/AArch64MCTargetDesc.h"
23#include "llvm/BinaryFormat/Dwarf.h"
24#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
25#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
26#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
27#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
28#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29#include "llvm/CodeGen/GlobalISel/Utils.h"
30#include "llvm/CodeGen/MachineBasicBlock.h"
31#include "llvm/CodeGen/MachineConstantPool.h"
32#include "llvm/CodeGen/MachineFrameInfo.h"
33#include "llvm/CodeGen/MachineFunction.h"
34#include "llvm/CodeGen/MachineInstr.h"
35#include "llvm/CodeGen/MachineInstrBuilder.h"
36#include "llvm/CodeGen/MachineMemOperand.h"
37#include "llvm/CodeGen/MachineOperand.h"
38#include "llvm/CodeGen/MachineRegisterInfo.h"
39#include "llvm/CodeGen/TargetOpcodes.h"
40#include "llvm/CodeGen/TargetRegisterInfo.h"
41#include "llvm/IR/Constants.h"
42#include "llvm/IR/DerivedTypes.h"
43#include "llvm/IR/Instructions.h"
44#include "llvm/IR/IntrinsicsAArch64.h"
45#include "llvm/IR/PatternMatch.h"
46#include "llvm/IR/Type.h"
47#include "llvm/Pass.h"
48#include "llvm/Support/Debug.h"
49#include "llvm/Support/raw_ostream.h"
50#include <optional>
51
52#define DEBUG_TYPE "aarch64-isel"
53
54using namespace llvm;
55using namespace MIPatternMatch;
56using namespace AArch64GISelUtils;
57
58namespace llvm {
59class BlockFrequencyInfo;
60class ProfileSummaryInfo;
61}
62
63namespace {
64
65#define GET_GLOBALISEL_PREDICATE_BITSET
66#include "AArch64GenGlobalISel.inc"
67#undef GET_GLOBALISEL_PREDICATE_BITSET
68
69
70class AArch64InstructionSelector : public InstructionSelector {
71public:
72 AArch64InstructionSelector(const AArch64TargetMachine &TM,
73 const AArch64Subtarget &STI,
74 const AArch64RegisterBankInfo &RBI);
75
76 bool select(MachineInstr &I) override;
77 static const char *getName() { return DEBUG_TYPE; }
78
79 void setupMF(MachineFunction &MF, GISelKnownBits *KB,
80 CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI,
81 BlockFrequencyInfo *BFI) override {
82 InstructionSelector::setupMF(mf&: MF, kb: KB, covinfo: CoverageInfo, psi: PSI, bfi: BFI);
83 MIB.setMF(MF);
84
85 // hasFnAttribute() is expensive to call on every BRCOND selection, so
86 // cache it here for each run of the selector.
87 ProduceNonFlagSettingCondBr =
88 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
89 MFReturnAddr = Register();
90
91 processPHIs(MF);
92 }
93
94private:
95 /// tblgen-erated 'select' implementation, used as the initial selector for
96 /// the patterns that don't require complex C++.
97 bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
98
99 // A lowering phase that runs before any selection attempts.
100 // Returns true if the instruction was modified.
101 bool preISelLower(MachineInstr &I);
102
103 // An early selection function that runs before the selectImpl() call.
104 bool earlySelect(MachineInstr &I);
105
106 /// Save state that is shared between select calls, call select on \p I and
107 /// then restore the saved state. This can be used to recursively call select
108 /// within a select call.
109 bool selectAndRestoreState(MachineInstr &I);
110
111 // Do some preprocessing of G_PHIs before we begin selection.
112 void processPHIs(MachineFunction &MF);
113
114 bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
115
116 /// Eliminate same-sized cross-bank copies into stores before selectImpl().
117 bool contractCrossBankCopyIntoStore(MachineInstr &I,
118 MachineRegisterInfo &MRI);
119
120 bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
121
122 bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
123 MachineRegisterInfo &MRI) const;
124 bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
125 MachineRegisterInfo &MRI) const;
126
127 ///@{
128 /// Helper functions for selectCompareBranch.
129 bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
130 MachineIRBuilder &MIB) const;
131 bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
132 MachineIRBuilder &MIB) const;
133 bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
134 MachineIRBuilder &MIB) const;
135 bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
136 MachineBasicBlock *DstMBB,
137 MachineIRBuilder &MIB) const;
138 ///@}
139
140 bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
141 MachineRegisterInfo &MRI);
142
143 bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI);
144 bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI);
145
146 // Helper to generate an equivalent of scalar_to_vector into a new register,
147 // returned via 'Dst'.
148 MachineInstr *emitScalarToVector(unsigned EltSize,
149 const TargetRegisterClass *DstRC,
150 Register Scalar,
151 MachineIRBuilder &MIRBuilder) const;
152 /// Helper to narrow vector that was widened by emitScalarToVector.
153 /// Copy lowest part of 128-bit or 64-bit vector to 64-bit or 32-bit
154 /// vector, correspondingly.
155 MachineInstr *emitNarrowVector(Register DstReg, Register SrcReg,
156 MachineIRBuilder &MIRBuilder,
157 MachineRegisterInfo &MRI) const;
158
159 /// Emit a lane insert into \p DstReg, or a new vector register if
160 /// std::nullopt is provided.
161 ///
162 /// The lane inserted into is defined by \p LaneIdx. The vector source
163 /// register is given by \p SrcReg. The register containing the element is
164 /// given by \p EltReg.
165 MachineInstr *emitLaneInsert(std::optional<Register> DstReg, Register SrcReg,
166 Register EltReg, unsigned LaneIdx,
167 const RegisterBank &RB,
168 MachineIRBuilder &MIRBuilder) const;
169
170 /// Emit a sequence of instructions representing a constant \p CV for a
171 /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.)
172 ///
173 /// \returns the last instruction in the sequence on success, and nullptr
174 /// otherwise.
175 MachineInstr *emitConstantVector(Register Dst, Constant *CV,
176 MachineIRBuilder &MIRBuilder,
177 MachineRegisterInfo &MRI);
178
179 MachineInstr *tryAdvSIMDModImm8(Register Dst, unsigned DstSize, APInt Bits,
180 MachineIRBuilder &MIRBuilder);
181
182 MachineInstr *tryAdvSIMDModImm16(Register Dst, unsigned DstSize, APInt Bits,
183 MachineIRBuilder &MIRBuilder, bool Inv);
184
185 MachineInstr *tryAdvSIMDModImm32(Register Dst, unsigned DstSize, APInt Bits,
186 MachineIRBuilder &MIRBuilder, bool Inv);
187 MachineInstr *tryAdvSIMDModImm64(Register Dst, unsigned DstSize, APInt Bits,
188 MachineIRBuilder &MIRBuilder);
189 MachineInstr *tryAdvSIMDModImm321s(Register Dst, unsigned DstSize, APInt Bits,
190 MachineIRBuilder &MIRBuilder, bool Inv);
191 MachineInstr *tryAdvSIMDModImmFP(Register Dst, unsigned DstSize, APInt Bits,
192 MachineIRBuilder &MIRBuilder);
193
194 bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
195 MachineRegisterInfo &MRI);
196 /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a
197 /// SUBREG_TO_REG.
198 bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI);
199 bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
200 bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
201 bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
202
203 bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI);
204 bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
205 bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
206 bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
207
208 /// Helper function to select vector load intrinsics like
209 /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc.
210 /// \p Opc is the opcode that the selected instruction should use.
211 /// \p NumVecs is the number of vector destinations for the instruction.
212 /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
213 bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs,
214 MachineInstr &I);
215 bool selectVectorLoadLaneIntrinsic(unsigned Opc, unsigned NumVecs,
216 MachineInstr &I);
217 void selectVectorStoreIntrinsic(MachineInstr &I, unsigned NumVecs,
218 unsigned Opc);
219 bool selectVectorStoreLaneIntrinsic(MachineInstr &I, unsigned NumVecs,
220 unsigned Opc);
221 bool selectIntrinsicWithSideEffects(MachineInstr &I,
222 MachineRegisterInfo &MRI);
223 bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
224 bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI);
225 bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
226 bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
227 bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
228 bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
229 bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
230 bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
231
232 bool selectIndexedExtLoad(MachineInstr &I, MachineRegisterInfo &MRI);
233 bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI);
234 bool selectIndexedStore(GIndexedStore &I, MachineRegisterInfo &MRI);
235
236 unsigned emitConstantPoolEntry(const Constant *CPVal,
237 MachineFunction &MF) const;
238 MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
239 MachineIRBuilder &MIRBuilder) const;
240
241 // Emit a vector concat operation.
242 MachineInstr *emitVectorConcat(std::optional<Register> Dst, Register Op1,
243 Register Op2,
244 MachineIRBuilder &MIRBuilder) const;
245
246 // Emit an integer compare between LHS and RHS, which checks for Predicate.
247 MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
248 MachineOperand &Predicate,
249 MachineIRBuilder &MIRBuilder) const;
250
251 /// Emit a floating point comparison between \p LHS and \p RHS.
252 /// \p Pred if given is the intended predicate to use.
253 MachineInstr *
254 emitFPCompare(Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
255 std::optional<CmpInst::Predicate> = std::nullopt) const;
256
257 MachineInstr *
258 emitInstr(unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
259 std::initializer_list<llvm::SrcOp> SrcOps,
260 MachineIRBuilder &MIRBuilder,
261 const ComplexRendererFns &RenderFns = std::nullopt) const;
262 /// Helper function to emit an add or sub instruction.
263 ///
264 /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
265 /// in a specific order.
266 ///
267 /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
268 ///
269 /// \code
270 /// const std::array<std::array<unsigned, 2>, 4> Table {
271 /// {{AArch64::ADDXri, AArch64::ADDWri},
272 /// {AArch64::ADDXrs, AArch64::ADDWrs},
273 /// {AArch64::ADDXrr, AArch64::ADDWrr},
274 /// {AArch64::SUBXri, AArch64::SUBWri},
275 /// {AArch64::ADDXrx, AArch64::ADDWrx}}};
276 /// \endcode
277 ///
278 /// Each row in the table corresponds to a different addressing mode. Each
279 /// column corresponds to a different register size.
280 ///
281 /// \attention Rows must be structured as follows:
282 /// - Row 0: The ri opcode variants
283 /// - Row 1: The rs opcode variants
284 /// - Row 2: The rr opcode variants
285 /// - Row 3: The ri opcode variants for negative immediates
286 /// - Row 4: The rx opcode variants
287 ///
288 /// \attention Columns must be structured as follows:
289 /// - Column 0: The 64-bit opcode variants
290 /// - Column 1: The 32-bit opcode variants
291 ///
292 /// \p Dst is the destination register of the binop to emit.
293 /// \p LHS is the left-hand operand of the binop to emit.
294 /// \p RHS is the right-hand operand of the binop to emit.
295 MachineInstr *emitAddSub(
296 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
297 Register Dst, MachineOperand &LHS, MachineOperand &RHS,
298 MachineIRBuilder &MIRBuilder) const;
299 MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
300 MachineOperand &RHS,
301 MachineIRBuilder &MIRBuilder) const;
302 MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
303 MachineIRBuilder &MIRBuilder) const;
304 MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
305 MachineIRBuilder &MIRBuilder) const;
306 MachineInstr *emitADCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
307 MachineIRBuilder &MIRBuilder) const;
308 MachineInstr *emitSBCS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
309 MachineIRBuilder &MIRBuilder) const;
310 MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
311 MachineIRBuilder &MIRBuilder) const;
312 MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
313 MachineIRBuilder &MIRBuilder) const;
314 MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
315 AArch64CC::CondCode CC,
316 MachineIRBuilder &MIRBuilder) const;
317 MachineInstr *emitExtractVectorElt(std::optional<Register> DstReg,
318 const RegisterBank &DstRB, LLT ScalarTy,
319 Register VecReg, unsigned LaneIdx,
320 MachineIRBuilder &MIRBuilder) const;
321 MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2,
322 AArch64CC::CondCode Pred,
323 MachineIRBuilder &MIRBuilder) const;
324 /// Emit a CSet for a FP compare.
325 ///
326 /// \p Dst is expected to be a 32-bit scalar register.
327 MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
328 MachineIRBuilder &MIRBuilder) const;
329
330 /// Emit an instruction that sets NZCV to the carry-in expected by \p I.
331 /// Might elide the instruction if the previous instruction already sets NZCV
332 /// correctly.
333 MachineInstr *emitCarryIn(MachineInstr &I, Register CarryReg);
334
335 /// Emit the overflow op for \p Opcode.
336 ///
337 /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
338 /// G_USUBO, etc.
339 std::pair<MachineInstr *, AArch64CC::CondCode>
340 emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
341 MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
342
343 bool selectOverflowOp(MachineInstr &I, MachineRegisterInfo &MRI);
344
345 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
346 /// In some cases this is even possible with OR operations in the expression.
347 MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC,
348 MachineIRBuilder &MIB) const;
349 MachineInstr *emitConditionalComparison(Register LHS, Register RHS,
350 CmpInst::Predicate CC,
351 AArch64CC::CondCode Predicate,
352 AArch64CC::CondCode OutCC,
353 MachineIRBuilder &MIB) const;
354 MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC,
355 bool Negate, Register CCOp,
356 AArch64CC::CondCode Predicate,
357 MachineIRBuilder &MIB) const;
358
359 /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
360 /// \p IsNegative is true if the test should be "not zero".
361 /// This will also optimize the test bit instruction when possible.
362 MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
363 MachineBasicBlock *DstMBB,
364 MachineIRBuilder &MIB) const;
365
366 /// Emit a CB(N)Z instruction which branches to \p DestMBB.
367 MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
368 MachineBasicBlock *DestMBB,
369 MachineIRBuilder &MIB) const;
370
371 // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
372 // We use these manually instead of using the importer since it doesn't
373 // support SDNodeXForm.
374 ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
375 ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
376 ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
377 ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
378
379 ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
380 ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
381 ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
382
383 ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
384 unsigned Size) const;
385
386 ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
387 return selectAddrModeUnscaled(Root, Size: 1);
388 }
389 ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
390 return selectAddrModeUnscaled(Root, Size: 2);
391 }
392 ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
393 return selectAddrModeUnscaled(Root, Size: 4);
394 }
395 ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
396 return selectAddrModeUnscaled(Root, Size: 8);
397 }
398 ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
399 return selectAddrModeUnscaled(Root, Size: 16);
400 }
401
402 /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
403 /// from complex pattern matchers like selectAddrModeIndexed().
404 ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
405 MachineRegisterInfo &MRI) const;
406
407 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
408 unsigned Size) const;
409 template <int Width>
410 ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
411 return selectAddrModeIndexed(Root, Size: Width / 8);
412 }
413
414 bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
415 const MachineRegisterInfo &MRI) const;
416 ComplexRendererFns
417 selectAddrModeShiftedExtendXReg(MachineOperand &Root,
418 unsigned SizeInBytes) const;
419
420 /// Returns a \p ComplexRendererFns which contains a base, offset, and whether
421 /// or not a shift + extend should be folded into an addressing mode. Returns
422 /// None when this is not profitable or possible.
423 ComplexRendererFns
424 selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
425 MachineOperand &Offset, unsigned SizeInBytes,
426 bool WantsExt) const;
427 ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
428 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
429 unsigned SizeInBytes) const;
430 template <int Width>
431 ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
432 return selectAddrModeXRO(Root, SizeInBytes: Width / 8);
433 }
434
435 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
436 unsigned SizeInBytes) const;
437 template <int Width>
438 ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
439 return selectAddrModeWRO(Root, SizeInBytes: Width / 8);
440 }
441
442 ComplexRendererFns selectShiftedRegister(MachineOperand &Root,
443 bool AllowROR = false) const;
444
445 ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
446 return selectShiftedRegister(Root);
447 }
448
449 ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
450 return selectShiftedRegister(Root, AllowROR: true);
451 }
452
453 /// Given an extend instruction, determine the correct shift-extend type for
454 /// that instruction.
455 ///
456 /// If the instruction is going to be used in a load or store, pass
457 /// \p IsLoadStore = true.
458 AArch64_AM::ShiftExtendType
459 getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
460 bool IsLoadStore = false) const;
461
462 /// Move \p Reg to \p RC if \p Reg is not already on \p RC.
463 ///
464 /// \returns Either \p Reg if no change was necessary, or the new register
465 /// created by moving \p Reg.
466 ///
467 /// Note: This uses emitCopy right now.
468 Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
469 MachineIRBuilder &MIB) const;
470
471 ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
472
473 ComplexRendererFns selectExtractHigh(MachineOperand &Root) const;
474
475 void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
476 int OpIdx = -1) const;
477 void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
478 int OpIdx = -1) const;
479 void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
480 int OpIdx = -1) const;
481 void renderUbsanTrap(MachineInstrBuilder &MIB, const MachineInstr &MI,
482 int OpIdx) const;
483 void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
484 int OpIdx = -1) const;
485 void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
486 int OpIdx = -1) const;
487 void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
488 int OpIdx = -1) const;
489 void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB,
490 const MachineInstr &MI,
491 int OpIdx = -1) const;
492
493 // Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
494 void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
495
496 // Optimization methods.
497 bool tryOptSelect(GSelect &Sel);
498 bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI);
499 MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
500 MachineOperand &Predicate,
501 MachineIRBuilder &MIRBuilder) const;
502
503 /// Return true if \p MI is a load or store of \p NumBytes bytes.
504 bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
505
506 /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
507 /// register zeroed out. In other words, the result of MI has been explicitly
508 /// zero extended.
509 bool isDef32(const MachineInstr &MI) const;
510
511 const AArch64TargetMachine &TM;
512 const AArch64Subtarget &STI;
513 const AArch64InstrInfo &TII;
514 const AArch64RegisterInfo &TRI;
515 const AArch64RegisterBankInfo &RBI;
516
517 bool ProduceNonFlagSettingCondBr = false;
518
519 // Some cached values used during selection.
520 // We use LR as a live-in register, and we keep track of it here as it can be
521 // clobbered by calls.
522 Register MFReturnAddr;
523
524 MachineIRBuilder MIB;
525
526#define GET_GLOBALISEL_PREDICATES_DECL
527#include "AArch64GenGlobalISel.inc"
528#undef GET_GLOBALISEL_PREDICATES_DECL
529
530// We declare the temporaries used by selectImpl() in the class to minimize the
531// cost of constructing placeholder values.
532#define GET_GLOBALISEL_TEMPORARIES_DECL
533#include "AArch64GenGlobalISel.inc"
534#undef GET_GLOBALISEL_TEMPORARIES_DECL
535};
536
537} // end anonymous namespace
538
539#define GET_GLOBALISEL_IMPL
540#include "AArch64GenGlobalISel.inc"
541#undef GET_GLOBALISEL_IMPL
542
543AArch64InstructionSelector::AArch64InstructionSelector(
544 const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
545 const AArch64RegisterBankInfo &RBI)
546 : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()),
547 RBI(RBI),
548#define GET_GLOBALISEL_PREDICATES_INIT
549#include "AArch64GenGlobalISel.inc"
550#undef GET_GLOBALISEL_PREDICATES_INIT
551#define GET_GLOBALISEL_TEMPORARIES_INIT
552#include "AArch64GenGlobalISel.inc"
553#undef GET_GLOBALISEL_TEMPORARIES_INIT
554{
555}
556
557// FIXME: This should be target-independent, inferred from the types declared
558// for each class in the bank.
559//
560/// Given a register bank, and a type, return the smallest register class that
561/// can represent that combination.
562static const TargetRegisterClass *
563getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
564 bool GetAllRegSet = false) {
565 if (RB.getID() == AArch64::GPRRegBankID) {
566 if (Ty.getSizeInBits() <= 32)
567 return GetAllRegSet ? &AArch64::GPR32allRegClass
568 : &AArch64::GPR32RegClass;
569 if (Ty.getSizeInBits() == 64)
570 return GetAllRegSet ? &AArch64::GPR64allRegClass
571 : &AArch64::GPR64RegClass;
572 if (Ty.getSizeInBits() == 128)
573 return &AArch64::XSeqPairsClassRegClass;
574 return nullptr;
575 }
576
577 if (RB.getID() == AArch64::FPRRegBankID) {
578 switch (Ty.getSizeInBits()) {
579 case 8:
580 return &AArch64::FPR8RegClass;
581 case 16:
582 return &AArch64::FPR16RegClass;
583 case 32:
584 return &AArch64::FPR32RegClass;
585 case 64:
586 return &AArch64::FPR64RegClass;
587 case 128:
588 return &AArch64::FPR128RegClass;
589 }
590 return nullptr;
591 }
592
593 return nullptr;
594}
595
596/// Given a register bank, and size in bits, return the smallest register class
597/// that can represent that combination.
598static const TargetRegisterClass *
599getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits,
600 bool GetAllRegSet = false) {
601 unsigned RegBankID = RB.getID();
602
603 if (RegBankID == AArch64::GPRRegBankID) {
604 if (SizeInBits <= 32)
605 return GetAllRegSet ? &AArch64::GPR32allRegClass
606 : &AArch64::GPR32RegClass;
607 if (SizeInBits == 64)
608 return GetAllRegSet ? &AArch64::GPR64allRegClass
609 : &AArch64::GPR64RegClass;
610 if (SizeInBits == 128)
611 return &AArch64::XSeqPairsClassRegClass;
612 }
613
614 if (RegBankID == AArch64::FPRRegBankID) {
615 switch (SizeInBits) {
616 default:
617 return nullptr;
618 case 8:
619 return &AArch64::FPR8RegClass;
620 case 16:
621 return &AArch64::FPR16RegClass;
622 case 32:
623 return &AArch64::FPR32RegClass;
624 case 64:
625 return &AArch64::FPR64RegClass;
626 case 128:
627 return &AArch64::FPR128RegClass;
628 }
629 }
630
631 return nullptr;
632}
633
634/// Returns the correct subregister to use for a given register class.
635static bool getSubRegForClass(const TargetRegisterClass *RC,
636 const TargetRegisterInfo &TRI, unsigned &SubReg) {
637 switch (TRI.getRegSizeInBits(RC: *RC)) {
638 case 8:
639 SubReg = AArch64::bsub;
640 break;
641 case 16:
642 SubReg = AArch64::hsub;
643 break;
644 case 32:
645 if (RC != &AArch64::FPR32RegClass)
646 SubReg = AArch64::sub_32;
647 else
648 SubReg = AArch64::ssub;
649 break;
650 case 64:
651 SubReg = AArch64::dsub;
652 break;
653 default:
654 LLVM_DEBUG(
655 dbgs() << "Couldn't find appropriate subregister for register class.");
656 return false;
657 }
658
659 return true;
660}
661
662/// Returns the minimum size the given register bank can hold.
663static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
664 switch (RB.getID()) {
665 case AArch64::GPRRegBankID:
666 return 32;
667 case AArch64::FPRRegBankID:
668 return 8;
669 default:
670 llvm_unreachable("Tried to get minimum size for unknown register bank.");
671 }
672}
673
674/// Create a REG_SEQUENCE instruction using the registers in \p Regs.
675/// Helper function for functions like createDTuple and createQTuple.
676///
677/// \p RegClassIDs - The list of register class IDs available for some tuple of
678/// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is
679/// expected to contain between 2 and 4 tuple classes.
680///
681/// \p SubRegs - The list of subregister classes associated with each register
682/// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0
683/// subregister class. The index of each subregister class is expected to
684/// correspond with the index of each register class.
685///
686/// \returns Either the destination register of REG_SEQUENCE instruction that
687/// was created, or the 0th element of \p Regs if \p Regs contains a single
688/// element.
689static Register createTuple(ArrayRef<Register> Regs,
690 const unsigned RegClassIDs[],
691 const unsigned SubRegs[], MachineIRBuilder &MIB) {
692 unsigned NumRegs = Regs.size();
693 if (NumRegs == 1)
694 return Regs[0];
695 assert(NumRegs >= 2 && NumRegs <= 4 &&
696 "Only support between two and 4 registers in a tuple!");
697 const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo();
698 auto *DesiredClass = TRI->getRegClass(i: RegClassIDs[NumRegs - 2]);
699 auto RegSequence =
700 MIB.buildInstr(Opc: TargetOpcode::REG_SEQUENCE, DstOps: {DesiredClass}, SrcOps: {});
701 for (unsigned I = 0, E = Regs.size(); I < E; ++I) {
702 RegSequence.addUse(RegNo: Regs[I]);
703 RegSequence.addImm(Val: SubRegs[I]);
704 }
705 return RegSequence.getReg(Idx: 0);
706}
707
708/// Create a tuple of D-registers using the registers in \p Regs.
709static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
710 static const unsigned RegClassIDs[] = {
711 AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
712 static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
713 AArch64::dsub2, AArch64::dsub3};
714 return createTuple(Regs, RegClassIDs, SubRegs, MIB);
715}
716
717/// Create a tuple of Q-registers using the registers in \p Regs.
718static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
719 static const unsigned RegClassIDs[] = {
720 AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
721 static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
722 AArch64::qsub2, AArch64::qsub3};
723 return createTuple(Regs, RegClassIDs, SubRegs, MIB);
724}
725
726static std::optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
727 auto &MI = *Root.getParent();
728 auto &MBB = *MI.getParent();
729 auto &MF = *MBB.getParent();
730 auto &MRI = MF.getRegInfo();
731 uint64_t Immed;
732 if (Root.isImm())
733 Immed = Root.getImm();
734 else if (Root.isCImm())
735 Immed = Root.getCImm()->getZExtValue();
736 else if (Root.isReg()) {
737 auto ValAndVReg =
738 getIConstantVRegValWithLookThrough(VReg: Root.getReg(), MRI, LookThroughInstrs: true);
739 if (!ValAndVReg)
740 return std::nullopt;
741 Immed = ValAndVReg->Value.getSExtValue();
742 } else
743 return std::nullopt;
744 return Immed;
745}
746
747/// Check whether \p I is a currently unsupported binary operation:
748/// - it has an unsized type
749/// - an operand is not a vreg
750/// - all operands are not in the same bank
751/// These are checks that should someday live in the verifier, but right now,
752/// these are mostly limitations of the aarch64 selector.
753static bool unsupportedBinOp(const MachineInstr &I,
754 const AArch64RegisterBankInfo &RBI,
755 const MachineRegisterInfo &MRI,
756 const AArch64RegisterInfo &TRI) {
757 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
758 if (!Ty.isValid()) {
759 LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
760 return true;
761 }
762
763 const RegisterBank *PrevOpBank = nullptr;
764 for (auto &MO : I.operands()) {
765 // FIXME: Support non-register operands.
766 if (!MO.isReg()) {
767 LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
768 return true;
769 }
770
771 // FIXME: Can generic operations have physical registers operands? If
772 // so, this will need to be taught about that, and we'll need to get the
773 // bank out of the minimal class for the register.
774 // Either way, this needs to be documented (and possibly verified).
775 if (!MO.getReg().isVirtual()) {
776 LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
777 return true;
778 }
779
780 const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
781 if (!OpBank) {
782 LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
783 return true;
784 }
785
786 if (PrevOpBank && OpBank != PrevOpBank) {
787 LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
788 return true;
789 }
790 PrevOpBank = OpBank;
791 }
792 return false;
793}
794
795/// Select the AArch64 opcode for the basic binary operation \p GenericOpc
796/// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
797/// and of size \p OpSize.
798/// \returns \p GenericOpc if the combination is unsupported.
799static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
800 unsigned OpSize) {
801 switch (RegBankID) {
802 case AArch64::GPRRegBankID:
803 if (OpSize == 32) {
804 switch (GenericOpc) {
805 case TargetOpcode::G_SHL:
806 return AArch64::LSLVWr;
807 case TargetOpcode::G_LSHR:
808 return AArch64::LSRVWr;
809 case TargetOpcode::G_ASHR:
810 return AArch64::ASRVWr;
811 default:
812 return GenericOpc;
813 }
814 } else if (OpSize == 64) {
815 switch (GenericOpc) {
816 case TargetOpcode::G_PTR_ADD:
817 return AArch64::ADDXrr;
818 case TargetOpcode::G_SHL:
819 return AArch64::LSLVXr;
820 case TargetOpcode::G_LSHR:
821 return AArch64::LSRVXr;
822 case TargetOpcode::G_ASHR:
823 return AArch64::ASRVXr;
824 default:
825 return GenericOpc;
826 }
827 }
828 break;
829 case AArch64::FPRRegBankID:
830 switch (OpSize) {
831 case 32:
832 switch (GenericOpc) {
833 case TargetOpcode::G_FADD:
834 return AArch64::FADDSrr;
835 case TargetOpcode::G_FSUB:
836 return AArch64::FSUBSrr;
837 case TargetOpcode::G_FMUL:
838 return AArch64::FMULSrr;
839 case TargetOpcode::G_FDIV:
840 return AArch64::FDIVSrr;
841 default:
842 return GenericOpc;
843 }
844 case 64:
845 switch (GenericOpc) {
846 case TargetOpcode::G_FADD:
847 return AArch64::FADDDrr;
848 case TargetOpcode::G_FSUB:
849 return AArch64::FSUBDrr;
850 case TargetOpcode::G_FMUL:
851 return AArch64::FMULDrr;
852 case TargetOpcode::G_FDIV:
853 return AArch64::FDIVDrr;
854 case TargetOpcode::G_OR:
855 return AArch64::ORRv8i8;
856 default:
857 return GenericOpc;
858 }
859 }
860 break;
861 }
862 return GenericOpc;
863}
864
865/// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
866/// appropriate for the (value) register bank \p RegBankID and of memory access
867/// size \p OpSize. This returns the variant with the base+unsigned-immediate
868/// addressing mode (e.g., LDRXui).
869/// \returns \p GenericOpc if the combination is unsupported.
870static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
871 unsigned OpSize) {
872 const bool isStore = GenericOpc == TargetOpcode::G_STORE;
873 switch (RegBankID) {
874 case AArch64::GPRRegBankID:
875 switch (OpSize) {
876 case 8:
877 return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
878 case 16:
879 return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
880 case 32:
881 return isStore ? AArch64::STRWui : AArch64::LDRWui;
882 case 64:
883 return isStore ? AArch64::STRXui : AArch64::LDRXui;
884 }
885 break;
886 case AArch64::FPRRegBankID:
887 switch (OpSize) {
888 case 8:
889 return isStore ? AArch64::STRBui : AArch64::LDRBui;
890 case 16:
891 return isStore ? AArch64::STRHui : AArch64::LDRHui;
892 case 32:
893 return isStore ? AArch64::STRSui : AArch64::LDRSui;
894 case 64:
895 return isStore ? AArch64::STRDui : AArch64::LDRDui;
896 case 128:
897 return isStore ? AArch64::STRQui : AArch64::LDRQui;
898 }
899 break;
900 }
901 return GenericOpc;
902}
903
904/// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
905/// to \p *To.
906///
907/// E.g "To = COPY SrcReg:SubReg"
908static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
909 const RegisterBankInfo &RBI, Register SrcReg,
910 const TargetRegisterClass *To, unsigned SubReg) {
911 assert(SrcReg.isValid() && "Expected a valid source register?");
912 assert(To && "Destination register class cannot be null");
913 assert(SubReg && "Expected a valid subregister");
914
915 MachineIRBuilder MIB(I);
916 auto SubRegCopy =
917 MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {To}, SrcOps: {}).addReg(RegNo: SrcReg, flags: 0, SubReg);
918 MachineOperand &RegOp = I.getOperand(i: 1);
919 RegOp.setReg(SubRegCopy.getReg(Idx: 0));
920
921 // It's possible that the destination register won't be constrained. Make
922 // sure that happens.
923 if (!I.getOperand(i: 0).getReg().isPhysical())
924 RBI.constrainGenericRegister(Reg: I.getOperand(i: 0).getReg(), RC: *To, MRI);
925
926 return true;
927}
928
929/// Helper function to get the source and destination register classes for a
930/// copy. Returns a std::pair containing the source register class for the
931/// copy, and the destination register class for the copy. If a register class
932/// cannot be determined, then it will be nullptr.
933static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
934getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
935 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
936 const RegisterBankInfo &RBI) {
937 Register DstReg = I.getOperand(i: 0).getReg();
938 Register SrcReg = I.getOperand(i: 1).getReg();
939 const RegisterBank &DstRegBank = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
940 const RegisterBank &SrcRegBank = *RBI.getRegBank(Reg: SrcReg, MRI, TRI);
941 unsigned DstSize = RBI.getSizeInBits(Reg: DstReg, MRI, TRI);
942 unsigned SrcSize = RBI.getSizeInBits(Reg: SrcReg, MRI, TRI);
943
944 // Special casing for cross-bank copies of s1s. We can technically represent
945 // a 1-bit value with any size of register. The minimum size for a GPR is 32
946 // bits. So, we need to put the FPR on 32 bits as well.
947 //
948 // FIXME: I'm not sure if this case holds true outside of copies. If it does,
949 // then we can pull it into the helpers that get the appropriate class for a
950 // register bank. Or make a new helper that carries along some constraint
951 // information.
952 if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1))
953 SrcSize = DstSize = 32;
954
955 return {getMinClassForRegBank(RB: SrcRegBank, SizeInBits: SrcSize, GetAllRegSet: true),
956 getMinClassForRegBank(RB: DstRegBank, SizeInBits: DstSize, GetAllRegSet: true)};
957}
958
959// FIXME: We need some sort of API in RBI/TRI to allow generic code to
960// constrain operands of simple instructions given a TargetRegisterClass
961// and LLT
962static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI,
963 const RegisterBankInfo &RBI) {
964 for (MachineOperand &MO : I.operands()) {
965 if (!MO.isReg())
966 continue;
967 Register Reg = MO.getReg();
968 if (!Reg)
969 continue;
970 if (Reg.isPhysical())
971 continue;
972 LLT Ty = MRI.getType(Reg);
973 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
974 const TargetRegisterClass *RC =
975 RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
976 if (!RC) {
977 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
978 RC = getRegClassForTypeOnBank(Ty, RB);
979 if (!RC) {
980 LLVM_DEBUG(
981 dbgs() << "Warning: DBG_VALUE operand has unexpected size/bank\n");
982 break;
983 }
984 }
985 RBI.constrainGenericRegister(Reg, RC: *RC, MRI);
986 }
987
988 return true;
989}
990
991static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
992 MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
993 const RegisterBankInfo &RBI) {
994 Register DstReg = I.getOperand(i: 0).getReg();
995 Register SrcReg = I.getOperand(i: 1).getReg();
996 const RegisterBank &DstRegBank = *RBI.getRegBank(Reg: DstReg, MRI, TRI);
997 const RegisterBank &SrcRegBank = *RBI.getRegBank(Reg: SrcReg, MRI, TRI);
998
999 // Find the correct register classes for the source and destination registers.
1000 const TargetRegisterClass *SrcRC;
1001 const TargetRegisterClass *DstRC;
1002 std::tie(args&: SrcRC, args&: DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
1003
1004 if (!DstRC) {
1005 LLVM_DEBUG(dbgs() << "Unexpected dest size "
1006 << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
1007 return false;
1008 }
1009
1010 // Is this a copy? If so, then we may need to insert a subregister copy.
1011 if (I.isCopy()) {
1012 // Yes. Check if there's anything to fix up.
1013 if (!SrcRC) {
1014 LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
1015 return false;
1016 }
1017
1018 unsigned SrcSize = TRI.getRegSizeInBits(RC: *SrcRC);
1019 unsigned DstSize = TRI.getRegSizeInBits(RC: *DstRC);
1020 unsigned SubReg;
1021
1022 // If the source bank doesn't support a subregister copy small enough,
1023 // then we first need to copy to the destination bank.
1024 if (getMinSizeForRegBank(RB: SrcRegBank) > DstSize) {
1025 const TargetRegisterClass *DstTempRC =
1026 getMinClassForRegBank(RB: DstRegBank, SizeInBits: SrcSize, /* GetAllRegSet */ true);
1027 getSubRegForClass(RC: DstRC, TRI, SubReg);
1028
1029 MachineIRBuilder MIB(I);
1030 auto Copy = MIB.buildCopy(Res: {DstTempRC}, Op: {SrcReg});
1031 copySubReg(I, MRI, RBI, SrcReg: Copy.getReg(Idx: 0), To: DstRC, SubReg);
1032 } else if (SrcSize > DstSize) {
1033 // If the source register is bigger than the destination we need to
1034 // perform a subregister copy.
1035 const TargetRegisterClass *SubRegRC =
1036 getMinClassForRegBank(RB: SrcRegBank, SizeInBits: DstSize, /* GetAllRegSet */ true);
1037 getSubRegForClass(RC: SubRegRC, TRI, SubReg);
1038 copySubReg(I, MRI, RBI, SrcReg, To: DstRC, SubReg);
1039 } else if (DstSize > SrcSize) {
1040 // If the destination register is bigger than the source we need to do
1041 // a promotion using SUBREG_TO_REG.
1042 const TargetRegisterClass *PromotionRC =
1043 getMinClassForRegBank(RB: SrcRegBank, SizeInBits: DstSize, /* GetAllRegSet */ true);
1044 getSubRegForClass(RC: SrcRC, TRI, SubReg);
1045
1046 Register PromoteReg = MRI.createVirtualRegister(RegClass: PromotionRC);
1047 BuildMI(*I.getParent(), I, I.getDebugLoc(),
1048 TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
1049 .addImm(0)
1050 .addUse(SrcReg)
1051 .addImm(SubReg);
1052 MachineOperand &RegOp = I.getOperand(i: 1);
1053 RegOp.setReg(PromoteReg);
1054 }
1055
1056 // If the destination is a physical register, then there's nothing to
1057 // change, so we're done.
1058 if (DstReg.isPhysical())
1059 return true;
1060 }
1061
1062 // No need to constrain SrcReg. It will get constrained when we hit another
1063 // of its use or its defs. Copies do not have constraints.
1064 if (!RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI)) {
1065 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
1066 << " operand\n");
1067 return false;
1068 }
1069
1070 // If this a GPR ZEXT that we want to just reduce down into a copy.
1071 // The sizes will be mismatched with the source < 32b but that's ok.
1072 if (I.getOpcode() == TargetOpcode::G_ZEXT) {
1073 I.setDesc(TII.get(AArch64::COPY));
1074 assert(SrcRegBank.getID() == AArch64::GPRRegBankID);
1075 return selectCopy(I, TII, MRI, TRI, RBI);
1076 }
1077
1078 I.setDesc(TII.get(AArch64::COPY));
1079 return true;
1080}
1081
1082static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
1083 if (!DstTy.isScalar() || !SrcTy.isScalar())
1084 return GenericOpc;
1085
1086 const unsigned DstSize = DstTy.getSizeInBits();
1087 const unsigned SrcSize = SrcTy.getSizeInBits();
1088
1089 switch (DstSize) {
1090 case 32:
1091 switch (SrcSize) {
1092 case 32:
1093 switch (GenericOpc) {
1094 case TargetOpcode::G_SITOFP:
1095 return AArch64::SCVTFUWSri;
1096 case TargetOpcode::G_UITOFP:
1097 return AArch64::UCVTFUWSri;
1098 case TargetOpcode::G_FPTOSI:
1099 return AArch64::FCVTZSUWSr;
1100 case TargetOpcode::G_FPTOUI:
1101 return AArch64::FCVTZUUWSr;
1102 default:
1103 return GenericOpc;
1104 }
1105 case 64:
1106 switch (GenericOpc) {
1107 case TargetOpcode::G_SITOFP:
1108 return AArch64::SCVTFUXSri;
1109 case TargetOpcode::G_UITOFP:
1110 return AArch64::UCVTFUXSri;
1111 case TargetOpcode::G_FPTOSI:
1112 return AArch64::FCVTZSUWDr;
1113 case TargetOpcode::G_FPTOUI:
1114 return AArch64::FCVTZUUWDr;
1115 default:
1116 return GenericOpc;
1117 }
1118 default:
1119 return GenericOpc;
1120 }
1121 case 64:
1122 switch (SrcSize) {
1123 case 32:
1124 switch (GenericOpc) {
1125 case TargetOpcode::G_SITOFP:
1126 return AArch64::SCVTFUWDri;
1127 case TargetOpcode::G_UITOFP:
1128 return AArch64::UCVTFUWDri;
1129 case TargetOpcode::G_FPTOSI:
1130 return AArch64::FCVTZSUXSr;
1131 case TargetOpcode::G_FPTOUI:
1132 return AArch64::FCVTZUUXSr;
1133 default:
1134 return GenericOpc;
1135 }
1136 case 64:
1137 switch (GenericOpc) {
1138 case TargetOpcode::G_SITOFP:
1139 return AArch64::SCVTFUXDri;
1140 case TargetOpcode::G_UITOFP:
1141 return AArch64::UCVTFUXDri;
1142 case TargetOpcode::G_FPTOSI:
1143 return AArch64::FCVTZSUXDr;
1144 case TargetOpcode::G_FPTOUI:
1145 return AArch64::FCVTZUUXDr;
1146 default:
1147 return GenericOpc;
1148 }
1149 default:
1150 return GenericOpc;
1151 }
1152 default:
1153 return GenericOpc;
1154 };
1155 return GenericOpc;
1156}
1157
1158MachineInstr *
1159AArch64InstructionSelector::emitSelect(Register Dst, Register True,
1160 Register False, AArch64CC::CondCode CC,
1161 MachineIRBuilder &MIB) const {
1162 MachineRegisterInfo &MRI = *MIB.getMRI();
1163 assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
1164 RBI.getRegBank(True, MRI, TRI)->getID() &&
1165 "Expected both select operands to have the same regbank?");
1166 LLT Ty = MRI.getType(Reg: True);
1167 if (Ty.isVector())
1168 return nullptr;
1169 const unsigned Size = Ty.getSizeInBits();
1170 assert((Size == 32 || Size == 64) &&
1171 "Expected 32 bit or 64 bit select only?");
1172 const bool Is32Bit = Size == 32;
1173 if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
1174 unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
1175 auto FCSel = MIB.buildInstr(Opc, DstOps: {Dst}, SrcOps: {True, False}).addImm(Val: CC);
1176 constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI);
1177 return &*FCSel;
1178 }
1179
1180 // By default, we'll try and emit a CSEL.
1181 unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
1182 bool Optimized = false;
1183 auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
1184 &Optimized](Register &Reg, Register &OtherReg,
1185 bool Invert) {
1186 if (Optimized)
1187 return false;
1188
1189 // Attempt to fold:
1190 //
1191 // %sub = G_SUB 0, %x
1192 // %select = G_SELECT cc, %reg, %sub
1193 //
1194 // Into:
1195 // %select = CSNEG %reg, %x, cc
1196 Register MatchReg;
1197 if (mi_match(R: Reg, MRI, P: m_Neg(Src: m_Reg(R&: MatchReg)))) {
1198 Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
1199 Reg = MatchReg;
1200 if (Invert) {
1201 CC = AArch64CC::getInvertedCondCode(Code: CC);
1202 std::swap(a&: Reg, b&: OtherReg);
1203 }
1204 return true;
1205 }
1206
1207 // Attempt to fold:
1208 //
1209 // %xor = G_XOR %x, -1
1210 // %select = G_SELECT cc, %reg, %xor
1211 //
1212 // Into:
1213 // %select = CSINV %reg, %x, cc
1214 if (mi_match(R: Reg, MRI, P: m_Not(Src: m_Reg(R&: MatchReg)))) {
1215 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1216 Reg = MatchReg;
1217 if (Invert) {
1218 CC = AArch64CC::getInvertedCondCode(Code: CC);
1219 std::swap(a&: Reg, b&: OtherReg);
1220 }
1221 return true;
1222 }
1223
1224 // Attempt to fold:
1225 //
1226 // %add = G_ADD %x, 1
1227 // %select = G_SELECT cc, %reg, %add
1228 //
1229 // Into:
1230 // %select = CSINC %reg, %x, cc
1231 if (mi_match(R: Reg, MRI,
1232 P: m_any_of(preds: m_GAdd(L: m_Reg(R&: MatchReg), R: m_SpecificICst(RequestedValue: 1)),
1233 preds: m_GPtrAdd(L: m_Reg(R&: MatchReg), R: m_SpecificICst(RequestedValue: 1))))) {
1234 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1235 Reg = MatchReg;
1236 if (Invert) {
1237 CC = AArch64CC::getInvertedCondCode(Code: CC);
1238 std::swap(a&: Reg, b&: OtherReg);
1239 }
1240 return true;
1241 }
1242
1243 return false;
1244 };
1245
1246 // Helper lambda which tries to use CSINC/CSINV for the instruction when its
1247 // true/false values are constants.
1248 // FIXME: All of these patterns already exist in tablegen. We should be
1249 // able to import these.
1250 auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
1251 &Optimized]() {
1252 if (Optimized)
1253 return false;
1254 auto TrueCst = getIConstantVRegValWithLookThrough(VReg: True, MRI);
1255 auto FalseCst = getIConstantVRegValWithLookThrough(VReg: False, MRI);
1256 if (!TrueCst && !FalseCst)
1257 return false;
1258
1259 Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
1260 if (TrueCst && FalseCst) {
1261 int64_t T = TrueCst->Value.getSExtValue();
1262 int64_t F = FalseCst->Value.getSExtValue();
1263
1264 if (T == 0 && F == 1) {
1265 // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
1266 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1267 True = ZReg;
1268 False = ZReg;
1269 return true;
1270 }
1271
1272 if (T == 0 && F == -1) {
1273 // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
1274 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1275 True = ZReg;
1276 False = ZReg;
1277 return true;
1278 }
1279 }
1280
1281 if (TrueCst) {
1282 int64_t T = TrueCst->Value.getSExtValue();
1283 if (T == 1) {
1284 // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
1285 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1286 True = False;
1287 False = ZReg;
1288 CC = AArch64CC::getInvertedCondCode(Code: CC);
1289 return true;
1290 }
1291
1292 if (T == -1) {
1293 // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
1294 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1295 True = False;
1296 False = ZReg;
1297 CC = AArch64CC::getInvertedCondCode(Code: CC);
1298 return true;
1299 }
1300 }
1301
1302 if (FalseCst) {
1303 int64_t F = FalseCst->Value.getSExtValue();
1304 if (F == 1) {
1305 // G_SELECT cc, t, 1 -> CSINC t, zreg, cc
1306 Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1307 False = ZReg;
1308 return true;
1309 }
1310
1311 if (F == -1) {
1312 // G_SELECT cc, t, -1 -> CSINC t, zreg, cc
1313 Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1314 False = ZReg;
1315 return true;
1316 }
1317 }
1318 return false;
1319 };
1320
1321 Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
1322 Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
1323 Optimized |= TryOptSelectCst();
1324 auto SelectInst = MIB.buildInstr(Opc, DstOps: {Dst}, SrcOps: {True, False}).addImm(Val: CC);
1325 constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI);
1326 return &*SelectInst;
1327}
1328
1329static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
1330 switch (P) {
1331 default:
1332 llvm_unreachable("Unknown condition code!");
1333 case CmpInst::ICMP_NE:
1334 return AArch64CC::NE;
1335 case CmpInst::ICMP_EQ:
1336 return AArch64CC::EQ;
1337 case CmpInst::ICMP_SGT:
1338 return AArch64CC::GT;
1339 case CmpInst::ICMP_SGE:
1340 return AArch64CC::GE;
1341 case CmpInst::ICMP_SLT:
1342 return AArch64CC::LT;
1343 case CmpInst::ICMP_SLE:
1344 return AArch64CC::LE;
1345 case CmpInst::ICMP_UGT:
1346 return AArch64CC::HI;
1347 case CmpInst::ICMP_UGE:
1348 return AArch64CC::HS;
1349 case CmpInst::ICMP_ULT:
1350 return AArch64CC::LO;
1351 case CmpInst::ICMP_ULE:
1352 return AArch64CC::LS;
1353 }
1354}
1355
1356/// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC.
1357static void changeFPCCToORAArch64CC(CmpInst::Predicate CC,
1358 AArch64CC::CondCode &CondCode,
1359 AArch64CC::CondCode &CondCode2) {
1360 CondCode2 = AArch64CC::AL;
1361 switch (CC) {
1362 default:
1363 llvm_unreachable("Unknown FP condition!");
1364 case CmpInst::FCMP_OEQ:
1365 CondCode = AArch64CC::EQ;
1366 break;
1367 case CmpInst::FCMP_OGT:
1368 CondCode = AArch64CC::GT;
1369 break;
1370 case CmpInst::FCMP_OGE:
1371 CondCode = AArch64CC::GE;
1372 break;
1373 case CmpInst::FCMP_OLT:
1374 CondCode = AArch64CC::MI;
1375 break;
1376 case CmpInst::FCMP_OLE:
1377 CondCode = AArch64CC::LS;
1378 break;
1379 case CmpInst::FCMP_ONE:
1380 CondCode = AArch64CC::MI;
1381 CondCode2 = AArch64CC::GT;
1382 break;
1383 case CmpInst::FCMP_ORD:
1384 CondCode = AArch64CC::VC;
1385 break;
1386 case CmpInst::FCMP_UNO:
1387 CondCode = AArch64CC::VS;
1388 break;
1389 case CmpInst::FCMP_UEQ:
1390 CondCode = AArch64CC::EQ;
1391 CondCode2 = AArch64CC::VS;
1392 break;
1393 case CmpInst::FCMP_UGT:
1394 CondCode = AArch64CC::HI;
1395 break;
1396 case CmpInst::FCMP_UGE:
1397 CondCode = AArch64CC::PL;
1398 break;
1399 case CmpInst::FCMP_ULT:
1400 CondCode = AArch64CC::LT;
1401 break;
1402 case CmpInst::FCMP_ULE:
1403 CondCode = AArch64CC::LE;
1404 break;
1405 case CmpInst::FCMP_UNE:
1406 CondCode = AArch64CC::NE;
1407 break;
1408 }
1409}
1410
1411/// Convert an IR fp condition code to an AArch64 CC.
1412/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1413/// should be AND'ed instead of OR'ed.
1414static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC,
1415 AArch64CC::CondCode &CondCode,
1416 AArch64CC::CondCode &CondCode2) {
1417 CondCode2 = AArch64CC::AL;
1418 switch (CC) {
1419 default:
1420 changeFPCCToORAArch64CC(CC, CondCode, CondCode2);
1421 assert(CondCode2 == AArch64CC::AL);
1422 break;
1423 case CmpInst::FCMP_ONE:
1424 // (a one b)
1425 // == ((a olt b) || (a ogt b))
1426 // == ((a ord b) && (a une b))
1427 CondCode = AArch64CC::VC;
1428 CondCode2 = AArch64CC::NE;
1429 break;
1430 case CmpInst::FCMP_UEQ:
1431 // (a ueq b)
1432 // == ((a uno b) || (a oeq b))
1433 // == ((a ule b) && (a uge b))
1434 CondCode = AArch64CC::PL;
1435 CondCode2 = AArch64CC::LE;
1436 break;
1437 }
1438}
1439
1440/// Return a register which can be used as a bit to test in a TB(N)Z.
1441static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
1442 MachineRegisterInfo &MRI) {
1443 assert(Reg.isValid() && "Expected valid register!");
1444 bool HasZext = false;
1445 while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
1446 unsigned Opc = MI->getOpcode();
1447
1448 if (!MI->getOperand(i: 0).isReg() ||
1449 !MRI.hasOneNonDBGUse(RegNo: MI->getOperand(i: 0).getReg()))
1450 break;
1451
1452 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
1453 //
1454 // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
1455 // on the truncated x is the same as the bit number on x.
1456 if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
1457 Opc == TargetOpcode::G_TRUNC) {
1458 if (Opc == TargetOpcode::G_ZEXT)
1459 HasZext = true;
1460
1461 Register NextReg = MI->getOperand(i: 1).getReg();
1462 // Did we find something worth folding?
1463 if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(RegNo: NextReg))
1464 break;
1465
1466 // NextReg is worth folding. Keep looking.
1467 Reg = NextReg;
1468 continue;
1469 }
1470
1471 // Attempt to find a suitable operation with a constant on one side.
1472 std::optional<uint64_t> C;
1473 Register TestReg;
1474 switch (Opc) {
1475 default:
1476 break;
1477 case TargetOpcode::G_AND:
1478 case TargetOpcode::G_XOR: {
1479 TestReg = MI->getOperand(i: 1).getReg();
1480 Register ConstantReg = MI->getOperand(i: 2).getReg();
1481 auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: ConstantReg, MRI);
1482 if (!VRegAndVal) {
1483 // AND commutes, check the other side for a constant.
1484 // FIXME: Can we canonicalize the constant so that it's always on the
1485 // same side at some point earlier?
1486 std::swap(a&: ConstantReg, b&: TestReg);
1487 VRegAndVal = getIConstantVRegValWithLookThrough(VReg: ConstantReg, MRI);
1488 }
1489 if (VRegAndVal) {
1490 if (HasZext)
1491 C = VRegAndVal->Value.getZExtValue();
1492 else
1493 C = VRegAndVal->Value.getSExtValue();
1494 }
1495 break;
1496 }
1497 case TargetOpcode::G_ASHR:
1498 case TargetOpcode::G_LSHR:
1499 case TargetOpcode::G_SHL: {
1500 TestReg = MI->getOperand(i: 1).getReg();
1501 auto VRegAndVal =
1502 getIConstantVRegValWithLookThrough(VReg: MI->getOperand(i: 2).getReg(), MRI);
1503 if (VRegAndVal)
1504 C = VRegAndVal->Value.getSExtValue();
1505 break;
1506 }
1507 }
1508
1509 // Didn't find a constant or viable register. Bail out of the loop.
1510 if (!C || !TestReg.isValid())
1511 break;
1512
1513 // We found a suitable instruction with a constant. Check to see if we can
1514 // walk through the instruction.
1515 Register NextReg;
1516 unsigned TestRegSize = MRI.getType(Reg: TestReg).getSizeInBits();
1517 switch (Opc) {
1518 default:
1519 break;
1520 case TargetOpcode::G_AND:
1521 // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
1522 if ((*C >> Bit) & 1)
1523 NextReg = TestReg;
1524 break;
1525 case TargetOpcode::G_SHL:
1526 // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
1527 // the type of the register.
1528 if (*C <= Bit && (Bit - *C) < TestRegSize) {
1529 NextReg = TestReg;
1530 Bit = Bit - *C;
1531 }
1532 break;
1533 case TargetOpcode::G_ASHR:
1534 // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
1535 // in x
1536 NextReg = TestReg;
1537 Bit = Bit + *C;
1538 if (Bit >= TestRegSize)
1539 Bit = TestRegSize - 1;
1540 break;
1541 case TargetOpcode::G_LSHR:
1542 // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
1543 if ((Bit + *C) < TestRegSize) {
1544 NextReg = TestReg;
1545 Bit = Bit + *C;
1546 }
1547 break;
1548 case TargetOpcode::G_XOR:
1549 // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
1550 // appropriate.
1551 //
1552 // e.g. If x' = xor x, c, and the b-th bit is set in c then
1553 //
1554 // tbz x', b -> tbnz x, b
1555 //
1556 // Because x' only has the b-th bit set if x does not.
1557 if ((*C >> Bit) & 1)
1558 Invert = !Invert;
1559 NextReg = TestReg;
1560 break;
1561 }
1562
1563 // Check if we found anything worth folding.
1564 if (!NextReg.isValid())
1565 return Reg;
1566 Reg = NextReg;
1567 }
1568
1569 return Reg;
1570}
1571
1572MachineInstr *AArch64InstructionSelector::emitTestBit(
1573 Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
1574 MachineIRBuilder &MIB) const {
1575 assert(TestReg.isValid());
1576 assert(ProduceNonFlagSettingCondBr &&
1577 "Cannot emit TB(N)Z with speculation tracking!");
1578 MachineRegisterInfo &MRI = *MIB.getMRI();
1579
1580 // Attempt to optimize the test bit by walking over instructions.
1581 TestReg = getTestBitReg(Reg: TestReg, Bit, Invert&: IsNegative, MRI);
1582 LLT Ty = MRI.getType(Reg: TestReg);
1583 unsigned Size = Ty.getSizeInBits();
1584 assert(!Ty.isVector() && "Expected a scalar!");
1585 assert(Bit < 64 && "Bit is too large!");
1586
1587 // When the test register is a 64-bit register, we have to narrow to make
1588 // TBNZW work.
1589 bool UseWReg = Bit < 32;
1590 unsigned NecessarySize = UseWReg ? 32 : 64;
1591 if (Size != NecessarySize)
1592 TestReg = moveScalarRegClass(
1593 TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
1594 MIB);
1595
1596 static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
1597 {AArch64::TBZW, AArch64::TBNZW}};
1598 unsigned Opc = OpcTable[UseWReg][IsNegative];
1599 auto TestBitMI =
1600 MIB.buildInstr(Opcode: Opc).addReg(RegNo: TestReg).addImm(Val: Bit).addMBB(MBB: DstMBB);
1601 constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI);
1602 return &*TestBitMI;
1603}
1604
1605bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
1606 MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
1607 MachineIRBuilder &MIB) const {
1608 assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
1609 // Given something like this:
1610 //
1611 // %x = ...Something...
1612 // %one = G_CONSTANT i64 1
1613 // %zero = G_CONSTANT i64 0
1614 // %and = G_AND %x, %one
1615 // %cmp = G_ICMP intpred(ne), %and, %zero
1616 // %cmp_trunc = G_TRUNC %cmp
1617 // G_BRCOND %cmp_trunc, %bb.3
1618 //
1619 // We want to try and fold the AND into the G_BRCOND and produce either a
1620 // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
1621 //
1622 // In this case, we'd get
1623 //
1624 // TBNZ %x %bb.3
1625 //
1626
1627 // Check if the AND has a constant on its RHS which we can use as a mask.
1628 // If it's a power of 2, then it's the same as checking a specific bit.
1629 // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
1630 auto MaybeBit = getIConstantVRegValWithLookThrough(
1631 VReg: AndInst.getOperand(i: 2).getReg(), MRI: *MIB.getMRI());
1632 if (!MaybeBit)
1633 return false;
1634
1635 int32_t Bit = MaybeBit->Value.exactLogBase2();
1636 if (Bit < 0)
1637 return false;
1638
1639 Register TestReg = AndInst.getOperand(i: 1).getReg();
1640
1641 // Emit a TB(N)Z.
1642 emitTestBit(TestReg, Bit, IsNegative: Invert, DstMBB, MIB);
1643 return true;
1644}
1645
1646MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
1647 bool IsNegative,
1648 MachineBasicBlock *DestMBB,
1649 MachineIRBuilder &MIB) const {
1650 assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
1651 MachineRegisterInfo &MRI = *MIB.getMRI();
1652 assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
1653 AArch64::GPRRegBankID &&
1654 "Expected GPRs only?");
1655 auto Ty = MRI.getType(Reg: CompareReg);
1656 unsigned Width = Ty.getSizeInBits();
1657 assert(!Ty.isVector() && "Expected scalar only?");
1658 assert(Width <= 64 && "Expected width to be at most 64?");
1659 static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
1660 {AArch64::CBNZW, AArch64::CBNZX}};
1661 unsigned Opc = OpcTable[IsNegative][Width == 64];
1662 auto BranchMI = MIB.buildInstr(Opc, DstOps: {}, SrcOps: {CompareReg}).addMBB(MBB: DestMBB);
1663 constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI);
1664 return &*BranchMI;
1665}
1666
1667bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
1668 MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
1669 assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
1670 assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1671 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
1672 // totally clean. Some of them require two branches to implement.
1673 auto Pred = (CmpInst::Predicate)FCmp.getOperand(i: 1).getPredicate();
1674 emitFPCompare(LHS: FCmp.getOperand(i: 2).getReg(), RHS: FCmp.getOperand(i: 3).getReg(), MIRBuilder&: MIB,
1675 Pred);
1676 AArch64CC::CondCode CC1, CC2;
1677 changeFCMPPredToAArch64CC(P: static_cast<CmpInst::Predicate>(Pred), CondCode&: CC1, CondCode2&: CC2);
1678 MachineBasicBlock *DestMBB = I.getOperand(i: 1).getMBB();
1679 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB);
1680 if (CC2 != AArch64CC::AL)
1681 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB);
1682 I.eraseFromParent();
1683 return true;
1684}
1685
1686bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
1687 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1688 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1689 assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1690 // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
1691 //
1692 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1693 // instructions will not be produced, as they are conditional branch
1694 // instructions that do not set flags.
1695 if (!ProduceNonFlagSettingCondBr)
1696 return false;
1697
1698 MachineRegisterInfo &MRI = *MIB.getMRI();
1699 MachineBasicBlock *DestMBB = I.getOperand(i: 1).getMBB();
1700 auto Pred =
1701 static_cast<CmpInst::Predicate>(ICmp.getOperand(i: 1).getPredicate());
1702 Register LHS = ICmp.getOperand(i: 2).getReg();
1703 Register RHS = ICmp.getOperand(i: 3).getReg();
1704
1705 // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
1706 auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: RHS, MRI);
1707 MachineInstr *AndInst = getOpcodeDef(Opcode: TargetOpcode::G_AND, Reg: LHS, MRI);
1708
1709 // When we can emit a TB(N)Z, prefer that.
1710 //
1711 // Handle non-commutative condition codes first.
1712 // Note that we don't want to do this when we have a G_AND because it can
1713 // become a tst. The tst will make the test bit in the TB(N)Z redundant.
1714 if (VRegAndVal && !AndInst) {
1715 int64_t C = VRegAndVal->Value.getSExtValue();
1716
1717 // When we have a greater-than comparison, we can just test if the msb is
1718 // zero.
1719 if (C == -1 && Pred == CmpInst::ICMP_SGT) {
1720 uint64_t Bit = MRI.getType(Reg: LHS).getSizeInBits() - 1;
1721 emitTestBit(TestReg: LHS, Bit, /*IsNegative = */ false, DstMBB: DestMBB, MIB);
1722 I.eraseFromParent();
1723 return true;
1724 }
1725
1726 // When we have a less than comparison, we can just test if the msb is not
1727 // zero.
1728 if (C == 0 && Pred == CmpInst::ICMP_SLT) {
1729 uint64_t Bit = MRI.getType(Reg: LHS).getSizeInBits() - 1;
1730 emitTestBit(TestReg: LHS, Bit, /*IsNegative = */ true, DstMBB: DestMBB, MIB);
1731 I.eraseFromParent();
1732 return true;
1733 }
1734
1735 // Inversely, if we have a signed greater-than-or-equal comparison to zero,
1736 // we can test if the msb is zero.
1737 if (C == 0 && Pred == CmpInst::ICMP_SGE) {
1738 uint64_t Bit = MRI.getType(Reg: LHS).getSizeInBits() - 1;
1739 emitTestBit(TestReg: LHS, Bit, /*IsNegative = */ false, DstMBB: DestMBB, MIB);
1740 I.eraseFromParent();
1741 return true;
1742 }
1743 }
1744
1745 // Attempt to handle commutative condition codes. Right now, that's only
1746 // eq/ne.
1747 if (ICmpInst::isEquality(P: Pred)) {
1748 if (!VRegAndVal) {
1749 std::swap(a&: RHS, b&: LHS);
1750 VRegAndVal = getIConstantVRegValWithLookThrough(VReg: RHS, MRI);
1751 AndInst = getOpcodeDef(Opcode: TargetOpcode::G_AND, Reg: LHS, MRI);
1752 }
1753
1754 if (VRegAndVal && VRegAndVal->Value == 0) {
1755 // If there's a G_AND feeding into this branch, try to fold it away by
1756 // emitting a TB(N)Z instead.
1757 //
1758 // Note: If we have LT, then it *is* possible to fold, but it wouldn't be
1759 // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
1760 // would be redundant.
1761 if (AndInst &&
1762 tryOptAndIntoCompareBranch(
1763 AndInst&: *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DstMBB: DestMBB, MIB)) {
1764 I.eraseFromParent();
1765 return true;
1766 }
1767
1768 // Otherwise, try to emit a CB(N)Z instead.
1769 auto LHSTy = MRI.getType(Reg: LHS);
1770 if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
1771 emitCBZ(CompareReg: LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
1772 I.eraseFromParent();
1773 return true;
1774 }
1775 }
1776 }
1777
1778 return false;
1779}
1780
1781bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
1782 MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1783 assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1784 assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1785 if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
1786 return true;
1787
1788 // Couldn't optimize. Emit a compare + a Bcc.
1789 MachineBasicBlock *DestMBB = I.getOperand(i: 1).getMBB();
1790 auto PredOp = ICmp.getOperand(i: 1);
1791 emitIntegerCompare(LHS&: ICmp.getOperand(i: 2), RHS&: ICmp.getOperand(i: 3), Predicate&: PredOp, MIRBuilder&: MIB);
1792 const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
1793 P: static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
1794 MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
1795 I.eraseFromParent();
1796 return true;
1797}
1798
1799bool AArch64InstructionSelector::selectCompareBranch(
1800 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
1801 Register CondReg = I.getOperand(i: 0).getReg();
1802 MachineInstr *CCMI = MRI.getVRegDef(Reg: CondReg);
1803 // Try to select the G_BRCOND using whatever is feeding the condition if
1804 // possible.
1805 unsigned CCMIOpc = CCMI->getOpcode();
1806 if (CCMIOpc == TargetOpcode::G_FCMP)
1807 return selectCompareBranchFedByFCmp(I, FCmp&: *CCMI, MIB);
1808 if (CCMIOpc == TargetOpcode::G_ICMP)
1809 return selectCompareBranchFedByICmp(I, ICmp&: *CCMI, MIB);
1810
1811 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1812 // instructions will not be produced, as they are conditional branch
1813 // instructions that do not set flags.
1814 if (ProduceNonFlagSettingCondBr) {
1815 emitTestBit(TestReg: CondReg, /*Bit = */ 0, /*IsNegative = */ true,
1816 DstMBB: I.getOperand(i: 1).getMBB(), MIB);
1817 I.eraseFromParent();
1818 return true;
1819 }
1820
1821 // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
1822 auto TstMI =
1823 MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1);
1824 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
1825 auto Bcc = MIB.buildInstr(AArch64::Bcc)
1826 .addImm(AArch64CC::NE)
1827 .addMBB(I.getOperand(1).getMBB());
1828 I.eraseFromParent();
1829 return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI);
1830}
1831
1832/// Returns the element immediate value of a vector shift operand if found.
1833/// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
1834static std::optional<int64_t> getVectorShiftImm(Register Reg,
1835 MachineRegisterInfo &MRI) {
1836 assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
1837 MachineInstr *OpMI = MRI.getVRegDef(Reg);
1838 return getAArch64VectorSplatScalar(MI: *OpMI, MRI);
1839}
1840
1841/// Matches and returns the shift immediate value for a SHL instruction given
1842/// a shift operand.
1843static std::optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg,
1844 MachineRegisterInfo &MRI) {
1845 std::optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
1846 if (!ShiftImm)
1847 return std::nullopt;
1848 // Check the immediate is in range for a SHL.
1849 int64_t Imm = *ShiftImm;
1850 if (Imm < 0)
1851 return std::nullopt;
1852 switch (SrcTy.getElementType().getSizeInBits()) {
1853 default:
1854 LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
1855 return std::nullopt;
1856 case 8:
1857 if (Imm > 7)
1858 return std::nullopt;
1859 break;
1860 case 16:
1861 if (Imm > 15)
1862 return std::nullopt;
1863 break;
1864 case 32:
1865 if (Imm > 31)
1866 return std::nullopt;
1867 break;
1868 case 64:
1869 if (Imm > 63)
1870 return std::nullopt;
1871 break;
1872 }
1873 return Imm;
1874}
1875
1876bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I,
1877 MachineRegisterInfo &MRI) {
1878 assert(I.getOpcode() == TargetOpcode::G_SHL);
1879 Register DstReg = I.getOperand(i: 0).getReg();
1880 const LLT Ty = MRI.getType(Reg: DstReg);
1881 Register Src1Reg = I.getOperand(i: 1).getReg();
1882 Register Src2Reg = I.getOperand(i: 2).getReg();
1883
1884 if (!Ty.isVector())
1885 return false;
1886
1887 // Check if we have a vector of constants on RHS that we can select as the
1888 // immediate form.
1889 std::optional<int64_t> ImmVal = getVectorSHLImm(SrcTy: Ty, Reg: Src2Reg, MRI);
1890
1891 unsigned Opc = 0;
1892 if (Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64)) {
1893 Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
1894 } else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32)) {
1895 Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
1896 } else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32)) {
1897 Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
1898 } else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16)) {
1899 Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
1900 } else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16)) {
1901 Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
1902 } else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8)) {
1903 Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
1904 } else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 8)) {
1905 Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
1906 } else {
1907 LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
1908 return false;
1909 }
1910
1911 auto Shl = MIB.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {Src1Reg});
1912 if (ImmVal)
1913 Shl.addImm(Val: *ImmVal);
1914 else
1915 Shl.addUse(RegNo: Src2Reg);
1916 constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI);
1917 I.eraseFromParent();
1918 return true;
1919}
1920
1921bool AArch64InstructionSelector::selectVectorAshrLshr(
1922 MachineInstr &I, MachineRegisterInfo &MRI) {
1923 assert(I.getOpcode() == TargetOpcode::G_ASHR ||
1924 I.getOpcode() == TargetOpcode::G_LSHR);
1925 Register DstReg = I.getOperand(i: 0).getReg();
1926 const LLT Ty = MRI.getType(Reg: DstReg);
1927 Register Src1Reg = I.getOperand(i: 1).getReg();
1928 Register Src2Reg = I.getOperand(i: 2).getReg();
1929
1930 if (!Ty.isVector())
1931 return false;
1932
1933 bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
1934
1935 // We expect the immediate case to be lowered in the PostLegalCombiner to
1936 // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
1937
1938 // There is not a shift right register instruction, but the shift left
1939 // register instruction takes a signed value, where negative numbers specify a
1940 // right shift.
1941
1942 unsigned Opc = 0;
1943 unsigned NegOpc = 0;
1944 const TargetRegisterClass *RC =
1945 getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID));
1946 if (Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64)) {
1947 Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
1948 NegOpc = AArch64::NEGv2i64;
1949 } else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32)) {
1950 Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
1951 NegOpc = AArch64::NEGv4i32;
1952 } else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 32)) {
1953 Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
1954 NegOpc = AArch64::NEGv2i32;
1955 } else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16)) {
1956 Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
1957 NegOpc = AArch64::NEGv4i16;
1958 } else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16)) {
1959 Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
1960 NegOpc = AArch64::NEGv8i16;
1961 } else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8)) {
1962 Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
1963 NegOpc = AArch64::NEGv16i8;
1964 } else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 8)) {
1965 Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
1966 NegOpc = AArch64::NEGv8i8;
1967 } else {
1968 LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
1969 return false;
1970 }
1971
1972 auto Neg = MIB.buildInstr(Opc: NegOpc, DstOps: {RC}, SrcOps: {Src2Reg});
1973 constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI);
1974 auto SShl = MIB.buildInstr(Opc, DstOps: {DstReg}, SrcOps: {Src1Reg, Neg});
1975 constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI);
1976 I.eraseFromParent();
1977 return true;
1978}
1979
1980bool AArch64InstructionSelector::selectVaStartAAPCS(
1981 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1982 return false;
1983}
1984
1985bool AArch64InstructionSelector::selectVaStartDarwin(
1986 MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1987 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
1988 Register ListReg = I.getOperand(i: 0).getReg();
1989
1990 Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1991
1992 int FrameIdx = FuncInfo->getVarArgsStackIndex();
1993 if (MF.getSubtarget<AArch64Subtarget>().isCallingConvWin64(
1994 CC: MF.getFunction().getCallingConv())) {
1995 FrameIdx = FuncInfo->getVarArgsGPRSize() > 0
1996 ? FuncInfo->getVarArgsGPRIndex()
1997 : FuncInfo->getVarArgsStackIndex();
1998 }
1999
2000 auto MIB =
2001 BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
2002 .addDef(ArgsAddrReg)
2003 .addFrameIndex(FrameIdx)
2004 .addImm(0)
2005 .addImm(0);
2006
2007 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2008
2009 MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
2010 .addUse(ArgsAddrReg)
2011 .addUse(ListReg)
2012 .addImm(0)
2013 .addMemOperand(*I.memoperands_begin());
2014
2015 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2016 I.eraseFromParent();
2017 return true;
2018}
2019
2020void AArch64InstructionSelector::materializeLargeCMVal(
2021 MachineInstr &I, const Value *V, unsigned OpFlags) {
2022 MachineBasicBlock &MBB = *I.getParent();
2023 MachineFunction &MF = *MBB.getParent();
2024 MachineRegisterInfo &MRI = MF.getRegInfo();
2025
2026 auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {});
2027 MovZ->addOperand(MF, I.getOperand(i: 1));
2028 MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
2029 AArch64II::MO_NC);
2030 MovZ->addOperand(MF, MachineOperand::CreateImm(Val: 0));
2031 constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
2032
2033 auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
2034 Register ForceDstReg) {
2035 Register DstReg = ForceDstReg
2036 ? ForceDstReg
2037 : MRI.createVirtualRegister(&AArch64::GPR64RegClass);
2038 auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg);
2039 if (auto *GV = dyn_cast<GlobalValue>(Val: V)) {
2040 MovI->addOperand(MF, MachineOperand::CreateGA(
2041 GV, Offset: MovZ->getOperand(1).getOffset(), TargetFlags: Flags));
2042 } else {
2043 MovI->addOperand(
2044 MF, MachineOperand::CreateBA(BA: cast<BlockAddress>(Val: V),
2045 Offset: MovZ->getOperand(1).getOffset(), TargetFlags: Flags));
2046 }
2047 MovI->addOperand(MF, MachineOperand::CreateImm(Val: Offset));
2048 constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
2049 return DstReg;
2050 };
2051 Register DstReg = BuildMovK(MovZ.getReg(0),
2052 AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
2053 DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
2054 BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(i: 0).getReg());
2055}
2056
2057bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
2058 MachineBasicBlock &MBB = *I.getParent();
2059 MachineFunction &MF = *MBB.getParent();
2060 MachineRegisterInfo &MRI = MF.getRegInfo();
2061
2062 switch (I.getOpcode()) {
2063 case TargetOpcode::G_STORE: {
2064 bool Changed = contractCrossBankCopyIntoStore(I, MRI);
2065 MachineOperand &SrcOp = I.getOperand(i: 0);
2066 if (MRI.getType(Reg: SrcOp.getReg()).isPointer()) {
2067 // Allow matching with imported patterns for stores of pointers. Unlike
2068 // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
2069 // and constrain.
2070 auto Copy = MIB.buildCopy(Res: LLT::scalar(SizeInBits: 64), Op: SrcOp);
2071 Register NewSrc = Copy.getReg(Idx: 0);
2072 SrcOp.setReg(NewSrc);
2073 RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI);
2074 Changed = true;
2075 }
2076 return Changed;
2077 }
2078 case TargetOpcode::G_PTR_ADD:
2079 return convertPtrAddToAdd(I, MRI);
2080 case TargetOpcode::G_LOAD: {
2081 // For scalar loads of pointers, we try to convert the dest type from p0
2082 // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
2083 // conversion, this should be ok because all users should have been
2084 // selected already, so the type doesn't matter for them.
2085 Register DstReg = I.getOperand(i: 0).getReg();
2086 const LLT DstTy = MRI.getType(Reg: DstReg);
2087 if (!DstTy.isPointer())
2088 return false;
2089 MRI.setType(VReg: DstReg, Ty: LLT::scalar(SizeInBits: 64));
2090 return true;
2091 }
2092 case AArch64::G_DUP: {
2093 // Convert the type from p0 to s64 to help selection.
2094 LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
2095 if (!DstTy.isPointerVector())
2096 return false;
2097 auto NewSrc = MIB.buildCopy(Res: LLT::scalar(SizeInBits: 64), Op: I.getOperand(i: 1).getReg());
2098 MRI.setType(VReg: I.getOperand(i: 0).getReg(),
2099 Ty: DstTy.changeElementType(NewEltTy: LLT::scalar(SizeInBits: 64)));
2100 MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
2101 I.getOperand(i: 1).setReg(NewSrc.getReg(Idx: 0));
2102 return true;
2103 }
2104 case TargetOpcode::G_UITOFP:
2105 case TargetOpcode::G_SITOFP: {
2106 // If both source and destination regbanks are FPR, then convert the opcode
2107 // to G_SITOF so that the importer can select it to an fpr variant.
2108 // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
2109 // copy.
2110 Register SrcReg = I.getOperand(i: 1).getReg();
2111 LLT SrcTy = MRI.getType(Reg: SrcReg);
2112 LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
2113 if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits())
2114 return false;
2115
2116 if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
2117 if (I.getOpcode() == TargetOpcode::G_SITOFP)
2118 I.setDesc(TII.get(AArch64::G_SITOF));
2119 else
2120 I.setDesc(TII.get(AArch64::G_UITOF));
2121 return true;
2122 }
2123 return false;
2124 }
2125 default:
2126 return false;
2127 }
2128}
2129
2130/// This lowering tries to look for G_PTR_ADD instructions and then converts
2131/// them to a standard G_ADD with a COPY on the source.
2132///
2133/// The motivation behind this is to expose the add semantics to the imported
2134/// tablegen patterns. We shouldn't need to check for uses being loads/stores,
2135/// because the selector works bottom up, uses before defs. By the time we
2136/// end up trying to select a G_PTR_ADD, we should have already attempted to
2137/// fold this into addressing modes and were therefore unsuccessful.
2138bool AArch64InstructionSelector::convertPtrAddToAdd(
2139 MachineInstr &I, MachineRegisterInfo &MRI) {
2140 assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
2141 Register DstReg = I.getOperand(i: 0).getReg();
2142 Register AddOp1Reg = I.getOperand(i: 1).getReg();
2143 const LLT PtrTy = MRI.getType(Reg: DstReg);
2144 if (PtrTy.getAddressSpace() != 0)
2145 return false;
2146
2147 const LLT CastPtrTy =
2148 PtrTy.isVector() ? LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64) : LLT::scalar(SizeInBits: 64);
2149 auto PtrToInt = MIB.buildPtrToInt(Dst: CastPtrTy, Src: AddOp1Reg);
2150 // Set regbanks on the registers.
2151 if (PtrTy.isVector())
2152 MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID));
2153 else
2154 MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
2155
2156 // Now turn the %dst(p0) = G_PTR_ADD %base, off into:
2157 // %dst(intty) = G_ADD %intbase, off
2158 I.setDesc(TII.get(TargetOpcode::G_ADD));
2159 MRI.setType(VReg: DstReg, Ty: CastPtrTy);
2160 I.getOperand(i: 1).setReg(PtrToInt.getReg(Idx: 0));
2161 if (!select(I&: *PtrToInt)) {
2162 LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
2163 return false;
2164 }
2165
2166 // Also take the opportunity here to try to do some optimization.
2167 // Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
2168 Register NegatedReg;
2169 if (!mi_match(R: I.getOperand(i: 2).getReg(), MRI, P: m_Neg(Src: m_Reg(R&: NegatedReg))))
2170 return true;
2171 I.getOperand(i: 2).setReg(NegatedReg);
2172 I.setDesc(TII.get(TargetOpcode::G_SUB));
2173 return true;
2174}
2175
2176bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
2177 MachineRegisterInfo &MRI) {
2178 // We try to match the immediate variant of LSL, which is actually an alias
2179 // for a special case of UBFM. Otherwise, we fall back to the imported
2180 // selector which will match the register variant.
2181 assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
2182 const auto &MO = I.getOperand(i: 2);
2183 auto VRegAndVal = getIConstantVRegVal(VReg: MO.getReg(), MRI);
2184 if (!VRegAndVal)
2185 return false;
2186
2187 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
2188 if (DstTy.isVector())
2189 return false;
2190 bool Is64Bit = DstTy.getSizeInBits() == 64;
2191 auto Imm1Fn = Is64Bit ? selectShiftA_64(Root: MO) : selectShiftA_32(Root: MO);
2192 auto Imm2Fn = Is64Bit ? selectShiftB_64(Root: MO) : selectShiftB_32(Root: MO);
2193
2194 if (!Imm1Fn || !Imm2Fn)
2195 return false;
2196
2197 auto NewI =
2198 MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
2199 {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()});
2200
2201 for (auto &RenderFn : *Imm1Fn)
2202 RenderFn(NewI);
2203 for (auto &RenderFn : *Imm2Fn)
2204 RenderFn(NewI);
2205
2206 I.eraseFromParent();
2207 return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
2208}
2209
2210bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
2211 MachineInstr &I, MachineRegisterInfo &MRI) {
2212 assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
2213 // If we're storing a scalar, it doesn't matter what register bank that
2214 // scalar is on. All that matters is the size.
2215 //
2216 // So, if we see something like this (with a 32-bit scalar as an example):
2217 //
2218 // %x:gpr(s32) = ... something ...
2219 // %y:fpr(s32) = COPY %x:gpr(s32)
2220 // G_STORE %y:fpr(s32)
2221 //
2222 // We can fix this up into something like this:
2223 //
2224 // G_STORE %x:gpr(s32)
2225 //
2226 // And then continue the selection process normally.
2227 Register DefDstReg = getSrcRegIgnoringCopies(Reg: I.getOperand(i: 0).getReg(), MRI);
2228 if (!DefDstReg.isValid())
2229 return false;
2230 LLT DefDstTy = MRI.getType(Reg: DefDstReg);
2231 Register StoreSrcReg = I.getOperand(i: 0).getReg();
2232 LLT StoreSrcTy = MRI.getType(Reg: StoreSrcReg);
2233
2234 // If we get something strange like a physical register, then we shouldn't
2235 // go any further.
2236 if (!DefDstTy.isValid())
2237 return false;
2238
2239 // Are the source and dst types the same size?
2240 if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
2241 return false;
2242
2243 if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
2244 RBI.getRegBank(DefDstReg, MRI, TRI))
2245 return false;
2246
2247 // We have a cross-bank copy, which is entering a store. Let's fold it.
2248 I.getOperand(i: 0).setReg(DefDstReg);
2249 return true;
2250}
2251
2252bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
2253 assert(I.getParent() && "Instruction should be in a basic block!");
2254 assert(I.getParent()->getParent() && "Instruction should be in a function!");
2255
2256 MachineBasicBlock &MBB = *I.getParent();
2257 MachineFunction &MF = *MBB.getParent();
2258 MachineRegisterInfo &MRI = MF.getRegInfo();
2259
2260 switch (I.getOpcode()) {
2261 case AArch64::G_DUP: {
2262 // Before selecting a DUP instruction, check if it is better selected as a
2263 // MOV or load from a constant pool.
2264 Register Src = I.getOperand(i: 1).getReg();
2265 auto ValAndVReg = getAnyConstantVRegValWithLookThrough(VReg: Src, MRI);
2266 if (!ValAndVReg)
2267 return false;
2268 LLVMContext &Ctx = MF.getFunction().getContext();
2269 Register Dst = I.getOperand(i: 0).getReg();
2270 auto *CV = ConstantDataVector::getSplat(
2271 NumElts: MRI.getType(Reg: Dst).getNumElements(),
2272 Elt: ConstantInt::get(Ty: Type::getIntNTy(C&: Ctx, N: MRI.getType(Reg: Src).getSizeInBits()),
2273 V: ValAndVReg->Value));
2274 if (!emitConstantVector(Dst, CV, MIRBuilder&: MIB, MRI))
2275 return false;
2276 I.eraseFromParent();
2277 return true;
2278 }
2279 case TargetOpcode::G_SEXT:
2280 // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV
2281 // over a normal extend.
2282 if (selectUSMovFromExtend(I, MRI))
2283 return true;
2284 return false;
2285 case TargetOpcode::G_BR:
2286 return false;
2287 case TargetOpcode::G_SHL:
2288 return earlySelectSHL(I, MRI);
2289 case TargetOpcode::G_CONSTANT: {
2290 bool IsZero = false;
2291 if (I.getOperand(i: 1).isCImm())
2292 IsZero = I.getOperand(i: 1).getCImm()->isZero();
2293 else if (I.getOperand(i: 1).isImm())
2294 IsZero = I.getOperand(i: 1).getImm() == 0;
2295
2296 if (!IsZero)
2297 return false;
2298
2299 Register DefReg = I.getOperand(i: 0).getReg();
2300 LLT Ty = MRI.getType(Reg: DefReg);
2301 if (Ty.getSizeInBits() == 64) {
2302 I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
2303 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
2304 } else if (Ty.getSizeInBits() == 32) {
2305 I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
2306 RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
2307 } else
2308 return false;
2309
2310 I.setDesc(TII.get(TargetOpcode::COPY));
2311 return true;
2312 }
2313
2314 case TargetOpcode::G_ADD: {
2315 // Check if this is being fed by a G_ICMP on either side.
2316 //
2317 // (cmp pred, x, y) + z
2318 //
2319 // In the above case, when the cmp is true, we increment z by 1. So, we can
2320 // fold the add into the cset for the cmp by using cinc.
2321 //
2322 // FIXME: This would probably be a lot nicer in PostLegalizerLowering.
2323 Register AddDst = I.getOperand(i: 0).getReg();
2324 Register AddLHS = I.getOperand(i: 1).getReg();
2325 Register AddRHS = I.getOperand(i: 2).getReg();
2326 // Only handle scalars.
2327 LLT Ty = MRI.getType(Reg: AddLHS);
2328 if (Ty.isVector())
2329 return false;
2330 // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64
2331 // bits.
2332 unsigned Size = Ty.getSizeInBits();
2333 if (Size != 32 && Size != 64)
2334 return false;
2335 auto MatchCmp = [&](Register Reg) -> MachineInstr * {
2336 if (!MRI.hasOneNonDBGUse(RegNo: Reg))
2337 return nullptr;
2338 // If the LHS of the add is 32 bits, then we want to fold a 32-bit
2339 // compare.
2340 if (Size == 32)
2341 return getOpcodeDef(Opcode: TargetOpcode::G_ICMP, Reg, MRI);
2342 // We model scalar compares using 32-bit destinations right now.
2343 // If it's a 64-bit compare, it'll have 64-bit sources.
2344 Register ZExt;
2345 if (!mi_match(R: Reg, MRI,
2346 P: m_OneNonDBGUse(SP: m_GZExt(Src: m_OneNonDBGUse(SP: m_Reg(R&: ZExt))))))
2347 return nullptr;
2348 auto *Cmp = getOpcodeDef(Opcode: TargetOpcode::G_ICMP, Reg: ZExt, MRI);
2349 if (!Cmp ||
2350 MRI.getType(Reg: Cmp->getOperand(i: 2).getReg()).getSizeInBits() != 64)
2351 return nullptr;
2352 return Cmp;
2353 };
2354 // Try to match
2355 // z + (cmp pred, x, y)
2356 MachineInstr *Cmp = MatchCmp(AddRHS);
2357 if (!Cmp) {
2358 // (cmp pred, x, y) + z
2359 std::swap(a&: AddLHS, b&: AddRHS);
2360 Cmp = MatchCmp(AddRHS);
2361 if (!Cmp)
2362 return false;
2363 }
2364 auto &PredOp = Cmp->getOperand(i: 1);
2365 auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
2366 const AArch64CC::CondCode InvCC =
2367 changeICMPPredToAArch64CC(P: CmpInst::getInversePredicate(pred: Pred));
2368 MIB.setInstrAndDebugLoc(I);
2369 emitIntegerCompare(/*LHS=*/Cmp->getOperand(i: 2),
2370 /*RHS=*/Cmp->getOperand(i: 3), Predicate&: PredOp, MIRBuilder&: MIB);
2371 emitCSINC(/*Dst=*/AddDst, /*Src =*/Src1: AddLHS, /*Src2=*/AddLHS, Pred: InvCC, MIRBuilder&: MIB);
2372 I.eraseFromParent();
2373 return true;
2374 }
2375 case TargetOpcode::G_OR: {
2376 // Look for operations that take the lower `Width=Size-ShiftImm` bits of
2377 // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via
2378 // shifting and masking that we can replace with a BFI (encoded as a BFM).
2379 Register Dst = I.getOperand(i: 0).getReg();
2380 LLT Ty = MRI.getType(Reg: Dst);
2381
2382 if (!Ty.isScalar())
2383 return false;
2384
2385 unsigned Size = Ty.getSizeInBits();
2386 if (Size != 32 && Size != 64)
2387 return false;
2388
2389 Register ShiftSrc;
2390 int64_t ShiftImm;
2391 Register MaskSrc;
2392 int64_t MaskImm;
2393 if (!mi_match(
2394 R: Dst, MRI,
2395 P: m_GOr(L: m_OneNonDBGUse(SP: m_GShl(L: m_Reg(R&: ShiftSrc), R: m_ICst(Cst&: ShiftImm))),
2396 R: m_OneNonDBGUse(SP: m_GAnd(L: m_Reg(R&: MaskSrc), R: m_ICst(Cst&: MaskImm))))))
2397 return false;
2398
2399 if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm))
2400 return false;
2401
2402 int64_t Immr = Size - ShiftImm;
2403 int64_t Imms = Size - ShiftImm - 1;
2404 unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri;
2405 emitInstr(Opcode: Opc, DstOps: {Dst}, SrcOps: {MaskSrc, ShiftSrc, Immr, Imms}, MIRBuilder&: MIB);
2406 I.eraseFromParent();
2407 return true;
2408 }
2409 case TargetOpcode::G_FENCE: {
2410 if (I.getOperand(i: 1).getImm() == 0)
2411 BuildMI(MBB, I, MIMetadata(I), TII.get(TargetOpcode::MEMBARRIER));
2412 else
2413 BuildMI(MBB, I, MIMetadata(I), TII.get(AArch64::DMB))
2414 .addImm(I.getOperand(0).getImm() == 4 ? 0x9 : 0xb);
2415 I.eraseFromParent();
2416 return true;
2417 }
2418 default:
2419 return false;
2420 }
2421}
2422
2423bool AArch64InstructionSelector::select(MachineInstr &I) {
2424 assert(I.getParent() && "Instruction should be in a basic block!");
2425 assert(I.getParent()->getParent() && "Instruction should be in a function!");
2426
2427 MachineBasicBlock &MBB = *I.getParent();
2428 MachineFunction &MF = *MBB.getParent();
2429 MachineRegisterInfo &MRI = MF.getRegInfo();
2430
2431 const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>();
2432 if (Subtarget->requiresStrictAlign()) {
2433 // We don't support this feature yet.
2434 LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
2435 return false;
2436 }
2437
2438 MIB.setInstrAndDebugLoc(I);
2439
2440 unsigned Opcode = I.getOpcode();
2441 // G_PHI requires same handling as PHI
2442 if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
2443 // Certain non-generic instructions also need some special handling.
2444
2445 if (Opcode == TargetOpcode::LOAD_STACK_GUARD)
2446 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2447
2448 if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
2449 const Register DefReg = I.getOperand(i: 0).getReg();
2450 const LLT DefTy = MRI.getType(Reg: DefReg);
2451
2452 const RegClassOrRegBank &RegClassOrBank =
2453 MRI.getRegClassOrRegBank(Reg: DefReg);
2454
2455 const TargetRegisterClass *DefRC
2456 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
2457 if (!DefRC) {
2458 if (!DefTy.isValid()) {
2459 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
2460 return false;
2461 }
2462 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
2463 DefRC = getRegClassForTypeOnBank(Ty: DefTy, RB);
2464 if (!DefRC) {
2465 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
2466 return false;
2467 }
2468 }
2469
2470 I.setDesc(TII.get(TargetOpcode::PHI));
2471
2472 return RBI.constrainGenericRegister(Reg: DefReg, RC: *DefRC, MRI);
2473 }
2474
2475 if (I.isCopy())
2476 return selectCopy(I, TII, MRI, TRI, RBI);
2477
2478 if (I.isDebugInstr())
2479 return selectDebugInstr(I, MRI, RBI);
2480
2481 return true;
2482 }
2483
2484
2485 if (I.getNumOperands() != I.getNumExplicitOperands()) {
2486 LLVM_DEBUG(
2487 dbgs() << "Generic instruction has unexpected implicit operands\n");
2488 return false;
2489 }
2490
2491 // Try to do some lowering before we start instruction selecting. These
2492 // lowerings are purely transformations on the input G_MIR and so selection
2493 // must continue after any modification of the instruction.
2494 if (preISelLower(I)) {
2495 Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
2496 }
2497
2498 // There may be patterns where the importer can't deal with them optimally,
2499 // but does select it to a suboptimal sequence so our custom C++ selection
2500 // code later never has a chance to work on it. Therefore, we have an early
2501 // selection attempt here to give priority to certain selection routines
2502 // over the imported ones.
2503 if (earlySelect(I))
2504 return true;
2505
2506 if (selectImpl(I, CoverageInfo&: *CoverageInfo))
2507 return true;
2508
2509 LLT Ty =
2510 I.getOperand(i: 0).isReg() ? MRI.getType(Reg: I.getOperand(i: 0).getReg()) : LLT{};
2511
2512 switch (Opcode) {
2513 case TargetOpcode::G_SBFX:
2514 case TargetOpcode::G_UBFX: {
2515 static const unsigned OpcTable[2][2] = {
2516 {AArch64::UBFMWri, AArch64::UBFMXri},
2517 {AArch64::SBFMWri, AArch64::SBFMXri}};
2518 bool IsSigned = Opcode == TargetOpcode::G_SBFX;
2519 unsigned Size = Ty.getSizeInBits();
2520 unsigned Opc = OpcTable[IsSigned][Size == 64];
2521 auto Cst1 =
2522 getIConstantVRegValWithLookThrough(VReg: I.getOperand(i: 2).getReg(), MRI);
2523 assert(Cst1 && "Should have gotten a constant for src 1?");
2524 auto Cst2 =
2525 getIConstantVRegValWithLookThrough(VReg: I.getOperand(i: 3).getReg(), MRI);
2526 assert(Cst2 && "Should have gotten a constant for src 2?");
2527 auto LSB = Cst1->Value.getZExtValue();
2528 auto Width = Cst2->Value.getZExtValue();
2529 auto BitfieldInst =
2530 MIB.buildInstr(Opc, DstOps: {I.getOperand(i: 0)}, SrcOps: {I.getOperand(i: 1)})
2531 .addImm(Val: LSB)
2532 .addImm(Val: LSB + Width - 1);
2533 I.eraseFromParent();
2534 return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI);
2535 }
2536 case TargetOpcode::G_BRCOND:
2537 return selectCompareBranch(I, MF, MRI);
2538
2539 case TargetOpcode::G_BRINDIRECT: {
2540 I.setDesc(TII.get(AArch64::BR));
2541 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2542 }
2543
2544 case TargetOpcode::G_BRJT:
2545 return selectBrJT(I, MRI);
2546
2547 case AArch64::G_ADD_LOW: {
2548 // This op may have been separated from it's ADRP companion by the localizer
2549 // or some other code motion pass. Given that many CPUs will try to
2550 // macro fuse these operations anyway, select this into a MOVaddr pseudo
2551 // which will later be expanded into an ADRP+ADD pair after scheduling.
2552 MachineInstr *BaseMI = MRI.getVRegDef(Reg: I.getOperand(i: 1).getReg());
2553 if (BaseMI->getOpcode() != AArch64::ADRP) {
2554 I.setDesc(TII.get(AArch64::ADDXri));
2555 I.addOperand(Op: MachineOperand::CreateImm(Val: 0));
2556 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2557 }
2558 assert(TM.getCodeModel() == CodeModel::Small &&
2559 "Expected small code model");
2560 auto Op1 = BaseMI->getOperand(i: 1);
2561 auto Op2 = I.getOperand(i: 2);
2562 auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {})
2563 .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(),
2564 Op1.getTargetFlags())
2565 .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(),
2566 Op2.getTargetFlags());
2567 I.eraseFromParent();
2568 return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI);
2569 }
2570
2571 case TargetOpcode::G_FCONSTANT:
2572 case TargetOpcode::G_CONSTANT: {
2573 const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
2574
2575 const LLT s8 = LLT::scalar(SizeInBits: 8);
2576 const LLT s16 = LLT::scalar(SizeInBits: 16);
2577 const LLT s32 = LLT::scalar(SizeInBits: 32);
2578 const LLT s64 = LLT::scalar(SizeInBits: 64);
2579 const LLT s128 = LLT::scalar(SizeInBits: 128);
2580 const LLT p0 = LLT::pointer(AddressSpace: 0, SizeInBits: 64);
2581
2582 const Register DefReg = I.getOperand(i: 0).getReg();
2583 const LLT DefTy = MRI.getType(Reg: DefReg);
2584 const unsigned DefSize = DefTy.getSizeInBits();
2585 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2586
2587 // FIXME: Redundant check, but even less readable when factored out.
2588 if (isFP) {
2589 if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) {
2590 LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2591 << " constant, expected: " << s16 << " or " << s32
2592 << " or " << s64 << " or " << s128 << '\n');
2593 return false;
2594 }
2595
2596 if (RB.getID() != AArch64::FPRRegBankID) {
2597 LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2598 << " constant on bank: " << RB
2599 << ", expected: FPR\n");
2600 return false;
2601 }
2602
2603 // The case when we have 0.0 is covered by tablegen. Reject it here so we
2604 // can be sure tablegen works correctly and isn't rescued by this code.
2605 // 0.0 is not covered by tablegen for FP128. So we will handle this
2606 // scenario in the code here.
2607 if (DefSize != 128 && I.getOperand(i: 1).getFPImm()->isExactlyValue(V: 0.0))
2608 return false;
2609 } else {
2610 // s32 and s64 are covered by tablegen.
2611 if (Ty != p0 && Ty != s8 && Ty != s16) {
2612 LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2613 << " constant, expected: " << s32 << ", " << s64
2614 << ", or " << p0 << '\n');
2615 return false;
2616 }
2617
2618 if (RB.getID() != AArch64::GPRRegBankID) {
2619 LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2620 << " constant on bank: " << RB
2621 << ", expected: GPR\n");
2622 return false;
2623 }
2624 }
2625
2626 if (isFP) {
2627 const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(Ty: DefTy, RB);
2628 // For 16, 64, and 128b values, emit a constant pool load.
2629 switch (DefSize) {
2630 default:
2631 llvm_unreachable("Unexpected destination size for G_FCONSTANT?");
2632 case 32:
2633 case 64: {
2634 bool OptForSize = shouldOptForSize(MF: &MF);
2635 const auto &TLI = MF.getSubtarget().getTargetLowering();
2636 // If TLI says that this fpimm is illegal, then we'll expand to a
2637 // constant pool load.
2638 if (TLI->isFPImmLegal(I.getOperand(i: 1).getFPImm()->getValueAPF(),
2639 EVT::getFloatingPointVT(BitWidth: DefSize), ForCodeSize: OptForSize))
2640 break;
2641 [[fallthrough]];
2642 }
2643 case 16:
2644 case 128: {
2645 auto *FPImm = I.getOperand(i: 1).getFPImm();
2646 auto *LoadMI = emitLoadFromConstantPool(CPVal: FPImm, MIRBuilder&: MIB);
2647 if (!LoadMI) {
2648 LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
2649 return false;
2650 }
2651 MIB.buildCopy(Res: {DefReg}, Op: {LoadMI->getOperand(i: 0).getReg()});
2652 I.eraseFromParent();
2653 return RBI.constrainGenericRegister(Reg: DefReg, RC: FPRRC, MRI);
2654 }
2655 }
2656
2657 assert((DefSize == 32 || DefSize == 64) && "Unexpected const def size");
2658 // Either emit a FMOV, or emit a copy to emit a normal mov.
2659 const Register DefGPRReg = MRI.createVirtualRegister(
2660 DefSize == 32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
2661 MachineOperand &RegOp = I.getOperand(i: 0);
2662 RegOp.setReg(DefGPRReg);
2663 MIB.setInsertPt(MBB&: MIB.getMBB(), II: std::next(x: I.getIterator()));
2664 MIB.buildCopy(Res: {DefReg}, Op: {DefGPRReg});
2665
2666 if (!RBI.constrainGenericRegister(Reg: DefReg, RC: FPRRC, MRI)) {
2667 LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
2668 return false;
2669 }
2670
2671 MachineOperand &ImmOp = I.getOperand(i: 1);
2672 // FIXME: Is going through int64_t always correct?
2673 ImmOp.ChangeToImmediate(
2674 ImmVal: ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
2675 } else if (I.getOperand(i: 1).isCImm()) {
2676 uint64_t Val = I.getOperand(i: 1).getCImm()->getZExtValue();
2677 I.getOperand(i: 1).ChangeToImmediate(ImmVal: Val);
2678 } else if (I.getOperand(i: 1).isImm()) {
2679 uint64_t Val = I.getOperand(i: 1).getImm();
2680 I.getOperand(i: 1).ChangeToImmediate(ImmVal: Val);
2681 }
2682
2683 const unsigned MovOpc =
2684 DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
2685 I.setDesc(TII.get(MovOpc));
2686 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2687 return true;
2688 }
2689 case TargetOpcode::G_EXTRACT: {
2690 Register DstReg = I.getOperand(i: 0).getReg();
2691 Register SrcReg = I.getOperand(i: 1).getReg();
2692 LLT SrcTy = MRI.getType(Reg: SrcReg);
2693 LLT DstTy = MRI.getType(Reg: DstReg);
2694 (void)DstTy;
2695 unsigned SrcSize = SrcTy.getSizeInBits();
2696
2697 if (SrcTy.getSizeInBits() > 64) {
2698 // This should be an extract of an s128, which is like a vector extract.
2699 if (SrcTy.getSizeInBits() != 128)
2700 return false;
2701 // Only support extracting 64 bits from an s128 at the moment.
2702 if (DstTy.getSizeInBits() != 64)
2703 return false;
2704
2705 unsigned Offset = I.getOperand(i: 2).getImm();
2706 if (Offset % 64 != 0)
2707 return false;
2708
2709 // Check we have the right regbank always.
2710 const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
2711 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
2712 assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!");
2713
2714 if (SrcRB.getID() == AArch64::GPRRegBankID) {
2715 auto NewI =
2716 MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
2717 .addUse(SrcReg, 0,
2718 Offset == 0 ? AArch64::sube64 : AArch64::subo64);
2719 constrainOperandRegClass(MF, TRI, MRI, TII, RBI, *NewI,
2720 AArch64::GPR64RegClass, NewI->getOperand(0));
2721 I.eraseFromParent();
2722 return true;
2723 }
2724
2725 // Emit the same code as a vector extract.
2726 // Offset must be a multiple of 64.
2727 unsigned LaneIdx = Offset / 64;
2728 MachineInstr *Extract = emitExtractVectorElt(
2729 DstReg, DstRB, ScalarTy: LLT::scalar(SizeInBits: 64), VecReg: SrcReg, LaneIdx, MIRBuilder&: MIB);
2730 if (!Extract)
2731 return false;
2732 I.eraseFromParent();
2733 return true;
2734 }
2735
2736 I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
2737 MachineInstrBuilder(MF, I).addImm(Val: I.getOperand(i: 2).getImm() +
2738 Ty.getSizeInBits() - 1);
2739
2740 if (SrcSize < 64) {
2741 assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
2742 "unexpected G_EXTRACT types");
2743 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2744 }
2745
2746 DstReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 64));
2747 MIB.setInsertPt(MBB&: MIB.getMBB(), II: std::next(x: I.getIterator()));
2748 MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
2749 .addReg(DstReg, 0, AArch64::sub_32);
2750 RBI.constrainGenericRegister(I.getOperand(0).getReg(),
2751 AArch64::GPR32RegClass, MRI);
2752 I.getOperand(i: 0).setReg(DstReg);
2753
2754 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2755 }
2756
2757 case TargetOpcode::G_INSERT: {
2758 LLT SrcTy = MRI.getType(Reg: I.getOperand(i: 2).getReg());
2759 LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
2760 unsigned DstSize = DstTy.getSizeInBits();
2761 // Larger inserts are vectors, same-size ones should be something else by
2762 // now (split up or turned into COPYs).
2763 if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
2764 return false;
2765
2766 I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
2767 unsigned LSB = I.getOperand(i: 3).getImm();
2768 unsigned Width = MRI.getType(Reg: I.getOperand(i: 2).getReg()).getSizeInBits();
2769 I.getOperand(i: 3).setImm((DstSize - LSB) % DstSize);
2770 MachineInstrBuilder(MF, I).addImm(Val: Width - 1);
2771
2772 if (DstSize < 64) {
2773 assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
2774 "unexpected G_INSERT types");
2775 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2776 }
2777
2778 Register SrcReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 64));
2779 BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
2780 TII.get(AArch64::SUBREG_TO_REG))
2781 .addDef(SrcReg)
2782 .addImm(0)
2783 .addUse(I.getOperand(2).getReg())
2784 .addImm(AArch64::sub_32);
2785 RBI.constrainGenericRegister(I.getOperand(2).getReg(),
2786 AArch64::GPR32RegClass, MRI);
2787 I.getOperand(i: 2).setReg(SrcReg);
2788
2789 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2790 }
2791 case TargetOpcode::G_FRAME_INDEX: {
2792 // allocas and G_FRAME_INDEX are only supported in addrspace(0).
2793 if (Ty != LLT::pointer(AddressSpace: 0, SizeInBits: 64)) {
2794 LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
2795 << ", expected: " << LLT::pointer(0, 64) << '\n');
2796 return false;
2797 }
2798 I.setDesc(TII.get(AArch64::ADDXri));
2799
2800 // MOs for a #0 shifted immediate.
2801 I.addOperand(Op: MachineOperand::CreateImm(Val: 0));
2802 I.addOperand(Op: MachineOperand::CreateImm(Val: 0));
2803
2804 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2805 }
2806
2807 case TargetOpcode::G_GLOBAL_VALUE: {
2808 const GlobalValue *GV = nullptr;
2809 unsigned OpFlags;
2810 if (I.getOperand(i: 1).isSymbol()) {
2811 OpFlags = I.getOperand(i: 1).getTargetFlags();
2812 // Currently only used by "RtLibUseGOT".
2813 assert(OpFlags == AArch64II::MO_GOT);
2814 } else {
2815 GV = I.getOperand(i: 1).getGlobal();
2816 if (GV->isThreadLocal())
2817 return selectTLSGlobalValue(I, MRI);
2818 OpFlags = STI.ClassifyGlobalReference(GV, TM);
2819 }
2820
2821 if (OpFlags & AArch64II::MO_GOT) {
2822 I.setDesc(TII.get(AArch64::LOADgot));
2823 I.getOperand(i: 1).setTargetFlags(OpFlags);
2824 } else if (TM.getCodeModel() == CodeModel::Large &&
2825 !TM.isPositionIndependent()) {
2826 // Materialize the global using movz/movk instructions.
2827 materializeLargeCMVal(I, V: GV, OpFlags);
2828 I.eraseFromParent();
2829 return true;
2830 } else if (TM.getCodeModel() == CodeModel::Tiny) {
2831 I.setDesc(TII.get(AArch64::ADR));
2832 I.getOperand(i: 1).setTargetFlags(OpFlags);
2833 } else {
2834 I.setDesc(TII.get(AArch64::MOVaddr));
2835 I.getOperand(i: 1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
2836 MachineInstrBuilder MIB(MF, I);
2837 MIB.addGlobalAddress(GV, Offset: I.getOperand(i: 1).getOffset(),
2838 TargetFlags: OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
2839 }
2840 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2841 }
2842
2843 case TargetOpcode::G_ZEXTLOAD:
2844 case TargetOpcode::G_LOAD:
2845 case TargetOpcode::G_STORE: {
2846 GLoadStore &LdSt = cast<GLoadStore>(Val&: I);
2847 bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
2848 LLT PtrTy = MRI.getType(Reg: LdSt.getPointerReg());
2849
2850 if (PtrTy != LLT::pointer(AddressSpace: 0, SizeInBits: 64)) {
2851 LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
2852 << ", expected: " << LLT::pointer(0, 64) << '\n');
2853 return false;
2854 }
2855
2856 uint64_t MemSizeInBytes = LdSt.getMemSize().getValue();
2857 unsigned MemSizeInBits = LdSt.getMemSizeInBits().getValue();
2858 AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
2859
2860 // Need special instructions for atomics that affect ordering.
2861 if (Order != AtomicOrdering::NotAtomic &&
2862 Order != AtomicOrdering::Unordered &&
2863 Order != AtomicOrdering::Monotonic) {
2864 assert(!isa<GZExtLoad>(LdSt));
2865 if (MemSizeInBytes > 64)
2866 return false;
2867
2868 if (isa<GLoad>(Val: LdSt)) {
2869 static constexpr unsigned LDAPROpcodes[] = {
2870 AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX};
2871 static constexpr unsigned LDAROpcodes[] = {
2872 AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX};
2873 ArrayRef<unsigned> Opcodes =
2874 STI.hasRCPC() && Order != AtomicOrdering::SequentiallyConsistent
2875 ? LDAPROpcodes
2876 : LDAROpcodes;
2877 I.setDesc(TII.get(Opcodes[Log2_32(Value: MemSizeInBytes)]));
2878 } else {
2879 static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
2880 AArch64::STLRW, AArch64::STLRX};
2881 Register ValReg = LdSt.getReg(Idx: 0);
2882 if (MRI.getType(Reg: ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) {
2883 // Emit a subreg copy of 32 bits.
2884 Register NewVal = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2885 MIB.buildInstr(TargetOpcode::COPY, {NewVal}, {})
2886 .addReg(I.getOperand(0).getReg(), 0, AArch64::sub_32);
2887 I.getOperand(i: 0).setReg(NewVal);
2888 }
2889 I.setDesc(TII.get(Opcodes[Log2_32(Value: MemSizeInBytes)]));
2890 }
2891 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2892 return true;
2893 }
2894
2895#ifndef NDEBUG
2896 const Register PtrReg = LdSt.getPointerReg();
2897 const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
2898 // Check that the pointer register is valid.
2899 assert(PtrRB.getID() == AArch64::GPRRegBankID &&
2900 "Load/Store pointer operand isn't a GPR");
2901 assert(MRI.getType(PtrReg).isPointer() &&
2902 "Load/Store pointer operand isn't a pointer");
2903#endif
2904
2905 const Register ValReg = LdSt.getReg(Idx: 0);
2906 const LLT ValTy = MRI.getType(Reg: ValReg);
2907 const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
2908
2909 // The code below doesn't support truncating stores, so we need to split it
2910 // again.
2911 if (isa<GStore>(Val: LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2912 unsigned SubReg;
2913 LLT MemTy = LdSt.getMMO().getMemoryType();
2914 auto *RC = getRegClassForTypeOnBank(Ty: MemTy, RB);
2915 if (!getSubRegForClass(RC, TRI, SubReg))
2916 return false;
2917
2918 // Generate a subreg copy.
2919 auto Copy = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {MemTy}, SrcOps: {})
2920 .addReg(RegNo: ValReg, flags: 0, SubReg)
2921 .getReg(Idx: 0);
2922 RBI.constrainGenericRegister(Reg: Copy, RC: *RC, MRI);
2923 LdSt.getOperand(i: 0).setReg(Copy);
2924 } else if (isa<GLoad>(Val: LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2925 // If this is an any-extending load from the FPR bank, split it into a regular
2926 // load + extend.
2927 if (RB.getID() == AArch64::FPRRegBankID) {
2928 unsigned SubReg;
2929 LLT MemTy = LdSt.getMMO().getMemoryType();
2930 auto *RC = getRegClassForTypeOnBank(Ty: MemTy, RB);
2931 if (!getSubRegForClass(RC, TRI, SubReg))
2932 return false;
2933 Register OldDst = LdSt.getReg(Idx: 0);
2934 Register NewDst =
2935 MRI.createGenericVirtualRegister(Ty: LdSt.getMMO().getMemoryType());
2936 LdSt.getOperand(i: 0).setReg(NewDst);
2937 MRI.setRegBank(Reg: NewDst, RegBank: RB);
2938 // Generate a SUBREG_TO_REG to extend it.
2939 MIB.setInsertPt(MBB&: MIB.getMBB(), II: std::next(x: LdSt.getIterator()));
2940 MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {})
2941 .addImm(0)
2942 .addUse(NewDst)
2943 .addImm(SubReg);
2944 auto SubRegRC = getRegClassForTypeOnBank(Ty: MRI.getType(Reg: OldDst), RB);
2945 RBI.constrainGenericRegister(Reg: OldDst, RC: *SubRegRC, MRI);
2946 MIB.setInstr(LdSt);
2947 }
2948 }
2949
2950 // Helper lambda for partially selecting I. Either returns the original
2951 // instruction with an updated opcode, or a new instruction.
2952 auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
2953 bool IsStore = isa<GStore>(Val: I);
2954 const unsigned NewOpc =
2955 selectLoadStoreUIOp(GenericOpc: I.getOpcode(), RegBankID: RB.getID(), OpSize: MemSizeInBits);
2956 if (NewOpc == I.getOpcode())
2957 return nullptr;
2958 // Check if we can fold anything into the addressing mode.
2959 auto AddrModeFns =
2960 selectAddrModeIndexed(Root&: I.getOperand(i: 1), Size: MemSizeInBytes);
2961 if (!AddrModeFns) {
2962 // Can't fold anything. Use the original instruction.
2963 I.setDesc(TII.get(NewOpc));
2964 I.addOperand(Op: MachineOperand::CreateImm(Val: 0));
2965 return &I;
2966 }
2967
2968 // Folded something. Create a new instruction and return it.
2969 auto NewInst = MIB.buildInstr(Opc: NewOpc, DstOps: {}, SrcOps: {}, Flags: I.getFlags());
2970 Register CurValReg = I.getOperand(i: 0).getReg();
2971 IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg);
2972 NewInst.cloneMemRefs(I);
2973 for (auto &Fn : *AddrModeFns)
2974 Fn(NewInst);
2975 I.eraseFromParent();
2976 return &*NewInst;
2977 };
2978
2979 MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
2980 if (!LoadStore)
2981 return false;
2982
2983 // If we're storing a 0, use WZR/XZR.
2984 if (Opcode == TargetOpcode::G_STORE) {
2985 auto CVal = getIConstantVRegValWithLookThrough(
2986 VReg: LoadStore->getOperand(i: 0).getReg(), MRI);
2987 if (CVal && CVal->Value == 0) {
2988 switch (LoadStore->getOpcode()) {
2989 case AArch64::STRWui:
2990 case AArch64::STRHHui:
2991 case AArch64::STRBBui:
2992 LoadStore->getOperand(0).setReg(AArch64::WZR);
2993 break;
2994 case AArch64::STRXui:
2995 LoadStore->getOperand(0).setReg(AArch64::XZR);
2996 break;
2997 }
2998 }
2999 }
3000
3001 if (IsZExtLoad || (Opcode == TargetOpcode::G_LOAD &&
3002 ValTy == LLT::scalar(SizeInBits: 64) && MemSizeInBits == 32)) {
3003 // The any/zextload from a smaller type to i32 should be handled by the
3004 // importer.
3005 if (MRI.getType(Reg: LoadStore->getOperand(i: 0).getReg()).getSizeInBits() != 64)
3006 return false;
3007 // If we have an extending load then change the load's type to be a
3008 // narrower reg and zero_extend with SUBREG_TO_REG.
3009 Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3010 Register DstReg = LoadStore->getOperand(i: 0).getReg();
3011 LoadStore->getOperand(i: 0).setReg(LdReg);
3012
3013 MIB.setInsertPt(MBB&: MIB.getMBB(), II: std::next(x: LoadStore->getIterator()));
3014 MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
3015 .addImm(0)
3016 .addUse(LdReg)
3017 .addImm(AArch64::sub_32);
3018 constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
3019 return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
3020 MRI);
3021 }
3022 return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
3023 }
3024
3025 case TargetOpcode::G_INDEXED_ZEXTLOAD:
3026 case TargetOpcode::G_INDEXED_SEXTLOAD:
3027 return selectIndexedExtLoad(I, MRI);
3028 case TargetOpcode::G_INDEXED_LOAD:
3029 return selectIndexedLoad(I, MRI);
3030 case TargetOpcode::G_INDEXED_STORE:
3031 return selectIndexedStore(I&: cast<GIndexedStore>(Val&: I), MRI);
3032
3033 case TargetOpcode::G_LSHR:
3034 case TargetOpcode::G_ASHR:
3035 if (MRI.getType(Reg: I.getOperand(i: 0).getReg()).isVector())
3036 return selectVectorAshrLshr(I, MRI);
3037 [[fallthrough]];
3038 case TargetOpcode::G_SHL:
3039 if (Opcode == TargetOpcode::G_SHL &&
3040 MRI.getType(Reg: I.getOperand(i: 0).getReg()).isVector())
3041 return selectVectorSHL(I, MRI);
3042
3043 // These shifts were legalized to have 64 bit shift amounts because we
3044 // want to take advantage of the selection patterns that assume the
3045 // immediates are s64s, however, selectBinaryOp will assume both operands
3046 // will have the same bit size.
3047 {
3048 Register SrcReg = I.getOperand(i: 1).getReg();
3049 Register ShiftReg = I.getOperand(i: 2).getReg();
3050 const LLT ShiftTy = MRI.getType(Reg: ShiftReg);
3051 const LLT SrcTy = MRI.getType(Reg: SrcReg);
3052 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
3053 ShiftTy.getSizeInBits() == 64) {
3054 assert(!ShiftTy.isVector() && "unexpected vector shift ty");
3055 // Insert a subregister copy to implement a 64->32 trunc
3056 auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
3057 .addReg(ShiftReg, 0, AArch64::sub_32);
3058 MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
3059 I.getOperand(i: 2).setReg(Trunc.getReg(0));
3060 }
3061 }
3062 [[fallthrough]];
3063 case TargetOpcode::G_OR: {
3064 // Reject the various things we don't support yet.
3065 if (unsupportedBinOp(I, RBI, MRI, TRI))
3066 return false;
3067
3068 const unsigned OpSize = Ty.getSizeInBits();
3069
3070 const Register DefReg = I.getOperand(i: 0).getReg();
3071 const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
3072
3073 const unsigned NewOpc = selectBinaryOp(GenericOpc: I.getOpcode(), RegBankID: RB.getID(), OpSize);
3074 if (NewOpc == I.getOpcode())
3075 return false;
3076
3077 I.setDesc(TII.get(NewOpc));
3078 // FIXME: Should the type be always reset in setDesc?
3079
3080 // Now that we selected an opcode, we need to constrain the register
3081 // operands to use appropriate classes.
3082 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3083 }
3084
3085 case TargetOpcode::G_PTR_ADD: {
3086 emitADD(DefReg: I.getOperand(i: 0).getReg(), LHS&: I.getOperand(i: 1), RHS&: I.getOperand(i: 2), MIRBuilder&: MIB);
3087 I.eraseFromParent();
3088 return true;
3089 }
3090
3091 case TargetOpcode::G_SADDE:
3092 case TargetOpcode::G_UADDE:
3093 case TargetOpcode::G_SSUBE:
3094 case TargetOpcode::G_USUBE:
3095 case TargetOpcode::G_SADDO:
3096 case TargetOpcode::G_UADDO:
3097 case TargetOpcode::G_SSUBO:
3098 case TargetOpcode::G_USUBO:
3099 return selectOverflowOp(I, MRI);
3100
3101 case TargetOpcode::G_PTRMASK: {
3102 Register MaskReg = I.getOperand(i: 2).getReg();
3103 std::optional<int64_t> MaskVal = getIConstantVRegSExtVal(VReg: MaskReg, MRI);
3104 // TODO: Implement arbitrary cases
3105 if (!MaskVal || !isShiftedMask_64(Value: *MaskVal))
3106 return false;
3107
3108 uint64_t Mask = *MaskVal;
3109 I.setDesc(TII.get(AArch64::ANDXri));
3110 I.getOperand(i: 2).ChangeToImmediate(
3111 ImmVal: AArch64_AM::encodeLogicalImmediate(imm: Mask, regSize: 64));
3112
3113 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3114 }
3115 case TargetOpcode::G_PTRTOINT:
3116 case TargetOpcode::G_TRUNC: {
3117 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
3118 const LLT SrcTy = MRI.getType(Reg: I.getOperand(i: 1).getReg());
3119
3120 const Register DstReg = I.getOperand(i: 0).getReg();
3121 const Register SrcReg = I.getOperand(i: 1).getReg();
3122
3123 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3124 const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3125
3126 if (DstRB.getID() != SrcRB.getID()) {
3127 LLVM_DEBUG(
3128 dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
3129 return false;
3130 }
3131
3132 if (DstRB.getID() == AArch64::GPRRegBankID) {
3133 const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(Ty: DstTy, RB: DstRB);
3134 if (!DstRC)
3135 return false;
3136
3137 const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(Ty: SrcTy, RB: SrcRB);
3138 if (!SrcRC)
3139 return false;
3140
3141 if (!RBI.constrainGenericRegister(Reg: SrcReg, RC: *SrcRC, MRI) ||
3142 !RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI)) {
3143 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
3144 return false;
3145 }
3146
3147 if (DstRC == SrcRC) {
3148 // Nothing to be done
3149 } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(SizeInBits: 32) &&
3150 SrcTy == LLT::scalar(SizeInBits: 64)) {
3151 llvm_unreachable("TableGen can import this case");
3152 return false;
3153 } else if (DstRC == &AArch64::GPR32RegClass &&
3154 SrcRC == &AArch64::GPR64RegClass) {
3155 I.getOperand(1).setSubReg(AArch64::sub_32);
3156 } else {
3157 LLVM_DEBUG(
3158 dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
3159 return false;
3160 }
3161
3162 I.setDesc(TII.get(TargetOpcode::COPY));
3163 return true;
3164 } else if (DstRB.getID() == AArch64::FPRRegBankID) {
3165 if (DstTy == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16) &&
3166 SrcTy == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 32)) {
3167 I.setDesc(TII.get(AArch64::XTNv4i16));
3168 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3169 return true;
3170 }
3171
3172 if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
3173 MachineInstr *Extract = emitExtractVectorElt(
3174 DstReg, DstRB, ScalarTy: LLT::scalar(SizeInBits: DstTy.getSizeInBits()), VecReg: SrcReg, LaneIdx: 0, MIRBuilder&: MIB);
3175 if (!Extract)
3176 return false;
3177 I.eraseFromParent();
3178 return true;
3179 }
3180
3181 // We might have a vector G_PTRTOINT, in which case just emit a COPY.
3182 if (Opcode == TargetOpcode::G_PTRTOINT) {
3183 assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
3184 I.setDesc(TII.get(TargetOpcode::COPY));
3185 return selectCopy(I, TII, MRI, TRI, RBI);
3186 }
3187 }
3188
3189 return false;
3190 }
3191
3192 case TargetOpcode::G_ANYEXT: {
3193 if (selectUSMovFromExtend(I, MRI))
3194 return true;
3195
3196 const Register DstReg = I.getOperand(i: 0).getReg();
3197 const Register SrcReg = I.getOperand(i: 1).getReg();
3198
3199 const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
3200 if (RBDst.getID() != AArch64::GPRRegBankID) {
3201 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
3202 << ", expected: GPR\n");
3203 return false;
3204 }
3205
3206 const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
3207 if (RBSrc.getID() != AArch64::GPRRegBankID) {
3208 LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
3209 << ", expected: GPR\n");
3210 return false;
3211 }
3212
3213 const unsigned DstSize = MRI.getType(Reg: DstReg).getSizeInBits();
3214
3215 if (DstSize == 0) {
3216 LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
3217 return false;
3218 }
3219
3220 if (DstSize != 64 && DstSize > 32) {
3221 LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
3222 << ", expected: 32 or 64\n");
3223 return false;
3224 }
3225 // At this point G_ANYEXT is just like a plain COPY, but we need
3226 // to explicitly form the 64-bit value if any.
3227 if (DstSize > 32) {
3228 Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
3229 BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
3230 .addDef(ExtSrc)
3231 .addImm(0)
3232 .addUse(SrcReg)
3233 .addImm(AArch64::sub_32);
3234 I.getOperand(i: 1).setReg(ExtSrc);
3235 }
3236 return selectCopy(I, TII, MRI, TRI, RBI);
3237 }
3238
3239 case TargetOpcode::G_ZEXT:
3240 case TargetOpcode::G_SEXT_INREG:
3241 case TargetOpcode::G_SEXT: {
3242 if (selectUSMovFromExtend(I, MRI))
3243 return true;
3244
3245 unsigned Opcode = I.getOpcode();
3246 const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
3247 const Register DefReg = I.getOperand(i: 0).getReg();
3248 Register SrcReg = I.getOperand(i: 1).getReg();
3249 const LLT DstTy = MRI.getType(Reg: DefReg);
3250 const LLT SrcTy = MRI.getType(Reg: SrcReg);
3251 unsigned DstSize = DstTy.getSizeInBits();
3252 unsigned SrcSize = SrcTy.getSizeInBits();
3253
3254 // SEXT_INREG has the same src reg size as dst, the size of the value to be
3255 // extended is encoded in the imm.
3256 if (Opcode == TargetOpcode::G_SEXT_INREG)
3257 SrcSize = I.getOperand(i: 2).getImm();
3258
3259 if (DstTy.isVector())
3260 return false; // Should be handled by imported patterns.
3261
3262 assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
3263 AArch64::GPRRegBankID &&
3264 "Unexpected ext regbank");
3265
3266 MachineInstr *ExtI;
3267
3268 // First check if we're extending the result of a load which has a dest type
3269 // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
3270 // GPR register on AArch64 and all loads which are smaller automatically
3271 // zero-extend the upper bits. E.g.
3272 // %v(s8) = G_LOAD %p, :: (load 1)
3273 // %v2(s32) = G_ZEXT %v(s8)
3274 if (!IsSigned) {
3275 auto *LoadMI = getOpcodeDef(Opcode: TargetOpcode::G_LOAD, Reg: SrcReg, MRI);
3276 bool IsGPR =
3277 RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
3278 if (LoadMI && IsGPR) {
3279 const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
3280 unsigned BytesLoaded = MemOp->getSize().getValue();
3281 if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
3282 return selectCopy(I, TII, MRI, TRI, RBI);
3283 }
3284
3285 // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs)
3286 // + SUBREG_TO_REG.
3287 if (IsGPR && SrcSize == 32 && DstSize == 64) {
3288 Register SubregToRegSrc =
3289 MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3290 const Register ZReg = AArch64::WZR;
3291 MIB.buildInstr(AArch64::ORRWrs, {SubregToRegSrc}, {ZReg, SrcReg})
3292 .addImm(0);
3293
3294 MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
3295 .addImm(0)
3296 .addUse(SubregToRegSrc)
3297 .addImm(AArch64::sub_32);
3298
3299 if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
3300 MRI)) {
3301 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
3302 return false;
3303 }
3304
3305 if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3306 MRI)) {
3307 LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
3308 return false;
3309 }
3310
3311 I.eraseFromParent();
3312 return true;
3313 }
3314 }
3315
3316 if (DstSize == 64) {
3317 if (Opcode != TargetOpcode::G_SEXT_INREG) {
3318 // FIXME: Can we avoid manually doing this?
3319 if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3320 MRI)) {
3321 LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
3322 << " operand\n");
3323 return false;
3324 }
3325 SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG,
3326 {&AArch64::GPR64RegClass}, {})
3327 .addImm(0)
3328 .addUse(SrcReg)
3329 .addImm(AArch64::sub_32)
3330 .getReg(0);
3331 }
3332
3333 ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
3334 {DefReg}, {SrcReg})
3335 .addImm(0)
3336 .addImm(SrcSize - 1);
3337 } else if (DstSize <= 32) {
3338 ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
3339 {DefReg}, {SrcReg})
3340 .addImm(0)
3341 .addImm(SrcSize - 1);
3342 } else {
3343 return false;
3344 }
3345
3346 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
3347 I.eraseFromParent();
3348 return true;
3349 }
3350
3351 case TargetOpcode::G_SITOFP:
3352 case TargetOpcode::G_UITOFP:
3353 case TargetOpcode::G_FPTOSI:
3354 case TargetOpcode::G_FPTOUI: {
3355 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg()),
3356 SrcTy = MRI.getType(Reg: I.getOperand(i: 1).getReg());
3357 const unsigned NewOpc = selectFPConvOpc(GenericOpc: Opcode, DstTy, SrcTy);
3358 if (NewOpc == Opcode)
3359 return false;
3360
3361 I.setDesc(TII.get(NewOpc));
3362 constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3363 I.setFlags(MachineInstr::NoFPExcept);
3364
3365 return true;
3366 }
3367
3368 case TargetOpcode::G_FREEZE:
3369 return selectCopy(I, TII, MRI, TRI, RBI);
3370
3371 case TargetOpcode::G_INTTOPTR:
3372 // The importer is currently unable to import pointer types since they
3373 // didn't exist in SelectionDAG.
3374 return selectCopy(I, TII, MRI, TRI, RBI);
3375
3376 case TargetOpcode::G_BITCAST:
3377 // Imported SelectionDAG rules can handle every bitcast except those that
3378 // bitcast from a type to the same type. Ideally, these shouldn't occur
3379 // but we might not run an optimizer that deletes them. The other exception
3380 // is bitcasts involving pointer types, as SelectionDAG has no knowledge
3381 // of them.
3382 return selectCopy(I, TII, MRI, TRI, RBI);
3383
3384 case TargetOpcode::G_SELECT: {
3385 auto &Sel = cast<GSelect>(Val&: I);
3386 const Register CondReg = Sel.getCondReg();
3387 const Register TReg = Sel.getTrueReg();
3388 const Register FReg = Sel.getFalseReg();
3389
3390 if (tryOptSelect(Sel))
3391 return true;
3392
3393 // Make sure to use an unused vreg instead of wzr, so that the peephole
3394 // optimizations will be able to optimize these.
3395 Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3396 auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
3397 .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
3398 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
3399 if (!emitSelect(Dst: Sel.getReg(Idx: 0), True: TReg, False: FReg, CC: AArch64CC::NE, MIB))
3400 return false;
3401 Sel.eraseFromParent();
3402 return true;
3403 }
3404 case TargetOpcode::G_ICMP: {
3405 if (Ty.isVector())
3406 return selectVectorICmp(I, MRI);
3407
3408 if (Ty != LLT::scalar(SizeInBits: 32)) {
3409 LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
3410 << ", expected: " << LLT::scalar(32) << '\n');
3411 return false;
3412 }
3413
3414 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(i: 1).getPredicate());
3415 const AArch64CC::CondCode InvCC =
3416 changeICMPPredToAArch64CC(P: CmpInst::getInversePredicate(pred: Pred));
3417 emitIntegerCompare(LHS&: I.getOperand(i: 2), RHS&: I.getOperand(i: 3), Predicate&: I.getOperand(i: 1), MIRBuilder&: MIB);
3418 emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR,
3419 /*Src2=*/AArch64::WZR, InvCC, MIB);
3420 I.eraseFromParent();
3421 return true;
3422 }
3423
3424 case TargetOpcode::G_FCMP: {
3425 CmpInst::Predicate Pred =
3426 static_cast<CmpInst::Predicate>(I.getOperand(i: 1).getPredicate());
3427 if (!emitFPCompare(LHS: I.getOperand(i: 2).getReg(), RHS: I.getOperand(i: 3).getReg(), MIRBuilder&: MIB,
3428 Pred) ||
3429 !emitCSetForFCmp(Dst: I.getOperand(i: 0).getReg(), Pred, MIRBuilder&: MIB))
3430 return false;
3431 I.eraseFromParent();
3432 return true;
3433 }
3434 case TargetOpcode::G_VASTART:
3435 return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
3436 : selectVaStartAAPCS(I, MF, MRI);
3437 case TargetOpcode::G_INTRINSIC:
3438 return selectIntrinsic(I, MRI);
3439 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3440 return selectIntrinsicWithSideEffects(I, MRI);
3441 case TargetOpcode::G_IMPLICIT_DEF: {
3442 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
3443 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
3444 const Register DstReg = I.getOperand(i: 0).getReg();
3445 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3446 const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(Ty: DstTy, RB: DstRB);
3447 RBI.constrainGenericRegister(Reg: DstReg, RC: *DstRC, MRI);
3448 return true;
3449 }
3450 case TargetOpcode::G_BLOCK_ADDR: {
3451 if (TM.getCodeModel() == CodeModel::Large && !TM.isPositionIndependent()) {
3452 materializeLargeCMVal(I, V: I.getOperand(i: 1).getBlockAddress(), OpFlags: 0);
3453 I.eraseFromParent();
3454 return true;
3455 } else {
3456 I.setDesc(TII.get(AArch64::MOVaddrBA));
3457 auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA),
3458 I.getOperand(0).getReg())
3459 .addBlockAddress(I.getOperand(1).getBlockAddress(),
3460 /* Offset */ 0, AArch64II::MO_PAGE)
3461 .addBlockAddress(
3462 I.getOperand(1).getBlockAddress(), /* Offset */ 0,
3463 AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3464 I.eraseFromParent();
3465 return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3466 }
3467 }
3468 case AArch64::G_DUP: {
3469 // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
3470 // imported patterns. Do it manually here. Avoiding generating s16 gpr is
3471 // difficult because at RBS we may end up pessimizing the fpr case if we
3472 // decided to add an anyextend to fix this. Manual selection is the most
3473 // robust solution for now.
3474 if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
3475 AArch64::GPRRegBankID)
3476 return false; // We expect the fpr regbank case to be imported.
3477 LLT VecTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
3478 if (VecTy == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 8))
3479 I.setDesc(TII.get(AArch64::DUPv8i8gpr));
3480 else if (VecTy == LLT::fixed_vector(NumElements: 16, ScalarSizeInBits: 8))
3481 I.setDesc(TII.get(AArch64::DUPv16i8gpr));
3482 else if (VecTy == LLT::fixed_vector(NumElements: 4, ScalarSizeInBits: 16))
3483 I.setDesc(TII.get(AArch64::DUPv4i16gpr));
3484 else if (VecTy == LLT::fixed_vector(NumElements: 8, ScalarSizeInBits: 16))
3485 I.setDesc(TII.get(AArch64::DUPv8i16gpr));
3486 else
3487 return false;
3488 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3489 }
3490 case TargetOpcode::G_BUILD_VECTOR:
3491 return selectBuildVector(I, MRI);
3492 case TargetOpcode::G_MERGE_VALUES:
3493 return selectMergeValues(I, MRI);
3494 case TargetOpcode::G_UNMERGE_VALUES:
3495 return selectUnmergeValues(I, MRI);
3496 case TargetOpcode::G_SHUFFLE_VECTOR:
3497 return selectShuffleVector(I, MRI);
3498 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3499 return selectExtractElt(I, MRI);
3500 case TargetOpcode::G_CONCAT_VECTORS:
3501 return selectConcatVectors(I, MRI);
3502 case TargetOpcode::G_JUMP_TABLE:
3503 return selectJumpTable(I, MRI);
3504 case TargetOpcode::G_MEMCPY:
3505 case TargetOpcode::G_MEMCPY_INLINE:
3506 case TargetOpcode::G_MEMMOVE:
3507 case TargetOpcode::G_MEMSET:
3508 assert(STI.hasMOPS() && "Shouldn't get here without +mops feature");
3509 return selectMOPS(I, MRI);
3510 }
3511
3512 return false;
3513}
3514
3515bool AArch64InstructionSelector::selectAndRestoreState(MachineInstr &I) {
3516 MachineIRBuilderState OldMIBState = MIB.getState();
3517 bool Success = select(I);
3518 MIB.setState(OldMIBState);
3519 return Success;
3520}
3521
3522bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI,
3523 MachineRegisterInfo &MRI) {
3524 unsigned Mopcode;
3525 switch (GI.getOpcode()) {
3526 case TargetOpcode::G_MEMCPY:
3527 case TargetOpcode::G_MEMCPY_INLINE:
3528 Mopcode = AArch64::MOPSMemoryCopyPseudo;
3529 break;
3530 case TargetOpcode::G_MEMMOVE:
3531 Mopcode = AArch64::MOPSMemoryMovePseudo;
3532 break;
3533 case TargetOpcode::G_MEMSET:
3534 // For tagged memset see llvm.aarch64.mops.memset.tag
3535 Mopcode = AArch64::MOPSMemorySetPseudo;
3536 break;
3537 }
3538
3539 auto &DstPtr = GI.getOperand(i: 0);
3540 auto &SrcOrVal = GI.getOperand(i: 1);
3541 auto &Size = GI.getOperand(i: 2);
3542
3543 // Create copies of the registers that can be clobbered.
3544 const Register DstPtrCopy = MRI.cloneVirtualRegister(VReg: DstPtr.getReg());
3545 const Register SrcValCopy = MRI.cloneVirtualRegister(VReg: SrcOrVal.getReg());
3546 const Register SizeCopy = MRI.cloneVirtualRegister(VReg: Size.getReg());
3547
3548 const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo;
3549 const auto &SrcValRegClass =
3550 IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass;
3551
3552 // Constrain to specific registers
3553 RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI);
3554 RBI.constrainGenericRegister(Reg: SrcValCopy, RC: SrcValRegClass, MRI);
3555 RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI);
3556
3557 MIB.buildCopy(Res: DstPtrCopy, Op: DstPtr);
3558 MIB.buildCopy(Res: SrcValCopy, Op: SrcOrVal);
3559 MIB.buildCopy(Res: SizeCopy, Op: Size);
3560
3561 // New instruction uses the copied registers because it must update them.
3562 // The defs are not used since they don't exist in G_MEM*. They are still
3563 // tied.
3564 // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE
3565 Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
3566 Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3567 if (IsSet) {
3568 MIB.buildInstr(Opc: Mopcode, DstOps: {DefDstPtr, DefSize},
3569 SrcOps: {DstPtrCopy, SizeCopy, SrcValCopy});
3570 } else {
3571 Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass);
3572 MIB.buildInstr(Opc: Mopcode, DstOps: {DefDstPtr, DefSrcPtr, DefSize},
3573 SrcOps: {DstPtrCopy, SrcValCopy, SizeCopy});
3574 }
3575
3576 GI.eraseFromParent();
3577 return true;
3578}
3579
3580bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
3581 MachineRegisterInfo &MRI) {
3582 assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
3583 Register JTAddr = I.getOperand(i: 0).getReg();
3584 unsigned JTI = I.getOperand(i: 1).getIndex();
3585 Register Index = I.getOperand(i: 2).getReg();
3586
3587 Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3588 Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
3589
3590 MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(Idx: JTI, Size: 4, PCRelSym: nullptr);
3591 auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
3592 {TargetReg, ScratchReg}, {JTAddr, Index})
3593 .addJumpTableIndex(JTI);
3594 // Save the jump table info.
3595 MIB.buildInstr(Opc: TargetOpcode::JUMP_TABLE_DEBUG_INFO, DstOps: {},
3596 SrcOps: {static_cast<int64_t>(JTI)});
3597 // Build the indirect branch.
3598 MIB.buildInstr(AArch64::BR, {}, {TargetReg});
3599 I.eraseFromParent();
3600 return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI);
3601}
3602
3603bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I,
3604 MachineRegisterInfo &MRI) {
3605 assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
3606 assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
3607
3608 Register DstReg = I.getOperand(i: 0).getReg();
3609 unsigned JTI = I.getOperand(i: 1).getIndex();
3610 // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
3611 auto MovMI =
3612 MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {})
3613 .addJumpTableIndex(JTI, AArch64II::MO_PAGE)
3614 .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3615 I.eraseFromParent();
3616 return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3617}
3618
3619bool AArch64InstructionSelector::selectTLSGlobalValue(
3620 MachineInstr &I, MachineRegisterInfo &MRI) {
3621 if (!STI.isTargetMachO())
3622 return false;
3623 MachineFunction &MF = *I.getParent()->getParent();
3624 MF.getFrameInfo().setAdjustsStack(true);
3625
3626 const auto &GlobalOp = I.getOperand(i: 1);
3627 assert(GlobalOp.getOffset() == 0 &&
3628 "Shouldn't have an offset on TLS globals!");
3629 const GlobalValue &GV = *GlobalOp.getGlobal();
3630
3631 auto LoadGOT =
3632 MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {})
3633 .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
3634
3635 auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
3636 {LoadGOT.getReg(0)})
3637 .addImm(0);
3638
3639 MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0));
3640 // TLS calls preserve all registers except those that absolutely must be
3641 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3642 // silly).
3643 MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load})
3644 .addUse(AArch64::X0, RegState::Implicit)
3645 .addDef(AArch64::X0, RegState::Implicit)
3646 .addRegMask(TRI.getTLSCallPreservedMask());
3647
3648 MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0));
3649 RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass,
3650 MRI);
3651 I.eraseFromParent();
3652 return true;
3653}
3654
3655bool AArch64InstructionSelector::selectVectorICmp(
3656 MachineInstr &I, MachineRegisterInfo &MRI) {
3657 Register DstReg = I.getOperand(i: 0).getReg();
3658 LLT DstTy = MRI.getType(Reg: DstReg);
3659 Register SrcReg = I.getOperand(i: 2).getReg();
3660 Register Src2Reg = I.getOperand(i: 3).getReg();
3661 LLT SrcTy = MRI.getType(Reg: SrcReg);
3662
3663 unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
3664 unsigned NumElts = DstTy.getNumElements();
3665
3666 // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b
3667 // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16
3668 // Third index is cc opcode:
3669 // 0 == eq
3670 // 1 == ugt
3671 // 2 == uge
3672 // 3 == ult
3673 // 4 == ule
3674 // 5 == sgt
3675 // 6 == sge
3676 // 7 == slt
3677 // 8 == sle
3678 // ne is done by negating 'eq' result.
3679
3680 // This table below assumes that for some comparisons the operands will be
3681 // commuted.
3682 // ult op == commute + ugt op
3683 // ule op == commute + uge op
3684 // slt op == commute + sgt op
3685 // sle op == commute + sge op
3686 unsigned PredIdx = 0;
3687 bool SwapOperands = false;
3688 CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(i: 1).getPredicate();
3689 switch (Pred) {
3690 case CmpInst::ICMP_NE:
3691 case CmpInst::ICMP_EQ:
3692 PredIdx = 0;
3693 break;
3694 case CmpInst::ICMP_UGT:
3695 PredIdx = 1;
3696 break;
3697 case CmpInst::ICMP_UGE:
3698 PredIdx = 2;
3699 break;
3700 case CmpInst::ICMP_ULT:
3701 PredIdx = 3;
3702 SwapOperands = true;
3703 break;
3704 case CmpInst::ICMP_ULE:
3705 PredIdx = 4;
3706 SwapOperands = true;
3707 break;
3708 case CmpInst::ICMP_SGT:
3709 PredIdx = 5;
3710 break;
3711 case CmpInst::ICMP_SGE:
3712 PredIdx = 6;
3713 break;
3714 case CmpInst::ICMP_SLT:
3715 PredIdx = 7;
3716 SwapOperands = true;
3717 break;
3718 case CmpInst::ICMP_SLE:
3719 PredIdx = 8;
3720 SwapOperands = true;
3721 break;
3722 default:
3723 llvm_unreachable("Unhandled icmp predicate");
3724 return false;
3725 }
3726
3727 // This table obviously should be tablegen'd when we have our GISel native
3728 // tablegen selector.
3729
3730 static const unsigned OpcTable[4][4][9] = {
3731 {
3732 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3733 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3734 0 /* invalid */},
3735 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3736 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3737 0 /* invalid */},
3738 {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8,
3739 AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8,
3740 AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8},
3741 {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8,
3742 AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8,
3743 AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8}
3744 },
3745 {
3746 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3747 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3748 0 /* invalid */},
3749 {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16,
3750 AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16,
3751 AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16},
3752 {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16,
3753 AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16,
3754 AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16},
3755 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3756 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3757 0 /* invalid */}
3758 },
3759 {
3760 {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32,
3761 AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32,
3762 AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32},
3763 {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32,
3764 AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32,
3765 AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32},
3766 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3767 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3768 0 /* invalid */},
3769 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3770 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3771 0 /* invalid */}
3772 },
3773 {
3774 {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64,
3775 AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64,
3776 AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64},
3777 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3778 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3779 0 /* invalid */},
3780 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3781 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3782 0 /* invalid */},
3783 {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3784 0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3785 0 /* invalid */}
3786 },
3787 };
3788 unsigned EltIdx = Log2_32(Value: SrcEltSize / 8);
3789 unsigned NumEltsIdx = Log2_32(Value: NumElts / 2);
3790 unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx];
3791 if (!Opc) {
3792 LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode");
3793 return false;
3794 }
3795
3796 const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3797 const TargetRegisterClass *SrcRC =
3798 getRegClassForTypeOnBank(Ty: SrcTy, RB: VecRB, GetAllRegSet: true);
3799 if (!SrcRC) {
3800 LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3801 return false;
3802 }
3803
3804 unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0;
3805 if (SrcTy.getSizeInBits() == 128)
3806 NotOpc = NotOpc ? AArch64::NOTv16i8 : 0;
3807
3808 if (SwapOperands)
3809 std::swap(a&: SrcReg, b&: Src2Reg);
3810
3811 auto Cmp = MIB.buildInstr(Opc, DstOps: {SrcRC}, SrcOps: {SrcReg, Src2Reg});
3812 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3813
3814 // Invert if we had a 'ne' cc.
3815 if (NotOpc) {
3816 Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp});
3817 constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3818 } else {
3819 MIB.buildCopy(Res: DstReg, Op: Cmp.getReg(0));
3820 }
3821 RBI.constrainGenericRegister(Reg: DstReg, RC: *SrcRC, MRI);
3822 I.eraseFromParent();
3823 return true;
3824}
3825
3826MachineInstr *AArch64InstructionSelector::emitScalarToVector(
3827 unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
3828 MachineIRBuilder &MIRBuilder) const {
3829 auto Undef = MIRBuilder.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {DstRC}, SrcOps: {});
3830
3831 auto BuildFn = [&](unsigned SubregIndex) {
3832 auto Ins =
3833 MIRBuilder
3834 .buildInstr(Opc: TargetOpcode::INSERT_SUBREG, DstOps: {DstRC}, SrcOps: {Undef, Scalar})
3835 .addImm(Val: SubregIndex);
3836 constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI);
3837 constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI);
3838 return &*Ins;
3839 };
3840
3841 switch (EltSize) {
3842 case 8:
3843 return BuildFn(AArch64::bsub);
3844 case 16:
3845 return BuildFn(AArch64::hsub);
3846 case 32:
3847 return BuildFn(AArch64::ssub);
3848 case 64:
3849 return BuildFn(AArch64::dsub);
3850 default:
3851 return nullptr;
3852 }
3853}
3854
3855MachineInstr *
3856AArch64InstructionSelector::emitNarrowVector(Register DstReg, Register SrcReg,
3857 MachineIRBuilder &MIB,
3858 MachineRegisterInfo &MRI) const {
3859 LLT DstTy = MRI.getType(Reg: DstReg);
3860 const TargetRegisterClass *RC =
3861 getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(SrcReg, MRI, TRI));
3862 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
3863 LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
3864 return nullptr;
3865 }
3866 unsigned SubReg = 0;
3867 if (!getSubRegForClass(RC, TRI, SubReg))
3868 return nullptr;
3869 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
3870 LLVM_DEBUG(dbgs() << "Unsupported destination size! ("
3871 << DstTy.getSizeInBits() << "\n");
3872 return nullptr;
3873 }
3874 auto Copy = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {DstReg}, SrcOps: {})
3875 .addReg(RegNo: SrcReg, flags: 0, SubReg);
3876 RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI);
3877 return Copy;
3878}
3879
3880bool AArch64InstructionSelector::selectMergeValues(
3881 MachineInstr &I, MachineRegisterInfo &MRI) {
3882 assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
3883 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
3884 const LLT SrcTy = MRI.getType(Reg: I.getOperand(i: 1).getReg());
3885 assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
3886 const RegisterBank &RB = *RBI.getRegBank(I.getOperand(i: 1).getReg(), MRI, TRI);
3887
3888 if (I.getNumOperands() != 3)
3889 return false;
3890
3891 // Merging 2 s64s into an s128.
3892 if (DstTy == LLT::scalar(SizeInBits: 128)) {
3893 if (SrcTy.getSizeInBits() != 64)
3894 return false;
3895 Register DstReg = I.getOperand(i: 0).getReg();
3896 Register Src1Reg = I.getOperand(i: 1).getReg();
3897 Register Src2Reg = I.getOperand(i: 2).getReg();
3898 auto Tmp = MIB.buildInstr(Opc: TargetOpcode::IMPLICIT_DEF, DstOps: {DstTy}, SrcOps: {});
3899 MachineInstr *InsMI = emitLaneInsert(DstReg: std::nullopt, SrcReg: Tmp.getReg(Idx: 0), EltReg: Src1Reg,
3900 /* LaneIdx */ 0, RB, MIRBuilder&: MIB);
3901 if (!InsMI)
3902 return false;
3903 MachineInstr *Ins2MI = emitLaneInsert(DstReg, SrcReg: InsMI->getOperand(i: 0).getReg(),
3904 EltReg: Src2Reg, /* LaneIdx */ 1, RB, MIRBuilder&: MIB);
3905 if (!Ins2MI)
3906 return false;
3907 constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
3908 constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
3909 I.eraseFromParent();
3910 return true;
3911 }
3912
3913 if (RB.getID() != AArch64::GPRRegBankID)
3914 return false;
3915
3916 if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
3917 return false;
3918
3919 auto *DstRC = &AArch64::GPR64RegClass;
3920 Register SubToRegDef = MRI.createVirtualRegister(DstRC);
3921 MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3922 TII.get(TargetOpcode::SUBREG_TO_REG))
3923 .addDef(SubToRegDef)
3924 .addImm(0)
3925 .addUse(I.getOperand(1).getReg())
3926 .addImm(AArch64::sub_32);
3927 Register SubToRegDef2 = MRI.createVirtualRegister(DstRC);
3928 // Need to anyext the second scalar before we can use bfm
3929 MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3930 TII.get(TargetOpcode::SUBREG_TO_REG))
3931 .addDef(SubToRegDef2)
3932 .addImm(0)
3933 .addUse(I.getOperand(2).getReg())
3934 .addImm(AArch64::sub_32);
3935 MachineInstr &BFM =
3936 *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri))
3937 .addDef(I.getOperand(0).getReg())
3938 .addUse(SubToRegDef)
3939 .addUse(SubToRegDef2)
3940 .addImm(32)
3941 .addImm(31);
3942 constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI);
3943 constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI);
3944 constrainSelectedInstRegOperands(BFM, TII, TRI, RBI);
3945 I.eraseFromParent();
3946 return true;
3947}
3948
3949static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
3950 const unsigned EltSize) {
3951 // Choose a lane copy opcode and subregister based off of the size of the
3952 // vector's elements.
3953 switch (EltSize) {
3954 case 8:
3955 CopyOpc = AArch64::DUPi8;
3956 ExtractSubReg = AArch64::bsub;
3957 break;
3958 case 16:
3959 CopyOpc = AArch64::DUPi16;
3960 ExtractSubReg = AArch64::hsub;
3961 break;
3962 case 32:
3963 CopyOpc = AArch64::DUPi32;
3964 ExtractSubReg = AArch64::ssub;
3965 break;
3966 case 64:
3967 CopyOpc = AArch64::DUPi64;
3968 ExtractSubReg = AArch64::dsub;
3969 break;
3970 default:
3971 // Unknown size, bail out.
3972 LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
3973 return false;
3974 }
3975 return true;
3976}
3977
3978MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
3979 std::optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
3980 Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
3981 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3982 unsigned CopyOpc = 0;
3983 unsigned ExtractSubReg = 0;
3984 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, EltSize: ScalarTy.getSizeInBits())) {
3985 LLVM_DEBUG(
3986 dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
3987 return nullptr;
3988 }
3989
3990 const TargetRegisterClass *DstRC =
3991 getRegClassForTypeOnBank(Ty: ScalarTy, RB: DstRB, GetAllRegSet: true);
3992 if (!DstRC) {
3993 LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
3994 return nullptr;
3995 }
3996
3997 const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
3998 const LLT &VecTy = MRI.getType(Reg: VecReg);
3999 const TargetRegisterClass *VecRC =
4000 getRegClassForTypeOnBank(Ty: VecTy, RB: VecRB, GetAllRegSet: true);
4001 if (!VecRC) {
4002 LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
4003 return nullptr;
4004 }
4005
4006 // The register that we're going to copy into.
4007 Register InsertReg = VecReg;
4008 if (!DstReg)
4009 DstReg = MRI.createVirtualRegister(RegClass: DstRC);
4010 // If the lane index is 0, we just use a subregister COPY.
4011 if (LaneIdx == 0) {
4012 auto Copy = MIRBuilder.buildInstr(Opc: TargetOpcode::COPY, DstOps: {*DstReg}, SrcOps: {})
4013 .addReg(RegNo: VecReg, flags: 0, SubReg: ExtractSubReg);
4014 RBI.constrainGenericRegister(Reg: *DstReg, RC: *DstRC, MRI);
4015 return &*Copy;
4016 }
4017
4018 // Lane copies require 128-bit wide registers. If we're dealing with an
4019 // unpacked vector, then we need to move up to that width. Insert an implicit
4020 // def and a subregister insert to get us there.
4021 if (VecTy.getSizeInBits() != 128) {
4022 MachineInstr *ScalarToVector = emitScalarToVector(
4023 VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder);
4024 if (!ScalarToVector)
4025 return nullptr;
4026 InsertReg = ScalarToVector->getOperand(i: 0).getReg();
4027 }
4028
4029 MachineInstr *LaneCopyMI =
4030 MIRBuilder.buildInstr(Opc: CopyOpc, DstOps: {*DstReg}, SrcOps: {InsertReg}).addImm(Val: LaneIdx);
4031 constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
4032
4033 // Make sure that we actually constrain the initial copy.
4034 RBI.constrainGenericRegister(Reg: *DstReg, RC: *DstRC, MRI);
4035 return LaneCopyMI;
4036}
4037
4038bool AArch64InstructionSelector::selectExtractElt(
4039 MachineInstr &I, MachineRegisterInfo &MRI) {
4040 assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
4041 "unexpected opcode!");
4042 Register DstReg = I.getOperand(i: 0).getReg();
4043 const LLT NarrowTy = MRI.getType(Reg: DstReg);
4044 const Register SrcReg = I.getOperand(i: 1).getReg();
4045 const LLT WideTy = MRI.getType(Reg: SrcReg);
4046 (void)WideTy;
4047 assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
4048 "source register size too small!");
4049 assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
4050
4051 // Need the lane index to determine the correct copy opcode.
4052 MachineOperand &LaneIdxOp = I.getOperand(i: 2);
4053 assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
4054
4055 if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
4056 LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
4057 return false;
4058 }
4059
4060 // Find the index to extract from.
4061 auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: LaneIdxOp.getReg(), MRI);
4062 if (!VRegAndVal)
4063 return false;
4064 unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
4065
4066
4067 const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
4068 MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, ScalarTy: NarrowTy, VecReg: SrcReg,
4069 LaneIdx, MIRBuilder&: MIB);
4070 if (!Extract)
4071 return false;
4072
4073 I.eraseFromParent();
4074 return true;
4075}
4076
4077bool AArch64InstructionSelector::selectSplitVectorUnmerge(
4078 MachineInstr &I, MachineRegisterInfo &MRI) {
4079 unsigned NumElts = I.getNumOperands() - 1;
4080 Register SrcReg = I.getOperand(i: NumElts).getReg();
4081 const LLT NarrowTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
4082 const LLT SrcTy = MRI.getType(Reg: SrcReg);
4083
4084 assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
4085 if (SrcTy.getSizeInBits() > 128) {
4086 LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
4087 return false;
4088 }
4089
4090 // We implement a split vector operation by treating the sub-vectors as
4091 // scalars and extracting them.
4092 const RegisterBank &DstRB =
4093 *RBI.getRegBank(I.getOperand(i: 0).getReg(), MRI, TRI);
4094 for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
4095 Register Dst = I.getOperand(i: OpIdx).getReg();
4096 MachineInstr *Extract =
4097 emitExtractVectorElt(DstReg: Dst, DstRB, ScalarTy: NarrowTy, VecReg: SrcReg, LaneIdx: OpIdx, MIRBuilder&: MIB);
4098 if (!Extract)
4099 return false;
4100 }
4101 I.eraseFromParent();
4102 return true;
4103}
4104
4105bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
4106 MachineRegisterInfo &MRI) {
4107 assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
4108 "unexpected opcode");
4109
4110 // TODO: Handle unmerging into GPRs and from scalars to scalars.
4111 if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
4112 AArch64::FPRRegBankID ||
4113 RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
4114 AArch64::FPRRegBankID) {
4115 LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
4116 "currently unsupported.\n");
4117 return false;
4118 }
4119
4120 // The last operand is the vector source register, and every other operand is
4121 // a register to unpack into.
4122 unsigned NumElts = I.getNumOperands() - 1;
4123 Register SrcReg = I.getOperand(i: NumElts).getReg();
4124 const LLT NarrowTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
4125 const LLT WideTy = MRI.getType(Reg: SrcReg);
4126 (void)WideTy;
4127 assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) &&
4128 "can only unmerge from vector or s128 types!");
4129 assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
4130 "source register size too small!");
4131
4132 if (!NarrowTy.isScalar())
4133 return selectSplitVectorUnmerge(I, MRI);
4134
4135 // Choose a lane copy opcode and subregister based off of the size of the
4136 // vector's elements.
4137 unsigned CopyOpc = 0;
4138 unsigned ExtractSubReg = 0;
4139 if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, EltSize: NarrowTy.getSizeInBits()))
4140 return false;
4141
4142 // Set up for the lane copies.
4143 MachineBasicBlock &MBB = *I.getParent();
4144
4145 // Stores the registers we'll be copying from.
4146 SmallVector<Register, 4> InsertRegs;
4147
4148 // We'll use the first register twice, so we only need NumElts-1 registers.
4149 unsigned NumInsertRegs = NumElts - 1;
4150
4151 // If our elements fit into exactly 128 bits, then we can copy from the source
4152 // directly. Otherwise, we need to do a bit of setup with some subregister
4153 // inserts.
4154 if (NarrowTy.getSizeInBits() * NumElts == 128) {
4155 InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg);
4156 } else {
4157 // No. We have to perform subregister inserts. For each insert, create an
4158 // implicit def and a subregister insert, and save the register we create.
4159 const TargetRegisterClass *RC = getRegClassForTypeOnBank(
4160 LLT::fixed_vector(NumElements: NumElts, ScalarSizeInBits: WideTy.getScalarSizeInBits()),
4161 *RBI.getRegBank(SrcReg, MRI, TRI));
4162 unsigned SubReg = 0;
4163 bool Found = getSubRegForClass(RC, TRI, SubReg);
4164 (void)Found;
4165 assert(Found && "expected to find last operand's subeg idx");
4166 for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
4167 Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4168 MachineInstr &ImpDefMI =
4169 *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF),
4170 ImpDefReg);
4171
4172 // Now, create the subregister insert from SrcReg.
4173 Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4174 MachineInstr &InsMI =
4175 *BuildMI(MBB, I, I.getDebugLoc(),
4176 TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
4177 .addUse(ImpDefReg)
4178 .addUse(SrcReg)
4179 .addImm(SubReg);
4180
4181 constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
4182 constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
4183
4184 // Save the register so that we can copy from it after.
4185 InsertRegs.push_back(Elt: InsertReg);
4186 }
4187 }
4188
4189 // Now that we've created any necessary subregister inserts, we can
4190 // create the copies.
4191 //
4192 // Perform the first copy separately as a subregister copy.
4193 Register CopyTo = I.getOperand(i: 0).getReg();
4194 auto FirstCopy = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {CopyTo}, SrcOps: {})
4195 .addReg(RegNo: InsertRegs[0], flags: 0, SubReg: ExtractSubReg);
4196 constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI);
4197
4198 // Now, perform the remaining copies as vector lane copies.
4199 unsigned LaneIdx = 1;
4200 for (Register InsReg : InsertRegs) {
4201 Register CopyTo = I.getOperand(i: LaneIdx).getReg();
4202 MachineInstr &CopyInst =
4203 *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo)
4204 .addUse(InsReg)
4205 .addImm(LaneIdx);
4206 constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI);
4207 ++LaneIdx;
4208 }
4209
4210 // Separately constrain the first copy's destination. Because of the
4211 // limitation in constrainOperandRegClass, we can't guarantee that this will
4212 // actually be constrained. So, do it ourselves using the second operand.
4213 const TargetRegisterClass *RC =
4214 MRI.getRegClassOrNull(Reg: I.getOperand(i: 1).getReg());
4215 if (!RC) {
4216 LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
4217 return false;
4218 }
4219
4220 RBI.constrainGenericRegister(Reg: CopyTo, RC: *RC, MRI);
4221 I.eraseFromParent();
4222 return true;
4223}
4224
4225bool AArch64InstructionSelector::selectConcatVectors(
4226 MachineInstr &I, MachineRegisterInfo &MRI) {
4227 assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
4228 "Unexpected opcode");
4229 Register Dst = I.getOperand(i: 0).getReg();
4230 Register Op1 = I.getOperand(i: 1).getReg();
4231 Register Op2 = I.getOperand(i: 2).getReg();
4232 MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIRBuilder&: MIB);
4233 if (!ConcatMI)
4234 return false;
4235 I.eraseFromParent();
4236 return true;
4237}
4238
4239unsigned
4240AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
4241 MachineFunction &MF) const {
4242 Type *CPTy = CPVal->getType();
4243 Align Alignment = MF.getDataLayout().getPrefTypeAlign(Ty: CPTy);
4244
4245 MachineConstantPool *MCP = MF.getConstantPool();
4246 return MCP->getConstantPoolIndex(C: CPVal, Alignment);
4247}
4248
4249MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
4250 const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
4251 const TargetRegisterClass *RC;
4252 unsigned Opc;
4253 bool IsTiny = TM.getCodeModel() == CodeModel::Tiny;
4254 unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(Ty: CPVal->getType());
4255 switch (Size) {
4256 case 16:
4257 RC = &AArch64::FPR128RegClass;
4258 Opc = IsTiny ? AArch64::LDRQl : AArch64::LDRQui;
4259 break;
4260 case 8:
4261 RC = &AArch64::FPR64RegClass;
4262 Opc = IsTiny ? AArch64::LDRDl : AArch64::LDRDui;
4263 break;
4264 case 4:
4265 RC = &AArch64::FPR32RegClass;
4266 Opc = IsTiny ? AArch64::LDRSl : AArch64::LDRSui;
4267 break;
4268 case 2:
4269 RC = &AArch64::FPR16RegClass;
4270 Opc = AArch64::LDRHui;
4271 break;
4272 default:
4273 LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
4274 << *CPVal->getType());
4275 return nullptr;
4276 }
4277
4278 MachineInstr *LoadMI = nullptr;
4279 auto &MF = MIRBuilder.getMF();
4280 unsigned CPIdx = emitConstantPoolEntry(CPVal, MF);
4281 if (IsTiny && (Size == 16 || Size == 8 || Size == 4)) {
4282 // Use load(literal) for tiny code model.
4283 LoadMI = &*MIRBuilder.buildInstr(Opc, DstOps: {RC}, SrcOps: {}).addConstantPoolIndex(Idx: CPIdx);
4284 } else {
4285 auto Adrp =
4286 MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
4287 .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
4288
4289 LoadMI = &*MIRBuilder.buildInstr(Opc, {RC}, {Adrp})
4290 .addConstantPoolIndex(
4291 CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4292
4293 constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
4294 }
4295
4296 MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF);
4297 LoadMI->addMemOperand(MF, MO: MF.getMachineMemOperand(PtrInfo,
4298 F: MachineMemOperand::MOLoad,
4299 Size, BaseAlignment: Align(Size)));
4300 constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
4301 return LoadMI;
4302}
4303
4304/// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
4305/// size and RB.
4306static std::pair<unsigned, unsigned>
4307getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
4308 unsigned Opc, SubregIdx;
4309 if (RB.getID() == AArch64::GPRRegBankID) {
4310 if (EltSize == 8) {
4311 Opc = AArch64::INSvi8gpr;
4312 SubregIdx = AArch64::bsub;
4313 } else if (EltSize == 16) {
4314 Opc = AArch64::INSvi16gpr;
4315 SubregIdx = AArch64::ssub;
4316 } else if (EltSize == 32) {
4317 Opc = AArch64::INSvi32gpr;
4318 SubregIdx = AArch64::ssub;
4319 } else if (EltSize == 64) {
4320 Opc = AArch64::INSvi64gpr;
4321 SubregIdx = AArch64::dsub;
4322 } else {
4323 llvm_unreachable("invalid elt size!");
4324 }
4325 } else {
4326 if (EltSize == 8) {
4327 Opc = AArch64::INSvi8lane;
4328 SubregIdx = AArch64::bsub;
4329 } else if (EltSize == 16) {
4330 Opc = AArch64::INSvi16lane;
4331 SubregIdx = AArch64::hsub;
4332 } else if (EltSize == 32) {
4333 Opc = AArch64::INSvi32lane;
4334 SubregIdx = AArch64::ssub;
4335 } else if (EltSize == 64) {
4336 Opc = AArch64::INSvi64lane;
4337 SubregIdx = AArch64::dsub;
4338 } else {
4339 llvm_unreachable("invalid elt size!");
4340 }
4341 }
4342 return std::make_pair(x&: Opc, y&: SubregIdx);
4343}
4344
4345MachineInstr *AArch64InstructionSelector::emitInstr(
4346 unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
4347 std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
4348 const ComplexRendererFns &RenderFns) const {
4349 assert(Opcode && "Expected an opcode?");
4350 assert(!isPreISelGenericOpcode(Opcode) &&
4351 "Function should only be used to produce selected instructions!");
4352 auto MI = MIRBuilder.buildInstr(Opc: Opcode, DstOps, SrcOps);
4353 if (RenderFns)
4354 for (auto &Fn : *RenderFns)
4355 Fn(MI);
4356 constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
4357 return &*MI;
4358}
4359
4360MachineInstr *AArch64InstructionSelector::emitAddSub(
4361 const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
4362 Register Dst, MachineOperand &LHS, MachineOperand &RHS,
4363 MachineIRBuilder &MIRBuilder) const {
4364 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4365 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4366 auto Ty = MRI.getType(Reg: LHS.getReg());
4367 assert(!Ty.isVector() && "Expected a scalar or pointer?");
4368 unsigned Size = Ty.getSizeInBits();
4369 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
4370 bool Is32Bit = Size == 32;
4371
4372 // INSTRri form with positive arithmetic immediate.
4373 if (auto Fns = selectArithImmed(Root&: RHS))
4374 return emitInstr(Opcode: AddrModeAndSizeToOpcode[0][Is32Bit], DstOps: {Dst}, SrcOps: {LHS},
4375 MIRBuilder, RenderFns: Fns);
4376
4377 // INSTRri form with negative arithmetic immediate.
4378 if (auto Fns = selectNegArithImmed(Root&: RHS))
4379 return emitInstr(Opcode: AddrModeAndSizeToOpcode[3][Is32Bit], DstOps: {Dst}, SrcOps: {LHS},
4380 MIRBuilder, RenderFns: Fns);
4381
4382 // INSTRrx form.
4383 if (auto Fns = selectArithExtendedRegister(Root&: RHS))
4384 return emitInstr(Opcode: AddrModeAndSizeToOpcode[4][Is32Bit], DstOps: {Dst}, SrcOps: {LHS},
4385 MIRBuilder, RenderFns: Fns);
4386
4387 // INSTRrs form.
4388 if (auto Fns = selectShiftedRegister(Root&: RHS))
4389 return emitInstr(Opcode: AddrModeAndSizeToOpcode[1][Is32Bit], DstOps: {Dst}, SrcOps: {LHS},
4390 MIRBuilder, RenderFns: Fns);
4391 return emitInstr(Opcode: AddrModeAndSizeToOpcode[2][Is32Bit], DstOps: {Dst}, SrcOps: {LHS, RHS},
4392 MIRBuilder);
4393}
4394
4395MachineInstr *
4396AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
4397 MachineOperand &RHS,
4398 MachineIRBuilder &MIRBuilder) const {
4399 const std::array<std::array<unsigned, 2>, 5> OpcTable{
4400 {{AArch64::ADDXri, AArch64::ADDWri},
4401 {AArch64::ADDXrs, AArch64::ADDWrs},
4402 {AArch64::ADDXrr, AArch64::ADDWrr},
4403 {AArch64::SUBXri, AArch64::SUBWri},
4404 {AArch64::ADDXrx, AArch64::ADDWrx}}};
4405 return emitAddSub(AddrModeAndSizeToOpcode: OpcTable, Dst: DefReg, LHS, RHS, MIRBuilder);
4406}
4407
4408MachineInstr *
4409AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
4410 MachineOperand &RHS,
4411 MachineIRBuilder &MIRBuilder) const {
4412 const std::array<std::array<unsigned, 2>, 5> OpcTable{
4413 {{AArch64::ADDSXri, AArch64::ADDSWri},
4414 {AArch64::ADDSXrs, AArch64::ADDSWrs},
4415 {AArch64::ADDSXrr, AArch64::ADDSWrr},
4416 {AArch64::SUBSXri, AArch64::SUBSWri},
4417 {AArch64::ADDSXrx, AArch64::ADDSWrx}}};
4418 return emitAddSub(AddrModeAndSizeToOpcode: OpcTable, Dst, LHS, RHS, MIRBuilder);
4419}
4420
4421MachineInstr *
4422AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
4423 MachineOperand &RHS,
4424 MachineIRBuilder &MIRBuilder) const {
4425 const std::array<std::array<unsigned, 2>, 5> OpcTable{
4426 {{AArch64::SUBSXri, AArch64::SUBSWri},
4427 {AArch64::SUBSXrs, AArch64::SUBSWrs},
4428 {AArch64::SUBSXrr, AArch64::SUBSWrr},
4429 {AArch64::ADDSXri, AArch64::ADDSWri},
4430 {AArch64::SUBSXrx, AArch64::SUBSWrx}}};
4431 return emitAddSub(AddrModeAndSizeToOpcode: OpcTable, Dst, LHS, RHS, MIRBuilder);
4432}
4433
4434MachineInstr *
4435AArch64InstructionSelector::emitADCS(Register Dst, MachineOperand &LHS,
4436 MachineOperand &RHS,
4437 MachineIRBuilder &MIRBuilder) const {
4438 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4439 MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4440 bool Is32Bit = (MRI->getType(Reg: LHS.getReg()).getSizeInBits() == 32);
4441 static const unsigned OpcTable[2] = {AArch64::ADCSXr, AArch64::ADCSWr};
4442 return emitInstr(Opcode: OpcTable[Is32Bit], DstOps: {Dst}, SrcOps: {LHS, RHS}, MIRBuilder);
4443}
4444
4445MachineInstr *
4446AArch64InstructionSelector::emitSBCS(Register Dst, MachineOperand &LHS,
4447 MachineOperand &RHS,
4448 MachineIRBuilder &MIRBuilder) const {
4449 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4450 MachineRegisterInfo *MRI = MIRBuilder.getMRI();
4451 bool Is32Bit = (MRI->getType(Reg: LHS.getReg()).getSizeInBits() == 32);
4452 static const unsigned OpcTable[2] = {AArch64::SBCSXr, AArch64::SBCSWr};
4453 return emitInstr(Opcode: OpcTable[Is32Bit], DstOps: {Dst}, SrcOps: {LHS, RHS}, MIRBuilder);
4454}
4455
4456MachineInstr *
4457AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
4458 MachineIRBuilder &MIRBuilder) const {
4459 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4460 bool Is32Bit = (MRI.getType(Reg: LHS.getReg()).getSizeInBits() == 32);
4461 auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4462 return emitADDS(Dst: MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder);
4463}
4464
4465MachineInstr *
4466AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
4467 MachineIRBuilder &MIRBuilder) const {
4468 assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4469 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4470 LLT Ty = MRI.getType(Reg: LHS.getReg());
4471 unsigned RegSize = Ty.getSizeInBits();
4472 bool Is32Bit = (RegSize == 32);
4473 const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
4474 {AArch64::ANDSXrs, AArch64::ANDSWrs},
4475 {AArch64::ANDSXrr, AArch64::ANDSWrr}};
4476 // ANDS needs a logical immediate for its immediate form. Check if we can
4477 // fold one in.
4478 if (auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: RHS.getReg(), MRI)) {
4479 int64_t Imm = ValAndVReg->Value.getSExtValue();
4480
4481 if (AArch64_AM::isLogicalImmediate(imm: Imm, regSize: RegSize)) {
4482 auto TstMI = MIRBuilder.buildInstr(Opc: OpcTable[0][Is32Bit], DstOps: {Ty}, SrcOps: {LHS});
4483 TstMI.addImm(Val: AArch64_AM::encodeLogicalImmediate(imm: Imm, regSize: RegSize));
4484 constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
4485 return &*TstMI;
4486 }
4487 }
4488
4489 if (auto Fns = selectLogicalShiftedRegister(Root&: RHS))
4490 return emitInstr(Opcode: OpcTable[1][Is32Bit], DstOps: {Ty}, SrcOps: {LHS}, MIRBuilder, RenderFns: Fns);
4491 return emitInstr(Opcode: OpcTable[2][Is32Bit], DstOps: {Ty}, SrcOps: {LHS, RHS}, MIRBuilder);
4492}
4493
4494MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
4495 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4496 MachineIRBuilder &MIRBuilder) const {
4497 assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
4498 assert(Predicate.isPredicate() && "Expected predicate?");
4499 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4500 LLT CmpTy = MRI.getType(Reg: LHS.getReg());
4501 assert(!CmpTy.isVector() && "Expected scalar or pointer");
4502 unsigned Size = CmpTy.getSizeInBits();
4503 (void)Size;
4504 assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
4505 // Fold the compare into a cmn or tst if possible.
4506 if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
4507 return FoldCmp;
4508 auto Dst = MRI.cloneVirtualRegister(VReg: LHS.getReg());
4509 return emitSUBS(Dst, LHS, RHS, MIRBuilder);
4510}
4511
4512MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
4513 Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
4514 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4515#ifndef NDEBUG
4516 LLT Ty = MRI.getType(Reg: Dst);
4517 assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
4518 "Expected a 32-bit scalar register?");
4519#endif
4520 const Register ZReg = AArch64::WZR;
4521 AArch64CC::CondCode CC1, CC2;
4522 changeFCMPPredToAArch64CC(P: Pred, CondCode&: CC1, CondCode2&: CC2);
4523 auto InvCC1 = AArch64CC::getInvertedCondCode(Code: CC1);
4524 if (CC2 == AArch64CC::AL)
4525 return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, Pred: InvCC1,
4526 MIRBuilder);
4527 const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
4528 Register Def1Reg = MRI.createVirtualRegister(RegClass: RC);
4529 Register Def2Reg = MRI.createVirtualRegister(RegClass: RC);
4530 auto InvCC2 = AArch64CC::getInvertedCondCode(Code: CC2);
4531 emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, Pred: InvCC1, MIRBuilder);
4532 emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, Pred: InvCC2, MIRBuilder);
4533 auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
4534 constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
4535 return &*OrMI;
4536}
4537
4538MachineInstr *AArch64InstructionSelector::emitFPCompare(
4539 Register LHS, Register RHS, MachineIRBuilder &MIRBuilder,
4540 std::optional<CmpInst::Predicate> Pred) const {
4541 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4542 LLT Ty = MRI.getType(Reg: LHS);
4543 if (Ty.isVector())
4544 return nullptr;
4545 unsigned OpSize = Ty.getSizeInBits();
4546 assert(OpSize == 16 || OpSize == 32 || OpSize == 64);
4547
4548 // If this is a compare against +0.0, then we don't have
4549 // to explicitly materialize a constant.
4550 const ConstantFP *FPImm = getConstantFPVRegVal(VReg: RHS, MRI);
4551 bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
4552
4553 auto IsEqualityPred = [](CmpInst::Predicate P) {
4554 return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE ||
4555 P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE;
4556 };
4557 if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) {
4558 // Try commutating the operands.
4559 const ConstantFP *LHSImm = getConstantFPVRegVal(VReg: LHS, MRI);
4560 if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
4561 ShouldUseImm = true;
4562 std::swap(a&: LHS, b&: RHS);
4563 }
4564 }
4565 unsigned CmpOpcTbl[2][3] = {
4566 {AArch64::FCMPHrr, AArch64::FCMPSrr, AArch64::FCMPDrr},
4567 {AArch64::FCMPHri, AArch64::FCMPSri, AArch64::FCMPDri}};
4568 unsigned CmpOpc =
4569 CmpOpcTbl[ShouldUseImm][OpSize == 16 ? 0 : (OpSize == 32 ? 1 : 2)];
4570
4571 // Partially build the compare. Decide if we need to add a use for the
4572 // third operand based off whether or not we're comparing against 0.0.
4573 auto CmpMI = MIRBuilder.buildInstr(Opcode: CmpOpc).addUse(RegNo: LHS);
4574 CmpMI.setMIFlags(MachineInstr::NoFPExcept);
4575 if (!ShouldUseImm)
4576 CmpMI.addUse(RegNo: RHS);
4577 constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
4578 return &*CmpMI;
4579}
4580
4581MachineInstr *AArch64InstructionSelector::emitVectorConcat(
4582 std::optional<Register> Dst, Register Op1, Register Op2,
4583 MachineIRBuilder &MIRBuilder) const {
4584 // We implement a vector concat by:
4585 // 1. Use scalar_to_vector to insert the lower vector into the larger dest
4586 // 2. Insert the upper vector into the destination's upper element
4587 // TODO: some of this code is common with G_BUILD_VECTOR handling.
4588 MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4589
4590 const LLT Op1Ty = MRI.getType(Reg: Op1);
4591 const LLT Op2Ty = MRI.getType(Reg: Op2);
4592
4593 if (Op1Ty != Op2Ty) {
4594 LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
4595 return nullptr;
4596 }
4597 assert(Op1Ty.isVector() && "Expected a vector for vector concat");
4598
4599 if (Op1Ty.getSizeInBits() >= 128) {
4600 LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
4601 return nullptr;
4602 }
4603
4604 // At the moment we just support 64 bit vector concats.
4605 if (Op1Ty.getSizeInBits() != 64) {
4606 LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
4607 return nullptr;
4608 }
4609
4610 const LLT ScalarTy = LLT::scalar(SizeInBits: Op1Ty.getSizeInBits());
4611 const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
4612 const TargetRegisterClass *DstRC =
4613 getRegClassForTypeOnBank(Ty: Op1Ty.multiplyElements(Factor: 2), RB: FPRBank);
4614
4615 MachineInstr *WidenedOp1 =
4616 emitScalarToVector(EltSize: ScalarTy.getSizeInBits(), DstRC, Scalar: Op1, MIRBuilder);
4617 MachineInstr *WidenedOp2 =
4618 emitScalarToVector(EltSize: ScalarTy.getSizeInBits(), DstRC, Scalar: Op2, MIRBuilder);
4619 if (!WidenedOp1 || !WidenedOp2) {
4620 LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
4621 return nullptr;
4622 }
4623
4624 // Now do the insert of the upper element.
4625 unsigned InsertOpc, InsSubRegIdx;
4626 std::tie(args&: InsertOpc, args&: InsSubRegIdx) =
4627 getInsertVecEltOpInfo(RB: FPRBank, EltSize: ScalarTy.getSizeInBits());
4628
4629 if (!Dst)
4630 Dst = MRI.createVirtualRegister(RegClass: DstRC);
4631 auto InsElt =
4632 MIRBuilder
4633 .buildInstr(Opc: InsertOpc, DstOps: {*Dst}, SrcOps: {WidenedOp1->getOperand(i: 0).getReg()})
4634 .addImm(Val: 1) /* Lane index */
4635 .addUse(RegNo: WidenedOp2->getOperand(i: 0).getReg())
4636 .addImm(Val: 0);
4637 constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4638 return &*InsElt;
4639}
4640
4641MachineInstr *
4642AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1,
4643 Register Src2, AArch64CC::CondCode Pred,
4644 MachineIRBuilder &MIRBuilder) const {
4645 auto &MRI = *MIRBuilder.getMRI();
4646 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg: Dst);
4647 // If we used a register class, then this won't necessarily have an LLT.
4648 // Compute the size based off whether or not we have a class or bank.
4649 unsigned Size;
4650 if (const auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
4651 Size = TRI.getRegSizeInBits(*RC);
4652 else
4653 Size = MRI.getType(Reg: Dst).getSizeInBits();
4654 // Some opcodes use s1.
4655 assert(Size <= 64 && "Expected 64 bits or less only!");
4656 static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr};
4657 unsigned Opc = OpcTable[Size == 64];
4658 auto CSINC = MIRBuilder.buildInstr(Opc, DstOps: {Dst}, SrcOps: {Src1, Src2}).addImm(Val: Pred);
4659 constrainSelectedInstRegOperands(*CSINC, TII, TRI, RBI);
4660 return &*CSINC;
4661}
4662
4663MachineInstr *AArch64InstructionSelector::emitCarryIn(MachineInstr &I,
4664 Register CarryReg) {
4665 MachineRegisterInfo *MRI = MIB.getMRI();
4666 unsigned Opcode = I.getOpcode();
4667
4668 // If the instruction is a SUB, we need to negate the carry,
4669 // because borrowing is indicated by carry-flag == 0.
4670 bool NeedsNegatedCarry =
4671 (Opcode == TargetOpcode::G_USUBE || Opcode == TargetOpcode::G_SSUBE);
4672
4673 // If the previous instruction will already produce the correct carry, do not
4674 // emit a carry generating instruction. E.g. for G_UADDE/G_USUBE sequences
4675 // generated during legalization of wide add/sub. This optimization depends on
4676 // these sequences not being interrupted by other instructions.
4677 // We have to select the previous instruction before the carry-using
4678 // instruction is deleted by the calling function, otherwise the previous
4679 // instruction might become dead and would get deleted.
4680 MachineInstr *SrcMI = MRI->getVRegDef(Reg: CarryReg);
4681 if (SrcMI == I.getPrevNode()) {
4682 if (auto *CarrySrcMI = dyn_cast<GAddSubCarryOut>(Val: SrcMI)) {
4683 bool ProducesNegatedCarry = CarrySrcMI->isSub();
4684 if (NeedsNegatedCarry == ProducesNegatedCarry &&
4685 CarrySrcMI->isUnsigned() &&
4686 CarrySrcMI->getCarryOutReg() == CarryReg &&
4687 selectAndRestoreState(I&: *SrcMI))
4688 return nullptr;
4689 }
4690 }
4691
4692 Register DeadReg = MRI->createVirtualRegister(&AArch64::GPR32RegClass);
4693
4694 if (NeedsNegatedCarry) {
4695 // (0 - Carry) sets !C in NZCV when Carry == 1
4696 Register ZReg = AArch64::WZR;
4697 return emitInstr(AArch64::SUBSWrr, {DeadReg}, {ZReg, CarryReg}, MIB);
4698 }
4699
4700 // (Carry - 1) sets !C in NZCV when Carry == 0
4701 auto Fns = select12BitValueWithLeftShift(Immed: 1);
4702 return emitInstr(AArch64::SUBSWri, {DeadReg}, {CarryReg}, MIB, Fns);
4703}
4704
4705bool AArch64InstructionSelector::selectOverflowOp(MachineInstr &I,
4706 MachineRegisterInfo &MRI) {
4707 auto &CarryMI = cast<GAddSubCarryOut>(Val&: I);
4708
4709 if (auto *CarryInMI = dyn_cast<GAddSubCarryInOut>(Val: &I)) {
4710 // Set NZCV carry according to carry-in VReg
4711 emitCarryIn(I, CarryReg: CarryInMI->getCarryInReg());
4712 }
4713
4714 // Emit the operation and get the correct condition code.
4715 auto OpAndCC = emitOverflowOp(Opcode: I.getOpcode(), Dst: CarryMI.getDstReg(),
4716 LHS&: CarryMI.getLHS(), RHS&: CarryMI.getRHS(), MIRBuilder&: MIB);
4717
4718 Register CarryOutReg = CarryMI.getCarryOutReg();
4719
4720 // Don't convert carry-out to VReg if it is never used
4721 if (!MRI.use_nodbg_empty(RegNo: CarryOutReg)) {
4722 // Now, put the overflow result in the register given by the first operand
4723 // to the overflow op. CSINC increments the result when the predicate is
4724 // false, so to get the increment when it's true, we need to use the
4725 // inverse. In this case, we want to increment when carry is set.
4726 Register ZReg = AArch64::WZR;
4727 emitCSINC(/*Dst=*/CarryOutReg, /*Src1=*/ZReg, /*Src2=*/ZReg,
4728 Pred: getInvertedCondCode(Code: OpAndCC.second), MIRBuilder&: MIB);
4729 }
4730
4731 I.eraseFromParent();
4732 return true;
4733}
4734
4735std::pair<MachineInstr *, AArch64CC::CondCode>
4736AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
4737 MachineOperand &LHS,
4738 MachineOperand &RHS,
4739 MachineIRBuilder &MIRBuilder) const {
4740 switch (Opcode) {
4741 default:
4742 llvm_unreachable("Unexpected opcode!");
4743 case TargetOpcode::G_SADDO:
4744 return std::make_pair(x: emitADDS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::VS);
4745 case TargetOpcode::G_UADDO:
4746 return std::make_pair(x: emitADDS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::HS);
4747 case TargetOpcode::G_SSUBO:
4748 return std::make_pair(x: emitSUBS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::VS);
4749 case TargetOpcode::G_USUBO:
4750 return std::make_pair(x: emitSUBS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::LO);
4751 case TargetOpcode::G_SADDE:
4752 return std::make_pair(x: emitADCS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::VS);
4753 case TargetOpcode::G_UADDE:
4754 return std::make_pair(x: emitADCS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::HS);
4755 case TargetOpcode::G_SSUBE:
4756 return std::make_pair(x: emitSBCS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::VS);
4757 case TargetOpcode::G_USUBE:
4758 return std::make_pair(x: emitSBCS(Dst, LHS, RHS, MIRBuilder), y: AArch64CC::LO);
4759 }
4760}
4761
4762/// Returns true if @p Val is a tree of AND/OR/CMP operations that can be
4763/// expressed as a conjunction.
4764/// \param CanNegate Set to true if we can negate the whole sub-tree just by
4765/// changing the conditions on the CMP tests.
4766/// (this means we can call emitConjunctionRec() with
4767/// Negate==true on this sub-tree)
4768/// \param MustBeFirst Set to true if this subtree needs to be negated and we
4769/// cannot do the negation naturally. We are required to
4770/// emit the subtree first in this case.
4771/// \param WillNegate Is true if are called when the result of this
4772/// subexpression must be negated. This happens when the
4773/// outer expression is an OR. We can use this fact to know
4774/// that we have a double negation (or (or ...) ...) that
4775/// can be implemented for free.
4776static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst,
4777 bool WillNegate, MachineRegisterInfo &MRI,
4778 unsigned Depth = 0) {
4779 if (!MRI.hasOneNonDBGUse(RegNo: Val))
4780 return false;
4781 MachineInstr *ValDef = MRI.getVRegDef(Reg: Val);
4782 unsigned Opcode = ValDef->getOpcode();
4783 if (isa<GAnyCmp>(Val: ValDef)) {
4784 CanNegate = true;
4785 MustBeFirst = false;
4786 return true;
4787 }
4788 // Protect against exponential runtime and stack overflow.
4789 if (Depth > 6)
4790 return false;
4791 if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) {
4792 bool IsOR = Opcode == TargetOpcode::G_OR;
4793 Register O0 = ValDef->getOperand(i: 1).getReg();
4794 Register O1 = ValDef->getOperand(i: 2).getReg();
4795 bool CanNegateL;
4796 bool MustBeFirstL;
4797 if (!canEmitConjunction(Val: O0, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR, MRI, Depth: Depth + 1))
4798 return false;
4799 bool CanNegateR;
4800 bool MustBeFirstR;
4801 if (!canEmitConjunction(Val: O1, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR, MRI, Depth: Depth + 1))
4802 return false;
4803
4804 if (MustBeFirstL && MustBeFirstR)
4805 return false;
4806
4807 if (IsOR) {
4808 // For an OR expression we need to be able to naturally negate at least
4809 // one side or we cannot do the transformation at all.
4810 if (!CanNegateL && !CanNegateR)
4811 return false;
4812 // If we the result of the OR will be negated and we can naturally negate
4813 // the leaves, then this sub-tree as a whole negates naturally.
4814 CanNegate = WillNegate && CanNegateL && CanNegateR;
4815 // If we cannot naturally negate the whole sub-tree, then this must be
4816 // emitted first.
4817 MustBeFirst = !CanNegate;
4818 } else {
4819 assert(Opcode == TargetOpcode::G_AND && "Must be G_AND");
4820 // We cannot naturally negate an AND operation.
4821 CanNegate = false;
4822 MustBeFirst = MustBeFirstL || MustBeFirstR;
4823 }
4824 return true;
4825 }
4826 return false;
4827}
4828
4829MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
4830 Register LHS, Register RHS, CmpInst::Predicate CC,
4831 AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC,
4832 MachineIRBuilder &MIB) const {
4833 // TODO: emit CMN as an optimization.
4834 auto &MRI = *MIB.getMRI();
4835 LLT OpTy = MRI.getType(Reg: LHS);
4836 unsigned CCmpOpc;
4837 std::optional<ValueAndVReg> C;
4838 if (CmpInst::isIntPredicate(P: CC)) {
4839 assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
4840 C = getIConstantVRegValWithLookThrough(VReg: RHS, MRI);
4841 if (C && C->Value.ult(32))
4842 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWi : AArch64::CCMPXi;
4843 else
4844 CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
4845 } else {
4846 assert(OpTy.getSizeInBits() == 16 || OpTy.getSizeInBits() == 32 ||
4847 OpTy.getSizeInBits() == 64);
4848 switch (OpTy.getSizeInBits()) {
4849 case 16:
4850 assert(STI.hasFullFP16() && "Expected Full FP16 for fp16 comparisons");
4851 CCmpOpc = AArch64::FCCMPHrr;
4852 break;
4853 case 32:
4854 CCmpOpc = AArch64::FCCMPSrr;
4855 break;
4856 case 64:
4857 CCmpOpc = AArch64::FCCMPDrr;
4858 break;
4859 default:
4860 return nullptr;
4861 }
4862 }
4863 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
4864 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(Code: InvOutCC);
4865 auto CCmp =
4866 MIB.buildInstr(Opc: CCmpOpc, DstOps: {}, SrcOps: {LHS});
4867 if (CCmpOpc == AArch64::CCMPWi || CCmpOpc == AArch64::CCMPXi)
4868 CCmp.addImm(Val: C->Value.getZExtValue());
4869 else
4870 CCmp.addReg(RegNo: RHS);
4871 CCmp.addImm(Val: NZCV).addImm(Val: Predicate);
4872 constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI);
4873 return &*CCmp;
4874}
4875
4876MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
4877 Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp,
4878 AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const {
4879 // We're at a tree leaf, produce a conditional comparison operation.
4880 auto &MRI = *MIB.getMRI();
4881 MachineInstr *ValDef = MRI.getVRegDef(Reg: Val);
4882 unsigned Opcode = ValDef->getOpcode();
4883 if (auto *Cmp = dyn_cast<GAnyCmp>(Val: ValDef)) {
4884 Register LHS = Cmp->getLHSReg();
4885 Register RHS = Cmp->getRHSReg();
4886 CmpInst::Predicate CC = Cmp->getCond();
4887 if (Negate)
4888 CC = CmpInst::getInversePredicate(pred: CC);
4889 if (isa<GICmp>(Val: Cmp)) {
4890 OutCC = changeICMPPredToAArch64CC(P: CC);
4891 } else {
4892 // Handle special FP cases.
4893 AArch64CC::CondCode ExtraCC;
4894 changeFPCCToANDAArch64CC(CC, CondCode&: OutCC, CondCode2&: ExtraCC);
4895 // Some floating point conditions can't be tested with a single condition
4896 // code. Construct an additional comparison in this case.
4897 if (ExtraCC != AArch64CC::AL) {
4898 MachineInstr *ExtraCmp;
4899 if (!CCOp)
4900 ExtraCmp = emitFPCompare(LHS, RHS, MIRBuilder&: MIB, Pred: CC);
4901 else
4902 ExtraCmp =
4903 emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC: ExtraCC, MIB);
4904 CCOp = ExtraCmp->getOperand(i: 0).getReg();
4905 Predicate = ExtraCC;
4906 }
4907 }
4908
4909 // Produce a normal comparison if we are first in the chain
4910 if (!CCOp) {
4911 auto Dst = MRI.cloneVirtualRegister(VReg: LHS);
4912 if (isa<GICmp>(Val: Cmp))
4913 return emitSUBS(Dst, LHS&: Cmp->getOperand(i: 2), RHS&: Cmp->getOperand(i: 3), MIRBuilder&: MIB);
4914 return emitFPCompare(LHS: Cmp->getOperand(i: 2).getReg(),
4915 RHS: Cmp->getOperand(i: 3).getReg(), MIRBuilder&: MIB);
4916 }
4917 // Otherwise produce a ccmp.
4918 return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB);
4919 }
4920 assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree");
4921
4922 bool IsOR = Opcode == TargetOpcode::G_OR;
4923
4924 Register LHS = ValDef->getOperand(i: 1).getReg();
4925 bool CanNegateL;
4926 bool MustBeFirstL;
4927 bool ValidL = canEmitConjunction(Val: LHS, CanNegate&: CanNegateL, MustBeFirst&: MustBeFirstL, WillNegate: IsOR, MRI);
4928 assert(ValidL && "Valid conjunction/disjunction tree");
4929 (void)ValidL;
4930
4931 Register RHS = ValDef->getOperand(i: 2).getReg();
4932 bool CanNegateR;
4933 bool MustBeFirstR;
4934 bool ValidR = canEmitConjunction(Val: RHS, CanNegate&: CanNegateR, MustBeFirst&: MustBeFirstR, WillNegate: IsOR, MRI);
4935 assert(ValidR && "Valid conjunction/disjunction tree");
4936 (void)ValidR;
4937
4938 // Swap sub-tree that must come first to the right side.
4939 if (MustBeFirstL) {
4940 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
4941 std::swap(a&: LHS, b&: RHS);
4942 std::swap(a&: CanNegateL, b&: CanNegateR);
4943 std::swap(a&: MustBeFirstL, b&: MustBeFirstR);
4944 }
4945
4946 bool NegateR;
4947 bool NegateAfterR;
4948 bool NegateL;
4949 bool NegateAfterAll;
4950 if (Opcode == TargetOpcode::G_OR) {
4951 // Swap the sub-tree that we can negate naturally to the left.
4952 if (!CanNegateL) {
4953 assert(CanNegateR && "at least one side must be negatable");
4954 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4955 assert(!Negate);
4956 std::swap(a&: LHS, b&: RHS);
4957 NegateR = false;
4958 NegateAfterR = true;
4959 } else {
4960 // Negate the left sub-tree if possible, otherwise negate the result.
4961 NegateR = CanNegateR;
4962 NegateAfterR = !CanNegateR;
4963 }
4964 NegateL = true;
4965 NegateAfterAll = !Negate;
4966 } else {
4967 assert(Opcode == TargetOpcode::G_AND &&
4968 "Valid conjunction/disjunction tree");
4969 assert(!Negate && "Valid conjunction/disjunction tree");
4970
4971 NegateL = false;
4972 NegateR = false;
4973 NegateAfterR = false;
4974 NegateAfterAll = false;
4975 }
4976
4977 // Emit sub-trees.
4978 AArch64CC::CondCode RHSCC;
4979 MachineInstr *CmpR =
4980 emitConjunctionRec(Val: RHS, OutCC&: RHSCC, Negate: NegateR, CCOp, Predicate, MIB);
4981 if (NegateAfterR)
4982 RHSCC = AArch64CC::getInvertedCondCode(Code: RHSCC);
4983 MachineInstr *CmpL = emitConjunctionRec(
4984 Val: LHS, OutCC, Negate: NegateL, CCOp: CmpR->getOperand(i: 0).getReg(), Predicate: RHSCC, MIB);
4985 if (NegateAfterAll)
4986 OutCC = AArch64CC::getInvertedCondCode(Code: OutCC);
4987 return CmpL;
4988}
4989
4990MachineInstr *AArch64InstructionSelector::emitConjunction(
4991 Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const {
4992 bool DummyCanNegate;
4993 bool DummyMustBeFirst;
4994 if (!canEmitConjunction(Val, CanNegate&: DummyCanNegate, MustBeFirst&: DummyMustBeFirst, WillNegate: false,
4995 MRI&: *MIB.getMRI()))
4996 return nullptr;
4997 return emitConjunctionRec(Val, OutCC, Negate: false, CCOp: Register(), Predicate: AArch64CC::AL, MIB);
4998}
4999
5000bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI,
5001 MachineInstr &CondMI) {
5002 AArch64CC::CondCode AArch64CC;
5003 MachineInstr *ConjMI = emitConjunction(Val: SelI.getCondReg(), OutCC&: AArch64CC, MIB);
5004 if (!ConjMI)
5005 return false;
5006
5007 emitSelect(Dst: SelI.getReg(Idx: 0), True: SelI.getTrueReg(), False: SelI.getFalseReg(), CC: AArch64CC, MIB);
5008 SelI.eraseFromParent();
5009 return true;
5010}
5011
5012bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
5013 MachineRegisterInfo &MRI = *MIB.getMRI();
5014 // We want to recognize this pattern:
5015 //
5016 // $z = G_FCMP pred, $x, $y
5017 // ...
5018 // $w = G_SELECT $z, $a, $b
5019 //
5020 // Where the value of $z is *only* ever used by the G_SELECT (possibly with
5021 // some copies/truncs in between.)
5022 //
5023 // If we see this, then we can emit something like this:
5024 //
5025 // fcmp $x, $y
5026 // fcsel $w, $a, $b, pred
5027 //
5028 // Rather than emitting both of the rather long sequences in the standard
5029 // G_FCMP/G_SELECT select methods.
5030
5031 // First, check if the condition is defined by a compare.
5032 MachineInstr *CondDef = MRI.getVRegDef(Reg: I.getOperand(i: 1).getReg());
5033
5034 // We can only fold if all of the defs have one use.
5035 Register CondDefReg = CondDef->getOperand(i: 0).getReg();
5036 if (!MRI.hasOneNonDBGUse(RegNo: CondDefReg)) {
5037 // Unless it's another select.
5038 for (const MachineInstr &UI : MRI.use_nodbg_instructions(Reg: CondDefReg)) {
5039 if (CondDef == &UI)
5040 continue;
5041 if (UI.getOpcode() != TargetOpcode::G_SELECT)
5042 return false;
5043 }
5044 }
5045
5046 // Is the condition defined by a compare?
5047 unsigned CondOpc = CondDef->getOpcode();
5048 if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) {
5049 if (tryOptSelectConjunction(SelI&: I, CondMI&: *CondDef))
5050 return true;
5051 return false;
5052 }
5053
5054 AArch64CC::CondCode CondCode;
5055 if (CondOpc == TargetOpcode::G_ICMP) {
5056 auto Pred =
5057 static_cast<CmpInst::Predicate>(CondDef->getOperand(i: 1).getPredicate());
5058 CondCode = changeICMPPredToAArch64CC(P: Pred);
5059 emitIntegerCompare(LHS&: CondDef->getOperand(i: 2), RHS&: CondDef->getOperand(i: 3),
5060 Predicate&: CondDef->getOperand(i: 1), MIRBuilder&: MIB);
5061 } else {
5062 // Get the condition code for the select.
5063 auto Pred =
5064 static_cast<CmpInst::Predicate>(CondDef->getOperand(i: 1).getPredicate());
5065 AArch64CC::CondCode CondCode2;
5066 changeFCMPPredToAArch64CC(P: Pred, CondCode, CondCode2);
5067
5068 // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
5069 // instructions to emit the comparison.
5070 // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
5071 // unnecessary.
5072 if (CondCode2 != AArch64CC::AL)
5073 return false;
5074
5075 if (!emitFPCompare(LHS: CondDef->getOperand(i: 2).getReg(),
5076 RHS: CondDef->getOperand(i: 3).getReg(), MIRBuilder&: MIB)) {
5077 LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
5078 return false;
5079 }
5080 }
5081
5082 // Emit the select.
5083 emitSelect(Dst: I.getOperand(i: 0).getReg(), True: I.getOperand(i: 2).getReg(),
5084 False: I.getOperand(i: 3).getReg(), CC: CondCode, MIB);
5085 I.eraseFromParent();
5086 return true;
5087}
5088
5089MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
5090 MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
5091 MachineIRBuilder &MIRBuilder) const {
5092 assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
5093 "Unexpected MachineOperand");
5094 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5095 // We want to find this sort of thing:
5096 // x = G_SUB 0, y
5097 // G_ICMP z, x
5098 //
5099 // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
5100 // e.g:
5101 //
5102 // cmn z, y
5103
5104 // Check if the RHS or LHS of the G_ICMP is defined by a SUB
5105 MachineInstr *LHSDef = getDefIgnoringCopies(Reg: LHS.getReg(), MRI);
5106 MachineInstr *RHSDef = getDefIgnoringCopies(Reg: RHS.getReg(), MRI);
5107 auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
5108 // Given this:
5109 //
5110 // x = G_SUB 0, y
5111 // G_ICMP x, z
5112 //
5113 // Produce this:
5114 //
5115 // cmn y, z
5116 if (isCMN(MaybeSub: LHSDef, Pred: P, MRI))
5117 return emitCMN(LHS&: LHSDef->getOperand(i: 2), RHS, MIRBuilder);
5118
5119 // Same idea here, but with the RHS of the compare instead:
5120 //
5121 // Given this:
5122 //
5123 // x = G_SUB 0, y
5124 // G_ICMP z, x
5125 //
5126 // Produce this:
5127 //
5128 // cmn z, y
5129 if (isCMN(MaybeSub: RHSDef, Pred: P, MRI))
5130 return emitCMN(LHS, RHS&: RHSDef->getOperand(i: 2), MIRBuilder);
5131
5132 // Given this:
5133 //
5134 // z = G_AND x, y
5135 // G_ICMP z, 0
5136 //
5137 // Produce this if the compare is signed:
5138 //
5139 // tst x, y
5140 if (!CmpInst::isUnsigned(predicate: P) && LHSDef &&
5141 LHSDef->getOpcode() == TargetOpcode::G_AND) {
5142 // Make sure that the RHS is 0.
5143 auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: RHS.getReg(), MRI);
5144 if (!ValAndVReg || ValAndVReg->Value != 0)
5145 return nullptr;
5146
5147 return emitTST(LHS&: LHSDef->getOperand(i: 1),
5148 RHS&: LHSDef->getOperand(i: 2), MIRBuilder);
5149 }
5150
5151 return nullptr;
5152}
5153
5154bool AArch64InstructionSelector::selectShuffleVector(
5155 MachineInstr &I, MachineRegisterInfo &MRI) {
5156 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
5157 Register Src1Reg = I.getOperand(i: 1).getReg();
5158 const LLT Src1Ty = MRI.getType(Reg: Src1Reg);
5159 Register Src2Reg = I.getOperand(i: 2).getReg();
5160 const LLT Src2Ty = MRI.getType(Reg: Src2Reg);
5161 ArrayRef<int> Mask = I.getOperand(i: 3).getShuffleMask();
5162
5163 MachineBasicBlock &MBB = *I.getParent();
5164 MachineFunction &MF = *MBB.getParent();
5165 LLVMContext &Ctx = MF.getFunction().getContext();
5166
5167 // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
5168 // it's originated from a <1 x T> type. Those should have been lowered into
5169 // G_BUILD_VECTOR earlier.
5170 if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
5171 LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
5172 return false;
5173 }
5174
5175 unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
5176
5177 SmallVector<Constant *, 64> CstIdxs;
5178 for (int Val : Mask) {
5179 // For now, any undef indexes we'll just assume to be 0. This should be
5180 // optimized in future, e.g. to select DUP etc.
5181 Val = Val < 0 ? 0 : Val;
5182 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
5183 unsigned Offset = Byte + Val * BytesPerElt;
5184 CstIdxs.emplace_back(Args: ConstantInt::get(Ty: Type::getInt8Ty(C&: Ctx), V: Offset));
5185 }
5186 }
5187
5188 // Use a constant pool to load the index vector for TBL.
5189 Constant *CPVal = ConstantVector::get(V: CstIdxs);
5190 MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder&: MIB);
5191 if (!IndexLoad) {
5192 LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
5193 return false;
5194 }
5195
5196 if (DstTy.getSizeInBits() != 128) {
5197 assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
5198 // This case can be done with TBL1.
5199 MachineInstr *Concat =
5200 emitVectorConcat(Dst: std::nullopt, Op1: Src1Reg, Op2: Src2Reg, MIRBuilder&: MIB);
5201 if (!Concat) {
5202 LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
5203 return false;
5204 }
5205
5206 // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
5207 IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass,
5208 IndexLoad->getOperand(0).getReg(), MIB);
5209
5210 auto TBL1 = MIB.buildInstr(
5211 AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
5212 {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
5213 constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
5214
5215 auto Copy =
5216 MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
5217 .addReg(TBL1.getReg(0), 0, AArch64::dsub);
5218 RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
5219 I.eraseFromParent();
5220 return true;
5221 }
5222
5223 // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
5224 // Q registers for regalloc.
5225 SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg};
5226 auto RegSeq = createQTuple(Regs, MIB);
5227 auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)},
5228 {RegSeq, IndexLoad->getOperand(0)});
5229 constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
5230 I.eraseFromParent();
5231 return true;
5232}
5233
5234MachineInstr *AArch64InstructionSelector::emitLaneInsert(
5235 std::optional<Register> DstReg, Register SrcReg, Register EltReg,
5236 unsigned LaneIdx, const RegisterBank &RB,
5237 MachineIRBuilder &MIRBuilder) const {
5238 MachineInstr *InsElt = nullptr;
5239 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5240 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5241
5242 // Create a register to define with the insert if one wasn't passed in.
5243 if (!DstReg)
5244 DstReg = MRI.createVirtualRegister(RegClass: DstRC);
5245
5246 unsigned EltSize = MRI.getType(Reg: EltReg).getSizeInBits();
5247 unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
5248
5249 if (RB.getID() == AArch64::FPRRegBankID) {
5250 auto InsSub = emitScalarToVector(EltSize, DstRC, Scalar: EltReg, MIRBuilder);
5251 InsElt = MIRBuilder.buildInstr(Opc, DstOps: {*DstReg}, SrcOps: {SrcReg})
5252 .addImm(Val: LaneIdx)
5253 .addUse(RegNo: InsSub->getOperand(i: 0).getReg())
5254 .addImm(Val: 0);
5255 } else {
5256 InsElt = MIRBuilder.buildInstr(Opc, DstOps: {*DstReg}, SrcOps: {SrcReg})
5257 .addImm(Val: LaneIdx)
5258 .addUse(RegNo: EltReg);
5259 }
5260
5261 constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
5262 return InsElt;
5263}
5264
5265bool AArch64InstructionSelector::selectUSMovFromExtend(
5266 MachineInstr &MI, MachineRegisterInfo &MRI) {
5267 if (MI.getOpcode() != TargetOpcode::G_SEXT &&
5268 MI.getOpcode() != TargetOpcode::G_ZEXT &&
5269 MI.getOpcode() != TargetOpcode::G_ANYEXT)
5270 return false;
5271 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT;
5272 const Register DefReg = MI.getOperand(i: 0).getReg();
5273 const LLT DstTy = MRI.getType(Reg: DefReg);
5274 unsigned DstSize = DstTy.getSizeInBits();
5275
5276 if (DstSize != 32 && DstSize != 64)
5277 return false;
5278
5279 MachineInstr *Extract = getOpcodeDef(Opcode: TargetOpcode::G_EXTRACT_VECTOR_ELT,
5280 Reg: MI.getOperand(i: 1).getReg(), MRI);
5281 int64_t Lane;
5282 if (!Extract || !mi_match(R: Extract->getOperand(i: 2).getReg(), MRI, P: m_ICst(Cst&: Lane)))
5283 return false;
5284 Register Src0 = Extract->getOperand(i: 1).getReg();
5285
5286 const LLT &VecTy = MRI.getType(Reg: Src0);
5287
5288 if (VecTy.getSizeInBits() != 128) {
5289 const MachineInstr *ScalarToVector = emitScalarToVector(
5290 VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB);
5291 assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!");
5292 Src0 = ScalarToVector->getOperand(i: 0).getReg();
5293 }
5294
5295 unsigned Opcode;
5296 if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32)
5297 Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32;
5298 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16)
5299 Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16;
5300 else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8)
5301 Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8;
5302 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16)
5303 Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16;
5304 else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8)
5305 Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8;
5306 else
5307 llvm_unreachable("Unexpected type combo for S/UMov!");
5308
5309 // We may need to generate one of these, depending on the type and sign of the
5310 // input:
5311 // DstReg = SMOV Src0, Lane;
5312 // NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32;
5313 MachineInstr *ExtI = nullptr;
5314 if (DstSize == 64 && !IsSigned) {
5315 Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
5316 MIB.buildInstr(Opc: Opcode, DstOps: {NewReg}, SrcOps: {Src0}).addImm(Val: Lane);
5317 ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
5318 .addImm(0)
5319 .addUse(NewReg)
5320 .addImm(AArch64::sub_32);
5321 RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
5322 } else
5323 ExtI = MIB.buildInstr(Opc: Opcode, DstOps: {DefReg}, SrcOps: {Src0}).addImm(Val: Lane);
5324
5325 constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
5326 MI.eraseFromParent();
5327 return true;
5328}
5329
5330MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm8(
5331 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5332 unsigned int Op;
5333 if (DstSize == 128) {
5334 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5335 return nullptr;
5336 Op = AArch64::MOVIv16b_ns;
5337 } else {
5338 Op = AArch64::MOVIv8b_ns;
5339 }
5340
5341 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5342
5343 if (AArch64_AM::isAdvSIMDModImmType9(Imm: Val)) {
5344 Val = AArch64_AM::encodeAdvSIMDModImmType9(Imm: Val);
5345 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val);
5346 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5347 return &*Mov;
5348 }
5349 return nullptr;
5350}
5351
5352MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm16(
5353 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5354 bool Inv) {
5355
5356 unsigned int Op;
5357 if (DstSize == 128) {
5358 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5359 return nullptr;
5360 Op = Inv ? AArch64::MVNIv8i16 : AArch64::MOVIv8i16;
5361 } else {
5362 Op = Inv ? AArch64::MVNIv4i16 : AArch64::MOVIv4i16;
5363 }
5364
5365 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5366 uint64_t Shift;
5367
5368 if (AArch64_AM::isAdvSIMDModImmType5(Imm: Val)) {
5369 Val = AArch64_AM::encodeAdvSIMDModImmType5(Imm: Val);
5370 Shift = 0;
5371 } else if (AArch64_AM::isAdvSIMDModImmType6(Imm: Val)) {
5372 Val = AArch64_AM::encodeAdvSIMDModImmType6(Imm: Val);
5373 Shift = 8;
5374 } else
5375 return nullptr;
5376
5377 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val).addImm(Val: Shift);
5378 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5379 return &*Mov;
5380}
5381
5382MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm32(
5383 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5384 bool Inv) {
5385
5386 unsigned int Op;
5387 if (DstSize == 128) {
5388 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5389 return nullptr;
5390 Op = Inv ? AArch64::MVNIv4i32 : AArch64::MOVIv4i32;
5391 } else {
5392 Op = Inv ? AArch64::MVNIv2i32 : AArch64::MOVIv2i32;
5393 }
5394
5395 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5396 uint64_t Shift;
5397
5398 if ((AArch64_AM::isAdvSIMDModImmType1(Imm: Val))) {
5399 Val = AArch64_AM::encodeAdvSIMDModImmType1(Imm: Val);
5400 Shift = 0;
5401 } else if ((AArch64_AM::isAdvSIMDModImmType2(Imm: Val))) {
5402 Val = AArch64_AM::encodeAdvSIMDModImmType2(Imm: Val);
5403 Shift = 8;
5404 } else if ((AArch64_AM::isAdvSIMDModImmType3(Imm: Val))) {
5405 Val = AArch64_AM::encodeAdvSIMDModImmType3(Imm: Val);
5406 Shift = 16;
5407 } else if ((AArch64_AM::isAdvSIMDModImmType4(Imm: Val))) {
5408 Val = AArch64_AM::encodeAdvSIMDModImmType4(Imm: Val);
5409 Shift = 24;
5410 } else
5411 return nullptr;
5412
5413 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val).addImm(Val: Shift);
5414 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5415 return &*Mov;
5416}
5417
5418MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm64(
5419 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5420
5421 unsigned int Op;
5422 if (DstSize == 128) {
5423 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5424 return nullptr;
5425 Op = AArch64::MOVIv2d_ns;
5426 } else {
5427 Op = AArch64::MOVID;
5428 }
5429
5430 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5431 if (AArch64_AM::isAdvSIMDModImmType10(Imm: Val)) {
5432 Val = AArch64_AM::encodeAdvSIMDModImmType10(Imm: Val);
5433 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val);
5434 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5435 return &*Mov;
5436 }
5437 return nullptr;
5438}
5439
5440MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImm321s(
5441 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder,
5442 bool Inv) {
5443
5444 unsigned int Op;
5445 if (DstSize == 128) {
5446 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5447 return nullptr;
5448 Op = Inv ? AArch64::MVNIv4s_msl : AArch64::MOVIv4s_msl;
5449 } else {
5450 Op = Inv ? AArch64::MVNIv2s_msl : AArch64::MOVIv2s_msl;
5451 }
5452
5453 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5454 uint64_t Shift;
5455
5456 if (AArch64_AM::isAdvSIMDModImmType7(Imm: Val)) {
5457 Val = AArch64_AM::encodeAdvSIMDModImmType7(Imm: Val);
5458 Shift = 264;
5459 } else if (AArch64_AM::isAdvSIMDModImmType8(Imm: Val)) {
5460 Val = AArch64_AM::encodeAdvSIMDModImmType8(Imm: Val);
5461 Shift = 272;
5462 } else
5463 return nullptr;
5464
5465 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val).addImm(Val: Shift);
5466 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5467 return &*Mov;
5468}
5469
5470MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImmFP(
5471 Register Dst, unsigned DstSize, APInt Bits, MachineIRBuilder &Builder) {
5472
5473 unsigned int Op;
5474 bool IsWide = false;
5475 if (DstSize == 128) {
5476 if (Bits.getHiBits(numBits: 64) != Bits.getLoBits(numBits: 64))
5477 return nullptr;
5478 Op = AArch64::FMOVv4f32_ns;
5479 IsWide = true;
5480 } else {
5481 Op = AArch64::FMOVv2f32_ns;
5482 }
5483
5484 uint64_t Val = Bits.zextOrTrunc(width: 64).getZExtValue();
5485
5486 if (AArch64_AM::isAdvSIMDModImmType11(Imm: Val)) {
5487 Val = AArch64_AM::encodeAdvSIMDModImmType11(Imm: Val);
5488 } else if (IsWide && AArch64_AM::isAdvSIMDModImmType12(Imm: Val)) {
5489 Val = AArch64_AM::encodeAdvSIMDModImmType12(Imm: Val);
5490 Op = AArch64::FMOVv2f64_ns;
5491 } else
5492 return nullptr;
5493
5494 auto Mov = Builder.buildInstr(Opc: Op, DstOps: {Dst}, SrcOps: {}).addImm(Val);
5495 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5496 return &*Mov;
5497}
5498
5499bool AArch64InstructionSelector::selectIndexedExtLoad(
5500 MachineInstr &MI, MachineRegisterInfo &MRI) {
5501 auto &ExtLd = cast<GIndexedAnyExtLoad>(Val&: MI);
5502 Register Dst = ExtLd.getDstReg();
5503 Register WriteBack = ExtLd.getWritebackReg();
5504 Register Base = ExtLd.getBaseReg();
5505 Register Offset = ExtLd.getOffsetReg();
5506 LLT Ty = MRI.getType(Reg: Dst);
5507 assert(Ty.getSizeInBits() <= 64); // Only for scalar GPRs.
5508 unsigned MemSizeBits = ExtLd.getMMO().getMemoryType().getSizeInBits();
5509 bool IsPre = ExtLd.isPre();
5510 bool IsSExt = isa<GIndexedSExtLoad>(Val: ExtLd);
5511 bool InsertIntoXReg = false;
5512 bool IsDst64 = Ty.getSizeInBits() == 64;
5513
5514 unsigned Opc = 0;
5515 LLT NewLdDstTy;
5516 LLT s32 = LLT::scalar(SizeInBits: 32);
5517 LLT s64 = LLT::scalar(SizeInBits: 64);
5518
5519 if (MemSizeBits == 8) {
5520 if (IsSExt) {
5521 if (IsDst64)
5522 Opc = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
5523 else
5524 Opc = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
5525 NewLdDstTy = IsDst64 ? s64 : s32;
5526 } else {
5527 Opc = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
5528 InsertIntoXReg = IsDst64;
5529 NewLdDstTy = s32;
5530 }
5531 } else if (MemSizeBits == 16) {
5532 if (IsSExt) {
5533 if (IsDst64)
5534 Opc = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
5535 else
5536 Opc = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
5537 NewLdDstTy = IsDst64 ? s64 : s32;
5538 } else {
5539 Opc = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
5540 InsertIntoXReg = IsDst64;
5541 NewLdDstTy = s32;
5542 }
5543 } else if (MemSizeBits == 32) {
5544 if (IsSExt) {
5545 Opc = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
5546 NewLdDstTy = s64;
5547 } else {
5548 Opc = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
5549 InsertIntoXReg = IsDst64;
5550 NewLdDstTy = s32;
5551 }
5552 } else {
5553 llvm_unreachable("Unexpected size for indexed load");
5554 }
5555
5556 if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5557 return false; // We should be on gpr.
5558
5559 auto Cst = getIConstantVRegVal(VReg: Offset, MRI);
5560 if (!Cst)
5561 return false; // Shouldn't happen, but just in case.
5562
5563 auto LdMI = MIB.buildInstr(Opc, DstOps: {WriteBack, NewLdDstTy}, SrcOps: {Base})
5564 .addImm(Val: Cst->getSExtValue());
5565 LdMI.cloneMemRefs(OtherMI: ExtLd);
5566 constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI);
5567 // Make sure to select the load with the MemTy as the dest type, and then
5568 // insert into X reg if needed.
5569 if (InsertIntoXReg) {
5570 // Generate a SUBREG_TO_REG.
5571 auto SubToReg = MIB.buildInstr(TargetOpcode::SUBREG_TO_REG, {Dst}, {})
5572 .addImm(0)
5573 .addUse(LdMI.getReg(1))
5574 .addImm(AArch64::sub_32);
5575 RBI.constrainGenericRegister(SubToReg.getReg(0), AArch64::GPR64RegClass,
5576 MRI);
5577 } else {
5578 auto Copy = MIB.buildCopy(Res: Dst, Op: LdMI.getReg(Idx: 1));
5579 selectCopy(*Copy, TII, MRI, TRI, RBI);
5580 }
5581 MI.eraseFromParent();
5582
5583 return true;
5584}
5585
5586bool AArch64InstructionSelector::selectIndexedLoad(MachineInstr &MI,
5587 MachineRegisterInfo &MRI) {
5588 auto &Ld = cast<GIndexedLoad>(Val&: MI);
5589 Register Dst = Ld.getDstReg();
5590 Register WriteBack = Ld.getWritebackReg();
5591 Register Base = Ld.getBaseReg();
5592 Register Offset = Ld.getOffsetReg();
5593 assert(MRI.getType(Dst).getSizeInBits() <= 128 &&
5594 "Unexpected type for indexed load");
5595 unsigned MemSize = Ld.getMMO().getMemoryType().getSizeInBytes();
5596
5597 if (MemSize < MRI.getType(Reg: Dst).getSizeInBytes())
5598 return selectIndexedExtLoad(MI, MRI);
5599
5600 unsigned Opc = 0;
5601 if (Ld.isPre()) {
5602 static constexpr unsigned GPROpcodes[] = {
5603 AArch64::LDRBBpre, AArch64::LDRHHpre, AArch64::LDRWpre,
5604 AArch64::LDRXpre};
5605 static constexpr unsigned FPROpcodes[] = {
5606 AArch64::LDRBpre, AArch64::LDRHpre, AArch64::LDRSpre, AArch64::LDRDpre,
5607 AArch64::LDRQpre};
5608 if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5609 Opc = FPROpcodes[Log2_32(Value: MemSize)];
5610 else
5611 Opc = GPROpcodes[Log2_32(Value: MemSize)];
5612 } else {
5613 static constexpr unsigned GPROpcodes[] = {
5614 AArch64::LDRBBpost, AArch64::LDRHHpost, AArch64::LDRWpost,
5615 AArch64::LDRXpost};
5616 static constexpr unsigned FPROpcodes[] = {
5617 AArch64::LDRBpost, AArch64::LDRHpost, AArch64::LDRSpost,
5618 AArch64::LDRDpost, AArch64::LDRQpost};
5619 if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5620 Opc = FPROpcodes[Log2_32(Value: MemSize)];
5621 else
5622 Opc = GPROpcodes[Log2_32(Value: MemSize)];
5623 }
5624 auto Cst = getIConstantVRegVal(VReg: Offset, MRI);
5625 if (!Cst)
5626 return false; // Shouldn't happen, but just in case.
5627 auto LdMI =
5628 MIB.buildInstr(Opc, DstOps: {WriteBack, Dst}, SrcOps: {Base}).addImm(Val: Cst->getSExtValue());
5629 LdMI.cloneMemRefs(OtherMI: Ld);
5630 constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI);
5631 MI.eraseFromParent();
5632 return true;
5633}
5634
5635bool AArch64InstructionSelector::selectIndexedStore(GIndexedStore &I,
5636 MachineRegisterInfo &MRI) {
5637 Register Dst = I.getWritebackReg();
5638 Register Val = I.getValueReg();
5639 Register Base = I.getBaseReg();
5640 Register Offset = I.getOffsetReg();
5641 LLT ValTy = MRI.getType(Reg: Val);
5642 assert(ValTy.getSizeInBits() <= 128 && "Unexpected type for indexed store");
5643
5644 unsigned Opc = 0;
5645 if (I.isPre()) {
5646 static constexpr unsigned GPROpcodes[] = {
5647 AArch64::STRBBpre, AArch64::STRHHpre, AArch64::STRWpre,
5648 AArch64::STRXpre};
5649 static constexpr unsigned FPROpcodes[] = {
5650 AArch64::STRBpre, AArch64::STRHpre, AArch64::STRSpre, AArch64::STRDpre,
5651 AArch64::STRQpre};
5652
5653 if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5654 Opc = FPROpcodes[Log2_32(Value: ValTy.getSizeInBytes())];
5655 else
5656 Opc = GPROpcodes[Log2_32(Value: ValTy.getSizeInBytes())];
5657 } else {
5658 static constexpr unsigned GPROpcodes[] = {
5659 AArch64::STRBBpost, AArch64::STRHHpost, AArch64::STRWpost,
5660 AArch64::STRXpost};
5661 static constexpr unsigned FPROpcodes[] = {
5662 AArch64::STRBpost, AArch64::STRHpost, AArch64::STRSpost,
5663 AArch64::STRDpost, AArch64::STRQpost};
5664
5665 if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
5666 Opc = FPROpcodes[Log2_32(Value: ValTy.getSizeInBytes())];
5667 else
5668 Opc = GPROpcodes[Log2_32(Value: ValTy.getSizeInBytes())];
5669 }
5670
5671 auto Cst = getIConstantVRegVal(VReg: Offset, MRI);
5672 if (!Cst)
5673 return false; // Shouldn't happen, but just in case.
5674 auto Str =
5675 MIB.buildInstr(Opc, DstOps: {Dst}, SrcOps: {Val, Base}).addImm(Val: Cst->getSExtValue());
5676 Str.cloneMemRefs(OtherMI: I);
5677 constrainSelectedInstRegOperands(*Str, TII, TRI, RBI);
5678 I.eraseFromParent();
5679 return true;
5680}
5681
5682MachineInstr *
5683AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
5684 MachineIRBuilder &MIRBuilder,
5685 MachineRegisterInfo &MRI) {
5686 LLT DstTy = MRI.getType(Reg: Dst);
5687 unsigned DstSize = DstTy.getSizeInBits();
5688 if (CV->isNullValue()) {
5689 if (DstSize == 128) {
5690 auto Mov =
5691 MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0);
5692 constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5693 return &*Mov;
5694 }
5695
5696 if (DstSize == 64) {
5697 auto Mov =
5698 MIRBuilder
5699 .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
5700 .addImm(0);
5701 auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {})
5702 .addReg(Mov.getReg(0), 0, AArch64::dsub);
5703 RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI);
5704 return &*Copy;
5705 }
5706 }
5707
5708 if (CV->getSplatValue()) {
5709 APInt DefBits = APInt::getSplat(NewLen: DstSize, V: CV->getUniqueInteger());
5710 auto TryMOVIWithBits = [&](APInt DefBits) -> MachineInstr * {
5711 MachineInstr *NewOp;
5712 bool Inv = false;
5713 if ((NewOp = tryAdvSIMDModImm64(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder)) ||
5714 (NewOp =
5715 tryAdvSIMDModImm32(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) ||
5716 (NewOp =
5717 tryAdvSIMDModImm321s(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) ||
5718 (NewOp =
5719 tryAdvSIMDModImm16(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) ||
5720 (NewOp = tryAdvSIMDModImm8(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder)) ||
5721 (NewOp = tryAdvSIMDModImmFP(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder)))
5722 return NewOp;
5723
5724 DefBits = ~DefBits;
5725 Inv = true;
5726 if ((NewOp =
5727 tryAdvSIMDModImm32(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) ||
5728 (NewOp =
5729 tryAdvSIMDModImm321s(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)) ||
5730 (NewOp = tryAdvSIMDModImm16(Dst, DstSize, Bits: DefBits, Builder&: MIRBuilder, Inv)))
5731 return NewOp;
5732 return nullptr;
5733 };
5734
5735 if (auto *NewOp = TryMOVIWithBits(DefBits))
5736 return NewOp;
5737
5738 // See if a fneg of the constant can be materialized with a MOVI, etc
5739 auto TryWithFNeg = [&](APInt DefBits, int NumBits,
5740 unsigned NegOpc) -> MachineInstr * {
5741 // FNegate each sub-element of the constant
5742 APInt Neg = APInt::getHighBitsSet(numBits: NumBits, hiBitsSet: 1).zext(width: DstSize);
5743 APInt NegBits(DstSize, 0);
5744 unsigned NumElts = DstSize / NumBits;
5745 for (unsigned i = 0; i < NumElts; i++)
5746 NegBits |= Neg << (NumBits * i);
5747 NegBits = DefBits ^ NegBits;
5748
5749 // Try to create the new constants with MOVI, and if so generate a fneg
5750 // for it.
5751 if (auto *NewOp = TryMOVIWithBits(NegBits)) {
5752 Register NewDst = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
5753 NewOp->getOperand(i: 0).setReg(NewDst);
5754 return MIRBuilder.buildInstr(Opc: NegOpc, DstOps: {Dst}, SrcOps: {NewDst});
5755 }
5756 return nullptr;
5757 };
5758 MachineInstr *R;
5759 if ((R = TryWithFNeg(DefBits, 32, AArch64::FNEGv4f32)) ||
5760 (R = TryWithFNeg(DefBits, 64, AArch64::FNEGv2f64)) ||
5761 (STI.hasFullFP16() &&
5762 (R = TryWithFNeg(DefBits, 16, AArch64::FNEGv8f16))))
5763 return R;
5764 }
5765
5766 auto *CPLoad = emitLoadFromConstantPool(CPVal: CV, MIRBuilder);
5767 if (!CPLoad) {
5768 LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!");
5769 return nullptr;
5770 }
5771
5772 auto Copy = MIRBuilder.buildCopy(Res: Dst, Op: CPLoad->getOperand(i: 0));
5773 RBI.constrainGenericRegister(
5774 Reg: Dst, RC: *MRI.getRegClass(Reg: CPLoad->getOperand(i: 0).getReg()), MRI);
5775 return &*Copy;
5776}
5777
5778bool AArch64InstructionSelector::tryOptConstantBuildVec(
5779 MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) {
5780 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5781 unsigned DstSize = DstTy.getSizeInBits();
5782 assert(DstSize <= 128 && "Unexpected build_vec type!");
5783 if (DstSize < 32)
5784 return false;
5785 // Check if we're building a constant vector, in which case we want to
5786 // generate a constant pool load instead of a vector insert sequence.
5787 SmallVector<Constant *, 16> Csts;
5788 for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
5789 // Try to find G_CONSTANT or G_FCONSTANT
5790 auto *OpMI =
5791 getOpcodeDef(Opcode: TargetOpcode::G_CONSTANT, Reg: I.getOperand(i: Idx).getReg(), MRI);
5792 if (OpMI)
5793 Csts.emplace_back(
5794 Args: const_cast<ConstantInt *>(OpMI->getOperand(i: 1).getCImm()));
5795 else if ((OpMI = getOpcodeDef(Opcode: TargetOpcode::G_FCONSTANT,
5796 Reg: I.getOperand(i: Idx).getReg(), MRI)))
5797 Csts.emplace_back(
5798 Args: const_cast<ConstantFP *>(OpMI->getOperand(i: 1).getFPImm()));
5799 else
5800 return false;
5801 }
5802 Constant *CV = ConstantVector::get(V: Csts);
5803 if (!emitConstantVector(Dst: I.getOperand(i: 0).getReg(), CV, MIRBuilder&: MIB, MRI))
5804 return false;
5805 I.eraseFromParent();
5806 return true;
5807}
5808
5809bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg(
5810 MachineInstr &I, MachineRegisterInfo &MRI) {
5811 // Given:
5812 // %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef
5813 //
5814 // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt.
5815 Register Dst = I.getOperand(i: 0).getReg();
5816 Register EltReg = I.getOperand(i: 1).getReg();
5817 LLT EltTy = MRI.getType(Reg: EltReg);
5818 // If the index isn't on the same bank as its elements, then this can't be a
5819 // SUBREG_TO_REG.
5820 const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
5821 const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI);
5822 if (EltRB != DstRB)
5823 return false;
5824 if (any_of(Range: drop_begin(RangeOrContainer: I.operands(), N: 2), P: [&MRI](const MachineOperand &Op) {
5825 return !getOpcodeDef(Opcode: TargetOpcode::G_IMPLICIT_DEF, Reg: Op.getReg(), MRI);
5826 }))
5827 return false;
5828 unsigned SubReg;
5829 const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(Ty: EltTy, RB: EltRB);
5830 if (!EltRC)
5831 return false;
5832 const TargetRegisterClass *DstRC =
5833 getRegClassForTypeOnBank(Ty: MRI.getType(Reg: Dst), RB: DstRB);
5834 if (!DstRC)
5835 return false;
5836 if (!getSubRegForClass(EltRC, TRI, SubReg))
5837 return false;
5838 auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {})
5839 .addImm(0)
5840 .addUse(EltReg)
5841 .addImm(SubReg);
5842 I.eraseFromParent();
5843 constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI);
5844 return RBI.constrainGenericRegister(Reg: Dst, RC: *DstRC, MRI);
5845}
5846
5847bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
5848 MachineRegisterInfo &MRI) {
5849 assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5850 // Until we port more of the optimized selections, for now just use a vector
5851 // insert sequence.
5852 const LLT DstTy = MRI.getType(Reg: I.getOperand(i: 0).getReg());
5853 const LLT EltTy = MRI.getType(Reg: I.getOperand(i: 1).getReg());
5854 unsigned EltSize = EltTy.getSizeInBits();
5855
5856 if (tryOptConstantBuildVec(I, DstTy, MRI))
5857 return true;
5858 if (tryOptBuildVecToSubregToReg(I, MRI))
5859 return true;
5860
5861 if (EltSize != 8 && EltSize != 16 && EltSize != 32 && EltSize != 64)
5862 return false; // Don't support all element types yet.
5863 const RegisterBank &RB = *RBI.getRegBank(I.getOperand(i: 1).getReg(), MRI, TRI);
5864
5865 const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5866 MachineInstr *ScalarToVec =
5867 emitScalarToVector(EltSize: DstTy.getElementType().getSizeInBits(), DstRC,
5868 Scalar: I.getOperand(i: 1).getReg(), MIRBuilder&: MIB);
5869 if (!ScalarToVec)
5870 return false;
5871
5872 Register DstVec = ScalarToVec->getOperand(i: 0).getReg();
5873 unsigned DstSize = DstTy.getSizeInBits();
5874
5875 // Keep track of the last MI we inserted. Later on, we might be able to save
5876 // a copy using it.
5877 MachineInstr *PrevMI = ScalarToVec;
5878 for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
5879 // Note that if we don't do a subregister copy, we can end up making an
5880 // extra register.
5881 Register OpReg = I.getOperand(i).getReg();
5882 // Do not emit inserts for undefs
5883 if (!getOpcodeDef<GImplicitDef>(Reg: OpReg, MRI)) {
5884 PrevMI = &*emitLaneInsert(DstReg: std::nullopt, SrcReg: DstVec, EltReg: OpReg, LaneIdx: i - 1, RB, MIRBuilder&: MIB);
5885 DstVec = PrevMI->getOperand(i: 0).getReg();
5886 }
5887 }
5888
5889 // If DstTy's size in bits is less than 128, then emit a subregister copy
5890 // from DstVec to the last register we've defined.
5891 if (DstSize < 128) {
5892 // Force this to be FPR using the destination vector.
5893 const TargetRegisterClass *RC =
5894 getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
5895 if (!RC)
5896 return false;
5897 if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5898 LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5899 return false;
5900 }
5901
5902 unsigned SubReg = 0;
5903 if (!getSubRegForClass(RC, TRI, SubReg))
5904 return false;
5905 if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5906 LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
5907 << "\n");
5908 return false;
5909 }
5910
5911 Register Reg = MRI.createVirtualRegister(RegClass: RC);
5912 Register DstReg = I.getOperand(i: 0).getReg();
5913
5914 MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {DstReg}, SrcOps: {}).addReg(RegNo: DstVec, flags: 0, SubReg);
5915 MachineOperand &RegOp = I.getOperand(i: 1);
5916 RegOp.setReg(Reg);
5917 RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI);
5918 } else {
5919 // We either have a vector with all elements (except the first one) undef or
5920 // at least one non-undef non-first element. In the first case, we need to
5921 // constrain the output register ourselves as we may have generated an
5922 // INSERT_SUBREG operation which is a generic operation for which the
5923 // output regclass cannot be automatically chosen.
5924 //
5925 // In the second case, there is no need to do this as it may generate an
5926 // instruction like INSvi32gpr where the regclass can be automatically
5927 // chosen.
5928 //
5929 // Also, we save a copy by re-using the destination register on the final
5930 // insert.
5931 PrevMI->getOperand(i: 0).setReg(I.getOperand(i: 0).getReg());
5932 constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
5933
5934 Register DstReg = PrevMI->getOperand(i: 0).getReg();
5935 if (PrevMI == ScalarToVec && DstReg.isVirtual()) {
5936 const TargetRegisterClass *RC =
5937 getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
5938 RBI.constrainGenericRegister(Reg: DstReg, RC: *RC, MRI);
5939 }
5940 }
5941
5942 I.eraseFromParent();
5943 return true;
5944}
5945
5946bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
5947 unsigned NumVecs,
5948 MachineInstr &I) {
5949 assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
5950 assert(Opc && "Expected an opcode?");
5951 assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
5952 auto &MRI = *MIB.getMRI();
5953 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
5954 unsigned Size = Ty.getSizeInBits();
5955 assert((Size == 64 || Size == 128) &&
5956 "Destination must be 64 bits or 128 bits?");
5957 unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0;
5958 auto Ptr = I.getOperand(i: I.getNumOperands() - 1).getReg();
5959 assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?");
5960 auto Load = MIB.buildInstr(Opc, DstOps: {Ty}, SrcOps: {Ptr});
5961 Load.cloneMemRefs(OtherMI: I);
5962 constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
5963 Register SelectedLoadDst = Load->getOperand(i: 0).getReg();
5964 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
5965 auto Vec = MIB.buildInstr(Opc: TargetOpcode::COPY, DstOps: {I.getOperand(i: Idx)}, SrcOps: {})
5966 .addReg(RegNo: SelectedLoadDst, flags: 0, SubReg: SubReg + Idx);
5967 // Emit the subreg copies and immediately select them.
5968 // FIXME: We should refactor our copy code into an emitCopy helper and
5969 // clean up uses of this pattern elsewhere in the selector.
5970 selectCopy(*Vec, TII, MRI, TRI, RBI);
5971 }
5972 return true;
5973}
5974
5975bool AArch64InstructionSelector::selectVectorLoadLaneIntrinsic(
5976 unsigned Opc, unsigned NumVecs, MachineInstr &I) {
5977 assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
5978 assert(Opc && "Expected an opcode?");
5979 assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
5980 auto &MRI = *MIB.getMRI();
5981 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
5982 bool Narrow = Ty.getSizeInBits() == 64;
5983
5984 auto FirstSrcRegIt = I.operands_begin() + NumVecs + 1;
5985 SmallVector<Register, 4> Regs(NumVecs);
5986 std::transform(first: FirstSrcRegIt, last: FirstSrcRegIt + NumVecs, result: Regs.begin(),
5987 unary_op: [](auto MO) { return MO.getReg(); });
5988
5989 if (Narrow) {
5990 transform(Range&: Regs, d_first: Regs.begin(), F: [this](Register Reg) {
5991 return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB)
5992 ->getOperand(0)
5993 .getReg();
5994 });
5995 Ty = Ty.multiplyElements(Factor: 2);
5996 }
5997
5998 Register Tuple = createQTuple(Regs, MIB);
5999 auto LaneNo = getIConstantVRegVal(VReg: (FirstSrcRegIt + NumVecs)->getReg(), MRI);
6000 if (!LaneNo)
6001 return false;
6002
6003 Register Ptr = (FirstSrcRegIt + NumVecs + 1)->getReg();
6004 auto Load = MIB.buildInstr(Opc, DstOps: {Ty}, SrcOps: {})
6005 .addReg(RegNo: Tuple)
6006 .addImm(Val: LaneNo->getZExtValue())
6007 .addReg(RegNo: Ptr);
6008 Load.cloneMemRefs(OtherMI: I);
6009 constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
6010 Register SelectedLoadDst = Load->getOperand(i: 0).getReg();
6011 unsigned SubReg = AArch64::qsub0;
6012 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
6013 auto Vec = MIB.buildInstr(TargetOpcode::COPY,
6014 {Narrow ? DstOp(&AArch64::FPR128RegClass)
6015 : DstOp(I.getOperand(Idx).getReg())},
6016 {})
6017 .addReg(SelectedLoadDst, 0, SubReg + Idx);
6018 Register WideReg = Vec.getReg(0);
6019 // Emit the subreg copies and immediately select them.
6020 selectCopy(*Vec, TII, MRI, TRI, RBI);
6021 if (Narrow &&
6022 !emitNarrowVector(DstReg: I.getOperand(i: Idx).getReg(), SrcReg: WideReg, MIB, MRI))
6023 return false;
6024 }
6025 return true;
6026}
6027
6028void AArch64InstructionSelector::selectVectorStoreIntrinsic(MachineInstr &I,
6029 unsigned NumVecs,
6030 unsigned Opc) {
6031 MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
6032 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6033 Register Ptr = I.getOperand(i: 1 + NumVecs).getReg();
6034
6035 SmallVector<Register, 2> Regs(NumVecs);
6036 std::transform(first: I.operands_begin() + 1, last: I.operands_begin() + 1 + NumVecs,
6037 result: Regs.begin(), unary_op: [](auto MO) { return MO.getReg(); });
6038
6039 Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB)
6040 : createDTuple(Regs, MIB);
6041 auto Store = MIB.buildInstr(Opc, DstOps: {}, SrcOps: {Tuple, Ptr});
6042 Store.cloneMemRefs(OtherMI: I);
6043 constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
6044}
6045
6046bool AArch64InstructionSelector::selectVectorStoreLaneIntrinsic(
6047 MachineInstr &I, unsigned NumVecs, unsigned Opc) {
6048 MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
6049 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6050 bool Narrow = Ty.getSizeInBits() == 64;
6051
6052 SmallVector<Register, 2> Regs(NumVecs);
6053 std::transform(first: I.operands_begin() + 1, last: I.operands_begin() + 1 + NumVecs,
6054 result: Regs.begin(), unary_op: [](auto MO) { return MO.getReg(); });
6055
6056 if (Narrow)
6057 transform(Range&: Regs, d_first: Regs.begin(), F: [this](Register Reg) {
6058 return emitScalarToVector(64, &AArch64::FPR128RegClass, Reg, MIB)
6059 ->getOperand(0)
6060 .getReg();
6061 });
6062
6063 Register Tuple = createQTuple(Regs, MIB);
6064
6065 auto LaneNo = getIConstantVRegVal(VReg: I.getOperand(i: 1 + NumVecs).getReg(), MRI);
6066 if (!LaneNo)
6067 return false;
6068 Register Ptr = I.getOperand(i: 1 + NumVecs + 1).getReg();
6069 auto Store = MIB.buildInstr(Opc, DstOps: {}, SrcOps: {})
6070 .addReg(RegNo: Tuple)
6071 .addImm(Val: LaneNo->getZExtValue())
6072 .addReg(RegNo: Ptr);
6073 Store.cloneMemRefs(OtherMI: I);
6074 constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
6075 return true;
6076}
6077
6078bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
6079 MachineInstr &I, MachineRegisterInfo &MRI) {
6080 // Find the intrinsic ID.
6081 unsigned IntrinID = cast<GIntrinsic>(Val&: I).getIntrinsicID();
6082
6083 const LLT S8 = LLT::scalar(SizeInBits: 8);
6084 const LLT S16 = LLT::scalar(SizeInBits: 16);
6085 const LLT S32 = LLT::scalar(SizeInBits: 32);
6086 const LLT S64 = LLT::scalar(SizeInBits: 64);
6087 const LLT P0 = LLT::pointer(AddressSpace: 0, SizeInBits: 64);
6088 // Select the instruction.
6089 switch (IntrinID) {
6090 default:
6091 return false;
6092 case Intrinsic::aarch64_ldxp:
6093 case Intrinsic::aarch64_ldaxp: {
6094 auto NewI = MIB.buildInstr(
6095 IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX,
6096 {I.getOperand(0).getReg(), I.getOperand(1).getReg()},
6097 {I.getOperand(3)});
6098 NewI.cloneMemRefs(I);
6099 constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
6100 break;
6101 }
6102 case Intrinsic::aarch64_neon_ld1x2: {
6103 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6104 unsigned Opc = 0;
6105 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6106 Opc = AArch64::LD1Twov8b;
6107 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6108 Opc = AArch64::LD1Twov16b;
6109 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6110 Opc = AArch64::LD1Twov4h;
6111 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6112 Opc = AArch64::LD1Twov8h;
6113 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6114 Opc = AArch64::LD1Twov2s;
6115 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6116 Opc = AArch64::LD1Twov4s;
6117 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6118 Opc = AArch64::LD1Twov2d;
6119 else if (Ty == S64 || Ty == P0)
6120 Opc = AArch64::LD1Twov1d;
6121 else
6122 llvm_unreachable("Unexpected type for ld1x2!");
6123 selectVectorLoadIntrinsic(Opc, NumVecs: 2, I);
6124 break;
6125 }
6126 case Intrinsic::aarch64_neon_ld1x3: {
6127 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6128 unsigned Opc = 0;
6129 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6130 Opc = AArch64::LD1Threev8b;
6131 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6132 Opc = AArch64::LD1Threev16b;
6133 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6134 Opc = AArch64::LD1Threev4h;
6135 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6136 Opc = AArch64::LD1Threev8h;
6137 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6138 Opc = AArch64::LD1Threev2s;
6139 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6140 Opc = AArch64::LD1Threev4s;
6141 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6142 Opc = AArch64::LD1Threev2d;
6143 else if (Ty == S64 || Ty == P0)
6144 Opc = AArch64::LD1Threev1d;
6145 else
6146 llvm_unreachable("Unexpected type for ld1x3!");
6147 selectVectorLoadIntrinsic(Opc, NumVecs: 3, I);
6148 break;
6149 }
6150 case Intrinsic::aarch64_neon_ld1x4: {
6151 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6152 unsigned Opc = 0;
6153 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6154 Opc = AArch64::LD1Fourv8b;
6155 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6156 Opc = AArch64::LD1Fourv16b;
6157 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6158 Opc = AArch64::LD1Fourv4h;
6159 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6160 Opc = AArch64::LD1Fourv8h;
6161 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6162 Opc = AArch64::LD1Fourv2s;
6163 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6164 Opc = AArch64::LD1Fourv4s;
6165 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6166 Opc = AArch64::LD1Fourv2d;
6167 else if (Ty == S64 || Ty == P0)
6168 Opc = AArch64::LD1Fourv1d;
6169 else
6170 llvm_unreachable("Unexpected type for ld1x4!");
6171 selectVectorLoadIntrinsic(Opc, NumVecs: 4, I);
6172 break;
6173 }
6174 case Intrinsic::aarch64_neon_ld2: {
6175 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6176 unsigned Opc = 0;
6177 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6178 Opc = AArch64::LD2Twov8b;
6179 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6180 Opc = AArch64::LD2Twov16b;
6181 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6182 Opc = AArch64::LD2Twov4h;
6183 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6184 Opc = AArch64::LD2Twov8h;
6185 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6186 Opc = AArch64::LD2Twov2s;
6187 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6188 Opc = AArch64::LD2Twov4s;
6189 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6190 Opc = AArch64::LD2Twov2d;
6191 else if (Ty == S64 || Ty == P0)
6192 Opc = AArch64::LD1Twov1d;
6193 else
6194 llvm_unreachable("Unexpected type for ld2!");
6195 selectVectorLoadIntrinsic(Opc, NumVecs: 2, I);
6196 break;
6197 }
6198 case Intrinsic::aarch64_neon_ld2lane: {
6199 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6200 unsigned Opc;
6201 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6202 Opc = AArch64::LD2i8;
6203 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6204 Opc = AArch64::LD2i16;
6205 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6206 Opc = AArch64::LD2i32;
6207 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6208 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6209 Opc = AArch64::LD2i64;
6210 else
6211 llvm_unreachable("Unexpected type for st2lane!");
6212 if (!selectVectorLoadLaneIntrinsic(Opc, NumVecs: 2, I))
6213 return false;
6214 break;
6215 }
6216 case Intrinsic::aarch64_neon_ld2r: {
6217 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6218 unsigned Opc = 0;
6219 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6220 Opc = AArch64::LD2Rv8b;
6221 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6222 Opc = AArch64::LD2Rv16b;
6223 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6224 Opc = AArch64::LD2Rv4h;
6225 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6226 Opc = AArch64::LD2Rv8h;
6227 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6228 Opc = AArch64::LD2Rv2s;
6229 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6230 Opc = AArch64::LD2Rv4s;
6231 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6232 Opc = AArch64::LD2Rv2d;
6233 else if (Ty == S64 || Ty == P0)
6234 Opc = AArch64::LD2Rv1d;
6235 else
6236 llvm_unreachable("Unexpected type for ld2r!");
6237 selectVectorLoadIntrinsic(Opc, NumVecs: 2, I);
6238 break;
6239 }
6240 case Intrinsic::aarch64_neon_ld3: {
6241 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6242 unsigned Opc = 0;
6243 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6244 Opc = AArch64::LD3Threev8b;
6245 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6246 Opc = AArch64::LD3Threev16b;
6247 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6248 Opc = AArch64::LD3Threev4h;
6249 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6250 Opc = AArch64::LD3Threev8h;
6251 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6252 Opc = AArch64::LD3Threev2s;
6253 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6254 Opc = AArch64::LD3Threev4s;
6255 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6256 Opc = AArch64::LD3Threev2d;
6257 else if (Ty == S64 || Ty == P0)
6258 Opc = AArch64::LD1Threev1d;
6259 else
6260 llvm_unreachable("Unexpected type for ld3!");
6261 selectVectorLoadIntrinsic(Opc, NumVecs: 3, I);
6262 break;
6263 }
6264 case Intrinsic::aarch64_neon_ld3lane: {
6265 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6266 unsigned Opc;
6267 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6268 Opc = AArch64::LD3i8;
6269 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6270 Opc = AArch64::LD3i16;
6271 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6272 Opc = AArch64::LD3i32;
6273 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6274 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6275 Opc = AArch64::LD3i64;
6276 else
6277 llvm_unreachable("Unexpected type for st3lane!");
6278 if (!selectVectorLoadLaneIntrinsic(Opc, NumVecs: 3, I))
6279 return false;
6280 break;
6281 }
6282 case Intrinsic::aarch64_neon_ld3r: {
6283 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6284 unsigned Opc = 0;
6285 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6286 Opc = AArch64::LD3Rv8b;
6287 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6288 Opc = AArch64::LD3Rv16b;
6289 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6290 Opc = AArch64::LD3Rv4h;
6291 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6292 Opc = AArch64::LD3Rv8h;
6293 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6294 Opc = AArch64::LD3Rv2s;
6295 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6296 Opc = AArch64::LD3Rv4s;
6297 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6298 Opc = AArch64::LD3Rv2d;
6299 else if (Ty == S64 || Ty == P0)
6300 Opc = AArch64::LD3Rv1d;
6301 else
6302 llvm_unreachable("Unexpected type for ld3r!");
6303 selectVectorLoadIntrinsic(Opc, NumVecs: 3, I);
6304 break;
6305 }
6306 case Intrinsic::aarch64_neon_ld4: {
6307 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6308 unsigned Opc = 0;
6309 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6310 Opc = AArch64::LD4Fourv8b;
6311 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6312 Opc = AArch64::LD4Fourv16b;
6313 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6314 Opc = AArch64::LD4Fourv4h;
6315 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6316 Opc = AArch64::LD4Fourv8h;
6317 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6318 Opc = AArch64::LD4Fourv2s;
6319 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6320 Opc = AArch64::LD4Fourv4s;
6321 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6322 Opc = AArch64::LD4Fourv2d;
6323 else if (Ty == S64 || Ty == P0)
6324 Opc = AArch64::LD1Fourv1d;
6325 else
6326 llvm_unreachable("Unexpected type for ld4!");
6327 selectVectorLoadIntrinsic(Opc, NumVecs: 4, I);
6328 break;
6329 }
6330 case Intrinsic::aarch64_neon_ld4lane: {
6331 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6332 unsigned Opc;
6333 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6334 Opc = AArch64::LD4i8;
6335 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6336 Opc = AArch64::LD4i16;
6337 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6338 Opc = AArch64::LD4i32;
6339 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6340 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6341 Opc = AArch64::LD4i64;
6342 else
6343 llvm_unreachable("Unexpected type for st4lane!");
6344 if (!selectVectorLoadLaneIntrinsic(Opc, NumVecs: 4, I))
6345 return false;
6346 break;
6347 }
6348 case Intrinsic::aarch64_neon_ld4r: {
6349 LLT Ty = MRI.getType(Reg: I.getOperand(i: 0).getReg());
6350 unsigned Opc = 0;
6351 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6352 Opc = AArch64::LD4Rv8b;
6353 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6354 Opc = AArch64::LD4Rv16b;
6355 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6356 Opc = AArch64::LD4Rv4h;
6357 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6358 Opc = AArch64::LD4Rv8h;
6359 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6360 Opc = AArch64::LD4Rv2s;
6361 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6362 Opc = AArch64::LD4Rv4s;
6363 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6364 Opc = AArch64::LD4Rv2d;
6365 else if (Ty == S64 || Ty == P0)
6366 Opc = AArch64::LD4Rv1d;
6367 else
6368 llvm_unreachable("Unexpected type for ld4r!");
6369 selectVectorLoadIntrinsic(Opc, NumVecs: 4, I);
6370 break;
6371 }
6372 case Intrinsic::aarch64_neon_st1x2: {
6373 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6374 unsigned Opc;
6375 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6376 Opc = AArch64::ST1Twov8b;
6377 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6378 Opc = AArch64::ST1Twov16b;
6379 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6380 Opc = AArch64::ST1Twov4h;
6381 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6382 Opc = AArch64::ST1Twov8h;
6383 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6384 Opc = AArch64::ST1Twov2s;
6385 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6386 Opc = AArch64::ST1Twov4s;
6387 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6388 Opc = AArch64::ST1Twov2d;
6389 else if (Ty == S64 || Ty == P0)
6390 Opc = AArch64::ST1Twov1d;
6391 else
6392 llvm_unreachable("Unexpected type for st1x2!");
6393 selectVectorStoreIntrinsic(I, NumVecs: 2, Opc);
6394 break;
6395 }
6396 case Intrinsic::aarch64_neon_st1x3: {
6397 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6398 unsigned Opc;
6399 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6400 Opc = AArch64::ST1Threev8b;
6401 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6402 Opc = AArch64::ST1Threev16b;
6403 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6404 Opc = AArch64::ST1Threev4h;
6405 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6406 Opc = AArch64::ST1Threev8h;
6407 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6408 Opc = AArch64::ST1Threev2s;
6409 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6410 Opc = AArch64::ST1Threev4s;
6411 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6412 Opc = AArch64::ST1Threev2d;
6413 else if (Ty == S64 || Ty == P0)
6414 Opc = AArch64::ST1Threev1d;
6415 else
6416 llvm_unreachable("Unexpected type for st1x3!");
6417 selectVectorStoreIntrinsic(I, NumVecs: 3, Opc);
6418 break;
6419 }
6420 case Intrinsic::aarch64_neon_st1x4: {
6421 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6422 unsigned Opc;
6423 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6424 Opc = AArch64::ST1Fourv8b;
6425 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6426 Opc = AArch64::ST1Fourv16b;
6427 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6428 Opc = AArch64::ST1Fourv4h;
6429 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6430 Opc = AArch64::ST1Fourv8h;
6431 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6432 Opc = AArch64::ST1Fourv2s;
6433 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6434 Opc = AArch64::ST1Fourv4s;
6435 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6436 Opc = AArch64::ST1Fourv2d;
6437 else if (Ty == S64 || Ty == P0)
6438 Opc = AArch64::ST1Fourv1d;
6439 else
6440 llvm_unreachable("Unexpected type for st1x4!");
6441 selectVectorStoreIntrinsic(I, NumVecs: 4, Opc);
6442 break;
6443 }
6444 case Intrinsic::aarch64_neon_st2: {
6445 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6446 unsigned Opc;
6447 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6448 Opc = AArch64::ST2Twov8b;
6449 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6450 Opc = AArch64::ST2Twov16b;
6451 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6452 Opc = AArch64::ST2Twov4h;
6453 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6454 Opc = AArch64::ST2Twov8h;
6455 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6456 Opc = AArch64::ST2Twov2s;
6457 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6458 Opc = AArch64::ST2Twov4s;
6459 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6460 Opc = AArch64::ST2Twov2d;
6461 else if (Ty == S64 || Ty == P0)
6462 Opc = AArch64::ST1Twov1d;
6463 else
6464 llvm_unreachable("Unexpected type for st2!");
6465 selectVectorStoreIntrinsic(I, NumVecs: 2, Opc);
6466 break;
6467 }
6468 case Intrinsic::aarch64_neon_st3: {
6469 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6470 unsigned Opc;
6471 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6472 Opc = AArch64::ST3Threev8b;
6473 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6474 Opc = AArch64::ST3Threev16b;
6475 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6476 Opc = AArch64::ST3Threev4h;
6477 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6478 Opc = AArch64::ST3Threev8h;
6479 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6480 Opc = AArch64::ST3Threev2s;
6481 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6482 Opc = AArch64::ST3Threev4s;
6483 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6484 Opc = AArch64::ST3Threev2d;
6485 else if (Ty == S64 || Ty == P0)
6486 Opc = AArch64::ST1Threev1d;
6487 else
6488 llvm_unreachable("Unexpected type for st3!");
6489 selectVectorStoreIntrinsic(I, NumVecs: 3, Opc);
6490 break;
6491 }
6492 case Intrinsic::aarch64_neon_st4: {
6493 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6494 unsigned Opc;
6495 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8))
6496 Opc = AArch64::ST4Fourv8b;
6497 else if (Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6498 Opc = AArch64::ST4Fourv16b;
6499 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16))
6500 Opc = AArch64::ST4Fourv4h;
6501 else if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6502 Opc = AArch64::ST4Fourv8h;
6503 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32))
6504 Opc = AArch64::ST4Fourv2s;
6505 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6506 Opc = AArch64::ST4Fourv4s;
6507 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) || Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0))
6508 Opc = AArch64::ST4Fourv2d;
6509 else if (Ty == S64 || Ty == P0)
6510 Opc = AArch64::ST1Fourv1d;
6511 else
6512 llvm_unreachable("Unexpected type for st4!");
6513 selectVectorStoreIntrinsic(I, NumVecs: 4, Opc);
6514 break;
6515 }
6516 case Intrinsic::aarch64_neon_st2lane: {
6517 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6518 unsigned Opc;
6519 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6520 Opc = AArch64::ST2i8;
6521 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6522 Opc = AArch64::ST2i16;
6523 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6524 Opc = AArch64::ST2i32;
6525 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6526 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6527 Opc = AArch64::ST2i64;
6528 else
6529 llvm_unreachable("Unexpected type for st2lane!");
6530 if (!selectVectorStoreLaneIntrinsic(I, NumVecs: 2, Opc))
6531 return false;
6532 break;
6533 }
6534 case Intrinsic::aarch64_neon_st3lane: {
6535 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6536 unsigned Opc;
6537 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6538 Opc = AArch64::ST3i8;
6539 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6540 Opc = AArch64::ST3i16;
6541 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6542 Opc = AArch64::ST3i32;
6543 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6544 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6545 Opc = AArch64::ST3i64;
6546 else
6547 llvm_unreachable("Unexpected type for st3lane!");
6548 if (!selectVectorStoreLaneIntrinsic(I, NumVecs: 3, Opc))
6549 return false;
6550 break;
6551 }
6552 case Intrinsic::aarch64_neon_st4lane: {
6553 LLT Ty = MRI.getType(Reg: I.getOperand(i: 1).getReg());
6554 unsigned Opc;
6555 if (Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S8) || Ty == LLT::fixed_vector(NumElements: 16, ScalarTy: S8))
6556 Opc = AArch64::ST4i8;
6557 else if (Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S16) || Ty == LLT::fixed_vector(NumElements: 8, ScalarTy: S16))
6558 Opc = AArch64::ST4i16;
6559 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S32) || Ty == LLT::fixed_vector(NumElements: 4, ScalarTy: S32))
6560 Opc = AArch64::ST4i32;
6561 else if (Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: S64) ||
6562 Ty == LLT::fixed_vector(NumElements: 2, ScalarTy: P0) || Ty == S64 || Ty == P0)
6563 Opc = AArch64::ST4i64;
6564 else
6565 llvm_unreachable("Unexpected type for st4lane!");
6566 if (!selectVectorStoreLaneIntrinsic(I, NumVecs: 4, Opc))
6567 return false;
6568 break;
6569 }
6570 case Intrinsic::aarch64_mops_memset_tag: {
6571 // Transform
6572 // %dst:gpr(p0) = \
6573 // G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag),
6574 // \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64)
6575 // where %dst is updated, into
6576 // %Rd:GPR64common, %Rn:GPR64) = \
6577 // MOPSMemorySetTaggingPseudo \
6578 // %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64
6579 // where Rd and Rn are tied.
6580 // It is expected that %val has been extended to s64 in legalization.
6581 // Note that the order of the size/value operands are swapped.
6582
6583 Register DstDef = I.getOperand(i: 0).getReg();
6584 // I.getOperand(1) is the intrinsic function
6585 Register DstUse = I.getOperand(i: 2).getReg();
6586 Register ValUse = I.getOperand(i: 3).getReg();
6587 Register SizeUse = I.getOperand(i: 4).getReg();
6588
6589 // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one.
6590 // Therefore an additional virtual register is requried for the updated size
6591 // operand. This value is not accessible via the semantics of the intrinsic.
6592 Register SizeDef = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 64));
6593
6594 auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo,
6595 {DstDef, SizeDef}, {DstUse, SizeUse, ValUse});
6596 Memset.cloneMemRefs(I);
6597 constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI);
6598 break;
6599 }
6600 }
6601
6602 I.eraseFromParent();
6603 return true;
6604}
6605
6606bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
6607 MachineRegisterInfo &MRI) {
6608 unsigned IntrinID = cast<GIntrinsic>(Val&: I).getIntrinsicID();
6609
6610 switch (IntrinID) {
6611 default:
6612 break;
6613 case Intrinsic::aarch64_crypto_sha1h: {
6614 Register DstReg = I.getOperand(i: 0).getReg();
6615 Register SrcReg = I.getOperand(i: 2).getReg();
6616
6617 // FIXME: Should this be an assert?
6618 if (MRI.getType(Reg: DstReg).getSizeInBits() != 32 ||
6619 MRI.getType(Reg: SrcReg).getSizeInBits() != 32)
6620 return false;
6621
6622 // The operation has to happen on FPRs. Set up some new FPR registers for
6623 // the source and destination if they are on GPRs.
6624 if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
6625 SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
6626 MIB.buildCopy(Res: {SrcReg}, Op: {I.getOperand(i: 2)});
6627
6628 // Make sure the copy ends up getting constrained properly.
6629 RBI.constrainGenericRegister(I.getOperand(2).getReg(),
6630 AArch64::GPR32RegClass, MRI);
6631 }
6632
6633 if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
6634 DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
6635
6636 // Actually insert the instruction.
6637 auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
6638 constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
6639
6640 // Did we create a new register for the destination?
6641 if (DstReg != I.getOperand(i: 0).getReg()) {
6642 // Yep. Copy the result of the instruction back into the original
6643 // destination.
6644 MIB.buildCopy(Res: {I.getOperand(i: 0)}, Op: {DstReg});
6645 RBI.constrainGenericRegister(I.getOperand(0).getReg(),
6646 AArch64::GPR32RegClass, MRI);
6647 }
6648
6649 I.eraseFromParent();
6650 return true;
6651 }
6652 case Intrinsic::frameaddress:
6653 case Intrinsic::returnaddress: {
6654 MachineFunction &MF = *I.getParent()->getParent();
6655 MachineFrameInfo &MFI = MF.getFrameInfo();
6656
6657 unsigned Depth = I.getOperand(i: 2).getImm();
6658 Register DstReg = I.getOperand(i: 0).getReg();
6659 RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
6660
6661 if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
6662 if (!MFReturnAddr) {
6663 // Insert the copy from LR/X30 into the entry block, before it can be
6664 // clobbered by anything.
6665 MFI.setReturnAddressIsTaken(true);
6666 MFReturnAddr = getFunctionLiveInPhysReg(
6667 MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc());
6668 }
6669
6670 if (STI.hasPAuth()) {
6671 MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
6672 } else {
6673 MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
6674 MIB.buildInstr(AArch64::XPACLRI);
6675 MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
6676 }
6677
6678 I.eraseFromParent();
6679 return true;
6680 }
6681
6682 MFI.setFrameAddressIsTaken(true);
6683 Register FrameAddr(AArch64::FP);
6684 while (Depth--) {
6685 Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
6686 auto Ldr =
6687 MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0);
6688 constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI);
6689 FrameAddr = NextFrame;
6690 }
6691
6692 if (IntrinID == Intrinsic::frameaddress)
6693 MIB.buildCopy(Res: {DstReg}, Op: {FrameAddr});
6694 else {
6695 MFI.setReturnAddressIsTaken(true);
6696
6697 if (STI.hasPAuth()) {
6698 Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
6699 MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1);
6700 MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
6701 } else {
6702 MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr})
6703 .addImm(1);
6704 MIB.buildInstr(AArch64::XPACLRI);
6705 MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
6706 }
6707 }
6708
6709 I.eraseFromParent();
6710 return true;
6711 }
6712 case Intrinsic::swift_async_context_addr:
6713 auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()},
6714 {Register(AArch64::FP)})
6715 .addImm(8)
6716 .addImm(0);
6717 constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI);
6718
6719 MF->getFrameInfo().setFrameAddressIsTaken(true);
6720 MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
6721 I.eraseFromParent();
6722 return true;
6723 }
6724 return false;
6725}
6726
6727InstructionSelector::ComplexRendererFns
6728AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
6729 auto MaybeImmed = getImmedFromMO(Root);
6730 if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
6731 return std::nullopt;
6732 uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
6733 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Enc); }}};
6734}
6735
6736InstructionSelector::ComplexRendererFns
6737AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
6738 auto MaybeImmed = getImmedFromMO(Root);
6739 if (MaybeImmed == std::nullopt || *MaybeImmed > 31)
6740 return std::nullopt;
6741 uint64_t Enc = 31 - *MaybeImmed;
6742 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Enc); }}};
6743}
6744
6745InstructionSelector::ComplexRendererFns
6746AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
6747 auto MaybeImmed = getImmedFromMO(Root);
6748 if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
6749 return std::nullopt;
6750 uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
6751 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Enc); }}};
6752}
6753
6754InstructionSelector::ComplexRendererFns
6755AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
6756 auto MaybeImmed = getImmedFromMO(Root);
6757 if (MaybeImmed == std::nullopt || *MaybeImmed > 63)
6758 return std::nullopt;
6759 uint64_t Enc = 63 - *MaybeImmed;
6760 return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Enc); }}};
6761}
6762
6763/// Helper to select an immediate value that can be represented as a 12-bit
6764/// value shifted left by either 0 or 12. If it is possible to do so, return
6765/// the immediate and shift value. If not, return std::nullopt.
6766///
6767/// Used by selectArithImmed and selectNegArithImmed.
6768InstructionSelector::ComplexRendererFns
6769AArch64InstructionSelector::select12BitValueWithLeftShift(
6770 uint64_t Immed) const {
6771 unsigned ShiftAmt;
6772 if (Immed >> 12 == 0) {
6773 ShiftAmt = 0;
6774 } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
6775 ShiftAmt = 12;
6776 Immed = Immed >> 12;
6777 } else
6778 return std::nullopt;
6779
6780 unsigned ShVal = AArch64_AM::getShifterImm(ST: AArch64_AM::LSL, Imm: ShiftAmt);
6781 return {{
6782 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: Immed); },
6783 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ShVal); },
6784 }};
6785}
6786
6787/// SelectArithImmed - Select an immediate value that can be represented as
6788/// a 12-bit value shifted left by either 0 or 12. If so, return true with
6789/// Val set to the 12-bit value and Shift set to the shifter operand.
6790InstructionSelector::ComplexRendererFns
6791AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
6792 // This function is called from the addsub_shifted_imm ComplexPattern,
6793 // which lists [imm] as the list of opcode it's interested in, however
6794 // we still need to check whether the operand is actually an immediate
6795 // here because the ComplexPattern opcode list is only used in
6796 // root-level opcode matching.
6797 auto MaybeImmed = getImmedFromMO(Root);
6798 if (MaybeImmed == std::nullopt)
6799 return std::nullopt;
6800 return select12BitValueWithLeftShift(Immed: *MaybeImmed);
6801}
6802
6803/// SelectNegArithImmed - As above, but negates the value before trying to
6804/// select it.
6805InstructionSelector::ComplexRendererFns
6806AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
6807 // We need a register here, because we need to know if we have a 64 or 32
6808 // bit immediate.
6809 if (!Root.isReg())
6810 return std::nullopt;
6811 auto MaybeImmed = getImmedFromMO(Root);
6812 if (MaybeImmed == std::nullopt)
6813 return std::nullopt;
6814 uint64_t Immed = *MaybeImmed;
6815
6816 // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
6817 // have the opposite effect on the C flag, so this pattern mustn't match under
6818 // those circumstances.
6819 if (Immed == 0)
6820 return std::nullopt;
6821
6822 // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
6823 // the root.
6824 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6825 if (MRI.getType(Reg: Root.getReg()).getSizeInBits() == 32)
6826 Immed = ~((uint32_t)Immed) + 1;
6827 else
6828 Immed = ~Immed + 1ULL;
6829
6830 if (Immed & 0xFFFFFFFFFF000000ULL)
6831 return std::nullopt;
6832
6833 Immed &= 0xFFFFFFULL;
6834 return select12BitValueWithLeftShift(Immed);
6835}
6836
6837/// Return true if it is worth folding MI into an extended register. That is,
6838/// if it's safe to pull it into the addressing mode of a load or store as a
6839/// shift.
6840bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
6841 MachineInstr &MI, const MachineRegisterInfo &MRI) const {
6842 // Always fold if there is one use, or if we're optimizing for size.
6843 Register DefReg = MI.getOperand(i: 0).getReg();
6844 if (MRI.hasOneNonDBGUse(RegNo: DefReg) ||
6845 MI.getParent()->getParent()->getFunction().hasOptSize())
6846 return true;
6847
6848 // FIXME: Consider checking HasAddrLSLSlow14 and HasALULSLFast as
6849 // appropriate.
6850
6851 // We have a fastpath, so folding a shift in and potentially computing it
6852 // many times may be beneficial. Check if this is only used in memory ops.
6853 // If it is, then we should fold.
6854 return all_of(Range: MRI.use_nodbg_instructions(Reg: DefReg),
6855 P: [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
6856}
6857
6858static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
6859 switch (Type) {
6860 case AArch64_AM::SXTB:
6861 case AArch64_AM::SXTH:
6862 case AArch64_AM::SXTW:
6863 return true;
6864 default:
6865 return false;
6866 }
6867}
6868
6869InstructionSelector::ComplexRendererFns
6870AArch64InstructionSelector::selectExtendedSHL(
6871 MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
6872 unsigned SizeInBytes, bool WantsExt) const {
6873 assert(Base.isReg() && "Expected base to be a register operand");
6874 assert(Offset.isReg() && "Expected offset to be a register operand");
6875
6876 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6877 MachineInstr *OffsetInst = MRI.getVRegDef(Reg: Offset.getReg());
6878
6879 unsigned OffsetOpc = OffsetInst->getOpcode();
6880 bool LookedThroughZExt = false;
6881 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
6882 // Try to look through a ZEXT.
6883 if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt)
6884 return std::nullopt;
6885
6886 OffsetInst = MRI.getVRegDef(Reg: OffsetInst->getOperand(i: 1).getReg());
6887 OffsetOpc = OffsetInst->getOpcode();
6888 LookedThroughZExt = true;
6889
6890 if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
6891 return std::nullopt;
6892 }
6893 // Make sure that the memory op is a valid size.
6894 int64_t LegalShiftVal = Log2_32(Value: SizeInBytes);
6895 if (LegalShiftVal == 0)
6896 return std::nullopt;
6897 if (!isWorthFoldingIntoExtendedReg(MI&: *OffsetInst, MRI))
6898 return std::nullopt;
6899
6900 // Now, try to find the specific G_CONSTANT. Start by assuming that the
6901 // register we will offset is the LHS, and the register containing the
6902 // constant is the RHS.
6903 Register OffsetReg = OffsetInst->getOperand(i: 1).getReg();
6904 Register ConstantReg = OffsetInst->getOperand(i: 2).getReg();
6905 auto ValAndVReg = getIConstantVRegValWithLookThrough(VReg: ConstantReg, MRI);
6906 if (!ValAndVReg) {
6907 // We didn't get a constant on the RHS. If the opcode is a shift, then
6908 // we're done.
6909 if (OffsetOpc == TargetOpcode::G_SHL)
6910 return std::nullopt;
6911
6912 // If we have a G_MUL, we can use either register. Try looking at the RHS.
6913 std::swap(a&: OffsetReg, b&: ConstantReg);
6914 ValAndVReg = getIConstantVRegValWithLookThrough(VReg: ConstantReg, MRI);
6915 if (!ValAndVReg)
6916 return std::nullopt;
6917 }
6918
6919 // The value must fit into 3 bits, and must be positive. Make sure that is
6920 // true.
6921 int64_t ImmVal = ValAndVReg->Value.getSExtValue();
6922
6923 // Since we're going to pull this into a shift, the constant value must be
6924 // a power of 2. If we got a multiply, then we need to check this.
6925 if (OffsetOpc == TargetOpcode::G_MUL) {
6926 if (!llvm::has_single_bit<uint32_t>(Value: ImmVal))
6927 return std::nullopt;
6928
6929 // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
6930 ImmVal = Log2_32(Value: ImmVal);
6931 }
6932
6933 if ((ImmVal & 0x7) != ImmVal)
6934 return std::nullopt;
6935
6936 // We are only allowed to shift by LegalShiftVal. This shift value is built
6937 // into the instruction, so we can't just use whatever we want.
6938 if (ImmVal != LegalShiftVal)
6939 return std::nullopt;
6940
6941 unsigned SignExtend = 0;
6942 if (WantsExt) {
6943 // Check if the offset is defined by an extend, unless we looked through a
6944 // G_ZEXT earlier.
6945 if (!LookedThroughZExt) {
6946 MachineInstr *ExtInst = getDefIgnoringCopies(Reg: OffsetReg, MRI);
6947 auto Ext = getExtendTypeForInst(MI&: *ExtInst, MRI, IsLoadStore: true);
6948 if (Ext == AArch64_AM::InvalidShiftExtend)
6949 return std::nullopt;
6950
6951 SignExtend = isSignExtendShiftType(Type: Ext) ? 1 : 0;
6952 // We only support SXTW for signed extension here.
6953 if (SignExtend && Ext != AArch64_AM::SXTW)
6954 return std::nullopt;
6955 OffsetReg = ExtInst->getOperand(i: 1).getReg();
6956 }
6957
6958 // Need a 32-bit wide register here.
6959 MachineIRBuilder MIB(*MRI.getVRegDef(Reg: Root.getReg()));
6960 OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB);
6961 }
6962
6963 // We can use the LHS of the GEP as the base, and the LHS of the shift as an
6964 // offset. Signify that we are shifting by setting the shift flag to 1.
6965 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: Base.getReg()); },
6966 [=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: OffsetReg); },
6967 [=](MachineInstrBuilder &MIB) {
6968 // Need to add both immediates here to make sure that they are both
6969 // added to the instruction.
6970 MIB.addImm(Val: SignExtend);
6971 MIB.addImm(Val: 1);
6972 }}};
6973}
6974
6975/// This is used for computing addresses like this:
6976///
6977/// ldr x1, [x2, x3, lsl #3]
6978///
6979/// Where x2 is the base register, and x3 is an offset register. The shift-left
6980/// is a constant value specific to this load instruction. That is, we'll never
6981/// see anything other than a 3 here (which corresponds to the size of the
6982/// element being loaded.)
6983InstructionSelector::ComplexRendererFns
6984AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
6985 MachineOperand &Root, unsigned SizeInBytes) const {
6986 if (!Root.isReg())
6987 return std::nullopt;
6988 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6989
6990 // We want to find something like this:
6991 //
6992 // val = G_CONSTANT LegalShiftVal
6993 // shift = G_SHL off_reg val
6994 // ptr = G_PTR_ADD base_reg shift
6995 // x = G_LOAD ptr
6996 //
6997 // And fold it into this addressing mode:
6998 //
6999 // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
7000
7001 // Check if we can find the G_PTR_ADD.
7002 MachineInstr *PtrAdd =
7003 getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Root.getReg(), MRI);
7004 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(MI&: *PtrAdd, MRI))
7005 return std::nullopt;
7006
7007 // Now, try to match an opcode which will match our specific offset.
7008 // We want a G_SHL or a G_MUL.
7009 MachineInstr *OffsetInst =
7010 getDefIgnoringCopies(Reg: PtrAdd->getOperand(i: 2).getReg(), MRI);
7011 return selectExtendedSHL(Root, Base&: PtrAdd->getOperand(i: 1),
7012 Offset&: OffsetInst->getOperand(i: 0), SizeInBytes,
7013 /*WantsExt=*/false);
7014}
7015
7016/// This is used for computing addresses like this:
7017///
7018/// ldr x1, [x2, x3]
7019///
7020/// Where x2 is the base register, and x3 is an offset register.
7021///
7022/// When possible (or profitable) to fold a G_PTR_ADD into the address
7023/// calculation, this will do so. Otherwise, it will return std::nullopt.
7024InstructionSelector::ComplexRendererFns
7025AArch64InstructionSelector::selectAddrModeRegisterOffset(
7026 MachineOperand &Root) const {
7027 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7028
7029 // We need a GEP.
7030 MachineInstr *Gep = MRI.getVRegDef(Reg: Root.getReg());
7031 if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
7032 return std::nullopt;
7033
7034 // If this is used more than once, let's not bother folding.
7035 // TODO: Check if they are memory ops. If they are, then we can still fold
7036 // without having to recompute anything.
7037 if (!MRI.hasOneNonDBGUse(RegNo: Gep->getOperand(i: 0).getReg()))
7038 return std::nullopt;
7039
7040 // Base is the GEP's LHS, offset is its RHS.
7041 return {{[=](MachineInstrBuilder &MIB) {
7042 MIB.addUse(RegNo: Gep->getOperand(i: 1).getReg());
7043 },
7044 [=](MachineInstrBuilder &MIB) {
7045 MIB.addUse(RegNo: Gep->getOperand(i: 2).getReg());
7046 },
7047 [=](MachineInstrBuilder &MIB) {
7048 // Need to add both immediates here to make sure that they are both
7049 // added to the instruction.
7050 MIB.addImm(Val: 0);
7051 MIB.addImm(Val: 0);
7052 }}};
7053}
7054
7055/// This is intended to be equivalent to selectAddrModeXRO in
7056/// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
7057InstructionSelector::ComplexRendererFns
7058AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
7059 unsigned SizeInBytes) const {
7060 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7061 if (!Root.isReg())
7062 return std::nullopt;
7063 MachineInstr *PtrAdd =
7064 getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Root.getReg(), MRI);
7065 if (!PtrAdd)
7066 return std::nullopt;
7067
7068 // Check for an immediates which cannot be encoded in the [base + imm]
7069 // addressing mode, and can't be encoded in an add/sub. If this happens, we'll
7070 // end up with code like:
7071 //
7072 // mov x0, wide
7073 // add x1 base, x0
7074 // ldr x2, [x1, x0]
7075 //
7076 // In this situation, we can use the [base, xreg] addressing mode to save an
7077 // add/sub:
7078 //
7079 // mov x0, wide
7080 // ldr x2, [base, x0]
7081 auto ValAndVReg =
7082 getIConstantVRegValWithLookThrough(VReg: PtrAdd->getOperand(i: 2).getReg(), MRI);
7083 if (ValAndVReg) {
7084 unsigned Scale = Log2_32(Value: SizeInBytes);
7085 int64_t ImmOff = ValAndVReg->Value.getSExtValue();
7086
7087 // Skip immediates that can be selected in the load/store addresing
7088 // mode.
7089 if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
7090 ImmOff < (0x1000 << Scale))
7091 return std::nullopt;
7092
7093 // Helper lambda to decide whether or not it is preferable to emit an add.
7094 auto isPreferredADD = [](int64_t ImmOff) {
7095 // Constants in [0x0, 0xfff] can be encoded in an add.
7096 if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
7097 return true;
7098
7099 // Can it be encoded in an add lsl #12?
7100 if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
7101 return false;
7102
7103 // It can be encoded in an add lsl #12, but we may not want to. If it is
7104 // possible to select this as a single movz, then prefer that. A single
7105 // movz is faster than an add with a shift.
7106 return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
7107 (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
7108 };
7109
7110 // If the immediate can be encoded in a single add/sub, then bail out.
7111 if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
7112 return std::nullopt;
7113 }
7114
7115 // Try to fold shifts into the addressing mode.
7116 auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
7117 if (AddrModeFns)
7118 return AddrModeFns;
7119
7120 // If that doesn't work, see if it's possible to fold in registers from
7121 // a GEP.
7122 return selectAddrModeRegisterOffset(Root);
7123}
7124
7125/// This is used for computing addresses like this:
7126///
7127/// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
7128///
7129/// Where we have a 64-bit base register, a 32-bit offset register, and an
7130/// extend (which may or may not be signed).
7131InstructionSelector::ComplexRendererFns
7132AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
7133 unsigned SizeInBytes) const {
7134 MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
7135
7136 MachineInstr *PtrAdd =
7137 getOpcodeDef(Opcode: TargetOpcode::G_PTR_ADD, Reg: Root.getReg(), MRI);
7138 if (!PtrAdd || !isWorthFoldingIntoExtendedReg(MI&: *PtrAdd, MRI))
7139 return std::nullopt;
7140
7141 MachineOperand &LHS = PtrAdd->getOperand(i: 1);
7142 MachineOperand &RHS = PtrAdd->getOperand(i: 2);
7143 MachineInstr *OffsetInst = getDefIgnoringCopies(Reg: RHS.getReg(), MRI);
7144
7145 // The first case is the same as selectAddrModeXRO, except we need an extend.
7146 // In this case, we try to find a shift and extend, and fold them into the
7147 // addressing mode.
7148 //
7149 // E.g.
7150 //
7151 // off_reg = G_Z/S/ANYEXT ext_reg
7152 // val = G_CONSTANT LegalShiftVal
7153 // shift = G_SHL off_reg val
7154 // ptr = G_PTR_ADD base_reg shift
7155 // x = G_LOAD ptr
7156 //
7157 // In this case we can get a load like this:
7158 //
7159 // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
7160 auto ExtendedShl = selectExtendedSHL(Root, Base&: LHS, Offset&: OffsetInst->getOperand(i: 0),
7161 SizeInBytes, /*WantsExt=*/true);
7162 if (ExtendedShl)
7163 return ExtendedShl;
7164
7165 // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
7166 //
7167 // e.g.
7168 // ldr something, [base_reg, ext_reg, sxtw]
7169 if (!isWorthFoldingIntoExtendedReg(MI&: *OffsetInst, MRI))
7170 return std::nullopt;
7171
7172 // Check if this is an extend. We'll get an extend type if it is.
7173 AArch64_AM::ShiftExtendType Ext =
7174 getExtendTypeForInst(MI&: *OffsetInst, MRI, /*IsLoadStore=*/true);
7175 if (Ext == AArch64_AM::InvalidShiftExtend)
7176 return std::nullopt;
7177
7178 // Need a 32-bit wide register.
7179 MachineIRBuilder MIB(*PtrAdd);
7180 Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(),
7181 AArch64::GPR32RegClass, MIB);
7182 unsigned SignExtend = Ext == AArch64_AM::SXTW;
7183
7184 // Base is LHS, offset is ExtReg.
7185 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: LHS.getReg()); },
7186 [=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); },
7187 [=](MachineInstrBuilder &MIB) {
7188 MIB.addImm(Val: SignExtend);
7189 MIB.addImm(Val: 0);
7190 }}};
7191}
7192
7193/// Select a "register plus unscaled signed 9-bit immediate" address. This
7194/// should only match when there is an offset that is not valid for a scaled
7195/// immediate addressing mode. The "Size" argument is the size in bytes of the
7196/// memory reference, which is needed here to know what is valid for a scaled
7197/// immediate.
7198InstructionSelector::ComplexRendererFns
7199AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
7200 unsigned Size) const {
7201 MachineRegisterInfo &MRI =
7202 Root.getParent()->getParent()->getParent()->getRegInfo();
7203
7204 if (!Root.isReg())
7205 return std::nullopt;
7206
7207 if (!isBaseWithConstantOffset(Root, MRI))
7208 return std::nullopt;
7209
7210 MachineInstr *RootDef = MRI.getVRegDef(Reg: Root.getReg());
7211
7212 MachineOperand &OffImm = RootDef->getOperand(i: 2);
7213 if (!OffImm.isReg())
7214 return std::nullopt;
7215 MachineInstr *RHS = MRI.getVRegDef(Reg: OffImm.getReg());
7216 if (RHS->getOpcode() != TargetOpcode::G_CONSTANT)
7217 return std::nullopt;
7218 int64_t RHSC;
7219 MachineOperand &RHSOp1 = RHS->getOperand(i: 1);
7220 if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
7221 return std::nullopt;
7222 RHSC = RHSOp1.getCImm()->getSExtValue();
7223
7224 if (RHSC >= -256 && RHSC < 256) {
7225 MachineOperand &Base = RootDef->getOperand(i: 1);
7226 return {{
7227 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Base); },
7228 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: RHSC); },
7229 }};
7230 }
7231 return std::nullopt;
7232}
7233
7234InstructionSelector::ComplexRendererFns
7235AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
7236 unsigned Size,
7237 MachineRegisterInfo &MRI) const {
7238 if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
7239 return std::nullopt;
7240 MachineInstr &Adrp = *MRI.getVRegDef(Reg: RootDef.getOperand(i: 1).getReg());
7241 if (Adrp.getOpcode() != AArch64::ADRP)
7242 return std::nullopt;
7243
7244 // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
7245 auto Offset = Adrp.getOperand(i: 1).getOffset();
7246 if (Offset % Size != 0)
7247 return std::nullopt;
7248
7249 auto GV = Adrp.getOperand(i: 1).getGlobal();
7250 if (GV->isThreadLocal())
7251 return std::nullopt;
7252
7253 auto &MF = *RootDef.getParent()->getParent();
7254 if (GV->getPointerAlignment(DL: MF.getDataLayout()) < Size)
7255 return std::nullopt;
7256
7257 unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM: MF.getTarget());
7258 MachineIRBuilder MIRBuilder(RootDef);
7259 Register AdrpReg = Adrp.getOperand(i: 0).getReg();
7260 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: AdrpReg); },
7261 [=](MachineInstrBuilder &MIB) {
7262 MIB.addGlobalAddress(GV, Offset,
7263 TargetFlags: OpFlags | AArch64II::MO_PAGEOFF |
7264 AArch64II::MO_NC);
7265 }}};
7266}
7267
7268/// Select a "register plus scaled unsigned 12-bit immediate" address. The
7269/// "Size" argument is the size in bytes of the memory reference, which
7270/// determines the scale.
7271InstructionSelector::ComplexRendererFns
7272AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
7273 unsigned Size) const {
7274 MachineFunction &MF = *Root.getParent()->getParent()->getParent();
7275 MachineRegisterInfo &MRI = MF.getRegInfo();
7276
7277 if (!Root.isReg())
7278 return std::nullopt;
7279
7280 MachineInstr *RootDef = MRI.getVRegDef(Reg: Root.getReg());
7281 if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
7282 return {{
7283 [=](MachineInstrBuilder &MIB) { MIB.add(MO: RootDef->getOperand(i: 1)); },
7284 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); },
7285 }};
7286 }
7287
7288 CodeModel::Model CM = MF.getTarget().getCodeModel();
7289 // Check if we can fold in the ADD of small code model ADRP + ADD address.
7290 if (CM == CodeModel::Small) {
7291 auto OpFns = tryFoldAddLowIntoImm(RootDef&: *RootDef, Size, MRI);
7292 if (OpFns)
7293 return OpFns;
7294 }
7295
7296 if (isBaseWithConstantOffset(Root, MRI)) {
7297 MachineOperand &LHS = RootDef->getOperand(i: 1);
7298 MachineOperand &RHS = RootDef->getOperand(i: 2);
7299 MachineInstr *LHSDef = MRI.getVRegDef(Reg: LHS.getReg());
7300 MachineInstr *RHSDef = MRI.getVRegDef(Reg: RHS.getReg());
7301
7302 int64_t RHSC = (int64_t)RHSDef->getOperand(i: 1).getCImm()->getZExtValue();
7303 unsigned Scale = Log2_32(Value: Size);
7304 if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
7305 if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
7306 return {{
7307 [=](MachineInstrBuilder &MIB) { MIB.add(MO: LHSDef->getOperand(i: 1)); },
7308 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: RHSC >> Scale); },
7309 }};
7310
7311 return {{
7312 [=](MachineInstrBuilder &MIB) { MIB.add(MO: LHS); },
7313 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: RHSC >> Scale); },
7314 }};
7315 }
7316 }
7317
7318 // Before falling back to our general case, check if the unscaled
7319 // instructions can handle this. If so, that's preferable.
7320 if (selectAddrModeUnscaled(Root, Size))
7321 return std::nullopt;
7322
7323 return {{
7324 [=](MachineInstrBuilder &MIB) { MIB.add(MO: Root); },
7325 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: 0); },
7326 }};
7327}
7328
7329/// Given a shift instruction, return the correct shift type for that
7330/// instruction.
7331static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
7332 switch (MI.getOpcode()) {
7333 default:
7334 return AArch64_AM::InvalidShiftExtend;
7335 case TargetOpcode::G_SHL:
7336 return AArch64_AM::LSL;
7337 case TargetOpcode::G_LSHR:
7338 return AArch64_AM::LSR;
7339 case TargetOpcode::G_ASHR:
7340 return AArch64_AM::ASR;
7341 case TargetOpcode::G_ROTR:
7342 return AArch64_AM::ROR;
7343 }
7344}
7345
7346/// Select a "shifted register" operand. If the value is not shifted, set the
7347/// shift operand to a default value of "lsl 0".
7348InstructionSelector::ComplexRendererFns
7349AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root,
7350 bool AllowROR) const {
7351 if (!Root.isReg())
7352 return std::nullopt;
7353 MachineRegisterInfo &MRI =
7354 Root.getParent()->getParent()->getParent()->getRegInfo();
7355
7356 // Check if the operand is defined by an instruction which corresponds to
7357 // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
7358 MachineInstr *ShiftInst = MRI.getVRegDef(Reg: Root.getReg());
7359 AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(MI&: *ShiftInst);
7360 if (ShType == AArch64_AM::InvalidShiftExtend)
7361 return std::nullopt;
7362 if (ShType == AArch64_AM::ROR && !AllowROR)
7363 return std::nullopt;
7364 if (!isWorthFoldingIntoExtendedReg(MI&: *ShiftInst, MRI))
7365 return std::nullopt;
7366
7367 // Need an immediate on the RHS.
7368 MachineOperand &ShiftRHS = ShiftInst->getOperand(i: 2);
7369 auto Immed = getImmedFromMO(Root: ShiftRHS);
7370 if (!Immed)
7371 return std::nullopt;
7372
7373 // We have something that we can fold. Fold in the shift's LHS and RHS into
7374 // the instruction.
7375 MachineOperand &ShiftLHS = ShiftInst->getOperand(i: 1);
7376 Register ShiftReg = ShiftLHS.getReg();
7377
7378 unsigned NumBits = MRI.getType(Reg: ShiftReg).getSizeInBits();
7379 unsigned Val = *Immed & (NumBits - 1);
7380 unsigned ShiftVal = AArch64_AM::getShifterImm(ST: ShType, Imm: Val);
7381
7382 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ShiftReg); },
7383 [=](MachineInstrBuilder &MIB) { MIB.addImm(Val: ShiftVal); }}};
7384}
7385
7386AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
7387 MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
7388 unsigned Opc = MI.getOpcode();
7389
7390 // Handle explicit extend instructions first.
7391 if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
7392 unsigned Size;
7393 if (Opc == TargetOpcode::G_SEXT)
7394 Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
7395 else
7396 Size = MI.getOperand(i: 2).getImm();
7397 assert(Size != 64 && "Extend from 64 bits?");
7398 switch (Size) {
7399 case 8:
7400 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB;
7401 case 16:
7402 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH;
7403 case 32:
7404 return AArch64_AM::SXTW;
7405 default:
7406 return AArch64_AM::InvalidShiftExtend;
7407 }
7408 }
7409
7410 if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
7411 unsigned Size = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
7412 assert(Size != 64 && "Extend from 64 bits?");
7413 switch (Size) {
7414 case 8:
7415 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB;
7416 case 16:
7417 return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH;
7418 case 32:
7419 return AArch64_AM::UXTW;
7420 default:
7421 return AArch64_AM::InvalidShiftExtend;
7422 }
7423 }
7424
7425 // Don't have an explicit extend. Try to handle a G_AND with a constant mask
7426 // on the RHS.
7427 if (Opc != TargetOpcode::G_AND)
7428 return AArch64_AM::InvalidShiftExtend;
7429
7430 std::optional<uint64_t> MaybeAndMask = getImmedFromMO(Root: MI.getOperand(i: 2));
7431 if (!MaybeAndMask)
7432 return AArch64_AM::InvalidShiftExtend;
7433 uint64_t AndMask = *MaybeAndMask;
7434 switch (AndMask) {
7435 default:
7436 return AArch64_AM::InvalidShiftExtend;
7437 case 0xFF:
7438 return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
7439 case 0xFFFF:
7440 return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
7441 case 0xFFFFFFFF:
7442 return AArch64_AM::UXTW;
7443 }
7444}
7445
7446Register AArch64InstructionSelector::moveScalarRegClass(
7447 Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
7448 MachineRegisterInfo &MRI = *MIB.getMRI();
7449 auto Ty = MRI.getType(Reg);
7450 assert(!Ty.isVector() && "Expected scalars only!");
7451 if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
7452 return Reg;
7453
7454 // Create a copy and immediately select it.
7455 // FIXME: We should have an emitCopy function?
7456 auto Copy = MIB.buildCopy(Res: {&RC}, Op: {Reg});
7457 selectCopy(*Copy, TII, MRI, TRI, RBI);
7458 return Copy.getReg(Idx: 0);
7459}
7460
7461/// Select an "extended register" operand. This operand folds in an extend
7462/// followed by an optional left shift.
7463InstructionSelector::ComplexRendererFns
7464AArch64InstructionSelector::selectArithExtendedRegister(
7465 MachineOperand &Root) const {
7466 if (!Root.isReg())
7467 return std::nullopt;
7468 MachineRegisterInfo &MRI =
7469 Root.getParent()->getParent()->getParent()->getRegInfo();
7470
7471 uint64_t ShiftVal = 0;
7472 Register ExtReg;
7473 AArch64_AM::ShiftExtendType Ext;
7474 MachineInstr *RootDef = getDefIgnoringCopies(Reg: Root.getReg(), MRI);
7475 if (!RootDef)
7476 return std::nullopt;
7477
7478 if (!isWorthFoldingIntoExtendedReg(MI&: *RootDef, MRI))
7479 return std::nullopt;
7480
7481 // Check if we can fold a shift and an extend.
7482 if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
7483 // Look for a constant on the RHS of the shift.
7484 MachineOperand &RHS = RootDef->getOperand(i: 2);
7485 std::optional<uint64_t> MaybeShiftVal = getImmedFromMO(Root: RHS);
7486 if (!MaybeShiftVal)
7487 return std::nullopt;
7488 ShiftVal = *MaybeShiftVal;
7489 if (ShiftVal > 4)
7490 return std::nullopt;
7491 // Look for a valid extend instruction on the LHS of the shift.
7492 MachineOperand &LHS = RootDef->getOperand(i: 1);
7493 MachineInstr *ExtDef = getDefIgnoringCopies(Reg: LHS.getReg(), MRI);
7494 if (!ExtDef)
7495 return std::nullopt;
7496 Ext = getExtendTypeForInst(MI&: *ExtDef, MRI);
7497 if (Ext == AArch64_AM::InvalidShiftExtend)
7498 return std::nullopt;
7499 ExtReg = ExtDef->getOperand(i: 1).getReg();
7500 } else {
7501 // Didn't get a shift. Try just folding an extend.
7502 Ext = getExtendTypeForInst(MI&: *RootDef, MRI);
7503 if (Ext == AArch64_AM::InvalidShiftExtend)
7504 return std::nullopt;
7505 ExtReg = RootDef->getOperand(i: 1).getReg();
7506
7507 // If we have a 32 bit instruction which zeroes out the high half of a
7508 // register, we get an implicit zero extend for free. Check if we have one.
7509 // FIXME: We actually emit the extend right now even though we don't have
7510 // to.
7511 if (Ext == AArch64_AM::UXTW && MRI.getType(Reg: ExtReg).getSizeInBits() == 32) {
7512 MachineInstr *ExtInst = MRI.getVRegDef(Reg: ExtReg);
7513 if (isDef32(MI: *ExtInst))
7514 return std::nullopt;
7515 }
7516 }
7517
7518 // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
7519 // copy.
7520 MachineIRBuilder MIB(*RootDef);
7521 ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB);
7522
7523 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); },
7524 [=](MachineInstrBuilder &MIB) {
7525 MIB.addImm(Val: getArithExtendImm(ET: Ext, Imm: ShiftVal));
7526 }}};
7527}
7528
7529InstructionSelector::ComplexRendererFns
7530AArch64InstructionSelector::selectExtractHigh(MachineOperand &Root) const {
7531 if (!Root.isReg())
7532 return std::nullopt;
7533 MachineRegisterInfo &MRI =
7534 Root.getParent()->getParent()->getParent()->getRegInfo();
7535
7536 auto Extract = getDefSrcRegIgnoringCopies(Reg: Root.getReg(), MRI);
7537 while (Extract && Extract->MI->getOpcode() == TargetOpcode::G_BITCAST &&
7538 STI.isLittleEndian())
7539 Extract =
7540 getDefSrcRegIgnoringCopies(Reg: Extract->MI->getOperand(i: 1).getReg(), MRI);
7541 if (!Extract)
7542 return std::nullopt;
7543
7544 if (Extract->MI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) {
7545 if (Extract->Reg == Extract->MI->getOperand(i: 1).getReg()) {
7546 Register ExtReg = Extract->MI->getOperand(i: 2).getReg();
7547 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); }}};
7548 }
7549 }
7550 if (Extract->MI->getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT) {
7551 LLT SrcTy = MRI.getType(Reg: Extract->MI->getOperand(i: 1).getReg());
7552 auto LaneIdx = getIConstantVRegValWithLookThrough(
7553 VReg: Extract->MI->getOperand(i: 2).getReg(), MRI);
7554 if (LaneIdx && SrcTy == LLT::fixed_vector(NumElements: 2, ScalarSizeInBits: 64) &&
7555 LaneIdx->Value.getSExtValue() == 1) {
7556 Register ExtReg = Extract->MI->getOperand(i: 1).getReg();
7557 return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(RegNo: ExtReg); }}};
7558 }
7559 }
7560
7561 return std::nullopt;
7562}
7563
7564void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
7565 const MachineInstr &MI,
7566 int OpIdx) const {
7567 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7568 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7569 "Expected G_CONSTANT");
7570 std::optional<int64_t> CstVal =
7571 getIConstantVRegSExtVal(VReg: MI.getOperand(i: 0).getReg(), MRI);
7572 assert(CstVal && "Expected constant value");
7573 MIB.addImm(Val: *CstVal);
7574}
7575
7576void AArch64InstructionSelector::renderLogicalImm32(
7577 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7578 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7579 "Expected G_CONSTANT");
7580 uint64_t CstVal = I.getOperand(i: 1).getCImm()->getZExtValue();
7581 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(imm: CstVal, regSize: 32);
7582 MIB.addImm(Val: Enc);
7583}
7584
7585void AArch64InstructionSelector::renderLogicalImm64(
7586 MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
7587 assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
7588 "Expected G_CONSTANT");
7589 uint64_t CstVal = I.getOperand(i: 1).getCImm()->getZExtValue();
7590 uint64_t Enc = AArch64_AM::encodeLogicalImmediate(imm: CstVal, regSize: 64);
7591 MIB.addImm(Val: Enc);
7592}
7593
7594void AArch64InstructionSelector::renderUbsanTrap(MachineInstrBuilder &MIB,
7595 const MachineInstr &MI,
7596 int OpIdx) const {
7597 assert(MI.getOpcode() == TargetOpcode::G_UBSANTRAP && OpIdx == 0 &&
7598 "Expected G_UBSANTRAP");
7599 MIB.addImm(Val: MI.getOperand(i: 0).getImm() | ('U' << 8));
7600}
7601
7602void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB,
7603 const MachineInstr &MI,
7604 int OpIdx) const {
7605 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7606 "Expected G_FCONSTANT");
7607 MIB.addImm(
7608 Val: AArch64_AM::getFP16Imm(FPImm: MI.getOperand(i: 1).getFPImm()->getValueAPF()));
7609}
7610
7611void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB,
7612 const MachineInstr &MI,
7613 int OpIdx) const {
7614 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7615 "Expected G_FCONSTANT");
7616 MIB.addImm(
7617 Val: AArch64_AM::getFP32Imm(FPImm: MI.getOperand(i: 1).getFPImm()->getValueAPF()));
7618}
7619
7620void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
7621 const MachineInstr &MI,
7622 int OpIdx) const {
7623 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7624 "Expected G_FCONSTANT");
7625 MIB.addImm(
7626 Val: AArch64_AM::getFP64Imm(FPImm: MI.getOperand(i: 1).getFPImm()->getValueAPF()));
7627}
7628
7629void AArch64InstructionSelector::renderFPImm32SIMDModImmType4(
7630 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
7631 assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
7632 "Expected G_FCONSTANT");
7633 MIB.addImm(Val: AArch64_AM::encodeAdvSIMDModImmType4(Imm: MI.getOperand(i: 1)
7634 .getFPImm()
7635 ->getValueAPF()
7636 .bitcastToAPInt()
7637 .getZExtValue()));
7638}
7639
7640bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
7641 const MachineInstr &MI, unsigned NumBytes) const {
7642 if (!MI.mayLoadOrStore())
7643 return false;
7644 assert(MI.hasOneMemOperand() &&
7645 "Expected load/store to have only one mem op!");
7646 return (*MI.memoperands_begin())->getSize() == NumBytes;
7647}
7648
7649bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
7650 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
7651 if (MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits() != 32)
7652 return false;
7653
7654 // Only return true if we know the operation will zero-out the high half of
7655 // the 64-bit register. Truncates can be subregister copies, which don't
7656 // zero out the high bits. Copies and other copy-like instructions can be
7657 // fed by truncates, or could be lowered as subregister copies.
7658 switch (MI.getOpcode()) {
7659 default:
7660 return true;
7661 case TargetOpcode::COPY:
7662 case TargetOpcode::G_BITCAST:
7663 case TargetOpcode::G_TRUNC:
7664 case TargetOpcode::G_PHI:
7665 return false;
7666 }
7667}
7668
7669
7670// Perform fixups on the given PHI instruction's operands to force them all
7671// to be the same as the destination regbank.
7672static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
7673 const AArch64RegisterBankInfo &RBI) {
7674 assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
7675 Register DstReg = MI.getOperand(i: 0).getReg();
7676 const RegisterBank *DstRB = MRI.getRegBankOrNull(Reg: DstReg);
7677 assert(DstRB && "Expected PHI dst to have regbank assigned");
7678 MachineIRBuilder MIB(MI);
7679
7680 // Go through each operand and ensure it has the same regbank.
7681 for (MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.operands())) {
7682 if (!MO.isReg())
7683 continue;
7684 Register OpReg = MO.getReg();
7685 const RegisterBank *RB = MRI.getRegBankOrNull(Reg: OpReg);
7686 if (RB != DstRB) {
7687 // Insert a cross-bank copy.
7688 auto *OpDef = MRI.getVRegDef(Reg: OpReg);
7689 const LLT &Ty = MRI.getType(Reg: OpReg);
7690 MachineBasicBlock &OpDefBB = *OpDef->getParent();
7691
7692 // Any instruction we insert must appear after all PHIs in the block
7693 // for the block to be valid MIR.
7694 MachineBasicBlock::iterator InsertPt = std::next(x: OpDef->getIterator());
7695 if (InsertPt != OpDefBB.end() && InsertPt->isPHI())
7696 InsertPt = OpDefBB.getFirstNonPHI();
7697 MIB.setInsertPt(MBB&: *OpDef->getParent(), II: InsertPt);
7698 auto Copy = MIB.buildCopy(Res: Ty, Op: OpReg);
7699 MRI.setRegBank(Reg: Copy.getReg(Idx: 0), RegBank: *DstRB);
7700 MO.setReg(Copy.getReg(Idx: 0));
7701 }
7702 }
7703}
7704
7705void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
7706 // We're looking for PHIs, build a list so we don't invalidate iterators.
7707 MachineRegisterInfo &MRI = MF.getRegInfo();
7708 SmallVector<MachineInstr *, 32> Phis;
7709 for (auto &BB : MF) {
7710 for (auto &MI : BB) {
7711 if (MI.getOpcode() == TargetOpcode::G_PHI)
7712 Phis.emplace_back(Args: &MI);
7713 }
7714 }
7715
7716 for (auto *MI : Phis) {
7717 // We need to do some work here if the operand types are < 16 bit and they
7718 // are split across fpr/gpr banks. Since all types <32b on gpr
7719 // end up being assigned gpr32 regclasses, we can end up with PHIs here
7720 // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
7721 // be selecting heterogenous regbanks for operands if possible, but we
7722 // still need to be able to deal with it here.
7723 //
7724 // To fix this, if we have a gpr-bank operand < 32b in size and at least
7725 // one other operand is on the fpr bank, then we add cross-bank copies
7726 // to homogenize the operand banks. For simplicity the bank that we choose
7727 // to settle on is whatever bank the def operand has. For example:
7728 //
7729 // %endbb:
7730 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
7731 // =>
7732 // %bb2:
7733 // ...
7734 // %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
7735 // ...
7736 // %endbb:
7737 // %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
7738 bool HasGPROp = false, HasFPROp = false;
7739 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI->operands())) {
7740 if (!MO.isReg())
7741 continue;
7742 const LLT &Ty = MRI.getType(Reg: MO.getReg());
7743 if (!Ty.isValid() || !Ty.isScalar())
7744 break;
7745 if (Ty.getSizeInBits() >= 32)
7746 break;
7747 const RegisterBank *RB = MRI.getRegBankOrNull(Reg: MO.getReg());
7748 // If for some reason we don't have a regbank yet. Don't try anything.
7749 if (!RB)
7750 break;
7751
7752 if (RB->getID() == AArch64::GPRRegBankID)
7753 HasGPROp = true;
7754 else
7755 HasFPROp = true;
7756 }
7757 // We have heterogenous regbanks, need to fixup.
7758 if (HasGPROp && HasFPROp)
7759 fixupPHIOpBanks(MI&: *MI, MRI, RBI);
7760 }
7761}
7762
7763namespace llvm {
7764InstructionSelector *
7765createAArch64InstructionSelector(const AArch64TargetMachine &TM,
7766 AArch64Subtarget &Subtarget,
7767 AArch64RegisterBankInfo &RBI) {
7768 return new AArch64InstructionSelector(TM, Subtarget, RBI);
7769}
7770}
7771

source code of llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp