1//===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This file implements the LegalizerHelper class to legalize
10/// individual instructions and the LegalizeMachineIR wrapper pass for the
11/// primary legalization.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16#include "llvm/CodeGen/GlobalISel/CallLowering.h"
17#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
19#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
20#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
21#include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
22#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24#include "llvm/CodeGen/GlobalISel/Utils.h"
25#include "llvm/CodeGen/MachineConstantPool.h"
26#include "llvm/CodeGen/MachineFrameInfo.h"
27#include "llvm/CodeGen/MachineRegisterInfo.h"
28#include "llvm/CodeGen/RuntimeLibcalls.h"
29#include "llvm/CodeGen/TargetFrameLowering.h"
30#include "llvm/CodeGen/TargetInstrInfo.h"
31#include "llvm/CodeGen/TargetLowering.h"
32#include "llvm/CodeGen/TargetOpcodes.h"
33#include "llvm/CodeGen/TargetSubtargetInfo.h"
34#include "llvm/IR/Instructions.h"
35#include "llvm/Support/Debug.h"
36#include "llvm/Support/MathExtras.h"
37#include "llvm/Support/raw_ostream.h"
38#include "llvm/Target/TargetMachine.h"
39#include <numeric>
40#include <optional>
41
42#define DEBUG_TYPE "legalizer"
43
44using namespace llvm;
45using namespace LegalizeActions;
46using namespace MIPatternMatch;
47
48/// Try to break down \p OrigTy into \p NarrowTy sized pieces.
49///
50/// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
51/// with any leftover piece as type \p LeftoverTy
52///
53/// Returns -1 in the first element of the pair if the breakdown is not
54/// satisfiable.
55static std::pair<int, int>
56getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
57 assert(!LeftoverTy.isValid() && "this is an out argument");
58
59 unsigned Size = OrigTy.getSizeInBits();
60 unsigned NarrowSize = NarrowTy.getSizeInBits();
61 unsigned NumParts = Size / NarrowSize;
62 unsigned LeftoverSize = Size - NumParts * NarrowSize;
63 assert(Size > NarrowSize);
64
65 if (LeftoverSize == 0)
66 return {NumParts, 0};
67
68 if (NarrowTy.isVector()) {
69 unsigned EltSize = OrigTy.getScalarSizeInBits();
70 if (LeftoverSize % EltSize != 0)
71 return {-1, -1};
72 LeftoverTy = LLT::scalarOrVector(
73 EC: ElementCount::getFixed(MinVal: LeftoverSize / EltSize), ScalarSize: EltSize);
74 } else {
75 LeftoverTy = LLT::scalar(SizeInBits: LeftoverSize);
76 }
77
78 int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
79 return std::make_pair(x&: NumParts, y&: NumLeftover);
80}
81
82static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
83
84 if (!Ty.isScalar())
85 return nullptr;
86
87 switch (Ty.getSizeInBits()) {
88 case 16:
89 return Type::getHalfTy(C&: Ctx);
90 case 32:
91 return Type::getFloatTy(C&: Ctx);
92 case 64:
93 return Type::getDoubleTy(C&: Ctx);
94 case 80:
95 return Type::getX86_FP80Ty(C&: Ctx);
96 case 128:
97 return Type::getFP128Ty(C&: Ctx);
98 default:
99 return nullptr;
100 }
101}
102
103LegalizerHelper::LegalizerHelper(MachineFunction &MF,
104 GISelChangeObserver &Observer,
105 MachineIRBuilder &Builder)
106 : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
107 LI(*MF.getSubtarget().getLegalizerInfo()),
108 TLI(*MF.getSubtarget().getTargetLowering()), KB(nullptr) {}
109
110LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
111 GISelChangeObserver &Observer,
112 MachineIRBuilder &B, GISelKnownBits *KB)
113 : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
114 TLI(*MF.getSubtarget().getTargetLowering()), KB(KB) {}
115
116LegalizerHelper::LegalizeResult
117LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
118 LostDebugLocObserver &LocObserver) {
119 LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
120
121 MIRBuilder.setInstrAndDebugLoc(MI);
122
123 if (isa<GIntrinsic>(Val: MI))
124 return LI.legalizeIntrinsic(Helper&: *this, MI) ? Legalized : UnableToLegalize;
125 auto Step = LI.getAction(MI, MRI);
126 switch (Step.Action) {
127 case Legal:
128 LLVM_DEBUG(dbgs() << ".. Already legal\n");
129 return AlreadyLegal;
130 case Libcall:
131 LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
132 return libcall(MI, LocObserver);
133 case NarrowScalar:
134 LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
135 return narrowScalar(MI, TypeIdx: Step.TypeIdx, NarrowTy: Step.NewType);
136 case WidenScalar:
137 LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
138 return widenScalar(MI, TypeIdx: Step.TypeIdx, WideTy: Step.NewType);
139 case Bitcast:
140 LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
141 return bitcast(MI, TypeIdx: Step.TypeIdx, Ty: Step.NewType);
142 case Lower:
143 LLVM_DEBUG(dbgs() << ".. Lower\n");
144 return lower(MI, TypeIdx: Step.TypeIdx, Ty: Step.NewType);
145 case FewerElements:
146 LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
147 return fewerElementsVector(MI, TypeIdx: Step.TypeIdx, NarrowTy: Step.NewType);
148 case MoreElements:
149 LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
150 return moreElementsVector(MI, TypeIdx: Step.TypeIdx, MoreTy: Step.NewType);
151 case Custom:
152 LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
153 return LI.legalizeCustom(Helper&: *this, MI, LocObserver) ? Legalized
154 : UnableToLegalize;
155 default:
156 LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
157 return UnableToLegalize;
158 }
159}
160
161void LegalizerHelper::insertParts(Register DstReg,
162 LLT ResultTy, LLT PartTy,
163 ArrayRef<Register> PartRegs,
164 LLT LeftoverTy,
165 ArrayRef<Register> LeftoverRegs) {
166 if (!LeftoverTy.isValid()) {
167 assert(LeftoverRegs.empty());
168
169 if (!ResultTy.isVector()) {
170 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: PartRegs);
171 return;
172 }
173
174 if (PartTy.isVector())
175 MIRBuilder.buildConcatVectors(Res: DstReg, Ops: PartRegs);
176 else
177 MIRBuilder.buildBuildVector(Res: DstReg, Ops: PartRegs);
178 return;
179 }
180
181 // Merge sub-vectors with different number of elements and insert into DstReg.
182 if (ResultTy.isVector()) {
183 assert(LeftoverRegs.size() == 1 && "Expected one leftover register");
184 SmallVector<Register, 8> AllRegs;
185 for (auto Reg : concat<const Register>(Ranges&: PartRegs, Ranges&: LeftoverRegs))
186 AllRegs.push_back(Elt: Reg);
187 return mergeMixedSubvectors(DstReg, PartRegs: AllRegs);
188 }
189
190 SmallVector<Register> GCDRegs;
191 LLT GCDTy = getGCDType(OrigTy: getGCDType(OrigTy: ResultTy, TargetTy: LeftoverTy), TargetTy: PartTy);
192 for (auto PartReg : concat<const Register>(Ranges&: PartRegs, Ranges&: LeftoverRegs))
193 extractGCDType(Parts&: GCDRegs, GCDTy, SrcReg: PartReg);
194 LLT ResultLCMTy = buildLCMMergePieces(DstTy: ResultTy, NarrowTy: LeftoverTy, GCDTy, VRegs&: GCDRegs);
195 buildWidenedRemergeToDst(DstReg, LCMTy: ResultLCMTy, RemergeRegs: GCDRegs);
196}
197
198void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts,
199 Register Reg) {
200 LLT Ty = MRI.getType(Reg);
201 SmallVector<Register, 8> RegElts;
202 extractParts(Reg, Ty: Ty.getScalarType(), NumParts: Ty.getNumElements(), VRegs&: RegElts,
203 MIRBuilder, MRI);
204 Elts.append(RHS: RegElts);
205}
206
207/// Merge \p PartRegs with different types into \p DstReg.
208void LegalizerHelper::mergeMixedSubvectors(Register DstReg,
209 ArrayRef<Register> PartRegs) {
210 SmallVector<Register, 8> AllElts;
211 for (unsigned i = 0; i < PartRegs.size() - 1; ++i)
212 appendVectorElts(Elts&: AllElts, Reg: PartRegs[i]);
213
214 Register Leftover = PartRegs[PartRegs.size() - 1];
215 if (MRI.getType(Reg: Leftover).isScalar())
216 AllElts.push_back(Elt: Leftover);
217 else
218 appendVectorElts(Elts&: AllElts, Reg: Leftover);
219
220 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: AllElts);
221}
222
223/// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
224static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
225 const MachineInstr &MI) {
226 assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
227
228 const int StartIdx = Regs.size();
229 const int NumResults = MI.getNumOperands() - 1;
230 Regs.resize(N: Regs.size() + NumResults);
231 for (int I = 0; I != NumResults; ++I)
232 Regs[StartIdx + I] = MI.getOperand(i: I).getReg();
233}
234
235void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
236 LLT GCDTy, Register SrcReg) {
237 LLT SrcTy = MRI.getType(Reg: SrcReg);
238 if (SrcTy == GCDTy) {
239 // If the source already evenly divides the result type, we don't need to do
240 // anything.
241 Parts.push_back(Elt: SrcReg);
242 } else {
243 // Need to split into common type sized pieces.
244 auto Unmerge = MIRBuilder.buildUnmerge(Res: GCDTy, Op: SrcReg);
245 getUnmergeResults(Regs&: Parts, MI: *Unmerge);
246 }
247}
248
249LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
250 LLT NarrowTy, Register SrcReg) {
251 LLT SrcTy = MRI.getType(Reg: SrcReg);
252 LLT GCDTy = getGCDType(OrigTy: getGCDType(OrigTy: SrcTy, TargetTy: NarrowTy), TargetTy: DstTy);
253 extractGCDType(Parts, GCDTy, SrcReg);
254 return GCDTy;
255}
256
257LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
258 SmallVectorImpl<Register> &VRegs,
259 unsigned PadStrategy) {
260 LLT LCMTy = getLCMType(OrigTy: DstTy, TargetTy: NarrowTy);
261
262 int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
263 int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
264 int NumOrigSrc = VRegs.size();
265
266 Register PadReg;
267
268 // Get a value we can use to pad the source value if the sources won't evenly
269 // cover the result type.
270 if (NumOrigSrc < NumParts * NumSubParts) {
271 if (PadStrategy == TargetOpcode::G_ZEXT)
272 PadReg = MIRBuilder.buildConstant(Res: GCDTy, Val: 0).getReg(Idx: 0);
273 else if (PadStrategy == TargetOpcode::G_ANYEXT)
274 PadReg = MIRBuilder.buildUndef(Res: GCDTy).getReg(Idx: 0);
275 else {
276 assert(PadStrategy == TargetOpcode::G_SEXT);
277
278 // Shift the sign bit of the low register through the high register.
279 auto ShiftAmt =
280 MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: 64), Val: GCDTy.getSizeInBits() - 1);
281 PadReg = MIRBuilder.buildAShr(Dst: GCDTy, Src0: VRegs.back(), Src1: ShiftAmt).getReg(Idx: 0);
282 }
283 }
284
285 // Registers for the final merge to be produced.
286 SmallVector<Register, 4> Remerge(NumParts);
287
288 // Registers needed for intermediate merges, which will be merged into a
289 // source for Remerge.
290 SmallVector<Register, 4> SubMerge(NumSubParts);
291
292 // Once we've fully read off the end of the original source bits, we can reuse
293 // the same high bits for remaining padding elements.
294 Register AllPadReg;
295
296 // Build merges to the LCM type to cover the original result type.
297 for (int I = 0; I != NumParts; ++I) {
298 bool AllMergePartsArePadding = true;
299
300 // Build the requested merges to the requested type.
301 for (int J = 0; J != NumSubParts; ++J) {
302 int Idx = I * NumSubParts + J;
303 if (Idx >= NumOrigSrc) {
304 SubMerge[J] = PadReg;
305 continue;
306 }
307
308 SubMerge[J] = VRegs[Idx];
309
310 // There are meaningful bits here we can't reuse later.
311 AllMergePartsArePadding = false;
312 }
313
314 // If we've filled up a complete piece with padding bits, we can directly
315 // emit the natural sized constant if applicable, rather than a merge of
316 // smaller constants.
317 if (AllMergePartsArePadding && !AllPadReg) {
318 if (PadStrategy == TargetOpcode::G_ANYEXT)
319 AllPadReg = MIRBuilder.buildUndef(Res: NarrowTy).getReg(Idx: 0);
320 else if (PadStrategy == TargetOpcode::G_ZEXT)
321 AllPadReg = MIRBuilder.buildConstant(Res: NarrowTy, Val: 0).getReg(Idx: 0);
322
323 // If this is a sign extension, we can't materialize a trivial constant
324 // with the right type and have to produce a merge.
325 }
326
327 if (AllPadReg) {
328 // Avoid creating additional instructions if we're just adding additional
329 // copies of padding bits.
330 Remerge[I] = AllPadReg;
331 continue;
332 }
333
334 if (NumSubParts == 1)
335 Remerge[I] = SubMerge[0];
336 else
337 Remerge[I] = MIRBuilder.buildMergeLikeInstr(Res: NarrowTy, Ops: SubMerge).getReg(Idx: 0);
338
339 // In the sign extend padding case, re-use the first all-signbit merge.
340 if (AllMergePartsArePadding && !AllPadReg)
341 AllPadReg = Remerge[I];
342 }
343
344 VRegs = std::move(Remerge);
345 return LCMTy;
346}
347
348void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
349 ArrayRef<Register> RemergeRegs) {
350 LLT DstTy = MRI.getType(Reg: DstReg);
351
352 // Create the merge to the widened source, and extract the relevant bits into
353 // the result.
354
355 if (DstTy == LCMTy) {
356 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: RemergeRegs);
357 return;
358 }
359
360 auto Remerge = MIRBuilder.buildMergeLikeInstr(Res: LCMTy, Ops: RemergeRegs);
361 if (DstTy.isScalar() && LCMTy.isScalar()) {
362 MIRBuilder.buildTrunc(Res: DstReg, Op: Remerge);
363 return;
364 }
365
366 if (LCMTy.isVector()) {
367 unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
368 SmallVector<Register, 8> UnmergeDefs(NumDefs);
369 UnmergeDefs[0] = DstReg;
370 for (unsigned I = 1; I != NumDefs; ++I)
371 UnmergeDefs[I] = MRI.createGenericVirtualRegister(Ty: DstTy);
372
373 MIRBuilder.buildUnmerge(Res: UnmergeDefs,
374 Op: MIRBuilder.buildMergeLikeInstr(Res: LCMTy, Ops: RemergeRegs));
375 return;
376 }
377
378 llvm_unreachable("unhandled case");
379}
380
381static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
382#define RTLIBCASE_INT(LibcallPrefix) \
383 do { \
384 switch (Size) { \
385 case 32: \
386 return RTLIB::LibcallPrefix##32; \
387 case 64: \
388 return RTLIB::LibcallPrefix##64; \
389 case 128: \
390 return RTLIB::LibcallPrefix##128; \
391 default: \
392 llvm_unreachable("unexpected size"); \
393 } \
394 } while (0)
395
396#define RTLIBCASE(LibcallPrefix) \
397 do { \
398 switch (Size) { \
399 case 32: \
400 return RTLIB::LibcallPrefix##32; \
401 case 64: \
402 return RTLIB::LibcallPrefix##64; \
403 case 80: \
404 return RTLIB::LibcallPrefix##80; \
405 case 128: \
406 return RTLIB::LibcallPrefix##128; \
407 default: \
408 llvm_unreachable("unexpected size"); \
409 } \
410 } while (0)
411
412 switch (Opcode) {
413 case TargetOpcode::G_MUL:
414 RTLIBCASE_INT(MUL_I);
415 case TargetOpcode::G_SDIV:
416 RTLIBCASE_INT(SDIV_I);
417 case TargetOpcode::G_UDIV:
418 RTLIBCASE_INT(UDIV_I);
419 case TargetOpcode::G_SREM:
420 RTLIBCASE_INT(SREM_I);
421 case TargetOpcode::G_UREM:
422 RTLIBCASE_INT(UREM_I);
423 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
424 RTLIBCASE_INT(CTLZ_I);
425 case TargetOpcode::G_FADD:
426 RTLIBCASE(ADD_F);
427 case TargetOpcode::G_FSUB:
428 RTLIBCASE(SUB_F);
429 case TargetOpcode::G_FMUL:
430 RTLIBCASE(MUL_F);
431 case TargetOpcode::G_FDIV:
432 RTLIBCASE(DIV_F);
433 case TargetOpcode::G_FEXP:
434 RTLIBCASE(EXP_F);
435 case TargetOpcode::G_FEXP2:
436 RTLIBCASE(EXP2_F);
437 case TargetOpcode::G_FEXP10:
438 RTLIBCASE(EXP10_F);
439 case TargetOpcode::G_FREM:
440 RTLIBCASE(REM_F);
441 case TargetOpcode::G_FPOW:
442 RTLIBCASE(POW_F);
443 case TargetOpcode::G_FPOWI:
444 RTLIBCASE(POWI_F);
445 case TargetOpcode::G_FMA:
446 RTLIBCASE(FMA_F);
447 case TargetOpcode::G_FSIN:
448 RTLIBCASE(SIN_F);
449 case TargetOpcode::G_FCOS:
450 RTLIBCASE(COS_F);
451 case TargetOpcode::G_FLOG10:
452 RTLIBCASE(LOG10_F);
453 case TargetOpcode::G_FLOG:
454 RTLIBCASE(LOG_F);
455 case TargetOpcode::G_FLOG2:
456 RTLIBCASE(LOG2_F);
457 case TargetOpcode::G_FLDEXP:
458 RTLIBCASE(LDEXP_F);
459 case TargetOpcode::G_FCEIL:
460 RTLIBCASE(CEIL_F);
461 case TargetOpcode::G_FFLOOR:
462 RTLIBCASE(FLOOR_F);
463 case TargetOpcode::G_FMINNUM:
464 RTLIBCASE(FMIN_F);
465 case TargetOpcode::G_FMAXNUM:
466 RTLIBCASE(FMAX_F);
467 case TargetOpcode::G_FSQRT:
468 RTLIBCASE(SQRT_F);
469 case TargetOpcode::G_FRINT:
470 RTLIBCASE(RINT_F);
471 case TargetOpcode::G_FNEARBYINT:
472 RTLIBCASE(NEARBYINT_F);
473 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
474 RTLIBCASE(ROUNDEVEN_F);
475 }
476 llvm_unreachable("Unknown libcall function");
477}
478
479/// True if an instruction is in tail position in its caller. Intended for
480/// legalizing libcalls as tail calls when possible.
481static bool isLibCallInTailPosition(const CallLowering::ArgInfo &Result,
482 MachineInstr &MI,
483 const TargetInstrInfo &TII,
484 MachineRegisterInfo &MRI) {
485 MachineBasicBlock &MBB = *MI.getParent();
486 const Function &F = MBB.getParent()->getFunction();
487
488 // Conservatively require the attributes of the call to match those of
489 // the return. Ignore NoAlias and NonNull because they don't affect the
490 // call sequence.
491 AttributeList CallerAttrs = F.getAttributes();
492 if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs())
493 .removeAttribute(Attribute::NoAlias)
494 .removeAttribute(Attribute::NonNull)
495 .hasAttributes())
496 return false;
497
498 // It's not safe to eliminate the sign / zero extension of the return value.
499 if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
500 CallerAttrs.hasRetAttr(Attribute::SExt))
501 return false;
502
503 // Only tail call if the following instruction is a standard return or if we
504 // have a `thisreturn` callee, and a sequence like:
505 //
506 // G_MEMCPY %0, %1, %2
507 // $x0 = COPY %0
508 // RET_ReallyLR implicit $x0
509 auto Next = next_nodbg(It: MI.getIterator(), End: MBB.instr_end());
510 if (Next != MBB.instr_end() && Next->isCopy()) {
511 if (MI.getOpcode() == TargetOpcode::G_BZERO)
512 return false;
513
514 // For MEMCPY/MOMMOVE/MEMSET these will be the first use (the dst), as the
515 // mempy/etc routines return the same parameter. For other it will be the
516 // returned value.
517 Register VReg = MI.getOperand(i: 0).getReg();
518 if (!VReg.isVirtual() || VReg != Next->getOperand(i: 1).getReg())
519 return false;
520
521 Register PReg = Next->getOperand(i: 0).getReg();
522 if (!PReg.isPhysical())
523 return false;
524
525 auto Ret = next_nodbg(It: Next, End: MBB.instr_end());
526 if (Ret == MBB.instr_end() || !Ret->isReturn())
527 return false;
528
529 if (Ret->getNumImplicitOperands() != 1)
530 return false;
531
532 if (!Ret->getOperand(i: 0).isReg() || PReg != Ret->getOperand(i: 0).getReg())
533 return false;
534
535 // Skip over the COPY that we just validated.
536 Next = Ret;
537 }
538
539 if (Next == MBB.instr_end() || TII.isTailCall(Inst: *Next) || !Next->isReturn())
540 return false;
541
542 return true;
543}
544
545LegalizerHelper::LegalizeResult
546llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
547 const CallLowering::ArgInfo &Result,
548 ArrayRef<CallLowering::ArgInfo> Args,
549 const CallingConv::ID CC, LostDebugLocObserver &LocObserver,
550 MachineInstr *MI) {
551 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
552
553 CallLowering::CallLoweringInfo Info;
554 Info.CallConv = CC;
555 Info.Callee = MachineOperand::CreateES(SymName: Name);
556 Info.OrigRet = Result;
557 if (MI)
558 Info.IsTailCall =
559 (Result.Ty->isVoidTy() ||
560 Result.Ty == MIRBuilder.getMF().getFunction().getReturnType()) &&
561 isLibCallInTailPosition(Result, MI&: *MI, TII: MIRBuilder.getTII(),
562 MRI&: *MIRBuilder.getMRI());
563
564 std::copy(first: Args.begin(), last: Args.end(), result: std::back_inserter(x&: Info.OrigArgs));
565 if (!CLI.lowerCall(MIRBuilder, Info))
566 return LegalizerHelper::UnableToLegalize;
567
568 if (MI && Info.LoweredTailCall) {
569 assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
570
571 // Check debug locations before removing the return.
572 LocObserver.checkpoint(CheckDebugLocs: true);
573
574 // We must have a return following the call (or debug insts) to get past
575 // isLibCallInTailPosition.
576 do {
577 MachineInstr *Next = MI->getNextNode();
578 assert(Next &&
579 (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
580 "Expected instr following MI to be return or debug inst?");
581 // We lowered a tail call, so the call is now the return from the block.
582 // Delete the old return.
583 Next->eraseFromParent();
584 } while (MI->getNextNode());
585
586 // We expect to lose the debug location from the return.
587 LocObserver.checkpoint(CheckDebugLocs: false);
588 }
589 return LegalizerHelper::Legalized;
590}
591
592LegalizerHelper::LegalizeResult
593llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
594 const CallLowering::ArgInfo &Result,
595 ArrayRef<CallLowering::ArgInfo> Args,
596 LostDebugLocObserver &LocObserver, MachineInstr *MI) {
597 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
598 const char *Name = TLI.getLibcallName(Call: Libcall);
599 const CallingConv::ID CC = TLI.getLibcallCallingConv(Call: Libcall);
600 return createLibcall(MIRBuilder, Name, Result, Args, CC, LocObserver, MI);
601}
602
603// Useful for libcalls where all operands have the same type.
604static LegalizerHelper::LegalizeResult
605simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
606 Type *OpType, LostDebugLocObserver &LocObserver) {
607 auto Libcall = getRTLibDesc(Opcode: MI.getOpcode(), Size);
608
609 // FIXME: What does the original arg index mean here?
610 SmallVector<CallLowering::ArgInfo, 3> Args;
611 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.operands()))
612 Args.push_back(Elt: {MO.getReg(), OpType, 0});
613 return createLibcall(MIRBuilder, Libcall,
614 Result: {MI.getOperand(i: 0).getReg(), OpType, 0}, Args,
615 LocObserver, MI: &MI);
616}
617
618LegalizerHelper::LegalizeResult
619llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
620 MachineInstr &MI, LostDebugLocObserver &LocObserver) {
621 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
622
623 SmallVector<CallLowering::ArgInfo, 3> Args;
624 // Add all the args, except for the last which is an imm denoting 'tail'.
625 for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
626 Register Reg = MI.getOperand(i).getReg();
627
628 // Need derive an IR type for call lowering.
629 LLT OpLLT = MRI.getType(Reg);
630 Type *OpTy = nullptr;
631 if (OpLLT.isPointer())
632 OpTy = PointerType::get(C&: Ctx, AddressSpace: OpLLT.getAddressSpace());
633 else
634 OpTy = IntegerType::get(C&: Ctx, NumBits: OpLLT.getSizeInBits());
635 Args.push_back(Elt: {Reg, OpTy, 0});
636 }
637
638 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
639 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
640 RTLIB::Libcall RTLibcall;
641 unsigned Opc = MI.getOpcode();
642 switch (Opc) {
643 case TargetOpcode::G_BZERO:
644 RTLibcall = RTLIB::BZERO;
645 break;
646 case TargetOpcode::G_MEMCPY:
647 RTLibcall = RTLIB::MEMCPY;
648 Args[0].Flags[0].setReturned();
649 break;
650 case TargetOpcode::G_MEMMOVE:
651 RTLibcall = RTLIB::MEMMOVE;
652 Args[0].Flags[0].setReturned();
653 break;
654 case TargetOpcode::G_MEMSET:
655 RTLibcall = RTLIB::MEMSET;
656 Args[0].Flags[0].setReturned();
657 break;
658 default:
659 llvm_unreachable("unsupported opcode");
660 }
661 const char *Name = TLI.getLibcallName(Call: RTLibcall);
662
663 // Unsupported libcall on the target.
664 if (!Name) {
665 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
666 << MIRBuilder.getTII().getName(Opc) << "\n");
667 return LegalizerHelper::UnableToLegalize;
668 }
669
670 CallLowering::CallLoweringInfo Info;
671 Info.CallConv = TLI.getLibcallCallingConv(Call: RTLibcall);
672 Info.Callee = MachineOperand::CreateES(SymName: Name);
673 Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(C&: Ctx), 0);
674 Info.IsTailCall =
675 MI.getOperand(i: MI.getNumOperands() - 1).getImm() &&
676 isLibCallInTailPosition(Result: Info.OrigRet, MI, TII: MIRBuilder.getTII(), MRI);
677
678 std::copy(first: Args.begin(), last: Args.end(), result: std::back_inserter(x&: Info.OrigArgs));
679 if (!CLI.lowerCall(MIRBuilder, Info))
680 return LegalizerHelper::UnableToLegalize;
681
682 if (Info.LoweredTailCall) {
683 assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
684
685 // Check debug locations before removing the return.
686 LocObserver.checkpoint(CheckDebugLocs: true);
687
688 // We must have a return following the call (or debug insts) to get past
689 // isLibCallInTailPosition.
690 do {
691 MachineInstr *Next = MI.getNextNode();
692 assert(Next &&
693 (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
694 "Expected instr following MI to be return or debug inst?");
695 // We lowered a tail call, so the call is now the return from the block.
696 // Delete the old return.
697 Next->eraseFromParent();
698 } while (MI.getNextNode());
699
700 // We expect to lose the debug location from the return.
701 LocObserver.checkpoint(CheckDebugLocs: false);
702 }
703
704 return LegalizerHelper::Legalized;
705}
706
707static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
708 unsigned Opc = MI.getOpcode();
709 auto &AtomicMI = cast<GMemOperation>(Val&: MI);
710 auto &MMO = AtomicMI.getMMO();
711 auto Ordering = MMO.getMergedOrdering();
712 LLT MemType = MMO.getMemoryType();
713 uint64_t MemSize = MemType.getSizeInBytes();
714 if (MemType.isVector())
715 return RTLIB::UNKNOWN_LIBCALL;
716
717#define LCALLS(A, B) \
718 { A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
719#define LCALL5(A) \
720 LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
721 switch (Opc) {
722 case TargetOpcode::G_ATOMIC_CMPXCHG:
723 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
724 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_CAS)};
725 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
726 }
727 case TargetOpcode::G_ATOMICRMW_XCHG: {
728 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_SWP)};
729 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
730 }
731 case TargetOpcode::G_ATOMICRMW_ADD:
732 case TargetOpcode::G_ATOMICRMW_SUB: {
733 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)};
734 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
735 }
736 case TargetOpcode::G_ATOMICRMW_AND: {
737 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDCLR)};
738 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
739 }
740 case TargetOpcode::G_ATOMICRMW_OR: {
741 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDSET)};
742 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
743 }
744 case TargetOpcode::G_ATOMICRMW_XOR: {
745 const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDEOR)};
746 return getOutlineAtomicHelper(LC, Order: Ordering, MemSize);
747 }
748 default:
749 return RTLIB::UNKNOWN_LIBCALL;
750 }
751#undef LCALLS
752#undef LCALL5
753}
754
755static LegalizerHelper::LegalizeResult
756createAtomicLibcall(MachineIRBuilder &MIRBuilder, MachineInstr &MI) {
757 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
758
759 Type *RetTy;
760 SmallVector<Register> RetRegs;
761 SmallVector<CallLowering::ArgInfo, 3> Args;
762 unsigned Opc = MI.getOpcode();
763 switch (Opc) {
764 case TargetOpcode::G_ATOMIC_CMPXCHG:
765 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
766 Register Success;
767 LLT SuccessLLT;
768 auto [Ret, RetLLT, Mem, MemLLT, Cmp, CmpLLT, New, NewLLT] =
769 MI.getFirst4RegLLTs();
770 RetRegs.push_back(Elt: Ret);
771 RetTy = IntegerType::get(C&: Ctx, NumBits: RetLLT.getSizeInBits());
772 if (Opc == TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS) {
773 std::tie(args&: Ret, args&: RetLLT, args&: Success, args&: SuccessLLT, args&: Mem, args&: MemLLT, args&: Cmp, args&: CmpLLT, args&: New,
774 args&: NewLLT) = MI.getFirst5RegLLTs();
775 RetRegs.push_back(Elt: Success);
776 RetTy = StructType::get(
777 Context&: Ctx, Elements: {RetTy, IntegerType::get(C&: Ctx, NumBits: SuccessLLT.getSizeInBits())});
778 }
779 Args.push_back(Elt: {Cmp, IntegerType::get(C&: Ctx, NumBits: CmpLLT.getSizeInBits()), 0});
780 Args.push_back(Elt: {New, IntegerType::get(C&: Ctx, NumBits: NewLLT.getSizeInBits()), 0});
781 Args.push_back(Elt: {Mem, PointerType::get(C&: Ctx, AddressSpace: MemLLT.getAddressSpace()), 0});
782 break;
783 }
784 case TargetOpcode::G_ATOMICRMW_XCHG:
785 case TargetOpcode::G_ATOMICRMW_ADD:
786 case TargetOpcode::G_ATOMICRMW_SUB:
787 case TargetOpcode::G_ATOMICRMW_AND:
788 case TargetOpcode::G_ATOMICRMW_OR:
789 case TargetOpcode::G_ATOMICRMW_XOR: {
790 auto [Ret, RetLLT, Mem, MemLLT, Val, ValLLT] = MI.getFirst3RegLLTs();
791 RetRegs.push_back(Elt: Ret);
792 RetTy = IntegerType::get(C&: Ctx, NumBits: RetLLT.getSizeInBits());
793 if (Opc == TargetOpcode::G_ATOMICRMW_AND)
794 Val =
795 MIRBuilder.buildXor(Dst: ValLLT, Src0: MIRBuilder.buildConstant(Res: ValLLT, Val: -1), Src1: Val)
796 .getReg(Idx: 0);
797 else if (Opc == TargetOpcode::G_ATOMICRMW_SUB)
798 Val =
799 MIRBuilder.buildSub(Dst: ValLLT, Src0: MIRBuilder.buildConstant(Res: ValLLT, Val: 0), Src1: Val)
800 .getReg(Idx: 0);
801 Args.push_back(Elt: {Val, IntegerType::get(C&: Ctx, NumBits: ValLLT.getSizeInBits()), 0});
802 Args.push_back(Elt: {Mem, PointerType::get(C&: Ctx, AddressSpace: MemLLT.getAddressSpace()), 0});
803 break;
804 }
805 default:
806 llvm_unreachable("unsupported opcode");
807 }
808
809 auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
810 auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
811 RTLIB::Libcall RTLibcall = getOutlineAtomicLibcall(MI);
812 const char *Name = TLI.getLibcallName(Call: RTLibcall);
813
814 // Unsupported libcall on the target.
815 if (!Name) {
816 LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
817 << MIRBuilder.getTII().getName(Opc) << "\n");
818 return LegalizerHelper::UnableToLegalize;
819 }
820
821 CallLowering::CallLoweringInfo Info;
822 Info.CallConv = TLI.getLibcallCallingConv(Call: RTLibcall);
823 Info.Callee = MachineOperand::CreateES(SymName: Name);
824 Info.OrigRet = CallLowering::ArgInfo(RetRegs, RetTy, 0);
825
826 std::copy(first: Args.begin(), last: Args.end(), result: std::back_inserter(x&: Info.OrigArgs));
827 if (!CLI.lowerCall(MIRBuilder, Info))
828 return LegalizerHelper::UnableToLegalize;
829
830 return LegalizerHelper::Legalized;
831}
832
833static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
834 Type *FromType) {
835 auto ToMVT = MVT::getVT(Ty: ToType);
836 auto FromMVT = MVT::getVT(Ty: FromType);
837
838 switch (Opcode) {
839 case TargetOpcode::G_FPEXT:
840 return RTLIB::getFPEXT(OpVT: FromMVT, RetVT: ToMVT);
841 case TargetOpcode::G_FPTRUNC:
842 return RTLIB::getFPROUND(OpVT: FromMVT, RetVT: ToMVT);
843 case TargetOpcode::G_FPTOSI:
844 return RTLIB::getFPTOSINT(OpVT: FromMVT, RetVT: ToMVT);
845 case TargetOpcode::G_FPTOUI:
846 return RTLIB::getFPTOUINT(OpVT: FromMVT, RetVT: ToMVT);
847 case TargetOpcode::G_SITOFP:
848 return RTLIB::getSINTTOFP(OpVT: FromMVT, RetVT: ToMVT);
849 case TargetOpcode::G_UITOFP:
850 return RTLIB::getUINTTOFP(OpVT: FromMVT, RetVT: ToMVT);
851 }
852 llvm_unreachable("Unsupported libcall function");
853}
854
855static LegalizerHelper::LegalizeResult
856conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
857 Type *FromType, LostDebugLocObserver &LocObserver) {
858 RTLIB::Libcall Libcall = getConvRTLibDesc(Opcode: MI.getOpcode(), ToType, FromType);
859 return createLibcall(
860 MIRBuilder, Libcall, Result: {MI.getOperand(i: 0).getReg(), ToType, 0},
861 Args: {{MI.getOperand(i: 1).getReg(), FromType, 0}}, LocObserver, MI: &MI);
862}
863
864static RTLIB::Libcall
865getStateLibraryFunctionFor(MachineInstr &MI, const TargetLowering &TLI) {
866 RTLIB::Libcall RTLibcall;
867 switch (MI.getOpcode()) {
868 case TargetOpcode::G_GET_FPENV:
869 RTLibcall = RTLIB::FEGETENV;
870 break;
871 case TargetOpcode::G_SET_FPENV:
872 case TargetOpcode::G_RESET_FPENV:
873 RTLibcall = RTLIB::FESETENV;
874 break;
875 case TargetOpcode::G_GET_FPMODE:
876 RTLibcall = RTLIB::FEGETMODE;
877 break;
878 case TargetOpcode::G_SET_FPMODE:
879 case TargetOpcode::G_RESET_FPMODE:
880 RTLibcall = RTLIB::FESETMODE;
881 break;
882 default:
883 llvm_unreachable("Unexpected opcode");
884 }
885 return RTLibcall;
886}
887
888// Some library functions that read FP state (fegetmode, fegetenv) write the
889// state into a region in memory. IR intrinsics that do the same operations
890// (get_fpmode, get_fpenv) return the state as integer value. To implement these
891// intrinsics via the library functions, we need to use temporary variable,
892// for example:
893//
894// %0:_(s32) = G_GET_FPMODE
895//
896// is transformed to:
897//
898// %1:_(p0) = G_FRAME_INDEX %stack.0
899// BL &fegetmode
900// %0:_(s32) = G_LOAD % 1
901//
902LegalizerHelper::LegalizeResult
903LegalizerHelper::createGetStateLibcall(MachineIRBuilder &MIRBuilder,
904 MachineInstr &MI,
905 LostDebugLocObserver &LocObserver) {
906 const DataLayout &DL = MIRBuilder.getDataLayout();
907 auto &MF = MIRBuilder.getMF();
908 auto &MRI = *MIRBuilder.getMRI();
909 auto &Ctx = MF.getFunction().getContext();
910
911 // Create temporary, where library function will put the read state.
912 Register Dst = MI.getOperand(i: 0).getReg();
913 LLT StateTy = MRI.getType(Reg: Dst);
914 TypeSize StateSize = StateTy.getSizeInBytes();
915 Align TempAlign = getStackTemporaryAlignment(Type: StateTy);
916 MachinePointerInfo TempPtrInfo;
917 auto Temp = createStackTemporary(Bytes: StateSize, Alignment: TempAlign, PtrInfo&: TempPtrInfo);
918
919 // Create a call to library function, with the temporary as an argument.
920 unsigned TempAddrSpace = DL.getAllocaAddrSpace();
921 Type *StatePtrTy = PointerType::get(C&: Ctx, AddressSpace: TempAddrSpace);
922 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
923 auto Res =
924 createLibcall(MIRBuilder, Libcall: RTLibcall,
925 Result: CallLowering::ArgInfo({0}, Type::getVoidTy(C&: Ctx), 0),
926 Args: CallLowering::ArgInfo({Temp.getReg(Idx: 0), StatePtrTy, 0}),
927 LocObserver, MI: nullptr);
928 if (Res != LegalizerHelper::Legalized)
929 return Res;
930
931 // Create a load from the temporary.
932 MachineMemOperand *MMO = MF.getMachineMemOperand(
933 PtrInfo: TempPtrInfo, f: MachineMemOperand::MOLoad, MemTy: StateTy, base_alignment: TempAlign);
934 MIRBuilder.buildLoadInstr(Opcode: TargetOpcode::G_LOAD, Res: Dst, Addr: Temp, MMO&: *MMO);
935
936 return LegalizerHelper::Legalized;
937}
938
939// Similar to `createGetStateLibcall` the function calls a library function
940// using transient space in stack. In this case the library function reads
941// content of memory region.
942LegalizerHelper::LegalizeResult
943LegalizerHelper::createSetStateLibcall(MachineIRBuilder &MIRBuilder,
944 MachineInstr &MI,
945 LostDebugLocObserver &LocObserver) {
946 const DataLayout &DL = MIRBuilder.getDataLayout();
947 auto &MF = MIRBuilder.getMF();
948 auto &MRI = *MIRBuilder.getMRI();
949 auto &Ctx = MF.getFunction().getContext();
950
951 // Create temporary, where library function will get the new state.
952 Register Src = MI.getOperand(i: 0).getReg();
953 LLT StateTy = MRI.getType(Reg: Src);
954 TypeSize StateSize = StateTy.getSizeInBytes();
955 Align TempAlign = getStackTemporaryAlignment(Type: StateTy);
956 MachinePointerInfo TempPtrInfo;
957 auto Temp = createStackTemporary(Bytes: StateSize, Alignment: TempAlign, PtrInfo&: TempPtrInfo);
958
959 // Put the new state into the temporary.
960 MachineMemOperand *MMO = MF.getMachineMemOperand(
961 PtrInfo: TempPtrInfo, f: MachineMemOperand::MOStore, MemTy: StateTy, base_alignment: TempAlign);
962 MIRBuilder.buildStore(Val: Src, Addr: Temp, MMO&: *MMO);
963
964 // Create a call to library function, with the temporary as an argument.
965 unsigned TempAddrSpace = DL.getAllocaAddrSpace();
966 Type *StatePtrTy = PointerType::get(C&: Ctx, AddressSpace: TempAddrSpace);
967 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
968 return createLibcall(MIRBuilder, Libcall: RTLibcall,
969 Result: CallLowering::ArgInfo({0}, Type::getVoidTy(C&: Ctx), 0),
970 Args: CallLowering::ArgInfo({Temp.getReg(Idx: 0), StatePtrTy, 0}),
971 LocObserver, MI: nullptr);
972}
973
974// The function is used to legalize operations that set default environment
975// state. In C library a call like `fesetmode(FE_DFL_MODE)` is used for that.
976// On most targets supported in glibc FE_DFL_MODE is defined as
977// `((const femode_t *) -1)`. Such assumption is used here. If for some target
978// it is not true, the target must provide custom lowering.
979LegalizerHelper::LegalizeResult
980LegalizerHelper::createResetStateLibcall(MachineIRBuilder &MIRBuilder,
981 MachineInstr &MI,
982 LostDebugLocObserver &LocObserver) {
983 const DataLayout &DL = MIRBuilder.getDataLayout();
984 auto &MF = MIRBuilder.getMF();
985 auto &Ctx = MF.getFunction().getContext();
986
987 // Create an argument for the library function.
988 unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
989 Type *StatePtrTy = PointerType::get(C&: Ctx, AddressSpace: AddrSpace);
990 unsigned PtrSize = DL.getPointerSizeInBits(AS: AddrSpace);
991 LLT MemTy = LLT::pointer(AddressSpace: AddrSpace, SizeInBits: PtrSize);
992 auto DefValue = MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: PtrSize), Val: -1LL);
993 DstOp Dest(MRI.createGenericVirtualRegister(Ty: MemTy));
994 MIRBuilder.buildIntToPtr(Dst: Dest, Src: DefValue);
995
996 RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
997 return createLibcall(MIRBuilder, Libcall: RTLibcall,
998 Result: CallLowering::ArgInfo({0}, Type::getVoidTy(C&: Ctx), 0),
999 Args: CallLowering::ArgInfo({Dest.getReg(), StatePtrTy, 0}),
1000 LocObserver, MI: &MI);
1001}
1002
1003LegalizerHelper::LegalizeResult
1004LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
1005 auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
1006
1007 switch (MI.getOpcode()) {
1008 default:
1009 return UnableToLegalize;
1010 case TargetOpcode::G_MUL:
1011 case TargetOpcode::G_SDIV:
1012 case TargetOpcode::G_UDIV:
1013 case TargetOpcode::G_SREM:
1014 case TargetOpcode::G_UREM:
1015 case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
1016 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1017 unsigned Size = LLTy.getSizeInBits();
1018 Type *HLTy = IntegerType::get(C&: Ctx, NumBits: Size);
1019 auto Status = simpleLibcall(MI, MIRBuilder, Size, OpType: HLTy, LocObserver);
1020 if (Status != Legalized)
1021 return Status;
1022 break;
1023 }
1024 case TargetOpcode::G_FADD:
1025 case TargetOpcode::G_FSUB:
1026 case TargetOpcode::G_FMUL:
1027 case TargetOpcode::G_FDIV:
1028 case TargetOpcode::G_FMA:
1029 case TargetOpcode::G_FPOW:
1030 case TargetOpcode::G_FREM:
1031 case TargetOpcode::G_FCOS:
1032 case TargetOpcode::G_FSIN:
1033 case TargetOpcode::G_FLOG10:
1034 case TargetOpcode::G_FLOG:
1035 case TargetOpcode::G_FLOG2:
1036 case TargetOpcode::G_FLDEXP:
1037 case TargetOpcode::G_FEXP:
1038 case TargetOpcode::G_FEXP2:
1039 case TargetOpcode::G_FEXP10:
1040 case TargetOpcode::G_FCEIL:
1041 case TargetOpcode::G_FFLOOR:
1042 case TargetOpcode::G_FMINNUM:
1043 case TargetOpcode::G_FMAXNUM:
1044 case TargetOpcode::G_FSQRT:
1045 case TargetOpcode::G_FRINT:
1046 case TargetOpcode::G_FNEARBYINT:
1047 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
1048 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1049 unsigned Size = LLTy.getSizeInBits();
1050 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1051 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1052 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1053 return UnableToLegalize;
1054 }
1055 auto Status = simpleLibcall(MI, MIRBuilder, Size, OpType: HLTy, LocObserver);
1056 if (Status != Legalized)
1057 return Status;
1058 break;
1059 }
1060 case TargetOpcode::G_FPOWI: {
1061 LLT LLTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1062 unsigned Size = LLTy.getSizeInBits();
1063 Type *HLTy = getFloatTypeForLLT(Ctx, Ty: LLTy);
1064 Type *ITy = IntegerType::get(
1065 C&: Ctx, NumBits: MRI.getType(Reg: MI.getOperand(i: 2).getReg()).getSizeInBits());
1066 if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1067 LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1068 return UnableToLegalize;
1069 }
1070 auto Libcall = getRTLibDesc(Opcode: MI.getOpcode(), Size);
1071 std::initializer_list<CallLowering::ArgInfo> Args = {
1072 {MI.getOperand(i: 1).getReg(), HLTy, 0},
1073 {MI.getOperand(i: 2).getReg(), ITy, 1}};
1074 LegalizeResult Status =
1075 createLibcall(MIRBuilder, Libcall, Result: {MI.getOperand(i: 0).getReg(), HLTy, 0},
1076 Args, LocObserver, MI: &MI);
1077 if (Status != Legalized)
1078 return Status;
1079 break;
1080 }
1081 case TargetOpcode::G_FPEXT:
1082 case TargetOpcode::G_FPTRUNC: {
1083 Type *FromTy = getFloatTypeForLLT(Ctx, Ty: MRI.getType(Reg: MI.getOperand(i: 1).getReg()));
1084 Type *ToTy = getFloatTypeForLLT(Ctx, Ty: MRI.getType(Reg: MI.getOperand(i: 0).getReg()));
1085 if (!FromTy || !ToTy)
1086 return UnableToLegalize;
1087 LegalizeResult Status =
1088 conversionLibcall(MI, MIRBuilder, ToType: ToTy, FromType: FromTy, LocObserver);
1089 if (Status != Legalized)
1090 return Status;
1091 break;
1092 }
1093 case TargetOpcode::G_FPTOSI:
1094 case TargetOpcode::G_FPTOUI: {
1095 // FIXME: Support other types
1096 unsigned FromSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
1097 unsigned ToSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
1098 if ((ToSize != 32 && ToSize != 64) || (FromSize != 32 && FromSize != 64))
1099 return UnableToLegalize;
1100 LegalizeResult Status = conversionLibcall(
1101 MI, MIRBuilder,
1102 ToType: ToSize == 32 ? Type::getInt32Ty(C&: Ctx) : Type::getInt64Ty(C&: Ctx),
1103 FromType: FromSize == 64 ? Type::getDoubleTy(C&: Ctx) : Type::getFloatTy(C&: Ctx),
1104 LocObserver);
1105 if (Status != Legalized)
1106 return Status;
1107 break;
1108 }
1109 case TargetOpcode::G_SITOFP:
1110 case TargetOpcode::G_UITOFP: {
1111 // FIXME: Support other types
1112 unsigned FromSize = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
1113 unsigned ToSize = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
1114 if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64))
1115 return UnableToLegalize;
1116 LegalizeResult Status = conversionLibcall(
1117 MI, MIRBuilder,
1118 ToType: ToSize == 64 ? Type::getDoubleTy(C&: Ctx) : Type::getFloatTy(C&: Ctx),
1119 FromType: FromSize == 32 ? Type::getInt32Ty(C&: Ctx) : Type::getInt64Ty(C&: Ctx),
1120 LocObserver);
1121 if (Status != Legalized)
1122 return Status;
1123 break;
1124 }
1125 case TargetOpcode::G_ATOMICRMW_XCHG:
1126 case TargetOpcode::G_ATOMICRMW_ADD:
1127 case TargetOpcode::G_ATOMICRMW_SUB:
1128 case TargetOpcode::G_ATOMICRMW_AND:
1129 case TargetOpcode::G_ATOMICRMW_OR:
1130 case TargetOpcode::G_ATOMICRMW_XOR:
1131 case TargetOpcode::G_ATOMIC_CMPXCHG:
1132 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
1133 auto Status = createAtomicLibcall(MIRBuilder, MI);
1134 if (Status != Legalized)
1135 return Status;
1136 break;
1137 }
1138 case TargetOpcode::G_BZERO:
1139 case TargetOpcode::G_MEMCPY:
1140 case TargetOpcode::G_MEMMOVE:
1141 case TargetOpcode::G_MEMSET: {
1142 LegalizeResult Result =
1143 createMemLibcall(MIRBuilder, MRI&: *MIRBuilder.getMRI(), MI, LocObserver);
1144 if (Result != Legalized)
1145 return Result;
1146 MI.eraseFromParent();
1147 return Result;
1148 }
1149 case TargetOpcode::G_GET_FPENV:
1150 case TargetOpcode::G_GET_FPMODE: {
1151 LegalizeResult Result = createGetStateLibcall(MIRBuilder, MI, LocObserver);
1152 if (Result != Legalized)
1153 return Result;
1154 break;
1155 }
1156 case TargetOpcode::G_SET_FPENV:
1157 case TargetOpcode::G_SET_FPMODE: {
1158 LegalizeResult Result = createSetStateLibcall(MIRBuilder, MI, LocObserver);
1159 if (Result != Legalized)
1160 return Result;
1161 break;
1162 }
1163 case TargetOpcode::G_RESET_FPENV:
1164 case TargetOpcode::G_RESET_FPMODE: {
1165 LegalizeResult Result =
1166 createResetStateLibcall(MIRBuilder, MI, LocObserver);
1167 if (Result != Legalized)
1168 return Result;
1169 break;
1170 }
1171 }
1172
1173 MI.eraseFromParent();
1174 return Legalized;
1175}
1176
1177LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
1178 unsigned TypeIdx,
1179 LLT NarrowTy) {
1180 uint64_t SizeOp0 = MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getSizeInBits();
1181 uint64_t NarrowSize = NarrowTy.getSizeInBits();
1182
1183 switch (MI.getOpcode()) {
1184 default:
1185 return UnableToLegalize;
1186 case TargetOpcode::G_IMPLICIT_DEF: {
1187 Register DstReg = MI.getOperand(i: 0).getReg();
1188 LLT DstTy = MRI.getType(Reg: DstReg);
1189
1190 // If SizeOp0 is not an exact multiple of NarrowSize, emit
1191 // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
1192 // FIXME: Although this would also be legal for the general case, it causes
1193 // a lot of regressions in the emitted code (superfluous COPYs, artifact
1194 // combines not being hit). This seems to be a problem related to the
1195 // artifact combiner.
1196 if (SizeOp0 % NarrowSize != 0) {
1197 LLT ImplicitTy = NarrowTy;
1198 if (DstTy.isVector())
1199 ImplicitTy = LLT::vector(EC: DstTy.getElementCount(), ScalarTy: ImplicitTy);
1200
1201 Register ImplicitReg = MIRBuilder.buildUndef(Res: ImplicitTy).getReg(Idx: 0);
1202 MIRBuilder.buildAnyExt(Res: DstReg, Op: ImplicitReg);
1203
1204 MI.eraseFromParent();
1205 return Legalized;
1206 }
1207
1208 int NumParts = SizeOp0 / NarrowSize;
1209
1210 SmallVector<Register, 2> DstRegs;
1211 for (int i = 0; i < NumParts; ++i)
1212 DstRegs.push_back(Elt: MIRBuilder.buildUndef(Res: NarrowTy).getReg(Idx: 0));
1213
1214 if (DstTy.isVector())
1215 MIRBuilder.buildBuildVector(Res: DstReg, Ops: DstRegs);
1216 else
1217 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
1218 MI.eraseFromParent();
1219 return Legalized;
1220 }
1221 case TargetOpcode::G_CONSTANT: {
1222 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1223 const APInt &Val = MI.getOperand(i: 1).getCImm()->getValue();
1224 unsigned TotalSize = Ty.getSizeInBits();
1225 unsigned NarrowSize = NarrowTy.getSizeInBits();
1226 int NumParts = TotalSize / NarrowSize;
1227
1228 SmallVector<Register, 4> PartRegs;
1229 for (int I = 0; I != NumParts; ++I) {
1230 unsigned Offset = I * NarrowSize;
1231 auto K = MIRBuilder.buildConstant(Res: NarrowTy,
1232 Val: Val.lshr(shiftAmt: Offset).trunc(width: NarrowSize));
1233 PartRegs.push_back(Elt: K.getReg(Idx: 0));
1234 }
1235
1236 LLT LeftoverTy;
1237 unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
1238 SmallVector<Register, 1> LeftoverRegs;
1239 if (LeftoverBits != 0) {
1240 LeftoverTy = LLT::scalar(SizeInBits: LeftoverBits);
1241 auto K = MIRBuilder.buildConstant(
1242 Res: LeftoverTy,
1243 Val: Val.lshr(shiftAmt: NumParts * NarrowSize).trunc(width: LeftoverBits));
1244 LeftoverRegs.push_back(Elt: K.getReg(Idx: 0));
1245 }
1246
1247 insertParts(DstReg: MI.getOperand(i: 0).getReg(),
1248 ResultTy: Ty, PartTy: NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
1249
1250 MI.eraseFromParent();
1251 return Legalized;
1252 }
1253 case TargetOpcode::G_SEXT:
1254 case TargetOpcode::G_ZEXT:
1255 case TargetOpcode::G_ANYEXT:
1256 return narrowScalarExt(MI, TypeIdx, Ty: NarrowTy);
1257 case TargetOpcode::G_TRUNC: {
1258 if (TypeIdx != 1)
1259 return UnableToLegalize;
1260
1261 uint64_t SizeOp1 = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
1262 if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
1263 LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
1264 return UnableToLegalize;
1265 }
1266
1267 auto Unmerge = MIRBuilder.buildUnmerge(Res: NarrowTy, Op: MI.getOperand(i: 1));
1268 MIRBuilder.buildCopy(Res: MI.getOperand(i: 0), Op: Unmerge.getReg(Idx: 0));
1269 MI.eraseFromParent();
1270 return Legalized;
1271 }
1272
1273 case TargetOpcode::G_FREEZE: {
1274 if (TypeIdx != 0)
1275 return UnableToLegalize;
1276
1277 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
1278 // Should widen scalar first
1279 if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0)
1280 return UnableToLegalize;
1281
1282 auto Unmerge = MIRBuilder.buildUnmerge(Res: NarrowTy, Op: MI.getOperand(i: 1).getReg());
1283 SmallVector<Register, 8> Parts;
1284 for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
1285 Parts.push_back(
1286 Elt: MIRBuilder.buildFreeze(Dst: NarrowTy, Src: Unmerge.getReg(Idx: i)).getReg(Idx: 0));
1287 }
1288
1289 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0).getReg(), Ops: Parts);
1290 MI.eraseFromParent();
1291 return Legalized;
1292 }
1293 case TargetOpcode::G_ADD:
1294 case TargetOpcode::G_SUB:
1295 case TargetOpcode::G_SADDO:
1296 case TargetOpcode::G_SSUBO:
1297 case TargetOpcode::G_SADDE:
1298 case TargetOpcode::G_SSUBE:
1299 case TargetOpcode::G_UADDO:
1300 case TargetOpcode::G_USUBO:
1301 case TargetOpcode::G_UADDE:
1302 case TargetOpcode::G_USUBE:
1303 return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
1304 case TargetOpcode::G_MUL:
1305 case TargetOpcode::G_UMULH:
1306 return narrowScalarMul(MI, Ty: NarrowTy);
1307 case TargetOpcode::G_EXTRACT:
1308 return narrowScalarExtract(MI, TypeIdx, Ty: NarrowTy);
1309 case TargetOpcode::G_INSERT:
1310 return narrowScalarInsert(MI, TypeIdx, Ty: NarrowTy);
1311 case TargetOpcode::G_LOAD: {
1312 auto &LoadMI = cast<GLoad>(Val&: MI);
1313 Register DstReg = LoadMI.getDstReg();
1314 LLT DstTy = MRI.getType(Reg: DstReg);
1315 if (DstTy.isVector())
1316 return UnableToLegalize;
1317
1318 if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) {
1319 Register TmpReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1320 MIRBuilder.buildLoad(Res: TmpReg, Addr: LoadMI.getPointerReg(), MMO&: LoadMI.getMMO());
1321 MIRBuilder.buildAnyExt(Res: DstReg, Op: TmpReg);
1322 LoadMI.eraseFromParent();
1323 return Legalized;
1324 }
1325
1326 return reduceLoadStoreWidth(MI&: LoadMI, TypeIdx, NarrowTy);
1327 }
1328 case TargetOpcode::G_ZEXTLOAD:
1329 case TargetOpcode::G_SEXTLOAD: {
1330 auto &LoadMI = cast<GExtLoad>(Val&: MI);
1331 Register DstReg = LoadMI.getDstReg();
1332 Register PtrReg = LoadMI.getPointerReg();
1333
1334 Register TmpReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1335 auto &MMO = LoadMI.getMMO();
1336 unsigned MemSize = MMO.getSizeInBits();
1337
1338 if (MemSize == NarrowSize) {
1339 MIRBuilder.buildLoad(Res: TmpReg, Addr: PtrReg, MMO);
1340 } else if (MemSize < NarrowSize) {
1341 MIRBuilder.buildLoadInstr(Opcode: LoadMI.getOpcode(), Res: TmpReg, Addr: PtrReg, MMO);
1342 } else if (MemSize > NarrowSize) {
1343 // FIXME: Need to split the load.
1344 return UnableToLegalize;
1345 }
1346
1347 if (isa<GZExtLoad>(Val: LoadMI))
1348 MIRBuilder.buildZExt(Res: DstReg, Op: TmpReg);
1349 else
1350 MIRBuilder.buildSExt(Res: DstReg, Op: TmpReg);
1351
1352 LoadMI.eraseFromParent();
1353 return Legalized;
1354 }
1355 case TargetOpcode::G_STORE: {
1356 auto &StoreMI = cast<GStore>(Val&: MI);
1357
1358 Register SrcReg = StoreMI.getValueReg();
1359 LLT SrcTy = MRI.getType(Reg: SrcReg);
1360 if (SrcTy.isVector())
1361 return UnableToLegalize;
1362
1363 int NumParts = SizeOp0 / NarrowSize;
1364 unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
1365 unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
1366 if (SrcTy.isVector() && LeftoverBits != 0)
1367 return UnableToLegalize;
1368
1369 if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) {
1370 Register TmpReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1371 MIRBuilder.buildTrunc(Res: TmpReg, Op: SrcReg);
1372 MIRBuilder.buildStore(Val: TmpReg, Addr: StoreMI.getPointerReg(), MMO&: StoreMI.getMMO());
1373 StoreMI.eraseFromParent();
1374 return Legalized;
1375 }
1376
1377 return reduceLoadStoreWidth(MI&: StoreMI, TypeIdx: 0, NarrowTy);
1378 }
1379 case TargetOpcode::G_SELECT:
1380 return narrowScalarSelect(MI, TypeIdx, Ty: NarrowTy);
1381 case TargetOpcode::G_AND:
1382 case TargetOpcode::G_OR:
1383 case TargetOpcode::G_XOR: {
1384 // Legalize bitwise operation:
1385 // A = BinOp<Ty> B, C
1386 // into:
1387 // B1, ..., BN = G_UNMERGE_VALUES B
1388 // C1, ..., CN = G_UNMERGE_VALUES C
1389 // A1 = BinOp<Ty/N> B1, C2
1390 // ...
1391 // AN = BinOp<Ty/N> BN, CN
1392 // A = G_MERGE_VALUES A1, ..., AN
1393 return narrowScalarBasic(MI, TypeIdx, Ty: NarrowTy);
1394 }
1395 case TargetOpcode::G_SHL:
1396 case TargetOpcode::G_LSHR:
1397 case TargetOpcode::G_ASHR:
1398 return narrowScalarShift(MI, TypeIdx, Ty: NarrowTy);
1399 case TargetOpcode::G_CTLZ:
1400 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1401 case TargetOpcode::G_CTTZ:
1402 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1403 case TargetOpcode::G_CTPOP:
1404 if (TypeIdx == 1)
1405 switch (MI.getOpcode()) {
1406 case TargetOpcode::G_CTLZ:
1407 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1408 return narrowScalarCTLZ(MI, TypeIdx, Ty: NarrowTy);
1409 case TargetOpcode::G_CTTZ:
1410 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1411 return narrowScalarCTTZ(MI, TypeIdx, Ty: NarrowTy);
1412 case TargetOpcode::G_CTPOP:
1413 return narrowScalarCTPOP(MI, TypeIdx, Ty: NarrowTy);
1414 default:
1415 return UnableToLegalize;
1416 }
1417
1418 Observer.changingInstr(MI);
1419 narrowScalarDst(MI, NarrowTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_ZEXT);
1420 Observer.changedInstr(MI);
1421 return Legalized;
1422 case TargetOpcode::G_INTTOPTR:
1423 if (TypeIdx != 1)
1424 return UnableToLegalize;
1425
1426 Observer.changingInstr(MI);
1427 narrowScalarSrc(MI, NarrowTy, OpIdx: 1);
1428 Observer.changedInstr(MI);
1429 return Legalized;
1430 case TargetOpcode::G_PTRTOINT:
1431 if (TypeIdx != 0)
1432 return UnableToLegalize;
1433
1434 Observer.changingInstr(MI);
1435 narrowScalarDst(MI, NarrowTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_ZEXT);
1436 Observer.changedInstr(MI);
1437 return Legalized;
1438 case TargetOpcode::G_PHI: {
1439 // FIXME: add support for when SizeOp0 isn't an exact multiple of
1440 // NarrowSize.
1441 if (SizeOp0 % NarrowSize != 0)
1442 return UnableToLegalize;
1443
1444 unsigned NumParts = SizeOp0 / NarrowSize;
1445 SmallVector<Register, 2> DstRegs(NumParts);
1446 SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1447 Observer.changingInstr(MI);
1448 for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1449 MachineBasicBlock &OpMBB = *MI.getOperand(i: i + 1).getMBB();
1450 MIRBuilder.setInsertPt(MBB&: OpMBB, II: OpMBB.getFirstTerminatorForward());
1451 extractParts(Reg: MI.getOperand(i).getReg(), Ty: NarrowTy, NumParts,
1452 VRegs&: SrcRegs[i / 2], MIRBuilder, MRI);
1453 }
1454 MachineBasicBlock &MBB = *MI.getParent();
1455 MIRBuilder.setInsertPt(MBB, II: MI);
1456 for (unsigned i = 0; i < NumParts; ++i) {
1457 DstRegs[i] = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1458 MachineInstrBuilder MIB =
1459 MIRBuilder.buildInstr(Opcode: TargetOpcode::G_PHI).addDef(RegNo: DstRegs[i]);
1460 for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1461 MIB.addUse(RegNo: SrcRegs[j / 2][i]).add(MO: MI.getOperand(i: j + 1));
1462 }
1463 MIRBuilder.setInsertPt(MBB, II: MBB.getFirstNonPHI());
1464 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: DstRegs);
1465 Observer.changedInstr(MI);
1466 MI.eraseFromParent();
1467 return Legalized;
1468 }
1469 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1470 case TargetOpcode::G_INSERT_VECTOR_ELT: {
1471 if (TypeIdx != 2)
1472 return UnableToLegalize;
1473
1474 int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1475 Observer.changingInstr(MI);
1476 narrowScalarSrc(MI, NarrowTy, OpIdx);
1477 Observer.changedInstr(MI);
1478 return Legalized;
1479 }
1480 case TargetOpcode::G_ICMP: {
1481 Register LHS = MI.getOperand(i: 2).getReg();
1482 LLT SrcTy = MRI.getType(Reg: LHS);
1483 uint64_t SrcSize = SrcTy.getSizeInBits();
1484 CmpInst::Predicate Pred =
1485 static_cast<CmpInst::Predicate>(MI.getOperand(i: 1).getPredicate());
1486
1487 // TODO: Handle the non-equality case for weird sizes.
1488 if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(P: Pred))
1489 return UnableToLegalize;
1490
1491 LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1492 SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1493 if (!extractParts(Reg: LHS, RegTy: SrcTy, MainTy: NarrowTy, LeftoverTy, VRegs&: LHSPartRegs,
1494 LeftoverVRegs&: LHSLeftoverRegs, MIRBuilder, MRI))
1495 return UnableToLegalize;
1496
1497 LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1498 SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1499 if (!extractParts(Reg: MI.getOperand(i: 3).getReg(), RegTy: SrcTy, MainTy: NarrowTy, LeftoverTy&: Unused,
1500 VRegs&: RHSPartRegs, LeftoverVRegs&: RHSLeftoverRegs, MIRBuilder, MRI))
1501 return UnableToLegalize;
1502
1503 // We now have the LHS and RHS of the compare split into narrow-type
1504 // registers, plus potentially some leftover type.
1505 Register Dst = MI.getOperand(i: 0).getReg();
1506 LLT ResTy = MRI.getType(Reg: Dst);
1507 if (ICmpInst::isEquality(P: Pred)) {
1508 // For each part on the LHS and RHS, keep track of the result of XOR-ing
1509 // them together. For each equal part, the result should be all 0s. For
1510 // each non-equal part, we'll get at least one 1.
1511 auto Zero = MIRBuilder.buildConstant(Res: NarrowTy, Val: 0);
1512 SmallVector<Register, 4> Xors;
1513 for (auto LHSAndRHS : zip(t&: LHSPartRegs, u&: RHSPartRegs)) {
1514 auto LHS = std::get<0>(t&: LHSAndRHS);
1515 auto RHS = std::get<1>(t&: LHSAndRHS);
1516 auto Xor = MIRBuilder.buildXor(Dst: NarrowTy, Src0: LHS, Src1: RHS).getReg(Idx: 0);
1517 Xors.push_back(Elt: Xor);
1518 }
1519
1520 // Build a G_XOR for each leftover register. Each G_XOR must be widened
1521 // to the desired narrow type so that we can OR them together later.
1522 SmallVector<Register, 4> WidenedXors;
1523 for (auto LHSAndRHS : zip(t&: LHSLeftoverRegs, u&: RHSLeftoverRegs)) {
1524 auto LHS = std::get<0>(t&: LHSAndRHS);
1525 auto RHS = std::get<1>(t&: LHSAndRHS);
1526 auto Xor = MIRBuilder.buildXor(Dst: LeftoverTy, Src0: LHS, Src1: RHS).getReg(Idx: 0);
1527 LLT GCDTy = extractGCDType(Parts&: WidenedXors, DstTy: NarrowTy, NarrowTy: LeftoverTy, SrcReg: Xor);
1528 buildLCMMergePieces(DstTy: LeftoverTy, NarrowTy, GCDTy, VRegs&: WidenedXors,
1529 /* PadStrategy = */ TargetOpcode::G_ZEXT);
1530 Xors.insert(I: Xors.end(), From: WidenedXors.begin(), To: WidenedXors.end());
1531 }
1532
1533 // Now, for each part we broke up, we know if they are equal/not equal
1534 // based off the G_XOR. We can OR these all together and compare against
1535 // 0 to get the result.
1536 assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1537 auto Or = MIRBuilder.buildOr(Dst: NarrowTy, Src0: Xors[0], Src1: Xors[1]);
1538 for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1539 Or = MIRBuilder.buildOr(Dst: NarrowTy, Src0: Or, Src1: Xors[I]);
1540 MIRBuilder.buildICmp(Pred, Res: Dst, Op0: Or, Op1: Zero);
1541 } else {
1542 // TODO: Handle non-power-of-two types.
1543 assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?");
1544 assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?");
1545 Register LHSL = LHSPartRegs[0];
1546 Register LHSH = LHSPartRegs[1];
1547 Register RHSL = RHSPartRegs[0];
1548 Register RHSH = RHSPartRegs[1];
1549 MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, Res: ResTy, Op0: LHSH, Op1: RHSH);
1550 MachineInstrBuilder CmpHEQ =
1551 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: ResTy, Op0: LHSH, Op1: RHSH);
1552 MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
1553 Pred: ICmpInst::getUnsignedPredicate(pred: Pred), Res: ResTy, Op0: LHSL, Op1: RHSL);
1554 MIRBuilder.buildSelect(Res: Dst, Tst: CmpHEQ, Op0: CmpLU, Op1: CmpH);
1555 }
1556 MI.eraseFromParent();
1557 return Legalized;
1558 }
1559 case TargetOpcode::G_FCMP:
1560 if (TypeIdx != 0)
1561 return UnableToLegalize;
1562
1563 Observer.changingInstr(MI);
1564 narrowScalarDst(MI, NarrowTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_ZEXT);
1565 Observer.changedInstr(MI);
1566 return Legalized;
1567
1568 case TargetOpcode::G_SEXT_INREG: {
1569 if (TypeIdx != 0)
1570 return UnableToLegalize;
1571
1572 int64_t SizeInBits = MI.getOperand(i: 2).getImm();
1573
1574 // So long as the new type has more bits than the bits we're extending we
1575 // don't need to break it apart.
1576 if (NarrowTy.getScalarSizeInBits() > SizeInBits) {
1577 Observer.changingInstr(MI);
1578 // We don't lose any non-extension bits by truncating the src and
1579 // sign-extending the dst.
1580 MachineOperand &MO1 = MI.getOperand(i: 1);
1581 auto TruncMIB = MIRBuilder.buildTrunc(Res: NarrowTy, Op: MO1);
1582 MO1.setReg(TruncMIB.getReg(Idx: 0));
1583
1584 MachineOperand &MO2 = MI.getOperand(i: 0);
1585 Register DstExt = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1586 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
1587 MIRBuilder.buildSExt(Res: MO2, Op: DstExt);
1588 MO2.setReg(DstExt);
1589 Observer.changedInstr(MI);
1590 return Legalized;
1591 }
1592
1593 // Break it apart. Components below the extension point are unmodified. The
1594 // component containing the extension point becomes a narrower SEXT_INREG.
1595 // Components above it are ashr'd from the component containing the
1596 // extension point.
1597 if (SizeOp0 % NarrowSize != 0)
1598 return UnableToLegalize;
1599 int NumParts = SizeOp0 / NarrowSize;
1600
1601 // List the registers where the destination will be scattered.
1602 SmallVector<Register, 2> DstRegs;
1603 // List the registers where the source will be split.
1604 SmallVector<Register, 2> SrcRegs;
1605
1606 // Create all the temporary registers.
1607 for (int i = 0; i < NumParts; ++i) {
1608 Register SrcReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1609
1610 SrcRegs.push_back(Elt: SrcReg);
1611 }
1612
1613 // Explode the big arguments into smaller chunks.
1614 MIRBuilder.buildUnmerge(Res: SrcRegs, Op: MI.getOperand(i: 1));
1615
1616 Register AshrCstReg =
1617 MIRBuilder.buildConstant(Res: NarrowTy, Val: NarrowTy.getScalarSizeInBits() - 1)
1618 .getReg(Idx: 0);
1619 Register FullExtensionReg;
1620 Register PartialExtensionReg;
1621
1622 // Do the operation on each small part.
1623 for (int i = 0; i < NumParts; ++i) {
1624 if ((i + 1) * NarrowTy.getScalarSizeInBits() <= SizeInBits) {
1625 DstRegs.push_back(Elt: SrcRegs[i]);
1626 PartialExtensionReg = DstRegs.back();
1627 } else if (i * NarrowTy.getScalarSizeInBits() >= SizeInBits) {
1628 assert(PartialExtensionReg &&
1629 "Expected to visit partial extension before full");
1630 if (FullExtensionReg) {
1631 DstRegs.push_back(Elt: FullExtensionReg);
1632 continue;
1633 }
1634 DstRegs.push_back(
1635 Elt: MIRBuilder.buildAShr(Dst: NarrowTy, Src0: PartialExtensionReg, Src1: AshrCstReg)
1636 .getReg(Idx: 0));
1637 FullExtensionReg = DstRegs.back();
1638 } else {
1639 DstRegs.push_back(
1640 Elt: MIRBuilder
1641 .buildInstr(
1642 Opc: TargetOpcode::G_SEXT_INREG, DstOps: {NarrowTy},
1643 SrcOps: {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
1644 .getReg(Idx: 0));
1645 PartialExtensionReg = DstRegs.back();
1646 }
1647 }
1648
1649 // Gather the destination registers into the final destination.
1650 Register DstReg = MI.getOperand(i: 0).getReg();
1651 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
1652 MI.eraseFromParent();
1653 return Legalized;
1654 }
1655 case TargetOpcode::G_BSWAP:
1656 case TargetOpcode::G_BITREVERSE: {
1657 if (SizeOp0 % NarrowSize != 0)
1658 return UnableToLegalize;
1659
1660 Observer.changingInstr(MI);
1661 SmallVector<Register, 2> SrcRegs, DstRegs;
1662 unsigned NumParts = SizeOp0 / NarrowSize;
1663 extractParts(Reg: MI.getOperand(i: 1).getReg(), Ty: NarrowTy, NumParts, VRegs&: SrcRegs,
1664 MIRBuilder, MRI);
1665
1666 for (unsigned i = 0; i < NumParts; ++i) {
1667 auto DstPart = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {NarrowTy},
1668 SrcOps: {SrcRegs[NumParts - 1 - i]});
1669 DstRegs.push_back(Elt: DstPart.getReg(Idx: 0));
1670 }
1671
1672 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: DstRegs);
1673
1674 Observer.changedInstr(MI);
1675 MI.eraseFromParent();
1676 return Legalized;
1677 }
1678 case TargetOpcode::G_PTR_ADD:
1679 case TargetOpcode::G_PTRMASK: {
1680 if (TypeIdx != 1)
1681 return UnableToLegalize;
1682 Observer.changingInstr(MI);
1683 narrowScalarSrc(MI, NarrowTy, OpIdx: 2);
1684 Observer.changedInstr(MI);
1685 return Legalized;
1686 }
1687 case TargetOpcode::G_FPTOUI:
1688 case TargetOpcode::G_FPTOSI:
1689 return narrowScalarFPTOI(MI, TypeIdx, Ty: NarrowTy);
1690 case TargetOpcode::G_FPEXT:
1691 if (TypeIdx != 0)
1692 return UnableToLegalize;
1693 Observer.changingInstr(MI);
1694 narrowScalarDst(MI, NarrowTy, OpIdx: 0, ExtOpcode: TargetOpcode::G_FPEXT);
1695 Observer.changedInstr(MI);
1696 return Legalized;
1697 case TargetOpcode::G_FLDEXP:
1698 case TargetOpcode::G_STRICT_FLDEXP:
1699 return narrowScalarFLDEXP(MI, TypeIdx, Ty: NarrowTy);
1700 }
1701}
1702
1703Register LegalizerHelper::coerceToScalar(Register Val) {
1704 LLT Ty = MRI.getType(Reg: Val);
1705 if (Ty.isScalar())
1706 return Val;
1707
1708 const DataLayout &DL = MIRBuilder.getDataLayout();
1709 LLT NewTy = LLT::scalar(SizeInBits: Ty.getSizeInBits());
1710 if (Ty.isPointer()) {
1711 if (DL.isNonIntegralAddressSpace(AddrSpace: Ty.getAddressSpace()))
1712 return Register();
1713 return MIRBuilder.buildPtrToInt(Dst: NewTy, Src: Val).getReg(Idx: 0);
1714 }
1715
1716 Register NewVal = Val;
1717
1718 assert(Ty.isVector());
1719 if (Ty.isPointerVector())
1720 NewVal = MIRBuilder.buildPtrToInt(Dst: NewTy, Src: NewVal).getReg(Idx: 0);
1721 return MIRBuilder.buildBitcast(Dst: NewTy, Src: NewVal).getReg(Idx: 0);
1722}
1723
1724void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
1725 unsigned OpIdx, unsigned ExtOpcode) {
1726 MachineOperand &MO = MI.getOperand(i: OpIdx);
1727 auto ExtB = MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {WideTy}, SrcOps: {MO});
1728 MO.setReg(ExtB.getReg(Idx: 0));
1729}
1730
1731void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
1732 unsigned OpIdx) {
1733 MachineOperand &MO = MI.getOperand(i: OpIdx);
1734 auto ExtB = MIRBuilder.buildTrunc(Res: NarrowTy, Op: MO);
1735 MO.setReg(ExtB.getReg(Idx: 0));
1736}
1737
1738void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
1739 unsigned OpIdx, unsigned TruncOpcode) {
1740 MachineOperand &MO = MI.getOperand(i: OpIdx);
1741 Register DstExt = MRI.createGenericVirtualRegister(Ty: WideTy);
1742 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
1743 MIRBuilder.buildInstr(Opc: TruncOpcode, DstOps: {MO}, SrcOps: {DstExt});
1744 MO.setReg(DstExt);
1745}
1746
1747void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
1748 unsigned OpIdx, unsigned ExtOpcode) {
1749 MachineOperand &MO = MI.getOperand(i: OpIdx);
1750 Register DstTrunc = MRI.createGenericVirtualRegister(Ty: NarrowTy);
1751 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
1752 MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {MO}, SrcOps: {DstTrunc});
1753 MO.setReg(DstTrunc);
1754}
1755
1756void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
1757 unsigned OpIdx) {
1758 MachineOperand &MO = MI.getOperand(i: OpIdx);
1759 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
1760 Register Dst = MO.getReg();
1761 Register DstExt = MRI.createGenericVirtualRegister(Ty: WideTy);
1762 MO.setReg(DstExt);
1763 MIRBuilder.buildDeleteTrailingVectorElements(Res: Dst, Op0: DstExt);
1764}
1765
1766void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
1767 unsigned OpIdx) {
1768 MachineOperand &MO = MI.getOperand(i: OpIdx);
1769 SmallVector<Register, 8> Regs;
1770 MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(Res: MoreTy, Op0: MO).getReg(Idx: 0));
1771}
1772
1773void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1774 MachineOperand &Op = MI.getOperand(i: OpIdx);
1775 Op.setReg(MIRBuilder.buildBitcast(Dst: CastTy, Src: Op).getReg(Idx: 0));
1776}
1777
1778void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1779 MachineOperand &MO = MI.getOperand(i: OpIdx);
1780 Register CastDst = MRI.createGenericVirtualRegister(Ty: CastTy);
1781 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
1782 MIRBuilder.buildBitcast(Dst: MO, Src: CastDst);
1783 MO.setReg(CastDst);
1784}
1785
1786LegalizerHelper::LegalizeResult
1787LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
1788 LLT WideTy) {
1789 if (TypeIdx != 1)
1790 return UnableToLegalize;
1791
1792 auto [DstReg, DstTy, Src1Reg, Src1Ty] = MI.getFirst2RegLLTs();
1793 if (DstTy.isVector())
1794 return UnableToLegalize;
1795
1796 LLT SrcTy = MRI.getType(Reg: Src1Reg);
1797 const int DstSize = DstTy.getSizeInBits();
1798 const int SrcSize = SrcTy.getSizeInBits();
1799 const int WideSize = WideTy.getSizeInBits();
1800 const int NumMerge = (DstSize + WideSize - 1) / WideSize;
1801
1802 unsigned NumOps = MI.getNumOperands();
1803 unsigned NumSrc = MI.getNumOperands() - 1;
1804 unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
1805
1806 if (WideSize >= DstSize) {
1807 // Directly pack the bits in the target type.
1808 Register ResultReg = MIRBuilder.buildZExt(Res: WideTy, Op: Src1Reg).getReg(Idx: 0);
1809
1810 for (unsigned I = 2; I != NumOps; ++I) {
1811 const unsigned Offset = (I - 1) * PartSize;
1812
1813 Register SrcReg = MI.getOperand(i: I).getReg();
1814 assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
1815
1816 auto ZextInput = MIRBuilder.buildZExt(Res: WideTy, Op: SrcReg);
1817
1818 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
1819 MRI.createGenericVirtualRegister(Ty: WideTy);
1820
1821 auto ShiftAmt = MIRBuilder.buildConstant(Res: WideTy, Val: Offset);
1822 auto Shl = MIRBuilder.buildShl(Dst: WideTy, Src0: ZextInput, Src1: ShiftAmt);
1823 MIRBuilder.buildOr(Dst: NextResult, Src0: ResultReg, Src1: Shl);
1824 ResultReg = NextResult;
1825 }
1826
1827 if (WideSize > DstSize)
1828 MIRBuilder.buildTrunc(Res: DstReg, Op: ResultReg);
1829 else if (DstTy.isPointer())
1830 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: ResultReg);
1831
1832 MI.eraseFromParent();
1833 return Legalized;
1834 }
1835
1836 // Unmerge the original values to the GCD type, and recombine to the next
1837 // multiple greater than the original type.
1838 //
1839 // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
1840 // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
1841 // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
1842 // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
1843 // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
1844 // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
1845 // %12:_(s12) = G_MERGE_VALUES %10, %11
1846 //
1847 // Padding with undef if necessary:
1848 //
1849 // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
1850 // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
1851 // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
1852 // %7:_(s2) = G_IMPLICIT_DEF
1853 // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
1854 // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
1855 // %10:_(s12) = G_MERGE_VALUES %8, %9
1856
1857 const int GCD = std::gcd(m: SrcSize, n: WideSize);
1858 LLT GCDTy = LLT::scalar(SizeInBits: GCD);
1859
1860 SmallVector<Register, 8> Parts;
1861 SmallVector<Register, 8> NewMergeRegs;
1862 SmallVector<Register, 8> Unmerges;
1863 LLT WideDstTy = LLT::scalar(SizeInBits: NumMerge * WideSize);
1864
1865 // Decompose the original operands if they don't evenly divide.
1866 for (const MachineOperand &MO : llvm::drop_begin(RangeOrContainer: MI.operands())) {
1867 Register SrcReg = MO.getReg();
1868 if (GCD == SrcSize) {
1869 Unmerges.push_back(Elt: SrcReg);
1870 } else {
1871 auto Unmerge = MIRBuilder.buildUnmerge(Res: GCDTy, Op: SrcReg);
1872 for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
1873 Unmerges.push_back(Elt: Unmerge.getReg(Idx: J));
1874 }
1875 }
1876
1877 // Pad with undef to the next size that is a multiple of the requested size.
1878 if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
1879 Register UndefReg = MIRBuilder.buildUndef(Res: GCDTy).getReg(Idx: 0);
1880 for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
1881 Unmerges.push_back(Elt: UndefReg);
1882 }
1883
1884 const int PartsPerGCD = WideSize / GCD;
1885
1886 // Build merges of each piece.
1887 ArrayRef<Register> Slicer(Unmerges);
1888 for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(N: PartsPerGCD)) {
1889 auto Merge =
1890 MIRBuilder.buildMergeLikeInstr(Res: WideTy, Ops: Slicer.take_front(N: PartsPerGCD));
1891 NewMergeRegs.push_back(Elt: Merge.getReg(Idx: 0));
1892 }
1893
1894 // A truncate may be necessary if the requested type doesn't evenly divide the
1895 // original result type.
1896 if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
1897 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: NewMergeRegs);
1898 } else {
1899 auto FinalMerge = MIRBuilder.buildMergeLikeInstr(Res: WideDstTy, Ops: NewMergeRegs);
1900 MIRBuilder.buildTrunc(Res: DstReg, Op: FinalMerge.getReg(Idx: 0));
1901 }
1902
1903 MI.eraseFromParent();
1904 return Legalized;
1905}
1906
1907LegalizerHelper::LegalizeResult
1908LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
1909 LLT WideTy) {
1910 if (TypeIdx != 0)
1911 return UnableToLegalize;
1912
1913 int NumDst = MI.getNumOperands() - 1;
1914 Register SrcReg = MI.getOperand(i: NumDst).getReg();
1915 LLT SrcTy = MRI.getType(Reg: SrcReg);
1916 if (SrcTy.isVector())
1917 return UnableToLegalize;
1918
1919 Register Dst0Reg = MI.getOperand(i: 0).getReg();
1920 LLT DstTy = MRI.getType(Reg: Dst0Reg);
1921 if (!DstTy.isScalar())
1922 return UnableToLegalize;
1923
1924 if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
1925 if (SrcTy.isPointer()) {
1926 const DataLayout &DL = MIRBuilder.getDataLayout();
1927 if (DL.isNonIntegralAddressSpace(AddrSpace: SrcTy.getAddressSpace())) {
1928 LLVM_DEBUG(
1929 dbgs() << "Not casting non-integral address space integer\n");
1930 return UnableToLegalize;
1931 }
1932
1933 SrcTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
1934 SrcReg = MIRBuilder.buildPtrToInt(Dst: SrcTy, Src: SrcReg).getReg(Idx: 0);
1935 }
1936
1937 // Widen SrcTy to WideTy. This does not affect the result, but since the
1938 // user requested this size, it is probably better handled than SrcTy and
1939 // should reduce the total number of legalization artifacts.
1940 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1941 SrcTy = WideTy;
1942 SrcReg = MIRBuilder.buildAnyExt(Res: WideTy, Op: SrcReg).getReg(Idx: 0);
1943 }
1944
1945 // Theres no unmerge type to target. Directly extract the bits from the
1946 // source type
1947 unsigned DstSize = DstTy.getSizeInBits();
1948
1949 MIRBuilder.buildTrunc(Res: Dst0Reg, Op: SrcReg);
1950 for (int I = 1; I != NumDst; ++I) {
1951 auto ShiftAmt = MIRBuilder.buildConstant(Res: SrcTy, Val: DstSize * I);
1952 auto Shr = MIRBuilder.buildLShr(Dst: SrcTy, Src0: SrcReg, Src1: ShiftAmt);
1953 MIRBuilder.buildTrunc(Res: MI.getOperand(i: I), Op: Shr);
1954 }
1955
1956 MI.eraseFromParent();
1957 return Legalized;
1958 }
1959
1960 // Extend the source to a wider type.
1961 LLT LCMTy = getLCMType(OrigTy: SrcTy, TargetTy: WideTy);
1962
1963 Register WideSrc = SrcReg;
1964 if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
1965 // TODO: If this is an integral address space, cast to integer and anyext.
1966 if (SrcTy.isPointer()) {
1967 LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
1968 return UnableToLegalize;
1969 }
1970
1971 WideSrc = MIRBuilder.buildAnyExt(Res: LCMTy, Op: WideSrc).getReg(Idx: 0);
1972 }
1973
1974 auto Unmerge = MIRBuilder.buildUnmerge(Res: WideTy, Op: WideSrc);
1975
1976 // Create a sequence of unmerges and merges to the original results. Since we
1977 // may have widened the source, we will need to pad the results with dead defs
1978 // to cover the source register.
1979 // e.g. widen s48 to s64:
1980 // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
1981 //
1982 // =>
1983 // %4:_(s192) = G_ANYEXT %0:_(s96)
1984 // %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
1985 // ; unpack to GCD type, with extra dead defs
1986 // %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
1987 // %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
1988 // dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
1989 // %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10 ; Remerge to destination
1990 // %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
1991 const LLT GCDTy = getGCDType(OrigTy: WideTy, TargetTy: DstTy);
1992 const int NumUnmerge = Unmerge->getNumOperands() - 1;
1993 const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
1994
1995 // Directly unmerge to the destination without going through a GCD type
1996 // if possible
1997 if (PartsPerRemerge == 1) {
1998 const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
1999
2000 for (int I = 0; I != NumUnmerge; ++I) {
2001 auto MIB = MIRBuilder.buildInstr(Opcode: TargetOpcode::G_UNMERGE_VALUES);
2002
2003 for (int J = 0; J != PartsPerUnmerge; ++J) {
2004 int Idx = I * PartsPerUnmerge + J;
2005 if (Idx < NumDst)
2006 MIB.addDef(RegNo: MI.getOperand(i: Idx).getReg());
2007 else {
2008 // Create dead def for excess components.
2009 MIB.addDef(RegNo: MRI.createGenericVirtualRegister(Ty: DstTy));
2010 }
2011 }
2012
2013 MIB.addUse(RegNo: Unmerge.getReg(Idx: I));
2014 }
2015 } else {
2016 SmallVector<Register, 16> Parts;
2017 for (int J = 0; J != NumUnmerge; ++J)
2018 extractGCDType(Parts, GCDTy, SrcReg: Unmerge.getReg(Idx: J));
2019
2020 SmallVector<Register, 8> RemergeParts;
2021 for (int I = 0; I != NumDst; ++I) {
2022 for (int J = 0; J < PartsPerRemerge; ++J) {
2023 const int Idx = I * PartsPerRemerge + J;
2024 RemergeParts.emplace_back(Args&: Parts[Idx]);
2025 }
2026
2027 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: I).getReg(), Ops: RemergeParts);
2028 RemergeParts.clear();
2029 }
2030 }
2031
2032 MI.eraseFromParent();
2033 return Legalized;
2034}
2035
2036LegalizerHelper::LegalizeResult
2037LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
2038 LLT WideTy) {
2039 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
2040 unsigned Offset = MI.getOperand(i: 2).getImm();
2041
2042 if (TypeIdx == 0) {
2043 if (SrcTy.isVector() || DstTy.isVector())
2044 return UnableToLegalize;
2045
2046 SrcOp Src(SrcReg);
2047 if (SrcTy.isPointer()) {
2048 // Extracts from pointers can be handled only if they are really just
2049 // simple integers.
2050 const DataLayout &DL = MIRBuilder.getDataLayout();
2051 if (DL.isNonIntegralAddressSpace(AddrSpace: SrcTy.getAddressSpace()))
2052 return UnableToLegalize;
2053
2054 LLT SrcAsIntTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
2055 Src = MIRBuilder.buildPtrToInt(Dst: SrcAsIntTy, Src);
2056 SrcTy = SrcAsIntTy;
2057 }
2058
2059 if (DstTy.isPointer())
2060 return UnableToLegalize;
2061
2062 if (Offset == 0) {
2063 // Avoid a shift in the degenerate case.
2064 MIRBuilder.buildTrunc(Res: DstReg,
2065 Op: MIRBuilder.buildAnyExtOrTrunc(Res: WideTy, Op: Src));
2066 MI.eraseFromParent();
2067 return Legalized;
2068 }
2069
2070 // Do a shift in the source type.
2071 LLT ShiftTy = SrcTy;
2072 if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
2073 Src = MIRBuilder.buildAnyExt(Res: WideTy, Op: Src);
2074 ShiftTy = WideTy;
2075 }
2076
2077 auto LShr = MIRBuilder.buildLShr(
2078 Dst: ShiftTy, Src0: Src, Src1: MIRBuilder.buildConstant(Res: ShiftTy, Val: Offset));
2079 MIRBuilder.buildTrunc(Res: DstReg, Op: LShr);
2080 MI.eraseFromParent();
2081 return Legalized;
2082 }
2083
2084 if (SrcTy.isScalar()) {
2085 Observer.changingInstr(MI);
2086 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2087 Observer.changedInstr(MI);
2088 return Legalized;
2089 }
2090
2091 if (!SrcTy.isVector())
2092 return UnableToLegalize;
2093
2094 if (DstTy != SrcTy.getElementType())
2095 return UnableToLegalize;
2096
2097 if (Offset % SrcTy.getScalarSizeInBits() != 0)
2098 return UnableToLegalize;
2099
2100 Observer.changingInstr(MI);
2101 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2102
2103 MI.getOperand(i: 2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
2104 Offset);
2105 widenScalarDst(MI, WideTy: WideTy.getScalarType(), OpIdx: 0);
2106 Observer.changedInstr(MI);
2107 return Legalized;
2108}
2109
2110LegalizerHelper::LegalizeResult
2111LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
2112 LLT WideTy) {
2113 if (TypeIdx != 0 || WideTy.isVector())
2114 return UnableToLegalize;
2115 Observer.changingInstr(MI);
2116 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2117 widenScalarDst(MI, WideTy);
2118 Observer.changedInstr(MI);
2119 return Legalized;
2120}
2121
2122LegalizerHelper::LegalizeResult
2123LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
2124 LLT WideTy) {
2125 unsigned Opcode;
2126 unsigned ExtOpcode;
2127 std::optional<Register> CarryIn;
2128 switch (MI.getOpcode()) {
2129 default:
2130 llvm_unreachable("Unexpected opcode!");
2131 case TargetOpcode::G_SADDO:
2132 Opcode = TargetOpcode::G_ADD;
2133 ExtOpcode = TargetOpcode::G_SEXT;
2134 break;
2135 case TargetOpcode::G_SSUBO:
2136 Opcode = TargetOpcode::G_SUB;
2137 ExtOpcode = TargetOpcode::G_SEXT;
2138 break;
2139 case TargetOpcode::G_UADDO:
2140 Opcode = TargetOpcode::G_ADD;
2141 ExtOpcode = TargetOpcode::G_ZEXT;
2142 break;
2143 case TargetOpcode::G_USUBO:
2144 Opcode = TargetOpcode::G_SUB;
2145 ExtOpcode = TargetOpcode::G_ZEXT;
2146 break;
2147 case TargetOpcode::G_SADDE:
2148 Opcode = TargetOpcode::G_UADDE;
2149 ExtOpcode = TargetOpcode::G_SEXT;
2150 CarryIn = MI.getOperand(i: 4).getReg();
2151 break;
2152 case TargetOpcode::G_SSUBE:
2153 Opcode = TargetOpcode::G_USUBE;
2154 ExtOpcode = TargetOpcode::G_SEXT;
2155 CarryIn = MI.getOperand(i: 4).getReg();
2156 break;
2157 case TargetOpcode::G_UADDE:
2158 Opcode = TargetOpcode::G_UADDE;
2159 ExtOpcode = TargetOpcode::G_ZEXT;
2160 CarryIn = MI.getOperand(i: 4).getReg();
2161 break;
2162 case TargetOpcode::G_USUBE:
2163 Opcode = TargetOpcode::G_USUBE;
2164 ExtOpcode = TargetOpcode::G_ZEXT;
2165 CarryIn = MI.getOperand(i: 4).getReg();
2166 break;
2167 }
2168
2169 if (TypeIdx == 1) {
2170 unsigned BoolExtOp = MIRBuilder.getBoolExtOp(IsVec: WideTy.isVector(), IsFP: false);
2171
2172 Observer.changingInstr(MI);
2173 if (CarryIn)
2174 widenScalarSrc(MI, WideTy, OpIdx: 4, ExtOpcode: BoolExtOp);
2175 widenScalarDst(MI, WideTy, OpIdx: 1);
2176
2177 Observer.changedInstr(MI);
2178 return Legalized;
2179 }
2180
2181 auto LHSExt = MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {WideTy}, SrcOps: {MI.getOperand(i: 2)});
2182 auto RHSExt = MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {WideTy}, SrcOps: {MI.getOperand(i: 3)});
2183 // Do the arithmetic in the larger type.
2184 Register NewOp;
2185 if (CarryIn) {
2186 LLT CarryOutTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg());
2187 NewOp = MIRBuilder
2188 .buildInstr(Opc: Opcode, DstOps: {WideTy, CarryOutTy},
2189 SrcOps: {LHSExt, RHSExt, *CarryIn})
2190 .getReg(Idx: 0);
2191 } else {
2192 NewOp = MIRBuilder.buildInstr(Opc: Opcode, DstOps: {WideTy}, SrcOps: {LHSExt, RHSExt}).getReg(Idx: 0);
2193 }
2194 LLT OrigTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
2195 auto TruncOp = MIRBuilder.buildTrunc(Res: OrigTy, Op: NewOp);
2196 auto ExtOp = MIRBuilder.buildInstr(Opc: ExtOpcode, DstOps: {WideTy}, SrcOps: {TruncOp});
2197 // There is no overflow if the ExtOp is the same as NewOp.
2198 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: MI.getOperand(i: 1), Op0: NewOp, Op1: ExtOp);
2199 // Now trunc the NewOp to the original result.
2200 MIRBuilder.buildTrunc(Res: MI.getOperand(i: 0), Op: NewOp);
2201 MI.eraseFromParent();
2202 return Legalized;
2203}
2204
2205LegalizerHelper::LegalizeResult
2206LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
2207 LLT WideTy) {
2208 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
2209 MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
2210 MI.getOpcode() == TargetOpcode::G_SSHLSAT;
2211 bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
2212 MI.getOpcode() == TargetOpcode::G_USHLSAT;
2213 // We can convert this to:
2214 // 1. Any extend iN to iM
2215 // 2. SHL by M-N
2216 // 3. [US][ADD|SUB|SHL]SAT
2217 // 4. L/ASHR by M-N
2218 //
2219 // It may be more efficient to lower this to a min and a max operation in
2220 // the higher precision arithmetic if the promoted operation isn't legal,
2221 // but this decision is up to the target's lowering request.
2222 Register DstReg = MI.getOperand(i: 0).getReg();
2223
2224 unsigned NewBits = WideTy.getScalarSizeInBits();
2225 unsigned SHLAmount = NewBits - MRI.getType(Reg: DstReg).getScalarSizeInBits();
2226
2227 // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
2228 // must not left shift the RHS to preserve the shift amount.
2229 auto LHS = MIRBuilder.buildAnyExt(Res: WideTy, Op: MI.getOperand(i: 1));
2230 auto RHS = IsShift ? MIRBuilder.buildZExt(Res: WideTy, Op: MI.getOperand(i: 2))
2231 : MIRBuilder.buildAnyExt(Res: WideTy, Op: MI.getOperand(i: 2));
2232 auto ShiftK = MIRBuilder.buildConstant(Res: WideTy, Val: SHLAmount);
2233 auto ShiftL = MIRBuilder.buildShl(Dst: WideTy, Src0: LHS, Src1: ShiftK);
2234 auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(Dst: WideTy, Src0: RHS, Src1: ShiftK);
2235
2236 auto WideInst = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {WideTy},
2237 SrcOps: {ShiftL, ShiftR}, Flags: MI.getFlags());
2238
2239 // Use a shift that will preserve the number of sign bits when the trunc is
2240 // folded away.
2241 auto Result = IsSigned ? MIRBuilder.buildAShr(Dst: WideTy, Src0: WideInst, Src1: ShiftK)
2242 : MIRBuilder.buildLShr(Dst: WideTy, Src0: WideInst, Src1: ShiftK);
2243
2244 MIRBuilder.buildTrunc(Res: DstReg, Op: Result);
2245 MI.eraseFromParent();
2246 return Legalized;
2247}
2248
2249LegalizerHelper::LegalizeResult
2250LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
2251 LLT WideTy) {
2252 if (TypeIdx == 1) {
2253 Observer.changingInstr(MI);
2254 widenScalarDst(MI, WideTy, OpIdx: 1);
2255 Observer.changedInstr(MI);
2256 return Legalized;
2257 }
2258
2259 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
2260 auto [Result, OriginalOverflow, LHS, RHS] = MI.getFirst4Regs();
2261 LLT SrcTy = MRI.getType(Reg: LHS);
2262 LLT OverflowTy = MRI.getType(Reg: OriginalOverflow);
2263 unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
2264
2265 // To determine if the result overflowed in the larger type, we extend the
2266 // input to the larger type, do the multiply (checking if it overflows),
2267 // then also check the high bits of the result to see if overflow happened
2268 // there.
2269 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2270 auto LeftOperand = MIRBuilder.buildInstr(Opc: ExtOp, DstOps: {WideTy}, SrcOps: {LHS});
2271 auto RightOperand = MIRBuilder.buildInstr(Opc: ExtOp, DstOps: {WideTy}, SrcOps: {RHS});
2272
2273 // Multiplication cannot overflow if the WideTy is >= 2 * original width,
2274 // so we don't need to check the overflow result of larger type Mulo.
2275 bool WideMulCanOverflow = WideTy.getScalarSizeInBits() < 2 * SrcBitWidth;
2276
2277 unsigned MulOpc =
2278 WideMulCanOverflow ? MI.getOpcode() : (unsigned)TargetOpcode::G_MUL;
2279
2280 MachineInstrBuilder Mulo;
2281 if (WideMulCanOverflow)
2282 Mulo = MIRBuilder.buildInstr(Opc: MulOpc, DstOps: {WideTy, OverflowTy},
2283 SrcOps: {LeftOperand, RightOperand});
2284 else
2285 Mulo = MIRBuilder.buildInstr(Opc: MulOpc, DstOps: {WideTy}, SrcOps: {LeftOperand, RightOperand});
2286
2287 auto Mul = Mulo->getOperand(i: 0);
2288 MIRBuilder.buildTrunc(Res: Result, Op: Mul);
2289
2290 MachineInstrBuilder ExtResult;
2291 // Overflow occurred if it occurred in the larger type, or if the high part
2292 // of the result does not zero/sign-extend the low part. Check this second
2293 // possibility first.
2294 if (IsSigned) {
2295 // For signed, overflow occurred when the high part does not sign-extend
2296 // the low part.
2297 ExtResult = MIRBuilder.buildSExtInReg(Res: WideTy, Op: Mul, ImmOp: SrcBitWidth);
2298 } else {
2299 // Unsigned overflow occurred when the high part does not zero-extend the
2300 // low part.
2301 ExtResult = MIRBuilder.buildZExtInReg(Res: WideTy, Op: Mul, ImmOp: SrcBitWidth);
2302 }
2303
2304 if (WideMulCanOverflow) {
2305 auto Overflow =
2306 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: OverflowTy, Op0: Mul, Op1: ExtResult);
2307 // Finally check if the multiplication in the larger type itself overflowed.
2308 MIRBuilder.buildOr(Dst: OriginalOverflow, Src0: Mulo->getOperand(i: 1), Src1: Overflow);
2309 } else {
2310 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: OriginalOverflow, Op0: Mul, Op1: ExtResult);
2311 }
2312 MI.eraseFromParent();
2313 return Legalized;
2314}
2315
2316LegalizerHelper::LegalizeResult
2317LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
2318 switch (MI.getOpcode()) {
2319 default:
2320 return UnableToLegalize;
2321 case TargetOpcode::G_ATOMICRMW_XCHG:
2322 case TargetOpcode::G_ATOMICRMW_ADD:
2323 case TargetOpcode::G_ATOMICRMW_SUB:
2324 case TargetOpcode::G_ATOMICRMW_AND:
2325 case TargetOpcode::G_ATOMICRMW_OR:
2326 case TargetOpcode::G_ATOMICRMW_XOR:
2327 case TargetOpcode::G_ATOMICRMW_MIN:
2328 case TargetOpcode::G_ATOMICRMW_MAX:
2329 case TargetOpcode::G_ATOMICRMW_UMIN:
2330 case TargetOpcode::G_ATOMICRMW_UMAX:
2331 assert(TypeIdx == 0 && "atomicrmw with second scalar type");
2332 Observer.changingInstr(MI);
2333 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
2334 widenScalarDst(MI, WideTy, OpIdx: 0);
2335 Observer.changedInstr(MI);
2336 return Legalized;
2337 case TargetOpcode::G_ATOMIC_CMPXCHG:
2338 assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
2339 Observer.changingInstr(MI);
2340 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
2341 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ANYEXT);
2342 widenScalarDst(MI, WideTy, OpIdx: 0);
2343 Observer.changedInstr(MI);
2344 return Legalized;
2345 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2346 if (TypeIdx == 0) {
2347 Observer.changingInstr(MI);
2348 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ANYEXT);
2349 widenScalarSrc(MI, WideTy, OpIdx: 4, ExtOpcode: TargetOpcode::G_ANYEXT);
2350 widenScalarDst(MI, WideTy, OpIdx: 0);
2351 Observer.changedInstr(MI);
2352 return Legalized;
2353 }
2354 assert(TypeIdx == 1 &&
2355 "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2356 Observer.changingInstr(MI);
2357 widenScalarDst(MI, WideTy, OpIdx: 1);
2358 Observer.changedInstr(MI);
2359 return Legalized;
2360 case TargetOpcode::G_EXTRACT:
2361 return widenScalarExtract(MI, TypeIdx, WideTy);
2362 case TargetOpcode::G_INSERT:
2363 return widenScalarInsert(MI, TypeIdx, WideTy);
2364 case TargetOpcode::G_MERGE_VALUES:
2365 return widenScalarMergeValues(MI, TypeIdx, WideTy);
2366 case TargetOpcode::G_UNMERGE_VALUES:
2367 return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2368 case TargetOpcode::G_SADDO:
2369 case TargetOpcode::G_SSUBO:
2370 case TargetOpcode::G_UADDO:
2371 case TargetOpcode::G_USUBO:
2372 case TargetOpcode::G_SADDE:
2373 case TargetOpcode::G_SSUBE:
2374 case TargetOpcode::G_UADDE:
2375 case TargetOpcode::G_USUBE:
2376 return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2377 case TargetOpcode::G_UMULO:
2378 case TargetOpcode::G_SMULO:
2379 return widenScalarMulo(MI, TypeIdx, WideTy);
2380 case TargetOpcode::G_SADDSAT:
2381 case TargetOpcode::G_SSUBSAT:
2382 case TargetOpcode::G_SSHLSAT:
2383 case TargetOpcode::G_UADDSAT:
2384 case TargetOpcode::G_USUBSAT:
2385 case TargetOpcode::G_USHLSAT:
2386 return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2387 case TargetOpcode::G_CTTZ:
2388 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2389 case TargetOpcode::G_CTLZ:
2390 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2391 case TargetOpcode::G_CTPOP: {
2392 if (TypeIdx == 0) {
2393 Observer.changingInstr(MI);
2394 widenScalarDst(MI, WideTy, OpIdx: 0);
2395 Observer.changedInstr(MI);
2396 return Legalized;
2397 }
2398
2399 Register SrcReg = MI.getOperand(i: 1).getReg();
2400
2401 // First extend the input.
2402 unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ ||
2403 MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF
2404 ? TargetOpcode::G_ANYEXT
2405 : TargetOpcode::G_ZEXT;
2406 auto MIBSrc = MIRBuilder.buildInstr(Opc: ExtOpc, DstOps: {WideTy}, SrcOps: {SrcReg});
2407 LLT CurTy = MRI.getType(Reg: SrcReg);
2408 unsigned NewOpc = MI.getOpcode();
2409 if (NewOpc == TargetOpcode::G_CTTZ) {
2410 // The count is the same in the larger type except if the original
2411 // value was zero. This can be handled by setting the bit just off
2412 // the top of the original type.
2413 auto TopBit =
2414 APInt::getOneBitSet(numBits: WideTy.getSizeInBits(), BitNo: CurTy.getSizeInBits());
2415 MIBSrc = MIRBuilder.buildOr(
2416 Dst: WideTy, Src0: MIBSrc, Src1: MIRBuilder.buildConstant(Res: WideTy, Val: TopBit));
2417 // Now we know the operand is non-zero, use the more relaxed opcode.
2418 NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2419 }
2420
2421 // Perform the operation at the larger size.
2422 auto MIBNewOp = MIRBuilder.buildInstr(Opc: NewOpc, DstOps: {WideTy}, SrcOps: {MIBSrc});
2423 // This is already the correct result for CTPOP and CTTZs
2424 if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
2425 MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2426 // The correct result is NewOp - (Difference in widety and current ty).
2427 unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2428 MIBNewOp = MIRBuilder.buildSub(
2429 Dst: WideTy, Src0: MIBNewOp, Src1: MIRBuilder.buildConstant(Res: WideTy, Val: SizeDiff));
2430 }
2431
2432 MIRBuilder.buildZExtOrTrunc(Res: MI.getOperand(i: 0), Op: MIBNewOp);
2433 MI.eraseFromParent();
2434 return Legalized;
2435 }
2436 case TargetOpcode::G_BSWAP: {
2437 Observer.changingInstr(MI);
2438 Register DstReg = MI.getOperand(i: 0).getReg();
2439
2440 Register ShrReg = MRI.createGenericVirtualRegister(Ty: WideTy);
2441 Register DstExt = MRI.createGenericVirtualRegister(Ty: WideTy);
2442 Register ShiftAmtReg = MRI.createGenericVirtualRegister(Ty: WideTy);
2443 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2444
2445 MI.getOperand(i: 0).setReg(DstExt);
2446
2447 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2448
2449 LLT Ty = MRI.getType(Reg: DstReg);
2450 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2451 MIRBuilder.buildConstant(Res: ShiftAmtReg, Val: DiffBits);
2452 MIRBuilder.buildLShr(Dst: ShrReg, Src0: DstExt, Src1: ShiftAmtReg);
2453
2454 MIRBuilder.buildTrunc(Res: DstReg, Op: ShrReg);
2455 Observer.changedInstr(MI);
2456 return Legalized;
2457 }
2458 case TargetOpcode::G_BITREVERSE: {
2459 Observer.changingInstr(MI);
2460
2461 Register DstReg = MI.getOperand(i: 0).getReg();
2462 LLT Ty = MRI.getType(Reg: DstReg);
2463 unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2464
2465 Register DstExt = MRI.createGenericVirtualRegister(Ty: WideTy);
2466 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2467 MI.getOperand(i: 0).setReg(DstExt);
2468 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
2469
2470 auto ShiftAmt = MIRBuilder.buildConstant(Res: WideTy, Val: DiffBits);
2471 auto Shift = MIRBuilder.buildLShr(Dst: WideTy, Src0: DstExt, Src1: ShiftAmt);
2472 MIRBuilder.buildTrunc(Res: DstReg, Op: Shift);
2473 Observer.changedInstr(MI);
2474 return Legalized;
2475 }
2476 case TargetOpcode::G_FREEZE:
2477 Observer.changingInstr(MI);
2478 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2479 widenScalarDst(MI, WideTy);
2480 Observer.changedInstr(MI);
2481 return Legalized;
2482
2483 case TargetOpcode::G_ABS:
2484 Observer.changingInstr(MI);
2485 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_SEXT);
2486 widenScalarDst(MI, WideTy);
2487 Observer.changedInstr(MI);
2488 return Legalized;
2489
2490 case TargetOpcode::G_ADD:
2491 case TargetOpcode::G_AND:
2492 case TargetOpcode::G_MUL:
2493 case TargetOpcode::G_OR:
2494 case TargetOpcode::G_XOR:
2495 case TargetOpcode::G_SUB:
2496 // Perform operation at larger width (any extension is fines here, high bits
2497 // don't affect the result) and then truncate the result back to the
2498 // original type.
2499 Observer.changingInstr(MI);
2500 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2501 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
2502 widenScalarDst(MI, WideTy);
2503 Observer.changedInstr(MI);
2504 return Legalized;
2505
2506 case TargetOpcode::G_SBFX:
2507 case TargetOpcode::G_UBFX:
2508 Observer.changingInstr(MI);
2509
2510 if (TypeIdx == 0) {
2511 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2512 widenScalarDst(MI, WideTy);
2513 } else {
2514 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2515 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ZEXT);
2516 }
2517
2518 Observer.changedInstr(MI);
2519 return Legalized;
2520
2521 case TargetOpcode::G_SHL:
2522 Observer.changingInstr(MI);
2523
2524 if (TypeIdx == 0) {
2525 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2526 widenScalarDst(MI, WideTy);
2527 } else {
2528 assert(TypeIdx == 1);
2529 // The "number of bits to shift" operand must preserve its value as an
2530 // unsigned integer:
2531 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2532 }
2533
2534 Observer.changedInstr(MI);
2535 return Legalized;
2536
2537 case TargetOpcode::G_ROTR:
2538 case TargetOpcode::G_ROTL:
2539 if (TypeIdx != 1)
2540 return UnableToLegalize;
2541
2542 Observer.changingInstr(MI);
2543 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2544 Observer.changedInstr(MI);
2545 return Legalized;
2546
2547 case TargetOpcode::G_SDIV:
2548 case TargetOpcode::G_SREM:
2549 case TargetOpcode::G_SMIN:
2550 case TargetOpcode::G_SMAX:
2551 Observer.changingInstr(MI);
2552 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_SEXT);
2553 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
2554 widenScalarDst(MI, WideTy);
2555 Observer.changedInstr(MI);
2556 return Legalized;
2557
2558 case TargetOpcode::G_SDIVREM:
2559 Observer.changingInstr(MI);
2560 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
2561 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_SEXT);
2562 widenScalarDst(MI, WideTy);
2563 widenScalarDst(MI, WideTy, OpIdx: 1);
2564 Observer.changedInstr(MI);
2565 return Legalized;
2566
2567 case TargetOpcode::G_ASHR:
2568 case TargetOpcode::G_LSHR:
2569 Observer.changingInstr(MI);
2570
2571 if (TypeIdx == 0) {
2572 unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ?
2573 TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2574
2575 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: CvtOp);
2576 widenScalarDst(MI, WideTy);
2577 } else {
2578 assert(TypeIdx == 1);
2579 // The "number of bits to shift" operand must preserve its value as an
2580 // unsigned integer:
2581 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2582 }
2583
2584 Observer.changedInstr(MI);
2585 return Legalized;
2586 case TargetOpcode::G_UDIV:
2587 case TargetOpcode::G_UREM:
2588 case TargetOpcode::G_UMIN:
2589 case TargetOpcode::G_UMAX:
2590 Observer.changingInstr(MI);
2591 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ZEXT);
2592 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2593 widenScalarDst(MI, WideTy);
2594 Observer.changedInstr(MI);
2595 return Legalized;
2596
2597 case TargetOpcode::G_UDIVREM:
2598 Observer.changingInstr(MI);
2599 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2600 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ZEXT);
2601 widenScalarDst(MI, WideTy);
2602 widenScalarDst(MI, WideTy, OpIdx: 1);
2603 Observer.changedInstr(MI);
2604 return Legalized;
2605
2606 case TargetOpcode::G_SELECT:
2607 Observer.changingInstr(MI);
2608 if (TypeIdx == 0) {
2609 // Perform operation at larger width (any extension is fine here, high
2610 // bits don't affect the result) and then truncate the result back to the
2611 // original type.
2612 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
2613 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_ANYEXT);
2614 widenScalarDst(MI, WideTy);
2615 } else {
2616 bool IsVec = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).isVector();
2617 // Explicit extension is required here since high bits affect the result.
2618 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: MIRBuilder.getBoolExtOp(IsVec, IsFP: false));
2619 }
2620 Observer.changedInstr(MI);
2621 return Legalized;
2622
2623 case TargetOpcode::G_FPTOSI:
2624 case TargetOpcode::G_FPTOUI:
2625 case TargetOpcode::G_IS_FPCLASS:
2626 Observer.changingInstr(MI);
2627
2628 if (TypeIdx == 0)
2629 widenScalarDst(MI, WideTy);
2630 else
2631 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
2632
2633 Observer.changedInstr(MI);
2634 return Legalized;
2635 case TargetOpcode::G_SITOFP:
2636 Observer.changingInstr(MI);
2637
2638 if (TypeIdx == 0)
2639 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
2640 else
2641 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_SEXT);
2642
2643 Observer.changedInstr(MI);
2644 return Legalized;
2645 case TargetOpcode::G_UITOFP:
2646 Observer.changingInstr(MI);
2647
2648 if (TypeIdx == 0)
2649 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
2650 else
2651 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ZEXT);
2652
2653 Observer.changedInstr(MI);
2654 return Legalized;
2655 case TargetOpcode::G_LOAD:
2656 case TargetOpcode::G_SEXTLOAD:
2657 case TargetOpcode::G_ZEXTLOAD:
2658 Observer.changingInstr(MI);
2659 widenScalarDst(MI, WideTy);
2660 Observer.changedInstr(MI);
2661 return Legalized;
2662
2663 case TargetOpcode::G_STORE: {
2664 if (TypeIdx != 0)
2665 return UnableToLegalize;
2666
2667 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
2668 if (!Ty.isScalar())
2669 return UnableToLegalize;
2670
2671 Observer.changingInstr(MI);
2672
2673 unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
2674 TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
2675 widenScalarSrc(MI, WideTy, OpIdx: 0, ExtOpcode: ExtType);
2676
2677 Observer.changedInstr(MI);
2678 return Legalized;
2679 }
2680 case TargetOpcode::G_CONSTANT: {
2681 MachineOperand &SrcMO = MI.getOperand(i: 1);
2682 LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2683 unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
2684 SmallTy: MRI.getType(Reg: MI.getOperand(i: 0).getReg()));
2685 assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
2686 ExtOpc == TargetOpcode::G_ANYEXT) &&
2687 "Illegal Extend");
2688 const APInt &SrcVal = SrcMO.getCImm()->getValue();
2689 const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
2690 ? SrcVal.sext(width: WideTy.getSizeInBits())
2691 : SrcVal.zext(width: WideTy.getSizeInBits());
2692 Observer.changingInstr(MI);
2693 SrcMO.setCImm(ConstantInt::get(Context&: Ctx, V: Val));
2694
2695 widenScalarDst(MI, WideTy);
2696 Observer.changedInstr(MI);
2697 return Legalized;
2698 }
2699 case TargetOpcode::G_FCONSTANT: {
2700 // To avoid changing the bits of the constant due to extension to a larger
2701 // type and then using G_FPTRUNC, we simply convert to a G_CONSTANT.
2702 MachineOperand &SrcMO = MI.getOperand(i: 1);
2703 APInt Val = SrcMO.getFPImm()->getValueAPF().bitcastToAPInt();
2704 MIRBuilder.setInstrAndDebugLoc(MI);
2705 auto IntCst = MIRBuilder.buildConstant(Res: MI.getOperand(i: 0).getReg(), Val);
2706 widenScalarDst(MI&: *IntCst, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_TRUNC);
2707 MI.eraseFromParent();
2708 return Legalized;
2709 }
2710 case TargetOpcode::G_IMPLICIT_DEF: {
2711 Observer.changingInstr(MI);
2712 widenScalarDst(MI, WideTy);
2713 Observer.changedInstr(MI);
2714 return Legalized;
2715 }
2716 case TargetOpcode::G_BRCOND:
2717 Observer.changingInstr(MI);
2718 widenScalarSrc(MI, WideTy, OpIdx: 0, ExtOpcode: MIRBuilder.getBoolExtOp(IsVec: false, IsFP: false));
2719 Observer.changedInstr(MI);
2720 return Legalized;
2721
2722 case TargetOpcode::G_FCMP:
2723 Observer.changingInstr(MI);
2724 if (TypeIdx == 0)
2725 widenScalarDst(MI, WideTy);
2726 else {
2727 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_FPEXT);
2728 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_FPEXT);
2729 }
2730 Observer.changedInstr(MI);
2731 return Legalized;
2732
2733 case TargetOpcode::G_ICMP:
2734 Observer.changingInstr(MI);
2735 if (TypeIdx == 0)
2736 widenScalarDst(MI, WideTy);
2737 else {
2738 unsigned ExtOpcode = CmpInst::isSigned(predicate: static_cast<CmpInst::Predicate>(
2739 MI.getOperand(i: 1).getPredicate()))
2740 ? TargetOpcode::G_SEXT
2741 : TargetOpcode::G_ZEXT;
2742 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode);
2743 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode);
2744 }
2745 Observer.changedInstr(MI);
2746 return Legalized;
2747
2748 case TargetOpcode::G_PTR_ADD:
2749 assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
2750 Observer.changingInstr(MI);
2751 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
2752 Observer.changedInstr(MI);
2753 return Legalized;
2754
2755 case TargetOpcode::G_PHI: {
2756 assert(TypeIdx == 0 && "Expecting only Idx 0");
2757
2758 Observer.changingInstr(MI);
2759 for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
2760 MachineBasicBlock &OpMBB = *MI.getOperand(i: I + 1).getMBB();
2761 MIRBuilder.setInsertPt(MBB&: OpMBB, II: OpMBB.getFirstTerminatorForward());
2762 widenScalarSrc(MI, WideTy, OpIdx: I, ExtOpcode: TargetOpcode::G_ANYEXT);
2763 }
2764
2765 MachineBasicBlock &MBB = *MI.getParent();
2766 MIRBuilder.setInsertPt(MBB, II: --MBB.getFirstNonPHI());
2767 widenScalarDst(MI, WideTy);
2768 Observer.changedInstr(MI);
2769 return Legalized;
2770 }
2771 case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
2772 if (TypeIdx == 0) {
2773 Register VecReg = MI.getOperand(i: 1).getReg();
2774 LLT VecTy = MRI.getType(Reg: VecReg);
2775 Observer.changingInstr(MI);
2776
2777 widenScalarSrc(
2778 MI, WideTy: LLT::vector(EC: VecTy.getElementCount(), ScalarSizeInBits: WideTy.getSizeInBits()), OpIdx: 1,
2779 ExtOpcode: TargetOpcode::G_ANYEXT);
2780
2781 widenScalarDst(MI, WideTy, OpIdx: 0);
2782 Observer.changedInstr(MI);
2783 return Legalized;
2784 }
2785
2786 if (TypeIdx != 2)
2787 return UnableToLegalize;
2788 Observer.changingInstr(MI);
2789 // TODO: Probably should be zext
2790 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
2791 Observer.changedInstr(MI);
2792 return Legalized;
2793 }
2794 case TargetOpcode::G_INSERT_VECTOR_ELT: {
2795 if (TypeIdx == 0) {
2796 Observer.changingInstr(MI);
2797 const LLT WideEltTy = WideTy.getElementType();
2798
2799 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2800 widenScalarSrc(MI, WideTy: WideEltTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
2801 widenScalarDst(MI, WideTy, OpIdx: 0);
2802 Observer.changedInstr(MI);
2803 return Legalized;
2804 }
2805
2806 if (TypeIdx == 1) {
2807 Observer.changingInstr(MI);
2808
2809 Register VecReg = MI.getOperand(i: 1).getReg();
2810 LLT VecTy = MRI.getType(Reg: VecReg);
2811 LLT WideVecTy = LLT::vector(EC: VecTy.getElementCount(), ScalarTy: WideTy);
2812
2813 widenScalarSrc(MI, WideTy: WideVecTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2814 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ANYEXT);
2815 widenScalarDst(MI, WideTy: WideVecTy, OpIdx: 0);
2816 Observer.changedInstr(MI);
2817 return Legalized;
2818 }
2819
2820 if (TypeIdx == 2) {
2821 Observer.changingInstr(MI);
2822 // TODO: Probably should be zext
2823 widenScalarSrc(MI, WideTy, OpIdx: 3, ExtOpcode: TargetOpcode::G_SEXT);
2824 Observer.changedInstr(MI);
2825 return Legalized;
2826 }
2827
2828 return UnableToLegalize;
2829 }
2830 case TargetOpcode::G_FADD:
2831 case TargetOpcode::G_FMUL:
2832 case TargetOpcode::G_FSUB:
2833 case TargetOpcode::G_FMA:
2834 case TargetOpcode::G_FMAD:
2835 case TargetOpcode::G_FNEG:
2836 case TargetOpcode::G_FABS:
2837 case TargetOpcode::G_FCANONICALIZE:
2838 case TargetOpcode::G_FMINNUM:
2839 case TargetOpcode::G_FMAXNUM:
2840 case TargetOpcode::G_FMINNUM_IEEE:
2841 case TargetOpcode::G_FMAXNUM_IEEE:
2842 case TargetOpcode::G_FMINIMUM:
2843 case TargetOpcode::G_FMAXIMUM:
2844 case TargetOpcode::G_FDIV:
2845 case TargetOpcode::G_FREM:
2846 case TargetOpcode::G_FCEIL:
2847 case TargetOpcode::G_FFLOOR:
2848 case TargetOpcode::G_FCOS:
2849 case TargetOpcode::G_FSIN:
2850 case TargetOpcode::G_FLOG10:
2851 case TargetOpcode::G_FLOG:
2852 case TargetOpcode::G_FLOG2:
2853 case TargetOpcode::G_FRINT:
2854 case TargetOpcode::G_FNEARBYINT:
2855 case TargetOpcode::G_FSQRT:
2856 case TargetOpcode::G_FEXP:
2857 case TargetOpcode::G_FEXP2:
2858 case TargetOpcode::G_FEXP10:
2859 case TargetOpcode::G_FPOW:
2860 case TargetOpcode::G_INTRINSIC_TRUNC:
2861 case TargetOpcode::G_INTRINSIC_ROUND:
2862 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2863 assert(TypeIdx == 0);
2864 Observer.changingInstr(MI);
2865
2866 for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
2867 widenScalarSrc(MI, WideTy, OpIdx: I, ExtOpcode: TargetOpcode::G_FPEXT);
2868
2869 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
2870 Observer.changedInstr(MI);
2871 return Legalized;
2872 case TargetOpcode::G_FPOWI:
2873 case TargetOpcode::G_FLDEXP:
2874 case TargetOpcode::G_STRICT_FLDEXP: {
2875 if (TypeIdx == 0) {
2876 if (MI.getOpcode() == TargetOpcode::G_STRICT_FLDEXP)
2877 return UnableToLegalize;
2878
2879 Observer.changingInstr(MI);
2880 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
2881 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
2882 Observer.changedInstr(MI);
2883 return Legalized;
2884 }
2885
2886 if (TypeIdx == 1) {
2887 // For some reason SelectionDAG tries to promote to a libcall without
2888 // actually changing the integer type for promotion.
2889 Observer.changingInstr(MI);
2890 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_SEXT);
2891 Observer.changedInstr(MI);
2892 return Legalized;
2893 }
2894
2895 return UnableToLegalize;
2896 }
2897 case TargetOpcode::G_FFREXP: {
2898 Observer.changingInstr(MI);
2899
2900 if (TypeIdx == 0) {
2901 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_FPEXT);
2902 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
2903 } else {
2904 widenScalarDst(MI, WideTy, OpIdx: 1);
2905 }
2906
2907 Observer.changedInstr(MI);
2908 return Legalized;
2909 }
2910 case TargetOpcode::G_INTTOPTR:
2911 if (TypeIdx != 1)
2912 return UnableToLegalize;
2913
2914 Observer.changingInstr(MI);
2915 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ZEXT);
2916 Observer.changedInstr(MI);
2917 return Legalized;
2918 case TargetOpcode::G_PTRTOINT:
2919 if (TypeIdx != 0)
2920 return UnableToLegalize;
2921
2922 Observer.changingInstr(MI);
2923 widenScalarDst(MI, WideTy, OpIdx: 0);
2924 Observer.changedInstr(MI);
2925 return Legalized;
2926 case TargetOpcode::G_BUILD_VECTOR: {
2927 Observer.changingInstr(MI);
2928
2929 const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
2930 for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
2931 widenScalarSrc(MI, WideTy: WideEltTy, OpIdx: I, ExtOpcode: TargetOpcode::G_ANYEXT);
2932
2933 // Avoid changing the result vector type if the source element type was
2934 // requested.
2935 if (TypeIdx == 1) {
2936 MI.setDesc(MIRBuilder.getTII().get(Opcode: TargetOpcode::G_BUILD_VECTOR_TRUNC));
2937 } else {
2938 widenScalarDst(MI, WideTy, OpIdx: 0);
2939 }
2940
2941 Observer.changedInstr(MI);
2942 return Legalized;
2943 }
2944 case TargetOpcode::G_SEXT_INREG:
2945 if (TypeIdx != 0)
2946 return UnableToLegalize;
2947
2948 Observer.changingInstr(MI);
2949 widenScalarSrc(MI, WideTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_ANYEXT);
2950 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_TRUNC);
2951 Observer.changedInstr(MI);
2952 return Legalized;
2953 case TargetOpcode::G_PTRMASK: {
2954 if (TypeIdx != 1)
2955 return UnableToLegalize;
2956 Observer.changingInstr(MI);
2957 widenScalarSrc(MI, WideTy, OpIdx: 2, ExtOpcode: TargetOpcode::G_ZEXT);
2958 Observer.changedInstr(MI);
2959 return Legalized;
2960 }
2961 case TargetOpcode::G_VECREDUCE_FADD:
2962 case TargetOpcode::G_VECREDUCE_FMUL:
2963 case TargetOpcode::G_VECREDUCE_FMIN:
2964 case TargetOpcode::G_VECREDUCE_FMAX:
2965 case TargetOpcode::G_VECREDUCE_FMINIMUM:
2966 case TargetOpcode::G_VECREDUCE_FMAXIMUM:
2967 if (TypeIdx != 0)
2968 return UnableToLegalize;
2969 Observer.changingInstr(MI);
2970 Register VecReg = MI.getOperand(i: 1).getReg();
2971 LLT VecTy = MRI.getType(Reg: VecReg);
2972 LLT WideVecTy = VecTy.isVector()
2973 ? LLT::vector(EC: VecTy.getElementCount(), ScalarTy: WideTy)
2974 : WideTy;
2975 widenScalarSrc(MI, WideTy: WideVecTy, OpIdx: 1, ExtOpcode: TargetOpcode::G_FPEXT);
2976 widenScalarDst(MI, WideTy, OpIdx: 0, TruncOpcode: TargetOpcode::G_FPTRUNC);
2977 Observer.changedInstr(MI);
2978 return Legalized;
2979 }
2980}
2981
2982static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
2983 MachineIRBuilder &B, Register Src, LLT Ty) {
2984 auto Unmerge = B.buildUnmerge(Res: Ty, Op: Src);
2985 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2986 Pieces.push_back(Elt: Unmerge.getReg(Idx: I));
2987}
2988
2989LegalizerHelper::LegalizeResult
2990LegalizerHelper::lowerFConstant(MachineInstr &MI) {
2991 Register Dst = MI.getOperand(i: 0).getReg();
2992
2993 MachineFunction &MF = MIRBuilder.getMF();
2994 const DataLayout &DL = MIRBuilder.getDataLayout();
2995
2996 unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
2997 LLT AddrPtrTy = LLT::pointer(AddressSpace: AddrSpace, SizeInBits: DL.getPointerSizeInBits(AS: AddrSpace));
2998 Align Alignment = Align(DL.getABITypeAlign(
2999 Ty: getFloatTypeForLLT(Ctx&: MF.getFunction().getContext(), Ty: MRI.getType(Reg: Dst))));
3000
3001 auto Addr = MIRBuilder.buildConstantPool(
3002 Res: AddrPtrTy, Idx: MF.getConstantPool()->getConstantPoolIndex(
3003 C: MI.getOperand(i: 1).getFPImm(), Alignment));
3004
3005 MachineMemOperand *MMO = MF.getMachineMemOperand(
3006 PtrInfo: MachinePointerInfo::getConstantPool(MF), f: MachineMemOperand::MOLoad,
3007 MemTy: MRI.getType(Reg: Dst), base_alignment: Alignment);
3008
3009 MIRBuilder.buildLoadInstr(Opcode: TargetOpcode::G_LOAD, Res: Dst, Addr, MMO&: *MMO);
3010 MI.eraseFromParent();
3011
3012 return Legalized;
3013}
3014
3015LegalizerHelper::LegalizeResult
3016LegalizerHelper::lowerBitcast(MachineInstr &MI) {
3017 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
3018 if (SrcTy.isVector()) {
3019 LLT SrcEltTy = SrcTy.getElementType();
3020 SmallVector<Register, 8> SrcRegs;
3021
3022 if (DstTy.isVector()) {
3023 int NumDstElt = DstTy.getNumElements();
3024 int NumSrcElt = SrcTy.getNumElements();
3025
3026 LLT DstEltTy = DstTy.getElementType();
3027 LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
3028 LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
3029
3030 // If there's an element size mismatch, insert intermediate casts to match
3031 // the result element type.
3032 if (NumSrcElt < NumDstElt) { // Source element type is larger.
3033 // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
3034 //
3035 // =>
3036 //
3037 // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
3038 // %3:_(<2 x s8>) = G_BITCAST %2
3039 // %4:_(<2 x s8>) = G_BITCAST %3
3040 // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
3041 DstCastTy = LLT::fixed_vector(NumElements: NumDstElt / NumSrcElt, ScalarTy: DstEltTy);
3042 SrcPartTy = SrcEltTy;
3043 } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
3044 //
3045 // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
3046 //
3047 // =>
3048 //
3049 // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
3050 // %3:_(s16) = G_BITCAST %2
3051 // %4:_(s16) = G_BITCAST %3
3052 // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
3053 SrcPartTy = LLT::fixed_vector(NumElements: NumSrcElt / NumDstElt, ScalarTy: SrcEltTy);
3054 DstCastTy = DstEltTy;
3055 }
3056
3057 getUnmergePieces(Pieces&: SrcRegs, B&: MIRBuilder, Src, Ty: SrcPartTy);
3058 for (Register &SrcReg : SrcRegs)
3059 SrcReg = MIRBuilder.buildBitcast(Dst: DstCastTy, Src: SrcReg).getReg(Idx: 0);
3060 } else
3061 getUnmergePieces(Pieces&: SrcRegs, B&: MIRBuilder, Src, Ty: SrcEltTy);
3062
3063 MIRBuilder.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
3064 MI.eraseFromParent();
3065 return Legalized;
3066 }
3067
3068 if (DstTy.isVector()) {
3069 SmallVector<Register, 8> SrcRegs;
3070 getUnmergePieces(Pieces&: SrcRegs, B&: MIRBuilder, Src, Ty: DstTy.getElementType());
3071 MIRBuilder.buildMergeLikeInstr(Res: Dst, Ops: SrcRegs);
3072 MI.eraseFromParent();
3073 return Legalized;
3074 }
3075
3076 return UnableToLegalize;
3077}
3078
3079/// Figure out the bit offset into a register when coercing a vector index for
3080/// the wide element type. This is only for the case when promoting vector to
3081/// one with larger elements.
3082//
3083///
3084/// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3085/// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3086static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
3087 Register Idx,
3088 unsigned NewEltSize,
3089 unsigned OldEltSize) {
3090 const unsigned Log2EltRatio = Log2_32(Value: NewEltSize / OldEltSize);
3091 LLT IdxTy = B.getMRI()->getType(Reg: Idx);
3092
3093 // Now figure out the amount we need to shift to get the target bits.
3094 auto OffsetMask = B.buildConstant(
3095 Res: IdxTy, Val: ~(APInt::getAllOnes(numBits: IdxTy.getSizeInBits()) << Log2EltRatio));
3096 auto OffsetIdx = B.buildAnd(Dst: IdxTy, Src0: Idx, Src1: OffsetMask);
3097 return B.buildShl(Dst: IdxTy, Src0: OffsetIdx,
3098 Src1: B.buildConstant(Res: IdxTy, Val: Log2_32(Value: OldEltSize))).getReg(Idx: 0);
3099}
3100
3101/// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
3102/// is casting to a vector with a smaller element size, perform multiple element
3103/// extracts and merge the results. If this is coercing to a vector with larger
3104/// elements, index the bitcasted vector and extract the target element with bit
3105/// operations. This is intended to force the indexing in the native register
3106/// size for architectures that can dynamically index the register file.
3107LegalizerHelper::LegalizeResult
3108LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
3109 LLT CastTy) {
3110 if (TypeIdx != 1)
3111 return UnableToLegalize;
3112
3113 auto [Dst, DstTy, SrcVec, SrcVecTy, Idx, IdxTy] = MI.getFirst3RegLLTs();
3114
3115 LLT SrcEltTy = SrcVecTy.getElementType();
3116 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3117 unsigned OldNumElts = SrcVecTy.getNumElements();
3118
3119 LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
3120 Register CastVec = MIRBuilder.buildBitcast(Dst: CastTy, Src: SrcVec).getReg(Idx: 0);
3121
3122 const unsigned NewEltSize = NewEltTy.getSizeInBits();
3123 const unsigned OldEltSize = SrcEltTy.getSizeInBits();
3124 if (NewNumElts > OldNumElts) {
3125 // Decreasing the vector element size
3126 //
3127 // e.g. i64 = extract_vector_elt x:v2i64, y:i32
3128 // =>
3129 // v4i32:castx = bitcast x:v2i64
3130 //
3131 // i64 = bitcast
3132 // (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
3133 // (i32 (extract_vector_elt castx, (2 * y + 1)))
3134 //
3135 if (NewNumElts % OldNumElts != 0)
3136 return UnableToLegalize;
3137
3138 // Type of the intermediate result vector.
3139 const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
3140 LLT MidTy =
3141 LLT::scalarOrVector(EC: ElementCount::getFixed(MinVal: NewEltsPerOldElt), ScalarTy: NewEltTy);
3142
3143 auto NewEltsPerOldEltK = MIRBuilder.buildConstant(Res: IdxTy, Val: NewEltsPerOldElt);
3144
3145 SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
3146 auto NewBaseIdx = MIRBuilder.buildMul(Dst: IdxTy, Src0: Idx, Src1: NewEltsPerOldEltK);
3147
3148 for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
3149 auto IdxOffset = MIRBuilder.buildConstant(Res: IdxTy, Val: I);
3150 auto TmpIdx = MIRBuilder.buildAdd(Dst: IdxTy, Src0: NewBaseIdx, Src1: IdxOffset);
3151 auto Elt = MIRBuilder.buildExtractVectorElement(Res: NewEltTy, Val: CastVec, Idx: TmpIdx);
3152 NewOps[I] = Elt.getReg(Idx: 0);
3153 }
3154
3155 auto NewVec = MIRBuilder.buildBuildVector(Res: MidTy, Ops: NewOps);
3156 MIRBuilder.buildBitcast(Dst, Src: NewVec);
3157 MI.eraseFromParent();
3158 return Legalized;
3159 }
3160
3161 if (NewNumElts < OldNumElts) {
3162 if (NewEltSize % OldEltSize != 0)
3163 return UnableToLegalize;
3164
3165 // This only depends on powers of 2 because we use bit tricks to figure out
3166 // the bit offset we need to shift to get the target element. A general
3167 // expansion could emit division/multiply.
3168 if (!isPowerOf2_32(Value: NewEltSize / OldEltSize))
3169 return UnableToLegalize;
3170
3171 // Increasing the vector element size.
3172 // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
3173 //
3174 // =>
3175 //
3176 // %cast = G_BITCAST %vec
3177 // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
3178 // %wide_elt = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
3179 // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3180 // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3181 // %elt_bits = G_LSHR %wide_elt, %offset_bits
3182 // %elt = G_TRUNC %elt_bits
3183
3184 const unsigned Log2EltRatio = Log2_32(Value: NewEltSize / OldEltSize);
3185 auto Log2Ratio = MIRBuilder.buildConstant(Res: IdxTy, Val: Log2EltRatio);
3186
3187 // Divide to get the index in the wider element type.
3188 auto ScaledIdx = MIRBuilder.buildLShr(Dst: IdxTy, Src0: Idx, Src1: Log2Ratio);
3189
3190 Register WideElt = CastVec;
3191 if (CastTy.isVector()) {
3192 WideElt = MIRBuilder.buildExtractVectorElement(Res: NewEltTy, Val: CastVec,
3193 Idx: ScaledIdx).getReg(Idx: 0);
3194 }
3195
3196 // Compute the bit offset into the register of the target element.
3197 Register OffsetBits = getBitcastWiderVectorElementOffset(
3198 B&: MIRBuilder, Idx, NewEltSize, OldEltSize);
3199
3200 // Shift the wide element to get the target element.
3201 auto ExtractedBits = MIRBuilder.buildLShr(Dst: NewEltTy, Src0: WideElt, Src1: OffsetBits);
3202 MIRBuilder.buildTrunc(Res: Dst, Op: ExtractedBits);
3203 MI.eraseFromParent();
3204 return Legalized;
3205 }
3206
3207 return UnableToLegalize;
3208}
3209
3210/// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
3211/// TargetReg, while preserving other bits in \p TargetReg.
3212///
3213/// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
3214static Register buildBitFieldInsert(MachineIRBuilder &B,
3215 Register TargetReg, Register InsertReg,
3216 Register OffsetBits) {
3217 LLT TargetTy = B.getMRI()->getType(Reg: TargetReg);
3218 LLT InsertTy = B.getMRI()->getType(Reg: InsertReg);
3219 auto ZextVal = B.buildZExt(Res: TargetTy, Op: InsertReg);
3220 auto ShiftedInsertVal = B.buildShl(Dst: TargetTy, Src0: ZextVal, Src1: OffsetBits);
3221
3222 // Produce a bitmask of the value to insert
3223 auto EltMask = B.buildConstant(
3224 Res: TargetTy, Val: APInt::getLowBitsSet(numBits: TargetTy.getSizeInBits(),
3225 loBitsSet: InsertTy.getSizeInBits()));
3226 // Shift it into position
3227 auto ShiftedMask = B.buildShl(Dst: TargetTy, Src0: EltMask, Src1: OffsetBits);
3228 auto InvShiftedMask = B.buildNot(Dst: TargetTy, Src0: ShiftedMask);
3229
3230 // Clear out the bits in the wide element
3231 auto MaskedOldElt = B.buildAnd(Dst: TargetTy, Src0: TargetReg, Src1: InvShiftedMask);
3232
3233 // The value to insert has all zeros already, so stick it into the masked
3234 // wide element.
3235 return B.buildOr(Dst: TargetTy, Src0: MaskedOldElt, Src1: ShiftedInsertVal).getReg(Idx: 0);
3236}
3237
3238/// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
3239/// is increasing the element size, perform the indexing in the target element
3240/// type, and use bit operations to insert at the element position. This is
3241/// intended for architectures that can dynamically index the register file and
3242/// want to force indexing in the native register size.
3243LegalizerHelper::LegalizeResult
3244LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
3245 LLT CastTy) {
3246 if (TypeIdx != 0)
3247 return UnableToLegalize;
3248
3249 auto [Dst, DstTy, SrcVec, SrcVecTy, Val, ValTy, Idx, IdxTy] =
3250 MI.getFirst4RegLLTs();
3251 LLT VecTy = DstTy;
3252
3253 LLT VecEltTy = VecTy.getElementType();
3254 LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
3255 const unsigned NewEltSize = NewEltTy.getSizeInBits();
3256 const unsigned OldEltSize = VecEltTy.getSizeInBits();
3257
3258 unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3259 unsigned OldNumElts = VecTy.getNumElements();
3260
3261 Register CastVec = MIRBuilder.buildBitcast(Dst: CastTy, Src: SrcVec).getReg(Idx: 0);
3262 if (NewNumElts < OldNumElts) {
3263 if (NewEltSize % OldEltSize != 0)
3264 return UnableToLegalize;
3265
3266 // This only depends on powers of 2 because we use bit tricks to figure out
3267 // the bit offset we need to shift to get the target element. A general
3268 // expansion could emit division/multiply.
3269 if (!isPowerOf2_32(Value: NewEltSize / OldEltSize))
3270 return UnableToLegalize;
3271
3272 const unsigned Log2EltRatio = Log2_32(Value: NewEltSize / OldEltSize);
3273 auto Log2Ratio = MIRBuilder.buildConstant(Res: IdxTy, Val: Log2EltRatio);
3274
3275 // Divide to get the index in the wider element type.
3276 auto ScaledIdx = MIRBuilder.buildLShr(Dst: IdxTy, Src0: Idx, Src1: Log2Ratio);
3277
3278 Register ExtractedElt = CastVec;
3279 if (CastTy.isVector()) {
3280 ExtractedElt = MIRBuilder.buildExtractVectorElement(Res: NewEltTy, Val: CastVec,
3281 Idx: ScaledIdx).getReg(Idx: 0);
3282 }
3283
3284 // Compute the bit offset into the register of the target element.
3285 Register OffsetBits = getBitcastWiderVectorElementOffset(
3286 B&: MIRBuilder, Idx, NewEltSize, OldEltSize);
3287
3288 Register InsertedElt = buildBitFieldInsert(B&: MIRBuilder, TargetReg: ExtractedElt,
3289 InsertReg: Val, OffsetBits);
3290 if (CastTy.isVector()) {
3291 InsertedElt = MIRBuilder.buildInsertVectorElement(
3292 Res: CastTy, Val: CastVec, Elt: InsertedElt, Idx: ScaledIdx).getReg(Idx: 0);
3293 }
3294
3295 MIRBuilder.buildBitcast(Dst, Src: InsertedElt);
3296 MI.eraseFromParent();
3297 return Legalized;
3298 }
3299
3300 return UnableToLegalize;
3301}
3302
3303LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
3304 // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
3305 Register DstReg = LoadMI.getDstReg();
3306 Register PtrReg = LoadMI.getPointerReg();
3307 LLT DstTy = MRI.getType(Reg: DstReg);
3308 MachineMemOperand &MMO = LoadMI.getMMO();
3309 LLT MemTy = MMO.getMemoryType();
3310 MachineFunction &MF = MIRBuilder.getMF();
3311
3312 unsigned MemSizeInBits = MemTy.getSizeInBits();
3313 unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
3314
3315 if (MemSizeInBits != MemStoreSizeInBits) {
3316 if (MemTy.isVector())
3317 return UnableToLegalize;
3318
3319 // Promote to a byte-sized load if not loading an integral number of
3320 // bytes. For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
3321 LLT WideMemTy = LLT::scalar(SizeInBits: MemStoreSizeInBits);
3322 MachineMemOperand *NewMMO =
3323 MF.getMachineMemOperand(MMO: &MMO, PtrInfo: MMO.getPointerInfo(), Ty: WideMemTy);
3324
3325 Register LoadReg = DstReg;
3326 LLT LoadTy = DstTy;
3327
3328 // If this wasn't already an extending load, we need to widen the result
3329 // register to avoid creating a load with a narrower result than the source.
3330 if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
3331 LoadTy = WideMemTy;
3332 LoadReg = MRI.createGenericVirtualRegister(Ty: WideMemTy);
3333 }
3334
3335 if (isa<GSExtLoad>(Val: LoadMI)) {
3336 auto NewLoad = MIRBuilder.buildLoad(Res: LoadTy, Addr: PtrReg, MMO&: *NewMMO);
3337 MIRBuilder.buildSExtInReg(Res: LoadReg, Op: NewLoad, ImmOp: MemSizeInBits);
3338 } else if (isa<GZExtLoad>(Val: LoadMI) || WideMemTy == LoadTy) {
3339 auto NewLoad = MIRBuilder.buildLoad(Res: LoadTy, Addr: PtrReg, MMO&: *NewMMO);
3340 // The extra bits are guaranteed to be zero, since we stored them that
3341 // way. A zext load from Wide thus automatically gives zext from MemVT.
3342 MIRBuilder.buildAssertZExt(Res: LoadReg, Op: NewLoad, Size: MemSizeInBits);
3343 } else {
3344 MIRBuilder.buildLoad(Res: LoadReg, Addr: PtrReg, MMO&: *NewMMO);
3345 }
3346
3347 if (DstTy != LoadTy)
3348 MIRBuilder.buildTrunc(Res: DstReg, Op: LoadReg);
3349
3350 LoadMI.eraseFromParent();
3351 return Legalized;
3352 }
3353
3354 // Big endian lowering not implemented.
3355 if (MIRBuilder.getDataLayout().isBigEndian())
3356 return UnableToLegalize;
3357
3358 // This load needs splitting into power of 2 sized loads.
3359 //
3360 // Our strategy here is to generate anyextending loads for the smaller
3361 // types up to next power-2 result type, and then combine the two larger
3362 // result values together, before truncating back down to the non-pow-2
3363 // type.
3364 // E.g. v1 = i24 load =>
3365 // v2 = i32 zextload (2 byte)
3366 // v3 = i32 load (1 byte)
3367 // v4 = i32 shl v3, 16
3368 // v5 = i32 or v4, v2
3369 // v1 = i24 trunc v5
3370 // By doing this we generate the correct truncate which should get
3371 // combined away as an artifact with a matching extend.
3372
3373 uint64_t LargeSplitSize, SmallSplitSize;
3374
3375 if (!isPowerOf2_32(Value: MemSizeInBits)) {
3376 // This load needs splitting into power of 2 sized loads.
3377 LargeSplitSize = llvm::bit_floor(Value: MemSizeInBits);
3378 SmallSplitSize = MemSizeInBits - LargeSplitSize;
3379 } else {
3380 // This is already a power of 2, but we still need to split this in half.
3381 //
3382 // Assume we're being asked to decompose an unaligned load.
3383 // TODO: If this requires multiple splits, handle them all at once.
3384 auto &Ctx = MF.getFunction().getContext();
3385 if (TLI.allowsMemoryAccess(Context&: Ctx, DL: MIRBuilder.getDataLayout(), Ty: MemTy, MMO))
3386 return UnableToLegalize;
3387
3388 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3389 }
3390
3391 if (MemTy.isVector()) {
3392 // TODO: Handle vector extloads
3393 if (MemTy != DstTy)
3394 return UnableToLegalize;
3395
3396 // TODO: We can do better than scalarizing the vector and at least split it
3397 // in half.
3398 return reduceLoadStoreWidth(MI&: LoadMI, TypeIdx: 0, NarrowTy: DstTy.getElementType());
3399 }
3400
3401 MachineMemOperand *LargeMMO =
3402 MF.getMachineMemOperand(MMO: &MMO, Offset: 0, Size: LargeSplitSize / 8);
3403 MachineMemOperand *SmallMMO =
3404 MF.getMachineMemOperand(MMO: &MMO, Offset: LargeSplitSize / 8, Size: SmallSplitSize / 8);
3405
3406 LLT PtrTy = MRI.getType(Reg: PtrReg);
3407 unsigned AnyExtSize = PowerOf2Ceil(A: DstTy.getSizeInBits());
3408 LLT AnyExtTy = LLT::scalar(SizeInBits: AnyExtSize);
3409 auto LargeLoad = MIRBuilder.buildLoadInstr(Opcode: TargetOpcode::G_ZEXTLOAD, Res: AnyExtTy,
3410 Addr: PtrReg, MMO&: *LargeMMO);
3411
3412 auto OffsetCst = MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: PtrTy.getSizeInBits()),
3413 Val: LargeSplitSize / 8);
3414 Register PtrAddReg = MRI.createGenericVirtualRegister(Ty: PtrTy);
3415 auto SmallPtr = MIRBuilder.buildPtrAdd(Res: PtrAddReg, Op0: PtrReg, Op1: OffsetCst);
3416 auto SmallLoad = MIRBuilder.buildLoadInstr(Opcode: LoadMI.getOpcode(), Res: AnyExtTy,
3417 Addr: SmallPtr, MMO&: *SmallMMO);
3418
3419 auto ShiftAmt = MIRBuilder.buildConstant(Res: AnyExtTy, Val: LargeSplitSize);
3420 auto Shift = MIRBuilder.buildShl(Dst: AnyExtTy, Src0: SmallLoad, Src1: ShiftAmt);
3421
3422 if (AnyExtTy == DstTy)
3423 MIRBuilder.buildOr(Dst: DstReg, Src0: Shift, Src1: LargeLoad);
3424 else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
3425 auto Or = MIRBuilder.buildOr(Dst: AnyExtTy, Src0: Shift, Src1: LargeLoad);
3426 MIRBuilder.buildTrunc(Res: DstReg, Op: {Or});
3427 } else {
3428 assert(DstTy.isPointer() && "expected pointer");
3429 auto Or = MIRBuilder.buildOr(Dst: AnyExtTy, Src0: Shift, Src1: LargeLoad);
3430
3431 // FIXME: We currently consider this to be illegal for non-integral address
3432 // spaces, but we need still need a way to reinterpret the bits.
3433 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: Or);
3434 }
3435
3436 LoadMI.eraseFromParent();
3437 return Legalized;
3438}
3439
3440LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
3441 // Lower a non-power of 2 store into multiple pow-2 stores.
3442 // E.g. split an i24 store into an i16 store + i8 store.
3443 // We do this by first extending the stored value to the next largest power
3444 // of 2 type, and then using truncating stores to store the components.
3445 // By doing this, likewise with G_LOAD, generate an extend that can be
3446 // artifact-combined away instead of leaving behind extracts.
3447 Register SrcReg = StoreMI.getValueReg();
3448 Register PtrReg = StoreMI.getPointerReg();
3449 LLT SrcTy = MRI.getType(Reg: SrcReg);
3450 MachineFunction &MF = MIRBuilder.getMF();
3451 MachineMemOperand &MMO = **StoreMI.memoperands_begin();
3452 LLT MemTy = MMO.getMemoryType();
3453
3454 unsigned StoreWidth = MemTy.getSizeInBits();
3455 unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
3456
3457 if (StoreWidth != StoreSizeInBits) {
3458 if (SrcTy.isVector())
3459 return UnableToLegalize;
3460
3461 // Promote to a byte-sized store with upper bits zero if not
3462 // storing an integral number of bytes. For example, promote
3463 // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
3464 LLT WideTy = LLT::scalar(SizeInBits: StoreSizeInBits);
3465
3466 if (StoreSizeInBits > SrcTy.getSizeInBits()) {
3467 // Avoid creating a store with a narrower source than result.
3468 SrcReg = MIRBuilder.buildAnyExt(Res: WideTy, Op: SrcReg).getReg(Idx: 0);
3469 SrcTy = WideTy;
3470 }
3471
3472 auto ZextInReg = MIRBuilder.buildZExtInReg(Res: SrcTy, Op: SrcReg, ImmOp: StoreWidth);
3473
3474 MachineMemOperand *NewMMO =
3475 MF.getMachineMemOperand(MMO: &MMO, PtrInfo: MMO.getPointerInfo(), Ty: WideTy);
3476 MIRBuilder.buildStore(Val: ZextInReg, Addr: PtrReg, MMO&: *NewMMO);
3477 StoreMI.eraseFromParent();
3478 return Legalized;
3479 }
3480
3481 if (MemTy.isVector()) {
3482 // TODO: Handle vector trunc stores
3483 if (MemTy != SrcTy)
3484 return UnableToLegalize;
3485
3486 // TODO: We can do better than scalarizing the vector and at least split it
3487 // in half.
3488 return reduceLoadStoreWidth(MI&: StoreMI, TypeIdx: 0, NarrowTy: SrcTy.getElementType());
3489 }
3490
3491 unsigned MemSizeInBits = MemTy.getSizeInBits();
3492 uint64_t LargeSplitSize, SmallSplitSize;
3493
3494 if (!isPowerOf2_32(Value: MemSizeInBits)) {
3495 LargeSplitSize = llvm::bit_floor<uint64_t>(Value: MemTy.getSizeInBits());
3496 SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
3497 } else {
3498 auto &Ctx = MF.getFunction().getContext();
3499 if (TLI.allowsMemoryAccess(Context&: Ctx, DL: MIRBuilder.getDataLayout(), Ty: MemTy, MMO))
3500 return UnableToLegalize; // Don't know what we're being asked to do.
3501
3502 SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3503 }
3504
3505 // Extend to the next pow-2. If this store was itself the result of lowering,
3506 // e.g. an s56 store being broken into s32 + s24, we might have a stored type
3507 // that's wider than the stored size.
3508 unsigned AnyExtSize = PowerOf2Ceil(A: MemTy.getSizeInBits());
3509 const LLT NewSrcTy = LLT::scalar(SizeInBits: AnyExtSize);
3510
3511 if (SrcTy.isPointer()) {
3512 const LLT IntPtrTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
3513 SrcReg = MIRBuilder.buildPtrToInt(Dst: IntPtrTy, Src: SrcReg).getReg(Idx: 0);
3514 }
3515
3516 auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(Res: NewSrcTy, Op: SrcReg);
3517
3518 // Obtain the smaller value by shifting away the larger value.
3519 auto ShiftAmt = MIRBuilder.buildConstant(Res: NewSrcTy, Val: LargeSplitSize);
3520 auto SmallVal = MIRBuilder.buildLShr(Dst: NewSrcTy, Src0: ExtVal, Src1: ShiftAmt);
3521
3522 // Generate the PtrAdd and truncating stores.
3523 LLT PtrTy = MRI.getType(Reg: PtrReg);
3524 auto OffsetCst = MIRBuilder.buildConstant(
3525 Res: LLT::scalar(SizeInBits: PtrTy.getSizeInBits()), Val: LargeSplitSize / 8);
3526 auto SmallPtr =
3527 MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: PtrReg, Op1: OffsetCst);
3528
3529 MachineMemOperand *LargeMMO =
3530 MF.getMachineMemOperand(MMO: &MMO, Offset: 0, Size: LargeSplitSize / 8);
3531 MachineMemOperand *SmallMMO =
3532 MF.getMachineMemOperand(MMO: &MMO, Offset: LargeSplitSize / 8, Size: SmallSplitSize / 8);
3533 MIRBuilder.buildStore(Val: ExtVal, Addr: PtrReg, MMO&: *LargeMMO);
3534 MIRBuilder.buildStore(Val: SmallVal, Addr: SmallPtr, MMO&: *SmallMMO);
3535 StoreMI.eraseFromParent();
3536 return Legalized;
3537}
3538
3539LegalizerHelper::LegalizeResult
3540LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
3541 switch (MI.getOpcode()) {
3542 case TargetOpcode::G_LOAD: {
3543 if (TypeIdx != 0)
3544 return UnableToLegalize;
3545 MachineMemOperand &MMO = **MI.memoperands_begin();
3546
3547 // Not sure how to interpret a bitcast of an extending load.
3548 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3549 return UnableToLegalize;
3550
3551 Observer.changingInstr(MI);
3552 bitcastDst(MI, CastTy, OpIdx: 0);
3553 MMO.setType(CastTy);
3554 Observer.changedInstr(MI);
3555 return Legalized;
3556 }
3557 case TargetOpcode::G_STORE: {
3558 if (TypeIdx != 0)
3559 return UnableToLegalize;
3560
3561 MachineMemOperand &MMO = **MI.memoperands_begin();
3562
3563 // Not sure how to interpret a bitcast of a truncating store.
3564 if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3565 return UnableToLegalize;
3566
3567 Observer.changingInstr(MI);
3568 bitcastSrc(MI, CastTy, OpIdx: 0);
3569 MMO.setType(CastTy);
3570 Observer.changedInstr(MI);
3571 return Legalized;
3572 }
3573 case TargetOpcode::G_SELECT: {
3574 if (TypeIdx != 0)
3575 return UnableToLegalize;
3576
3577 if (MRI.getType(Reg: MI.getOperand(i: 1).getReg()).isVector()) {
3578 LLVM_DEBUG(
3579 dbgs() << "bitcast action not implemented for vector select\n");
3580 return UnableToLegalize;
3581 }
3582
3583 Observer.changingInstr(MI);
3584 bitcastSrc(MI, CastTy, OpIdx: 2);
3585 bitcastSrc(MI, CastTy, OpIdx: 3);
3586 bitcastDst(MI, CastTy, OpIdx: 0);
3587 Observer.changedInstr(MI);
3588 return Legalized;
3589 }
3590 case TargetOpcode::G_AND:
3591 case TargetOpcode::G_OR:
3592 case TargetOpcode::G_XOR: {
3593 Observer.changingInstr(MI);
3594 bitcastSrc(MI, CastTy, OpIdx: 1);
3595 bitcastSrc(MI, CastTy, OpIdx: 2);
3596 bitcastDst(MI, CastTy, OpIdx: 0);
3597 Observer.changedInstr(MI);
3598 return Legalized;
3599 }
3600 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3601 return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
3602 case TargetOpcode::G_INSERT_VECTOR_ELT:
3603 return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
3604 default:
3605 return UnableToLegalize;
3606 }
3607}
3608
3609// Legalize an instruction by changing the opcode in place.
3610void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
3611 Observer.changingInstr(MI);
3612 MI.setDesc(MIRBuilder.getTII().get(Opcode: NewOpcode));
3613 Observer.changedInstr(MI);
3614}
3615
3616LegalizerHelper::LegalizeResult
3617LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
3618 using namespace TargetOpcode;
3619
3620 switch(MI.getOpcode()) {
3621 default:
3622 return UnableToLegalize;
3623 case TargetOpcode::G_FCONSTANT:
3624 return lowerFConstant(MI);
3625 case TargetOpcode::G_BITCAST:
3626 return lowerBitcast(MI);
3627 case TargetOpcode::G_SREM:
3628 case TargetOpcode::G_UREM: {
3629 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
3630 auto Quot =
3631 MIRBuilder.buildInstr(Opc: MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, DstOps: {Ty},
3632 SrcOps: {MI.getOperand(i: 1), MI.getOperand(i: 2)});
3633
3634 auto Prod = MIRBuilder.buildMul(Dst: Ty, Src0: Quot, Src1: MI.getOperand(i: 2));
3635 MIRBuilder.buildSub(Dst: MI.getOperand(i: 0), Src0: MI.getOperand(i: 1), Src1: Prod);
3636 MI.eraseFromParent();
3637 return Legalized;
3638 }
3639 case TargetOpcode::G_SADDO:
3640 case TargetOpcode::G_SSUBO:
3641 return lowerSADDO_SSUBO(MI);
3642 case TargetOpcode::G_UMULH:
3643 case TargetOpcode::G_SMULH:
3644 return lowerSMULH_UMULH(MI);
3645 case TargetOpcode::G_SMULO:
3646 case TargetOpcode::G_UMULO: {
3647 // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
3648 // result.
3649 auto [Res, Overflow, LHS, RHS] = MI.getFirst4Regs();
3650 LLT Ty = MRI.getType(Reg: Res);
3651
3652 unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
3653 ? TargetOpcode::G_SMULH
3654 : TargetOpcode::G_UMULH;
3655
3656 Observer.changingInstr(MI);
3657 const auto &TII = MIRBuilder.getTII();
3658 MI.setDesc(TII.get(Opcode: TargetOpcode::G_MUL));
3659 MI.removeOperand(OpNo: 1);
3660 Observer.changedInstr(MI);
3661
3662 auto HiPart = MIRBuilder.buildInstr(Opc: Opcode, DstOps: {Ty}, SrcOps: {LHS, RHS});
3663 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
3664
3665 // Move insert point forward so we can use the Res register if needed.
3666 MIRBuilder.setInsertPt(MBB&: MIRBuilder.getMBB(), II: ++MIRBuilder.getInsertPt());
3667
3668 // For *signed* multiply, overflow is detected by checking:
3669 // (hi != (lo >> bitwidth-1))
3670 if (Opcode == TargetOpcode::G_SMULH) {
3671 auto ShiftAmt = MIRBuilder.buildConstant(Res: Ty, Val: Ty.getSizeInBits() - 1);
3672 auto Shifted = MIRBuilder.buildAShr(Dst: Ty, Src0: Res, Src1: ShiftAmt);
3673 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: Overflow, Op0: HiPart, Op1: Shifted);
3674 } else {
3675 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: Overflow, Op0: HiPart, Op1: Zero);
3676 }
3677 return Legalized;
3678 }
3679 case TargetOpcode::G_FNEG: {
3680 auto [Res, SubByReg] = MI.getFirst2Regs();
3681 LLT Ty = MRI.getType(Reg: Res);
3682
3683 // TODO: Handle vector types once we are able to
3684 // represent them.
3685 if (Ty.isVector())
3686 return UnableToLegalize;
3687 auto SignMask =
3688 MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignMask(BitWidth: Ty.getSizeInBits()));
3689 MIRBuilder.buildXor(Dst: Res, Src0: SubByReg, Src1: SignMask);
3690 MI.eraseFromParent();
3691 return Legalized;
3692 }
3693 case TargetOpcode::G_FSUB:
3694 case TargetOpcode::G_STRICT_FSUB: {
3695 auto [Res, LHS, RHS] = MI.getFirst3Regs();
3696 LLT Ty = MRI.getType(Reg: Res);
3697
3698 // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
3699 auto Neg = MIRBuilder.buildFNeg(Dst: Ty, Src0: RHS);
3700
3701 if (MI.getOpcode() == TargetOpcode::G_STRICT_FSUB)
3702 MIRBuilder.buildStrictFAdd(Dst: Res, Src0: LHS, Src1: Neg, Flags: MI.getFlags());
3703 else
3704 MIRBuilder.buildFAdd(Dst: Res, Src0: LHS, Src1: Neg, Flags: MI.getFlags());
3705
3706 MI.eraseFromParent();
3707 return Legalized;
3708 }
3709 case TargetOpcode::G_FMAD:
3710 return lowerFMad(MI);
3711 case TargetOpcode::G_FFLOOR:
3712 return lowerFFloor(MI);
3713 case TargetOpcode::G_INTRINSIC_ROUND:
3714 return lowerIntrinsicRound(MI);
3715 case TargetOpcode::G_FRINT: {
3716 // Since round even is the assumed rounding mode for unconstrained FP
3717 // operations, rint and roundeven are the same operation.
3718 changeOpcode(MI, NewOpcode: TargetOpcode::G_INTRINSIC_ROUNDEVEN);
3719 return Legalized;
3720 }
3721 case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
3722 auto [OldValRes, SuccessRes, Addr, CmpVal, NewVal] = MI.getFirst5Regs();
3723 MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal,
3724 MMO&: **MI.memoperands_begin());
3725 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: SuccessRes, Op0: OldValRes, Op1: CmpVal);
3726 MI.eraseFromParent();
3727 return Legalized;
3728 }
3729 case TargetOpcode::G_LOAD:
3730 case TargetOpcode::G_SEXTLOAD:
3731 case TargetOpcode::G_ZEXTLOAD:
3732 return lowerLoad(LoadMI&: cast<GAnyLoad>(Val&: MI));
3733 case TargetOpcode::G_STORE:
3734 return lowerStore(StoreMI&: cast<GStore>(Val&: MI));
3735 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
3736 case TargetOpcode::G_CTTZ_ZERO_UNDEF:
3737 case TargetOpcode::G_CTLZ:
3738 case TargetOpcode::G_CTTZ:
3739 case TargetOpcode::G_CTPOP:
3740 return lowerBitCount(MI);
3741 case G_UADDO: {
3742 auto [Res, CarryOut, LHS, RHS] = MI.getFirst4Regs();
3743
3744 MIRBuilder.buildAdd(Dst: Res, Src0: LHS, Src1: RHS);
3745 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_ULT, Res: CarryOut, Op0: Res, Op1: RHS);
3746
3747 MI.eraseFromParent();
3748 return Legalized;
3749 }
3750 case G_UADDE: {
3751 auto [Res, CarryOut, LHS, RHS, CarryIn] = MI.getFirst5Regs();
3752 const LLT CondTy = MRI.getType(Reg: CarryOut);
3753 const LLT Ty = MRI.getType(Reg: Res);
3754
3755 // Initial add of the two operands.
3756 auto TmpRes = MIRBuilder.buildAdd(Dst: Ty, Src0: LHS, Src1: RHS);
3757
3758 // Initial check for carry.
3759 auto Carry = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_ULT, Res: CondTy, Op0: TmpRes, Op1: LHS);
3760
3761 // Add the sum and the carry.
3762 auto ZExtCarryIn = MIRBuilder.buildZExt(Res: Ty, Op: CarryIn);
3763 MIRBuilder.buildAdd(Dst: Res, Src0: TmpRes, Src1: ZExtCarryIn);
3764
3765 // Second check for carry. We can only carry if the initial sum is all 1s
3766 // and the carry is set, resulting in a new sum of 0.
3767 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
3768 auto ResEqZero = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CondTy, Op0: Res, Op1: Zero);
3769 auto Carry2 = MIRBuilder.buildAnd(Dst: CondTy, Src0: ResEqZero, Src1: CarryIn);
3770 MIRBuilder.buildOr(Dst: CarryOut, Src0: Carry, Src1: Carry2);
3771
3772 MI.eraseFromParent();
3773 return Legalized;
3774 }
3775 case G_USUBO: {
3776 auto [Res, BorrowOut, LHS, RHS] = MI.getFirst4Regs();
3777
3778 MIRBuilder.buildSub(Dst: Res, Src0: LHS, Src1: RHS);
3779 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_ULT, Res: BorrowOut, Op0: LHS, Op1: RHS);
3780
3781 MI.eraseFromParent();
3782 return Legalized;
3783 }
3784 case G_USUBE: {
3785 auto [Res, BorrowOut, LHS, RHS, BorrowIn] = MI.getFirst5Regs();
3786 const LLT CondTy = MRI.getType(Reg: BorrowOut);
3787 const LLT Ty = MRI.getType(Reg: Res);
3788
3789 // Initial subtract of the two operands.
3790 auto TmpRes = MIRBuilder.buildSub(Dst: Ty, Src0: LHS, Src1: RHS);
3791
3792 // Initial check for borrow.
3793 auto Borrow = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_UGT, Res: CondTy, Op0: TmpRes, Op1: LHS);
3794
3795 // Subtract the borrow from the first subtract.
3796 auto ZExtBorrowIn = MIRBuilder.buildZExt(Res: Ty, Op: BorrowIn);
3797 MIRBuilder.buildSub(Dst: Res, Src0: TmpRes, Src1: ZExtBorrowIn);
3798
3799 // Second check for borrow. We can only borrow if the initial difference is
3800 // 0 and the borrow is set, resulting in a new difference of all 1s.
3801 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
3802 auto TmpResEqZero =
3803 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: CondTy, Op0: TmpRes, Op1: Zero);
3804 auto Borrow2 = MIRBuilder.buildAnd(Dst: CondTy, Src0: TmpResEqZero, Src1: BorrowIn);
3805 MIRBuilder.buildOr(Dst: BorrowOut, Src0: Borrow, Src1: Borrow2);
3806
3807 MI.eraseFromParent();
3808 return Legalized;
3809 }
3810 case G_UITOFP:
3811 return lowerUITOFP(MI);
3812 case G_SITOFP:
3813 return lowerSITOFP(MI);
3814 case G_FPTOUI:
3815 return lowerFPTOUI(MI);
3816 case G_FPTOSI:
3817 return lowerFPTOSI(MI);
3818 case G_FPTRUNC:
3819 return lowerFPTRUNC(MI);
3820 case G_FPOWI:
3821 return lowerFPOWI(MI);
3822 case G_SMIN:
3823 case G_SMAX:
3824 case G_UMIN:
3825 case G_UMAX:
3826 return lowerMinMax(MI);
3827 case G_FCOPYSIGN:
3828 return lowerFCopySign(MI);
3829 case G_FMINNUM:
3830 case G_FMAXNUM:
3831 return lowerFMinNumMaxNum(MI);
3832 case G_MERGE_VALUES:
3833 return lowerMergeValues(MI);
3834 case G_UNMERGE_VALUES:
3835 return lowerUnmergeValues(MI);
3836 case TargetOpcode::G_SEXT_INREG: {
3837 assert(MI.getOperand(2).isImm() && "Expected immediate");
3838 int64_t SizeInBits = MI.getOperand(i: 2).getImm();
3839
3840 auto [DstReg, SrcReg] = MI.getFirst2Regs();
3841 LLT DstTy = MRI.getType(Reg: DstReg);
3842 Register TmpRes = MRI.createGenericVirtualRegister(Ty: DstTy);
3843
3844 auto MIBSz = MIRBuilder.buildConstant(Res: DstTy, Val: DstTy.getScalarSizeInBits() - SizeInBits);
3845 MIRBuilder.buildShl(Dst: TmpRes, Src0: SrcReg, Src1: MIBSz->getOperand(i: 0));
3846 MIRBuilder.buildAShr(Dst: DstReg, Src0: TmpRes, Src1: MIBSz->getOperand(i: 0));
3847 MI.eraseFromParent();
3848 return Legalized;
3849 }
3850 case G_EXTRACT_VECTOR_ELT:
3851 case G_INSERT_VECTOR_ELT:
3852 return lowerExtractInsertVectorElt(MI);
3853 case G_SHUFFLE_VECTOR:
3854 return lowerShuffleVector(MI);
3855 case G_DYN_STACKALLOC:
3856 return lowerDynStackAlloc(MI);
3857 case G_STACKSAVE:
3858 return lowerStackSave(MI);
3859 case G_STACKRESTORE:
3860 return lowerStackRestore(MI);
3861 case G_EXTRACT:
3862 return lowerExtract(MI);
3863 case G_INSERT:
3864 return lowerInsert(MI);
3865 case G_BSWAP:
3866 return lowerBswap(MI);
3867 case G_BITREVERSE:
3868 return lowerBitreverse(MI);
3869 case G_READ_REGISTER:
3870 case G_WRITE_REGISTER:
3871 return lowerReadWriteRegister(MI);
3872 case G_UADDSAT:
3873 case G_USUBSAT: {
3874 // Try to make a reasonable guess about which lowering strategy to use. The
3875 // target can override this with custom lowering and calling the
3876 // implementation functions.
3877 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
3878 if (LI.isLegalOrCustom(Query: {G_UMIN, Ty}))
3879 return lowerAddSubSatToMinMax(MI);
3880 return lowerAddSubSatToAddoSubo(MI);
3881 }
3882 case G_SADDSAT:
3883 case G_SSUBSAT: {
3884 LLT Ty = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
3885
3886 // FIXME: It would probably make more sense to see if G_SADDO is preferred,
3887 // since it's a shorter expansion. However, we would need to figure out the
3888 // preferred boolean type for the carry out for the query.
3889 if (LI.isLegalOrCustom(Query: {G_SMIN, Ty}) && LI.isLegalOrCustom(Query: {G_SMAX, Ty}))
3890 return lowerAddSubSatToMinMax(MI);
3891 return lowerAddSubSatToAddoSubo(MI);
3892 }
3893 case G_SSHLSAT:
3894 case G_USHLSAT:
3895 return lowerShlSat(MI);
3896 case G_ABS:
3897 return lowerAbsToAddXor(MI);
3898 case G_SELECT:
3899 return lowerSelect(MI);
3900 case G_IS_FPCLASS:
3901 return lowerISFPCLASS(MI);
3902 case G_SDIVREM:
3903 case G_UDIVREM:
3904 return lowerDIVREM(MI);
3905 case G_FSHL:
3906 case G_FSHR:
3907 return lowerFunnelShift(MI);
3908 case G_ROTL:
3909 case G_ROTR:
3910 return lowerRotate(MI);
3911 case G_MEMSET:
3912 case G_MEMCPY:
3913 case G_MEMMOVE:
3914 return lowerMemCpyFamily(MI);
3915 case G_MEMCPY_INLINE:
3916 return lowerMemcpyInline(MI);
3917 case G_ZEXT:
3918 case G_SEXT:
3919 case G_ANYEXT:
3920 return lowerEXT(MI);
3921 case G_TRUNC:
3922 return lowerTRUNC(MI);
3923 GISEL_VECREDUCE_CASES_NONSEQ
3924 return lowerVectorReduction(MI);
3925 case G_VAARG:
3926 return lowerVAArg(MI);
3927 }
3928}
3929
3930Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
3931 Align MinAlign) const {
3932 // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
3933 // datalayout for the preferred alignment. Also there should be a target hook
3934 // for this to allow targets to reduce the alignment and ignore the
3935 // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
3936 // the type.
3937 return std::max(a: Align(PowerOf2Ceil(A: Ty.getSizeInBytes())), b: MinAlign);
3938}
3939
3940MachineInstrBuilder
3941LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
3942 MachinePointerInfo &PtrInfo) {
3943 MachineFunction &MF = MIRBuilder.getMF();
3944 const DataLayout &DL = MIRBuilder.getDataLayout();
3945 int FrameIdx = MF.getFrameInfo().CreateStackObject(Size: Bytes, Alignment, isSpillSlot: false);
3946
3947 unsigned AddrSpace = DL.getAllocaAddrSpace();
3948 LLT FramePtrTy = LLT::pointer(AddressSpace: AddrSpace, SizeInBits: DL.getPointerSizeInBits(AS: AddrSpace));
3949
3950 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI: FrameIdx);
3951 return MIRBuilder.buildFrameIndex(Res: FramePtrTy, Idx: FrameIdx);
3952}
3953
3954static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg,
3955 LLT VecTy) {
3956 int64_t IdxVal;
3957 if (mi_match(R: IdxReg, MRI: *B.getMRI(), P: m_ICst(Cst&: IdxVal)))
3958 return IdxReg;
3959
3960 LLT IdxTy = B.getMRI()->getType(Reg: IdxReg);
3961 unsigned NElts = VecTy.getNumElements();
3962 if (isPowerOf2_32(Value: NElts)) {
3963 APInt Imm = APInt::getLowBitsSet(numBits: IdxTy.getSizeInBits(), loBitsSet: Log2_32(Value: NElts));
3964 return B.buildAnd(Dst: IdxTy, Src0: IdxReg, Src1: B.buildConstant(Res: IdxTy, Val: Imm)).getReg(Idx: 0);
3965 }
3966
3967 return B.buildUMin(Dst: IdxTy, Src0: IdxReg, Src1: B.buildConstant(Res: IdxTy, Val: NElts - 1))
3968 .getReg(Idx: 0);
3969}
3970
3971Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
3972 Register Index) {
3973 LLT EltTy = VecTy.getElementType();
3974
3975 // Calculate the element offset and add it to the pointer.
3976 unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
3977 assert(EltSize * 8 == EltTy.getSizeInBits() &&
3978 "Converting bits to bytes lost precision");
3979
3980 Index = clampDynamicVectorIndex(B&: MIRBuilder, IdxReg: Index, VecTy);
3981
3982 LLT IdxTy = MRI.getType(Reg: Index);
3983 auto Mul = MIRBuilder.buildMul(Dst: IdxTy, Src0: Index,
3984 Src1: MIRBuilder.buildConstant(Res: IdxTy, Val: EltSize));
3985
3986 LLT PtrTy = MRI.getType(Reg: VecPtr);
3987 return MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: VecPtr, Op1: Mul).getReg(Idx: 0);
3988}
3989
3990#ifndef NDEBUG
3991/// Check that all vector operands have same number of elements. Other operands
3992/// should be listed in NonVecOp.
3993static bool hasSameNumEltsOnAllVectorOperands(
3994 GenericMachineInstr &MI, MachineRegisterInfo &MRI,
3995 std::initializer_list<unsigned> NonVecOpIndices) {
3996 if (MI.getNumMemOperands() != 0)
3997 return false;
3998
3999 LLT VecTy = MRI.getType(Reg: MI.getReg(Idx: 0));
4000 if (!VecTy.isVector())
4001 return false;
4002 unsigned NumElts = VecTy.getNumElements();
4003
4004 for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
4005 MachineOperand &Op = MI.getOperand(i: OpIdx);
4006 if (!Op.isReg()) {
4007 if (!is_contained(Set: NonVecOpIndices, Element: OpIdx))
4008 return false;
4009 continue;
4010 }
4011
4012 LLT Ty = MRI.getType(Reg: Op.getReg());
4013 if (!Ty.isVector()) {
4014 if (!is_contained(Set: NonVecOpIndices, Element: OpIdx))
4015 return false;
4016 continue;
4017 }
4018
4019 if (Ty.getNumElements() != NumElts)
4020 return false;
4021 }
4022
4023 return true;
4024}
4025#endif
4026
4027/// Fill \p DstOps with DstOps that have same number of elements combined as
4028/// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are
4029/// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple
4030/// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements.
4031static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty,
4032 unsigned NumElts) {
4033 LLT LeftoverTy;
4034 assert(Ty.isVector() && "Expected vector type");
4035 LLT EltTy = Ty.getElementType();
4036 LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElements: NumElts, ScalarTy: EltTy);
4037 int NumParts, NumLeftover;
4038 std::tie(args&: NumParts, args&: NumLeftover) =
4039 getNarrowTypeBreakDown(OrigTy: Ty, NarrowTy, LeftoverTy);
4040
4041 assert(NumParts > 0 && "Error in getNarrowTypeBreakDown");
4042 for (int i = 0; i < NumParts; ++i) {
4043 DstOps.push_back(Elt: NarrowTy);
4044 }
4045
4046 if (LeftoverTy.isValid()) {
4047 assert(NumLeftover == 1 && "expected exactly one leftover");
4048 DstOps.push_back(Elt: LeftoverTy);
4049 }
4050}
4051
4052/// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps
4053/// made from \p Op depending on operand type.
4054static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N,
4055 MachineOperand &Op) {
4056 for (unsigned i = 0; i < N; ++i) {
4057 if (Op.isReg())
4058 Ops.push_back(Elt: Op.getReg());
4059 else if (Op.isImm())
4060 Ops.push_back(Elt: Op.getImm());
4061 else if (Op.isPredicate())
4062 Ops.push_back(Elt: static_cast<CmpInst::Predicate>(Op.getPredicate()));
4063 else
4064 llvm_unreachable("Unsupported type");
4065 }
4066}
4067
4068// Handle splitting vector operations which need to have the same number of
4069// elements in each type index, but each type index may have a different element
4070// type.
4071//
4072// e.g. <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
4073// <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4074// <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4075//
4076// Also handles some irregular breakdown cases, e.g.
4077// e.g. <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
4078// <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4079// s64 = G_SHL s64, s32
4080LegalizerHelper::LegalizeResult
4081LegalizerHelper::fewerElementsVectorMultiEltType(
4082 GenericMachineInstr &MI, unsigned NumElts,
4083 std::initializer_list<unsigned> NonVecOpIndices) {
4084 assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) &&
4085 "Non-compatible opcode or not specified non-vector operands");
4086 unsigned OrigNumElts = MRI.getType(Reg: MI.getReg(Idx: 0)).getNumElements();
4087
4088 unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
4089 unsigned NumDefs = MI.getNumDefs();
4090
4091 // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output.
4092 // Build instructions with DstOps to use instruction found by CSE directly.
4093 // CSE copies found instruction into given vreg when building with vreg dest.
4094 SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs);
4095 // Output registers will be taken from created instructions.
4096 SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs);
4097 for (unsigned i = 0; i < NumDefs; ++i) {
4098 makeDstOps(DstOps&: OutputOpsPieces[i], Ty: MRI.getType(Reg: MI.getReg(Idx: i)), NumElts);
4099 }
4100
4101 // Split vector input operands into sub-vectors with NumElts elts + Leftover.
4102 // Operands listed in NonVecOpIndices will be used as is without splitting;
4103 // examples: compare predicate in icmp and fcmp (op 1), vector select with i1
4104 // scalar condition (op 1), immediate in sext_inreg (op 2).
4105 SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs);
4106 for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
4107 ++UseIdx, ++UseNo) {
4108 if (is_contained(Set: NonVecOpIndices, Element: UseIdx)) {
4109 broadcastSrcOp(Ops&: InputOpsPieces[UseNo], N: OutputOpsPieces[0].size(),
4110 Op&: MI.getOperand(i: UseIdx));
4111 } else {
4112 SmallVector<Register, 8> SplitPieces;
4113 extractVectorParts(Reg: MI.getReg(Idx: UseIdx), NumElts, VRegs&: SplitPieces, MIRBuilder,
4114 MRI);
4115 for (auto Reg : SplitPieces)
4116 InputOpsPieces[UseNo].push_back(Elt: Reg);
4117 }
4118 }
4119
4120 unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
4121
4122 // Take i-th piece of each input operand split and build sub-vector/scalar
4123 // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s).
4124 for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
4125 SmallVector<DstOp, 2> Defs;
4126 for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
4127 Defs.push_back(Elt: OutputOpsPieces[DstNo][i]);
4128
4129 SmallVector<SrcOp, 3> Uses;
4130 for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo)
4131 Uses.push_back(Elt: InputOpsPieces[InputNo][i]);
4132
4133 auto I = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: Defs, SrcOps: Uses, Flags: MI.getFlags());
4134 for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
4135 OutputRegs[DstNo].push_back(Elt: I.getReg(Idx: DstNo));
4136 }
4137
4138 // Merge small outputs into MI's output for each def operand.
4139 if (NumLeftovers) {
4140 for (unsigned i = 0; i < NumDefs; ++i)
4141 mergeMixedSubvectors(DstReg: MI.getReg(Idx: i), PartRegs: OutputRegs[i]);
4142 } else {
4143 for (unsigned i = 0; i < NumDefs; ++i)
4144 MIRBuilder.buildMergeLikeInstr(Res: MI.getReg(Idx: i), Ops: OutputRegs[i]);
4145 }
4146
4147 MI.eraseFromParent();
4148 return Legalized;
4149}
4150
4151LegalizerHelper::LegalizeResult
4152LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
4153 unsigned NumElts) {
4154 unsigned OrigNumElts = MRI.getType(Reg: MI.getReg(Idx: 0)).getNumElements();
4155
4156 unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
4157 unsigned NumDefs = MI.getNumDefs();
4158
4159 SmallVector<DstOp, 8> OutputOpsPieces;
4160 SmallVector<Register, 8> OutputRegs;
4161 makeDstOps(DstOps&: OutputOpsPieces, Ty: MRI.getType(Reg: MI.getReg(Idx: 0)), NumElts);
4162
4163 // Instructions that perform register split will be inserted in basic block
4164 // where register is defined (basic block is in the next operand).
4165 SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2);
4166 for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
4167 UseIdx += 2, ++UseNo) {
4168 MachineBasicBlock &OpMBB = *MI.getOperand(i: UseIdx + 1).getMBB();
4169 MIRBuilder.setInsertPt(MBB&: OpMBB, II: OpMBB.getFirstTerminatorForward());
4170 extractVectorParts(Reg: MI.getReg(Idx: UseIdx), NumElts, VRegs&: InputOpsPieces[UseNo],
4171 MIRBuilder, MRI);
4172 }
4173
4174 // Build PHIs with fewer elements.
4175 unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
4176 MIRBuilder.setInsertPt(MBB&: *MI.getParent(), II: MI);
4177 for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
4178 auto Phi = MIRBuilder.buildInstr(Opcode: TargetOpcode::G_PHI);
4179 Phi.addDef(
4180 RegNo: MRI.createGenericVirtualRegister(Ty: OutputOpsPieces[i].getLLTTy(MRI)));
4181 OutputRegs.push_back(Elt: Phi.getReg(Idx: 0));
4182
4183 for (unsigned j = 0; j < NumInputs / 2; ++j) {
4184 Phi.addUse(RegNo: InputOpsPieces[j][i]);
4185 Phi.add(MO: MI.getOperand(i: 1 + j * 2 + 1));
4186 }
4187 }
4188
4189 // Merge small outputs into MI's def.
4190 if (NumLeftovers) {
4191 mergeMixedSubvectors(DstReg: MI.getReg(Idx: 0), PartRegs: OutputRegs);
4192 } else {
4193 MIRBuilder.buildMergeLikeInstr(Res: MI.getReg(Idx: 0), Ops: OutputRegs);
4194 }
4195
4196 MI.eraseFromParent();
4197 return Legalized;
4198}
4199
4200LegalizerHelper::LegalizeResult
4201LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
4202 unsigned TypeIdx,
4203 LLT NarrowTy) {
4204 const int NumDst = MI.getNumOperands() - 1;
4205 const Register SrcReg = MI.getOperand(i: NumDst).getReg();
4206 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
4207 LLT SrcTy = MRI.getType(Reg: SrcReg);
4208
4209 if (TypeIdx != 1 || NarrowTy == DstTy)
4210 return UnableToLegalize;
4211
4212 // Requires compatible types. Otherwise SrcReg should have been defined by
4213 // merge-like instruction that would get artifact combined. Most likely
4214 // instruction that defines SrcReg has to perform more/fewer elements
4215 // legalization compatible with NarrowTy.
4216 assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types");
4217 assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
4218
4219 if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
4220 (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0))
4221 return UnableToLegalize;
4222
4223 // This is most likely DstTy (smaller then register size) packed in SrcTy
4224 // (larger then register size) and since unmerge was not combined it will be
4225 // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy
4226 // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy.
4227
4228 // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy)
4229 //
4230 // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence
4231 // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg
4232 // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy)
4233 auto Unmerge = MIRBuilder.buildUnmerge(Res: NarrowTy, Op: SrcReg);
4234 const int NumUnmerge = Unmerge->getNumOperands() - 1;
4235 const int PartsPerUnmerge = NumDst / NumUnmerge;
4236
4237 for (int I = 0; I != NumUnmerge; ++I) {
4238 auto MIB = MIRBuilder.buildInstr(Opcode: TargetOpcode::G_UNMERGE_VALUES);
4239
4240 for (int J = 0; J != PartsPerUnmerge; ++J)
4241 MIB.addDef(RegNo: MI.getOperand(i: I * PartsPerUnmerge + J).getReg());
4242 MIB.addUse(RegNo: Unmerge.getReg(Idx: I));
4243 }
4244
4245 MI.eraseFromParent();
4246 return Legalized;
4247}
4248
4249LegalizerHelper::LegalizeResult
4250LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
4251 LLT NarrowTy) {
4252 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
4253 // Requires compatible types. Otherwise user of DstReg did not perform unmerge
4254 // that should have been artifact combined. Most likely instruction that uses
4255 // DstReg has to do more/fewer elements legalization compatible with NarrowTy.
4256 assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types");
4257 assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
4258 if (NarrowTy == SrcTy)
4259 return UnableToLegalize;
4260
4261 // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use
4262 // is for old mir tests. Since the changes to more/fewer elements it should no
4263 // longer be possible to generate MIR like this when starting from llvm-ir
4264 // because LCMTy approach was replaced with merge/unmerge to vector elements.
4265 if (TypeIdx == 1) {
4266 assert(SrcTy.isVector() && "Expected vector types");
4267 assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
4268 if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
4269 (NarrowTy.getNumElements() >= SrcTy.getNumElements()))
4270 return UnableToLegalize;
4271 // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy)
4272 //
4273 // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy)
4274 // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy)
4275 // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4
4276 // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6
4277 // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8
4278 // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11
4279
4280 SmallVector<Register, 8> Elts;
4281 LLT EltTy = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getScalarType();
4282 for (unsigned i = 1; i < MI.getNumOperands(); ++i) {
4283 auto Unmerge = MIRBuilder.buildUnmerge(Res: EltTy, Op: MI.getOperand(i).getReg());
4284 for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j)
4285 Elts.push_back(Elt: Unmerge.getReg(Idx: j));
4286 }
4287
4288 SmallVector<Register, 8> NarrowTyElts;
4289 unsigned NumNarrowTyElts = NarrowTy.getNumElements();
4290 unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts;
4291 for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces;
4292 ++i, Offset += NumNarrowTyElts) {
4293 ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts);
4294 NarrowTyElts.push_back(
4295 Elt: MIRBuilder.buildMergeLikeInstr(Res: NarrowTy, Ops: Pieces).getReg(Idx: 0));
4296 }
4297
4298 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: NarrowTyElts);
4299 MI.eraseFromParent();
4300 return Legalized;
4301 }
4302
4303 assert(TypeIdx == 0 && "Bad type index");
4304 if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) ||
4305 (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0))
4306 return UnableToLegalize;
4307
4308 // This is most likely SrcTy (smaller then register size) packed in DstTy
4309 // (larger then register size) and since merge was not combined it will be
4310 // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy
4311 // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy.
4312
4313 // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4
4314 //
4315 // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg
4316 // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4
4317 // %0:_(DstTy) = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence
4318 SmallVector<Register, 8> NarrowTyElts;
4319 unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
4320 unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;
4321 unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts;
4322 for (unsigned i = 0; i < NumParts; ++i) {
4323 SmallVector<Register, 8> Sources;
4324 for (unsigned j = 0; j < NumElts; ++j)
4325 Sources.push_back(Elt: MI.getOperand(i: 1 + i * NumElts + j).getReg());
4326 NarrowTyElts.push_back(
4327 Elt: MIRBuilder.buildMergeLikeInstr(Res: NarrowTy, Ops: Sources).getReg(Idx: 0));
4328 }
4329
4330 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: NarrowTyElts);
4331 MI.eraseFromParent();
4332 return Legalized;
4333}
4334
4335LegalizerHelper::LegalizeResult
4336LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
4337 unsigned TypeIdx,
4338 LLT NarrowVecTy) {
4339 auto [DstReg, SrcVec] = MI.getFirst2Regs();
4340 Register InsertVal;
4341 bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
4342
4343 assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
4344 if (IsInsert)
4345 InsertVal = MI.getOperand(i: 2).getReg();
4346
4347 Register Idx = MI.getOperand(i: MI.getNumOperands() - 1).getReg();
4348
4349 // TODO: Handle total scalarization case.
4350 if (!NarrowVecTy.isVector())
4351 return UnableToLegalize;
4352
4353 LLT VecTy = MRI.getType(Reg: SrcVec);
4354
4355 // If the index is a constant, we can really break this down as you would
4356 // expect, and index into the target size pieces.
4357 int64_t IdxVal;
4358 auto MaybeCst = getIConstantVRegValWithLookThrough(VReg: Idx, MRI);
4359 if (MaybeCst) {
4360 IdxVal = MaybeCst->Value.getSExtValue();
4361 // Avoid out of bounds indexing the pieces.
4362 if (IdxVal >= VecTy.getNumElements()) {
4363 MIRBuilder.buildUndef(Res: DstReg);
4364 MI.eraseFromParent();
4365 return Legalized;
4366 }
4367
4368 SmallVector<Register, 8> VecParts;
4369 LLT GCDTy = extractGCDType(Parts&: VecParts, DstTy: VecTy, NarrowTy: NarrowVecTy, SrcReg: SrcVec);
4370
4371 // Build a sequence of NarrowTy pieces in VecParts for this operand.
4372 LLT LCMTy = buildLCMMergePieces(DstTy: VecTy, NarrowTy: NarrowVecTy, GCDTy, VRegs&: VecParts,
4373 PadStrategy: TargetOpcode::G_ANYEXT);
4374
4375 unsigned NewNumElts = NarrowVecTy.getNumElements();
4376
4377 LLT IdxTy = MRI.getType(Reg: Idx);
4378 int64_t PartIdx = IdxVal / NewNumElts;
4379 auto NewIdx =
4380 MIRBuilder.buildConstant(Res: IdxTy, Val: IdxVal - NewNumElts * PartIdx);
4381
4382 if (IsInsert) {
4383 LLT PartTy = MRI.getType(Reg: VecParts[PartIdx]);
4384
4385 // Use the adjusted index to insert into one of the subvectors.
4386 auto InsertPart = MIRBuilder.buildInsertVectorElement(
4387 Res: PartTy, Val: VecParts[PartIdx], Elt: InsertVal, Idx: NewIdx);
4388 VecParts[PartIdx] = InsertPart.getReg(Idx: 0);
4389
4390 // Recombine the inserted subvector with the others to reform the result
4391 // vector.
4392 buildWidenedRemergeToDst(DstReg, LCMTy, RemergeRegs: VecParts);
4393 } else {
4394 MIRBuilder.buildExtractVectorElement(Res: DstReg, Val: VecParts[PartIdx], Idx: NewIdx);
4395 }
4396
4397 MI.eraseFromParent();
4398 return Legalized;
4399 }
4400
4401 // With a variable index, we can't perform the operation in a smaller type, so
4402 // we're forced to expand this.
4403 //
4404 // TODO: We could emit a chain of compare/select to figure out which piece to
4405 // index.
4406 return lowerExtractInsertVectorElt(MI);
4407}
4408
4409LegalizerHelper::LegalizeResult
4410LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
4411 LLT NarrowTy) {
4412 // FIXME: Don't know how to handle secondary types yet.
4413 if (TypeIdx != 0)
4414 return UnableToLegalize;
4415
4416 // This implementation doesn't work for atomics. Give up instead of doing
4417 // something invalid.
4418 if (LdStMI.isAtomic())
4419 return UnableToLegalize;
4420
4421 bool IsLoad = isa<GLoad>(Val: LdStMI);
4422 Register ValReg = LdStMI.getReg(Idx: 0);
4423 Register AddrReg = LdStMI.getPointerReg();
4424 LLT ValTy = MRI.getType(Reg: ValReg);
4425
4426 // FIXME: Do we need a distinct NarrowMemory legalize action?
4427 if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) {
4428 LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
4429 return UnableToLegalize;
4430 }
4431
4432 int NumParts = -1;
4433 int NumLeftover = -1;
4434 LLT LeftoverTy;
4435 SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
4436 if (IsLoad) {
4437 std::tie(args&: NumParts, args&: NumLeftover) = getNarrowTypeBreakDown(OrigTy: ValTy, NarrowTy, LeftoverTy);
4438 } else {
4439 if (extractParts(Reg: ValReg, RegTy: ValTy, MainTy: NarrowTy, LeftoverTy, VRegs&: NarrowRegs,
4440 LeftoverVRegs&: NarrowLeftoverRegs, MIRBuilder, MRI)) {
4441 NumParts = NarrowRegs.size();
4442 NumLeftover = NarrowLeftoverRegs.size();
4443 }
4444 }
4445
4446 if (NumParts == -1)
4447 return UnableToLegalize;
4448
4449 LLT PtrTy = MRI.getType(Reg: AddrReg);
4450 const LLT OffsetTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
4451
4452 unsigned TotalSize = ValTy.getSizeInBits();
4453
4454 // Split the load/store into PartTy sized pieces starting at Offset. If this
4455 // is a load, return the new registers in ValRegs. For a store, each elements
4456 // of ValRegs should be PartTy. Returns the next offset that needs to be
4457 // handled.
4458 bool isBigEndian = MIRBuilder.getDataLayout().isBigEndian();
4459 auto MMO = LdStMI.getMMO();
4460 auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
4461 unsigned NumParts, unsigned Offset) -> unsigned {
4462 MachineFunction &MF = MIRBuilder.getMF();
4463 unsigned PartSize = PartTy.getSizeInBits();
4464 for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
4465 ++Idx) {
4466 unsigned ByteOffset = Offset / 8;
4467 Register NewAddrReg;
4468
4469 MIRBuilder.materializePtrAdd(Res&: NewAddrReg, Op0: AddrReg, ValueTy: OffsetTy, Value: ByteOffset);
4470
4471 MachineMemOperand *NewMMO =
4472 MF.getMachineMemOperand(MMO: &MMO, Offset: ByteOffset, Ty: PartTy);
4473
4474 if (IsLoad) {
4475 Register Dst = MRI.createGenericVirtualRegister(Ty: PartTy);
4476 ValRegs.push_back(Elt: Dst);
4477 MIRBuilder.buildLoad(Res: Dst, Addr: NewAddrReg, MMO&: *NewMMO);
4478 } else {
4479 MIRBuilder.buildStore(Val: ValRegs[Idx], Addr: NewAddrReg, MMO&: *NewMMO);
4480 }
4481 Offset = isBigEndian ? Offset - PartSize : Offset + PartSize;
4482 }
4483
4484 return Offset;
4485 };
4486
4487 unsigned Offset = isBigEndian ? TotalSize - NarrowTy.getSizeInBits() : 0;
4488 unsigned HandledOffset =
4489 splitTypePieces(NarrowTy, NarrowRegs, NumParts, Offset);
4490
4491 // Handle the rest of the register if this isn't an even type breakdown.
4492 if (LeftoverTy.isValid())
4493 splitTypePieces(LeftoverTy, NarrowLeftoverRegs, NumLeftover, HandledOffset);
4494
4495 if (IsLoad) {
4496 insertParts(DstReg: ValReg, ResultTy: ValTy, PartTy: NarrowTy, PartRegs: NarrowRegs,
4497 LeftoverTy, LeftoverRegs: NarrowLeftoverRegs);
4498 }
4499
4500 LdStMI.eraseFromParent();
4501 return Legalized;
4502}
4503
4504LegalizerHelper::LegalizeResult
4505LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
4506 LLT NarrowTy) {
4507 using namespace TargetOpcode;
4508 GenericMachineInstr &GMI = cast<GenericMachineInstr>(Val&: MI);
4509 unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
4510
4511 switch (MI.getOpcode()) {
4512 case G_IMPLICIT_DEF:
4513 case G_TRUNC:
4514 case G_AND:
4515 case G_OR:
4516 case G_XOR:
4517 case G_ADD:
4518 case G_SUB:
4519 case G_MUL:
4520 case G_PTR_ADD:
4521 case G_SMULH:
4522 case G_UMULH:
4523 case G_FADD:
4524 case G_FMUL:
4525 case G_FSUB:
4526 case G_FNEG:
4527 case G_FABS:
4528 case G_FCANONICALIZE:
4529 case G_FDIV:
4530 case G_FREM:
4531 case G_FMA:
4532 case G_FMAD:
4533 case G_FPOW:
4534 case G_FEXP:
4535 case G_FEXP2:
4536 case G_FEXP10:
4537 case G_FLOG:
4538 case G_FLOG2:
4539 case G_FLOG10:
4540 case G_FLDEXP:
4541 case G_FNEARBYINT:
4542 case G_FCEIL:
4543 case G_FFLOOR:
4544 case G_FRINT:
4545 case G_INTRINSIC_ROUND:
4546 case G_INTRINSIC_ROUNDEVEN:
4547 case G_INTRINSIC_TRUNC:
4548 case G_FCOS:
4549 case G_FSIN:
4550 case G_FSQRT:
4551 case G_BSWAP:
4552 case G_BITREVERSE:
4553 case G_SDIV:
4554 case G_UDIV:
4555 case G_SREM:
4556 case G_UREM:
4557 case G_SDIVREM:
4558 case G_UDIVREM:
4559 case G_SMIN:
4560 case G_SMAX:
4561 case G_UMIN:
4562 case G_UMAX:
4563 case G_ABS:
4564 case G_FMINNUM:
4565 case G_FMAXNUM:
4566 case G_FMINNUM_IEEE:
4567 case G_FMAXNUM_IEEE:
4568 case G_FMINIMUM:
4569 case G_FMAXIMUM:
4570 case G_FSHL:
4571 case G_FSHR:
4572 case G_ROTL:
4573 case G_ROTR:
4574 case G_FREEZE:
4575 case G_SADDSAT:
4576 case G_SSUBSAT:
4577 case G_UADDSAT:
4578 case G_USUBSAT:
4579 case G_UMULO:
4580 case G_SMULO:
4581 case G_SHL:
4582 case G_LSHR:
4583 case G_ASHR:
4584 case G_SSHLSAT:
4585 case G_USHLSAT:
4586 case G_CTLZ:
4587 case G_CTLZ_ZERO_UNDEF:
4588 case G_CTTZ:
4589 case G_CTTZ_ZERO_UNDEF:
4590 case G_CTPOP:
4591 case G_FCOPYSIGN:
4592 case G_ZEXT:
4593 case G_SEXT:
4594 case G_ANYEXT:
4595 case G_FPEXT:
4596 case G_FPTRUNC:
4597 case G_SITOFP:
4598 case G_UITOFP:
4599 case G_FPTOSI:
4600 case G_FPTOUI:
4601 case G_INTTOPTR:
4602 case G_PTRTOINT:
4603 case G_ADDRSPACE_CAST:
4604 case G_UADDO:
4605 case G_USUBO:
4606 case G_UADDE:
4607 case G_USUBE:
4608 case G_SADDO:
4609 case G_SSUBO:
4610 case G_SADDE:
4611 case G_SSUBE:
4612 case G_STRICT_FADD:
4613 case G_STRICT_FSUB:
4614 case G_STRICT_FMUL:
4615 case G_STRICT_FMA:
4616 case G_STRICT_FLDEXP:
4617 case G_FFREXP:
4618 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts);
4619 case G_ICMP:
4620 case G_FCMP:
4621 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {1 /*cpm predicate*/});
4622 case G_IS_FPCLASS:
4623 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {2, 3 /*mask,fpsem*/});
4624 case G_SELECT:
4625 if (MRI.getType(Reg: MI.getOperand(i: 1).getReg()).isVector())
4626 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts);
4627 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {1 /*scalar cond*/});
4628 case G_PHI:
4629 return fewerElementsVectorPhi(MI&: GMI, NumElts);
4630 case G_UNMERGE_VALUES:
4631 return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
4632 case G_BUILD_VECTOR:
4633 assert(TypeIdx == 0 && "not a vector type index");
4634 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4635 case G_CONCAT_VECTORS:
4636 if (TypeIdx != 1) // TODO: This probably does work as expected already.
4637 return UnableToLegalize;
4638 return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4639 case G_EXTRACT_VECTOR_ELT:
4640 case G_INSERT_VECTOR_ELT:
4641 return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowVecTy: NarrowTy);
4642 case G_LOAD:
4643 case G_STORE:
4644 return reduceLoadStoreWidth(LdStMI&: cast<GLoadStore>(Val&: MI), TypeIdx, NarrowTy);
4645 case G_SEXT_INREG:
4646 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {2 /*imm*/});
4647 GISEL_VECREDUCE_CASES_NONSEQ
4648 return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
4649 case TargetOpcode::G_VECREDUCE_SEQ_FADD:
4650 case TargetOpcode::G_VECREDUCE_SEQ_FMUL:
4651 return fewerElementsVectorSeqReductions(MI, TypeIdx, NarrowTy);
4652 case G_SHUFFLE_VECTOR:
4653 return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
4654 case G_FPOWI:
4655 return fewerElementsVectorMultiEltType(MI&: GMI, NumElts, NonVecOpIndices: {2 /*pow*/});
4656 default:
4657 return UnableToLegalize;
4658 }
4659}
4660
4661LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
4662 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4663 assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
4664 if (TypeIdx != 0)
4665 return UnableToLegalize;
4666
4667 auto [DstReg, DstTy, Src1Reg, Src1Ty, Src2Reg, Src2Ty] =
4668 MI.getFirst3RegLLTs();
4669 ArrayRef<int> Mask = MI.getOperand(i: 3).getShuffleMask();
4670 // The shuffle should be canonicalized by now.
4671 if (DstTy != Src1Ty)
4672 return UnableToLegalize;
4673 if (DstTy != Src2Ty)
4674 return UnableToLegalize;
4675
4676 if (!isPowerOf2_32(Value: DstTy.getNumElements()))
4677 return UnableToLegalize;
4678
4679 // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
4680 // Further legalization attempts will be needed to do split further.
4681 NarrowTy =
4682 DstTy.changeElementCount(EC: DstTy.getElementCount().divideCoefficientBy(RHS: 2));
4683 unsigned NewElts = NarrowTy.getNumElements();
4684
4685 SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
4686 extractParts(Reg: Src1Reg, Ty: NarrowTy, NumParts: 2, VRegs&: SplitSrc1Regs, MIRBuilder, MRI);
4687 extractParts(Reg: Src2Reg, Ty: NarrowTy, NumParts: 2, VRegs&: SplitSrc2Regs, MIRBuilder, MRI);
4688 Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
4689 SplitSrc2Regs[1]};
4690
4691 Register Hi, Lo;
4692
4693 // If Lo or Hi uses elements from at most two of the four input vectors, then
4694 // express it as a vector shuffle of those two inputs. Otherwise extract the
4695 // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
4696 SmallVector<int, 16> Ops;
4697 for (unsigned High = 0; High < 2; ++High) {
4698 Register &Output = High ? Hi : Lo;
4699
4700 // Build a shuffle mask for the output, discovering on the fly which
4701 // input vectors to use as shuffle operands (recorded in InputUsed).
4702 // If building a suitable shuffle vector proves too hard, then bail
4703 // out with useBuildVector set.
4704 unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
4705 unsigned FirstMaskIdx = High * NewElts;
4706 bool UseBuildVector = false;
4707 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4708 // The mask element. This indexes into the input.
4709 int Idx = Mask[FirstMaskIdx + MaskOffset];
4710
4711 // The input vector this mask element indexes into.
4712 unsigned Input = (unsigned)Idx / NewElts;
4713
4714 if (Input >= std::size(Inputs)) {
4715 // The mask element does not index into any input vector.
4716 Ops.push_back(Elt: -1);
4717 continue;
4718 }
4719
4720 // Turn the index into an offset from the start of the input vector.
4721 Idx -= Input * NewElts;
4722
4723 // Find or create a shuffle vector operand to hold this input.
4724 unsigned OpNo;
4725 for (OpNo = 0; OpNo < std::size(InputUsed); ++OpNo) {
4726 if (InputUsed[OpNo] == Input) {
4727 // This input vector is already an operand.
4728 break;
4729 } else if (InputUsed[OpNo] == -1U) {
4730 // Create a new operand for this input vector.
4731 InputUsed[OpNo] = Input;
4732 break;
4733 }
4734 }
4735
4736 if (OpNo >= std::size(InputUsed)) {
4737 // More than two input vectors used! Give up on trying to create a
4738 // shuffle vector. Insert all elements into a BUILD_VECTOR instead.
4739 UseBuildVector = true;
4740 break;
4741 }
4742
4743 // Add the mask index for the new shuffle vector.
4744 Ops.push_back(Elt: Idx + OpNo * NewElts);
4745 }
4746
4747 if (UseBuildVector) {
4748 LLT EltTy = NarrowTy.getElementType();
4749 SmallVector<Register, 16> SVOps;
4750
4751 // Extract the input elements by hand.
4752 for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4753 // The mask element. This indexes into the input.
4754 int Idx = Mask[FirstMaskIdx + MaskOffset];
4755
4756 // The input vector this mask element indexes into.
4757 unsigned Input = (unsigned)Idx / NewElts;
4758
4759 if (Input >= std::size(Inputs)) {
4760 // The mask element is "undef" or indexes off the end of the input.
4761 SVOps.push_back(Elt: MIRBuilder.buildUndef(Res: EltTy).getReg(Idx: 0));
4762 continue;
4763 }
4764
4765 // Turn the index into an offset from the start of the input vector.
4766 Idx -= Input * NewElts;
4767
4768 // Extract the vector element by hand.
4769 SVOps.push_back(Elt: MIRBuilder
4770 .buildExtractVectorElement(
4771 Res: EltTy, Val: Inputs[Input],
4772 Idx: MIRBuilder.buildConstant(Res: LLT::scalar(SizeInBits: 32), Val: Idx))
4773 .getReg(Idx: 0));
4774 }
4775
4776 // Construct the Lo/Hi output using a G_BUILD_VECTOR.
4777 Output = MIRBuilder.buildBuildVector(Res: NarrowTy, Ops: SVOps).getReg(Idx: 0);
4778 } else if (InputUsed[0] == -1U) {
4779 // No input vectors were used! The result is undefined.
4780 Output = MIRBuilder.buildUndef(Res: NarrowTy).getReg(Idx: 0);
4781 } else {
4782 Register Op0 = Inputs[InputUsed[0]];
4783 // If only one input was used, use an undefined vector for the other.
4784 Register Op1 = InputUsed[1] == -1U
4785 ? MIRBuilder.buildUndef(Res: NarrowTy).getReg(Idx: 0)
4786 : Inputs[InputUsed[1]];
4787 // At least one input vector was used. Create a new shuffle vector.
4788 Output = MIRBuilder.buildShuffleVector(Res: NarrowTy, Src1: Op0, Src2: Op1, Mask: Ops).getReg(Idx: 0);
4789 }
4790
4791 Ops.clear();
4792 }
4793
4794 MIRBuilder.buildConcatVectors(Res: DstReg, Ops: {Lo, Hi});
4795 MI.eraseFromParent();
4796 return Legalized;
4797}
4798
4799LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
4800 MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4801 auto &RdxMI = cast<GVecReduce>(Val&: MI);
4802
4803 if (TypeIdx != 1)
4804 return UnableToLegalize;
4805
4806 // The semantics of the normal non-sequential reductions allow us to freely
4807 // re-associate the operation.
4808 auto [DstReg, DstTy, SrcReg, SrcTy] = RdxMI.getFirst2RegLLTs();
4809
4810 if (NarrowTy.isVector() &&
4811 (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
4812 return UnableToLegalize;
4813
4814 unsigned ScalarOpc = RdxMI.getScalarOpcForReduction();
4815 SmallVector<Register> SplitSrcs;
4816 // If NarrowTy is a scalar then we're being asked to scalarize.
4817 const unsigned NumParts =
4818 NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
4819 : SrcTy.getNumElements();
4820
4821 extractParts(Reg: SrcReg, Ty: NarrowTy, NumParts, VRegs&: SplitSrcs, MIRBuilder, MRI);
4822 if (NarrowTy.isScalar()) {
4823 if (DstTy != NarrowTy)
4824 return UnableToLegalize; // FIXME: handle implicit extensions.
4825
4826 if (isPowerOf2_32(Value: NumParts)) {
4827 // Generate a tree of scalar operations to reduce the critical path.
4828 SmallVector<Register> PartialResults;
4829 unsigned NumPartsLeft = NumParts;
4830 while (NumPartsLeft > 1) {
4831 for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
4832 PartialResults.emplace_back(
4833 Args: MIRBuilder
4834 .buildInstr(Opc: ScalarOpc, DstOps: {NarrowTy},
4835 SrcOps: {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
4836 .getReg(Idx: 0));
4837 }
4838 SplitSrcs = PartialResults;
4839 PartialResults.clear();
4840 NumPartsLeft = SplitSrcs.size();
4841 }
4842 assert(SplitSrcs.size() == 1);
4843 MIRBuilder.buildCopy(Res: DstReg, Op: SplitSrcs[0]);
4844 MI.eraseFromParent();
4845 return Legalized;
4846 }
4847 // If we can't generate a tree, then just do sequential operations.
4848 Register Acc = SplitSrcs[0];
4849 for (unsigned Idx = 1; Idx < NumParts; ++Idx)
4850 Acc = MIRBuilder.buildInstr(Opc: ScalarOpc, DstOps: {NarrowTy}, SrcOps: {Acc, SplitSrcs[Idx]})
4851 .getReg(Idx: 0);
4852 MIRBuilder.buildCopy(Res: DstReg, Op: Acc);
4853 MI.eraseFromParent();
4854 return Legalized;
4855 }
4856 SmallVector<Register> PartialReductions;
4857 for (unsigned Part = 0; Part < NumParts; ++Part) {
4858 PartialReductions.push_back(
4859 Elt: MIRBuilder.buildInstr(Opc: RdxMI.getOpcode(), DstOps: {DstTy}, SrcOps: {SplitSrcs[Part]})
4860 .getReg(Idx: 0));
4861 }
4862
4863 // If the types involved are powers of 2, we can generate intermediate vector
4864 // ops, before generating a final reduction operation.
4865 if (isPowerOf2_32(Value: SrcTy.getNumElements()) &&
4866 isPowerOf2_32(Value: NarrowTy.getNumElements())) {
4867 return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
4868 }
4869
4870 Register Acc = PartialReductions[0];
4871 for (unsigned Part = 1; Part < NumParts; ++Part) {
4872 if (Part == NumParts - 1) {
4873 MIRBuilder.buildInstr(Opc: ScalarOpc, DstOps: {DstReg},
4874 SrcOps: {Acc, PartialReductions[Part]});
4875 } else {
4876 Acc = MIRBuilder
4877 .buildInstr(Opc: ScalarOpc, DstOps: {DstTy}, SrcOps: {Acc, PartialReductions[Part]})
4878 .getReg(Idx: 0);
4879 }
4880 }
4881 MI.eraseFromParent();
4882 return Legalized;
4883}
4884
4885LegalizerHelper::LegalizeResult
4886LegalizerHelper::fewerElementsVectorSeqReductions(MachineInstr &MI,
4887 unsigned int TypeIdx,
4888 LLT NarrowTy) {
4889 auto [DstReg, DstTy, ScalarReg, ScalarTy, SrcReg, SrcTy] =
4890 MI.getFirst3RegLLTs();
4891 if (!NarrowTy.isScalar() || TypeIdx != 2 || DstTy != ScalarTy ||
4892 DstTy != NarrowTy)
4893 return UnableToLegalize;
4894
4895 assert((MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD ||
4896 MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FMUL) &&
4897 "Unexpected vecreduce opcode");
4898 unsigned ScalarOpc = MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD
4899 ? TargetOpcode::G_FADD
4900 : TargetOpcode::G_FMUL;
4901
4902 SmallVector<Register> SplitSrcs;
4903 unsigned NumParts = SrcTy.getNumElements();
4904 extractParts(Reg: SrcReg, Ty: NarrowTy, NumParts, VRegs&: SplitSrcs, MIRBuilder, MRI);
4905 Register Acc = ScalarReg;
4906 for (unsigned i = 0; i < NumParts; i++)
4907 Acc = MIRBuilder.buildInstr(Opc: ScalarOpc, DstOps: {NarrowTy}, SrcOps: {Acc, SplitSrcs[i]})
4908 .getReg(Idx: 0);
4909
4910 MIRBuilder.buildCopy(Res: DstReg, Op: Acc);
4911 MI.eraseFromParent();
4912 return Legalized;
4913}
4914
4915LegalizerHelper::LegalizeResult
4916LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
4917 LLT SrcTy, LLT NarrowTy,
4918 unsigned ScalarOpc) {
4919 SmallVector<Register> SplitSrcs;
4920 // Split the sources into NarrowTy size pieces.
4921 extractParts(Reg: SrcReg, Ty: NarrowTy,
4922 NumParts: SrcTy.getNumElements() / NarrowTy.getNumElements(), VRegs&: SplitSrcs,
4923 MIRBuilder, MRI);
4924 // We're going to do a tree reduction using vector operations until we have
4925 // one NarrowTy size value left.
4926 while (SplitSrcs.size() > 1) {
4927 SmallVector<Register> PartialRdxs;
4928 for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
4929 Register LHS = SplitSrcs[Idx];
4930 Register RHS = SplitSrcs[Idx + 1];
4931 // Create the intermediate vector op.
4932 Register Res =
4933 MIRBuilder.buildInstr(Opc: ScalarOpc, DstOps: {NarrowTy}, SrcOps: {LHS, RHS}).getReg(Idx: 0);
4934 PartialRdxs.push_back(Elt: Res);
4935 }
4936 SplitSrcs = std::move(PartialRdxs);
4937 }
4938 // Finally generate the requested NarrowTy based reduction.
4939 Observer.changingInstr(MI);
4940 MI.getOperand(i: 1).setReg(SplitSrcs[0]);
4941 Observer.changedInstr(MI);
4942 return Legalized;
4943}
4944
4945LegalizerHelper::LegalizeResult
4946LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
4947 const LLT HalfTy, const LLT AmtTy) {
4948
4949 Register InL = MRI.createGenericVirtualRegister(Ty: HalfTy);
4950 Register InH = MRI.createGenericVirtualRegister(Ty: HalfTy);
4951 MIRBuilder.buildUnmerge(Res: {InL, InH}, Op: MI.getOperand(i: 1));
4952
4953 if (Amt.isZero()) {
4954 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: {InL, InH});
4955 MI.eraseFromParent();
4956 return Legalized;
4957 }
4958
4959 LLT NVT = HalfTy;
4960 unsigned NVTBits = HalfTy.getSizeInBits();
4961 unsigned VTBits = 2 * NVTBits;
4962
4963 SrcOp Lo(Register(0)), Hi(Register(0));
4964 if (MI.getOpcode() == TargetOpcode::G_SHL) {
4965 if (Amt.ugt(RHS: VTBits)) {
4966 Lo = Hi = MIRBuilder.buildConstant(Res: NVT, Val: 0);
4967 } else if (Amt.ugt(RHS: NVTBits)) {
4968 Lo = MIRBuilder.buildConstant(Res: NVT, Val: 0);
4969 Hi = MIRBuilder.buildShl(Dst: NVT, Src0: InL,
4970 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt - NVTBits));
4971 } else if (Amt == NVTBits) {
4972 Lo = MIRBuilder.buildConstant(Res: NVT, Val: 0);
4973 Hi = InL;
4974 } else {
4975 Lo = MIRBuilder.buildShl(Dst: NVT, Src0: InL, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt));
4976 auto OrLHS =
4977 MIRBuilder.buildShl(Dst: NVT, Src0: InH, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt));
4978 auto OrRHS = MIRBuilder.buildLShr(
4979 Dst: NVT, Src0: InL, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: -Amt + NVTBits));
4980 Hi = MIRBuilder.buildOr(Dst: NVT, Src0: OrLHS, Src1: OrRHS);
4981 }
4982 } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4983 if (Amt.ugt(RHS: VTBits)) {
4984 Lo = Hi = MIRBuilder.buildConstant(Res: NVT, Val: 0);
4985 } else if (Amt.ugt(RHS: NVTBits)) {
4986 Lo = MIRBuilder.buildLShr(Dst: NVT, Src0: InH,
4987 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt - NVTBits));
4988 Hi = MIRBuilder.buildConstant(Res: NVT, Val: 0);
4989 } else if (Amt == NVTBits) {
4990 Lo = InH;
4991 Hi = MIRBuilder.buildConstant(Res: NVT, Val: 0);
4992 } else {
4993 auto ShiftAmtConst = MIRBuilder.buildConstant(Res: AmtTy, Val: Amt);
4994
4995 auto OrLHS = MIRBuilder.buildLShr(Dst: NVT, Src0: InL, Src1: ShiftAmtConst);
4996 auto OrRHS = MIRBuilder.buildShl(
4997 Dst: NVT, Src0: InH, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: -Amt + NVTBits));
4998
4999 Lo = MIRBuilder.buildOr(Dst: NVT, Src0: OrLHS, Src1: OrRHS);
5000 Hi = MIRBuilder.buildLShr(Dst: NVT, Src0: InH, Src1: ShiftAmtConst);
5001 }
5002 } else {
5003 if (Amt.ugt(RHS: VTBits)) {
5004 Hi = Lo = MIRBuilder.buildAShr(
5005 Dst: NVT, Src0: InH, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: NVTBits - 1));
5006 } else if (Amt.ugt(RHS: NVTBits)) {
5007 Lo = MIRBuilder.buildAShr(Dst: NVT, Src0: InH,
5008 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: Amt - NVTBits));
5009 Hi = MIRBuilder.buildAShr(Dst: NVT, Src0: InH,
5010 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: NVTBits - 1));
5011 } else if (Amt == NVTBits) {
5012 Lo = InH;
5013 Hi = MIRBuilder.buildAShr(Dst: NVT, Src0: InH,
5014 Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: NVTBits - 1));
5015 } else {
5016 auto ShiftAmtConst = MIRBuilder.buildConstant(Res: AmtTy, Val: Amt);
5017
5018 auto OrLHS = MIRBuilder.buildLShr(Dst: NVT, Src0: InL, Src1: ShiftAmtConst);
5019 auto OrRHS = MIRBuilder.buildShl(
5020 Dst: NVT, Src0: InH, Src1: MIRBuilder.buildConstant(Res: AmtTy, Val: -Amt + NVTBits));
5021
5022 Lo = MIRBuilder.buildOr(Dst: NVT, Src0: OrLHS, Src1: OrRHS);
5023 Hi = MIRBuilder.buildAShr(Dst: NVT, Src0: InH, Src1: ShiftAmtConst);
5024 }
5025 }
5026
5027 MIRBuilder.buildMergeLikeInstr(Res: MI.getOperand(i: 0), Ops: {Lo, Hi});
5028 MI.eraseFromParent();
5029
5030 return Legalized;
5031}
5032
5033// TODO: Optimize if constant shift amount.
5034LegalizerHelper::LegalizeResult
5035LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
5036 LLT RequestedTy) {
5037 if (TypeIdx == 1) {
5038 Observer.changingInstr(MI);
5039 narrowScalarSrc(MI, NarrowTy: RequestedTy, OpIdx: 2);
5040 Observer.changedInstr(MI);
5041 return Legalized;
5042 }
5043
5044 Register DstReg = MI.getOperand(i: 0).getReg();
5045 LLT DstTy = MRI.getType(Reg: DstReg);
5046 if (DstTy.isVector())
5047 return UnableToLegalize;
5048
5049 Register Amt = MI.getOperand(i: 2).getReg();
5050 LLT ShiftAmtTy = MRI.getType(Reg: Amt);
5051 const unsigned DstEltSize = DstTy.getScalarSizeInBits();
5052 if (DstEltSize % 2 != 0)
5053 return UnableToLegalize;
5054
5055 // Ignore the input type. We can only go to exactly half the size of the
5056 // input. If that isn't small enough, the resulting pieces will be further
5057 // legalized.
5058 const unsigned NewBitSize = DstEltSize / 2;
5059 const LLT HalfTy = LLT::scalar(SizeInBits: NewBitSize);
5060 const LLT CondTy = LLT::scalar(SizeInBits: 1);
5061
5062 if (auto VRegAndVal = getIConstantVRegValWithLookThrough(VReg: Amt, MRI)) {
5063 return narrowScalarShiftByConstant(MI, Amt: VRegAndVal->Value, HalfTy,
5064 AmtTy: ShiftAmtTy);
5065 }
5066
5067 // TODO: Expand with known bits.
5068
5069 // Handle the fully general expansion by an unknown amount.
5070 auto NewBits = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: NewBitSize);
5071
5072 Register InL = MRI.createGenericVirtualRegister(Ty: HalfTy);
5073 Register InH = MRI.createGenericVirtualRegister(Ty: HalfTy);
5074 MIRBuilder.buildUnmerge(Res: {InL, InH}, Op: MI.getOperand(i: 1));
5075
5076 auto AmtExcess = MIRBuilder.buildSub(Dst: ShiftAmtTy, Src0: Amt, Src1: NewBits);
5077 auto AmtLack = MIRBuilder.buildSub(Dst: ShiftAmtTy, Src0: NewBits, Src1: Amt);
5078
5079 auto Zero = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: 0);
5080 auto IsShort = MIRBuilder.buildICmp(Pred: ICmpInst::ICMP_ULT, Res: CondTy, Op0: Amt, Op1: NewBits);
5081 auto IsZero = MIRBuilder.buildICmp(Pred: ICmpInst::ICMP_EQ, Res: CondTy, Op0: Amt, Op1: Zero);
5082
5083 Register ResultRegs[2];
5084 switch (MI.getOpcode()) {
5085 case TargetOpcode::G_SHL: {
5086 // Short: ShAmt < NewBitSize
5087 auto LoS = MIRBuilder.buildShl(Dst: HalfTy, Src0: InL, Src1: Amt);
5088
5089 auto LoOr = MIRBuilder.buildLShr(Dst: HalfTy, Src0: InL, Src1: AmtLack);
5090 auto HiOr = MIRBuilder.buildShl(Dst: HalfTy, Src0: InH, Src1: Amt);
5091 auto HiS = MIRBuilder.buildOr(Dst: HalfTy, Src0: LoOr, Src1: HiOr);
5092
5093 // Long: ShAmt >= NewBitSize
5094 auto LoL = MIRBuilder.buildConstant(Res: HalfTy, Val: 0); // Lo part is zero.
5095 auto HiL = MIRBuilder.buildShl(Dst: HalfTy, Src0: InL, Src1: AmtExcess); // Hi from Lo part.
5096
5097 auto Lo = MIRBuilder.buildSelect(Res: HalfTy, Tst: IsShort, Op0: LoS, Op1: LoL);
5098 auto Hi = MIRBuilder.buildSelect(
5099 Res: HalfTy, Tst: IsZero, Op0: InH, Op1: MIRBuilder.buildSelect(Res: HalfTy, Tst: IsShort, Op0: HiS, Op1: HiL));
5100
5101 ResultRegs[0] = Lo.getReg(Idx: 0);
5102 ResultRegs[1] = Hi.getReg(Idx: 0);
5103 break;
5104 }
5105 case TargetOpcode::G_LSHR:
5106 case TargetOpcode::G_ASHR: {
5107 // Short: ShAmt < NewBitSize
5108 auto HiS = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {HalfTy}, SrcOps: {InH, Amt});
5109
5110 auto LoOr = MIRBuilder.buildLShr(Dst: HalfTy, Src0: InL, Src1: Amt);
5111 auto HiOr = MIRBuilder.buildShl(Dst: HalfTy, Src0: InH, Src1: AmtLack);
5112 auto LoS = MIRBuilder.buildOr(Dst: HalfTy, Src0: LoOr, Src1: HiOr);
5113
5114 // Long: ShAmt >= NewBitSize
5115 MachineInstrBuilder HiL;
5116 if (MI.getOpcode() == TargetOpcode::G_LSHR) {
5117 HiL = MIRBuilder.buildConstant(Res: HalfTy, Val: 0); // Hi part is zero.
5118 } else {
5119 auto ShiftAmt = MIRBuilder.buildConstant(Res: ShiftAmtTy, Val: NewBitSize - 1);
5120 HiL = MIRBuilder.buildAShr(Dst: HalfTy, Src0: InH, Src1: ShiftAmt); // Sign of Hi part.
5121 }
5122 auto LoL = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {HalfTy},
5123 SrcOps: {InH, AmtExcess}); // Lo from Hi part.
5124
5125 auto Lo = MIRBuilder.buildSelect(
5126 Res: HalfTy, Tst: IsZero, Op0: InL, Op1: MIRBuilder.buildSelect(Res: HalfTy, Tst: IsShort, Op0: LoS, Op1: LoL));
5127
5128 auto Hi = MIRBuilder.buildSelect(Res: HalfTy, Tst: IsShort, Op0: HiS, Op1: HiL);
5129
5130 ResultRegs[0] = Lo.getReg(Idx: 0);
5131 ResultRegs[1] = Hi.getReg(Idx: 0);
5132 break;
5133 }
5134 default:
5135 llvm_unreachable("not a shift");
5136 }
5137
5138 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: ResultRegs);
5139 MI.eraseFromParent();
5140 return Legalized;
5141}
5142
5143LegalizerHelper::LegalizeResult
5144LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
5145 LLT MoreTy) {
5146 assert(TypeIdx == 0 && "Expecting only Idx 0");
5147
5148 Observer.changingInstr(MI);
5149 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
5150 MachineBasicBlock &OpMBB = *MI.getOperand(i: I + 1).getMBB();
5151 MIRBuilder.setInsertPt(MBB&: OpMBB, II: OpMBB.getFirstTerminator());
5152 moreElementsVectorSrc(MI, MoreTy, OpIdx: I);
5153 }
5154
5155 MachineBasicBlock &MBB = *MI.getParent();
5156 MIRBuilder.setInsertPt(MBB, II: --MBB.getFirstNonPHI());
5157 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
5158 Observer.changedInstr(MI);
5159 return Legalized;
5160}
5161
5162LegalizerHelper::LegalizeResult
5163LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
5164 LLT MoreTy) {
5165 unsigned Opc = MI.getOpcode();
5166 switch (Opc) {
5167 case TargetOpcode::G_IMPLICIT_DEF:
5168 case TargetOpcode::G_LOAD: {
5169 if (TypeIdx != 0)
5170 return UnableToLegalize;
5171 Observer.changingInstr(MI);
5172 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
5173 Observer.changedInstr(MI);
5174 return Legalized;
5175 }
5176 case TargetOpcode::G_STORE:
5177 if (TypeIdx != 0)
5178 return UnableToLegalize;
5179 Observer.changingInstr(MI);
5180 moreElementsVectorSrc(MI, MoreTy, OpIdx: 0);
5181 Observer.changedInstr(MI);
5182 return Legalized;
5183 case TargetOpcode::G_AND:
5184 case TargetOpcode::G_OR:
5185 case TargetOpcode::G_XOR:
5186 case TargetOpcode::G_ADD:
5187 case TargetOpcode::G_SUB:
5188 case TargetOpcode::G_MUL:
5189 case TargetOpcode::G_FADD:
5190 case TargetOpcode::G_FSUB:
5191 case TargetOpcode::G_FMUL:
5192 case TargetOpcode::G_FDIV:
5193 case TargetOpcode::G_UADDSAT:
5194 case TargetOpcode::G_USUBSAT:
5195 case TargetOpcode::G_SADDSAT:
5196 case TargetOpcode::G_SSUBSAT:
5197 case TargetOpcode::G_SMIN:
5198 case TargetOpcode::G_SMAX:
5199 case TargetOpcode::G_UMIN:
5200 case TargetOpcode::G_UMAX:
5201 case TargetOpcode::G_FMINNUM:
5202 case TargetOpcode::G_FMAXNUM:
5203 case TargetOpcode::G_FMINNUM_IEEE:
5204 case TargetOpcode::G_FMAXNUM_IEEE:
5205 case TargetOpcode::G_FMINIMUM:
5206 case TargetOpcode::G_FMAXIMUM:
5207 case TargetOpcode::G_STRICT_FADD:
5208 case TargetOpcode::G_STRICT_FSUB:
5209 case TargetOpcode::G_STRICT_FMUL:
5210 case TargetOpcode::G_SHL:
5211 case TargetOpcode::G_ASHR:
5212 case TargetOpcode::G_LSHR: {
5213 Observer.changingInstr(MI);
5214 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
5215 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
5216 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
5217 Observer.changedInstr(MI);
5218 return Legalized;
5219 }
5220 case TargetOpcode::G_FMA:
5221 case TargetOpcode::G_STRICT_FMA:
5222 case TargetOpcode::G_FSHR:
5223 case TargetOpcode::G_FSHL: {
5224 Observer.changingInstr(MI);
5225 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
5226 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
5227 moreElementsVectorSrc(MI, MoreTy, OpIdx: 3);
5228 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
5229 Observer.changedInstr(MI);
5230 return Legalized;
5231 }
5232 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
5233 case TargetOpcode::G_EXTRACT:
5234 if (TypeIdx != 1)
5235 return UnableToLegalize;
5236 Observer.changingInstr(MI);
5237 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
5238 Observer.changedInstr(MI);
5239 return Legalized;
5240 case TargetOpcode::G_INSERT:
5241 case TargetOpcode::G_INSERT_VECTOR_ELT:
5242 case TargetOpcode::G_FREEZE:
5243 case TargetOpcode::G_FNEG:
5244 case TargetOpcode::G_FABS:
5245 case TargetOpcode::G_FSQRT:
5246 case TargetOpcode::G_FCEIL:
5247 case TargetOpcode::G_FFLOOR:
5248 case TargetOpcode::G_FNEARBYINT:
5249 case TargetOpcode::G_FRINT:
5250 case TargetOpcode::G_INTRINSIC_ROUND:
5251 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
5252 case TargetOpcode::G_INTRINSIC_TRUNC:
5253 case TargetOpcode::G_BSWAP:
5254 case TargetOpcode::G_FCANONICALIZE:
5255 case TargetOpcode::G_SEXT_INREG:
5256 case TargetOpcode::G_ABS:
5257 if (TypeIdx != 0)
5258 return UnableToLegalize;
5259 Observer.changingInstr(MI);
5260 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
5261 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
5262 Observer.changedInstr(MI);
5263 return Legalized;
5264 case TargetOpcode::G_SELECT: {
5265 auto [DstReg, DstTy, CondReg, CondTy] = MI.getFirst2RegLLTs();
5266 if (TypeIdx == 1) {
5267 if (!CondTy.isScalar() ||
5268 DstTy.getElementCount() != MoreTy.getElementCount())
5269 return UnableToLegalize;
5270
5271 // This is turning a scalar select of vectors into a vector
5272 // select. Broadcast the select condition.
5273 auto ShufSplat = MIRBuilder.buildShuffleSplat(Res: MoreTy, Src: CondReg);
5274 Observer.changingInstr(MI);
5275 MI.getOperand(i: 1).setReg(ShufSplat.getReg(Idx: 0));
5276 Observer.changedInstr(MI);
5277 return Legalized;
5278 }
5279
5280 if (CondTy.isVector())
5281 return UnableToLegalize;
5282
5283 Observer.changingInstr(MI);
5284 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
5285 moreElementsVectorSrc(MI, MoreTy, OpIdx: 3);
5286 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
5287 Observer.changedInstr(MI);
5288 return Legalized;
5289 }
5290 case TargetOpcode::G_UNMERGE_VALUES:
5291 return UnableToLegalize;
5292 case TargetOpcode::G_PHI:
5293 return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
5294 case TargetOpcode::G_SHUFFLE_VECTOR:
5295 return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
5296 case TargetOpcode::G_BUILD_VECTOR: {
5297 SmallVector<SrcOp, 8> Elts;
5298 for (auto Op : MI.uses()) {
5299 Elts.push_back(Elt: Op.getReg());
5300 }
5301
5302 for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) {
5303 Elts.push_back(Elt: MIRBuilder.buildUndef(Res: MoreTy.getScalarType()));
5304 }
5305
5306 MIRBuilder.buildDeleteTrailingVectorElements(
5307 Res: MI.getOperand(i: 0).getReg(), Op0: MIRBuilder.buildInstr(Opc, DstOps: {MoreTy}, SrcOps: Elts));
5308 MI.eraseFromParent();
5309 return Legalized;
5310 }
5311 case TargetOpcode::G_TRUNC:
5312 case TargetOpcode::G_FPTRUNC:
5313 case TargetOpcode::G_FPEXT:
5314 case TargetOpcode::G_FPTOSI:
5315 case TargetOpcode::G_FPTOUI:
5316 case TargetOpcode::G_SITOFP:
5317 case TargetOpcode::G_UITOFP: {
5318 if (TypeIdx != 0)
5319 return UnableToLegalize;
5320 Observer.changingInstr(MI);
5321 LLT SrcTy = LLT::fixed_vector(
5322 NumElements: MoreTy.getNumElements(),
5323 ScalarTy: MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getElementType());
5324 moreElementsVectorSrc(MI, MoreTy: SrcTy, OpIdx: 1);
5325 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
5326 Observer.changedInstr(MI);
5327 return Legalized;
5328 }
5329 case TargetOpcode::G_ICMP:
5330 case TargetOpcode::G_FCMP: {
5331 if (TypeIdx != 1)
5332 return UnableToLegalize;
5333
5334 Observer.changingInstr(MI);
5335 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
5336 moreElementsVectorSrc(MI, MoreTy, OpIdx: 3);
5337 LLT CondTy = LLT::fixed_vector(
5338 NumElements: MoreTy.getNumElements(),
5339 ScalarTy: MRI.getType(Reg: MI.getOperand(i: 0).getReg()).getElementType());
5340 moreElementsVectorDst(MI, WideTy: CondTy, OpIdx: 0);
5341 Observer.changedInstr(MI);
5342 return Legalized;
5343 }
5344 default:
5345 return UnableToLegalize;
5346 }
5347}
5348
5349LegalizerHelper::LegalizeResult
5350LegalizerHelper::equalizeVectorShuffleLengths(MachineInstr &MI) {
5351 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5352 ArrayRef<int> Mask = MI.getOperand(i: 3).getShuffleMask();
5353 unsigned MaskNumElts = Mask.size();
5354 unsigned SrcNumElts = SrcTy.getNumElements();
5355 LLT DestEltTy = DstTy.getElementType();
5356
5357 if (MaskNumElts == SrcNumElts)
5358 return Legalized;
5359
5360 if (MaskNumElts < SrcNumElts) {
5361 // Extend mask to match new destination vector size with
5362 // undef values.
5363 SmallVector<int, 16> NewMask(Mask);
5364 for (unsigned I = MaskNumElts; I < SrcNumElts; ++I)
5365 NewMask.push_back(Elt: -1);
5366
5367 moreElementsVectorDst(MI, WideTy: SrcTy, OpIdx: 0);
5368 MIRBuilder.setInstrAndDebugLoc(MI);
5369 MIRBuilder.buildShuffleVector(Res: MI.getOperand(i: 0).getReg(),
5370 Src1: MI.getOperand(i: 1).getReg(),
5371 Src2: MI.getOperand(i: 2).getReg(), Mask: NewMask);
5372 MI.eraseFromParent();
5373
5374 return Legalized;
5375 }
5376
5377 unsigned PaddedMaskNumElts = alignTo(Value: MaskNumElts, Align: SrcNumElts);
5378 unsigned NumConcat = PaddedMaskNumElts / SrcNumElts;
5379 LLT PaddedTy = LLT::fixed_vector(NumElements: PaddedMaskNumElts, ScalarTy: DestEltTy);
5380
5381 // Create new source vectors by concatenating the initial
5382 // source vectors with undefined vectors of the same size.
5383 auto Undef = MIRBuilder.buildUndef(Res: SrcTy);
5384 SmallVector<Register, 8> MOps1(NumConcat, Undef.getReg(Idx: 0));
5385 SmallVector<Register, 8> MOps2(NumConcat, Undef.getReg(Idx: 0));
5386 MOps1[0] = MI.getOperand(i: 1).getReg();
5387 MOps2[0] = MI.getOperand(i: 2).getReg();
5388
5389 auto Src1 = MIRBuilder.buildConcatVectors(Res: PaddedTy, Ops: MOps1);
5390 auto Src2 = MIRBuilder.buildConcatVectors(Res: PaddedTy, Ops: MOps2);
5391
5392 // Readjust mask for new input vector length.
5393 SmallVector<int, 8> MappedOps(PaddedMaskNumElts, -1);
5394 for (unsigned I = 0; I != MaskNumElts; ++I) {
5395 int Idx = Mask[I];
5396 if (Idx >= static_cast<int>(SrcNumElts))
5397 Idx += PaddedMaskNumElts - SrcNumElts;
5398 MappedOps[I] = Idx;
5399 }
5400
5401 // If we got more elements than required, extract subvector.
5402 if (MaskNumElts != PaddedMaskNumElts) {
5403 auto Shuffle =
5404 MIRBuilder.buildShuffleVector(Res: PaddedTy, Src1, Src2, Mask: MappedOps);
5405
5406 SmallVector<Register, 16> Elts(MaskNumElts);
5407 for (unsigned I = 0; I < MaskNumElts; ++I) {
5408 Elts[I] =
5409 MIRBuilder.buildExtractVectorElementConstant(Res: DestEltTy, Val: Shuffle, Idx: I)
5410 .getReg(Idx: 0);
5411 }
5412 MIRBuilder.buildBuildVector(Res: DstReg, Ops: Elts);
5413 } else {
5414 MIRBuilder.buildShuffleVector(Res: DstReg, Src1, Src2, Mask: MappedOps);
5415 }
5416
5417 MI.eraseFromParent();
5418 return LegalizerHelper::LegalizeResult::Legalized;
5419}
5420
5421LegalizerHelper::LegalizeResult
5422LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
5423 unsigned int TypeIdx, LLT MoreTy) {
5424 auto [DstTy, Src1Ty, Src2Ty] = MI.getFirst3LLTs();
5425 ArrayRef<int> Mask = MI.getOperand(i: 3).getShuffleMask();
5426 unsigned NumElts = DstTy.getNumElements();
5427 unsigned WidenNumElts = MoreTy.getNumElements();
5428
5429 if (DstTy.isVector() && Src1Ty.isVector() &&
5430 DstTy.getNumElements() != Src1Ty.getNumElements()) {
5431 return equalizeVectorShuffleLengths(MI);
5432 }
5433
5434 if (TypeIdx != 0)
5435 return UnableToLegalize;
5436
5437 // Expect a canonicalized shuffle.
5438 if (DstTy != Src1Ty || DstTy != Src2Ty)
5439 return UnableToLegalize;
5440
5441 moreElementsVectorSrc(MI, MoreTy, OpIdx: 1);
5442 moreElementsVectorSrc(MI, MoreTy, OpIdx: 2);
5443
5444 // Adjust mask based on new input vector length.
5445 SmallVector<int, 16> NewMask;
5446 for (unsigned I = 0; I != NumElts; ++I) {
5447 int Idx = Mask[I];
5448 if (Idx < static_cast<int>(NumElts))
5449 NewMask.push_back(Elt: Idx);
5450 else
5451 NewMask.push_back(Elt: Idx - NumElts + WidenNumElts);
5452 }
5453 for (unsigned I = NumElts; I != WidenNumElts; ++I)
5454 NewMask.push_back(Elt: -1);
5455 moreElementsVectorDst(MI, WideTy: MoreTy, OpIdx: 0);
5456 MIRBuilder.setInstrAndDebugLoc(MI);
5457 MIRBuilder.buildShuffleVector(Res: MI.getOperand(i: 0).getReg(),
5458 Src1: MI.getOperand(i: 1).getReg(),
5459 Src2: MI.getOperand(i: 2).getReg(), Mask: NewMask);
5460 MI.eraseFromParent();
5461 return Legalized;
5462}
5463
5464void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
5465 ArrayRef<Register> Src1Regs,
5466 ArrayRef<Register> Src2Regs,
5467 LLT NarrowTy) {
5468 MachineIRBuilder &B = MIRBuilder;
5469 unsigned SrcParts = Src1Regs.size();
5470 unsigned DstParts = DstRegs.size();
5471
5472 unsigned DstIdx = 0; // Low bits of the result.
5473 Register FactorSum =
5474 B.buildMul(Dst: NarrowTy, Src0: Src1Regs[DstIdx], Src1: Src2Regs[DstIdx]).getReg(Idx: 0);
5475 DstRegs[DstIdx] = FactorSum;
5476
5477 unsigned CarrySumPrevDstIdx;
5478 SmallVector<Register, 4> Factors;
5479
5480 for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
5481 // Collect low parts of muls for DstIdx.
5482 for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
5483 i <= std::min(a: DstIdx, b: SrcParts - 1); ++i) {
5484 MachineInstrBuilder Mul =
5485 B.buildMul(Dst: NarrowTy, Src0: Src1Regs[DstIdx - i], Src1: Src2Regs[i]);
5486 Factors.push_back(Elt: Mul.getReg(Idx: 0));
5487 }
5488 // Collect high parts of muls from previous DstIdx.
5489 for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
5490 i <= std::min(a: DstIdx - 1, b: SrcParts - 1); ++i) {
5491 MachineInstrBuilder Umulh =
5492 B.buildUMulH(Dst: NarrowTy, Src0: Src1Regs[DstIdx - 1 - i], Src1: Src2Regs[i]);
5493 Factors.push_back(Elt: Umulh.getReg(Idx: 0));
5494 }
5495 // Add CarrySum from additions calculated for previous DstIdx.
5496 if (DstIdx != 1) {
5497 Factors.push_back(Elt: CarrySumPrevDstIdx);
5498 }
5499
5500 Register CarrySum;
5501 // Add all factors and accumulate all carries into CarrySum.
5502 if (DstIdx != DstParts - 1) {
5503 MachineInstrBuilder Uaddo =
5504 B.buildUAddo(Res: NarrowTy, CarryOut: LLT::scalar(SizeInBits: 1), Op0: Factors[0], Op1: Factors[1]);
5505 FactorSum = Uaddo.getReg(Idx: 0);
5506 CarrySum = B.buildZExt(Res: NarrowTy, Op: Uaddo.getReg(Idx: 1)).getReg(Idx: 0);
5507 for (unsigned i = 2; i < Factors.size(); ++i) {
5508 MachineInstrBuilder Uaddo =
5509 B.buildUAddo(Res: NarrowTy, CarryOut: LLT::scalar(SizeInBits: 1), Op0: FactorSum, Op1: Factors[i]);
5510 FactorSum = Uaddo.getReg(Idx: 0);
5511 MachineInstrBuilder Carry = B.buildZExt(Res: NarrowTy, Op: Uaddo.getReg(Idx: 1));
5512 CarrySum = B.buildAdd(Dst: NarrowTy, Src0: CarrySum, Src1: Carry).getReg(Idx: 0);
5513 }
5514 } else {
5515 // Since value for the next index is not calculated, neither is CarrySum.
5516 FactorSum = B.buildAdd(Dst: NarrowTy, Src0: Factors[0], Src1: Factors[1]).getReg(Idx: 0);
5517 for (unsigned i = 2; i < Factors.size(); ++i)
5518 FactorSum = B.buildAdd(Dst: NarrowTy, Src0: FactorSum, Src1: Factors[i]).getReg(Idx: 0);
5519 }
5520
5521 CarrySumPrevDstIdx = CarrySum;
5522 DstRegs[DstIdx] = FactorSum;
5523 Factors.clear();
5524 }
5525}
5526
5527LegalizerHelper::LegalizeResult
5528LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
5529 LLT NarrowTy) {
5530 if (TypeIdx != 0)
5531 return UnableToLegalize;
5532
5533 Register DstReg = MI.getOperand(i: 0).getReg();
5534 LLT DstType = MRI.getType(Reg: DstReg);
5535 // FIXME: add support for vector types
5536 if (DstType.isVector())
5537 return UnableToLegalize;
5538
5539 unsigned Opcode = MI.getOpcode();
5540 unsigned OpO, OpE, OpF;
5541 switch (Opcode) {
5542 case TargetOpcode::G_SADDO:
5543 case TargetOpcode::G_SADDE:
5544 case TargetOpcode::G_UADDO:
5545 case TargetOpcode::G_UADDE:
5546 case TargetOpcode::G_ADD:
5547 OpO = TargetOpcode::G_UADDO;
5548 OpE = TargetOpcode::G_UADDE;
5549 OpF = TargetOpcode::G_UADDE;
5550 if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
5551 OpF = TargetOpcode::G_SADDE;
5552 break;
5553 case TargetOpcode::G_SSUBO:
5554 case TargetOpcode::G_SSUBE:
5555 case TargetOpcode::G_USUBO:
5556 case TargetOpcode::G_USUBE:
5557 case TargetOpcode::G_SUB:
5558 OpO = TargetOpcode::G_USUBO;
5559 OpE = TargetOpcode::G_USUBE;
5560 OpF = TargetOpcode::G_USUBE;
5561 if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
5562 OpF = TargetOpcode::G_SSUBE;
5563 break;
5564 default:
5565 llvm_unreachable("Unexpected add/sub opcode!");
5566 }
5567
5568 // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
5569 unsigned NumDefs = MI.getNumExplicitDefs();
5570 Register Src1 = MI.getOperand(i: NumDefs).getReg();
5571 Register Src2 = MI.getOperand(i: NumDefs + 1).getReg();
5572 Register CarryDst, CarryIn;
5573 if (NumDefs == 2)
5574 CarryDst = MI.getOperand(i: 1).getReg();
5575 if (MI.getNumOperands() == NumDefs + 3)
5576 CarryIn = MI.getOperand(i: NumDefs + 2).getReg();
5577
5578 LLT RegTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5579 LLT LeftoverTy, DummyTy;
5580 SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
5581 extractParts(Reg: Src1, RegTy, MainTy: NarrowTy, LeftoverTy, VRegs&: Src1Regs, LeftoverVRegs&: Src1Left,
5582 MIRBuilder, MRI);
5583 extractParts(Reg: Src2, RegTy, MainTy: NarrowTy, LeftoverTy&: DummyTy, VRegs&: Src2Regs, LeftoverVRegs&: Src2Left, MIRBuilder,
5584 MRI);
5585
5586 int NarrowParts = Src1Regs.size();
5587 for (int I = 0, E = Src1Left.size(); I != E; ++I) {
5588 Src1Regs.push_back(Elt: Src1Left[I]);
5589 Src2Regs.push_back(Elt: Src2Left[I]);
5590 }
5591 DstRegs.reserve(N: Src1Regs.size());
5592
5593 for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
5594 Register DstReg =
5595 MRI.createGenericVirtualRegister(Ty: MRI.getType(Reg: Src1Regs[i]));
5596 Register CarryOut = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: 1));
5597 // Forward the final carry-out to the destination register
5598 if (i == e - 1 && CarryDst)
5599 CarryOut = CarryDst;
5600
5601 if (!CarryIn) {
5602 MIRBuilder.buildInstr(Opc: OpO, DstOps: {DstReg, CarryOut},
5603 SrcOps: {Src1Regs[i], Src2Regs[i]});
5604 } else if (i == e - 1) {
5605 MIRBuilder.buildInstr(Opc: OpF, DstOps: {DstReg, CarryOut},
5606 SrcOps: {Src1Regs[i], Src2Regs[i], CarryIn});
5607 } else {
5608 MIRBuilder.buildInstr(Opc: OpE, DstOps: {DstReg, CarryOut},
5609 SrcOps: {Src1Regs[i], Src2Regs[i], CarryIn});
5610 }
5611
5612 DstRegs.push_back(Elt: DstReg);
5613 CarryIn = CarryOut;
5614 }
5615 insertParts(DstReg: MI.getOperand(i: 0).getReg(), ResultTy: RegTy, PartTy: NarrowTy,
5616 PartRegs: ArrayRef(DstRegs).take_front(N: NarrowParts), LeftoverTy,
5617 LeftoverRegs: ArrayRef(DstRegs).drop_front(N: NarrowParts));
5618
5619 MI.eraseFromParent();
5620 return Legalized;
5621}
5622
5623LegalizerHelper::LegalizeResult
5624LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
5625 auto [DstReg, Src1, Src2] = MI.getFirst3Regs();
5626
5627 LLT Ty = MRI.getType(Reg: DstReg);
5628 if (Ty.isVector())
5629 return UnableToLegalize;
5630
5631 unsigned Size = Ty.getSizeInBits();
5632 unsigned NarrowSize = NarrowTy.getSizeInBits();
5633 if (Size % NarrowSize != 0)
5634 return UnableToLegalize;
5635
5636 unsigned NumParts = Size / NarrowSize;
5637 bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
5638 unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1);
5639
5640 SmallVector<Register, 2> Src1Parts, Src2Parts;
5641 SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
5642 extractParts(Reg: Src1, Ty: NarrowTy, NumParts, VRegs&: Src1Parts, MIRBuilder, MRI);
5643 extractParts(Reg: Src2, Ty: NarrowTy, NumParts, VRegs&: Src2Parts, MIRBuilder, MRI);
5644 multiplyRegisters(DstRegs&: DstTmpRegs, Src1Regs: Src1Parts, Src2Regs: Src2Parts, NarrowTy);
5645
5646 // Take only high half of registers if this is high mul.
5647 ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts);
5648 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
5649 MI.eraseFromParent();
5650 return Legalized;
5651}
5652
5653LegalizerHelper::LegalizeResult
5654LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
5655 LLT NarrowTy) {
5656 if (TypeIdx != 0)
5657 return UnableToLegalize;
5658
5659 bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
5660
5661 Register Src = MI.getOperand(i: 1).getReg();
5662 LLT SrcTy = MRI.getType(Reg: Src);
5663
5664 // If all finite floats fit into the narrowed integer type, we can just swap
5665 // out the result type. This is practically only useful for conversions from
5666 // half to at least 16-bits, so just handle the one case.
5667 if (SrcTy.getScalarType() != LLT::scalar(SizeInBits: 16) ||
5668 NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
5669 return UnableToLegalize;
5670
5671 Observer.changingInstr(MI);
5672 narrowScalarDst(MI, NarrowTy, OpIdx: 0,
5673 ExtOpcode: IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
5674 Observer.changedInstr(MI);
5675 return Legalized;
5676}
5677
5678LegalizerHelper::LegalizeResult
5679LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
5680 LLT NarrowTy) {
5681 if (TypeIdx != 1)
5682 return UnableToLegalize;
5683
5684 uint64_t NarrowSize = NarrowTy.getSizeInBits();
5685
5686 int64_t SizeOp1 = MRI.getType(Reg: MI.getOperand(i: 1).getReg()).getSizeInBits();
5687 // FIXME: add support for when SizeOp1 isn't an exact multiple of
5688 // NarrowSize.
5689 if (SizeOp1 % NarrowSize != 0)
5690 return UnableToLegalize;
5691 int NumParts = SizeOp1 / NarrowSize;
5692
5693 SmallVector<Register, 2> SrcRegs, DstRegs;
5694 SmallVector<uint64_t, 2> Indexes;
5695 extractParts(Reg: MI.getOperand(i: 1).getReg(), Ty: NarrowTy, NumParts, VRegs&: SrcRegs,
5696 MIRBuilder, MRI);
5697
5698 Register OpReg = MI.getOperand(i: 0).getReg();
5699 uint64_t OpStart = MI.getOperand(i: 2).getImm();
5700 uint64_t OpSize = MRI.getType(Reg: OpReg).getSizeInBits();
5701 for (int i = 0; i < NumParts; ++i) {
5702 unsigned SrcStart = i * NarrowSize;
5703
5704 if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
5705 // No part of the extract uses this subregister, ignore it.
5706 continue;
5707 } else if (SrcStart == OpStart && NarrowTy == MRI.getType(Reg: OpReg)) {
5708 // The entire subregister is extracted, forward the value.
5709 DstRegs.push_back(Elt: SrcRegs[i]);
5710 continue;
5711 }
5712
5713 // OpSegStart is where this destination segment would start in OpReg if it
5714 // extended infinitely in both directions.
5715 int64_t ExtractOffset;
5716 uint64_t SegSize;
5717 if (OpStart < SrcStart) {
5718 ExtractOffset = 0;
5719 SegSize = std::min(a: NarrowSize, b: OpStart + OpSize - SrcStart);
5720 } else {
5721 ExtractOffset = OpStart - SrcStart;
5722 SegSize = std::min(a: SrcStart + NarrowSize - OpStart, b: OpSize);
5723 }
5724
5725 Register SegReg = SrcRegs[i];
5726 if (ExtractOffset != 0 || SegSize != NarrowSize) {
5727 // A genuine extract is needed.
5728 SegReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: SegSize));
5729 MIRBuilder.buildExtract(Res: SegReg, Src: SrcRegs[i], Index: ExtractOffset);
5730 }
5731
5732 DstRegs.push_back(Elt: SegReg);
5733 }
5734
5735 Register DstReg = MI.getOperand(i: 0).getReg();
5736 if (MRI.getType(Reg: DstReg).isVector())
5737 MIRBuilder.buildBuildVector(Res: DstReg, Ops: DstRegs);
5738 else if (DstRegs.size() > 1)
5739 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
5740 else
5741 MIRBuilder.buildCopy(Res: DstReg, Op: DstRegs[0]);
5742 MI.eraseFromParent();
5743 return Legalized;
5744}
5745
5746LegalizerHelper::LegalizeResult
5747LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
5748 LLT NarrowTy) {
5749 // FIXME: Don't know how to handle secondary types yet.
5750 if (TypeIdx != 0)
5751 return UnableToLegalize;
5752
5753 SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
5754 SmallVector<uint64_t, 2> Indexes;
5755 LLT RegTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
5756 LLT LeftoverTy;
5757 extractParts(Reg: MI.getOperand(i: 1).getReg(), RegTy, MainTy: NarrowTy, LeftoverTy, VRegs&: SrcRegs,
5758 LeftoverVRegs&: LeftoverRegs, MIRBuilder, MRI);
5759
5760 for (Register Reg : LeftoverRegs)
5761 SrcRegs.push_back(Elt: Reg);
5762
5763 uint64_t NarrowSize = NarrowTy.getSizeInBits();
5764 Register OpReg = MI.getOperand(i: 2).getReg();
5765 uint64_t OpStart = MI.getOperand(i: 3).getImm();
5766 uint64_t OpSize = MRI.getType(Reg: OpReg).getSizeInBits();
5767 for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
5768 unsigned DstStart = I * NarrowSize;
5769
5770 if (DstStart == OpStart && NarrowTy == MRI.getType(Reg: OpReg)) {
5771 // The entire subregister is defined by this insert, forward the new
5772 // value.
5773 DstRegs.push_back(Elt: OpReg);
5774 continue;
5775 }
5776
5777 Register SrcReg = SrcRegs[I];
5778 if (MRI.getType(Reg: SrcRegs[I]) == LeftoverTy) {
5779 // The leftover reg is smaller than NarrowTy, so we need to extend it.
5780 SrcReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
5781 MIRBuilder.buildAnyExt(Res: SrcReg, Op: SrcRegs[I]);
5782 }
5783
5784 if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
5785 // No part of the insert affects this subregister, forward the original.
5786 DstRegs.push_back(Elt: SrcReg);
5787 continue;
5788 }
5789
5790 // OpSegStart is where this destination segment would start in OpReg if it
5791 // extended infinitely in both directions.
5792 int64_t ExtractOffset, InsertOffset;
5793 uint64_t SegSize;
5794 if (OpStart < DstStart) {
5795 InsertOffset = 0;
5796 ExtractOffset = DstStart - OpStart;
5797 SegSize = std::min(a: NarrowSize, b: OpStart + OpSize - DstStart);
5798 } else {
5799 InsertOffset = OpStart - DstStart;
5800 ExtractOffset = 0;
5801 SegSize =
5802 std::min(a: NarrowSize - InsertOffset, b: OpStart + OpSize - DstStart);
5803 }
5804
5805 Register SegReg = OpReg;
5806 if (ExtractOffset != 0 || SegSize != OpSize) {
5807 // A genuine extract is needed.
5808 SegReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: SegSize));
5809 MIRBuilder.buildExtract(Res: SegReg, Src: OpReg, Index: ExtractOffset);
5810 }
5811
5812 Register DstReg = MRI.createGenericVirtualRegister(Ty: NarrowTy);
5813 MIRBuilder.buildInsert(Res: DstReg, Src: SrcReg, Op: SegReg, Index: InsertOffset);
5814 DstRegs.push_back(Elt: DstReg);
5815 }
5816
5817 uint64_t WideSize = DstRegs.size() * NarrowSize;
5818 Register DstReg = MI.getOperand(i: 0).getReg();
5819 if (WideSize > RegTy.getSizeInBits()) {
5820 Register MergeReg = MRI.createGenericVirtualRegister(Ty: LLT::scalar(SizeInBits: WideSize));
5821 MIRBuilder.buildMergeLikeInstr(Res: MergeReg, Ops: DstRegs);
5822 MIRBuilder.buildTrunc(Res: DstReg, Op: MergeReg);
5823 } else
5824 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: DstRegs);
5825
5826 MI.eraseFromParent();
5827 return Legalized;
5828}
5829
5830LegalizerHelper::LegalizeResult
5831LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
5832 LLT NarrowTy) {
5833 Register DstReg = MI.getOperand(i: 0).getReg();
5834 LLT DstTy = MRI.getType(Reg: DstReg);
5835
5836 assert(MI.getNumOperands() == 3 && TypeIdx == 0);
5837
5838 SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5839 SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
5840 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5841 LLT LeftoverTy;
5842 if (!extractParts(Reg: MI.getOperand(i: 1).getReg(), RegTy: DstTy, MainTy: NarrowTy, LeftoverTy,
5843 VRegs&: Src0Regs, LeftoverVRegs&: Src0LeftoverRegs, MIRBuilder, MRI))
5844 return UnableToLegalize;
5845
5846 LLT Unused;
5847 if (!extractParts(Reg: MI.getOperand(i: 2).getReg(), RegTy: DstTy, MainTy: NarrowTy, LeftoverTy&: Unused,
5848 VRegs&: Src1Regs, LeftoverVRegs&: Src1LeftoverRegs, MIRBuilder, MRI))
5849 llvm_unreachable("inconsistent extractParts result");
5850
5851 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5852 auto Inst = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {NarrowTy},
5853 SrcOps: {Src0Regs[I], Src1Regs[I]});
5854 DstRegs.push_back(Elt: Inst.getReg(Idx: 0));
5855 }
5856
5857 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5858 auto Inst = MIRBuilder.buildInstr(
5859 Opc: MI.getOpcode(),
5860 DstOps: {LeftoverTy}, SrcOps: {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
5861 DstLeftoverRegs.push_back(Elt: Inst.getReg(Idx: 0));
5862 }
5863
5864 insertParts(DstReg, ResultTy: DstTy, PartTy: NarrowTy, PartRegs: DstRegs,
5865 LeftoverTy, LeftoverRegs: DstLeftoverRegs);
5866
5867 MI.eraseFromParent();
5868 return Legalized;
5869}
5870
5871LegalizerHelper::LegalizeResult
5872LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
5873 LLT NarrowTy) {
5874 if (TypeIdx != 0)
5875 return UnableToLegalize;
5876
5877 auto [DstReg, SrcReg] = MI.getFirst2Regs();
5878
5879 LLT DstTy = MRI.getType(Reg: DstReg);
5880 if (DstTy.isVector())
5881 return UnableToLegalize;
5882
5883 SmallVector<Register, 8> Parts;
5884 LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
5885 LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, VRegs&: Parts, PadStrategy: MI.getOpcode());
5886 buildWidenedRemergeToDst(DstReg, LCMTy, RemergeRegs: Parts);
5887
5888 MI.eraseFromParent();
5889 return Legalized;
5890}
5891
5892LegalizerHelper::LegalizeResult
5893LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
5894 LLT NarrowTy) {
5895 if (TypeIdx != 0)
5896 return UnableToLegalize;
5897
5898 Register CondReg = MI.getOperand(i: 1).getReg();
5899 LLT CondTy = MRI.getType(Reg: CondReg);
5900 if (CondTy.isVector()) // TODO: Handle vselect
5901 return UnableToLegalize;
5902
5903 Register DstReg = MI.getOperand(i: 0).getReg();
5904 LLT DstTy = MRI.getType(Reg: DstReg);
5905
5906 SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5907 SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5908 SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
5909 LLT LeftoverTy;
5910 if (!extractParts(Reg: MI.getOperand(i: 2).getReg(), RegTy: DstTy, MainTy: NarrowTy, LeftoverTy,
5911 VRegs&: Src1Regs, LeftoverVRegs&: Src1LeftoverRegs, MIRBuilder, MRI))
5912 return UnableToLegalize;
5913
5914 LLT Unused;
5915 if (!extractParts(Reg: MI.getOperand(i: 3).getReg(), RegTy: DstTy, MainTy: NarrowTy, LeftoverTy&: Unused,
5916 VRegs&: Src2Regs, LeftoverVRegs&: Src2LeftoverRegs, MIRBuilder, MRI))
5917 llvm_unreachable("inconsistent extractParts result");
5918
5919 for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5920 auto Select = MIRBuilder.buildSelect(Res: NarrowTy,
5921 Tst: CondReg, Op0: Src1Regs[I], Op1: Src2Regs[I]);
5922 DstRegs.push_back(Elt: Select.getReg(Idx: 0));
5923 }
5924
5925 for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5926 auto Select = MIRBuilder.buildSelect(
5927 Res: LeftoverTy, Tst: CondReg, Op0: Src1LeftoverRegs[I], Op1: Src2LeftoverRegs[I]);
5928 DstLeftoverRegs.push_back(Elt: Select.getReg(Idx: 0));
5929 }
5930
5931 insertParts(DstReg, ResultTy: DstTy, PartTy: NarrowTy, PartRegs: DstRegs,
5932 LeftoverTy, LeftoverRegs: DstLeftoverRegs);
5933
5934 MI.eraseFromParent();
5935 return Legalized;
5936}
5937
5938LegalizerHelper::LegalizeResult
5939LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
5940 LLT NarrowTy) {
5941 if (TypeIdx != 1)
5942 return UnableToLegalize;
5943
5944 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5945 unsigned NarrowSize = NarrowTy.getSizeInBits();
5946
5947 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5948 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
5949
5950 MachineIRBuilder &B = MIRBuilder;
5951 auto UnmergeSrc = B.buildUnmerge(Res: NarrowTy, Op: SrcReg);
5952 // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
5953 auto C_0 = B.buildConstant(Res: NarrowTy, Val: 0);
5954 auto HiIsZero = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: LLT::scalar(SizeInBits: 1),
5955 Op0: UnmergeSrc.getReg(Idx: 1), Op1: C_0);
5956 auto LoCTLZ = IsUndef ?
5957 B.buildCTLZ_ZERO_UNDEF(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 0)) :
5958 B.buildCTLZ(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 0));
5959 auto C_NarrowSize = B.buildConstant(Res: DstTy, Val: NarrowSize);
5960 auto HiIsZeroCTLZ = B.buildAdd(Dst: DstTy, Src0: LoCTLZ, Src1: C_NarrowSize);
5961 auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 1));
5962 B.buildSelect(Res: DstReg, Tst: HiIsZero, Op0: HiIsZeroCTLZ, Op1: HiCTLZ);
5963
5964 MI.eraseFromParent();
5965 return Legalized;
5966 }
5967
5968 return UnableToLegalize;
5969}
5970
5971LegalizerHelper::LegalizeResult
5972LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
5973 LLT NarrowTy) {
5974 if (TypeIdx != 1)
5975 return UnableToLegalize;
5976
5977 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5978 unsigned NarrowSize = NarrowTy.getSizeInBits();
5979
5980 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5981 const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
5982
5983 MachineIRBuilder &B = MIRBuilder;
5984 auto UnmergeSrc = B.buildUnmerge(Res: NarrowTy, Op: SrcReg);
5985 // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
5986 auto C_0 = B.buildConstant(Res: NarrowTy, Val: 0);
5987 auto LoIsZero = B.buildICmp(Pred: CmpInst::ICMP_EQ, Res: LLT::scalar(SizeInBits: 1),
5988 Op0: UnmergeSrc.getReg(Idx: 0), Op1: C_0);
5989 auto HiCTTZ = IsUndef ?
5990 B.buildCTTZ_ZERO_UNDEF(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 1)) :
5991 B.buildCTTZ(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 1));
5992 auto C_NarrowSize = B.buildConstant(Res: DstTy, Val: NarrowSize);
5993 auto LoIsZeroCTTZ = B.buildAdd(Dst: DstTy, Src0: HiCTTZ, Src1: C_NarrowSize);
5994 auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 0));
5995 B.buildSelect(Res: DstReg, Tst: LoIsZero, Op0: LoIsZeroCTTZ, Op1: LoCTTZ);
5996
5997 MI.eraseFromParent();
5998 return Legalized;
5999 }
6000
6001 return UnableToLegalize;
6002}
6003
6004LegalizerHelper::LegalizeResult
6005LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
6006 LLT NarrowTy) {
6007 if (TypeIdx != 1)
6008 return UnableToLegalize;
6009
6010 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6011 unsigned NarrowSize = NarrowTy.getSizeInBits();
6012
6013 if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
6014 auto UnmergeSrc = MIRBuilder.buildUnmerge(Res: NarrowTy, Op: MI.getOperand(i: 1));
6015
6016 auto LoCTPOP = MIRBuilder.buildCTPOP(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 0));
6017 auto HiCTPOP = MIRBuilder.buildCTPOP(Dst: DstTy, Src0: UnmergeSrc.getReg(Idx: 1));
6018 MIRBuilder.buildAdd(Dst: DstReg, Src0: HiCTPOP, Src1: LoCTPOP);
6019
6020 MI.eraseFromParent();
6021 return Legalized;
6022 }
6023
6024 return UnableToLegalize;
6025}
6026
6027LegalizerHelper::LegalizeResult
6028LegalizerHelper::narrowScalarFLDEXP(MachineInstr &MI, unsigned TypeIdx,
6029 LLT NarrowTy) {
6030 if (TypeIdx != 1)
6031 return UnableToLegalize;
6032
6033 MachineIRBuilder &B = MIRBuilder;
6034 Register ExpReg = MI.getOperand(i: 2).getReg();
6035 LLT ExpTy = MRI.getType(Reg: ExpReg);
6036
6037 unsigned ClampSize = NarrowTy.getScalarSizeInBits();
6038
6039 // Clamp the exponent to the range of the target type.
6040 auto MinExp = B.buildConstant(Res: ExpTy, Val: minIntN(N: ClampSize));
6041 auto ClampMin = B.buildSMax(Dst: ExpTy, Src0: ExpReg, Src1: MinExp);
6042 auto MaxExp = B.buildConstant(Res: ExpTy, Val: maxIntN(N: ClampSize));
6043 auto Clamp = B.buildSMin(Dst: ExpTy, Src0: ClampMin, Src1: MaxExp);
6044
6045 auto Trunc = B.buildTrunc(Res: NarrowTy, Op: Clamp);
6046 Observer.changingInstr(MI);
6047 MI.getOperand(i: 2).setReg(Trunc.getReg(Idx: 0));
6048 Observer.changedInstr(MI);
6049 return Legalized;
6050}
6051
6052LegalizerHelper::LegalizeResult
6053LegalizerHelper::lowerBitCount(MachineInstr &MI) {
6054 unsigned Opc = MI.getOpcode();
6055 const auto &TII = MIRBuilder.getTII();
6056 auto isSupported = [this](const LegalityQuery &Q) {
6057 auto QAction = LI.getAction(Query: Q).Action;
6058 return QAction == Legal || QAction == Libcall || QAction == Custom;
6059 };
6060 switch (Opc) {
6061 default:
6062 return UnableToLegalize;
6063 case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
6064 // This trivially expands to CTLZ.
6065 Observer.changingInstr(MI);
6066 MI.setDesc(TII.get(Opcode: TargetOpcode::G_CTLZ));
6067 Observer.changedInstr(MI);
6068 return Legalized;
6069 }
6070 case TargetOpcode::G_CTLZ: {
6071 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6072 unsigned Len = SrcTy.getSizeInBits();
6073
6074 if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
6075 // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
6076 auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(Dst: DstTy, Src0: SrcReg);
6077 auto ZeroSrc = MIRBuilder.buildConstant(Res: SrcTy, Val: 0);
6078 auto ICmp = MIRBuilder.buildICmp(
6079 Pred: CmpInst::ICMP_EQ, Res: SrcTy.changeElementSize(NewEltSize: 1), Op0: SrcReg, Op1: ZeroSrc);
6080 auto LenConst = MIRBuilder.buildConstant(Res: DstTy, Val: Len);
6081 MIRBuilder.buildSelect(Res: DstReg, Tst: ICmp, Op0: LenConst, Op1: CtlzZU);
6082 MI.eraseFromParent();
6083 return Legalized;
6084 }
6085 // for now, we do this:
6086 // NewLen = NextPowerOf2(Len);
6087 // x = x | (x >> 1);
6088 // x = x | (x >> 2);
6089 // ...
6090 // x = x | (x >>16);
6091 // x = x | (x >>32); // for 64-bit input
6092 // Upto NewLen/2
6093 // return Len - popcount(x);
6094 //
6095 // Ref: "Hacker's Delight" by Henry Warren
6096 Register Op = SrcReg;
6097 unsigned NewLen = PowerOf2Ceil(A: Len);
6098 for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
6099 auto MIBShiftAmt = MIRBuilder.buildConstant(Res: SrcTy, Val: 1ULL << i);
6100 auto MIBOp = MIRBuilder.buildOr(
6101 Dst: SrcTy, Src0: Op, Src1: MIRBuilder.buildLShr(Dst: SrcTy, Src0: Op, Src1: MIBShiftAmt));
6102 Op = MIBOp.getReg(Idx: 0);
6103 }
6104 auto MIBPop = MIRBuilder.buildCTPOP(Dst: DstTy, Src0: Op);
6105 MIRBuilder.buildSub(Dst: MI.getOperand(i: 0), Src0: MIRBuilder.buildConstant(Res: DstTy, Val: Len),
6106 Src1: MIBPop);
6107 MI.eraseFromParent();
6108 return Legalized;
6109 }
6110 case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
6111 // This trivially expands to CTTZ.
6112 Observer.changingInstr(MI);
6113 MI.setDesc(TII.get(Opcode: TargetOpcode::G_CTTZ));
6114 Observer.changedInstr(MI);
6115 return Legalized;
6116 }
6117 case TargetOpcode::G_CTTZ: {
6118 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6119
6120 unsigned Len = SrcTy.getSizeInBits();
6121 if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
6122 // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
6123 // zero.
6124 auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(Dst: DstTy, Src0: SrcReg);
6125 auto Zero = MIRBuilder.buildConstant(Res: SrcTy, Val: 0);
6126 auto ICmp = MIRBuilder.buildICmp(
6127 Pred: CmpInst::ICMP_EQ, Res: DstTy.changeElementSize(NewEltSize: 1), Op0: SrcReg, Op1: Zero);
6128 auto LenConst = MIRBuilder.buildConstant(Res: DstTy, Val: Len);
6129 MIRBuilder.buildSelect(Res: DstReg, Tst: ICmp, Op0: LenConst, Op1: CttzZU);
6130 MI.eraseFromParent();
6131 return Legalized;
6132 }
6133 // for now, we use: { return popcount(~x & (x - 1)); }
6134 // unless the target has ctlz but not ctpop, in which case we use:
6135 // { return 32 - nlz(~x & (x-1)); }
6136 // Ref: "Hacker's Delight" by Henry Warren
6137 auto MIBCstNeg1 = MIRBuilder.buildConstant(Res: SrcTy, Val: -1);
6138 auto MIBNot = MIRBuilder.buildXor(Dst: SrcTy, Src0: SrcReg, Src1: MIBCstNeg1);
6139 auto MIBTmp = MIRBuilder.buildAnd(
6140 Dst: SrcTy, Src0: MIBNot, Src1: MIRBuilder.buildAdd(Dst: SrcTy, Src0: SrcReg, Src1: MIBCstNeg1));
6141 if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
6142 isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
6143 auto MIBCstLen = MIRBuilder.buildConstant(Res: SrcTy, Val: Len);
6144 MIRBuilder.buildSub(Dst: MI.getOperand(i: 0), Src0: MIBCstLen,
6145 Src1: MIRBuilder.buildCTLZ(Dst: SrcTy, Src0: MIBTmp));
6146 MI.eraseFromParent();
6147 return Legalized;
6148 }
6149 Observer.changingInstr(MI);
6150 MI.setDesc(TII.get(Opcode: TargetOpcode::G_CTPOP));
6151 MI.getOperand(i: 1).setReg(MIBTmp.getReg(Idx: 0));
6152 Observer.changedInstr(MI);
6153 return Legalized;
6154 }
6155 case TargetOpcode::G_CTPOP: {
6156 Register SrcReg = MI.getOperand(i: 1).getReg();
6157 LLT Ty = MRI.getType(Reg: SrcReg);
6158 unsigned Size = Ty.getSizeInBits();
6159 MachineIRBuilder &B = MIRBuilder;
6160
6161 // Count set bits in blocks of 2 bits. Default approach would be
6162 // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
6163 // We use following formula instead:
6164 // B2Count = val - { (val >> 1) & 0x55555555 }
6165 // since it gives same result in blocks of 2 with one instruction less.
6166 auto C_1 = B.buildConstant(Res: Ty, Val: 1);
6167 auto B2Set1LoTo1Hi = B.buildLShr(Dst: Ty, Src0: SrcReg, Src1: C_1);
6168 APInt B2Mask1HiTo0 = APInt::getSplat(NewLen: Size, V: APInt(8, 0x55));
6169 auto C_B2Mask1HiTo0 = B.buildConstant(Res: Ty, Val: B2Mask1HiTo0);
6170 auto B2Count1Hi = B.buildAnd(Dst: Ty, Src0: B2Set1LoTo1Hi, Src1: C_B2Mask1HiTo0);
6171 auto B2Count = B.buildSub(Dst: Ty, Src0: SrcReg, Src1: B2Count1Hi);
6172
6173 // In order to get count in blocks of 4 add values from adjacent block of 2.
6174 // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
6175 auto C_2 = B.buildConstant(Res: Ty, Val: 2);
6176 auto B4Set2LoTo2Hi = B.buildLShr(Dst: Ty, Src0: B2Count, Src1: C_2);
6177 APInt B4Mask2HiTo0 = APInt::getSplat(NewLen: Size, V: APInt(8, 0x33));
6178 auto C_B4Mask2HiTo0 = B.buildConstant(Res: Ty, Val: B4Mask2HiTo0);
6179 auto B4HiB2Count = B.buildAnd(Dst: Ty, Src0: B4Set2LoTo2Hi, Src1: C_B4Mask2HiTo0);
6180 auto B4LoB2Count = B.buildAnd(Dst: Ty, Src0: B2Count, Src1: C_B4Mask2HiTo0);
6181 auto B4Count = B.buildAdd(Dst: Ty, Src0: B4HiB2Count, Src1: B4LoB2Count);
6182
6183 // For count in blocks of 8 bits we don't have to mask high 4 bits before
6184 // addition since count value sits in range {0,...,8} and 4 bits are enough
6185 // to hold such binary values. After addition high 4 bits still hold count
6186 // of set bits in high 4 bit block, set them to zero and get 8 bit result.
6187 // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
6188 auto C_4 = B.buildConstant(Res: Ty, Val: 4);
6189 auto B8HiB4Count = B.buildLShr(Dst: Ty, Src0: B4Count, Src1: C_4);
6190 auto B8CountDirty4Hi = B.buildAdd(Dst: Ty, Src0: B8HiB4Count, Src1: B4Count);
6191 APInt B8Mask4HiTo0 = APInt::getSplat(NewLen: Size, V: APInt(8, 0x0F));
6192 auto C_B8Mask4HiTo0 = B.buildConstant(Res: Ty, Val: B8Mask4HiTo0);
6193 auto B8Count = B.buildAnd(Dst: Ty, Src0: B8CountDirty4Hi, Src1: C_B8Mask4HiTo0);
6194
6195 assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
6196 // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
6197 // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
6198 auto MulMask = B.buildConstant(Res: Ty, Val: APInt::getSplat(NewLen: Size, V: APInt(8, 0x01)));
6199 auto ResTmp = B.buildMul(Dst: Ty, Src0: B8Count, Src1: MulMask);
6200
6201 // Shift count result from 8 high bits to low bits.
6202 auto C_SizeM8 = B.buildConstant(Res: Ty, Val: Size - 8);
6203 B.buildLShr(Dst: MI.getOperand(i: 0).getReg(), Src0: ResTmp, Src1: C_SizeM8);
6204
6205 MI.eraseFromParent();
6206 return Legalized;
6207 }
6208 }
6209}
6210
6211// Check that (every element of) Reg is undef or not an exact multiple of BW.
6212static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
6213 Register Reg, unsigned BW) {
6214 return matchUnaryPredicate(
6215 MRI, Reg,
6216 Match: [=](const Constant *C) {
6217 // Null constant here means an undef.
6218 const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(Val: C);
6219 return !CI || CI->getValue().urem(RHS: BW) != 0;
6220 },
6221 /*AllowUndefs*/ true);
6222}
6223
6224LegalizerHelper::LegalizeResult
6225LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
6226 auto [Dst, X, Y, Z] = MI.getFirst4Regs();
6227 LLT Ty = MRI.getType(Reg: Dst);
6228 LLT ShTy = MRI.getType(Reg: Z);
6229
6230 unsigned BW = Ty.getScalarSizeInBits();
6231
6232 if (!isPowerOf2_32(Value: BW))
6233 return UnableToLegalize;
6234
6235 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
6236 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
6237
6238 if (isNonZeroModBitWidthOrUndef(MRI, Reg: Z, BW)) {
6239 // fshl X, Y, Z -> fshr X, Y, -Z
6240 // fshr X, Y, Z -> fshl X, Y, -Z
6241 auto Zero = MIRBuilder.buildConstant(Res: ShTy, Val: 0);
6242 Z = MIRBuilder.buildSub(Dst: Ty, Src0: Zero, Src1: Z).getReg(Idx: 0);
6243 } else {
6244 // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
6245 // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
6246 auto One = MIRBuilder.buildConstant(Res: ShTy, Val: 1);
6247 if (IsFSHL) {
6248 Y = MIRBuilder.buildInstr(Opc: RevOpcode, DstOps: {Ty}, SrcOps: {X, Y, One}).getReg(Idx: 0);
6249 X = MIRBuilder.buildLShr(Dst: Ty, Src0: X, Src1: One).getReg(Idx: 0);
6250 } else {
6251 X = MIRBuilder.buildInstr(Opc: RevOpcode, DstOps: {Ty}, SrcOps: {X, Y, One}).getReg(Idx: 0);
6252 Y = MIRBuilder.buildShl(Dst: Ty, Src0: Y, Src1: One).getReg(Idx: 0);
6253 }
6254
6255 Z = MIRBuilder.buildNot(Dst: ShTy, Src0: Z).getReg(Idx: 0);
6256 }
6257
6258 MIRBuilder.buildInstr(Opc: RevOpcode, DstOps: {Dst}, SrcOps: {X, Y, Z});
6259 MI.eraseFromParent();
6260 return Legalized;
6261}
6262
6263LegalizerHelper::LegalizeResult
6264LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
6265 auto [Dst, X, Y, Z] = MI.getFirst4Regs();
6266 LLT Ty = MRI.getType(Reg: Dst);
6267 LLT ShTy = MRI.getType(Reg: Z);
6268
6269 const unsigned BW = Ty.getScalarSizeInBits();
6270 const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
6271
6272 Register ShX, ShY;
6273 Register ShAmt, InvShAmt;
6274
6275 // FIXME: Emit optimized urem by constant instead of letting it expand later.
6276 if (isNonZeroModBitWidthOrUndef(MRI, Reg: Z, BW)) {
6277 // fshl: X << C | Y >> (BW - C)
6278 // fshr: X << (BW - C) | Y >> C
6279 // where C = Z % BW is not zero
6280 auto BitWidthC = MIRBuilder.buildConstant(Res: ShTy, Val: BW);
6281 ShAmt = MIRBuilder.buildURem(Dst: ShTy, Src0: Z, Src1: BitWidthC).getReg(Idx: 0);
6282 InvShAmt = MIRBuilder.buildSub(Dst: ShTy, Src0: BitWidthC, Src1: ShAmt).getReg(Idx: 0);
6283 ShX = MIRBuilder.buildShl(Dst: Ty, Src0: X, Src1: IsFSHL ? ShAmt : InvShAmt).getReg(Idx: 0);
6284 ShY = MIRBuilder.buildLShr(Dst: Ty, Src0: Y, Src1: IsFSHL ? InvShAmt : ShAmt).getReg(Idx: 0);
6285 } else {
6286 // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
6287 // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
6288 auto Mask = MIRBuilder.buildConstant(Res: ShTy, Val: BW - 1);
6289 if (isPowerOf2_32(Value: BW)) {
6290 // Z % BW -> Z & (BW - 1)
6291 ShAmt = MIRBuilder.buildAnd(Dst: ShTy, Src0: Z, Src1: Mask).getReg(Idx: 0);
6292 // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
6293 auto NotZ = MIRBuilder.buildNot(Dst: ShTy, Src0: Z);
6294 InvShAmt = MIRBuilder.buildAnd(Dst: ShTy, Src0: NotZ, Src1: Mask).getReg(Idx: 0);
6295 } else {
6296 auto BitWidthC = MIRBuilder.buildConstant(Res: ShTy, Val: BW);
6297 ShAmt = MIRBuilder.buildURem(Dst: ShTy, Src0: Z, Src1: BitWidthC).getReg(Idx: 0);
6298 InvShAmt = MIRBuilder.buildSub(Dst: ShTy, Src0: Mask, Src1: ShAmt).getReg(Idx: 0);
6299 }
6300
6301 auto One = MIRBuilder.buildConstant(Res: ShTy, Val: 1);
6302 if (IsFSHL) {
6303 ShX = MIRBuilder.buildShl(Dst: Ty, Src0: X, Src1: ShAmt).getReg(Idx: 0);
6304 auto ShY1 = MIRBuilder.buildLShr(Dst: Ty, Src0: Y, Src1: One);
6305 ShY = MIRBuilder.buildLShr(Dst: Ty, Src0: ShY1, Src1: InvShAmt).getReg(Idx: 0);
6306 } else {
6307 auto ShX1 = MIRBuilder.buildShl(Dst: Ty, Src0: X, Src1: One);
6308 ShX = MIRBuilder.buildShl(Dst: Ty, Src0: ShX1, Src1: InvShAmt).getReg(Idx: 0);
6309 ShY = MIRBuilder.buildLShr(Dst: Ty, Src0: Y, Src1: ShAmt).getReg(Idx: 0);
6310 }
6311 }
6312
6313 MIRBuilder.buildOr(Dst, Src0: ShX, Src1: ShY);
6314 MI.eraseFromParent();
6315 return Legalized;
6316}
6317
6318LegalizerHelper::LegalizeResult
6319LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
6320 // These operations approximately do the following (while avoiding undefined
6321 // shifts by BW):
6322 // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
6323 // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
6324 Register Dst = MI.getOperand(i: 0).getReg();
6325 LLT Ty = MRI.getType(Reg: Dst);
6326 LLT ShTy = MRI.getType(Reg: MI.getOperand(i: 3).getReg());
6327
6328 bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
6329 unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
6330
6331 // TODO: Use smarter heuristic that accounts for vector legalization.
6332 if (LI.getAction(Query: {RevOpcode, {Ty, ShTy}}).Action == Lower)
6333 return lowerFunnelShiftAsShifts(MI);
6334
6335 // This only works for powers of 2, fallback to shifts if it fails.
6336 LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
6337 if (Result == UnableToLegalize)
6338 return lowerFunnelShiftAsShifts(MI);
6339 return Result;
6340}
6341
6342LegalizerHelper::LegalizeResult LegalizerHelper::lowerEXT(MachineInstr &MI) {
6343 auto [Dst, Src] = MI.getFirst2Regs();
6344 LLT DstTy = MRI.getType(Reg: Dst);
6345 LLT SrcTy = MRI.getType(Reg: Src);
6346
6347 uint32_t DstTySize = DstTy.getSizeInBits();
6348 uint32_t DstTyScalarSize = DstTy.getScalarSizeInBits();
6349 uint32_t SrcTyScalarSize = SrcTy.getScalarSizeInBits();
6350
6351 if (!isPowerOf2_32(Value: DstTySize) || !isPowerOf2_32(Value: DstTyScalarSize) ||
6352 !isPowerOf2_32(Value: SrcTyScalarSize))
6353 return UnableToLegalize;
6354
6355 // The step between extend is too large, split it by creating an intermediate
6356 // extend instruction
6357 if (SrcTyScalarSize * 2 < DstTyScalarSize) {
6358 LLT MidTy = SrcTy.changeElementSize(NewEltSize: SrcTyScalarSize * 2);
6359 // If the destination type is illegal, split it into multiple statements
6360 // zext x -> zext(merge(zext(unmerge), zext(unmerge)))
6361 auto NewExt = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {MidTy}, SrcOps: {Src});
6362 // Unmerge the vector
6363 LLT EltTy = MidTy.changeElementCount(
6364 EC: MidTy.getElementCount().divideCoefficientBy(RHS: 2));
6365 auto UnmergeSrc = MIRBuilder.buildUnmerge(Res: EltTy, Op: NewExt);
6366
6367 // ZExt the vectors
6368 LLT ZExtResTy = DstTy.changeElementCount(
6369 EC: DstTy.getElementCount().divideCoefficientBy(RHS: 2));
6370 auto ZExtRes1 = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {ZExtResTy},
6371 SrcOps: {UnmergeSrc.getReg(Idx: 0)});
6372 auto ZExtRes2 = MIRBuilder.buildInstr(Opc: MI.getOpcode(), DstOps: {ZExtResTy},
6373 SrcOps: {UnmergeSrc.getReg(Idx: 1)});
6374
6375 // Merge the ending vectors
6376 MIRBuilder.buildMergeLikeInstr(Res: Dst, Ops: {ZExtRes1, ZExtRes2});
6377
6378 MI.eraseFromParent();
6379 return Legalized;
6380 }
6381 return UnableToLegalize;
6382}
6383
6384LegalizerHelper::LegalizeResult LegalizerHelper::lowerTRUNC(MachineInstr &MI) {
6385 // MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
6386 MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
6387 // Similar to how operand splitting is done in SelectiondDAG, we can handle
6388 // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
6389 // %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
6390 // %lo16(<4 x s16>) = G_TRUNC %inlo
6391 // %hi16(<4 x s16>) = G_TRUNC %inhi
6392 // %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
6393 // %res(<8 x s8>) = G_TRUNC %in16
6394
6395 assert(MI.getOpcode() == TargetOpcode::G_TRUNC);
6396
6397 Register DstReg = MI.getOperand(i: 0).getReg();
6398 Register SrcReg = MI.getOperand(i: 1).getReg();
6399 LLT DstTy = MRI.getType(Reg: DstReg);
6400 LLT SrcTy = MRI.getType(Reg: SrcReg);
6401
6402 if (DstTy.isVector() && isPowerOf2_32(Value: DstTy.getNumElements()) &&
6403 isPowerOf2_32(Value: DstTy.getScalarSizeInBits()) &&
6404 isPowerOf2_32(Value: SrcTy.getNumElements()) &&
6405 isPowerOf2_32(Value: SrcTy.getScalarSizeInBits())) {
6406 // Split input type.
6407 LLT SplitSrcTy = SrcTy.changeElementCount(
6408 EC: SrcTy.getElementCount().divideCoefficientBy(RHS: 2));
6409
6410 // First, split the source into two smaller vectors.
6411 SmallVector<Register, 2> SplitSrcs;
6412 extractParts(Reg: SrcReg, Ty: SplitSrcTy, NumParts: 2, VRegs&: SplitSrcs, MIRBuilder, MRI);
6413
6414 // Truncate the splits into intermediate narrower elements.
6415 LLT InterTy;
6416 if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
6417 InterTy = SplitSrcTy.changeElementSize(NewEltSize: DstTy.getScalarSizeInBits() * 2);
6418 else
6419 InterTy = SplitSrcTy.changeElementSize(NewEltSize: DstTy.getScalarSizeInBits());
6420 for (unsigned I = 0; I < SplitSrcs.size(); ++I) {
6421 SplitSrcs[I] = MIRBuilder.buildTrunc(Res: InterTy, Op: SplitSrcs[I]).getReg(Idx: 0);
6422 }
6423
6424 // Combine the new truncates into one vector
6425 auto Merge = MIRBuilder.buildMergeLikeInstr(
6426 Res: DstTy.changeElementSize(NewEltSize: InterTy.getScalarSizeInBits()), Ops: SplitSrcs);
6427
6428 // Truncate the new vector to the final result type
6429 if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
6430 MIRBuilder.buildTrunc(Res: MI.getOperand(i: 0).getReg(), Op: Merge.getReg(Idx: 0));
6431 else
6432 MIRBuilder.buildCopy(Res: MI.getOperand(i: 0).getReg(), Op: Merge.getReg(Idx: 0));
6433
6434 MI.eraseFromParent();
6435
6436 return Legalized;
6437 }
6438 return UnableToLegalize;
6439}
6440
6441LegalizerHelper::LegalizeResult
6442LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
6443 auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
6444 auto Zero = MIRBuilder.buildConstant(Res: AmtTy, Val: 0);
6445 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
6446 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
6447 auto Neg = MIRBuilder.buildSub(Dst: AmtTy, Src0: Zero, Src1: Amt);
6448 MIRBuilder.buildInstr(Opc: RevRot, DstOps: {Dst}, SrcOps: {Src, Neg});
6449 MI.eraseFromParent();
6450 return Legalized;
6451}
6452
6453LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
6454 auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
6455
6456 unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
6457 bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
6458
6459 MIRBuilder.setInstrAndDebugLoc(MI);
6460
6461 // If a rotate in the other direction is supported, use it.
6462 unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
6463 if (LI.isLegalOrCustom(Query: {RevRot, {DstTy, SrcTy}}) &&
6464 isPowerOf2_32(Value: EltSizeInBits))
6465 return lowerRotateWithReverseRotate(MI);
6466
6467 // If a funnel shift is supported, use it.
6468 unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
6469 unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
6470 bool IsFShLegal = false;
6471 if ((IsFShLegal = LI.isLegalOrCustom(Query: {FShOpc, {DstTy, AmtTy}})) ||
6472 LI.isLegalOrCustom(Query: {RevFsh, {DstTy, AmtTy}})) {
6473 auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
6474 Register R3) {
6475 MIRBuilder.buildInstr(Opc, DstOps: {R1}, SrcOps: {R2, R2, R3});
6476 MI.eraseFromParent();
6477 return Legalized;
6478 };
6479 // If a funnel shift in the other direction is supported, use it.
6480 if (IsFShLegal) {
6481 return buildFunnelShift(FShOpc, Dst, Src, Amt);
6482 } else if (isPowerOf2_32(Value: EltSizeInBits)) {
6483 Amt = MIRBuilder.buildNeg(Dst: DstTy, Src0: Amt).getReg(Idx: 0);
6484 return buildFunnelShift(RevFsh, Dst, Src, Amt);
6485 }
6486 }
6487
6488 auto Zero = MIRBuilder.buildConstant(Res: AmtTy, Val: 0);
6489 unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
6490 unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
6491 auto BitWidthMinusOneC = MIRBuilder.buildConstant(Res: AmtTy, Val: EltSizeInBits - 1);
6492 Register ShVal;
6493 Register RevShiftVal;
6494 if (isPowerOf2_32(Value: EltSizeInBits)) {
6495 // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
6496 // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
6497 auto NegAmt = MIRBuilder.buildSub(Dst: AmtTy, Src0: Zero, Src1: Amt);
6498 auto ShAmt = MIRBuilder.buildAnd(Dst: AmtTy, Src0: Amt, Src1: BitWidthMinusOneC);
6499 ShVal = MIRBuilder.buildInstr(Opc: ShOpc, DstOps: {DstTy}, SrcOps: {Src, ShAmt}).getReg(Idx: 0);
6500 auto RevAmt = MIRBuilder.buildAnd(Dst: AmtTy, Src0: NegAmt, Src1: BitWidthMinusOneC);
6501 RevShiftVal =
6502 MIRBuilder.buildInstr(Opc: RevShiftOpc, DstOps: {DstTy}, SrcOps: {Src, RevAmt}).getReg(Idx: 0);
6503 } else {
6504 // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
6505 // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
6506 auto BitWidthC = MIRBuilder.buildConstant(Res: AmtTy, Val: EltSizeInBits);
6507 auto ShAmt = MIRBuilder.buildURem(Dst: AmtTy, Src0: Amt, Src1: BitWidthC);
6508 ShVal = MIRBuilder.buildInstr(Opc: ShOpc, DstOps: {DstTy}, SrcOps: {Src, ShAmt}).getReg(Idx: 0);
6509 auto RevAmt = MIRBuilder.buildSub(Dst: AmtTy, Src0: BitWidthMinusOneC, Src1: ShAmt);
6510 auto One = MIRBuilder.buildConstant(Res: AmtTy, Val: 1);
6511 auto Inner = MIRBuilder.buildInstr(Opc: RevShiftOpc, DstOps: {DstTy}, SrcOps: {Src, One});
6512 RevShiftVal =
6513 MIRBuilder.buildInstr(Opc: RevShiftOpc, DstOps: {DstTy}, SrcOps: {Inner, RevAmt}).getReg(Idx: 0);
6514 }
6515 MIRBuilder.buildOr(Dst, Src0: ShVal, Src1: RevShiftVal);
6516 MI.eraseFromParent();
6517 return Legalized;
6518}
6519
6520// Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
6521// representation.
6522LegalizerHelper::LegalizeResult
6523LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
6524 auto [Dst, Src] = MI.getFirst2Regs();
6525 const LLT S64 = LLT::scalar(SizeInBits: 64);
6526 const LLT S32 = LLT::scalar(SizeInBits: 32);
6527 const LLT S1 = LLT::scalar(SizeInBits: 1);
6528
6529 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
6530
6531 // unsigned cul2f(ulong u) {
6532 // uint lz = clz(u);
6533 // uint e = (u != 0) ? 127U + 63U - lz : 0;
6534 // u = (u << lz) & 0x7fffffffffffffffUL;
6535 // ulong t = u & 0xffffffffffUL;
6536 // uint v = (e << 23) | (uint)(u >> 40);
6537 // uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
6538 // return as_float(v + r);
6539 // }
6540
6541 auto Zero32 = MIRBuilder.buildConstant(Res: S32, Val: 0);
6542 auto Zero64 = MIRBuilder.buildConstant(Res: S64, Val: 0);
6543
6544 auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(Dst: S32, Src0: Src);
6545
6546 auto K = MIRBuilder.buildConstant(Res: S32, Val: 127U + 63U);
6547 auto Sub = MIRBuilder.buildSub(Dst: S32, Src0: K, Src1: LZ);
6548
6549 auto NotZero = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: Src, Op1: Zero64);
6550 auto E = MIRBuilder.buildSelect(Res: S32, Tst: NotZero, Op0: Sub, Op1: Zero32);
6551
6552 auto Mask0 = MIRBuilder.buildConstant(Res: S64, Val: (-1ULL) >> 1);
6553 auto ShlLZ = MIRBuilder.buildShl(Dst: S64, Src0: Src, Src1: LZ);
6554
6555 auto U = MIRBuilder.buildAnd(Dst: S64, Src0: ShlLZ, Src1: Mask0);
6556
6557 auto Mask1 = MIRBuilder.buildConstant(Res: S64, Val: 0xffffffffffULL);
6558 auto T = MIRBuilder.buildAnd(Dst: S64, Src0: U, Src1: Mask1);
6559
6560 auto UShl = MIRBuilder.buildLShr(Dst: S64, Src0: U, Src1: MIRBuilder.buildConstant(Res: S64, Val: 40));
6561 auto ShlE = MIRBuilder.buildShl(Dst: S32, Src0: E, Src1: MIRBuilder.buildConstant(Res: S32, Val: 23));
6562 auto V = MIRBuilder.buildOr(Dst: S32, Src0: ShlE, Src1: MIRBuilder.buildTrunc(Res: S32, Op: UShl));
6563
6564 auto C = MIRBuilder.buildConstant(Res: S64, Val: 0x8000000000ULL);
6565 auto RCmp = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_UGT, Res: S1, Op0: T, Op1: C);
6566 auto TCmp = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: T, Op1: C);
6567 auto One = MIRBuilder.buildConstant(Res: S32, Val: 1);
6568
6569 auto VTrunc1 = MIRBuilder.buildAnd(Dst: S32, Src0: V, Src1: One);
6570 auto Select0 = MIRBuilder.buildSelect(Res: S32, Tst: TCmp, Op0: VTrunc1, Op1: Zero32);
6571 auto R = MIRBuilder.buildSelect(Res: S32, Tst: RCmp, Op0: One, Op1: Select0);
6572 MIRBuilder.buildAdd(Dst, Src0: V, Src1: R);
6573
6574 MI.eraseFromParent();
6575 return Legalized;
6576}
6577
6578LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
6579 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
6580
6581 if (SrcTy == LLT::scalar(SizeInBits: 1)) {
6582 auto True = MIRBuilder.buildFConstant(Res: DstTy, Val: 1.0);
6583 auto False = MIRBuilder.buildFConstant(Res: DstTy, Val: 0.0);
6584 MIRBuilder.buildSelect(Res: Dst, Tst: Src, Op0: True, Op1: False);
6585 MI.eraseFromParent();
6586 return Legalized;
6587 }
6588
6589 if (SrcTy != LLT::scalar(SizeInBits: 64))
6590 return UnableToLegalize;
6591
6592 if (DstTy == LLT::scalar(SizeInBits: 32)) {
6593 // TODO: SelectionDAG has several alternative expansions to port which may
6594 // be more reasonble depending on the available instructions. If a target
6595 // has sitofp, does not have CTLZ, or can efficiently use f64 as an
6596 // intermediate type, this is probably worse.
6597 return lowerU64ToF32BitOps(MI);
6598 }
6599
6600 return UnableToLegalize;
6601}
6602
6603LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
6604 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
6605
6606 const LLT S64 = LLT::scalar(SizeInBits: 64);
6607 const LLT S32 = LLT::scalar(SizeInBits: 32);
6608 const LLT S1 = LLT::scalar(SizeInBits: 1);
6609
6610 if (SrcTy == S1) {
6611 auto True = MIRBuilder.buildFConstant(Res: DstTy, Val: -1.0);
6612 auto False = MIRBuilder.buildFConstant(Res: DstTy, Val: 0.0);
6613 MIRBuilder.buildSelect(Res: Dst, Tst: Src, Op0: True, Op1: False);
6614 MI.eraseFromParent();
6615 return Legalized;
6616 }
6617
6618 if (SrcTy != S64)
6619 return UnableToLegalize;
6620
6621 if (DstTy == S32) {
6622 // signed cl2f(long l) {
6623 // long s = l >> 63;
6624 // float r = cul2f((l + s) ^ s);
6625 // return s ? -r : r;
6626 // }
6627 Register L = Src;
6628 auto SignBit = MIRBuilder.buildConstant(Res: S64, Val: 63);
6629 auto S = MIRBuilder.buildAShr(Dst: S64, Src0: L, Src1: SignBit);
6630
6631 auto LPlusS = MIRBuilder.buildAdd(Dst: S64, Src0: L, Src1: S);
6632 auto Xor = MIRBuilder.buildXor(Dst: S64, Src0: LPlusS, Src1: S);
6633 auto R = MIRBuilder.buildUITOFP(Dst: S32, Src0: Xor);
6634
6635 auto RNeg = MIRBuilder.buildFNeg(Dst: S32, Src0: R);
6636 auto SignNotZero = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: S,
6637 Op1: MIRBuilder.buildConstant(Res: S64, Val: 0));
6638 MIRBuilder.buildSelect(Res: Dst, Tst: SignNotZero, Op0: RNeg, Op1: R);
6639 MI.eraseFromParent();
6640 return Legalized;
6641 }
6642
6643 return UnableToLegalize;
6644}
6645
6646LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
6647 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
6648 const LLT S64 = LLT::scalar(SizeInBits: 64);
6649 const LLT S32 = LLT::scalar(SizeInBits: 32);
6650
6651 if (SrcTy != S64 && SrcTy != S32)
6652 return UnableToLegalize;
6653 if (DstTy != S32 && DstTy != S64)
6654 return UnableToLegalize;
6655
6656 // FPTOSI gives same result as FPTOUI for positive signed integers.
6657 // FPTOUI needs to deal with fp values that convert to unsigned integers
6658 // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
6659
6660 APInt TwoPExpInt = APInt::getSignMask(BitWidth: DstTy.getSizeInBits());
6661 APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
6662 : APFloat::IEEEdouble(),
6663 APInt::getZero(numBits: SrcTy.getSizeInBits()));
6664 TwoPExpFP.convertFromAPInt(Input: TwoPExpInt, IsSigned: false, RM: APFloat::rmNearestTiesToEven);
6665
6666 MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(Dst: DstTy, Src0: Src);
6667
6668 MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(Res: SrcTy, Val: TwoPExpFP);
6669 // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
6670 // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
6671 MachineInstrBuilder FSub = MIRBuilder.buildFSub(Dst: SrcTy, Src0: Src, Src1: Threshold);
6672 MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(Dst: DstTy, Src0: FSub);
6673 MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(Res: DstTy, Val: TwoPExpInt);
6674 MachineInstrBuilder Res = MIRBuilder.buildXor(Dst: DstTy, Src0: ResLowBits, Src1: ResHighBit);
6675
6676 const LLT S1 = LLT::scalar(SizeInBits: 1);
6677
6678 MachineInstrBuilder FCMP =
6679 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_ULT, Res: S1, Op0: Src, Op1: Threshold);
6680 MIRBuilder.buildSelect(Res: Dst, Tst: FCMP, Op0: FPTOSI, Op1: Res);
6681
6682 MI.eraseFromParent();
6683 return Legalized;
6684}
6685
6686LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
6687 auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
6688 const LLT S64 = LLT::scalar(SizeInBits: 64);
6689 const LLT S32 = LLT::scalar(SizeInBits: 32);
6690
6691 // FIXME: Only f32 to i64 conversions are supported.
6692 if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
6693 return UnableToLegalize;
6694
6695 // Expand f32 -> i64 conversion
6696 // This algorithm comes from compiler-rt's implementation of fixsfdi:
6697 // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
6698
6699 unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
6700
6701 auto ExponentMask = MIRBuilder.buildConstant(Res: SrcTy, Val: 0x7F800000);
6702 auto ExponentLoBit = MIRBuilder.buildConstant(Res: SrcTy, Val: 23);
6703
6704 auto AndExpMask = MIRBuilder.buildAnd(Dst: SrcTy, Src0: Src, Src1: ExponentMask);
6705 auto ExponentBits = MIRBuilder.buildLShr(Dst: SrcTy, Src0: AndExpMask, Src1: ExponentLoBit);
6706
6707 auto SignMask = MIRBuilder.buildConstant(Res: SrcTy,
6708 Val: APInt::getSignMask(BitWidth: SrcEltBits));
6709 auto AndSignMask = MIRBuilder.buildAnd(Dst: SrcTy, Src0: Src, Src1: SignMask);
6710 auto SignLowBit = MIRBuilder.buildConstant(Res: SrcTy, Val: SrcEltBits - 1);
6711 auto Sign = MIRBuilder.buildAShr(Dst: SrcTy, Src0: AndSignMask, Src1: SignLowBit);
6712 Sign = MIRBuilder.buildSExt(Res: DstTy, Op: Sign);
6713
6714 auto MantissaMask = MIRBuilder.buildConstant(Res: SrcTy, Val: 0x007FFFFF);
6715 auto AndMantissaMask = MIRBuilder.buildAnd(Dst: SrcTy, Src0: Src, Src1: MantissaMask);
6716 auto K = MIRBuilder.buildConstant(Res: SrcTy, Val: 0x00800000);
6717
6718 auto R = MIRBuilder.buildOr(Dst: SrcTy, Src0: AndMantissaMask, Src1: K);
6719 R = MIRBuilder.buildZExt(Res: DstTy, Op: R);
6720
6721 auto Bias = MIRBuilder.buildConstant(Res: SrcTy, Val: 127);
6722 auto Exponent = MIRBuilder.buildSub(Dst: SrcTy, Src0: ExponentBits, Src1: Bias);
6723 auto SubExponent = MIRBuilder.buildSub(Dst: SrcTy, Src0: Exponent, Src1: ExponentLoBit);
6724 auto ExponentSub = MIRBuilder.buildSub(Dst: SrcTy, Src0: ExponentLoBit, Src1: Exponent);
6725
6726 auto Shl = MIRBuilder.buildShl(Dst: DstTy, Src0: R, Src1: SubExponent);
6727 auto Srl = MIRBuilder.buildLShr(Dst: DstTy, Src0: R, Src1: ExponentSub);
6728
6729 const LLT S1 = LLT::scalar(SizeInBits: 1);
6730 auto CmpGt = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SGT,
6731 Res: S1, Op0: Exponent, Op1: ExponentLoBit);
6732
6733 R = MIRBuilder.buildSelect(Res: DstTy, Tst: CmpGt, Op0: Shl, Op1: Srl);
6734
6735 auto XorSign = MIRBuilder.buildXor(Dst: DstTy, Src0: R, Src1: Sign);
6736 auto Ret = MIRBuilder.buildSub(Dst: DstTy, Src0: XorSign, Src1: Sign);
6737
6738 auto ZeroSrcTy = MIRBuilder.buildConstant(Res: SrcTy, Val: 0);
6739
6740 auto ExponentLt0 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT,
6741 Res: S1, Op0: Exponent, Op1: ZeroSrcTy);
6742
6743 auto ZeroDstTy = MIRBuilder.buildConstant(Res: DstTy, Val: 0);
6744 MIRBuilder.buildSelect(Res: Dst, Tst: ExponentLt0, Op0: ZeroDstTy, Op1: Ret);
6745
6746 MI.eraseFromParent();
6747 return Legalized;
6748}
6749
6750// f64 -> f16 conversion using round-to-nearest-even rounding mode.
6751LegalizerHelper::LegalizeResult
6752LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
6753 const LLT S1 = LLT::scalar(SizeInBits: 1);
6754 const LLT S32 = LLT::scalar(SizeInBits: 32);
6755
6756 auto [Dst, Src] = MI.getFirst2Regs();
6757 assert(MRI.getType(Dst).getScalarType() == LLT::scalar(16) &&
6758 MRI.getType(Src).getScalarType() == LLT::scalar(64));
6759
6760 if (MRI.getType(Reg: Src).isVector()) // TODO: Handle vectors directly.
6761 return UnableToLegalize;
6762
6763 if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) {
6764 unsigned Flags = MI.getFlags();
6765 auto Src32 = MIRBuilder.buildFPTrunc(Res: S32, Op: Src, Flags);
6766 MIRBuilder.buildFPTrunc(Res: Dst, Op: Src32, Flags);
6767 MI.eraseFromParent();
6768 return Legalized;
6769 }
6770
6771 const unsigned ExpMask = 0x7ff;
6772 const unsigned ExpBiasf64 = 1023;
6773 const unsigned ExpBiasf16 = 15;
6774
6775 auto Unmerge = MIRBuilder.buildUnmerge(Res: S32, Op: Src);
6776 Register U = Unmerge.getReg(Idx: 0);
6777 Register UH = Unmerge.getReg(Idx: 1);
6778
6779 auto E = MIRBuilder.buildLShr(Dst: S32, Src0: UH, Src1: MIRBuilder.buildConstant(Res: S32, Val: 20));
6780 E = MIRBuilder.buildAnd(Dst: S32, Src0: E, Src1: MIRBuilder.buildConstant(Res: S32, Val: ExpMask));
6781
6782 // Subtract the fp64 exponent bias (1023) to get the real exponent and
6783 // add the f16 bias (15) to get the biased exponent for the f16 format.
6784 E = MIRBuilder.buildAdd(
6785 Dst: S32, Src0: E, Src1: MIRBuilder.buildConstant(Res: S32, Val: -ExpBiasf64 + ExpBiasf16));
6786
6787 auto M = MIRBuilder.buildLShr(Dst: S32, Src0: UH, Src1: MIRBuilder.buildConstant(Res: S32, Val: 8));
6788 M = MIRBuilder.buildAnd(Dst: S32, Src0: M, Src1: MIRBuilder.buildConstant(Res: S32, Val: 0xffe));
6789
6790 auto MaskedSig = MIRBuilder.buildAnd(Dst: S32, Src0: UH,
6791 Src1: MIRBuilder.buildConstant(Res: S32, Val: 0x1ff));
6792 MaskedSig = MIRBuilder.buildOr(Dst: S32, Src0: MaskedSig, Src1: U);
6793
6794 auto Zero = MIRBuilder.buildConstant(Res: S32, Val: 0);
6795 auto SigCmpNE0 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: MaskedSig, Op1: Zero);
6796 auto Lo40Set = MIRBuilder.buildZExt(Res: S32, Op: SigCmpNE0);
6797 M = MIRBuilder.buildOr(Dst: S32, Src0: M, Src1: Lo40Set);
6798
6799 // (M != 0 ? 0x0200 : 0) | 0x7c00;
6800 auto Bits0x200 = MIRBuilder.buildConstant(Res: S32, Val: 0x0200);
6801 auto CmpM_NE0 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1, Op0: M, Op1: Zero);
6802 auto SelectCC = MIRBuilder.buildSelect(Res: S32, Tst: CmpM_NE0, Op0: Bits0x200, Op1: Zero);
6803
6804 auto Bits0x7c00 = MIRBuilder.buildConstant(Res: S32, Val: 0x7c00);
6805 auto I = MIRBuilder.buildOr(Dst: S32, Src0: SelectCC, Src1: Bits0x7c00);
6806
6807 // N = M | (E << 12);
6808 auto EShl12 = MIRBuilder.buildShl(Dst: S32, Src0: E, Src1: MIRBuilder.buildConstant(Res: S32, Val: 12));
6809 auto N = MIRBuilder.buildOr(Dst: S32, Src0: M, Src1: EShl12);
6810
6811 // B = clamp(1-E, 0, 13);
6812 auto One = MIRBuilder.buildConstant(Res: S32, Val: 1);
6813 auto OneSubExp = MIRBuilder.buildSub(Dst: S32, Src0: One, Src1: E);
6814 auto B = MIRBuilder.buildSMax(Dst: S32, Src0: OneSubExp, Src1: Zero);
6815 B = MIRBuilder.buildSMin(Dst: S32, Src0: B, Src1: MIRBuilder.buildConstant(Res: S32, Val: 13));
6816
6817 auto SigSetHigh = MIRBuilder.buildOr(Dst: S32, Src0: M,
6818 Src1: MIRBuilder.buildConstant(Res: S32, Val: 0x1000));
6819
6820 auto D = MIRBuilder.buildLShr(Dst: S32, Src0: SigSetHigh, Src1: B);
6821 auto D0 = MIRBuilder.buildShl(Dst: S32, Src0: D, Src1: B);
6822
6823 auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: S1,
6824 Op0: D0, Op1: SigSetHigh);
6825 auto D1 = MIRBuilder.buildZExt(Res: S32, Op: D0_NE_SigSetHigh);
6826 D = MIRBuilder.buildOr(Dst: S32, Src0: D, Src1: D1);
6827
6828 auto CmpELtOne = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: S1, Op0: E, Op1: One);
6829 auto V = MIRBuilder.buildSelect(Res: S32, Tst: CmpELtOne, Op0: D, Op1: N);
6830
6831 auto VLow3 = MIRBuilder.buildAnd(Dst: S32, Src0: V, Src1: MIRBuilder.buildConstant(Res: S32, Val: 7));
6832 V = MIRBuilder.buildLShr(Dst: S32, Src0: V, Src1: MIRBuilder.buildConstant(Res: S32, Val: 2));
6833
6834 auto VLow3Eq3 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1, Op0: VLow3,
6835 Op1: MIRBuilder.buildConstant(Res: S32, Val: 3));
6836 auto V0 = MIRBuilder.buildZExt(Res: S32, Op: VLow3Eq3);
6837
6838 auto VLow3Gt5 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1, Op0: VLow3,
6839 Op1: MIRBuilder.buildConstant(Res: S32, Val: 5));
6840 auto V1 = MIRBuilder.buildZExt(Res: S32, Op: VLow3Gt5);
6841
6842 V1 = MIRBuilder.buildOr(Dst: S32, Src0: V0, Src1: V1);
6843 V = MIRBuilder.buildAdd(Dst: S32, Src0: V, Src1: V1);
6844
6845 auto CmpEGt30 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SGT, Res: S1,
6846 Op0: E, Op1: MIRBuilder.buildConstant(Res: S32, Val: 30));
6847 V = MIRBuilder.buildSelect(Res: S32, Tst: CmpEGt30,
6848 Op0: MIRBuilder.buildConstant(Res: S32, Val: 0x7c00), Op1: V);
6849
6850 auto CmpEGt1039 = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_EQ, Res: S1,
6851 Op0: E, Op1: MIRBuilder.buildConstant(Res: S32, Val: 1039));
6852 V = MIRBuilder.buildSelect(Res: S32, Tst: CmpEGt1039, Op0: I, Op1: V);
6853
6854 // Extract the sign bit.
6855 auto Sign = MIRBuilder.buildLShr(Dst: S32, Src0: UH, Src1: MIRBuilder.buildConstant(Res: S32, Val: 16));
6856 Sign = MIRBuilder.buildAnd(Dst: S32, Src0: Sign, Src1: MIRBuilder.buildConstant(Res: S32, Val: 0x8000));
6857
6858 // Insert the sign bit
6859 V = MIRBuilder.buildOr(Dst: S32, Src0: Sign, Src1: V);
6860
6861 MIRBuilder.buildTrunc(Res: Dst, Op: V);
6862 MI.eraseFromParent();
6863 return Legalized;
6864}
6865
6866LegalizerHelper::LegalizeResult
6867LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
6868 auto [DstTy, SrcTy] = MI.getFirst2LLTs();
6869 const LLT S64 = LLT::scalar(SizeInBits: 64);
6870 const LLT S16 = LLT::scalar(SizeInBits: 16);
6871
6872 if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
6873 return lowerFPTRUNC_F64_TO_F16(MI);
6874
6875 return UnableToLegalize;
6876}
6877
6878// TODO: If RHS is a constant SelectionDAGBuilder expands this into a
6879// multiplication tree.
6880LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
6881 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
6882 LLT Ty = MRI.getType(Reg: Dst);
6883
6884 auto CvtSrc1 = MIRBuilder.buildSITOFP(Dst: Ty, Src0: Src1);
6885 MIRBuilder.buildFPow(Dst, Src0, Src1: CvtSrc1, Flags: MI.getFlags());
6886 MI.eraseFromParent();
6887 return Legalized;
6888}
6889
6890static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
6891 switch (Opc) {
6892 case TargetOpcode::G_SMIN:
6893 return CmpInst::ICMP_SLT;
6894 case TargetOpcode::G_SMAX:
6895 return CmpInst::ICMP_SGT;
6896 case TargetOpcode::G_UMIN:
6897 return CmpInst::ICMP_ULT;
6898 case TargetOpcode::G_UMAX:
6899 return CmpInst::ICMP_UGT;
6900 default:
6901 llvm_unreachable("not in integer min/max");
6902 }
6903}
6904
6905LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
6906 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
6907
6908 const CmpInst::Predicate Pred = minMaxToCompare(Opc: MI.getOpcode());
6909 LLT CmpType = MRI.getType(Reg: Dst).changeElementSize(NewEltSize: 1);
6910
6911 auto Cmp = MIRBuilder.buildICmp(Pred, Res: CmpType, Op0: Src0, Op1: Src1);
6912 MIRBuilder.buildSelect(Res: Dst, Tst: Cmp, Op0: Src0, Op1: Src1);
6913
6914 MI.eraseFromParent();
6915 return Legalized;
6916}
6917
6918LegalizerHelper::LegalizeResult
6919LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
6920 auto [Dst, DstTy, Src0, Src0Ty, Src1, Src1Ty] = MI.getFirst3RegLLTs();
6921 const int Src0Size = Src0Ty.getScalarSizeInBits();
6922 const int Src1Size = Src1Ty.getScalarSizeInBits();
6923
6924 auto SignBitMask = MIRBuilder.buildConstant(
6925 Res: Src0Ty, Val: APInt::getSignMask(BitWidth: Src0Size));
6926
6927 auto NotSignBitMask = MIRBuilder.buildConstant(
6928 Res: Src0Ty, Val: APInt::getLowBitsSet(numBits: Src0Size, loBitsSet: Src0Size - 1));
6929
6930 Register And0 = MIRBuilder.buildAnd(Dst: Src0Ty, Src0, Src1: NotSignBitMask).getReg(Idx: 0);
6931 Register And1;
6932 if (Src0Ty == Src1Ty) {
6933 And1 = MIRBuilder.buildAnd(Dst: Src1Ty, Src0: Src1, Src1: SignBitMask).getReg(Idx: 0);
6934 } else if (Src0Size > Src1Size) {
6935 auto ShiftAmt = MIRBuilder.buildConstant(Res: Src0Ty, Val: Src0Size - Src1Size);
6936 auto Zext = MIRBuilder.buildZExt(Res: Src0Ty, Op: Src1);
6937 auto Shift = MIRBuilder.buildShl(Dst: Src0Ty, Src0: Zext, Src1: ShiftAmt);
6938 And1 = MIRBuilder.buildAnd(Dst: Src0Ty, Src0: Shift, Src1: SignBitMask).getReg(Idx: 0);
6939 } else {
6940 auto ShiftAmt = MIRBuilder.buildConstant(Res: Src1Ty, Val: Src1Size - Src0Size);
6941 auto Shift = MIRBuilder.buildLShr(Dst: Src1Ty, Src0: Src1, Src1: ShiftAmt);
6942 auto Trunc = MIRBuilder.buildTrunc(Res: Src0Ty, Op: Shift);
6943 And1 = MIRBuilder.buildAnd(Dst: Src0Ty, Src0: Trunc, Src1: SignBitMask).getReg(Idx: 0);
6944 }
6945
6946 // Be careful about setting nsz/nnan/ninf on every instruction, since the
6947 // constants are a nan and -0.0, but the final result should preserve
6948 // everything.
6949 unsigned Flags = MI.getFlags();
6950 MIRBuilder.buildOr(Dst, Src0: And0, Src1: And1, Flags);
6951
6952 MI.eraseFromParent();
6953 return Legalized;
6954}
6955
6956LegalizerHelper::LegalizeResult
6957LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
6958 unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
6959 TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
6960
6961 auto [Dst, Src0, Src1] = MI.getFirst3Regs();
6962 LLT Ty = MRI.getType(Reg: Dst);
6963
6964 if (!MI.getFlag(Flag: MachineInstr::FmNoNans)) {
6965 // Insert canonicalizes if it's possible we need to quiet to get correct
6966 // sNaN behavior.
6967
6968 // Note this must be done here, and not as an optimization combine in the
6969 // absence of a dedicate quiet-snan instruction as we're using an
6970 // omni-purpose G_FCANONICALIZE.
6971 if (!isKnownNeverSNaN(Val: Src0, MRI))
6972 Src0 = MIRBuilder.buildFCanonicalize(Dst: Ty, Src0, Flags: MI.getFlags()).getReg(Idx: 0);
6973
6974 if (!isKnownNeverSNaN(Val: Src1, MRI))
6975 Src1 = MIRBuilder.buildFCanonicalize(Dst: Ty, Src0: Src1, Flags: MI.getFlags()).getReg(Idx: 0);
6976 }
6977
6978 // If there are no nans, it's safe to simply replace this with the non-IEEE
6979 // version.
6980 MIRBuilder.buildInstr(Opc: NewOp, DstOps: {Dst}, SrcOps: {Src0, Src1}, Flags: MI.getFlags());
6981 MI.eraseFromParent();
6982 return Legalized;
6983}
6984
6985LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
6986 // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
6987 Register DstReg = MI.getOperand(i: 0).getReg();
6988 LLT Ty = MRI.getType(Reg: DstReg);
6989 unsigned Flags = MI.getFlags();
6990
6991 auto Mul = MIRBuilder.buildFMul(Dst: Ty, Src0: MI.getOperand(i: 1), Src1: MI.getOperand(i: 2),
6992 Flags);
6993 MIRBuilder.buildFAdd(Dst: DstReg, Src0: Mul, Src1: MI.getOperand(i: 3), Flags);
6994 MI.eraseFromParent();
6995 return Legalized;
6996}
6997
6998LegalizerHelper::LegalizeResult
6999LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
7000 auto [DstReg, X] = MI.getFirst2Regs();
7001 const unsigned Flags = MI.getFlags();
7002 const LLT Ty = MRI.getType(Reg: DstReg);
7003 const LLT CondTy = Ty.changeElementSize(NewEltSize: 1);
7004
7005 // round(x) =>
7006 // t = trunc(x);
7007 // d = fabs(x - t);
7008 // o = copysign(d >= 0.5 ? 1.0 : 0.0, x);
7009 // return t + o;
7010
7011 auto T = MIRBuilder.buildIntrinsicTrunc(Dst: Ty, Src0: X, Flags);
7012
7013 auto Diff = MIRBuilder.buildFSub(Dst: Ty, Src0: X, Src1: T, Flags);
7014 auto AbsDiff = MIRBuilder.buildFAbs(Dst: Ty, Src0: Diff, Flags);
7015
7016 auto Half = MIRBuilder.buildFConstant(Res: Ty, Val: 0.5);
7017 auto Cmp =
7018 MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OGE, Res: CondTy, Op0: AbsDiff, Op1: Half, Flags);
7019
7020 // Could emit G_UITOFP instead
7021 auto One = MIRBuilder.buildFConstant(Res: Ty, Val: 1.0);
7022 auto Zero = MIRBuilder.buildFConstant(Res: Ty, Val: 0.0);
7023 auto BoolFP = MIRBuilder.buildSelect(Res: Ty, Tst: Cmp, Op0: One, Op1: Zero);
7024 auto SignedOffset = MIRBuilder.buildFCopysign(Dst: Ty, Src0: BoolFP, Src1: X);
7025
7026 MIRBuilder.buildFAdd(Dst: DstReg, Src0: T, Src1: SignedOffset, Flags);
7027
7028 MI.eraseFromParent();
7029 return Legalized;
7030}
7031
7032LegalizerHelper::LegalizeResult LegalizerHelper::lowerFFloor(MachineInstr &MI) {
7033 auto [DstReg, SrcReg] = MI.getFirst2Regs();
7034 unsigned Flags = MI.getFlags();
7035 LLT Ty = MRI.getType(Reg: DstReg);
7036 const LLT CondTy = Ty.changeElementSize(NewEltSize: 1);
7037
7038 // result = trunc(src);
7039 // if (src < 0.0 && src != result)
7040 // result += -1.0.
7041
7042 auto Trunc = MIRBuilder.buildIntrinsicTrunc(Dst: Ty, Src0: SrcReg, Flags);
7043 auto Zero = MIRBuilder.buildFConstant(Res: Ty, Val: 0.0);
7044
7045 auto Lt0 = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_OLT, Res: CondTy,
7046 Op0: SrcReg, Op1: Zero, Flags);
7047 auto NeTrunc = MIRBuilder.buildFCmp(Pred: CmpInst::FCMP_ONE, Res: CondTy,
7048 Op0: SrcReg, Op1: Trunc, Flags);
7049 auto And = MIRBuilder.buildAnd(Dst: CondTy, Src0: Lt0, Src1: NeTrunc);
7050 auto AddVal = MIRBuilder.buildSITOFP(Dst: Ty, Src0: And);
7051
7052 MIRBuilder.buildFAdd(Dst: DstReg, Src0: Trunc, Src1: AddVal, Flags);
7053 MI.eraseFromParent();
7054 return Legalized;
7055}
7056
7057LegalizerHelper::LegalizeResult
7058LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
7059 const unsigned NumOps = MI.getNumOperands();
7060 auto [DstReg, DstTy, Src0Reg, Src0Ty] = MI.getFirst2RegLLTs();
7061 unsigned PartSize = Src0Ty.getSizeInBits();
7062
7063 LLT WideTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
7064 Register ResultReg = MIRBuilder.buildZExt(Res: WideTy, Op: Src0Reg).getReg(Idx: 0);
7065
7066 for (unsigned I = 2; I != NumOps; ++I) {
7067 const unsigned Offset = (I - 1) * PartSize;
7068
7069 Register SrcReg = MI.getOperand(i: I).getReg();
7070 auto ZextInput = MIRBuilder.buildZExt(Res: WideTy, Op: SrcReg);
7071
7072 Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
7073 MRI.createGenericVirtualRegister(Ty: WideTy);
7074
7075 auto ShiftAmt = MIRBuilder.buildConstant(Res: WideTy, Val: Offset);
7076 auto Shl = MIRBuilder.buildShl(Dst: WideTy, Src0: ZextInput, Src1: ShiftAmt);
7077 MIRBuilder.buildOr(Dst: NextResult, Src0: ResultReg, Src1: Shl);
7078 ResultReg = NextResult;
7079 }
7080
7081 if (DstTy.isPointer()) {
7082 if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
7083 AddrSpace: DstTy.getAddressSpace())) {
7084 LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
7085 return UnableToLegalize;
7086 }
7087
7088 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: ResultReg);
7089 }
7090
7091 MI.eraseFromParent();
7092 return Legalized;
7093}
7094
7095LegalizerHelper::LegalizeResult
7096LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
7097 const unsigned NumDst = MI.getNumOperands() - 1;
7098 Register SrcReg = MI.getOperand(i: NumDst).getReg();
7099 Register Dst0Reg = MI.getOperand(i: 0).getReg();
7100 LLT DstTy = MRI.getType(Reg: Dst0Reg);
7101 if (DstTy.isPointer())
7102 return UnableToLegalize; // TODO
7103
7104 SrcReg = coerceToScalar(Val: SrcReg);
7105 if (!SrcReg)
7106 return UnableToLegalize;
7107
7108 // Expand scalarizing unmerge as bitcast to integer and shift.
7109 LLT IntTy = MRI.getType(Reg: SrcReg);
7110
7111 MIRBuilder.buildTrunc(Res: Dst0Reg, Op: SrcReg);
7112
7113 const unsigned DstSize = DstTy.getSizeInBits();
7114 unsigned Offset = DstSize;
7115 for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
7116 auto ShiftAmt = MIRBuilder.buildConstant(Res: IntTy, Val: Offset);
7117 auto Shift = MIRBuilder.buildLShr(Dst: IntTy, Src0: SrcReg, Src1: ShiftAmt);
7118 MIRBuilder.buildTrunc(Res: MI.getOperand(i: I), Op: Shift);
7119 }
7120
7121 MI.eraseFromParent();
7122 return Legalized;
7123}
7124
7125/// Lower a vector extract or insert by writing the vector to a stack temporary
7126/// and reloading the element or vector.
7127///
7128/// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
7129/// =>
7130/// %stack_temp = G_FRAME_INDEX
7131/// G_STORE %vec, %stack_temp
7132/// %idx = clamp(%idx, %vec.getNumElements())
7133/// %element_ptr = G_PTR_ADD %stack_temp, %idx
7134/// %dst = G_LOAD %element_ptr
7135LegalizerHelper::LegalizeResult
7136LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
7137 Register DstReg = MI.getOperand(i: 0).getReg();
7138 Register SrcVec = MI.getOperand(i: 1).getReg();
7139 Register InsertVal;
7140 if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
7141 InsertVal = MI.getOperand(i: 2).getReg();
7142
7143 Register Idx = MI.getOperand(i: MI.getNumOperands() - 1).getReg();
7144
7145 LLT VecTy = MRI.getType(Reg: SrcVec);
7146 LLT EltTy = VecTy.getElementType();
7147 unsigned NumElts = VecTy.getNumElements();
7148
7149 int64_t IdxVal;
7150 if (mi_match(R: Idx, MRI, P: m_ICst(Cst&: IdxVal)) && IdxVal <= NumElts) {
7151 SmallVector<Register, 8> SrcRegs;
7152 extractParts(Reg: SrcVec, Ty: EltTy, NumParts: NumElts, VRegs&: SrcRegs, MIRBuilder, MRI);
7153
7154 if (InsertVal) {
7155 SrcRegs[IdxVal] = MI.getOperand(i: 2).getReg();
7156 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: SrcRegs);
7157 } else {
7158 MIRBuilder.buildCopy(Res: DstReg, Op: SrcRegs[IdxVal]);
7159 }
7160
7161 MI.eraseFromParent();
7162 return Legalized;
7163 }
7164
7165 if (!EltTy.isByteSized()) { // Not implemented.
7166 LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
7167 return UnableToLegalize;
7168 }
7169
7170 unsigned EltBytes = EltTy.getSizeInBytes();
7171 Align VecAlign = getStackTemporaryAlignment(Ty: VecTy);
7172 Align EltAlign;
7173
7174 MachinePointerInfo PtrInfo;
7175 auto StackTemp = createStackTemporary(
7176 Bytes: TypeSize::getFixed(ExactSize: VecTy.getSizeInBytes()), Alignment: VecAlign, PtrInfo);
7177 MIRBuilder.buildStore(Val: SrcVec, Addr: StackTemp, PtrInfo, Alignment: VecAlign);
7178
7179 // Get the pointer to the element, and be sure not to hit undefined behavior
7180 // if the index is out of bounds.
7181 Register EltPtr = getVectorElementPointer(VecPtr: StackTemp.getReg(Idx: 0), VecTy, Index: Idx);
7182
7183 if (mi_match(R: Idx, MRI, P: m_ICst(Cst&: IdxVal))) {
7184 int64_t Offset = IdxVal * EltBytes;
7185 PtrInfo = PtrInfo.getWithOffset(O: Offset);
7186 EltAlign = commonAlignment(A: VecAlign, Offset);
7187 } else {
7188 // We lose information with a variable offset.
7189 EltAlign = getStackTemporaryAlignment(Ty: EltTy);
7190 PtrInfo = MachinePointerInfo(MRI.getType(Reg: EltPtr).getAddressSpace());
7191 }
7192
7193 if (InsertVal) {
7194 // Write the inserted element
7195 MIRBuilder.buildStore(Val: InsertVal, Addr: EltPtr, PtrInfo, Alignment: EltAlign);
7196
7197 // Reload the whole vector.
7198 MIRBuilder.buildLoad(Res: DstReg, Addr: StackTemp, PtrInfo, Alignment: VecAlign);
7199 } else {
7200 MIRBuilder.buildLoad(Res: DstReg, Addr: EltPtr, PtrInfo, Alignment: EltAlign);
7201 }
7202
7203 MI.eraseFromParent();
7204 return Legalized;
7205}
7206
7207LegalizerHelper::LegalizeResult
7208LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
7209 auto [DstReg, DstTy, Src0Reg, Src0Ty, Src1Reg, Src1Ty] =
7210 MI.getFirst3RegLLTs();
7211 LLT IdxTy = LLT::scalar(SizeInBits: 32);
7212
7213 ArrayRef<int> Mask = MI.getOperand(i: 3).getShuffleMask();
7214 Register Undef;
7215 SmallVector<Register, 32> BuildVec;
7216 LLT EltTy = DstTy.getScalarType();
7217
7218 for (int Idx : Mask) {
7219 if (Idx < 0) {
7220 if (!Undef.isValid())
7221 Undef = MIRBuilder.buildUndef(Res: EltTy).getReg(Idx: 0);
7222 BuildVec.push_back(Elt: Undef);
7223 continue;
7224 }
7225
7226 if (Src0Ty.isScalar()) {
7227 BuildVec.push_back(Elt: Idx == 0 ? Src0Reg : Src1Reg);
7228 } else {
7229 int NumElts = Src0Ty.getNumElements();
7230 Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
7231 int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
7232 auto IdxK = MIRBuilder.buildConstant(Res: IdxTy, Val: ExtractIdx);
7233 auto Extract = MIRBuilder.buildExtractVectorElement(Res: EltTy, Val: SrcVec, Idx: IdxK);
7234 BuildVec.push_back(Elt: Extract.getReg(Idx: 0));
7235 }
7236 }
7237
7238 if (DstTy.isScalar())
7239 MIRBuilder.buildCopy(Res: DstReg, Op: BuildVec[0]);
7240 else
7241 MIRBuilder.buildBuildVector(Res: DstReg, Ops: BuildVec);
7242 MI.eraseFromParent();
7243 return Legalized;
7244}
7245
7246Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg,
7247 Register AllocSize,
7248 Align Alignment,
7249 LLT PtrTy) {
7250 LLT IntPtrTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
7251
7252 auto SPTmp = MIRBuilder.buildCopy(Res: PtrTy, Op: SPReg);
7253 SPTmp = MIRBuilder.buildCast(Dst: IntPtrTy, Src: SPTmp);
7254
7255 // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
7256 // have to generate an extra instruction to negate the alloc and then use
7257 // G_PTR_ADD to add the negative offset.
7258 auto Alloc = MIRBuilder.buildSub(Dst: IntPtrTy, Src0: SPTmp, Src1: AllocSize);
7259 if (Alignment > Align(1)) {
7260 APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
7261 AlignMask.negate();
7262 auto AlignCst = MIRBuilder.buildConstant(Res: IntPtrTy, Val: AlignMask);
7263 Alloc = MIRBuilder.buildAnd(Dst: IntPtrTy, Src0: Alloc, Src1: AlignCst);
7264 }
7265
7266 return MIRBuilder.buildCast(Dst: PtrTy, Src: Alloc).getReg(Idx: 0);
7267}
7268
7269LegalizerHelper::LegalizeResult
7270LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
7271 const auto &MF = *MI.getMF();
7272 const auto &TFI = *MF.getSubtarget().getFrameLowering();
7273 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
7274 return UnableToLegalize;
7275
7276 Register Dst = MI.getOperand(i: 0).getReg();
7277 Register AllocSize = MI.getOperand(i: 1).getReg();
7278 Align Alignment = assumeAligned(Value: MI.getOperand(i: 2).getImm());
7279
7280 LLT PtrTy = MRI.getType(Reg: Dst);
7281 Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
7282 Register SPTmp =
7283 getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
7284
7285 MIRBuilder.buildCopy(Res: SPReg, Op: SPTmp);
7286 MIRBuilder.buildCopy(Res: Dst, Op: SPTmp);
7287
7288 MI.eraseFromParent();
7289 return Legalized;
7290}
7291
7292LegalizerHelper::LegalizeResult
7293LegalizerHelper::lowerStackSave(MachineInstr &MI) {
7294 Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
7295 if (!StackPtr)
7296 return UnableToLegalize;
7297
7298 MIRBuilder.buildCopy(Res: MI.getOperand(i: 0), Op: StackPtr);
7299 MI.eraseFromParent();
7300 return Legalized;
7301}
7302
7303LegalizerHelper::LegalizeResult
7304LegalizerHelper::lowerStackRestore(MachineInstr &MI) {
7305 Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
7306 if (!StackPtr)
7307 return UnableToLegalize;
7308
7309 MIRBuilder.buildCopy(Res: StackPtr, Op: MI.getOperand(i: 0));
7310 MI.eraseFromParent();
7311 return Legalized;
7312}
7313
7314LegalizerHelper::LegalizeResult
7315LegalizerHelper::lowerExtract(MachineInstr &MI) {
7316 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7317 unsigned Offset = MI.getOperand(i: 2).getImm();
7318
7319 // Extract sub-vector or one element
7320 if (SrcTy.isVector()) {
7321 unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
7322 unsigned DstSize = DstTy.getSizeInBits();
7323
7324 if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) &&
7325 (Offset + DstSize <= SrcTy.getSizeInBits())) {
7326 // Unmerge and allow access to each Src element for the artifact combiner.
7327 auto Unmerge = MIRBuilder.buildUnmerge(Res: SrcTy.getElementType(), Op: SrcReg);
7328
7329 // Take element(s) we need to extract and copy it (merge them).
7330 SmallVector<Register, 8> SubVectorElts;
7331 for (unsigned Idx = Offset / SrcEltSize;
7332 Idx < (Offset + DstSize) / SrcEltSize; ++Idx) {
7333 SubVectorElts.push_back(Elt: Unmerge.getReg(Idx));
7334 }
7335 if (SubVectorElts.size() == 1)
7336 MIRBuilder.buildCopy(Res: DstReg, Op: SubVectorElts[0]);
7337 else
7338 MIRBuilder.buildMergeLikeInstr(Res: DstReg, Ops: SubVectorElts);
7339
7340 MI.eraseFromParent();
7341 return Legalized;
7342 }
7343 }
7344
7345 if (DstTy.isScalar() &&
7346 (SrcTy.isScalar() ||
7347 (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
7348 LLT SrcIntTy = SrcTy;
7349 if (!SrcTy.isScalar()) {
7350 SrcIntTy = LLT::scalar(SizeInBits: SrcTy.getSizeInBits());
7351 SrcReg = MIRBuilder.buildBitcast(Dst: SrcIntTy, Src: SrcReg).getReg(Idx: 0);
7352 }
7353
7354 if (Offset == 0)
7355 MIRBuilder.buildTrunc(Res: DstReg, Op: SrcReg);
7356 else {
7357 auto ShiftAmt = MIRBuilder.buildConstant(Res: SrcIntTy, Val: Offset);
7358 auto Shr = MIRBuilder.buildLShr(Dst: SrcIntTy, Src0: SrcReg, Src1: ShiftAmt);
7359 MIRBuilder.buildTrunc(Res: DstReg, Op: Shr);
7360 }
7361
7362 MI.eraseFromParent();
7363 return Legalized;
7364 }
7365
7366 return UnableToLegalize;
7367}
7368
7369LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
7370 auto [Dst, Src, InsertSrc] = MI.getFirst3Regs();
7371 uint64_t Offset = MI.getOperand(i: 3).getImm();
7372
7373 LLT DstTy = MRI.getType(Reg: Src);
7374 LLT InsertTy = MRI.getType(Reg: InsertSrc);
7375
7376 // Insert sub-vector or one element
7377 if (DstTy.isVector() && !InsertTy.isPointer()) {
7378 LLT EltTy = DstTy.getElementType();
7379 unsigned EltSize = EltTy.getSizeInBits();
7380 unsigned InsertSize = InsertTy.getSizeInBits();
7381
7382 if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) &&
7383 (Offset + InsertSize <= DstTy.getSizeInBits())) {
7384 auto UnmergeSrc = MIRBuilder.buildUnmerge(Res: EltTy, Op: Src);
7385 SmallVector<Register, 8> DstElts;
7386 unsigned Idx = 0;
7387 // Elements from Src before insert start Offset
7388 for (; Idx < Offset / EltSize; ++Idx) {
7389 DstElts.push_back(Elt: UnmergeSrc.getReg(Idx));
7390 }
7391
7392 // Replace elements in Src with elements from InsertSrc
7393 if (InsertTy.getSizeInBits() > EltSize) {
7394 auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(Res: EltTy, Op: InsertSrc);
7395 for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize;
7396 ++Idx, ++i) {
7397 DstElts.push_back(Elt: UnmergeInsertSrc.getReg(Idx: i));
7398 }
7399 } else {
7400 DstElts.push_back(Elt: InsertSrc);
7401 ++Idx;
7402 }
7403
7404 // Remaining elements from Src after insert
7405 for (; Idx < DstTy.getNumElements(); ++Idx) {
7406 DstElts.push_back(Elt: UnmergeSrc.getReg(Idx));
7407 }
7408
7409 MIRBuilder.buildMergeLikeInstr(Res: Dst, Ops: DstElts);
7410 MI.eraseFromParent();
7411 return Legalized;
7412 }
7413 }
7414
7415 if (InsertTy.isVector() ||
7416 (DstTy.isVector() && DstTy.getElementType() != InsertTy))
7417 return UnableToLegalize;
7418
7419 const DataLayout &DL = MIRBuilder.getDataLayout();
7420 if ((DstTy.isPointer() &&
7421 DL.isNonIntegralAddressSpace(AddrSpace: DstTy.getAddressSpace())) ||
7422 (InsertTy.isPointer() &&
7423 DL.isNonIntegralAddressSpace(AddrSpace: InsertTy.getAddressSpace()))) {
7424 LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
7425 return UnableToLegalize;
7426 }
7427
7428 LLT IntDstTy = DstTy;
7429
7430 if (!DstTy.isScalar()) {
7431 IntDstTy = LLT::scalar(SizeInBits: DstTy.getSizeInBits());
7432 Src = MIRBuilder.buildCast(Dst: IntDstTy, Src).getReg(Idx: 0);
7433 }
7434
7435 if (!InsertTy.isScalar()) {
7436 const LLT IntInsertTy = LLT::scalar(SizeInBits: InsertTy.getSizeInBits());
7437 InsertSrc = MIRBuilder.buildPtrToInt(Dst: IntInsertTy, Src: InsertSrc).getReg(Idx: 0);
7438 }
7439
7440 Register ExtInsSrc = MIRBuilder.buildZExt(Res: IntDstTy, Op: InsertSrc).getReg(Idx: 0);
7441 if (Offset != 0) {
7442 auto ShiftAmt = MIRBuilder.buildConstant(Res: IntDstTy, Val: Offset);
7443 ExtInsSrc = MIRBuilder.buildShl(Dst: IntDstTy, Src0: ExtInsSrc, Src1: ShiftAmt).getReg(Idx: 0);
7444 }
7445
7446 APInt MaskVal = APInt::getBitsSetWithWrap(
7447 numBits: DstTy.getSizeInBits(), loBit: Offset + InsertTy.getSizeInBits(), hiBit: Offset);
7448
7449 auto Mask = MIRBuilder.buildConstant(Res: IntDstTy, Val: MaskVal);
7450 auto MaskedSrc = MIRBuilder.buildAnd(Dst: IntDstTy, Src0: Src, Src1: Mask);
7451 auto Or = MIRBuilder.buildOr(Dst: IntDstTy, Src0: MaskedSrc, Src1: ExtInsSrc);
7452
7453 MIRBuilder.buildCast(Dst, Src: Or);
7454 MI.eraseFromParent();
7455 return Legalized;
7456}
7457
7458LegalizerHelper::LegalizeResult
7459LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
7460 auto [Dst0, Dst0Ty, Dst1, Dst1Ty, LHS, LHSTy, RHS, RHSTy] =
7461 MI.getFirst4RegLLTs();
7462 const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
7463
7464 LLT Ty = Dst0Ty;
7465 LLT BoolTy = Dst1Ty;
7466
7467 if (IsAdd)
7468 MIRBuilder.buildAdd(Dst: Dst0, Src0: LHS, Src1: RHS);
7469 else
7470 MIRBuilder.buildSub(Dst: Dst0, Src0: LHS, Src1: RHS);
7471
7472 // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
7473
7474 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
7475
7476 // For an addition, the result should be less than one of the operands (LHS)
7477 // if and only if the other operand (RHS) is negative, otherwise there will
7478 // be overflow.
7479 // For a subtraction, the result should be less than one of the operands
7480 // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
7481 // otherwise there will be overflow.
7482 auto ResultLowerThanLHS =
7483 MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: BoolTy, Op0: Dst0, Op1: LHS);
7484 auto ConditionRHS = MIRBuilder.buildICmp(
7485 Pred: IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, Res: BoolTy, Op0: RHS, Op1: Zero);
7486
7487 MIRBuilder.buildXor(Dst: Dst1, Src0: ConditionRHS, Src1: ResultLowerThanLHS);
7488 MI.eraseFromParent();
7489 return Legalized;
7490}
7491
7492LegalizerHelper::LegalizeResult
7493LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
7494 auto [Res, LHS, RHS] = MI.getFirst3Regs();
7495 LLT Ty = MRI.getType(Reg: Res);
7496 bool IsSigned;
7497 bool IsAdd;
7498 unsigned BaseOp;
7499 switch (MI.getOpcode()) {
7500 default:
7501 llvm_unreachable("unexpected addsat/subsat opcode");
7502 case TargetOpcode::G_UADDSAT:
7503 IsSigned = false;
7504 IsAdd = true;
7505 BaseOp = TargetOpcode::G_ADD;
7506 break;
7507 case TargetOpcode::G_SADDSAT:
7508 IsSigned = true;
7509 IsAdd = true;
7510 BaseOp = TargetOpcode::G_ADD;
7511 break;
7512 case TargetOpcode::G_USUBSAT:
7513 IsSigned = false;
7514 IsAdd = false;
7515 BaseOp = TargetOpcode::G_SUB;
7516 break;
7517 case TargetOpcode::G_SSUBSAT:
7518 IsSigned = true;
7519 IsAdd = false;
7520 BaseOp = TargetOpcode::G_SUB;
7521 break;
7522 }
7523
7524 if (IsSigned) {
7525 // sadd.sat(a, b) ->
7526 // hi = 0x7fffffff - smax(a, 0)
7527 // lo = 0x80000000 - smin(a, 0)
7528 // a + smin(smax(lo, b), hi)
7529 // ssub.sat(a, b) ->
7530 // lo = smax(a, -1) - 0x7fffffff
7531 // hi = smin(a, -1) - 0x80000000
7532 // a - smin(smax(lo, b), hi)
7533 // TODO: AMDGPU can use a "median of 3" instruction here:
7534 // a +/- med3(lo, b, hi)
7535 uint64_t NumBits = Ty.getScalarSizeInBits();
7536 auto MaxVal =
7537 MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMaxValue(numBits: NumBits));
7538 auto MinVal =
7539 MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMinValue(numBits: NumBits));
7540 MachineInstrBuilder Hi, Lo;
7541 if (IsAdd) {
7542 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0);
7543 Hi = MIRBuilder.buildSub(Dst: Ty, Src0: MaxVal, Src1: MIRBuilder.buildSMax(Dst: Ty, Src0: LHS, Src1: Zero));
7544 Lo = MIRBuilder.buildSub(Dst: Ty, Src0: MinVal, Src1: MIRBuilder.buildSMin(Dst: Ty, Src0: LHS, Src1: Zero));
7545 } else {
7546 auto NegOne = MIRBuilder.buildConstant(Res: Ty, Val: -1);
7547 Lo = MIRBuilder.buildSub(Dst: Ty, Src0: MIRBuilder.buildSMax(Dst: Ty, Src0: LHS, Src1: NegOne),
7548 Src1: MaxVal);
7549 Hi = MIRBuilder.buildSub(Dst: Ty, Src0: MIRBuilder.buildSMin(Dst: Ty, Src0: LHS, Src1: NegOne),
7550 Src1: MinVal);
7551 }
7552 auto RHSClamped =
7553 MIRBuilder.buildSMin(Dst: Ty, Src0: MIRBuilder.buildSMax(Dst: Ty, Src0: Lo, Src1: RHS), Src1: Hi);
7554 MIRBuilder.buildInstr(Opc: BaseOp, DstOps: {Res}, SrcOps: {LHS, RHSClamped});
7555 } else {
7556 // uadd.sat(a, b) -> a + umin(~a, b)
7557 // usub.sat(a, b) -> a - umin(a, b)
7558 Register Not = IsAdd ? MIRBuilder.buildNot(Dst: Ty, Src0: LHS).getReg(Idx: 0) : LHS;
7559 auto Min = MIRBuilder.buildUMin(Dst: Ty, Src0: Not, Src1: RHS);
7560 MIRBuilder.buildInstr(Opc: BaseOp, DstOps: {Res}, SrcOps: {LHS, Min});
7561 }
7562
7563 MI.eraseFromParent();
7564 return Legalized;
7565}
7566
7567LegalizerHelper::LegalizeResult
7568LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
7569 auto [Res, LHS, RHS] = MI.getFirst3Regs();
7570 LLT Ty = MRI.getType(Reg: Res);
7571 LLT BoolTy = Ty.changeElementSize(NewEltSize: 1);
7572 bool IsSigned;
7573 bool IsAdd;
7574 unsigned OverflowOp;
7575 switch (MI.getOpcode()) {
7576 default:
7577 llvm_unreachable("unexpected addsat/subsat opcode");
7578 case TargetOpcode::G_UADDSAT:
7579 IsSigned = false;
7580 IsAdd = true;
7581 OverflowOp = TargetOpcode::G_UADDO;
7582 break;
7583 case TargetOpcode::G_SADDSAT:
7584 IsSigned = true;
7585 IsAdd = true;
7586 OverflowOp = TargetOpcode::G_SADDO;
7587 break;
7588 case TargetOpcode::G_USUBSAT:
7589 IsSigned = false;
7590 IsAdd = false;
7591 OverflowOp = TargetOpcode::G_USUBO;
7592 break;
7593 case TargetOpcode::G_SSUBSAT:
7594 IsSigned = true;
7595 IsAdd = false;
7596 OverflowOp = TargetOpcode::G_SSUBO;
7597 break;
7598 }
7599
7600 auto OverflowRes =
7601 MIRBuilder.buildInstr(Opc: OverflowOp, DstOps: {Ty, BoolTy}, SrcOps: {LHS, RHS});
7602 Register Tmp = OverflowRes.getReg(Idx: 0);
7603 Register Ov = OverflowRes.getReg(Idx: 1);
7604 MachineInstrBuilder Clamp;
7605 if (IsSigned) {
7606 // sadd.sat(a, b) ->
7607 // {tmp, ov} = saddo(a, b)
7608 // ov ? (tmp >>s 31) + 0x80000000 : r
7609 // ssub.sat(a, b) ->
7610 // {tmp, ov} = ssubo(a, b)
7611 // ov ? (tmp >>s 31) + 0x80000000 : r
7612 uint64_t NumBits = Ty.getScalarSizeInBits();
7613 auto ShiftAmount = MIRBuilder.buildConstant(Res: Ty, Val: NumBits - 1);
7614 auto Sign = MIRBuilder.buildAShr(Dst: Ty, Src0: Tmp, Src1: ShiftAmount);
7615 auto MinVal =
7616 MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMinValue(numBits: NumBits));
7617 Clamp = MIRBuilder.buildAdd(Dst: Ty, Src0: Sign, Src1: MinVal);
7618 } else {
7619 // uadd.sat(a, b) ->
7620 // {tmp, ov} = uaddo(a, b)
7621 // ov ? 0xffffffff : tmp
7622 // usub.sat(a, b) ->
7623 // {tmp, ov} = usubo(a, b)
7624 // ov ? 0 : tmp
7625 Clamp = MIRBuilder.buildConstant(Res: Ty, Val: IsAdd ? -1 : 0);
7626 }
7627 MIRBuilder.buildSelect(Res, Tst: Ov, Op0: Clamp, Op1: Tmp);
7628
7629 MI.eraseFromParent();
7630 return Legalized;
7631}
7632
7633LegalizerHelper::LegalizeResult
7634LegalizerHelper::lowerShlSat(MachineInstr &MI) {
7635 assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
7636 MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
7637 "Expected shlsat opcode!");
7638 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
7639 auto [Res, LHS, RHS] = MI.getFirst3Regs();
7640 LLT Ty = MRI.getType(Reg: Res);
7641 LLT BoolTy = Ty.changeElementSize(NewEltSize: 1);
7642
7643 unsigned BW = Ty.getScalarSizeInBits();
7644 auto Result = MIRBuilder.buildShl(Dst: Ty, Src0: LHS, Src1: RHS);
7645 auto Orig = IsSigned ? MIRBuilder.buildAShr(Dst: Ty, Src0: Result, Src1: RHS)
7646 : MIRBuilder.buildLShr(Dst: Ty, Src0: Result, Src1: RHS);
7647
7648 MachineInstrBuilder SatVal;
7649 if (IsSigned) {
7650 auto SatMin = MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMinValue(numBits: BW));
7651 auto SatMax = MIRBuilder.buildConstant(Res: Ty, Val: APInt::getSignedMaxValue(numBits: BW));
7652 auto Cmp = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_SLT, Res: BoolTy, Op0: LHS,
7653 Op1: MIRBuilder.buildConstant(Res: Ty, Val: 0));
7654 SatVal = MIRBuilder.buildSelect(Res: Ty, Tst: Cmp, Op0: SatMin, Op1: SatMax);
7655 } else {
7656 SatVal = MIRBuilder.buildConstant(Res: Ty, Val: APInt::getMaxValue(numBits: BW));
7657 }
7658 auto Ov = MIRBuilder.buildICmp(Pred: CmpInst::ICMP_NE, Res: BoolTy, Op0: LHS, Op1: Orig);
7659 MIRBuilder.buildSelect(Res, Tst: Ov, Op0: SatVal, Op1: Result);
7660
7661 MI.eraseFromParent();
7662 return Legalized;
7663}
7664
7665LegalizerHelper::LegalizeResult LegalizerHelper::lowerBswap(MachineInstr &MI) {
7666 auto [Dst, Src] = MI.getFirst2Regs();
7667 const LLT Ty = MRI.getType(Reg: Src);
7668 unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
7669 unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
7670
7671 // Swap most and least significant byte, set remaining bytes in Res to zero.
7672 auto ShiftAmt = MIRBuilder.buildConstant(Res: Ty, Val: BaseShiftAmt);
7673 auto LSByteShiftedLeft = MIRBuilder.buildShl(Dst: Ty, Src0: Src, Src1: ShiftAmt);
7674 auto MSByteShiftedRight = MIRBuilder.buildLShr(Dst: Ty, Src0: Src, Src1: ShiftAmt);
7675 auto Res = MIRBuilder.buildOr(Dst: Ty, Src0: MSByteShiftedRight, Src1: LSByteShiftedLeft);
7676
7677 // Set i-th high/low byte in Res to i-th low/high byte from Src.
7678 for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
7679 // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
7680 APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
7681 auto Mask = MIRBuilder.buildConstant(Res: Ty, Val: APMask);
7682 auto ShiftAmt = MIRBuilder.buildConstant(Res: Ty, Val: BaseShiftAmt - 16 * i);
7683 // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
7684 auto LoByte = MIRBuilder.buildAnd(Dst: Ty, Src0: Src, Src1: Mask);
7685 auto LoShiftedLeft = MIRBuilder.buildShl(Dst: Ty, Src0: LoByte, Src1: ShiftAmt);
7686 Res = MIRBuilder.buildOr(Dst: Ty, Src0: Res, Src1: LoShiftedLeft);
7687 // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
7688 auto SrcShiftedRight = MIRBuilder.buildLShr(Dst: Ty, Src0: Src, Src1: ShiftAmt);
7689 auto HiShiftedRight = MIRBuilder.buildAnd(Dst: Ty, Src0: SrcShiftedRight, Src1: Mask);
7690 Res = MIRBuilder.buildOr(Dst: Ty, Src0: Res, Src1: HiShiftedRight);
7691 }
7692 Res.getInstr()->getOperand(i: 0).setReg(Dst);
7693
7694 MI.eraseFromParent();
7695 return Legalized;
7696}
7697
7698//{ (Src & Mask) >> N } | { (Src << N) & Mask }
7699static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
7700 MachineInstrBuilder Src, APInt Mask) {
7701 const LLT Ty = Dst.getLLTTy(MRI: *B.getMRI());
7702 MachineInstrBuilder C_N = B.buildConstant(Res: Ty, Val: N);
7703 MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Res: Ty, Val: Mask);
7704 auto LHS = B.buildLShr(Dst: Ty, Src0: B.buildAnd(Dst: Ty, Src0: Src, Src1: MaskLoNTo0), Src1: C_N);
7705 auto RHS = B.buildAnd(Dst: Ty, Src0: B.buildShl(Dst: Ty, Src0: Src, Src1: C_N), Src1: MaskLoNTo0);
7706 return B.buildOr(Dst, Src0: LHS, Src1: RHS);
7707}
7708
7709LegalizerHelper::LegalizeResult
7710LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
7711 auto [Dst, Src] = MI.getFirst2Regs();
7712 const LLT Ty = MRI.getType(Reg: Src);
7713 unsigned Size = Ty.getSizeInBits();
7714
7715 MachineInstrBuilder BSWAP =
7716 MIRBuilder.buildInstr(Opc: TargetOpcode::G_BSWAP, DstOps: {Ty}, SrcOps: {Src});
7717
7718 // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
7719 // [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
7720 // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
7721 MachineInstrBuilder Swap4 =
7722 SwapN(N: 4, Dst: Ty, B&: MIRBuilder, Src: BSWAP, Mask: APInt::getSplat(NewLen: Size, V: APInt(8, 0xF0)));
7723
7724 // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
7725 // [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
7726 // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
7727 MachineInstrBuilder Swap2 =
7728 SwapN(N: 2, Dst: Ty, B&: MIRBuilder, Src: Swap4, Mask: APInt::getSplat(NewLen: Size, V: APInt(8, 0xCC)));
7729
7730 // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7
7731 // [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
7732 // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
7733 SwapN(N: 1, Dst, B&: MIRBuilder, Src: Swap2, Mask: APInt::getSplat(NewLen: Size, V: APInt(8, 0xAA)));
7734
7735 MI.eraseFromParent();
7736 return Legalized;
7737}
7738
7739LegalizerHelper::LegalizeResult
7740LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
7741 MachineFunction &MF = MIRBuilder.getMF();
7742
7743 bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
7744 int NameOpIdx = IsRead ? 1 : 0;
7745 int ValRegIndex = IsRead ? 0 : 1;
7746
7747 Register ValReg = MI.getOperand(i: ValRegIndex).getReg();
7748 const LLT Ty = MRI.getType(Reg: ValReg);
7749 const MDString *RegStr = cast<MDString>(
7750 Val: cast<MDNode>(Val: MI.getOperand(i: NameOpIdx).getMetadata())->getOperand(I: 0));
7751
7752 Register PhysReg = TLI.getRegisterByName(RegName: RegStr->getString().data(), Ty, MF);
7753 if (!PhysReg.isValid())
7754 return UnableToLegalize;
7755
7756 if (IsRead)
7757 MIRBuilder.buildCopy(Res: ValReg, Op: PhysReg);
7758 else
7759 MIRBuilder.buildCopy(Res: PhysReg, Op: ValReg);
7760
7761 MI.eraseFromParent();
7762 return Legalized;
7763}
7764
7765LegalizerHelper::LegalizeResult
7766LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
7767 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
7768 unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
7769 Register Result = MI.getOperand(i: 0).getReg();
7770 LLT OrigTy = MRI.getType(Reg: Result);
7771 auto SizeInBits = OrigTy.getScalarSizeInBits();
7772 LLT WideTy = OrigTy.changeElementSize(NewEltSize: SizeInBits * 2);
7773
7774 auto LHS = MIRBuilder.buildInstr(Opc: ExtOp, DstOps: {WideTy}, SrcOps: {MI.getOperand(i: 1)});
7775 auto RHS = MIRBuilder.buildInstr(Opc: ExtOp, DstOps: {WideTy}, SrcOps: {MI.getOperand(i: 2)});
7776 auto Mul = MIRBuilder.buildMul(Dst: WideTy, Src0: LHS, Src1: RHS);
7777 unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
7778
7779 auto ShiftAmt = MIRBuilder.buildConstant(Res: WideTy, Val: SizeInBits);
7780 auto Shifted = MIRBuilder.buildInstr(Opc: ShiftOp, DstOps: {WideTy}, SrcOps: {Mul, ShiftAmt});
7781 MIRBuilder.buildTrunc(Res: Result, Op: Shifted);
7782
7783 MI.eraseFromParent();
7784 return Legalized;
7785}
7786
7787LegalizerHelper::LegalizeResult
7788LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) {
7789 auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7790 FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(i: 2).getImm());
7791
7792 if (Mask == fcNone) {
7793 MIRBuilder.buildConstant(Res: DstReg, Val: 0);
7794 MI.eraseFromParent();
7795 return Legalized;
7796 }
7797 if (Mask == fcAllFlags) {
7798 MIRBuilder.buildConstant(Res: DstReg, Val: 1);
7799 MI.eraseFromParent();
7800 return Legalized;
7801 }
7802
7803 // TODO: Try inverting the test with getInvertedFPClassTest like the DAG
7804 // version
7805
7806 unsigned BitSize = SrcTy.getScalarSizeInBits();
7807 const fltSemantics &Semantics = getFltSemanticForLLT(Ty: SrcTy.getScalarType());
7808
7809 LLT IntTy = LLT::scalar(SizeInBits: BitSize);
7810 if (SrcTy.isVector())
7811 IntTy = LLT::vector(EC: SrcTy.getElementCount(), ScalarTy: IntTy);
7812 auto AsInt = MIRBuilder.buildCopy(Res: IntTy, Op: SrcReg);
7813
7814 // Various masks.
7815 APInt SignBit = APInt::getSignMask(BitWidth: BitSize);
7816 APInt ValueMask = APInt::getSignedMaxValue(numBits: BitSize); // All bits but sign.
7817 APInt Inf = APFloat::getInf(Sem: Semantics).bitcastToAPInt(); // Exp and int bit.
7818 APInt ExpMask = Inf;
7819 APInt AllOneMantissa = APFloat::getLargest(Sem: Semantics).bitcastToAPInt() & ~Inf;
7820 APInt QNaNBitMask =
7821 APInt::getOneBitSet(numBits: BitSize, BitNo: AllOneMantissa.getActiveBits() - 1);
7822 APInt InvertionMask = APInt::getAllOnes(numBits: DstTy.getScalarSizeInBits());
7823
7824 auto SignBitC = MIRBuilder.buildConstant(Res: IntTy, Val: SignBit);
7825 auto ValueMaskC = MIRBuilder.buildConstant(Res: IntTy, Val: ValueMask);
7826 auto InfC = MIRBuilder.buildConstant(Res: IntTy, Val: Inf);
7827 auto ExpMaskC = MIRBuilder.buildConstant(Res: IntTy, Val: ExpMask);
7828 auto ZeroC = MIRBuilder.buildConstant(Res: IntTy, Val: 0);
7829
7830 auto Abs = MIRBuilder.buildAnd(Dst: IntTy, Src0: AsInt, Src1: ValueMaskC);
7831 auto Sign =
7832 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_NE, Res: DstTy, Op0: AsInt, Op1: Abs);
7833
7834 auto Res = MIRBuilder.buildConstant(Res: DstTy, Val: 0);
7835 // Clang doesn't support capture of structured bindings:
7836 LLT DstTyCopy = DstTy;
7837 const auto appendToRes = [&](MachineInstrBuilder ToAppend) {
7838 Res = MIRBuilder.buildOr(Dst: DstTyCopy, Src0: Res, Src1: ToAppend);
7839 };
7840
7841 // Tests that involve more than one class should be processed first.
7842 if ((Mask & fcFinite) == fcFinite) {
7843 // finite(V) ==> abs(V) u< exp_mask
7844 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: Abs,
7845 Op1: ExpMaskC));
7846 Mask &= ~fcFinite;
7847 } else if ((Mask & fcFinite) == fcPosFinite) {
7848 // finite(V) && V > 0 ==> V u< exp_mask
7849 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: AsInt,
7850 Op1: ExpMaskC));
7851 Mask &= ~fcPosFinite;
7852 } else if ((Mask & fcFinite) == fcNegFinite) {
7853 // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1
7854 auto Cmp = MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: Abs,
7855 Op1: ExpMaskC);
7856 auto And = MIRBuilder.buildAnd(Dst: DstTy, Src0: Cmp, Src1: Sign);
7857 appendToRes(And);
7858 Mask &= ~fcNegFinite;
7859 }
7860
7861 if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) {
7862 // fcZero | fcSubnormal => test all exponent bits are 0
7863 // TODO: Handle sign bit specific cases
7864 // TODO: Handle inverted case
7865 if (PartialCheck == (fcZero | fcSubnormal)) {
7866 auto ExpBits = MIRBuilder.buildAnd(Dst: IntTy, Src0: AsInt, Src1: ExpMaskC);
7867 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
7868 Op0: ExpBits, Op1: ZeroC));
7869 Mask &= ~PartialCheck;
7870 }
7871 }
7872
7873 // Check for individual classes.
7874 if (FPClassTest PartialCheck = Mask & fcZero) {
7875 if (PartialCheck == fcPosZero)
7876 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
7877 Op0: AsInt, Op1: ZeroC));
7878 else if (PartialCheck == fcZero)
7879 appendToRes(
7880 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy, Op0: Abs, Op1: ZeroC));
7881 else // fcNegZero
7882 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
7883 Op0: AsInt, Op1: SignBitC));
7884 }
7885
7886 if (FPClassTest PartialCheck = Mask & fcSubnormal) {
7887 // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set)
7888 // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set)
7889 auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs;
7890 auto OneC = MIRBuilder.buildConstant(Res: IntTy, Val: 1);
7891 auto VMinusOne = MIRBuilder.buildSub(Dst: IntTy, Src0: V, Src1: OneC);
7892 auto SubnormalRes =
7893 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: VMinusOne,
7894 Op1: MIRBuilder.buildConstant(Res: IntTy, Val: AllOneMantissa));
7895 if (PartialCheck == fcNegSubnormal)
7896 SubnormalRes = MIRBuilder.buildAnd(Dst: DstTy, Src0: SubnormalRes, Src1: Sign);
7897 appendToRes(SubnormalRes);
7898 }
7899
7900 if (FPClassTest PartialCheck = Mask & fcInf) {
7901 if (PartialCheck == fcPosInf)
7902 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
7903 Op0: AsInt, Op1: InfC));
7904 else if (PartialCheck == fcInf)
7905 appendToRes(
7906 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy, Op0: Abs, Op1: InfC));
7907 else { // fcNegInf
7908 APInt NegInf = APFloat::getInf(Sem: Semantics, Negative: true).bitcastToAPInt();
7909 auto NegInfC = MIRBuilder.buildConstant(Res: IntTy, Val: NegInf);
7910 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_EQ, Res: DstTy,
7911 Op0: AsInt, Op1: NegInfC));
7912 }
7913 }
7914
7915 if (FPClassTest PartialCheck = Mask & fcNan) {
7916 auto InfWithQnanBitC = MIRBuilder.buildConstant(Res: IntTy, Val: Inf | QNaNBitMask);
7917 if (PartialCheck == fcNan) {
7918 // isnan(V) ==> abs(V) u> int(inf)
7919 appendToRes(
7920 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_UGT, Res: DstTy, Op0: Abs, Op1: InfC));
7921 } else if (PartialCheck == fcQNan) {
7922 // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit)
7923 appendToRes(MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_UGE, Res: DstTy, Op0: Abs,
7924 Op1: InfWithQnanBitC));
7925 } else { // fcSNan
7926 // issignaling(V) ==> abs(V) u> unsigned(Inf) &&
7927 // abs(V) u< (unsigned(Inf) | quiet_bit)
7928 auto IsNan =
7929 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_UGT, Res: DstTy, Op0: Abs, Op1: InfC);
7930 auto IsNotQnan = MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy,
7931 Op0: Abs, Op1: InfWithQnanBitC);
7932 appendToRes(MIRBuilder.buildAnd(Dst: DstTy, Src0: IsNan, Src1: IsNotQnan));
7933 }
7934 }
7935
7936 if (FPClassTest PartialCheck = Mask & fcNormal) {
7937 // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u<
7938 // (max_exp-1))
7939 APInt ExpLSB = ExpMask & ~(ExpMask.shl(shiftAmt: 1));
7940 auto ExpMinusOne = MIRBuilder.buildSub(
7941 Dst: IntTy, Src0: Abs, Src1: MIRBuilder.buildConstant(Res: IntTy, Val: ExpLSB));
7942 APInt MaxExpMinusOne = ExpMask - ExpLSB;
7943 auto NormalRes =
7944 MIRBuilder.buildICmp(Pred: CmpInst::Predicate::ICMP_ULT, Res: DstTy, Op0: ExpMinusOne,
7945 Op1: MIRBuilder.buildConstant(Res: IntTy, Val: MaxExpMinusOne));
7946 if (PartialCheck == fcNegNormal)
7947 NormalRes = MIRBuilder.buildAnd(Dst: DstTy, Src0: NormalRes, Src1: Sign);
7948 else if (PartialCheck == fcPosNormal) {
7949 auto PosSign = MIRBuilder.buildXor(
7950 Dst: DstTy, Src0: Sign, Src1: MIRBuilder.buildConstant(Res: DstTy, Val: InvertionMask));
7951 NormalRes = MIRBuilder.buildAnd(Dst: DstTy, Src0: NormalRes, Src1: PosSign);
7952 }
7953 appendToRes(NormalRes);
7954 }
7955
7956 MIRBuilder.buildCopy(Res: DstReg, Op: Res);
7957 MI.eraseFromParent();
7958 return Legalized;
7959}
7960
7961LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
7962 // Implement G_SELECT in terms of XOR, AND, OR.
7963 auto [DstReg, DstTy, MaskReg, MaskTy, Op1Reg, Op1Ty, Op2Reg, Op2Ty] =
7964 MI.getFirst4RegLLTs();
7965
7966 bool IsEltPtr = DstTy.isPointerOrPointerVector();
7967 if (IsEltPtr) {
7968 LLT ScalarPtrTy = LLT::scalar(SizeInBits: DstTy.getScalarSizeInBits());
7969 LLT NewTy = DstTy.changeElementType(NewEltTy: ScalarPtrTy);
7970 Op1Reg = MIRBuilder.buildPtrToInt(Dst: NewTy, Src: Op1Reg).getReg(Idx: 0);
7971 Op2Reg = MIRBuilder.buildPtrToInt(Dst: NewTy, Src: Op2Reg).getReg(Idx: 0);
7972 DstTy = NewTy;
7973 }
7974
7975 if (MaskTy.isScalar()) {
7976 // Turn the scalar condition into a vector condition mask if needed.
7977
7978 Register MaskElt = MaskReg;
7979
7980 // The condition was potentially zero extended before, but we want a sign
7981 // extended boolean.
7982 if (MaskTy != LLT::scalar(SizeInBits: 1))
7983 MaskElt = MIRBuilder.buildSExtInReg(Res: MaskTy, Op: MaskElt, ImmOp: 1).getReg(Idx: 0);
7984
7985 // Continue the sign extension (or truncate) to match the data type.
7986 MaskElt =
7987 MIRBuilder.buildSExtOrTrunc(Res: DstTy.getScalarType(), Op: MaskElt).getReg(Idx: 0);
7988
7989 if (DstTy.isVector()) {
7990 // Generate a vector splat idiom.
7991 auto ShufSplat = MIRBuilder.buildShuffleSplat(Res: DstTy, Src: MaskElt);
7992 MaskReg = ShufSplat.getReg(Idx: 0);
7993 } else {
7994 MaskReg = MaskElt;
7995 }
7996 MaskTy = DstTy;
7997 } else if (!DstTy.isVector()) {
7998 // Cannot handle the case that mask is a vector and dst is a scalar.
7999 return UnableToLegalize;
8000 }
8001
8002 if (MaskTy.getSizeInBits() != DstTy.getSizeInBits()) {
8003 return UnableToLegalize;
8004 }
8005
8006 auto NotMask = MIRBuilder.buildNot(Dst: MaskTy, Src0: MaskReg);
8007 auto NewOp1 = MIRBuilder.buildAnd(Dst: MaskTy, Src0: Op1Reg, Src1: MaskReg);
8008 auto NewOp2 = MIRBuilder.buildAnd(Dst: MaskTy, Src0: Op2Reg, Src1: NotMask);
8009 if (IsEltPtr) {
8010 auto Or = MIRBuilder.buildOr(Dst: DstTy, Src0: NewOp1, Src1: NewOp2);
8011 MIRBuilder.buildIntToPtr(Dst: DstReg, Src: Or);
8012 } else {
8013 MIRBuilder.buildOr(Dst: DstReg, Src0: NewOp1, Src1: NewOp2);
8014 }
8015 MI.eraseFromParent();
8016 return Legalized;
8017}
8018
8019LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
8020 // Split DIVREM into individual instructions.
8021 unsigned Opcode = MI.getOpcode();
8022
8023 MIRBuilder.buildInstr(
8024 Opc: Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
8025 : TargetOpcode::G_UDIV,
8026 DstOps: {MI.getOperand(i: 0).getReg()}, SrcOps: {MI.getOperand(i: 2), MI.getOperand(i: 3)});
8027 MIRBuilder.buildInstr(
8028 Opc: Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
8029 : TargetOpcode::G_UREM,
8030 DstOps: {MI.getOperand(i: 1).getReg()}, SrcOps: {MI.getOperand(i: 2), MI.getOperand(i: 3)});
8031 MI.eraseFromParent();
8032 return Legalized;
8033}
8034
8035LegalizerHelper::LegalizeResult
8036LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
8037 // Expand %res = G_ABS %a into:
8038 // %v1 = G_ASHR %a, scalar_size-1
8039 // %v2 = G_ADD %a, %v1
8040 // %res = G_XOR %v2, %v1
8041 LLT DstTy = MRI.getType(Reg: MI.getOperand(i: 0).getReg());
8042 Register OpReg = MI.getOperand(i: 1).getReg();
8043 auto ShiftAmt =
8044 MIRBuilder.buildConstant(Res: DstTy, Val: DstTy.getScalarSizeInBits() - 1);
8045 auto Shift = MIRBuilder.buildAShr(Dst: DstTy, Src0: OpReg, Src1: ShiftAmt);
8046 auto Add = MIRBuilder.buildAdd(Dst: DstTy, Src0: OpReg, Src1: Shift);
8047 MIRBuilder.buildXor(Dst: MI.getOperand(i: 0).getReg(), Src0: Add, Src1: Shift);
8048 MI.eraseFromParent();
8049 return Legalized;
8050}
8051
8052LegalizerHelper::LegalizeResult
8053LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
8054 // Expand %res = G_ABS %a into:
8055 // %v1 = G_CONSTANT 0
8056 // %v2 = G_SUB %v1, %a
8057 // %res = G_SMAX %a, %v2
8058 Register SrcReg = MI.getOperand(i: 1).getReg();
8059 LLT Ty = MRI.getType(Reg: SrcReg);
8060 auto Zero = MIRBuilder.buildConstant(Res: Ty, Val: 0).getReg(Idx: 0);
8061 auto Sub = MIRBuilder.buildSub(Dst: Ty, Src0: Zero, Src1: SrcReg).getReg(Idx: 0);
8062 MIRBuilder.buildSMax(Dst: MI.getOperand(i: 0), Src0: SrcReg, Src1: Sub);
8063 MI.eraseFromParent();
8064 return Legalized;
8065}
8066
8067LegalizerHelper::LegalizeResult
8068LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
8069 Register SrcReg = MI.getOperand(i: 1).getReg();
8070 LLT SrcTy = MRI.getType(Reg: SrcReg);
8071 LLT DstTy = MRI.getType(Reg: SrcReg);
8072
8073 // The source could be a scalar if the IR type was <1 x sN>.
8074 if (SrcTy.isScalar()) {
8075 if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
8076 return UnableToLegalize; // FIXME: handle extension.
8077 // This can be just a plain copy.
8078 Observer.changingInstr(MI);
8079 MI.setDesc(MIRBuilder.getTII().get(Opcode: TargetOpcode::COPY));
8080 Observer.changedInstr(MI);
8081 return Legalized;
8082 }
8083 return UnableToLegalize;
8084}
8085
8086static Type *getTypeForLLT(LLT Ty, LLVMContext &C);
8087
8088LegalizerHelper::LegalizeResult LegalizerHelper::lowerVAArg(MachineInstr &MI) {
8089 MachineFunction &MF = *MI.getMF();
8090 const DataLayout &DL = MIRBuilder.getDataLayout();
8091 LLVMContext &Ctx = MF.getFunction().getContext();
8092 Register ListPtr = MI.getOperand(i: 1).getReg();
8093 LLT PtrTy = MRI.getType(Reg: ListPtr);
8094
8095 // LstPtr is a pointer to the head of the list. Get the address
8096 // of the head of the list.
8097 Align PtrAlignment = DL.getABITypeAlign(Ty: getTypeForLLT(Ty: PtrTy, C&: Ctx));
8098 MachineMemOperand *PtrLoadMMO = MF.getMachineMemOperand(
8099 PtrInfo: MachinePointerInfo(), f: MachineMemOperand::MOLoad, MemTy: PtrTy, base_alignment: PtrAlignment);
8100 auto VAList = MIRBuilder.buildLoad(Res: PtrTy, Addr: ListPtr, MMO&: *PtrLoadMMO).getReg(Idx: 0);
8101
8102 const Align A(MI.getOperand(i: 2).getImm());
8103 LLT PtrTyAsScalarTy = LLT::scalar(SizeInBits: PtrTy.getSizeInBits());
8104 if (A > TLI.getMinStackArgumentAlignment()) {
8105 Register AlignAmt =
8106 MIRBuilder.buildConstant(Res: PtrTyAsScalarTy, Val: A.value() - 1).getReg(Idx: 0);
8107 auto AddDst = MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: VAList, Op1: AlignAmt);
8108 auto AndDst = MIRBuilder.buildMaskLowPtrBits(Res: PtrTy, Op0: AddDst, NumBits: Log2(A));
8109 VAList = AndDst.getReg(Idx: 0);
8110 }
8111
8112 // Increment the pointer, VAList, to the next vaarg
8113 // The list should be bumped by the size of element in the current head of
8114 // list.
8115 Register Dst = MI.getOperand(i: 0).getReg();
8116 LLT LLTTy = MRI.getType(Reg: Dst);
8117 Type *Ty = getTypeForLLT(Ty: LLTTy, C&: Ctx);
8118 auto IncAmt =
8119 MIRBuilder.buildConstant(Res: PtrTyAsScalarTy, Val: DL.getTypeAllocSize(Ty));
8120 auto Succ = MIRBuilder.buildPtrAdd(Res: PtrTy, Op0: VAList, Op1: IncAmt);
8121
8122 // Store the increment VAList to the legalized pointer
8123 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
8124 PtrInfo: MachinePointerInfo(), f: MachineMemOperand::MOStore, MemTy: PtrTy, base_alignment: PtrAlignment);
8125 MIRBuilder.buildStore(Val: Succ, Addr: ListPtr, MMO&: *StoreMMO);
8126 // Load the actual argument out of the pointer VAList
8127 Align EltAlignment = DL.getABITypeAlign(Ty);
8128 MachineMemOperand *EltLoadMMO = MF.getMachineMemOperand(
8129 PtrInfo: MachinePointerInfo(), f: MachineMemOperand::MOLoad, MemTy: LLTTy, base_alignment: EltAlignment);
8130 MIRBuilder.buildLoad(Res: Dst, Addr: VAList, MMO&: *EltLoadMMO);
8131
8132 MI.eraseFromParent();
8133 return Legalized;
8134}
8135
8136static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
8137 // On Darwin, -Os means optimize for size without hurting performance, so
8138 // only really optimize for size when -Oz (MinSize) is used.
8139 if (MF.getTarget().getTargetTriple().isOSDarwin())
8140 return MF.getFunction().hasMinSize();
8141 return MF.getFunction().hasOptSize();
8142}
8143
8144// Returns a list of types to use for memory op lowering in MemOps. A partial
8145// port of findOptimalMemOpLowering in TargetLowering.
8146static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
8147 unsigned Limit, const MemOp &Op,
8148 unsigned DstAS, unsigned SrcAS,
8149 const AttributeList &FuncAttributes,
8150 const TargetLowering &TLI) {
8151 if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
8152 return false;
8153
8154 LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
8155
8156 if (Ty == LLT()) {
8157 // Use the largest scalar type whose alignment constraints are satisfied.
8158 // We only need to check DstAlign here as SrcAlign is always greater or
8159 // equal to DstAlign (or zero).
8160 Ty = LLT::scalar(SizeInBits: 64);
8161 if (Op.isFixedDstAlign())
8162 while (Op.getDstAlign() < Ty.getSizeInBytes() &&
8163 !TLI.allowsMisalignedMemoryAccesses(Ty, AddrSpace: DstAS, Alignment: Op.getDstAlign()))
8164 Ty = LLT::scalar(SizeInBits: Ty.getSizeInBytes());
8165 assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
8166 // FIXME: check for the largest legal type we can load/store to.
8167 }
8168
8169 unsigned NumMemOps = 0;
8170 uint64_t Size = Op.size();
8171 while (Size) {
8172 unsigned TySize = Ty.getSizeInBytes();
8173 while (TySize > Size) {
8174 // For now, only use non-vector load / store's for the left-over pieces.
8175 LLT NewTy = Ty;
8176 // FIXME: check for mem op safety and legality of the types. Not all of
8177 // SDAGisms map cleanly to GISel concepts.
8178 if (NewTy.isVector())
8179 NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(SizeInBits: 64) : LLT::scalar(SizeInBits: 32);
8180 NewTy = LLT::scalar(SizeInBits: llvm::bit_floor(Value: NewTy.getSizeInBits() - 1));
8181 unsigned NewTySize = NewTy.getSizeInBytes();
8182 assert(NewTySize > 0 && "Could not find appropriate type");
8183
8184 // If the new LLT cannot cover all of the remaining bits, then consider
8185 // issuing a (or a pair of) unaligned and overlapping load / store.
8186 unsigned Fast;
8187 // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
8188 MVT VT = getMVTForLLT(Ty);
8189 if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
8190 TLI.allowsMisalignedMemoryAccesses(
8191 VT, AddrSpace: DstAS, Alignment: Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
8192 Flags: MachineMemOperand::MONone, &Fast) &&
8193 Fast)
8194 TySize = Size;
8195 else {
8196 Ty = NewTy;
8197 TySize = NewTySize;
8198 }
8199 }
8200
8201 if (++NumMemOps > Limit)
8202 return false;
8203
8204 MemOps.push_back(x: Ty);
8205 Size -= TySize;
8206 }
8207
8208 return true;
8209}
8210
8211static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
8212 if (Ty.isVector())
8213 return FixedVectorType::get(ElementType: IntegerType::get(C, NumBits: Ty.getScalarSizeInBits()),
8214 NumElts: Ty.getNumElements());
8215 return IntegerType::get(C, NumBits: Ty.getSizeInBits());
8216}
8217
8218// Get a vectorized representation of the memset value operand, GISel edition.
8219static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
8220 MachineRegisterInfo &MRI = *MIB.getMRI();
8221 unsigned NumBits = Ty.getScalarSizeInBits();
8222 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(VReg: Val, MRI);
8223 if (!Ty.isVector() && ValVRegAndVal) {
8224 APInt Scalar = ValVRegAndVal->Value.trunc(width: 8);
8225 APInt SplatVal = APInt::getSplat(NewLen: NumBits, V: Scalar);
8226 return MIB.buildConstant(Res: Ty, Val: SplatVal).getReg(Idx: 0);
8227 }
8228
8229 // Extend the byte value to the larger type, and then multiply by a magic
8230 // value 0x010101... in order to replicate it across every byte.
8231 // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
8232 if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
8233 return MIB.buildConstant(Res: Ty, Val: 0).getReg(Idx: 0);
8234 }
8235
8236 LLT ExtType = Ty.getScalarType();
8237 auto ZExt = MIB.buildZExtOrTrunc(Res: ExtType, Op: Val);
8238 if (NumBits > 8) {
8239 APInt Magic = APInt::getSplat(NewLen: NumBits, V: APInt(8, 0x01));
8240 auto MagicMI = MIB.buildConstant(Res: ExtType, Val: Magic);
8241 Val = MIB.buildMul(Dst: ExtType, Src0: ZExt, Src1: MagicMI).getReg(Idx: 0);
8242 }
8243
8244 // For vector types create a G_BUILD_VECTOR.
8245 if (Ty.isVector())
8246 Val = MIB.buildSplatVector(Res: Ty, Src: Val).getReg(Idx: 0);
8247
8248 return Val;
8249}
8250
8251LegalizerHelper::LegalizeResult
8252LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
8253 uint64_t KnownLen, Align Alignment,
8254 bool IsVolatile) {
8255 auto &MF = *MI.getParent()->getParent();
8256 const auto &TLI = *MF.getSubtarget().getTargetLowering();
8257 auto &DL = MF.getDataLayout();
8258 LLVMContext &C = MF.getFunction().getContext();
8259
8260 assert(KnownLen != 0 && "Have a zero length memset length!");
8261
8262 bool DstAlignCanChange = false;
8263 MachineFrameInfo &MFI = MF.getFrameInfo();
8264 bool OptSize = shouldLowerMemFuncForSize(MF);
8265
8266 MachineInstr *FIDef = getOpcodeDef(Opcode: TargetOpcode::G_FRAME_INDEX, Reg: Dst, MRI);
8267 if (FIDef && !MFI.isFixedObjectIndex(ObjectIdx: FIDef->getOperand(i: 1).getIndex()))
8268 DstAlignCanChange = true;
8269
8270 unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
8271 std::vector<LLT> MemOps;
8272
8273 const auto &DstMMO = **MI.memoperands_begin();
8274 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
8275
8276 auto ValVRegAndVal = getIConstantVRegValWithLookThrough(VReg: Val, MRI);
8277 bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
8278
8279 if (!findGISelOptimalMemOpLowering(MemOps, Limit,
8280 Op: MemOp::Set(Size: KnownLen, DstAlignCanChange,
8281 DstAlign: Alignment,
8282 /*IsZeroMemset=*/IsZeroVal,
8283 /*IsVolatile=*/IsVolatile),
8284 DstAS: DstPtrInfo.getAddrSpace(), SrcAS: ~0u,
8285 FuncAttributes: MF.getFunction().getAttributes(), TLI))
8286 return UnableToLegalize;
8287
8288 if (DstAlignCanChange) {
8289 // Get an estimate of the type from the LLT.
8290 Type *IRTy = getTypeForLLT(Ty: MemOps[0], C);
8291 Align NewAlign = DL.getABITypeAlign(Ty: IRTy);
8292 if (NewAlign > Alignment) {
8293 Alignment = NewAlign;
8294 unsigned FI = FIDef->getOperand(i: 1).getIndex();
8295 // Give the stack frame object a larger alignment if needed.
8296 if (MFI.getObjectAlign(ObjectIdx: FI) < Alignment)
8297 MFI.setObjectAlignment(ObjectIdx: FI, Alignment);
8298 }
8299 }
8300
8301 MachineIRBuilder MIB(MI);
8302 // Find the largest store and generate the bit pattern for it.
8303 LLT LargestTy = MemOps[0];
8304 for (unsigned i = 1; i < MemOps.size(); i++)
8305 if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
8306 LargestTy = MemOps[i];
8307
8308 // The memset stored value is always defined as an s8, so in order to make it
8309 // work with larger store types we need to repeat the bit pattern across the
8310 // wider type.
8311 Register MemSetValue = getMemsetValue(Val, Ty: LargestTy, MIB);
8312
8313 if (!MemSetValue)
8314 return UnableToLegalize;
8315
8316 // Generate the stores. For each store type in the list, we generate the
8317 // matching store of that type to the destination address.
8318 LLT PtrTy = MRI.getType(Reg: Dst);
8319 unsigned DstOff = 0;
8320 unsigned Size = KnownLen;
8321 for (unsigned I = 0; I < MemOps.size(); I++) {
8322 LLT Ty = MemOps[I];
8323 unsigned TySize = Ty.getSizeInBytes();
8324 if (TySize > Size) {
8325 // Issuing an unaligned load / store pair that overlaps with the previous
8326 // pair. Adjust the offset accordingly.
8327 assert(I == MemOps.size() - 1 && I != 0);
8328 DstOff -= TySize - Size;
8329 }
8330
8331 // If this store is smaller than the largest store see whether we can get
8332 // the smaller value for free with a truncate.
8333 Register Value = MemSetValue;
8334 if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
8335 MVT VT = getMVTForLLT(Ty);
8336 MVT LargestVT = getMVTForLLT(Ty: LargestTy);
8337 if (!LargestTy.isVector() && !Ty.isVector() &&
8338 TLI.isTruncateFree(FromVT: LargestVT, ToVT: VT))
8339 Value = MIB.buildTrunc(Res: Ty, Op: MemSetValue).getReg(Idx: 0);
8340 else
8341 Value = getMemsetValue(Val, Ty, MIB);
8342 if (!Value)
8343 return UnableToLegalize;
8344 }
8345
8346 auto *StoreMMO = MF.getMachineMemOperand(MMO: &DstMMO, Offset: DstOff, Ty);
8347
8348 Register Ptr = Dst;
8349 if (DstOff != 0) {
8350 auto Offset =
8351 MIB.buildConstant(Res: LLT::scalar(SizeInBits: PtrTy.getSizeInBits()), Val: DstOff);
8352 Ptr = MIB.buildPtrAdd(Res: PtrTy, Op0: Dst, Op1: Offset).getReg(Idx: 0);
8353 }
8354
8355 MIB.buildStore(Val: Value, Addr: Ptr, MMO&: *StoreMMO);
8356 DstOff += Ty.getSizeInBytes();
8357 Size -= TySize;
8358 }
8359
8360 MI.eraseFromParent();
8361 return Legalized;
8362}
8363
8364LegalizerHelper::LegalizeResult
8365LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
8366 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
8367
8368 auto [Dst, Src, Len] = MI.getFirst3Regs();
8369
8370 const auto *MMOIt = MI.memoperands_begin();
8371 const MachineMemOperand *MemOp = *MMOIt;
8372 bool IsVolatile = MemOp->isVolatile();
8373
8374 // See if this is a constant length copy
8375 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(VReg: Len, MRI);
8376 // FIXME: support dynamically sized G_MEMCPY_INLINE
8377 assert(LenVRegAndVal &&
8378 "inline memcpy with dynamic size is not yet supported");
8379 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
8380 if (KnownLen == 0) {
8381 MI.eraseFromParent();
8382 return Legalized;
8383 }
8384
8385 const auto &DstMMO = **MI.memoperands_begin();
8386 const auto &SrcMMO = **std::next(x: MI.memoperands_begin());
8387 Align DstAlign = DstMMO.getBaseAlign();
8388 Align SrcAlign = SrcMMO.getBaseAlign();
8389
8390 return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
8391 IsVolatile);
8392}
8393
8394LegalizerHelper::LegalizeResult
8395LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
8396 uint64_t KnownLen, Align DstAlign,
8397 Align SrcAlign, bool IsVolatile) {
8398 assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
8399 return lowerMemcpy(MI, Dst, Src, KnownLen,
8400 Limit: std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
8401 IsVolatile);
8402}
8403
8404LegalizerHelper::LegalizeResult
8405LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
8406 uint64_t KnownLen, uint64_t Limit, Align DstAlign,
8407 Align SrcAlign, bool IsVolatile) {
8408 auto &MF = *MI.getParent()->getParent();
8409 const auto &TLI = *MF.getSubtarget().getTargetLowering();
8410 auto &DL = MF.getDataLayout();
8411 LLVMContext &C = MF.getFunction().getContext();
8412
8413 assert(KnownLen != 0 && "Have a zero length memcpy length!");
8414
8415 bool DstAlignCanChange = false;
8416 MachineFrameInfo &MFI = MF.getFrameInfo();
8417 Align Alignment = std::min(a: DstAlign, b: SrcAlign);
8418
8419 MachineInstr *FIDef = getOpcodeDef(Opcode: TargetOpcode::G_FRAME_INDEX, Reg: Dst, MRI);
8420 if (FIDef && !MFI.isFixedObjectIndex(ObjectIdx: FIDef->getOperand(i: 1).getIndex()))
8421 DstAlignCanChange = true;
8422
8423 // FIXME: infer better src pointer alignment like SelectionDAG does here.
8424 // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
8425 // if the memcpy is in a tail call position.
8426
8427 std::vector<LLT> MemOps;
8428
8429 const auto &DstMMO = **MI.memoperands_begin();
8430 const auto &SrcMMO = **std::next(x: MI.memoperands_begin());
8431 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
8432 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
8433
8434 if (!findGISelOptimalMemOpLowering(
8435 MemOps, Limit,
8436 Op: MemOp::Copy(Size: KnownLen, DstAlignCanChange, DstAlign: Alignment, SrcAlign,
8437 IsVolatile),
8438 DstAS: DstPtrInfo.getAddrSpace(), SrcAS: SrcPtrInfo.getAddrSpace(),
8439 FuncAttributes: MF.getFunction().getAttributes(), TLI))
8440 return UnableToLegalize;
8441
8442 if (DstAlignCanChange) {
8443 // Get an estimate of the type from the LLT.
8444 Type *IRTy = getTypeForLLT(Ty: MemOps[0], C);
8445 Align NewAlign = DL.getABITypeAlign(Ty: IRTy);
8446
8447 // Don't promote to an alignment that would require dynamic stack
8448 // realignment.
8449 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
8450 if (!TRI->hasStackRealignment(MF))
8451 while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(Alignment: NewAlign))
8452 NewAlign = NewAlign.previous();
8453
8454 if (NewAlign > Alignment) {
8455 Alignment = NewAlign;
8456 unsigned FI = FIDef->getOperand(i: 1).getIndex();
8457 // Give the stack frame object a larger alignment if needed.
8458 if (MFI.getObjectAlign(ObjectIdx: FI) < Alignment)
8459 MFI.setObjectAlignment(ObjectIdx: FI, Alignment);
8460 }
8461 }
8462
8463 LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
8464
8465 MachineIRBuilder MIB(MI);
8466 // Now we need to emit a pair of load and stores for each of the types we've
8467 // collected. I.e. for each type, generate a load from the source pointer of
8468 // that type width, and then generate a corresponding store to the dest buffer
8469 // of that value loaded. This can result in a sequence of loads and stores
8470 // mixed types, depending on what the target specifies as good types to use.
8471 unsigned CurrOffset = 0;
8472 unsigned Size = KnownLen;
8473 for (auto CopyTy : MemOps) {
8474 // Issuing an unaligned load / store pair that overlaps with the previous
8475 // pair. Adjust the offset accordingly.
8476 if (CopyTy.getSizeInBytes() > Size)
8477 CurrOffset -= CopyTy.getSizeInBytes() - Size;
8478
8479 // Construct MMOs for the accesses.
8480 auto *LoadMMO =
8481 MF.getMachineMemOperand(MMO: &SrcMMO, Offset: CurrOffset, Size: CopyTy.getSizeInBytes());
8482 auto *StoreMMO =
8483 MF.getMachineMemOperand(MMO: &DstMMO, Offset: CurrOffset, Size: CopyTy.getSizeInBytes());
8484
8485 // Create the load.
8486 Register LoadPtr = Src;
8487 Register Offset;
8488 if (CurrOffset != 0) {
8489 LLT SrcTy = MRI.getType(Reg: Src);
8490 Offset = MIB.buildConstant(Res: LLT::scalar(SizeInBits: SrcTy.getSizeInBits()), Val: CurrOffset)
8491 .getReg(Idx: 0);
8492 LoadPtr = MIB.buildPtrAdd(Res: SrcTy, Op0: Src, Op1: Offset).getReg(Idx: 0);
8493 }
8494 auto LdVal = MIB.buildLoad(Res: CopyTy, Addr: LoadPtr, MMO&: *LoadMMO);
8495
8496 // Create the store.
8497 Register StorePtr = Dst;
8498 if (CurrOffset != 0) {
8499 LLT DstTy = MRI.getType(Reg: Dst);
8500 StorePtr = MIB.buildPtrAdd(Res: DstTy, Op0: Dst, Op1: Offset).getReg(Idx: 0);
8501 }
8502 MIB.buildStore(Val: LdVal, Addr: StorePtr, MMO&: *StoreMMO);
8503 CurrOffset += CopyTy.getSizeInBytes();
8504 Size -= CopyTy.getSizeInBytes();
8505 }
8506
8507 MI.eraseFromParent();
8508 return Legalized;
8509}
8510
8511LegalizerHelper::LegalizeResult
8512LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
8513 uint64_t KnownLen, Align DstAlign, Align SrcAlign,
8514 bool IsVolatile) {
8515 auto &MF = *MI.getParent()->getParent();
8516 const auto &TLI = *MF.getSubtarget().getTargetLowering();
8517 auto &DL = MF.getDataLayout();
8518 LLVMContext &C = MF.getFunction().getContext();
8519
8520 assert(KnownLen != 0 && "Have a zero length memmove length!");
8521
8522 bool DstAlignCanChange = false;
8523 MachineFrameInfo &MFI = MF.getFrameInfo();
8524 bool OptSize = shouldLowerMemFuncForSize(MF);
8525 Align Alignment = std::min(a: DstAlign, b: SrcAlign);
8526
8527 MachineInstr *FIDef = getOpcodeDef(Opcode: TargetOpcode::G_FRAME_INDEX, Reg: Dst, MRI);
8528 if (FIDef && !MFI.isFixedObjectIndex(ObjectIdx: FIDef->getOperand(i: 1).getIndex()))
8529 DstAlignCanChange = true;
8530
8531 unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
8532 std::vector<LLT> MemOps;
8533
8534 const auto &DstMMO = **MI.memoperands_begin();
8535 const auto &SrcMMO = **std::next(x: MI.memoperands_begin());
8536 MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
8537 MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
8538
8539 // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
8540 // to a bug in it's findOptimalMemOpLowering implementation. For now do the
8541 // same thing here.
8542 if (!findGISelOptimalMemOpLowering(
8543 MemOps, Limit,
8544 Op: MemOp::Copy(Size: KnownLen, DstAlignCanChange, DstAlign: Alignment, SrcAlign,
8545 /*IsVolatile*/ true),
8546 DstAS: DstPtrInfo.getAddrSpace(), SrcAS: SrcPtrInfo.getAddrSpace(),
8547 FuncAttributes: MF.getFunction().getAttributes(), TLI))
8548 return UnableToLegalize;
8549
8550 if (DstAlignCanChange) {
8551 // Get an estimate of the type from the LLT.
8552 Type *IRTy = getTypeForLLT(Ty: MemOps[0], C);
8553 Align NewAlign = DL.getABITypeAlign(Ty: IRTy);
8554
8555 // Don't promote to an alignment that would require dynamic stack
8556 // realignment.
8557 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
8558 if (!TRI->hasStackRealignment(MF))
8559 while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(Alignment: NewAlign))
8560 NewAlign = NewAlign.previous();
8561
8562 if (NewAlign > Alignment) {
8563 Alignment = NewAlign;
8564 unsigned FI = FIDef->getOperand(i: 1).getIndex();
8565 // Give the stack frame object a larger alignment if needed.
8566 if (MFI.getObjectAlign(ObjectIdx: FI) < Alignment)
8567 MFI.setObjectAlignment(ObjectIdx: FI, Alignment);
8568 }
8569 }
8570
8571 LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
8572
8573 MachineIRBuilder MIB(MI);
8574 // Memmove requires that we perform the loads first before issuing the stores.
8575 // Apart from that, this loop is pretty much doing the same thing as the
8576 // memcpy codegen function.
8577 unsigned CurrOffset = 0;
8578 SmallVector<Register, 16> LoadVals;
8579 for (auto CopyTy : MemOps) {
8580 // Construct MMO for the load.
8581 auto *LoadMMO =
8582 MF.getMachineMemOperand(MMO: &SrcMMO, Offset: CurrOffset, Size: CopyTy.getSizeInBytes());
8583
8584 // Create the load.
8585 Register LoadPtr = Src;
8586 if (CurrOffset != 0) {
8587 LLT SrcTy = MRI.getType(Reg: Src);
8588 auto Offset =
8589 MIB.buildConstant(Res: LLT::scalar(SizeInBits: SrcTy.getSizeInBits()), Val: CurrOffset);
8590 LoadPtr = MIB.buildPtrAdd(Res: SrcTy, Op0: Src, Op1: Offset).getReg(Idx: 0);
8591 }
8592 LoadVals.push_back(Elt: MIB.buildLoad(Res: CopyTy, Addr: LoadPtr, MMO&: *LoadMMO).getReg(Idx: 0));
8593 CurrOffset += CopyTy.getSizeInBytes();
8594 }
8595
8596 CurrOffset = 0;
8597 for (unsigned I = 0; I < MemOps.size(); ++I) {
8598 LLT CopyTy = MemOps[I];
8599 // Now store the values loaded.
8600 auto *StoreMMO =
8601 MF.getMachineMemOperand(MMO: &DstMMO, Offset: CurrOffset, Size: CopyTy.getSizeInBytes());
8602
8603 Register StorePtr = Dst;
8604 if (CurrOffset != 0) {
8605 LLT DstTy = MRI.getType(Reg: Dst);
8606 auto Offset =
8607 MIB.buildConstant(Res: LLT::scalar(SizeInBits: DstTy.getSizeInBits()), Val: CurrOffset);
8608 StorePtr = MIB.buildPtrAdd(Res: DstTy, Op0: Dst, Op1: Offset).getReg(Idx: 0);
8609 }
8610 MIB.buildStore(Val: LoadVals[I], Addr: StorePtr, MMO&: *StoreMMO);
8611 CurrOffset += CopyTy.getSizeInBytes();
8612 }
8613 MI.eraseFromParent();
8614 return Legalized;
8615}
8616
8617LegalizerHelper::LegalizeResult
8618LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
8619 const unsigned Opc = MI.getOpcode();
8620 // This combine is fairly complex so it's not written with a separate
8621 // matcher function.
8622 assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
8623 Opc == TargetOpcode::G_MEMSET) &&
8624 "Expected memcpy like instruction");
8625
8626 auto MMOIt = MI.memoperands_begin();
8627 const MachineMemOperand *MemOp = *MMOIt;
8628
8629 Align DstAlign = MemOp->getBaseAlign();
8630 Align SrcAlign;
8631 auto [Dst, Src, Len] = MI.getFirst3Regs();
8632
8633 if (Opc != TargetOpcode::G_MEMSET) {
8634 assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
8635 MemOp = *(++MMOIt);
8636 SrcAlign = MemOp->getBaseAlign();
8637 }
8638
8639 // See if this is a constant length copy
8640 auto LenVRegAndVal = getIConstantVRegValWithLookThrough(VReg: Len, MRI);
8641 if (!LenVRegAndVal)
8642 return UnableToLegalize;
8643 uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
8644
8645 if (KnownLen == 0) {
8646 MI.eraseFromParent();
8647 return Legalized;
8648 }
8649
8650 bool IsVolatile = MemOp->isVolatile();
8651 if (Opc == TargetOpcode::G_MEMCPY_INLINE)
8652 return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
8653 IsVolatile);
8654
8655 // Don't try to optimize volatile.
8656 if (IsVolatile)
8657 return UnableToLegalize;
8658
8659 if (MaxLen && KnownLen > MaxLen)
8660 return UnableToLegalize;
8661
8662 if (Opc == TargetOpcode::G_MEMCPY) {
8663 auto &MF = *MI.getParent()->getParent();
8664 const auto &TLI = *MF.getSubtarget().getTargetLowering();
8665 bool OptSize = shouldLowerMemFuncForSize(MF);
8666 uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
8667 return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
8668 IsVolatile);
8669 }
8670 if (Opc == TargetOpcode::G_MEMMOVE)
8671 return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
8672 if (Opc == TargetOpcode::G_MEMSET)
8673 return lowerMemset(MI, Dst, Val: Src, KnownLen, Alignment: DstAlign, IsVolatile);
8674 return UnableToLegalize;
8675}
8676

source code of llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp