1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
15#include "MCTargetDesc/NVPTXBaseInfo.h"
16#include "NVPTX.h"
17#include "NVPTXSubtarget.h"
18#include "NVPTXTargetMachine.h"
19#include "NVPTXTargetObjectFile.h"
20#include "NVPTXUtilities.h"
21#include "llvm/ADT/APInt.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/ADT/StringRef.h"
25#include "llvm/CodeGen/Analysis.h"
26#include "llvm/CodeGen/ISDOpcodes.h"
27#include "llvm/CodeGen/MachineFunction.h"
28#include "llvm/CodeGen/MachineMemOperand.h"
29#include "llvm/CodeGen/SelectionDAG.h"
30#include "llvm/CodeGen/SelectionDAGNodes.h"
31#include "llvm/CodeGen/TargetCallingConv.h"
32#include "llvm/CodeGen/TargetLowering.h"
33#include "llvm/CodeGen/ValueTypes.h"
34#include "llvm/CodeGenTypes/MachineValueType.h"
35#include "llvm/IR/Argument.h"
36#include "llvm/IR/Attributes.h"
37#include "llvm/IR/Constants.h"
38#include "llvm/IR/DataLayout.h"
39#include "llvm/IR/DerivedTypes.h"
40#include "llvm/IR/DiagnosticInfo.h"
41#include "llvm/IR/FPEnv.h"
42#include "llvm/IR/Function.h"
43#include "llvm/IR/GlobalValue.h"
44#include "llvm/IR/Instruction.h"
45#include "llvm/IR/Instructions.h"
46#include "llvm/IR/IntrinsicsNVPTX.h"
47#include "llvm/IR/Module.h"
48#include "llvm/IR/Type.h"
49#include "llvm/IR/Value.h"
50#include "llvm/Support/Alignment.h"
51#include "llvm/Support/Casting.h"
52#include "llvm/Support/CodeGen.h"
53#include "llvm/Support/CommandLine.h"
54#include "llvm/Support/ErrorHandling.h"
55#include "llvm/Support/raw_ostream.h"
56#include "llvm/Target/TargetMachine.h"
57#include "llvm/Target/TargetOptions.h"
58#include <algorithm>
59#include <cassert>
60#include <cmath>
61#include <cstdint>
62#include <iterator>
63#include <optional>
64#include <sstream>
65#include <string>
66#include <utility>
67#include <vector>
68
69#define DEBUG_TYPE "nvptx-lower"
70
71using namespace llvm;
72
73static std::atomic<unsigned> GlobalUniqueCallSite;
74
75static cl::opt<bool> sched4reg(
76 "nvptx-sched4reg",
77 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(Val: false));
78
79static cl::opt<unsigned> FMAContractLevelOpt(
80 "nvptx-fma-level", cl::Hidden,
81 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
82 " 1: do it 2: do it aggressively"),
83 cl::init(Val: 2));
84
85static cl::opt<int> UsePrecDivF32(
86 "nvptx-prec-divf32", cl::Hidden,
87 cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
88 " IEEE Compliant F32 div.rnd if available."),
89 cl::init(Val: 2));
90
91static cl::opt<bool> UsePrecSqrtF32(
92 "nvptx-prec-sqrtf32", cl::Hidden,
93 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
94 cl::init(Val: true));
95
96static cl::opt<bool> ForceMinByValParamAlign(
97 "nvptx-force-min-byval-param-align", cl::Hidden,
98 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
99 " params of device functions."),
100 cl::init(Val: false));
101
102int NVPTXTargetLowering::getDivF32Level() const {
103 if (UsePrecDivF32.getNumOccurrences() > 0) {
104 // If nvptx-prec-div32=N is used on the command-line, always honor it
105 return UsePrecDivF32;
106 } else {
107 // Otherwise, use div.approx if fast math is enabled
108 if (getTargetMachine().Options.UnsafeFPMath)
109 return 0;
110 else
111 return 2;
112 }
113}
114
115bool NVPTXTargetLowering::usePrecSqrtF32() const {
116 if (UsePrecSqrtF32.getNumOccurrences() > 0) {
117 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
118 return UsePrecSqrtF32;
119 } else {
120 // Otherwise, use sqrt.approx if fast math is enabled
121 return !getTargetMachine().Options.UnsafeFPMath;
122 }
123}
124
125bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
126 return MF.getDenormalMode(FPType: APFloat::IEEEsingle()).Output ==
127 DenormalMode::PreserveSign;
128}
129
130static bool IsPTXVectorType(MVT VT) {
131 switch (VT.SimpleTy) {
132 default:
133 return false;
134 case MVT::v2i1:
135 case MVT::v4i1:
136 case MVT::v2i8:
137 case MVT::v4i8:
138 case MVT::v2i16:
139 case MVT::v4i16:
140 case MVT::v8i16: // <4 x i16x2>
141 case MVT::v2i32:
142 case MVT::v4i32:
143 case MVT::v2i64:
144 case MVT::v2f16:
145 case MVT::v4f16:
146 case MVT::v8f16: // <4 x f16x2>
147 case MVT::v2bf16:
148 case MVT::v4bf16:
149 case MVT::v8bf16: // <4 x bf16x2>
150 case MVT::v2f32:
151 case MVT::v4f32:
152 case MVT::v2f64:
153 return true;
154 }
155}
156
157static bool Is16bitsType(MVT VT) {
158 return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 ||
159 VT.SimpleTy == MVT::i16);
160}
161
162/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
163/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
164/// into their primitive components.
165/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
166/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
167/// LowerCall, and LowerReturn.
168static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
169 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
170 SmallVectorImpl<uint64_t> *Offsets = nullptr,
171 uint64_t StartingOffset = 0) {
172 SmallVector<EVT, 16> TempVTs;
173 SmallVector<uint64_t, 16> TempOffsets;
174
175 // Special case for i128 - decompose to (i64, i64)
176 if (Ty->isIntegerTy(Bitwidth: 128)) {
177 ValueVTs.push_back(EVT(MVT::i64));
178 ValueVTs.push_back(EVT(MVT::i64));
179
180 if (Offsets) {
181 Offsets->push_back(Elt: StartingOffset + 0);
182 Offsets->push_back(Elt: StartingOffset + 8);
183 }
184
185 return;
186 }
187
188 // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
189 if (StructType *STy = dyn_cast<StructType>(Val: Ty)) {
190 auto const *SL = DL.getStructLayout(Ty: STy);
191 auto ElementNum = 0;
192 for(auto *EI : STy->elements()) {
193 ComputePTXValueVTs(TLI, DL, Ty: EI, ValueVTs, Offsets,
194 StartingOffset: StartingOffset + SL->getElementOffset(Idx: ElementNum));
195 ++ElementNum;
196 }
197 return;
198 }
199
200 ComputeValueVTs(TLI, DL, Ty, ValueVTs&: TempVTs, FixedOffsets: &TempOffsets, StartingOffset);
201 for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
202 EVT VT = TempVTs[i];
203 uint64_t Off = TempOffsets[i];
204 // Split vectors into individual elements, except for v2f16, which
205 // we will pass as a single scalar.
206 if (VT.isVector()) {
207 unsigned NumElts = VT.getVectorNumElements();
208 EVT EltVT = VT.getVectorElementType();
209 // Vectors with an even number of f16 elements will be passed to
210 // us as an array of v2f16/v2bf16 elements. We must match this so we
211 // stay in sync with Ins/Outs.
212 if ((Is16bitsType(VT: EltVT.getSimpleVT())) && NumElts % 2 == 0) {
213 switch (EltVT.getSimpleVT().SimpleTy) {
214 case MVT::f16:
215 EltVT = MVT::v2f16;
216 break;
217 case MVT::bf16:
218 EltVT = MVT::v2bf16;
219 break;
220 case MVT::i16:
221 EltVT = MVT::v2i16;
222 break;
223 default:
224 llvm_unreachable("Unexpected type");
225 }
226 NumElts /= 2;
227 } else if (EltVT.getSimpleVT() == MVT::i8 &&
228 (NumElts % 4 == 0 || NumElts == 3)) {
229 // v*i8 are formally lowered as v4i8
230 EltVT = MVT::v4i8;
231 NumElts = (NumElts + 3) / 4;
232 }
233 for (unsigned j = 0; j != NumElts; ++j) {
234 ValueVTs.push_back(Elt: EltVT);
235 if (Offsets)
236 Offsets->push_back(Elt: Off + j * EltVT.getStoreSize());
237 }
238 } else {
239 ValueVTs.push_back(Elt: VT);
240 if (Offsets)
241 Offsets->push_back(Elt: Off);
242 }
243 }
244}
245
246/// PromoteScalarIntegerPTX
247/// Used to make sure the arguments/returns are suitable for passing
248/// and promote them to a larger size if they're not.
249///
250/// The promoted type is placed in \p PromoteVT if the function returns true.
251static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) {
252 if (VT.isScalarInteger()) {
253 switch (PowerOf2Ceil(A: VT.getFixedSizeInBits())) {
254 default:
255 llvm_unreachable(
256 "Promotion is not suitable for scalars of size larger than 64-bits");
257 case 1:
258 *PromotedVT = MVT::i1;
259 break;
260 case 2:
261 case 4:
262 case 8:
263 *PromotedVT = MVT::i8;
264 break;
265 case 16:
266 *PromotedVT = MVT::i16;
267 break;
268 case 32:
269 *PromotedVT = MVT::i32;
270 break;
271 case 64:
272 *PromotedVT = MVT::i64;
273 break;
274 }
275 return EVT(*PromotedVT) != VT;
276 }
277 return false;
278}
279
280// Check whether we can merge loads/stores of some of the pieces of a
281// flattened function parameter or return value into a single vector
282// load/store.
283//
284// The flattened parameter is represented as a list of EVTs and
285// offsets, and the whole structure is aligned to ParamAlignment. This
286// function determines whether we can load/store pieces of the
287// parameter starting at index Idx using a single vectorized op of
288// size AccessSize. If so, it returns the number of param pieces
289// covered by the vector op. Otherwise, it returns 1.
290static unsigned CanMergeParamLoadStoresStartingAt(
291 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
292 const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
293
294 // Can't vectorize if param alignment is not sufficient.
295 if (ParamAlignment < AccessSize)
296 return 1;
297 // Can't vectorize if offset is not aligned.
298 if (Offsets[Idx] & (AccessSize - 1))
299 return 1;
300
301 EVT EltVT = ValueVTs[Idx];
302 unsigned EltSize = EltVT.getStoreSize();
303
304 // Element is too large to vectorize.
305 if (EltSize >= AccessSize)
306 return 1;
307
308 unsigned NumElts = AccessSize / EltSize;
309 // Can't vectorize if AccessBytes if not a multiple of EltSize.
310 if (AccessSize != EltSize * NumElts)
311 return 1;
312
313 // We don't have enough elements to vectorize.
314 if (Idx + NumElts > ValueVTs.size())
315 return 1;
316
317 // PTX ISA can only deal with 2- and 4-element vector ops.
318 if (NumElts != 4 && NumElts != 2)
319 return 1;
320
321 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
322 // Types do not match.
323 if (ValueVTs[j] != EltVT)
324 return 1;
325
326 // Elements are not contiguous.
327 if (Offsets[j] - Offsets[j - 1] != EltSize)
328 return 1;
329 }
330 // OK. We can vectorize ValueVTs[i..i+NumElts)
331 return NumElts;
332}
333
334// Flags for tracking per-element vectorization state of loads/stores
335// of a flattened function parameter or return value.
336enum ParamVectorizationFlags {
337 PVF_INNER = 0x0, // Middle elements of a vector.
338 PVF_FIRST = 0x1, // First element of the vector.
339 PVF_LAST = 0x2, // Last element of the vector.
340 // Scalar is effectively a 1-element vector.
341 PVF_SCALAR = PVF_FIRST | PVF_LAST
342};
343
344// Computes whether and how we can vectorize the loads/stores of a
345// flattened function parameter or return value.
346//
347// The flattened parameter is represented as the list of ValueVTs and
348// Offsets, and is aligned to ParamAlignment bytes. We return a vector
349// of the same size as ValueVTs indicating how each piece should be
350// loaded/stored (i.e. as a scalar, or as part of a vector
351// load/store).
352static SmallVector<ParamVectorizationFlags, 16>
353VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
354 const SmallVectorImpl<uint64_t> &Offsets,
355 Align ParamAlignment, bool IsVAArg = false) {
356 // Set vector size to match ValueVTs and mark all elements as
357 // scalars by default.
358 SmallVector<ParamVectorizationFlags, 16> VectorInfo;
359 VectorInfo.assign(NumElts: ValueVTs.size(), Elt: PVF_SCALAR);
360
361 if (IsVAArg)
362 return VectorInfo;
363
364 // Check what we can vectorize using 128/64/32-bit accesses.
365 for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
366 // Skip elements we've already processed.
367 assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
368 for (unsigned AccessSize : {16, 8, 4, 2}) {
369 unsigned NumElts = CanMergeParamLoadStoresStartingAt(
370 Idx: I, AccessSize, ValueVTs, Offsets, ParamAlignment);
371 // Mark vectorized elements.
372 switch (NumElts) {
373 default:
374 llvm_unreachable("Unexpected return value");
375 case 1:
376 // Can't vectorize using this size, try next smaller size.
377 continue;
378 case 2:
379 assert(I + 1 < E && "Not enough elements.");
380 VectorInfo[I] = PVF_FIRST;
381 VectorInfo[I + 1] = PVF_LAST;
382 I += 1;
383 break;
384 case 4:
385 assert(I + 3 < E && "Not enough elements.");
386 VectorInfo[I] = PVF_FIRST;
387 VectorInfo[I + 1] = PVF_INNER;
388 VectorInfo[I + 2] = PVF_INNER;
389 VectorInfo[I + 3] = PVF_LAST;
390 I += 3;
391 break;
392 }
393 // Break out of the inner loop because we've already succeeded
394 // using largest possible AccessSize.
395 break;
396 }
397 }
398 return VectorInfo;
399}
400
401// NVPTXTargetLowering Constructor.
402NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
403 const NVPTXSubtarget &STI)
404 : TargetLowering(TM), nvTM(&TM), STI(STI) {
405 // always lower memset, memcpy, and memmove intrinsics to load/store
406 // instructions, rather
407 // then generating calls to memset, mempcy or memmove.
408 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF;
409 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF;
410 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF;
411
412 setBooleanContents(ZeroOrNegativeOneBooleanContent);
413 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
414
415 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
416 // condition branches.
417 setJumpIsExpensive(true);
418
419 // Wide divides are _very_ slow. Try to reduce the width of the divide if
420 // possible.
421 addBypassSlowDiv(SlowBitWidth: 64, FastBitWidth: 32);
422
423 // By default, use the Source scheduling
424 if (sched4reg)
425 setSchedulingPreference(Sched::RegPressure);
426 else
427 setSchedulingPreference(Sched::Source);
428
429 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
430 LegalizeAction NoF16Action) {
431 setOperationAction(Op, VT, Action: STI.allowFP16Math() ? Action : NoF16Action);
432 };
433
434 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
435 LegalizeAction NoBF16Action) {
436 bool IsOpSupported = STI.hasBF16Math();
437 // Few instructions are available on sm_90 only
438 switch(Op) {
439 case ISD::FADD:
440 case ISD::FMUL:
441 case ISD::FSUB:
442 case ISD::SELECT:
443 case ISD::SELECT_CC:
444 case ISD::SETCC:
445 case ISD::FEXP2:
446 case ISD::FCEIL:
447 case ISD::FFLOOR:
448 case ISD::FNEARBYINT:
449 case ISD::FRINT:
450 case ISD::FROUNDEVEN:
451 case ISD::FTRUNC:
452 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78;
453 break;
454 }
455 setOperationAction(
456 Op, VT, Action: IsOpSupported ? Action : NoBF16Action);
457 };
458
459 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
460 LegalizeAction NoI16x2Action) {
461 bool IsOpSupported = false;
462 // instructions are available on sm_90 only
463 switch (Op) {
464 case ISD::ADD:
465 case ISD::SMAX:
466 case ISD::SMIN:
467 case ISD::UMIN:
468 case ISD::UMAX:
469 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
470 break;
471 }
472 setOperationAction(Op, VT, Action: IsOpSupported ? Action : NoI16x2Action);
473 };
474
475 addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
476 addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
477 addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass);
478 addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass);
479 addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
480 addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
481 addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
482 addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
483 addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass);
484 addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass);
485 addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass);
486 addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass);
487
488 // Conversion to/from FP16/FP16x2 is always legal.
489 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
490 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
491 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);
492 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);
493
494 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
495 if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
496 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
497
498 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
499 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
500
501 // Conversion to/from BFP16/BFP16x2 is always legal.
502 setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Custom);
503 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2bf16, Custom);
504 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2bf16, Expand);
505 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2bf16, Expand);
506
507 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
508 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
509 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
510 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
511
512 // Conversion to/from i16/i16x2 is always legal.
513 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
514 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
515 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand);
516 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand);
517
518 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom);
519 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
520 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
521 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom);
522 // Only logical ops can be done on v4i8 directly, others must be done
523 // elementwise.
524 setOperationAction(
525 {ISD::ABS, ISD::ADD, ISD::ADDC, ISD::ADDE,
526 ISD::BITREVERSE, ISD::CTLZ, ISD::CTPOP, ISD::CTTZ,
527 ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FSHL, ISD::FSHR,
528 ISD::MUL, ISD::MULHS, ISD::MULHU, ISD::PARITY,
529 ISD::ROTL, ISD::ROTR, ISD::SADDO, ISD::SADDO_CARRY,
530 ISD::SADDSAT, ISD::SDIV, ISD::SDIVREM, ISD::SELECT_CC,
531 ISD::SETCC, ISD::SHL, ISD::SINT_TO_FP, ISD::SMAX,
532 ISD::SMIN, ISD::SMULO, ISD::SMUL_LOHI, ISD::SRA,
533 ISD::SREM, ISD::SRL, ISD::SSHLSAT, ISD::SSUBO,
534 ISD::SSUBO_CARRY, ISD::SSUBSAT, ISD::SUB, ISD::SUBC,
535 ISD::SUBE, ISD::UADDO, ISD::UADDO_CARRY, ISD::UADDSAT,
536 ISD::UDIV, ISD::UDIVREM, ISD::UINT_TO_FP, ISD::UMAX,
537 ISD::UMIN, ISD::UMULO, ISD::UMUL_LOHI, ISD::UREM,
538 ISD::USHLSAT, ISD::USUBO, ISD::USUBO_CARRY, ISD::VSELECT,
539 ISD::USUBSAT},
540 MVT::v4i8, Expand);
541
542 // Operations not directly supported by NVPTX.
543 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
544 MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8,
545 MVT::i32, MVT::i64}) {
546 setOperationAction(ISD::SELECT_CC, VT, Expand);
547 setOperationAction(ISD::BR_CC, VT, Expand);
548 }
549
550 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
551 // For others we will expand to a SHL/SRA pair.
552 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
553 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
554 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
555 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
556 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
557 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
558
559 setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom);
560 setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom);
561 setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom);
562 setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom);
563 setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom);
564 setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom);
565
566 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
567 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
568
569 // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
570 // that don't have h/w rotation we lower them to multi-instruction assembly.
571 // See ROT*_sw in NVPTXIntrInfo.td
572 setOperationAction(ISD::ROTL, MVT::i64, Legal);
573 setOperationAction(ISD::ROTR, MVT::i64, Legal);
574 setOperationAction(ISD::ROTL, MVT::i32, Legal);
575 setOperationAction(ISD::ROTR, MVT::i32, Legal);
576
577 setOperationAction(ISD::ROTL, MVT::i16, Expand);
578 setOperationAction(ISD::ROTL, MVT::v2i16, Expand);
579 setOperationAction(ISD::ROTR, MVT::i16, Expand);
580 setOperationAction(ISD::ROTR, MVT::v2i16, Expand);
581 setOperationAction(ISD::ROTL, MVT::i8, Expand);
582 setOperationAction(ISD::ROTR, MVT::i8, Expand);
583 setOperationAction(ISD::BSWAP, MVT::i16, Expand);
584
585 // Indirect branch is not supported.
586 // This also disables Jump Table creation.
587 setOperationAction(ISD::BR_JT, MVT::Other, Expand);
588 setOperationAction(ISD::BRIND, MVT::Other, Expand);
589
590 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
591 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
592
593 // We want to legalize constant related memmove and memcopy
594 // intrinsics.
595 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
596
597 // Turn FP extload into load/fpextend
598 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
599 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
600 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
601 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
602 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
603 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
604 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
605 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
606 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
607 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
608 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
609 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
610 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
611 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
612 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
613 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
614 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
615 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
616 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
617 // Turn FP truncstore into trunc + store.
618 // FIXME: vector types should also be expanded
619 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
620 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
621 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
622 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
623 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
624
625 // PTX does not support load / store predicate registers
626 setOperationAction(ISD::LOAD, MVT::i1, Custom);
627 setOperationAction(ISD::STORE, MVT::i1, Custom);
628
629 for (MVT VT : MVT::integer_valuetypes()) {
630 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
631 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
632 setTruncStoreAction(VT, MVT::i1, Expand);
633 }
634
635 // expand extload of vector of integers.
636 setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16,
637 MVT::v2i8, Expand);
638 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
639
640 // This is legal in NVPTX
641 setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
642 setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
643 setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
644 setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
645
646 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
647 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
648
649 // TRAP can be lowered to PTX trap
650 setOperationAction(ISD::TRAP, MVT::Other, Legal);
651
652 // Register custom handling for vector loads/stores
653 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
654 if (IsPTXVectorType(VT)) {
655 setOperationAction(ISD::LOAD, VT, Custom);
656 setOperationAction(ISD::STORE, VT, Custom);
657 setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
658 }
659 }
660
661 // Support varargs.
662 setOperationAction(ISD::VASTART, MVT::Other, Custom);
663 setOperationAction(ISD::VAARG, MVT::Other, Custom);
664 setOperationAction(ISD::VACOPY, MVT::Other, Expand);
665 setOperationAction(ISD::VAEND, MVT::Other, Expand);
666
667 // Custom handling for i8 intrinsics
668 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
669
670 for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
671 setOperationAction(ISD::ABS, Ty, Legal);
672 setOperationAction(ISD::SMIN, Ty, Legal);
673 setOperationAction(ISD::SMAX, Ty, Legal);
674 setOperationAction(ISD::UMIN, Ty, Legal);
675 setOperationAction(ISD::UMAX, Ty, Legal);
676
677 setOperationAction(ISD::CTPOP, Ty, Legal);
678 setOperationAction(ISD::CTLZ, Ty, Legal);
679 }
680
681 setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom);
682 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
683 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
684 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
685 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
686 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
687 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
688
689 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
690 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
691 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
692 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
693 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
694 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
695
696 // Other arithmetic and logic ops are unsupported.
697 setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS,
698 ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
699 ISD::SINT_TO_FP, ISD::UINT_TO_FP},
700 MVT::v2i16, Expand);
701
702 setOperationAction(ISD::ADDC, MVT::i32, Legal);
703 setOperationAction(ISD::ADDE, MVT::i32, Legal);
704 setOperationAction(ISD::SUBC, MVT::i32, Legal);
705 setOperationAction(ISD::SUBE, MVT::i32, Legal);
706 if (STI.getPTXVersion() >= 43) {
707 setOperationAction(ISD::ADDC, MVT::i64, Legal);
708 setOperationAction(ISD::ADDE, MVT::i64, Legal);
709 setOperationAction(ISD::SUBC, MVT::i64, Legal);
710 setOperationAction(ISD::SUBE, MVT::i64, Legal);
711 }
712
713 setOperationAction(ISD::CTTZ, MVT::i16, Expand);
714 setOperationAction(ISD::CTTZ, MVT::v2i16, Expand);
715 setOperationAction(ISD::CTTZ, MVT::i32, Expand);
716 setOperationAction(ISD::CTTZ, MVT::i64, Expand);
717
718 // PTX does not directly support SELP of i1, so promote to i32 first
719 setOperationAction(ISD::SELECT, MVT::i1, Custom);
720
721 // PTX cannot multiply two i64s in a single instruction.
722 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
723 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
724
725 // We have some custom DAG combine patterns for these nodes
726 setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
727 ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
728 ISD::VSELECT});
729
730 // setcc for f16x2 and bf16x2 needs special handling to prevent
731 // legalizer's attempt to scalarize it due to v2i1 not being legal.
732 if (STI.allowFP16Math() || STI.hasBF16Math())
733 setTargetDAGCombine(ISD::SETCC);
734
735 // Promote fp16 arithmetic if fp16 hardware isn't available or the
736 // user passed --nvptx-no-fp16-math. The flag is useful because,
737 // although sm_53+ GPUs have some sort of FP16 support in
738 // hardware, only sm_53 and sm_60 have full implementation. Others
739 // only have token amount of hardware and are likely to run faster
740 // by using fp32 units instead.
741 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
742 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
743 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
744 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
745 // bf16 must be promoted to f32.
746 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
747 if (getOperationAction(Op, MVT::bf16) == Promote)
748 AddPromotedToType(Op, MVT::bf16, MVT::f32);
749 }
750
751 // f16/f16x2 neg was introduced in PTX 60, SM_53.
752 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
753 STI.getPTXVersion() >= 60 &&
754 STI.allowFP16Math();
755 for (const auto &VT : {MVT::f16, MVT::v2f16})
756 setOperationAction(ISD::FNEG, VT,
757 IsFP16FP16x2NegAvailable ? Legal : Expand);
758
759 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
760 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
761 // (would be) Library functions.
762
763 // These map to conversion instructions for scalar FP types.
764 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
765 ISD::FROUNDEVEN, ISD::FTRUNC}) {
766 setOperationAction(Op, MVT::f16, Legal);
767 setOperationAction(Op, MVT::f32, Legal);
768 setOperationAction(Op, MVT::f64, Legal);
769 setOperationAction(Op, MVT::v2f16, Expand);
770 setOperationAction(Op, MVT::v2bf16, Expand);
771 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
772 if (getOperationAction(Op, MVT::bf16) == Promote)
773 AddPromotedToType(Op, MVT::bf16, MVT::f32);
774 }
775
776 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
777 setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand);
778 }
779 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
780 for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
781 setOperationAction(ISD::FP_EXTEND, VT, Custom);
782 setOperationAction(ISD::FP_ROUND, VT, Custom);
783 }
784 }
785
786 // sm_80 only has conversions between f32 and bf16. Custom lower all other
787 // bf16 conversions.
788 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
789 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
790 setOperationAction(
791 {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
792 VT, Custom);
793 }
794 setOperationAction(
795 {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
796 MVT::bf16, Custom);
797 }
798
799 setOperationAction(ISD::FROUND, MVT::f16, Promote);
800 setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
801 setOperationAction(ISD::FROUND, MVT::v2bf16, Expand);
802 setOperationAction(ISD::FROUND, MVT::f32, Custom);
803 setOperationAction(ISD::FROUND, MVT::f64, Custom);
804 setOperationAction(ISD::FROUND, MVT::bf16, Promote);
805 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
806
807 // 'Expand' implements FCOPYSIGN without calling an external library.
808 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
809 setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
810 setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand);
811 setOperationAction(ISD::FCOPYSIGN, MVT::v2bf16, Expand);
812 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
813 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
814
815 // These map to corresponding instructions for f32/f64. f16 must be
816 // promoted to f32. v2f16 is expanded to f16, which is then promoted
817 // to f32.
818 for (const auto &Op :
819 {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) {
820 setOperationAction(Op, MVT::f16, Promote);
821 setOperationAction(Op, MVT::f32, Legal);
822 setOperationAction(Op, MVT::f64, Legal);
823 setOperationAction(Op, MVT::v2f16, Expand);
824 setOperationAction(Op, MVT::v2bf16, Expand);
825 setOperationAction(Op, MVT::bf16, Promote);
826 AddPromotedToType(Op, MVT::bf16, MVT::f32);
827 }
828 for (const auto &Op : {ISD::FABS}) {
829 setOperationAction(Op, MVT::f16, Promote);
830 setOperationAction(Op, MVT::f32, Legal);
831 setOperationAction(Op, MVT::f64, Legal);
832 setOperationAction(Op, MVT::v2f16, Expand);
833 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
834 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
835 if (getOperationAction(Op, MVT::bf16) == Promote)
836 AddPromotedToType(Op, MVT::bf16, MVT::f32);
837 }
838
839 // max.f16, max.f16x2 and max.NaN are supported on sm_80+.
840 auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) {
841 bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
842 return IsAtLeastSm80 ? Legal : NotSm80Action;
843 };
844 for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
845 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote);
846 setOperationAction(Op, MVT::f32, Legal);
847 setOperationAction(Op, MVT::f64, Legal);
848 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
849 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
850 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
851 if (getOperationAction(Op, MVT::bf16) == Promote)
852 AddPromotedToType(Op, MVT::bf16, MVT::f32);
853 }
854 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
855 setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
856 setFP16OperationAction(Op, MVT::bf16, Legal, Expand);
857 setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));
858 setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
859 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
860 }
861
862 // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate.
863 // No FPOW or FREM in PTX.
864
865 // Now deduce the information based on the above mentioned
866 // actions
867 computeRegisterProperties(STI.getRegisterInfo());
868
869 setMinCmpXchgSizeInBits(32);
870 setMaxAtomicSizeInBitsSupported(64);
871}
872
873const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
874
875#define MAKE_CASE(V) \
876 case V: \
877 return #V;
878
879 switch ((NVPTXISD::NodeType)Opcode) {
880 case NVPTXISD::FIRST_NUMBER:
881 break;
882
883 MAKE_CASE(NVPTXISD::CALL)
884 MAKE_CASE(NVPTXISD::RET_GLUE)
885 MAKE_CASE(NVPTXISD::LOAD_PARAM)
886 MAKE_CASE(NVPTXISD::Wrapper)
887 MAKE_CASE(NVPTXISD::DeclareParam)
888 MAKE_CASE(NVPTXISD::DeclareScalarParam)
889 MAKE_CASE(NVPTXISD::DeclareRet)
890 MAKE_CASE(NVPTXISD::DeclareScalarRet)
891 MAKE_CASE(NVPTXISD::DeclareRetParam)
892 MAKE_CASE(NVPTXISD::PrintCall)
893 MAKE_CASE(NVPTXISD::PrintConvergentCall)
894 MAKE_CASE(NVPTXISD::PrintCallUni)
895 MAKE_CASE(NVPTXISD::PrintConvergentCallUni)
896 MAKE_CASE(NVPTXISD::LoadParam)
897 MAKE_CASE(NVPTXISD::LoadParamV2)
898 MAKE_CASE(NVPTXISD::LoadParamV4)
899 MAKE_CASE(NVPTXISD::StoreParam)
900 MAKE_CASE(NVPTXISD::StoreParamV2)
901 MAKE_CASE(NVPTXISD::StoreParamV4)
902 MAKE_CASE(NVPTXISD::StoreParamS32)
903 MAKE_CASE(NVPTXISD::StoreParamU32)
904 MAKE_CASE(NVPTXISD::CallArgBegin)
905 MAKE_CASE(NVPTXISD::CallArg)
906 MAKE_CASE(NVPTXISD::LastCallArg)
907 MAKE_CASE(NVPTXISD::CallArgEnd)
908 MAKE_CASE(NVPTXISD::CallVoid)
909 MAKE_CASE(NVPTXISD::CallVal)
910 MAKE_CASE(NVPTXISD::CallSymbol)
911 MAKE_CASE(NVPTXISD::Prototype)
912 MAKE_CASE(NVPTXISD::MoveParam)
913 MAKE_CASE(NVPTXISD::StoreRetval)
914 MAKE_CASE(NVPTXISD::StoreRetvalV2)
915 MAKE_CASE(NVPTXISD::StoreRetvalV4)
916 MAKE_CASE(NVPTXISD::PseudoUseParam)
917 MAKE_CASE(NVPTXISD::RETURN)
918 MAKE_CASE(NVPTXISD::CallSeqBegin)
919 MAKE_CASE(NVPTXISD::CallSeqEnd)
920 MAKE_CASE(NVPTXISD::CallPrototype)
921 MAKE_CASE(NVPTXISD::ProxyReg)
922 MAKE_CASE(NVPTXISD::LoadV2)
923 MAKE_CASE(NVPTXISD::LoadV4)
924 MAKE_CASE(NVPTXISD::LDGV2)
925 MAKE_CASE(NVPTXISD::LDGV4)
926 MAKE_CASE(NVPTXISD::LDUV2)
927 MAKE_CASE(NVPTXISD::LDUV4)
928 MAKE_CASE(NVPTXISD::StoreV2)
929 MAKE_CASE(NVPTXISD::StoreV4)
930 MAKE_CASE(NVPTXISD::FUN_SHFL_CLAMP)
931 MAKE_CASE(NVPTXISD::FUN_SHFR_CLAMP)
932 MAKE_CASE(NVPTXISD::IMAD)
933 MAKE_CASE(NVPTXISD::BFE)
934 MAKE_CASE(NVPTXISD::BFI)
935 MAKE_CASE(NVPTXISD::PRMT)
936 MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC)
937 MAKE_CASE(NVPTXISD::SETP_F16X2)
938 MAKE_CASE(NVPTXISD::SETP_BF16X2)
939 MAKE_CASE(NVPTXISD::Dummy)
940 MAKE_CASE(NVPTXISD::MUL_WIDE_SIGNED)
941 MAKE_CASE(NVPTXISD::MUL_WIDE_UNSIGNED)
942 MAKE_CASE(NVPTXISD::Tex1DFloatS32)
943 MAKE_CASE(NVPTXISD::Tex1DFloatFloat)
944 MAKE_CASE(NVPTXISD::Tex1DFloatFloatLevel)
945 MAKE_CASE(NVPTXISD::Tex1DFloatFloatGrad)
946 MAKE_CASE(NVPTXISD::Tex1DS32S32)
947 MAKE_CASE(NVPTXISD::Tex1DS32Float)
948 MAKE_CASE(NVPTXISD::Tex1DS32FloatLevel)
949 MAKE_CASE(NVPTXISD::Tex1DS32FloatGrad)
950 MAKE_CASE(NVPTXISD::Tex1DU32S32)
951 MAKE_CASE(NVPTXISD::Tex1DU32Float)
952 MAKE_CASE(NVPTXISD::Tex1DU32FloatLevel)
953 MAKE_CASE(NVPTXISD::Tex1DU32FloatGrad)
954 MAKE_CASE(NVPTXISD::Tex1DArrayFloatS32)
955 MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloat)
956 MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatLevel)
957 MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatGrad)
958 MAKE_CASE(NVPTXISD::Tex1DArrayS32S32)
959 MAKE_CASE(NVPTXISD::Tex1DArrayS32Float)
960 MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatLevel)
961 MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatGrad)
962 MAKE_CASE(NVPTXISD::Tex1DArrayU32S32)
963 MAKE_CASE(NVPTXISD::Tex1DArrayU32Float)
964 MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatLevel)
965 MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatGrad)
966 MAKE_CASE(NVPTXISD::Tex2DFloatS32)
967 MAKE_CASE(NVPTXISD::Tex2DFloatFloat)
968 MAKE_CASE(NVPTXISD::Tex2DFloatFloatLevel)
969 MAKE_CASE(NVPTXISD::Tex2DFloatFloatGrad)
970 MAKE_CASE(NVPTXISD::Tex2DS32S32)
971 MAKE_CASE(NVPTXISD::Tex2DS32Float)
972 MAKE_CASE(NVPTXISD::Tex2DS32FloatLevel)
973 MAKE_CASE(NVPTXISD::Tex2DS32FloatGrad)
974 MAKE_CASE(NVPTXISD::Tex2DU32S32)
975 MAKE_CASE(NVPTXISD::Tex2DU32Float)
976 MAKE_CASE(NVPTXISD::Tex2DU32FloatLevel)
977 MAKE_CASE(NVPTXISD::Tex2DU32FloatGrad)
978 MAKE_CASE(NVPTXISD::Tex2DArrayFloatS32)
979 MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloat)
980 MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatLevel)
981 MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatGrad)
982 MAKE_CASE(NVPTXISD::Tex2DArrayS32S32)
983 MAKE_CASE(NVPTXISD::Tex2DArrayS32Float)
984 MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatLevel)
985 MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatGrad)
986 MAKE_CASE(NVPTXISD::Tex2DArrayU32S32)
987 MAKE_CASE(NVPTXISD::Tex2DArrayU32Float)
988 MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatLevel)
989 MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatGrad)
990 MAKE_CASE(NVPTXISD::Tex3DFloatS32)
991 MAKE_CASE(NVPTXISD::Tex3DFloatFloat)
992 MAKE_CASE(NVPTXISD::Tex3DFloatFloatLevel)
993 MAKE_CASE(NVPTXISD::Tex3DFloatFloatGrad)
994 MAKE_CASE(NVPTXISD::Tex3DS32S32)
995 MAKE_CASE(NVPTXISD::Tex3DS32Float)
996 MAKE_CASE(NVPTXISD::Tex3DS32FloatLevel)
997 MAKE_CASE(NVPTXISD::Tex3DS32FloatGrad)
998 MAKE_CASE(NVPTXISD::Tex3DU32S32)
999 MAKE_CASE(NVPTXISD::Tex3DU32Float)
1000 MAKE_CASE(NVPTXISD::Tex3DU32FloatLevel)
1001 MAKE_CASE(NVPTXISD::Tex3DU32FloatGrad)
1002 MAKE_CASE(NVPTXISD::TexCubeFloatFloat)
1003 MAKE_CASE(NVPTXISD::TexCubeFloatFloatLevel)
1004 MAKE_CASE(NVPTXISD::TexCubeS32Float)
1005 MAKE_CASE(NVPTXISD::TexCubeS32FloatLevel)
1006 MAKE_CASE(NVPTXISD::TexCubeU32Float)
1007 MAKE_CASE(NVPTXISD::TexCubeU32FloatLevel)
1008 MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloat)
1009 MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloatLevel)
1010 MAKE_CASE(NVPTXISD::TexCubeArrayS32Float)
1011 MAKE_CASE(NVPTXISD::TexCubeArrayS32FloatLevel)
1012 MAKE_CASE(NVPTXISD::TexCubeArrayU32Float)
1013 MAKE_CASE(NVPTXISD::TexCubeArrayU32FloatLevel)
1014 MAKE_CASE(NVPTXISD::Tld4R2DFloatFloat)
1015 MAKE_CASE(NVPTXISD::Tld4G2DFloatFloat)
1016 MAKE_CASE(NVPTXISD::Tld4B2DFloatFloat)
1017 MAKE_CASE(NVPTXISD::Tld4A2DFloatFloat)
1018 MAKE_CASE(NVPTXISD::Tld4R2DS64Float)
1019 MAKE_CASE(NVPTXISD::Tld4G2DS64Float)
1020 MAKE_CASE(NVPTXISD::Tld4B2DS64Float)
1021 MAKE_CASE(NVPTXISD::Tld4A2DS64Float)
1022 MAKE_CASE(NVPTXISD::Tld4R2DU64Float)
1023 MAKE_CASE(NVPTXISD::Tld4G2DU64Float)
1024 MAKE_CASE(NVPTXISD::Tld4B2DU64Float)
1025 MAKE_CASE(NVPTXISD::Tld4A2DU64Float)
1026
1027 MAKE_CASE(NVPTXISD::TexUnified1DFloatS32)
1028 MAKE_CASE(NVPTXISD::TexUnified1DFloatFloat)
1029 MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatLevel)
1030 MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatGrad)
1031 MAKE_CASE(NVPTXISD::TexUnified1DS32S32)
1032 MAKE_CASE(NVPTXISD::TexUnified1DS32Float)
1033 MAKE_CASE(NVPTXISD::TexUnified1DS32FloatLevel)
1034 MAKE_CASE(NVPTXISD::TexUnified1DS32FloatGrad)
1035 MAKE_CASE(NVPTXISD::TexUnified1DU32S32)
1036 MAKE_CASE(NVPTXISD::TexUnified1DU32Float)
1037 MAKE_CASE(NVPTXISD::TexUnified1DU32FloatLevel)
1038 MAKE_CASE(NVPTXISD::TexUnified1DU32FloatGrad)
1039 MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatS32)
1040 MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloat)
1041 MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatLevel)
1042 MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatGrad)
1043 MAKE_CASE(NVPTXISD::TexUnified1DArrayS32S32)
1044 MAKE_CASE(NVPTXISD::TexUnified1DArrayS32Float)
1045 MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatLevel)
1046 MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatGrad)
1047 MAKE_CASE(NVPTXISD::TexUnified1DArrayU32S32)
1048 MAKE_CASE(NVPTXISD::TexUnified1DArrayU32Float)
1049 MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatLevel)
1050 MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatGrad)
1051 MAKE_CASE(NVPTXISD::TexUnified2DFloatS32)
1052 MAKE_CASE(NVPTXISD::TexUnified2DFloatFloat)
1053 MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatLevel)
1054 MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatGrad)
1055 MAKE_CASE(NVPTXISD::TexUnified2DS32S32)
1056 MAKE_CASE(NVPTXISD::TexUnified2DS32Float)
1057 MAKE_CASE(NVPTXISD::TexUnified2DS32FloatLevel)
1058 MAKE_CASE(NVPTXISD::TexUnified2DS32FloatGrad)
1059 MAKE_CASE(NVPTXISD::TexUnified2DU32S32)
1060 MAKE_CASE(NVPTXISD::TexUnified2DU32Float)
1061 MAKE_CASE(NVPTXISD::TexUnified2DU32FloatLevel)
1062 MAKE_CASE(NVPTXISD::TexUnified2DU32FloatGrad)
1063 MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatS32)
1064 MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloat)
1065 MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatLevel)
1066 MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatGrad)
1067 MAKE_CASE(NVPTXISD::TexUnified2DArrayS32S32)
1068 MAKE_CASE(NVPTXISD::TexUnified2DArrayS32Float)
1069 MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatLevel)
1070 MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatGrad)
1071 MAKE_CASE(NVPTXISD::TexUnified2DArrayU32S32)
1072 MAKE_CASE(NVPTXISD::TexUnified2DArrayU32Float)
1073 MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatLevel)
1074 MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatGrad)
1075 MAKE_CASE(NVPTXISD::TexUnified3DFloatS32)
1076 MAKE_CASE(NVPTXISD::TexUnified3DFloatFloat)
1077 MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatLevel)
1078 MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatGrad)
1079 MAKE_CASE(NVPTXISD::TexUnified3DS32S32)
1080 MAKE_CASE(NVPTXISD::TexUnified3DS32Float)
1081 MAKE_CASE(NVPTXISD::TexUnified3DS32FloatLevel)
1082 MAKE_CASE(NVPTXISD::TexUnified3DS32FloatGrad)
1083 MAKE_CASE(NVPTXISD::TexUnified3DU32S32)
1084 MAKE_CASE(NVPTXISD::TexUnified3DU32Float)
1085 MAKE_CASE(NVPTXISD::TexUnified3DU32FloatLevel)
1086 MAKE_CASE(NVPTXISD::TexUnified3DU32FloatGrad)
1087 MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloat)
1088 MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatLevel)
1089 MAKE_CASE(NVPTXISD::TexUnifiedCubeS32Float)
1090 MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatLevel)
1091 MAKE_CASE(NVPTXISD::TexUnifiedCubeU32Float)
1092 MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatLevel)
1093 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloat)
1094 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel)
1095 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32Float)
1096 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatLevel)
1097 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32Float)
1098 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatLevel)
1099 MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatGrad)
1100 MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatGrad)
1101 MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatGrad)
1102 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad)
1103 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatGrad)
1104 MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatGrad)
1105 MAKE_CASE(NVPTXISD::Tld4UnifiedR2DFloatFloat)
1106 MAKE_CASE(NVPTXISD::Tld4UnifiedG2DFloatFloat)
1107 MAKE_CASE(NVPTXISD::Tld4UnifiedB2DFloatFloat)
1108 MAKE_CASE(NVPTXISD::Tld4UnifiedA2DFloatFloat)
1109 MAKE_CASE(NVPTXISD::Tld4UnifiedR2DS64Float)
1110 MAKE_CASE(NVPTXISD::Tld4UnifiedG2DS64Float)
1111 MAKE_CASE(NVPTXISD::Tld4UnifiedB2DS64Float)
1112 MAKE_CASE(NVPTXISD::Tld4UnifiedA2DS64Float)
1113 MAKE_CASE(NVPTXISD::Tld4UnifiedR2DU64Float)
1114 MAKE_CASE(NVPTXISD::Tld4UnifiedG2DU64Float)
1115 MAKE_CASE(NVPTXISD::Tld4UnifiedB2DU64Float)
1116 MAKE_CASE(NVPTXISD::Tld4UnifiedA2DU64Float)
1117
1118 MAKE_CASE(NVPTXISD::Suld1DI8Clamp)
1119 MAKE_CASE(NVPTXISD::Suld1DI16Clamp)
1120 MAKE_CASE(NVPTXISD::Suld1DI32Clamp)
1121 MAKE_CASE(NVPTXISD::Suld1DI64Clamp)
1122 MAKE_CASE(NVPTXISD::Suld1DV2I8Clamp)
1123 MAKE_CASE(NVPTXISD::Suld1DV2I16Clamp)
1124 MAKE_CASE(NVPTXISD::Suld1DV2I32Clamp)
1125 MAKE_CASE(NVPTXISD::Suld1DV2I64Clamp)
1126 MAKE_CASE(NVPTXISD::Suld1DV4I8Clamp)
1127 MAKE_CASE(NVPTXISD::Suld1DV4I16Clamp)
1128 MAKE_CASE(NVPTXISD::Suld1DV4I32Clamp)
1129
1130 MAKE_CASE(NVPTXISD::Suld1DArrayI8Clamp)
1131 MAKE_CASE(NVPTXISD::Suld1DArrayI16Clamp)
1132 MAKE_CASE(NVPTXISD::Suld1DArrayI32Clamp)
1133 MAKE_CASE(NVPTXISD::Suld1DArrayI64Clamp)
1134 MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Clamp)
1135 MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Clamp)
1136 MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Clamp)
1137 MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Clamp)
1138 MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Clamp)
1139 MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Clamp)
1140 MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Clamp)
1141
1142 MAKE_CASE(NVPTXISD::Suld2DI8Clamp)
1143 MAKE_CASE(NVPTXISD::Suld2DI16Clamp)
1144 MAKE_CASE(NVPTXISD::Suld2DI32Clamp)
1145 MAKE_CASE(NVPTXISD::Suld2DI64Clamp)
1146 MAKE_CASE(NVPTXISD::Suld2DV2I8Clamp)
1147 MAKE_CASE(NVPTXISD::Suld2DV2I16Clamp)
1148 MAKE_CASE(NVPTXISD::Suld2DV2I32Clamp)
1149 MAKE_CASE(NVPTXISD::Suld2DV2I64Clamp)
1150 MAKE_CASE(NVPTXISD::Suld2DV4I8Clamp)
1151 MAKE_CASE(NVPTXISD::Suld2DV4I16Clamp)
1152 MAKE_CASE(NVPTXISD::Suld2DV4I32Clamp)
1153
1154 MAKE_CASE(NVPTXISD::Suld2DArrayI8Clamp)
1155 MAKE_CASE(NVPTXISD::Suld2DArrayI16Clamp)
1156 MAKE_CASE(NVPTXISD::Suld2DArrayI32Clamp)
1157 MAKE_CASE(NVPTXISD::Suld2DArrayI64Clamp)
1158 MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Clamp)
1159 MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Clamp)
1160 MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Clamp)
1161 MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Clamp)
1162 MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Clamp)
1163 MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Clamp)
1164 MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Clamp)
1165
1166 MAKE_CASE(NVPTXISD::Suld3DI8Clamp)
1167 MAKE_CASE(NVPTXISD::Suld3DI16Clamp)
1168 MAKE_CASE(NVPTXISD::Suld3DI32Clamp)
1169 MAKE_CASE(NVPTXISD::Suld3DI64Clamp)
1170 MAKE_CASE(NVPTXISD::Suld3DV2I8Clamp)
1171 MAKE_CASE(NVPTXISD::Suld3DV2I16Clamp)
1172 MAKE_CASE(NVPTXISD::Suld3DV2I32Clamp)
1173 MAKE_CASE(NVPTXISD::Suld3DV2I64Clamp)
1174 MAKE_CASE(NVPTXISD::Suld3DV4I8Clamp)
1175 MAKE_CASE(NVPTXISD::Suld3DV4I16Clamp)
1176 MAKE_CASE(NVPTXISD::Suld3DV4I32Clamp)
1177
1178 MAKE_CASE(NVPTXISD::Suld1DI8Trap)
1179 MAKE_CASE(NVPTXISD::Suld1DI16Trap)
1180 MAKE_CASE(NVPTXISD::Suld1DI32Trap)
1181 MAKE_CASE(NVPTXISD::Suld1DI64Trap)
1182 MAKE_CASE(NVPTXISD::Suld1DV2I8Trap)
1183 MAKE_CASE(NVPTXISD::Suld1DV2I16Trap)
1184 MAKE_CASE(NVPTXISD::Suld1DV2I32Trap)
1185 MAKE_CASE(NVPTXISD::Suld1DV2I64Trap)
1186 MAKE_CASE(NVPTXISD::Suld1DV4I8Trap)
1187 MAKE_CASE(NVPTXISD::Suld1DV4I16Trap)
1188 MAKE_CASE(NVPTXISD::Suld1DV4I32Trap)
1189
1190 MAKE_CASE(NVPTXISD::Suld1DArrayI8Trap)
1191 MAKE_CASE(NVPTXISD::Suld1DArrayI16Trap)
1192 MAKE_CASE(NVPTXISD::Suld1DArrayI32Trap)
1193 MAKE_CASE(NVPTXISD::Suld1DArrayI64Trap)
1194 MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Trap)
1195 MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Trap)
1196 MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Trap)
1197 MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Trap)
1198 MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Trap)
1199 MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Trap)
1200 MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Trap)
1201
1202 MAKE_CASE(NVPTXISD::Suld2DI8Trap)
1203 MAKE_CASE(NVPTXISD::Suld2DI16Trap)
1204 MAKE_CASE(NVPTXISD::Suld2DI32Trap)
1205 MAKE_CASE(NVPTXISD::Suld2DI64Trap)
1206 MAKE_CASE(NVPTXISD::Suld2DV2I8Trap)
1207 MAKE_CASE(NVPTXISD::Suld2DV2I16Trap)
1208 MAKE_CASE(NVPTXISD::Suld2DV2I32Trap)
1209 MAKE_CASE(NVPTXISD::Suld2DV2I64Trap)
1210 MAKE_CASE(NVPTXISD::Suld2DV4I8Trap)
1211 MAKE_CASE(NVPTXISD::Suld2DV4I16Trap)
1212 MAKE_CASE(NVPTXISD::Suld2DV4I32Trap)
1213
1214 MAKE_CASE(NVPTXISD::Suld2DArrayI8Trap)
1215 MAKE_CASE(NVPTXISD::Suld2DArrayI16Trap)
1216 MAKE_CASE(NVPTXISD::Suld2DArrayI32Trap)
1217 MAKE_CASE(NVPTXISD::Suld2DArrayI64Trap)
1218 MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Trap)
1219 MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Trap)
1220 MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Trap)
1221 MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Trap)
1222 MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Trap)
1223 MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Trap)
1224 MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Trap)
1225
1226 MAKE_CASE(NVPTXISD::Suld3DI8Trap)
1227 MAKE_CASE(NVPTXISD::Suld3DI16Trap)
1228 MAKE_CASE(NVPTXISD::Suld3DI32Trap)
1229 MAKE_CASE(NVPTXISD::Suld3DI64Trap)
1230 MAKE_CASE(NVPTXISD::Suld3DV2I8Trap)
1231 MAKE_CASE(NVPTXISD::Suld3DV2I16Trap)
1232 MAKE_CASE(NVPTXISD::Suld3DV2I32Trap)
1233 MAKE_CASE(NVPTXISD::Suld3DV2I64Trap)
1234 MAKE_CASE(NVPTXISD::Suld3DV4I8Trap)
1235 MAKE_CASE(NVPTXISD::Suld3DV4I16Trap)
1236 MAKE_CASE(NVPTXISD::Suld3DV4I32Trap)
1237
1238 MAKE_CASE(NVPTXISD::Suld1DI8Zero)
1239 MAKE_CASE(NVPTXISD::Suld1DI16Zero)
1240 MAKE_CASE(NVPTXISD::Suld1DI32Zero)
1241 MAKE_CASE(NVPTXISD::Suld1DI64Zero)
1242 MAKE_CASE(NVPTXISD::Suld1DV2I8Zero)
1243 MAKE_CASE(NVPTXISD::Suld1DV2I16Zero)
1244 MAKE_CASE(NVPTXISD::Suld1DV2I32Zero)
1245 MAKE_CASE(NVPTXISD::Suld1DV2I64Zero)
1246 MAKE_CASE(NVPTXISD::Suld1DV4I8Zero)
1247 MAKE_CASE(NVPTXISD::Suld1DV4I16Zero)
1248 MAKE_CASE(NVPTXISD::Suld1DV4I32Zero)
1249
1250 MAKE_CASE(NVPTXISD::Suld1DArrayI8Zero)
1251 MAKE_CASE(NVPTXISD::Suld1DArrayI16Zero)
1252 MAKE_CASE(NVPTXISD::Suld1DArrayI32Zero)
1253 MAKE_CASE(NVPTXISD::Suld1DArrayI64Zero)
1254 MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Zero)
1255 MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Zero)
1256 MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Zero)
1257 MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Zero)
1258 MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Zero)
1259 MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Zero)
1260 MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Zero)
1261
1262 MAKE_CASE(NVPTXISD::Suld2DI8Zero)
1263 MAKE_CASE(NVPTXISD::Suld2DI16Zero)
1264 MAKE_CASE(NVPTXISD::Suld2DI32Zero)
1265 MAKE_CASE(NVPTXISD::Suld2DI64Zero)
1266 MAKE_CASE(NVPTXISD::Suld2DV2I8Zero)
1267 MAKE_CASE(NVPTXISD::Suld2DV2I16Zero)
1268 MAKE_CASE(NVPTXISD::Suld2DV2I32Zero)
1269 MAKE_CASE(NVPTXISD::Suld2DV2I64Zero)
1270 MAKE_CASE(NVPTXISD::Suld2DV4I8Zero)
1271 MAKE_CASE(NVPTXISD::Suld2DV4I16Zero)
1272 MAKE_CASE(NVPTXISD::Suld2DV4I32Zero)
1273
1274 MAKE_CASE(NVPTXISD::Suld2DArrayI8Zero)
1275 MAKE_CASE(NVPTXISD::Suld2DArrayI16Zero)
1276 MAKE_CASE(NVPTXISD::Suld2DArrayI32Zero)
1277 MAKE_CASE(NVPTXISD::Suld2DArrayI64Zero)
1278 MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Zero)
1279 MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Zero)
1280 MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Zero)
1281 MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Zero)
1282 MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Zero)
1283 MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Zero)
1284 MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Zero)
1285
1286 MAKE_CASE(NVPTXISD::Suld3DI8Zero)
1287 MAKE_CASE(NVPTXISD::Suld3DI16Zero)
1288 MAKE_CASE(NVPTXISD::Suld3DI32Zero)
1289 MAKE_CASE(NVPTXISD::Suld3DI64Zero)
1290 MAKE_CASE(NVPTXISD::Suld3DV2I8Zero)
1291 MAKE_CASE(NVPTXISD::Suld3DV2I16Zero)
1292 MAKE_CASE(NVPTXISD::Suld3DV2I32Zero)
1293 MAKE_CASE(NVPTXISD::Suld3DV2I64Zero)
1294 MAKE_CASE(NVPTXISD::Suld3DV4I8Zero)
1295 MAKE_CASE(NVPTXISD::Suld3DV4I16Zero)
1296 MAKE_CASE(NVPTXISD::Suld3DV4I32Zero)
1297 }
1298 return nullptr;
1299
1300#undef MAKE_CASE
1301}
1302
1303TargetLoweringBase::LegalizeTypeAction
1304NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const {
1305 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1306 VT.getScalarType() == MVT::i1)
1307 return TypeSplitVector;
1308 if (Isv2x16VT(VT))
1309 return TypeLegal;
1310 return TargetLoweringBase::getPreferredVectorAction(VT);
1311}
1312
1313SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
1314 int Enabled, int &ExtraSteps,
1315 bool &UseOneConst,
1316 bool Reciprocal) const {
1317 if (!(Enabled == ReciprocalEstimate::Enabled ||
1318 (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
1319 return SDValue();
1320
1321 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1322 ExtraSteps = 0;
1323
1324 SDLoc DL(Operand);
1325 EVT VT = Operand.getValueType();
1326 bool Ftz = useF32FTZ(MF: DAG.getMachineFunction());
1327
1328 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1329 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1330 DAG.getConstant(IID, DL, MVT::i32), Operand);
1331 };
1332
1333 // The sqrt and rsqrt refinement processes assume we always start out with an
1334 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1335 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1336 // any refinement, we must return a regular sqrt.
1337 if (Reciprocal || ExtraSteps > 0) {
1338 if (VT == MVT::f32)
1339 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1340 : Intrinsic::nvvm_rsqrt_approx_f);
1341 else if (VT == MVT::f64)
1342 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1343 else
1344 return SDValue();
1345 } else {
1346 if (VT == MVT::f32)
1347 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1348 : Intrinsic::nvvm_sqrt_approx_f);
1349 else {
1350 // There's no sqrt.approx.f64 instruction, so we emit
1351 // reciprocal(rsqrt(x)). This is faster than
1352 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1353 // x * rsqrt(x).)
1354 return DAG.getNode(
1355 ISD::INTRINSIC_WO_CHAIN, DL, VT,
1356 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1357 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1358 }
1359 }
1360}
1361
1362SDValue
1363NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
1364 SDLoc dl(Op);
1365 const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Val&: Op);
1366 auto PtrVT = getPointerTy(DL: DAG.getDataLayout(), AS: GAN->getAddressSpace());
1367 Op = DAG.getTargetGlobalAddress(GV: GAN->getGlobal(), DL: dl, VT: PtrVT);
1368 return DAG.getNode(Opcode: NVPTXISD::Wrapper, DL: dl, VT: PtrVT, Operand: Op);
1369}
1370
1371static bool IsTypePassedAsArray(const Type *Ty) {
1372 return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(Bitwidth: 128) ||
1373 Ty->isHalfTy() || Ty->isBFloatTy();
1374}
1375
1376std::string NVPTXTargetLowering::getPrototype(
1377 const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1378 const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
1379 std::optional<std::pair<unsigned, const APInt &>> VAInfo,
1380 const CallBase &CB, unsigned UniqueCallSite) const {
1381 auto PtrVT = getPointerTy(DL);
1382
1383 bool isABI = (STI.getSmVersion() >= 20);
1384 assert(isABI && "Non-ABI compilation is not supported");
1385 if (!isABI)
1386 return "";
1387
1388 std::string Prototype;
1389 raw_string_ostream O(Prototype);
1390 O << "prototype_" << UniqueCallSite << " : .callprototype ";
1391
1392 if (retTy->getTypeID() == Type::VoidTyID) {
1393 O << "()";
1394 } else {
1395 O << "(";
1396 if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) &&
1397 !IsTypePassedAsArray(Ty: retTy)) {
1398 unsigned size = 0;
1399 if (auto *ITy = dyn_cast<IntegerType>(Val: retTy)) {
1400 size = ITy->getBitWidth();
1401 } else {
1402 assert(retTy->isFloatingPointTy() &&
1403 "Floating point type expected here");
1404 size = retTy->getPrimitiveSizeInBits();
1405 }
1406 // PTX ABI requires all scalar return values to be at least 32
1407 // bits in size. fp16 normally uses .b16 as its storage type in
1408 // PTX, so its size must be adjusted here, too.
1409 size = promoteScalarArgumentSize(size);
1410
1411 O << ".param .b" << size << " _";
1412 } else if (isa<PointerType>(Val: retTy)) {
1413 O << ".param .b" << PtrVT.getSizeInBits() << " _";
1414 } else if (IsTypePassedAsArray(Ty: retTy)) {
1415 O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
1416 << " .b8 _[" << DL.getTypeAllocSize(Ty: retTy) << "]";
1417 } else {
1418 llvm_unreachable("Unknown return type");
1419 }
1420 O << ") ";
1421 }
1422 O << "_ (";
1423
1424 bool first = true;
1425
1426 const Function *F = CB.getFunction();
1427 unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
1428 for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
1429 Type *Ty = Args[i].Ty;
1430 if (!first) {
1431 O << ", ";
1432 }
1433 first = false;
1434
1435 if (!Outs[OIdx].Flags.isByVal()) {
1436 if (IsTypePassedAsArray(Ty)) {
1437 unsigned ParamAlign = 0;
1438 const CallInst *CallI = cast<CallInst>(Val: &CB);
1439 // +1 because index 0 is reserved for return type alignment
1440 if (!getAlign(*CallI, index: i + 1, ParamAlign))
1441 ParamAlign = getFunctionParamOptimizedAlign(F, ArgTy: Ty, DL).value();
1442 O << ".param .align " << ParamAlign << " .b8 ";
1443 O << "_";
1444 O << "[" << DL.getTypeAllocSize(Ty) << "]";
1445 // update the index for Outs
1446 SmallVector<EVT, 16> vtparts;
1447 ComputeValueVTs(TLI: *this, DL, Ty, ValueVTs&: vtparts);
1448 if (unsigned len = vtparts.size())
1449 OIdx += len - 1;
1450 continue;
1451 }
1452 // i8 types in IR will be i16 types in SDAG
1453 assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1454 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1455 "type mismatch between callee prototype and arguments");
1456 // scalar type
1457 unsigned sz = 0;
1458 if (isa<IntegerType>(Val: Ty)) {
1459 sz = cast<IntegerType>(Val: Ty)->getBitWidth();
1460 sz = promoteScalarArgumentSize(size: sz);
1461 } else if (isa<PointerType>(Val: Ty)) {
1462 sz = PtrVT.getSizeInBits();
1463 } else {
1464 sz = Ty->getPrimitiveSizeInBits();
1465 }
1466 O << ".param .b" << sz << " ";
1467 O << "_";
1468 continue;
1469 }
1470
1471 Type *ETy = Args[i].IndirectType;
1472 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1473 Align ParamByValAlign =
1474 getFunctionByValParamAlign(F, ArgTy: ETy, InitialAlign, DL);
1475
1476 O << ".param .align " << ParamByValAlign.value() << " .b8 ";
1477 O << "_";
1478 O << "[" << Outs[OIdx].Flags.getByValSize() << "]";
1479 }
1480
1481 if (VAInfo)
1482 O << (first ? "" : ",") << " .param .align " << VAInfo->second
1483 << " .b8 _[]\n";
1484 O << ")";
1485 if (shouldEmitPTXNoReturn(V: &CB, TM: *nvTM))
1486 O << " .noreturn";
1487 O << ";";
1488
1489 return Prototype;
1490}
1491
1492Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
1493 unsigned Idx,
1494 const DataLayout &DL) const {
1495 if (!CB) {
1496 // CallSite is zero, fallback to ABI type alignment
1497 return DL.getABITypeAlign(Ty);
1498 }
1499
1500 unsigned Alignment = 0;
1501 const Function *DirectCallee = CB->getCalledFunction();
1502
1503 if (!DirectCallee) {
1504 // We don't have a direct function symbol, but that may be because of
1505 // constant cast instructions in the call.
1506
1507 // With bitcast'd call targets, the instruction will be the call
1508 if (const auto *CI = dyn_cast<CallInst>(Val: CB)) {
1509 // Check if we have call alignment metadata
1510 if (getAlign(*CI, index: Idx, Alignment))
1511 return Align(Alignment);
1512 }
1513 DirectCallee = getMaybeBitcastedCallee(CB);
1514 }
1515
1516 // Check for function alignment information if we found that the
1517 // ultimate target is a Function
1518 if (DirectCallee) {
1519 if (getAlign(*DirectCallee, index: Idx, Alignment))
1520 return Align(Alignment);
1521 // If alignment information is not available, fall back to the
1522 // default function param optimized type alignment
1523 return getFunctionParamOptimizedAlign(F: DirectCallee, ArgTy: Ty, DL);
1524 }
1525
1526 // Call is indirect, fall back to the ABI type alignment
1527 return DL.getABITypeAlign(Ty);
1528}
1529
1530static bool adjustElementType(EVT &ElementType) {
1531 switch (ElementType.getSimpleVT().SimpleTy) {
1532 default:
1533 return false;
1534 case MVT::f16:
1535 case MVT::bf16:
1536 ElementType = MVT::i16;
1537 return true;
1538 case MVT::f32:
1539 case MVT::v2f16:
1540 case MVT::v2bf16:
1541 ElementType = MVT::i32;
1542 return true;
1543 case MVT::f64:
1544 ElementType = MVT::i64;
1545 return true;
1546 }
1547}
1548
1549// Use byte-store when the param address of the argument value is unaligned.
1550// This may happen when the return value is a field of a packed structure.
1551//
1552// This is called in LowerCall() when passing the param values.
1553static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain,
1554 uint64_t Offset, EVT ElementType,
1555 SDValue StVal, SDValue &InGlue,
1556 unsigned ArgID, const SDLoc &dl) {
1557 // Bit logic only works on integer types
1558 if (adjustElementType(ElementType))
1559 StVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ElementType, Operand: StVal);
1560
1561 // Store each byte
1562 SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1563 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1564 // Shift the byte to the last byte position
1565 SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,
1566 DAG.getConstant(i * 8, dl, MVT::i32));
1567 SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),
1568 DAG.getConstant(Offset + i, dl, MVT::i32),
1569 ShiftVal, InGlue};
1570 // Trunc store only the last byte by using
1571 // st.param.b8
1572 // The register type can be larger than b8.
1573 Chain = DAG.getMemIntrinsicNode(
1574 NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,
1575 MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
1576 InGlue = Chain.getValue(R: 1);
1577 }
1578 return Chain;
1579}
1580
1581// Use byte-load when the param adress of the returned value is unaligned.
1582// This may happen when the returned value is a field of a packed structure.
1583static SDValue
1584LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset,
1585 EVT ElementType, SDValue &InGlue,
1586 SmallVectorImpl<SDValue> &TempProxyRegOps,
1587 const SDLoc &dl) {
1588 // Bit logic only works on integer types
1589 EVT MergedType = ElementType;
1590 adjustElementType(ElementType&: MergedType);
1591
1592 // Load each byte and construct the whole value. Initial value to 0
1593 SDValue RetVal = DAG.getConstant(Val: 0, DL: dl, VT: MergedType);
1594 // LoadParamMemI8 loads into i16 register only
1595 SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);
1596 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1597 SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
1598 DAG.getConstant(Offset + i, dl, MVT::i32),
1599 InGlue};
1600 // This will be selected to LoadParamMemI8
1601 SDValue LdVal =
1602 DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,
1603 MVT::i8, MachinePointerInfo(), Align(1));
1604 SDValue TmpLdVal = LdVal.getValue(R: 0);
1605 Chain = LdVal.getValue(R: 1);
1606 InGlue = LdVal.getValue(R: 2);
1607
1608 TmpLdVal = DAG.getNode(Opcode: NVPTXISD::ProxyReg, DL: dl,
1609 VT: TmpLdVal.getSimpleValueType(), Operand: TmpLdVal);
1610 TempProxyRegOps.push_back(Elt: TmpLdVal);
1611
1612 SDValue CMask = DAG.getConstant(Val: 255, DL: dl, VT: MergedType);
1613 SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);
1614 // Need to extend the i16 register to the whole width.
1615 TmpLdVal = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MergedType, Operand: TmpLdVal);
1616 // Mask off the high bits. Leave only the lower 8bits.
1617 // Do this because we are using loadparam.b8.
1618 TmpLdVal = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MergedType, N1: TmpLdVal, N2: CMask);
1619 // Shift and merge
1620 TmpLdVal = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: MergedType, N1: TmpLdVal, N2: CShift);
1621 RetVal = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: MergedType, N1: RetVal, N2: TmpLdVal);
1622 }
1623 if (ElementType != MergedType)
1624 RetVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ElementType, Operand: RetVal);
1625
1626 return RetVal;
1627}
1628
1629SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1630 SmallVectorImpl<SDValue> &InVals) const {
1631
1632 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1633 report_fatal_error(
1634 reason: "Support for variadic functions (unsized array parameter) introduced "
1635 "in PTX ISA version 6.0 and requires target sm_30.");
1636
1637 SelectionDAG &DAG = CLI.DAG;
1638 SDLoc dl = CLI.DL;
1639 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1640 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1641 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1642 SDValue Chain = CLI.Chain;
1643 SDValue Callee = CLI.Callee;
1644 bool &isTailCall = CLI.IsTailCall;
1645 ArgListTy &Args = CLI.getArgs();
1646 Type *RetTy = CLI.RetTy;
1647 const CallBase *CB = CLI.CB;
1648 const DataLayout &DL = DAG.getDataLayout();
1649
1650 bool isABI = (STI.getSmVersion() >= 20);
1651 assert(isABI && "Non-ABI compilation is not supported");
1652 if (!isABI)
1653 return Chain;
1654
1655 // Variadic arguments.
1656 //
1657 // Normally, for each argument, we declare a param scalar or a param
1658 // byte array in the .param space, and store the argument value to that
1659 // param scalar or array starting at offset 0.
1660 //
1661 // In the case of the first variadic argument, we declare a vararg byte array
1662 // with size 0. The exact size of this array isn't known at this point, so
1663 // it'll be patched later. All the variadic arguments will be stored to this
1664 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1665 // initially set to 0, so it can be used for non-variadic arguments (which use
1666 // 0 offset) to simplify the code.
1667 //
1668 // After all vararg is processed, 'VAOffset' holds the size of the
1669 // vararg byte array.
1670
1671 SDValue VADeclareParam; // vararg byte array
1672 unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic
1673 unsigned VAOffset = 0; // current offset in the param array
1674
1675 unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(i: 1);
1676 SDValue TempChain = Chain;
1677 Chain = DAG.getCALLSEQ_START(Chain, InSize: UniqueCallSite, OutSize: 0, DL: dl);
1678 SDValue InGlue = Chain.getValue(R: 1);
1679
1680 unsigned ParamCount = 0;
1681 // Args.size() and Outs.size() need not match.
1682 // Outs.size() will be larger
1683 // * if there is an aggregate argument with multiple fields (each field
1684 // showing up separately in Outs)
1685 // * if there is a vector argument with more than typical vector-length
1686 // elements (generally if more than 4) where each vector element is
1687 // individually present in Outs.
1688 // So a different index should be used for indexing into Outs/OutVals.
1689 // See similar issue in LowerFormalArguments.
1690 unsigned OIdx = 0;
1691 // Declare the .params or .reg need to pass values
1692 // to the function
1693 for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1694 EVT VT = Outs[OIdx].VT;
1695 Type *Ty = Args[i].Ty;
1696 bool IsVAArg = (i >= CLI.NumFixedArgs);
1697 bool IsByVal = Outs[OIdx].Flags.isByVal();
1698
1699 SmallVector<EVT, 16> VTs;
1700 SmallVector<uint64_t, 16> Offsets;
1701
1702 assert((!IsByVal || Args[i].IndirectType) &&
1703 "byval arg must have indirect type");
1704 Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
1705 ComputePTXValueVTs(TLI: *this, DL, Ty: ETy, ValueVTs&: VTs, Offsets: &Offsets, StartingOffset: IsByVal ? 0 : VAOffset);
1706
1707 Align ArgAlign;
1708 if (IsByVal) {
1709 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1710 // so we don't need to worry whether it's naturally aligned or not.
1711 // See TargetLowering::LowerCallTo().
1712 Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1713 ArgAlign = getFunctionByValParamAlign(F: CB->getCalledFunction(), ArgTy: ETy,
1714 InitialAlign, DL);
1715 if (IsVAArg)
1716 VAOffset = alignTo(Size: VAOffset, A: ArgAlign);
1717 } else {
1718 ArgAlign = getArgumentAlignment(CB, Ty, Idx: ParamCount + 1, DL);
1719 }
1720
1721 unsigned TypeSize =
1722 (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty));
1723 SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1724
1725 bool NeedAlign; // Does argument declaration specify alignment?
1726 bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty);
1727 if (IsVAArg) {
1728 if (ParamCount == FirstVAArg) {
1729 SDValue DeclareParamOps[] = {
1730 Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32),
1731 DAG.getConstant(ParamCount, dl, MVT::i32),
1732 DAG.getConstant(1, dl, MVT::i32), InGlue};
1733 VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl,
1734 DeclareParamVTs, DeclareParamOps);
1735 }
1736 NeedAlign = PassAsArray;
1737 } else if (PassAsArray) {
1738 // declare .param .align <align> .b8 .param<n>[<size>];
1739 SDValue DeclareParamOps[] = {
1740 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
1741 DAG.getConstant(ParamCount, dl, MVT::i32),
1742 DAG.getConstant(TypeSize, dl, MVT::i32), InGlue};
1743 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1744 DeclareParamOps);
1745 NeedAlign = true;
1746 } else {
1747 // declare .param .b<size> .param<n>;
1748 if (VT.isInteger() || VT.isFloatingPoint()) {
1749 // PTX ABI requires integral types to be at least 32 bits in
1750 // size. FP16 is loaded/stored using i16, so it's handled
1751 // here as well.
1752 TypeSize = promoteScalarArgumentSize(size: TypeSize * 8) / 8;
1753 }
1754 SDValue DeclareScalarParamOps[] = {
1755 Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
1756 DAG.getConstant(TypeSize * 8, dl, MVT::i32),
1757 DAG.getConstant(0, dl, MVT::i32), InGlue};
1758 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1759 DeclareScalarParamOps);
1760 NeedAlign = false;
1761 }
1762 InGlue = Chain.getValue(R: 1);
1763
1764 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1765 // than 32-bits are sign extended or zero extended, depending on
1766 // whether they are signed or unsigned types. This case applies
1767 // only to scalar parameters and not to aggregate values.
1768 bool ExtendIntegerParam =
1769 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1770
1771 auto VectorInfo = VectorizePTXValueVTs(ValueVTs: VTs, Offsets, ParamAlignment: ArgAlign, IsVAArg);
1772 SmallVector<SDValue, 6> StoreOperands;
1773 for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1774 EVT EltVT = VTs[j];
1775 int CurOffset = Offsets[j];
1776 MaybeAlign PartAlign;
1777 if (NeedAlign)
1778 PartAlign = commonAlignment(A: ArgAlign, Offset: CurOffset);
1779
1780 SDValue StVal = OutVals[OIdx];
1781
1782 MVT PromotedVT;
1783 if (PromoteScalarIntegerPTX(VT: EltVT, PromotedVT: &PromotedVT)) {
1784 EltVT = EVT(PromotedVT);
1785 }
1786 if (PromoteScalarIntegerPTX(VT: StVal.getValueType(), PromotedVT: &PromotedVT)) {
1787 llvm::ISD::NodeType Ext =
1788 Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1789 StVal = DAG.getNode(Opcode: Ext, DL: dl, VT: PromotedVT, Operand: StVal);
1790 }
1791
1792 if (IsByVal) {
1793 auto PtrVT = getPointerTy(DL);
1794 SDValue srcAddr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StVal,
1795 N2: DAG.getConstant(Val: CurOffset, DL: dl, VT: PtrVT));
1796 StVal = DAG.getLoad(VT: EltVT, dl, Chain: TempChain, Ptr: srcAddr, PtrInfo: MachinePointerInfo(),
1797 Alignment: PartAlign);
1798 } else if (ExtendIntegerParam) {
1799 assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1800 // zext/sext to i32
1801 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1802 : ISD::ZERO_EXTEND,
1803 dl, MVT::i32, StVal);
1804 }
1805
1806 if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {
1807 // Use 16-bit registers for small stores as it's the
1808 // smallest general purpose register size supported by NVPTX.
1809 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1810 }
1811
1812 // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
1813 // scalar store. In such cases, fall back to byte stores.
1814 if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() &&
1815 PartAlign.value() <
1816 DL.getABITypeAlign(Ty: EltVT.getTypeForEVT(Context&: *DAG.getContext()))) {
1817 assert(StoreOperands.empty() && "Unfinished preceeding store.");
1818 Chain = LowerUnalignedStoreParam(
1819 DAG, Chain, Offset: IsByVal ? CurOffset + VAOffset : CurOffset, ElementType: EltVT,
1820 StVal, InGlue, ArgID: ParamCount, dl);
1821
1822 // LowerUnalignedStoreParam took care of inserting the necessary nodes
1823 // into the SDAG, so just move on to the next element.
1824 if (!IsByVal)
1825 ++OIdx;
1826 continue;
1827 }
1828
1829 // New store.
1830 if (VectorInfo[j] & PVF_FIRST) {
1831 assert(StoreOperands.empty() && "Unfinished preceding store.");
1832 StoreOperands.push_back(Elt: Chain);
1833 StoreOperands.push_back(
1834 DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
1835
1836 StoreOperands.push_back(DAG.getConstant(
1837 IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
1838 dl, MVT::i32));
1839 }
1840
1841 // Record the value to store.
1842 StoreOperands.push_back(Elt: StVal);
1843
1844 if (VectorInfo[j] & PVF_LAST) {
1845 unsigned NumElts = StoreOperands.size() - 3;
1846 NVPTXISD::NodeType Op;
1847 switch (NumElts) {
1848 case 1:
1849 Op = NVPTXISD::StoreParam;
1850 break;
1851 case 2:
1852 Op = NVPTXISD::StoreParamV2;
1853 break;
1854 case 4:
1855 Op = NVPTXISD::StoreParamV4;
1856 break;
1857 default:
1858 llvm_unreachable("Invalid vector info.");
1859 }
1860
1861 StoreOperands.push_back(Elt: InGlue);
1862
1863 // Adjust type of the store op if we've extended the scalar
1864 // return value.
1865 EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1866
1867 Chain = DAG.getMemIntrinsicNode(
1868 Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1869 TheStoreType, MachinePointerInfo(), PartAlign,
1870 MachineMemOperand::MOStore);
1871 InGlue = Chain.getValue(R: 1);
1872
1873 // Cleanup.
1874 StoreOperands.clear();
1875
1876 // TODO: We may need to support vector types that can be passed
1877 // as scalars in variadic arguments.
1878 if (!IsByVal && IsVAArg) {
1879 assert(NumElts == 1 &&
1880 "Vectorization is expected to be disabled for variadics.");
1881 VAOffset += DL.getTypeAllocSize(
1882 Ty: TheStoreType.getTypeForEVT(Context&: *DAG.getContext()));
1883 }
1884 }
1885 if (!IsByVal)
1886 ++OIdx;
1887 }
1888 assert(StoreOperands.empty() && "Unfinished parameter store.");
1889 if (!IsByVal && VTs.size() > 0)
1890 --OIdx;
1891 ++ParamCount;
1892 if (IsByVal && IsVAArg)
1893 VAOffset += TypeSize;
1894 }
1895
1896 GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Val: Callee.getNode());
1897 MaybeAlign retAlignment = std::nullopt;
1898
1899 // Handle Result
1900 if (Ins.size() > 0) {
1901 SmallVector<EVT, 16> resvtparts;
1902 ComputeValueVTs(TLI: *this, DL, Ty: RetTy, ValueVTs&: resvtparts);
1903
1904 // Declare
1905 // .param .align N .b8 retval0[<size-in-bytes>], or
1906 // .param .b<size-in-bits> retval0
1907 unsigned resultsz = DL.getTypeAllocSizeInBits(Ty: RetTy);
1908 if (!IsTypePassedAsArray(Ty: RetTy)) {
1909 resultsz = promoteScalarArgumentSize(size: resultsz);
1910 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1911 SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1912 DAG.getConstant(resultsz, dl, MVT::i32),
1913 DAG.getConstant(0, dl, MVT::i32), InGlue };
1914 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1915 DeclareRetOps);
1916 InGlue = Chain.getValue(R: 1);
1917 } else {
1918 retAlignment = getArgumentAlignment(CB, Ty: RetTy, Idx: 0, DL);
1919 assert(retAlignment && "retAlignment is guaranteed to be set");
1920 SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1921 SDValue DeclareRetOps[] = {
1922 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
1923 DAG.getConstant(resultsz / 8, dl, MVT::i32),
1924 DAG.getConstant(0, dl, MVT::i32), InGlue};
1925 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1926 DeclareRetOps);
1927 InGlue = Chain.getValue(R: 1);
1928 }
1929 }
1930
1931 bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1932 // Set the size of the vararg param byte array if the callee is a variadic
1933 // function and the variadic part is not empty.
1934 if (HasVAArgs) {
1935 SDValue DeclareParamOps[] = {
1936 VADeclareParam.getOperand(0), VADeclareParam.getOperand(1),
1937 VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32),
1938 VADeclareParam.getOperand(4)};
1939 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1940 VADeclareParam->getVTList(), DeclareParamOps);
1941 }
1942
1943 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1944 // between them we must rely on the call site value which is valid for
1945 // indirect calls but is always null for libcalls.
1946 bool isIndirectCall = !Func && CB;
1947
1948 if (isa<ExternalSymbolSDNode>(Val: Callee)) {
1949 Function* CalleeFunc = nullptr;
1950
1951 // Try to find the callee in the current module.
1952 Callee = DAG.getSymbolFunctionGlobalAddress(Op: Callee, TargetFunction: &CalleeFunc);
1953 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1954
1955 // Set the "libcall callee" attribute to indicate that the function
1956 // must always have a declaration.
1957 CalleeFunc->addFnAttr(Kind: "nvptx-libcall-callee", Val: "true");
1958 }
1959
1960 if (isIndirectCall) {
1961 // This is indirect function call case : PTX requires a prototype of the
1962 // form
1963 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1964 // to be emitted, and the label has to used as the last arg of call
1965 // instruction.
1966 // The prototype is embedded in a string and put as the operand for a
1967 // CallPrototype SDNode which will print out to the value of the string.
1968 SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1969 std::string Proto = getPrototype(
1970 DL, retTy: RetTy, Args, Outs, retAlignment,
1971 VAInfo: HasVAArgs
1972 ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(
1973 x&: CLI.NumFixedArgs, y: VADeclareParam->getConstantOperandAPInt(Num: 1)))
1974 : std::nullopt,
1975 CB: *CB, UniqueCallSite);
1976 const char *ProtoStr = nvTM->getStrPool().save(S: Proto).data();
1977 SDValue ProtoOps[] = {
1978 Chain,
1979 DAG.getTargetExternalSymbol(ProtoStr, MVT::i32),
1980 InGlue,
1981 };
1982 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1983 InGlue = Chain.getValue(R: 1);
1984 }
1985 // Op to just print "call"
1986 SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1987 SDValue PrintCallOps[] = {
1988 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue
1989 };
1990 // We model convergent calls as separate opcodes.
1991 unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni;
1992 if (CLI.IsConvergent)
1993 Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
1994 : NVPTXISD::PrintConvergentCall;
1995 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1996 InGlue = Chain.getValue(R: 1);
1997
1998 // Ops to print out the function name
1999 SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2000 SDValue CallVoidOps[] = { Chain, Callee, InGlue };
2001 Chain = DAG.getNode(Opcode: NVPTXISD::CallVoid, DL: dl, VTList: CallVoidVTs, Ops: CallVoidOps);
2002 InGlue = Chain.getValue(R: 1);
2003
2004 // Ops to print out the param list
2005 SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2006 SDValue CallArgBeginOps[] = { Chain, InGlue };
2007 Chain = DAG.getNode(Opcode: NVPTXISD::CallArgBegin, DL: dl, VTList: CallArgBeginVTs,
2008 Ops: CallArgBeginOps);
2009 InGlue = Chain.getValue(R: 1);
2010
2011 for (unsigned i = 0, e = std::min(a: CLI.NumFixedArgs + 1, b: ParamCount); i != e;
2012 ++i) {
2013 unsigned opcode;
2014 if (i == (e - 1))
2015 opcode = NVPTXISD::LastCallArg;
2016 else
2017 opcode = NVPTXISD::CallArg;
2018 SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2019 SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
2020 DAG.getConstant(i, dl, MVT::i32), InGlue };
2021 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
2022 InGlue = Chain.getValue(R: 1);
2023 }
2024 SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2025 SDValue CallArgEndOps[] = { Chain,
2026 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
2027 InGlue };
2028 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
2029 InGlue = Chain.getValue(R: 1);
2030
2031 if (isIndirectCall) {
2032 SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
2033 SDValue PrototypeOps[] = {
2034 Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue};
2035 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
2036 InGlue = Chain.getValue(R: 1);
2037 }
2038
2039 SmallVector<SDValue, 16> ProxyRegOps;
2040 SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
2041 // An item of the vector is filled if the element does not need a ProxyReg
2042 // operation on it and should be added to InVals as is. ProxyRegOps and
2043 // ProxyRegTruncates contain empty/none items at the same index.
2044 SmallVector<SDValue, 16> RetElts;
2045 // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
2046 // to use the values of `LoadParam`s and to be replaced later then
2047 // `CALLSEQ_END` is added.
2048 SmallVector<SDValue, 16> TempProxyRegOps;
2049
2050 // Generate loads from param memory/moves from registers for result
2051 if (Ins.size() > 0) {
2052 SmallVector<EVT, 16> VTs;
2053 SmallVector<uint64_t, 16> Offsets;
2054 ComputePTXValueVTs(TLI: *this, DL, Ty: RetTy, ValueVTs&: VTs, Offsets: &Offsets, StartingOffset: 0);
2055 assert(VTs.size() == Ins.size() && "Bad value decomposition");
2056
2057 Align RetAlign = getArgumentAlignment(CB, Ty: RetTy, Idx: 0, DL);
2058 auto VectorInfo = VectorizePTXValueVTs(ValueVTs: VTs, Offsets, ParamAlignment: RetAlign);
2059
2060 SmallVector<EVT, 6> LoadVTs;
2061 int VecIdx = -1; // Index of the first element of the vector.
2062
2063 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2064 // 32-bits are sign extended or zero extended, depending on whether
2065 // they are signed or unsigned types.
2066 bool ExtendIntegerRetVal =
2067 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty: RetTy) < 32;
2068
2069 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2070 bool needTruncate = false;
2071 EVT TheLoadType = VTs[i];
2072 EVT EltType = Ins[i].VT;
2073 Align EltAlign = commonAlignment(A: RetAlign, Offset: Offsets[i]);
2074 MVT PromotedVT;
2075
2076 if (PromoteScalarIntegerPTX(VT: TheLoadType, PromotedVT: &PromotedVT)) {
2077 TheLoadType = EVT(PromotedVT);
2078 EltType = EVT(PromotedVT);
2079 needTruncate = true;
2080 }
2081
2082 if (ExtendIntegerRetVal) {
2083 TheLoadType = MVT::i32;
2084 EltType = MVT::i32;
2085 needTruncate = true;
2086 } else if (TheLoadType.getSizeInBits() < 16) {
2087 if (VTs[i].isInteger())
2088 needTruncate = true;
2089 EltType = MVT::i16;
2090 }
2091
2092 // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
2093 // scalar load. In such cases, fall back to byte loads.
2094 if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() &&
2095 EltAlign < DL.getABITypeAlign(
2096 Ty: TheLoadType.getTypeForEVT(Context&: *DAG.getContext()))) {
2097 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
2098 SDValue Ret = LowerUnalignedLoadRetParam(
2099 DAG, Chain, Offset: Offsets[i], ElementType: TheLoadType, InGlue, TempProxyRegOps, dl);
2100 ProxyRegOps.push_back(Elt: SDValue());
2101 ProxyRegTruncates.push_back(Elt: std::optional<MVT>());
2102 RetElts.resize(N: i);
2103 RetElts.push_back(Elt: Ret);
2104
2105 continue;
2106 }
2107
2108 // Record index of the very first element of the vector.
2109 if (VectorInfo[i] & PVF_FIRST) {
2110 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
2111 VecIdx = i;
2112 }
2113
2114 LoadVTs.push_back(Elt: EltType);
2115
2116 if (VectorInfo[i] & PVF_LAST) {
2117 unsigned NumElts = LoadVTs.size();
2118 LoadVTs.push_back(MVT::Other);
2119 LoadVTs.push_back(MVT::Glue);
2120 NVPTXISD::NodeType Op;
2121 switch (NumElts) {
2122 case 1:
2123 Op = NVPTXISD::LoadParam;
2124 break;
2125 case 2:
2126 Op = NVPTXISD::LoadParamV2;
2127 break;
2128 case 4:
2129 Op = NVPTXISD::LoadParamV4;
2130 break;
2131 default:
2132 llvm_unreachable("Invalid vector info.");
2133 }
2134
2135 SDValue LoadOperands[] = {
2136 Chain, DAG.getConstant(1, dl, MVT::i32),
2137 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue};
2138 SDValue RetVal = DAG.getMemIntrinsicNode(
2139 Op, dl, DAG.getVTList(VTs: LoadVTs), LoadOperands, TheLoadType,
2140 MachinePointerInfo(), EltAlign,
2141 MachineMemOperand::MOLoad);
2142
2143 for (unsigned j = 0; j < NumElts; ++j) {
2144 ProxyRegOps.push_back(Elt: RetVal.getValue(R: j));
2145
2146 if (needTruncate)
2147 ProxyRegTruncates.push_back(Elt: std::optional<MVT>(Ins[VecIdx + j].VT));
2148 else
2149 ProxyRegTruncates.push_back(Elt: std::optional<MVT>());
2150 }
2151
2152 Chain = RetVal.getValue(R: NumElts);
2153 InGlue = RetVal.getValue(R: NumElts + 1);
2154
2155 // Cleanup
2156 VecIdx = -1;
2157 LoadVTs.clear();
2158 }
2159 }
2160 }
2161
2162 Chain =
2163 DAG.getCALLSEQ_END(Chain, Size1: UniqueCallSite, Size2: UniqueCallSite + 1, Glue: InGlue, DL: dl);
2164 InGlue = Chain.getValue(R: 1);
2165
2166 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
2167 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
2168 // dangling.
2169 for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
2170 if (i < RetElts.size() && RetElts[i]) {
2171 InVals.push_back(Elt: RetElts[i]);
2172 continue;
2173 }
2174
2175 SDValue Ret = DAG.getNode(
2176 NVPTXISD::ProxyReg, dl,
2177 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
2178 { Chain, ProxyRegOps[i], InGlue }
2179 );
2180
2181 Chain = Ret.getValue(R: 1);
2182 InGlue = Ret.getValue(R: 2);
2183
2184 if (ProxyRegTruncates[i]) {
2185 Ret = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: *ProxyRegTruncates[i], Operand: Ret);
2186 }
2187
2188 InVals.push_back(Elt: Ret);
2189 }
2190
2191 for (SDValue &T : TempProxyRegOps) {
2192 SDValue Repl = DAG.getNode(
2193 NVPTXISD::ProxyReg, dl,
2194 DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue),
2195 {Chain, T.getOperand(0), InGlue});
2196 DAG.ReplaceAllUsesWith(From: T, To: Repl);
2197 DAG.RemoveDeadNode(N: T.getNode());
2198
2199 Chain = Repl.getValue(R: 1);
2200 InGlue = Repl.getValue(R: 2);
2201 }
2202
2203 // set isTailCall to false for now, until we figure out how to express
2204 // tail call optimization in PTX
2205 isTailCall = false;
2206 return Chain;
2207}
2208
2209SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
2210 SelectionDAG &DAG) const {
2211
2212 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
2213 const Function &Fn = DAG.getMachineFunction().getFunction();
2214
2215 DiagnosticInfoUnsupported NoDynamicAlloca(
2216 Fn,
2217 "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
2218 "requires target sm_52.",
2219 SDLoc(Op).getDebugLoc());
2220 DAG.getContext()->diagnose(DI: NoDynamicAlloca);
2221 auto Ops = {DAG.getConstant(Val: 0, DL: SDLoc(), VT: Op.getValueType()),
2222 Op.getOperand(i: 0)};
2223 return DAG.getMergeValues(Ops, dl: SDLoc());
2224 }
2225
2226 SDValue Chain = Op.getOperand(i: 0);
2227 SDValue Size = Op.getOperand(i: 1);
2228 uint64_t Align = cast<ConstantSDNode>(Val: Op.getOperand(i: 2))->getZExtValue();
2229 SDLoc DL(Op.getNode());
2230
2231 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
2232 if (nvTM->is64Bit())
2233 Size = DAG.getZExtOrTrunc(Size, DL, MVT::i64);
2234 else
2235 Size = DAG.getZExtOrTrunc(Size, DL, MVT::i32);
2236
2237 SDValue AllocOps[] = {Chain, Size,
2238 DAG.getTargetConstant(Align, DL, MVT::i32)};
2239 SDValue Alloca = DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL,
2240 nvTM->is64Bit() ? MVT::i64 : MVT::i32, AllocOps);
2241
2242 SDValue MergeOps[] = {Alloca, Chain};
2243 return DAG.getMergeValues(Ops: MergeOps, dl: DL);
2244}
2245
2246// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
2247// (see LegalizeDAG.cpp). This is slow and uses local memory.
2248// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
2249SDValue
2250NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
2251 SDNode *Node = Op.getNode();
2252 SDLoc dl(Node);
2253 SmallVector<SDValue, 8> Ops;
2254 unsigned NumOperands = Node->getNumOperands();
2255 for (unsigned i = 0; i < NumOperands; ++i) {
2256 SDValue SubOp = Node->getOperand(Num: i);
2257 EVT VVT = SubOp.getNode()->getValueType(ResNo: 0);
2258 EVT EltVT = VVT.getVectorElementType();
2259 unsigned NumSubElem = VVT.getVectorNumElements();
2260 for (unsigned j = 0; j < NumSubElem; ++j) {
2261 Ops.push_back(Elt: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: EltVT, N1: SubOp,
2262 N2: DAG.getIntPtrConstant(Val: j, DL: dl)));
2263 }
2264 }
2265 return DAG.getBuildVector(VT: Node->getValueType(ResNo: 0), DL: dl, Ops);
2266}
2267
2268// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
2269// would get lowered as two constant loads and vector-packing move.
2270// Instead we want just a constant move:
2271// mov.b32 %r2, 0x40003C00
2272SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
2273 SelectionDAG &DAG) const {
2274 EVT VT = Op->getValueType(ResNo: 0);
2275 if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
2276 return Op;
2277
2278 SDLoc DL(Op);
2279
2280 if (!llvm::all_of(Range: Op->ops(), P: [](SDValue Operand) {
2281 return Operand->isUndef() || isa<ConstantSDNode>(Val: Operand) ||
2282 isa<ConstantFPSDNode>(Val: Operand);
2283 })) {
2284 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2285 // to optimize calculation of constant parts.
2286 if (VT == MVT::v4i8) {
2287 SDValue C8 = DAG.getConstant(8, DL, MVT::i32);
2288 SDValue E01 = DAG.getNode(
2289 NVPTXISD::BFI, DL, MVT::i32,
2290 DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32),
2291 DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8);
2292 SDValue E012 =
2293 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2294 DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32),
2295 E01, DAG.getConstant(16, DL, MVT::i32), C8);
2296 SDValue E0123 =
2297 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2298 DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32),
2299 E012, DAG.getConstant(24, DL, MVT::i32), C8);
2300 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: E0123);
2301 }
2302 return Op;
2303 }
2304
2305 // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2306 auto GetOperand = [](SDValue Op, int N) -> APInt {
2307 const SDValue &Operand = Op->getOperand(Num: N);
2308 EVT VT = Op->getValueType(ResNo: 0);
2309 if (Operand->isUndef())
2310 return APInt(32, 0);
2311 APInt Value;
2312 if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2313 Value = cast<ConstantFPSDNode>(Val: Operand)->getValueAPF().bitcastToAPInt();
2314 else if (VT == MVT::v2i16 || VT == MVT::v4i8)
2315 Value = Operand->getAsAPIntVal();
2316 else
2317 llvm_unreachable("Unsupported type");
2318 // i8 values are carried around as i16, so we need to zero out upper bits,
2319 // so they do not get in the way of combining individual byte values
2320 if (VT == MVT::v4i8)
2321 Value = Value.trunc(width: 8);
2322 return Value.zext(width: 32);
2323 };
2324 APInt Value;
2325 if (Isv2x16VT(VT)) {
2326 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(shiftAmt: 16);
2327 } else if (VT == MVT::v4i8) {
2328 Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(shiftAmt: 8) |
2329 GetOperand(Op, 2).shl(shiftAmt: 16) | GetOperand(Op, 3).shl(shiftAmt: 24);
2330 } else {
2331 llvm_unreachable("Unsupported type");
2332 }
2333 SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32);
2334 return DAG.getNode(Opcode: ISD::BITCAST, DL: SDLoc(Op), VT: Op->getValueType(ResNo: 0), Operand: Const);
2335}
2336
2337SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
2338 SelectionDAG &DAG) const {
2339 SDValue Index = Op->getOperand(Num: 1);
2340 SDValue Vector = Op->getOperand(Num: 0);
2341 SDLoc DL(Op);
2342 EVT VectorVT = Vector.getValueType();
2343
2344 if (VectorVT == MVT::v4i8) {
2345 SDValue BFE =
2346 DAG.getNode(NVPTXISD::BFE, DL, MVT::i32,
2347 {Vector,
2348 DAG.getNode(ISD::MUL, DL, MVT::i32,
2349 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2350 DAG.getConstant(8, DL, MVT::i32)),
2351 DAG.getConstant(8, DL, MVT::i32)});
2352 return DAG.getAnyExtOrTrunc(Op: BFE, DL, VT: Op->getValueType(ResNo: 0));
2353 }
2354
2355 // Constant index will be matched by tablegen.
2356 if (isa<ConstantSDNode>(Val: Index.getNode()))
2357 return Op;
2358
2359 // Extract individual elements and select one of them.
2360 assert(Isv2x16VT(VectorVT) && "Unexpected vector type.");
2361 EVT EltVT = VectorVT.getVectorElementType();
2362
2363 SDLoc dl(Op.getNode());
2364 SDValue E0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: EltVT, N1: Vector,
2365 N2: DAG.getIntPtrConstant(Val: 0, DL: dl));
2366 SDValue E1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: EltVT, N1: Vector,
2367 N2: DAG.getIntPtrConstant(Val: 1, DL: dl));
2368 return DAG.getSelectCC(DL: dl, LHS: Index, RHS: DAG.getIntPtrConstant(Val: 0, DL: dl), True: E0, False: E1,
2369 Cond: ISD::CondCode::SETEQ);
2370}
2371
2372SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2373 SelectionDAG &DAG) const {
2374 SDValue Vector = Op->getOperand(Num: 0);
2375 EVT VectorVT = Vector.getValueType();
2376
2377 if (VectorVT != MVT::v4i8)
2378 return Op;
2379 SDLoc DL(Op);
2380 SDValue Value = Op->getOperand(Num: 1);
2381 if (Value->isUndef())
2382 return Vector;
2383
2384 SDValue Index = Op->getOperand(Num: 2);
2385
2386 SDValue BFI =
2387 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2388 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2389 DAG.getNode(ISD::MUL, DL, MVT::i32,
2390 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2391 DAG.getConstant(8, DL, MVT::i32)),
2392 DAG.getConstant(8, DL, MVT::i32)});
2393 return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op->getValueType(ResNo: 0), Operand: BFI);
2394}
2395
2396SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2397 SelectionDAG &DAG) const {
2398 SDValue V1 = Op.getOperand(i: 0);
2399 EVT VectorVT = V1.getValueType();
2400 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2401 return Op;
2402
2403 // Lower shuffle to PRMT instruction.
2404 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val: Op.getNode());
2405 SDValue V2 = Op.getOperand(i: 1);
2406 uint32_t Selector = 0;
2407 for (auto I : llvm::enumerate(First: SVN->getMask())) {
2408 if (I.value() != -1) // -1 is a placeholder for undef.
2409 Selector |= (I.value() << (I.index() * 4));
2410 }
2411
2412 SDLoc DL(Op);
2413 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2,
2414 DAG.getConstant(Selector, DL, MVT::i32),
2415 DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32));
2416}
2417/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2418/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2419/// amount, or
2420/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2421/// amount.
2422SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2423 SelectionDAG &DAG) const {
2424 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2425 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2426
2427 EVT VT = Op.getValueType();
2428 unsigned VTBits = VT.getSizeInBits();
2429 SDLoc dl(Op);
2430 SDValue ShOpLo = Op.getOperand(i: 0);
2431 SDValue ShOpHi = Op.getOperand(i: 1);
2432 SDValue ShAmt = Op.getOperand(i: 2);
2433 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2434
2435 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2436 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2437 // {dHi, dLo} = {aHi, aLo} >> Amt
2438 // dHi = aHi >> Amt
2439 // dLo = shf.r.clamp aLo, aHi, Amt
2440
2441 SDValue Hi = DAG.getNode(Opcode: Opc, DL: dl, VT, N1: ShOpHi, N2: ShAmt);
2442 SDValue Lo = DAG.getNode(Opcode: NVPTXISD::FUN_SHFR_CLAMP, DL: dl, VT, N1: ShOpLo, N2: ShOpHi,
2443 N3: ShAmt);
2444
2445 SDValue Ops[2] = { Lo, Hi };
2446 return DAG.getMergeValues(Ops, dl);
2447 }
2448 else {
2449 // {dHi, dLo} = {aHi, aLo} >> Amt
2450 // - if (Amt>=size) then
2451 // dLo = aHi >> (Amt-size)
2452 // dHi = aHi >> Amt (this is either all 0 or all 1)
2453 // else
2454 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2455 // dHi = aHi >> Amt
2456
2457 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2458 DAG.getConstant(VTBits, dl, MVT::i32),
2459 ShAmt);
2460 SDValue Tmp1 = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT, N1: ShOpLo, N2: ShAmt);
2461 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2462 DAG.getConstant(VTBits, dl, MVT::i32));
2463 SDValue Tmp2 = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: ShOpHi, N2: RevShAmt);
2464 SDValue FalseVal = DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Tmp1, N2: Tmp2);
2465 SDValue TrueVal = DAG.getNode(Opcode: Opc, DL: dl, VT, N1: ShOpHi, N2: ExtraShAmt);
2466
2467 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2468 DAG.getConstant(VTBits, dl, MVT::i32),
2469 ISD::SETGE);
2470 SDValue Hi = DAG.getNode(Opcode: Opc, DL: dl, VT, N1: ShOpHi, N2: ShAmt);
2471 SDValue Lo = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT, N1: Cmp, N2: TrueVal, N3: FalseVal);
2472
2473 SDValue Ops[2] = { Lo, Hi };
2474 return DAG.getMergeValues(Ops, dl);
2475 }
2476}
2477
2478/// LowerShiftLeftParts - Lower SHL_PARTS, which
2479/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2480/// amount, or
2481/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2482/// amount.
2483SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2484 SelectionDAG &DAG) const {
2485 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2486 assert(Op.getOpcode() == ISD::SHL_PARTS);
2487
2488 EVT VT = Op.getValueType();
2489 unsigned VTBits = VT.getSizeInBits();
2490 SDLoc dl(Op);
2491 SDValue ShOpLo = Op.getOperand(i: 0);
2492 SDValue ShOpHi = Op.getOperand(i: 1);
2493 SDValue ShAmt = Op.getOperand(i: 2);
2494
2495 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2496 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2497 // {dHi, dLo} = {aHi, aLo} << Amt
2498 // dHi = shf.l.clamp aLo, aHi, Amt
2499 // dLo = aLo << Amt
2500
2501 SDValue Hi = DAG.getNode(Opcode: NVPTXISD::FUN_SHFL_CLAMP, DL: dl, VT, N1: ShOpLo, N2: ShOpHi,
2502 N3: ShAmt);
2503 SDValue Lo = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: ShOpLo, N2: ShAmt);
2504
2505 SDValue Ops[2] = { Lo, Hi };
2506 return DAG.getMergeValues(Ops, dl);
2507 }
2508 else {
2509 // {dHi, dLo} = {aHi, aLo} << Amt
2510 // - if (Amt>=size) then
2511 // dLo = aLo << Amt (all 0)
2512 // dLo = aLo << (Amt-size)
2513 // else
2514 // dLo = aLo << Amt
2515 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2516
2517 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2518 DAG.getConstant(VTBits, dl, MVT::i32),
2519 ShAmt);
2520 SDValue Tmp1 = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: ShOpHi, N2: ShAmt);
2521 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2522 DAG.getConstant(VTBits, dl, MVT::i32));
2523 SDValue Tmp2 = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT, N1: ShOpLo, N2: RevShAmt);
2524 SDValue FalseVal = DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Tmp1, N2: Tmp2);
2525 SDValue TrueVal = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: ShOpLo, N2: ExtraShAmt);
2526
2527 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2528 DAG.getConstant(VTBits, dl, MVT::i32),
2529 ISD::SETGE);
2530 SDValue Lo = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: ShOpLo, N2: ShAmt);
2531 SDValue Hi = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT, N1: Cmp, N2: TrueVal, N3: FalseVal);
2532
2533 SDValue Ops[2] = { Lo, Hi };
2534 return DAG.getMergeValues(Ops, dl);
2535 }
2536}
2537
2538SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2539 EVT VT = Op.getValueType();
2540
2541 if (VT == MVT::f32)
2542 return LowerFROUND32(Op, DAG);
2543
2544 if (VT == MVT::f64)
2545 return LowerFROUND64(Op, DAG);
2546
2547 llvm_unreachable("unhandled type");
2548}
2549
2550// This is the the rounding method used in CUDA libdevice in C like code:
2551// float roundf(float A)
2552// {
2553// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2554// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2555// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2556// }
2557SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2558 SelectionDAG &DAG) const {
2559 SDLoc SL(Op);
2560 SDValue A = Op.getOperand(i: 0);
2561 EVT VT = Op.getValueType();
2562
2563 SDValue AbsA = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: A);
2564
2565 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2566 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2567 const int SignBitMask = 0x80000000;
2568 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2569 DAG.getConstant(SignBitMask, SL, MVT::i32));
2570 const int PointFiveInBits = 0x3F000000;
2571 SDValue PointFiveWithSignRaw =
2572 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2573 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2574 SDValue PointFiveWithSign =
2575 DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: PointFiveWithSignRaw);
2576 SDValue AdjustedA = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: A, N2: PointFiveWithSign);
2577 SDValue RoundedA = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: AdjustedA);
2578
2579 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2580 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Ctx&: *DAG.getContext(), VT);
2581 SDValue IsLarge =
2582 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsA, RHS: DAG.getConstantFP(Val: pow(x: 2.0, y: 23.0), DL: SL, VT),
2583 Cond: ISD::SETOGT);
2584 RoundedA = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLarge, N2: A, N3: RoundedA);
2585
2586 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2587 SDValue IsSmall =DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsA,
2588 RHS: DAG.getConstantFP(Val: 0.5, DL: SL, VT), Cond: ISD::SETOLT);
2589 SDValue RoundedAForSmallA = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: A);
2590 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsSmall, N2: RoundedAForSmallA, N3: RoundedA);
2591}
2592
2593// The implementation of round(double) is similar to that of round(float) in
2594// that they both separate the value range into three regions and use a method
2595// specific to the region to round the values. However, round(double) first
2596// calculates the round of the absolute value and then adds the sign back while
2597// round(float) directly rounds the value with sign.
2598SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2599 SelectionDAG &DAG) const {
2600 SDLoc SL(Op);
2601 SDValue A = Op.getOperand(i: 0);
2602 EVT VT = Op.getValueType();
2603
2604 SDValue AbsA = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: A);
2605
2606 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2607 SDValue AdjustedA = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: AbsA,
2608 N2: DAG.getConstantFP(Val: 0.5, DL: SL, VT));
2609 SDValue RoundedA = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: AdjustedA);
2610
2611 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2612 EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Ctx&: *DAG.getContext(), VT);
2613 SDValue IsSmall =DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsA,
2614 RHS: DAG.getConstantFP(Val: 0.5, DL: SL, VT), Cond: ISD::SETOLT);
2615 RoundedA = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsSmall,
2616 N2: DAG.getConstantFP(Val: 0, DL: SL, VT),
2617 N3: RoundedA);
2618
2619 // Add sign to rounded_A
2620 RoundedA = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT, N1: RoundedA, N2: A);
2621 DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: A);
2622
2623 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2624 SDValue IsLarge =
2625 DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsA, RHS: DAG.getConstantFP(Val: pow(x: 2.0, y: 52.0), DL: SL, VT),
2626 Cond: ISD::SETOGT);
2627 return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLarge, N2: A, N3: RoundedA);
2628}
2629
2630SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2631 SelectionDAG &DAG) const {
2632 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2633
2634 if (Op.getValueType() == MVT::bf16) {
2635 SDLoc Loc(Op);
2636 return DAG.getNode(
2637 ISD::FP_ROUND, Loc, MVT::bf16,
2638 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2639 DAG.getIntPtrConstant(0, Loc));
2640 }
2641
2642 // Everything else is considered legal.
2643 return Op;
2644}
2645
2646SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2647 SelectionDAG &DAG) const {
2648 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2649
2650 if (Op.getOperand(0).getValueType() == MVT::bf16) {
2651 SDLoc Loc(Op);
2652 return DAG.getNode(
2653 Op.getOpcode(), Loc, Op.getValueType(),
2654 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2655 }
2656
2657 // Everything else is considered legal.
2658 return Op;
2659}
2660
2661SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
2662 SelectionDAG &DAG) const {
2663 EVT NarrowVT = Op.getValueType();
2664 SDValue Wide = Op.getOperand(i: 0);
2665 EVT WideVT = Wide.getValueType();
2666 if (NarrowVT.getScalarType() == MVT::bf16) {
2667 const TargetLowering *TLI = STI.getTargetLowering();
2668 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2669 return TLI->expandFP_ROUND(Node: Op.getNode(), DAG);
2670 }
2671 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2672 // This combination was the first to support f32 -> bf16.
2673 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2674 if (WideVT.getScalarType() == MVT::f32) {
2675 return Op;
2676 }
2677 if (WideVT.getScalarType() == MVT::f64) {
2678 SDLoc Loc(Op);
2679 // Round-inexact-to-odd f64 to f32, then do the final rounding using
2680 // the hardware f32 -> bf16 instruction.
2681 SDValue rod = TLI->expandRoundInexactToOdd(
2682 WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32)
2683 : MVT::f32,
2684 Wide, Loc, DAG);
2685 return DAG.getFPExtendOrRound(Op: rod, DL: Loc, VT: NarrowVT);
2686 }
2687 }
2688 return TLI->expandFP_ROUND(Node: Op.getNode(), DAG);
2689 }
2690 }
2691
2692 // Everything else is considered legal.
2693 return Op;
2694}
2695
2696SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
2697 SelectionDAG &DAG) const {
2698 SDValue Narrow = Op.getOperand(i: 0);
2699 EVT NarrowVT = Narrow.getValueType();
2700 EVT WideVT = Op.getValueType();
2701 if (NarrowVT.getScalarType() == MVT::bf16) {
2702 if (WideVT.getScalarType() == MVT::f32 &&
2703 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2704 SDLoc Loc(Op);
2705 return DAG.getNode(Opcode: ISD::BF16_TO_FP, DL: Loc, VT: WideVT, Operand: Narrow);
2706 }
2707 if (WideVT.getScalarType() == MVT::f64 &&
2708 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2709 EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32)
2710 : MVT::f32;
2711 SDLoc Loc(Op);
2712 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2713 Op = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: Loc, VT: F32, Operand: Narrow);
2714 } else {
2715 Op = DAG.getNode(Opcode: ISD::BF16_TO_FP, DL: Loc, VT: F32, Operand: Narrow);
2716 }
2717 return DAG.getNode(Opcode: ISD::FP_EXTEND, DL: Loc, VT: WideVT, Operand: Op);
2718 }
2719 }
2720
2721 // Everything else is considered legal.
2722 return Op;
2723}
2724
2725static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) {
2726 SDLoc DL(Op);
2727 if (Op.getValueType() != MVT::v2i16)
2728 return Op;
2729 EVT EltVT = Op.getValueType().getVectorElementType();
2730 SmallVector<SDValue> VecElements;
2731 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2732 SmallVector<SDValue> ScalarArgs;
2733 llvm::transform(Range: Op->ops(), d_first: std::back_inserter(x&: ScalarArgs),
2734 F: [&](const SDUse &O) {
2735 return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: EltVT,
2736 N1: O.get(), N2: DAG.getIntPtrConstant(Val: I, DL));
2737 });
2738 VecElements.push_back(Elt: DAG.getNode(Opcode: Op.getOpcode(), DL, VT: EltVT, Ops: ScalarArgs));
2739 }
2740 SDValue V =
2741 DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: Op.getValueType(), Ops: VecElements);
2742 return V;
2743}
2744
2745SDValue
2746NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
2747 switch (Op.getOpcode()) {
2748 case ISD::RETURNADDR:
2749 return SDValue();
2750 case ISD::FRAMEADDR:
2751 return SDValue();
2752 case ISD::GlobalAddress:
2753 return LowerGlobalAddress(Op, DAG);
2754 case ISD::INTRINSIC_W_CHAIN:
2755 return Op;
2756 case ISD::BUILD_VECTOR:
2757 return LowerBUILD_VECTOR(Op, DAG);
2758 case ISD::EXTRACT_SUBVECTOR:
2759 return Op;
2760 case ISD::EXTRACT_VECTOR_ELT:
2761 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2762 case ISD::INSERT_VECTOR_ELT:
2763 return LowerINSERT_VECTOR_ELT(Op, DAG);
2764 case ISD::VECTOR_SHUFFLE:
2765 return LowerVECTOR_SHUFFLE(Op, DAG);
2766 case ISD::CONCAT_VECTORS:
2767 return LowerCONCAT_VECTORS(Op, DAG);
2768 case ISD::STORE:
2769 return LowerSTORE(Op, DAG);
2770 case ISD::LOAD:
2771 return LowerLOAD(Op, DAG);
2772 case ISD::SHL_PARTS:
2773 return LowerShiftLeftParts(Op, DAG);
2774 case ISD::SRA_PARTS:
2775 case ISD::SRL_PARTS:
2776 return LowerShiftRightParts(Op, DAG);
2777 case ISD::SELECT:
2778 return LowerSelect(Op, DAG);
2779 case ISD::FROUND:
2780 return LowerFROUND(Op, DAG);
2781 case ISD::SINT_TO_FP:
2782 case ISD::UINT_TO_FP:
2783 return LowerINT_TO_FP(Op, DAG);
2784 case ISD::FP_TO_SINT:
2785 case ISD::FP_TO_UINT:
2786 return LowerFP_TO_INT(Op, DAG);
2787 case ISD::FP_ROUND:
2788 return LowerFP_ROUND(Op, DAG);
2789 case ISD::FP_EXTEND:
2790 return LowerFP_EXTEND(Op, DAG);
2791 case ISD::VAARG:
2792 return LowerVAARG(Op, DAG);
2793 case ISD::VASTART:
2794 return LowerVASTART(Op, DAG);
2795 case ISD::ABS:
2796 case ISD::SMIN:
2797 case ISD::SMAX:
2798 case ISD::UMIN:
2799 case ISD::UMAX:
2800 case ISD::ADD:
2801 case ISD::SUB:
2802 case ISD::MUL:
2803 case ISD::SHL:
2804 case ISD::SREM:
2805 case ISD::UREM:
2806 return LowerVectorArith(Op, DAG);
2807 case ISD::DYNAMIC_STACKALLOC:
2808 return LowerDYNAMIC_STACKALLOC(Op, DAG);
2809 default:
2810 llvm_unreachable("Custom lowering not defined for operation");
2811 }
2812}
2813
2814// This function is almost a copy of SelectionDAG::expandVAArg().
2815// The only diff is that this one produces loads from local address space.
2816SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
2817 const TargetLowering *TLI = STI.getTargetLowering();
2818 SDLoc DL(Op);
2819
2820 SDNode *Node = Op.getNode();
2821 const Value *V = cast<SrcValueSDNode>(Val: Node->getOperand(Num: 2))->getValue();
2822 EVT VT = Node->getValueType(ResNo: 0);
2823 auto *Ty = VT.getTypeForEVT(Context&: *DAG.getContext());
2824 SDValue Tmp1 = Node->getOperand(Num: 0);
2825 SDValue Tmp2 = Node->getOperand(Num: 1);
2826 const MaybeAlign MA(Node->getConstantOperandVal(Num: 3));
2827
2828 SDValue VAListLoad = DAG.getLoad(VT: TLI->getPointerTy(DL: DAG.getDataLayout()), dl: DL,
2829 Chain: Tmp1, Ptr: Tmp2, PtrInfo: MachinePointerInfo(V));
2830 SDValue VAList = VAListLoad;
2831
2832 if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
2833 VAList = DAG.getNode(
2834 Opcode: ISD::ADD, DL, VT: VAList.getValueType(), N1: VAList,
2835 N2: DAG.getConstant(Val: MA->value() - 1, DL, VT: VAList.getValueType()));
2836
2837 VAList = DAG.getNode(
2838 Opcode: ISD::AND, DL, VT: VAList.getValueType(), N1: VAList,
2839 N2: DAG.getConstant(Val: -(int64_t)MA->value(), DL, VT: VAList.getValueType()));
2840 }
2841
2842 // Increment the pointer, VAList, to the next vaarg
2843 Tmp1 = DAG.getNode(Opcode: ISD::ADD, DL, VT: VAList.getValueType(), N1: VAList,
2844 N2: DAG.getConstant(Val: DAG.getDataLayout().getTypeAllocSize(Ty),
2845 DL, VT: VAList.getValueType()));
2846
2847 // Store the incremented VAList to the legalized pointer
2848 Tmp1 = DAG.getStore(Chain: VAListLoad.getValue(R: 1), dl: DL, Val: Tmp1, Ptr: Tmp2,
2849 PtrInfo: MachinePointerInfo(V));
2850
2851 const Value *SrcV =
2852 Constant::getNullValue(Ty: PointerType::get(ElementType: Ty, AddressSpace: ADDRESS_SPACE_LOCAL));
2853
2854 // Load the actual argument out of the pointer VAList
2855 return DAG.getLoad(VT, dl: DL, Chain: Tmp1, Ptr: VAList, PtrInfo: MachinePointerInfo(SrcV));
2856}
2857
2858SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
2859 const TargetLowering *TLI = STI.getTargetLowering();
2860 SDLoc DL(Op);
2861 EVT PtrVT = TLI->getPointerTy(DL: DAG.getDataLayout());
2862
2863 // Store the address of unsized array <function>_vararg[] in the ap object.
2864 SDValue Arg = getParamSymbol(DAG, /* vararg */ idx: -1, PtrVT);
2865 SDValue VAReg = DAG.getNode(Opcode: NVPTXISD::Wrapper, DL, VT: PtrVT, Operand: Arg);
2866
2867 const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue();
2868 return DAG.getStore(Chain: Op.getOperand(i: 0), dl: DL, Val: VAReg, Ptr: Op.getOperand(i: 1),
2869 PtrInfo: MachinePointerInfo(SV));
2870}
2871
2872SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2873 SDValue Op0 = Op->getOperand(Num: 0);
2874 SDValue Op1 = Op->getOperand(Num: 1);
2875 SDValue Op2 = Op->getOperand(Num: 2);
2876 SDLoc DL(Op.getNode());
2877
2878 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2879
2880 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2881 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2882 SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2883 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2884
2885 return Trunc;
2886}
2887
2888SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2889 if (Op.getValueType() == MVT::i1)
2890 return LowerLOADi1(Op, DAG);
2891
2892 // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
2893 // unaligned loads and have to handle it here.
2894 EVT VT = Op.getValueType();
2895 if (Isv2x16VT(VT) || VT == MVT::v4i8) {
2896 LoadSDNode *Load = cast<LoadSDNode>(Val&: Op);
2897 EVT MemVT = Load->getMemoryVT();
2898 if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(),
2899 VT: MemVT, MMO: *Load->getMemOperand())) {
2900 SDValue Ops[2];
2901 std::tie(args&: Ops[0], args&: Ops[1]) = expandUnalignedLoad(LD: Load, DAG);
2902 return DAG.getMergeValues(Ops, dl: SDLoc(Op));
2903 }
2904 }
2905
2906 return SDValue();
2907}
2908
2909// v = ld i1* addr
2910// =>
2911// v1 = ld i8* addr (-> i16)
2912// v = trunc i16 to i1
2913SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2914 SDNode *Node = Op.getNode();
2915 LoadSDNode *LD = cast<LoadSDNode>(Val: Node);
2916 SDLoc dl(Node);
2917 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
2918 assert(Node->getValueType(0) == MVT::i1 &&
2919 "Custom lowering for i1 load only");
2920 SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
2921 LD->getPointerInfo(), LD->getAlign(),
2922 LD->getMemOperand()->getFlags());
2923 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2924 // The legalizer (the caller) is expecting two values from the legalized
2925 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2926 // in LegalizeDAG.cpp which also uses MergeValues.
2927 SDValue Ops[] = { result, LD->getChain() };
2928 return DAG.getMergeValues(Ops, dl);
2929}
2930
2931SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2932 StoreSDNode *Store = cast<StoreSDNode>(Val&: Op);
2933 EVT VT = Store->getMemoryVT();
2934
2935 if (VT == MVT::i1)
2936 return LowerSTOREi1(Op, DAG);
2937
2938 // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2939 // stores and have to handle it here.
2940 if ((Isv2x16VT(VT) || VT == MVT::v4i8) &&
2941 !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
2942 VT, *Store->getMemOperand()))
2943 return expandUnalignedStore(ST: Store, DAG);
2944
2945 // v2f16, v2bf16 and v2i16 don't need special handling.
2946 if (Isv2x16VT(VT) || VT == MVT::v4i8)
2947 return SDValue();
2948
2949 if (VT.isVector())
2950 return LowerSTOREVector(Op, DAG);
2951
2952 return SDValue();
2953}
2954
2955SDValue
2956NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2957 SDNode *N = Op.getNode();
2958 SDValue Val = N->getOperand(Num: 1);
2959 SDLoc DL(N);
2960 EVT ValVT = Val.getValueType();
2961
2962 if (ValVT.isVector()) {
2963 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2964 // legal. We can (and should) split that into 2 stores of <2 x double> here
2965 // but I'm leaving that as a TODO for now.
2966 if (!ValVT.isSimple())
2967 return SDValue();
2968 switch (ValVT.getSimpleVT().SimpleTy) {
2969 default:
2970 return SDValue();
2971 case MVT::v2i8:
2972 case MVT::v2i16:
2973 case MVT::v2i32:
2974 case MVT::v2i64:
2975 case MVT::v2f16:
2976 case MVT::v2bf16:
2977 case MVT::v2f32:
2978 case MVT::v2f64:
2979 case MVT::v4i8:
2980 case MVT::v4i16:
2981 case MVT::v4i32:
2982 case MVT::v4f16:
2983 case MVT::v4bf16:
2984 case MVT::v4f32:
2985 case MVT::v8f16: // <4 x f16x2>
2986 case MVT::v8bf16: // <4 x bf16x2>
2987 case MVT::v8i16: // <4 x i16x2>
2988 // This is a "native" vector type
2989 break;
2990 }
2991
2992 MemSDNode *MemSD = cast<MemSDNode>(Val: N);
2993 const DataLayout &TD = DAG.getDataLayout();
2994
2995 Align Alignment = MemSD->getAlign();
2996 Align PrefAlign =
2997 TD.getPrefTypeAlign(Ty: ValVT.getTypeForEVT(Context&: *DAG.getContext()));
2998 if (Alignment < PrefAlign) {
2999 // This store is not sufficiently aligned, so bail out and let this vector
3000 // store be scalarized. Note that we may still be able to emit smaller
3001 // vector stores. For example, if we are storing a <4 x float> with an
3002 // alignment of 8, this check will fail but the legalizer will try again
3003 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3004 return SDValue();
3005 }
3006
3007 unsigned Opcode = 0;
3008 EVT EltVT = ValVT.getVectorElementType();
3009 unsigned NumElts = ValVT.getVectorNumElements();
3010
3011 // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
3012 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
3013 // stored type to i16 and propagate the "real" type as the memory type.
3014 bool NeedExt = false;
3015 if (EltVT.getSizeInBits() < 16)
3016 NeedExt = true;
3017
3018 bool StoreF16x2 = false;
3019 switch (NumElts) {
3020 default:
3021 return SDValue();
3022 case 2:
3023 Opcode = NVPTXISD::StoreV2;
3024 break;
3025 case 4:
3026 Opcode = NVPTXISD::StoreV4;
3027 break;
3028 case 8:
3029 // v8f16 is a special case. PTX doesn't have st.v8.f16
3030 // instruction. Instead, we split the vector into v2f16 chunks and
3031 // store them with st.v4.b32.
3032 assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector.");
3033 Opcode = NVPTXISD::StoreV4;
3034 StoreF16x2 = true;
3035 break;
3036 }
3037
3038 SmallVector<SDValue, 8> Ops;
3039
3040 // First is the chain
3041 Ops.push_back(Elt: N->getOperand(Num: 0));
3042
3043 if (StoreF16x2) {
3044 // Combine f16,f16 -> v2f16
3045 NumElts /= 2;
3046 for (unsigned i = 0; i < NumElts; ++i) {
3047 SDValue E0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: EltVT, N1: Val,
3048 N2: DAG.getIntPtrConstant(Val: i * 2, DL));
3049 SDValue E1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: EltVT, N1: Val,
3050 N2: DAG.getIntPtrConstant(Val: i * 2 + 1, DL));
3051 EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: 2);
3052 SDValue V2 = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: VecVT, N1: E0, N2: E1);
3053 Ops.push_back(Elt: V2);
3054 }
3055 } else {
3056 // Then the split values
3057 for (unsigned i = 0; i < NumElts; ++i) {
3058 SDValue ExtVal = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: EltVT, N1: Val,
3059 N2: DAG.getIntPtrConstant(Val: i, DL));
3060 if (NeedExt)
3061 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
3062 Ops.push_back(Elt: ExtVal);
3063 }
3064 }
3065
3066 // Then any remaining arguments
3067 Ops.append(in_start: N->op_begin() + 2, in_end: N->op_end());
3068
3069 SDValue NewSt =
3070 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3071 MemSD->getMemoryVT(), MemSD->getMemOperand());
3072
3073 // return DCI.CombineTo(N, NewSt, true);
3074 return NewSt;
3075 }
3076
3077 return SDValue();
3078}
3079
3080// st i1 v, addr
3081// =>
3082// v1 = zxt v to i16
3083// st.u8 i16, addr
3084SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3085 SDNode *Node = Op.getNode();
3086 SDLoc dl(Node);
3087 StoreSDNode *ST = cast<StoreSDNode>(Val: Node);
3088 SDValue Tmp1 = ST->getChain();
3089 SDValue Tmp2 = ST->getBasePtr();
3090 SDValue Tmp3 = ST->getValue();
3091 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3092 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3093 SDValue Result =
3094 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3095 ST->getAlign(), ST->getMemOperand()->getFlags());
3096 return Result;
3097}
3098
3099// This creates target external symbol for a function parameter.
3100// Name of the symbol is composed from its index and the function name.
3101// Negative index corresponds to special parameter (unsized array) used for
3102// passing variable arguments.
3103SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx,
3104 EVT v) const {
3105 StringRef SavedStr = nvTM->getStrPool().save(
3106 S: getParamName(F: &DAG.getMachineFunction().getFunction(), Idx: idx));
3107 return DAG.getTargetExternalSymbol(Sym: SavedStr.data(), VT: v);
3108}
3109
3110SDValue NVPTXTargetLowering::LowerFormalArguments(
3111 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3112 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3113 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3114 MachineFunction &MF = DAG.getMachineFunction();
3115 const DataLayout &DL = DAG.getDataLayout();
3116 auto PtrVT = getPointerTy(DL: DAG.getDataLayout());
3117
3118 const Function *F = &MF.getFunction();
3119 const AttributeList &PAL = F->getAttributes();
3120 const TargetLowering *TLI = STI.getTargetLowering();
3121
3122 SDValue Root = DAG.getRoot();
3123 std::vector<SDValue> OutChains;
3124
3125 bool isABI = (STI.getSmVersion() >= 20);
3126 assert(isABI && "Non-ABI compilation is not supported");
3127 if (!isABI)
3128 return Chain;
3129
3130 std::vector<Type *> argTypes;
3131 std::vector<const Argument *> theArgs;
3132 for (const Argument &I : F->args()) {
3133 theArgs.push_back(x: &I);
3134 argTypes.push_back(x: I.getType());
3135 }
3136 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3137 // Ins.size() will be larger
3138 // * if there is an aggregate argument with multiple fields (each field
3139 // showing up separately in Ins)
3140 // * if there is a vector argument with more than typical vector-length
3141 // elements (generally if more than 4) where each vector element is
3142 // individually present in Ins.
3143 // So a different index should be used for indexing into Ins.
3144 // See similar issue in LowerCall.
3145 unsigned InsIdx = 0;
3146
3147 for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++InsIdx) {
3148 Type *Ty = argTypes[i];
3149
3150 if (theArgs[i]->use_empty()) {
3151 // argument is dead
3152 if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) {
3153 SmallVector<EVT, 16> vtparts;
3154
3155 ComputePTXValueVTs(TLI: *this, DL: DAG.getDataLayout(), Ty, ValueVTs&: vtparts);
3156 if (vtparts.empty())
3157 report_fatal_error(reason: "Empty parameter types are not supported");
3158
3159 for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
3160 ++parti) {
3161 InVals.push_back(Elt: DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3162 ++InsIdx;
3163 }
3164 if (vtparts.size() > 0)
3165 --InsIdx;
3166 continue;
3167 }
3168 if (Ty->isVectorTy()) {
3169 EVT ObjectVT = getValueType(DL, Ty);
3170 unsigned NumRegs = TLI->getNumRegisters(Context&: F->getContext(), VT: ObjectVT);
3171 for (unsigned parti = 0; parti < NumRegs; ++parti) {
3172 InVals.push_back(Elt: DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3173 ++InsIdx;
3174 }
3175 if (NumRegs > 0)
3176 --InsIdx;
3177 continue;
3178 }
3179 InVals.push_back(Elt: DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3180 continue;
3181 }
3182
3183 // In the following cases, assign a node order of "i+1"
3184 // to newly created nodes. The SDNodes for params have to
3185 // appear in the same order as their order of appearance
3186 // in the original function. "i+1" holds that order.
3187 if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
3188 bool aggregateIsPacked = false;
3189 if (StructType *STy = dyn_cast<StructType>(Val: Ty))
3190 aggregateIsPacked = STy->isPacked();
3191
3192 SmallVector<EVT, 16> VTs;
3193 SmallVector<uint64_t, 16> Offsets;
3194 ComputePTXValueVTs(TLI: *this, DL, Ty, ValueVTs&: VTs, Offsets: &Offsets, StartingOffset: 0);
3195 if (VTs.empty())
3196 report_fatal_error(reason: "Empty parameter types are not supported");
3197
3198 auto VectorInfo =
3199 VectorizePTXValueVTs(ValueVTs: VTs, Offsets, ParamAlignment: DL.getABITypeAlign(Ty));
3200
3201 SDValue Arg = getParamSymbol(DAG, idx: i, v: PtrVT);
3202 int VecIdx = -1; // Index of the first element of the current vector.
3203 for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
3204 if (VectorInfo[parti] & PVF_FIRST) {
3205 assert(VecIdx == -1 && "Orphaned vector.");
3206 VecIdx = parti;
3207 }
3208
3209 // That's the last element of this store op.
3210 if (VectorInfo[parti] & PVF_LAST) {
3211 unsigned NumElts = parti - VecIdx + 1;
3212 EVT EltVT = VTs[parti];
3213 // i1 is loaded/stored as i8.
3214 EVT LoadVT = EltVT;
3215 if (EltVT == MVT::i1)
3216 LoadVT = MVT::i8;
3217 else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8)
3218 // getLoad needs a vector type, but it can't handle
3219 // vectors which contain v2f16 or v2bf16 elements. So we must load
3220 // using i32 here and then bitcast back.
3221 LoadVT = MVT::i32;
3222
3223 EVT VecVT = EVT::getVectorVT(Context&: F->getContext(), VT: LoadVT, NumElements: NumElts);
3224 SDValue VecAddr =
3225 DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: Arg,
3226 N2: DAG.getConstant(Val: Offsets[VecIdx], DL: dl, VT: PtrVT));
3227 Value *srcValue = Constant::getNullValue(Ty: PointerType::get(
3228 ElementType: EltVT.getTypeForEVT(Context&: F->getContext()), AddressSpace: ADDRESS_SPACE_PARAM));
3229
3230 const MaybeAlign PartAlign = [&]() -> MaybeAlign {
3231 if (aggregateIsPacked)
3232 return Align(1);
3233 if (NumElts != 1)
3234 return std::nullopt;
3235 Align PartAlign =
3236 (Offsets[parti] == 0 && PAL.getParamAlignment(ArgNo: i))
3237 ? PAL.getParamAlignment(ArgNo: i).value()
3238 : DL.getABITypeAlign(Ty: EltVT.getTypeForEVT(Context&: F->getContext()));
3239 return commonAlignment(A: PartAlign, Offset: Offsets[parti]);
3240 }();
3241 SDValue P = DAG.getLoad(VT: VecVT, dl, Chain: Root, Ptr: VecAddr,
3242 PtrInfo: MachinePointerInfo(srcValue), Alignment: PartAlign,
3243 MMOFlags: MachineMemOperand::MODereferenceable |
3244 MachineMemOperand::MOInvariant);
3245 if (P.getNode())
3246 P.getNode()->setIROrder(i + 1);
3247 for (unsigned j = 0; j < NumElts; ++j) {
3248 SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: LoadVT, N1: P,
3249 N2: DAG.getIntPtrConstant(Val: j, DL: dl));
3250 // We've loaded i1 as an i8 and now must truncate it back to i1
3251 if (EltVT == MVT::i1)
3252 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
3253 // v2f16 was loaded as an i32. Now we must bitcast it back.
3254 else if (EltVT != LoadVT)
3255 Elt = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: EltVT, Operand: Elt);
3256
3257 // If a promoted integer type is used, truncate down to the original
3258 MVT PromotedVT;
3259 if (PromoteScalarIntegerPTX(VT: EltVT, PromotedVT: &PromotedVT)) {
3260 Elt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: EltVT, Operand: Elt);
3261 }
3262
3263 // Extend the element if necessary (e.g. an i8 is loaded
3264 // into an i16 register)
3265 if (Ins[InsIdx].VT.isInteger() &&
3266 Ins[InsIdx].VT.getFixedSizeInBits() >
3267 LoadVT.getFixedSizeInBits()) {
3268 unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
3269 : ISD::ZERO_EXTEND;
3270 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
3271 }
3272 InVals.push_back(Elt);
3273 }
3274
3275 // Reset vector tracking state.
3276 VecIdx = -1;
3277 }
3278 ++InsIdx;
3279 }
3280 if (VTs.size() > 0)
3281 --InsIdx;
3282 continue;
3283 }
3284
3285 // Param has ByVal attribute
3286 // Return MoveParam(param symbol).
3287 // Ideally, the param symbol can be returned directly,
3288 // but when SDNode builder decides to use it in a CopyToReg(),
3289 // machine instruction fails because TargetExternalSymbol
3290 // (not lowered) is target dependent, and CopyToReg assumes
3291 // the source is lowered.
3292 EVT ObjectVT = getValueType(DL, Ty);
3293 assert(ObjectVT == Ins[InsIdx].VT &&
3294 "Ins type did not match function type");
3295 SDValue Arg = getParamSymbol(DAG, idx: i, v: PtrVT);
3296 SDValue p = DAG.getNode(Opcode: NVPTXISD::MoveParam, DL: dl, VT: ObjectVT, Operand: Arg);
3297 if (p.getNode())
3298 p.getNode()->setIROrder(i + 1);
3299 InVals.push_back(Elt: p);
3300 }
3301
3302 if (!OutChains.empty())
3303 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
3304
3305 return Chain;
3306}
3307
3308// Use byte-store when the param adress of the return value is unaligned.
3309// This may happen when the return value is a field of a packed structure.
3310static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain,
3311 uint64_t Offset, EVT ElementType,
3312 SDValue RetVal, const SDLoc &dl) {
3313 // Bit logic only works on integer types
3314 if (adjustElementType(ElementType))
3315 RetVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ElementType, Operand: RetVal);
3316
3317 // Store each byte
3318 for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
3319 // Shift the byte to the last byte position
3320 SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal,
3321 DAG.getConstant(i * 8, dl, MVT::i32));
3322 SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32),
3323 ShiftVal};
3324 // Trunc store only the last byte by using
3325 // st.param.b8
3326 // The register type can be larger than b8.
3327 Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
3328 DAG.getVTList(MVT::Other), StoreOperands,
3329 MVT::i8, MachinePointerInfo(), std::nullopt,
3330 MachineMemOperand::MOStore);
3331 }
3332 return Chain;
3333}
3334
3335SDValue
3336NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3337 bool isVarArg,
3338 const SmallVectorImpl<ISD::OutputArg> &Outs,
3339 const SmallVectorImpl<SDValue> &OutVals,
3340 const SDLoc &dl, SelectionDAG &DAG) const {
3341 const MachineFunction &MF = DAG.getMachineFunction();
3342 const Function &F = MF.getFunction();
3343 Type *RetTy = MF.getFunction().getReturnType();
3344
3345 bool isABI = (STI.getSmVersion() >= 20);
3346 assert(isABI && "Non-ABI compilation is not supported");
3347 if (!isABI)
3348 return Chain;
3349
3350 const DataLayout &DL = DAG.getDataLayout();
3351 SmallVector<SDValue, 16> PromotedOutVals;
3352 SmallVector<EVT, 16> VTs;
3353 SmallVector<uint64_t, 16> Offsets;
3354 ComputePTXValueVTs(TLI: *this, DL, Ty: RetTy, ValueVTs&: VTs, Offsets: &Offsets);
3355 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
3356
3357 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3358 SDValue PromotedOutVal = OutVals[i];
3359 MVT PromotedVT;
3360 if (PromoteScalarIntegerPTX(VT: VTs[i], PromotedVT: &PromotedVT)) {
3361 VTs[i] = EVT(PromotedVT);
3362 }
3363 if (PromoteScalarIntegerPTX(VT: PromotedOutVal.getValueType(), PromotedVT: &PromotedVT)) {
3364 llvm::ISD::NodeType Ext =
3365 Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3366 PromotedOutVal = DAG.getNode(Opcode: Ext, DL: dl, VT: PromotedVT, Operand: PromotedOutVal);
3367 }
3368 PromotedOutVals.push_back(Elt: PromotedOutVal);
3369 }
3370
3371 auto VectorInfo = VectorizePTXValueVTs(
3372 ValueVTs: VTs, Offsets,
3373 ParamAlignment: RetTy->isSized() ? getFunctionParamOptimizedAlign(F: &F, ArgTy: RetTy, DL)
3374 : Align(1));
3375
3376 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3377 // 32-bits are sign extended or zero extended, depending on whether
3378 // they are signed or unsigned types.
3379 bool ExtendIntegerRetVal =
3380 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty: RetTy) < 32;
3381
3382 SmallVector<SDValue, 6> StoreOperands;
3383 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3384 SDValue OutVal = OutVals[i];
3385 SDValue RetVal = PromotedOutVals[i];
3386
3387 if (ExtendIntegerRetVal) {
3388 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
3389 : ISD::ZERO_EXTEND,
3390 dl, MVT::i32, RetVal);
3391 } else if (OutVal.getValueSizeInBits() < 16) {
3392 // Use 16-bit registers for small load-stores as it's the
3393 // smallest general purpose register size supported by NVPTX.
3394 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
3395 }
3396
3397 // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned
3398 // for a scalar store. In such cases, fall back to byte stores.
3399 if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) {
3400 EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3401 Align ElementTypeAlign =
3402 DL.getABITypeAlign(Ty: ElementType.getTypeForEVT(Context&: RetTy->getContext()));
3403 Align ElementAlign =
3404 commonAlignment(A: DL.getABITypeAlign(Ty: RetTy), Offset: Offsets[i]);
3405 if (ElementAlign < ElementTypeAlign) {
3406 assert(StoreOperands.empty() && "Orphaned operand list.");
3407 Chain = LowerUnalignedStoreRet(DAG, Chain, Offset: Offsets[i], ElementType,
3408 RetVal, dl);
3409
3410 // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes
3411 // into the graph, so just move on to the next element.
3412 continue;
3413 }
3414 }
3415
3416 // New load/store. Record chain and offset operands.
3417 if (VectorInfo[i] & PVF_FIRST) {
3418 assert(StoreOperands.empty() && "Orphaned operand list.");
3419 StoreOperands.push_back(Elt: Chain);
3420 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
3421 }
3422
3423 // Record the value to return.
3424 StoreOperands.push_back(Elt: RetVal);
3425
3426 // That's the last element of this store op.
3427 if (VectorInfo[i] & PVF_LAST) {
3428 NVPTXISD::NodeType Op;
3429 unsigned NumElts = StoreOperands.size() - 2;
3430 switch (NumElts) {
3431 case 1:
3432 Op = NVPTXISD::StoreRetval;
3433 break;
3434 case 2:
3435 Op = NVPTXISD::StoreRetvalV2;
3436 break;
3437 case 4:
3438 Op = NVPTXISD::StoreRetvalV4;
3439 break;
3440 default:
3441 llvm_unreachable("Invalid vector info.");
3442 }
3443
3444 // Adjust type of load/store op if we've extended the scalar
3445 // return value.
3446 EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3447 Chain = DAG.getMemIntrinsicNode(
3448 Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
3449 MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
3450 // Cleanup vector state.
3451 StoreOperands.clear();
3452 }
3453 }
3454
3455 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3456}
3457
3458void NVPTXTargetLowering::LowerAsmOperandForConstraint(
3459 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
3460 SelectionDAG &DAG) const {
3461 if (Constraint.size() > 1)
3462 return;
3463 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3464}
3465
3466static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
3467 switch (Intrinsic) {
3468 default:
3469 return 0;
3470
3471 case Intrinsic::nvvm_tex_1d_v4f32_s32:
3472 return NVPTXISD::Tex1DFloatS32;
3473 case Intrinsic::nvvm_tex_1d_v4f32_f32:
3474 return NVPTXISD::Tex1DFloatFloat;
3475 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3476 return NVPTXISD::Tex1DFloatFloatLevel;
3477 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3478 return NVPTXISD::Tex1DFloatFloatGrad;
3479 case Intrinsic::nvvm_tex_1d_v4s32_s32:
3480 return NVPTXISD::Tex1DS32S32;
3481 case Intrinsic::nvvm_tex_1d_v4s32_f32:
3482 return NVPTXISD::Tex1DS32Float;
3483 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3484 return NVPTXISD::Tex1DS32FloatLevel;
3485 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3486 return NVPTXISD::Tex1DS32FloatGrad;
3487 case Intrinsic::nvvm_tex_1d_v4u32_s32:
3488 return NVPTXISD::Tex1DU32S32;
3489 case Intrinsic::nvvm_tex_1d_v4u32_f32:
3490 return NVPTXISD::Tex1DU32Float;
3491 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3492 return NVPTXISD::Tex1DU32FloatLevel;
3493 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3494 return NVPTXISD::Tex1DU32FloatGrad;
3495
3496 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3497 return NVPTXISD::Tex1DArrayFloatS32;
3498 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3499 return NVPTXISD::Tex1DArrayFloatFloat;
3500 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3501 return NVPTXISD::Tex1DArrayFloatFloatLevel;
3502 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3503 return NVPTXISD::Tex1DArrayFloatFloatGrad;
3504 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3505 return NVPTXISD::Tex1DArrayS32S32;
3506 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3507 return NVPTXISD::Tex1DArrayS32Float;
3508 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3509 return NVPTXISD::Tex1DArrayS32FloatLevel;
3510 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3511 return NVPTXISD::Tex1DArrayS32FloatGrad;
3512 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3513 return NVPTXISD::Tex1DArrayU32S32;
3514 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3515 return NVPTXISD::Tex1DArrayU32Float;
3516 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3517 return NVPTXISD::Tex1DArrayU32FloatLevel;
3518 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3519 return NVPTXISD::Tex1DArrayU32FloatGrad;
3520
3521 case Intrinsic::nvvm_tex_2d_v4f32_s32:
3522 return NVPTXISD::Tex2DFloatS32;
3523 case Intrinsic::nvvm_tex_2d_v4f32_f32:
3524 return NVPTXISD::Tex2DFloatFloat;
3525 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3526 return NVPTXISD::Tex2DFloatFloatLevel;
3527 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3528 return NVPTXISD::Tex2DFloatFloatGrad;
3529 case Intrinsic::nvvm_tex_2d_v4s32_s32:
3530 return NVPTXISD::Tex2DS32S32;
3531 case Intrinsic::nvvm_tex_2d_v4s32_f32:
3532 return NVPTXISD::Tex2DS32Float;
3533 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3534 return NVPTXISD::Tex2DS32FloatLevel;
3535 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3536 return NVPTXISD::Tex2DS32FloatGrad;
3537 case Intrinsic::nvvm_tex_2d_v4u32_s32:
3538 return NVPTXISD::Tex2DU32S32;
3539 case Intrinsic::nvvm_tex_2d_v4u32_f32:
3540 return NVPTXISD::Tex2DU32Float;
3541 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3542 return NVPTXISD::Tex2DU32FloatLevel;
3543 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3544 return NVPTXISD::Tex2DU32FloatGrad;
3545
3546 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3547 return NVPTXISD::Tex2DArrayFloatS32;
3548 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3549 return NVPTXISD::Tex2DArrayFloatFloat;
3550 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3551 return NVPTXISD::Tex2DArrayFloatFloatLevel;
3552 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3553 return NVPTXISD::Tex2DArrayFloatFloatGrad;
3554 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3555 return NVPTXISD::Tex2DArrayS32S32;
3556 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3557 return NVPTXISD::Tex2DArrayS32Float;
3558 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3559 return NVPTXISD::Tex2DArrayS32FloatLevel;
3560 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3561 return NVPTXISD::Tex2DArrayS32FloatGrad;
3562 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3563 return NVPTXISD::Tex2DArrayU32S32;
3564 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3565 return NVPTXISD::Tex2DArrayU32Float;
3566 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3567 return NVPTXISD::Tex2DArrayU32FloatLevel;
3568 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3569 return NVPTXISD::Tex2DArrayU32FloatGrad;
3570
3571 case Intrinsic::nvvm_tex_3d_v4f32_s32:
3572 return NVPTXISD::Tex3DFloatS32;
3573 case Intrinsic::nvvm_tex_3d_v4f32_f32:
3574 return NVPTXISD::Tex3DFloatFloat;
3575 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3576 return NVPTXISD::Tex3DFloatFloatLevel;
3577 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3578 return NVPTXISD::Tex3DFloatFloatGrad;
3579 case Intrinsic::nvvm_tex_3d_v4s32_s32:
3580 return NVPTXISD::Tex3DS32S32;
3581 case Intrinsic::nvvm_tex_3d_v4s32_f32:
3582 return NVPTXISD::Tex3DS32Float;
3583 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3584 return NVPTXISD::Tex3DS32FloatLevel;
3585 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3586 return NVPTXISD::Tex3DS32FloatGrad;
3587 case Intrinsic::nvvm_tex_3d_v4u32_s32:
3588 return NVPTXISD::Tex3DU32S32;
3589 case Intrinsic::nvvm_tex_3d_v4u32_f32:
3590 return NVPTXISD::Tex3DU32Float;
3591 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3592 return NVPTXISD::Tex3DU32FloatLevel;
3593 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3594 return NVPTXISD::Tex3DU32FloatGrad;
3595
3596 case Intrinsic::nvvm_tex_cube_v4f32_f32:
3597 return NVPTXISD::TexCubeFloatFloat;
3598 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3599 return NVPTXISD::TexCubeFloatFloatLevel;
3600 case Intrinsic::nvvm_tex_cube_v4s32_f32:
3601 return NVPTXISD::TexCubeS32Float;
3602 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3603 return NVPTXISD::TexCubeS32FloatLevel;
3604 case Intrinsic::nvvm_tex_cube_v4u32_f32:
3605 return NVPTXISD::TexCubeU32Float;
3606 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3607 return NVPTXISD::TexCubeU32FloatLevel;
3608
3609 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3610 return NVPTXISD::TexCubeArrayFloatFloat;
3611 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3612 return NVPTXISD::TexCubeArrayFloatFloatLevel;
3613 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3614 return NVPTXISD::TexCubeArrayS32Float;
3615 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3616 return NVPTXISD::TexCubeArrayS32FloatLevel;
3617 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3618 return NVPTXISD::TexCubeArrayU32Float;
3619 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3620 return NVPTXISD::TexCubeArrayU32FloatLevel;
3621
3622 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3623 return NVPTXISD::Tld4R2DFloatFloat;
3624 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3625 return NVPTXISD::Tld4G2DFloatFloat;
3626 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3627 return NVPTXISD::Tld4B2DFloatFloat;
3628 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3629 return NVPTXISD::Tld4A2DFloatFloat;
3630 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3631 return NVPTXISD::Tld4R2DS64Float;
3632 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3633 return NVPTXISD::Tld4G2DS64Float;
3634 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3635 return NVPTXISD::Tld4B2DS64Float;
3636 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3637 return NVPTXISD::Tld4A2DS64Float;
3638 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3639 return NVPTXISD::Tld4R2DU64Float;
3640 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3641 return NVPTXISD::Tld4G2DU64Float;
3642 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3643 return NVPTXISD::Tld4B2DU64Float;
3644 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3645 return NVPTXISD::Tld4A2DU64Float;
3646
3647 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3648 return NVPTXISD::TexUnified1DFloatS32;
3649 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3650 return NVPTXISD::TexUnified1DFloatFloat;
3651 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3652 return NVPTXISD::TexUnified1DFloatFloatLevel;
3653 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3654 return NVPTXISD::TexUnified1DFloatFloatGrad;
3655 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3656 return NVPTXISD::TexUnified1DS32S32;
3657 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3658 return NVPTXISD::TexUnified1DS32Float;
3659 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3660 return NVPTXISD::TexUnified1DS32FloatLevel;
3661 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3662 return NVPTXISD::TexUnified1DS32FloatGrad;
3663 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3664 return NVPTXISD::TexUnified1DU32S32;
3665 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3666 return NVPTXISD::TexUnified1DU32Float;
3667 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3668 return NVPTXISD::TexUnified1DU32FloatLevel;
3669 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3670 return NVPTXISD::TexUnified1DU32FloatGrad;
3671
3672 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3673 return NVPTXISD::TexUnified1DArrayFloatS32;
3674 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3675 return NVPTXISD::TexUnified1DArrayFloatFloat;
3676 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3677 return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
3678 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3679 return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
3680 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3681 return NVPTXISD::TexUnified1DArrayS32S32;
3682 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3683 return NVPTXISD::TexUnified1DArrayS32Float;
3684 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3685 return NVPTXISD::TexUnified1DArrayS32FloatLevel;
3686 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3687 return NVPTXISD::TexUnified1DArrayS32FloatGrad;
3688 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3689 return NVPTXISD::TexUnified1DArrayU32S32;
3690 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3691 return NVPTXISD::TexUnified1DArrayU32Float;
3692 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3693 return NVPTXISD::TexUnified1DArrayU32FloatLevel;
3694 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3695 return NVPTXISD::TexUnified1DArrayU32FloatGrad;
3696
3697 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3698 return NVPTXISD::TexUnified2DFloatS32;
3699 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3700 return NVPTXISD::TexUnified2DFloatFloat;
3701 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3702 return NVPTXISD::TexUnified2DFloatFloatLevel;
3703 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3704 return NVPTXISD::TexUnified2DFloatFloatGrad;
3705 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3706 return NVPTXISD::TexUnified2DS32S32;
3707 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3708 return NVPTXISD::TexUnified2DS32Float;
3709 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3710 return NVPTXISD::TexUnified2DS32FloatLevel;
3711 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3712 return NVPTXISD::TexUnified2DS32FloatGrad;
3713 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3714 return NVPTXISD::TexUnified2DU32S32;
3715 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3716 return NVPTXISD::TexUnified2DU32Float;
3717 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3718 return NVPTXISD::TexUnified2DU32FloatLevel;
3719 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3720 return NVPTXISD::TexUnified2DU32FloatGrad;
3721
3722 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3723 return NVPTXISD::TexUnified2DArrayFloatS32;
3724 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3725 return NVPTXISD::TexUnified2DArrayFloatFloat;
3726 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3727 return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
3728 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3729 return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
3730 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3731 return NVPTXISD::TexUnified2DArrayS32S32;
3732 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3733 return NVPTXISD::TexUnified2DArrayS32Float;
3734 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3735 return NVPTXISD::TexUnified2DArrayS32FloatLevel;
3736 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3737 return NVPTXISD::TexUnified2DArrayS32FloatGrad;
3738 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3739 return NVPTXISD::TexUnified2DArrayU32S32;
3740 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3741 return NVPTXISD::TexUnified2DArrayU32Float;
3742 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3743 return NVPTXISD::TexUnified2DArrayU32FloatLevel;
3744 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3745 return NVPTXISD::TexUnified2DArrayU32FloatGrad;
3746
3747 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3748 return NVPTXISD::TexUnified3DFloatS32;
3749 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3750 return NVPTXISD::TexUnified3DFloatFloat;
3751 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3752 return NVPTXISD::TexUnified3DFloatFloatLevel;
3753 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3754 return NVPTXISD::TexUnified3DFloatFloatGrad;
3755 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3756 return NVPTXISD::TexUnified3DS32S32;
3757 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3758 return NVPTXISD::TexUnified3DS32Float;
3759 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3760 return NVPTXISD::TexUnified3DS32FloatLevel;
3761 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3762 return NVPTXISD::TexUnified3DS32FloatGrad;
3763 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3764 return NVPTXISD::TexUnified3DU32S32;
3765 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3766 return NVPTXISD::TexUnified3DU32Float;
3767 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3768 return NVPTXISD::TexUnified3DU32FloatLevel;
3769 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3770 return NVPTXISD::TexUnified3DU32FloatGrad;
3771
3772 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3773 return NVPTXISD::TexUnifiedCubeFloatFloat;
3774 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3775 return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
3776 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3777 return NVPTXISD::TexUnifiedCubeS32Float;
3778 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3779 return NVPTXISD::TexUnifiedCubeS32FloatLevel;
3780 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3781 return NVPTXISD::TexUnifiedCubeU32Float;
3782 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3783 return NVPTXISD::TexUnifiedCubeU32FloatLevel;
3784
3785 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3786 return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
3787 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3788 return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
3789 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3790 return NVPTXISD::TexUnifiedCubeArrayS32Float;
3791 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3792 return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
3793 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3794 return NVPTXISD::TexUnifiedCubeArrayU32Float;
3795 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3796 return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
3797
3798 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
3799 return NVPTXISD::TexUnifiedCubeFloatFloatGrad;
3800 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
3801 return NVPTXISD::TexUnifiedCubeS32FloatGrad;
3802 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
3803 return NVPTXISD::TexUnifiedCubeU32FloatGrad;
3804 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
3805 return NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad;
3806 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
3807 return NVPTXISD::TexUnifiedCubeArrayS32FloatGrad;
3808 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
3809 return NVPTXISD::TexUnifiedCubeArrayU32FloatGrad;
3810
3811 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3812 return NVPTXISD::Tld4UnifiedR2DFloatFloat;
3813 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3814 return NVPTXISD::Tld4UnifiedG2DFloatFloat;
3815 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3816 return NVPTXISD::Tld4UnifiedB2DFloatFloat;
3817 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3818 return NVPTXISD::Tld4UnifiedA2DFloatFloat;
3819 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3820 return NVPTXISD::Tld4UnifiedR2DS64Float;
3821 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3822 return NVPTXISD::Tld4UnifiedG2DS64Float;
3823 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3824 return NVPTXISD::Tld4UnifiedB2DS64Float;
3825 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3826 return NVPTXISD::Tld4UnifiedA2DS64Float;
3827 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3828 return NVPTXISD::Tld4UnifiedR2DU64Float;
3829 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3830 return NVPTXISD::Tld4UnifiedG2DU64Float;
3831 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3832 return NVPTXISD::Tld4UnifiedB2DU64Float;
3833 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3834 return NVPTXISD::Tld4UnifiedA2DU64Float;
3835 }
3836}
3837
3838static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3839 switch (Intrinsic) {
3840 default:
3841 return 0;
3842 case Intrinsic::nvvm_suld_1d_i8_clamp:
3843 return NVPTXISD::Suld1DI8Clamp;
3844 case Intrinsic::nvvm_suld_1d_i16_clamp:
3845 return NVPTXISD::Suld1DI16Clamp;
3846 case Intrinsic::nvvm_suld_1d_i32_clamp:
3847 return NVPTXISD::Suld1DI32Clamp;
3848 case Intrinsic::nvvm_suld_1d_i64_clamp:
3849 return NVPTXISD::Suld1DI64Clamp;
3850 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3851 return NVPTXISD::Suld1DV2I8Clamp;
3852 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3853 return NVPTXISD::Suld1DV2I16Clamp;
3854 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3855 return NVPTXISD::Suld1DV2I32Clamp;
3856 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3857 return NVPTXISD::Suld1DV2I64Clamp;
3858 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3859 return NVPTXISD::Suld1DV4I8Clamp;
3860 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3861 return NVPTXISD::Suld1DV4I16Clamp;
3862 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3863 return NVPTXISD::Suld1DV4I32Clamp;
3864 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3865 return NVPTXISD::Suld1DArrayI8Clamp;
3866 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3867 return NVPTXISD::Suld1DArrayI16Clamp;
3868 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3869 return NVPTXISD::Suld1DArrayI32Clamp;
3870 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3871 return NVPTXISD::Suld1DArrayI64Clamp;
3872 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3873 return NVPTXISD::Suld1DArrayV2I8Clamp;
3874 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3875 return NVPTXISD::Suld1DArrayV2I16Clamp;
3876 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3877 return NVPTXISD::Suld1DArrayV2I32Clamp;
3878 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3879 return NVPTXISD::Suld1DArrayV2I64Clamp;
3880 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3881 return NVPTXISD::Suld1DArrayV4I8Clamp;
3882 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3883 return NVPTXISD::Suld1DArrayV4I16Clamp;
3884 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3885 return NVPTXISD::Suld1DArrayV4I32Clamp;
3886 case Intrinsic::nvvm_suld_2d_i8_clamp:
3887 return NVPTXISD::Suld2DI8Clamp;
3888 case Intrinsic::nvvm_suld_2d_i16_clamp:
3889 return NVPTXISD::Suld2DI16Clamp;
3890 case Intrinsic::nvvm_suld_2d_i32_clamp:
3891 return NVPTXISD::Suld2DI32Clamp;
3892 case Intrinsic::nvvm_suld_2d_i64_clamp:
3893 return NVPTXISD::Suld2DI64Clamp;
3894 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3895 return NVPTXISD::Suld2DV2I8Clamp;
3896 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3897 return NVPTXISD::Suld2DV2I16Clamp;
3898 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3899 return NVPTXISD::Suld2DV2I32Clamp;
3900 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3901 return NVPTXISD::Suld2DV2I64Clamp;
3902 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3903 return NVPTXISD::Suld2DV4I8Clamp;
3904 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3905 return NVPTXISD::Suld2DV4I16Clamp;
3906 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3907 return NVPTXISD::Suld2DV4I32Clamp;
3908 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3909 return NVPTXISD::Suld2DArrayI8Clamp;
3910 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3911 return NVPTXISD::Suld2DArrayI16Clamp;
3912 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3913 return NVPTXISD::Suld2DArrayI32Clamp;
3914 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3915 return NVPTXISD::Suld2DArrayI64Clamp;
3916 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3917 return NVPTXISD::Suld2DArrayV2I8Clamp;
3918 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3919 return NVPTXISD::Suld2DArrayV2I16Clamp;
3920 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3921 return NVPTXISD::Suld2DArrayV2I32Clamp;
3922 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3923 return NVPTXISD::Suld2DArrayV2I64Clamp;
3924 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3925 return NVPTXISD::Suld2DArrayV4I8Clamp;
3926 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3927 return NVPTXISD::Suld2DArrayV4I16Clamp;
3928 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3929 return NVPTXISD::Suld2DArrayV4I32Clamp;
3930 case Intrinsic::nvvm_suld_3d_i8_clamp:
3931 return NVPTXISD::Suld3DI8Clamp;
3932 case Intrinsic::nvvm_suld_3d_i16_clamp:
3933 return NVPTXISD::Suld3DI16Clamp;
3934 case Intrinsic::nvvm_suld_3d_i32_clamp:
3935 return NVPTXISD::Suld3DI32Clamp;
3936 case Intrinsic::nvvm_suld_3d_i64_clamp:
3937 return NVPTXISD::Suld3DI64Clamp;
3938 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3939 return NVPTXISD::Suld3DV2I8Clamp;
3940 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3941 return NVPTXISD::Suld3DV2I16Clamp;
3942 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3943 return NVPTXISD::Suld3DV2I32Clamp;
3944 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3945 return NVPTXISD::Suld3DV2I64Clamp;
3946 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3947 return NVPTXISD::Suld3DV4I8Clamp;
3948 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3949 return NVPTXISD::Suld3DV4I16Clamp;
3950 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3951 return NVPTXISD::Suld3DV4I32Clamp;
3952 case Intrinsic::nvvm_suld_1d_i8_trap:
3953 return NVPTXISD::Suld1DI8Trap;
3954 case Intrinsic::nvvm_suld_1d_i16_trap:
3955 return NVPTXISD::Suld1DI16Trap;
3956 case Intrinsic::nvvm_suld_1d_i32_trap:
3957 return NVPTXISD::Suld1DI32Trap;
3958 case Intrinsic::nvvm_suld_1d_i64_trap:
3959 return NVPTXISD::Suld1DI64Trap;
3960 case Intrinsic::nvvm_suld_1d_v2i8_trap:
3961 return NVPTXISD::Suld1DV2I8Trap;
3962 case Intrinsic::nvvm_suld_1d_v2i16_trap:
3963 return NVPTXISD::Suld1DV2I16Trap;
3964 case Intrinsic::nvvm_suld_1d_v2i32_trap:
3965 return NVPTXISD::Suld1DV2I32Trap;
3966 case Intrinsic::nvvm_suld_1d_v2i64_trap:
3967 return NVPTXISD::Suld1DV2I64Trap;
3968 case Intrinsic::nvvm_suld_1d_v4i8_trap:
3969 return NVPTXISD::Suld1DV4I8Trap;
3970 case Intrinsic::nvvm_suld_1d_v4i16_trap:
3971 return NVPTXISD::Suld1DV4I16Trap;
3972 case Intrinsic::nvvm_suld_1d_v4i32_trap:
3973 return NVPTXISD::Suld1DV4I32Trap;
3974 case Intrinsic::nvvm_suld_1d_array_i8_trap:
3975 return NVPTXISD::Suld1DArrayI8Trap;
3976 case Intrinsic::nvvm_suld_1d_array_i16_trap:
3977 return NVPTXISD::Suld1DArrayI16Trap;
3978 case Intrinsic::nvvm_suld_1d_array_i32_trap:
3979 return NVPTXISD::Suld1DArrayI32Trap;
3980 case Intrinsic::nvvm_suld_1d_array_i64_trap:
3981 return NVPTXISD::Suld1DArrayI64Trap;
3982 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3983 return NVPTXISD::Suld1DArrayV2I8Trap;
3984 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3985 return NVPTXISD::Suld1DArrayV2I16Trap;
3986 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3987 return NVPTXISD::Suld1DArrayV2I32Trap;
3988 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3989 return NVPTXISD::Suld1DArrayV2I64Trap;
3990 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3991 return NVPTXISD::Suld1DArrayV4I8Trap;
3992 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3993 return NVPTXISD::Suld1DArrayV4I16Trap;
3994 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3995 return NVPTXISD::Suld1DArrayV4I32Trap;
3996 case Intrinsic::nvvm_suld_2d_i8_trap:
3997 return NVPTXISD::Suld2DI8Trap;
3998 case Intrinsic::nvvm_suld_2d_i16_trap:
3999 return NVPTXISD::Suld2DI16Trap;
4000 case Intrinsic::nvvm_suld_2d_i32_trap:
4001 return NVPTXISD::Suld2DI32Trap;
4002 case Intrinsic::nvvm_suld_2d_i64_trap:
4003 return NVPTXISD::Suld2DI64Trap;
4004 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4005 return NVPTXISD::Suld2DV2I8Trap;
4006 case Intrinsic::nvvm_suld_2d_v2i16_trap:
4007 return NVPTXISD::Suld2DV2I16Trap;
4008 case Intrinsic::nvvm_suld_2d_v2i32_trap:
4009 return NVPTXISD::Suld2DV2I32Trap;
4010 case Intrinsic::nvvm_suld_2d_v2i64_trap:
4011 return NVPTXISD::Suld2DV2I64Trap;
4012 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4013 return NVPTXISD::Suld2DV4I8Trap;
4014 case Intrinsic::nvvm_suld_2d_v4i16_trap:
4015 return NVPTXISD::Suld2DV4I16Trap;
4016 case Intrinsic::nvvm_suld_2d_v4i32_trap:
4017 return NVPTXISD::Suld2DV4I32Trap;
4018 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4019 return NVPTXISD::Suld2DArrayI8Trap;
4020 case Intrinsic::nvvm_suld_2d_array_i16_trap:
4021 return NVPTXISD::Suld2DArrayI16Trap;
4022 case Intrinsic::nvvm_suld_2d_array_i32_trap:
4023 return NVPTXISD::Suld2DArrayI32Trap;
4024 case Intrinsic::nvvm_suld_2d_array_i64_trap:
4025 return NVPTXISD::Suld2DArrayI64Trap;
4026 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4027 return NVPTXISD::Suld2DArrayV2I8Trap;
4028 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4029 return NVPTXISD::Suld2DArrayV2I16Trap;
4030 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4031 return NVPTXISD::Suld2DArrayV2I32Trap;
4032 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4033 return NVPTXISD::Suld2DArrayV2I64Trap;
4034 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4035 return NVPTXISD::Suld2DArrayV4I8Trap;
4036 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4037 return NVPTXISD::Suld2DArrayV4I16Trap;
4038 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4039 return NVPTXISD::Suld2DArrayV4I32Trap;
4040 case Intrinsic::nvvm_suld_3d_i8_trap:
4041 return NVPTXISD::Suld3DI8Trap;
4042 case Intrinsic::nvvm_suld_3d_i16_trap:
4043 return NVPTXISD::Suld3DI16Trap;
4044 case Intrinsic::nvvm_suld_3d_i32_trap:
4045 return NVPTXISD::Suld3DI32Trap;
4046 case Intrinsic::nvvm_suld_3d_i64_trap:
4047 return NVPTXISD::Suld3DI64Trap;
4048 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4049 return NVPTXISD::Suld3DV2I8Trap;
4050 case Intrinsic::nvvm_suld_3d_v2i16_trap:
4051 return NVPTXISD::Suld3DV2I16Trap;
4052 case Intrinsic::nvvm_suld_3d_v2i32_trap:
4053 return NVPTXISD::Suld3DV2I32Trap;
4054 case Intrinsic::nvvm_suld_3d_v2i64_trap:
4055 return NVPTXISD::Suld3DV2I64Trap;
4056 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4057 return NVPTXISD::Suld3DV4I8Trap;
4058 case Intrinsic::nvvm_suld_3d_v4i16_trap:
4059 return NVPTXISD::Suld3DV4I16Trap;
4060 case Intrinsic::nvvm_suld_3d_v4i32_trap:
4061 return NVPTXISD::Suld3DV4I32Trap;
4062 case Intrinsic::nvvm_suld_1d_i8_zero:
4063 return NVPTXISD::Suld1DI8Zero;
4064 case Intrinsic::nvvm_suld_1d_i16_zero:
4065 return NVPTXISD::Suld1DI16Zero;
4066 case Intrinsic::nvvm_suld_1d_i32_zero:
4067 return NVPTXISD::Suld1DI32Zero;
4068 case Intrinsic::nvvm_suld_1d_i64_zero:
4069 return NVPTXISD::Suld1DI64Zero;
4070 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4071 return NVPTXISD::Suld1DV2I8Zero;
4072 case Intrinsic::nvvm_suld_1d_v2i16_zero:
4073 return NVPTXISD::Suld1DV2I16Zero;
4074 case Intrinsic::nvvm_suld_1d_v2i32_zero:
4075 return NVPTXISD::Suld1DV2I32Zero;
4076 case Intrinsic::nvvm_suld_1d_v2i64_zero:
4077 return NVPTXISD::Suld1DV2I64Zero;
4078 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4079 return NVPTXISD::Suld1DV4I8Zero;
4080 case Intrinsic::nvvm_suld_1d_v4i16_zero:
4081 return NVPTXISD::Suld1DV4I16Zero;
4082 case Intrinsic::nvvm_suld_1d_v4i32_zero:
4083 return NVPTXISD::Suld1DV4I32Zero;
4084 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4085 return NVPTXISD::Suld1DArrayI8Zero;
4086 case Intrinsic::nvvm_suld_1d_array_i16_zero:
4087 return NVPTXISD::Suld1DArrayI16Zero;
4088 case Intrinsic::nvvm_suld_1d_array_i32_zero:
4089 return NVPTXISD::Suld1DArrayI32Zero;
4090 case Intrinsic::nvvm_suld_1d_array_i64_zero:
4091 return NVPTXISD::Suld1DArrayI64Zero;
4092 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4093 return NVPTXISD::Suld1DArrayV2I8Zero;
4094 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4095 return NVPTXISD::Suld1DArrayV2I16Zero;
4096 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4097 return NVPTXISD::Suld1DArrayV2I32Zero;
4098 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4099 return NVPTXISD::Suld1DArrayV2I64Zero;
4100 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4101 return NVPTXISD::Suld1DArrayV4I8Zero;
4102 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4103 return NVPTXISD::Suld1DArrayV4I16Zero;
4104 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4105 return NVPTXISD::Suld1DArrayV4I32Zero;
4106 case Intrinsic::nvvm_suld_2d_i8_zero:
4107 return NVPTXISD::Suld2DI8Zero;
4108 case Intrinsic::nvvm_suld_2d_i16_zero:
4109 return NVPTXISD::Suld2DI16Zero;
4110 case Intrinsic::nvvm_suld_2d_i32_zero:
4111 return NVPTXISD::Suld2DI32Zero;
4112 case Intrinsic::nvvm_suld_2d_i64_zero:
4113 return NVPTXISD::Suld2DI64Zero;
4114 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4115 return NVPTXISD::Suld2DV2I8Zero;
4116 case Intrinsic::nvvm_suld_2d_v2i16_zero:
4117 return NVPTXISD::Suld2DV2I16Zero;
4118 case Intrinsic::nvvm_suld_2d_v2i32_zero:
4119 return NVPTXISD::Suld2DV2I32Zero;
4120 case Intrinsic::nvvm_suld_2d_v2i64_zero:
4121 return NVPTXISD::Suld2DV2I64Zero;
4122 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4123 return NVPTXISD::Suld2DV4I8Zero;
4124 case Intrinsic::nvvm_suld_2d_v4i16_zero:
4125 return NVPTXISD::Suld2DV4I16Zero;
4126 case Intrinsic::nvvm_suld_2d_v4i32_zero:
4127 return NVPTXISD::Suld2DV4I32Zero;
4128 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4129 return NVPTXISD::Suld2DArrayI8Zero;
4130 case Intrinsic::nvvm_suld_2d_array_i16_zero:
4131 return NVPTXISD::Suld2DArrayI16Zero;
4132 case Intrinsic::nvvm_suld_2d_array_i32_zero:
4133 return NVPTXISD::Suld2DArrayI32Zero;
4134 case Intrinsic::nvvm_suld_2d_array_i64_zero:
4135 return NVPTXISD::Suld2DArrayI64Zero;
4136 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4137 return NVPTXISD::Suld2DArrayV2I8Zero;
4138 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4139 return NVPTXISD::Suld2DArrayV2I16Zero;
4140 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4141 return NVPTXISD::Suld2DArrayV2I32Zero;
4142 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4143 return NVPTXISD::Suld2DArrayV2I64Zero;
4144 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4145 return NVPTXISD::Suld2DArrayV4I8Zero;
4146 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4147 return NVPTXISD::Suld2DArrayV4I16Zero;
4148 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4149 return NVPTXISD::Suld2DArrayV4I32Zero;
4150 case Intrinsic::nvvm_suld_3d_i8_zero:
4151 return NVPTXISD::Suld3DI8Zero;
4152 case Intrinsic::nvvm_suld_3d_i16_zero:
4153 return NVPTXISD::Suld3DI16Zero;
4154 case Intrinsic::nvvm_suld_3d_i32_zero:
4155 return NVPTXISD::Suld3DI32Zero;
4156 case Intrinsic::nvvm_suld_3d_i64_zero:
4157 return NVPTXISD::Suld3DI64Zero;
4158 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4159 return NVPTXISD::Suld3DV2I8Zero;
4160 case Intrinsic::nvvm_suld_3d_v2i16_zero:
4161 return NVPTXISD::Suld3DV2I16Zero;
4162 case Intrinsic::nvvm_suld_3d_v2i32_zero:
4163 return NVPTXISD::Suld3DV2I32Zero;
4164 case Intrinsic::nvvm_suld_3d_v2i64_zero:
4165 return NVPTXISD::Suld3DV2I64Zero;
4166 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4167 return NVPTXISD::Suld3DV4I8Zero;
4168 case Intrinsic::nvvm_suld_3d_v4i16_zero:
4169 return NVPTXISD::Suld3DV4I16Zero;
4170 case Intrinsic::nvvm_suld_3d_v4i32_zero:
4171 return NVPTXISD::Suld3DV4I32Zero;
4172 }
4173}
4174
4175// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
4176// TgtMemIntrinsic
4177// because we need the information that is only available in the "Value" type
4178// of destination
4179// pointer. In particular, the address space information.
4180bool NVPTXTargetLowering::getTgtMemIntrinsic(
4181 IntrinsicInfo &Info, const CallInst &I,
4182 MachineFunction &MF, unsigned Intrinsic) const {
4183 switch (Intrinsic) {
4184 default:
4185 return false;
4186 case Intrinsic::nvvm_match_all_sync_i32p:
4187 case Intrinsic::nvvm_match_all_sync_i64p:
4188 Info.opc = ISD::INTRINSIC_W_CHAIN;
4189 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
4190 // in order to model data exchange with other threads, but perform no real
4191 // memory accesses.
4192 Info.memVT = MVT::i1;
4193
4194 // Our result depends on both our and other thread's arguments.
4195 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
4196 return true;
4197 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
4198 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
4199 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
4200 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
4201 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
4202 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
4203 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
4204 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
4205 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
4206 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
4207 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
4208 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
4209 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
4210 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
4211 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
4212 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
4213 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
4214 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
4215 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
4216 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
4217 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
4218 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
4219 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
4220 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
4221 Info.opc = ISD::INTRINSIC_W_CHAIN;
4222 Info.memVT = MVT::v8f16;
4223 Info.ptrVal = I.getArgOperand(i: 0);
4224 Info.offset = 0;
4225 Info.flags = MachineMemOperand::MOLoad;
4226 Info.align = Align(16);
4227 return true;
4228 }
4229 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
4230 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
4231 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
4232 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
4233 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
4234 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
4235 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
4236 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
4237 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
4238 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
4239 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
4240 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
4241 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
4242 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
4243 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
4244 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
4245 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
4246 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
4247 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
4248 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
4249 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
4250 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
4251 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
4252 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
4253 Info.opc = ISD::INTRINSIC_W_CHAIN;
4254 Info.memVT = MVT::v2i32;
4255 Info.ptrVal = I.getArgOperand(i: 0);
4256 Info.offset = 0;
4257 Info.flags = MachineMemOperand::MOLoad;
4258 Info.align = Align(8);
4259 return true;
4260 }
4261
4262 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
4263 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
4264 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
4265 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
4266 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
4267 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
4268 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
4269 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
4270 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
4271 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
4272 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
4273 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
4274 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
4275 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
4276 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
4277 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
4278
4279 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
4280 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
4281 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
4282 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
4283 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
4284 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
4285 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
4286 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
4287 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
4288 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
4289 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
4290 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
4291 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
4292 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
4293 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
4294 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
4295 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
4296 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
4297 Info.opc = ISD::INTRINSIC_W_CHAIN;
4298 Info.memVT = MVT::v4i32;
4299 Info.ptrVal = I.getArgOperand(i: 0);
4300 Info.offset = 0;
4301 Info.flags = MachineMemOperand::MOLoad;
4302 Info.align = Align(16);
4303 return true;
4304 }
4305
4306 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
4307 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
4308 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
4309 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
4310 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
4311 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
4312 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
4313 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
4314
4315 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
4316 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
4317 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
4318 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
4319 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
4320 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
4321 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
4322 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
4323 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
4324 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
4325 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
4326 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
4327 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
4328 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
4329 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
4330 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
4331 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
4332 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
4333 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
4334 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
4335 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
4336 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
4337 Info.opc = ISD::INTRINSIC_W_CHAIN;
4338 Info.memVT = MVT::i32;
4339 Info.ptrVal = I.getArgOperand(i: 0);
4340 Info.offset = 0;
4341 Info.flags = MachineMemOperand::MOLoad;
4342 Info.align = Align(4);
4343 return true;
4344 }
4345
4346 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
4347 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
4348 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
4349 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
4350 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
4351 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
4352 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
4353 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
4354 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
4355 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
4356 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
4357 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
4358 Info.opc = ISD::INTRINSIC_W_CHAIN;
4359 Info.memVT = MVT::v4f16;
4360 Info.ptrVal = I.getArgOperand(i: 0);
4361 Info.offset = 0;
4362 Info.flags = MachineMemOperand::MOLoad;
4363 Info.align = Align(16);
4364 return true;
4365 }
4366
4367 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
4368 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
4369 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
4370 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
4371 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
4372 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
4373 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
4374 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
4375 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
4376 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
4377 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
4378 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
4379 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
4380 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
4381 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
4382 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
4383 Info.opc = ISD::INTRINSIC_W_CHAIN;
4384 Info.memVT = MVT::v8f32;
4385 Info.ptrVal = I.getArgOperand(i: 0);
4386 Info.offset = 0;
4387 Info.flags = MachineMemOperand::MOLoad;
4388 Info.align = Align(16);
4389 return true;
4390 }
4391
4392 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4393 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4394 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4395 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4396
4397 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4398 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4399 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4400 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4401
4402 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4403 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4404 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4405 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4406 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4407 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4408 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4409 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4410 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4411 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4412 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4413 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4414 Info.opc = ISD::INTRINSIC_W_CHAIN;
4415 Info.memVT = MVT::v8i32;
4416 Info.ptrVal = I.getArgOperand(i: 0);
4417 Info.offset = 0;
4418 Info.flags = MachineMemOperand::MOLoad;
4419 Info.align = Align(16);
4420 return true;
4421 }
4422
4423 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4424 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4425 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4426 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4427 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4428 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4429 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4430 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4431 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4432 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
4433 Info.opc = ISD::INTRINSIC_W_CHAIN;
4434 Info.memVT = MVT::v2i32;
4435 Info.ptrVal = I.getArgOperand(i: 0);
4436 Info.offset = 0;
4437 Info.flags = MachineMemOperand::MOLoad;
4438 Info.align = Align(8);
4439 return true;
4440 }
4441
4442 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4443 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4444 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4445 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4446
4447 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4448 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4449 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4450 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4451 Info.opc = ISD::INTRINSIC_W_CHAIN;
4452 Info.memVT = MVT::f64;
4453 Info.ptrVal = I.getArgOperand(i: 0);
4454 Info.offset = 0;
4455 Info.flags = MachineMemOperand::MOLoad;
4456 Info.align = Align(8);
4457 return true;
4458 }
4459
4460 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4461 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4462 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4463 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4464 Info.opc = ISD::INTRINSIC_W_CHAIN;
4465 Info.memVT = MVT::v2f64;
4466 Info.ptrVal = I.getArgOperand(i: 0);
4467 Info.offset = 0;
4468 Info.flags = MachineMemOperand::MOLoad;
4469 Info.align = Align(16);
4470 return true;
4471 }
4472
4473 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4474 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4475 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4476 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4477 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4478 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4479 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4480 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4481 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4482 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4483 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4484 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4485 Info.opc = ISD::INTRINSIC_VOID;
4486 Info.memVT = MVT::v4f16;
4487 Info.ptrVal = I.getArgOperand(i: 0);
4488 Info.offset = 0;
4489 Info.flags = MachineMemOperand::MOStore;
4490 Info.align = Align(16);
4491 return true;
4492 }
4493
4494 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4495 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4496 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4497 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4498 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4499 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4500 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4501 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4502 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4503 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4504 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4505 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4506 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4507 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4508 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4509 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4510 Info.opc = ISD::INTRINSIC_VOID;
4511 Info.memVT = MVT::v8f32;
4512 Info.ptrVal = I.getArgOperand(i: 0);
4513 Info.offset = 0;
4514 Info.flags = MachineMemOperand::MOStore;
4515 Info.align = Align(16);
4516 return true;
4517 }
4518
4519 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4520 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4521 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4522 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4523 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4524 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4525 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4526 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4527 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4528 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4529 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4530 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4531 Info.opc = ISD::INTRINSIC_VOID;
4532 Info.memVT = MVT::v8i32;
4533 Info.ptrVal = I.getArgOperand(i: 0);
4534 Info.offset = 0;
4535 Info.flags = MachineMemOperand::MOStore;
4536 Info.align = Align(16);
4537 return true;
4538 }
4539
4540 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4541 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4542 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4543 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4544 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4545 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4546 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4547 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
4548 Info.opc = ISD::INTRINSIC_VOID;
4549 Info.memVT = MVT::v2i32;
4550 Info.ptrVal = I.getArgOperand(i: 0);
4551 Info.offset = 0;
4552 Info.flags = MachineMemOperand::MOStore;
4553 Info.align = Align(8);
4554 return true;
4555 }
4556
4557 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4558 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4559 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4560 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4561 Info.opc = ISD::INTRINSIC_VOID;
4562 Info.memVT = MVT::v2f64;
4563 Info.ptrVal = I.getArgOperand(i: 0);
4564 Info.offset = 0;
4565 Info.flags = MachineMemOperand::MOStore;
4566 Info.align = Align(16);
4567 return true;
4568 }
4569
4570 case Intrinsic::nvvm_atomic_load_inc_32:
4571 case Intrinsic::nvvm_atomic_load_dec_32:
4572
4573 case Intrinsic::nvvm_atomic_add_gen_f_cta:
4574 case Intrinsic::nvvm_atomic_add_gen_f_sys:
4575 case Intrinsic::nvvm_atomic_add_gen_i_cta:
4576 case Intrinsic::nvvm_atomic_add_gen_i_sys:
4577 case Intrinsic::nvvm_atomic_and_gen_i_cta:
4578 case Intrinsic::nvvm_atomic_and_gen_i_sys:
4579 case Intrinsic::nvvm_atomic_cas_gen_i_cta:
4580 case Intrinsic::nvvm_atomic_cas_gen_i_sys:
4581 case Intrinsic::nvvm_atomic_dec_gen_i_cta:
4582 case Intrinsic::nvvm_atomic_dec_gen_i_sys:
4583 case Intrinsic::nvvm_atomic_inc_gen_i_cta:
4584 case Intrinsic::nvvm_atomic_inc_gen_i_sys:
4585 case Intrinsic::nvvm_atomic_max_gen_i_cta:
4586 case Intrinsic::nvvm_atomic_max_gen_i_sys:
4587 case Intrinsic::nvvm_atomic_min_gen_i_cta:
4588 case Intrinsic::nvvm_atomic_min_gen_i_sys:
4589 case Intrinsic::nvvm_atomic_or_gen_i_cta:
4590 case Intrinsic::nvvm_atomic_or_gen_i_sys:
4591 case Intrinsic::nvvm_atomic_exch_gen_i_cta:
4592 case Intrinsic::nvvm_atomic_exch_gen_i_sys:
4593 case Intrinsic::nvvm_atomic_xor_gen_i_cta:
4594 case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
4595 auto &DL = I.getModule()->getDataLayout();
4596 Info.opc = ISD::INTRINSIC_W_CHAIN;
4597 Info.memVT = getValueType(DL, Ty: I.getType());
4598 Info.ptrVal = I.getArgOperand(i: 0);
4599 Info.offset = 0;
4600 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
4601 Info.align.reset();
4602 return true;
4603 }
4604
4605 case Intrinsic::nvvm_ldu_global_i:
4606 case Intrinsic::nvvm_ldu_global_f:
4607 case Intrinsic::nvvm_ldu_global_p: {
4608 auto &DL = I.getModule()->getDataLayout();
4609 Info.opc = ISD::INTRINSIC_W_CHAIN;
4610 if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
4611 Info.memVT = getValueType(DL, Ty: I.getType());
4612 else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
4613 Info.memVT = getPointerTy(DL);
4614 else
4615 Info.memVT = getValueType(DL, Ty: I.getType());
4616 Info.ptrVal = I.getArgOperand(i: 0);
4617 Info.offset = 0;
4618 Info.flags = MachineMemOperand::MOLoad;
4619 Info.align = cast<ConstantInt>(Val: I.getArgOperand(i: 1))->getMaybeAlignValue();
4620
4621 return true;
4622 }
4623 case Intrinsic::nvvm_ldg_global_i:
4624 case Intrinsic::nvvm_ldg_global_f:
4625 case Intrinsic::nvvm_ldg_global_p: {
4626 auto &DL = I.getModule()->getDataLayout();
4627
4628 Info.opc = ISD::INTRINSIC_W_CHAIN;
4629 if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
4630 Info.memVT = getValueType(DL, Ty: I.getType());
4631 else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
4632 Info.memVT = getPointerTy(DL);
4633 else
4634 Info.memVT = getValueType(DL, Ty: I.getType());
4635 Info.ptrVal = I.getArgOperand(i: 0);
4636 Info.offset = 0;
4637 Info.flags = MachineMemOperand::MOLoad;
4638 Info.align = cast<ConstantInt>(Val: I.getArgOperand(i: 1))->getMaybeAlignValue();
4639
4640 return true;
4641 }
4642
4643 case Intrinsic::nvvm_tex_1d_v4f32_s32:
4644 case Intrinsic::nvvm_tex_1d_v4f32_f32:
4645 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4646 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4647 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4648 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4649 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4650 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4651 case Intrinsic::nvvm_tex_2d_v4f32_s32:
4652 case Intrinsic::nvvm_tex_2d_v4f32_f32:
4653 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4654 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4655 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4656 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4657 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4658 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4659 case Intrinsic::nvvm_tex_3d_v4f32_s32:
4660 case Intrinsic::nvvm_tex_3d_v4f32_f32:
4661 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4662 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4663 case Intrinsic::nvvm_tex_cube_v4f32_f32:
4664 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4665 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4666 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4667 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4668 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4669 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4670 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4671 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4672 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4673 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4674 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4675 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4676 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4677 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4678 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4679 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4680 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4681 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4682 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4683 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4684 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4685 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4686 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4687 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4688 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4689 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4690 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4691 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4692 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4693 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4694 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4695 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
4696 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
4697 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4698 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4699 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4700 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4701 Info.opc = getOpcForTextureInstr(Intrinsic);
4702 Info.memVT = MVT::v4f32;
4703 Info.ptrVal = nullptr;
4704 Info.offset = 0;
4705 Info.flags = MachineMemOperand::MOLoad;
4706 Info.align = Align(16);
4707 return true;
4708
4709 case Intrinsic::nvvm_tex_1d_v4s32_s32:
4710 case Intrinsic::nvvm_tex_1d_v4s32_f32:
4711 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4712 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4713 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4714 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4715 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4716 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4717 case Intrinsic::nvvm_tex_2d_v4s32_s32:
4718 case Intrinsic::nvvm_tex_2d_v4s32_f32:
4719 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4720 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4721 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4722 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4723 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4724 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4725 case Intrinsic::nvvm_tex_3d_v4s32_s32:
4726 case Intrinsic::nvvm_tex_3d_v4s32_f32:
4727 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4728 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4729 case Intrinsic::nvvm_tex_cube_v4s32_f32:
4730 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4731 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4732 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4733 case Intrinsic::nvvm_tex_cube_v4u32_f32:
4734 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4735 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4736 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4737 case Intrinsic::nvvm_tex_1d_v4u32_s32:
4738 case Intrinsic::nvvm_tex_1d_v4u32_f32:
4739 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4740 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4741 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4742 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4743 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4744 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4745 case Intrinsic::nvvm_tex_2d_v4u32_s32:
4746 case Intrinsic::nvvm_tex_2d_v4u32_f32:
4747 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4748 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4749 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4750 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4751 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4752 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4753 case Intrinsic::nvvm_tex_3d_v4u32_s32:
4754 case Intrinsic::nvvm_tex_3d_v4u32_f32:
4755 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4756 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4757 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4758 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4759 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4760 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4761 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4762 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4763 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4764 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4765 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4766 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4767 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4768 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4769 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4770 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4771 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4772 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4773 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4774 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4775 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4776 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4777 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4778 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4779 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4780 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4781 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4782 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4783 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4784 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4785 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4786 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4787 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4788 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4789 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4790 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4791 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4792 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4793 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4794 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4795 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4796 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4797 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4798 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4799 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4800 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4801 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4802 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4803 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4804 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4805 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4806 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4807 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4808 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4809 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4810 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4811 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4812 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4813 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4814 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4815 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4816 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4817 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4818 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4819 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4820 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4821 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4822 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4823 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4824 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4825 Info.opc = getOpcForTextureInstr(Intrinsic);
4826 Info.memVT = MVT::v4i32;
4827 Info.ptrVal = nullptr;
4828 Info.offset = 0;
4829 Info.flags = MachineMemOperand::MOLoad;
4830 Info.align = Align(16);
4831 return true;
4832
4833 case Intrinsic::nvvm_suld_1d_i8_clamp:
4834 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4835 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4836 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4837 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4838 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4839 case Intrinsic::nvvm_suld_2d_i8_clamp:
4840 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4841 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4842 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4843 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4844 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4845 case Intrinsic::nvvm_suld_3d_i8_clamp:
4846 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4847 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4848 case Intrinsic::nvvm_suld_1d_i8_trap:
4849 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4850 case Intrinsic::nvvm_suld_1d_v4i8_trap:
4851 case Intrinsic::nvvm_suld_1d_array_i8_trap:
4852 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4853 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4854 case Intrinsic::nvvm_suld_2d_i8_trap:
4855 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4856 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4857 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4858 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4859 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4860 case Intrinsic::nvvm_suld_3d_i8_trap:
4861 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4862 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4863 case Intrinsic::nvvm_suld_1d_i8_zero:
4864 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4865 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4866 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4867 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4868 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4869 case Intrinsic::nvvm_suld_2d_i8_zero:
4870 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4871 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4872 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4873 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4874 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4875 case Intrinsic::nvvm_suld_3d_i8_zero:
4876 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4877 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4878 Info.opc = getOpcForSurfaceInstr(Intrinsic);
4879 Info.memVT = MVT::i8;
4880 Info.ptrVal = nullptr;
4881 Info.offset = 0;
4882 Info.flags = MachineMemOperand::MOLoad;
4883 Info.align = Align(16);
4884 return true;
4885
4886 case Intrinsic::nvvm_suld_1d_i16_clamp:
4887 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4888 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4889 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4890 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4891 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4892 case Intrinsic::nvvm_suld_2d_i16_clamp:
4893 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4894 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4895 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4896 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4897 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4898 case Intrinsic::nvvm_suld_3d_i16_clamp:
4899 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4900 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4901 case Intrinsic::nvvm_suld_1d_i16_trap:
4902 case Intrinsic::nvvm_suld_1d_v2i16_trap:
4903 case Intrinsic::nvvm_suld_1d_v4i16_trap:
4904 case Intrinsic::nvvm_suld_1d_array_i16_trap:
4905 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4906 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4907 case Intrinsic::nvvm_suld_2d_i16_trap:
4908 case Intrinsic::nvvm_suld_2d_v2i16_trap:
4909 case Intrinsic::nvvm_suld_2d_v4i16_trap:
4910 case Intrinsic::nvvm_suld_2d_array_i16_trap:
4911 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4912 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4913 case Intrinsic::nvvm_suld_3d_i16_trap:
4914 case Intrinsic::nvvm_suld_3d_v2i16_trap:
4915 case Intrinsic::nvvm_suld_3d_v4i16_trap:
4916 case Intrinsic::nvvm_suld_1d_i16_zero:
4917 case Intrinsic::nvvm_suld_1d_v2i16_zero:
4918 case Intrinsic::nvvm_suld_1d_v4i16_zero:
4919 case Intrinsic::nvvm_suld_1d_array_i16_zero:
4920 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4921 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4922 case Intrinsic::nvvm_suld_2d_i16_zero:
4923 case Intrinsic::nvvm_suld_2d_v2i16_zero:
4924 case Intrinsic::nvvm_suld_2d_v4i16_zero:
4925 case Intrinsic::nvvm_suld_2d_array_i16_zero:
4926 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4927 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4928 case Intrinsic::nvvm_suld_3d_i16_zero:
4929 case Intrinsic::nvvm_suld_3d_v2i16_zero:
4930 case Intrinsic::nvvm_suld_3d_v4i16_zero:
4931 Info.opc = getOpcForSurfaceInstr(Intrinsic);
4932 Info.memVT = MVT::i16;
4933 Info.ptrVal = nullptr;
4934 Info.offset = 0;
4935 Info.flags = MachineMemOperand::MOLoad;
4936 Info.align = Align(16);
4937 return true;
4938
4939 case Intrinsic::nvvm_suld_1d_i32_clamp:
4940 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4941 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4942 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4943 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4944 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4945 case Intrinsic::nvvm_suld_2d_i32_clamp:
4946 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4947 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4948 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4949 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4950 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4951 case Intrinsic::nvvm_suld_3d_i32_clamp:
4952 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4953 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4954 case Intrinsic::nvvm_suld_1d_i32_trap:
4955 case Intrinsic::nvvm_suld_1d_v2i32_trap:
4956 case Intrinsic::nvvm_suld_1d_v4i32_trap:
4957 case Intrinsic::nvvm_suld_1d_array_i32_trap:
4958 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4959 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4960 case Intrinsic::nvvm_suld_2d_i32_trap:
4961 case Intrinsic::nvvm_suld_2d_v2i32_trap:
4962 case Intrinsic::nvvm_suld_2d_v4i32_trap:
4963 case Intrinsic::nvvm_suld_2d_array_i32_trap:
4964 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4965 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4966 case Intrinsic::nvvm_suld_3d_i32_trap:
4967 case Intrinsic::nvvm_suld_3d_v2i32_trap:
4968 case Intrinsic::nvvm_suld_3d_v4i32_trap:
4969 case Intrinsic::nvvm_suld_1d_i32_zero:
4970 case Intrinsic::nvvm_suld_1d_v2i32_zero:
4971 case Intrinsic::nvvm_suld_1d_v4i32_zero:
4972 case Intrinsic::nvvm_suld_1d_array_i32_zero:
4973 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4974 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4975 case Intrinsic::nvvm_suld_2d_i32_zero:
4976 case Intrinsic::nvvm_suld_2d_v2i32_zero:
4977 case Intrinsic::nvvm_suld_2d_v4i32_zero:
4978 case Intrinsic::nvvm_suld_2d_array_i32_zero:
4979 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4980 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4981 case Intrinsic::nvvm_suld_3d_i32_zero:
4982 case Intrinsic::nvvm_suld_3d_v2i32_zero:
4983 case Intrinsic::nvvm_suld_3d_v4i32_zero:
4984 Info.opc = getOpcForSurfaceInstr(Intrinsic);
4985 Info.memVT = MVT::i32;
4986 Info.ptrVal = nullptr;
4987 Info.offset = 0;
4988 Info.flags = MachineMemOperand::MOLoad;
4989 Info.align = Align(16);
4990 return true;
4991
4992 case Intrinsic::nvvm_suld_1d_i64_clamp:
4993 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4994 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4995 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4996 case Intrinsic::nvvm_suld_2d_i64_clamp:
4997 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4998 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4999 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
5000 case Intrinsic::nvvm_suld_3d_i64_clamp:
5001 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
5002 case Intrinsic::nvvm_suld_1d_i64_trap:
5003 case Intrinsic::nvvm_suld_1d_v2i64_trap:
5004 case Intrinsic::nvvm_suld_1d_array_i64_trap:
5005 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
5006 case Intrinsic::nvvm_suld_2d_i64_trap:
5007 case Intrinsic::nvvm_suld_2d_v2i64_trap:
5008 case Intrinsic::nvvm_suld_2d_array_i64_trap:
5009 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
5010 case Intrinsic::nvvm_suld_3d_i64_trap:
5011 case Intrinsic::nvvm_suld_3d_v2i64_trap:
5012 case Intrinsic::nvvm_suld_1d_i64_zero:
5013 case Intrinsic::nvvm_suld_1d_v2i64_zero:
5014 case Intrinsic::nvvm_suld_1d_array_i64_zero:
5015 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
5016 case Intrinsic::nvvm_suld_2d_i64_zero:
5017 case Intrinsic::nvvm_suld_2d_v2i64_zero:
5018 case Intrinsic::nvvm_suld_2d_array_i64_zero:
5019 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
5020 case Intrinsic::nvvm_suld_3d_i64_zero:
5021 case Intrinsic::nvvm_suld_3d_v2i64_zero:
5022 Info.opc = getOpcForSurfaceInstr(Intrinsic);
5023 Info.memVT = MVT::i64;
5024 Info.ptrVal = nullptr;
5025 Info.offset = 0;
5026 Info.flags = MachineMemOperand::MOLoad;
5027 Info.align = Align(16);
5028 return true;
5029 }
5030 return false;
5031}
5032
5033/// getFunctionParamOptimizedAlign - since function arguments are passed via
5034/// .param space, we may want to increase their alignment in a way that
5035/// ensures that we can effectively vectorize their loads & stores. We can
5036/// increase alignment only if the function has internal or has private
5037/// linkage as for other linkage types callers may already rely on default
5038/// alignment. To allow using 128-bit vectorized loads/stores, this function
5039/// ensures that alignment is 16 or greater.
5040Align NVPTXTargetLowering::getFunctionParamOptimizedAlign(
5041 const Function *F, Type *ArgTy, const DataLayout &DL) const {
5042 const uint64_t ABITypeAlign = DL.getABITypeAlign(Ty: ArgTy).value();
5043
5044 // If a function has linkage different from internal or private, we
5045 // must use default ABI alignment as external users rely on it. Same
5046 // for a function that may be called from a function pointer.
5047 if (!F || !F->hasLocalLinkage() ||
5048 F->hasAddressTaken(/*Users=*/nullptr,
5049 /*IgnoreCallbackUses=*/false,
5050 /*IgnoreAssumeLikeCalls=*/true,
5051 /*IgnoreLLVMUsed=*/IngoreLLVMUsed: true))
5052 return Align(ABITypeAlign);
5053
5054 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage");
5055 return Align(std::max(a: uint64_t(16), b: ABITypeAlign));
5056}
5057
5058/// Helper for computing alignment of a device function byval parameter.
5059Align NVPTXTargetLowering::getFunctionByValParamAlign(
5060 const Function *F, Type *ArgTy, Align InitialAlign,
5061 const DataLayout &DL) const {
5062 Align ArgAlign = InitialAlign;
5063 // Try to increase alignment to enhance vectorization options.
5064 if (F)
5065 ArgAlign = std::max(a: ArgAlign, b: getFunctionParamOptimizedAlign(F, ArgTy, DL));
5066
5067 // Old ptx versions have a bug. When PTX code takes address of
5068 // byval parameter with alignment < 4, ptxas generates code to
5069 // spill argument into memory. Alas on sm_50+ ptxas generates
5070 // SASS code that fails with misaligned access. To work around
5071 // the problem, make sure that we align byval parameters by at
5072 // least 4. This bug seems to be fixed at least starting from
5073 // ptxas > 9.0.
5074 // TODO: remove this after verifying the bug is not reproduced
5075 // on non-deprecated ptxas versions.
5076 if (ForceMinByValParamAlign)
5077 ArgAlign = std::max(a: ArgAlign, b: Align(4));
5078
5079 return ArgAlign;
5080}
5081
5082// Helper for getting a function parameter name. Name is composed from
5083// its index and the function name. Negative index corresponds to special
5084// parameter (unsized array) used for passing variable arguments.
5085std::string NVPTXTargetLowering::getParamName(const Function *F,
5086 int Idx) const {
5087 std::string ParamName;
5088 raw_string_ostream ParamStr(ParamName);
5089
5090 ParamStr << getTargetMachine().getSymbol(GV: F)->getName();
5091 if (Idx < 0)
5092 ParamStr << "_vararg";
5093 else
5094 ParamStr << "_param_" << Idx;
5095
5096 return ParamName;
5097}
5098
5099/// isLegalAddressingMode - Return true if the addressing mode represented
5100/// by AM is legal for this target, for a load/store of the specified type.
5101/// Used to guide target specific optimizations, like loop strength reduction
5102/// (LoopStrengthReduce.cpp) and memory optimization for address mode
5103/// (CodeGenPrepare.cpp)
5104bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
5105 const AddrMode &AM, Type *Ty,
5106 unsigned AS, Instruction *I) const {
5107 // AddrMode - This represents an addressing mode of:
5108 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
5109 //
5110 // The legal address modes are
5111 // - [avar]
5112 // - [areg]
5113 // - [areg+immoff]
5114 // - [immAddr]
5115
5116 if (AM.BaseGV) {
5117 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
5118 }
5119
5120 switch (AM.Scale) {
5121 case 0: // "r", "r+i" or "i" is allowed
5122 break;
5123 case 1:
5124 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
5125 return false;
5126 // Otherwise we have r+i.
5127 break;
5128 default:
5129 // No scale > 1 is allowed
5130 return false;
5131 }
5132 return true;
5133}
5134
5135//===----------------------------------------------------------------------===//
5136// NVPTX Inline Assembly Support
5137//===----------------------------------------------------------------------===//
5138
5139/// getConstraintType - Given a constraint letter, return the type of
5140/// constraint it is for this target.
5141NVPTXTargetLowering::ConstraintType
5142NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
5143 if (Constraint.size() == 1) {
5144 switch (Constraint[0]) {
5145 default:
5146 break;
5147 case 'b':
5148 case 'r':
5149 case 'h':
5150 case 'c':
5151 case 'l':
5152 case 'f':
5153 case 'd':
5154 case '0':
5155 case 'N':
5156 return C_RegisterClass;
5157 }
5158 }
5159 return TargetLowering::getConstraintType(Constraint);
5160}
5161
5162std::pair<unsigned, const TargetRegisterClass *>
5163NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
5164 StringRef Constraint,
5165 MVT VT) const {
5166 if (Constraint.size() == 1) {
5167 switch (Constraint[0]) {
5168 case 'b':
5169 return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
5170 case 'c':
5171 return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5172 case 'h':
5173 return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
5174 case 'r':
5175 return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
5176 case 'l':
5177 case 'N':
5178 return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
5179 case 'f':
5180 return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
5181 case 'd':
5182 return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
5183 }
5184 }
5185 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5186}
5187
5188//===----------------------------------------------------------------------===//
5189// NVPTX DAG Combining
5190//===----------------------------------------------------------------------===//
5191
5192bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
5193 CodeGenOptLevel OptLevel) const {
5194 // Always honor command-line argument
5195 if (FMAContractLevelOpt.getNumOccurrences() > 0)
5196 return FMAContractLevelOpt > 0;
5197
5198 // Do not contract if we're not optimizing the code.
5199 if (OptLevel == CodeGenOptLevel::None)
5200 return false;
5201
5202 // Honor TargetOptions flags that explicitly say fusion is okay.
5203 if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
5204 return true;
5205
5206 return allowUnsafeFPMath(MF);
5207}
5208
5209bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
5210 // Honor TargetOptions flags that explicitly say unsafe math is okay.
5211 if (MF.getTarget().Options.UnsafeFPMath)
5212 return true;
5213
5214 // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
5215 const Function &F = MF.getFunction();
5216 return F.getFnAttribute(Kind: "unsafe-fp-math").getValueAsBool();
5217}
5218
5219/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5220/// operands N0 and N1. This is a helper for PerformADDCombine that is
5221/// called with the default operands, and if that fails, with commuted
5222/// operands.
5223static SDValue PerformADDCombineWithOperands(
5224 SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI,
5225 const NVPTXSubtarget &Subtarget, CodeGenOptLevel OptLevel) {
5226 SelectionDAG &DAG = DCI.DAG;
5227 // Skip non-integer, non-scalar case
5228 EVT VT=N0.getValueType();
5229 if (VT.isVector())
5230 return SDValue();
5231
5232 // fold (add (mul a, b), c) -> (mad a, b, c)
5233 //
5234 if (N0.getOpcode() == ISD::MUL) {
5235 assert (VT.isInteger());
5236 // For integer:
5237 // Since integer multiply-add costs the same as integer multiply
5238 // but is more costly than integer add, do the fusion only when
5239 // the mul is only used in the add.
5240 if (OptLevel == CodeGenOptLevel::None || VT != MVT::i32 ||
5241 !N0.getNode()->hasOneUse())
5242 return SDValue();
5243
5244 // Do the folding
5245 return DAG.getNode(Opcode: NVPTXISD::IMAD, DL: SDLoc(N), VT,
5246 N1: N0.getOperand(i: 0), N2: N0.getOperand(i: 1), N3: N1);
5247 }
5248 else if (N0.getOpcode() == ISD::FMUL) {
5249 if (VT == MVT::f32 || VT == MVT::f64) {
5250 const auto *TLI = static_cast<const NVPTXTargetLowering *>(
5251 &DAG.getTargetLoweringInfo());
5252 if (!TLI->allowFMA(MF&: DAG.getMachineFunction(), OptLevel))
5253 return SDValue();
5254
5255 // For floating point:
5256 // Do the fusion only when the mul has less than 5 uses and all
5257 // are add.
5258 // The heuristic is that if a use is not an add, then that use
5259 // cannot be fused into fma, therefore mul is still needed anyway.
5260 // If there are more than 4 uses, even if they are all add, fusing
5261 // them will increase register pressue.
5262 //
5263 int numUses = 0;
5264 int nonAddCount = 0;
5265 for (const SDNode *User : N0.getNode()->uses()) {
5266 numUses++;
5267 if (User->getOpcode() != ISD::FADD)
5268 ++nonAddCount;
5269 }
5270 if (numUses >= 5)
5271 return SDValue();
5272 if (nonAddCount) {
5273 int orderNo = N->getIROrder();
5274 int orderNo2 = N0.getNode()->getIROrder();
5275 // simple heuristics here for considering potential register
5276 // pressure, the logics here is that the differnce are used
5277 // to measure the distance between def and use, the longer distance
5278 // more likely cause register pressure.
5279 if (orderNo - orderNo2 < 500)
5280 return SDValue();
5281
5282 // Now, check if at least one of the FMUL's operands is live beyond the node N,
5283 // which guarantees that the FMA will not increase register pressure at node N.
5284 bool opIsLive = false;
5285 const SDNode *left = N0.getOperand(i: 0).getNode();
5286 const SDNode *right = N0.getOperand(i: 1).getNode();
5287
5288 if (isa<ConstantSDNode>(Val: left) || isa<ConstantSDNode>(Val: right))
5289 opIsLive = true;
5290
5291 if (!opIsLive)
5292 for (const SDNode *User : left->uses()) {
5293 int orderNo3 = User->getIROrder();
5294 if (orderNo3 > orderNo) {
5295 opIsLive = true;
5296 break;
5297 }
5298 }
5299
5300 if (!opIsLive)
5301 for (const SDNode *User : right->uses()) {
5302 int orderNo3 = User->getIROrder();
5303 if (orderNo3 > orderNo) {
5304 opIsLive = true;
5305 break;
5306 }
5307 }
5308
5309 if (!opIsLive)
5310 return SDValue();
5311 }
5312
5313 return DAG.getNode(Opcode: ISD::FMA, DL: SDLoc(N), VT,
5314 N1: N0.getOperand(i: 0), N2: N0.getOperand(i: 1), N3: N1);
5315 }
5316 }
5317
5318 return SDValue();
5319}
5320
5321static SDValue PerformStoreRetvalCombine(SDNode *N) {
5322 // Operands from the 2nd to the last one are the values to be stored
5323 for (std::size_t I = 2, OpsCount = N->ops().size(); I != OpsCount; ++I)
5324 if (!N->getOperand(Num: I).isUndef())
5325 return SDValue();
5326
5327 // Operand 0 is the previous value in the chain. Cannot return EntryToken
5328 // as the previous value will become unused and eliminated later.
5329 return N->getOperand(Num: 0);
5330}
5331
5332/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5333///
5334static SDValue PerformADDCombine(SDNode *N,
5335 TargetLowering::DAGCombinerInfo &DCI,
5336 const NVPTXSubtarget &Subtarget,
5337 CodeGenOptLevel OptLevel) {
5338 SDValue N0 = N->getOperand(Num: 0);
5339 SDValue N1 = N->getOperand(Num: 1);
5340
5341 // First try with the default operand order.
5342 if (SDValue Result =
5343 PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
5344 return Result;
5345
5346 // If that didn't work, try again with the operands commuted.
5347 return PerformADDCombineWithOperands(N, N0: N1, N1: N0, DCI, Subtarget, OptLevel);
5348}
5349
5350static SDValue PerformANDCombine(SDNode *N,
5351 TargetLowering::DAGCombinerInfo &DCI) {
5352 // The type legalizer turns a vector load of i8 values into a zextload to i16
5353 // registers, optionally ANY_EXTENDs it (if target type is integer),
5354 // and ANDs off the high 8 bits. Since we turn this load into a
5355 // target-specific DAG node, the DAG combiner fails to eliminate these AND
5356 // nodes. Do that here.
5357 SDValue Val = N->getOperand(Num: 0);
5358 SDValue Mask = N->getOperand(Num: 1);
5359
5360 if (isa<ConstantSDNode>(Val)) {
5361 std::swap(a&: Val, b&: Mask);
5362 }
5363
5364 SDValue AExt;
5365
5366 // Convert BFE-> truncate i16 -> and 255
5367 // To just BFE-> truncate i16, as the value already has all the bits in the
5368 // right places.
5369 if (Val.getOpcode() == ISD::TRUNCATE) {
5370 SDValue BFE = Val.getOperand(i: 0);
5371 if (BFE.getOpcode() != NVPTXISD::BFE)
5372 return SDValue();
5373
5374 ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(Val: BFE.getOperand(i: 0));
5375 if (!BFEBits)
5376 return SDValue();
5377 uint64_t BFEBitsVal = BFEBits->getZExtValue();
5378
5379 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Val&: Mask);
5380 if (!MaskCnst) {
5381 // Not an AND with a constant
5382 return SDValue();
5383 }
5384 uint64_t MaskVal = MaskCnst->getZExtValue();
5385
5386 if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1)
5387 return SDValue();
5388 // If we get here, the AND is unnecessary. Just replace it with the trunc
5389 DCI.CombineTo(N, Res: Val, AddTo: false);
5390 }
5391 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
5392 if (Val.getOpcode() == ISD::ANY_EXTEND) {
5393 AExt = Val;
5394 Val = Val->getOperand(Num: 0);
5395 }
5396
5397 if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
5398 Val = Val->getOperand(Num: 0);
5399 }
5400
5401 if (Val->getOpcode() == NVPTXISD::LoadV2 ||
5402 Val->getOpcode() == NVPTXISD::LoadV4) {
5403 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Val&: Mask);
5404 if (!MaskCnst) {
5405 // Not an AND with a constant
5406 return SDValue();
5407 }
5408
5409 uint64_t MaskVal = MaskCnst->getZExtValue();
5410 if (MaskVal != 0xff) {
5411 // Not an AND that chops off top 8 bits
5412 return SDValue();
5413 }
5414
5415 MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
5416 if (!Mem) {
5417 // Not a MemSDNode?!?
5418 return SDValue();
5419 }
5420
5421 EVT MemVT = Mem->getMemoryVT();
5422 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
5423 // We only handle the i8 case
5424 return SDValue();
5425 }
5426
5427 unsigned ExtType = Val->getConstantOperandVal(Num: Val->getNumOperands() - 1);
5428 if (ExtType == ISD::SEXTLOAD) {
5429 // If for some reason the load is a sextload, the and is needed to zero
5430 // out the high 8 bits
5431 return SDValue();
5432 }
5433
5434 bool AddTo = false;
5435 if (AExt.getNode() != nullptr) {
5436 // Re-insert the ext as a zext.
5437 Val = DCI.DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc(N),
5438 VT: AExt.getValueType(), Operand: Val);
5439 AddTo = true;
5440 }
5441
5442 // If we get here, the AND is unnecessary. Just replace it with the load
5443 DCI.CombineTo(N, Res: Val, AddTo);
5444 }
5445
5446 return SDValue();
5447}
5448
5449static SDValue PerformREMCombine(SDNode *N,
5450 TargetLowering::DAGCombinerInfo &DCI,
5451 CodeGenOptLevel OptLevel) {
5452 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
5453
5454 // Don't do anything at less than -O2.
5455 if (OptLevel < CodeGenOptLevel::Default)
5456 return SDValue();
5457
5458 SelectionDAG &DAG = DCI.DAG;
5459 SDLoc DL(N);
5460 EVT VT = N->getValueType(ResNo: 0);
5461 bool IsSigned = N->getOpcode() == ISD::SREM;
5462 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
5463
5464 const SDValue &Num = N->getOperand(Num: 0);
5465 const SDValue &Den = N->getOperand(Num: 1);
5466
5467 for (const SDNode *U : Num->uses()) {
5468 if (U->getOpcode() == DivOpc && U->getOperand(Num: 0) == Num &&
5469 U->getOperand(Num: 1) == Den) {
5470 // Num % Den -> Num - (Num / Den) * Den
5471 return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Num,
5472 N2: DAG.getNode(Opcode: ISD::MUL, DL, VT,
5473 N1: DAG.getNode(Opcode: DivOpc, DL, VT, N1: Num, N2: Den),
5474 N2: Den));
5475 }
5476 }
5477 return SDValue();
5478}
5479
5480enum OperandSignedness {
5481 Signed = 0,
5482 Unsigned,
5483 Unknown
5484};
5485
5486/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
5487/// that can be demoted to \p OptSize bits without loss of information. The
5488/// signedness of the operand, if determinable, is placed in \p S.
5489static bool IsMulWideOperandDemotable(SDValue Op,
5490 unsigned OptSize,
5491 OperandSignedness &S) {
5492 S = Unknown;
5493
5494 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
5495 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
5496 EVT OrigVT = Op.getOperand(i: 0).getValueType();
5497 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5498 S = Signed;
5499 return true;
5500 }
5501 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
5502 EVT OrigVT = Op.getOperand(i: 0).getValueType();
5503 if (OrigVT.getFixedSizeInBits() <= OptSize) {
5504 S = Unsigned;
5505 return true;
5506 }
5507 }
5508
5509 return false;
5510}
5511
5512/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
5513/// be demoted to \p OptSize bits without loss of information. If the operands
5514/// contain a constant, it should appear as the RHS operand. The signedness of
5515/// the operands is placed in \p IsSigned.
5516static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
5517 unsigned OptSize,
5518 bool &IsSigned) {
5519 OperandSignedness LHSSign;
5520
5521 // The LHS operand must be a demotable op
5522 if (!IsMulWideOperandDemotable(Op: LHS, OptSize, S&: LHSSign))
5523 return false;
5524
5525 // We should have been able to determine the signedness from the LHS
5526 if (LHSSign == Unknown)
5527 return false;
5528
5529 IsSigned = (LHSSign == Signed);
5530
5531 // The RHS can be a demotable op or a constant
5532 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Val&: RHS)) {
5533 const APInt &Val = CI->getAPIntValue();
5534 if (LHSSign == Unsigned) {
5535 return Val.isIntN(N: OptSize);
5536 } else {
5537 return Val.isSignedIntN(N: OptSize);
5538 }
5539 } else {
5540 OperandSignedness RHSSign;
5541 if (!IsMulWideOperandDemotable(Op: RHS, OptSize, S&: RHSSign))
5542 return false;
5543
5544 return LHSSign == RHSSign;
5545 }
5546}
5547
5548/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
5549/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
5550/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
5551/// amount.
5552static SDValue TryMULWIDECombine(SDNode *N,
5553 TargetLowering::DAGCombinerInfo &DCI) {
5554 EVT MulType = N->getValueType(ResNo: 0);
5555 if (MulType != MVT::i32 && MulType != MVT::i64) {
5556 return SDValue();
5557 }
5558
5559 SDLoc DL(N);
5560 unsigned OptSize = MulType.getSizeInBits() >> 1;
5561 SDValue LHS = N->getOperand(Num: 0);
5562 SDValue RHS = N->getOperand(Num: 1);
5563
5564 // Canonicalize the multiply so the constant (if any) is on the right
5565 if (N->getOpcode() == ISD::MUL) {
5566 if (isa<ConstantSDNode>(Val: LHS)) {
5567 std::swap(a&: LHS, b&: RHS);
5568 }
5569 }
5570
5571 // If we have a SHL, determine the actual multiply amount
5572 if (N->getOpcode() == ISD::SHL) {
5573 ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(Val&: RHS);
5574 if (!ShlRHS) {
5575 return SDValue();
5576 }
5577
5578 APInt ShiftAmt = ShlRHS->getAPIntValue();
5579 unsigned BitWidth = MulType.getSizeInBits();
5580 if (ShiftAmt.sge(RHS: 0) && ShiftAmt.slt(RHS: BitWidth)) {
5581 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
5582 RHS = DCI.DAG.getConstant(Val: MulVal, DL, VT: MulType);
5583 } else {
5584 return SDValue();
5585 }
5586 }
5587
5588 bool Signed;
5589 // Verify that our operands are demotable
5590 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, IsSigned&: Signed)) {
5591 return SDValue();
5592 }
5593
5594 EVT DemotedVT;
5595 if (MulType == MVT::i32) {
5596 DemotedVT = MVT::i16;
5597 } else {
5598 DemotedVT = MVT::i32;
5599 }
5600
5601 // Truncate the operands to the correct size. Note that these are just for
5602 // type consistency and will (likely) be eliminated in later phases.
5603 SDValue TruncLHS =
5604 DCI.DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DemotedVT, Operand: LHS);
5605 SDValue TruncRHS =
5606 DCI.DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DemotedVT, Operand: RHS);
5607
5608 unsigned Opc;
5609 if (Signed) {
5610 Opc = NVPTXISD::MUL_WIDE_SIGNED;
5611 } else {
5612 Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
5613 }
5614
5615 return DCI.DAG.getNode(Opcode: Opc, DL, VT: MulType, N1: TruncLHS, N2: TruncRHS);
5616}
5617
5618/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
5619static SDValue PerformMULCombine(SDNode *N,
5620 TargetLowering::DAGCombinerInfo &DCI,
5621 CodeGenOptLevel OptLevel) {
5622 if (OptLevel > CodeGenOptLevel::None) {
5623 // Try mul.wide combining at OptLevel > 0
5624 if (SDValue Ret = TryMULWIDECombine(N, DCI))
5625 return Ret;
5626 }
5627
5628 return SDValue();
5629}
5630
5631/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
5632static SDValue PerformSHLCombine(SDNode *N,
5633 TargetLowering::DAGCombinerInfo &DCI,
5634 CodeGenOptLevel OptLevel) {
5635 if (OptLevel > CodeGenOptLevel::None) {
5636 // Try mul.wide combining at OptLevel > 0
5637 if (SDValue Ret = TryMULWIDECombine(N, DCI))
5638 return Ret;
5639 }
5640
5641 return SDValue();
5642}
5643
5644static SDValue PerformSETCCCombine(SDNode *N,
5645 TargetLowering::DAGCombinerInfo &DCI,
5646 unsigned int SmVersion) {
5647 EVT CCType = N->getValueType(ResNo: 0);
5648 SDValue A = N->getOperand(Num: 0);
5649 SDValue B = N->getOperand(Num: 1);
5650
5651 EVT AType = A.getValueType();
5652 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
5653 return SDValue();
5654
5655 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
5656 return SDValue();
5657
5658 SDLoc DL(N);
5659 // setp.f16x2 returns two scalar predicates, which we need to
5660 // convert back to v2i1. The returned result will be scalarized by
5661 // the legalizer, but the comparison will remain a single vector
5662 // instruction.
5663 SDValue CCNode = DCI.DAG.getNode(
5664 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
5665 : NVPTXISD::SETP_BF16X2,
5666 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
5667 return DCI.DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: CCType, N1: CCNode.getValue(R: 0),
5668 N2: CCNode.getValue(R: 1));
5669}
5670
5671static SDValue PerformEXTRACTCombine(SDNode *N,
5672 TargetLowering::DAGCombinerInfo &DCI) {
5673 SDValue Vector = N->getOperand(Num: 0);
5674 SDLoc DL(N);
5675 EVT VectorVT = Vector.getValueType();
5676 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
5677 IsPTXVectorType(VT: VectorVT.getSimpleVT()))
5678 return SDValue(); // Native vector loads already combine nicely w/
5679 // extract_vector_elt.
5680 // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already
5681 // handle them OK.
5682 if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) ||
5683 VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)
5684 return SDValue();
5685
5686 uint64_t VectorBits = VectorVT.getSizeInBits();
5687 // We only handle the types we can extract in-register.
5688 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
5689 return SDValue();
5690
5691 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1));
5692 // Index == 0 is handled by generic DAG combiner.
5693 if (!Index || Index->getZExtValue() == 0)
5694 return SDValue();
5695
5696 MVT IVT = MVT::getIntegerVT(BitWidth: VectorBits);
5697 EVT EltVT = VectorVT.getVectorElementType();
5698 EVT EltIVT = EltVT.changeTypeToInteger();
5699 uint64_t EltBits = EltVT.getScalarSizeInBits();
5700
5701 SDValue Result = DCI.DAG.getNode(
5702 Opcode: ISD::TRUNCATE, DL, VT: EltIVT,
5703 Operand: DCI.DAG.getNode(
5704 Opcode: ISD::SRA, DL, VT: IVT, N1: DCI.DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IVT, Operand: Vector),
5705 N2: DCI.DAG.getConstant(Val: Index->getZExtValue() * EltBits, DL, VT: IVT)));
5706
5707 // If element has non-integer type, bitcast it back to the expected type.
5708 if (EltVT != EltIVT)
5709 Result = DCI.DAG.getNode(Opcode: ISD::BITCAST, DL, VT: EltVT, Operand: Result);
5710 // Past legalizer, we may need to extent i8 -> i16 to match the register type.
5711 if (EltVT != N->getValueType(ResNo: 0))
5712 Result = DCI.DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: N->getValueType(ResNo: 0), Operand: Result);
5713
5714 return Result;
5715}
5716
5717static SDValue PerformVSELECTCombine(SDNode *N,
5718 TargetLowering::DAGCombinerInfo &DCI) {
5719 SDValue VA = N->getOperand(Num: 1);
5720 EVT VectorVT = VA.getValueType();
5721 if (VectorVT != MVT::v4i8)
5722 return SDValue();
5723
5724 // We need to split vselect into individual per-element operations Because we
5725 // use BFE/BFI instruction for byte extraction/insertion, we do end up with
5726 // 32-bit values, so we may as well do comparison as i32 to avoid conversions
5727 // to/from i16 normally used for i8 values.
5728 SmallVector<SDValue, 4> E;
5729 SDLoc DL(N);
5730 SDValue VCond = N->getOperand(Num: 0);
5731 SDValue VB = N->getOperand(Num: 2);
5732 for (int I = 0; I < 4; ++I) {
5733 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
5734 DCI.DAG.getConstant(I, DL, MVT::i32));
5735 SDValue EA = DCI.DAG.getAnyExtOrTrunc(
5736 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
5737 DCI.DAG.getConstant(I, DL, MVT::i32)),
5738 DL, MVT::i32);
5739 SDValue EB = DCI.DAG.getAnyExtOrTrunc(
5740 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
5741 DCI.DAG.getConstant(I, DL, MVT::i32)),
5742 DL, MVT::i32);
5743 E.push_back(DCI.DAG.getAnyExtOrTrunc(
5744 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
5745 }
5746 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
5747}
5748
5749static SDValue PerformLOADCombine(SDNode *N,
5750 TargetLowering::DAGCombinerInfo &DCI) {
5751 SelectionDAG &DAG = DCI.DAG;
5752 LoadSDNode *LD = cast<LoadSDNode>(Val: N);
5753
5754 // Lower a v16i8 load into a LoadV4 operation with i32 results instead of
5755 // letting ReplaceLoadVector split it into smaller loads during legalization.
5756 // This is done at dag-combine1 time, so that vector operations with i8
5757 // elements can be optimised away instead of being needlessly split during
5758 // legalization, which involves storing to the stack and loading it back.
5759 EVT VT = N->getValueType(ResNo: 0);
5760 if (VT != MVT::v16i8)
5761 return SDValue();
5762
5763 SDLoc DL(N);
5764
5765 // Create a v4i32 vector load operation, effectively <4 x v4i8>.
5766 unsigned Opc = NVPTXISD::LoadV4;
5767 EVT NewVT = MVT::v4i32;
5768 EVT EltVT = NewVT.getVectorElementType();
5769 unsigned NumElts = NewVT.getVectorNumElements();
5770 EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other};
5771 SDVTList RetVTList = DAG.getVTList(RetVTs);
5772 SmallVector<SDValue, 8> Ops(N->ops());
5773 Ops.push_back(Elt: DAG.getIntPtrConstant(Val: LD->getExtensionType(), DL));
5774 SDValue NewLoad = DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: RetVTList, Ops, MemVT: NewVT,
5775 MMO: LD->getMemOperand());
5776 SDValue NewChain = NewLoad.getValue(R: NumElts);
5777
5778 // Create a vector of the same type returned by the original load.
5779 SmallVector<SDValue, 4> Elts;
5780 for (unsigned i = 0; i < NumElts; i++)
5781 Elts.push_back(Elt: NewLoad.getValue(R: i));
5782 return DCI.DAG.getMergeValues(
5783 Ops: {DCI.DAG.getBitcast(VT, V: DCI.DAG.getBuildVector(VT: NewVT, DL, Ops: Elts)),
5784 NewChain},
5785 dl: DL);
5786}
5787
5788SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
5789 DAGCombinerInfo &DCI) const {
5790 CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
5791 switch (N->getOpcode()) {
5792 default: break;
5793 case ISD::ADD:
5794 case ISD::FADD:
5795 return PerformADDCombine(N, DCI, Subtarget: STI, OptLevel);
5796 case ISD::MUL:
5797 return PerformMULCombine(N, DCI, OptLevel);
5798 case ISD::SHL:
5799 return PerformSHLCombine(N, DCI, OptLevel);
5800 case ISD::AND:
5801 return PerformANDCombine(N, DCI);
5802 case ISD::UREM:
5803 case ISD::SREM:
5804 return PerformREMCombine(N, DCI, OptLevel);
5805 case ISD::SETCC:
5806 return PerformSETCCCombine(N, DCI, SmVersion: STI.getSmVersion());
5807 case ISD::LOAD:
5808 return PerformLOADCombine(N, DCI);
5809 case NVPTXISD::StoreRetval:
5810 case NVPTXISD::StoreRetvalV2:
5811 case NVPTXISD::StoreRetvalV4:
5812 return PerformStoreRetvalCombine(N);
5813 case ISD::EXTRACT_VECTOR_ELT:
5814 return PerformEXTRACTCombine(N, DCI);
5815 case ISD::VSELECT:
5816 return PerformVSELECTCombine(N, DCI);
5817 }
5818 return SDValue();
5819}
5820
5821/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
5822static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
5823 SmallVectorImpl<SDValue> &Results) {
5824 EVT ResVT = N->getValueType(ResNo: 0);
5825 SDLoc DL(N);
5826
5827 assert(ResVT.isVector() && "Vector load must have vector type");
5828
5829 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
5830 // legal. We can (and should) split that into 2 loads of <2 x double> here
5831 // but I'm leaving that as a TODO for now.
5832 assert(ResVT.isSimple() && "Can only handle simple types");
5833 switch (ResVT.getSimpleVT().SimpleTy) {
5834 default:
5835 return;
5836 case MVT::v2i8:
5837 case MVT::v2i16:
5838 case MVT::v2i32:
5839 case MVT::v2i64:
5840 case MVT::v2f16:
5841 case MVT::v2f32:
5842 case MVT::v2f64:
5843 case MVT::v4i8:
5844 case MVT::v4i16:
5845 case MVT::v4i32:
5846 case MVT::v4f16:
5847 case MVT::v4f32:
5848 case MVT::v8f16: // <4 x f16x2>
5849 case MVT::v8bf16: // <4 x bf16x2>
5850 case MVT::v8i16: // <4 x i16x2>
5851 // This is a "native" vector type
5852 break;
5853 }
5854
5855 LoadSDNode *LD = cast<LoadSDNode>(Val: N);
5856
5857 Align Alignment = LD->getAlign();
5858 auto &TD = DAG.getDataLayout();
5859 Align PrefAlign =
5860 TD.getPrefTypeAlign(Ty: LD->getMemoryVT().getTypeForEVT(Context&: *DAG.getContext()));
5861 if (Alignment < PrefAlign) {
5862 // This load is not sufficiently aligned, so bail out and let this vector
5863 // load be scalarized. Note that we may still be able to emit smaller
5864 // vector loads. For example, if we are loading a <4 x float> with an
5865 // alignment of 8, this check will fail but the legalizer will try again
5866 // with 2 x <2 x float>, which will succeed with an alignment of 8.
5867 return;
5868 }
5869
5870 EVT EltVT = ResVT.getVectorElementType();
5871 unsigned NumElts = ResVT.getVectorNumElements();
5872
5873 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
5874 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
5875 // loaded type to i16 and propagate the "real" type as the memory type.
5876 bool NeedTrunc = false;
5877 if (EltVT.getSizeInBits() < 16) {
5878 EltVT = MVT::i16;
5879 NeedTrunc = true;
5880 }
5881
5882 unsigned Opcode = 0;
5883 SDVTList LdResVTs;
5884 bool Load16x2 = false;
5885
5886 switch (NumElts) {
5887 default:
5888 return;
5889 case 2:
5890 Opcode = NVPTXISD::LoadV2;
5891 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
5892 break;
5893 case 4: {
5894 Opcode = NVPTXISD::LoadV4;
5895 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
5896 LdResVTs = DAG.getVTList(ListVTs);
5897 break;
5898 }
5899 case 8: {
5900 // v8f16 is a special case. PTX doesn't have ld.v8.f16
5901 // instruction. Instead, we split the vector into v2f16 chunks and
5902 // load them with ld.v4.b32.
5903 assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type.");
5904 Load16x2 = true;
5905 Opcode = NVPTXISD::LoadV4;
5906 EVT VVT;
5907 switch (EltVT.getSimpleVT().SimpleTy) {
5908 case MVT::f16:
5909 VVT = MVT::v2f16;
5910 break;
5911 case MVT::bf16:
5912 VVT = MVT::v2bf16;
5913 break;
5914 case MVT::i16:
5915 VVT = MVT::v2i16;
5916 break;
5917 default:
5918 llvm_unreachable("Unsupported v8 vector type.");
5919 }
5920 EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other};
5921 LdResVTs = DAG.getVTList(ListVTs);
5922 break;
5923 }
5924 }
5925
5926 // Copy regular operands
5927 SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
5928
5929 // The select routine does not have access to the LoadSDNode instance, so
5930 // pass along the extension information
5931 OtherOps.push_back(Elt: DAG.getIntPtrConstant(Val: LD->getExtensionType(), DL));
5932
5933 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList: LdResVTs, Ops: OtherOps,
5934 MemVT: LD->getMemoryVT(),
5935 MMO: LD->getMemOperand());
5936
5937 SmallVector<SDValue, 8> ScalarRes;
5938 if (Load16x2) {
5939 // Split v2f16 subvectors back into individual elements.
5940 NumElts /= 2;
5941 for (unsigned i = 0; i < NumElts; ++i) {
5942 SDValue SubVector = NewLD.getValue(R: i);
5943 SDValue E0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: EltVT, N1: SubVector,
5944 N2: DAG.getIntPtrConstant(Val: 0, DL));
5945 SDValue E1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: EltVT, N1: SubVector,
5946 N2: DAG.getIntPtrConstant(Val: 1, DL));
5947 ScalarRes.push_back(Elt: E0);
5948 ScalarRes.push_back(Elt: E1);
5949 }
5950 } else {
5951 for (unsigned i = 0; i < NumElts; ++i) {
5952 SDValue Res = NewLD.getValue(R: i);
5953 if (NeedTrunc)
5954 Res = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ResVT.getVectorElementType(), Operand: Res);
5955 ScalarRes.push_back(Elt: Res);
5956 }
5957 }
5958
5959 SDValue LoadChain = NewLD.getValue(R: NumElts);
5960
5961 SDValue BuildVec = DAG.getBuildVector(VT: ResVT, DL, Ops: ScalarRes);
5962
5963 Results.push_back(Elt: BuildVec);
5964 Results.push_back(Elt: LoadChain);
5965}
5966
5967static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
5968 SmallVectorImpl<SDValue> &Results) {
5969 SDValue Chain = N->getOperand(Num: 0);
5970 SDValue Intrin = N->getOperand(Num: 1);
5971 SDLoc DL(N);
5972
5973 // Get the intrinsic ID
5974 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
5975 switch (IntrinNo) {
5976 default:
5977 return;
5978 case Intrinsic::nvvm_ldg_global_i:
5979 case Intrinsic::nvvm_ldg_global_f:
5980 case Intrinsic::nvvm_ldg_global_p:
5981 case Intrinsic::nvvm_ldu_global_i:
5982 case Intrinsic::nvvm_ldu_global_f:
5983 case Intrinsic::nvvm_ldu_global_p: {
5984 EVT ResVT = N->getValueType(ResNo: 0);
5985
5986 if (ResVT.isVector()) {
5987 // Vector LDG/LDU
5988
5989 unsigned NumElts = ResVT.getVectorNumElements();
5990 EVT EltVT = ResVT.getVectorElementType();
5991
5992 // Since LDU/LDG are target nodes, we cannot rely on DAG type
5993 // legalization.
5994 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
5995 // loaded type to i16 and propagate the "real" type as the memory type.
5996 bool NeedTrunc = false;
5997 if (EltVT.getSizeInBits() < 16) {
5998 EltVT = MVT::i16;
5999 NeedTrunc = true;
6000 }
6001
6002 unsigned Opcode = 0;
6003 SDVTList LdResVTs;
6004
6005 switch (NumElts) {
6006 default:
6007 return;
6008 case 2:
6009 switch (IntrinNo) {
6010 default:
6011 return;
6012 case Intrinsic::nvvm_ldg_global_i:
6013 case Intrinsic::nvvm_ldg_global_f:
6014 case Intrinsic::nvvm_ldg_global_p:
6015 Opcode = NVPTXISD::LDGV2;
6016 break;
6017 case Intrinsic::nvvm_ldu_global_i:
6018 case Intrinsic::nvvm_ldu_global_f:
6019 case Intrinsic::nvvm_ldu_global_p:
6020 Opcode = NVPTXISD::LDUV2;
6021 break;
6022 }
6023 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
6024 break;
6025 case 4: {
6026 switch (IntrinNo) {
6027 default:
6028 return;
6029 case Intrinsic::nvvm_ldg_global_i:
6030 case Intrinsic::nvvm_ldg_global_f:
6031 case Intrinsic::nvvm_ldg_global_p:
6032 Opcode = NVPTXISD::LDGV4;
6033 break;
6034 case Intrinsic::nvvm_ldu_global_i:
6035 case Intrinsic::nvvm_ldu_global_f:
6036 case Intrinsic::nvvm_ldu_global_p:
6037 Opcode = NVPTXISD::LDUV4;
6038 break;
6039 }
6040 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
6041 LdResVTs = DAG.getVTList(ListVTs);
6042 break;
6043 }
6044 }
6045
6046 SmallVector<SDValue, 8> OtherOps;
6047
6048 // Copy regular operands
6049
6050 OtherOps.push_back(Elt: Chain); // Chain
6051 // Skip operand 1 (intrinsic ID)
6052 // Others
6053 OtherOps.append(in_start: N->op_begin() + 2, in_end: N->op_end());
6054
6055 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(Val: N);
6056
6057 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList: LdResVTs, Ops: OtherOps,
6058 MemVT: MemSD->getMemoryVT(),
6059 MMO: MemSD->getMemOperand());
6060
6061 SmallVector<SDValue, 4> ScalarRes;
6062
6063 for (unsigned i = 0; i < NumElts; ++i) {
6064 SDValue Res = NewLD.getValue(R: i);
6065 if (NeedTrunc)
6066 Res =
6067 DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ResVT.getVectorElementType(), Operand: Res);
6068 ScalarRes.push_back(Elt: Res);
6069 }
6070
6071 SDValue LoadChain = NewLD.getValue(R: NumElts);
6072
6073 SDValue BuildVec =
6074 DAG.getBuildVector(VT: ResVT, DL, Ops: ScalarRes);
6075
6076 Results.push_back(Elt: BuildVec);
6077 Results.push_back(Elt: LoadChain);
6078 } else {
6079 // i8 LDG/LDU
6080 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
6081 "Custom handling of non-i8 ldu/ldg?");
6082
6083 // Just copy all operands as-is
6084 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
6085
6086 // Force output to i16
6087 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
6088
6089 MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(Val: N);
6090
6091 // We make sure the memory type is i8, which will be used during isel
6092 // to select the proper instruction.
6093 SDValue NewLD =
6094 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
6095 MVT::i8, MemSD->getMemOperand());
6096
6097 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
6098 NewLD.getValue(0)));
6099 Results.push_back(Elt: NewLD.getValue(R: 1));
6100 }
6101 }
6102 }
6103}
6104
6105void NVPTXTargetLowering::ReplaceNodeResults(
6106 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
6107 switch (N->getOpcode()) {
6108 default:
6109 report_fatal_error(reason: "Unhandled custom legalization");
6110 case ISD::LOAD:
6111 ReplaceLoadVector(N, DAG, Results);
6112 return;
6113 case ISD::INTRINSIC_W_CHAIN:
6114 ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
6115 return;
6116 }
6117}
6118
6119NVPTXTargetLowering::AtomicExpansionKind
6120NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
6121 Type *Ty = AI->getValOperand()->getType();
6122
6123 if (AI->isFloatingPointOperation()) {
6124 if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) {
6125 if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
6126 STI.getPTXVersion() >= 63)
6127 return AtomicExpansionKind::None;
6128 if (Ty->isFloatTy())
6129 return AtomicExpansionKind::None;
6130 if (Ty->isDoubleTy() && STI.hasAtomAddF64())
6131 return AtomicExpansionKind::None;
6132 }
6133 return AtomicExpansionKind::CmpXChg;
6134 }
6135
6136 assert(Ty->isIntegerTy() && "Ty should be integer at this point");
6137 auto ITy = cast<llvm::IntegerType>(Val: Ty);
6138
6139 switch (AI->getOperation()) {
6140 default:
6141 return AtomicExpansionKind::CmpXChg;
6142 case AtomicRMWInst::BinOp::And:
6143 case AtomicRMWInst::BinOp::Or:
6144 case AtomicRMWInst::BinOp::Xor:
6145 case AtomicRMWInst::BinOp::Xchg:
6146 switch (ITy->getBitWidth()) {
6147 case 8:
6148 case 16:
6149 return AtomicExpansionKind::CmpXChg;
6150 case 32:
6151 return AtomicExpansionKind::None;
6152 case 64:
6153 if (STI.hasAtomBitwise64())
6154 return AtomicExpansionKind::None;
6155 return AtomicExpansionKind::CmpXChg;
6156 default:
6157 llvm_unreachable("unsupported width encountered");
6158 }
6159 case AtomicRMWInst::BinOp::Add:
6160 case AtomicRMWInst::BinOp::Sub:
6161 case AtomicRMWInst::BinOp::Max:
6162 case AtomicRMWInst::BinOp::Min:
6163 case AtomicRMWInst::BinOp::UMax:
6164 case AtomicRMWInst::BinOp::UMin:
6165 switch (ITy->getBitWidth()) {
6166 case 8:
6167 case 16:
6168 return AtomicExpansionKind::CmpXChg;
6169 case 32:
6170 return AtomicExpansionKind::None;
6171 case 64:
6172 if (STI.hasAtomMinMax64())
6173 return AtomicExpansionKind::None;
6174 return AtomicExpansionKind::CmpXChg;
6175 default:
6176 llvm_unreachable("unsupported width encountered");
6177 }
6178 }
6179
6180 return AtomicExpansionKind::CmpXChg;
6181}
6182
6183// Pin NVPTXTargetObjectFile's vtables to this file.
6184NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default;
6185
6186MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
6187 const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
6188 return getDataSection();
6189}
6190

source code of llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp