1 | //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file defines the interfaces that NVPTX uses to lower LLVM code into a |
10 | // selection DAG. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "NVPTXISelLowering.h" |
15 | #include "MCTargetDesc/NVPTXBaseInfo.h" |
16 | #include "NVPTX.h" |
17 | #include "NVPTXSubtarget.h" |
18 | #include "NVPTXTargetMachine.h" |
19 | #include "NVPTXTargetObjectFile.h" |
20 | #include "NVPTXUtilities.h" |
21 | #include "llvm/ADT/APInt.h" |
22 | #include "llvm/ADT/STLExtras.h" |
23 | #include "llvm/ADT/SmallVector.h" |
24 | #include "llvm/ADT/StringRef.h" |
25 | #include "llvm/CodeGen/Analysis.h" |
26 | #include "llvm/CodeGen/ISDOpcodes.h" |
27 | #include "llvm/CodeGen/MachineFunction.h" |
28 | #include "llvm/CodeGen/MachineMemOperand.h" |
29 | #include "llvm/CodeGen/SelectionDAG.h" |
30 | #include "llvm/CodeGen/SelectionDAGNodes.h" |
31 | #include "llvm/CodeGen/TargetCallingConv.h" |
32 | #include "llvm/CodeGen/TargetLowering.h" |
33 | #include "llvm/CodeGen/ValueTypes.h" |
34 | #include "llvm/CodeGenTypes/MachineValueType.h" |
35 | #include "llvm/IR/Argument.h" |
36 | #include "llvm/IR/Attributes.h" |
37 | #include "llvm/IR/Constants.h" |
38 | #include "llvm/IR/DataLayout.h" |
39 | #include "llvm/IR/DerivedTypes.h" |
40 | #include "llvm/IR/DiagnosticInfo.h" |
41 | #include "llvm/IR/FPEnv.h" |
42 | #include "llvm/IR/Function.h" |
43 | #include "llvm/IR/GlobalValue.h" |
44 | #include "llvm/IR/Instruction.h" |
45 | #include "llvm/IR/Instructions.h" |
46 | #include "llvm/IR/IntrinsicsNVPTX.h" |
47 | #include "llvm/IR/Module.h" |
48 | #include "llvm/IR/Type.h" |
49 | #include "llvm/IR/Value.h" |
50 | #include "llvm/Support/Alignment.h" |
51 | #include "llvm/Support/Casting.h" |
52 | #include "llvm/Support/CodeGen.h" |
53 | #include "llvm/Support/CommandLine.h" |
54 | #include "llvm/Support/ErrorHandling.h" |
55 | #include "llvm/Support/raw_ostream.h" |
56 | #include "llvm/Target/TargetMachine.h" |
57 | #include "llvm/Target/TargetOptions.h" |
58 | #include <algorithm> |
59 | #include <cassert> |
60 | #include <cmath> |
61 | #include <cstdint> |
62 | #include <iterator> |
63 | #include <optional> |
64 | #include <sstream> |
65 | #include <string> |
66 | #include <utility> |
67 | #include <vector> |
68 | |
69 | #define DEBUG_TYPE "nvptx-lower" |
70 | |
71 | using namespace llvm; |
72 | |
73 | static std::atomic<unsigned> GlobalUniqueCallSite; |
74 | |
75 | static cl::opt<bool> sched4reg( |
76 | "nvptx-sched4reg" , |
77 | cl::desc("NVPTX Specific: schedule for register pressue" ), cl::init(Val: false)); |
78 | |
79 | static cl::opt<unsigned> FMAContractLevelOpt( |
80 | "nvptx-fma-level" , cl::Hidden, |
81 | cl::desc("NVPTX Specific: FMA contraction (0: don't do it" |
82 | " 1: do it 2: do it aggressively" ), |
83 | cl::init(Val: 2)); |
84 | |
85 | static cl::opt<int> UsePrecDivF32( |
86 | "nvptx-prec-divf32" , cl::Hidden, |
87 | cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" |
88 | " IEEE Compliant F32 div.rnd if available." ), |
89 | cl::init(Val: 2)); |
90 | |
91 | static cl::opt<bool> UsePrecSqrtF32( |
92 | "nvptx-prec-sqrtf32" , cl::Hidden, |
93 | cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn." ), |
94 | cl::init(Val: true)); |
95 | |
96 | static cl::opt<bool> ForceMinByValParamAlign( |
97 | "nvptx-force-min-byval-param-align" , cl::Hidden, |
98 | cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" |
99 | " params of device functions." ), |
100 | cl::init(Val: false)); |
101 | |
102 | int NVPTXTargetLowering::getDivF32Level() const { |
103 | if (UsePrecDivF32.getNumOccurrences() > 0) { |
104 | // If nvptx-prec-div32=N is used on the command-line, always honor it |
105 | return UsePrecDivF32; |
106 | } else { |
107 | // Otherwise, use div.approx if fast math is enabled |
108 | if (getTargetMachine().Options.UnsafeFPMath) |
109 | return 0; |
110 | else |
111 | return 2; |
112 | } |
113 | } |
114 | |
115 | bool NVPTXTargetLowering::usePrecSqrtF32() const { |
116 | if (UsePrecSqrtF32.getNumOccurrences() > 0) { |
117 | // If nvptx-prec-sqrtf32 is used on the command-line, always honor it |
118 | return UsePrecSqrtF32; |
119 | } else { |
120 | // Otherwise, use sqrt.approx if fast math is enabled |
121 | return !getTargetMachine().Options.UnsafeFPMath; |
122 | } |
123 | } |
124 | |
125 | bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const { |
126 | return MF.getDenormalMode(FPType: APFloat::IEEEsingle()).Output == |
127 | DenormalMode::PreserveSign; |
128 | } |
129 | |
130 | static bool IsPTXVectorType(MVT VT) { |
131 | switch (VT.SimpleTy) { |
132 | default: |
133 | return false; |
134 | case MVT::v2i1: |
135 | case MVT::v4i1: |
136 | case MVT::v2i8: |
137 | case MVT::v4i8: |
138 | case MVT::v2i16: |
139 | case MVT::v4i16: |
140 | case MVT::v8i16: // <4 x i16x2> |
141 | case MVT::v2i32: |
142 | case MVT::v4i32: |
143 | case MVT::v2i64: |
144 | case MVT::v2f16: |
145 | case MVT::v4f16: |
146 | case MVT::v8f16: // <4 x f16x2> |
147 | case MVT::v2bf16: |
148 | case MVT::v4bf16: |
149 | case MVT::v8bf16: // <4 x bf16x2> |
150 | case MVT::v2f32: |
151 | case MVT::v4f32: |
152 | case MVT::v2f64: |
153 | return true; |
154 | } |
155 | } |
156 | |
157 | static bool Is16bitsType(MVT VT) { |
158 | return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 || |
159 | VT.SimpleTy == MVT::i16); |
160 | } |
161 | |
162 | /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive |
163 | /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors |
164 | /// into their primitive components. |
165 | /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the |
166 | /// same number of types as the Ins/Outs arrays in LowerFormalArguments, |
167 | /// LowerCall, and LowerReturn. |
168 | static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, |
169 | Type *Ty, SmallVectorImpl<EVT> &ValueVTs, |
170 | SmallVectorImpl<uint64_t> *Offsets = nullptr, |
171 | uint64_t StartingOffset = 0) { |
172 | SmallVector<EVT, 16> TempVTs; |
173 | SmallVector<uint64_t, 16> TempOffsets; |
174 | |
175 | // Special case for i128 - decompose to (i64, i64) |
176 | if (Ty->isIntegerTy(Bitwidth: 128)) { |
177 | ValueVTs.push_back(EVT(MVT::i64)); |
178 | ValueVTs.push_back(EVT(MVT::i64)); |
179 | |
180 | if (Offsets) { |
181 | Offsets->push_back(Elt: StartingOffset + 0); |
182 | Offsets->push_back(Elt: StartingOffset + 8); |
183 | } |
184 | |
185 | return; |
186 | } |
187 | |
188 | // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs. |
189 | if (StructType *STy = dyn_cast<StructType>(Val: Ty)) { |
190 | auto const *SL = DL.getStructLayout(Ty: STy); |
191 | auto ElementNum = 0; |
192 | for(auto *EI : STy->elements()) { |
193 | ComputePTXValueVTs(TLI, DL, Ty: EI, ValueVTs, Offsets, |
194 | StartingOffset: StartingOffset + SL->getElementOffset(Idx: ElementNum)); |
195 | ++ElementNum; |
196 | } |
197 | return; |
198 | } |
199 | |
200 | ComputeValueVTs(TLI, DL, Ty, ValueVTs&: TempVTs, FixedOffsets: &TempOffsets, StartingOffset); |
201 | for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { |
202 | EVT VT = TempVTs[i]; |
203 | uint64_t Off = TempOffsets[i]; |
204 | // Split vectors into individual elements, except for v2f16, which |
205 | // we will pass as a single scalar. |
206 | if (VT.isVector()) { |
207 | unsigned NumElts = VT.getVectorNumElements(); |
208 | EVT EltVT = VT.getVectorElementType(); |
209 | // Vectors with an even number of f16 elements will be passed to |
210 | // us as an array of v2f16/v2bf16 elements. We must match this so we |
211 | // stay in sync with Ins/Outs. |
212 | if ((Is16bitsType(VT: EltVT.getSimpleVT())) && NumElts % 2 == 0) { |
213 | switch (EltVT.getSimpleVT().SimpleTy) { |
214 | case MVT::f16: |
215 | EltVT = MVT::v2f16; |
216 | break; |
217 | case MVT::bf16: |
218 | EltVT = MVT::v2bf16; |
219 | break; |
220 | case MVT::i16: |
221 | EltVT = MVT::v2i16; |
222 | break; |
223 | default: |
224 | llvm_unreachable("Unexpected type" ); |
225 | } |
226 | NumElts /= 2; |
227 | } else if (EltVT.getSimpleVT() == MVT::i8 && |
228 | (NumElts % 4 == 0 || NumElts == 3)) { |
229 | // v*i8 are formally lowered as v4i8 |
230 | EltVT = MVT::v4i8; |
231 | NumElts = (NumElts + 3) / 4; |
232 | } |
233 | for (unsigned j = 0; j != NumElts; ++j) { |
234 | ValueVTs.push_back(Elt: EltVT); |
235 | if (Offsets) |
236 | Offsets->push_back(Elt: Off + j * EltVT.getStoreSize()); |
237 | } |
238 | } else { |
239 | ValueVTs.push_back(Elt: VT); |
240 | if (Offsets) |
241 | Offsets->push_back(Elt: Off); |
242 | } |
243 | } |
244 | } |
245 | |
246 | /// PromoteScalarIntegerPTX |
247 | /// Used to make sure the arguments/returns are suitable for passing |
248 | /// and promote them to a larger size if they're not. |
249 | /// |
250 | /// The promoted type is placed in \p PromoteVT if the function returns true. |
251 | static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) { |
252 | if (VT.isScalarInteger()) { |
253 | switch (PowerOf2Ceil(A: VT.getFixedSizeInBits())) { |
254 | default: |
255 | llvm_unreachable( |
256 | "Promotion is not suitable for scalars of size larger than 64-bits" ); |
257 | case 1: |
258 | *PromotedVT = MVT::i1; |
259 | break; |
260 | case 2: |
261 | case 4: |
262 | case 8: |
263 | *PromotedVT = MVT::i8; |
264 | break; |
265 | case 16: |
266 | *PromotedVT = MVT::i16; |
267 | break; |
268 | case 32: |
269 | *PromotedVT = MVT::i32; |
270 | break; |
271 | case 64: |
272 | *PromotedVT = MVT::i64; |
273 | break; |
274 | } |
275 | return EVT(*PromotedVT) != VT; |
276 | } |
277 | return false; |
278 | } |
279 | |
280 | // Check whether we can merge loads/stores of some of the pieces of a |
281 | // flattened function parameter or return value into a single vector |
282 | // load/store. |
283 | // |
284 | // The flattened parameter is represented as a list of EVTs and |
285 | // offsets, and the whole structure is aligned to ParamAlignment. This |
286 | // function determines whether we can load/store pieces of the |
287 | // parameter starting at index Idx using a single vectorized op of |
288 | // size AccessSize. If so, it returns the number of param pieces |
289 | // covered by the vector op. Otherwise, it returns 1. |
290 | static unsigned CanMergeParamLoadStoresStartingAt( |
291 | unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs, |
292 | const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) { |
293 | |
294 | // Can't vectorize if param alignment is not sufficient. |
295 | if (ParamAlignment < AccessSize) |
296 | return 1; |
297 | // Can't vectorize if offset is not aligned. |
298 | if (Offsets[Idx] & (AccessSize - 1)) |
299 | return 1; |
300 | |
301 | EVT EltVT = ValueVTs[Idx]; |
302 | unsigned EltSize = EltVT.getStoreSize(); |
303 | |
304 | // Element is too large to vectorize. |
305 | if (EltSize >= AccessSize) |
306 | return 1; |
307 | |
308 | unsigned NumElts = AccessSize / EltSize; |
309 | // Can't vectorize if AccessBytes if not a multiple of EltSize. |
310 | if (AccessSize != EltSize * NumElts) |
311 | return 1; |
312 | |
313 | // We don't have enough elements to vectorize. |
314 | if (Idx + NumElts > ValueVTs.size()) |
315 | return 1; |
316 | |
317 | // PTX ISA can only deal with 2- and 4-element vector ops. |
318 | if (NumElts != 4 && NumElts != 2) |
319 | return 1; |
320 | |
321 | for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) { |
322 | // Types do not match. |
323 | if (ValueVTs[j] != EltVT) |
324 | return 1; |
325 | |
326 | // Elements are not contiguous. |
327 | if (Offsets[j] - Offsets[j - 1] != EltSize) |
328 | return 1; |
329 | } |
330 | // OK. We can vectorize ValueVTs[i..i+NumElts) |
331 | return NumElts; |
332 | } |
333 | |
334 | // Flags for tracking per-element vectorization state of loads/stores |
335 | // of a flattened function parameter or return value. |
336 | enum ParamVectorizationFlags { |
337 | PVF_INNER = 0x0, // Middle elements of a vector. |
338 | PVF_FIRST = 0x1, // First element of the vector. |
339 | PVF_LAST = 0x2, // Last element of the vector. |
340 | // Scalar is effectively a 1-element vector. |
341 | PVF_SCALAR = PVF_FIRST | PVF_LAST |
342 | }; |
343 | |
344 | // Computes whether and how we can vectorize the loads/stores of a |
345 | // flattened function parameter or return value. |
346 | // |
347 | // The flattened parameter is represented as the list of ValueVTs and |
348 | // Offsets, and is aligned to ParamAlignment bytes. We return a vector |
349 | // of the same size as ValueVTs indicating how each piece should be |
350 | // loaded/stored (i.e. as a scalar, or as part of a vector |
351 | // load/store). |
352 | static SmallVector<ParamVectorizationFlags, 16> |
353 | VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs, |
354 | const SmallVectorImpl<uint64_t> &Offsets, |
355 | Align ParamAlignment, bool IsVAArg = false) { |
356 | // Set vector size to match ValueVTs and mark all elements as |
357 | // scalars by default. |
358 | SmallVector<ParamVectorizationFlags, 16> VectorInfo; |
359 | VectorInfo.assign(NumElts: ValueVTs.size(), Elt: PVF_SCALAR); |
360 | |
361 | if (IsVAArg) |
362 | return VectorInfo; |
363 | |
364 | // Check what we can vectorize using 128/64/32-bit accesses. |
365 | for (int I = 0, E = ValueVTs.size(); I != E; ++I) { |
366 | // Skip elements we've already processed. |
367 | assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state." ); |
368 | for (unsigned AccessSize : {16, 8, 4, 2}) { |
369 | unsigned NumElts = CanMergeParamLoadStoresStartingAt( |
370 | Idx: I, AccessSize, ValueVTs, Offsets, ParamAlignment); |
371 | // Mark vectorized elements. |
372 | switch (NumElts) { |
373 | default: |
374 | llvm_unreachable("Unexpected return value" ); |
375 | case 1: |
376 | // Can't vectorize using this size, try next smaller size. |
377 | continue; |
378 | case 2: |
379 | assert(I + 1 < E && "Not enough elements." ); |
380 | VectorInfo[I] = PVF_FIRST; |
381 | VectorInfo[I + 1] = PVF_LAST; |
382 | I += 1; |
383 | break; |
384 | case 4: |
385 | assert(I + 3 < E && "Not enough elements." ); |
386 | VectorInfo[I] = PVF_FIRST; |
387 | VectorInfo[I + 1] = PVF_INNER; |
388 | VectorInfo[I + 2] = PVF_INNER; |
389 | VectorInfo[I + 3] = PVF_LAST; |
390 | I += 3; |
391 | break; |
392 | } |
393 | // Break out of the inner loop because we've already succeeded |
394 | // using largest possible AccessSize. |
395 | break; |
396 | } |
397 | } |
398 | return VectorInfo; |
399 | } |
400 | |
401 | // NVPTXTargetLowering Constructor. |
402 | NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, |
403 | const NVPTXSubtarget &STI) |
404 | : TargetLowering(TM), nvTM(&TM), STI(STI) { |
405 | // always lower memset, memcpy, and memmove intrinsics to load/store |
406 | // instructions, rather |
407 | // then generating calls to memset, mempcy or memmove. |
408 | MaxStoresPerMemset = MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF; |
409 | MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF; |
410 | MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF; |
411 | |
412 | setBooleanContents(ZeroOrNegativeOneBooleanContent); |
413 | setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); |
414 | |
415 | // Jump is Expensive. Don't create extra control flow for 'and', 'or' |
416 | // condition branches. |
417 | setJumpIsExpensive(true); |
418 | |
419 | // Wide divides are _very_ slow. Try to reduce the width of the divide if |
420 | // possible. |
421 | addBypassSlowDiv(SlowBitWidth: 64, FastBitWidth: 32); |
422 | |
423 | // By default, use the Source scheduling |
424 | if (sched4reg) |
425 | setSchedulingPreference(Sched::RegPressure); |
426 | else |
427 | setSchedulingPreference(Sched::Source); |
428 | |
429 | auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, |
430 | LegalizeAction NoF16Action) { |
431 | setOperationAction(Op, VT, Action: STI.allowFP16Math() ? Action : NoF16Action); |
432 | }; |
433 | |
434 | auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, |
435 | LegalizeAction NoBF16Action) { |
436 | bool IsOpSupported = STI.hasBF16Math(); |
437 | // Few instructions are available on sm_90 only |
438 | switch(Op) { |
439 | case ISD::FADD: |
440 | case ISD::FMUL: |
441 | case ISD::FSUB: |
442 | case ISD::SELECT: |
443 | case ISD::SELECT_CC: |
444 | case ISD::SETCC: |
445 | case ISD::FEXP2: |
446 | case ISD::FCEIL: |
447 | case ISD::FFLOOR: |
448 | case ISD::FNEARBYINT: |
449 | case ISD::FRINT: |
450 | case ISD::FROUNDEVEN: |
451 | case ISD::FTRUNC: |
452 | IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 78; |
453 | break; |
454 | } |
455 | setOperationAction( |
456 | Op, VT, Action: IsOpSupported ? Action : NoBF16Action); |
457 | }; |
458 | |
459 | auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action, |
460 | LegalizeAction NoI16x2Action) { |
461 | bool IsOpSupported = false; |
462 | // instructions are available on sm_90 only |
463 | switch (Op) { |
464 | case ISD::ADD: |
465 | case ISD::SMAX: |
466 | case ISD::SMIN: |
467 | case ISD::UMIN: |
468 | case ISD::UMAX: |
469 | IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80; |
470 | break; |
471 | } |
472 | setOperationAction(Op, VT, Action: IsOpSupported ? Action : NoI16x2Action); |
473 | }; |
474 | |
475 | addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); |
476 | addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); |
477 | addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass); |
478 | addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass); |
479 | addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); |
480 | addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); |
481 | addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass); |
482 | addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass); |
483 | addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass); |
484 | addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass); |
485 | addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass); |
486 | addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass); |
487 | |
488 | // Conversion to/from FP16/FP16x2 is always legal. |
489 | setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); |
490 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); |
491 | setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand); |
492 | setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand); |
493 | |
494 | setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); |
495 | if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31) |
496 | setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal); |
497 | |
498 | setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote); |
499 | setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand); |
500 | |
501 | // Conversion to/from BFP16/BFP16x2 is always legal. |
502 | setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Custom); |
503 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2bf16, Custom); |
504 | setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2bf16, Expand); |
505 | setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2bf16, Expand); |
506 | |
507 | setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand); |
508 | setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote); |
509 | if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote) |
510 | AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32); |
511 | |
512 | // Conversion to/from i16/i16x2 is always legal. |
513 | setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom); |
514 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); |
515 | setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Expand); |
516 | setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16, Expand); |
517 | |
518 | setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8, Custom); |
519 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom); |
520 | setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom); |
521 | setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom); |
522 | // Only logical ops can be done on v4i8 directly, others must be done |
523 | // elementwise. |
524 | setOperationAction( |
525 | {ISD::ABS, ISD::ADD, ISD::ADDC, ISD::ADDE, |
526 | ISD::BITREVERSE, ISD::CTLZ, ISD::CTPOP, ISD::CTTZ, |
527 | ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FSHL, ISD::FSHR, |
528 | ISD::MUL, ISD::MULHS, ISD::MULHU, ISD::PARITY, |
529 | ISD::ROTL, ISD::ROTR, ISD::SADDO, ISD::SADDO_CARRY, |
530 | ISD::SADDSAT, ISD::SDIV, ISD::SDIVREM, ISD::SELECT_CC, |
531 | ISD::SETCC, ISD::SHL, ISD::SINT_TO_FP, ISD::SMAX, |
532 | ISD::SMIN, ISD::SMULO, ISD::SMUL_LOHI, ISD::SRA, |
533 | ISD::SREM, ISD::SRL, ISD::SSHLSAT, ISD::SSUBO, |
534 | ISD::SSUBO_CARRY, ISD::SSUBSAT, ISD::SUB, ISD::SUBC, |
535 | ISD::SUBE, ISD::UADDO, ISD::UADDO_CARRY, ISD::UADDSAT, |
536 | ISD::UDIV, ISD::UDIVREM, ISD::UINT_TO_FP, ISD::UMAX, |
537 | ISD::UMIN, ISD::UMULO, ISD::UMUL_LOHI, ISD::UREM, |
538 | ISD::USHLSAT, ISD::USUBO, ISD::USUBO_CARRY, ISD::VSELECT, |
539 | ISD::USUBSAT}, |
540 | MVT::v4i8, Expand); |
541 | |
542 | // Operations not directly supported by NVPTX. |
543 | for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, |
544 | MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8, |
545 | MVT::i32, MVT::i64}) { |
546 | setOperationAction(ISD::SELECT_CC, VT, Expand); |
547 | setOperationAction(ISD::BR_CC, VT, Expand); |
548 | } |
549 | |
550 | // Some SIGN_EXTEND_INREG can be done using cvt instruction. |
551 | // For others we will expand to a SHL/SRA pair. |
552 | setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal); |
553 | setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); |
554 | setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); |
555 | setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); |
556 | setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); |
557 | setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); |
558 | |
559 | setOperationAction(ISD::SHL_PARTS, MVT::i32 , Custom); |
560 | setOperationAction(ISD::SRA_PARTS, MVT::i32 , Custom); |
561 | setOperationAction(ISD::SRL_PARTS, MVT::i32 , Custom); |
562 | setOperationAction(ISD::SHL_PARTS, MVT::i64 , Custom); |
563 | setOperationAction(ISD::SRA_PARTS, MVT::i64 , Custom); |
564 | setOperationAction(ISD::SRL_PARTS, MVT::i64 , Custom); |
565 | |
566 | setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); |
567 | setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); |
568 | |
569 | // TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs |
570 | // that don't have h/w rotation we lower them to multi-instruction assembly. |
571 | // See ROT*_sw in NVPTXIntrInfo.td |
572 | setOperationAction(ISD::ROTL, MVT::i64, Legal); |
573 | setOperationAction(ISD::ROTR, MVT::i64, Legal); |
574 | setOperationAction(ISD::ROTL, MVT::i32, Legal); |
575 | setOperationAction(ISD::ROTR, MVT::i32, Legal); |
576 | |
577 | setOperationAction(ISD::ROTL, MVT::i16, Expand); |
578 | setOperationAction(ISD::ROTL, MVT::v2i16, Expand); |
579 | setOperationAction(ISD::ROTR, MVT::i16, Expand); |
580 | setOperationAction(ISD::ROTR, MVT::v2i16, Expand); |
581 | setOperationAction(ISD::ROTL, MVT::i8, Expand); |
582 | setOperationAction(ISD::ROTR, MVT::i8, Expand); |
583 | setOperationAction(ISD::BSWAP, MVT::i16, Expand); |
584 | |
585 | // Indirect branch is not supported. |
586 | // This also disables Jump Table creation. |
587 | setOperationAction(ISD::BR_JT, MVT::Other, Expand); |
588 | setOperationAction(ISD::BRIND, MVT::Other, Expand); |
589 | |
590 | setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); |
591 | setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); |
592 | |
593 | // We want to legalize constant related memmove and memcopy |
594 | // intrinsics. |
595 | setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); |
596 | |
597 | // Turn FP extload into load/fpextend |
598 | setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); |
599 | setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); |
600 | setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand); |
601 | setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand); |
602 | setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); |
603 | setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); |
604 | setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); |
605 | setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand); |
606 | setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand); |
607 | setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); |
608 | setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); |
609 | setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); |
610 | setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand); |
611 | setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand); |
612 | setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); |
613 | setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); |
614 | setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); |
615 | setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand); |
616 | setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand); |
617 | // Turn FP truncstore into trunc + store. |
618 | // FIXME: vector types should also be expanded |
619 | setTruncStoreAction(MVT::f32, MVT::f16, Expand); |
620 | setTruncStoreAction(MVT::f64, MVT::f16, Expand); |
621 | setTruncStoreAction(MVT::f32, MVT::bf16, Expand); |
622 | setTruncStoreAction(MVT::f64, MVT::bf16, Expand); |
623 | setTruncStoreAction(MVT::f64, MVT::f32, Expand); |
624 | |
625 | // PTX does not support load / store predicate registers |
626 | setOperationAction(ISD::LOAD, MVT::i1, Custom); |
627 | setOperationAction(ISD::STORE, MVT::i1, Custom); |
628 | |
629 | for (MVT VT : MVT::integer_valuetypes()) { |
630 | setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); |
631 | setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); |
632 | setTruncStoreAction(VT, MVT::i1, Expand); |
633 | } |
634 | |
635 | // expand extload of vector of integers. |
636 | setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16, |
637 | MVT::v2i8, Expand); |
638 | setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); |
639 | |
640 | // This is legal in NVPTX |
641 | setOperationAction(ISD::ConstantFP, MVT::f64, Legal); |
642 | setOperationAction(ISD::ConstantFP, MVT::f32, Legal); |
643 | setOperationAction(ISD::ConstantFP, MVT::f16, Legal); |
644 | setOperationAction(ISD::ConstantFP, MVT::bf16, Legal); |
645 | |
646 | setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); |
647 | setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); |
648 | |
649 | // TRAP can be lowered to PTX trap |
650 | setOperationAction(ISD::TRAP, MVT::Other, Legal); |
651 | |
652 | // Register custom handling for vector loads/stores |
653 | for (MVT VT : MVT::fixedlen_vector_valuetypes()) { |
654 | if (IsPTXVectorType(VT)) { |
655 | setOperationAction(ISD::LOAD, VT, Custom); |
656 | setOperationAction(ISD::STORE, VT, Custom); |
657 | setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); |
658 | } |
659 | } |
660 | |
661 | // Support varargs. |
662 | setOperationAction(ISD::VASTART, MVT::Other, Custom); |
663 | setOperationAction(ISD::VAARG, MVT::Other, Custom); |
664 | setOperationAction(ISD::VACOPY, MVT::Other, Expand); |
665 | setOperationAction(ISD::VAEND, MVT::Other, Expand); |
666 | |
667 | // Custom handling for i8 intrinsics |
668 | setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); |
669 | |
670 | for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) { |
671 | setOperationAction(ISD::ABS, Ty, Legal); |
672 | setOperationAction(ISD::SMIN, Ty, Legal); |
673 | setOperationAction(ISD::SMAX, Ty, Legal); |
674 | setOperationAction(ISD::UMIN, Ty, Legal); |
675 | setOperationAction(ISD::UMAX, Ty, Legal); |
676 | |
677 | setOperationAction(ISD::CTPOP, Ty, Legal); |
678 | setOperationAction(ISD::CTLZ, Ty, Legal); |
679 | } |
680 | |
681 | setI16x2OperationAction(ISD::ABS, MVT::v2i16, Legal, Custom); |
682 | setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom); |
683 | setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom); |
684 | setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom); |
685 | setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom); |
686 | setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand); |
687 | setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand); |
688 | |
689 | setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom); |
690 | setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom); |
691 | setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom); |
692 | setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom); |
693 | setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom); |
694 | setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom); |
695 | |
696 | // Other arithmetic and logic ops are unsupported. |
697 | setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS, |
698 | ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT, |
699 | ISD::SINT_TO_FP, ISD::UINT_TO_FP}, |
700 | MVT::v2i16, Expand); |
701 | |
702 | setOperationAction(ISD::ADDC, MVT::i32, Legal); |
703 | setOperationAction(ISD::ADDE, MVT::i32, Legal); |
704 | setOperationAction(ISD::SUBC, MVT::i32, Legal); |
705 | setOperationAction(ISD::SUBE, MVT::i32, Legal); |
706 | if (STI.getPTXVersion() >= 43) { |
707 | setOperationAction(ISD::ADDC, MVT::i64, Legal); |
708 | setOperationAction(ISD::ADDE, MVT::i64, Legal); |
709 | setOperationAction(ISD::SUBC, MVT::i64, Legal); |
710 | setOperationAction(ISD::SUBE, MVT::i64, Legal); |
711 | } |
712 | |
713 | setOperationAction(ISD::CTTZ, MVT::i16, Expand); |
714 | setOperationAction(ISD::CTTZ, MVT::v2i16, Expand); |
715 | setOperationAction(ISD::CTTZ, MVT::i32, Expand); |
716 | setOperationAction(ISD::CTTZ, MVT::i64, Expand); |
717 | |
718 | // PTX does not directly support SELP of i1, so promote to i32 first |
719 | setOperationAction(ISD::SELECT, MVT::i1, Custom); |
720 | |
721 | // PTX cannot multiply two i64s in a single instruction. |
722 | setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); |
723 | setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); |
724 | |
725 | // We have some custom DAG combine patterns for these nodes |
726 | setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD, |
727 | ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, |
728 | ISD::VSELECT}); |
729 | |
730 | // setcc for f16x2 and bf16x2 needs special handling to prevent |
731 | // legalizer's attempt to scalarize it due to v2i1 not being legal. |
732 | if (STI.allowFP16Math() || STI.hasBF16Math()) |
733 | setTargetDAGCombine(ISD::SETCC); |
734 | |
735 | // Promote fp16 arithmetic if fp16 hardware isn't available or the |
736 | // user passed --nvptx-no-fp16-math. The flag is useful because, |
737 | // although sm_53+ GPUs have some sort of FP16 support in |
738 | // hardware, only sm_53 and sm_60 have full implementation. Others |
739 | // only have token amount of hardware and are likely to run faster |
740 | // by using fp32 units instead. |
741 | for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) { |
742 | setFP16OperationAction(Op, MVT::f16, Legal, Promote); |
743 | setFP16OperationAction(Op, MVT::v2f16, Legal, Expand); |
744 | setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); |
745 | // bf16 must be promoted to f32. |
746 | setBF16OperationAction(Op, MVT::bf16, Legal, Promote); |
747 | if (getOperationAction(Op, MVT::bf16) == Promote) |
748 | AddPromotedToType(Op, MVT::bf16, MVT::f32); |
749 | } |
750 | |
751 | // f16/f16x2 neg was introduced in PTX 60, SM_53. |
752 | const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 && |
753 | STI.getPTXVersion() >= 60 && |
754 | STI.allowFP16Math(); |
755 | for (const auto &VT : {MVT::f16, MVT::v2f16}) |
756 | setOperationAction(ISD::FNEG, VT, |
757 | IsFP16FP16x2NegAvailable ? Legal : Expand); |
758 | |
759 | setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand); |
760 | setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand); |
761 | // (would be) Library functions. |
762 | |
763 | // These map to conversion instructions for scalar FP types. |
764 | for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, |
765 | ISD::FROUNDEVEN, ISD::FTRUNC}) { |
766 | setOperationAction(Op, MVT::f16, Legal); |
767 | setOperationAction(Op, MVT::f32, Legal); |
768 | setOperationAction(Op, MVT::f64, Legal); |
769 | setOperationAction(Op, MVT::v2f16, Expand); |
770 | setOperationAction(Op, MVT::v2bf16, Expand); |
771 | setBF16OperationAction(Op, MVT::bf16, Legal, Promote); |
772 | if (getOperationAction(Op, MVT::bf16) == Promote) |
773 | AddPromotedToType(Op, MVT::bf16, MVT::f32); |
774 | } |
775 | |
776 | if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) { |
777 | setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand); |
778 | } |
779 | if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) { |
780 | for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) { |
781 | setOperationAction(ISD::FP_EXTEND, VT, Custom); |
782 | setOperationAction(ISD::FP_ROUND, VT, Custom); |
783 | } |
784 | } |
785 | |
786 | // sm_80 only has conversions between f32 and bf16. Custom lower all other |
787 | // bf16 conversions. |
788 | if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) { |
789 | for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) { |
790 | setOperationAction( |
791 | {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, |
792 | VT, Custom); |
793 | } |
794 | setOperationAction( |
795 | {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, |
796 | MVT::bf16, Custom); |
797 | } |
798 | |
799 | setOperationAction(ISD::FROUND, MVT::f16, Promote); |
800 | setOperationAction(ISD::FROUND, MVT::v2f16, Expand); |
801 | setOperationAction(ISD::FROUND, MVT::v2bf16, Expand); |
802 | setOperationAction(ISD::FROUND, MVT::f32, Custom); |
803 | setOperationAction(ISD::FROUND, MVT::f64, Custom); |
804 | setOperationAction(ISD::FROUND, MVT::bf16, Promote); |
805 | AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32); |
806 | |
807 | // 'Expand' implements FCOPYSIGN without calling an external library. |
808 | setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); |
809 | setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand); |
810 | setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand); |
811 | setOperationAction(ISD::FCOPYSIGN, MVT::v2bf16, Expand); |
812 | setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); |
813 | setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); |
814 | |
815 | // These map to corresponding instructions for f32/f64. f16 must be |
816 | // promoted to f32. v2f16 is expanded to f16, which is then promoted |
817 | // to f32. |
818 | for (const auto &Op : |
819 | {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS}) { |
820 | setOperationAction(Op, MVT::f16, Promote); |
821 | setOperationAction(Op, MVT::f32, Legal); |
822 | setOperationAction(Op, MVT::f64, Legal); |
823 | setOperationAction(Op, MVT::v2f16, Expand); |
824 | setOperationAction(Op, MVT::v2bf16, Expand); |
825 | setOperationAction(Op, MVT::bf16, Promote); |
826 | AddPromotedToType(Op, MVT::bf16, MVT::f32); |
827 | } |
828 | for (const auto &Op : {ISD::FABS}) { |
829 | setOperationAction(Op, MVT::f16, Promote); |
830 | setOperationAction(Op, MVT::f32, Legal); |
831 | setOperationAction(Op, MVT::f64, Legal); |
832 | setOperationAction(Op, MVT::v2f16, Expand); |
833 | setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); |
834 | setBF16OperationAction(Op, MVT::bf16, Legal, Promote); |
835 | if (getOperationAction(Op, MVT::bf16) == Promote) |
836 | AddPromotedToType(Op, MVT::bf16, MVT::f32); |
837 | } |
838 | |
839 | // max.f16, max.f16x2 and max.NaN are supported on sm_80+. |
840 | auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) { |
841 | bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70; |
842 | return IsAtLeastSm80 ? Legal : NotSm80Action; |
843 | }; |
844 | for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) { |
845 | setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote); |
846 | setOperationAction(Op, MVT::f32, Legal); |
847 | setOperationAction(Op, MVT::f64, Legal); |
848 | setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); |
849 | setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); |
850 | setBF16OperationAction(Op, MVT::bf16, Legal, Promote); |
851 | if (getOperationAction(Op, MVT::bf16) == Promote) |
852 | AddPromotedToType(Op, MVT::bf16, MVT::f32); |
853 | } |
854 | for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) { |
855 | setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand); |
856 | setFP16OperationAction(Op, MVT::bf16, Legal, Expand); |
857 | setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand)); |
858 | setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand); |
859 | setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand); |
860 | } |
861 | |
862 | // No FEXP2, FLOG2. The PTX ex2 and log2 functions are always approximate. |
863 | // No FPOW or FREM in PTX. |
864 | |
865 | // Now deduce the information based on the above mentioned |
866 | // actions |
867 | computeRegisterProperties(STI.getRegisterInfo()); |
868 | |
869 | setMinCmpXchgSizeInBits(32); |
870 | setMaxAtomicSizeInBitsSupported(64); |
871 | } |
872 | |
873 | const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { |
874 | |
875 | #define MAKE_CASE(V) \ |
876 | case V: \ |
877 | return #V; |
878 | |
879 | switch ((NVPTXISD::NodeType)Opcode) { |
880 | case NVPTXISD::FIRST_NUMBER: |
881 | break; |
882 | |
883 | MAKE_CASE(NVPTXISD::CALL) |
884 | MAKE_CASE(NVPTXISD::RET_GLUE) |
885 | MAKE_CASE(NVPTXISD::LOAD_PARAM) |
886 | MAKE_CASE(NVPTXISD::Wrapper) |
887 | MAKE_CASE(NVPTXISD::DeclareParam) |
888 | MAKE_CASE(NVPTXISD::DeclareScalarParam) |
889 | MAKE_CASE(NVPTXISD::DeclareRet) |
890 | MAKE_CASE(NVPTXISD::DeclareScalarRet) |
891 | MAKE_CASE(NVPTXISD::DeclareRetParam) |
892 | MAKE_CASE(NVPTXISD::PrintCall) |
893 | MAKE_CASE(NVPTXISD::PrintConvergentCall) |
894 | MAKE_CASE(NVPTXISD::PrintCallUni) |
895 | MAKE_CASE(NVPTXISD::PrintConvergentCallUni) |
896 | MAKE_CASE(NVPTXISD::LoadParam) |
897 | MAKE_CASE(NVPTXISD::LoadParamV2) |
898 | MAKE_CASE(NVPTXISD::LoadParamV4) |
899 | MAKE_CASE(NVPTXISD::StoreParam) |
900 | MAKE_CASE(NVPTXISD::StoreParamV2) |
901 | MAKE_CASE(NVPTXISD::StoreParamV4) |
902 | MAKE_CASE(NVPTXISD::StoreParamS32) |
903 | MAKE_CASE(NVPTXISD::StoreParamU32) |
904 | MAKE_CASE(NVPTXISD::CallArgBegin) |
905 | MAKE_CASE(NVPTXISD::CallArg) |
906 | MAKE_CASE(NVPTXISD::LastCallArg) |
907 | MAKE_CASE(NVPTXISD::CallArgEnd) |
908 | MAKE_CASE(NVPTXISD::CallVoid) |
909 | MAKE_CASE(NVPTXISD::CallVal) |
910 | MAKE_CASE(NVPTXISD::CallSymbol) |
911 | MAKE_CASE(NVPTXISD::Prototype) |
912 | MAKE_CASE(NVPTXISD::MoveParam) |
913 | MAKE_CASE(NVPTXISD::StoreRetval) |
914 | MAKE_CASE(NVPTXISD::StoreRetvalV2) |
915 | MAKE_CASE(NVPTXISD::StoreRetvalV4) |
916 | MAKE_CASE(NVPTXISD::PseudoUseParam) |
917 | MAKE_CASE(NVPTXISD::RETURN) |
918 | MAKE_CASE(NVPTXISD::CallSeqBegin) |
919 | MAKE_CASE(NVPTXISD::CallSeqEnd) |
920 | MAKE_CASE(NVPTXISD::CallPrototype) |
921 | MAKE_CASE(NVPTXISD::ProxyReg) |
922 | MAKE_CASE(NVPTXISD::LoadV2) |
923 | MAKE_CASE(NVPTXISD::LoadV4) |
924 | MAKE_CASE(NVPTXISD::LDGV2) |
925 | MAKE_CASE(NVPTXISD::LDGV4) |
926 | MAKE_CASE(NVPTXISD::LDUV2) |
927 | MAKE_CASE(NVPTXISD::LDUV4) |
928 | MAKE_CASE(NVPTXISD::StoreV2) |
929 | MAKE_CASE(NVPTXISD::StoreV4) |
930 | MAKE_CASE(NVPTXISD::FUN_SHFL_CLAMP) |
931 | MAKE_CASE(NVPTXISD::FUN_SHFR_CLAMP) |
932 | MAKE_CASE(NVPTXISD::IMAD) |
933 | MAKE_CASE(NVPTXISD::BFE) |
934 | MAKE_CASE(NVPTXISD::BFI) |
935 | MAKE_CASE(NVPTXISD::PRMT) |
936 | MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC) |
937 | MAKE_CASE(NVPTXISD::SETP_F16X2) |
938 | MAKE_CASE(NVPTXISD::SETP_BF16X2) |
939 | MAKE_CASE(NVPTXISD::Dummy) |
940 | MAKE_CASE(NVPTXISD::MUL_WIDE_SIGNED) |
941 | MAKE_CASE(NVPTXISD::MUL_WIDE_UNSIGNED) |
942 | MAKE_CASE(NVPTXISD::Tex1DFloatS32) |
943 | MAKE_CASE(NVPTXISD::Tex1DFloatFloat) |
944 | MAKE_CASE(NVPTXISD::Tex1DFloatFloatLevel) |
945 | MAKE_CASE(NVPTXISD::Tex1DFloatFloatGrad) |
946 | MAKE_CASE(NVPTXISD::Tex1DS32S32) |
947 | MAKE_CASE(NVPTXISD::Tex1DS32Float) |
948 | MAKE_CASE(NVPTXISD::Tex1DS32FloatLevel) |
949 | MAKE_CASE(NVPTXISD::Tex1DS32FloatGrad) |
950 | MAKE_CASE(NVPTXISD::Tex1DU32S32) |
951 | MAKE_CASE(NVPTXISD::Tex1DU32Float) |
952 | MAKE_CASE(NVPTXISD::Tex1DU32FloatLevel) |
953 | MAKE_CASE(NVPTXISD::Tex1DU32FloatGrad) |
954 | MAKE_CASE(NVPTXISD::Tex1DArrayFloatS32) |
955 | MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloat) |
956 | MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatLevel) |
957 | MAKE_CASE(NVPTXISD::Tex1DArrayFloatFloatGrad) |
958 | MAKE_CASE(NVPTXISD::Tex1DArrayS32S32) |
959 | MAKE_CASE(NVPTXISD::Tex1DArrayS32Float) |
960 | MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatLevel) |
961 | MAKE_CASE(NVPTXISD::Tex1DArrayS32FloatGrad) |
962 | MAKE_CASE(NVPTXISD::Tex1DArrayU32S32) |
963 | MAKE_CASE(NVPTXISD::Tex1DArrayU32Float) |
964 | MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatLevel) |
965 | MAKE_CASE(NVPTXISD::Tex1DArrayU32FloatGrad) |
966 | MAKE_CASE(NVPTXISD::Tex2DFloatS32) |
967 | MAKE_CASE(NVPTXISD::Tex2DFloatFloat) |
968 | MAKE_CASE(NVPTXISD::Tex2DFloatFloatLevel) |
969 | MAKE_CASE(NVPTXISD::Tex2DFloatFloatGrad) |
970 | MAKE_CASE(NVPTXISD::Tex2DS32S32) |
971 | MAKE_CASE(NVPTXISD::Tex2DS32Float) |
972 | MAKE_CASE(NVPTXISD::Tex2DS32FloatLevel) |
973 | MAKE_CASE(NVPTXISD::Tex2DS32FloatGrad) |
974 | MAKE_CASE(NVPTXISD::Tex2DU32S32) |
975 | MAKE_CASE(NVPTXISD::Tex2DU32Float) |
976 | MAKE_CASE(NVPTXISD::Tex2DU32FloatLevel) |
977 | MAKE_CASE(NVPTXISD::Tex2DU32FloatGrad) |
978 | MAKE_CASE(NVPTXISD::Tex2DArrayFloatS32) |
979 | MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloat) |
980 | MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatLevel) |
981 | MAKE_CASE(NVPTXISD::Tex2DArrayFloatFloatGrad) |
982 | MAKE_CASE(NVPTXISD::Tex2DArrayS32S32) |
983 | MAKE_CASE(NVPTXISD::Tex2DArrayS32Float) |
984 | MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatLevel) |
985 | MAKE_CASE(NVPTXISD::Tex2DArrayS32FloatGrad) |
986 | MAKE_CASE(NVPTXISD::Tex2DArrayU32S32) |
987 | MAKE_CASE(NVPTXISD::Tex2DArrayU32Float) |
988 | MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatLevel) |
989 | MAKE_CASE(NVPTXISD::Tex2DArrayU32FloatGrad) |
990 | MAKE_CASE(NVPTXISD::Tex3DFloatS32) |
991 | MAKE_CASE(NVPTXISD::Tex3DFloatFloat) |
992 | MAKE_CASE(NVPTXISD::Tex3DFloatFloatLevel) |
993 | MAKE_CASE(NVPTXISD::Tex3DFloatFloatGrad) |
994 | MAKE_CASE(NVPTXISD::Tex3DS32S32) |
995 | MAKE_CASE(NVPTXISD::Tex3DS32Float) |
996 | MAKE_CASE(NVPTXISD::Tex3DS32FloatLevel) |
997 | MAKE_CASE(NVPTXISD::Tex3DS32FloatGrad) |
998 | MAKE_CASE(NVPTXISD::Tex3DU32S32) |
999 | MAKE_CASE(NVPTXISD::Tex3DU32Float) |
1000 | MAKE_CASE(NVPTXISD::Tex3DU32FloatLevel) |
1001 | MAKE_CASE(NVPTXISD::Tex3DU32FloatGrad) |
1002 | MAKE_CASE(NVPTXISD::TexCubeFloatFloat) |
1003 | MAKE_CASE(NVPTXISD::TexCubeFloatFloatLevel) |
1004 | MAKE_CASE(NVPTXISD::TexCubeS32Float) |
1005 | MAKE_CASE(NVPTXISD::TexCubeS32FloatLevel) |
1006 | MAKE_CASE(NVPTXISD::TexCubeU32Float) |
1007 | MAKE_CASE(NVPTXISD::TexCubeU32FloatLevel) |
1008 | MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloat) |
1009 | MAKE_CASE(NVPTXISD::TexCubeArrayFloatFloatLevel) |
1010 | MAKE_CASE(NVPTXISD::TexCubeArrayS32Float) |
1011 | MAKE_CASE(NVPTXISD::TexCubeArrayS32FloatLevel) |
1012 | MAKE_CASE(NVPTXISD::TexCubeArrayU32Float) |
1013 | MAKE_CASE(NVPTXISD::TexCubeArrayU32FloatLevel) |
1014 | MAKE_CASE(NVPTXISD::Tld4R2DFloatFloat) |
1015 | MAKE_CASE(NVPTXISD::Tld4G2DFloatFloat) |
1016 | MAKE_CASE(NVPTXISD::Tld4B2DFloatFloat) |
1017 | MAKE_CASE(NVPTXISD::Tld4A2DFloatFloat) |
1018 | MAKE_CASE(NVPTXISD::Tld4R2DS64Float) |
1019 | MAKE_CASE(NVPTXISD::Tld4G2DS64Float) |
1020 | MAKE_CASE(NVPTXISD::Tld4B2DS64Float) |
1021 | MAKE_CASE(NVPTXISD::Tld4A2DS64Float) |
1022 | MAKE_CASE(NVPTXISD::Tld4R2DU64Float) |
1023 | MAKE_CASE(NVPTXISD::Tld4G2DU64Float) |
1024 | MAKE_CASE(NVPTXISD::Tld4B2DU64Float) |
1025 | MAKE_CASE(NVPTXISD::Tld4A2DU64Float) |
1026 | |
1027 | MAKE_CASE(NVPTXISD::TexUnified1DFloatS32) |
1028 | MAKE_CASE(NVPTXISD::TexUnified1DFloatFloat) |
1029 | MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatLevel) |
1030 | MAKE_CASE(NVPTXISD::TexUnified1DFloatFloatGrad) |
1031 | MAKE_CASE(NVPTXISD::TexUnified1DS32S32) |
1032 | MAKE_CASE(NVPTXISD::TexUnified1DS32Float) |
1033 | MAKE_CASE(NVPTXISD::TexUnified1DS32FloatLevel) |
1034 | MAKE_CASE(NVPTXISD::TexUnified1DS32FloatGrad) |
1035 | MAKE_CASE(NVPTXISD::TexUnified1DU32S32) |
1036 | MAKE_CASE(NVPTXISD::TexUnified1DU32Float) |
1037 | MAKE_CASE(NVPTXISD::TexUnified1DU32FloatLevel) |
1038 | MAKE_CASE(NVPTXISD::TexUnified1DU32FloatGrad) |
1039 | MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatS32) |
1040 | MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloat) |
1041 | MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatLevel) |
1042 | MAKE_CASE(NVPTXISD::TexUnified1DArrayFloatFloatGrad) |
1043 | MAKE_CASE(NVPTXISD::TexUnified1DArrayS32S32) |
1044 | MAKE_CASE(NVPTXISD::TexUnified1DArrayS32Float) |
1045 | MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatLevel) |
1046 | MAKE_CASE(NVPTXISD::TexUnified1DArrayS32FloatGrad) |
1047 | MAKE_CASE(NVPTXISD::TexUnified1DArrayU32S32) |
1048 | MAKE_CASE(NVPTXISD::TexUnified1DArrayU32Float) |
1049 | MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatLevel) |
1050 | MAKE_CASE(NVPTXISD::TexUnified1DArrayU32FloatGrad) |
1051 | MAKE_CASE(NVPTXISD::TexUnified2DFloatS32) |
1052 | MAKE_CASE(NVPTXISD::TexUnified2DFloatFloat) |
1053 | MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatLevel) |
1054 | MAKE_CASE(NVPTXISD::TexUnified2DFloatFloatGrad) |
1055 | MAKE_CASE(NVPTXISD::TexUnified2DS32S32) |
1056 | MAKE_CASE(NVPTXISD::TexUnified2DS32Float) |
1057 | MAKE_CASE(NVPTXISD::TexUnified2DS32FloatLevel) |
1058 | MAKE_CASE(NVPTXISD::TexUnified2DS32FloatGrad) |
1059 | MAKE_CASE(NVPTXISD::TexUnified2DU32S32) |
1060 | MAKE_CASE(NVPTXISD::TexUnified2DU32Float) |
1061 | MAKE_CASE(NVPTXISD::TexUnified2DU32FloatLevel) |
1062 | MAKE_CASE(NVPTXISD::TexUnified2DU32FloatGrad) |
1063 | MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatS32) |
1064 | MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloat) |
1065 | MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatLevel) |
1066 | MAKE_CASE(NVPTXISD::TexUnified2DArrayFloatFloatGrad) |
1067 | MAKE_CASE(NVPTXISD::TexUnified2DArrayS32S32) |
1068 | MAKE_CASE(NVPTXISD::TexUnified2DArrayS32Float) |
1069 | MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatLevel) |
1070 | MAKE_CASE(NVPTXISD::TexUnified2DArrayS32FloatGrad) |
1071 | MAKE_CASE(NVPTXISD::TexUnified2DArrayU32S32) |
1072 | MAKE_CASE(NVPTXISD::TexUnified2DArrayU32Float) |
1073 | MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatLevel) |
1074 | MAKE_CASE(NVPTXISD::TexUnified2DArrayU32FloatGrad) |
1075 | MAKE_CASE(NVPTXISD::TexUnified3DFloatS32) |
1076 | MAKE_CASE(NVPTXISD::TexUnified3DFloatFloat) |
1077 | MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatLevel) |
1078 | MAKE_CASE(NVPTXISD::TexUnified3DFloatFloatGrad) |
1079 | MAKE_CASE(NVPTXISD::TexUnified3DS32S32) |
1080 | MAKE_CASE(NVPTXISD::TexUnified3DS32Float) |
1081 | MAKE_CASE(NVPTXISD::TexUnified3DS32FloatLevel) |
1082 | MAKE_CASE(NVPTXISD::TexUnified3DS32FloatGrad) |
1083 | MAKE_CASE(NVPTXISD::TexUnified3DU32S32) |
1084 | MAKE_CASE(NVPTXISD::TexUnified3DU32Float) |
1085 | MAKE_CASE(NVPTXISD::TexUnified3DU32FloatLevel) |
1086 | MAKE_CASE(NVPTXISD::TexUnified3DU32FloatGrad) |
1087 | MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloat) |
1088 | MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatLevel) |
1089 | MAKE_CASE(NVPTXISD::TexUnifiedCubeS32Float) |
1090 | MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatLevel) |
1091 | MAKE_CASE(NVPTXISD::TexUnifiedCubeU32Float) |
1092 | MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatLevel) |
1093 | MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloat) |
1094 | MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel) |
1095 | MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32Float) |
1096 | MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatLevel) |
1097 | MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32Float) |
1098 | MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatLevel) |
1099 | MAKE_CASE(NVPTXISD::TexUnifiedCubeFloatFloatGrad) |
1100 | MAKE_CASE(NVPTXISD::TexUnifiedCubeS32FloatGrad) |
1101 | MAKE_CASE(NVPTXISD::TexUnifiedCubeU32FloatGrad) |
1102 | MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad) |
1103 | MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayS32FloatGrad) |
1104 | MAKE_CASE(NVPTXISD::TexUnifiedCubeArrayU32FloatGrad) |
1105 | MAKE_CASE(NVPTXISD::Tld4UnifiedR2DFloatFloat) |
1106 | MAKE_CASE(NVPTXISD::Tld4UnifiedG2DFloatFloat) |
1107 | MAKE_CASE(NVPTXISD::Tld4UnifiedB2DFloatFloat) |
1108 | MAKE_CASE(NVPTXISD::Tld4UnifiedA2DFloatFloat) |
1109 | MAKE_CASE(NVPTXISD::Tld4UnifiedR2DS64Float) |
1110 | MAKE_CASE(NVPTXISD::Tld4UnifiedG2DS64Float) |
1111 | MAKE_CASE(NVPTXISD::Tld4UnifiedB2DS64Float) |
1112 | MAKE_CASE(NVPTXISD::Tld4UnifiedA2DS64Float) |
1113 | MAKE_CASE(NVPTXISD::Tld4UnifiedR2DU64Float) |
1114 | MAKE_CASE(NVPTXISD::Tld4UnifiedG2DU64Float) |
1115 | MAKE_CASE(NVPTXISD::Tld4UnifiedB2DU64Float) |
1116 | MAKE_CASE(NVPTXISD::Tld4UnifiedA2DU64Float) |
1117 | |
1118 | MAKE_CASE(NVPTXISD::Suld1DI8Clamp) |
1119 | MAKE_CASE(NVPTXISD::Suld1DI16Clamp) |
1120 | MAKE_CASE(NVPTXISD::Suld1DI32Clamp) |
1121 | MAKE_CASE(NVPTXISD::Suld1DI64Clamp) |
1122 | MAKE_CASE(NVPTXISD::Suld1DV2I8Clamp) |
1123 | MAKE_CASE(NVPTXISD::Suld1DV2I16Clamp) |
1124 | MAKE_CASE(NVPTXISD::Suld1DV2I32Clamp) |
1125 | MAKE_CASE(NVPTXISD::Suld1DV2I64Clamp) |
1126 | MAKE_CASE(NVPTXISD::Suld1DV4I8Clamp) |
1127 | MAKE_CASE(NVPTXISD::Suld1DV4I16Clamp) |
1128 | MAKE_CASE(NVPTXISD::Suld1DV4I32Clamp) |
1129 | |
1130 | MAKE_CASE(NVPTXISD::Suld1DArrayI8Clamp) |
1131 | MAKE_CASE(NVPTXISD::Suld1DArrayI16Clamp) |
1132 | MAKE_CASE(NVPTXISD::Suld1DArrayI32Clamp) |
1133 | MAKE_CASE(NVPTXISD::Suld1DArrayI64Clamp) |
1134 | MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Clamp) |
1135 | MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Clamp) |
1136 | MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Clamp) |
1137 | MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Clamp) |
1138 | MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Clamp) |
1139 | MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Clamp) |
1140 | MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Clamp) |
1141 | |
1142 | MAKE_CASE(NVPTXISD::Suld2DI8Clamp) |
1143 | MAKE_CASE(NVPTXISD::Suld2DI16Clamp) |
1144 | MAKE_CASE(NVPTXISD::Suld2DI32Clamp) |
1145 | MAKE_CASE(NVPTXISD::Suld2DI64Clamp) |
1146 | MAKE_CASE(NVPTXISD::Suld2DV2I8Clamp) |
1147 | MAKE_CASE(NVPTXISD::Suld2DV2I16Clamp) |
1148 | MAKE_CASE(NVPTXISD::Suld2DV2I32Clamp) |
1149 | MAKE_CASE(NVPTXISD::Suld2DV2I64Clamp) |
1150 | MAKE_CASE(NVPTXISD::Suld2DV4I8Clamp) |
1151 | MAKE_CASE(NVPTXISD::Suld2DV4I16Clamp) |
1152 | MAKE_CASE(NVPTXISD::Suld2DV4I32Clamp) |
1153 | |
1154 | MAKE_CASE(NVPTXISD::Suld2DArrayI8Clamp) |
1155 | MAKE_CASE(NVPTXISD::Suld2DArrayI16Clamp) |
1156 | MAKE_CASE(NVPTXISD::Suld2DArrayI32Clamp) |
1157 | MAKE_CASE(NVPTXISD::Suld2DArrayI64Clamp) |
1158 | MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Clamp) |
1159 | MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Clamp) |
1160 | MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Clamp) |
1161 | MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Clamp) |
1162 | MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Clamp) |
1163 | MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Clamp) |
1164 | MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Clamp) |
1165 | |
1166 | MAKE_CASE(NVPTXISD::Suld3DI8Clamp) |
1167 | MAKE_CASE(NVPTXISD::Suld3DI16Clamp) |
1168 | MAKE_CASE(NVPTXISD::Suld3DI32Clamp) |
1169 | MAKE_CASE(NVPTXISD::Suld3DI64Clamp) |
1170 | MAKE_CASE(NVPTXISD::Suld3DV2I8Clamp) |
1171 | MAKE_CASE(NVPTXISD::Suld3DV2I16Clamp) |
1172 | MAKE_CASE(NVPTXISD::Suld3DV2I32Clamp) |
1173 | MAKE_CASE(NVPTXISD::Suld3DV2I64Clamp) |
1174 | MAKE_CASE(NVPTXISD::Suld3DV4I8Clamp) |
1175 | MAKE_CASE(NVPTXISD::Suld3DV4I16Clamp) |
1176 | MAKE_CASE(NVPTXISD::Suld3DV4I32Clamp) |
1177 | |
1178 | MAKE_CASE(NVPTXISD::Suld1DI8Trap) |
1179 | MAKE_CASE(NVPTXISD::Suld1DI16Trap) |
1180 | MAKE_CASE(NVPTXISD::Suld1DI32Trap) |
1181 | MAKE_CASE(NVPTXISD::Suld1DI64Trap) |
1182 | MAKE_CASE(NVPTXISD::Suld1DV2I8Trap) |
1183 | MAKE_CASE(NVPTXISD::Suld1DV2I16Trap) |
1184 | MAKE_CASE(NVPTXISD::Suld1DV2I32Trap) |
1185 | MAKE_CASE(NVPTXISD::Suld1DV2I64Trap) |
1186 | MAKE_CASE(NVPTXISD::Suld1DV4I8Trap) |
1187 | MAKE_CASE(NVPTXISD::Suld1DV4I16Trap) |
1188 | MAKE_CASE(NVPTXISD::Suld1DV4I32Trap) |
1189 | |
1190 | MAKE_CASE(NVPTXISD::Suld1DArrayI8Trap) |
1191 | MAKE_CASE(NVPTXISD::Suld1DArrayI16Trap) |
1192 | MAKE_CASE(NVPTXISD::Suld1DArrayI32Trap) |
1193 | MAKE_CASE(NVPTXISD::Suld1DArrayI64Trap) |
1194 | MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Trap) |
1195 | MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Trap) |
1196 | MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Trap) |
1197 | MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Trap) |
1198 | MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Trap) |
1199 | MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Trap) |
1200 | MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Trap) |
1201 | |
1202 | MAKE_CASE(NVPTXISD::Suld2DI8Trap) |
1203 | MAKE_CASE(NVPTXISD::Suld2DI16Trap) |
1204 | MAKE_CASE(NVPTXISD::Suld2DI32Trap) |
1205 | MAKE_CASE(NVPTXISD::Suld2DI64Trap) |
1206 | MAKE_CASE(NVPTXISD::Suld2DV2I8Trap) |
1207 | MAKE_CASE(NVPTXISD::Suld2DV2I16Trap) |
1208 | MAKE_CASE(NVPTXISD::Suld2DV2I32Trap) |
1209 | MAKE_CASE(NVPTXISD::Suld2DV2I64Trap) |
1210 | MAKE_CASE(NVPTXISD::Suld2DV4I8Trap) |
1211 | MAKE_CASE(NVPTXISD::Suld2DV4I16Trap) |
1212 | MAKE_CASE(NVPTXISD::Suld2DV4I32Trap) |
1213 | |
1214 | MAKE_CASE(NVPTXISD::Suld2DArrayI8Trap) |
1215 | MAKE_CASE(NVPTXISD::Suld2DArrayI16Trap) |
1216 | MAKE_CASE(NVPTXISD::Suld2DArrayI32Trap) |
1217 | MAKE_CASE(NVPTXISD::Suld2DArrayI64Trap) |
1218 | MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Trap) |
1219 | MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Trap) |
1220 | MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Trap) |
1221 | MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Trap) |
1222 | MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Trap) |
1223 | MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Trap) |
1224 | MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Trap) |
1225 | |
1226 | MAKE_CASE(NVPTXISD::Suld3DI8Trap) |
1227 | MAKE_CASE(NVPTXISD::Suld3DI16Trap) |
1228 | MAKE_CASE(NVPTXISD::Suld3DI32Trap) |
1229 | MAKE_CASE(NVPTXISD::Suld3DI64Trap) |
1230 | MAKE_CASE(NVPTXISD::Suld3DV2I8Trap) |
1231 | MAKE_CASE(NVPTXISD::Suld3DV2I16Trap) |
1232 | MAKE_CASE(NVPTXISD::Suld3DV2I32Trap) |
1233 | MAKE_CASE(NVPTXISD::Suld3DV2I64Trap) |
1234 | MAKE_CASE(NVPTXISD::Suld3DV4I8Trap) |
1235 | MAKE_CASE(NVPTXISD::Suld3DV4I16Trap) |
1236 | MAKE_CASE(NVPTXISD::Suld3DV4I32Trap) |
1237 | |
1238 | MAKE_CASE(NVPTXISD::Suld1DI8Zero) |
1239 | MAKE_CASE(NVPTXISD::Suld1DI16Zero) |
1240 | MAKE_CASE(NVPTXISD::Suld1DI32Zero) |
1241 | MAKE_CASE(NVPTXISD::Suld1DI64Zero) |
1242 | MAKE_CASE(NVPTXISD::Suld1DV2I8Zero) |
1243 | MAKE_CASE(NVPTXISD::Suld1DV2I16Zero) |
1244 | MAKE_CASE(NVPTXISD::Suld1DV2I32Zero) |
1245 | MAKE_CASE(NVPTXISD::Suld1DV2I64Zero) |
1246 | MAKE_CASE(NVPTXISD::Suld1DV4I8Zero) |
1247 | MAKE_CASE(NVPTXISD::Suld1DV4I16Zero) |
1248 | MAKE_CASE(NVPTXISD::Suld1DV4I32Zero) |
1249 | |
1250 | MAKE_CASE(NVPTXISD::Suld1DArrayI8Zero) |
1251 | MAKE_CASE(NVPTXISD::Suld1DArrayI16Zero) |
1252 | MAKE_CASE(NVPTXISD::Suld1DArrayI32Zero) |
1253 | MAKE_CASE(NVPTXISD::Suld1DArrayI64Zero) |
1254 | MAKE_CASE(NVPTXISD::Suld1DArrayV2I8Zero) |
1255 | MAKE_CASE(NVPTXISD::Suld1DArrayV2I16Zero) |
1256 | MAKE_CASE(NVPTXISD::Suld1DArrayV2I32Zero) |
1257 | MAKE_CASE(NVPTXISD::Suld1DArrayV2I64Zero) |
1258 | MAKE_CASE(NVPTXISD::Suld1DArrayV4I8Zero) |
1259 | MAKE_CASE(NVPTXISD::Suld1DArrayV4I16Zero) |
1260 | MAKE_CASE(NVPTXISD::Suld1DArrayV4I32Zero) |
1261 | |
1262 | MAKE_CASE(NVPTXISD::Suld2DI8Zero) |
1263 | MAKE_CASE(NVPTXISD::Suld2DI16Zero) |
1264 | MAKE_CASE(NVPTXISD::Suld2DI32Zero) |
1265 | MAKE_CASE(NVPTXISD::Suld2DI64Zero) |
1266 | MAKE_CASE(NVPTXISD::Suld2DV2I8Zero) |
1267 | MAKE_CASE(NVPTXISD::Suld2DV2I16Zero) |
1268 | MAKE_CASE(NVPTXISD::Suld2DV2I32Zero) |
1269 | MAKE_CASE(NVPTXISD::Suld2DV2I64Zero) |
1270 | MAKE_CASE(NVPTXISD::Suld2DV4I8Zero) |
1271 | MAKE_CASE(NVPTXISD::Suld2DV4I16Zero) |
1272 | MAKE_CASE(NVPTXISD::Suld2DV4I32Zero) |
1273 | |
1274 | MAKE_CASE(NVPTXISD::Suld2DArrayI8Zero) |
1275 | MAKE_CASE(NVPTXISD::Suld2DArrayI16Zero) |
1276 | MAKE_CASE(NVPTXISD::Suld2DArrayI32Zero) |
1277 | MAKE_CASE(NVPTXISD::Suld2DArrayI64Zero) |
1278 | MAKE_CASE(NVPTXISD::Suld2DArrayV2I8Zero) |
1279 | MAKE_CASE(NVPTXISD::Suld2DArrayV2I16Zero) |
1280 | MAKE_CASE(NVPTXISD::Suld2DArrayV2I32Zero) |
1281 | MAKE_CASE(NVPTXISD::Suld2DArrayV2I64Zero) |
1282 | MAKE_CASE(NVPTXISD::Suld2DArrayV4I8Zero) |
1283 | MAKE_CASE(NVPTXISD::Suld2DArrayV4I16Zero) |
1284 | MAKE_CASE(NVPTXISD::Suld2DArrayV4I32Zero) |
1285 | |
1286 | MAKE_CASE(NVPTXISD::Suld3DI8Zero) |
1287 | MAKE_CASE(NVPTXISD::Suld3DI16Zero) |
1288 | MAKE_CASE(NVPTXISD::Suld3DI32Zero) |
1289 | MAKE_CASE(NVPTXISD::Suld3DI64Zero) |
1290 | MAKE_CASE(NVPTXISD::Suld3DV2I8Zero) |
1291 | MAKE_CASE(NVPTXISD::Suld3DV2I16Zero) |
1292 | MAKE_CASE(NVPTXISD::Suld3DV2I32Zero) |
1293 | MAKE_CASE(NVPTXISD::Suld3DV2I64Zero) |
1294 | MAKE_CASE(NVPTXISD::Suld3DV4I8Zero) |
1295 | MAKE_CASE(NVPTXISD::Suld3DV4I16Zero) |
1296 | MAKE_CASE(NVPTXISD::Suld3DV4I32Zero) |
1297 | } |
1298 | return nullptr; |
1299 | |
1300 | #undef MAKE_CASE |
1301 | } |
1302 | |
1303 | TargetLoweringBase::LegalizeTypeAction |
1304 | NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const { |
1305 | if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && |
1306 | VT.getScalarType() == MVT::i1) |
1307 | return TypeSplitVector; |
1308 | if (Isv2x16VT(VT)) |
1309 | return TypeLegal; |
1310 | return TargetLoweringBase::getPreferredVectorAction(VT); |
1311 | } |
1312 | |
1313 | SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, |
1314 | int Enabled, int &, |
1315 | bool &UseOneConst, |
1316 | bool Reciprocal) const { |
1317 | if (!(Enabled == ReciprocalEstimate::Enabled || |
1318 | (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32()))) |
1319 | return SDValue(); |
1320 | |
1321 | if (ExtraSteps == ReciprocalEstimate::Unspecified) |
1322 | ExtraSteps = 0; |
1323 | |
1324 | SDLoc DL(Operand); |
1325 | EVT VT = Operand.getValueType(); |
1326 | bool Ftz = useF32FTZ(MF: DAG.getMachineFunction()); |
1327 | |
1328 | auto MakeIntrinsicCall = [&](Intrinsic::ID IID) { |
1329 | return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, |
1330 | DAG.getConstant(IID, DL, MVT::i32), Operand); |
1331 | }; |
1332 | |
1333 | // The sqrt and rsqrt refinement processes assume we always start out with an |
1334 | // approximation of the rsqrt. Therefore, if we're going to do any refinement |
1335 | // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing |
1336 | // any refinement, we must return a regular sqrt. |
1337 | if (Reciprocal || ExtraSteps > 0) { |
1338 | if (VT == MVT::f32) |
1339 | return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f |
1340 | : Intrinsic::nvvm_rsqrt_approx_f); |
1341 | else if (VT == MVT::f64) |
1342 | return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d); |
1343 | else |
1344 | return SDValue(); |
1345 | } else { |
1346 | if (VT == MVT::f32) |
1347 | return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f |
1348 | : Intrinsic::nvvm_sqrt_approx_f); |
1349 | else { |
1350 | // There's no sqrt.approx.f64 instruction, so we emit |
1351 | // reciprocal(rsqrt(x)). This is faster than |
1352 | // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain |
1353 | // x * rsqrt(x).) |
1354 | return DAG.getNode( |
1355 | ISD::INTRINSIC_WO_CHAIN, DL, VT, |
1356 | DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32), |
1357 | MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d)); |
1358 | } |
1359 | } |
1360 | } |
1361 | |
1362 | SDValue |
1363 | NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { |
1364 | SDLoc dl(Op); |
1365 | const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Val&: Op); |
1366 | auto PtrVT = getPointerTy(DL: DAG.getDataLayout(), AS: GAN->getAddressSpace()); |
1367 | Op = DAG.getTargetGlobalAddress(GV: GAN->getGlobal(), DL: dl, VT: PtrVT); |
1368 | return DAG.getNode(Opcode: NVPTXISD::Wrapper, DL: dl, VT: PtrVT, Operand: Op); |
1369 | } |
1370 | |
1371 | static bool IsTypePassedAsArray(const Type *Ty) { |
1372 | return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(Bitwidth: 128) || |
1373 | Ty->isHalfTy() || Ty->isBFloatTy(); |
1374 | } |
1375 | |
1376 | std::string NVPTXTargetLowering::getPrototype( |
1377 | const DataLayout &DL, Type *retTy, const ArgListTy &Args, |
1378 | const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment, |
1379 | std::optional<std::pair<unsigned, const APInt &>> VAInfo, |
1380 | const CallBase &CB, unsigned UniqueCallSite) const { |
1381 | auto PtrVT = getPointerTy(DL); |
1382 | |
1383 | bool isABI = (STI.getSmVersion() >= 20); |
1384 | assert(isABI && "Non-ABI compilation is not supported" ); |
1385 | if (!isABI) |
1386 | return "" ; |
1387 | |
1388 | std::string Prototype; |
1389 | raw_string_ostream O(Prototype); |
1390 | O << "prototype_" << UniqueCallSite << " : .callprototype " ; |
1391 | |
1392 | if (retTy->getTypeID() == Type::VoidTyID) { |
1393 | O << "()" ; |
1394 | } else { |
1395 | O << "(" ; |
1396 | if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) && |
1397 | !IsTypePassedAsArray(Ty: retTy)) { |
1398 | unsigned size = 0; |
1399 | if (auto *ITy = dyn_cast<IntegerType>(Val: retTy)) { |
1400 | size = ITy->getBitWidth(); |
1401 | } else { |
1402 | assert(retTy->isFloatingPointTy() && |
1403 | "Floating point type expected here" ); |
1404 | size = retTy->getPrimitiveSizeInBits(); |
1405 | } |
1406 | // PTX ABI requires all scalar return values to be at least 32 |
1407 | // bits in size. fp16 normally uses .b16 as its storage type in |
1408 | // PTX, so its size must be adjusted here, too. |
1409 | size = promoteScalarArgumentSize(size); |
1410 | |
1411 | O << ".param .b" << size << " _" ; |
1412 | } else if (isa<PointerType>(Val: retTy)) { |
1413 | O << ".param .b" << PtrVT.getSizeInBits() << " _" ; |
1414 | } else if (IsTypePassedAsArray(Ty: retTy)) { |
1415 | O << ".param .align " << (retAlignment ? retAlignment->value() : 0) |
1416 | << " .b8 _[" << DL.getTypeAllocSize(Ty: retTy) << "]" ; |
1417 | } else { |
1418 | llvm_unreachable("Unknown return type" ); |
1419 | } |
1420 | O << ") " ; |
1421 | } |
1422 | O << "_ (" ; |
1423 | |
1424 | bool first = true; |
1425 | |
1426 | const Function *F = CB.getFunction(); |
1427 | unsigned NumArgs = VAInfo ? VAInfo->first : Args.size(); |
1428 | for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) { |
1429 | Type *Ty = Args[i].Ty; |
1430 | if (!first) { |
1431 | O << ", " ; |
1432 | } |
1433 | first = false; |
1434 | |
1435 | if (!Outs[OIdx].Flags.isByVal()) { |
1436 | if (IsTypePassedAsArray(Ty)) { |
1437 | unsigned ParamAlign = 0; |
1438 | const CallInst *CallI = cast<CallInst>(Val: &CB); |
1439 | // +1 because index 0 is reserved for return type alignment |
1440 | if (!getAlign(*CallI, index: i + 1, ParamAlign)) |
1441 | ParamAlign = getFunctionParamOptimizedAlign(F, ArgTy: Ty, DL).value(); |
1442 | O << ".param .align " << ParamAlign << " .b8 " ; |
1443 | O << "_" ; |
1444 | O << "[" << DL.getTypeAllocSize(Ty) << "]" ; |
1445 | // update the index for Outs |
1446 | SmallVector<EVT, 16> vtparts; |
1447 | ComputeValueVTs(TLI: *this, DL, Ty, ValueVTs&: vtparts); |
1448 | if (unsigned len = vtparts.size()) |
1449 | OIdx += len - 1; |
1450 | continue; |
1451 | } |
1452 | // i8 types in IR will be i16 types in SDAG |
1453 | assert((getValueType(DL, Ty) == Outs[OIdx].VT || |
1454 | (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) && |
1455 | "type mismatch between callee prototype and arguments" ); |
1456 | // scalar type |
1457 | unsigned sz = 0; |
1458 | if (isa<IntegerType>(Val: Ty)) { |
1459 | sz = cast<IntegerType>(Val: Ty)->getBitWidth(); |
1460 | sz = promoteScalarArgumentSize(size: sz); |
1461 | } else if (isa<PointerType>(Val: Ty)) { |
1462 | sz = PtrVT.getSizeInBits(); |
1463 | } else { |
1464 | sz = Ty->getPrimitiveSizeInBits(); |
1465 | } |
1466 | O << ".param .b" << sz << " " ; |
1467 | O << "_" ; |
1468 | continue; |
1469 | } |
1470 | |
1471 | Type *ETy = Args[i].IndirectType; |
1472 | Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); |
1473 | Align ParamByValAlign = |
1474 | getFunctionByValParamAlign(F, ArgTy: ETy, InitialAlign, DL); |
1475 | |
1476 | O << ".param .align " << ParamByValAlign.value() << " .b8 " ; |
1477 | O << "_" ; |
1478 | O << "[" << Outs[OIdx].Flags.getByValSize() << "]" ; |
1479 | } |
1480 | |
1481 | if (VAInfo) |
1482 | O << (first ? "" : "," ) << " .param .align " << VAInfo->second |
1483 | << " .b8 _[]\n" ; |
1484 | O << ")" ; |
1485 | if (shouldEmitPTXNoReturn(V: &CB, TM: *nvTM)) |
1486 | O << " .noreturn" ; |
1487 | O << ";" ; |
1488 | |
1489 | return Prototype; |
1490 | } |
1491 | |
1492 | Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty, |
1493 | unsigned Idx, |
1494 | const DataLayout &DL) const { |
1495 | if (!CB) { |
1496 | // CallSite is zero, fallback to ABI type alignment |
1497 | return DL.getABITypeAlign(Ty); |
1498 | } |
1499 | |
1500 | unsigned Alignment = 0; |
1501 | const Function *DirectCallee = CB->getCalledFunction(); |
1502 | |
1503 | if (!DirectCallee) { |
1504 | // We don't have a direct function symbol, but that may be because of |
1505 | // constant cast instructions in the call. |
1506 | |
1507 | // With bitcast'd call targets, the instruction will be the call |
1508 | if (const auto *CI = dyn_cast<CallInst>(Val: CB)) { |
1509 | // Check if we have call alignment metadata |
1510 | if (getAlign(*CI, index: Idx, Alignment)) |
1511 | return Align(Alignment); |
1512 | } |
1513 | DirectCallee = getMaybeBitcastedCallee(CB); |
1514 | } |
1515 | |
1516 | // Check for function alignment information if we found that the |
1517 | // ultimate target is a Function |
1518 | if (DirectCallee) { |
1519 | if (getAlign(*DirectCallee, index: Idx, Alignment)) |
1520 | return Align(Alignment); |
1521 | // If alignment information is not available, fall back to the |
1522 | // default function param optimized type alignment |
1523 | return getFunctionParamOptimizedAlign(F: DirectCallee, ArgTy: Ty, DL); |
1524 | } |
1525 | |
1526 | // Call is indirect, fall back to the ABI type alignment |
1527 | return DL.getABITypeAlign(Ty); |
1528 | } |
1529 | |
1530 | static bool adjustElementType(EVT &ElementType) { |
1531 | switch (ElementType.getSimpleVT().SimpleTy) { |
1532 | default: |
1533 | return false; |
1534 | case MVT::f16: |
1535 | case MVT::bf16: |
1536 | ElementType = MVT::i16; |
1537 | return true; |
1538 | case MVT::f32: |
1539 | case MVT::v2f16: |
1540 | case MVT::v2bf16: |
1541 | ElementType = MVT::i32; |
1542 | return true; |
1543 | case MVT::f64: |
1544 | ElementType = MVT::i64; |
1545 | return true; |
1546 | } |
1547 | } |
1548 | |
1549 | // Use byte-store when the param address of the argument value is unaligned. |
1550 | // This may happen when the return value is a field of a packed structure. |
1551 | // |
1552 | // This is called in LowerCall() when passing the param values. |
1553 | static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, |
1554 | uint64_t Offset, EVT ElementType, |
1555 | SDValue StVal, SDValue &InGlue, |
1556 | unsigned ArgID, const SDLoc &dl) { |
1557 | // Bit logic only works on integer types |
1558 | if (adjustElementType(ElementType)) |
1559 | StVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ElementType, Operand: StVal); |
1560 | |
1561 | // Store each byte |
1562 | SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue); |
1563 | for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { |
1564 | // Shift the byte to the last byte position |
1565 | SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal, |
1566 | DAG.getConstant(i * 8, dl, MVT::i32)); |
1567 | SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32), |
1568 | DAG.getConstant(Offset + i, dl, MVT::i32), |
1569 | ShiftVal, InGlue}; |
1570 | // Trunc store only the last byte by using |
1571 | // st.param.b8 |
1572 | // The register type can be larger than b8. |
1573 | Chain = DAG.getMemIntrinsicNode( |
1574 | NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8, |
1575 | MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); |
1576 | InGlue = Chain.getValue(R: 1); |
1577 | } |
1578 | return Chain; |
1579 | } |
1580 | |
1581 | // Use byte-load when the param adress of the returned value is unaligned. |
1582 | // This may happen when the returned value is a field of a packed structure. |
1583 | static SDValue |
1584 | LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, |
1585 | EVT ElementType, SDValue &InGlue, |
1586 | SmallVectorImpl<SDValue> &TempProxyRegOps, |
1587 | const SDLoc &dl) { |
1588 | // Bit logic only works on integer types |
1589 | EVT MergedType = ElementType; |
1590 | adjustElementType(ElementType&: MergedType); |
1591 | |
1592 | // Load each byte and construct the whole value. Initial value to 0 |
1593 | SDValue RetVal = DAG.getConstant(Val: 0, DL: dl, VT: MergedType); |
1594 | // LoadParamMemI8 loads into i16 register only |
1595 | SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue); |
1596 | for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { |
1597 | SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32), |
1598 | DAG.getConstant(Offset + i, dl, MVT::i32), |
1599 | InGlue}; |
1600 | // This will be selected to LoadParamMemI8 |
1601 | SDValue LdVal = |
1602 | DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands, |
1603 | MVT::i8, MachinePointerInfo(), Align(1)); |
1604 | SDValue TmpLdVal = LdVal.getValue(R: 0); |
1605 | Chain = LdVal.getValue(R: 1); |
1606 | InGlue = LdVal.getValue(R: 2); |
1607 | |
1608 | TmpLdVal = DAG.getNode(Opcode: NVPTXISD::ProxyReg, DL: dl, |
1609 | VT: TmpLdVal.getSimpleValueType(), Operand: TmpLdVal); |
1610 | TempProxyRegOps.push_back(Elt: TmpLdVal); |
1611 | |
1612 | SDValue CMask = DAG.getConstant(Val: 255, DL: dl, VT: MergedType); |
1613 | SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32); |
1614 | // Need to extend the i16 register to the whole width. |
1615 | TmpLdVal = DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: dl, VT: MergedType, Operand: TmpLdVal); |
1616 | // Mask off the high bits. Leave only the lower 8bits. |
1617 | // Do this because we are using loadparam.b8. |
1618 | TmpLdVal = DAG.getNode(Opcode: ISD::AND, DL: dl, VT: MergedType, N1: TmpLdVal, N2: CMask); |
1619 | // Shift and merge |
1620 | TmpLdVal = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT: MergedType, N1: TmpLdVal, N2: CShift); |
1621 | RetVal = DAG.getNode(Opcode: ISD::OR, DL: dl, VT: MergedType, N1: RetVal, N2: TmpLdVal); |
1622 | } |
1623 | if (ElementType != MergedType) |
1624 | RetVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ElementType, Operand: RetVal); |
1625 | |
1626 | return RetVal; |
1627 | } |
1628 | |
1629 | SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, |
1630 | SmallVectorImpl<SDValue> &InVals) const { |
1631 | |
1632 | if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30)) |
1633 | report_fatal_error( |
1634 | reason: "Support for variadic functions (unsized array parameter) introduced " |
1635 | "in PTX ISA version 6.0 and requires target sm_30." ); |
1636 | |
1637 | SelectionDAG &DAG = CLI.DAG; |
1638 | SDLoc dl = CLI.DL; |
1639 | SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; |
1640 | SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; |
1641 | SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; |
1642 | SDValue Chain = CLI.Chain; |
1643 | SDValue Callee = CLI.Callee; |
1644 | bool &isTailCall = CLI.IsTailCall; |
1645 | ArgListTy &Args = CLI.getArgs(); |
1646 | Type *RetTy = CLI.RetTy; |
1647 | const CallBase *CB = CLI.CB; |
1648 | const DataLayout &DL = DAG.getDataLayout(); |
1649 | |
1650 | bool isABI = (STI.getSmVersion() >= 20); |
1651 | assert(isABI && "Non-ABI compilation is not supported" ); |
1652 | if (!isABI) |
1653 | return Chain; |
1654 | |
1655 | // Variadic arguments. |
1656 | // |
1657 | // Normally, for each argument, we declare a param scalar or a param |
1658 | // byte array in the .param space, and store the argument value to that |
1659 | // param scalar or array starting at offset 0. |
1660 | // |
1661 | // In the case of the first variadic argument, we declare a vararg byte array |
1662 | // with size 0. The exact size of this array isn't known at this point, so |
1663 | // it'll be patched later. All the variadic arguments will be stored to this |
1664 | // array at a certain offset (which gets tracked by 'VAOffset'). The offset is |
1665 | // initially set to 0, so it can be used for non-variadic arguments (which use |
1666 | // 0 offset) to simplify the code. |
1667 | // |
1668 | // After all vararg is processed, 'VAOffset' holds the size of the |
1669 | // vararg byte array. |
1670 | |
1671 | SDValue VADeclareParam; // vararg byte array |
1672 | unsigned FirstVAArg = CLI.NumFixedArgs; // position of the first variadic |
1673 | unsigned VAOffset = 0; // current offset in the param array |
1674 | |
1675 | unsigned UniqueCallSite = GlobalUniqueCallSite.fetch_add(i: 1); |
1676 | SDValue TempChain = Chain; |
1677 | Chain = DAG.getCALLSEQ_START(Chain, InSize: UniqueCallSite, OutSize: 0, DL: dl); |
1678 | SDValue InGlue = Chain.getValue(R: 1); |
1679 | |
1680 | unsigned ParamCount = 0; |
1681 | // Args.size() and Outs.size() need not match. |
1682 | // Outs.size() will be larger |
1683 | // * if there is an aggregate argument with multiple fields (each field |
1684 | // showing up separately in Outs) |
1685 | // * if there is a vector argument with more than typical vector-length |
1686 | // elements (generally if more than 4) where each vector element is |
1687 | // individually present in Outs. |
1688 | // So a different index should be used for indexing into Outs/OutVals. |
1689 | // See similar issue in LowerFormalArguments. |
1690 | unsigned OIdx = 0; |
1691 | // Declare the .params or .reg need to pass values |
1692 | // to the function |
1693 | for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { |
1694 | EVT VT = Outs[OIdx].VT; |
1695 | Type *Ty = Args[i].Ty; |
1696 | bool IsVAArg = (i >= CLI.NumFixedArgs); |
1697 | bool IsByVal = Outs[OIdx].Flags.isByVal(); |
1698 | |
1699 | SmallVector<EVT, 16> VTs; |
1700 | SmallVector<uint64_t, 16> Offsets; |
1701 | |
1702 | assert((!IsByVal || Args[i].IndirectType) && |
1703 | "byval arg must have indirect type" ); |
1704 | Type *ETy = (IsByVal ? Args[i].IndirectType : Ty); |
1705 | ComputePTXValueVTs(TLI: *this, DL, Ty: ETy, ValueVTs&: VTs, Offsets: &Offsets, StartingOffset: IsByVal ? 0 : VAOffset); |
1706 | |
1707 | Align ArgAlign; |
1708 | if (IsByVal) { |
1709 | // The ByValAlign in the Outs[OIdx].Flags is always set at this point, |
1710 | // so we don't need to worry whether it's naturally aligned or not. |
1711 | // See TargetLowering::LowerCallTo(). |
1712 | Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign(); |
1713 | ArgAlign = getFunctionByValParamAlign(F: CB->getCalledFunction(), ArgTy: ETy, |
1714 | InitialAlign, DL); |
1715 | if (IsVAArg) |
1716 | VAOffset = alignTo(Size: VAOffset, A: ArgAlign); |
1717 | } else { |
1718 | ArgAlign = getArgumentAlignment(CB, Ty, Idx: ParamCount + 1, DL); |
1719 | } |
1720 | |
1721 | unsigned TypeSize = |
1722 | (IsByVal ? Outs[OIdx].Flags.getByValSize() : DL.getTypeAllocSize(Ty)); |
1723 | SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); |
1724 | |
1725 | bool NeedAlign; // Does argument declaration specify alignment? |
1726 | bool PassAsArray = IsByVal || IsTypePassedAsArray(Ty); |
1727 | if (IsVAArg) { |
1728 | if (ParamCount == FirstVAArg) { |
1729 | SDValue DeclareParamOps[] = { |
1730 | Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32), |
1731 | DAG.getConstant(ParamCount, dl, MVT::i32), |
1732 | DAG.getConstant(1, dl, MVT::i32), InGlue}; |
1733 | VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, |
1734 | DeclareParamVTs, DeclareParamOps); |
1735 | } |
1736 | NeedAlign = PassAsArray; |
1737 | } else if (PassAsArray) { |
1738 | // declare .param .align <align> .b8 .param<n>[<size>]; |
1739 | SDValue DeclareParamOps[] = { |
1740 | Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32), |
1741 | DAG.getConstant(ParamCount, dl, MVT::i32), |
1742 | DAG.getConstant(TypeSize, dl, MVT::i32), InGlue}; |
1743 | Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, |
1744 | DeclareParamOps); |
1745 | NeedAlign = true; |
1746 | } else { |
1747 | // declare .param .b<size> .param<n>; |
1748 | if (VT.isInteger() || VT.isFloatingPoint()) { |
1749 | // PTX ABI requires integral types to be at least 32 bits in |
1750 | // size. FP16 is loaded/stored using i16, so it's handled |
1751 | // here as well. |
1752 | TypeSize = promoteScalarArgumentSize(size: TypeSize * 8) / 8; |
1753 | } |
1754 | SDValue DeclareScalarParamOps[] = { |
1755 | Chain, DAG.getConstant(ParamCount, dl, MVT::i32), |
1756 | DAG.getConstant(TypeSize * 8, dl, MVT::i32), |
1757 | DAG.getConstant(0, dl, MVT::i32), InGlue}; |
1758 | Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, |
1759 | DeclareScalarParamOps); |
1760 | NeedAlign = false; |
1761 | } |
1762 | InGlue = Chain.getValue(R: 1); |
1763 | |
1764 | // PTX Interoperability Guide 3.3(A): [Integer] Values shorter |
1765 | // than 32-bits are sign extended or zero extended, depending on |
1766 | // whether they are signed or unsigned types. This case applies |
1767 | // only to scalar parameters and not to aggregate values. |
1768 | bool ExtendIntegerParam = |
1769 | Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; |
1770 | |
1771 | auto VectorInfo = VectorizePTXValueVTs(ValueVTs: VTs, Offsets, ParamAlignment: ArgAlign, IsVAArg); |
1772 | SmallVector<SDValue, 6> StoreOperands; |
1773 | for (unsigned j = 0, je = VTs.size(); j != je; ++j) { |
1774 | EVT EltVT = VTs[j]; |
1775 | int CurOffset = Offsets[j]; |
1776 | MaybeAlign PartAlign; |
1777 | if (NeedAlign) |
1778 | PartAlign = commonAlignment(A: ArgAlign, Offset: CurOffset); |
1779 | |
1780 | SDValue StVal = OutVals[OIdx]; |
1781 | |
1782 | MVT PromotedVT; |
1783 | if (PromoteScalarIntegerPTX(VT: EltVT, PromotedVT: &PromotedVT)) { |
1784 | EltVT = EVT(PromotedVT); |
1785 | } |
1786 | if (PromoteScalarIntegerPTX(VT: StVal.getValueType(), PromotedVT: &PromotedVT)) { |
1787 | llvm::ISD::NodeType Ext = |
1788 | Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; |
1789 | StVal = DAG.getNode(Opcode: Ext, DL: dl, VT: PromotedVT, Operand: StVal); |
1790 | } |
1791 | |
1792 | if (IsByVal) { |
1793 | auto PtrVT = getPointerTy(DL); |
1794 | SDValue srcAddr = DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: StVal, |
1795 | N2: DAG.getConstant(Val: CurOffset, DL: dl, VT: PtrVT)); |
1796 | StVal = DAG.getLoad(VT: EltVT, dl, Chain: TempChain, Ptr: srcAddr, PtrInfo: MachinePointerInfo(), |
1797 | Alignment: PartAlign); |
1798 | } else if (ExtendIntegerParam) { |
1799 | assert(VTs.size() == 1 && "Scalar can't have multiple parts." ); |
1800 | // zext/sext to i32 |
1801 | StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND |
1802 | : ISD::ZERO_EXTEND, |
1803 | dl, MVT::i32, StVal); |
1804 | } |
1805 | |
1806 | if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) { |
1807 | // Use 16-bit registers for small stores as it's the |
1808 | // smallest general purpose register size supported by NVPTX. |
1809 | StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal); |
1810 | } |
1811 | |
1812 | // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a |
1813 | // scalar store. In such cases, fall back to byte stores. |
1814 | if (VectorInfo[j] == PVF_SCALAR && !IsVAArg && PartAlign.has_value() && |
1815 | PartAlign.value() < |
1816 | DL.getABITypeAlign(Ty: EltVT.getTypeForEVT(Context&: *DAG.getContext()))) { |
1817 | assert(StoreOperands.empty() && "Unfinished preceeding store." ); |
1818 | Chain = LowerUnalignedStoreParam( |
1819 | DAG, Chain, Offset: IsByVal ? CurOffset + VAOffset : CurOffset, ElementType: EltVT, |
1820 | StVal, InGlue, ArgID: ParamCount, dl); |
1821 | |
1822 | // LowerUnalignedStoreParam took care of inserting the necessary nodes |
1823 | // into the SDAG, so just move on to the next element. |
1824 | if (!IsByVal) |
1825 | ++OIdx; |
1826 | continue; |
1827 | } |
1828 | |
1829 | // New store. |
1830 | if (VectorInfo[j] & PVF_FIRST) { |
1831 | assert(StoreOperands.empty() && "Unfinished preceding store." ); |
1832 | StoreOperands.push_back(Elt: Chain); |
1833 | StoreOperands.push_back( |
1834 | DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32)); |
1835 | |
1836 | StoreOperands.push_back(DAG.getConstant( |
1837 | IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset), |
1838 | dl, MVT::i32)); |
1839 | } |
1840 | |
1841 | // Record the value to store. |
1842 | StoreOperands.push_back(Elt: StVal); |
1843 | |
1844 | if (VectorInfo[j] & PVF_LAST) { |
1845 | unsigned NumElts = StoreOperands.size() - 3; |
1846 | NVPTXISD::NodeType Op; |
1847 | switch (NumElts) { |
1848 | case 1: |
1849 | Op = NVPTXISD::StoreParam; |
1850 | break; |
1851 | case 2: |
1852 | Op = NVPTXISD::StoreParamV2; |
1853 | break; |
1854 | case 4: |
1855 | Op = NVPTXISD::StoreParamV4; |
1856 | break; |
1857 | default: |
1858 | llvm_unreachable("Invalid vector info." ); |
1859 | } |
1860 | |
1861 | StoreOperands.push_back(Elt: InGlue); |
1862 | |
1863 | // Adjust type of the store op if we've extended the scalar |
1864 | // return value. |
1865 | EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT; |
1866 | |
1867 | Chain = DAG.getMemIntrinsicNode( |
1868 | Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, |
1869 | TheStoreType, MachinePointerInfo(), PartAlign, |
1870 | MachineMemOperand::MOStore); |
1871 | InGlue = Chain.getValue(R: 1); |
1872 | |
1873 | // Cleanup. |
1874 | StoreOperands.clear(); |
1875 | |
1876 | // TODO: We may need to support vector types that can be passed |
1877 | // as scalars in variadic arguments. |
1878 | if (!IsByVal && IsVAArg) { |
1879 | assert(NumElts == 1 && |
1880 | "Vectorization is expected to be disabled for variadics." ); |
1881 | VAOffset += DL.getTypeAllocSize( |
1882 | Ty: TheStoreType.getTypeForEVT(Context&: *DAG.getContext())); |
1883 | } |
1884 | } |
1885 | if (!IsByVal) |
1886 | ++OIdx; |
1887 | } |
1888 | assert(StoreOperands.empty() && "Unfinished parameter store." ); |
1889 | if (!IsByVal && VTs.size() > 0) |
1890 | --OIdx; |
1891 | ++ParamCount; |
1892 | if (IsByVal && IsVAArg) |
1893 | VAOffset += TypeSize; |
1894 | } |
1895 | |
1896 | GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Val: Callee.getNode()); |
1897 | MaybeAlign retAlignment = std::nullopt; |
1898 | |
1899 | // Handle Result |
1900 | if (Ins.size() > 0) { |
1901 | SmallVector<EVT, 16> resvtparts; |
1902 | ComputeValueVTs(TLI: *this, DL, Ty: RetTy, ValueVTs&: resvtparts); |
1903 | |
1904 | // Declare |
1905 | // .param .align N .b8 retval0[<size-in-bytes>], or |
1906 | // .param .b<size-in-bits> retval0 |
1907 | unsigned resultsz = DL.getTypeAllocSizeInBits(Ty: RetTy); |
1908 | if (!IsTypePassedAsArray(Ty: RetTy)) { |
1909 | resultsz = promoteScalarArgumentSize(size: resultsz); |
1910 | SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); |
1911 | SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), |
1912 | DAG.getConstant(resultsz, dl, MVT::i32), |
1913 | DAG.getConstant(0, dl, MVT::i32), InGlue }; |
1914 | Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, |
1915 | DeclareRetOps); |
1916 | InGlue = Chain.getValue(R: 1); |
1917 | } else { |
1918 | retAlignment = getArgumentAlignment(CB, Ty: RetTy, Idx: 0, DL); |
1919 | assert(retAlignment && "retAlignment is guaranteed to be set" ); |
1920 | SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); |
1921 | SDValue DeclareRetOps[] = { |
1922 | Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32), |
1923 | DAG.getConstant(resultsz / 8, dl, MVT::i32), |
1924 | DAG.getConstant(0, dl, MVT::i32), InGlue}; |
1925 | Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, |
1926 | DeclareRetOps); |
1927 | InGlue = Chain.getValue(R: 1); |
1928 | } |
1929 | } |
1930 | |
1931 | bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs); |
1932 | // Set the size of the vararg param byte array if the callee is a variadic |
1933 | // function and the variadic part is not empty. |
1934 | if (HasVAArgs) { |
1935 | SDValue DeclareParamOps[] = { |
1936 | VADeclareParam.getOperand(0), VADeclareParam.getOperand(1), |
1937 | VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32), |
1938 | VADeclareParam.getOperand(4)}; |
1939 | DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(), |
1940 | VADeclareParam->getVTList(), DeclareParamOps); |
1941 | } |
1942 | |
1943 | // Both indirect calls and libcalls have nullptr Func. In order to distinguish |
1944 | // between them we must rely on the call site value which is valid for |
1945 | // indirect calls but is always null for libcalls. |
1946 | bool isIndirectCall = !Func && CB; |
1947 | |
1948 | if (isa<ExternalSymbolSDNode>(Val: Callee)) { |
1949 | Function* CalleeFunc = nullptr; |
1950 | |
1951 | // Try to find the callee in the current module. |
1952 | Callee = DAG.getSymbolFunctionGlobalAddress(Op: Callee, TargetFunction: &CalleeFunc); |
1953 | assert(CalleeFunc != nullptr && "Libcall callee must be set." ); |
1954 | |
1955 | // Set the "libcall callee" attribute to indicate that the function |
1956 | // must always have a declaration. |
1957 | CalleeFunc->addFnAttr(Kind: "nvptx-libcall-callee" , Val: "true" ); |
1958 | } |
1959 | |
1960 | if (isIndirectCall) { |
1961 | // This is indirect function call case : PTX requires a prototype of the |
1962 | // form |
1963 | // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); |
1964 | // to be emitted, and the label has to used as the last arg of call |
1965 | // instruction. |
1966 | // The prototype is embedded in a string and put as the operand for a |
1967 | // CallPrototype SDNode which will print out to the value of the string. |
1968 | SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue); |
1969 | std::string Proto = getPrototype( |
1970 | DL, retTy: RetTy, Args, Outs, retAlignment, |
1971 | VAInfo: HasVAArgs |
1972 | ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair( |
1973 | x&: CLI.NumFixedArgs, y: VADeclareParam->getConstantOperandAPInt(Num: 1))) |
1974 | : std::nullopt, |
1975 | CB: *CB, UniqueCallSite); |
1976 | const char *ProtoStr = nvTM->getStrPool().save(S: Proto).data(); |
1977 | SDValue ProtoOps[] = { |
1978 | Chain, |
1979 | DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), |
1980 | InGlue, |
1981 | }; |
1982 | Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps); |
1983 | InGlue = Chain.getValue(R: 1); |
1984 | } |
1985 | // Op to just print "call" |
1986 | SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue); |
1987 | SDValue PrintCallOps[] = { |
1988 | Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue |
1989 | }; |
1990 | // We model convergent calls as separate opcodes. |
1991 | unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni; |
1992 | if (CLI.IsConvergent) |
1993 | Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni |
1994 | : NVPTXISD::PrintConvergentCall; |
1995 | Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps); |
1996 | InGlue = Chain.getValue(R: 1); |
1997 | |
1998 | // Ops to print out the function name |
1999 | SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue); |
2000 | SDValue CallVoidOps[] = { Chain, Callee, InGlue }; |
2001 | Chain = DAG.getNode(Opcode: NVPTXISD::CallVoid, DL: dl, VTList: CallVoidVTs, Ops: CallVoidOps); |
2002 | InGlue = Chain.getValue(R: 1); |
2003 | |
2004 | // Ops to print out the param list |
2005 | SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue); |
2006 | SDValue CallArgBeginOps[] = { Chain, InGlue }; |
2007 | Chain = DAG.getNode(Opcode: NVPTXISD::CallArgBegin, DL: dl, VTList: CallArgBeginVTs, |
2008 | Ops: CallArgBeginOps); |
2009 | InGlue = Chain.getValue(R: 1); |
2010 | |
2011 | for (unsigned i = 0, e = std::min(a: CLI.NumFixedArgs + 1, b: ParamCount); i != e; |
2012 | ++i) { |
2013 | unsigned opcode; |
2014 | if (i == (e - 1)) |
2015 | opcode = NVPTXISD::LastCallArg; |
2016 | else |
2017 | opcode = NVPTXISD::CallArg; |
2018 | SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue); |
2019 | SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32), |
2020 | DAG.getConstant(i, dl, MVT::i32), InGlue }; |
2021 | Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps); |
2022 | InGlue = Chain.getValue(R: 1); |
2023 | } |
2024 | SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue); |
2025 | SDValue CallArgEndOps[] = { Chain, |
2026 | DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32), |
2027 | InGlue }; |
2028 | Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps); |
2029 | InGlue = Chain.getValue(R: 1); |
2030 | |
2031 | if (isIndirectCall) { |
2032 | SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue); |
2033 | SDValue PrototypeOps[] = { |
2034 | Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue}; |
2035 | Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps); |
2036 | InGlue = Chain.getValue(R: 1); |
2037 | } |
2038 | |
2039 | SmallVector<SDValue, 16> ProxyRegOps; |
2040 | SmallVector<std::optional<MVT>, 16> ProxyRegTruncates; |
2041 | // An item of the vector is filled if the element does not need a ProxyReg |
2042 | // operation on it and should be added to InVals as is. ProxyRegOps and |
2043 | // ProxyRegTruncates contain empty/none items at the same index. |
2044 | SmallVector<SDValue, 16> RetElts; |
2045 | // A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()` |
2046 | // to use the values of `LoadParam`s and to be replaced later then |
2047 | // `CALLSEQ_END` is added. |
2048 | SmallVector<SDValue, 16> TempProxyRegOps; |
2049 | |
2050 | // Generate loads from param memory/moves from registers for result |
2051 | if (Ins.size() > 0) { |
2052 | SmallVector<EVT, 16> VTs; |
2053 | SmallVector<uint64_t, 16> Offsets; |
2054 | ComputePTXValueVTs(TLI: *this, DL, Ty: RetTy, ValueVTs&: VTs, Offsets: &Offsets, StartingOffset: 0); |
2055 | assert(VTs.size() == Ins.size() && "Bad value decomposition" ); |
2056 | |
2057 | Align RetAlign = getArgumentAlignment(CB, Ty: RetTy, Idx: 0, DL); |
2058 | auto VectorInfo = VectorizePTXValueVTs(ValueVTs: VTs, Offsets, ParamAlignment: RetAlign); |
2059 | |
2060 | SmallVector<EVT, 6> LoadVTs; |
2061 | int VecIdx = -1; // Index of the first element of the vector. |
2062 | |
2063 | // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than |
2064 | // 32-bits are sign extended or zero extended, depending on whether |
2065 | // they are signed or unsigned types. |
2066 | bool ExtendIntegerRetVal = |
2067 | RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty: RetTy) < 32; |
2068 | |
2069 | for (unsigned i = 0, e = VTs.size(); i != e; ++i) { |
2070 | bool needTruncate = false; |
2071 | EVT TheLoadType = VTs[i]; |
2072 | EVT EltType = Ins[i].VT; |
2073 | Align EltAlign = commonAlignment(A: RetAlign, Offset: Offsets[i]); |
2074 | MVT PromotedVT; |
2075 | |
2076 | if (PromoteScalarIntegerPTX(VT: TheLoadType, PromotedVT: &PromotedVT)) { |
2077 | TheLoadType = EVT(PromotedVT); |
2078 | EltType = EVT(PromotedVT); |
2079 | needTruncate = true; |
2080 | } |
2081 | |
2082 | if (ExtendIntegerRetVal) { |
2083 | TheLoadType = MVT::i32; |
2084 | EltType = MVT::i32; |
2085 | needTruncate = true; |
2086 | } else if (TheLoadType.getSizeInBits() < 16) { |
2087 | if (VTs[i].isInteger()) |
2088 | needTruncate = true; |
2089 | EltType = MVT::i16; |
2090 | } |
2091 | |
2092 | // If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a |
2093 | // scalar load. In such cases, fall back to byte loads. |
2094 | if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() && |
2095 | EltAlign < DL.getABITypeAlign( |
2096 | Ty: TheLoadType.getTypeForEVT(Context&: *DAG.getContext()))) { |
2097 | assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list." ); |
2098 | SDValue Ret = LowerUnalignedLoadRetParam( |
2099 | DAG, Chain, Offset: Offsets[i], ElementType: TheLoadType, InGlue, TempProxyRegOps, dl); |
2100 | ProxyRegOps.push_back(Elt: SDValue()); |
2101 | ProxyRegTruncates.push_back(Elt: std::optional<MVT>()); |
2102 | RetElts.resize(N: i); |
2103 | RetElts.push_back(Elt: Ret); |
2104 | |
2105 | continue; |
2106 | } |
2107 | |
2108 | // Record index of the very first element of the vector. |
2109 | if (VectorInfo[i] & PVF_FIRST) { |
2110 | assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list." ); |
2111 | VecIdx = i; |
2112 | } |
2113 | |
2114 | LoadVTs.push_back(Elt: EltType); |
2115 | |
2116 | if (VectorInfo[i] & PVF_LAST) { |
2117 | unsigned NumElts = LoadVTs.size(); |
2118 | LoadVTs.push_back(MVT::Other); |
2119 | LoadVTs.push_back(MVT::Glue); |
2120 | NVPTXISD::NodeType Op; |
2121 | switch (NumElts) { |
2122 | case 1: |
2123 | Op = NVPTXISD::LoadParam; |
2124 | break; |
2125 | case 2: |
2126 | Op = NVPTXISD::LoadParamV2; |
2127 | break; |
2128 | case 4: |
2129 | Op = NVPTXISD::LoadParamV4; |
2130 | break; |
2131 | default: |
2132 | llvm_unreachable("Invalid vector info." ); |
2133 | } |
2134 | |
2135 | SDValue LoadOperands[] = { |
2136 | Chain, DAG.getConstant(1, dl, MVT::i32), |
2137 | DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue}; |
2138 | SDValue RetVal = DAG.getMemIntrinsicNode( |
2139 | Op, dl, DAG.getVTList(VTs: LoadVTs), LoadOperands, TheLoadType, |
2140 | MachinePointerInfo(), EltAlign, |
2141 | MachineMemOperand::MOLoad); |
2142 | |
2143 | for (unsigned j = 0; j < NumElts; ++j) { |
2144 | ProxyRegOps.push_back(Elt: RetVal.getValue(R: j)); |
2145 | |
2146 | if (needTruncate) |
2147 | ProxyRegTruncates.push_back(Elt: std::optional<MVT>(Ins[VecIdx + j].VT)); |
2148 | else |
2149 | ProxyRegTruncates.push_back(Elt: std::optional<MVT>()); |
2150 | } |
2151 | |
2152 | Chain = RetVal.getValue(R: NumElts); |
2153 | InGlue = RetVal.getValue(R: NumElts + 1); |
2154 | |
2155 | // Cleanup |
2156 | VecIdx = -1; |
2157 | LoadVTs.clear(); |
2158 | } |
2159 | } |
2160 | } |
2161 | |
2162 | Chain = |
2163 | DAG.getCALLSEQ_END(Chain, Size1: UniqueCallSite, Size2: UniqueCallSite + 1, Glue: InGlue, DL: dl); |
2164 | InGlue = Chain.getValue(R: 1); |
2165 | |
2166 | // Append ProxyReg instructions to the chain to make sure that `callseq_end` |
2167 | // will not get lost. Otherwise, during libcalls expansion, the nodes can become |
2168 | // dangling. |
2169 | for (unsigned i = 0; i < ProxyRegOps.size(); ++i) { |
2170 | if (i < RetElts.size() && RetElts[i]) { |
2171 | InVals.push_back(Elt: RetElts[i]); |
2172 | continue; |
2173 | } |
2174 | |
2175 | SDValue Ret = DAG.getNode( |
2176 | NVPTXISD::ProxyReg, dl, |
2177 | DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue), |
2178 | { Chain, ProxyRegOps[i], InGlue } |
2179 | ); |
2180 | |
2181 | Chain = Ret.getValue(R: 1); |
2182 | InGlue = Ret.getValue(R: 2); |
2183 | |
2184 | if (ProxyRegTruncates[i]) { |
2185 | Ret = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: *ProxyRegTruncates[i], Operand: Ret); |
2186 | } |
2187 | |
2188 | InVals.push_back(Elt: Ret); |
2189 | } |
2190 | |
2191 | for (SDValue &T : TempProxyRegOps) { |
2192 | SDValue Repl = DAG.getNode( |
2193 | NVPTXISD::ProxyReg, dl, |
2194 | DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue), |
2195 | {Chain, T.getOperand(0), InGlue}); |
2196 | DAG.ReplaceAllUsesWith(From: T, To: Repl); |
2197 | DAG.RemoveDeadNode(N: T.getNode()); |
2198 | |
2199 | Chain = Repl.getValue(R: 1); |
2200 | InGlue = Repl.getValue(R: 2); |
2201 | } |
2202 | |
2203 | // set isTailCall to false for now, until we figure out how to express |
2204 | // tail call optimization in PTX |
2205 | isTailCall = false; |
2206 | return Chain; |
2207 | } |
2208 | |
2209 | SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, |
2210 | SelectionDAG &DAG) const { |
2211 | |
2212 | if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) { |
2213 | const Function &Fn = DAG.getMachineFunction().getFunction(); |
2214 | |
2215 | DiagnosticInfoUnsupported NoDynamicAlloca( |
2216 | Fn, |
2217 | "Support for dynamic alloca introduced in PTX ISA version 7.3 and " |
2218 | "requires target sm_52." , |
2219 | SDLoc(Op).getDebugLoc()); |
2220 | DAG.getContext()->diagnose(DI: NoDynamicAlloca); |
2221 | auto Ops = {DAG.getConstant(Val: 0, DL: SDLoc(), VT: Op.getValueType()), |
2222 | Op.getOperand(i: 0)}; |
2223 | return DAG.getMergeValues(Ops, dl: SDLoc()); |
2224 | } |
2225 | |
2226 | SDValue Chain = Op.getOperand(i: 0); |
2227 | SDValue Size = Op.getOperand(i: 1); |
2228 | uint64_t Align = cast<ConstantSDNode>(Val: Op.getOperand(i: 2))->getZExtValue(); |
2229 | SDLoc DL(Op.getNode()); |
2230 | |
2231 | // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32. |
2232 | if (nvTM->is64Bit()) |
2233 | Size = DAG.getZExtOrTrunc(Size, DL, MVT::i64); |
2234 | else |
2235 | Size = DAG.getZExtOrTrunc(Size, DL, MVT::i32); |
2236 | |
2237 | SDValue AllocOps[] = {Chain, Size, |
2238 | DAG.getTargetConstant(Align, DL, MVT::i32)}; |
2239 | SDValue Alloca = DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, |
2240 | nvTM->is64Bit() ? MVT::i64 : MVT::i32, AllocOps); |
2241 | |
2242 | SDValue MergeOps[] = {Alloca, Chain}; |
2243 | return DAG.getMergeValues(Ops: MergeOps, dl: DL); |
2244 | } |
2245 | |
2246 | // By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() |
2247 | // (see LegalizeDAG.cpp). This is slow and uses local memory. |
2248 | // We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 |
2249 | SDValue |
2250 | NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { |
2251 | SDNode *Node = Op.getNode(); |
2252 | SDLoc dl(Node); |
2253 | SmallVector<SDValue, 8> Ops; |
2254 | unsigned NumOperands = Node->getNumOperands(); |
2255 | for (unsigned i = 0; i < NumOperands; ++i) { |
2256 | SDValue SubOp = Node->getOperand(Num: i); |
2257 | EVT VVT = SubOp.getNode()->getValueType(ResNo: 0); |
2258 | EVT EltVT = VVT.getVectorElementType(); |
2259 | unsigned NumSubElem = VVT.getVectorNumElements(); |
2260 | for (unsigned j = 0; j < NumSubElem; ++j) { |
2261 | Ops.push_back(Elt: DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: EltVT, N1: SubOp, |
2262 | N2: DAG.getIntPtrConstant(Val: j, DL: dl))); |
2263 | } |
2264 | } |
2265 | return DAG.getBuildVector(VT: Node->getValueType(ResNo: 0), DL: dl, Ops); |
2266 | } |
2267 | |
2268 | // We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it |
2269 | // would get lowered as two constant loads and vector-packing move. |
2270 | // Instead we want just a constant move: |
2271 | // mov.b32 %r2, 0x40003C00 |
2272 | SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, |
2273 | SelectionDAG &DAG) const { |
2274 | EVT VT = Op->getValueType(ResNo: 0); |
2275 | if (!(Isv2x16VT(VT) || VT == MVT::v4i8)) |
2276 | return Op; |
2277 | |
2278 | SDLoc DL(Op); |
2279 | |
2280 | if (!llvm::all_of(Range: Op->ops(), P: [](SDValue Operand) { |
2281 | return Operand->isUndef() || isa<ConstantSDNode>(Val: Operand) || |
2282 | isa<ConstantFPSDNode>(Val: Operand); |
2283 | })) { |
2284 | // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us |
2285 | // to optimize calculation of constant parts. |
2286 | if (VT == MVT::v4i8) { |
2287 | SDValue C8 = DAG.getConstant(8, DL, MVT::i32); |
2288 | SDValue E01 = DAG.getNode( |
2289 | NVPTXISD::BFI, DL, MVT::i32, |
2290 | DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32), |
2291 | DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8); |
2292 | SDValue E012 = |
2293 | DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, |
2294 | DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32), |
2295 | E01, DAG.getConstant(16, DL, MVT::i32), C8); |
2296 | SDValue E0123 = |
2297 | DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, |
2298 | DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32), |
2299 | E012, DAG.getConstant(24, DL, MVT::i32), C8); |
2300 | return DAG.getNode(Opcode: ISD::BITCAST, DL, VT, Operand: E0123); |
2301 | } |
2302 | return Op; |
2303 | } |
2304 | |
2305 | // Get value or the Nth operand as an APInt(32). Undef values treated as 0. |
2306 | auto GetOperand = [](SDValue Op, int N) -> APInt { |
2307 | const SDValue &Operand = Op->getOperand(Num: N); |
2308 | EVT VT = Op->getValueType(ResNo: 0); |
2309 | if (Operand->isUndef()) |
2310 | return APInt(32, 0); |
2311 | APInt Value; |
2312 | if (VT == MVT::v2f16 || VT == MVT::v2bf16) |
2313 | Value = cast<ConstantFPSDNode>(Val: Operand)->getValueAPF().bitcastToAPInt(); |
2314 | else if (VT == MVT::v2i16 || VT == MVT::v4i8) |
2315 | Value = Operand->getAsAPIntVal(); |
2316 | else |
2317 | llvm_unreachable("Unsupported type" ); |
2318 | // i8 values are carried around as i16, so we need to zero out upper bits, |
2319 | // so they do not get in the way of combining individual byte values |
2320 | if (VT == MVT::v4i8) |
2321 | Value = Value.trunc(width: 8); |
2322 | return Value.zext(width: 32); |
2323 | }; |
2324 | APInt Value; |
2325 | if (Isv2x16VT(VT)) { |
2326 | Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(shiftAmt: 16); |
2327 | } else if (VT == MVT::v4i8) { |
2328 | Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(shiftAmt: 8) | |
2329 | GetOperand(Op, 2).shl(shiftAmt: 16) | GetOperand(Op, 3).shl(shiftAmt: 24); |
2330 | } else { |
2331 | llvm_unreachable("Unsupported type" ); |
2332 | } |
2333 | SDValue Const = DAG.getConstant(Value, SDLoc(Op), MVT::i32); |
2334 | return DAG.getNode(Opcode: ISD::BITCAST, DL: SDLoc(Op), VT: Op->getValueType(ResNo: 0), Operand: Const); |
2335 | } |
2336 | |
2337 | SDValue NVPTXTargetLowering::(SDValue Op, |
2338 | SelectionDAG &DAG) const { |
2339 | SDValue Index = Op->getOperand(Num: 1); |
2340 | SDValue Vector = Op->getOperand(Num: 0); |
2341 | SDLoc DL(Op); |
2342 | EVT VectorVT = Vector.getValueType(); |
2343 | |
2344 | if (VectorVT == MVT::v4i8) { |
2345 | SDValue BFE = |
2346 | DAG.getNode(NVPTXISD::BFE, DL, MVT::i32, |
2347 | {Vector, |
2348 | DAG.getNode(ISD::MUL, DL, MVT::i32, |
2349 | DAG.getZExtOrTrunc(Index, DL, MVT::i32), |
2350 | DAG.getConstant(8, DL, MVT::i32)), |
2351 | DAG.getConstant(8, DL, MVT::i32)}); |
2352 | return DAG.getAnyExtOrTrunc(Op: BFE, DL, VT: Op->getValueType(ResNo: 0)); |
2353 | } |
2354 | |
2355 | // Constant index will be matched by tablegen. |
2356 | if (isa<ConstantSDNode>(Val: Index.getNode())) |
2357 | return Op; |
2358 | |
2359 | // Extract individual elements and select one of them. |
2360 | assert(Isv2x16VT(VectorVT) && "Unexpected vector type." ); |
2361 | EVT EltVT = VectorVT.getVectorElementType(); |
2362 | |
2363 | SDLoc dl(Op.getNode()); |
2364 | SDValue E0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: EltVT, N1: Vector, |
2365 | N2: DAG.getIntPtrConstant(Val: 0, DL: dl)); |
2366 | SDValue E1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: EltVT, N1: Vector, |
2367 | N2: DAG.getIntPtrConstant(Val: 1, DL: dl)); |
2368 | return DAG.getSelectCC(DL: dl, LHS: Index, RHS: DAG.getIntPtrConstant(Val: 0, DL: dl), True: E0, False: E1, |
2369 | Cond: ISD::CondCode::SETEQ); |
2370 | } |
2371 | |
2372 | SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, |
2373 | SelectionDAG &DAG) const { |
2374 | SDValue Vector = Op->getOperand(Num: 0); |
2375 | EVT VectorVT = Vector.getValueType(); |
2376 | |
2377 | if (VectorVT != MVT::v4i8) |
2378 | return Op; |
2379 | SDLoc DL(Op); |
2380 | SDValue Value = Op->getOperand(Num: 1); |
2381 | if (Value->isUndef()) |
2382 | return Vector; |
2383 | |
2384 | SDValue Index = Op->getOperand(Num: 2); |
2385 | |
2386 | SDValue BFI = |
2387 | DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, |
2388 | {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector, |
2389 | DAG.getNode(ISD::MUL, DL, MVT::i32, |
2390 | DAG.getZExtOrTrunc(Index, DL, MVT::i32), |
2391 | DAG.getConstant(8, DL, MVT::i32)), |
2392 | DAG.getConstant(8, DL, MVT::i32)}); |
2393 | return DAG.getNode(Opcode: ISD::BITCAST, DL, VT: Op->getValueType(ResNo: 0), Operand: BFI); |
2394 | } |
2395 | |
2396 | SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, |
2397 | SelectionDAG &DAG) const { |
2398 | SDValue V1 = Op.getOperand(i: 0); |
2399 | EVT VectorVT = V1.getValueType(); |
2400 | if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8) |
2401 | return Op; |
2402 | |
2403 | // Lower shuffle to PRMT instruction. |
2404 | const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Val: Op.getNode()); |
2405 | SDValue V2 = Op.getOperand(i: 1); |
2406 | uint32_t Selector = 0; |
2407 | for (auto I : llvm::enumerate(First: SVN->getMask())) { |
2408 | if (I.value() != -1) // -1 is a placeholder for undef. |
2409 | Selector |= (I.value() << (I.index() * 4)); |
2410 | } |
2411 | |
2412 | SDLoc DL(Op); |
2413 | return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2, |
2414 | DAG.getConstant(Selector, DL, MVT::i32), |
2415 | DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)); |
2416 | } |
2417 | /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which |
2418 | /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift |
2419 | /// amount, or |
2420 | /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift |
2421 | /// amount. |
2422 | SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op, |
2423 | SelectionDAG &DAG) const { |
2424 | assert(Op.getNumOperands() == 3 && "Not a double-shift!" ); |
2425 | assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); |
2426 | |
2427 | EVT VT = Op.getValueType(); |
2428 | unsigned VTBits = VT.getSizeInBits(); |
2429 | SDLoc dl(Op); |
2430 | SDValue ShOpLo = Op.getOperand(i: 0); |
2431 | SDValue ShOpHi = Op.getOperand(i: 1); |
2432 | SDValue ShAmt = Op.getOperand(i: 2); |
2433 | unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; |
2434 | |
2435 | if (VTBits == 32 && STI.getSmVersion() >= 35) { |
2436 | // For 32bit and sm35, we can use the funnel shift 'shf' instruction. |
2437 | // {dHi, dLo} = {aHi, aLo} >> Amt |
2438 | // dHi = aHi >> Amt |
2439 | // dLo = shf.r.clamp aLo, aHi, Amt |
2440 | |
2441 | SDValue Hi = DAG.getNode(Opcode: Opc, DL: dl, VT, N1: ShOpHi, N2: ShAmt); |
2442 | SDValue Lo = DAG.getNode(Opcode: NVPTXISD::FUN_SHFR_CLAMP, DL: dl, VT, N1: ShOpLo, N2: ShOpHi, |
2443 | N3: ShAmt); |
2444 | |
2445 | SDValue Ops[2] = { Lo, Hi }; |
2446 | return DAG.getMergeValues(Ops, dl); |
2447 | } |
2448 | else { |
2449 | // {dHi, dLo} = {aHi, aLo} >> Amt |
2450 | // - if (Amt>=size) then |
2451 | // dLo = aHi >> (Amt-size) |
2452 | // dHi = aHi >> Amt (this is either all 0 or all 1) |
2453 | // else |
2454 | // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) |
2455 | // dHi = aHi >> Amt |
2456 | |
2457 | SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, |
2458 | DAG.getConstant(VTBits, dl, MVT::i32), |
2459 | ShAmt); |
2460 | SDValue Tmp1 = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT, N1: ShOpLo, N2: ShAmt); |
2461 | SDValue = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, |
2462 | DAG.getConstant(VTBits, dl, MVT::i32)); |
2463 | SDValue Tmp2 = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: ShOpHi, N2: RevShAmt); |
2464 | SDValue FalseVal = DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Tmp1, N2: Tmp2); |
2465 | SDValue TrueVal = DAG.getNode(Opcode: Opc, DL: dl, VT, N1: ShOpHi, N2: ExtraShAmt); |
2466 | |
2467 | SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, |
2468 | DAG.getConstant(VTBits, dl, MVT::i32), |
2469 | ISD::SETGE); |
2470 | SDValue Hi = DAG.getNode(Opcode: Opc, DL: dl, VT, N1: ShOpHi, N2: ShAmt); |
2471 | SDValue Lo = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT, N1: Cmp, N2: TrueVal, N3: FalseVal); |
2472 | |
2473 | SDValue Ops[2] = { Lo, Hi }; |
2474 | return DAG.getMergeValues(Ops, dl); |
2475 | } |
2476 | } |
2477 | |
2478 | /// LowerShiftLeftParts - Lower SHL_PARTS, which |
2479 | /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift |
2480 | /// amount, or |
2481 | /// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift |
2482 | /// amount. |
2483 | SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op, |
2484 | SelectionDAG &DAG) const { |
2485 | assert(Op.getNumOperands() == 3 && "Not a double-shift!" ); |
2486 | assert(Op.getOpcode() == ISD::SHL_PARTS); |
2487 | |
2488 | EVT VT = Op.getValueType(); |
2489 | unsigned VTBits = VT.getSizeInBits(); |
2490 | SDLoc dl(Op); |
2491 | SDValue ShOpLo = Op.getOperand(i: 0); |
2492 | SDValue ShOpHi = Op.getOperand(i: 1); |
2493 | SDValue ShAmt = Op.getOperand(i: 2); |
2494 | |
2495 | if (VTBits == 32 && STI.getSmVersion() >= 35) { |
2496 | // For 32bit and sm35, we can use the funnel shift 'shf' instruction. |
2497 | // {dHi, dLo} = {aHi, aLo} << Amt |
2498 | // dHi = shf.l.clamp aLo, aHi, Amt |
2499 | // dLo = aLo << Amt |
2500 | |
2501 | SDValue Hi = DAG.getNode(Opcode: NVPTXISD::FUN_SHFL_CLAMP, DL: dl, VT, N1: ShOpLo, N2: ShOpHi, |
2502 | N3: ShAmt); |
2503 | SDValue Lo = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: ShOpLo, N2: ShAmt); |
2504 | |
2505 | SDValue Ops[2] = { Lo, Hi }; |
2506 | return DAG.getMergeValues(Ops, dl); |
2507 | } |
2508 | else { |
2509 | // {dHi, dLo} = {aHi, aLo} << Amt |
2510 | // - if (Amt>=size) then |
2511 | // dLo = aLo << Amt (all 0) |
2512 | // dLo = aLo << (Amt-size) |
2513 | // else |
2514 | // dLo = aLo << Amt |
2515 | // dHi = (aHi << Amt) | (aLo >> (size-Amt)) |
2516 | |
2517 | SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, |
2518 | DAG.getConstant(VTBits, dl, MVT::i32), |
2519 | ShAmt); |
2520 | SDValue Tmp1 = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: ShOpHi, N2: ShAmt); |
2521 | SDValue = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, |
2522 | DAG.getConstant(VTBits, dl, MVT::i32)); |
2523 | SDValue Tmp2 = DAG.getNode(Opcode: ISD::SRL, DL: dl, VT, N1: ShOpLo, N2: RevShAmt); |
2524 | SDValue FalseVal = DAG.getNode(Opcode: ISD::OR, DL: dl, VT, N1: Tmp1, N2: Tmp2); |
2525 | SDValue TrueVal = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: ShOpLo, N2: ExtraShAmt); |
2526 | |
2527 | SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt, |
2528 | DAG.getConstant(VTBits, dl, MVT::i32), |
2529 | ISD::SETGE); |
2530 | SDValue Lo = DAG.getNode(Opcode: ISD::SHL, DL: dl, VT, N1: ShOpLo, N2: ShAmt); |
2531 | SDValue Hi = DAG.getNode(Opcode: ISD::SELECT, DL: dl, VT, N1: Cmp, N2: TrueVal, N3: FalseVal); |
2532 | |
2533 | SDValue Ops[2] = { Lo, Hi }; |
2534 | return DAG.getMergeValues(Ops, dl); |
2535 | } |
2536 | } |
2537 | |
2538 | SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { |
2539 | EVT VT = Op.getValueType(); |
2540 | |
2541 | if (VT == MVT::f32) |
2542 | return LowerFROUND32(Op, DAG); |
2543 | |
2544 | if (VT == MVT::f64) |
2545 | return LowerFROUND64(Op, DAG); |
2546 | |
2547 | llvm_unreachable("unhandled type" ); |
2548 | } |
2549 | |
2550 | // This is the the rounding method used in CUDA libdevice in C like code: |
2551 | // float roundf(float A) |
2552 | // { |
2553 | // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)); |
2554 | // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; |
2555 | // return abs(A) < 0.5 ? (float)(int)A : RoundedA; |
2556 | // } |
2557 | SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op, |
2558 | SelectionDAG &DAG) const { |
2559 | SDLoc SL(Op); |
2560 | SDValue A = Op.getOperand(i: 0); |
2561 | EVT VT = Op.getValueType(); |
2562 | |
2563 | SDValue AbsA = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: A); |
2564 | |
2565 | // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)) |
2566 | SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A); |
2567 | const int SignBitMask = 0x80000000; |
2568 | SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast, |
2569 | DAG.getConstant(SignBitMask, SL, MVT::i32)); |
2570 | const int PointFiveInBits = 0x3F000000; |
2571 | SDValue PointFiveWithSignRaw = |
2572 | DAG.getNode(ISD::OR, SL, MVT::i32, Sign, |
2573 | DAG.getConstant(PointFiveInBits, SL, MVT::i32)); |
2574 | SDValue PointFiveWithSign = |
2575 | DAG.getNode(Opcode: ISD::BITCAST, DL: SL, VT, Operand: PointFiveWithSignRaw); |
2576 | SDValue AdjustedA = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: A, N2: PointFiveWithSign); |
2577 | SDValue RoundedA = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: AdjustedA); |
2578 | |
2579 | // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; |
2580 | EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Ctx&: *DAG.getContext(), VT); |
2581 | SDValue IsLarge = |
2582 | DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsA, RHS: DAG.getConstantFP(Val: pow(x: 2.0, y: 23.0), DL: SL, VT), |
2583 | Cond: ISD::SETOGT); |
2584 | RoundedA = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLarge, N2: A, N3: RoundedA); |
2585 | |
2586 | // return abs(A) < 0.5 ? (float)(int)A : RoundedA; |
2587 | SDValue IsSmall =DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsA, |
2588 | RHS: DAG.getConstantFP(Val: 0.5, DL: SL, VT), Cond: ISD::SETOLT); |
2589 | SDValue RoundedAForSmallA = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: A); |
2590 | return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsSmall, N2: RoundedAForSmallA, N3: RoundedA); |
2591 | } |
2592 | |
2593 | // The implementation of round(double) is similar to that of round(float) in |
2594 | // that they both separate the value range into three regions and use a method |
2595 | // specific to the region to round the values. However, round(double) first |
2596 | // calculates the round of the absolute value and then adds the sign back while |
2597 | // round(float) directly rounds the value with sign. |
2598 | SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op, |
2599 | SelectionDAG &DAG) const { |
2600 | SDLoc SL(Op); |
2601 | SDValue A = Op.getOperand(i: 0); |
2602 | EVT VT = Op.getValueType(); |
2603 | |
2604 | SDValue AbsA = DAG.getNode(Opcode: ISD::FABS, DL: SL, VT, Operand: A); |
2605 | |
2606 | // double RoundedA = (double) (int) (abs(A) + 0.5f); |
2607 | SDValue AdjustedA = DAG.getNode(Opcode: ISD::FADD, DL: SL, VT, N1: AbsA, |
2608 | N2: DAG.getConstantFP(Val: 0.5, DL: SL, VT)); |
2609 | SDValue RoundedA = DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: AdjustedA); |
2610 | |
2611 | // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA; |
2612 | EVT SetCCVT = getSetCCResultType(DL: DAG.getDataLayout(), Ctx&: *DAG.getContext(), VT); |
2613 | SDValue IsSmall =DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsA, |
2614 | RHS: DAG.getConstantFP(Val: 0.5, DL: SL, VT), Cond: ISD::SETOLT); |
2615 | RoundedA = DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsSmall, |
2616 | N2: DAG.getConstantFP(Val: 0, DL: SL, VT), |
2617 | N3: RoundedA); |
2618 | |
2619 | // Add sign to rounded_A |
2620 | RoundedA = DAG.getNode(Opcode: ISD::FCOPYSIGN, DL: SL, VT, N1: RoundedA, N2: A); |
2621 | DAG.getNode(Opcode: ISD::FTRUNC, DL: SL, VT, Operand: A); |
2622 | |
2623 | // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA; |
2624 | SDValue IsLarge = |
2625 | DAG.getSetCC(DL: SL, VT: SetCCVT, LHS: AbsA, RHS: DAG.getConstantFP(Val: pow(x: 2.0, y: 52.0), DL: SL, VT), |
2626 | Cond: ISD::SETOGT); |
2627 | return DAG.getNode(Opcode: ISD::SELECT, DL: SL, VT, N1: IsLarge, N2: A, N3: RoundedA); |
2628 | } |
2629 | |
2630 | SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op, |
2631 | SelectionDAG &DAG) const { |
2632 | assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78); |
2633 | |
2634 | if (Op.getValueType() == MVT::bf16) { |
2635 | SDLoc Loc(Op); |
2636 | return DAG.getNode( |
2637 | ISD::FP_ROUND, Loc, MVT::bf16, |
2638 | DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)), |
2639 | DAG.getIntPtrConstant(0, Loc)); |
2640 | } |
2641 | |
2642 | // Everything else is considered legal. |
2643 | return Op; |
2644 | } |
2645 | |
2646 | SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op, |
2647 | SelectionDAG &DAG) const { |
2648 | assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78); |
2649 | |
2650 | if (Op.getOperand(0).getValueType() == MVT::bf16) { |
2651 | SDLoc Loc(Op); |
2652 | return DAG.getNode( |
2653 | Op.getOpcode(), Loc, Op.getValueType(), |
2654 | DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0))); |
2655 | } |
2656 | |
2657 | // Everything else is considered legal. |
2658 | return Op; |
2659 | } |
2660 | |
2661 | SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op, |
2662 | SelectionDAG &DAG) const { |
2663 | EVT NarrowVT = Op.getValueType(); |
2664 | SDValue Wide = Op.getOperand(i: 0); |
2665 | EVT WideVT = Wide.getValueType(); |
2666 | if (NarrowVT.getScalarType() == MVT::bf16) { |
2667 | const TargetLowering *TLI = STI.getTargetLowering(); |
2668 | if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) { |
2669 | return TLI->expandFP_ROUND(Node: Op.getNode(), DAG); |
2670 | } |
2671 | if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) { |
2672 | // This combination was the first to support f32 -> bf16. |
2673 | if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) { |
2674 | if (WideVT.getScalarType() == MVT::f32) { |
2675 | return Op; |
2676 | } |
2677 | if (WideVT.getScalarType() == MVT::f64) { |
2678 | SDLoc Loc(Op); |
2679 | // Round-inexact-to-odd f64 to f32, then do the final rounding using |
2680 | // the hardware f32 -> bf16 instruction. |
2681 | SDValue rod = TLI->expandRoundInexactToOdd( |
2682 | WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32) |
2683 | : MVT::f32, |
2684 | Wide, Loc, DAG); |
2685 | return DAG.getFPExtendOrRound(Op: rod, DL: Loc, VT: NarrowVT); |
2686 | } |
2687 | } |
2688 | return TLI->expandFP_ROUND(Node: Op.getNode(), DAG); |
2689 | } |
2690 | } |
2691 | |
2692 | // Everything else is considered legal. |
2693 | return Op; |
2694 | } |
2695 | |
2696 | SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op, |
2697 | SelectionDAG &DAG) const { |
2698 | SDValue Narrow = Op.getOperand(i: 0); |
2699 | EVT NarrowVT = Narrow.getValueType(); |
2700 | EVT WideVT = Op.getValueType(); |
2701 | if (NarrowVT.getScalarType() == MVT::bf16) { |
2702 | if (WideVT.getScalarType() == MVT::f32 && |
2703 | (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) { |
2704 | SDLoc Loc(Op); |
2705 | return DAG.getNode(Opcode: ISD::BF16_TO_FP, DL: Loc, VT: WideVT, Operand: Narrow); |
2706 | } |
2707 | if (WideVT.getScalarType() == MVT::f64 && |
2708 | (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) { |
2709 | EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32) |
2710 | : MVT::f32; |
2711 | SDLoc Loc(Op); |
2712 | if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) { |
2713 | Op = DAG.getNode(Opcode: ISD::FP_EXTEND, DL: Loc, VT: F32, Operand: Narrow); |
2714 | } else { |
2715 | Op = DAG.getNode(Opcode: ISD::BF16_TO_FP, DL: Loc, VT: F32, Operand: Narrow); |
2716 | } |
2717 | return DAG.getNode(Opcode: ISD::FP_EXTEND, DL: Loc, VT: WideVT, Operand: Op); |
2718 | } |
2719 | } |
2720 | |
2721 | // Everything else is considered legal. |
2722 | return Op; |
2723 | } |
2724 | |
2725 | static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) { |
2726 | SDLoc DL(Op); |
2727 | if (Op.getValueType() != MVT::v2i16) |
2728 | return Op; |
2729 | EVT EltVT = Op.getValueType().getVectorElementType(); |
2730 | SmallVector<SDValue> VecElements; |
2731 | for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) { |
2732 | SmallVector<SDValue> ScalarArgs; |
2733 | llvm::transform(Range: Op->ops(), d_first: std::back_inserter(x&: ScalarArgs), |
2734 | F: [&](const SDUse &O) { |
2735 | return DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: EltVT, |
2736 | N1: O.get(), N2: DAG.getIntPtrConstant(Val: I, DL)); |
2737 | }); |
2738 | VecElements.push_back(Elt: DAG.getNode(Opcode: Op.getOpcode(), DL, VT: EltVT, Ops: ScalarArgs)); |
2739 | } |
2740 | SDValue V = |
2741 | DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: Op.getValueType(), Ops: VecElements); |
2742 | return V; |
2743 | } |
2744 | |
2745 | SDValue |
2746 | NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { |
2747 | switch (Op.getOpcode()) { |
2748 | case ISD::RETURNADDR: |
2749 | return SDValue(); |
2750 | case ISD::FRAMEADDR: |
2751 | return SDValue(); |
2752 | case ISD::GlobalAddress: |
2753 | return LowerGlobalAddress(Op, DAG); |
2754 | case ISD::INTRINSIC_W_CHAIN: |
2755 | return Op; |
2756 | case ISD::BUILD_VECTOR: |
2757 | return LowerBUILD_VECTOR(Op, DAG); |
2758 | case ISD::EXTRACT_SUBVECTOR: |
2759 | return Op; |
2760 | case ISD::EXTRACT_VECTOR_ELT: |
2761 | return LowerEXTRACT_VECTOR_ELT(Op, DAG); |
2762 | case ISD::INSERT_VECTOR_ELT: |
2763 | return LowerINSERT_VECTOR_ELT(Op, DAG); |
2764 | case ISD::VECTOR_SHUFFLE: |
2765 | return LowerVECTOR_SHUFFLE(Op, DAG); |
2766 | case ISD::CONCAT_VECTORS: |
2767 | return LowerCONCAT_VECTORS(Op, DAG); |
2768 | case ISD::STORE: |
2769 | return LowerSTORE(Op, DAG); |
2770 | case ISD::LOAD: |
2771 | return LowerLOAD(Op, DAG); |
2772 | case ISD::SHL_PARTS: |
2773 | return LowerShiftLeftParts(Op, DAG); |
2774 | case ISD::SRA_PARTS: |
2775 | case ISD::SRL_PARTS: |
2776 | return LowerShiftRightParts(Op, DAG); |
2777 | case ISD::SELECT: |
2778 | return LowerSelect(Op, DAG); |
2779 | case ISD::FROUND: |
2780 | return LowerFROUND(Op, DAG); |
2781 | case ISD::SINT_TO_FP: |
2782 | case ISD::UINT_TO_FP: |
2783 | return LowerINT_TO_FP(Op, DAG); |
2784 | case ISD::FP_TO_SINT: |
2785 | case ISD::FP_TO_UINT: |
2786 | return LowerFP_TO_INT(Op, DAG); |
2787 | case ISD::FP_ROUND: |
2788 | return LowerFP_ROUND(Op, DAG); |
2789 | case ISD::FP_EXTEND: |
2790 | return LowerFP_EXTEND(Op, DAG); |
2791 | case ISD::VAARG: |
2792 | return LowerVAARG(Op, DAG); |
2793 | case ISD::VASTART: |
2794 | return LowerVASTART(Op, DAG); |
2795 | case ISD::ABS: |
2796 | case ISD::SMIN: |
2797 | case ISD::SMAX: |
2798 | case ISD::UMIN: |
2799 | case ISD::UMAX: |
2800 | case ISD::ADD: |
2801 | case ISD::SUB: |
2802 | case ISD::MUL: |
2803 | case ISD::SHL: |
2804 | case ISD::SREM: |
2805 | case ISD::UREM: |
2806 | return LowerVectorArith(Op, DAG); |
2807 | case ISD::DYNAMIC_STACKALLOC: |
2808 | return LowerDYNAMIC_STACKALLOC(Op, DAG); |
2809 | default: |
2810 | llvm_unreachable("Custom lowering not defined for operation" ); |
2811 | } |
2812 | } |
2813 | |
2814 | // This function is almost a copy of SelectionDAG::expandVAArg(). |
2815 | // The only diff is that this one produces loads from local address space. |
2816 | SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { |
2817 | const TargetLowering *TLI = STI.getTargetLowering(); |
2818 | SDLoc DL(Op); |
2819 | |
2820 | SDNode *Node = Op.getNode(); |
2821 | const Value *V = cast<SrcValueSDNode>(Val: Node->getOperand(Num: 2))->getValue(); |
2822 | EVT VT = Node->getValueType(ResNo: 0); |
2823 | auto *Ty = VT.getTypeForEVT(Context&: *DAG.getContext()); |
2824 | SDValue Tmp1 = Node->getOperand(Num: 0); |
2825 | SDValue Tmp2 = Node->getOperand(Num: 1); |
2826 | const MaybeAlign MA(Node->getConstantOperandVal(Num: 3)); |
2827 | |
2828 | SDValue VAListLoad = DAG.getLoad(VT: TLI->getPointerTy(DL: DAG.getDataLayout()), dl: DL, |
2829 | Chain: Tmp1, Ptr: Tmp2, PtrInfo: MachinePointerInfo(V)); |
2830 | SDValue VAList = VAListLoad; |
2831 | |
2832 | if (MA && *MA > TLI->getMinStackArgumentAlignment()) { |
2833 | VAList = DAG.getNode( |
2834 | Opcode: ISD::ADD, DL, VT: VAList.getValueType(), N1: VAList, |
2835 | N2: DAG.getConstant(Val: MA->value() - 1, DL, VT: VAList.getValueType())); |
2836 | |
2837 | VAList = DAG.getNode( |
2838 | Opcode: ISD::AND, DL, VT: VAList.getValueType(), N1: VAList, |
2839 | N2: DAG.getConstant(Val: -(int64_t)MA->value(), DL, VT: VAList.getValueType())); |
2840 | } |
2841 | |
2842 | // Increment the pointer, VAList, to the next vaarg |
2843 | Tmp1 = DAG.getNode(Opcode: ISD::ADD, DL, VT: VAList.getValueType(), N1: VAList, |
2844 | N2: DAG.getConstant(Val: DAG.getDataLayout().getTypeAllocSize(Ty), |
2845 | DL, VT: VAList.getValueType())); |
2846 | |
2847 | // Store the incremented VAList to the legalized pointer |
2848 | Tmp1 = DAG.getStore(Chain: VAListLoad.getValue(R: 1), dl: DL, Val: Tmp1, Ptr: Tmp2, |
2849 | PtrInfo: MachinePointerInfo(V)); |
2850 | |
2851 | const Value *SrcV = |
2852 | Constant::getNullValue(Ty: PointerType::get(ElementType: Ty, AddressSpace: ADDRESS_SPACE_LOCAL)); |
2853 | |
2854 | // Load the actual argument out of the pointer VAList |
2855 | return DAG.getLoad(VT, dl: DL, Chain: Tmp1, Ptr: VAList, PtrInfo: MachinePointerInfo(SrcV)); |
2856 | } |
2857 | |
2858 | SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { |
2859 | const TargetLowering *TLI = STI.getTargetLowering(); |
2860 | SDLoc DL(Op); |
2861 | EVT PtrVT = TLI->getPointerTy(DL: DAG.getDataLayout()); |
2862 | |
2863 | // Store the address of unsized array <function>_vararg[] in the ap object. |
2864 | SDValue Arg = getParamSymbol(DAG, /* vararg */ idx: -1, PtrVT); |
2865 | SDValue VAReg = DAG.getNode(Opcode: NVPTXISD::Wrapper, DL, VT: PtrVT, Operand: Arg); |
2866 | |
2867 | const Value *SV = cast<SrcValueSDNode>(Val: Op.getOperand(i: 2))->getValue(); |
2868 | return DAG.getStore(Chain: Op.getOperand(i: 0), dl: DL, Val: VAReg, Ptr: Op.getOperand(i: 1), |
2869 | PtrInfo: MachinePointerInfo(SV)); |
2870 | } |
2871 | |
2872 | SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const { |
2873 | SDValue Op0 = Op->getOperand(Num: 0); |
2874 | SDValue Op1 = Op->getOperand(Num: 1); |
2875 | SDValue Op2 = Op->getOperand(Num: 2); |
2876 | SDLoc DL(Op.getNode()); |
2877 | |
2878 | assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1" ); |
2879 | |
2880 | Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); |
2881 | Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); |
2882 | SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2); |
2883 | SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select); |
2884 | |
2885 | return Trunc; |
2886 | } |
2887 | |
2888 | SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { |
2889 | if (Op.getValueType() == MVT::i1) |
2890 | return LowerLOADi1(Op, DAG); |
2891 | |
2892 | // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle |
2893 | // unaligned loads and have to handle it here. |
2894 | EVT VT = Op.getValueType(); |
2895 | if (Isv2x16VT(VT) || VT == MVT::v4i8) { |
2896 | LoadSDNode *Load = cast<LoadSDNode>(Val&: Op); |
2897 | EVT MemVT = Load->getMemoryVT(); |
2898 | if (!allowsMemoryAccessForAlignment(Context&: *DAG.getContext(), DL: DAG.getDataLayout(), |
2899 | VT: MemVT, MMO: *Load->getMemOperand())) { |
2900 | SDValue Ops[2]; |
2901 | std::tie(args&: Ops[0], args&: Ops[1]) = expandUnalignedLoad(LD: Load, DAG); |
2902 | return DAG.getMergeValues(Ops, dl: SDLoc(Op)); |
2903 | } |
2904 | } |
2905 | |
2906 | return SDValue(); |
2907 | } |
2908 | |
2909 | // v = ld i1* addr |
2910 | // => |
2911 | // v1 = ld i8* addr (-> i16) |
2912 | // v = trunc i16 to i1 |
2913 | SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { |
2914 | SDNode *Node = Op.getNode(); |
2915 | LoadSDNode *LD = cast<LoadSDNode>(Val: Node); |
2916 | SDLoc dl(Node); |
2917 | assert(LD->getExtensionType() == ISD::NON_EXTLOAD); |
2918 | assert(Node->getValueType(0) == MVT::i1 && |
2919 | "Custom lowering for i1 load only" ); |
2920 | SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), |
2921 | LD->getPointerInfo(), LD->getAlign(), |
2922 | LD->getMemOperand()->getFlags()); |
2923 | SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); |
2924 | // The legalizer (the caller) is expecting two values from the legalized |
2925 | // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() |
2926 | // in LegalizeDAG.cpp which also uses MergeValues. |
2927 | SDValue Ops[] = { result, LD->getChain() }; |
2928 | return DAG.getMergeValues(Ops, dl); |
2929 | } |
2930 | |
2931 | SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { |
2932 | StoreSDNode *Store = cast<StoreSDNode>(Val&: Op); |
2933 | EVT VT = Store->getMemoryVT(); |
2934 | |
2935 | if (VT == MVT::i1) |
2936 | return LowerSTOREi1(Op, DAG); |
2937 | |
2938 | // v2f16 is legal, so we can't rely on legalizer to handle unaligned |
2939 | // stores and have to handle it here. |
2940 | if ((Isv2x16VT(VT) || VT == MVT::v4i8) && |
2941 | !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), |
2942 | VT, *Store->getMemOperand())) |
2943 | return expandUnalignedStore(ST: Store, DAG); |
2944 | |
2945 | // v2f16, v2bf16 and v2i16 don't need special handling. |
2946 | if (Isv2x16VT(VT) || VT == MVT::v4i8) |
2947 | return SDValue(); |
2948 | |
2949 | if (VT.isVector()) |
2950 | return LowerSTOREVector(Op, DAG); |
2951 | |
2952 | return SDValue(); |
2953 | } |
2954 | |
2955 | SDValue |
2956 | NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { |
2957 | SDNode *N = Op.getNode(); |
2958 | SDValue Val = N->getOperand(Num: 1); |
2959 | SDLoc DL(N); |
2960 | EVT ValVT = Val.getValueType(); |
2961 | |
2962 | if (ValVT.isVector()) { |
2963 | // We only handle "native" vector sizes for now, e.g. <4 x double> is not |
2964 | // legal. We can (and should) split that into 2 stores of <2 x double> here |
2965 | // but I'm leaving that as a TODO for now. |
2966 | if (!ValVT.isSimple()) |
2967 | return SDValue(); |
2968 | switch (ValVT.getSimpleVT().SimpleTy) { |
2969 | default: |
2970 | return SDValue(); |
2971 | case MVT::v2i8: |
2972 | case MVT::v2i16: |
2973 | case MVT::v2i32: |
2974 | case MVT::v2i64: |
2975 | case MVT::v2f16: |
2976 | case MVT::v2bf16: |
2977 | case MVT::v2f32: |
2978 | case MVT::v2f64: |
2979 | case MVT::v4i8: |
2980 | case MVT::v4i16: |
2981 | case MVT::v4i32: |
2982 | case MVT::v4f16: |
2983 | case MVT::v4bf16: |
2984 | case MVT::v4f32: |
2985 | case MVT::v8f16: // <4 x f16x2> |
2986 | case MVT::v8bf16: // <4 x bf16x2> |
2987 | case MVT::v8i16: // <4 x i16x2> |
2988 | // This is a "native" vector type |
2989 | break; |
2990 | } |
2991 | |
2992 | MemSDNode *MemSD = cast<MemSDNode>(Val: N); |
2993 | const DataLayout &TD = DAG.getDataLayout(); |
2994 | |
2995 | Align Alignment = MemSD->getAlign(); |
2996 | Align PrefAlign = |
2997 | TD.getPrefTypeAlign(Ty: ValVT.getTypeForEVT(Context&: *DAG.getContext())); |
2998 | if (Alignment < PrefAlign) { |
2999 | // This store is not sufficiently aligned, so bail out and let this vector |
3000 | // store be scalarized. Note that we may still be able to emit smaller |
3001 | // vector stores. For example, if we are storing a <4 x float> with an |
3002 | // alignment of 8, this check will fail but the legalizer will try again |
3003 | // with 2 x <2 x float>, which will succeed with an alignment of 8. |
3004 | return SDValue(); |
3005 | } |
3006 | |
3007 | unsigned Opcode = 0; |
3008 | EVT EltVT = ValVT.getVectorElementType(); |
3009 | unsigned NumElts = ValVT.getVectorNumElements(); |
3010 | |
3011 | // Since StoreV2 is a target node, we cannot rely on DAG type legalization. |
3012 | // Therefore, we must ensure the type is legal. For i1 and i8, we set the |
3013 | // stored type to i16 and propagate the "real" type as the memory type. |
3014 | bool NeedExt = false; |
3015 | if (EltVT.getSizeInBits() < 16) |
3016 | NeedExt = true; |
3017 | |
3018 | bool StoreF16x2 = false; |
3019 | switch (NumElts) { |
3020 | default: |
3021 | return SDValue(); |
3022 | case 2: |
3023 | Opcode = NVPTXISD::StoreV2; |
3024 | break; |
3025 | case 4: |
3026 | Opcode = NVPTXISD::StoreV4; |
3027 | break; |
3028 | case 8: |
3029 | // v8f16 is a special case. PTX doesn't have st.v8.f16 |
3030 | // instruction. Instead, we split the vector into v2f16 chunks and |
3031 | // store them with st.v4.b32. |
3032 | assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector." ); |
3033 | Opcode = NVPTXISD::StoreV4; |
3034 | StoreF16x2 = true; |
3035 | break; |
3036 | } |
3037 | |
3038 | SmallVector<SDValue, 8> Ops; |
3039 | |
3040 | // First is the chain |
3041 | Ops.push_back(Elt: N->getOperand(Num: 0)); |
3042 | |
3043 | if (StoreF16x2) { |
3044 | // Combine f16,f16 -> v2f16 |
3045 | NumElts /= 2; |
3046 | for (unsigned i = 0; i < NumElts; ++i) { |
3047 | SDValue E0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: EltVT, N1: Val, |
3048 | N2: DAG.getIntPtrConstant(Val: i * 2, DL)); |
3049 | SDValue E1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: EltVT, N1: Val, |
3050 | N2: DAG.getIntPtrConstant(Val: i * 2 + 1, DL)); |
3051 | EVT VecVT = EVT::getVectorVT(Context&: *DAG.getContext(), VT: EltVT, NumElements: 2); |
3052 | SDValue V2 = DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: VecVT, N1: E0, N2: E1); |
3053 | Ops.push_back(Elt: V2); |
3054 | } |
3055 | } else { |
3056 | // Then the split values |
3057 | for (unsigned i = 0; i < NumElts; ++i) { |
3058 | SDValue ExtVal = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: EltVT, N1: Val, |
3059 | N2: DAG.getIntPtrConstant(Val: i, DL)); |
3060 | if (NeedExt) |
3061 | ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); |
3062 | Ops.push_back(Elt: ExtVal); |
3063 | } |
3064 | } |
3065 | |
3066 | // Then any remaining arguments |
3067 | Ops.append(in_start: N->op_begin() + 2, in_end: N->op_end()); |
3068 | |
3069 | SDValue NewSt = |
3070 | DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, |
3071 | MemSD->getMemoryVT(), MemSD->getMemOperand()); |
3072 | |
3073 | // return DCI.CombineTo(N, NewSt, true); |
3074 | return NewSt; |
3075 | } |
3076 | |
3077 | return SDValue(); |
3078 | } |
3079 | |
3080 | // st i1 v, addr |
3081 | // => |
3082 | // v1 = zxt v to i16 |
3083 | // st.u8 i16, addr |
3084 | SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { |
3085 | SDNode *Node = Op.getNode(); |
3086 | SDLoc dl(Node); |
3087 | StoreSDNode *ST = cast<StoreSDNode>(Val: Node); |
3088 | SDValue Tmp1 = ST->getChain(); |
3089 | SDValue Tmp2 = ST->getBasePtr(); |
3090 | SDValue Tmp3 = ST->getValue(); |
3091 | assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only" ); |
3092 | Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); |
3093 | SDValue Result = |
3094 | DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8, |
3095 | ST->getAlign(), ST->getMemOperand()->getFlags()); |
3096 | return Result; |
3097 | } |
3098 | |
3099 | // This creates target external symbol for a function parameter. |
3100 | // Name of the symbol is composed from its index and the function name. |
3101 | // Negative index corresponds to special parameter (unsized array) used for |
3102 | // passing variable arguments. |
3103 | SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, |
3104 | EVT v) const { |
3105 | StringRef SavedStr = nvTM->getStrPool().save( |
3106 | S: getParamName(F: &DAG.getMachineFunction().getFunction(), Idx: idx)); |
3107 | return DAG.getTargetExternalSymbol(Sym: SavedStr.data(), VT: v); |
3108 | } |
3109 | |
3110 | SDValue NVPTXTargetLowering::LowerFormalArguments( |
3111 | SDValue Chain, CallingConv::ID CallConv, bool isVarArg, |
3112 | const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, |
3113 | SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { |
3114 | MachineFunction &MF = DAG.getMachineFunction(); |
3115 | const DataLayout &DL = DAG.getDataLayout(); |
3116 | auto PtrVT = getPointerTy(DL: DAG.getDataLayout()); |
3117 | |
3118 | const Function *F = &MF.getFunction(); |
3119 | const AttributeList &PAL = F->getAttributes(); |
3120 | const TargetLowering *TLI = STI.getTargetLowering(); |
3121 | |
3122 | SDValue Root = DAG.getRoot(); |
3123 | std::vector<SDValue> OutChains; |
3124 | |
3125 | bool isABI = (STI.getSmVersion() >= 20); |
3126 | assert(isABI && "Non-ABI compilation is not supported" ); |
3127 | if (!isABI) |
3128 | return Chain; |
3129 | |
3130 | std::vector<Type *> argTypes; |
3131 | std::vector<const Argument *> theArgs; |
3132 | for (const Argument &I : F->args()) { |
3133 | theArgs.push_back(x: &I); |
3134 | argTypes.push_back(x: I.getType()); |
3135 | } |
3136 | // argTypes.size() (or theArgs.size()) and Ins.size() need not match. |
3137 | // Ins.size() will be larger |
3138 | // * if there is an aggregate argument with multiple fields (each field |
3139 | // showing up separately in Ins) |
3140 | // * if there is a vector argument with more than typical vector-length |
3141 | // elements (generally if more than 4) where each vector element is |
3142 | // individually present in Ins. |
3143 | // So a different index should be used for indexing into Ins. |
3144 | // See similar issue in LowerCall. |
3145 | unsigned InsIdx = 0; |
3146 | |
3147 | for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++InsIdx) { |
3148 | Type *Ty = argTypes[i]; |
3149 | |
3150 | if (theArgs[i]->use_empty()) { |
3151 | // argument is dead |
3152 | if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) { |
3153 | SmallVector<EVT, 16> vtparts; |
3154 | |
3155 | ComputePTXValueVTs(TLI: *this, DL: DAG.getDataLayout(), Ty, ValueVTs&: vtparts); |
3156 | if (vtparts.empty()) |
3157 | report_fatal_error(reason: "Empty parameter types are not supported" ); |
3158 | |
3159 | for (unsigned parti = 0, parte = vtparts.size(); parti != parte; |
3160 | ++parti) { |
3161 | InVals.push_back(Elt: DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); |
3162 | ++InsIdx; |
3163 | } |
3164 | if (vtparts.size() > 0) |
3165 | --InsIdx; |
3166 | continue; |
3167 | } |
3168 | if (Ty->isVectorTy()) { |
3169 | EVT ObjectVT = getValueType(DL, Ty); |
3170 | unsigned NumRegs = TLI->getNumRegisters(Context&: F->getContext(), VT: ObjectVT); |
3171 | for (unsigned parti = 0; parti < NumRegs; ++parti) { |
3172 | InVals.push_back(Elt: DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); |
3173 | ++InsIdx; |
3174 | } |
3175 | if (NumRegs > 0) |
3176 | --InsIdx; |
3177 | continue; |
3178 | } |
3179 | InVals.push_back(Elt: DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT)); |
3180 | continue; |
3181 | } |
3182 | |
3183 | // In the following cases, assign a node order of "i+1" |
3184 | // to newly created nodes. The SDNodes for params have to |
3185 | // appear in the same order as their order of appearance |
3186 | // in the original function. "i+1" holds that order. |
3187 | if (!PAL.hasParamAttr(i, Attribute::ByVal)) { |
3188 | bool aggregateIsPacked = false; |
3189 | if (StructType *STy = dyn_cast<StructType>(Val: Ty)) |
3190 | aggregateIsPacked = STy->isPacked(); |
3191 | |
3192 | SmallVector<EVT, 16> VTs; |
3193 | SmallVector<uint64_t, 16> Offsets; |
3194 | ComputePTXValueVTs(TLI: *this, DL, Ty, ValueVTs&: VTs, Offsets: &Offsets, StartingOffset: 0); |
3195 | if (VTs.empty()) |
3196 | report_fatal_error(reason: "Empty parameter types are not supported" ); |
3197 | |
3198 | auto VectorInfo = |
3199 | VectorizePTXValueVTs(ValueVTs: VTs, Offsets, ParamAlignment: DL.getABITypeAlign(Ty)); |
3200 | |
3201 | SDValue Arg = getParamSymbol(DAG, idx: i, v: PtrVT); |
3202 | int VecIdx = -1; // Index of the first element of the current vector. |
3203 | for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) { |
3204 | if (VectorInfo[parti] & PVF_FIRST) { |
3205 | assert(VecIdx == -1 && "Orphaned vector." ); |
3206 | VecIdx = parti; |
3207 | } |
3208 | |
3209 | // That's the last element of this store op. |
3210 | if (VectorInfo[parti] & PVF_LAST) { |
3211 | unsigned NumElts = parti - VecIdx + 1; |
3212 | EVT EltVT = VTs[parti]; |
3213 | // i1 is loaded/stored as i8. |
3214 | EVT LoadVT = EltVT; |
3215 | if (EltVT == MVT::i1) |
3216 | LoadVT = MVT::i8; |
3217 | else if (Isv2x16VT(EltVT) || EltVT == MVT::v4i8) |
3218 | // getLoad needs a vector type, but it can't handle |
3219 | // vectors which contain v2f16 or v2bf16 elements. So we must load |
3220 | // using i32 here and then bitcast back. |
3221 | LoadVT = MVT::i32; |
3222 | |
3223 | EVT VecVT = EVT::getVectorVT(Context&: F->getContext(), VT: LoadVT, NumElements: NumElts); |
3224 | SDValue VecAddr = |
3225 | DAG.getNode(Opcode: ISD::ADD, DL: dl, VT: PtrVT, N1: Arg, |
3226 | N2: DAG.getConstant(Val: Offsets[VecIdx], DL: dl, VT: PtrVT)); |
3227 | Value *srcValue = Constant::getNullValue(Ty: PointerType::get( |
3228 | ElementType: EltVT.getTypeForEVT(Context&: F->getContext()), AddressSpace: ADDRESS_SPACE_PARAM)); |
3229 | |
3230 | const MaybeAlign PartAlign = [&]() -> MaybeAlign { |
3231 | if (aggregateIsPacked) |
3232 | return Align(1); |
3233 | if (NumElts != 1) |
3234 | return std::nullopt; |
3235 | Align PartAlign = |
3236 | (Offsets[parti] == 0 && PAL.getParamAlignment(ArgNo: i)) |
3237 | ? PAL.getParamAlignment(ArgNo: i).value() |
3238 | : DL.getABITypeAlign(Ty: EltVT.getTypeForEVT(Context&: F->getContext())); |
3239 | return commonAlignment(A: PartAlign, Offset: Offsets[parti]); |
3240 | }(); |
3241 | SDValue P = DAG.getLoad(VT: VecVT, dl, Chain: Root, Ptr: VecAddr, |
3242 | PtrInfo: MachinePointerInfo(srcValue), Alignment: PartAlign, |
3243 | MMOFlags: MachineMemOperand::MODereferenceable | |
3244 | MachineMemOperand::MOInvariant); |
3245 | if (P.getNode()) |
3246 | P.getNode()->setIROrder(i + 1); |
3247 | for (unsigned j = 0; j < NumElts; ++j) { |
3248 | SDValue Elt = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL: dl, VT: LoadVT, N1: P, |
3249 | N2: DAG.getIntPtrConstant(Val: j, DL: dl)); |
3250 | // We've loaded i1 as an i8 and now must truncate it back to i1 |
3251 | if (EltVT == MVT::i1) |
3252 | Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt); |
3253 | // v2f16 was loaded as an i32. Now we must bitcast it back. |
3254 | else if (EltVT != LoadVT) |
3255 | Elt = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: EltVT, Operand: Elt); |
3256 | |
3257 | // If a promoted integer type is used, truncate down to the original |
3258 | MVT PromotedVT; |
3259 | if (PromoteScalarIntegerPTX(VT: EltVT, PromotedVT: &PromotedVT)) { |
3260 | Elt = DAG.getNode(Opcode: ISD::TRUNCATE, DL: dl, VT: EltVT, Operand: Elt); |
3261 | } |
3262 | |
3263 | // Extend the element if necessary (e.g. an i8 is loaded |
3264 | // into an i16 register) |
3265 | if (Ins[InsIdx].VT.isInteger() && |
3266 | Ins[InsIdx].VT.getFixedSizeInBits() > |
3267 | LoadVT.getFixedSizeInBits()) { |
3268 | unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND |
3269 | : ISD::ZERO_EXTEND; |
3270 | Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt); |
3271 | } |
3272 | InVals.push_back(Elt); |
3273 | } |
3274 | |
3275 | // Reset vector tracking state. |
3276 | VecIdx = -1; |
3277 | } |
3278 | ++InsIdx; |
3279 | } |
3280 | if (VTs.size() > 0) |
3281 | --InsIdx; |
3282 | continue; |
3283 | } |
3284 | |
3285 | // Param has ByVal attribute |
3286 | // Return MoveParam(param symbol). |
3287 | // Ideally, the param symbol can be returned directly, |
3288 | // but when SDNode builder decides to use it in a CopyToReg(), |
3289 | // machine instruction fails because TargetExternalSymbol |
3290 | // (not lowered) is target dependent, and CopyToReg assumes |
3291 | // the source is lowered. |
3292 | EVT ObjectVT = getValueType(DL, Ty); |
3293 | assert(ObjectVT == Ins[InsIdx].VT && |
3294 | "Ins type did not match function type" ); |
3295 | SDValue Arg = getParamSymbol(DAG, idx: i, v: PtrVT); |
3296 | SDValue p = DAG.getNode(Opcode: NVPTXISD::MoveParam, DL: dl, VT: ObjectVT, Operand: Arg); |
3297 | if (p.getNode()) |
3298 | p.getNode()->setIROrder(i + 1); |
3299 | InVals.push_back(Elt: p); |
3300 | } |
3301 | |
3302 | if (!OutChains.empty()) |
3303 | DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains)); |
3304 | |
3305 | return Chain; |
3306 | } |
3307 | |
3308 | // Use byte-store when the param adress of the return value is unaligned. |
3309 | // This may happen when the return value is a field of a packed structure. |
3310 | static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain, |
3311 | uint64_t Offset, EVT ElementType, |
3312 | SDValue RetVal, const SDLoc &dl) { |
3313 | // Bit logic only works on integer types |
3314 | if (adjustElementType(ElementType)) |
3315 | RetVal = DAG.getNode(Opcode: ISD::BITCAST, DL: dl, VT: ElementType, Operand: RetVal); |
3316 | |
3317 | // Store each byte |
3318 | for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) { |
3319 | // Shift the byte to the last byte position |
3320 | SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal, |
3321 | DAG.getConstant(i * 8, dl, MVT::i32)); |
3322 | SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32), |
3323 | ShiftVal}; |
3324 | // Trunc store only the last byte by using |
3325 | // st.param.b8 |
3326 | // The register type can be larger than b8. |
3327 | Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, |
3328 | DAG.getVTList(MVT::Other), StoreOperands, |
3329 | MVT::i8, MachinePointerInfo(), std::nullopt, |
3330 | MachineMemOperand::MOStore); |
3331 | } |
3332 | return Chain; |
3333 | } |
3334 | |
3335 | SDValue |
3336 | NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, |
3337 | bool isVarArg, |
3338 | const SmallVectorImpl<ISD::OutputArg> &Outs, |
3339 | const SmallVectorImpl<SDValue> &OutVals, |
3340 | const SDLoc &dl, SelectionDAG &DAG) const { |
3341 | const MachineFunction &MF = DAG.getMachineFunction(); |
3342 | const Function &F = MF.getFunction(); |
3343 | Type *RetTy = MF.getFunction().getReturnType(); |
3344 | |
3345 | bool isABI = (STI.getSmVersion() >= 20); |
3346 | assert(isABI && "Non-ABI compilation is not supported" ); |
3347 | if (!isABI) |
3348 | return Chain; |
3349 | |
3350 | const DataLayout &DL = DAG.getDataLayout(); |
3351 | SmallVector<SDValue, 16> PromotedOutVals; |
3352 | SmallVector<EVT, 16> VTs; |
3353 | SmallVector<uint64_t, 16> Offsets; |
3354 | ComputePTXValueVTs(TLI: *this, DL, Ty: RetTy, ValueVTs&: VTs, Offsets: &Offsets); |
3355 | assert(VTs.size() == OutVals.size() && "Bad return value decomposition" ); |
3356 | |
3357 | for (unsigned i = 0, e = VTs.size(); i != e; ++i) { |
3358 | SDValue PromotedOutVal = OutVals[i]; |
3359 | MVT PromotedVT; |
3360 | if (PromoteScalarIntegerPTX(VT: VTs[i], PromotedVT: &PromotedVT)) { |
3361 | VTs[i] = EVT(PromotedVT); |
3362 | } |
3363 | if (PromoteScalarIntegerPTX(VT: PromotedOutVal.getValueType(), PromotedVT: &PromotedVT)) { |
3364 | llvm::ISD::NodeType Ext = |
3365 | Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; |
3366 | PromotedOutVal = DAG.getNode(Opcode: Ext, DL: dl, VT: PromotedVT, Operand: PromotedOutVal); |
3367 | } |
3368 | PromotedOutVals.push_back(Elt: PromotedOutVal); |
3369 | } |
3370 | |
3371 | auto VectorInfo = VectorizePTXValueVTs( |
3372 | ValueVTs: VTs, Offsets, |
3373 | ParamAlignment: RetTy->isSized() ? getFunctionParamOptimizedAlign(F: &F, ArgTy: RetTy, DL) |
3374 | : Align(1)); |
3375 | |
3376 | // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than |
3377 | // 32-bits are sign extended or zero extended, depending on whether |
3378 | // they are signed or unsigned types. |
3379 | bool ExtendIntegerRetVal = |
3380 | RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty: RetTy) < 32; |
3381 | |
3382 | SmallVector<SDValue, 6> StoreOperands; |
3383 | for (unsigned i = 0, e = VTs.size(); i != e; ++i) { |
3384 | SDValue OutVal = OutVals[i]; |
3385 | SDValue RetVal = PromotedOutVals[i]; |
3386 | |
3387 | if (ExtendIntegerRetVal) { |
3388 | RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND |
3389 | : ISD::ZERO_EXTEND, |
3390 | dl, MVT::i32, RetVal); |
3391 | } else if (OutVal.getValueSizeInBits() < 16) { |
3392 | // Use 16-bit registers for small load-stores as it's the |
3393 | // smallest general purpose register size supported by NVPTX. |
3394 | RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal); |
3395 | } |
3396 | |
3397 | // If we have a PVF_SCALAR entry, it may not even be sufficiently aligned |
3398 | // for a scalar store. In such cases, fall back to byte stores. |
3399 | if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) { |
3400 | EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; |
3401 | Align ElementTypeAlign = |
3402 | DL.getABITypeAlign(Ty: ElementType.getTypeForEVT(Context&: RetTy->getContext())); |
3403 | Align ElementAlign = |
3404 | commonAlignment(A: DL.getABITypeAlign(Ty: RetTy), Offset: Offsets[i]); |
3405 | if (ElementAlign < ElementTypeAlign) { |
3406 | assert(StoreOperands.empty() && "Orphaned operand list." ); |
3407 | Chain = LowerUnalignedStoreRet(DAG, Chain, Offset: Offsets[i], ElementType, |
3408 | RetVal, dl); |
3409 | |
3410 | // The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes |
3411 | // into the graph, so just move on to the next element. |
3412 | continue; |
3413 | } |
3414 | } |
3415 | |
3416 | // New load/store. Record chain and offset operands. |
3417 | if (VectorInfo[i] & PVF_FIRST) { |
3418 | assert(StoreOperands.empty() && "Orphaned operand list." ); |
3419 | StoreOperands.push_back(Elt: Chain); |
3420 | StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32)); |
3421 | } |
3422 | |
3423 | // Record the value to return. |
3424 | StoreOperands.push_back(Elt: RetVal); |
3425 | |
3426 | // That's the last element of this store op. |
3427 | if (VectorInfo[i] & PVF_LAST) { |
3428 | NVPTXISD::NodeType Op; |
3429 | unsigned NumElts = StoreOperands.size() - 2; |
3430 | switch (NumElts) { |
3431 | case 1: |
3432 | Op = NVPTXISD::StoreRetval; |
3433 | break; |
3434 | case 2: |
3435 | Op = NVPTXISD::StoreRetvalV2; |
3436 | break; |
3437 | case 4: |
3438 | Op = NVPTXISD::StoreRetvalV4; |
3439 | break; |
3440 | default: |
3441 | llvm_unreachable("Invalid vector info." ); |
3442 | } |
3443 | |
3444 | // Adjust type of load/store op if we've extended the scalar |
3445 | // return value. |
3446 | EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; |
3447 | Chain = DAG.getMemIntrinsicNode( |
3448 | Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType, |
3449 | MachinePointerInfo(), Align(1), MachineMemOperand::MOStore); |
3450 | // Cleanup vector state. |
3451 | StoreOperands.clear(); |
3452 | } |
3453 | } |
3454 | |
3455 | return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain); |
3456 | } |
3457 | |
3458 | void NVPTXTargetLowering::LowerAsmOperandForConstraint( |
3459 | SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops, |
3460 | SelectionDAG &DAG) const { |
3461 | if (Constraint.size() > 1) |
3462 | return; |
3463 | TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); |
3464 | } |
3465 | |
3466 | static unsigned getOpcForTextureInstr(unsigned Intrinsic) { |
3467 | switch (Intrinsic) { |
3468 | default: |
3469 | return 0; |
3470 | |
3471 | case Intrinsic::nvvm_tex_1d_v4f32_s32: |
3472 | return NVPTXISD::Tex1DFloatS32; |
3473 | case Intrinsic::nvvm_tex_1d_v4f32_f32: |
3474 | return NVPTXISD::Tex1DFloatFloat; |
3475 | case Intrinsic::nvvm_tex_1d_level_v4f32_f32: |
3476 | return NVPTXISD::Tex1DFloatFloatLevel; |
3477 | case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: |
3478 | return NVPTXISD::Tex1DFloatFloatGrad; |
3479 | case Intrinsic::nvvm_tex_1d_v4s32_s32: |
3480 | return NVPTXISD::Tex1DS32S32; |
3481 | case Intrinsic::nvvm_tex_1d_v4s32_f32: |
3482 | return NVPTXISD::Tex1DS32Float; |
3483 | case Intrinsic::nvvm_tex_1d_level_v4s32_f32: |
3484 | return NVPTXISD::Tex1DS32FloatLevel; |
3485 | case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: |
3486 | return NVPTXISD::Tex1DS32FloatGrad; |
3487 | case Intrinsic::nvvm_tex_1d_v4u32_s32: |
3488 | return NVPTXISD::Tex1DU32S32; |
3489 | case Intrinsic::nvvm_tex_1d_v4u32_f32: |
3490 | return NVPTXISD::Tex1DU32Float; |
3491 | case Intrinsic::nvvm_tex_1d_level_v4u32_f32: |
3492 | return NVPTXISD::Tex1DU32FloatLevel; |
3493 | case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: |
3494 | return NVPTXISD::Tex1DU32FloatGrad; |
3495 | |
3496 | case Intrinsic::nvvm_tex_1d_array_v4f32_s32: |
3497 | return NVPTXISD::Tex1DArrayFloatS32; |
3498 | case Intrinsic::nvvm_tex_1d_array_v4f32_f32: |
3499 | return NVPTXISD::Tex1DArrayFloatFloat; |
3500 | case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: |
3501 | return NVPTXISD::Tex1DArrayFloatFloatLevel; |
3502 | case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: |
3503 | return NVPTXISD::Tex1DArrayFloatFloatGrad; |
3504 | case Intrinsic::nvvm_tex_1d_array_v4s32_s32: |
3505 | return NVPTXISD::Tex1DArrayS32S32; |
3506 | case Intrinsic::nvvm_tex_1d_array_v4s32_f32: |
3507 | return NVPTXISD::Tex1DArrayS32Float; |
3508 | case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: |
3509 | return NVPTXISD::Tex1DArrayS32FloatLevel; |
3510 | case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: |
3511 | return NVPTXISD::Tex1DArrayS32FloatGrad; |
3512 | case Intrinsic::nvvm_tex_1d_array_v4u32_s32: |
3513 | return NVPTXISD::Tex1DArrayU32S32; |
3514 | case Intrinsic::nvvm_tex_1d_array_v4u32_f32: |
3515 | return NVPTXISD::Tex1DArrayU32Float; |
3516 | case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: |
3517 | return NVPTXISD::Tex1DArrayU32FloatLevel; |
3518 | case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: |
3519 | return NVPTXISD::Tex1DArrayU32FloatGrad; |
3520 | |
3521 | case Intrinsic::nvvm_tex_2d_v4f32_s32: |
3522 | return NVPTXISD::Tex2DFloatS32; |
3523 | case Intrinsic::nvvm_tex_2d_v4f32_f32: |
3524 | return NVPTXISD::Tex2DFloatFloat; |
3525 | case Intrinsic::nvvm_tex_2d_level_v4f32_f32: |
3526 | return NVPTXISD::Tex2DFloatFloatLevel; |
3527 | case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: |
3528 | return NVPTXISD::Tex2DFloatFloatGrad; |
3529 | case Intrinsic::nvvm_tex_2d_v4s32_s32: |
3530 | return NVPTXISD::Tex2DS32S32; |
3531 | case Intrinsic::nvvm_tex_2d_v4s32_f32: |
3532 | return NVPTXISD::Tex2DS32Float; |
3533 | case Intrinsic::nvvm_tex_2d_level_v4s32_f32: |
3534 | return NVPTXISD::Tex2DS32FloatLevel; |
3535 | case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: |
3536 | return NVPTXISD::Tex2DS32FloatGrad; |
3537 | case Intrinsic::nvvm_tex_2d_v4u32_s32: |
3538 | return NVPTXISD::Tex2DU32S32; |
3539 | case Intrinsic::nvvm_tex_2d_v4u32_f32: |
3540 | return NVPTXISD::Tex2DU32Float; |
3541 | case Intrinsic::nvvm_tex_2d_level_v4u32_f32: |
3542 | return NVPTXISD::Tex2DU32FloatLevel; |
3543 | case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: |
3544 | return NVPTXISD::Tex2DU32FloatGrad; |
3545 | |
3546 | case Intrinsic::nvvm_tex_2d_array_v4f32_s32: |
3547 | return NVPTXISD::Tex2DArrayFloatS32; |
3548 | case Intrinsic::nvvm_tex_2d_array_v4f32_f32: |
3549 | return NVPTXISD::Tex2DArrayFloatFloat; |
3550 | case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: |
3551 | return NVPTXISD::Tex2DArrayFloatFloatLevel; |
3552 | case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: |
3553 | return NVPTXISD::Tex2DArrayFloatFloatGrad; |
3554 | case Intrinsic::nvvm_tex_2d_array_v4s32_s32: |
3555 | return NVPTXISD::Tex2DArrayS32S32; |
3556 | case Intrinsic::nvvm_tex_2d_array_v4s32_f32: |
3557 | return NVPTXISD::Tex2DArrayS32Float; |
3558 | case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: |
3559 | return NVPTXISD::Tex2DArrayS32FloatLevel; |
3560 | case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: |
3561 | return NVPTXISD::Tex2DArrayS32FloatGrad; |
3562 | case Intrinsic::nvvm_tex_2d_array_v4u32_s32: |
3563 | return NVPTXISD::Tex2DArrayU32S32; |
3564 | case Intrinsic::nvvm_tex_2d_array_v4u32_f32: |
3565 | return NVPTXISD::Tex2DArrayU32Float; |
3566 | case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: |
3567 | return NVPTXISD::Tex2DArrayU32FloatLevel; |
3568 | case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: |
3569 | return NVPTXISD::Tex2DArrayU32FloatGrad; |
3570 | |
3571 | case Intrinsic::nvvm_tex_3d_v4f32_s32: |
3572 | return NVPTXISD::Tex3DFloatS32; |
3573 | case Intrinsic::nvvm_tex_3d_v4f32_f32: |
3574 | return NVPTXISD::Tex3DFloatFloat; |
3575 | case Intrinsic::nvvm_tex_3d_level_v4f32_f32: |
3576 | return NVPTXISD::Tex3DFloatFloatLevel; |
3577 | case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: |
3578 | return NVPTXISD::Tex3DFloatFloatGrad; |
3579 | case Intrinsic::nvvm_tex_3d_v4s32_s32: |
3580 | return NVPTXISD::Tex3DS32S32; |
3581 | case Intrinsic::nvvm_tex_3d_v4s32_f32: |
3582 | return NVPTXISD::Tex3DS32Float; |
3583 | case Intrinsic::nvvm_tex_3d_level_v4s32_f32: |
3584 | return NVPTXISD::Tex3DS32FloatLevel; |
3585 | case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: |
3586 | return NVPTXISD::Tex3DS32FloatGrad; |
3587 | case Intrinsic::nvvm_tex_3d_v4u32_s32: |
3588 | return NVPTXISD::Tex3DU32S32; |
3589 | case Intrinsic::nvvm_tex_3d_v4u32_f32: |
3590 | return NVPTXISD::Tex3DU32Float; |
3591 | case Intrinsic::nvvm_tex_3d_level_v4u32_f32: |
3592 | return NVPTXISD::Tex3DU32FloatLevel; |
3593 | case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: |
3594 | return NVPTXISD::Tex3DU32FloatGrad; |
3595 | |
3596 | case Intrinsic::nvvm_tex_cube_v4f32_f32: |
3597 | return NVPTXISD::TexCubeFloatFloat; |
3598 | case Intrinsic::nvvm_tex_cube_level_v4f32_f32: |
3599 | return NVPTXISD::TexCubeFloatFloatLevel; |
3600 | case Intrinsic::nvvm_tex_cube_v4s32_f32: |
3601 | return NVPTXISD::TexCubeS32Float; |
3602 | case Intrinsic::nvvm_tex_cube_level_v4s32_f32: |
3603 | return NVPTXISD::TexCubeS32FloatLevel; |
3604 | case Intrinsic::nvvm_tex_cube_v4u32_f32: |
3605 | return NVPTXISD::TexCubeU32Float; |
3606 | case Intrinsic::nvvm_tex_cube_level_v4u32_f32: |
3607 | return NVPTXISD::TexCubeU32FloatLevel; |
3608 | |
3609 | case Intrinsic::nvvm_tex_cube_array_v4f32_f32: |
3610 | return NVPTXISD::TexCubeArrayFloatFloat; |
3611 | case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: |
3612 | return NVPTXISD::TexCubeArrayFloatFloatLevel; |
3613 | case Intrinsic::nvvm_tex_cube_array_v4s32_f32: |
3614 | return NVPTXISD::TexCubeArrayS32Float; |
3615 | case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: |
3616 | return NVPTXISD::TexCubeArrayS32FloatLevel; |
3617 | case Intrinsic::nvvm_tex_cube_array_v4u32_f32: |
3618 | return NVPTXISD::TexCubeArrayU32Float; |
3619 | case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: |
3620 | return NVPTXISD::TexCubeArrayU32FloatLevel; |
3621 | |
3622 | case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: |
3623 | return NVPTXISD::Tld4R2DFloatFloat; |
3624 | case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: |
3625 | return NVPTXISD::Tld4G2DFloatFloat; |
3626 | case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: |
3627 | return NVPTXISD::Tld4B2DFloatFloat; |
3628 | case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: |
3629 | return NVPTXISD::Tld4A2DFloatFloat; |
3630 | case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: |
3631 | return NVPTXISD::Tld4R2DS64Float; |
3632 | case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: |
3633 | return NVPTXISD::Tld4G2DS64Float; |
3634 | case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: |
3635 | return NVPTXISD::Tld4B2DS64Float; |
3636 | case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: |
3637 | return NVPTXISD::Tld4A2DS64Float; |
3638 | case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: |
3639 | return NVPTXISD::Tld4R2DU64Float; |
3640 | case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: |
3641 | return NVPTXISD::Tld4G2DU64Float; |
3642 | case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: |
3643 | return NVPTXISD::Tld4B2DU64Float; |
3644 | case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: |
3645 | return NVPTXISD::Tld4A2DU64Float; |
3646 | |
3647 | case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: |
3648 | return NVPTXISD::TexUnified1DFloatS32; |
3649 | case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: |
3650 | return NVPTXISD::TexUnified1DFloatFloat; |
3651 | case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: |
3652 | return NVPTXISD::TexUnified1DFloatFloatLevel; |
3653 | case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: |
3654 | return NVPTXISD::TexUnified1DFloatFloatGrad; |
3655 | case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: |
3656 | return NVPTXISD::TexUnified1DS32S32; |
3657 | case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: |
3658 | return NVPTXISD::TexUnified1DS32Float; |
3659 | case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: |
3660 | return NVPTXISD::TexUnified1DS32FloatLevel; |
3661 | case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: |
3662 | return NVPTXISD::TexUnified1DS32FloatGrad; |
3663 | case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: |
3664 | return NVPTXISD::TexUnified1DU32S32; |
3665 | case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: |
3666 | return NVPTXISD::TexUnified1DU32Float; |
3667 | case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: |
3668 | return NVPTXISD::TexUnified1DU32FloatLevel; |
3669 | case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: |
3670 | return NVPTXISD::TexUnified1DU32FloatGrad; |
3671 | |
3672 | case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: |
3673 | return NVPTXISD::TexUnified1DArrayFloatS32; |
3674 | case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: |
3675 | return NVPTXISD::TexUnified1DArrayFloatFloat; |
3676 | case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: |
3677 | return NVPTXISD::TexUnified1DArrayFloatFloatLevel; |
3678 | case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: |
3679 | return NVPTXISD::TexUnified1DArrayFloatFloatGrad; |
3680 | case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: |
3681 | return NVPTXISD::TexUnified1DArrayS32S32; |
3682 | case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: |
3683 | return NVPTXISD::TexUnified1DArrayS32Float; |
3684 | case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: |
3685 | return NVPTXISD::TexUnified1DArrayS32FloatLevel; |
3686 | case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: |
3687 | return NVPTXISD::TexUnified1DArrayS32FloatGrad; |
3688 | case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: |
3689 | return NVPTXISD::TexUnified1DArrayU32S32; |
3690 | case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: |
3691 | return NVPTXISD::TexUnified1DArrayU32Float; |
3692 | case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: |
3693 | return NVPTXISD::TexUnified1DArrayU32FloatLevel; |
3694 | case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: |
3695 | return NVPTXISD::TexUnified1DArrayU32FloatGrad; |
3696 | |
3697 | case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: |
3698 | return NVPTXISD::TexUnified2DFloatS32; |
3699 | case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: |
3700 | return NVPTXISD::TexUnified2DFloatFloat; |
3701 | case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: |
3702 | return NVPTXISD::TexUnified2DFloatFloatLevel; |
3703 | case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: |
3704 | return NVPTXISD::TexUnified2DFloatFloatGrad; |
3705 | case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: |
3706 | return NVPTXISD::TexUnified2DS32S32; |
3707 | case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: |
3708 | return NVPTXISD::TexUnified2DS32Float; |
3709 | case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: |
3710 | return NVPTXISD::TexUnified2DS32FloatLevel; |
3711 | case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: |
3712 | return NVPTXISD::TexUnified2DS32FloatGrad; |
3713 | case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: |
3714 | return NVPTXISD::TexUnified2DU32S32; |
3715 | case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: |
3716 | return NVPTXISD::TexUnified2DU32Float; |
3717 | case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: |
3718 | return NVPTXISD::TexUnified2DU32FloatLevel; |
3719 | case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: |
3720 | return NVPTXISD::TexUnified2DU32FloatGrad; |
3721 | |
3722 | case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: |
3723 | return NVPTXISD::TexUnified2DArrayFloatS32; |
3724 | case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: |
3725 | return NVPTXISD::TexUnified2DArrayFloatFloat; |
3726 | case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: |
3727 | return NVPTXISD::TexUnified2DArrayFloatFloatLevel; |
3728 | case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: |
3729 | return NVPTXISD::TexUnified2DArrayFloatFloatGrad; |
3730 | case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: |
3731 | return NVPTXISD::TexUnified2DArrayS32S32; |
3732 | case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: |
3733 | return NVPTXISD::TexUnified2DArrayS32Float; |
3734 | case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: |
3735 | return NVPTXISD::TexUnified2DArrayS32FloatLevel; |
3736 | case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: |
3737 | return NVPTXISD::TexUnified2DArrayS32FloatGrad; |
3738 | case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: |
3739 | return NVPTXISD::TexUnified2DArrayU32S32; |
3740 | case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: |
3741 | return NVPTXISD::TexUnified2DArrayU32Float; |
3742 | case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: |
3743 | return NVPTXISD::TexUnified2DArrayU32FloatLevel; |
3744 | case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: |
3745 | return NVPTXISD::TexUnified2DArrayU32FloatGrad; |
3746 | |
3747 | case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: |
3748 | return NVPTXISD::TexUnified3DFloatS32; |
3749 | case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: |
3750 | return NVPTXISD::TexUnified3DFloatFloat; |
3751 | case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: |
3752 | return NVPTXISD::TexUnified3DFloatFloatLevel; |
3753 | case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: |
3754 | return NVPTXISD::TexUnified3DFloatFloatGrad; |
3755 | case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: |
3756 | return NVPTXISD::TexUnified3DS32S32; |
3757 | case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: |
3758 | return NVPTXISD::TexUnified3DS32Float; |
3759 | case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: |
3760 | return NVPTXISD::TexUnified3DS32FloatLevel; |
3761 | case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: |
3762 | return NVPTXISD::TexUnified3DS32FloatGrad; |
3763 | case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: |
3764 | return NVPTXISD::TexUnified3DU32S32; |
3765 | case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: |
3766 | return NVPTXISD::TexUnified3DU32Float; |
3767 | case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: |
3768 | return NVPTXISD::TexUnified3DU32FloatLevel; |
3769 | case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: |
3770 | return NVPTXISD::TexUnified3DU32FloatGrad; |
3771 | |
3772 | case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: |
3773 | return NVPTXISD::TexUnifiedCubeFloatFloat; |
3774 | case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: |
3775 | return NVPTXISD::TexUnifiedCubeFloatFloatLevel; |
3776 | case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: |
3777 | return NVPTXISD::TexUnifiedCubeS32Float; |
3778 | case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: |
3779 | return NVPTXISD::TexUnifiedCubeS32FloatLevel; |
3780 | case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: |
3781 | return NVPTXISD::TexUnifiedCubeU32Float; |
3782 | case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: |
3783 | return NVPTXISD::TexUnifiedCubeU32FloatLevel; |
3784 | |
3785 | case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: |
3786 | return NVPTXISD::TexUnifiedCubeArrayFloatFloat; |
3787 | case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: |
3788 | return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel; |
3789 | case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: |
3790 | return NVPTXISD::TexUnifiedCubeArrayS32Float; |
3791 | case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: |
3792 | return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel; |
3793 | case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: |
3794 | return NVPTXISD::TexUnifiedCubeArrayU32Float; |
3795 | case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: |
3796 | return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel; |
3797 | |
3798 | case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32: |
3799 | return NVPTXISD::TexUnifiedCubeFloatFloatGrad; |
3800 | case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32: |
3801 | return NVPTXISD::TexUnifiedCubeS32FloatGrad; |
3802 | case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32: |
3803 | return NVPTXISD::TexUnifiedCubeU32FloatGrad; |
3804 | case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32: |
3805 | return NVPTXISD::TexUnifiedCubeArrayFloatFloatGrad; |
3806 | case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32: |
3807 | return NVPTXISD::TexUnifiedCubeArrayS32FloatGrad; |
3808 | case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32: |
3809 | return NVPTXISD::TexUnifiedCubeArrayU32FloatGrad; |
3810 | |
3811 | case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: |
3812 | return NVPTXISD::Tld4UnifiedR2DFloatFloat; |
3813 | case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: |
3814 | return NVPTXISD::Tld4UnifiedG2DFloatFloat; |
3815 | case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: |
3816 | return NVPTXISD::Tld4UnifiedB2DFloatFloat; |
3817 | case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: |
3818 | return NVPTXISD::Tld4UnifiedA2DFloatFloat; |
3819 | case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: |
3820 | return NVPTXISD::Tld4UnifiedR2DS64Float; |
3821 | case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: |
3822 | return NVPTXISD::Tld4UnifiedG2DS64Float; |
3823 | case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: |
3824 | return NVPTXISD::Tld4UnifiedB2DS64Float; |
3825 | case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: |
3826 | return NVPTXISD::Tld4UnifiedA2DS64Float; |
3827 | case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: |
3828 | return NVPTXISD::Tld4UnifiedR2DU64Float; |
3829 | case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: |
3830 | return NVPTXISD::Tld4UnifiedG2DU64Float; |
3831 | case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: |
3832 | return NVPTXISD::Tld4UnifiedB2DU64Float; |
3833 | case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: |
3834 | return NVPTXISD::Tld4UnifiedA2DU64Float; |
3835 | } |
3836 | } |
3837 | |
3838 | static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) { |
3839 | switch (Intrinsic) { |
3840 | default: |
3841 | return 0; |
3842 | case Intrinsic::nvvm_suld_1d_i8_clamp: |
3843 | return NVPTXISD::Suld1DI8Clamp; |
3844 | case Intrinsic::nvvm_suld_1d_i16_clamp: |
3845 | return NVPTXISD::Suld1DI16Clamp; |
3846 | case Intrinsic::nvvm_suld_1d_i32_clamp: |
3847 | return NVPTXISD::Suld1DI32Clamp; |
3848 | case Intrinsic::nvvm_suld_1d_i64_clamp: |
3849 | return NVPTXISD::Suld1DI64Clamp; |
3850 | case Intrinsic::nvvm_suld_1d_v2i8_clamp: |
3851 | return NVPTXISD::Suld1DV2I8Clamp; |
3852 | case Intrinsic::nvvm_suld_1d_v2i16_clamp: |
3853 | return NVPTXISD::Suld1DV2I16Clamp; |
3854 | case Intrinsic::nvvm_suld_1d_v2i32_clamp: |
3855 | return NVPTXISD::Suld1DV2I32Clamp; |
3856 | case Intrinsic::nvvm_suld_1d_v2i64_clamp: |
3857 | return NVPTXISD::Suld1DV2I64Clamp; |
3858 | case Intrinsic::nvvm_suld_1d_v4i8_clamp: |
3859 | return NVPTXISD::Suld1DV4I8Clamp; |
3860 | case Intrinsic::nvvm_suld_1d_v4i16_clamp: |
3861 | return NVPTXISD::Suld1DV4I16Clamp; |
3862 | case Intrinsic::nvvm_suld_1d_v4i32_clamp: |
3863 | return NVPTXISD::Suld1DV4I32Clamp; |
3864 | case Intrinsic::nvvm_suld_1d_array_i8_clamp: |
3865 | return NVPTXISD::Suld1DArrayI8Clamp; |
3866 | case Intrinsic::nvvm_suld_1d_array_i16_clamp: |
3867 | return NVPTXISD::Suld1DArrayI16Clamp; |
3868 | case Intrinsic::nvvm_suld_1d_array_i32_clamp: |
3869 | return NVPTXISD::Suld1DArrayI32Clamp; |
3870 | case Intrinsic::nvvm_suld_1d_array_i64_clamp: |
3871 | return NVPTXISD::Suld1DArrayI64Clamp; |
3872 | case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: |
3873 | return NVPTXISD::Suld1DArrayV2I8Clamp; |
3874 | case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: |
3875 | return NVPTXISD::Suld1DArrayV2I16Clamp; |
3876 | case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: |
3877 | return NVPTXISD::Suld1DArrayV2I32Clamp; |
3878 | case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: |
3879 | return NVPTXISD::Suld1DArrayV2I64Clamp; |
3880 | case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: |
3881 | return NVPTXISD::Suld1DArrayV4I8Clamp; |
3882 | case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: |
3883 | return NVPTXISD::Suld1DArrayV4I16Clamp; |
3884 | case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: |
3885 | return NVPTXISD::Suld1DArrayV4I32Clamp; |
3886 | case Intrinsic::nvvm_suld_2d_i8_clamp: |
3887 | return NVPTXISD::Suld2DI8Clamp; |
3888 | case Intrinsic::nvvm_suld_2d_i16_clamp: |
3889 | return NVPTXISD::Suld2DI16Clamp; |
3890 | case Intrinsic::nvvm_suld_2d_i32_clamp: |
3891 | return NVPTXISD::Suld2DI32Clamp; |
3892 | case Intrinsic::nvvm_suld_2d_i64_clamp: |
3893 | return NVPTXISD::Suld2DI64Clamp; |
3894 | case Intrinsic::nvvm_suld_2d_v2i8_clamp: |
3895 | return NVPTXISD::Suld2DV2I8Clamp; |
3896 | case Intrinsic::nvvm_suld_2d_v2i16_clamp: |
3897 | return NVPTXISD::Suld2DV2I16Clamp; |
3898 | case Intrinsic::nvvm_suld_2d_v2i32_clamp: |
3899 | return NVPTXISD::Suld2DV2I32Clamp; |
3900 | case Intrinsic::nvvm_suld_2d_v2i64_clamp: |
3901 | return NVPTXISD::Suld2DV2I64Clamp; |
3902 | case Intrinsic::nvvm_suld_2d_v4i8_clamp: |
3903 | return NVPTXISD::Suld2DV4I8Clamp; |
3904 | case Intrinsic::nvvm_suld_2d_v4i16_clamp: |
3905 | return NVPTXISD::Suld2DV4I16Clamp; |
3906 | case Intrinsic::nvvm_suld_2d_v4i32_clamp: |
3907 | return NVPTXISD::Suld2DV4I32Clamp; |
3908 | case Intrinsic::nvvm_suld_2d_array_i8_clamp: |
3909 | return NVPTXISD::Suld2DArrayI8Clamp; |
3910 | case Intrinsic::nvvm_suld_2d_array_i16_clamp: |
3911 | return NVPTXISD::Suld2DArrayI16Clamp; |
3912 | case Intrinsic::nvvm_suld_2d_array_i32_clamp: |
3913 | return NVPTXISD::Suld2DArrayI32Clamp; |
3914 | case Intrinsic::nvvm_suld_2d_array_i64_clamp: |
3915 | return NVPTXISD::Suld2DArrayI64Clamp; |
3916 | case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: |
3917 | return NVPTXISD::Suld2DArrayV2I8Clamp; |
3918 | case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: |
3919 | return NVPTXISD::Suld2DArrayV2I16Clamp; |
3920 | case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: |
3921 | return NVPTXISD::Suld2DArrayV2I32Clamp; |
3922 | case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: |
3923 | return NVPTXISD::Suld2DArrayV2I64Clamp; |
3924 | case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: |
3925 | return NVPTXISD::Suld2DArrayV4I8Clamp; |
3926 | case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: |
3927 | return NVPTXISD::Suld2DArrayV4I16Clamp; |
3928 | case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: |
3929 | return NVPTXISD::Suld2DArrayV4I32Clamp; |
3930 | case Intrinsic::nvvm_suld_3d_i8_clamp: |
3931 | return NVPTXISD::Suld3DI8Clamp; |
3932 | case Intrinsic::nvvm_suld_3d_i16_clamp: |
3933 | return NVPTXISD::Suld3DI16Clamp; |
3934 | case Intrinsic::nvvm_suld_3d_i32_clamp: |
3935 | return NVPTXISD::Suld3DI32Clamp; |
3936 | case Intrinsic::nvvm_suld_3d_i64_clamp: |
3937 | return NVPTXISD::Suld3DI64Clamp; |
3938 | case Intrinsic::nvvm_suld_3d_v2i8_clamp: |
3939 | return NVPTXISD::Suld3DV2I8Clamp; |
3940 | case Intrinsic::nvvm_suld_3d_v2i16_clamp: |
3941 | return NVPTXISD::Suld3DV2I16Clamp; |
3942 | case Intrinsic::nvvm_suld_3d_v2i32_clamp: |
3943 | return NVPTXISD::Suld3DV2I32Clamp; |
3944 | case Intrinsic::nvvm_suld_3d_v2i64_clamp: |
3945 | return NVPTXISD::Suld3DV2I64Clamp; |
3946 | case Intrinsic::nvvm_suld_3d_v4i8_clamp: |
3947 | return NVPTXISD::Suld3DV4I8Clamp; |
3948 | case Intrinsic::nvvm_suld_3d_v4i16_clamp: |
3949 | return NVPTXISD::Suld3DV4I16Clamp; |
3950 | case Intrinsic::nvvm_suld_3d_v4i32_clamp: |
3951 | return NVPTXISD::Suld3DV4I32Clamp; |
3952 | case Intrinsic::nvvm_suld_1d_i8_trap: |
3953 | return NVPTXISD::Suld1DI8Trap; |
3954 | case Intrinsic::nvvm_suld_1d_i16_trap: |
3955 | return NVPTXISD::Suld1DI16Trap; |
3956 | case Intrinsic::nvvm_suld_1d_i32_trap: |
3957 | return NVPTXISD::Suld1DI32Trap; |
3958 | case Intrinsic::nvvm_suld_1d_i64_trap: |
3959 | return NVPTXISD::Suld1DI64Trap; |
3960 | case Intrinsic::nvvm_suld_1d_v2i8_trap: |
3961 | return NVPTXISD::Suld1DV2I8Trap; |
3962 | case Intrinsic::nvvm_suld_1d_v2i16_trap: |
3963 | return NVPTXISD::Suld1DV2I16Trap; |
3964 | case Intrinsic::nvvm_suld_1d_v2i32_trap: |
3965 | return NVPTXISD::Suld1DV2I32Trap; |
3966 | case Intrinsic::nvvm_suld_1d_v2i64_trap: |
3967 | return NVPTXISD::Suld1DV2I64Trap; |
3968 | case Intrinsic::nvvm_suld_1d_v4i8_trap: |
3969 | return NVPTXISD::Suld1DV4I8Trap; |
3970 | case Intrinsic::nvvm_suld_1d_v4i16_trap: |
3971 | return NVPTXISD::Suld1DV4I16Trap; |
3972 | case Intrinsic::nvvm_suld_1d_v4i32_trap: |
3973 | return NVPTXISD::Suld1DV4I32Trap; |
3974 | case Intrinsic::nvvm_suld_1d_array_i8_trap: |
3975 | return NVPTXISD::Suld1DArrayI8Trap; |
3976 | case Intrinsic::nvvm_suld_1d_array_i16_trap: |
3977 | return NVPTXISD::Suld1DArrayI16Trap; |
3978 | case Intrinsic::nvvm_suld_1d_array_i32_trap: |
3979 | return NVPTXISD::Suld1DArrayI32Trap; |
3980 | case Intrinsic::nvvm_suld_1d_array_i64_trap: |
3981 | return NVPTXISD::Suld1DArrayI64Trap; |
3982 | case Intrinsic::nvvm_suld_1d_array_v2i8_trap: |
3983 | return NVPTXISD::Suld1DArrayV2I8Trap; |
3984 | case Intrinsic::nvvm_suld_1d_array_v2i16_trap: |
3985 | return NVPTXISD::Suld1DArrayV2I16Trap; |
3986 | case Intrinsic::nvvm_suld_1d_array_v2i32_trap: |
3987 | return NVPTXISD::Suld1DArrayV2I32Trap; |
3988 | case Intrinsic::nvvm_suld_1d_array_v2i64_trap: |
3989 | return NVPTXISD::Suld1DArrayV2I64Trap; |
3990 | case Intrinsic::nvvm_suld_1d_array_v4i8_trap: |
3991 | return NVPTXISD::Suld1DArrayV4I8Trap; |
3992 | case Intrinsic::nvvm_suld_1d_array_v4i16_trap: |
3993 | return NVPTXISD::Suld1DArrayV4I16Trap; |
3994 | case Intrinsic::nvvm_suld_1d_array_v4i32_trap: |
3995 | return NVPTXISD::Suld1DArrayV4I32Trap; |
3996 | case Intrinsic::nvvm_suld_2d_i8_trap: |
3997 | return NVPTXISD::Suld2DI8Trap; |
3998 | case Intrinsic::nvvm_suld_2d_i16_trap: |
3999 | return NVPTXISD::Suld2DI16Trap; |
4000 | case Intrinsic::nvvm_suld_2d_i32_trap: |
4001 | return NVPTXISD::Suld2DI32Trap; |
4002 | case Intrinsic::nvvm_suld_2d_i64_trap: |
4003 | return NVPTXISD::Suld2DI64Trap; |
4004 | case Intrinsic::nvvm_suld_2d_v2i8_trap: |
4005 | return NVPTXISD::Suld2DV2I8Trap; |
4006 | case Intrinsic::nvvm_suld_2d_v2i16_trap: |
4007 | return NVPTXISD::Suld2DV2I16Trap; |
4008 | case Intrinsic::nvvm_suld_2d_v2i32_trap: |
4009 | return NVPTXISD::Suld2DV2I32Trap; |
4010 | case Intrinsic::nvvm_suld_2d_v2i64_trap: |
4011 | return NVPTXISD::Suld2DV2I64Trap; |
4012 | case Intrinsic::nvvm_suld_2d_v4i8_trap: |
4013 | return NVPTXISD::Suld2DV4I8Trap; |
4014 | case Intrinsic::nvvm_suld_2d_v4i16_trap: |
4015 | return NVPTXISD::Suld2DV4I16Trap; |
4016 | case Intrinsic::nvvm_suld_2d_v4i32_trap: |
4017 | return NVPTXISD::Suld2DV4I32Trap; |
4018 | case Intrinsic::nvvm_suld_2d_array_i8_trap: |
4019 | return NVPTXISD::Suld2DArrayI8Trap; |
4020 | case Intrinsic::nvvm_suld_2d_array_i16_trap: |
4021 | return NVPTXISD::Suld2DArrayI16Trap; |
4022 | case Intrinsic::nvvm_suld_2d_array_i32_trap: |
4023 | return NVPTXISD::Suld2DArrayI32Trap; |
4024 | case Intrinsic::nvvm_suld_2d_array_i64_trap: |
4025 | return NVPTXISD::Suld2DArrayI64Trap; |
4026 | case Intrinsic::nvvm_suld_2d_array_v2i8_trap: |
4027 | return NVPTXISD::Suld2DArrayV2I8Trap; |
4028 | case Intrinsic::nvvm_suld_2d_array_v2i16_trap: |
4029 | return NVPTXISD::Suld2DArrayV2I16Trap; |
4030 | case Intrinsic::nvvm_suld_2d_array_v2i32_trap: |
4031 | return NVPTXISD::Suld2DArrayV2I32Trap; |
4032 | case Intrinsic::nvvm_suld_2d_array_v2i64_trap: |
4033 | return NVPTXISD::Suld2DArrayV2I64Trap; |
4034 | case Intrinsic::nvvm_suld_2d_array_v4i8_trap: |
4035 | return NVPTXISD::Suld2DArrayV4I8Trap; |
4036 | case Intrinsic::nvvm_suld_2d_array_v4i16_trap: |
4037 | return NVPTXISD::Suld2DArrayV4I16Trap; |
4038 | case Intrinsic::nvvm_suld_2d_array_v4i32_trap: |
4039 | return NVPTXISD::Suld2DArrayV4I32Trap; |
4040 | case Intrinsic::nvvm_suld_3d_i8_trap: |
4041 | return NVPTXISD::Suld3DI8Trap; |
4042 | case Intrinsic::nvvm_suld_3d_i16_trap: |
4043 | return NVPTXISD::Suld3DI16Trap; |
4044 | case Intrinsic::nvvm_suld_3d_i32_trap: |
4045 | return NVPTXISD::Suld3DI32Trap; |
4046 | case Intrinsic::nvvm_suld_3d_i64_trap: |
4047 | return NVPTXISD::Suld3DI64Trap; |
4048 | case Intrinsic::nvvm_suld_3d_v2i8_trap: |
4049 | return NVPTXISD::Suld3DV2I8Trap; |
4050 | case Intrinsic::nvvm_suld_3d_v2i16_trap: |
4051 | return NVPTXISD::Suld3DV2I16Trap; |
4052 | case Intrinsic::nvvm_suld_3d_v2i32_trap: |
4053 | return NVPTXISD::Suld3DV2I32Trap; |
4054 | case Intrinsic::nvvm_suld_3d_v2i64_trap: |
4055 | return NVPTXISD::Suld3DV2I64Trap; |
4056 | case Intrinsic::nvvm_suld_3d_v4i8_trap: |
4057 | return NVPTXISD::Suld3DV4I8Trap; |
4058 | case Intrinsic::nvvm_suld_3d_v4i16_trap: |
4059 | return NVPTXISD::Suld3DV4I16Trap; |
4060 | case Intrinsic::nvvm_suld_3d_v4i32_trap: |
4061 | return NVPTXISD::Suld3DV4I32Trap; |
4062 | case Intrinsic::nvvm_suld_1d_i8_zero: |
4063 | return NVPTXISD::Suld1DI8Zero; |
4064 | case Intrinsic::nvvm_suld_1d_i16_zero: |
4065 | return NVPTXISD::Suld1DI16Zero; |
4066 | case Intrinsic::nvvm_suld_1d_i32_zero: |
4067 | return NVPTXISD::Suld1DI32Zero; |
4068 | case Intrinsic::nvvm_suld_1d_i64_zero: |
4069 | return NVPTXISD::Suld1DI64Zero; |
4070 | case Intrinsic::nvvm_suld_1d_v2i8_zero: |
4071 | return NVPTXISD::Suld1DV2I8Zero; |
4072 | case Intrinsic::nvvm_suld_1d_v2i16_zero: |
4073 | return NVPTXISD::Suld1DV2I16Zero; |
4074 | case Intrinsic::nvvm_suld_1d_v2i32_zero: |
4075 | return NVPTXISD::Suld1DV2I32Zero; |
4076 | case Intrinsic::nvvm_suld_1d_v2i64_zero: |
4077 | return NVPTXISD::Suld1DV2I64Zero; |
4078 | case Intrinsic::nvvm_suld_1d_v4i8_zero: |
4079 | return NVPTXISD::Suld1DV4I8Zero; |
4080 | case Intrinsic::nvvm_suld_1d_v4i16_zero: |
4081 | return NVPTXISD::Suld1DV4I16Zero; |
4082 | case Intrinsic::nvvm_suld_1d_v4i32_zero: |
4083 | return NVPTXISD::Suld1DV4I32Zero; |
4084 | case Intrinsic::nvvm_suld_1d_array_i8_zero: |
4085 | return NVPTXISD::Suld1DArrayI8Zero; |
4086 | case Intrinsic::nvvm_suld_1d_array_i16_zero: |
4087 | return NVPTXISD::Suld1DArrayI16Zero; |
4088 | case Intrinsic::nvvm_suld_1d_array_i32_zero: |
4089 | return NVPTXISD::Suld1DArrayI32Zero; |
4090 | case Intrinsic::nvvm_suld_1d_array_i64_zero: |
4091 | return NVPTXISD::Suld1DArrayI64Zero; |
4092 | case Intrinsic::nvvm_suld_1d_array_v2i8_zero: |
4093 | return NVPTXISD::Suld1DArrayV2I8Zero; |
4094 | case Intrinsic::nvvm_suld_1d_array_v2i16_zero: |
4095 | return NVPTXISD::Suld1DArrayV2I16Zero; |
4096 | case Intrinsic::nvvm_suld_1d_array_v2i32_zero: |
4097 | return NVPTXISD::Suld1DArrayV2I32Zero; |
4098 | case Intrinsic::nvvm_suld_1d_array_v2i64_zero: |
4099 | return NVPTXISD::Suld1DArrayV2I64Zero; |
4100 | case Intrinsic::nvvm_suld_1d_array_v4i8_zero: |
4101 | return NVPTXISD::Suld1DArrayV4I8Zero; |
4102 | case Intrinsic::nvvm_suld_1d_array_v4i16_zero: |
4103 | return NVPTXISD::Suld1DArrayV4I16Zero; |
4104 | case Intrinsic::nvvm_suld_1d_array_v4i32_zero: |
4105 | return NVPTXISD::Suld1DArrayV4I32Zero; |
4106 | case Intrinsic::nvvm_suld_2d_i8_zero: |
4107 | return NVPTXISD::Suld2DI8Zero; |
4108 | case Intrinsic::nvvm_suld_2d_i16_zero: |
4109 | return NVPTXISD::Suld2DI16Zero; |
4110 | case Intrinsic::nvvm_suld_2d_i32_zero: |
4111 | return NVPTXISD::Suld2DI32Zero; |
4112 | case Intrinsic::nvvm_suld_2d_i64_zero: |
4113 | return NVPTXISD::Suld2DI64Zero; |
4114 | case Intrinsic::nvvm_suld_2d_v2i8_zero: |
4115 | return NVPTXISD::Suld2DV2I8Zero; |
4116 | case Intrinsic::nvvm_suld_2d_v2i16_zero: |
4117 | return NVPTXISD::Suld2DV2I16Zero; |
4118 | case Intrinsic::nvvm_suld_2d_v2i32_zero: |
4119 | return NVPTXISD::Suld2DV2I32Zero; |
4120 | case Intrinsic::nvvm_suld_2d_v2i64_zero: |
4121 | return NVPTXISD::Suld2DV2I64Zero; |
4122 | case Intrinsic::nvvm_suld_2d_v4i8_zero: |
4123 | return NVPTXISD::Suld2DV4I8Zero; |
4124 | case Intrinsic::nvvm_suld_2d_v4i16_zero: |
4125 | return NVPTXISD::Suld2DV4I16Zero; |
4126 | case Intrinsic::nvvm_suld_2d_v4i32_zero: |
4127 | return NVPTXISD::Suld2DV4I32Zero; |
4128 | case Intrinsic::nvvm_suld_2d_array_i8_zero: |
4129 | return NVPTXISD::Suld2DArrayI8Zero; |
4130 | case Intrinsic::nvvm_suld_2d_array_i16_zero: |
4131 | return NVPTXISD::Suld2DArrayI16Zero; |
4132 | case Intrinsic::nvvm_suld_2d_array_i32_zero: |
4133 | return NVPTXISD::Suld2DArrayI32Zero; |
4134 | case Intrinsic::nvvm_suld_2d_array_i64_zero: |
4135 | return NVPTXISD::Suld2DArrayI64Zero; |
4136 | case Intrinsic::nvvm_suld_2d_array_v2i8_zero: |
4137 | return NVPTXISD::Suld2DArrayV2I8Zero; |
4138 | case Intrinsic::nvvm_suld_2d_array_v2i16_zero: |
4139 | return NVPTXISD::Suld2DArrayV2I16Zero; |
4140 | case Intrinsic::nvvm_suld_2d_array_v2i32_zero: |
4141 | return NVPTXISD::Suld2DArrayV2I32Zero; |
4142 | case Intrinsic::nvvm_suld_2d_array_v2i64_zero: |
4143 | return NVPTXISD::Suld2DArrayV2I64Zero; |
4144 | case Intrinsic::nvvm_suld_2d_array_v4i8_zero: |
4145 | return NVPTXISD::Suld2DArrayV4I8Zero; |
4146 | case Intrinsic::nvvm_suld_2d_array_v4i16_zero: |
4147 | return NVPTXISD::Suld2DArrayV4I16Zero; |
4148 | case Intrinsic::nvvm_suld_2d_array_v4i32_zero: |
4149 | return NVPTXISD::Suld2DArrayV4I32Zero; |
4150 | case Intrinsic::nvvm_suld_3d_i8_zero: |
4151 | return NVPTXISD::Suld3DI8Zero; |
4152 | case Intrinsic::nvvm_suld_3d_i16_zero: |
4153 | return NVPTXISD::Suld3DI16Zero; |
4154 | case Intrinsic::nvvm_suld_3d_i32_zero: |
4155 | return NVPTXISD::Suld3DI32Zero; |
4156 | case Intrinsic::nvvm_suld_3d_i64_zero: |
4157 | return NVPTXISD::Suld3DI64Zero; |
4158 | case Intrinsic::nvvm_suld_3d_v2i8_zero: |
4159 | return NVPTXISD::Suld3DV2I8Zero; |
4160 | case Intrinsic::nvvm_suld_3d_v2i16_zero: |
4161 | return NVPTXISD::Suld3DV2I16Zero; |
4162 | case Intrinsic::nvvm_suld_3d_v2i32_zero: |
4163 | return NVPTXISD::Suld3DV2I32Zero; |
4164 | case Intrinsic::nvvm_suld_3d_v2i64_zero: |
4165 | return NVPTXISD::Suld3DV2I64Zero; |
4166 | case Intrinsic::nvvm_suld_3d_v4i8_zero: |
4167 | return NVPTXISD::Suld3DV4I8Zero; |
4168 | case Intrinsic::nvvm_suld_3d_v4i16_zero: |
4169 | return NVPTXISD::Suld3DV4I16Zero; |
4170 | case Intrinsic::nvvm_suld_3d_v4i32_zero: |
4171 | return NVPTXISD::Suld3DV4I32Zero; |
4172 | } |
4173 | } |
4174 | |
4175 | // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as |
4176 | // TgtMemIntrinsic |
4177 | // because we need the information that is only available in the "Value" type |
4178 | // of destination |
4179 | // pointer. In particular, the address space information. |
4180 | bool NVPTXTargetLowering::getTgtMemIntrinsic( |
4181 | IntrinsicInfo &Info, const CallInst &I, |
4182 | MachineFunction &MF, unsigned Intrinsic) const { |
4183 | switch (Intrinsic) { |
4184 | default: |
4185 | return false; |
4186 | case Intrinsic::nvvm_match_all_sync_i32p: |
4187 | case Intrinsic::nvvm_match_all_sync_i64p: |
4188 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4189 | // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute |
4190 | // in order to model data exchange with other threads, but perform no real |
4191 | // memory accesses. |
4192 | Info.memVT = MVT::i1; |
4193 | |
4194 | // Our result depends on both our and other thread's arguments. |
4195 | Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
4196 | return true; |
4197 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col: |
4198 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row: |
4199 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride: |
4200 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride: |
4201 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col: |
4202 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row: |
4203 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride: |
4204 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride: |
4205 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col: |
4206 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row: |
4207 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride: |
4208 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride: |
4209 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col: |
4210 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row: |
4211 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride: |
4212 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride: |
4213 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col: |
4214 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row: |
4215 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride: |
4216 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride: |
4217 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col: |
4218 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row: |
4219 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride: |
4220 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: { |
4221 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4222 | Info.memVT = MVT::v8f16; |
4223 | Info.ptrVal = I.getArgOperand(i: 0); |
4224 | Info.offset = 0; |
4225 | Info.flags = MachineMemOperand::MOLoad; |
4226 | Info.align = Align(16); |
4227 | return true; |
4228 | } |
4229 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col: |
4230 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride: |
4231 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride: |
4232 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col: |
4233 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row: |
4234 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride: |
4235 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride: |
4236 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row: |
4237 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col: |
4238 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride: |
4239 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row: |
4240 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride: |
4241 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col: |
4242 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride: |
4243 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride: |
4244 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col: |
4245 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row: |
4246 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride: |
4247 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride: |
4248 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: |
4249 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col: |
4250 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride: |
4251 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row: |
4252 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: { |
4253 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4254 | Info.memVT = MVT::v2i32; |
4255 | Info.ptrVal = I.getArgOperand(i: 0); |
4256 | Info.offset = 0; |
4257 | Info.flags = MachineMemOperand::MOLoad; |
4258 | Info.align = Align(8); |
4259 | return true; |
4260 | } |
4261 | |
4262 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col: |
4263 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride: |
4264 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride: |
4265 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col: |
4266 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row: |
4267 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride: |
4268 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride: |
4269 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row: |
4270 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col: |
4271 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride: |
4272 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row: |
4273 | case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride: |
4274 | case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col: |
4275 | case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride: |
4276 | case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row: |
4277 | case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride: |
4278 | |
4279 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col: |
4280 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride: |
4281 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride: |
4282 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col: |
4283 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row: |
4284 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride: |
4285 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride: |
4286 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: |
4287 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col: |
4288 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride: |
4289 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row: |
4290 | case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride: |
4291 | case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col: |
4292 | case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride: |
4293 | case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row: |
4294 | case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride: |
4295 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16: |
4296 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: { |
4297 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4298 | Info.memVT = MVT::v4i32; |
4299 | Info.ptrVal = I.getArgOperand(i: 0); |
4300 | Info.offset = 0; |
4301 | Info.flags = MachineMemOperand::MOLoad; |
4302 | Info.align = Align(16); |
4303 | return true; |
4304 | } |
4305 | |
4306 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col: |
4307 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride: |
4308 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride: |
4309 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col: |
4310 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row: |
4311 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride: |
4312 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride: |
4313 | case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row: |
4314 | |
4315 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col: |
4316 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride: |
4317 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride: |
4318 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col: |
4319 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row: |
4320 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride: |
4321 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride: |
4322 | case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row: |
4323 | case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row: |
4324 | case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride: |
4325 | case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col: |
4326 | case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride: |
4327 | case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row: |
4328 | case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride: |
4329 | case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride: |
4330 | case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row: |
4331 | case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col: |
4332 | case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride: |
4333 | case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride: |
4334 | case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: |
4335 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16: |
4336 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: { |
4337 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4338 | Info.memVT = MVT::i32; |
4339 | Info.ptrVal = I.getArgOperand(i: 0); |
4340 | Info.offset = 0; |
4341 | Info.flags = MachineMemOperand::MOLoad; |
4342 | Info.align = Align(4); |
4343 | return true; |
4344 | } |
4345 | |
4346 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col: |
4347 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row: |
4348 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride: |
4349 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride: |
4350 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col: |
4351 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row: |
4352 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride: |
4353 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride: |
4354 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col: |
4355 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row: |
4356 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride: |
4357 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: { |
4358 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4359 | Info.memVT = MVT::v4f16; |
4360 | Info.ptrVal = I.getArgOperand(i: 0); |
4361 | Info.offset = 0; |
4362 | Info.flags = MachineMemOperand::MOLoad; |
4363 | Info.align = Align(16); |
4364 | return true; |
4365 | } |
4366 | |
4367 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col: |
4368 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row: |
4369 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride: |
4370 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride: |
4371 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col: |
4372 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row: |
4373 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride: |
4374 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride: |
4375 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col: |
4376 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row: |
4377 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride: |
4378 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: |
4379 | case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col: |
4380 | case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row: |
4381 | case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride: |
4382 | case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: { |
4383 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4384 | Info.memVT = MVT::v8f32; |
4385 | Info.ptrVal = I.getArgOperand(i: 0); |
4386 | Info.offset = 0; |
4387 | Info.flags = MachineMemOperand::MOLoad; |
4388 | Info.align = Align(16); |
4389 | return true; |
4390 | } |
4391 | |
4392 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col: |
4393 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride: |
4394 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row: |
4395 | case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride: |
4396 | |
4397 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col: |
4398 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride: |
4399 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row: |
4400 | case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride: |
4401 | |
4402 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col: |
4403 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride: |
4404 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row: |
4405 | case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride: |
4406 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col: |
4407 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride: |
4408 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row: |
4409 | case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride: |
4410 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col: |
4411 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride: |
4412 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row: |
4413 | case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: { |
4414 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4415 | Info.memVT = MVT::v8i32; |
4416 | Info.ptrVal = I.getArgOperand(i: 0); |
4417 | Info.offset = 0; |
4418 | Info.flags = MachineMemOperand::MOLoad; |
4419 | Info.align = Align(16); |
4420 | return true; |
4421 | } |
4422 | |
4423 | case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col: |
4424 | case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride: |
4425 | case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row: |
4426 | case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride: |
4427 | case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col: |
4428 | case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride: |
4429 | case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row: |
4430 | case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: |
4431 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16: |
4432 | case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: { |
4433 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4434 | Info.memVT = MVT::v2i32; |
4435 | Info.ptrVal = I.getArgOperand(i: 0); |
4436 | Info.offset = 0; |
4437 | Info.flags = MachineMemOperand::MOLoad; |
4438 | Info.align = Align(8); |
4439 | return true; |
4440 | } |
4441 | |
4442 | case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col: |
4443 | case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride: |
4444 | case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row: |
4445 | case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride: |
4446 | |
4447 | case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col: |
4448 | case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride: |
4449 | case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row: |
4450 | case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: { |
4451 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4452 | Info.memVT = MVT::f64; |
4453 | Info.ptrVal = I.getArgOperand(i: 0); |
4454 | Info.offset = 0; |
4455 | Info.flags = MachineMemOperand::MOLoad; |
4456 | Info.align = Align(8); |
4457 | return true; |
4458 | } |
4459 | |
4460 | case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col: |
4461 | case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride: |
4462 | case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row: |
4463 | case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: { |
4464 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4465 | Info.memVT = MVT::v2f64; |
4466 | Info.ptrVal = I.getArgOperand(i: 0); |
4467 | Info.offset = 0; |
4468 | Info.flags = MachineMemOperand::MOLoad; |
4469 | Info.align = Align(16); |
4470 | return true; |
4471 | } |
4472 | |
4473 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col: |
4474 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row: |
4475 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride: |
4476 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride: |
4477 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col: |
4478 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row: |
4479 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride: |
4480 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride: |
4481 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col: |
4482 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row: |
4483 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride: |
4484 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: { |
4485 | Info.opc = ISD::INTRINSIC_VOID; |
4486 | Info.memVT = MVT::v4f16; |
4487 | Info.ptrVal = I.getArgOperand(i: 0); |
4488 | Info.offset = 0; |
4489 | Info.flags = MachineMemOperand::MOStore; |
4490 | Info.align = Align(16); |
4491 | return true; |
4492 | } |
4493 | |
4494 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col: |
4495 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row: |
4496 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride: |
4497 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride: |
4498 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col: |
4499 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row: |
4500 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride: |
4501 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride: |
4502 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col: |
4503 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row: |
4504 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride: |
4505 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: |
4506 | case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col: |
4507 | case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row: |
4508 | case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride: |
4509 | case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: { |
4510 | Info.opc = ISD::INTRINSIC_VOID; |
4511 | Info.memVT = MVT::v8f32; |
4512 | Info.ptrVal = I.getArgOperand(i: 0); |
4513 | Info.offset = 0; |
4514 | Info.flags = MachineMemOperand::MOStore; |
4515 | Info.align = Align(16); |
4516 | return true; |
4517 | } |
4518 | |
4519 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col: |
4520 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride: |
4521 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row: |
4522 | case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride: |
4523 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col: |
4524 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride: |
4525 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row: |
4526 | case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride: |
4527 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col: |
4528 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride: |
4529 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row: |
4530 | case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: { |
4531 | Info.opc = ISD::INTRINSIC_VOID; |
4532 | Info.memVT = MVT::v8i32; |
4533 | Info.ptrVal = I.getArgOperand(i: 0); |
4534 | Info.offset = 0; |
4535 | Info.flags = MachineMemOperand::MOStore; |
4536 | Info.align = Align(16); |
4537 | return true; |
4538 | } |
4539 | |
4540 | case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col: |
4541 | case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride: |
4542 | case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row: |
4543 | case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride: |
4544 | case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col: |
4545 | case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride: |
4546 | case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row: |
4547 | case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: { |
4548 | Info.opc = ISD::INTRINSIC_VOID; |
4549 | Info.memVT = MVT::v2i32; |
4550 | Info.ptrVal = I.getArgOperand(i: 0); |
4551 | Info.offset = 0; |
4552 | Info.flags = MachineMemOperand::MOStore; |
4553 | Info.align = Align(8); |
4554 | return true; |
4555 | } |
4556 | |
4557 | case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col: |
4558 | case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride: |
4559 | case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row: |
4560 | case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: { |
4561 | Info.opc = ISD::INTRINSIC_VOID; |
4562 | Info.memVT = MVT::v2f64; |
4563 | Info.ptrVal = I.getArgOperand(i: 0); |
4564 | Info.offset = 0; |
4565 | Info.flags = MachineMemOperand::MOStore; |
4566 | Info.align = Align(16); |
4567 | return true; |
4568 | } |
4569 | |
4570 | case Intrinsic::nvvm_atomic_load_inc_32: |
4571 | case Intrinsic::nvvm_atomic_load_dec_32: |
4572 | |
4573 | case Intrinsic::nvvm_atomic_add_gen_f_cta: |
4574 | case Intrinsic::nvvm_atomic_add_gen_f_sys: |
4575 | case Intrinsic::nvvm_atomic_add_gen_i_cta: |
4576 | case Intrinsic::nvvm_atomic_add_gen_i_sys: |
4577 | case Intrinsic::nvvm_atomic_and_gen_i_cta: |
4578 | case Intrinsic::nvvm_atomic_and_gen_i_sys: |
4579 | case Intrinsic::nvvm_atomic_cas_gen_i_cta: |
4580 | case Intrinsic::nvvm_atomic_cas_gen_i_sys: |
4581 | case Intrinsic::nvvm_atomic_dec_gen_i_cta: |
4582 | case Intrinsic::nvvm_atomic_dec_gen_i_sys: |
4583 | case Intrinsic::nvvm_atomic_inc_gen_i_cta: |
4584 | case Intrinsic::nvvm_atomic_inc_gen_i_sys: |
4585 | case Intrinsic::nvvm_atomic_max_gen_i_cta: |
4586 | case Intrinsic::nvvm_atomic_max_gen_i_sys: |
4587 | case Intrinsic::nvvm_atomic_min_gen_i_cta: |
4588 | case Intrinsic::nvvm_atomic_min_gen_i_sys: |
4589 | case Intrinsic::nvvm_atomic_or_gen_i_cta: |
4590 | case Intrinsic::nvvm_atomic_or_gen_i_sys: |
4591 | case Intrinsic::nvvm_atomic_exch_gen_i_cta: |
4592 | case Intrinsic::nvvm_atomic_exch_gen_i_sys: |
4593 | case Intrinsic::nvvm_atomic_xor_gen_i_cta: |
4594 | case Intrinsic::nvvm_atomic_xor_gen_i_sys: { |
4595 | auto &DL = I.getModule()->getDataLayout(); |
4596 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4597 | Info.memVT = getValueType(DL, Ty: I.getType()); |
4598 | Info.ptrVal = I.getArgOperand(i: 0); |
4599 | Info.offset = 0; |
4600 | Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; |
4601 | Info.align.reset(); |
4602 | return true; |
4603 | } |
4604 | |
4605 | case Intrinsic::nvvm_ldu_global_i: |
4606 | case Intrinsic::nvvm_ldu_global_f: |
4607 | case Intrinsic::nvvm_ldu_global_p: { |
4608 | auto &DL = I.getModule()->getDataLayout(); |
4609 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4610 | if (Intrinsic == Intrinsic::nvvm_ldu_global_i) |
4611 | Info.memVT = getValueType(DL, Ty: I.getType()); |
4612 | else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) |
4613 | Info.memVT = getPointerTy(DL); |
4614 | else |
4615 | Info.memVT = getValueType(DL, Ty: I.getType()); |
4616 | Info.ptrVal = I.getArgOperand(i: 0); |
4617 | Info.offset = 0; |
4618 | Info.flags = MachineMemOperand::MOLoad; |
4619 | Info.align = cast<ConstantInt>(Val: I.getArgOperand(i: 1))->getMaybeAlignValue(); |
4620 | |
4621 | return true; |
4622 | } |
4623 | case Intrinsic::nvvm_ldg_global_i: |
4624 | case Intrinsic::nvvm_ldg_global_f: |
4625 | case Intrinsic::nvvm_ldg_global_p: { |
4626 | auto &DL = I.getModule()->getDataLayout(); |
4627 | |
4628 | Info.opc = ISD::INTRINSIC_W_CHAIN; |
4629 | if (Intrinsic == Intrinsic::nvvm_ldg_global_i) |
4630 | Info.memVT = getValueType(DL, Ty: I.getType()); |
4631 | else if(Intrinsic == Intrinsic::nvvm_ldg_global_p) |
4632 | Info.memVT = getPointerTy(DL); |
4633 | else |
4634 | Info.memVT = getValueType(DL, Ty: I.getType()); |
4635 | Info.ptrVal = I.getArgOperand(i: 0); |
4636 | Info.offset = 0; |
4637 | Info.flags = MachineMemOperand::MOLoad; |
4638 | Info.align = cast<ConstantInt>(Val: I.getArgOperand(i: 1))->getMaybeAlignValue(); |
4639 | |
4640 | return true; |
4641 | } |
4642 | |
4643 | case Intrinsic::nvvm_tex_1d_v4f32_s32: |
4644 | case Intrinsic::nvvm_tex_1d_v4f32_f32: |
4645 | case Intrinsic::nvvm_tex_1d_level_v4f32_f32: |
4646 | case Intrinsic::nvvm_tex_1d_grad_v4f32_f32: |
4647 | case Intrinsic::nvvm_tex_1d_array_v4f32_s32: |
4648 | case Intrinsic::nvvm_tex_1d_array_v4f32_f32: |
4649 | case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32: |
4650 | case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32: |
4651 | case Intrinsic::nvvm_tex_2d_v4f32_s32: |
4652 | case Intrinsic::nvvm_tex_2d_v4f32_f32: |
4653 | case Intrinsic::nvvm_tex_2d_level_v4f32_f32: |
4654 | case Intrinsic::nvvm_tex_2d_grad_v4f32_f32: |
4655 | case Intrinsic::nvvm_tex_2d_array_v4f32_s32: |
4656 | case Intrinsic::nvvm_tex_2d_array_v4f32_f32: |
4657 | case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32: |
4658 | case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32: |
4659 | case Intrinsic::nvvm_tex_3d_v4f32_s32: |
4660 | case Intrinsic::nvvm_tex_3d_v4f32_f32: |
4661 | case Intrinsic::nvvm_tex_3d_level_v4f32_f32: |
4662 | case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: |
4663 | case Intrinsic::nvvm_tex_cube_v4f32_f32: |
4664 | case Intrinsic::nvvm_tex_cube_level_v4f32_f32: |
4665 | case Intrinsic::nvvm_tex_cube_array_v4f32_f32: |
4666 | case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32: |
4667 | case Intrinsic::nvvm_tld4_r_2d_v4f32_f32: |
4668 | case Intrinsic::nvvm_tld4_g_2d_v4f32_f32: |
4669 | case Intrinsic::nvvm_tld4_b_2d_v4f32_f32: |
4670 | case Intrinsic::nvvm_tld4_a_2d_v4f32_f32: |
4671 | case Intrinsic::nvvm_tex_unified_1d_v4f32_s32: |
4672 | case Intrinsic::nvvm_tex_unified_1d_v4f32_f32: |
4673 | case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32: |
4674 | case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32: |
4675 | case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32: |
4676 | case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32: |
4677 | case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32: |
4678 | case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32: |
4679 | case Intrinsic::nvvm_tex_unified_2d_v4f32_s32: |
4680 | case Intrinsic::nvvm_tex_unified_2d_v4f32_f32: |
4681 | case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32: |
4682 | case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32: |
4683 | case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32: |
4684 | case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32: |
4685 | case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32: |
4686 | case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32: |
4687 | case Intrinsic::nvvm_tex_unified_3d_v4f32_s32: |
4688 | case Intrinsic::nvvm_tex_unified_3d_v4f32_f32: |
4689 | case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32: |
4690 | case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32: |
4691 | case Intrinsic::nvvm_tex_unified_cube_v4f32_f32: |
4692 | case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32: |
4693 | case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32: |
4694 | case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32: |
4695 | case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32: |
4696 | case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32: |
4697 | case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32: |
4698 | case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32: |
4699 | case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32: |
4700 | case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: |
4701 | Info.opc = getOpcForTextureInstr(Intrinsic); |
4702 | Info.memVT = MVT::v4f32; |
4703 | Info.ptrVal = nullptr; |
4704 | Info.offset = 0; |
4705 | Info.flags = MachineMemOperand::MOLoad; |
4706 | Info.align = Align(16); |
4707 | return true; |
4708 | |
4709 | case Intrinsic::nvvm_tex_1d_v4s32_s32: |
4710 | case Intrinsic::nvvm_tex_1d_v4s32_f32: |
4711 | case Intrinsic::nvvm_tex_1d_level_v4s32_f32: |
4712 | case Intrinsic::nvvm_tex_1d_grad_v4s32_f32: |
4713 | case Intrinsic::nvvm_tex_1d_array_v4s32_s32: |
4714 | case Intrinsic::nvvm_tex_1d_array_v4s32_f32: |
4715 | case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32: |
4716 | case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32: |
4717 | case Intrinsic::nvvm_tex_2d_v4s32_s32: |
4718 | case Intrinsic::nvvm_tex_2d_v4s32_f32: |
4719 | case Intrinsic::nvvm_tex_2d_level_v4s32_f32: |
4720 | case Intrinsic::nvvm_tex_2d_grad_v4s32_f32: |
4721 | case Intrinsic::nvvm_tex_2d_array_v4s32_s32: |
4722 | case Intrinsic::nvvm_tex_2d_array_v4s32_f32: |
4723 | case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32: |
4724 | case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32: |
4725 | case Intrinsic::nvvm_tex_3d_v4s32_s32: |
4726 | case Intrinsic::nvvm_tex_3d_v4s32_f32: |
4727 | case Intrinsic::nvvm_tex_3d_level_v4s32_f32: |
4728 | case Intrinsic::nvvm_tex_3d_grad_v4s32_f32: |
4729 | case Intrinsic::nvvm_tex_cube_v4s32_f32: |
4730 | case Intrinsic::nvvm_tex_cube_level_v4s32_f32: |
4731 | case Intrinsic::nvvm_tex_cube_array_v4s32_f32: |
4732 | case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32: |
4733 | case Intrinsic::nvvm_tex_cube_v4u32_f32: |
4734 | case Intrinsic::nvvm_tex_cube_level_v4u32_f32: |
4735 | case Intrinsic::nvvm_tex_cube_array_v4u32_f32: |
4736 | case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32: |
4737 | case Intrinsic::nvvm_tex_1d_v4u32_s32: |
4738 | case Intrinsic::nvvm_tex_1d_v4u32_f32: |
4739 | case Intrinsic::nvvm_tex_1d_level_v4u32_f32: |
4740 | case Intrinsic::nvvm_tex_1d_grad_v4u32_f32: |
4741 | case Intrinsic::nvvm_tex_1d_array_v4u32_s32: |
4742 | case Intrinsic::nvvm_tex_1d_array_v4u32_f32: |
4743 | case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32: |
4744 | case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32: |
4745 | case Intrinsic::nvvm_tex_2d_v4u32_s32: |
4746 | case Intrinsic::nvvm_tex_2d_v4u32_f32: |
4747 | case Intrinsic::nvvm_tex_2d_level_v4u32_f32: |
4748 | case Intrinsic::nvvm_tex_2d_grad_v4u32_f32: |
4749 | case Intrinsic::nvvm_tex_2d_array_v4u32_s32: |
4750 | case Intrinsic::nvvm_tex_2d_array_v4u32_f32: |
4751 | case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32: |
4752 | case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32: |
4753 | case Intrinsic::nvvm_tex_3d_v4u32_s32: |
4754 | case Intrinsic::nvvm_tex_3d_v4u32_f32: |
4755 | case Intrinsic::nvvm_tex_3d_level_v4u32_f32: |
4756 | case Intrinsic::nvvm_tex_3d_grad_v4u32_f32: |
4757 | case Intrinsic::nvvm_tld4_r_2d_v4s32_f32: |
4758 | case Intrinsic::nvvm_tld4_g_2d_v4s32_f32: |
4759 | case Intrinsic::nvvm_tld4_b_2d_v4s32_f32: |
4760 | case Intrinsic::nvvm_tld4_a_2d_v4s32_f32: |
4761 | case Intrinsic::nvvm_tld4_r_2d_v4u32_f32: |
4762 | case Intrinsic::nvvm_tld4_g_2d_v4u32_f32: |
4763 | case Intrinsic::nvvm_tld4_b_2d_v4u32_f32: |
4764 | case Intrinsic::nvvm_tld4_a_2d_v4u32_f32: |
4765 | case Intrinsic::nvvm_tex_unified_1d_v4s32_s32: |
4766 | case Intrinsic::nvvm_tex_unified_1d_v4s32_f32: |
4767 | case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32: |
4768 | case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32: |
4769 | case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32: |
4770 | case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32: |
4771 | case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32: |
4772 | case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32: |
4773 | case Intrinsic::nvvm_tex_unified_2d_v4s32_s32: |
4774 | case Intrinsic::nvvm_tex_unified_2d_v4s32_f32: |
4775 | case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32: |
4776 | case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32: |
4777 | case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32: |
4778 | case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32: |
4779 | case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32: |
4780 | case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32: |
4781 | case Intrinsic::nvvm_tex_unified_3d_v4s32_s32: |
4782 | case Intrinsic::nvvm_tex_unified_3d_v4s32_f32: |
4783 | case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32: |
4784 | case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32: |
4785 | case Intrinsic::nvvm_tex_unified_1d_v4u32_s32: |
4786 | case Intrinsic::nvvm_tex_unified_1d_v4u32_f32: |
4787 | case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32: |
4788 | case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32: |
4789 | case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32: |
4790 | case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32: |
4791 | case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32: |
4792 | case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32: |
4793 | case Intrinsic::nvvm_tex_unified_2d_v4u32_s32: |
4794 | case Intrinsic::nvvm_tex_unified_2d_v4u32_f32: |
4795 | case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32: |
4796 | case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32: |
4797 | case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32: |
4798 | case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32: |
4799 | case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32: |
4800 | case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32: |
4801 | case Intrinsic::nvvm_tex_unified_3d_v4u32_s32: |
4802 | case Intrinsic::nvvm_tex_unified_3d_v4u32_f32: |
4803 | case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32: |
4804 | case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32: |
4805 | case Intrinsic::nvvm_tex_unified_cube_v4s32_f32: |
4806 | case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32: |
4807 | case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32: |
4808 | case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32: |
4809 | case Intrinsic::nvvm_tex_unified_cube_v4u32_f32: |
4810 | case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32: |
4811 | case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32: |
4812 | case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32: |
4813 | case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32: |
4814 | case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32: |
4815 | case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32: |
4816 | case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32: |
4817 | case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32: |
4818 | case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32: |
4819 | case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32: |
4820 | case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32: |
4821 | case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32: |
4822 | case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32: |
4823 | case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32: |
4824 | case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: |
4825 | Info.opc = getOpcForTextureInstr(Intrinsic); |
4826 | Info.memVT = MVT::v4i32; |
4827 | Info.ptrVal = nullptr; |
4828 | Info.offset = 0; |
4829 | Info.flags = MachineMemOperand::MOLoad; |
4830 | Info.align = Align(16); |
4831 | return true; |
4832 | |
4833 | case Intrinsic::nvvm_suld_1d_i8_clamp: |
4834 | case Intrinsic::nvvm_suld_1d_v2i8_clamp: |
4835 | case Intrinsic::nvvm_suld_1d_v4i8_clamp: |
4836 | case Intrinsic::nvvm_suld_1d_array_i8_clamp: |
4837 | case Intrinsic::nvvm_suld_1d_array_v2i8_clamp: |
4838 | case Intrinsic::nvvm_suld_1d_array_v4i8_clamp: |
4839 | case Intrinsic::nvvm_suld_2d_i8_clamp: |
4840 | case Intrinsic::nvvm_suld_2d_v2i8_clamp: |
4841 | case Intrinsic::nvvm_suld_2d_v4i8_clamp: |
4842 | case Intrinsic::nvvm_suld_2d_array_i8_clamp: |
4843 | case Intrinsic::nvvm_suld_2d_array_v2i8_clamp: |
4844 | case Intrinsic::nvvm_suld_2d_array_v4i8_clamp: |
4845 | case Intrinsic::nvvm_suld_3d_i8_clamp: |
4846 | case Intrinsic::nvvm_suld_3d_v2i8_clamp: |
4847 | case Intrinsic::nvvm_suld_3d_v4i8_clamp: |
4848 | case Intrinsic::nvvm_suld_1d_i8_trap: |
4849 | case Intrinsic::nvvm_suld_1d_v2i8_trap: |
4850 | case Intrinsic::nvvm_suld_1d_v4i8_trap: |
4851 | case Intrinsic::nvvm_suld_1d_array_i8_trap: |
4852 | case Intrinsic::nvvm_suld_1d_array_v2i8_trap: |
4853 | case Intrinsic::nvvm_suld_1d_array_v4i8_trap: |
4854 | case Intrinsic::nvvm_suld_2d_i8_trap: |
4855 | case Intrinsic::nvvm_suld_2d_v2i8_trap: |
4856 | case Intrinsic::nvvm_suld_2d_v4i8_trap: |
4857 | case Intrinsic::nvvm_suld_2d_array_i8_trap: |
4858 | case Intrinsic::nvvm_suld_2d_array_v2i8_trap: |
4859 | case Intrinsic::nvvm_suld_2d_array_v4i8_trap: |
4860 | case Intrinsic::nvvm_suld_3d_i8_trap: |
4861 | case Intrinsic::nvvm_suld_3d_v2i8_trap: |
4862 | case Intrinsic::nvvm_suld_3d_v4i8_trap: |
4863 | case Intrinsic::nvvm_suld_1d_i8_zero: |
4864 | case Intrinsic::nvvm_suld_1d_v2i8_zero: |
4865 | case Intrinsic::nvvm_suld_1d_v4i8_zero: |
4866 | case Intrinsic::nvvm_suld_1d_array_i8_zero: |
4867 | case Intrinsic::nvvm_suld_1d_array_v2i8_zero: |
4868 | case Intrinsic::nvvm_suld_1d_array_v4i8_zero: |
4869 | case Intrinsic::nvvm_suld_2d_i8_zero: |
4870 | case Intrinsic::nvvm_suld_2d_v2i8_zero: |
4871 | case Intrinsic::nvvm_suld_2d_v4i8_zero: |
4872 | case Intrinsic::nvvm_suld_2d_array_i8_zero: |
4873 | case Intrinsic::nvvm_suld_2d_array_v2i8_zero: |
4874 | case Intrinsic::nvvm_suld_2d_array_v4i8_zero: |
4875 | case Intrinsic::nvvm_suld_3d_i8_zero: |
4876 | case Intrinsic::nvvm_suld_3d_v2i8_zero: |
4877 | case Intrinsic::nvvm_suld_3d_v4i8_zero: |
4878 | Info.opc = getOpcForSurfaceInstr(Intrinsic); |
4879 | Info.memVT = MVT::i8; |
4880 | Info.ptrVal = nullptr; |
4881 | Info.offset = 0; |
4882 | Info.flags = MachineMemOperand::MOLoad; |
4883 | Info.align = Align(16); |
4884 | return true; |
4885 | |
4886 | case Intrinsic::nvvm_suld_1d_i16_clamp: |
4887 | case Intrinsic::nvvm_suld_1d_v2i16_clamp: |
4888 | case Intrinsic::nvvm_suld_1d_v4i16_clamp: |
4889 | case Intrinsic::nvvm_suld_1d_array_i16_clamp: |
4890 | case Intrinsic::nvvm_suld_1d_array_v2i16_clamp: |
4891 | case Intrinsic::nvvm_suld_1d_array_v4i16_clamp: |
4892 | case Intrinsic::nvvm_suld_2d_i16_clamp: |
4893 | case Intrinsic::nvvm_suld_2d_v2i16_clamp: |
4894 | case Intrinsic::nvvm_suld_2d_v4i16_clamp: |
4895 | case Intrinsic::nvvm_suld_2d_array_i16_clamp: |
4896 | case Intrinsic::nvvm_suld_2d_array_v2i16_clamp: |
4897 | case Intrinsic::nvvm_suld_2d_array_v4i16_clamp: |
4898 | case Intrinsic::nvvm_suld_3d_i16_clamp: |
4899 | case Intrinsic::nvvm_suld_3d_v2i16_clamp: |
4900 | case Intrinsic::nvvm_suld_3d_v4i16_clamp: |
4901 | case Intrinsic::nvvm_suld_1d_i16_trap: |
4902 | case Intrinsic::nvvm_suld_1d_v2i16_trap: |
4903 | case Intrinsic::nvvm_suld_1d_v4i16_trap: |
4904 | case Intrinsic::nvvm_suld_1d_array_i16_trap: |
4905 | case Intrinsic::nvvm_suld_1d_array_v2i16_trap: |
4906 | case Intrinsic::nvvm_suld_1d_array_v4i16_trap: |
4907 | case Intrinsic::nvvm_suld_2d_i16_trap: |
4908 | case Intrinsic::nvvm_suld_2d_v2i16_trap: |
4909 | case Intrinsic::nvvm_suld_2d_v4i16_trap: |
4910 | case Intrinsic::nvvm_suld_2d_array_i16_trap: |
4911 | case Intrinsic::nvvm_suld_2d_array_v2i16_trap: |
4912 | case Intrinsic::nvvm_suld_2d_array_v4i16_trap: |
4913 | case Intrinsic::nvvm_suld_3d_i16_trap: |
4914 | case Intrinsic::nvvm_suld_3d_v2i16_trap: |
4915 | case Intrinsic::nvvm_suld_3d_v4i16_trap: |
4916 | case Intrinsic::nvvm_suld_1d_i16_zero: |
4917 | case Intrinsic::nvvm_suld_1d_v2i16_zero: |
4918 | case Intrinsic::nvvm_suld_1d_v4i16_zero: |
4919 | case Intrinsic::nvvm_suld_1d_array_i16_zero: |
4920 | case Intrinsic::nvvm_suld_1d_array_v2i16_zero: |
4921 | case Intrinsic::nvvm_suld_1d_array_v4i16_zero: |
4922 | case Intrinsic::nvvm_suld_2d_i16_zero: |
4923 | case Intrinsic::nvvm_suld_2d_v2i16_zero: |
4924 | case Intrinsic::nvvm_suld_2d_v4i16_zero: |
4925 | case Intrinsic::nvvm_suld_2d_array_i16_zero: |
4926 | case Intrinsic::nvvm_suld_2d_array_v2i16_zero: |
4927 | case Intrinsic::nvvm_suld_2d_array_v4i16_zero: |
4928 | case Intrinsic::nvvm_suld_3d_i16_zero: |
4929 | case Intrinsic::nvvm_suld_3d_v2i16_zero: |
4930 | case Intrinsic::nvvm_suld_3d_v4i16_zero: |
4931 | Info.opc = getOpcForSurfaceInstr(Intrinsic); |
4932 | Info.memVT = MVT::i16; |
4933 | Info.ptrVal = nullptr; |
4934 | Info.offset = 0; |
4935 | Info.flags = MachineMemOperand::MOLoad; |
4936 | Info.align = Align(16); |
4937 | return true; |
4938 | |
4939 | case Intrinsic::nvvm_suld_1d_i32_clamp: |
4940 | case Intrinsic::nvvm_suld_1d_v2i32_clamp: |
4941 | case Intrinsic::nvvm_suld_1d_v4i32_clamp: |
4942 | case Intrinsic::nvvm_suld_1d_array_i32_clamp: |
4943 | case Intrinsic::nvvm_suld_1d_array_v2i32_clamp: |
4944 | case Intrinsic::nvvm_suld_1d_array_v4i32_clamp: |
4945 | case Intrinsic::nvvm_suld_2d_i32_clamp: |
4946 | case Intrinsic::nvvm_suld_2d_v2i32_clamp: |
4947 | case Intrinsic::nvvm_suld_2d_v4i32_clamp: |
4948 | case Intrinsic::nvvm_suld_2d_array_i32_clamp: |
4949 | case Intrinsic::nvvm_suld_2d_array_v2i32_clamp: |
4950 | case Intrinsic::nvvm_suld_2d_array_v4i32_clamp: |
4951 | case Intrinsic::nvvm_suld_3d_i32_clamp: |
4952 | case Intrinsic::nvvm_suld_3d_v2i32_clamp: |
4953 | case Intrinsic::nvvm_suld_3d_v4i32_clamp: |
4954 | case Intrinsic::nvvm_suld_1d_i32_trap: |
4955 | case Intrinsic::nvvm_suld_1d_v2i32_trap: |
4956 | case Intrinsic::nvvm_suld_1d_v4i32_trap: |
4957 | case Intrinsic::nvvm_suld_1d_array_i32_trap: |
4958 | case Intrinsic::nvvm_suld_1d_array_v2i32_trap: |
4959 | case Intrinsic::nvvm_suld_1d_array_v4i32_trap: |
4960 | case Intrinsic::nvvm_suld_2d_i32_trap: |
4961 | case Intrinsic::nvvm_suld_2d_v2i32_trap: |
4962 | case Intrinsic::nvvm_suld_2d_v4i32_trap: |
4963 | case Intrinsic::nvvm_suld_2d_array_i32_trap: |
4964 | case Intrinsic::nvvm_suld_2d_array_v2i32_trap: |
4965 | case Intrinsic::nvvm_suld_2d_array_v4i32_trap: |
4966 | case Intrinsic::nvvm_suld_3d_i32_trap: |
4967 | case Intrinsic::nvvm_suld_3d_v2i32_trap: |
4968 | case Intrinsic::nvvm_suld_3d_v4i32_trap: |
4969 | case Intrinsic::nvvm_suld_1d_i32_zero: |
4970 | case Intrinsic::nvvm_suld_1d_v2i32_zero: |
4971 | case Intrinsic::nvvm_suld_1d_v4i32_zero: |
4972 | case Intrinsic::nvvm_suld_1d_array_i32_zero: |
4973 | case Intrinsic::nvvm_suld_1d_array_v2i32_zero: |
4974 | case Intrinsic::nvvm_suld_1d_array_v4i32_zero: |
4975 | case Intrinsic::nvvm_suld_2d_i32_zero: |
4976 | case Intrinsic::nvvm_suld_2d_v2i32_zero: |
4977 | case Intrinsic::nvvm_suld_2d_v4i32_zero: |
4978 | case Intrinsic::nvvm_suld_2d_array_i32_zero: |
4979 | case Intrinsic::nvvm_suld_2d_array_v2i32_zero: |
4980 | case Intrinsic::nvvm_suld_2d_array_v4i32_zero: |
4981 | case Intrinsic::nvvm_suld_3d_i32_zero: |
4982 | case Intrinsic::nvvm_suld_3d_v2i32_zero: |
4983 | case Intrinsic::nvvm_suld_3d_v4i32_zero: |
4984 | Info.opc = getOpcForSurfaceInstr(Intrinsic); |
4985 | Info.memVT = MVT::i32; |
4986 | Info.ptrVal = nullptr; |
4987 | Info.offset = 0; |
4988 | Info.flags = MachineMemOperand::MOLoad; |
4989 | Info.align = Align(16); |
4990 | return true; |
4991 | |
4992 | case Intrinsic::nvvm_suld_1d_i64_clamp: |
4993 | case Intrinsic::nvvm_suld_1d_v2i64_clamp: |
4994 | case Intrinsic::nvvm_suld_1d_array_i64_clamp: |
4995 | case Intrinsic::nvvm_suld_1d_array_v2i64_clamp: |
4996 | case Intrinsic::nvvm_suld_2d_i64_clamp: |
4997 | case Intrinsic::nvvm_suld_2d_v2i64_clamp: |
4998 | case Intrinsic::nvvm_suld_2d_array_i64_clamp: |
4999 | case Intrinsic::nvvm_suld_2d_array_v2i64_clamp: |
5000 | case Intrinsic::nvvm_suld_3d_i64_clamp: |
5001 | case Intrinsic::nvvm_suld_3d_v2i64_clamp: |
5002 | case Intrinsic::nvvm_suld_1d_i64_trap: |
5003 | case Intrinsic::nvvm_suld_1d_v2i64_trap: |
5004 | case Intrinsic::nvvm_suld_1d_array_i64_trap: |
5005 | case Intrinsic::nvvm_suld_1d_array_v2i64_trap: |
5006 | case Intrinsic::nvvm_suld_2d_i64_trap: |
5007 | case Intrinsic::nvvm_suld_2d_v2i64_trap: |
5008 | case Intrinsic::nvvm_suld_2d_array_i64_trap: |
5009 | case Intrinsic::nvvm_suld_2d_array_v2i64_trap: |
5010 | case Intrinsic::nvvm_suld_3d_i64_trap: |
5011 | case Intrinsic::nvvm_suld_3d_v2i64_trap: |
5012 | case Intrinsic::nvvm_suld_1d_i64_zero: |
5013 | case Intrinsic::nvvm_suld_1d_v2i64_zero: |
5014 | case Intrinsic::nvvm_suld_1d_array_i64_zero: |
5015 | case Intrinsic::nvvm_suld_1d_array_v2i64_zero: |
5016 | case Intrinsic::nvvm_suld_2d_i64_zero: |
5017 | case Intrinsic::nvvm_suld_2d_v2i64_zero: |
5018 | case Intrinsic::nvvm_suld_2d_array_i64_zero: |
5019 | case Intrinsic::nvvm_suld_2d_array_v2i64_zero: |
5020 | case Intrinsic::nvvm_suld_3d_i64_zero: |
5021 | case Intrinsic::nvvm_suld_3d_v2i64_zero: |
5022 | Info.opc = getOpcForSurfaceInstr(Intrinsic); |
5023 | Info.memVT = MVT::i64; |
5024 | Info.ptrVal = nullptr; |
5025 | Info.offset = 0; |
5026 | Info.flags = MachineMemOperand::MOLoad; |
5027 | Info.align = Align(16); |
5028 | return true; |
5029 | } |
5030 | return false; |
5031 | } |
5032 | |
5033 | /// getFunctionParamOptimizedAlign - since function arguments are passed via |
5034 | /// .param space, we may want to increase their alignment in a way that |
5035 | /// ensures that we can effectively vectorize their loads & stores. We can |
5036 | /// increase alignment only if the function has internal or has private |
5037 | /// linkage as for other linkage types callers may already rely on default |
5038 | /// alignment. To allow using 128-bit vectorized loads/stores, this function |
5039 | /// ensures that alignment is 16 or greater. |
5040 | Align NVPTXTargetLowering::getFunctionParamOptimizedAlign( |
5041 | const Function *F, Type *ArgTy, const DataLayout &DL) const { |
5042 | const uint64_t ABITypeAlign = DL.getABITypeAlign(Ty: ArgTy).value(); |
5043 | |
5044 | // If a function has linkage different from internal or private, we |
5045 | // must use default ABI alignment as external users rely on it. Same |
5046 | // for a function that may be called from a function pointer. |
5047 | if (!F || !F->hasLocalLinkage() || |
5048 | F->hasAddressTaken(/*Users=*/nullptr, |
5049 | /*IgnoreCallbackUses=*/false, |
5050 | /*IgnoreAssumeLikeCalls=*/true, |
5051 | /*IgnoreLLVMUsed=*/IngoreLLVMUsed: true)) |
5052 | return Align(ABITypeAlign); |
5053 | |
5054 | assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage" ); |
5055 | return Align(std::max(a: uint64_t(16), b: ABITypeAlign)); |
5056 | } |
5057 | |
5058 | /// Helper for computing alignment of a device function byval parameter. |
5059 | Align NVPTXTargetLowering::getFunctionByValParamAlign( |
5060 | const Function *F, Type *ArgTy, Align InitialAlign, |
5061 | const DataLayout &DL) const { |
5062 | Align ArgAlign = InitialAlign; |
5063 | // Try to increase alignment to enhance vectorization options. |
5064 | if (F) |
5065 | ArgAlign = std::max(a: ArgAlign, b: getFunctionParamOptimizedAlign(F, ArgTy, DL)); |
5066 | |
5067 | // Old ptx versions have a bug. When PTX code takes address of |
5068 | // byval parameter with alignment < 4, ptxas generates code to |
5069 | // spill argument into memory. Alas on sm_50+ ptxas generates |
5070 | // SASS code that fails with misaligned access. To work around |
5071 | // the problem, make sure that we align byval parameters by at |
5072 | // least 4. This bug seems to be fixed at least starting from |
5073 | // ptxas > 9.0. |
5074 | // TODO: remove this after verifying the bug is not reproduced |
5075 | // on non-deprecated ptxas versions. |
5076 | if (ForceMinByValParamAlign) |
5077 | ArgAlign = std::max(a: ArgAlign, b: Align(4)); |
5078 | |
5079 | return ArgAlign; |
5080 | } |
5081 | |
5082 | // Helper for getting a function parameter name. Name is composed from |
5083 | // its index and the function name. Negative index corresponds to special |
5084 | // parameter (unsized array) used for passing variable arguments. |
5085 | std::string NVPTXTargetLowering::getParamName(const Function *F, |
5086 | int Idx) const { |
5087 | std::string ParamName; |
5088 | raw_string_ostream ParamStr(ParamName); |
5089 | |
5090 | ParamStr << getTargetMachine().getSymbol(GV: F)->getName(); |
5091 | if (Idx < 0) |
5092 | ParamStr << "_vararg" ; |
5093 | else |
5094 | ParamStr << "_param_" << Idx; |
5095 | |
5096 | return ParamName; |
5097 | } |
5098 | |
5099 | /// isLegalAddressingMode - Return true if the addressing mode represented |
5100 | /// by AM is legal for this target, for a load/store of the specified type. |
5101 | /// Used to guide target specific optimizations, like loop strength reduction |
5102 | /// (LoopStrengthReduce.cpp) and memory optimization for address mode |
5103 | /// (CodeGenPrepare.cpp) |
5104 | bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, |
5105 | const AddrMode &AM, Type *Ty, |
5106 | unsigned AS, Instruction *I) const { |
5107 | // AddrMode - This represents an addressing mode of: |
5108 | // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg |
5109 | // |
5110 | // The legal address modes are |
5111 | // - [avar] |
5112 | // - [areg] |
5113 | // - [areg+immoff] |
5114 | // - [immAddr] |
5115 | |
5116 | if (AM.BaseGV) { |
5117 | return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale; |
5118 | } |
5119 | |
5120 | switch (AM.Scale) { |
5121 | case 0: // "r", "r+i" or "i" is allowed |
5122 | break; |
5123 | case 1: |
5124 | if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed. |
5125 | return false; |
5126 | // Otherwise we have r+i. |
5127 | break; |
5128 | default: |
5129 | // No scale > 1 is allowed |
5130 | return false; |
5131 | } |
5132 | return true; |
5133 | } |
5134 | |
5135 | //===----------------------------------------------------------------------===// |
5136 | // NVPTX Inline Assembly Support |
5137 | //===----------------------------------------------------------------------===// |
5138 | |
5139 | /// getConstraintType - Given a constraint letter, return the type of |
5140 | /// constraint it is for this target. |
5141 | NVPTXTargetLowering::ConstraintType |
5142 | NVPTXTargetLowering::getConstraintType(StringRef Constraint) const { |
5143 | if (Constraint.size() == 1) { |
5144 | switch (Constraint[0]) { |
5145 | default: |
5146 | break; |
5147 | case 'b': |
5148 | case 'r': |
5149 | case 'h': |
5150 | case 'c': |
5151 | case 'l': |
5152 | case 'f': |
5153 | case 'd': |
5154 | case '0': |
5155 | case 'N': |
5156 | return C_RegisterClass; |
5157 | } |
5158 | } |
5159 | return TargetLowering::getConstraintType(Constraint); |
5160 | } |
5161 | |
5162 | std::pair<unsigned, const TargetRegisterClass *> |
5163 | NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, |
5164 | StringRef Constraint, |
5165 | MVT VT) const { |
5166 | if (Constraint.size() == 1) { |
5167 | switch (Constraint[0]) { |
5168 | case 'b': |
5169 | return std::make_pair(0U, &NVPTX::Int1RegsRegClass); |
5170 | case 'c': |
5171 | return std::make_pair(0U, &NVPTX::Int16RegsRegClass); |
5172 | case 'h': |
5173 | return std::make_pair(0U, &NVPTX::Int16RegsRegClass); |
5174 | case 'r': |
5175 | return std::make_pair(0U, &NVPTX::Int32RegsRegClass); |
5176 | case 'l': |
5177 | case 'N': |
5178 | return std::make_pair(0U, &NVPTX::Int64RegsRegClass); |
5179 | case 'f': |
5180 | return std::make_pair(0U, &NVPTX::Float32RegsRegClass); |
5181 | case 'd': |
5182 | return std::make_pair(0U, &NVPTX::Float64RegsRegClass); |
5183 | } |
5184 | } |
5185 | return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); |
5186 | } |
5187 | |
5188 | //===----------------------------------------------------------------------===// |
5189 | // NVPTX DAG Combining |
5190 | //===----------------------------------------------------------------------===// |
5191 | |
5192 | bool NVPTXTargetLowering::allowFMA(MachineFunction &MF, |
5193 | CodeGenOptLevel OptLevel) const { |
5194 | // Always honor command-line argument |
5195 | if (FMAContractLevelOpt.getNumOccurrences() > 0) |
5196 | return FMAContractLevelOpt > 0; |
5197 | |
5198 | // Do not contract if we're not optimizing the code. |
5199 | if (OptLevel == CodeGenOptLevel::None) |
5200 | return false; |
5201 | |
5202 | // Honor TargetOptions flags that explicitly say fusion is okay. |
5203 | if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast) |
5204 | return true; |
5205 | |
5206 | return allowUnsafeFPMath(MF); |
5207 | } |
5208 | |
5209 | bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const { |
5210 | // Honor TargetOptions flags that explicitly say unsafe math is okay. |
5211 | if (MF.getTarget().Options.UnsafeFPMath) |
5212 | return true; |
5213 | |
5214 | // Allow unsafe math if unsafe-fp-math attribute explicitly says so. |
5215 | const Function &F = MF.getFunction(); |
5216 | return F.getFnAttribute(Kind: "unsafe-fp-math" ).getValueAsBool(); |
5217 | } |
5218 | |
5219 | /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with |
5220 | /// operands N0 and N1. This is a helper for PerformADDCombine that is |
5221 | /// called with the default operands, and if that fails, with commuted |
5222 | /// operands. |
5223 | static SDValue PerformADDCombineWithOperands( |
5224 | SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, |
5225 | const NVPTXSubtarget &Subtarget, CodeGenOptLevel OptLevel) { |
5226 | SelectionDAG &DAG = DCI.DAG; |
5227 | // Skip non-integer, non-scalar case |
5228 | EVT VT=N0.getValueType(); |
5229 | if (VT.isVector()) |
5230 | return SDValue(); |
5231 | |
5232 | // fold (add (mul a, b), c) -> (mad a, b, c) |
5233 | // |
5234 | if (N0.getOpcode() == ISD::MUL) { |
5235 | assert (VT.isInteger()); |
5236 | // For integer: |
5237 | // Since integer multiply-add costs the same as integer multiply |
5238 | // but is more costly than integer add, do the fusion only when |
5239 | // the mul is only used in the add. |
5240 | if (OptLevel == CodeGenOptLevel::None || VT != MVT::i32 || |
5241 | !N0.getNode()->hasOneUse()) |
5242 | return SDValue(); |
5243 | |
5244 | // Do the folding |
5245 | return DAG.getNode(Opcode: NVPTXISD::IMAD, DL: SDLoc(N), VT, |
5246 | N1: N0.getOperand(i: 0), N2: N0.getOperand(i: 1), N3: N1); |
5247 | } |
5248 | else if (N0.getOpcode() == ISD::FMUL) { |
5249 | if (VT == MVT::f32 || VT == MVT::f64) { |
5250 | const auto *TLI = static_cast<const NVPTXTargetLowering *>( |
5251 | &DAG.getTargetLoweringInfo()); |
5252 | if (!TLI->allowFMA(MF&: DAG.getMachineFunction(), OptLevel)) |
5253 | return SDValue(); |
5254 | |
5255 | // For floating point: |
5256 | // Do the fusion only when the mul has less than 5 uses and all |
5257 | // are add. |
5258 | // The heuristic is that if a use is not an add, then that use |
5259 | // cannot be fused into fma, therefore mul is still needed anyway. |
5260 | // If there are more than 4 uses, even if they are all add, fusing |
5261 | // them will increase register pressue. |
5262 | // |
5263 | int numUses = 0; |
5264 | int nonAddCount = 0; |
5265 | for (const SDNode *User : N0.getNode()->uses()) { |
5266 | numUses++; |
5267 | if (User->getOpcode() != ISD::FADD) |
5268 | ++nonAddCount; |
5269 | } |
5270 | if (numUses >= 5) |
5271 | return SDValue(); |
5272 | if (nonAddCount) { |
5273 | int orderNo = N->getIROrder(); |
5274 | int orderNo2 = N0.getNode()->getIROrder(); |
5275 | // simple heuristics here for considering potential register |
5276 | // pressure, the logics here is that the differnce are used |
5277 | // to measure the distance between def and use, the longer distance |
5278 | // more likely cause register pressure. |
5279 | if (orderNo - orderNo2 < 500) |
5280 | return SDValue(); |
5281 | |
5282 | // Now, check if at least one of the FMUL's operands is live beyond the node N, |
5283 | // which guarantees that the FMA will not increase register pressure at node N. |
5284 | bool opIsLive = false; |
5285 | const SDNode *left = N0.getOperand(i: 0).getNode(); |
5286 | const SDNode *right = N0.getOperand(i: 1).getNode(); |
5287 | |
5288 | if (isa<ConstantSDNode>(Val: left) || isa<ConstantSDNode>(Val: right)) |
5289 | opIsLive = true; |
5290 | |
5291 | if (!opIsLive) |
5292 | for (const SDNode *User : left->uses()) { |
5293 | int orderNo3 = User->getIROrder(); |
5294 | if (orderNo3 > orderNo) { |
5295 | opIsLive = true; |
5296 | break; |
5297 | } |
5298 | } |
5299 | |
5300 | if (!opIsLive) |
5301 | for (const SDNode *User : right->uses()) { |
5302 | int orderNo3 = User->getIROrder(); |
5303 | if (orderNo3 > orderNo) { |
5304 | opIsLive = true; |
5305 | break; |
5306 | } |
5307 | } |
5308 | |
5309 | if (!opIsLive) |
5310 | return SDValue(); |
5311 | } |
5312 | |
5313 | return DAG.getNode(Opcode: ISD::FMA, DL: SDLoc(N), VT, |
5314 | N1: N0.getOperand(i: 0), N2: N0.getOperand(i: 1), N3: N1); |
5315 | } |
5316 | } |
5317 | |
5318 | return SDValue(); |
5319 | } |
5320 | |
5321 | static SDValue PerformStoreRetvalCombine(SDNode *N) { |
5322 | // Operands from the 2nd to the last one are the values to be stored |
5323 | for (std::size_t I = 2, OpsCount = N->ops().size(); I != OpsCount; ++I) |
5324 | if (!N->getOperand(Num: I).isUndef()) |
5325 | return SDValue(); |
5326 | |
5327 | // Operand 0 is the previous value in the chain. Cannot return EntryToken |
5328 | // as the previous value will become unused and eliminated later. |
5329 | return N->getOperand(Num: 0); |
5330 | } |
5331 | |
5332 | /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. |
5333 | /// |
5334 | static SDValue PerformADDCombine(SDNode *N, |
5335 | TargetLowering::DAGCombinerInfo &DCI, |
5336 | const NVPTXSubtarget &Subtarget, |
5337 | CodeGenOptLevel OptLevel) { |
5338 | SDValue N0 = N->getOperand(Num: 0); |
5339 | SDValue N1 = N->getOperand(Num: 1); |
5340 | |
5341 | // First try with the default operand order. |
5342 | if (SDValue Result = |
5343 | PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel)) |
5344 | return Result; |
5345 | |
5346 | // If that didn't work, try again with the operands commuted. |
5347 | return PerformADDCombineWithOperands(N, N0: N1, N1: N0, DCI, Subtarget, OptLevel); |
5348 | } |
5349 | |
5350 | static SDValue PerformANDCombine(SDNode *N, |
5351 | TargetLowering::DAGCombinerInfo &DCI) { |
5352 | // The type legalizer turns a vector load of i8 values into a zextload to i16 |
5353 | // registers, optionally ANY_EXTENDs it (if target type is integer), |
5354 | // and ANDs off the high 8 bits. Since we turn this load into a |
5355 | // target-specific DAG node, the DAG combiner fails to eliminate these AND |
5356 | // nodes. Do that here. |
5357 | SDValue Val = N->getOperand(Num: 0); |
5358 | SDValue Mask = N->getOperand(Num: 1); |
5359 | |
5360 | if (isa<ConstantSDNode>(Val)) { |
5361 | std::swap(a&: Val, b&: Mask); |
5362 | } |
5363 | |
5364 | SDValue AExt; |
5365 | |
5366 | // Convert BFE-> truncate i16 -> and 255 |
5367 | // To just BFE-> truncate i16, as the value already has all the bits in the |
5368 | // right places. |
5369 | if (Val.getOpcode() == ISD::TRUNCATE) { |
5370 | SDValue BFE = Val.getOperand(i: 0); |
5371 | if (BFE.getOpcode() != NVPTXISD::BFE) |
5372 | return SDValue(); |
5373 | |
5374 | ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(Val: BFE.getOperand(i: 0)); |
5375 | if (!BFEBits) |
5376 | return SDValue(); |
5377 | uint64_t BFEBitsVal = BFEBits->getZExtValue(); |
5378 | |
5379 | ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Val&: Mask); |
5380 | if (!MaskCnst) { |
5381 | // Not an AND with a constant |
5382 | return SDValue(); |
5383 | } |
5384 | uint64_t MaskVal = MaskCnst->getZExtValue(); |
5385 | |
5386 | if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1) |
5387 | return SDValue(); |
5388 | // If we get here, the AND is unnecessary. Just replace it with the trunc |
5389 | DCI.CombineTo(N, Res: Val, AddTo: false); |
5390 | } |
5391 | // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and |
5392 | if (Val.getOpcode() == ISD::ANY_EXTEND) { |
5393 | AExt = Val; |
5394 | Val = Val->getOperand(Num: 0); |
5395 | } |
5396 | |
5397 | if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { |
5398 | Val = Val->getOperand(Num: 0); |
5399 | } |
5400 | |
5401 | if (Val->getOpcode() == NVPTXISD::LoadV2 || |
5402 | Val->getOpcode() == NVPTXISD::LoadV4) { |
5403 | ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Val&: Mask); |
5404 | if (!MaskCnst) { |
5405 | // Not an AND with a constant |
5406 | return SDValue(); |
5407 | } |
5408 | |
5409 | uint64_t MaskVal = MaskCnst->getZExtValue(); |
5410 | if (MaskVal != 0xff) { |
5411 | // Not an AND that chops off top 8 bits |
5412 | return SDValue(); |
5413 | } |
5414 | |
5415 | MemSDNode *Mem = dyn_cast<MemSDNode>(Val); |
5416 | if (!Mem) { |
5417 | // Not a MemSDNode?!? |
5418 | return SDValue(); |
5419 | } |
5420 | |
5421 | EVT MemVT = Mem->getMemoryVT(); |
5422 | if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) { |
5423 | // We only handle the i8 case |
5424 | return SDValue(); |
5425 | } |
5426 | |
5427 | unsigned ExtType = Val->getConstantOperandVal(Num: Val->getNumOperands() - 1); |
5428 | if (ExtType == ISD::SEXTLOAD) { |
5429 | // If for some reason the load is a sextload, the and is needed to zero |
5430 | // out the high 8 bits |
5431 | return SDValue(); |
5432 | } |
5433 | |
5434 | bool AddTo = false; |
5435 | if (AExt.getNode() != nullptr) { |
5436 | // Re-insert the ext as a zext. |
5437 | Val = DCI.DAG.getNode(Opcode: ISD::ZERO_EXTEND, DL: SDLoc(N), |
5438 | VT: AExt.getValueType(), Operand: Val); |
5439 | AddTo = true; |
5440 | } |
5441 | |
5442 | // If we get here, the AND is unnecessary. Just replace it with the load |
5443 | DCI.CombineTo(N, Res: Val, AddTo); |
5444 | } |
5445 | |
5446 | return SDValue(); |
5447 | } |
5448 | |
5449 | static SDValue PerformREMCombine(SDNode *N, |
5450 | TargetLowering::DAGCombinerInfo &DCI, |
5451 | CodeGenOptLevel OptLevel) { |
5452 | assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM); |
5453 | |
5454 | // Don't do anything at less than -O2. |
5455 | if (OptLevel < CodeGenOptLevel::Default) |
5456 | return SDValue(); |
5457 | |
5458 | SelectionDAG &DAG = DCI.DAG; |
5459 | SDLoc DL(N); |
5460 | EVT VT = N->getValueType(ResNo: 0); |
5461 | bool IsSigned = N->getOpcode() == ISD::SREM; |
5462 | unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV; |
5463 | |
5464 | const SDValue &Num = N->getOperand(Num: 0); |
5465 | const SDValue &Den = N->getOperand(Num: 1); |
5466 | |
5467 | for (const SDNode *U : Num->uses()) { |
5468 | if (U->getOpcode() == DivOpc && U->getOperand(Num: 0) == Num && |
5469 | U->getOperand(Num: 1) == Den) { |
5470 | // Num % Den -> Num - (Num / Den) * Den |
5471 | return DAG.getNode(Opcode: ISD::SUB, DL, VT, N1: Num, |
5472 | N2: DAG.getNode(Opcode: ISD::MUL, DL, VT, |
5473 | N1: DAG.getNode(Opcode: DivOpc, DL, VT, N1: Num, N2: Den), |
5474 | N2: Den)); |
5475 | } |
5476 | } |
5477 | return SDValue(); |
5478 | } |
5479 | |
5480 | enum OperandSignedness { |
5481 | Signed = 0, |
5482 | Unsigned, |
5483 | Unknown |
5484 | }; |
5485 | |
5486 | /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand |
5487 | /// that can be demoted to \p OptSize bits without loss of information. The |
5488 | /// signedness of the operand, if determinable, is placed in \p S. |
5489 | static bool IsMulWideOperandDemotable(SDValue Op, |
5490 | unsigned OptSize, |
5491 | OperandSignedness &S) { |
5492 | S = Unknown; |
5493 | |
5494 | if (Op.getOpcode() == ISD::SIGN_EXTEND || |
5495 | Op.getOpcode() == ISD::SIGN_EXTEND_INREG) { |
5496 | EVT OrigVT = Op.getOperand(i: 0).getValueType(); |
5497 | if (OrigVT.getFixedSizeInBits() <= OptSize) { |
5498 | S = Signed; |
5499 | return true; |
5500 | } |
5501 | } else if (Op.getOpcode() == ISD::ZERO_EXTEND) { |
5502 | EVT OrigVT = Op.getOperand(i: 0).getValueType(); |
5503 | if (OrigVT.getFixedSizeInBits() <= OptSize) { |
5504 | S = Unsigned; |
5505 | return true; |
5506 | } |
5507 | } |
5508 | |
5509 | return false; |
5510 | } |
5511 | |
5512 | /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can |
5513 | /// be demoted to \p OptSize bits without loss of information. If the operands |
5514 | /// contain a constant, it should appear as the RHS operand. The signedness of |
5515 | /// the operands is placed in \p IsSigned. |
5516 | static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, |
5517 | unsigned OptSize, |
5518 | bool &IsSigned) { |
5519 | OperandSignedness LHSSign; |
5520 | |
5521 | // The LHS operand must be a demotable op |
5522 | if (!IsMulWideOperandDemotable(Op: LHS, OptSize, S&: LHSSign)) |
5523 | return false; |
5524 | |
5525 | // We should have been able to determine the signedness from the LHS |
5526 | if (LHSSign == Unknown) |
5527 | return false; |
5528 | |
5529 | IsSigned = (LHSSign == Signed); |
5530 | |
5531 | // The RHS can be a demotable op or a constant |
5532 | if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Val&: RHS)) { |
5533 | const APInt &Val = CI->getAPIntValue(); |
5534 | if (LHSSign == Unsigned) { |
5535 | return Val.isIntN(N: OptSize); |
5536 | } else { |
5537 | return Val.isSignedIntN(N: OptSize); |
5538 | } |
5539 | } else { |
5540 | OperandSignedness RHSSign; |
5541 | if (!IsMulWideOperandDemotable(Op: RHS, OptSize, S&: RHSSign)) |
5542 | return false; |
5543 | |
5544 | return LHSSign == RHSSign; |
5545 | } |
5546 | } |
5547 | |
5548 | /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply |
5549 | /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform |
5550 | /// works on both multiply DAG nodes and SHL DAG nodes with a constant shift |
5551 | /// amount. |
5552 | static SDValue TryMULWIDECombine(SDNode *N, |
5553 | TargetLowering::DAGCombinerInfo &DCI) { |
5554 | EVT MulType = N->getValueType(ResNo: 0); |
5555 | if (MulType != MVT::i32 && MulType != MVT::i64) { |
5556 | return SDValue(); |
5557 | } |
5558 | |
5559 | SDLoc DL(N); |
5560 | unsigned OptSize = MulType.getSizeInBits() >> 1; |
5561 | SDValue LHS = N->getOperand(Num: 0); |
5562 | SDValue RHS = N->getOperand(Num: 1); |
5563 | |
5564 | // Canonicalize the multiply so the constant (if any) is on the right |
5565 | if (N->getOpcode() == ISD::MUL) { |
5566 | if (isa<ConstantSDNode>(Val: LHS)) { |
5567 | std::swap(a&: LHS, b&: RHS); |
5568 | } |
5569 | } |
5570 | |
5571 | // If we have a SHL, determine the actual multiply amount |
5572 | if (N->getOpcode() == ISD::SHL) { |
5573 | ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(Val&: RHS); |
5574 | if (!ShlRHS) { |
5575 | return SDValue(); |
5576 | } |
5577 | |
5578 | APInt ShiftAmt = ShlRHS->getAPIntValue(); |
5579 | unsigned BitWidth = MulType.getSizeInBits(); |
5580 | if (ShiftAmt.sge(RHS: 0) && ShiftAmt.slt(RHS: BitWidth)) { |
5581 | APInt MulVal = APInt(BitWidth, 1) << ShiftAmt; |
5582 | RHS = DCI.DAG.getConstant(Val: MulVal, DL, VT: MulType); |
5583 | } else { |
5584 | return SDValue(); |
5585 | } |
5586 | } |
5587 | |
5588 | bool Signed; |
5589 | // Verify that our operands are demotable |
5590 | if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, IsSigned&: Signed)) { |
5591 | return SDValue(); |
5592 | } |
5593 | |
5594 | EVT DemotedVT; |
5595 | if (MulType == MVT::i32) { |
5596 | DemotedVT = MVT::i16; |
5597 | } else { |
5598 | DemotedVT = MVT::i32; |
5599 | } |
5600 | |
5601 | // Truncate the operands to the correct size. Note that these are just for |
5602 | // type consistency and will (likely) be eliminated in later phases. |
5603 | SDValue TruncLHS = |
5604 | DCI.DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DemotedVT, Operand: LHS); |
5605 | SDValue TruncRHS = |
5606 | DCI.DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: DemotedVT, Operand: RHS); |
5607 | |
5608 | unsigned Opc; |
5609 | if (Signed) { |
5610 | Opc = NVPTXISD::MUL_WIDE_SIGNED; |
5611 | } else { |
5612 | Opc = NVPTXISD::MUL_WIDE_UNSIGNED; |
5613 | } |
5614 | |
5615 | return DCI.DAG.getNode(Opcode: Opc, DL, VT: MulType, N1: TruncLHS, N2: TruncRHS); |
5616 | } |
5617 | |
5618 | /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. |
5619 | static SDValue PerformMULCombine(SDNode *N, |
5620 | TargetLowering::DAGCombinerInfo &DCI, |
5621 | CodeGenOptLevel OptLevel) { |
5622 | if (OptLevel > CodeGenOptLevel::None) { |
5623 | // Try mul.wide combining at OptLevel > 0 |
5624 | if (SDValue Ret = TryMULWIDECombine(N, DCI)) |
5625 | return Ret; |
5626 | } |
5627 | |
5628 | return SDValue(); |
5629 | } |
5630 | |
5631 | /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. |
5632 | static SDValue PerformSHLCombine(SDNode *N, |
5633 | TargetLowering::DAGCombinerInfo &DCI, |
5634 | CodeGenOptLevel OptLevel) { |
5635 | if (OptLevel > CodeGenOptLevel::None) { |
5636 | // Try mul.wide combining at OptLevel > 0 |
5637 | if (SDValue Ret = TryMULWIDECombine(N, DCI)) |
5638 | return Ret; |
5639 | } |
5640 | |
5641 | return SDValue(); |
5642 | } |
5643 | |
5644 | static SDValue PerformSETCCCombine(SDNode *N, |
5645 | TargetLowering::DAGCombinerInfo &DCI, |
5646 | unsigned int SmVersion) { |
5647 | EVT CCType = N->getValueType(ResNo: 0); |
5648 | SDValue A = N->getOperand(Num: 0); |
5649 | SDValue B = N->getOperand(Num: 1); |
5650 | |
5651 | EVT AType = A.getValueType(); |
5652 | if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16))) |
5653 | return SDValue(); |
5654 | |
5655 | if (A.getValueType() == MVT::v2bf16 && SmVersion < 90) |
5656 | return SDValue(); |
5657 | |
5658 | SDLoc DL(N); |
5659 | // setp.f16x2 returns two scalar predicates, which we need to |
5660 | // convert back to v2i1. The returned result will be scalarized by |
5661 | // the legalizer, but the comparison will remain a single vector |
5662 | // instruction. |
5663 | SDValue CCNode = DCI.DAG.getNode( |
5664 | A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2 |
5665 | : NVPTXISD::SETP_BF16X2, |
5666 | DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)}); |
5667 | return DCI.DAG.getNode(Opcode: ISD::BUILD_VECTOR, DL, VT: CCType, N1: CCNode.getValue(R: 0), |
5668 | N2: CCNode.getValue(R: 1)); |
5669 | } |
5670 | |
5671 | static SDValue (SDNode *N, |
5672 | TargetLowering::DAGCombinerInfo &DCI) { |
5673 | SDValue Vector = N->getOperand(Num: 0); |
5674 | SDLoc DL(N); |
5675 | EVT VectorVT = Vector.getValueType(); |
5676 | if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() && |
5677 | IsPTXVectorType(VT: VectorVT.getSimpleVT())) |
5678 | return SDValue(); // Native vector loads already combine nicely w/ |
5679 | // extract_vector_elt. |
5680 | // Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already |
5681 | // handle them OK. |
5682 | if (VectorVT.getVectorNumElements() == 1 || Isv2x16VT(VectorVT) || |
5683 | VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8) |
5684 | return SDValue(); |
5685 | |
5686 | uint64_t VectorBits = VectorVT.getSizeInBits(); |
5687 | // We only handle the types we can extract in-register. |
5688 | if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64)) |
5689 | return SDValue(); |
5690 | |
5691 | ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Val: N->getOperand(Num: 1)); |
5692 | // Index == 0 is handled by generic DAG combiner. |
5693 | if (!Index || Index->getZExtValue() == 0) |
5694 | return SDValue(); |
5695 | |
5696 | MVT IVT = MVT::getIntegerVT(BitWidth: VectorBits); |
5697 | EVT EltVT = VectorVT.getVectorElementType(); |
5698 | EVT EltIVT = EltVT.changeTypeToInteger(); |
5699 | uint64_t EltBits = EltVT.getScalarSizeInBits(); |
5700 | |
5701 | SDValue Result = DCI.DAG.getNode( |
5702 | Opcode: ISD::TRUNCATE, DL, VT: EltIVT, |
5703 | Operand: DCI.DAG.getNode( |
5704 | Opcode: ISD::SRA, DL, VT: IVT, N1: DCI.DAG.getNode(Opcode: ISD::BITCAST, DL, VT: IVT, Operand: Vector), |
5705 | N2: DCI.DAG.getConstant(Val: Index->getZExtValue() * EltBits, DL, VT: IVT))); |
5706 | |
5707 | // If element has non-integer type, bitcast it back to the expected type. |
5708 | if (EltVT != EltIVT) |
5709 | Result = DCI.DAG.getNode(Opcode: ISD::BITCAST, DL, VT: EltVT, Operand: Result); |
5710 | // Past legalizer, we may need to extent i8 -> i16 to match the register type. |
5711 | if (EltVT != N->getValueType(ResNo: 0)) |
5712 | Result = DCI.DAG.getNode(Opcode: ISD::ANY_EXTEND, DL, VT: N->getValueType(ResNo: 0), Operand: Result); |
5713 | |
5714 | return Result; |
5715 | } |
5716 | |
5717 | static SDValue PerformVSELECTCombine(SDNode *N, |
5718 | TargetLowering::DAGCombinerInfo &DCI) { |
5719 | SDValue VA = N->getOperand(Num: 1); |
5720 | EVT VectorVT = VA.getValueType(); |
5721 | if (VectorVT != MVT::v4i8) |
5722 | return SDValue(); |
5723 | |
5724 | // We need to split vselect into individual per-element operations Because we |
5725 | // use BFE/BFI instruction for byte extraction/insertion, we do end up with |
5726 | // 32-bit values, so we may as well do comparison as i32 to avoid conversions |
5727 | // to/from i16 normally used for i8 values. |
5728 | SmallVector<SDValue, 4> E; |
5729 | SDLoc DL(N); |
5730 | SDValue VCond = N->getOperand(Num: 0); |
5731 | SDValue VB = N->getOperand(Num: 2); |
5732 | for (int I = 0; I < 4; ++I) { |
5733 | SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond, |
5734 | DCI.DAG.getConstant(I, DL, MVT::i32)); |
5735 | SDValue EA = DCI.DAG.getAnyExtOrTrunc( |
5736 | DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA, |
5737 | DCI.DAG.getConstant(I, DL, MVT::i32)), |
5738 | DL, MVT::i32); |
5739 | SDValue EB = DCI.DAG.getAnyExtOrTrunc( |
5740 | DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB, |
5741 | DCI.DAG.getConstant(I, DL, MVT::i32)), |
5742 | DL, MVT::i32); |
5743 | E.push_back(DCI.DAG.getAnyExtOrTrunc( |
5744 | DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8)); |
5745 | } |
5746 | return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E); |
5747 | } |
5748 | |
5749 | static SDValue PerformLOADCombine(SDNode *N, |
5750 | TargetLowering::DAGCombinerInfo &DCI) { |
5751 | SelectionDAG &DAG = DCI.DAG; |
5752 | LoadSDNode *LD = cast<LoadSDNode>(Val: N); |
5753 | |
5754 | // Lower a v16i8 load into a LoadV4 operation with i32 results instead of |
5755 | // letting ReplaceLoadVector split it into smaller loads during legalization. |
5756 | // This is done at dag-combine1 time, so that vector operations with i8 |
5757 | // elements can be optimised away instead of being needlessly split during |
5758 | // legalization, which involves storing to the stack and loading it back. |
5759 | EVT VT = N->getValueType(ResNo: 0); |
5760 | if (VT != MVT::v16i8) |
5761 | return SDValue(); |
5762 | |
5763 | SDLoc DL(N); |
5764 | |
5765 | // Create a v4i32 vector load operation, effectively <4 x v4i8>. |
5766 | unsigned Opc = NVPTXISD::LoadV4; |
5767 | EVT NewVT = MVT::v4i32; |
5768 | EVT EltVT = NewVT.getVectorElementType(); |
5769 | unsigned NumElts = NewVT.getVectorNumElements(); |
5770 | EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other}; |
5771 | SDVTList RetVTList = DAG.getVTList(RetVTs); |
5772 | SmallVector<SDValue, 8> Ops(N->ops()); |
5773 | Ops.push_back(Elt: DAG.getIntPtrConstant(Val: LD->getExtensionType(), DL)); |
5774 | SDValue NewLoad = DAG.getMemIntrinsicNode(Opcode: Opc, dl: DL, VTList: RetVTList, Ops, MemVT: NewVT, |
5775 | MMO: LD->getMemOperand()); |
5776 | SDValue NewChain = NewLoad.getValue(R: NumElts); |
5777 | |
5778 | // Create a vector of the same type returned by the original load. |
5779 | SmallVector<SDValue, 4> Elts; |
5780 | for (unsigned i = 0; i < NumElts; i++) |
5781 | Elts.push_back(Elt: NewLoad.getValue(R: i)); |
5782 | return DCI.DAG.getMergeValues( |
5783 | Ops: {DCI.DAG.getBitcast(VT, V: DCI.DAG.getBuildVector(VT: NewVT, DL, Ops: Elts)), |
5784 | NewChain}, |
5785 | dl: DL); |
5786 | } |
5787 | |
5788 | SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, |
5789 | DAGCombinerInfo &DCI) const { |
5790 | CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel(); |
5791 | switch (N->getOpcode()) { |
5792 | default: break; |
5793 | case ISD::ADD: |
5794 | case ISD::FADD: |
5795 | return PerformADDCombine(N, DCI, Subtarget: STI, OptLevel); |
5796 | case ISD::MUL: |
5797 | return PerformMULCombine(N, DCI, OptLevel); |
5798 | case ISD::SHL: |
5799 | return PerformSHLCombine(N, DCI, OptLevel); |
5800 | case ISD::AND: |
5801 | return PerformANDCombine(N, DCI); |
5802 | case ISD::UREM: |
5803 | case ISD::SREM: |
5804 | return PerformREMCombine(N, DCI, OptLevel); |
5805 | case ISD::SETCC: |
5806 | return PerformSETCCCombine(N, DCI, SmVersion: STI.getSmVersion()); |
5807 | case ISD::LOAD: |
5808 | return PerformLOADCombine(N, DCI); |
5809 | case NVPTXISD::StoreRetval: |
5810 | case NVPTXISD::StoreRetvalV2: |
5811 | case NVPTXISD::StoreRetvalV4: |
5812 | return PerformStoreRetvalCombine(N); |
5813 | case ISD::EXTRACT_VECTOR_ELT: |
5814 | return PerformEXTRACTCombine(N, DCI); |
5815 | case ISD::VSELECT: |
5816 | return PerformVSELECTCombine(N, DCI); |
5817 | } |
5818 | return SDValue(); |
5819 | } |
5820 | |
5821 | /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. |
5822 | static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, |
5823 | SmallVectorImpl<SDValue> &Results) { |
5824 | EVT ResVT = N->getValueType(ResNo: 0); |
5825 | SDLoc DL(N); |
5826 | |
5827 | assert(ResVT.isVector() && "Vector load must have vector type" ); |
5828 | |
5829 | // We only handle "native" vector sizes for now, e.g. <4 x double> is not |
5830 | // legal. We can (and should) split that into 2 loads of <2 x double> here |
5831 | // but I'm leaving that as a TODO for now. |
5832 | assert(ResVT.isSimple() && "Can only handle simple types" ); |
5833 | switch (ResVT.getSimpleVT().SimpleTy) { |
5834 | default: |
5835 | return; |
5836 | case MVT::v2i8: |
5837 | case MVT::v2i16: |
5838 | case MVT::v2i32: |
5839 | case MVT::v2i64: |
5840 | case MVT::v2f16: |
5841 | case MVT::v2f32: |
5842 | case MVT::v2f64: |
5843 | case MVT::v4i8: |
5844 | case MVT::v4i16: |
5845 | case MVT::v4i32: |
5846 | case MVT::v4f16: |
5847 | case MVT::v4f32: |
5848 | case MVT::v8f16: // <4 x f16x2> |
5849 | case MVT::v8bf16: // <4 x bf16x2> |
5850 | case MVT::v8i16: // <4 x i16x2> |
5851 | // This is a "native" vector type |
5852 | break; |
5853 | } |
5854 | |
5855 | LoadSDNode *LD = cast<LoadSDNode>(Val: N); |
5856 | |
5857 | Align Alignment = LD->getAlign(); |
5858 | auto &TD = DAG.getDataLayout(); |
5859 | Align PrefAlign = |
5860 | TD.getPrefTypeAlign(Ty: LD->getMemoryVT().getTypeForEVT(Context&: *DAG.getContext())); |
5861 | if (Alignment < PrefAlign) { |
5862 | // This load is not sufficiently aligned, so bail out and let this vector |
5863 | // load be scalarized. Note that we may still be able to emit smaller |
5864 | // vector loads. For example, if we are loading a <4 x float> with an |
5865 | // alignment of 8, this check will fail but the legalizer will try again |
5866 | // with 2 x <2 x float>, which will succeed with an alignment of 8. |
5867 | return; |
5868 | } |
5869 | |
5870 | EVT EltVT = ResVT.getVectorElementType(); |
5871 | unsigned NumElts = ResVT.getVectorNumElements(); |
5872 | |
5873 | // Since LoadV2 is a target node, we cannot rely on DAG type legalization. |
5874 | // Therefore, we must ensure the type is legal. For i1 and i8, we set the |
5875 | // loaded type to i16 and propagate the "real" type as the memory type. |
5876 | bool NeedTrunc = false; |
5877 | if (EltVT.getSizeInBits() < 16) { |
5878 | EltVT = MVT::i16; |
5879 | NeedTrunc = true; |
5880 | } |
5881 | |
5882 | unsigned Opcode = 0; |
5883 | SDVTList LdResVTs; |
5884 | bool Load16x2 = false; |
5885 | |
5886 | switch (NumElts) { |
5887 | default: |
5888 | return; |
5889 | case 2: |
5890 | Opcode = NVPTXISD::LoadV2; |
5891 | LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); |
5892 | break; |
5893 | case 4: { |
5894 | Opcode = NVPTXISD::LoadV4; |
5895 | EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; |
5896 | LdResVTs = DAG.getVTList(ListVTs); |
5897 | break; |
5898 | } |
5899 | case 8: { |
5900 | // v8f16 is a special case. PTX doesn't have ld.v8.f16 |
5901 | // instruction. Instead, we split the vector into v2f16 chunks and |
5902 | // load them with ld.v4.b32. |
5903 | assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type." ); |
5904 | Load16x2 = true; |
5905 | Opcode = NVPTXISD::LoadV4; |
5906 | EVT VVT; |
5907 | switch (EltVT.getSimpleVT().SimpleTy) { |
5908 | case MVT::f16: |
5909 | VVT = MVT::v2f16; |
5910 | break; |
5911 | case MVT::bf16: |
5912 | VVT = MVT::v2bf16; |
5913 | break; |
5914 | case MVT::i16: |
5915 | VVT = MVT::v2i16; |
5916 | break; |
5917 | default: |
5918 | llvm_unreachable("Unsupported v8 vector type." ); |
5919 | } |
5920 | EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other}; |
5921 | LdResVTs = DAG.getVTList(ListVTs); |
5922 | break; |
5923 | } |
5924 | } |
5925 | |
5926 | // Copy regular operands |
5927 | SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end()); |
5928 | |
5929 | // The select routine does not have access to the LoadSDNode instance, so |
5930 | // pass along the extension information |
5931 | OtherOps.push_back(Elt: DAG.getIntPtrConstant(Val: LD->getExtensionType(), DL)); |
5932 | |
5933 | SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList: LdResVTs, Ops: OtherOps, |
5934 | MemVT: LD->getMemoryVT(), |
5935 | MMO: LD->getMemOperand()); |
5936 | |
5937 | SmallVector<SDValue, 8> ScalarRes; |
5938 | if (Load16x2) { |
5939 | // Split v2f16 subvectors back into individual elements. |
5940 | NumElts /= 2; |
5941 | for (unsigned i = 0; i < NumElts; ++i) { |
5942 | SDValue SubVector = NewLD.getValue(R: i); |
5943 | SDValue E0 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: EltVT, N1: SubVector, |
5944 | N2: DAG.getIntPtrConstant(Val: 0, DL)); |
5945 | SDValue E1 = DAG.getNode(Opcode: ISD::EXTRACT_VECTOR_ELT, DL, VT: EltVT, N1: SubVector, |
5946 | N2: DAG.getIntPtrConstant(Val: 1, DL)); |
5947 | ScalarRes.push_back(Elt: E0); |
5948 | ScalarRes.push_back(Elt: E1); |
5949 | } |
5950 | } else { |
5951 | for (unsigned i = 0; i < NumElts; ++i) { |
5952 | SDValue Res = NewLD.getValue(R: i); |
5953 | if (NeedTrunc) |
5954 | Res = DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ResVT.getVectorElementType(), Operand: Res); |
5955 | ScalarRes.push_back(Elt: Res); |
5956 | } |
5957 | } |
5958 | |
5959 | SDValue LoadChain = NewLD.getValue(R: NumElts); |
5960 | |
5961 | SDValue BuildVec = DAG.getBuildVector(VT: ResVT, DL, Ops: ScalarRes); |
5962 | |
5963 | Results.push_back(Elt: BuildVec); |
5964 | Results.push_back(Elt: LoadChain); |
5965 | } |
5966 | |
5967 | static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, |
5968 | SmallVectorImpl<SDValue> &Results) { |
5969 | SDValue Chain = N->getOperand(Num: 0); |
5970 | SDValue Intrin = N->getOperand(Num: 1); |
5971 | SDLoc DL(N); |
5972 | |
5973 | // Get the intrinsic ID |
5974 | unsigned IntrinNo = Intrin.getNode()->getAsZExtVal(); |
5975 | switch (IntrinNo) { |
5976 | default: |
5977 | return; |
5978 | case Intrinsic::nvvm_ldg_global_i: |
5979 | case Intrinsic::nvvm_ldg_global_f: |
5980 | case Intrinsic::nvvm_ldg_global_p: |
5981 | case Intrinsic::nvvm_ldu_global_i: |
5982 | case Intrinsic::nvvm_ldu_global_f: |
5983 | case Intrinsic::nvvm_ldu_global_p: { |
5984 | EVT ResVT = N->getValueType(ResNo: 0); |
5985 | |
5986 | if (ResVT.isVector()) { |
5987 | // Vector LDG/LDU |
5988 | |
5989 | unsigned NumElts = ResVT.getVectorNumElements(); |
5990 | EVT EltVT = ResVT.getVectorElementType(); |
5991 | |
5992 | // Since LDU/LDG are target nodes, we cannot rely on DAG type |
5993 | // legalization. |
5994 | // Therefore, we must ensure the type is legal. For i1 and i8, we set the |
5995 | // loaded type to i16 and propagate the "real" type as the memory type. |
5996 | bool NeedTrunc = false; |
5997 | if (EltVT.getSizeInBits() < 16) { |
5998 | EltVT = MVT::i16; |
5999 | NeedTrunc = true; |
6000 | } |
6001 | |
6002 | unsigned Opcode = 0; |
6003 | SDVTList LdResVTs; |
6004 | |
6005 | switch (NumElts) { |
6006 | default: |
6007 | return; |
6008 | case 2: |
6009 | switch (IntrinNo) { |
6010 | default: |
6011 | return; |
6012 | case Intrinsic::nvvm_ldg_global_i: |
6013 | case Intrinsic::nvvm_ldg_global_f: |
6014 | case Intrinsic::nvvm_ldg_global_p: |
6015 | Opcode = NVPTXISD::LDGV2; |
6016 | break; |
6017 | case Intrinsic::nvvm_ldu_global_i: |
6018 | case Intrinsic::nvvm_ldu_global_f: |
6019 | case Intrinsic::nvvm_ldu_global_p: |
6020 | Opcode = NVPTXISD::LDUV2; |
6021 | break; |
6022 | } |
6023 | LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other); |
6024 | break; |
6025 | case 4: { |
6026 | switch (IntrinNo) { |
6027 | default: |
6028 | return; |
6029 | case Intrinsic::nvvm_ldg_global_i: |
6030 | case Intrinsic::nvvm_ldg_global_f: |
6031 | case Intrinsic::nvvm_ldg_global_p: |
6032 | Opcode = NVPTXISD::LDGV4; |
6033 | break; |
6034 | case Intrinsic::nvvm_ldu_global_i: |
6035 | case Intrinsic::nvvm_ldu_global_f: |
6036 | case Intrinsic::nvvm_ldu_global_p: |
6037 | Opcode = NVPTXISD::LDUV4; |
6038 | break; |
6039 | } |
6040 | EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other }; |
6041 | LdResVTs = DAG.getVTList(ListVTs); |
6042 | break; |
6043 | } |
6044 | } |
6045 | |
6046 | SmallVector<SDValue, 8> OtherOps; |
6047 | |
6048 | // Copy regular operands |
6049 | |
6050 | OtherOps.push_back(Elt: Chain); // Chain |
6051 | // Skip operand 1 (intrinsic ID) |
6052 | // Others |
6053 | OtherOps.append(in_start: N->op_begin() + 2, in_end: N->op_end()); |
6054 | |
6055 | MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(Val: N); |
6056 | |
6057 | SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, dl: DL, VTList: LdResVTs, Ops: OtherOps, |
6058 | MemVT: MemSD->getMemoryVT(), |
6059 | MMO: MemSD->getMemOperand()); |
6060 | |
6061 | SmallVector<SDValue, 4> ScalarRes; |
6062 | |
6063 | for (unsigned i = 0; i < NumElts; ++i) { |
6064 | SDValue Res = NewLD.getValue(R: i); |
6065 | if (NeedTrunc) |
6066 | Res = |
6067 | DAG.getNode(Opcode: ISD::TRUNCATE, DL, VT: ResVT.getVectorElementType(), Operand: Res); |
6068 | ScalarRes.push_back(Elt: Res); |
6069 | } |
6070 | |
6071 | SDValue LoadChain = NewLD.getValue(R: NumElts); |
6072 | |
6073 | SDValue BuildVec = |
6074 | DAG.getBuildVector(VT: ResVT, DL, Ops: ScalarRes); |
6075 | |
6076 | Results.push_back(Elt: BuildVec); |
6077 | Results.push_back(Elt: LoadChain); |
6078 | } else { |
6079 | // i8 LDG/LDU |
6080 | assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 && |
6081 | "Custom handling of non-i8 ldu/ldg?" ); |
6082 | |
6083 | // Just copy all operands as-is |
6084 | SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end()); |
6085 | |
6086 | // Force output to i16 |
6087 | SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other); |
6088 | |
6089 | MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(Val: N); |
6090 | |
6091 | // We make sure the memory type is i8, which will be used during isel |
6092 | // to select the proper instruction. |
6093 | SDValue NewLD = |
6094 | DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops, |
6095 | MVT::i8, MemSD->getMemOperand()); |
6096 | |
6097 | Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, |
6098 | NewLD.getValue(0))); |
6099 | Results.push_back(Elt: NewLD.getValue(R: 1)); |
6100 | } |
6101 | } |
6102 | } |
6103 | } |
6104 | |
6105 | void NVPTXTargetLowering::ReplaceNodeResults( |
6106 | SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { |
6107 | switch (N->getOpcode()) { |
6108 | default: |
6109 | report_fatal_error(reason: "Unhandled custom legalization" ); |
6110 | case ISD::LOAD: |
6111 | ReplaceLoadVector(N, DAG, Results); |
6112 | return; |
6113 | case ISD::INTRINSIC_W_CHAIN: |
6114 | ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); |
6115 | return; |
6116 | } |
6117 | } |
6118 | |
6119 | NVPTXTargetLowering::AtomicExpansionKind |
6120 | NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { |
6121 | Type *Ty = AI->getValOperand()->getType(); |
6122 | |
6123 | if (AI->isFloatingPointOperation()) { |
6124 | if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) { |
6125 | if (Ty->isHalfTy() && STI.getSmVersion() >= 70 && |
6126 | STI.getPTXVersion() >= 63) |
6127 | return AtomicExpansionKind::None; |
6128 | if (Ty->isFloatTy()) |
6129 | return AtomicExpansionKind::None; |
6130 | if (Ty->isDoubleTy() && STI.hasAtomAddF64()) |
6131 | return AtomicExpansionKind::None; |
6132 | } |
6133 | return AtomicExpansionKind::CmpXChg; |
6134 | } |
6135 | |
6136 | assert(Ty->isIntegerTy() && "Ty should be integer at this point" ); |
6137 | auto ITy = cast<llvm::IntegerType>(Val: Ty); |
6138 | |
6139 | switch (AI->getOperation()) { |
6140 | default: |
6141 | return AtomicExpansionKind::CmpXChg; |
6142 | case AtomicRMWInst::BinOp::And: |
6143 | case AtomicRMWInst::BinOp::Or: |
6144 | case AtomicRMWInst::BinOp::Xor: |
6145 | case AtomicRMWInst::BinOp::Xchg: |
6146 | switch (ITy->getBitWidth()) { |
6147 | case 8: |
6148 | case 16: |
6149 | return AtomicExpansionKind::CmpXChg; |
6150 | case 32: |
6151 | return AtomicExpansionKind::None; |
6152 | case 64: |
6153 | if (STI.hasAtomBitwise64()) |
6154 | return AtomicExpansionKind::None; |
6155 | return AtomicExpansionKind::CmpXChg; |
6156 | default: |
6157 | llvm_unreachable("unsupported width encountered" ); |
6158 | } |
6159 | case AtomicRMWInst::BinOp::Add: |
6160 | case AtomicRMWInst::BinOp::Sub: |
6161 | case AtomicRMWInst::BinOp::Max: |
6162 | case AtomicRMWInst::BinOp::Min: |
6163 | case AtomicRMWInst::BinOp::UMax: |
6164 | case AtomicRMWInst::BinOp::UMin: |
6165 | switch (ITy->getBitWidth()) { |
6166 | case 8: |
6167 | case 16: |
6168 | return AtomicExpansionKind::CmpXChg; |
6169 | case 32: |
6170 | return AtomicExpansionKind::None; |
6171 | case 64: |
6172 | if (STI.hasAtomMinMax64()) |
6173 | return AtomicExpansionKind::None; |
6174 | return AtomicExpansionKind::CmpXChg; |
6175 | default: |
6176 | llvm_unreachable("unsupported width encountered" ); |
6177 | } |
6178 | } |
6179 | |
6180 | return AtomicExpansionKind::CmpXChg; |
6181 | } |
6182 | |
6183 | // Pin NVPTXTargetObjectFile's vtables to this file. |
6184 | NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default; |
6185 | |
6186 | MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal( |
6187 | const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { |
6188 | return getDataSection(); |
6189 | } |
6190 | |