1 | //===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "RISCVTargetTransformInfo.h" |
10 | #include "MCTargetDesc/RISCVMatInt.h" |
11 | #include "llvm/ADT/STLExtras.h" |
12 | #include "llvm/Analysis/TargetTransformInfo.h" |
13 | #include "llvm/CodeGen/BasicTTIImpl.h" |
14 | #include "llvm/CodeGen/CostTable.h" |
15 | #include "llvm/CodeGen/TargetLowering.h" |
16 | #include "llvm/IR/Instructions.h" |
17 | #include <cmath> |
18 | #include <optional> |
19 | using namespace llvm; |
20 | |
21 | #define DEBUG_TYPE "riscvtti" |
22 | |
23 | static cl::opt<unsigned> RVVRegisterWidthLMUL( |
24 | "riscv-v-register-bit-width-lmul" , |
25 | cl::desc( |
26 | "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " |
27 | "by autovectorized code. Fractional LMULs are not supported." ), |
28 | cl::init(Val: 2), cl::Hidden); |
29 | |
30 | static cl::opt<unsigned> SLPMaxVF( |
31 | "riscv-v-slp-max-vf" , |
32 | cl::desc( |
33 | "Overrides result used for getMaximumVF query which is used " |
34 | "exclusively by SLP vectorizer." ), |
35 | cl::Hidden); |
36 | |
37 | InstructionCost |
38 | RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT, |
39 | TTI::TargetCostKind CostKind) { |
40 | // Check if the type is valid for all CostKind |
41 | if (!VT.isVector()) |
42 | return InstructionCost::getInvalid(); |
43 | size_t NumInstr = OpCodes.size(); |
44 | if (CostKind == TTI::TCK_CodeSize) |
45 | return NumInstr; |
46 | InstructionCost LMULCost = TLI->getLMULCost(VT); |
47 | if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency)) |
48 | return LMULCost * NumInstr; |
49 | InstructionCost Cost = 0; |
50 | for (auto Op : OpCodes) { |
51 | switch (Op) { |
52 | case RISCV::VRGATHER_VI: |
53 | Cost += TLI->getVRGatherVICost(VT); |
54 | break; |
55 | case RISCV::VRGATHER_VV: |
56 | Cost += TLI->getVRGatherVVCost(VT); |
57 | break; |
58 | case RISCV::VSLIDEUP_VI: |
59 | case RISCV::VSLIDEDOWN_VI: |
60 | Cost += TLI->getVSlideVICost(VT); |
61 | break; |
62 | case RISCV::VSLIDEUP_VX: |
63 | case RISCV::VSLIDEDOWN_VX: |
64 | Cost += TLI->getVSlideVXCost(VT); |
65 | break; |
66 | case RISCV::VREDMAX_VS: |
67 | case RISCV::VREDMIN_VS: |
68 | case RISCV::VREDMAXU_VS: |
69 | case RISCV::VREDMINU_VS: |
70 | case RISCV::VREDSUM_VS: |
71 | case RISCV::VREDAND_VS: |
72 | case RISCV::VREDOR_VS: |
73 | case RISCV::VREDXOR_VS: |
74 | case RISCV::VFREDMAX_VS: |
75 | case RISCV::VFREDMIN_VS: |
76 | case RISCV::VFREDUSUM_VS: { |
77 | unsigned VL = VT.getVectorMinNumElements(); |
78 | if (!VT.isFixedLengthVector()) |
79 | VL *= *getVScaleForTuning(); |
80 | Cost += Log2_32_Ceil(VL); |
81 | break; |
82 | } |
83 | case RISCV::VFREDOSUM_VS: { |
84 | unsigned VL = VT.getVectorMinNumElements(); |
85 | if (!VT.isFixedLengthVector()) |
86 | VL *= *getVScaleForTuning(); |
87 | Cost += VL; |
88 | break; |
89 | } |
90 | case RISCV::VMV_X_S: |
91 | case RISCV::VMV_S_X: |
92 | case RISCV::VFMV_F_S: |
93 | case RISCV::VFMV_S_F: |
94 | case RISCV::VMOR_MM: |
95 | case RISCV::VMXOR_MM: |
96 | case RISCV::VMAND_MM: |
97 | case RISCV::VMANDN_MM: |
98 | case RISCV::VMNAND_MM: |
99 | case RISCV::VCPOP_M: |
100 | Cost += 1; |
101 | break; |
102 | default: |
103 | Cost += LMULCost; |
104 | } |
105 | } |
106 | return Cost; |
107 | } |
108 | |
109 | InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, |
110 | TTI::TargetCostKind CostKind) { |
111 | assert(Ty->isIntegerTy() && |
112 | "getIntImmCost can only estimate cost of materialising integers" ); |
113 | |
114 | // We have a Zero register, so 0 is always free. |
115 | if (Imm == 0) |
116 | return TTI::TCC_Free; |
117 | |
118 | // Otherwise, we check how many instructions it will take to materialise. |
119 | const DataLayout &DL = getDataLayout(); |
120 | return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *getST()); |
121 | } |
122 | |
123 | // Look for patterns of shift followed by AND that can be turned into a pair of |
124 | // shifts. We won't need to materialize an immediate for the AND so these can |
125 | // be considered free. |
126 | static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) { |
127 | uint64_t Mask = Imm.getZExtValue(); |
128 | auto *BO = dyn_cast<BinaryOperator>(Val: Inst->getOperand(i: 0)); |
129 | if (!BO || !BO->hasOneUse()) |
130 | return false; |
131 | |
132 | if (BO->getOpcode() != Instruction::Shl) |
133 | return false; |
134 | |
135 | if (!isa<ConstantInt>(Val: BO->getOperand(i_nocapture: 1))) |
136 | return false; |
137 | |
138 | unsigned ShAmt = cast<ConstantInt>(Val: BO->getOperand(i_nocapture: 1))->getZExtValue(); |
139 | // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1 |
140 | // is a mask shifted by c2 bits with c3 leading zeros. |
141 | if (isShiftedMask_64(Value: Mask)) { |
142 | unsigned Trailing = llvm::countr_zero(Val: Mask); |
143 | if (ShAmt == Trailing) |
144 | return true; |
145 | } |
146 | |
147 | return false; |
148 | } |
149 | |
150 | InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, |
151 | const APInt &Imm, Type *Ty, |
152 | TTI::TargetCostKind CostKind, |
153 | Instruction *Inst) { |
154 | assert(Ty->isIntegerTy() && |
155 | "getIntImmCost can only estimate cost of materialising integers" ); |
156 | |
157 | // We have a Zero register, so 0 is always free. |
158 | if (Imm == 0) |
159 | return TTI::TCC_Free; |
160 | |
161 | // Some instructions in RISC-V can take a 12-bit immediate. Some of these are |
162 | // commutative, in others the immediate comes from a specific argument index. |
163 | bool Takes12BitImm = false; |
164 | unsigned ImmArgIdx = ~0U; |
165 | |
166 | switch (Opcode) { |
167 | case Instruction::GetElementPtr: |
168 | // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will |
169 | // split up large offsets in GEP into better parts than ConstantHoisting |
170 | // can. |
171 | return TTI::TCC_Free; |
172 | case Instruction::Store: |
173 | // If the address is a constant, use the materialization cost. |
174 | if (Idx == 1) |
175 | return getIntImmCost(Imm, Ty, CostKind); |
176 | return TTI::TCC_Free; |
177 | case Instruction::Load: |
178 | // If the address is a constant, use the materialization cost. |
179 | return getIntImmCost(Imm, Ty, CostKind); |
180 | case Instruction::And: |
181 | // zext.h |
182 | if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb()) |
183 | return TTI::TCC_Free; |
184 | // zext.w |
185 | if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba()) |
186 | return TTI::TCC_Free; |
187 | // bclri |
188 | if (ST->hasStdExtZbs() && (~Imm).isPowerOf2()) |
189 | return TTI::TCC_Free; |
190 | if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() && |
191 | canUseShiftPair(Inst, Imm)) |
192 | return TTI::TCC_Free; |
193 | Takes12BitImm = true; |
194 | break; |
195 | case Instruction::Add: |
196 | Takes12BitImm = true; |
197 | break; |
198 | case Instruction::Or: |
199 | case Instruction::Xor: |
200 | // bseti/binvi |
201 | if (ST->hasStdExtZbs() && Imm.isPowerOf2()) |
202 | return TTI::TCC_Free; |
203 | Takes12BitImm = true; |
204 | break; |
205 | case Instruction::Mul: |
206 | // Power of 2 is a shift. Negated power of 2 is a shift and a negate. |
207 | if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2()) |
208 | return TTI::TCC_Free; |
209 | // One more or less than a power of 2 can use SLLI+ADD/SUB. |
210 | if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2()) |
211 | return TTI::TCC_Free; |
212 | // FIXME: There is no MULI instruction. |
213 | Takes12BitImm = true; |
214 | break; |
215 | case Instruction::Sub: |
216 | case Instruction::Shl: |
217 | case Instruction::LShr: |
218 | case Instruction::AShr: |
219 | Takes12BitImm = true; |
220 | ImmArgIdx = 1; |
221 | break; |
222 | default: |
223 | break; |
224 | } |
225 | |
226 | if (Takes12BitImm) { |
227 | // Check immediate is the correct argument... |
228 | if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) { |
229 | // ... and fits into the 12-bit immediate. |
230 | if (Imm.getSignificantBits() <= 64 && |
231 | getTLI()->isLegalAddImmediate(Imm: Imm.getSExtValue())) { |
232 | return TTI::TCC_Free; |
233 | } |
234 | } |
235 | |
236 | // Otherwise, use the full materialisation cost. |
237 | return getIntImmCost(Imm, Ty, CostKind); |
238 | } |
239 | |
240 | // By default, prevent hoisting. |
241 | return TTI::TCC_Free; |
242 | } |
243 | |
244 | InstructionCost |
245 | RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, |
246 | const APInt &Imm, Type *Ty, |
247 | TTI::TargetCostKind CostKind) { |
248 | // Prevent hoisting in unknown cases. |
249 | return TTI::TCC_Free; |
250 | } |
251 | |
252 | bool RISCVTTIImpl::hasActiveVectorLength(unsigned, Type *DataTy, Align) const { |
253 | return ST->hasVInstructions(); |
254 | } |
255 | |
256 | TargetTransformInfo::PopcntSupportKind |
257 | RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) { |
258 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ); |
259 | return ST->hasStdExtZbb() || ST->hasVendorXCVbitmanip() |
260 | ? TTI::PSK_FastHardware |
261 | : TTI::PSK_Software; |
262 | } |
263 | |
264 | bool RISCVTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { |
265 | // Currently, the ExpandReductions pass can't expand scalable-vector |
266 | // reductions, but we still request expansion as RVV doesn't support certain |
267 | // reductions and the SelectionDAG can't legalize them either. |
268 | switch (II->getIntrinsicID()) { |
269 | default: |
270 | return false; |
271 | // These reductions have no equivalent in RVV |
272 | case Intrinsic::vector_reduce_mul: |
273 | case Intrinsic::vector_reduce_fmul: |
274 | return true; |
275 | } |
276 | } |
277 | |
278 | std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const { |
279 | if (ST->hasVInstructions()) |
280 | return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock; |
281 | return BaseT::getMaxVScale(); |
282 | } |
283 | |
284 | std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const { |
285 | if (ST->hasVInstructions()) |
286 | if (unsigned MinVLen = ST->getRealMinVLen(); |
287 | MinVLen >= RISCV::RVVBitsPerBlock) |
288 | return MinVLen / RISCV::RVVBitsPerBlock; |
289 | return BaseT::getVScaleForTuning(); |
290 | } |
291 | |
292 | TypeSize |
293 | RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
294 | unsigned LMUL = |
295 | llvm::bit_floor(Value: std::clamp<unsigned>(val: RVVRegisterWidthLMUL, lo: 1, hi: 8)); |
296 | switch (K) { |
297 | case TargetTransformInfo::RGK_Scalar: |
298 | return TypeSize::getFixed(ExactSize: ST->getXLen()); |
299 | case TargetTransformInfo::RGK_FixedWidthVector: |
300 | return TypeSize::getFixed( |
301 | ExactSize: ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0); |
302 | case TargetTransformInfo::RGK_ScalableVector: |
303 | return TypeSize::getScalable( |
304 | MinimumSize: (ST->hasVInstructions() && |
305 | ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock) |
306 | ? LMUL * RISCV::RVVBitsPerBlock |
307 | : 0); |
308 | } |
309 | |
310 | llvm_unreachable("Unsupported register kind" ); |
311 | } |
312 | |
313 | InstructionCost |
314 | RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind) { |
315 | // Add a cost of address generation + the cost of the load. The address |
316 | // is expected to be a PC relative offset to a constant pool entry |
317 | // using auipc/addi. |
318 | return 2 + getMemoryOpCost(Opcode: Instruction::Load, Src: Ty, Alignment: DL.getABITypeAlign(Ty), |
319 | /*AddressSpace=*/0, CostKind); |
320 | } |
321 | |
322 | static VectorType *getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, |
323 | LLVMContext &C) { |
324 | assert((DataVT.getScalarSizeInBits() != 8 || |
325 | DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering" ); |
326 | MVT IndexVT = DataVT.changeTypeToInteger(); |
327 | if (IndexVT.getScalarType().bitsGT(ST.getXLenVT())) |
328 | IndexVT = IndexVT.changeVectorElementType(MVT::i16); |
329 | return cast<VectorType>(Val: EVT(IndexVT).getTypeForEVT(Context&: C)); |
330 | } |
331 | |
332 | InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, |
333 | VectorType *Tp, ArrayRef<int> Mask, |
334 | TTI::TargetCostKind CostKind, |
335 | int Index, VectorType *SubTp, |
336 | ArrayRef<const Value *> Args, |
337 | const Instruction *CxtI) { |
338 | Kind = improveShuffleKindFromMask(Kind, Mask, Ty: Tp, Index, SubTy&: SubTp); |
339 | |
340 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp); |
341 | |
342 | // First, handle cases where having a fixed length vector enables us to |
343 | // give a more accurate cost than falling back to generic scalable codegen. |
344 | // TODO: Each of these cases hints at a modeling gap around scalable vectors. |
345 | if (isa<FixedVectorType>(Val: Tp)) { |
346 | switch (Kind) { |
347 | default: |
348 | break; |
349 | case TTI::SK_PermuteSingleSrc: { |
350 | if (Mask.size() >= 2 && LT.second.isFixedLengthVector()) { |
351 | MVT EltTp = LT.second.getVectorElementType(); |
352 | // If the size of the element is < ELEN then shuffles of interleaves and |
353 | // deinterleaves of 2 vectors can be lowered into the following |
354 | // sequences |
355 | if (EltTp.getScalarSizeInBits() < ST->getELen()) { |
356 | // Example sequence: |
357 | // vsetivli zero, 4, e8, mf4, ta, ma (ignored) |
358 | // vwaddu.vv v10, v8, v9 |
359 | // li a0, -1 (ignored) |
360 | // vwmaccu.vx v10, a0, v9 |
361 | if (ShuffleVectorInst::isInterleaveMask(Mask, Factor: 2, NumInputElts: Mask.size())) |
362 | return 2 * LT.first * TLI->getLMULCost(VT: LT.second); |
363 | |
364 | if (Mask[0] == 0 || Mask[0] == 1) { |
365 | auto DeinterleaveMask = createStrideMask(Start: Mask[0], Stride: 2, VF: Mask.size()); |
366 | // Example sequence: |
367 | // vnsrl.wi v10, v8, 0 |
368 | if (equal(DeinterleaveMask, Mask)) |
369 | return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI, |
370 | LT.second, CostKind); |
371 | } |
372 | } |
373 | } |
374 | // vrgather + cost of generating the mask constant. |
375 | // We model this for an unknown mask with a single vrgather. |
376 | if (LT.second.isFixedLengthVector() && LT.first == 1 && |
377 | (LT.second.getScalarSizeInBits() != 8 || |
378 | LT.second.getVectorNumElements() <= 256)) { |
379 | VectorType *IdxTy = getVRGatherIndexType(DataVT: LT.second, ST: *ST, C&: Tp->getContext()); |
380 | InstructionCost IndexCost = getConstantPoolLoadCost(Ty: IdxTy, CostKind); |
381 | return IndexCost + |
382 | getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind); |
383 | } |
384 | [[fallthrough]]; |
385 | } |
386 | case TTI::SK_Transpose: |
387 | case TTI::SK_PermuteTwoSrc: { |
388 | // 2 x (vrgather + cost of generating the mask constant) + cost of mask |
389 | // register for the second vrgather. We model this for an unknown |
390 | // (shuffle) mask. |
391 | if (LT.second.isFixedLengthVector() && LT.first == 1 && |
392 | (LT.second.getScalarSizeInBits() != 8 || |
393 | LT.second.getVectorNumElements() <= 256)) { |
394 | auto &C = Tp->getContext(); |
395 | auto EC = Tp->getElementCount(); |
396 | VectorType *IdxTy = getVRGatherIndexType(DataVT: LT.second, ST: *ST, C); |
397 | VectorType *MaskTy = VectorType::get(ElementType: IntegerType::getInt1Ty(C), EC); |
398 | InstructionCost IndexCost = getConstantPoolLoadCost(Ty: IdxTy, CostKind); |
399 | InstructionCost MaskCost = getConstantPoolLoadCost(Ty: MaskTy, CostKind); |
400 | return 2 * IndexCost + |
401 | getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV}, |
402 | LT.second, CostKind) + |
403 | MaskCost; |
404 | } |
405 | [[fallthrough]]; |
406 | } |
407 | case TTI::SK_Select: { |
408 | // We are going to permute multiple sources and the result will be in |
409 | // multiple destinations. Providing an accurate cost only for splits where |
410 | // the element type remains the same. |
411 | if (!Mask.empty() && LT.first.isValid() && LT.first != 1 && |
412 | LT.second.isFixedLengthVector() && |
413 | LT.second.getVectorElementType().getSizeInBits() == |
414 | Tp->getElementType()->getPrimitiveSizeInBits() && |
415 | LT.second.getVectorNumElements() < |
416 | cast<FixedVectorType>(Val: Tp)->getNumElements() && |
417 | divideCeil(Numerator: Mask.size(), |
418 | Denominator: cast<FixedVectorType>(Val: Tp)->getNumElements()) == |
419 | static_cast<unsigned>(*LT.first.getValue())) { |
420 | unsigned NumRegs = *LT.first.getValue(); |
421 | unsigned VF = cast<FixedVectorType>(Val: Tp)->getNumElements(); |
422 | unsigned SubVF = PowerOf2Ceil(A: VF / NumRegs); |
423 | auto *SubVecTy = FixedVectorType::get(ElementType: Tp->getElementType(), NumElts: SubVF); |
424 | |
425 | InstructionCost Cost = 0; |
426 | for (unsigned I = 0; I < NumRegs; ++I) { |
427 | bool IsSingleVector = true; |
428 | SmallVector<int> SubMask(SubVF, PoisonMaskElem); |
429 | transform(Range: Mask.slice(N: I * SubVF, |
430 | M: I == NumRegs - 1 ? Mask.size() % SubVF : SubVF), |
431 | d_first: SubMask.begin(), F: [&](int I) { |
432 | bool SingleSubVector = I / VF == 0; |
433 | IsSingleVector &= SingleSubVector; |
434 | return (SingleSubVector ? 0 : 1) * SubVF + I % VF; |
435 | }); |
436 | Cost += getShuffleCost(Kind: IsSingleVector ? TTI::SK_PermuteSingleSrc |
437 | : TTI::SK_PermuteTwoSrc, |
438 | Tp: SubVecTy, Mask: SubMask, CostKind, Index: 0, SubTp: nullptr); |
439 | return Cost; |
440 | } |
441 | } |
442 | break; |
443 | } |
444 | } |
445 | }; |
446 | |
447 | // Handle scalable vectors (and fixed vectors legalized to scalable vectors). |
448 | switch (Kind) { |
449 | default: |
450 | // Fallthrough to generic handling. |
451 | // TODO: Most of these cases will return getInvalid in generic code, and |
452 | // must be implemented here. |
453 | break; |
454 | case TTI::SK_ExtractSubvector: |
455 | // Extract at zero is always a subregister extract |
456 | if (Index == 0) |
457 | return TTI::TCC_Free; |
458 | |
459 | // If we're extracting a subvector of at most m1 size at a sub-register |
460 | // boundary - which unfortunately we need exact vlen to identify - this is |
461 | // a subregister extract at worst and thus won't require a vslidedown. |
462 | // TODO: Extend for aligned m2, m4 subvector extracts |
463 | // TODO: Extend for misalgined (but contained) extracts |
464 | // TODO: Extend for scalable subvector types |
465 | if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp); |
466 | SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) { |
467 | const unsigned MinVLen = ST->getRealMinVLen(); |
468 | const unsigned MaxVLen = ST->getRealMaxVLen(); |
469 | if (MinVLen == MaxVLen && |
470 | SubLT.second.getScalarSizeInBits() * Index % MinVLen == 0 && |
471 | SubLT.second.getSizeInBits() <= MinVLen) |
472 | return TTI::TCC_Free; |
473 | } |
474 | |
475 | // Example sequence: |
476 | // vsetivli zero, 4, e8, mf2, tu, ma (ignored) |
477 | // vslidedown.vi v8, v9, 2 |
478 | return LT.first * |
479 | getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind); |
480 | case TTI::SK_InsertSubvector: |
481 | // Example sequence: |
482 | // vsetivli zero, 4, e8, mf2, tu, ma (ignored) |
483 | // vslideup.vi v8, v9, 2 |
484 | return LT.first * |
485 | getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind); |
486 | case TTI::SK_Select: { |
487 | // Example sequence: |
488 | // li a0, 90 |
489 | // vsetivli zero, 8, e8, mf2, ta, ma (ignored) |
490 | // vmv.s.x v0, a0 |
491 | // vmerge.vvm v8, v9, v8, v0 |
492 | // We use 2 for the cost of the mask materialization as this is the true |
493 | // cost for small masks and most shuffles are small. At worst, this cost |
494 | // should be a very small constant for the constant pool load. As such, |
495 | // we may bias towards large selects slightly more than truely warranted. |
496 | return LT.first * |
497 | (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM}, |
498 | LT.second, CostKind)); |
499 | } |
500 | case TTI::SK_Broadcast: { |
501 | bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(V: Args[0]) == |
502 | Instruction::InsertElement); |
503 | if (LT.second.getScalarSizeInBits() == 1) { |
504 | if (HasScalar) { |
505 | // Example sequence: |
506 | // andi a0, a0, 1 |
507 | // vsetivli zero, 2, e8, mf8, ta, ma (ignored) |
508 | // vmv.v.x v8, a0 |
509 | // vmsne.vi v0, v8, 0 |
510 | return LT.first * |
511 | (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI}, |
512 | LT.second, CostKind)); |
513 | } |
514 | // Example sequence: |
515 | // vsetivli zero, 2, e8, mf8, ta, mu (ignored) |
516 | // vmv.v.i v8, 0 |
517 | // vmerge.vim v8, v8, 1, v0 |
518 | // vmv.x.s a0, v8 |
519 | // andi a0, a0, 1 |
520 | // vmv.v.x v8, a0 |
521 | // vmsne.vi v0, v8, 0 |
522 | |
523 | return LT.first * |
524 | (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM, |
525 | RISCV::VMV_X_S, RISCV::VMV_V_X, |
526 | RISCV::VMSNE_VI}, |
527 | LT.second, CostKind)); |
528 | } |
529 | |
530 | if (HasScalar) { |
531 | // Example sequence: |
532 | // vmv.v.x v8, a0 |
533 | return LT.first * |
534 | getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind); |
535 | } |
536 | |
537 | // Example sequence: |
538 | // vrgather.vi v9, v8, 0 |
539 | return LT.first * |
540 | getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind); |
541 | } |
542 | case TTI::SK_Splice: { |
543 | // vslidedown+vslideup. |
544 | // TODO: Multiplying by LT.first implies this legalizes into multiple copies |
545 | // of similar code, but I think we expand through memory. |
546 | unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX}; |
547 | if (Index >= 0 && Index < 32) |
548 | Opcodes[0] = RISCV::VSLIDEDOWN_VI; |
549 | else if (Index < 0 && Index > -32) |
550 | Opcodes[1] = RISCV::VSLIDEUP_VI; |
551 | return LT.first * getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind); |
552 | } |
553 | case TTI::SK_Reverse: { |
554 | // TODO: Cases to improve here: |
555 | // * Illegal vector types |
556 | // * i64 on RV32 |
557 | // * i1 vector |
558 | // At low LMUL, most of the cost is producing the vrgather index register. |
559 | // At high LMUL, the cost of the vrgather itself will dominate. |
560 | // Example sequence: |
561 | // csrr a0, vlenb |
562 | // srli a0, a0, 3 |
563 | // addi a0, a0, -1 |
564 | // vsetvli a1, zero, e8, mf8, ta, mu (ignored) |
565 | // vid.v v9 |
566 | // vrsub.vx v10, v9, a0 |
567 | // vrgather.vv v9, v8, v10 |
568 | InstructionCost LenCost = 3; |
569 | if (LT.second.isFixedLengthVector()) |
570 | // vrsub.vi has a 5 bit immediate field, otherwise an li suffices |
571 | LenCost = isInt<5>(x: LT.second.getVectorNumElements() - 1) ? 0 : 1; |
572 | unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV}; |
573 | if (LT.second.isFixedLengthVector() && |
574 | isInt<5>(LT.second.getVectorNumElements() - 1)) |
575 | Opcodes[1] = RISCV::VRSUB_VI; |
576 | InstructionCost GatherCost = |
577 | getRISCVInstructionCost(Opcodes, LT.second, CostKind); |
578 | // Mask operation additionally required extend and truncate |
579 | InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(Bitwidth: 1) ? 3 : 0; |
580 | return LT.first * (LenCost + GatherCost + ExtendCost); |
581 | } |
582 | } |
583 | return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); |
584 | } |
585 | |
586 | InstructionCost |
587 | RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, |
588 | unsigned AddressSpace, |
589 | TTI::TargetCostKind CostKind) { |
590 | if (!isLegalMaskedLoadStore(DataType: Src, Alignment) || |
591 | CostKind != TTI::TCK_RecipThroughput) |
592 | return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace, |
593 | CostKind); |
594 | |
595 | return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); |
596 | } |
597 | |
598 | InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( |
599 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, |
600 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
601 | bool UseMaskForCond, bool UseMaskForGaps) { |
602 | if (isa<ScalableVectorType>(Val: VecTy)) |
603 | return InstructionCost::getInvalid(); |
604 | auto *FVTy = cast<FixedVectorType>(Val: VecTy); |
605 | InstructionCost MemCost = |
606 | getMemoryOpCost(Opcode, Src: VecTy, Alignment, AddressSpace, CostKind); |
607 | unsigned VF = FVTy->getNumElements() / Factor; |
608 | |
609 | // The interleaved memory access pass will lower interleaved memory ops (i.e |
610 | // a load and store followed by a specific shuffle) to vlseg/vsseg |
611 | // intrinsics. In those cases then we can treat it as if it's just one (legal) |
612 | // memory op |
613 | if (!UseMaskForCond && !UseMaskForGaps && |
614 | Factor <= TLI->getMaxSupportedInterleaveFactor()) { |
615 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: FVTy); |
616 | // Need to make sure type has't been scalarized |
617 | if (LT.second.isFixedLengthVector()) { |
618 | auto *LegalFVTy = FixedVectorType::get(ElementType: FVTy->getElementType(), |
619 | NumElts: LT.second.getVectorNumElements()); |
620 | // FIXME: We use the memory op cost of the *legalized* type here, becuase |
621 | // it's getMemoryOpCost returns a really expensive cost for types like |
622 | // <6 x i8>, which show up when doing interleaves of Factor=3 etc. |
623 | // Should the memory op cost of these be cheaper? |
624 | if (TLI->isLegalInterleavedAccessType(VTy: LegalFVTy, Factor, Alignment, |
625 | AddrSpace: AddressSpace, DL)) { |
626 | InstructionCost LegalMemCost = getMemoryOpCost( |
627 | Opcode, Src: LegalFVTy, Alignment, AddressSpace, CostKind); |
628 | return LT.first + LegalMemCost; |
629 | } |
630 | } |
631 | } |
632 | |
633 | // An interleaved load will look like this for Factor=3: |
634 | // %wide.vec = load <12 x i32>, ptr %3, align 4 |
635 | // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask> |
636 | // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask> |
637 | // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask> |
638 | if (Opcode == Instruction::Load) { |
639 | InstructionCost Cost = MemCost; |
640 | for (unsigned Index : Indices) { |
641 | FixedVectorType *SubVecTy = |
642 | FixedVectorType::get(ElementType: FVTy->getElementType(), NumElts: VF * Factor); |
643 | auto Mask = createStrideMask(Start: Index, Stride: Factor, VF); |
644 | InstructionCost ShuffleCost = |
645 | getShuffleCost(Kind: TTI::ShuffleKind::SK_PermuteSingleSrc, Tp: SubVecTy, Mask, |
646 | CostKind, Index: 0, SubTp: nullptr, Args: {}); |
647 | Cost += ShuffleCost; |
648 | } |
649 | return Cost; |
650 | } |
651 | |
652 | // TODO: Model for NF > 2 |
653 | // We'll need to enhance getShuffleCost to model shuffles that are just |
654 | // inserts and extracts into subvectors, since they won't have the full cost |
655 | // of a vrgather. |
656 | // An interleaved store for 3 vectors of 4 lanes will look like |
657 | // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7> |
658 | // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3> |
659 | // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11> |
660 | // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask> |
661 | // store <12 x i32> %interleaved.vec, ptr %10, align 4 |
662 | if (Factor != 2) |
663 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
664 | Alignment, AddressSpace, CostKind, |
665 | UseMaskForCond, UseMaskForGaps); |
666 | |
667 | assert(Opcode == Instruction::Store && "Opcode must be a store" ); |
668 | // For an interleaving store of 2 vectors, we perform one large interleaving |
669 | // shuffle that goes into the wide store |
670 | auto Mask = createInterleaveMask(VF, NumVecs: Factor); |
671 | InstructionCost ShuffleCost = |
672 | getShuffleCost(Kind: TTI::ShuffleKind::SK_PermuteSingleSrc, Tp: FVTy, Mask, |
673 | CostKind, Index: 0, SubTp: nullptr, Args: {}); |
674 | return MemCost + ShuffleCost; |
675 | } |
676 | |
677 | InstructionCost RISCVTTIImpl::getGatherScatterOpCost( |
678 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, |
679 | Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { |
680 | if (CostKind != TTI::TCK_RecipThroughput) |
681 | return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, |
682 | Alignment, CostKind, I); |
683 | |
684 | if ((Opcode == Instruction::Load && |
685 | !isLegalMaskedGather(DataType: DataTy, Alignment: Align(Alignment))) || |
686 | (Opcode == Instruction::Store && |
687 | !isLegalMaskedScatter(DataType: DataTy, Alignment: Align(Alignment)))) |
688 | return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, |
689 | Alignment, CostKind, I); |
690 | |
691 | // Cost is proportional to the number of memory operations implied. For |
692 | // scalable vectors, we use an estimate on that number since we don't |
693 | // know exactly what VL will be. |
694 | auto &VTy = *cast<VectorType>(Val: DataTy); |
695 | InstructionCost MemOpCost = |
696 | getMemoryOpCost(Opcode, Src: VTy.getElementType(), Alignment, AddressSpace: 0, CostKind, |
697 | OpdInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I); |
698 | unsigned NumLoads = getEstimatedVLFor(Ty: &VTy); |
699 | return NumLoads * MemOpCost; |
700 | } |
701 | |
702 | InstructionCost RISCVTTIImpl::getStridedMemoryOpCost( |
703 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, |
704 | Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { |
705 | if (((Opcode == Instruction::Load || Opcode == Instruction::Store) && |
706 | !isLegalStridedLoadStore(DataType: DataTy, Alignment)) || |
707 | (Opcode != Instruction::Load && Opcode != Instruction::Store)) |
708 | return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask, |
709 | Alignment, CostKind, I); |
710 | |
711 | if (CostKind == TTI::TCK_CodeSize) |
712 | return TTI::TCC_Basic; |
713 | |
714 | // Cost is proportional to the number of memory operations implied. For |
715 | // scalable vectors, we use an estimate on that number since we don't |
716 | // know exactly what VL will be. |
717 | auto &VTy = *cast<VectorType>(Val: DataTy); |
718 | InstructionCost MemOpCost = |
719 | getMemoryOpCost(Opcode, Src: VTy.getElementType(), Alignment, AddressSpace: 0, CostKind, |
720 | OpdInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I); |
721 | unsigned NumLoads = getEstimatedVLFor(Ty: &VTy); |
722 | return NumLoads * MemOpCost; |
723 | } |
724 | |
725 | // Currently, these represent both throughput and codesize costs |
726 | // for the respective intrinsics. The costs in this table are simply |
727 | // instruction counts with the following adjustments made: |
728 | // * One vsetvli is considered free. |
729 | static const CostTblEntry VectorIntrinsicCostTable[]{ |
730 | {Intrinsic::floor, MVT::f32, 9}, |
731 | {Intrinsic::floor, MVT::f64, 9}, |
732 | {Intrinsic::ceil, MVT::f32, 9}, |
733 | {Intrinsic::ceil, MVT::f64, 9}, |
734 | {Intrinsic::trunc, MVT::f32, 7}, |
735 | {Intrinsic::trunc, MVT::f64, 7}, |
736 | {Intrinsic::round, MVT::f32, 9}, |
737 | {Intrinsic::round, MVT::f64, 9}, |
738 | {Intrinsic::roundeven, MVT::f32, 9}, |
739 | {Intrinsic::roundeven, MVT::f64, 9}, |
740 | {Intrinsic::rint, MVT::f32, 7}, |
741 | {Intrinsic::rint, MVT::f64, 7}, |
742 | {Intrinsic::lrint, MVT::i32, 1}, |
743 | {Intrinsic::lrint, MVT::i64, 1}, |
744 | {Intrinsic::llrint, MVT::i64, 1}, |
745 | {Intrinsic::nearbyint, MVT::f32, 9}, |
746 | {Intrinsic::nearbyint, MVT::f64, 9}, |
747 | {Intrinsic::bswap, MVT::i16, 3}, |
748 | {Intrinsic::bswap, MVT::i32, 12}, |
749 | {Intrinsic::bswap, MVT::i64, 31}, |
750 | {Intrinsic::vp_bswap, MVT::i16, 3}, |
751 | {Intrinsic::vp_bswap, MVT::i32, 12}, |
752 | {Intrinsic::vp_bswap, MVT::i64, 31}, |
753 | {Intrinsic::vp_fshl, MVT::i8, 7}, |
754 | {Intrinsic::vp_fshl, MVT::i16, 7}, |
755 | {Intrinsic::vp_fshl, MVT::i32, 7}, |
756 | {Intrinsic::vp_fshl, MVT::i64, 7}, |
757 | {Intrinsic::vp_fshr, MVT::i8, 7}, |
758 | {Intrinsic::vp_fshr, MVT::i16, 7}, |
759 | {Intrinsic::vp_fshr, MVT::i32, 7}, |
760 | {Intrinsic::vp_fshr, MVT::i64, 7}, |
761 | {Intrinsic::bitreverse, MVT::i8, 17}, |
762 | {Intrinsic::bitreverse, MVT::i16, 24}, |
763 | {Intrinsic::bitreverse, MVT::i32, 33}, |
764 | {Intrinsic::bitreverse, MVT::i64, 52}, |
765 | {Intrinsic::vp_bitreverse, MVT::i8, 17}, |
766 | {Intrinsic::vp_bitreverse, MVT::i16, 24}, |
767 | {Intrinsic::vp_bitreverse, MVT::i32, 33}, |
768 | {Intrinsic::vp_bitreverse, MVT::i64, 52}, |
769 | {Intrinsic::ctpop, MVT::i8, 12}, |
770 | {Intrinsic::ctpop, MVT::i16, 19}, |
771 | {Intrinsic::ctpop, MVT::i32, 20}, |
772 | {Intrinsic::ctpop, MVT::i64, 21}, |
773 | {Intrinsic::vp_ctpop, MVT::i8, 12}, |
774 | {Intrinsic::vp_ctpop, MVT::i16, 19}, |
775 | {Intrinsic::vp_ctpop, MVT::i32, 20}, |
776 | {Intrinsic::vp_ctpop, MVT::i64, 21}, |
777 | {Intrinsic::vp_ctlz, MVT::i8, 19}, |
778 | {Intrinsic::vp_ctlz, MVT::i16, 28}, |
779 | {Intrinsic::vp_ctlz, MVT::i32, 31}, |
780 | {Intrinsic::vp_ctlz, MVT::i64, 35}, |
781 | {Intrinsic::vp_cttz, MVT::i8, 16}, |
782 | {Intrinsic::vp_cttz, MVT::i16, 23}, |
783 | {Intrinsic::vp_cttz, MVT::i32, 24}, |
784 | {Intrinsic::vp_cttz, MVT::i64, 25}, |
785 | }; |
786 | |
787 | static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) { |
788 | switch (ID) { |
789 | #define HELPER_MAP_VPID_TO_VPSD(VPID, VPSD) \ |
790 | case Intrinsic::VPID: \ |
791 | return ISD::VPSD; |
792 | #include "llvm/IR/VPIntrinsics.def" |
793 | #undef HELPER_MAP_VPID_TO_VPSD |
794 | } |
795 | return ISD::DELETED_NODE; |
796 | } |
797 | |
798 | InstructionCost |
799 | RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
800 | TTI::TargetCostKind CostKind) { |
801 | auto *RetTy = ICA.getReturnType(); |
802 | switch (ICA.getID()) { |
803 | case Intrinsic::ceil: |
804 | case Intrinsic::floor: |
805 | case Intrinsic::trunc: |
806 | case Intrinsic::rint: |
807 | case Intrinsic::lrint: |
808 | case Intrinsic::llrint: |
809 | case Intrinsic::round: |
810 | case Intrinsic::roundeven: { |
811 | // These all use the same code. |
812 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
813 | if (!LT.second.isVector() && TLI->isOperationCustom(Op: ISD::FCEIL, VT: LT.second)) |
814 | return LT.first * 8; |
815 | break; |
816 | } |
817 | case Intrinsic::umin: |
818 | case Intrinsic::umax: |
819 | case Intrinsic::smin: |
820 | case Intrinsic::smax: { |
821 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
822 | if (LT.second.isScalarInteger() && ST->hasStdExtZbb()) |
823 | return LT.first; |
824 | |
825 | if (ST->hasVInstructions() && LT.second.isVector()) { |
826 | unsigned Op; |
827 | switch (ICA.getID()) { |
828 | case Intrinsic::umin: |
829 | Op = RISCV::VMINU_VV; |
830 | break; |
831 | case Intrinsic::umax: |
832 | Op = RISCV::VMAXU_VV; |
833 | break; |
834 | case Intrinsic::smin: |
835 | Op = RISCV::VMIN_VV; |
836 | break; |
837 | case Intrinsic::smax: |
838 | Op = RISCV::VMAX_VV; |
839 | break; |
840 | } |
841 | return LT.first * getRISCVInstructionCost(OpCodes: Op, VT: LT.second, CostKind); |
842 | } |
843 | break; |
844 | } |
845 | case Intrinsic::sadd_sat: |
846 | case Intrinsic::ssub_sat: |
847 | case Intrinsic::uadd_sat: |
848 | case Intrinsic::usub_sat: |
849 | case Intrinsic::fabs: |
850 | case Intrinsic::sqrt: { |
851 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
852 | if (ST->hasVInstructions() && LT.second.isVector()) |
853 | return LT.first; |
854 | break; |
855 | } |
856 | case Intrinsic::ctpop: { |
857 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
858 | if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector()) |
859 | return LT.first; |
860 | break; |
861 | } |
862 | case Intrinsic::abs: { |
863 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
864 | if (ST->hasVInstructions() && LT.second.isVector()) { |
865 | // vrsub.vi v10, v8, 0 |
866 | // vmax.vv v8, v8, v10 |
867 | return LT.first * 2; |
868 | } |
869 | break; |
870 | } |
871 | case Intrinsic::get_active_lane_mask: { |
872 | if (ST->hasVInstructions()) { |
873 | Type *ExpRetTy = VectorType::get( |
874 | ElementType: ICA.getArgTypes()[0], EC: cast<VectorType>(Val: RetTy)->getElementCount()); |
875 | auto LT = getTypeLegalizationCost(Ty: ExpRetTy); |
876 | |
877 | // vid.v v8 // considered hoisted |
878 | // vsaddu.vx v8, v8, a0 |
879 | // vmsltu.vx v0, v8, a1 |
880 | return LT.first * |
881 | getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX}, |
882 | LT.second, CostKind); |
883 | } |
884 | break; |
885 | } |
886 | // TODO: add more intrinsic |
887 | case Intrinsic::experimental_stepvector: { |
888 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
889 | // Legalisation of illegal types involves an `index' instruction plus |
890 | // (LT.first - 1) vector adds. |
891 | if (ST->hasVInstructions()) |
892 | return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) + |
893 | (LT.first - 1) * |
894 | getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind); |
895 | return 1 + (LT.first - 1); |
896 | } |
897 | case Intrinsic::vp_rint: { |
898 | // RISC-V target uses at least 5 instructions to lower rounding intrinsics. |
899 | unsigned Cost = 5; |
900 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
901 | if (TLI->isOperationCustom(Op: ISD::VP_FRINT, VT: LT.second)) |
902 | return Cost * LT.first; |
903 | break; |
904 | } |
905 | case Intrinsic::vp_nearbyint: { |
906 | // More one read and one write for fflags than vp_rint. |
907 | unsigned Cost = 7; |
908 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
909 | if (TLI->isOperationCustom(Op: ISD::VP_FRINT, VT: LT.second)) |
910 | return Cost * LT.first; |
911 | break; |
912 | } |
913 | case Intrinsic::vp_ceil: |
914 | case Intrinsic::vp_floor: |
915 | case Intrinsic::vp_round: |
916 | case Intrinsic::vp_roundeven: |
917 | case Intrinsic::vp_roundtozero: { |
918 | // Rounding with static rounding mode needs two more instructions to |
919 | // swap/write FRM than vp_rint. |
920 | unsigned Cost = 7; |
921 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
922 | unsigned VPISD = getISDForVPIntrinsicID(ID: ICA.getID()); |
923 | if (TLI->isOperationCustom(Op: VPISD, VT: LT.second)) |
924 | return Cost * LT.first; |
925 | break; |
926 | } |
927 | } |
928 | |
929 | if (ST->hasVInstructions() && RetTy->isVectorTy()) { |
930 | if (auto LT = getTypeLegalizationCost(Ty: RetTy); |
931 | LT.second.isVector()) { |
932 | MVT EltTy = LT.second.getVectorElementType(); |
933 | if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable, |
934 | ICA.getID(), EltTy)) |
935 | return LT.first * Entry->Cost; |
936 | } |
937 | } |
938 | |
939 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
940 | } |
941 | |
942 | InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, |
943 | Type *Src, |
944 | TTI::CastContextHint CCH, |
945 | TTI::TargetCostKind CostKind, |
946 | const Instruction *I) { |
947 | bool IsVectorType = isa<VectorType>(Val: Dst) && isa<VectorType>(Val: Src); |
948 | if (!IsVectorType) |
949 | return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); |
950 | |
951 | bool IsTypeLegal = isTypeLegal(Ty: Src) && isTypeLegal(Ty: Dst) && |
952 | (Src->getScalarSizeInBits() <= ST->getELen()) && |
953 | (Dst->getScalarSizeInBits() <= ST->getELen()); |
954 | |
955 | // FIXME: Need to compute legalizing cost for illegal types. |
956 | if (!IsTypeLegal) |
957 | return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); |
958 | |
959 | std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src); |
960 | std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Ty: Dst); |
961 | |
962 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
963 | assert(ISD && "Invalid opcode" ); |
964 | |
965 | int PowDiff = (int)Log2_32(Value: Dst->getScalarSizeInBits()) - |
966 | (int)Log2_32(Value: Src->getScalarSizeInBits()); |
967 | switch (ISD) { |
968 | case ISD::SIGN_EXTEND: |
969 | case ISD::ZERO_EXTEND: { |
970 | const unsigned SrcEltSize = Src->getScalarSizeInBits(); |
971 | if (SrcEltSize == 1) { |
972 | // We do not use vsext/vzext to extend from mask vector. |
973 | // Instead we use the following instructions to extend from mask vector: |
974 | // vmv.v.i v8, 0 |
975 | // vmerge.vim v8, v8, -1, v0 |
976 | return getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM}, |
977 | DstLT.second, CostKind); |
978 | } |
979 | if ((PowDiff < 1) || (PowDiff > 3)) |
980 | return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); |
981 | unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8}; |
982 | unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8}; |
983 | unsigned Op = |
984 | (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1]; |
985 | return getRISCVInstructionCost(OpCodes: Op, VT: DstLT.second, CostKind); |
986 | } |
987 | case ISD::TRUNCATE: |
988 | if (Dst->getScalarSizeInBits() == 1) { |
989 | // We do not use several vncvt to truncate to mask vector. So we could |
990 | // not use PowDiff to calculate it. |
991 | // Instead we use the following instructions to truncate to mask vector: |
992 | // vand.vi v8, v8, 1 |
993 | // vmsne.vi v0, v8, 0 |
994 | return getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI}, |
995 | SrcLT.second, CostKind); |
996 | } |
997 | [[fallthrough]]; |
998 | case ISD::FP_EXTEND: |
999 | case ISD::FP_ROUND: { |
1000 | // Counts of narrow/widen instructions. |
1001 | unsigned SrcEltSize = Src->getScalarSizeInBits(); |
1002 | unsigned DstEltSize = Dst->getScalarSizeInBits(); |
1003 | |
1004 | unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI |
1005 | : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V |
1006 | : RISCV::VFNCVT_F_F_W; |
1007 | InstructionCost Cost = 0; |
1008 | for (; SrcEltSize != DstEltSize;) { |
1009 | MVT ElementMVT = (ISD == ISD::TRUNCATE) |
1010 | ? MVT::getIntegerVT(BitWidth: DstEltSize) |
1011 | : MVT::getFloatingPointVT(BitWidth: DstEltSize); |
1012 | MVT DstMVT = DstLT.second.changeVectorElementType(EltVT: ElementMVT); |
1013 | DstEltSize = |
1014 | (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1; |
1015 | Cost += getRISCVInstructionCost(OpCodes: Op, VT: DstMVT, CostKind); |
1016 | } |
1017 | return Cost; |
1018 | } |
1019 | case ISD::FP_TO_SINT: |
1020 | case ISD::FP_TO_UINT: |
1021 | case ISD::SINT_TO_FP: |
1022 | case ISD::UINT_TO_FP: |
1023 | if (Src->getScalarSizeInBits() == 1 || Dst->getScalarSizeInBits() == 1) { |
1024 | // The cost of convert from or to mask vector is different from other |
1025 | // cases. We could not use PowDiff to calculate it. |
1026 | // For mask vector to fp, we should use the following instructions: |
1027 | // vmv.v.i v8, 0 |
1028 | // vmerge.vim v8, v8, -1, v0 |
1029 | // vfcvt.f.x.v v8, v8 |
1030 | |
1031 | // And for fp vector to mask, we use: |
1032 | // vfncvt.rtz.x.f.w v9, v8 |
1033 | // vand.vi v8, v9, 1 |
1034 | // vmsne.vi v0, v8, 0 |
1035 | return 3; |
1036 | } |
1037 | if (std::abs(x: PowDiff) <= 1) |
1038 | return 1; |
1039 | // Backend could lower (v[sz]ext i8 to double) to vfcvt(v[sz]ext.f8 i8), |
1040 | // so it only need two conversion. |
1041 | if (Src->isIntOrIntVectorTy()) |
1042 | return 2; |
1043 | // Counts of narrow/widen instructions. |
1044 | return std::abs(x: PowDiff); |
1045 | } |
1046 | return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); |
1047 | } |
1048 | |
1049 | unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) { |
1050 | if (isa<ScalableVectorType>(Val: Ty)) { |
1051 | const unsigned EltSize = DL.getTypeSizeInBits(Ty: Ty->getElementType()); |
1052 | const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue(); |
1053 | const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock; |
1054 | return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize); |
1055 | } |
1056 | return cast<FixedVectorType>(Val: Ty)->getNumElements(); |
1057 | } |
1058 | |
1059 | InstructionCost |
1060 | RISCVTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, |
1061 | FastMathFlags FMF, |
1062 | TTI::TargetCostKind CostKind) { |
1063 | if (isa<FixedVectorType>(Val: Ty) && !ST->useRVVForFixedLengthVectors()) |
1064 | return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); |
1065 | |
1066 | // Skip if scalar size of Ty is bigger than ELEN. |
1067 | if (Ty->getScalarSizeInBits() > ST->getELen()) |
1068 | return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); |
1069 | |
1070 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
1071 | if (Ty->getElementType()->isIntegerTy(Bitwidth: 1)) { |
1072 | // SelectionDAGBuilder does following transforms: |
1073 | // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>) |
1074 | // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>) |
1075 | if (IID == Intrinsic::umax || IID == Intrinsic::smin) |
1076 | return getArithmeticReductionCost(Opcode: Instruction::Or, Ty, FMF, CostKind); |
1077 | else |
1078 | return getArithmeticReductionCost(Opcode: Instruction::And, Ty, FMF, CostKind); |
1079 | } |
1080 | |
1081 | if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) { |
1082 | SmallVector<unsigned, 3> Opcodes; |
1083 | InstructionCost = 0; |
1084 | switch (IID) { |
1085 | case Intrinsic::maximum: |
1086 | if (FMF.noNaNs()) { |
1087 | Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S}; |
1088 | } else { |
1089 | Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS, |
1090 | RISCV::VFMV_F_S}; |
1091 | // Cost of Canonical Nan + branch |
1092 | // lui a0, 523264 |
1093 | // fmv.w.x fa0, a0 |
1094 | Type *DstTy = Ty->getScalarType(); |
1095 | const unsigned EltTyBits = DstTy->getScalarSizeInBits(); |
1096 | Type *SrcTy = IntegerType::getIntNTy(C&: DstTy->getContext(), N: EltTyBits); |
1097 | ExtraCost = 1 + |
1098 | getCastInstrCost(Opcode: Instruction::UIToFP, Dst: DstTy, Src: SrcTy, |
1099 | CCH: TTI::CastContextHint::None, CostKind) + |
1100 | getCFInstrCost(Opcode: Instruction::Br, CostKind); |
1101 | } |
1102 | break; |
1103 | |
1104 | case Intrinsic::minimum: |
1105 | if (FMF.noNaNs()) { |
1106 | Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S}; |
1107 | } else { |
1108 | Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS, |
1109 | RISCV::VFMV_F_S}; |
1110 | // Cost of Canonical Nan + branch |
1111 | // lui a0, 523264 |
1112 | // fmv.w.x fa0, a0 |
1113 | Type *DstTy = Ty->getScalarType(); |
1114 | const unsigned EltTyBits = DL.getTypeSizeInBits(Ty: DstTy); |
1115 | Type *SrcTy = IntegerType::getIntNTy(C&: DstTy->getContext(), N: EltTyBits); |
1116 | ExtraCost = 1 + |
1117 | getCastInstrCost(Opcode: Instruction::UIToFP, Dst: DstTy, Src: SrcTy, |
1118 | CCH: TTI::CastContextHint::None, CostKind) + |
1119 | getCFInstrCost(Opcode: Instruction::Br, CostKind); |
1120 | } |
1121 | break; |
1122 | } |
1123 | return ExtraCost + getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind); |
1124 | } |
1125 | |
1126 | // IR Reduction is composed by two vmv and one rvv reduction instruction. |
1127 | unsigned SplitOp; |
1128 | SmallVector<unsigned, 3> Opcodes; |
1129 | switch (IID) { |
1130 | default: |
1131 | llvm_unreachable("Unsupported intrinsic" ); |
1132 | case Intrinsic::smax: |
1133 | SplitOp = RISCV::VMAX_VV; |
1134 | Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAX_VS, RISCV::VMV_X_S}; |
1135 | break; |
1136 | case Intrinsic::smin: |
1137 | SplitOp = RISCV::VMIN_VV; |
1138 | Opcodes = {RISCV::VMV_S_X, RISCV::VREDMIN_VS, RISCV::VMV_X_S}; |
1139 | break; |
1140 | case Intrinsic::umax: |
1141 | SplitOp = RISCV::VMAXU_VV; |
1142 | Opcodes = {RISCV::VMV_S_X, RISCV::VREDMAXU_VS, RISCV::VMV_X_S}; |
1143 | break; |
1144 | case Intrinsic::umin: |
1145 | SplitOp = RISCV::VMINU_VV; |
1146 | Opcodes = {RISCV::VMV_S_X, RISCV::VREDMINU_VS, RISCV::VMV_X_S}; |
1147 | break; |
1148 | case Intrinsic::maxnum: |
1149 | SplitOp = RISCV::VFMAX_VV; |
1150 | Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMAX_VS, RISCV::VFMV_F_S}; |
1151 | break; |
1152 | case Intrinsic::minnum: |
1153 | SplitOp = RISCV::VFMIN_VV; |
1154 | Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDMIN_VS, RISCV::VFMV_F_S}; |
1155 | break; |
1156 | } |
1157 | // Add a cost for data larger than LMUL8 |
1158 | InstructionCost SplitCost = |
1159 | (LT.first > 1) ? (LT.first - 1) * |
1160 | getRISCVInstructionCost(OpCodes: SplitOp, VT: LT.second, CostKind) |
1161 | : 0; |
1162 | return SplitCost + getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind); |
1163 | } |
1164 | |
1165 | InstructionCost |
1166 | RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, |
1167 | std::optional<FastMathFlags> FMF, |
1168 | TTI::TargetCostKind CostKind) { |
1169 | if (isa<FixedVectorType>(Val: Ty) && !ST->useRVVForFixedLengthVectors()) |
1170 | return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); |
1171 | |
1172 | // Skip if scalar size of Ty is bigger than ELEN. |
1173 | if (Ty->getScalarSizeInBits() > ST->getELen()) |
1174 | return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); |
1175 | |
1176 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
1177 | assert(ISD && "Invalid opcode" ); |
1178 | |
1179 | if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND && |
1180 | ISD != ISD::FADD) |
1181 | return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); |
1182 | |
1183 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
1184 | SmallVector<unsigned, 3> Opcodes; |
1185 | Type *ElementTy = Ty->getElementType(); |
1186 | if (ElementTy->isIntegerTy(Bitwidth: 1)) { |
1187 | if (ISD == ISD::AND) { |
1188 | // Example sequences: |
1189 | // vsetvli a0, zero, e8, mf8, ta, ma |
1190 | // vmnot.m v8, v0 |
1191 | // vcpop.m a0, v8 |
1192 | // seqz a0, a0 |
1193 | Opcodes = {RISCV::VMNAND_MM, RISCV::VCPOP_M}; |
1194 | return (LT.first - 1) + |
1195 | getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind) + |
1196 | getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ElementTy, CondTy: ElementTy, |
1197 | VecPred: CmpInst::ICMP_EQ, CostKind); |
1198 | } else { |
1199 | // Example sequences: |
1200 | // vsetvli a0, zero, e8, mf8, ta, ma |
1201 | // vcpop.m a0, v0 |
1202 | // snez a0, a0 |
1203 | Opcodes = {RISCV::VCPOP_M}; |
1204 | return (LT.first - 1) + |
1205 | getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind) + |
1206 | getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: ElementTy, CondTy: ElementTy, |
1207 | VecPred: CmpInst::ICMP_NE, CostKind); |
1208 | } |
1209 | } |
1210 | |
1211 | // IR Reduction is composed by two vmv and one rvv reduction instruction. |
1212 | if (TTI::requiresOrderedReduction(FMF)) { |
1213 | Opcodes.push_back(RISCV::VFMV_S_F); |
1214 | for (unsigned i = 0; i < LT.first.getValue(); i++) |
1215 | Opcodes.push_back(RISCV::VFREDOSUM_VS); |
1216 | Opcodes.push_back(RISCV::VFMV_F_S); |
1217 | return getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind); |
1218 | } |
1219 | unsigned SplitOp; |
1220 | switch (ISD) { |
1221 | case ISD::ADD: |
1222 | SplitOp = RISCV::VADD_VV; |
1223 | Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S}; |
1224 | break; |
1225 | case ISD::OR: |
1226 | SplitOp = RISCV::VOR_VV; |
1227 | Opcodes = {RISCV::VMV_S_X, RISCV::VREDOR_VS, RISCV::VMV_X_S}; |
1228 | break; |
1229 | case ISD::XOR: |
1230 | SplitOp = RISCV::VXOR_VV; |
1231 | Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S}; |
1232 | break; |
1233 | case ISD::AND: |
1234 | SplitOp = RISCV::VAND_VV; |
1235 | Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S}; |
1236 | break; |
1237 | case ISD::FADD: |
1238 | SplitOp = RISCV::VFADD_VV; |
1239 | Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S}; |
1240 | break; |
1241 | } |
1242 | // Add a cost for data larger than LMUL8 |
1243 | InstructionCost SplitCost = |
1244 | (LT.first > 1) ? (LT.first - 1) * |
1245 | getRISCVInstructionCost(OpCodes: SplitOp, VT: LT.second, CostKind) |
1246 | : 0; |
1247 | return SplitCost + getRISCVInstructionCost(OpCodes: Opcodes, VT: LT.second, CostKind); |
1248 | } |
1249 | |
1250 | InstructionCost RISCVTTIImpl::getExtendedReductionCost( |
1251 | unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, |
1252 | FastMathFlags FMF, TTI::TargetCostKind CostKind) { |
1253 | if (isa<FixedVectorType>(Val: ValTy) && !ST->useRVVForFixedLengthVectors()) |
1254 | return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy, |
1255 | FMF, CostKind); |
1256 | |
1257 | // Skip if scalar size of ResTy is bigger than ELEN. |
1258 | if (ResTy->getScalarSizeInBits() > ST->getELen()) |
1259 | return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy, |
1260 | FMF, CostKind); |
1261 | |
1262 | if (Opcode != Instruction::Add && Opcode != Instruction::FAdd) |
1263 | return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy, |
1264 | FMF, CostKind); |
1265 | |
1266 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
1267 | |
1268 | if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits()) |
1269 | return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty: ValTy, |
1270 | FMF, CostKind); |
1271 | |
1272 | return (LT.first - 1) + |
1273 | getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind); |
1274 | } |
1275 | |
1276 | InstructionCost RISCVTTIImpl::getStoreImmCost(Type *Ty, |
1277 | TTI::OperandValueInfo OpInfo, |
1278 | TTI::TargetCostKind CostKind) { |
1279 | assert(OpInfo.isConstant() && "non constant operand?" ); |
1280 | if (!isa<VectorType>(Val: Ty)) |
1281 | // FIXME: We need to account for immediate materialization here, but doing |
1282 | // a decent job requires more knowledge about the immediate than we |
1283 | // currently have here. |
1284 | return 0; |
1285 | |
1286 | if (OpInfo.isUniform()) |
1287 | // vmv.x.i, vmv.v.x, or vfmv.v.f |
1288 | // We ignore the cost of the scalar constant materialization to be consistent |
1289 | // with how we treat scalar constants themselves just above. |
1290 | return 1; |
1291 | |
1292 | return getConstantPoolLoadCost(Ty, CostKind); |
1293 | } |
1294 | |
1295 | |
1296 | InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, |
1297 | MaybeAlign Alignment, |
1298 | unsigned AddressSpace, |
1299 | TTI::TargetCostKind CostKind, |
1300 | TTI::OperandValueInfo OpInfo, |
1301 | const Instruction *I) { |
1302 | EVT VT = TLI->getValueType(DL, Ty: Src, AllowUnknown: true); |
1303 | // Type legalization can't handle structs |
1304 | if (VT == MVT::Other) |
1305 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
1306 | CostKind, OpInfo, I); |
1307 | |
1308 | InstructionCost Cost = 0; |
1309 | if (Opcode == Instruction::Store && OpInfo.isConstant()) |
1310 | Cost += getStoreImmCost(Ty: Src, OpInfo, CostKind); |
1311 | InstructionCost BaseCost = |
1312 | BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
1313 | CostKind, OpInfo, I); |
1314 | // Assume memory ops cost scale with the number of vector registers |
1315 | // possible accessed by the instruction. Note that BasicTTI already |
1316 | // handles the LT.first term for us. |
1317 | if (std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src); |
1318 | LT.second.isVector() && CostKind != TTI::TCK_CodeSize) |
1319 | BaseCost *= TLI->getLMULCost(VT: LT.second); |
1320 | return Cost + BaseCost; |
1321 | |
1322 | } |
1323 | |
1324 | InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, |
1325 | Type *CondTy, |
1326 | CmpInst::Predicate VecPred, |
1327 | TTI::TargetCostKind CostKind, |
1328 | const Instruction *I) { |
1329 | if (CostKind != TTI::TCK_RecipThroughput) |
1330 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
1331 | I); |
1332 | |
1333 | if (isa<FixedVectorType>(Val: ValTy) && !ST->useRVVForFixedLengthVectors()) |
1334 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
1335 | I); |
1336 | |
1337 | // Skip if scalar size of ValTy is bigger than ELEN. |
1338 | if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen()) |
1339 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
1340 | I); |
1341 | |
1342 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
1343 | if (Opcode == Instruction::Select && ValTy->isVectorTy()) { |
1344 | if (CondTy->isVectorTy()) { |
1345 | if (ValTy->getScalarSizeInBits() == 1) { |
1346 | // vmandn.mm v8, v8, v9 |
1347 | // vmand.mm v9, v0, v9 |
1348 | // vmor.mm v0, v9, v8 |
1349 | return LT.first * |
1350 | getRISCVInstructionCost( |
1351 | {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM}, |
1352 | LT.second, CostKind); |
1353 | } |
1354 | // vselect and max/min are supported natively. |
1355 | return LT.first * |
1356 | getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind); |
1357 | } |
1358 | |
1359 | if (ValTy->getScalarSizeInBits() == 1) { |
1360 | // vmv.v.x v9, a0 |
1361 | // vmsne.vi v9, v9, 0 |
1362 | // vmandn.mm v8, v8, v9 |
1363 | // vmand.mm v9, v0, v9 |
1364 | // vmor.mm v0, v9, v8 |
1365 | MVT InterimVT = LT.second.changeVectorElementType(MVT::i8); |
1366 | return LT.first * |
1367 | getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI}, |
1368 | InterimVT, CostKind) + |
1369 | LT.first * getRISCVInstructionCost( |
1370 | {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM}, |
1371 | LT.second, CostKind); |
1372 | } |
1373 | |
1374 | // vmv.v.x v10, a0 |
1375 | // vmsne.vi v0, v10, 0 |
1376 | // vmerge.vvm v8, v9, v8, v0 |
1377 | return LT.first * getRISCVInstructionCost( |
1378 | {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM}, |
1379 | LT.second, CostKind); |
1380 | } |
1381 | |
1382 | if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() && |
1383 | CmpInst::isIntPredicate(P: VecPred)) { |
1384 | // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE |
1385 | // provided they incur the same cost across all implementations |
1386 | return LT.first * |
1387 | getRISCVInstructionCost(RISCV::VMSLT_VV, LT.second, CostKind); |
1388 | } |
1389 | |
1390 | if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() && |
1391 | CmpInst::isFPPredicate(P: VecPred)) { |
1392 | |
1393 | // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask |
1394 | if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE)) |
1395 | return getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind); |
1396 | |
1397 | // If we do not support the input floating point vector type, use the base |
1398 | // one which will calculate as: |
1399 | // ScalarizeCost + Num * Cost for fixed vector, |
1400 | // InvalidCost for scalable vector. |
1401 | if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) || |
1402 | (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) || |
1403 | (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64())) |
1404 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
1405 | I); |
1406 | |
1407 | // Assuming vector fp compare and mask instructions are all the same cost |
1408 | // until a need arises to differentiate them. |
1409 | switch (VecPred) { |
1410 | case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm |
1411 | case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm |
1412 | case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm |
1413 | case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm |
1414 | return LT.first * getRISCVInstructionCost( |
1415 | {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM}, |
1416 | LT.second, CostKind); |
1417 | |
1418 | case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m |
1419 | case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m |
1420 | case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m |
1421 | case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m |
1422 | return LT.first * |
1423 | getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM}, |
1424 | LT.second, CostKind); |
1425 | |
1426 | case CmpInst::FCMP_OEQ: // vmfeq.vv |
1427 | case CmpInst::FCMP_OGT: // vmflt.vv |
1428 | case CmpInst::FCMP_OGE: // vmfle.vv |
1429 | case CmpInst::FCMP_OLT: // vmflt.vv |
1430 | case CmpInst::FCMP_OLE: // vmfle.vv |
1431 | case CmpInst::FCMP_UNE: // vmfne.vv |
1432 | return LT.first * |
1433 | getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind); |
1434 | default: |
1435 | break; |
1436 | } |
1437 | } |
1438 | |
1439 | // TODO: Add cost for scalar type. |
1440 | |
1441 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); |
1442 | } |
1443 | |
1444 | InstructionCost RISCVTTIImpl::getCFInstrCost(unsigned Opcode, |
1445 | TTI::TargetCostKind CostKind, |
1446 | const Instruction *I) { |
1447 | if (CostKind != TTI::TCK_RecipThroughput) |
1448 | return Opcode == Instruction::PHI ? 0 : 1; |
1449 | // Branches are assumed to be predicted. |
1450 | return 0; |
1451 | } |
1452 | |
1453 | InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, |
1454 | TTI::TargetCostKind CostKind, |
1455 | unsigned Index, Value *Op0, |
1456 | Value *Op1) { |
1457 | assert(Val->isVectorTy() && "This must be a vector type" ); |
1458 | |
1459 | if (Opcode != Instruction::ExtractElement && |
1460 | Opcode != Instruction::InsertElement) |
1461 | return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); |
1462 | |
1463 | // Legalize the type. |
1464 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val); |
1465 | |
1466 | // This type is legalized to a scalar type. |
1467 | if (!LT.second.isVector()) { |
1468 | auto *FixedVecTy = cast<FixedVectorType>(Val); |
1469 | // If Index is a known constant, cost is zero. |
1470 | if (Index != -1U) |
1471 | return 0; |
1472 | // Extract/InsertElement with non-constant index is very costly when |
1473 | // scalarized; estimate cost of loads/stores sequence via the stack: |
1474 | // ExtractElement cost: store vector to stack, load scalar; |
1475 | // InsertElement cost: store vector to stack, store scalar, load vector. |
1476 | Type *ElemTy = FixedVecTy->getElementType(); |
1477 | auto NumElems = FixedVecTy->getNumElements(); |
1478 | auto Align = DL.getPrefTypeAlign(Ty: ElemTy); |
1479 | InstructionCost LoadCost = |
1480 | getMemoryOpCost(Opcode: Instruction::Load, Src: ElemTy, Alignment: Align, AddressSpace: 0, CostKind); |
1481 | InstructionCost StoreCost = |
1482 | getMemoryOpCost(Opcode: Instruction::Store, Src: ElemTy, Alignment: Align, AddressSpace: 0, CostKind); |
1483 | return Opcode == Instruction::ExtractElement |
1484 | ? StoreCost * NumElems + LoadCost |
1485 | : (StoreCost + LoadCost) * NumElems + StoreCost; |
1486 | } |
1487 | |
1488 | // For unsupported scalable vector. |
1489 | if (LT.second.isScalableVector() && !LT.first.isValid()) |
1490 | return LT.first; |
1491 | |
1492 | if (!isTypeLegal(Ty: Val)) |
1493 | return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); |
1494 | |
1495 | // Mask vector extract/insert is expanded via e8. |
1496 | if (Val->getScalarSizeInBits() == 1) { |
1497 | VectorType *WideTy = |
1498 | VectorType::get(ElementType: IntegerType::get(C&: Val->getContext(), NumBits: 8), |
1499 | EC: cast<VectorType>(Val)->getElementCount()); |
1500 | if (Opcode == Instruction::ExtractElement) { |
1501 | InstructionCost ExtendCost |
1502 | = getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideTy, Src: Val, |
1503 | CCH: TTI::CastContextHint::None, CostKind); |
1504 | InstructionCost |
1505 | = getVectorInstrCost(Opcode, Val: WideTy, CostKind, Index, Op0: nullptr, Op1: nullptr); |
1506 | return ExtendCost + ExtractCost; |
1507 | } |
1508 | InstructionCost ExtendCost |
1509 | = getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideTy, Src: Val, |
1510 | CCH: TTI::CastContextHint::None, CostKind); |
1511 | InstructionCost InsertCost |
1512 | = getVectorInstrCost(Opcode, Val: WideTy, CostKind, Index, Op0: nullptr, Op1: nullptr); |
1513 | InstructionCost TruncCost |
1514 | = getCastInstrCost(Opcode: Instruction::Trunc, Dst: Val, Src: WideTy, |
1515 | CCH: TTI::CastContextHint::None, CostKind); |
1516 | return ExtendCost + InsertCost + TruncCost; |
1517 | } |
1518 | |
1519 | |
1520 | // In RVV, we could use vslidedown + vmv.x.s to extract element from vector |
1521 | // and vslideup + vmv.s.x to insert element to vector. |
1522 | unsigned BaseCost = 1; |
1523 | // When insertelement we should add the index with 1 as the input of vslideup. |
1524 | unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1; |
1525 | |
1526 | if (Index != -1U) { |
1527 | // The type may be split. For fixed-width vectors we can normalize the |
1528 | // index to the new type. |
1529 | if (LT.second.isFixedLengthVector()) { |
1530 | unsigned Width = LT.second.getVectorNumElements(); |
1531 | Index = Index % Width; |
1532 | } |
1533 | |
1534 | // We could extract/insert the first element without vslidedown/vslideup. |
1535 | if (Index == 0) |
1536 | SlideCost = 0; |
1537 | else if (Opcode == Instruction::InsertElement) |
1538 | SlideCost = 1; // With a constant index, we do not need to use addi. |
1539 | } |
1540 | |
1541 | // Extract i64 in the target that has XLEN=32 need more instruction. |
1542 | if (Val->getScalarType()->isIntegerTy() && |
1543 | ST->getXLen() < Val->getScalarSizeInBits()) { |
1544 | // For extractelement, we need the following instructions: |
1545 | // vsetivli zero, 1, e64, m1, ta, mu (not count) |
1546 | // vslidedown.vx v8, v8, a0 |
1547 | // vmv.x.s a0, v8 |
1548 | // li a1, 32 |
1549 | // vsrl.vx v8, v8, a1 |
1550 | // vmv.x.s a1, v8 |
1551 | |
1552 | // For insertelement, we need the following instructions: |
1553 | // vsetivli zero, 2, e32, m4, ta, mu (not count) |
1554 | // vmv.v.i v12, 0 |
1555 | // vslide1up.vx v16, v12, a1 |
1556 | // vslide1up.vx v12, v16, a0 |
1557 | // addi a0, a2, 1 |
1558 | // vsetvli zero, a0, e64, m4, tu, mu (not count) |
1559 | // vslideup.vx v8, v12, a2 |
1560 | |
1561 | // TODO: should we count these special vsetvlis? |
1562 | BaseCost = Opcode == Instruction::InsertElement ? 3 : 4; |
1563 | } |
1564 | return BaseCost + SlideCost; |
1565 | } |
1566 | |
1567 | InstructionCost RISCVTTIImpl::getArithmeticInstrCost( |
1568 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
1569 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, |
1570 | ArrayRef<const Value *> Args, const Instruction *CxtI) { |
1571 | |
1572 | // TODO: Handle more cost kinds. |
1573 | if (CostKind != TTI::TCK_RecipThroughput) |
1574 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, |
1575 | Args, CxtI); |
1576 | |
1577 | if (isa<FixedVectorType>(Val: Ty) && !ST->useRVVForFixedLengthVectors()) |
1578 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, |
1579 | Args, CxtI); |
1580 | |
1581 | // Skip if scalar size of Ty is bigger than ELEN. |
1582 | if (isa<VectorType>(Val: Ty) && Ty->getScalarSizeInBits() > ST->getELen()) |
1583 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, |
1584 | Args, CxtI); |
1585 | |
1586 | // Legalize the type. |
1587 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
1588 | |
1589 | // TODO: Handle scalar type. |
1590 | if (!LT.second.isVector()) |
1591 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, |
1592 | Args, CxtI); |
1593 | |
1594 | |
1595 | auto getConstantMatCost = |
1596 | [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost { |
1597 | if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand)) |
1598 | // Two sub-cases: |
1599 | // * Has a 5 bit immediate operand which can be splatted. |
1600 | // * Has a larger immediate which must be materialized in scalar register |
1601 | // We return 0 for both as we currently ignore the cost of materializing |
1602 | // scalar constants in GPRs. |
1603 | return 0; |
1604 | |
1605 | return getConstantPoolLoadCost(Ty, CostKind); |
1606 | }; |
1607 | |
1608 | // Add the cost of materializing any constant vectors required. |
1609 | InstructionCost ConstantMatCost = 0; |
1610 | if (Op1Info.isConstant()) |
1611 | ConstantMatCost += getConstantMatCost(0, Op1Info); |
1612 | if (Op2Info.isConstant()) |
1613 | ConstantMatCost += getConstantMatCost(1, Op2Info); |
1614 | |
1615 | switch (TLI->InstructionOpcodeToISD(Opcode)) { |
1616 | case ISD::ADD: |
1617 | case ISD::SUB: |
1618 | case ISD::AND: |
1619 | case ISD::OR: |
1620 | case ISD::XOR: |
1621 | case ISD::SHL: |
1622 | case ISD::SRL: |
1623 | case ISD::SRA: |
1624 | case ISD::MUL: |
1625 | case ISD::MULHS: |
1626 | case ISD::MULHU: |
1627 | case ISD::FADD: |
1628 | case ISD::FSUB: |
1629 | case ISD::FMUL: |
1630 | case ISD::FNEG: { |
1631 | return ConstantMatCost + TLI->getLMULCost(VT: LT.second) * LT.first * 1; |
1632 | } |
1633 | default: |
1634 | return ConstantMatCost + |
1635 | BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, |
1636 | Args, CxtI); |
1637 | } |
1638 | } |
1639 | |
1640 | // TODO: Deduplicate from TargetTransformInfoImplCRTPBase. |
1641 | InstructionCost RISCVTTIImpl::getPointersChainCost( |
1642 | ArrayRef<const Value *> Ptrs, const Value *Base, |
1643 | const TTI::PointersChainInfo &Info, Type *AccessTy, |
1644 | TTI::TargetCostKind CostKind) { |
1645 | InstructionCost Cost = TTI::TCC_Free; |
1646 | // In the basic model we take into account GEP instructions only |
1647 | // (although here can come alloca instruction, a value, constants and/or |
1648 | // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a |
1649 | // pointer). Typically, if Base is a not a GEP-instruction and all the |
1650 | // pointers are relative to the same base address, all the rest are |
1651 | // either GEP instructions, PHIs, bitcasts or constants. When we have same |
1652 | // base, we just calculate cost of each non-Base GEP as an ADD operation if |
1653 | // any their index is a non-const. |
1654 | // If no known dependecies between the pointers cost is calculated as a sum |
1655 | // of costs of GEP instructions. |
1656 | for (auto [I, V] : enumerate(First&: Ptrs)) { |
1657 | const auto *GEP = dyn_cast<GetElementPtrInst>(Val: V); |
1658 | if (!GEP) |
1659 | continue; |
1660 | if (Info.isSameBase() && V != Base) { |
1661 | if (GEP->hasAllConstantIndices()) |
1662 | continue; |
1663 | // If the chain is unit-stride and BaseReg + stride*i is a legal |
1664 | // addressing mode, then presume the base GEP is sitting around in a |
1665 | // register somewhere and check if we can fold the offset relative to |
1666 | // it. |
1667 | unsigned Stride = DL.getTypeStoreSize(Ty: AccessTy); |
1668 | if (Info.isUnitStride() && |
1669 | isLegalAddressingMode(Ty: AccessTy, |
1670 | /* BaseGV */ nullptr, |
1671 | /* BaseOffset */ Stride * I, |
1672 | /* HasBaseReg */ true, |
1673 | /* Scale */ 0, |
1674 | AddrSpace: GEP->getType()->getPointerAddressSpace())) |
1675 | continue; |
1676 | Cost += getArithmeticInstrCost(Opcode: Instruction::Add, Ty: GEP->getType(), CostKind, |
1677 | Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
1678 | Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
1679 | Args: std::nullopt); |
1680 | } else { |
1681 | SmallVector<const Value *> Indices(GEP->indices()); |
1682 | Cost += getGEPCost(PointeeType: GEP->getSourceElementType(), Ptr: GEP->getPointerOperand(), |
1683 | Operands: Indices, AccessType: AccessTy, CostKind); |
1684 | } |
1685 | } |
1686 | return Cost; |
1687 | } |
1688 | |
1689 | void RISCVTTIImpl::(Loop *L, ScalarEvolution &SE, |
1690 | TTI::UnrollingPreferences &UP, |
1691 | OptimizationRemarkEmitter *ORE) { |
1692 | // TODO: More tuning on benchmarks and metrics with changes as needed |
1693 | // would apply to all settings below to enable performance. |
1694 | |
1695 | |
1696 | if (ST->enableDefaultUnroll()) |
1697 | return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE); |
1698 | |
1699 | // Enable Upper bound unrolling universally, not dependant upon the conditions |
1700 | // below. |
1701 | UP.UpperBound = true; |
1702 | |
1703 | // Disable loop unrolling for Oz and Os. |
1704 | UP.OptSizeThreshold = 0; |
1705 | UP.PartialOptSizeThreshold = 0; |
1706 | if (L->getHeader()->getParent()->hasOptSize()) |
1707 | return; |
1708 | |
1709 | SmallVector<BasicBlock *, 4> ExitingBlocks; |
1710 | L->getExitingBlocks(ExitingBlocks); |
1711 | LLVM_DEBUG(dbgs() << "Loop has:\n" |
1712 | << "Blocks: " << L->getNumBlocks() << "\n" |
1713 | << "Exit blocks: " << ExitingBlocks.size() << "\n" ); |
1714 | |
1715 | // Only allow another exit other than the latch. This acts as an early exit |
1716 | // as it mirrors the profitability calculation of the runtime unroller. |
1717 | if (ExitingBlocks.size() > 2) |
1718 | return; |
1719 | |
1720 | // Limit the CFG of the loop body for targets with a branch predictor. |
1721 | // Allowing 4 blocks permits if-then-else diamonds in the body. |
1722 | if (L->getNumBlocks() > 4) |
1723 | return; |
1724 | |
1725 | // Don't unroll vectorized loops, including the remainder loop |
1726 | if (getBooleanLoopAttribute(TheLoop: L, Name: "llvm.loop.isvectorized" )) |
1727 | return; |
1728 | |
1729 | // Scan the loop: don't unroll loops with calls as this could prevent |
1730 | // inlining. |
1731 | InstructionCost Cost = 0; |
1732 | for (auto *BB : L->getBlocks()) { |
1733 | for (auto &I : *BB) { |
1734 | // Initial setting - Don't unroll loops containing vectorized |
1735 | // instructions. |
1736 | if (I.getType()->isVectorTy()) |
1737 | return; |
1738 | |
1739 | if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I)) { |
1740 | if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) { |
1741 | if (!isLoweredToCall(F)) |
1742 | continue; |
1743 | } |
1744 | return; |
1745 | } |
1746 | |
1747 | SmallVector<const Value *> Operands(I.operand_values()); |
1748 | Cost += getInstructionCost(U: &I, Operands, |
1749 | CostKind: TargetTransformInfo::TCK_SizeAndLatency); |
1750 | } |
1751 | } |
1752 | |
1753 | LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n" ); |
1754 | |
1755 | UP.Partial = true; |
1756 | UP.Runtime = true; |
1757 | UP.UnrollRemainder = true; |
1758 | UP.UnrollAndJam = true; |
1759 | UP.UnrollAndJamInnerLoopThreshold = 60; |
1760 | |
1761 | // Force unrolling small loops can be very useful because of the branch |
1762 | // taken cost of the backedge. |
1763 | if (Cost < 12) |
1764 | UP.Force = true; |
1765 | } |
1766 | |
1767 | void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
1768 | TTI::PeelingPreferences &PP) { |
1769 | BaseT::getPeelingPreferences(L, SE, PP); |
1770 | } |
1771 | |
1772 | unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) { |
1773 | TypeSize Size = DL.getTypeSizeInBits(Ty); |
1774 | if (Ty->isVectorTy()) { |
1775 | if (Size.isScalable() && ST->hasVInstructions()) |
1776 | return divideCeil(Numerator: Size.getKnownMinValue(), Denominator: RISCV::RVVBitsPerBlock); |
1777 | |
1778 | if (ST->useRVVForFixedLengthVectors()) |
1779 | return divideCeil(Numerator: Size, Denominator: ST->getRealMinVLen()); |
1780 | } |
1781 | |
1782 | return BaseT::getRegUsageForType(Ty); |
1783 | } |
1784 | |
1785 | unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { |
1786 | if (SLPMaxVF.getNumOccurrences()) |
1787 | return SLPMaxVF; |
1788 | |
1789 | // Return how many elements can fit in getRegisterBitwidth. This is the |
1790 | // same routine as used in LoopVectorizer. We should probably be |
1791 | // accounting for whether we actually have instructions with the right |
1792 | // lane type, but we don't have enough information to do that without |
1793 | // some additional plumbing which hasn't been justified yet. |
1794 | TypeSize RegWidth = |
1795 | getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector); |
1796 | // If no vector registers, or absurd element widths, disable |
1797 | // vectorization by returning 1. |
1798 | return std::max<unsigned>(a: 1U, b: RegWidth.getFixedValue() / ElemWidth); |
1799 | } |
1800 | |
1801 | bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, |
1802 | const TargetTransformInfo::LSRCost &C2) { |
1803 | // RISC-V specific here are "instruction number 1st priority". |
1804 | return std::tie(args: C1.Insns, args: C1.NumRegs, args: C1.AddRecCost, |
1805 | args: C1.NumIVMuls, args: C1.NumBaseAdds, |
1806 | args: C1.ScaleCost, args: C1.ImmCost, args: C1.SetupCost) < |
1807 | std::tie(args: C2.Insns, args: C2.NumRegs, args: C2.AddRecCost, |
1808 | args: C2.NumIVMuls, args: C2.NumBaseAdds, |
1809 | args: C2.ScaleCost, args: C2.ImmCost, args: C2.SetupCost); |
1810 | } |
1811 | |
1812 | bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) { |
1813 | auto *VTy = dyn_cast<VectorType>(Val: DataTy); |
1814 | if (!VTy || VTy->isScalableTy()) |
1815 | return false; |
1816 | |
1817 | if (!isLegalMaskedLoadStore(DataType: DataTy, Alignment)) |
1818 | return false; |
1819 | return true; |
1820 | } |
1821 | |
1822 | bool RISCVTTIImpl::areInlineCompatible(const Function *Caller, |
1823 | const Function *Callee) const { |
1824 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
1825 | |
1826 | const FeatureBitset &CallerBits = |
1827 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); |
1828 | const FeatureBitset &CalleeBits = |
1829 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); |
1830 | |
1831 | // Inline a callee if its target-features are a subset of the callers |
1832 | // target-features. |
1833 | return (CallerBits & CalleeBits) == CalleeBits; |
1834 | } |
1835 | |