LLVM API Documentation
00001 //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 /// \file 00010 /// This file implements a TargetTransformInfo analysis pass specific to the 00011 /// X86 target machine. It uses the target's detailed information to provide 00012 /// more precise answers to certain TTI queries, while letting the target 00013 /// independent and default TTI implementations handle the rest. 00014 /// 00015 //===----------------------------------------------------------------------===// 00016 00017 #include "X86.h" 00018 #include "X86TargetMachine.h" 00019 #include "llvm/Analysis/TargetTransformInfo.h" 00020 #include "llvm/IR/IntrinsicInst.h" 00021 #include "llvm/Support/Debug.h" 00022 #include "llvm/Target/CostTable.h" 00023 #include "llvm/Target/TargetLowering.h" 00024 using namespace llvm; 00025 00026 #define DEBUG_TYPE "x86tti" 00027 00028 // Declare the pass initialization routine locally as target-specific passes 00029 // don't have a target-wide initialization entry point, and so we rely on the 00030 // pass constructor initialization. 00031 namespace llvm { 00032 void initializeX86TTIPass(PassRegistry &); 00033 } 00034 00035 namespace { 00036 00037 class X86TTI final : public ImmutablePass, public TargetTransformInfo { 00038 const X86Subtarget *ST; 00039 const X86TargetLowering *TLI; 00040 00041 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 00042 /// are set if the result needs to be inserted and/or extracted from vectors. 00043 unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; 00044 00045 public: 00046 X86TTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) { 00047 llvm_unreachable("This pass cannot be directly constructed"); 00048 } 00049 00050 X86TTI(const X86TargetMachine *TM) 00051 : ImmutablePass(ID), ST(TM->getSubtargetImpl()), 00052 TLI(TM->getSubtargetImpl()->getTargetLowering()) { 00053 initializeX86TTIPass(*PassRegistry::getPassRegistry()); 00054 } 00055 00056 void initializePass() override { 00057 pushTTIStack(this); 00058 } 00059 00060 void getAnalysisUsage(AnalysisUsage &AU) const override { 00061 TargetTransformInfo::getAnalysisUsage(AU); 00062 } 00063 00064 /// Pass identification. 00065 static char ID; 00066 00067 /// Provide necessary pointer adjustments for the two base classes. 00068 void *getAdjustedAnalysisPointer(const void *ID) override { 00069 if (ID == &TargetTransformInfo::ID) 00070 return (TargetTransformInfo*)this; 00071 return this; 00072 } 00073 00074 /// \name Scalar TTI Implementations 00075 /// @{ 00076 PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override; 00077 00078 /// @} 00079 00080 /// \name Vector TTI Implementations 00081 /// @{ 00082 00083 unsigned getNumberOfRegisters(bool Vector) const override; 00084 unsigned getRegisterBitWidth(bool Vector) const override; 00085 unsigned getMaxInterleaveFactor() const override; 00086 unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind, 00087 OperandValueKind, OperandValueProperties, 00088 OperandValueProperties) const override; 00089 unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, 00090 int Index, Type *SubTp) const override; 00091 unsigned getCastInstrCost(unsigned Opcode, Type *Dst, 00092 Type *Src) const override; 00093 unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 00094 Type *CondTy) const override; 00095 unsigned getVectorInstrCost(unsigned Opcode, Type *Val, 00096 unsigned Index) const override; 00097 unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 00098 unsigned AddressSpace) const override; 00099 00100 unsigned getAddressComputationCost(Type *PtrTy, 00101 bool IsComplex) const override; 00102 00103 unsigned getReductionCost(unsigned Opcode, Type *Ty, 00104 bool IsPairwiseForm) const override; 00105 00106 unsigned getIntImmCost(int64_t) const; 00107 00108 unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; 00109 00110 unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 00111 Type *Ty) const override; 00112 unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, 00113 Type *Ty) const override; 00114 00115 /// @} 00116 }; 00117 00118 } // end anonymous namespace 00119 00120 INITIALIZE_AG_PASS(X86TTI, TargetTransformInfo, "x86tti", 00121 "X86 Target Transform Info", true, true, false) 00122 char X86TTI::ID = 0; 00123 00124 ImmutablePass * 00125 llvm::createX86TargetTransformInfoPass(const X86TargetMachine *TM) { 00126 return new X86TTI(TM); 00127 } 00128 00129 00130 //===----------------------------------------------------------------------===// 00131 // 00132 // X86 cost model. 00133 // 00134 //===----------------------------------------------------------------------===// 00135 00136 X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const { 00137 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 00138 // TODO: Currently the __builtin_popcount() implementation using SSE3 00139 // instructions is inefficient. Once the problem is fixed, we should 00140 // call ST->hasSSE3() instead of ST->hasPOPCNT(). 00141 return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software; 00142 } 00143 00144 unsigned X86TTI::getNumberOfRegisters(bool Vector) const { 00145 if (Vector && !ST->hasSSE1()) 00146 return 0; 00147 00148 if (ST->is64Bit()) { 00149 if (Vector && ST->hasAVX512()) 00150 return 32; 00151 return 16; 00152 } 00153 return 8; 00154 } 00155 00156 unsigned X86TTI::getRegisterBitWidth(bool Vector) const { 00157 if (Vector) { 00158 if (ST->hasAVX512()) return 512; 00159 if (ST->hasAVX()) return 256; 00160 if (ST->hasSSE1()) return 128; 00161 return 0; 00162 } 00163 00164 if (ST->is64Bit()) 00165 return 64; 00166 return 32; 00167 00168 } 00169 00170 unsigned X86TTI::getMaxInterleaveFactor() const { 00171 if (ST->isAtom()) 00172 return 1; 00173 00174 // Sandybridge and Haswell have multiple execution ports and pipelined 00175 // vector units. 00176 if (ST->hasAVX()) 00177 return 4; 00178 00179 return 2; 00180 } 00181 00182 unsigned X86TTI::getArithmeticInstrCost( 00183 unsigned Opcode, Type *Ty, OperandValueKind Op1Info, 00184 OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo, 00185 OperandValueProperties Opd2PropInfo) const { 00186 // Legalize the type. 00187 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty); 00188 00189 int ISD = TLI->InstructionOpcodeToISD(Opcode); 00190 assert(ISD && "Invalid opcode"); 00191 00192 if (ISD == ISD::SDIV && 00193 Op2Info == TargetTransformInfo::OK_UniformConstantValue && 00194 Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { 00195 // On X86, vector signed division by constants power-of-two are 00196 // normally expanded to the sequence SRA + SRL + ADD + SRA. 00197 // The OperandValue properties many not be same as that of previous 00198 // operation;conservatively assume OP_None. 00199 unsigned Cost = 00200 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info, 00201 TargetTransformInfo::OP_None, 00202 TargetTransformInfo::OP_None); 00203 Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, 00204 TargetTransformInfo::OP_None, 00205 TargetTransformInfo::OP_None); 00206 Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info, 00207 TargetTransformInfo::OP_None, 00208 TargetTransformInfo::OP_None); 00209 00210 return Cost; 00211 } 00212 00213 static const CostTblEntry<MVT::SimpleValueType> 00214 AVX2UniformConstCostTable[] = { 00215 { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence 00216 { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence 00217 { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence 00218 { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence 00219 }; 00220 00221 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 00222 ST->hasAVX2()) { 00223 int Idx = CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second); 00224 if (Idx != -1) 00225 return LT.first * AVX2UniformConstCostTable[Idx].Cost; 00226 } 00227 00228 static const CostTblEntry<MVT::SimpleValueType> AVX512CostTable[] = { 00229 { ISD::SHL, MVT::v16i32, 1 }, 00230 { ISD::SRL, MVT::v16i32, 1 }, 00231 { ISD::SRA, MVT::v16i32, 1 }, 00232 { ISD::SHL, MVT::v8i64, 1 }, 00233 { ISD::SRL, MVT::v8i64, 1 }, 00234 { ISD::SRA, MVT::v8i64, 1 }, 00235 }; 00236 00237 static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = { 00238 // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to 00239 // customize them to detect the cases where shift amount is a scalar one. 00240 { ISD::SHL, MVT::v4i32, 1 }, 00241 { ISD::SRL, MVT::v4i32, 1 }, 00242 { ISD::SRA, MVT::v4i32, 1 }, 00243 { ISD::SHL, MVT::v8i32, 1 }, 00244 { ISD::SRL, MVT::v8i32, 1 }, 00245 { ISD::SRA, MVT::v8i32, 1 }, 00246 { ISD::SHL, MVT::v2i64, 1 }, 00247 { ISD::SRL, MVT::v2i64, 1 }, 00248 { ISD::SHL, MVT::v4i64, 1 }, 00249 { ISD::SRL, MVT::v4i64, 1 }, 00250 00251 { ISD::SHL, MVT::v32i8, 42 }, // cmpeqb sequence. 00252 { ISD::SHL, MVT::v16i16, 16*10 }, // Scalarized. 00253 00254 { ISD::SRL, MVT::v32i8, 32*10 }, // Scalarized. 00255 { ISD::SRL, MVT::v16i16, 8*10 }, // Scalarized. 00256 00257 { ISD::SRA, MVT::v32i8, 32*10 }, // Scalarized. 00258 { ISD::SRA, MVT::v16i16, 16*10 }, // Scalarized. 00259 { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized. 00260 00261 // Vectorizing division is a bad idea. See the SSE2 table for more comments. 00262 { ISD::SDIV, MVT::v32i8, 32*20 }, 00263 { ISD::SDIV, MVT::v16i16, 16*20 }, 00264 { ISD::SDIV, MVT::v8i32, 8*20 }, 00265 { ISD::SDIV, MVT::v4i64, 4*20 }, 00266 { ISD::UDIV, MVT::v32i8, 32*20 }, 00267 { ISD::UDIV, MVT::v16i16, 16*20 }, 00268 { ISD::UDIV, MVT::v8i32, 8*20 }, 00269 { ISD::UDIV, MVT::v4i64, 4*20 }, 00270 }; 00271 00272 if (ST->hasAVX512()) { 00273 int Idx = CostTableLookup(AVX512CostTable, ISD, LT.second); 00274 if (Idx != -1) 00275 return LT.first * AVX512CostTable[Idx].Cost; 00276 } 00277 // Look for AVX2 lowering tricks. 00278 if (ST->hasAVX2()) { 00279 if (ISD == ISD::SHL && LT.second == MVT::v16i16 && 00280 (Op2Info == TargetTransformInfo::OK_UniformConstantValue || 00281 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) 00282 // On AVX2, a packed v16i16 shift left by a constant build_vector 00283 // is lowered into a vector multiply (vpmullw). 00284 return LT.first; 00285 00286 int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second); 00287 if (Idx != -1) 00288 return LT.first * AVX2CostTable[Idx].Cost; 00289 } 00290 00291 static const CostTblEntry<MVT::SimpleValueType> 00292 SSE2UniformConstCostTable[] = { 00293 // We don't correctly identify costs of casts because they are marked as 00294 // custom. 00295 // Constant splats are cheaper for the following instructions. 00296 { ISD::SHL, MVT::v16i8, 1 }, // psllw. 00297 { ISD::SHL, MVT::v8i16, 1 }, // psllw. 00298 { ISD::SHL, MVT::v4i32, 1 }, // pslld 00299 { ISD::SHL, MVT::v2i64, 1 }, // psllq. 00300 00301 { ISD::SRL, MVT::v16i8, 1 }, // psrlw. 00302 { ISD::SRL, MVT::v8i16, 1 }, // psrlw. 00303 { ISD::SRL, MVT::v4i32, 1 }, // psrld. 00304 { ISD::SRL, MVT::v2i64, 1 }, // psrlq. 00305 00306 { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. 00307 { ISD::SRA, MVT::v8i16, 1 }, // psraw. 00308 { ISD::SRA, MVT::v4i32, 1 }, // psrad. 00309 00310 { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence 00311 { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence 00312 { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence 00313 { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence 00314 }; 00315 00316 if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && 00317 ST->hasSSE2()) { 00318 // pmuldq sequence. 00319 if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) 00320 return LT.first * 15; 00321 00322 int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second); 00323 if (Idx != -1) 00324 return LT.first * SSE2UniformConstCostTable[Idx].Cost; 00325 } 00326 00327 if (ISD == ISD::SHL && 00328 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { 00329 EVT VT = LT.second; 00330 if ((VT == MVT::v8i16 && ST->hasSSE2()) || 00331 (VT == MVT::v4i32 && ST->hasSSE41())) 00332 // Vector shift left by non uniform constant can be lowered 00333 // into vector multiply (pmullw/pmulld). 00334 return LT.first; 00335 if (VT == MVT::v4i32 && ST->hasSSE2()) 00336 // A vector shift left by non uniform constant is converted 00337 // into a vector multiply; the new multiply is eventually 00338 // lowered into a sequence of shuffles and 2 x pmuludq. 00339 ISD = ISD::MUL; 00340 } 00341 00342 static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = { 00343 // We don't correctly identify costs of casts because they are marked as 00344 // custom. 00345 // For some cases, where the shift amount is a scalar we would be able 00346 // to generate better code. Unfortunately, when this is the case the value 00347 // (the splat) will get hoisted out of the loop, thereby making it invisible 00348 // to ISel. The cost model must return worst case assumptions because it is 00349 // used for vectorization and we don't want to make vectorized code worse 00350 // than scalar code. 00351 { ISD::SHL, MVT::v16i8, 30 }, // cmpeqb sequence. 00352 { ISD::SHL, MVT::v8i16, 8*10 }, // Scalarized. 00353 { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. 00354 { ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized. 00355 { ISD::SHL, MVT::v4i64, 4*10 }, // Scalarized. 00356 00357 { ISD::SRL, MVT::v16i8, 16*10 }, // Scalarized. 00358 { ISD::SRL, MVT::v8i16, 8*10 }, // Scalarized. 00359 { ISD::SRL, MVT::v4i32, 4*10 }, // Scalarized. 00360 { ISD::SRL, MVT::v2i64, 2*10 }, // Scalarized. 00361 00362 { ISD::SRA, MVT::v16i8, 16*10 }, // Scalarized. 00363 { ISD::SRA, MVT::v8i16, 8*10 }, // Scalarized. 00364 { ISD::SRA, MVT::v4i32, 4*10 }, // Scalarized. 00365 { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized. 00366 00367 // It is not a good idea to vectorize division. We have to scalarize it and 00368 // in the process we will often end up having to spilling regular 00369 // registers. The overhead of division is going to dominate most kernels 00370 // anyways so try hard to prevent vectorization of division - it is 00371 // generally a bad idea. Assume somewhat arbitrarily that we have to be able 00372 // to hide "20 cycles" for each lane. 00373 { ISD::SDIV, MVT::v16i8, 16*20 }, 00374 { ISD::SDIV, MVT::v8i16, 8*20 }, 00375 { ISD::SDIV, MVT::v4i32, 4*20 }, 00376 { ISD::SDIV, MVT::v2i64, 2*20 }, 00377 { ISD::UDIV, MVT::v16i8, 16*20 }, 00378 { ISD::UDIV, MVT::v8i16, 8*20 }, 00379 { ISD::UDIV, MVT::v4i32, 4*20 }, 00380 { ISD::UDIV, MVT::v2i64, 2*20 }, 00381 }; 00382 00383 if (ST->hasSSE2()) { 00384 int Idx = CostTableLookup(SSE2CostTable, ISD, LT.second); 00385 if (Idx != -1) 00386 return LT.first * SSE2CostTable[Idx].Cost; 00387 } 00388 00389 static const CostTblEntry<MVT::SimpleValueType> AVX1CostTable[] = { 00390 // We don't have to scalarize unsupported ops. We can issue two half-sized 00391 // operations and we only need to extract the upper YMM half. 00392 // Two ops + 1 extract + 1 insert = 4. 00393 { ISD::MUL, MVT::v16i16, 4 }, 00394 { ISD::MUL, MVT::v8i32, 4 }, 00395 { ISD::SUB, MVT::v8i32, 4 }, 00396 { ISD::ADD, MVT::v8i32, 4 }, 00397 { ISD::SUB, MVT::v4i64, 4 }, 00398 { ISD::ADD, MVT::v4i64, 4 }, 00399 // A v4i64 multiply is custom lowered as two split v2i64 vectors that then 00400 // are lowered as a series of long multiplies(3), shifts(4) and adds(2) 00401 // Because we believe v4i64 to be a legal type, we must also include the 00402 // split factor of two in the cost table. Therefore, the cost here is 18 00403 // instead of 9. 00404 { ISD::MUL, MVT::v4i64, 18 }, 00405 }; 00406 00407 // Look for AVX1 lowering tricks. 00408 if (ST->hasAVX() && !ST->hasAVX2()) { 00409 EVT VT = LT.second; 00410 00411 // v16i16 and v8i32 shifts by non-uniform constants are lowered into a 00412 // sequence of extract + two vector multiply + insert. 00413 if (ISD == ISD::SHL && (VT == MVT::v8i32 || VT == MVT::v16i16) && 00414 Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) 00415 ISD = ISD::MUL; 00416 00417 int Idx = CostTableLookup(AVX1CostTable, ISD, VT); 00418 if (Idx != -1) 00419 return LT.first * AVX1CostTable[Idx].Cost; 00420 } 00421 00422 // Custom lowering of vectors. 00423 static const CostTblEntry<MVT::SimpleValueType> CustomLowered[] = { 00424 // A v2i64/v4i64 and multiply is custom lowered as a series of long 00425 // multiplies(3), shifts(4) and adds(2). 00426 { ISD::MUL, MVT::v2i64, 9 }, 00427 { ISD::MUL, MVT::v4i64, 9 }, 00428 }; 00429 int Idx = CostTableLookup(CustomLowered, ISD, LT.second); 00430 if (Idx != -1) 00431 return LT.first * CustomLowered[Idx].Cost; 00432 00433 // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle, 00434 // 2x pmuludq, 2x shuffle. 00435 if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() && 00436 !ST->hasSSE41()) 00437 return LT.first * 6; 00438 00439 // Fallback to the default implementation. 00440 return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, 00441 Op2Info); 00442 } 00443 00444 unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, 00445 Type *SubTp) const { 00446 // We only estimate the cost of reverse and alternate shuffles. 00447 if (Kind != SK_Reverse && Kind != SK_Alternate) 00448 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); 00449 00450 if (Kind == SK_Reverse) { 00451 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); 00452 unsigned Cost = 1; 00453 if (LT.second.getSizeInBits() > 128) 00454 Cost = 3; // Extract + insert + copy. 00455 00456 // Multiple by the number of parts. 00457 return Cost * LT.first; 00458 } 00459 00460 if (Kind == SK_Alternate) { 00461 // 64-bit packed float vectors (v2f32) are widened to type v4f32. 00462 // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. 00463 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp); 00464 00465 // The backend knows how to generate a single VEX.256 version of 00466 // instruction VPBLENDW if the target supports AVX2. 00467 if (ST->hasAVX2() && LT.second == MVT::v16i16) 00468 return LT.first; 00469 00470 static const CostTblEntry<MVT::SimpleValueType> AVXAltShuffleTbl[] = { 00471 {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vblendpd 00472 {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vblendpd 00473 00474 {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vblendps 00475 {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vblendps 00476 00477 // This shuffle is custom lowered into a sequence of: 00478 // 2x vextractf128 , 2x vpblendw , 1x vinsertf128 00479 {ISD::VECTOR_SHUFFLE, MVT::v16i16, 5}, 00480 00481 // This shuffle is custom lowered into a long sequence of: 00482 // 2x vextractf128 , 4x vpshufb , 2x vpor , 1x vinsertf128 00483 {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9} 00484 }; 00485 00486 if (ST->hasAVX()) { 00487 int Idx = CostTableLookup(AVXAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); 00488 if (Idx != -1) 00489 return LT.first * AVXAltShuffleTbl[Idx].Cost; 00490 } 00491 00492 static const CostTblEntry<MVT::SimpleValueType> SSE41AltShuffleTbl[] = { 00493 // These are lowered into movsd. 00494 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, 00495 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, 00496 00497 // packed float vectors with four elements are lowered into BLENDI dag 00498 // nodes. A v4i32/v4f32 BLENDI generates a single 'blendps'/'blendpd'. 00499 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}, 00500 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1}, 00501 00502 // This shuffle generates a single pshufw. 00503 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1}, 00504 00505 // There is no instruction that matches a v16i8 alternate shuffle. 00506 // The backend will expand it into the sequence 'pshufb + pshufb + or'. 00507 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} 00508 }; 00509 00510 if (ST->hasSSE41()) { 00511 int Idx = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); 00512 if (Idx != -1) 00513 return LT.first * SSE41AltShuffleTbl[Idx].Cost; 00514 } 00515 00516 static const CostTblEntry<MVT::SimpleValueType> SSSE3AltShuffleTbl[] = { 00517 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd 00518 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd 00519 00520 // SSE3 doesn't have 'blendps'. The following shuffles are expanded into 00521 // the sequence 'shufps + pshufd' 00522 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, 00523 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, 00524 00525 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or 00526 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // pshufb + pshufb + or 00527 }; 00528 00529 if (ST->hasSSSE3()) { 00530 int Idx = CostTableLookup(SSSE3AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); 00531 if (Idx != -1) 00532 return LT.first * SSSE3AltShuffleTbl[Idx].Cost; 00533 } 00534 00535 static const CostTblEntry<MVT::SimpleValueType> SSEAltShuffleTbl[] = { 00536 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd 00537 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd 00538 00539 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd 00540 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd 00541 00542 // This is expanded into a long sequence of four extract + four insert. 00543 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw. 00544 00545 // 8 x (pinsrw + pextrw + and + movb + movzb + or) 00546 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48} 00547 }; 00548 00549 // Fall-back (SSE3 and SSE2). 00550 int Idx = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); 00551 if (Idx != -1) 00552 return LT.first * SSEAltShuffleTbl[Idx].Cost; 00553 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); 00554 } 00555 00556 return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); 00557 } 00558 00559 unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const { 00560 int ISD = TLI->InstructionOpcodeToISD(Opcode); 00561 assert(ISD && "Invalid opcode"); 00562 00563 std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(Src); 00564 std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(Dst); 00565 00566 static const TypeConversionCostTblEntry<MVT::SimpleValueType> 00567 SSE2ConvTbl[] = { 00568 // These are somewhat magic numbers justified by looking at the output of 00569 // Intel's IACA, running some kernels and making sure when we take 00570 // legalization into account the throughput will be overestimated. 00571 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, 00572 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, 00573 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, 00574 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, 00575 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, 00576 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, 00577 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, 00578 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, 00579 // There are faster sequences for float conversions. 00580 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, 00581 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, 00582 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, 00583 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, 00584 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, 00585 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, 00586 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, 00587 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, 00588 }; 00589 00590 if (ST->hasSSE2() && !ST->hasAVX()) { 00591 int Idx = 00592 ConvertCostTableLookup(SSE2ConvTbl, ISD, LTDest.second, LTSrc.second); 00593 if (Idx != -1) 00594 return LTSrc.first * SSE2ConvTbl[Idx].Cost; 00595 } 00596 00597 static const TypeConversionCostTblEntry<MVT::SimpleValueType> 00598 AVX512ConversionTbl[] = { 00599 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, 00600 { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, 00601 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, 00602 { ISD::FP_ROUND, MVT::v16f32, MVT::v8f64, 3 }, 00603 00604 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 }, 00605 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 }, 00606 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 }, 00607 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, 00608 { ISD::TRUNCATE, MVT::v16i32, MVT::v8i64, 4 }, 00609 00610 // v16i1 -> v16i32 - load + broadcast 00611 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, 00612 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, 00613 00614 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, 00615 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, 00616 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, 00617 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, 00618 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i32, 3 }, 00619 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i32, 3 }, 00620 00621 }; 00622 00623 if (ST->hasAVX512()) { 00624 int Idx = ConvertCostTableLookup(AVX512ConversionTbl, ISD, LTDest.second, 00625 LTSrc.second); 00626 if (Idx != -1) 00627 return AVX512ConversionTbl[Idx].Cost; 00628 } 00629 EVT SrcTy = TLI->getValueType(Src); 00630 EVT DstTy = TLI->getValueType(Dst); 00631 00632 // The function getSimpleVT only handles simple value types. 00633 if (!SrcTy.isSimple() || !DstTy.isSimple()) 00634 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 00635 00636 static const TypeConversionCostTblEntry<MVT::SimpleValueType> 00637 AVX2ConversionTbl[] = { 00638 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, 00639 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, 00640 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, 00641 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, 00642 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 00643 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, 00644 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 00645 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, 00646 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, 00647 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, 00648 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, 00649 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 }, 00650 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 00651 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 00652 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 00653 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, 00654 00655 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, 00656 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, 00657 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, 00658 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, 00659 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, 00660 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 }, 00661 00662 { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, 00663 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, 00664 }; 00665 00666 static const TypeConversionCostTblEntry<MVT::SimpleValueType> 00667 AVXConversionTbl[] = { 00668 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, 00669 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, 00670 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, 00671 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, 00672 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 }, 00673 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, 00674 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, 00675 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, 00676 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, 00677 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, 00678 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 }, 00679 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, 00680 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 }, 00681 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, 00682 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, 00683 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, 00684 00685 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 }, 00686 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 }, 00687 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 }, 00688 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, 00689 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, 00690 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 }, 00691 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 }, 00692 00693 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, 00694 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 }, 00695 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, 00696 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, 00697 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, 00698 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, 00699 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 }, 00700 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, 00701 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, 00702 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 }, 00703 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 }, 00704 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, 00705 00706 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, 00707 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 }, 00708 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, 00709 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 }, 00710 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, 00711 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 }, 00712 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, 00713 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 }, 00714 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, 00715 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, 00716 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, 00717 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, 00718 // The generic code to compute the scalar overhead is currently broken. 00719 // Workaround this limitation by estimating the scalarization overhead 00720 // here. We have roughly 10 instructions per scalar element. 00721 // Multiply that by the vector width. 00722 // FIXME: remove that when PR19268 is fixed. 00723 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, 00724 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 4*10 }, 00725 00726 { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 }, 00727 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, 00728 // This node is expanded into scalarized operations but BasicTTI is overly 00729 // optimistic estimating its cost. It computes 3 per element (one 00730 // vector-extract, one scalar conversion and one vector-insert). The 00731 // problem is that the inserts form a read-modify-write chain so latency 00732 // should be factored in too. Inflating the cost per element by 1. 00733 { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 }, 00734 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 }, 00735 }; 00736 00737 if (ST->hasAVX2()) { 00738 int Idx = ConvertCostTableLookup(AVX2ConversionTbl, ISD, 00739 DstTy.getSimpleVT(), SrcTy.getSimpleVT()); 00740 if (Idx != -1) 00741 return AVX2ConversionTbl[Idx].Cost; 00742 } 00743 00744 if (ST->hasAVX()) { 00745 int Idx = ConvertCostTableLookup(AVXConversionTbl, ISD, DstTy.getSimpleVT(), 00746 SrcTy.getSimpleVT()); 00747 if (Idx != -1) 00748 return AVXConversionTbl[Idx].Cost; 00749 } 00750 00751 return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); 00752 } 00753 00754 unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, 00755 Type *CondTy) const { 00756 // Legalize the type. 00757 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); 00758 00759 MVT MTy = LT.second; 00760 00761 int ISD = TLI->InstructionOpcodeToISD(Opcode); 00762 assert(ISD && "Invalid opcode"); 00763 00764 static const CostTblEntry<MVT::SimpleValueType> SSE42CostTbl[] = { 00765 { ISD::SETCC, MVT::v2f64, 1 }, 00766 { ISD::SETCC, MVT::v4f32, 1 }, 00767 { ISD::SETCC, MVT::v2i64, 1 }, 00768 { ISD::SETCC, MVT::v4i32, 1 }, 00769 { ISD::SETCC, MVT::v8i16, 1 }, 00770 { ISD::SETCC, MVT::v16i8, 1 }, 00771 }; 00772 00773 static const CostTblEntry<MVT::SimpleValueType> AVX1CostTbl[] = { 00774 { ISD::SETCC, MVT::v4f64, 1 }, 00775 { ISD::SETCC, MVT::v8f32, 1 }, 00776 // AVX1 does not support 8-wide integer compare. 00777 { ISD::SETCC, MVT::v4i64, 4 }, 00778 { ISD::SETCC, MVT::v8i32, 4 }, 00779 { ISD::SETCC, MVT::v16i16, 4 }, 00780 { ISD::SETCC, MVT::v32i8, 4 }, 00781 }; 00782 00783 static const CostTblEntry<MVT::SimpleValueType> AVX2CostTbl[] = { 00784 { ISD::SETCC, MVT::v4i64, 1 }, 00785 { ISD::SETCC, MVT::v8i32, 1 }, 00786 { ISD::SETCC, MVT::v16i16, 1 }, 00787 { ISD::SETCC, MVT::v32i8, 1 }, 00788 }; 00789 00790 static const CostTblEntry<MVT::SimpleValueType> AVX512CostTbl[] = { 00791 { ISD::SETCC, MVT::v8i64, 1 }, 00792 { ISD::SETCC, MVT::v16i32, 1 }, 00793 { ISD::SETCC, MVT::v8f64, 1 }, 00794 { ISD::SETCC, MVT::v16f32, 1 }, 00795 }; 00796 00797 if (ST->hasAVX512()) { 00798 int Idx = CostTableLookup(AVX512CostTbl, ISD, MTy); 00799 if (Idx != -1) 00800 return LT.first * AVX512CostTbl[Idx].Cost; 00801 } 00802 00803 if (ST->hasAVX2()) { 00804 int Idx = CostTableLookup(AVX2CostTbl, ISD, MTy); 00805 if (Idx != -1) 00806 return LT.first * AVX2CostTbl[Idx].Cost; 00807 } 00808 00809 if (ST->hasAVX()) { 00810 int Idx = CostTableLookup(AVX1CostTbl, ISD, MTy); 00811 if (Idx != -1) 00812 return LT.first * AVX1CostTbl[Idx].Cost; 00813 } 00814 00815 if (ST->hasSSE42()) { 00816 int Idx = CostTableLookup(SSE42CostTbl, ISD, MTy); 00817 if (Idx != -1) 00818 return LT.first * SSE42CostTbl[Idx].Cost; 00819 } 00820 00821 return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); 00822 } 00823 00824 unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val, 00825 unsigned Index) const { 00826 assert(Val->isVectorTy() && "This must be a vector type"); 00827 00828 if (Index != -1U) { 00829 // Legalize the type. 00830 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val); 00831 00832 // This type is legalized to a scalar type. 00833 if (!LT.second.isVector()) 00834 return 0; 00835 00836 // The type may be split. Normalize the index to the new type. 00837 unsigned Width = LT.second.getVectorNumElements(); 00838 Index = Index % Width; 00839 00840 // Floating point scalars are already located in index #0. 00841 if (Val->getScalarType()->isFloatingPointTy() && Index == 0) 00842 return 0; 00843 } 00844 00845 return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index); 00846 } 00847 00848 unsigned X86TTI::getScalarizationOverhead(Type *Ty, bool Insert, 00849 bool Extract) const { 00850 assert (Ty->isVectorTy() && "Can only scalarize vectors"); 00851 unsigned Cost = 0; 00852 00853 for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { 00854 if (Insert) 00855 Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); 00856 if (Extract) 00857 Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i); 00858 } 00859 00860 return Cost; 00861 } 00862 00863 unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, 00864 unsigned AddressSpace) const { 00865 // Handle non-power-of-two vectors such as <3 x float> 00866 if (VectorType *VTy = dyn_cast<VectorType>(Src)) { 00867 unsigned NumElem = VTy->getVectorNumElements(); 00868 00869 // Handle a few common cases: 00870 // <3 x float> 00871 if (NumElem == 3 && VTy->getScalarSizeInBits() == 32) 00872 // Cost = 64 bit store + extract + 32 bit store. 00873 return 3; 00874 00875 // <3 x double> 00876 if (NumElem == 3 && VTy->getScalarSizeInBits() == 64) 00877 // Cost = 128 bit store + unpack + 64 bit store. 00878 return 3; 00879 00880 // Assume that all other non-power-of-two numbers are scalarized. 00881 if (!isPowerOf2_32(NumElem)) { 00882 unsigned Cost = TargetTransformInfo::getMemoryOpCost(Opcode, 00883 VTy->getScalarType(), 00884 Alignment, 00885 AddressSpace); 00886 unsigned SplitCost = getScalarizationOverhead(Src, 00887 Opcode == Instruction::Load, 00888 Opcode==Instruction::Store); 00889 return NumElem * Cost + SplitCost; 00890 } 00891 } 00892 00893 // Legalize the type. 00894 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src); 00895 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && 00896 "Invalid Opcode"); 00897 00898 // Each load/store unit costs 1. 00899 unsigned Cost = LT.first * 1; 00900 00901 // On Sandybridge 256bit load/stores are double pumped 00902 // (but not on Haswell). 00903 if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2()) 00904 Cost*=2; 00905 00906 return Cost; 00907 } 00908 00909 unsigned X86TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { 00910 // Address computations in vectorized code with non-consecutive addresses will 00911 // likely result in more instructions compared to scalar code where the 00912 // computation can more often be merged into the index mode. The resulting 00913 // extra micro-ops can significantly decrease throughput. 00914 unsigned NumVectorInstToHideOverhead = 10; 00915 00916 if (Ty->isVectorTy() && IsComplex) 00917 return NumVectorInstToHideOverhead; 00918 00919 return TargetTransformInfo::getAddressComputationCost(Ty, IsComplex); 00920 } 00921 00922 unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy, 00923 bool IsPairwise) const { 00924 00925 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy); 00926 00927 MVT MTy = LT.second; 00928 00929 int ISD = TLI->InstructionOpcodeToISD(Opcode); 00930 assert(ISD && "Invalid opcode"); 00931 00932 // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 00933 // and make it as the cost. 00934 00935 static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblPairWise[] = { 00936 { ISD::FADD, MVT::v2f64, 2 }, 00937 { ISD::FADD, MVT::v4f32, 4 }, 00938 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". 00939 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". 00940 { ISD::ADD, MVT::v8i16, 5 }, 00941 }; 00942 00943 static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblPairWise[] = { 00944 { ISD::FADD, MVT::v4f32, 4 }, 00945 { ISD::FADD, MVT::v4f64, 5 }, 00946 { ISD::FADD, MVT::v8f32, 7 }, 00947 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". 00948 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". 00949 { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8". 00950 { ISD::ADD, MVT::v8i16, 5 }, 00951 { ISD::ADD, MVT::v8i32, 5 }, 00952 }; 00953 00954 static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblNoPairWise[] = { 00955 { ISD::FADD, MVT::v2f64, 2 }, 00956 { ISD::FADD, MVT::v4f32, 4 }, 00957 { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". 00958 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". 00959 { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". 00960 }; 00961 00962 static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblNoPairWise[] = { 00963 { ISD::FADD, MVT::v4f32, 3 }, 00964 { ISD::FADD, MVT::v4f64, 3 }, 00965 { ISD::FADD, MVT::v8f32, 4 }, 00966 { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". 00967 { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "2.8". 00968 { ISD::ADD, MVT::v4i64, 3 }, 00969 { ISD::ADD, MVT::v8i16, 4 }, 00970 { ISD::ADD, MVT::v8i32, 5 }, 00971 }; 00972 00973 if (IsPairwise) { 00974 if (ST->hasAVX()) { 00975 int Idx = CostTableLookup(AVX1CostTblPairWise, ISD, MTy); 00976 if (Idx != -1) 00977 return LT.first * AVX1CostTblPairWise[Idx].Cost; 00978 } 00979 00980 if (ST->hasSSE42()) { 00981 int Idx = CostTableLookup(SSE42CostTblPairWise, ISD, MTy); 00982 if (Idx != -1) 00983 return LT.first * SSE42CostTblPairWise[Idx].Cost; 00984 } 00985 } else { 00986 if (ST->hasAVX()) { 00987 int Idx = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy); 00988 if (Idx != -1) 00989 return LT.first * AVX1CostTblNoPairWise[Idx].Cost; 00990 } 00991 00992 if (ST->hasSSE42()) { 00993 int Idx = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy); 00994 if (Idx != -1) 00995 return LT.first * SSE42CostTblNoPairWise[Idx].Cost; 00996 } 00997 } 00998 00999 return TargetTransformInfo::getReductionCost(Opcode, ValTy, IsPairwise); 01000 } 01001 01002 /// \brief Calculate the cost of materializing a 64-bit value. This helper 01003 /// method might only calculate a fraction of a larger immediate. Therefore it 01004 /// is valid to return a cost of ZERO. 01005 unsigned X86TTI::getIntImmCost(int64_t Val) const { 01006 if (Val == 0) 01007 return TCC_Free; 01008 01009 if (isInt<32>(Val)) 01010 return TCC_Basic; 01011 01012 return 2 * TCC_Basic; 01013 } 01014 01015 unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { 01016 assert(Ty->isIntegerTy()); 01017 01018 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 01019 if (BitSize == 0) 01020 return ~0U; 01021 01022 // Never hoist constants larger than 128bit, because this might lead to 01023 // incorrect code generation or assertions in codegen. 01024 // Fixme: Create a cost model for types larger than i128 once the codegen 01025 // issues have been fixed. 01026 if (BitSize > 128) 01027 return TCC_Free; 01028 01029 if (Imm == 0) 01030 return TCC_Free; 01031 01032 // Sign-extend all constants to a multiple of 64-bit. 01033 APInt ImmVal = Imm; 01034 if (BitSize & 0x3f) 01035 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); 01036 01037 // Split the constant into 64-bit chunks and calculate the cost for each 01038 // chunk. 01039 unsigned Cost = 0; 01040 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { 01041 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); 01042 int64_t Val = Tmp.getSExtValue(); 01043 Cost += getIntImmCost(Val); 01044 } 01045 // We need at least one instruction to materialze the constant. 01046 return std::max(1U, Cost); 01047 } 01048 01049 unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, 01050 Type *Ty) const { 01051 assert(Ty->isIntegerTy()); 01052 01053 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 01054 // There is no cost model for constants with a bit size of 0. Return TCC_Free 01055 // here, so that constant hoisting will ignore this constant. 01056 if (BitSize == 0) 01057 return TCC_Free; 01058 01059 unsigned ImmIdx = ~0U; 01060 switch (Opcode) { 01061 default: return TCC_Free; 01062 case Instruction::GetElementPtr: 01063 // Always hoist the base address of a GetElementPtr. This prevents the 01064 // creation of new constants for every base constant that gets constant 01065 // folded with the offset. 01066 if (Idx == 0) 01067 return 2 * TCC_Basic; 01068 return TCC_Free; 01069 case Instruction::Store: 01070 ImmIdx = 0; 01071 break; 01072 case Instruction::Add: 01073 case Instruction::Sub: 01074 case Instruction::Mul: 01075 case Instruction::UDiv: 01076 case Instruction::SDiv: 01077 case Instruction::URem: 01078 case Instruction::SRem: 01079 case Instruction::And: 01080 case Instruction::Or: 01081 case Instruction::Xor: 01082 case Instruction::ICmp: 01083 ImmIdx = 1; 01084 break; 01085 // Always return TCC_Free for the shift value of a shift instruction. 01086 case Instruction::Shl: 01087 case Instruction::LShr: 01088 case Instruction::AShr: 01089 if (Idx == 1) 01090 return TCC_Free; 01091 break; 01092 case Instruction::Trunc: 01093 case Instruction::ZExt: 01094 case Instruction::SExt: 01095 case Instruction::IntToPtr: 01096 case Instruction::PtrToInt: 01097 case Instruction::BitCast: 01098 case Instruction::PHI: 01099 case Instruction::Call: 01100 case Instruction::Select: 01101 case Instruction::Ret: 01102 case Instruction::Load: 01103 break; 01104 } 01105 01106 if (Idx == ImmIdx) { 01107 unsigned NumConstants = (BitSize + 63) / 64; 01108 unsigned Cost = X86TTI::getIntImmCost(Imm, Ty); 01109 return (Cost <= NumConstants * TCC_Basic) 01110 ? static_cast<unsigned>(TCC_Free) 01111 : Cost; 01112 } 01113 01114 return X86TTI::getIntImmCost(Imm, Ty); 01115 } 01116 01117 unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, 01118 const APInt &Imm, Type *Ty) const { 01119 assert(Ty->isIntegerTy()); 01120 01121 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 01122 // There is no cost model for constants with a bit size of 0. Return TCC_Free 01123 // here, so that constant hoisting will ignore this constant. 01124 if (BitSize == 0) 01125 return TCC_Free; 01126 01127 switch (IID) { 01128 default: return TCC_Free; 01129 case Intrinsic::sadd_with_overflow: 01130 case Intrinsic::uadd_with_overflow: 01131 case Intrinsic::ssub_with_overflow: 01132 case Intrinsic::usub_with_overflow: 01133 case Intrinsic::smul_with_overflow: 01134 case Intrinsic::umul_with_overflow: 01135 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<32>(Imm.getSExtValue())) 01136 return TCC_Free; 01137 break; 01138 case Intrinsic::experimental_stackmap: 01139 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 01140 return TCC_Free; 01141 break; 01142 case Intrinsic::experimental_patchpoint_void: 01143 case Intrinsic::experimental_patchpoint_i64: 01144 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) 01145 return TCC_Free; 01146 break; 01147 } 01148 return X86TTI::getIntImmCost(Imm, Ty); 01149 }