LLVM API Documentation
00001 //===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // \file 00011 // This file implements a TargetTransformInfo analysis pass specific to the 00012 // AMDGPU target machine. It uses the target's detailed information to provide 00013 // more precise answers to certain TTI queries, while letting the target 00014 // independent and default TTI implementations handle the rest. 00015 // 00016 //===----------------------------------------------------------------------===// 00017 00018 #include "AMDGPU.h" 00019 #include "AMDGPUTargetMachine.h" 00020 #include "llvm/Analysis/LoopInfo.h" 00021 #include "llvm/Analysis/TargetTransformInfo.h" 00022 #include "llvm/Analysis/ValueTracking.h" 00023 #include "llvm/Support/Debug.h" 00024 #include "llvm/Target/CostTable.h" 00025 #include "llvm/Target/TargetLowering.h" 00026 using namespace llvm; 00027 00028 #define DEBUG_TYPE "AMDGPUtti" 00029 00030 // Declare the pass initialization routine locally as target-specific passes 00031 // don't have a target-wide initialization entry point, and so we rely on the 00032 // pass constructor initialization. 00033 namespace llvm { 00034 void initializeAMDGPUTTIPass(PassRegistry &); 00035 } 00036 00037 namespace { 00038 00039 class AMDGPUTTI final : public ImmutablePass, public TargetTransformInfo { 00040 const AMDGPUTargetMachine *TM; 00041 const AMDGPUSubtarget *ST; 00042 const AMDGPUTargetLowering *TLI; 00043 00044 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 00045 /// are set if the result needs to be inserted and/or extracted from vectors. 00046 unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; 00047 00048 public: 00049 AMDGPUTTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) { 00050 llvm_unreachable("This pass cannot be directly constructed"); 00051 } 00052 00053 AMDGPUTTI(const AMDGPUTargetMachine *TM) 00054 : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), 00055 TLI(TM->getSubtargetImpl()->getTargetLowering()) { 00056 initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry()); 00057 } 00058 00059 void initializePass() override { pushTTIStack(this); } 00060 00061 void getAnalysisUsage(AnalysisUsage &AU) const override { 00062 TargetTransformInfo::getAnalysisUsage(AU); 00063 } 00064 00065 /// Pass identification. 00066 static char ID; 00067 00068 /// Provide necessary pointer adjustments for the two base classes. 00069 void *getAdjustedAnalysisPointer(const void *ID) override { 00070 if (ID == &TargetTransformInfo::ID) 00071 return (TargetTransformInfo *)this; 00072 return this; 00073 } 00074 00075 bool hasBranchDivergence() const override; 00076 00077 void getUnrollingPreferences(const Function *F, Loop *L, 00078 UnrollingPreferences &UP) const override; 00079 00080 PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const override; 00081 00082 unsigned getNumberOfRegisters(bool Vector) const override; 00083 unsigned getRegisterBitWidth(bool Vector) const override; 00084 unsigned getMaxInterleaveFactor() const override; 00085 00086 /// @} 00087 }; 00088 00089 } // end anonymous namespace 00090 00091 INITIALIZE_AG_PASS(AMDGPUTTI, TargetTransformInfo, "AMDGPUtti", 00092 "AMDGPU Target Transform Info", true, true, false) 00093 char AMDGPUTTI::ID = 0; 00094 00095 ImmutablePass * 00096 llvm::createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM) { 00097 return new AMDGPUTTI(TM); 00098 } 00099 00100 bool AMDGPUTTI::hasBranchDivergence() const { return true; } 00101 00102 void AMDGPUTTI::getUnrollingPreferences(const Function *, Loop *L, 00103 UnrollingPreferences &UP) const { 00104 UP.Threshold = 300; // Twice the default. 00105 UP.Count = UINT_MAX; 00106 UP.Partial = true; 00107 00108 // TODO: Do we want runtime unrolling? 00109 00110 for (const BasicBlock *BB : L->getBlocks()) { 00111 for (const Instruction &I : *BB) { 00112 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I); 00113 if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) 00114 continue; 00115 00116 const Value *Ptr = GEP->getPointerOperand(); 00117 const AllocaInst *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr)); 00118 if (Alloca) { 00119 // We want to do whatever we can to limit the number of alloca 00120 // instructions that make it through to the code generator. allocas 00121 // require us to use indirect addressing, which is slow and prone to 00122 // compiler bugs. If this loop does an address calculation on an 00123 // alloca ptr, then we want to use a higher than normal loop unroll 00124 // threshold. This will give SROA a better chance to eliminate these 00125 // allocas. 00126 // 00127 // Don't use the maximum allowed value here as it will make some 00128 // programs way too big. 00129 UP.Threshold = 800; 00130 } 00131 } 00132 } 00133 } 00134 00135 AMDGPUTTI::PopcntSupportKind 00136 AMDGPUTTI::getPopcntSupport(unsigned TyWidth) const { 00137 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); 00138 return ST->hasBCNT(TyWidth) ? PSK_FastHardware : PSK_Software; 00139 } 00140 00141 unsigned AMDGPUTTI::getNumberOfRegisters(bool Vec) const { 00142 if (Vec) 00143 return 0; 00144 00145 // Number of VGPRs on SI. 00146 if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) 00147 return 256; 00148 00149 return 4 * 128; // XXX - 4 channels. Should these count as vector instead? 00150 } 00151 00152 unsigned AMDGPUTTI::getRegisterBitWidth(bool) const { 00153 return 32; 00154 } 00155 00156 unsigned AMDGPUTTI::getMaxInterleaveFactor() const { 00157 // Semi-arbitrary large amount. 00158 return 64; 00159 }