LLVM API Documentation

AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
00001 //===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // \file
00011 // This file implements a TargetTransformInfo analysis pass specific to the
00012 // AMDGPU target machine. It uses the target's detailed information to provide
00013 // more precise answers to certain TTI queries, while letting the target
00014 // independent and default TTI implementations handle the rest.
00015 //
00016 //===----------------------------------------------------------------------===//
00017 
00018 #include "AMDGPU.h"
00019 #include "AMDGPUTargetMachine.h"
00020 #include "llvm/Analysis/LoopInfo.h"
00021 #include "llvm/Analysis/TargetTransformInfo.h"
00022 #include "llvm/Analysis/ValueTracking.h"
00023 #include "llvm/Support/Debug.h"
00024 #include "llvm/Target/CostTable.h"
00025 #include "llvm/Target/TargetLowering.h"
00026 using namespace llvm;
00027 
00028 #define DEBUG_TYPE "AMDGPUtti"
00029 
00030 // Declare the pass initialization routine locally as target-specific passes
00031 // don't have a target-wide initialization entry point, and so we rely on the
00032 // pass constructor initialization.
00033 namespace llvm {
00034 void initializeAMDGPUTTIPass(PassRegistry &);
00035 }
00036 
00037 namespace {
00038 
00039 class AMDGPUTTI final : public ImmutablePass, public TargetTransformInfo {
00040   const AMDGPUTargetMachine *TM;
00041   const AMDGPUSubtarget *ST;
00042   const AMDGPUTargetLowering *TLI;
00043 
00044   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
00045   /// are set if the result needs to be inserted and/or extracted from vectors.
00046   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
00047 
00048 public:
00049   AMDGPUTTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
00050     llvm_unreachable("This pass cannot be directly constructed");
00051   }
00052 
00053   AMDGPUTTI(const AMDGPUTargetMachine *TM)
00054       : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
00055         TLI(TM->getSubtargetImpl()->getTargetLowering()) {
00056     initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry());
00057   }
00058 
00059   void initializePass() override { pushTTIStack(this); }
00060 
00061   void getAnalysisUsage(AnalysisUsage &AU) const override {
00062     TargetTransformInfo::getAnalysisUsage(AU);
00063   }
00064 
00065   /// Pass identification.
00066   static char ID;
00067 
00068   /// Provide necessary pointer adjustments for the two base classes.
00069   void *getAdjustedAnalysisPointer(const void *ID) override {
00070     if (ID == &TargetTransformInfo::ID)
00071       return (TargetTransformInfo *)this;
00072     return this;
00073   }
00074 
00075   bool hasBranchDivergence() const override;
00076 
00077   void getUnrollingPreferences(const Function *F, Loop *L,
00078                                UnrollingPreferences &UP) const override;
00079 
00080   PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const override;
00081 
00082   unsigned getNumberOfRegisters(bool Vector) const override;
00083   unsigned getRegisterBitWidth(bool Vector) const override;
00084   unsigned getMaxInterleaveFactor() const override;
00085 
00086   /// @}
00087 };
00088 
00089 } // end anonymous namespace
00090 
00091 INITIALIZE_AG_PASS(AMDGPUTTI, TargetTransformInfo, "AMDGPUtti",
00092                    "AMDGPU Target Transform Info", true, true, false)
00093 char AMDGPUTTI::ID = 0;
00094 
00095 ImmutablePass *
00096 llvm::createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM) {
00097   return new AMDGPUTTI(TM);
00098 }
00099 
00100 bool AMDGPUTTI::hasBranchDivergence() const { return true; }
00101 
00102 void AMDGPUTTI::getUnrollingPreferences(const Function *, Loop *L,
00103                                         UnrollingPreferences &UP) const {
00104   UP.Threshold = 300; // Twice the default.
00105   UP.Count = UINT_MAX;
00106   UP.Partial = true;
00107 
00108   // TODO: Do we want runtime unrolling?
00109 
00110   for (const BasicBlock *BB : L->getBlocks()) {
00111     for (const Instruction &I : *BB) {
00112       const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
00113       if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
00114         continue;
00115 
00116       const Value *Ptr = GEP->getPointerOperand();
00117       const AllocaInst *Alloca = dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr));
00118       if (Alloca) {
00119         // We want to do whatever we can to limit the number of alloca
00120         // instructions that make it through to the code generator.  allocas
00121         // require us to use indirect addressing, which is slow and prone to
00122         // compiler bugs.  If this loop does an address calculation on an
00123         // alloca ptr, then we want to use a higher than normal loop unroll
00124         // threshold. This will give SROA a better chance to eliminate these
00125         // allocas.
00126         //
00127         // Don't use the maximum allowed value here as it will make some
00128         // programs way too big.
00129         UP.Threshold = 800;
00130       }
00131     }
00132   }
00133 }
00134 
00135 AMDGPUTTI::PopcntSupportKind
00136 AMDGPUTTI::getPopcntSupport(unsigned TyWidth) const {
00137   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
00138   return ST->hasBCNT(TyWidth) ? PSK_FastHardware : PSK_Software;
00139 }
00140 
00141 unsigned AMDGPUTTI::getNumberOfRegisters(bool Vec) const {
00142   if (Vec)
00143     return 0;
00144 
00145   // Number of VGPRs on SI.
00146   if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
00147     return 256;
00148 
00149   return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
00150 }
00151 
00152 unsigned AMDGPUTTI::getRegisterBitWidth(bool) const {
00153   return 32;
00154 }
00155 
00156 unsigned AMDGPUTTI::getMaxInterleaveFactor() const {
00157   // Semi-arbitrary large amount.
00158   return 64;
00159 }