LLVM API Documentation

AMDGPUPromoteAlloca.cpp
Go to the documentation of this file.
00001 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This pass eliminates allocas by either converting them into vectors or
00011 // by migrating them to local address space.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "AMDGPU.h"
00016 #include "AMDGPUSubtarget.h"
00017 #include "llvm/Analysis/ValueTracking.h"
00018 #include "llvm/IR/IRBuilder.h"
00019 #include "llvm/IR/InstVisitor.h"
00020 #include "llvm/Support/Debug.h"
00021 
00022 #define DEBUG_TYPE "amdgpu-promote-alloca"
00023 
00024 using namespace llvm;
00025 
00026 namespace {
00027 
00028 class AMDGPUPromoteAlloca : public FunctionPass,
00029                        public InstVisitor<AMDGPUPromoteAlloca> {
00030 
00031   static char ID;
00032   Module *Mod;
00033   const AMDGPUSubtarget &ST;
00034   int LocalMemAvailable;
00035 
00036 public:
00037   AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
00038                                                    LocalMemAvailable(0) { }
00039   bool doInitialization(Module &M) override;
00040   bool runOnFunction(Function &F) override;
00041   const char *getPassName() const override { return "AMDGPU Promote Alloca"; }
00042   void visitAlloca(AllocaInst &I);
00043 };
00044 
00045 } // End anonymous namespace
00046 
00047 char AMDGPUPromoteAlloca::ID = 0;
00048 
00049 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
00050   Mod = &M;
00051   return false;
00052 }
00053 
00054 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
00055 
00056   const FunctionType *FTy = F.getFunctionType();
00057 
00058   LocalMemAvailable = ST.getLocalMemorySize();
00059 
00060 
00061   // If the function has any arguments in the local address space, then it's
00062   // possible these arguments require the entire local memory space, so
00063   // we cannot use local memory in the pass.
00064   for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
00065     const Type *ParamTy = FTy->getParamType(i);
00066     if (ParamTy->isPointerTy() &&
00067         ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
00068       LocalMemAvailable = 0;
00069       DEBUG(dbgs() << "Function has local memory argument.  Promoting to "
00070                       "local memory disabled.\n");
00071       break;
00072     }
00073   }
00074 
00075   if (LocalMemAvailable > 0) {
00076     // Check how much local memory is being used by global objects
00077     for (Module::global_iterator I = Mod->global_begin(),
00078                                  E = Mod->global_end(); I != E; ++I) {
00079       GlobalVariable *GV = I;
00080       PointerType *GVTy = GV->getType();
00081       if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
00082         continue;
00083       for (Value::use_iterator U = GV->use_begin(),
00084                                UE = GV->use_end(); U != UE; ++U) {
00085         Instruction *Use = dyn_cast<Instruction>(*U);
00086         if (!Use)
00087           continue;
00088         if (Use->getParent()->getParent() == &F)
00089           LocalMemAvailable -=
00090               Mod->getDataLayout()->getTypeAllocSize(GVTy->getElementType());
00091       }
00092     }
00093   }
00094 
00095   LocalMemAvailable = std::max(0, LocalMemAvailable);
00096   DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
00097 
00098   visit(F);
00099 
00100   return false;
00101 }
00102 
00103 static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
00104   return VectorType::get(ArrayTy->getArrayElementType(),
00105                          ArrayTy->getArrayNumElements());
00106 }
00107 
00108 static Value* calculateVectorIndex(Value *Ptr,
00109                                   std::map<GetElementPtrInst*, Value*> GEPIdx) {
00110   if (isa<AllocaInst>(Ptr))
00111     return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
00112 
00113   GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
00114 
00115   return GEPIdx[GEP];
00116 }
00117 
00118 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
00119   // FIXME we only support simple cases
00120   if (GEP->getNumOperands() != 3)
00121     return NULL;
00122 
00123   ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
00124   if (!I0 || !I0->isZero())
00125     return NULL;
00126 
00127   return GEP->getOperand(2);
00128 }
00129 
00130 // Not an instruction handled below to turn into a vector.
00131 //
00132 // TODO: Check isTriviallyVectorizable for calls and handle other
00133 // instructions.
00134 static bool canVectorizeInst(Instruction *Inst) {
00135   switch (Inst->getOpcode()) {
00136   case Instruction::Load:
00137   case Instruction::Store:
00138   case Instruction::BitCast:
00139   case Instruction::AddrSpaceCast:
00140     return true;
00141   default:
00142     return false;
00143   }
00144 }
00145 
00146 static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
00147   Type *AllocaTy = Alloca->getAllocatedType();
00148 
00149   DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
00150 
00151   // FIXME: There is no reason why we can't support larger arrays, we
00152   // are just being conservative for now.
00153   if (!AllocaTy->isArrayTy() ||
00154       AllocaTy->getArrayElementType()->isVectorTy() ||
00155       AllocaTy->getArrayNumElements() > 4) {
00156 
00157     DEBUG(dbgs() << "  Cannot convert type to vector");
00158     return false;
00159   }
00160 
00161   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
00162   std::vector<Value*> WorkList;
00163   for (User *AllocaUser : Alloca->users()) {
00164     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
00165     if (!GEP) {
00166       if (!canVectorizeInst(cast<Instruction>(AllocaUser)))
00167         return false;
00168 
00169       WorkList.push_back(AllocaUser);
00170       continue;
00171     }
00172 
00173     Value *Index = GEPToVectorIndex(GEP);
00174 
00175     // If we can't compute a vector index from this GEP, then we can't
00176     // promote this alloca to vector.
00177     if (!Index) {
00178       DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
00179       return false;
00180     }
00181 
00182     GEPVectorIdx[GEP] = Index;
00183     for (User *GEPUser : AllocaUser->users()) {
00184       if (!canVectorizeInst(cast<Instruction>(GEPUser)))
00185         return false;
00186 
00187       WorkList.push_back(GEPUser);
00188     }
00189   }
00190 
00191   VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
00192 
00193   DEBUG(dbgs() << "  Converting alloca to vector "
00194         << *AllocaTy << " -> " << *VectorTy << '\n');
00195 
00196   for (std::vector<Value*>::iterator I = WorkList.begin(),
00197                                      E = WorkList.end(); I != E; ++I) {
00198     Instruction *Inst = cast<Instruction>(*I);
00199     IRBuilder<> Builder(Inst);
00200     switch (Inst->getOpcode()) {
00201     case Instruction::Load: {
00202       Value *Ptr = Inst->getOperand(0);
00203       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
00204       Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
00205       Value *VecValue = Builder.CreateLoad(BitCast);
00206       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
00207       Inst->replaceAllUsesWith(ExtractElement);
00208       Inst->eraseFromParent();
00209       break;
00210     }
00211     case Instruction::Store: {
00212       Value *Ptr = Inst->getOperand(1);
00213       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
00214       Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
00215       Value *VecValue = Builder.CreateLoad(BitCast);
00216       Value *NewVecValue = Builder.CreateInsertElement(VecValue,
00217                                                        Inst->getOperand(0),
00218                                                        Index);
00219       Builder.CreateStore(NewVecValue, BitCast);
00220       Inst->eraseFromParent();
00221       break;
00222     }
00223     case Instruction::BitCast:
00224     case Instruction::AddrSpaceCast:
00225       break;
00226 
00227     default:
00228       Inst->dump();
00229       llvm_unreachable("Inconsistency in instructions promotable to vector");
00230     }
00231   }
00232   return true;
00233 }
00234 
00235 static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
00236   for (User *User : Val->users()) {
00237     if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
00238       continue;
00239     if (isa<CallInst>(User)) {
00240       WorkList.push_back(User);
00241       continue;
00242     }
00243     if (!User->getType()->isPointerTy())
00244       continue;
00245     WorkList.push_back(User);
00246     collectUsesWithPtrTypes(User, WorkList);
00247   }
00248 }
00249 
00250 void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
00251   IRBuilder<> Builder(&I);
00252 
00253   // First try to replace the alloca with a vector
00254   Type *AllocaTy = I.getAllocatedType();
00255 
00256   DEBUG(dbgs() << "Trying to promote " << I << '\n');
00257 
00258   if (tryPromoteAllocaToVector(&I))
00259     return;
00260 
00261   DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
00262 
00263   // FIXME: This is the maximum work group size.  We should try to get
00264   // value from the reqd_work_group_size function attribute if it is
00265   // available.
00266   unsigned WorkGroupSize = 256;
00267   int AllocaSize = WorkGroupSize *
00268       Mod->getDataLayout()->getTypeAllocSize(AllocaTy);
00269 
00270   if (AllocaSize > LocalMemAvailable) {
00271     DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
00272     return;
00273   }
00274 
00275   DEBUG(dbgs() << "Promoting alloca to local memory\n");
00276   LocalMemAvailable -= AllocaSize;
00277 
00278   GlobalVariable *GV = new GlobalVariable(
00279       *Mod, ArrayType::get(I.getAllocatedType(), 256), false,
00280       GlobalValue::ExternalLinkage, 0, I.getName(), 0,
00281       GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
00282 
00283   FunctionType *FTy = FunctionType::get(
00284       Type::getInt32Ty(Mod->getContext()), false);
00285   AttributeSet AttrSet;
00286   AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);
00287 
00288   Value *ReadLocalSizeY = Mod->getOrInsertFunction(
00289       "llvm.r600.read.local.size.y", FTy, AttrSet);
00290   Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
00291       "llvm.r600.read.local.size.z", FTy, AttrSet);
00292   Value *ReadTIDIGX = Mod->getOrInsertFunction(
00293       "llvm.r600.read.tidig.x", FTy, AttrSet);
00294   Value *ReadTIDIGY = Mod->getOrInsertFunction(
00295       "llvm.r600.read.tidig.y", FTy, AttrSet);
00296   Value *ReadTIDIGZ = Mod->getOrInsertFunction(
00297       "llvm.r600.read.tidig.z", FTy, AttrSet);
00298 
00299 
00300   Value *TCntY = Builder.CreateCall(ReadLocalSizeY);
00301   Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ);
00302   Value *TIdX  = Builder.CreateCall(ReadTIDIGX);
00303   Value *TIdY  = Builder.CreateCall(ReadTIDIGY);
00304   Value *TIdZ  = Builder.CreateCall(ReadTIDIGZ);
00305 
00306   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
00307   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
00308   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
00309   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
00310   TID = Builder.CreateAdd(TID, TIdZ);
00311 
00312   std::vector<Value*> Indices;
00313   Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
00314   Indices.push_back(TID);
00315 
00316   Value *Offset = Builder.CreateGEP(GV, Indices);
00317   I.mutateType(Offset->getType());
00318   I.replaceAllUsesWith(Offset);
00319   I.eraseFromParent();
00320 
00321   std::vector<Value*> WorkList;
00322 
00323   collectUsesWithPtrTypes(Offset, WorkList);
00324 
00325   for (std::vector<Value*>::iterator i = WorkList.begin(),
00326                                      e = WorkList.end(); i != e; ++i) {
00327     Value *V = *i;
00328     CallInst *Call = dyn_cast<CallInst>(V);
00329     if (!Call) {
00330       Type *EltTy = V->getType()->getPointerElementType();
00331       PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
00332 
00333       // The operand's value should be corrected on its own.
00334       if (isa<AddrSpaceCastInst>(V))
00335         continue;
00336 
00337       // FIXME: It doesn't really make sense to try to do this for all
00338       // instructions.
00339       V->mutateType(NewTy);
00340       continue;
00341     }
00342 
00343     IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
00344     if (!Intr) {
00345       std::vector<Type*> ArgTypes;
00346       for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
00347                                 ArgIdx != ArgEnd; ++ArgIdx) {
00348         ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
00349       }
00350       Function *F = Call->getCalledFunction();
00351       FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
00352                                                 F->isVarArg());
00353       Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType,
00354                                              F->getAttributes());
00355       Function *NewF = cast<Function>(C);
00356       Call->setCalledFunction(NewF);
00357       continue;
00358     }
00359 
00360     Builder.SetInsertPoint(Intr);
00361     switch (Intr->getIntrinsicID()) {
00362     case Intrinsic::lifetime_start:
00363     case Intrinsic::lifetime_end:
00364       // These intrinsics are for address space 0 only
00365       Intr->eraseFromParent();
00366       continue;
00367     case Intrinsic::memcpy: {
00368       MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
00369       Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
00370                            MemCpy->getLength(), MemCpy->getAlignment(),
00371                            MemCpy->isVolatile());
00372       Intr->eraseFromParent();
00373       continue;
00374     }
00375     case Intrinsic::memset: {
00376       MemSetInst *MemSet = cast<MemSetInst>(Intr);
00377       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
00378                            MemSet->getLength(), MemSet->getAlignment(),
00379                            MemSet->isVolatile());
00380       Intr->eraseFromParent();
00381       continue;
00382     }
00383     default:
00384       Intr->dump();
00385       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
00386     }
00387   }
00388 }
00389 
00390 FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
00391   return new AMDGPUPromoteAlloca(ST);
00392 }