LLVM API Documentation
00001 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This pass eliminates allocas by either converting them into vectors or 00011 // by migrating them to local address space. 00012 // 00013 //===----------------------------------------------------------------------===// 00014 00015 #include "AMDGPU.h" 00016 #include "AMDGPUSubtarget.h" 00017 #include "llvm/Analysis/ValueTracking.h" 00018 #include "llvm/IR/IRBuilder.h" 00019 #include "llvm/IR/InstVisitor.h" 00020 #include "llvm/Support/Debug.h" 00021 00022 #define DEBUG_TYPE "amdgpu-promote-alloca" 00023 00024 using namespace llvm; 00025 00026 namespace { 00027 00028 class AMDGPUPromoteAlloca : public FunctionPass, 00029 public InstVisitor<AMDGPUPromoteAlloca> { 00030 00031 static char ID; 00032 Module *Mod; 00033 const AMDGPUSubtarget &ST; 00034 int LocalMemAvailable; 00035 00036 public: 00037 AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st), 00038 LocalMemAvailable(0) { } 00039 bool doInitialization(Module &M) override; 00040 bool runOnFunction(Function &F) override; 00041 const char *getPassName() const override { return "AMDGPU Promote Alloca"; } 00042 void visitAlloca(AllocaInst &I); 00043 }; 00044 00045 } // End anonymous namespace 00046 00047 char AMDGPUPromoteAlloca::ID = 0; 00048 00049 bool AMDGPUPromoteAlloca::doInitialization(Module &M) { 00050 Mod = &M; 00051 return false; 00052 } 00053 00054 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { 00055 00056 const FunctionType *FTy = F.getFunctionType(); 00057 00058 LocalMemAvailable = ST.getLocalMemorySize(); 00059 00060 00061 // If the function has any arguments in the local address space, then it's 00062 // possible these arguments require the entire local memory space, so 00063 // we cannot use local memory in the pass. 00064 for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) { 00065 const Type *ParamTy = FTy->getParamType(i); 00066 if (ParamTy->isPointerTy() && 00067 ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 00068 LocalMemAvailable = 0; 00069 DEBUG(dbgs() << "Function has local memory argument. Promoting to " 00070 "local memory disabled.\n"); 00071 break; 00072 } 00073 } 00074 00075 if (LocalMemAvailable > 0) { 00076 // Check how much local memory is being used by global objects 00077 for (Module::global_iterator I = Mod->global_begin(), 00078 E = Mod->global_end(); I != E; ++I) { 00079 GlobalVariable *GV = I; 00080 PointerType *GVTy = GV->getType(); 00081 if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) 00082 continue; 00083 for (Value::use_iterator U = GV->use_begin(), 00084 UE = GV->use_end(); U != UE; ++U) { 00085 Instruction *Use = dyn_cast<Instruction>(*U); 00086 if (!Use) 00087 continue; 00088 if (Use->getParent()->getParent() == &F) 00089 LocalMemAvailable -= 00090 Mod->getDataLayout()->getTypeAllocSize(GVTy->getElementType()); 00091 } 00092 } 00093 } 00094 00095 LocalMemAvailable = std::max(0, LocalMemAvailable); 00096 DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n"); 00097 00098 visit(F); 00099 00100 return false; 00101 } 00102 00103 static VectorType *arrayTypeToVecType(const Type *ArrayTy) { 00104 return VectorType::get(ArrayTy->getArrayElementType(), 00105 ArrayTy->getArrayNumElements()); 00106 } 00107 00108 static Value* calculateVectorIndex(Value *Ptr, 00109 std::map<GetElementPtrInst*, Value*> GEPIdx) { 00110 if (isa<AllocaInst>(Ptr)) 00111 return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext())); 00112 00113 GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr); 00114 00115 return GEPIdx[GEP]; 00116 } 00117 00118 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { 00119 // FIXME we only support simple cases 00120 if (GEP->getNumOperands() != 3) 00121 return NULL; 00122 00123 ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1)); 00124 if (!I0 || !I0->isZero()) 00125 return NULL; 00126 00127 return GEP->getOperand(2); 00128 } 00129 00130 // Not an instruction handled below to turn into a vector. 00131 // 00132 // TODO: Check isTriviallyVectorizable for calls and handle other 00133 // instructions. 00134 static bool canVectorizeInst(Instruction *Inst) { 00135 switch (Inst->getOpcode()) { 00136 case Instruction::Load: 00137 case Instruction::Store: 00138 case Instruction::BitCast: 00139 case Instruction::AddrSpaceCast: 00140 return true; 00141 default: 00142 return false; 00143 } 00144 } 00145 00146 static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { 00147 Type *AllocaTy = Alloca->getAllocatedType(); 00148 00149 DEBUG(dbgs() << "Alloca Candidate for vectorization \n"); 00150 00151 // FIXME: There is no reason why we can't support larger arrays, we 00152 // are just being conservative for now. 00153 if (!AllocaTy->isArrayTy() || 00154 AllocaTy->getArrayElementType()->isVectorTy() || 00155 AllocaTy->getArrayNumElements() > 4) { 00156 00157 DEBUG(dbgs() << " Cannot convert type to vector"); 00158 return false; 00159 } 00160 00161 std::map<GetElementPtrInst*, Value*> GEPVectorIdx; 00162 std::vector<Value*> WorkList; 00163 for (User *AllocaUser : Alloca->users()) { 00164 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser); 00165 if (!GEP) { 00166 if (!canVectorizeInst(cast<Instruction>(AllocaUser))) 00167 return false; 00168 00169 WorkList.push_back(AllocaUser); 00170 continue; 00171 } 00172 00173 Value *Index = GEPToVectorIndex(GEP); 00174 00175 // If we can't compute a vector index from this GEP, then we can't 00176 // promote this alloca to vector. 00177 if (!Index) { 00178 DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n'); 00179 return false; 00180 } 00181 00182 GEPVectorIdx[GEP] = Index; 00183 for (User *GEPUser : AllocaUser->users()) { 00184 if (!canVectorizeInst(cast<Instruction>(GEPUser))) 00185 return false; 00186 00187 WorkList.push_back(GEPUser); 00188 } 00189 } 00190 00191 VectorType *VectorTy = arrayTypeToVecType(AllocaTy); 00192 00193 DEBUG(dbgs() << " Converting alloca to vector " 00194 << *AllocaTy << " -> " << *VectorTy << '\n'); 00195 00196 for (std::vector<Value*>::iterator I = WorkList.begin(), 00197 E = WorkList.end(); I != E; ++I) { 00198 Instruction *Inst = cast<Instruction>(*I); 00199 IRBuilder<> Builder(Inst); 00200 switch (Inst->getOpcode()) { 00201 case Instruction::Load: { 00202 Value *Ptr = Inst->getOperand(0); 00203 Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); 00204 Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); 00205 Value *VecValue = Builder.CreateLoad(BitCast); 00206 Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); 00207 Inst->replaceAllUsesWith(ExtractElement); 00208 Inst->eraseFromParent(); 00209 break; 00210 } 00211 case Instruction::Store: { 00212 Value *Ptr = Inst->getOperand(1); 00213 Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); 00214 Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); 00215 Value *VecValue = Builder.CreateLoad(BitCast); 00216 Value *NewVecValue = Builder.CreateInsertElement(VecValue, 00217 Inst->getOperand(0), 00218 Index); 00219 Builder.CreateStore(NewVecValue, BitCast); 00220 Inst->eraseFromParent(); 00221 break; 00222 } 00223 case Instruction::BitCast: 00224 case Instruction::AddrSpaceCast: 00225 break; 00226 00227 default: 00228 Inst->dump(); 00229 llvm_unreachable("Inconsistency in instructions promotable to vector"); 00230 } 00231 } 00232 return true; 00233 } 00234 00235 static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) { 00236 for (User *User : Val->users()) { 00237 if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end()) 00238 continue; 00239 if (isa<CallInst>(User)) { 00240 WorkList.push_back(User); 00241 continue; 00242 } 00243 if (!User->getType()->isPointerTy()) 00244 continue; 00245 WorkList.push_back(User); 00246 collectUsesWithPtrTypes(User, WorkList); 00247 } 00248 } 00249 00250 void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { 00251 IRBuilder<> Builder(&I); 00252 00253 // First try to replace the alloca with a vector 00254 Type *AllocaTy = I.getAllocatedType(); 00255 00256 DEBUG(dbgs() << "Trying to promote " << I << '\n'); 00257 00258 if (tryPromoteAllocaToVector(&I)) 00259 return; 00260 00261 DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); 00262 00263 // FIXME: This is the maximum work group size. We should try to get 00264 // value from the reqd_work_group_size function attribute if it is 00265 // available. 00266 unsigned WorkGroupSize = 256; 00267 int AllocaSize = WorkGroupSize * 00268 Mod->getDataLayout()->getTypeAllocSize(AllocaTy); 00269 00270 if (AllocaSize > LocalMemAvailable) { 00271 DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); 00272 return; 00273 } 00274 00275 DEBUG(dbgs() << "Promoting alloca to local memory\n"); 00276 LocalMemAvailable -= AllocaSize; 00277 00278 GlobalVariable *GV = new GlobalVariable( 00279 *Mod, ArrayType::get(I.getAllocatedType(), 256), false, 00280 GlobalValue::ExternalLinkage, 0, I.getName(), 0, 00281 GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); 00282 00283 FunctionType *FTy = FunctionType::get( 00284 Type::getInt32Ty(Mod->getContext()), false); 00285 AttributeSet AttrSet; 00286 AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone); 00287 00288 Value *ReadLocalSizeY = Mod->getOrInsertFunction( 00289 "llvm.r600.read.local.size.y", FTy, AttrSet); 00290 Value *ReadLocalSizeZ = Mod->getOrInsertFunction( 00291 "llvm.r600.read.local.size.z", FTy, AttrSet); 00292 Value *ReadTIDIGX = Mod->getOrInsertFunction( 00293 "llvm.r600.read.tidig.x", FTy, AttrSet); 00294 Value *ReadTIDIGY = Mod->getOrInsertFunction( 00295 "llvm.r600.read.tidig.y", FTy, AttrSet); 00296 Value *ReadTIDIGZ = Mod->getOrInsertFunction( 00297 "llvm.r600.read.tidig.z", FTy, AttrSet); 00298 00299 00300 Value *TCntY = Builder.CreateCall(ReadLocalSizeY); 00301 Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ); 00302 Value *TIdX = Builder.CreateCall(ReadTIDIGX); 00303 Value *TIdY = Builder.CreateCall(ReadTIDIGY); 00304 Value *TIdZ = Builder.CreateCall(ReadTIDIGZ); 00305 00306 Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ); 00307 Tmp0 = Builder.CreateMul(Tmp0, TIdX); 00308 Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ); 00309 Value *TID = Builder.CreateAdd(Tmp0, Tmp1); 00310 TID = Builder.CreateAdd(TID, TIdZ); 00311 00312 std::vector<Value*> Indices; 00313 Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext()))); 00314 Indices.push_back(TID); 00315 00316 Value *Offset = Builder.CreateGEP(GV, Indices); 00317 I.mutateType(Offset->getType()); 00318 I.replaceAllUsesWith(Offset); 00319 I.eraseFromParent(); 00320 00321 std::vector<Value*> WorkList; 00322 00323 collectUsesWithPtrTypes(Offset, WorkList); 00324 00325 for (std::vector<Value*>::iterator i = WorkList.begin(), 00326 e = WorkList.end(); i != e; ++i) { 00327 Value *V = *i; 00328 CallInst *Call = dyn_cast<CallInst>(V); 00329 if (!Call) { 00330 Type *EltTy = V->getType()->getPointerElementType(); 00331 PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); 00332 00333 // The operand's value should be corrected on its own. 00334 if (isa<AddrSpaceCastInst>(V)) 00335 continue; 00336 00337 // FIXME: It doesn't really make sense to try to do this for all 00338 // instructions. 00339 V->mutateType(NewTy); 00340 continue; 00341 } 00342 00343 IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call); 00344 if (!Intr) { 00345 std::vector<Type*> ArgTypes; 00346 for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands(); 00347 ArgIdx != ArgEnd; ++ArgIdx) { 00348 ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType()); 00349 } 00350 Function *F = Call->getCalledFunction(); 00351 FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes, 00352 F->isVarArg()); 00353 Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType, 00354 F->getAttributes()); 00355 Function *NewF = cast<Function>(C); 00356 Call->setCalledFunction(NewF); 00357 continue; 00358 } 00359 00360 Builder.SetInsertPoint(Intr); 00361 switch (Intr->getIntrinsicID()) { 00362 case Intrinsic::lifetime_start: 00363 case Intrinsic::lifetime_end: 00364 // These intrinsics are for address space 0 only 00365 Intr->eraseFromParent(); 00366 continue; 00367 case Intrinsic::memcpy: { 00368 MemCpyInst *MemCpy = cast<MemCpyInst>(Intr); 00369 Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(), 00370 MemCpy->getLength(), MemCpy->getAlignment(), 00371 MemCpy->isVolatile()); 00372 Intr->eraseFromParent(); 00373 continue; 00374 } 00375 case Intrinsic::memset: { 00376 MemSetInst *MemSet = cast<MemSetInst>(Intr); 00377 Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), 00378 MemSet->getLength(), MemSet->getAlignment(), 00379 MemSet->isVolatile()); 00380 Intr->eraseFromParent(); 00381 continue; 00382 } 00383 default: 00384 Intr->dump(); 00385 llvm_unreachable("Don't know how to promote alloca intrinsic use."); 00386 } 00387 } 00388 } 00389 00390 FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) { 00391 return new AMDGPUPromoteAlloca(ST); 00392 }