LLVM API Documentation
00001 //===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 /// \file 00010 /// This transformation implements the well known scalar replacement of 00011 /// aggregates transformation. It tries to identify promotable elements of an 00012 /// aggregate alloca, and promote them to registers. It will also try to 00013 /// convert uses of an element (or set of elements) of an alloca into a vector 00014 /// or bitfield-style integer scalar if appropriate. 00015 /// 00016 /// It works to do this with minimal slicing of the alloca so that regions 00017 /// which are merely transferred in and out of external memory remain unchanged 00018 /// and are not decomposed to scalar code. 00019 /// 00020 /// Because this also performs alloca promotion, it can be thought of as also 00021 /// serving the purpose of SSA formation. The algorithm iterates on the 00022 /// function until all opportunities for promotion have been realized. 00023 /// 00024 //===----------------------------------------------------------------------===// 00025 00026 #include "llvm/Transforms/Scalar.h" 00027 #include "llvm/ADT/STLExtras.h" 00028 #include "llvm/ADT/SetVector.h" 00029 #include "llvm/ADT/SmallVector.h" 00030 #include "llvm/ADT/Statistic.h" 00031 #include "llvm/Analysis/AssumptionTracker.h" 00032 #include "llvm/Analysis/Loads.h" 00033 #include "llvm/Analysis/PtrUseVisitor.h" 00034 #include "llvm/Analysis/ValueTracking.h" 00035 #include "llvm/IR/Constants.h" 00036 #include "llvm/IR/DIBuilder.h" 00037 #include "llvm/IR/DataLayout.h" 00038 #include "llvm/IR/DebugInfo.h" 00039 #include "llvm/IR/DerivedTypes.h" 00040 #include "llvm/IR/Dominators.h" 00041 #include "llvm/IR/Function.h" 00042 #include "llvm/IR/IRBuilder.h" 00043 #include "llvm/IR/InstVisitor.h" 00044 #include "llvm/IR/Instructions.h" 00045 #include "llvm/IR/IntrinsicInst.h" 00046 #include "llvm/IR/LLVMContext.h" 00047 #include "llvm/IR/Operator.h" 00048 #include "llvm/Pass.h" 00049 #include "llvm/Support/CommandLine.h" 00050 #include "llvm/Support/Compiler.h" 00051 #include "llvm/Support/Debug.h" 00052 #include "llvm/Support/ErrorHandling.h" 00053 #include "llvm/Support/MathExtras.h" 00054 #include "llvm/Support/TimeValue.h" 00055 #include "llvm/Support/raw_ostream.h" 00056 #include "llvm/Transforms/Utils/Local.h" 00057 #include "llvm/Transforms/Utils/PromoteMemToReg.h" 00058 #include "llvm/Transforms/Utils/SSAUpdater.h" 00059 00060 #if __cplusplus >= 201103L && !defined(NDEBUG) 00061 // We only use this for a debug check in C++11 00062 #include <random> 00063 #endif 00064 00065 using namespace llvm; 00066 00067 #define DEBUG_TYPE "sroa" 00068 00069 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); 00070 STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed"); 00071 STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca"); 00072 STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten"); 00073 STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition"); 00074 STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced"); 00075 STATISTIC(NumPromoted, "Number of allocas promoted to SSA values"); 00076 STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); 00077 STATISTIC(NumDeleted, "Number of instructions deleted"); 00078 STATISTIC(NumVectorized, "Number of vectorized aggregates"); 00079 00080 /// Hidden option to force the pass to not use DomTree and mem2reg, instead 00081 /// forming SSA values through the SSAUpdater infrastructure. 00082 static cl::opt<bool> 00083 ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden); 00084 00085 /// Hidden option to enable randomly shuffling the slices to help uncover 00086 /// instability in their order. 00087 static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices", 00088 cl::init(false), cl::Hidden); 00089 00090 /// Hidden option to experiment with completely strict handling of inbounds 00091 /// GEPs. 00092 static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", 00093 cl::init(false), cl::Hidden); 00094 00095 namespace { 00096 /// \brief A custom IRBuilder inserter which prefixes all names if they are 00097 /// preserved. 00098 template <bool preserveNames = true> 00099 class IRBuilderPrefixedInserter : 00100 public IRBuilderDefaultInserter<preserveNames> { 00101 std::string Prefix; 00102 00103 public: 00104 void SetNamePrefix(const Twine &P) { Prefix = P.str(); } 00105 00106 protected: 00107 void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB, 00108 BasicBlock::iterator InsertPt) const { 00109 IRBuilderDefaultInserter<preserveNames>::InsertHelper( 00110 I, Name.isTriviallyEmpty() ? Name : Prefix + Name, BB, InsertPt); 00111 } 00112 }; 00113 00114 // Specialization for not preserving the name is trivial. 00115 template <> 00116 class IRBuilderPrefixedInserter<false> : 00117 public IRBuilderDefaultInserter<false> { 00118 public: 00119 void SetNamePrefix(const Twine &P) {} 00120 }; 00121 00122 /// \brief Provide a typedef for IRBuilder that drops names in release builds. 00123 #ifndef NDEBUG 00124 typedef llvm::IRBuilder<true, ConstantFolder, 00125 IRBuilderPrefixedInserter<true> > IRBuilderTy; 00126 #else 00127 typedef llvm::IRBuilder<false, ConstantFolder, 00128 IRBuilderPrefixedInserter<false> > IRBuilderTy; 00129 #endif 00130 } 00131 00132 namespace { 00133 /// \brief A used slice of an alloca. 00134 /// 00135 /// This structure represents a slice of an alloca used by some instruction. It 00136 /// stores both the begin and end offsets of this use, a pointer to the use 00137 /// itself, and a flag indicating whether we can classify the use as splittable 00138 /// or not when forming partitions of the alloca. 00139 class Slice { 00140 /// \brief The beginning offset of the range. 00141 uint64_t BeginOffset; 00142 00143 /// \brief The ending offset, not included in the range. 00144 uint64_t EndOffset; 00145 00146 /// \brief Storage for both the use of this slice and whether it can be 00147 /// split. 00148 PointerIntPair<Use *, 1, bool> UseAndIsSplittable; 00149 00150 public: 00151 Slice() : BeginOffset(), EndOffset() {} 00152 Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable) 00153 : BeginOffset(BeginOffset), EndOffset(EndOffset), 00154 UseAndIsSplittable(U, IsSplittable) {} 00155 00156 uint64_t beginOffset() const { return BeginOffset; } 00157 uint64_t endOffset() const { return EndOffset; } 00158 00159 bool isSplittable() const { return UseAndIsSplittable.getInt(); } 00160 void makeUnsplittable() { UseAndIsSplittable.setInt(false); } 00161 00162 Use *getUse() const { return UseAndIsSplittable.getPointer(); } 00163 00164 bool isDead() const { return getUse() == nullptr; } 00165 void kill() { UseAndIsSplittable.setPointer(nullptr); } 00166 00167 /// \brief Support for ordering ranges. 00168 /// 00169 /// This provides an ordering over ranges such that start offsets are 00170 /// always increasing, and within equal start offsets, the end offsets are 00171 /// decreasing. Thus the spanning range comes first in a cluster with the 00172 /// same start position. 00173 bool operator<(const Slice &RHS) const { 00174 if (beginOffset() < RHS.beginOffset()) return true; 00175 if (beginOffset() > RHS.beginOffset()) return false; 00176 if (isSplittable() != RHS.isSplittable()) return !isSplittable(); 00177 if (endOffset() > RHS.endOffset()) return true; 00178 return false; 00179 } 00180 00181 /// \brief Support comparison with a single offset to allow binary searches. 00182 friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS, 00183 uint64_t RHSOffset) { 00184 return LHS.beginOffset() < RHSOffset; 00185 } 00186 friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset, 00187 const Slice &RHS) { 00188 return LHSOffset < RHS.beginOffset(); 00189 } 00190 00191 bool operator==(const Slice &RHS) const { 00192 return isSplittable() == RHS.isSplittable() && 00193 beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset(); 00194 } 00195 bool operator!=(const Slice &RHS) const { return !operator==(RHS); } 00196 }; 00197 } // end anonymous namespace 00198 00199 namespace llvm { 00200 template <typename T> struct isPodLike; 00201 template <> struct isPodLike<Slice> { 00202 static const bool value = true; 00203 }; 00204 } 00205 00206 namespace { 00207 /// \brief Representation of the alloca slices. 00208 /// 00209 /// This class represents the slices of an alloca which are formed by its 00210 /// various uses. If a pointer escapes, we can't fully build a representation 00211 /// for the slices used and we reflect that in this structure. The uses are 00212 /// stored, sorted by increasing beginning offset and with unsplittable slices 00213 /// starting at a particular offset before splittable slices. 00214 class AllocaSlices { 00215 public: 00216 /// \brief Construct the slices of a particular alloca. 00217 AllocaSlices(const DataLayout &DL, AllocaInst &AI); 00218 00219 /// \brief Test whether a pointer to the allocation escapes our analysis. 00220 /// 00221 /// If this is true, the slices are never fully built and should be 00222 /// ignored. 00223 bool isEscaped() const { return PointerEscapingInstr; } 00224 00225 /// \brief Support for iterating over the slices. 00226 /// @{ 00227 typedef SmallVectorImpl<Slice>::iterator iterator; 00228 iterator begin() { return Slices.begin(); } 00229 iterator end() { return Slices.end(); } 00230 00231 typedef SmallVectorImpl<Slice>::const_iterator const_iterator; 00232 const_iterator begin() const { return Slices.begin(); } 00233 const_iterator end() const { return Slices.end(); } 00234 /// @} 00235 00236 /// \brief Allow iterating the dead users for this alloca. 00237 /// 00238 /// These are instructions which will never actually use the alloca as they 00239 /// are outside the allocated range. They are safe to replace with undef and 00240 /// delete. 00241 /// @{ 00242 typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator; 00243 dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); } 00244 dead_user_iterator dead_user_end() const { return DeadUsers.end(); } 00245 /// @} 00246 00247 /// \brief Allow iterating the dead expressions referring to this alloca. 00248 /// 00249 /// These are operands which have cannot actually be used to refer to the 00250 /// alloca as they are outside its range and the user doesn't correct for 00251 /// that. These mostly consist of PHI node inputs and the like which we just 00252 /// need to replace with undef. 00253 /// @{ 00254 typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator; 00255 dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); } 00256 dead_op_iterator dead_op_end() const { return DeadOperands.end(); } 00257 /// @} 00258 00259 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 00260 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; 00261 void printSlice(raw_ostream &OS, const_iterator I, 00262 StringRef Indent = " ") const; 00263 void printUse(raw_ostream &OS, const_iterator I, 00264 StringRef Indent = " ") const; 00265 void print(raw_ostream &OS) const; 00266 void dump(const_iterator I) const; 00267 void dump() const; 00268 #endif 00269 00270 private: 00271 template <typename DerivedT, typename RetT = void> class BuilderBase; 00272 class SliceBuilder; 00273 friend class AllocaSlices::SliceBuilder; 00274 00275 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 00276 /// \brief Handle to alloca instruction to simplify method interfaces. 00277 AllocaInst &AI; 00278 #endif 00279 00280 /// \brief The instruction responsible for this alloca not having a known set 00281 /// of slices. 00282 /// 00283 /// When an instruction (potentially) escapes the pointer to the alloca, we 00284 /// store a pointer to that here and abort trying to form slices of the 00285 /// alloca. This will be null if the alloca slices are analyzed successfully. 00286 Instruction *PointerEscapingInstr; 00287 00288 /// \brief The slices of the alloca. 00289 /// 00290 /// We store a vector of the slices formed by uses of the alloca here. This 00291 /// vector is sorted by increasing begin offset, and then the unsplittable 00292 /// slices before the splittable ones. See the Slice inner class for more 00293 /// details. 00294 SmallVector<Slice, 8> Slices; 00295 00296 /// \brief Instructions which will become dead if we rewrite the alloca. 00297 /// 00298 /// Note that these are not separated by slice. This is because we expect an 00299 /// alloca to be completely rewritten or not rewritten at all. If rewritten, 00300 /// all these instructions can simply be removed and replaced with undef as 00301 /// they come from outside of the allocated space. 00302 SmallVector<Instruction *, 8> DeadUsers; 00303 00304 /// \brief Operands which will become dead if we rewrite the alloca. 00305 /// 00306 /// These are operands that in their particular use can be replaced with 00307 /// undef when we rewrite the alloca. These show up in out-of-bounds inputs 00308 /// to PHI nodes and the like. They aren't entirely dead (there might be 00309 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we 00310 /// want to swap this particular input for undef to simplify the use lists of 00311 /// the alloca. 00312 SmallVector<Use *, 8> DeadOperands; 00313 }; 00314 } 00315 00316 static Value *foldSelectInst(SelectInst &SI) { 00317 // If the condition being selected on is a constant or the same value is 00318 // being selected between, fold the select. Yes this does (rarely) happen 00319 // early on. 00320 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition())) 00321 return SI.getOperand(1+CI->isZero()); 00322 if (SI.getOperand(1) == SI.getOperand(2)) 00323 return SI.getOperand(1); 00324 00325 return nullptr; 00326 } 00327 00328 /// \brief A helper that folds a PHI node or a select. 00329 static Value *foldPHINodeOrSelectInst(Instruction &I) { 00330 if (PHINode *PN = dyn_cast<PHINode>(&I)) { 00331 // If PN merges together the same value, return that value. 00332 return PN->hasConstantValue(); 00333 } 00334 return foldSelectInst(cast<SelectInst>(I)); 00335 } 00336 00337 /// \brief Builder for the alloca slices. 00338 /// 00339 /// This class builds a set of alloca slices by recursively visiting the uses 00340 /// of an alloca and making a slice for each load and store at each offset. 00341 class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> { 00342 friend class PtrUseVisitor<SliceBuilder>; 00343 friend class InstVisitor<SliceBuilder>; 00344 typedef PtrUseVisitor<SliceBuilder> Base; 00345 00346 const uint64_t AllocSize; 00347 AllocaSlices &S; 00348 00349 SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap; 00350 SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes; 00351 00352 /// \brief Set to de-duplicate dead instructions found in the use walk. 00353 SmallPtrSet<Instruction *, 4> VisitedDeadInsts; 00354 00355 public: 00356 SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &S) 00357 : PtrUseVisitor<SliceBuilder>(DL), 00358 AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), S(S) {} 00359 00360 private: 00361 void markAsDead(Instruction &I) { 00362 if (VisitedDeadInsts.insert(&I)) 00363 S.DeadUsers.push_back(&I); 00364 } 00365 00366 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size, 00367 bool IsSplittable = false) { 00368 // Completely skip uses which have a zero size or start either before or 00369 // past the end of the allocation. 00370 if (Size == 0 || Offset.uge(AllocSize)) { 00371 DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset 00372 << " which has zero size or starts outside of the " 00373 << AllocSize << " byte alloca:\n" 00374 << " alloca: " << S.AI << "\n" 00375 << " use: " << I << "\n"); 00376 return markAsDead(I); 00377 } 00378 00379 uint64_t BeginOffset = Offset.getZExtValue(); 00380 uint64_t EndOffset = BeginOffset + Size; 00381 00382 // Clamp the end offset to the end of the allocation. Note that this is 00383 // formulated to handle even the case where "BeginOffset + Size" overflows. 00384 // This may appear superficially to be something we could ignore entirely, 00385 // but that is not so! There may be widened loads or PHI-node uses where 00386 // some instructions are dead but not others. We can't completely ignore 00387 // them, and so have to record at least the information here. 00388 assert(AllocSize >= BeginOffset); // Established above. 00389 if (Size > AllocSize - BeginOffset) { 00390 DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset 00391 << " to remain within the " << AllocSize << " byte alloca:\n" 00392 << " alloca: " << S.AI << "\n" 00393 << " use: " << I << "\n"); 00394 EndOffset = AllocSize; 00395 } 00396 00397 S.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable)); 00398 } 00399 00400 void visitBitCastInst(BitCastInst &BC) { 00401 if (BC.use_empty()) 00402 return markAsDead(BC); 00403 00404 return Base::visitBitCastInst(BC); 00405 } 00406 00407 void visitGetElementPtrInst(GetElementPtrInst &GEPI) { 00408 if (GEPI.use_empty()) 00409 return markAsDead(GEPI); 00410 00411 if (SROAStrictInbounds && GEPI.isInBounds()) { 00412 // FIXME: This is a manually un-factored variant of the basic code inside 00413 // of GEPs with checking of the inbounds invariant specified in the 00414 // langref in a very strict sense. If we ever want to enable 00415 // SROAStrictInbounds, this code should be factored cleanly into 00416 // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds 00417 // by writing out the code here where we have tho underlying allocation 00418 // size readily available. 00419 APInt GEPOffset = Offset; 00420 for (gep_type_iterator GTI = gep_type_begin(GEPI), 00421 GTE = gep_type_end(GEPI); 00422 GTI != GTE; ++GTI) { 00423 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 00424 if (!OpC) 00425 break; 00426 00427 // Handle a struct index, which adds its field offset to the pointer. 00428 if (StructType *STy = dyn_cast<StructType>(*GTI)) { 00429 unsigned ElementIdx = OpC->getZExtValue(); 00430 const StructLayout *SL = DL.getStructLayout(STy); 00431 GEPOffset += 00432 APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx)); 00433 } else { 00434 // For array or vector indices, scale the index by the size of the type. 00435 APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth()); 00436 GEPOffset += Index * APInt(Offset.getBitWidth(), 00437 DL.getTypeAllocSize(GTI.getIndexedType())); 00438 } 00439 00440 // If this index has computed an intermediate pointer which is not 00441 // inbounds, then the result of the GEP is a poison value and we can 00442 // delete it and all uses. 00443 if (GEPOffset.ugt(AllocSize)) 00444 return markAsDead(GEPI); 00445 } 00446 } 00447 00448 return Base::visitGetElementPtrInst(GEPI); 00449 } 00450 00451 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset, 00452 uint64_t Size, bool IsVolatile) { 00453 // We allow splitting of loads and stores where the type is an integer type 00454 // and cover the entire alloca. This prevents us from splitting over 00455 // eagerly. 00456 // FIXME: In the great blue eventually, we should eagerly split all integer 00457 // loads and stores, and then have a separate step that merges adjacent 00458 // alloca partitions into a single partition suitable for integer widening. 00459 // Or we should skip the merge step and rely on GVN and other passes to 00460 // merge adjacent loads and stores that survive mem2reg. 00461 bool IsSplittable = 00462 Ty->isIntegerTy() && !IsVolatile && Offset == 0 && Size >= AllocSize; 00463 00464 insertUse(I, Offset, Size, IsSplittable); 00465 } 00466 00467 void visitLoadInst(LoadInst &LI) { 00468 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) && 00469 "All simple FCA loads should have been pre-split"); 00470 00471 if (!IsOffsetKnown) 00472 return PI.setAborted(&LI); 00473 00474 uint64_t Size = DL.getTypeStoreSize(LI.getType()); 00475 return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile()); 00476 } 00477 00478 void visitStoreInst(StoreInst &SI) { 00479 Value *ValOp = SI.getValueOperand(); 00480 if (ValOp == *U) 00481 return PI.setEscapedAndAborted(&SI); 00482 if (!IsOffsetKnown) 00483 return PI.setAborted(&SI); 00484 00485 uint64_t Size = DL.getTypeStoreSize(ValOp->getType()); 00486 00487 // If this memory access can be shown to *statically* extend outside the 00488 // bounds of of the allocation, it's behavior is undefined, so simply 00489 // ignore it. Note that this is more strict than the generic clamping 00490 // behavior of insertUse. We also try to handle cases which might run the 00491 // risk of overflow. 00492 // FIXME: We should instead consider the pointer to have escaped if this 00493 // function is being instrumented for addressing bugs or race conditions. 00494 if (Size > AllocSize || Offset.ugt(AllocSize - Size)) { 00495 DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset 00496 << " which extends past the end of the " << AllocSize 00497 << " byte alloca:\n" 00498 << " alloca: " << S.AI << "\n" 00499 << " use: " << SI << "\n"); 00500 return markAsDead(SI); 00501 } 00502 00503 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) && 00504 "All simple FCA stores should have been pre-split"); 00505 handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile()); 00506 } 00507 00508 00509 void visitMemSetInst(MemSetInst &II) { 00510 assert(II.getRawDest() == *U && "Pointer use is not the destination?"); 00511 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 00512 if ((Length && Length->getValue() == 0) || 00513 (IsOffsetKnown && Offset.uge(AllocSize))) 00514 // Zero-length mem transfer intrinsics can be ignored entirely. 00515 return markAsDead(II); 00516 00517 if (!IsOffsetKnown) 00518 return PI.setAborted(&II); 00519 00520 insertUse(II, Offset, 00521 Length ? Length->getLimitedValue() 00522 : AllocSize - Offset.getLimitedValue(), 00523 (bool)Length); 00524 } 00525 00526 void visitMemTransferInst(MemTransferInst &II) { 00527 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 00528 if (Length && Length->getValue() == 0) 00529 // Zero-length mem transfer intrinsics can be ignored entirely. 00530 return markAsDead(II); 00531 00532 // Because we can visit these intrinsics twice, also check to see if the 00533 // first time marked this instruction as dead. If so, skip it. 00534 if (VisitedDeadInsts.count(&II)) 00535 return; 00536 00537 if (!IsOffsetKnown) 00538 return PI.setAborted(&II); 00539 00540 // This side of the transfer is completely out-of-bounds, and so we can 00541 // nuke the entire transfer. However, we also need to nuke the other side 00542 // if already added to our partitions. 00543 // FIXME: Yet another place we really should bypass this when 00544 // instrumenting for ASan. 00545 if (Offset.uge(AllocSize)) { 00546 SmallDenseMap<Instruction *, unsigned>::iterator MTPI = MemTransferSliceMap.find(&II); 00547 if (MTPI != MemTransferSliceMap.end()) 00548 S.Slices[MTPI->second].kill(); 00549 return markAsDead(II); 00550 } 00551 00552 uint64_t RawOffset = Offset.getLimitedValue(); 00553 uint64_t Size = Length ? Length->getLimitedValue() 00554 : AllocSize - RawOffset; 00555 00556 // Check for the special case where the same exact value is used for both 00557 // source and dest. 00558 if (*U == II.getRawDest() && *U == II.getRawSource()) { 00559 // For non-volatile transfers this is a no-op. 00560 if (!II.isVolatile()) 00561 return markAsDead(II); 00562 00563 return insertUse(II, Offset, Size, /*IsSplittable=*/false); 00564 } 00565 00566 // If we have seen both source and destination for a mem transfer, then 00567 // they both point to the same alloca. 00568 bool Inserted; 00569 SmallDenseMap<Instruction *, unsigned>::iterator MTPI; 00570 std::tie(MTPI, Inserted) = 00571 MemTransferSliceMap.insert(std::make_pair(&II, S.Slices.size())); 00572 unsigned PrevIdx = MTPI->second; 00573 if (!Inserted) { 00574 Slice &PrevP = S.Slices[PrevIdx]; 00575 00576 // Check if the begin offsets match and this is a non-volatile transfer. 00577 // In that case, we can completely elide the transfer. 00578 if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) { 00579 PrevP.kill(); 00580 return markAsDead(II); 00581 } 00582 00583 // Otherwise we have an offset transfer within the same alloca. We can't 00584 // split those. 00585 PrevP.makeUnsplittable(); 00586 } 00587 00588 // Insert the use now that we've fixed up the splittable nature. 00589 insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length); 00590 00591 // Check that we ended up with a valid index in the map. 00592 assert(S.Slices[PrevIdx].getUse()->getUser() == &II && 00593 "Map index doesn't point back to a slice with this user."); 00594 } 00595 00596 // Disable SRoA for any intrinsics except for lifetime invariants. 00597 // FIXME: What about debug intrinsics? This matches old behavior, but 00598 // doesn't make sense. 00599 void visitIntrinsicInst(IntrinsicInst &II) { 00600 if (!IsOffsetKnown) 00601 return PI.setAborted(&II); 00602 00603 if (II.getIntrinsicID() == Intrinsic::lifetime_start || 00604 II.getIntrinsicID() == Intrinsic::lifetime_end) { 00605 ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); 00606 uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(), 00607 Length->getLimitedValue()); 00608 insertUse(II, Offset, Size, true); 00609 return; 00610 } 00611 00612 Base::visitIntrinsicInst(II); 00613 } 00614 00615 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) { 00616 // We consider any PHI or select that results in a direct load or store of 00617 // the same offset to be a viable use for slicing purposes. These uses 00618 // are considered unsplittable and the size is the maximum loaded or stored 00619 // size. 00620 SmallPtrSet<Instruction *, 4> Visited; 00621 SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses; 00622 Visited.insert(Root); 00623 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root)); 00624 // If there are no loads or stores, the access is dead. We mark that as 00625 // a size zero access. 00626 Size = 0; 00627 do { 00628 Instruction *I, *UsedI; 00629 std::tie(UsedI, I) = Uses.pop_back_val(); 00630 00631 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 00632 Size = std::max(Size, DL.getTypeStoreSize(LI->getType())); 00633 continue; 00634 } 00635 if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 00636 Value *Op = SI->getOperand(0); 00637 if (Op == UsedI) 00638 return SI; 00639 Size = std::max(Size, DL.getTypeStoreSize(Op->getType())); 00640 continue; 00641 } 00642 00643 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { 00644 if (!GEP->hasAllZeroIndices()) 00645 return GEP; 00646 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) && 00647 !isa<SelectInst>(I)) { 00648 return I; 00649 } 00650 00651 for (User *U : I->users()) 00652 if (Visited.insert(cast<Instruction>(U))) 00653 Uses.push_back(std::make_pair(I, cast<Instruction>(U))); 00654 } while (!Uses.empty()); 00655 00656 return nullptr; 00657 } 00658 00659 void visitPHINodeOrSelectInst(Instruction &I) { 00660 assert(isa<PHINode>(I) || isa<SelectInst>(I)); 00661 if (I.use_empty()) 00662 return markAsDead(I); 00663 00664 // TODO: We could use SimplifyInstruction here to fold PHINodes and 00665 // SelectInsts. However, doing so requires to change the current 00666 // dead-operand-tracking mechanism. For instance, suppose neither loading 00667 // from %U nor %other traps. Then "load (select undef, %U, %other)" does not 00668 // trap either. However, if we simply replace %U with undef using the 00669 // current dead-operand-tracking mechanism, "load (select undef, undef, 00670 // %other)" may trap because the select may return the first operand 00671 // "undef". 00672 if (Value *Result = foldPHINodeOrSelectInst(I)) { 00673 if (Result == *U) 00674 // If the result of the constant fold will be the pointer, recurse 00675 // through the PHI/select as if we had RAUW'ed it. 00676 enqueueUsers(I); 00677 else 00678 // Otherwise the operand to the PHI/select is dead, and we can replace 00679 // it with undef. 00680 S.DeadOperands.push_back(U); 00681 00682 return; 00683 } 00684 00685 if (!IsOffsetKnown) 00686 return PI.setAborted(&I); 00687 00688 // See if we already have computed info on this node. 00689 uint64_t &Size = PHIOrSelectSizes[&I]; 00690 if (!Size) { 00691 // This is a new PHI/Select, check for an unsafe use of it. 00692 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size)) 00693 return PI.setAborted(UnsafeI); 00694 } 00695 00696 // For PHI and select operands outside the alloca, we can't nuke the entire 00697 // phi or select -- the other side might still be relevant, so we special 00698 // case them here and use a separate structure to track the operands 00699 // themselves which should be replaced with undef. 00700 // FIXME: This should instead be escaped in the event we're instrumenting 00701 // for address sanitization. 00702 if (Offset.uge(AllocSize)) { 00703 S.DeadOperands.push_back(U); 00704 return; 00705 } 00706 00707 insertUse(I, Offset, Size); 00708 } 00709 00710 void visitPHINode(PHINode &PN) { 00711 visitPHINodeOrSelectInst(PN); 00712 } 00713 00714 void visitSelectInst(SelectInst &SI) { 00715 visitPHINodeOrSelectInst(SI); 00716 } 00717 00718 /// \brief Disable SROA entirely if there are unhandled users of the alloca. 00719 void visitInstruction(Instruction &I) { 00720 PI.setAborted(&I); 00721 } 00722 }; 00723 00724 AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) 00725 : 00726 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 00727 AI(AI), 00728 #endif 00729 PointerEscapingInstr(nullptr) { 00730 SliceBuilder PB(DL, AI, *this); 00731 SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI); 00732 if (PtrI.isEscaped() || PtrI.isAborted()) { 00733 // FIXME: We should sink the escape vs. abort info into the caller nicely, 00734 // possibly by just storing the PtrInfo in the AllocaSlices. 00735 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst() 00736 : PtrI.getAbortingInst(); 00737 assert(PointerEscapingInstr && "Did not track a bad instruction"); 00738 return; 00739 } 00740 00741 Slices.erase(std::remove_if(Slices.begin(), Slices.end(), 00742 std::mem_fun_ref(&Slice::isDead)), 00743 Slices.end()); 00744 00745 #if __cplusplus >= 201103L && !defined(NDEBUG) 00746 if (SROARandomShuffleSlices) { 00747 std::mt19937 MT(static_cast<unsigned>(sys::TimeValue::now().msec())); 00748 std::shuffle(Slices.begin(), Slices.end(), MT); 00749 } 00750 #endif 00751 00752 // Sort the uses. This arranges for the offsets to be in ascending order, 00753 // and the sizes to be in descending order. 00754 std::sort(Slices.begin(), Slices.end()); 00755 } 00756 00757 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 00758 00759 void AllocaSlices::print(raw_ostream &OS, const_iterator I, 00760 StringRef Indent) const { 00761 printSlice(OS, I, Indent); 00762 printUse(OS, I, Indent); 00763 } 00764 00765 void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I, 00766 StringRef Indent) const { 00767 OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")" 00768 << " slice #" << (I - begin()) 00769 << (I->isSplittable() ? " (splittable)" : "") << "\n"; 00770 } 00771 00772 void AllocaSlices::printUse(raw_ostream &OS, const_iterator I, 00773 StringRef Indent) const { 00774 OS << Indent << " used by: " << *I->getUse()->getUser() << "\n"; 00775 } 00776 00777 void AllocaSlices::print(raw_ostream &OS) const { 00778 if (PointerEscapingInstr) { 00779 OS << "Can't analyze slices for alloca: " << AI << "\n" 00780 << " A pointer to this alloca escaped by:\n" 00781 << " " << *PointerEscapingInstr << "\n"; 00782 return; 00783 } 00784 00785 OS << "Slices of alloca: " << AI << "\n"; 00786 for (const_iterator I = begin(), E = end(); I != E; ++I) 00787 print(OS, I); 00788 } 00789 00790 LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const { 00791 print(dbgs(), I); 00792 } 00793 LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); } 00794 00795 #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 00796 00797 namespace { 00798 /// \brief Implementation of LoadAndStorePromoter for promoting allocas. 00799 /// 00800 /// This subclass of LoadAndStorePromoter adds overrides to handle promoting 00801 /// the loads and stores of an alloca instruction, as well as updating its 00802 /// debug information. This is used when a domtree is unavailable and thus 00803 /// mem2reg in its full form can't be used to handle promotion of allocas to 00804 /// scalar values. 00805 class AllocaPromoter : public LoadAndStorePromoter { 00806 AllocaInst &AI; 00807 DIBuilder &DIB; 00808 00809 SmallVector<DbgDeclareInst *, 4> DDIs; 00810 SmallVector<DbgValueInst *, 4> DVIs; 00811 00812 public: 00813 AllocaPromoter(const SmallVectorImpl<Instruction *> &Insts, SSAUpdater &S, 00814 AllocaInst &AI, DIBuilder &DIB) 00815 : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {} 00816 00817 void run(const SmallVectorImpl<Instruction*> &Insts) { 00818 // Retain the debug information attached to the alloca for use when 00819 // rewriting loads and stores. 00820 if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) { 00821 for (User *U : DebugNode->users()) 00822 if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U)) 00823 DDIs.push_back(DDI); 00824 else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U)) 00825 DVIs.push_back(DVI); 00826 } 00827 00828 LoadAndStorePromoter::run(Insts); 00829 00830 // While we have the debug information, clear it off of the alloca. The 00831 // caller takes care of deleting the alloca. 00832 while (!DDIs.empty()) 00833 DDIs.pop_back_val()->eraseFromParent(); 00834 while (!DVIs.empty()) 00835 DVIs.pop_back_val()->eraseFromParent(); 00836 } 00837 00838 bool isInstInList(Instruction *I, 00839 const SmallVectorImpl<Instruction*> &Insts) const override { 00840 Value *Ptr; 00841 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 00842 Ptr = LI->getOperand(0); 00843 else 00844 Ptr = cast<StoreInst>(I)->getPointerOperand(); 00845 00846 // Only used to detect cycles, which will be rare and quickly found as 00847 // we're walking up a chain of defs rather than down through uses. 00848 SmallPtrSet<Value *, 4> Visited; 00849 00850 do { 00851 if (Ptr == &AI) 00852 return true; 00853 00854 if (BitCastInst *BCI = dyn_cast<BitCastInst>(Ptr)) 00855 Ptr = BCI->getOperand(0); 00856 else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) 00857 Ptr = GEPI->getPointerOperand(); 00858 else 00859 return false; 00860 00861 } while (Visited.insert(Ptr)); 00862 00863 return false; 00864 } 00865 00866 void updateDebugInfo(Instruction *Inst) const override { 00867 for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(), 00868 E = DDIs.end(); I != E; ++I) { 00869 DbgDeclareInst *DDI = *I; 00870 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) 00871 ConvertDebugDeclareToDebugValue(DDI, SI, DIB); 00872 else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) 00873 ConvertDebugDeclareToDebugValue(DDI, LI, DIB); 00874 } 00875 for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(), 00876 E = DVIs.end(); I != E; ++I) { 00877 DbgValueInst *DVI = *I; 00878 Value *Arg = nullptr; 00879 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 00880 // If an argument is zero extended then use argument directly. The ZExt 00881 // may be zapped by an optimization pass in future. 00882 if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0))) 00883 Arg = dyn_cast<Argument>(ZExt->getOperand(0)); 00884 else if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0))) 00885 Arg = dyn_cast<Argument>(SExt->getOperand(0)); 00886 if (!Arg) 00887 Arg = SI->getValueOperand(); 00888 } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { 00889 Arg = LI->getPointerOperand(); 00890 } else { 00891 continue; 00892 } 00893 Instruction *DbgVal = 00894 DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()), 00895 Inst); 00896 DbgVal->setDebugLoc(DVI->getDebugLoc()); 00897 } 00898 } 00899 }; 00900 } // end anon namespace 00901 00902 00903 namespace { 00904 /// \brief An optimization pass providing Scalar Replacement of Aggregates. 00905 /// 00906 /// This pass takes allocations which can be completely analyzed (that is, they 00907 /// don't escape) and tries to turn them into scalar SSA values. There are 00908 /// a few steps to this process. 00909 /// 00910 /// 1) It takes allocations of aggregates and analyzes the ways in which they 00911 /// are used to try to split them into smaller allocations, ideally of 00912 /// a single scalar data type. It will split up memcpy and memset accesses 00913 /// as necessary and try to isolate individual scalar accesses. 00914 /// 2) It will transform accesses into forms which are suitable for SSA value 00915 /// promotion. This can be replacing a memset with a scalar store of an 00916 /// integer value, or it can involve speculating operations on a PHI or 00917 /// select to be a PHI or select of the results. 00918 /// 3) Finally, this will try to detect a pattern of accesses which map cleanly 00919 /// onto insert and extract operations on a vector value, and convert them to 00920 /// this form. By doing so, it will enable promotion of vector aggregates to 00921 /// SSA vector values. 00922 class SROA : public FunctionPass { 00923 const bool RequiresDomTree; 00924 00925 LLVMContext *C; 00926 const DataLayout *DL; 00927 DominatorTree *DT; 00928 AssumptionTracker *AT; 00929 00930 /// \brief Worklist of alloca instructions to simplify. 00931 /// 00932 /// Each alloca in the function is added to this. Each new alloca formed gets 00933 /// added to it as well to recursively simplify unless that alloca can be 00934 /// directly promoted. Finally, each time we rewrite a use of an alloca other 00935 /// the one being actively rewritten, we add it back onto the list if not 00936 /// already present to ensure it is re-visited. 00937 SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist; 00938 00939 /// \brief A collection of instructions to delete. 00940 /// We try to batch deletions to simplify code and make things a bit more 00941 /// efficient. 00942 SetVector<Instruction *, SmallVector<Instruction *, 8> > DeadInsts; 00943 00944 /// \brief Post-promotion worklist. 00945 /// 00946 /// Sometimes we discover an alloca which has a high probability of becoming 00947 /// viable for SROA after a round of promotion takes place. In those cases, 00948 /// the alloca is enqueued here for re-processing. 00949 /// 00950 /// Note that we have to be very careful to clear allocas out of this list in 00951 /// the event they are deleted. 00952 SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > PostPromotionWorklist; 00953 00954 /// \brief A collection of alloca instructions we can directly promote. 00955 std::vector<AllocaInst *> PromotableAllocas; 00956 00957 /// \brief A worklist of PHIs to speculate prior to promoting allocas. 00958 /// 00959 /// All of these PHIs have been checked for the safety of speculation and by 00960 /// being speculated will allow promoting allocas currently in the promotable 00961 /// queue. 00962 SetVector<PHINode *, SmallVector<PHINode *, 2> > SpeculatablePHIs; 00963 00964 /// \brief A worklist of select instructions to speculate prior to promoting 00965 /// allocas. 00966 /// 00967 /// All of these select instructions have been checked for the safety of 00968 /// speculation and by being speculated will allow promoting allocas 00969 /// currently in the promotable queue. 00970 SetVector<SelectInst *, SmallVector<SelectInst *, 2> > SpeculatableSelects; 00971 00972 public: 00973 SROA(bool RequiresDomTree = true) 00974 : FunctionPass(ID), RequiresDomTree(RequiresDomTree), 00975 C(nullptr), DL(nullptr), DT(nullptr) { 00976 initializeSROAPass(*PassRegistry::getPassRegistry()); 00977 } 00978 bool runOnFunction(Function &F) override; 00979 void getAnalysisUsage(AnalysisUsage &AU) const override; 00980 00981 const char *getPassName() const override { return "SROA"; } 00982 static char ID; 00983 00984 private: 00985 friend class PHIOrSelectSpeculator; 00986 friend class AllocaSliceRewriter; 00987 00988 bool rewritePartition(AllocaInst &AI, AllocaSlices &S, 00989 AllocaSlices::iterator B, AllocaSlices::iterator E, 00990 int64_t BeginOffset, int64_t EndOffset, 00991 ArrayRef<AllocaSlices::iterator> SplitUses); 00992 bool splitAlloca(AllocaInst &AI, AllocaSlices &S); 00993 bool runOnAlloca(AllocaInst &AI); 00994 void clobberUse(Use &U); 00995 void deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas); 00996 bool promoteAllocas(Function &F); 00997 }; 00998 } 00999 01000 char SROA::ID = 0; 01001 01002 FunctionPass *llvm::createSROAPass(bool RequiresDomTree) { 01003 return new SROA(RequiresDomTree); 01004 } 01005 01006 INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates", 01007 false, false) 01008 INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) 01009 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 01010 INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates", 01011 false, false) 01012 01013 /// Walk the range of a partitioning looking for a common type to cover this 01014 /// sequence of slices. 01015 static Type *findCommonType(AllocaSlices::const_iterator B, 01016 AllocaSlices::const_iterator E, 01017 uint64_t EndOffset) { 01018 Type *Ty = nullptr; 01019 bool TyIsCommon = true; 01020 IntegerType *ITy = nullptr; 01021 01022 // Note that we need to look at *every* alloca slice's Use to ensure we 01023 // always get consistent results regardless of the order of slices. 01024 for (AllocaSlices::const_iterator I = B; I != E; ++I) { 01025 Use *U = I->getUse(); 01026 if (isa<IntrinsicInst>(*U->getUser())) 01027 continue; 01028 if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset) 01029 continue; 01030 01031 Type *UserTy = nullptr; 01032 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { 01033 UserTy = LI->getType(); 01034 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { 01035 UserTy = SI->getValueOperand()->getType(); 01036 } 01037 01038 if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) { 01039 // If the type is larger than the partition, skip it. We only encounter 01040 // this for split integer operations where we want to use the type of the 01041 // entity causing the split. Also skip if the type is not a byte width 01042 // multiple. 01043 if (UserITy->getBitWidth() % 8 != 0 || 01044 UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset())) 01045 continue; 01046 01047 // Track the largest bitwidth integer type used in this way in case there 01048 // is no common type. 01049 if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth()) 01050 ITy = UserITy; 01051 } 01052 01053 // To avoid depending on the order of slices, Ty and TyIsCommon must not 01054 // depend on types skipped above. 01055 if (!UserTy || (Ty && Ty != UserTy)) 01056 TyIsCommon = false; // Give up on anything but an iN type. 01057 else 01058 Ty = UserTy; 01059 } 01060 01061 return TyIsCommon ? Ty : ITy; 01062 } 01063 01064 /// PHI instructions that use an alloca and are subsequently loaded can be 01065 /// rewritten to load both input pointers in the pred blocks and then PHI the 01066 /// results, allowing the load of the alloca to be promoted. 01067 /// From this: 01068 /// %P2 = phi [i32* %Alloca, i32* %Other] 01069 /// %V = load i32* %P2 01070 /// to: 01071 /// %V1 = load i32* %Alloca -> will be mem2reg'd 01072 /// ... 01073 /// %V2 = load i32* %Other 01074 /// ... 01075 /// %V = phi [i32 %V1, i32 %V2] 01076 /// 01077 /// We can do this to a select if its only uses are loads and if the operands 01078 /// to the select can be loaded unconditionally. 01079 /// 01080 /// FIXME: This should be hoisted into a generic utility, likely in 01081 /// Transforms/Util/Local.h 01082 static bool isSafePHIToSpeculate(PHINode &PN, 01083 const DataLayout *DL = nullptr) { 01084 // For now, we can only do this promotion if the load is in the same block 01085 // as the PHI, and if there are no stores between the phi and load. 01086 // TODO: Allow recursive phi users. 01087 // TODO: Allow stores. 01088 BasicBlock *BB = PN.getParent(); 01089 unsigned MaxAlign = 0; 01090 bool HaveLoad = false; 01091 for (User *U : PN.users()) { 01092 LoadInst *LI = dyn_cast<LoadInst>(U); 01093 if (!LI || !LI->isSimple()) 01094 return false; 01095 01096 // For now we only allow loads in the same block as the PHI. This is 01097 // a common case that happens when instcombine merges two loads through 01098 // a PHI. 01099 if (LI->getParent() != BB) 01100 return false; 01101 01102 // Ensure that there are no instructions between the PHI and the load that 01103 // could store. 01104 for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI) 01105 if (BBI->mayWriteToMemory()) 01106 return false; 01107 01108 MaxAlign = std::max(MaxAlign, LI->getAlignment()); 01109 HaveLoad = true; 01110 } 01111 01112 if (!HaveLoad) 01113 return false; 01114 01115 // We can only transform this if it is safe to push the loads into the 01116 // predecessor blocks. The only thing to watch out for is that we can't put 01117 // a possibly trapping load in the predecessor if it is a critical edge. 01118 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { 01119 TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator(); 01120 Value *InVal = PN.getIncomingValue(Idx); 01121 01122 // If the value is produced by the terminator of the predecessor (an 01123 // invoke) or it has side-effects, there is no valid place to put a load 01124 // in the predecessor. 01125 if (TI == InVal || TI->mayHaveSideEffects()) 01126 return false; 01127 01128 // If the predecessor has a single successor, then the edge isn't 01129 // critical. 01130 if (TI->getNumSuccessors() == 1) 01131 continue; 01132 01133 // If this pointer is always safe to load, or if we can prove that there 01134 // is already a load in the block, then we can move the load to the pred 01135 // block. 01136 if (InVal->isDereferenceablePointer(DL) || 01137 isSafeToLoadUnconditionally(InVal, TI, MaxAlign, DL)) 01138 continue; 01139 01140 return false; 01141 } 01142 01143 return true; 01144 } 01145 01146 static void speculatePHINodeLoads(PHINode &PN) { 01147 DEBUG(dbgs() << " original: " << PN << "\n"); 01148 01149 Type *LoadTy = cast<PointerType>(PN.getType())->getElementType(); 01150 IRBuilderTy PHIBuilder(&PN); 01151 PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), 01152 PN.getName() + ".sroa.speculated"); 01153 01154 // Get the AA tags and alignment to use from one of the loads. It doesn't 01155 // matter which one we get and if any differ. 01156 LoadInst *SomeLoad = cast<LoadInst>(PN.user_back()); 01157 01158 AAMDNodes AATags; 01159 SomeLoad->getAAMetadata(AATags); 01160 unsigned Align = SomeLoad->getAlignment(); 01161 01162 // Rewrite all loads of the PN to use the new PHI. 01163 while (!PN.use_empty()) { 01164 LoadInst *LI = cast<LoadInst>(PN.user_back()); 01165 LI->replaceAllUsesWith(NewPN); 01166 LI->eraseFromParent(); 01167 } 01168 01169 // Inject loads into all of the pred blocks. 01170 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { 01171 BasicBlock *Pred = PN.getIncomingBlock(Idx); 01172 TerminatorInst *TI = Pred->getTerminator(); 01173 Value *InVal = PN.getIncomingValue(Idx); 01174 IRBuilderTy PredBuilder(TI); 01175 01176 LoadInst *Load = PredBuilder.CreateLoad( 01177 InVal, (PN.getName() + ".sroa.speculate.load." + Pred->getName())); 01178 ++NumLoadsSpeculated; 01179 Load->setAlignment(Align); 01180 if (AATags) 01181 Load->setAAMetadata(AATags); 01182 NewPN->addIncoming(Load, Pred); 01183 } 01184 01185 DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); 01186 PN.eraseFromParent(); 01187 } 01188 01189 /// Select instructions that use an alloca and are subsequently loaded can be 01190 /// rewritten to load both input pointers and then select between the result, 01191 /// allowing the load of the alloca to be promoted. 01192 /// From this: 01193 /// %P2 = select i1 %cond, i32* %Alloca, i32* %Other 01194 /// %V = load i32* %P2 01195 /// to: 01196 /// %V1 = load i32* %Alloca -> will be mem2reg'd 01197 /// %V2 = load i32* %Other 01198 /// %V = select i1 %cond, i32 %V1, i32 %V2 01199 /// 01200 /// We can do this to a select if its only uses are loads and if the operand 01201 /// to the select can be loaded unconditionally. 01202 static bool isSafeSelectToSpeculate(SelectInst &SI, 01203 const DataLayout *DL = nullptr) { 01204 Value *TValue = SI.getTrueValue(); 01205 Value *FValue = SI.getFalseValue(); 01206 bool TDerefable = TValue->isDereferenceablePointer(DL); 01207 bool FDerefable = FValue->isDereferenceablePointer(DL); 01208 01209 for (User *U : SI.users()) { 01210 LoadInst *LI = dyn_cast<LoadInst>(U); 01211 if (!LI || !LI->isSimple()) 01212 return false; 01213 01214 // Both operands to the select need to be dereferencable, either 01215 // absolutely (e.g. allocas) or at this point because we can see other 01216 // accesses to it. 01217 if (!TDerefable && 01218 !isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment(), DL)) 01219 return false; 01220 if (!FDerefable && 01221 !isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment(), DL)) 01222 return false; 01223 } 01224 01225 return true; 01226 } 01227 01228 static void speculateSelectInstLoads(SelectInst &SI) { 01229 DEBUG(dbgs() << " original: " << SI << "\n"); 01230 01231 IRBuilderTy IRB(&SI); 01232 Value *TV = SI.getTrueValue(); 01233 Value *FV = SI.getFalseValue(); 01234 // Replace the loads of the select with a select of two loads. 01235 while (!SI.use_empty()) { 01236 LoadInst *LI = cast<LoadInst>(SI.user_back()); 01237 assert(LI->isSimple() && "We only speculate simple loads"); 01238 01239 IRB.SetInsertPoint(LI); 01240 LoadInst *TL = 01241 IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true"); 01242 LoadInst *FL = 01243 IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false"); 01244 NumLoadsSpeculated += 2; 01245 01246 // Transfer alignment and AA info if present. 01247 TL->setAlignment(LI->getAlignment()); 01248 FL->setAlignment(LI->getAlignment()); 01249 01250 AAMDNodes Tags; 01251 LI->getAAMetadata(Tags); 01252 if (Tags) { 01253 TL->setAAMetadata(Tags); 01254 FL->setAAMetadata(Tags); 01255 } 01256 01257 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, 01258 LI->getName() + ".sroa.speculated"); 01259 01260 DEBUG(dbgs() << " speculated to: " << *V << "\n"); 01261 LI->replaceAllUsesWith(V); 01262 LI->eraseFromParent(); 01263 } 01264 SI.eraseFromParent(); 01265 } 01266 01267 /// \brief Build a GEP out of a base pointer and indices. 01268 /// 01269 /// This will return the BasePtr if that is valid, or build a new GEP 01270 /// instruction using the IRBuilder if GEP-ing is needed. 01271 static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr, 01272 SmallVectorImpl<Value *> &Indices, Twine NamePrefix) { 01273 if (Indices.empty()) 01274 return BasePtr; 01275 01276 // A single zero index is a no-op, so check for this and avoid building a GEP 01277 // in that case. 01278 if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero()) 01279 return BasePtr; 01280 01281 return IRB.CreateInBoundsGEP(BasePtr, Indices, NamePrefix + "sroa_idx"); 01282 } 01283 01284 /// \brief Get a natural GEP off of the BasePtr walking through Ty toward 01285 /// TargetTy without changing the offset of the pointer. 01286 /// 01287 /// This routine assumes we've already established a properly offset GEP with 01288 /// Indices, and arrived at the Ty type. The goal is to continue to GEP with 01289 /// zero-indices down through type layers until we find one the same as 01290 /// TargetTy. If we can't find one with the same type, we at least try to use 01291 /// one with the same size. If none of that works, we just produce the GEP as 01292 /// indicated by Indices to have the correct offset. 01293 static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, 01294 Value *BasePtr, Type *Ty, Type *TargetTy, 01295 SmallVectorImpl<Value *> &Indices, 01296 Twine NamePrefix) { 01297 if (Ty == TargetTy) 01298 return buildGEP(IRB, BasePtr, Indices, NamePrefix); 01299 01300 // Pointer size to use for the indices. 01301 unsigned PtrSize = DL.getPointerTypeSizeInBits(BasePtr->getType()); 01302 01303 // See if we can descend into a struct and locate a field with the correct 01304 // type. 01305 unsigned NumLayers = 0; 01306 Type *ElementTy = Ty; 01307 do { 01308 if (ElementTy->isPointerTy()) 01309 break; 01310 01311 if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) { 01312 ElementTy = ArrayTy->getElementType(); 01313 Indices.push_back(IRB.getIntN(PtrSize, 0)); 01314 } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) { 01315 ElementTy = VectorTy->getElementType(); 01316 Indices.push_back(IRB.getInt32(0)); 01317 } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) { 01318 if (STy->element_begin() == STy->element_end()) 01319 break; // Nothing left to descend into. 01320 ElementTy = *STy->element_begin(); 01321 Indices.push_back(IRB.getInt32(0)); 01322 } else { 01323 break; 01324 } 01325 ++NumLayers; 01326 } while (ElementTy != TargetTy); 01327 if (ElementTy != TargetTy) 01328 Indices.erase(Indices.end() - NumLayers, Indices.end()); 01329 01330 return buildGEP(IRB, BasePtr, Indices, NamePrefix); 01331 } 01332 01333 /// \brief Recursively compute indices for a natural GEP. 01334 /// 01335 /// This is the recursive step for getNaturalGEPWithOffset that walks down the 01336 /// element types adding appropriate indices for the GEP. 01337 static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, 01338 Value *Ptr, Type *Ty, APInt &Offset, 01339 Type *TargetTy, 01340 SmallVectorImpl<Value *> &Indices, 01341 Twine NamePrefix) { 01342 if (Offset == 0) 01343 return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices, NamePrefix); 01344 01345 // We can't recurse through pointer types. 01346 if (Ty->isPointerTy()) 01347 return nullptr; 01348 01349 // We try to analyze GEPs over vectors here, but note that these GEPs are 01350 // extremely poorly defined currently. The long-term goal is to remove GEPing 01351 // over a vector from the IR completely. 01352 if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) { 01353 unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType()); 01354 if (ElementSizeInBits % 8 != 0) { 01355 // GEPs over non-multiple of 8 size vector elements are invalid. 01356 return nullptr; 01357 } 01358 APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8); 01359 APInt NumSkippedElements = Offset.sdiv(ElementSize); 01360 if (NumSkippedElements.ugt(VecTy->getNumElements())) 01361 return nullptr; 01362 Offset -= NumSkippedElements * ElementSize; 01363 Indices.push_back(IRB.getInt(NumSkippedElements)); 01364 return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(), 01365 Offset, TargetTy, Indices, NamePrefix); 01366 } 01367 01368 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 01369 Type *ElementTy = ArrTy->getElementType(); 01370 APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy)); 01371 APInt NumSkippedElements = Offset.sdiv(ElementSize); 01372 if (NumSkippedElements.ugt(ArrTy->getNumElements())) 01373 return nullptr; 01374 01375 Offset -= NumSkippedElements * ElementSize; 01376 Indices.push_back(IRB.getInt(NumSkippedElements)); 01377 return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, 01378 Indices, NamePrefix); 01379 } 01380 01381 StructType *STy = dyn_cast<StructType>(Ty); 01382 if (!STy) 01383 return nullptr; 01384 01385 const StructLayout *SL = DL.getStructLayout(STy); 01386 uint64_t StructOffset = Offset.getZExtValue(); 01387 if (StructOffset >= SL->getSizeInBytes()) 01388 return nullptr; 01389 unsigned Index = SL->getElementContainingOffset(StructOffset); 01390 Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index)); 01391 Type *ElementTy = STy->getElementType(Index); 01392 if (Offset.uge(DL.getTypeAllocSize(ElementTy))) 01393 return nullptr; // The offset points into alignment padding. 01394 01395 Indices.push_back(IRB.getInt32(Index)); 01396 return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, 01397 Indices, NamePrefix); 01398 } 01399 01400 /// \brief Get a natural GEP from a base pointer to a particular offset and 01401 /// resulting in a particular type. 01402 /// 01403 /// The goal is to produce a "natural" looking GEP that works with the existing 01404 /// composite types to arrive at the appropriate offset and element type for 01405 /// a pointer. TargetTy is the element type the returned GEP should point-to if 01406 /// possible. We recurse by decreasing Offset, adding the appropriate index to 01407 /// Indices, and setting Ty to the result subtype. 01408 /// 01409 /// If no natural GEP can be constructed, this function returns null. 01410 static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL, 01411 Value *Ptr, APInt Offset, Type *TargetTy, 01412 SmallVectorImpl<Value *> &Indices, 01413 Twine NamePrefix) { 01414 PointerType *Ty = cast<PointerType>(Ptr->getType()); 01415 01416 // Don't consider any GEPs through an i8* as natural unless the TargetTy is 01417 // an i8. 01418 if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8)) 01419 return nullptr; 01420 01421 Type *ElementTy = Ty->getElementType(); 01422 if (!ElementTy->isSized()) 01423 return nullptr; // We can't GEP through an unsized element. 01424 APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy)); 01425 if (ElementSize == 0) 01426 return nullptr; // Zero-length arrays can't help us build a natural GEP. 01427 APInt NumSkippedElements = Offset.sdiv(ElementSize); 01428 01429 Offset -= NumSkippedElements * ElementSize; 01430 Indices.push_back(IRB.getInt(NumSkippedElements)); 01431 return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy, 01432 Indices, NamePrefix); 01433 } 01434 01435 /// \brief Compute an adjusted pointer from Ptr by Offset bytes where the 01436 /// resulting pointer has PointerTy. 01437 /// 01438 /// This tries very hard to compute a "natural" GEP which arrives at the offset 01439 /// and produces the pointer type desired. Where it cannot, it will try to use 01440 /// the natural GEP to arrive at the offset and bitcast to the type. Where that 01441 /// fails, it will try to use an existing i8* and GEP to the byte offset and 01442 /// bitcast to the type. 01443 /// 01444 /// The strategy for finding the more natural GEPs is to peel off layers of the 01445 /// pointer, walking back through bit casts and GEPs, searching for a base 01446 /// pointer from which we can compute a natural GEP with the desired 01447 /// properties. The algorithm tries to fold as many constant indices into 01448 /// a single GEP as possible, thus making each GEP more independent of the 01449 /// surrounding code. 01450 static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, 01451 APInt Offset, Type *PointerTy, 01452 Twine NamePrefix) { 01453 // Even though we don't look through PHI nodes, we could be called on an 01454 // instruction in an unreachable block, which may be on a cycle. 01455 SmallPtrSet<Value *, 4> Visited; 01456 Visited.insert(Ptr); 01457 SmallVector<Value *, 4> Indices; 01458 01459 // We may end up computing an offset pointer that has the wrong type. If we 01460 // never are able to compute one directly that has the correct type, we'll 01461 // fall back to it, so keep it around here. 01462 Value *OffsetPtr = nullptr; 01463 01464 // Remember any i8 pointer we come across to re-use if we need to do a raw 01465 // byte offset. 01466 Value *Int8Ptr = nullptr; 01467 APInt Int8PtrOffset(Offset.getBitWidth(), 0); 01468 01469 Type *TargetTy = PointerTy->getPointerElementType(); 01470 01471 do { 01472 // First fold any existing GEPs into the offset. 01473 while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) { 01474 APInt GEPOffset(Offset.getBitWidth(), 0); 01475 if (!GEP->accumulateConstantOffset(DL, GEPOffset)) 01476 break; 01477 Offset += GEPOffset; 01478 Ptr = GEP->getPointerOperand(); 01479 if (!Visited.insert(Ptr)) 01480 break; 01481 } 01482 01483 // See if we can perform a natural GEP here. 01484 Indices.clear(); 01485 if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy, 01486 Indices, NamePrefix)) { 01487 if (P->getType() == PointerTy) { 01488 // Zap any offset pointer that we ended up computing in previous rounds. 01489 if (OffsetPtr && OffsetPtr->use_empty()) 01490 if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) 01491 I->eraseFromParent(); 01492 return P; 01493 } 01494 if (!OffsetPtr) { 01495 OffsetPtr = P; 01496 } 01497 } 01498 01499 // Stash this pointer if we've found an i8*. 01500 if (Ptr->getType()->isIntegerTy(8)) { 01501 Int8Ptr = Ptr; 01502 Int8PtrOffset = Offset; 01503 } 01504 01505 // Peel off a layer of the pointer and update the offset appropriately. 01506 if (Operator::getOpcode(Ptr) == Instruction::BitCast) { 01507 Ptr = cast<Operator>(Ptr)->getOperand(0); 01508 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) { 01509 if (GA->mayBeOverridden()) 01510 break; 01511 Ptr = GA->getAliasee(); 01512 } else { 01513 break; 01514 } 01515 assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!"); 01516 } while (Visited.insert(Ptr)); 01517 01518 if (!OffsetPtr) { 01519 if (!Int8Ptr) { 01520 Int8Ptr = IRB.CreateBitCast( 01521 Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()), 01522 NamePrefix + "sroa_raw_cast"); 01523 Int8PtrOffset = Offset; 01524 } 01525 01526 OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr : 01527 IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset), 01528 NamePrefix + "sroa_raw_idx"); 01529 } 01530 Ptr = OffsetPtr; 01531 01532 // On the off chance we were targeting i8*, guard the bitcast here. 01533 if (Ptr->getType() != PointerTy) 01534 Ptr = IRB.CreateBitCast(Ptr, PointerTy, NamePrefix + "sroa_cast"); 01535 01536 return Ptr; 01537 } 01538 01539 /// \brief Test whether we can convert a value from the old to the new type. 01540 /// 01541 /// This predicate should be used to guard calls to convertValue in order to 01542 /// ensure that we only try to convert viable values. The strategy is that we 01543 /// will peel off single element struct and array wrappings to get to an 01544 /// underlying value, and convert that value. 01545 static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { 01546 if (OldTy == NewTy) 01547 return true; 01548 if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy)) 01549 if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy)) 01550 if (NewITy->getBitWidth() >= OldITy->getBitWidth()) 01551 return true; 01552 if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy)) 01553 return false; 01554 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) 01555 return false; 01556 01557 // We can convert pointers to integers and vice-versa. Same for vectors 01558 // of pointers and integers. 01559 OldTy = OldTy->getScalarType(); 01560 NewTy = NewTy->getScalarType(); 01561 if (NewTy->isPointerTy() || OldTy->isPointerTy()) { 01562 if (NewTy->isPointerTy() && OldTy->isPointerTy()) 01563 return true; 01564 if (NewTy->isIntegerTy() || OldTy->isIntegerTy()) 01565 return true; 01566 return false; 01567 } 01568 01569 return true; 01570 } 01571 01572 /// \brief Generic routine to convert an SSA value to a value of a different 01573 /// type. 01574 /// 01575 /// This will try various different casting techniques, such as bitcasts, 01576 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test 01577 /// two types for viability with this routine. 01578 static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, 01579 Type *NewTy) { 01580 Type *OldTy = V->getType(); 01581 assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type"); 01582 01583 if (OldTy == NewTy) 01584 return V; 01585 01586 if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy)) 01587 if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy)) 01588 if (NewITy->getBitWidth() > OldITy->getBitWidth()) 01589 return IRB.CreateZExt(V, NewITy); 01590 01591 // See if we need inttoptr for this type pair. A cast involving both scalars 01592 // and vectors requires and additional bitcast. 01593 if (OldTy->getScalarType()->isIntegerTy() && 01594 NewTy->getScalarType()->isPointerTy()) { 01595 // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8* 01596 if (OldTy->isVectorTy() && !NewTy->isVectorTy()) 01597 return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)), 01598 NewTy); 01599 01600 // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*> 01601 if (!OldTy->isVectorTy() && NewTy->isVectorTy()) 01602 return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)), 01603 NewTy); 01604 01605 return IRB.CreateIntToPtr(V, NewTy); 01606 } 01607 01608 // See if we need ptrtoint for this type pair. A cast involving both scalars 01609 // and vectors requires and additional bitcast. 01610 if (OldTy->getScalarType()->isPointerTy() && 01611 NewTy->getScalarType()->isIntegerTy()) { 01612 // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128 01613 if (OldTy->isVectorTy() && !NewTy->isVectorTy()) 01614 return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), 01615 NewTy); 01616 01617 // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32> 01618 if (!OldTy->isVectorTy() && NewTy->isVectorTy()) 01619 return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), 01620 NewTy); 01621 01622 return IRB.CreatePtrToInt(V, NewTy); 01623 } 01624 01625 return IRB.CreateBitCast(V, NewTy); 01626 } 01627 01628 /// \brief Test whether the given slice use can be promoted to a vector. 01629 /// 01630 /// This function is called to test each entry in a partioning which is slated 01631 /// for a single slice. 01632 static bool isVectorPromotionViableForSlice( 01633 const DataLayout &DL, AllocaSlices &S, uint64_t SliceBeginOffset, 01634 uint64_t SliceEndOffset, VectorType *Ty, uint64_t ElementSize, 01635 AllocaSlices::const_iterator I) { 01636 // First validate the slice offsets. 01637 uint64_t BeginOffset = 01638 std::max(I->beginOffset(), SliceBeginOffset) - SliceBeginOffset; 01639 uint64_t BeginIndex = BeginOffset / ElementSize; 01640 if (BeginIndex * ElementSize != BeginOffset || 01641 BeginIndex >= Ty->getNumElements()) 01642 return false; 01643 uint64_t EndOffset = 01644 std::min(I->endOffset(), SliceEndOffset) - SliceBeginOffset; 01645 uint64_t EndIndex = EndOffset / ElementSize; 01646 if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements()) 01647 return false; 01648 01649 assert(EndIndex > BeginIndex && "Empty vector!"); 01650 uint64_t NumElements = EndIndex - BeginIndex; 01651 Type *SliceTy = 01652 (NumElements == 1) ? Ty->getElementType() 01653 : VectorType::get(Ty->getElementType(), NumElements); 01654 01655 Type *SplitIntTy = 01656 Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8); 01657 01658 Use *U = I->getUse(); 01659 01660 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { 01661 if (MI->isVolatile()) 01662 return false; 01663 if (!I->isSplittable()) 01664 return false; // Skip any unsplittable intrinsics. 01665 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) { 01666 if (II->getIntrinsicID() != Intrinsic::lifetime_start && 01667 II->getIntrinsicID() != Intrinsic::lifetime_end) 01668 return false; 01669 } else if (U->get()->getType()->getPointerElementType()->isStructTy()) { 01670 // Disable vector promotion when there are loads or stores of an FCA. 01671 return false; 01672 } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { 01673 if (LI->isVolatile()) 01674 return false; 01675 Type *LTy = LI->getType(); 01676 if (SliceBeginOffset > I->beginOffset() || 01677 SliceEndOffset < I->endOffset()) { 01678 assert(LTy->isIntegerTy()); 01679 LTy = SplitIntTy; 01680 } 01681 if (!canConvertValue(DL, SliceTy, LTy)) 01682 return false; 01683 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { 01684 if (SI->isVolatile()) 01685 return false; 01686 Type *STy = SI->getValueOperand()->getType(); 01687 if (SliceBeginOffset > I->beginOffset() || 01688 SliceEndOffset < I->endOffset()) { 01689 assert(STy->isIntegerTy()); 01690 STy = SplitIntTy; 01691 } 01692 if (!canConvertValue(DL, STy, SliceTy)) 01693 return false; 01694 } else { 01695 return false; 01696 } 01697 01698 return true; 01699 } 01700 01701 /// \brief Test whether the given alloca partitioning and range of slices can be 01702 /// promoted to a vector. 01703 /// 01704 /// This is a quick test to check whether we can rewrite a particular alloca 01705 /// partition (and its newly formed alloca) into a vector alloca with only 01706 /// whole-vector loads and stores such that it could be promoted to a vector 01707 /// SSA value. We only can ensure this for a limited set of operations, and we 01708 /// don't want to do the rewrites unless we are confident that the result will 01709 /// be promotable, so we have an early test here. 01710 static bool 01711 isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy, AllocaSlices &S, 01712 uint64_t SliceBeginOffset, uint64_t SliceEndOffset, 01713 AllocaSlices::const_iterator I, 01714 AllocaSlices::const_iterator E, 01715 ArrayRef<AllocaSlices::iterator> SplitUses) { 01716 VectorType *Ty = dyn_cast<VectorType>(AllocaTy); 01717 if (!Ty) 01718 return false; 01719 01720 uint64_t ElementSize = DL.getTypeSizeInBits(Ty->getScalarType()); 01721 01722 // While the definition of LLVM vectors is bitpacked, we don't support sizes 01723 // that aren't byte sized. 01724 if (ElementSize % 8) 01725 return false; 01726 assert((DL.getTypeSizeInBits(Ty) % 8) == 0 && 01727 "vector size not a multiple of element size?"); 01728 ElementSize /= 8; 01729 01730 for (; I != E; ++I) 01731 if (!isVectorPromotionViableForSlice(DL, S, SliceBeginOffset, 01732 SliceEndOffset, Ty, ElementSize, I)) 01733 return false; 01734 01735 for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(), 01736 SUE = SplitUses.end(); 01737 SUI != SUE; ++SUI) 01738 if (!isVectorPromotionViableForSlice(DL, S, SliceBeginOffset, 01739 SliceEndOffset, Ty, ElementSize, *SUI)) 01740 return false; 01741 01742 return true; 01743 } 01744 01745 /// \brief Test whether a slice of an alloca is valid for integer widening. 01746 /// 01747 /// This implements the necessary checking for the \c isIntegerWideningViable 01748 /// test below on a single slice of the alloca. 01749 static bool isIntegerWideningViableForSlice(const DataLayout &DL, 01750 Type *AllocaTy, 01751 uint64_t AllocBeginOffset, 01752 uint64_t Size, AllocaSlices &S, 01753 AllocaSlices::const_iterator I, 01754 bool &WholeAllocaOp) { 01755 uint64_t RelBegin = I->beginOffset() - AllocBeginOffset; 01756 uint64_t RelEnd = I->endOffset() - AllocBeginOffset; 01757 01758 // We can't reasonably handle cases where the load or store extends past 01759 // the end of the aloca's type and into its padding. 01760 if (RelEnd > Size) 01761 return false; 01762 01763 Use *U = I->getUse(); 01764 01765 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { 01766 if (LI->isVolatile()) 01767 return false; 01768 if (RelBegin == 0 && RelEnd == Size) 01769 WholeAllocaOp = true; 01770 if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) { 01771 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy)) 01772 return false; 01773 } else if (RelBegin != 0 || RelEnd != Size || 01774 !canConvertValue(DL, AllocaTy, LI->getType())) { 01775 // Non-integer loads need to be convertible from the alloca type so that 01776 // they are promotable. 01777 return false; 01778 } 01779 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { 01780 Type *ValueTy = SI->getValueOperand()->getType(); 01781 if (SI->isVolatile()) 01782 return false; 01783 if (RelBegin == 0 && RelEnd == Size) 01784 WholeAllocaOp = true; 01785 if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) { 01786 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy)) 01787 return false; 01788 } else if (RelBegin != 0 || RelEnd != Size || 01789 !canConvertValue(DL, ValueTy, AllocaTy)) { 01790 // Non-integer stores need to be convertible to the alloca type so that 01791 // they are promotable. 01792 return false; 01793 } 01794 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { 01795 if (MI->isVolatile() || !isa<Constant>(MI->getLength())) 01796 return false; 01797 if (!I->isSplittable()) 01798 return false; // Skip any unsplittable intrinsics. 01799 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) { 01800 if (II->getIntrinsicID() != Intrinsic::lifetime_start && 01801 II->getIntrinsicID() != Intrinsic::lifetime_end) 01802 return false; 01803 } else { 01804 return false; 01805 } 01806 01807 return true; 01808 } 01809 01810 /// \brief Test whether the given alloca partition's integer operations can be 01811 /// widened to promotable ones. 01812 /// 01813 /// This is a quick test to check whether we can rewrite the integer loads and 01814 /// stores to a particular alloca into wider loads and stores and be able to 01815 /// promote the resulting alloca. 01816 static bool 01817 isIntegerWideningViable(const DataLayout &DL, Type *AllocaTy, 01818 uint64_t AllocBeginOffset, AllocaSlices &S, 01819 AllocaSlices::const_iterator I, 01820 AllocaSlices::const_iterator E, 01821 ArrayRef<AllocaSlices::iterator> SplitUses) { 01822 uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy); 01823 // Don't create integer types larger than the maximum bitwidth. 01824 if (SizeInBits > IntegerType::MAX_INT_BITS) 01825 return false; 01826 01827 // Don't try to handle allocas with bit-padding. 01828 if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy)) 01829 return false; 01830 01831 // We need to ensure that an integer type with the appropriate bitwidth can 01832 // be converted to the alloca type, whatever that is. We don't want to force 01833 // the alloca itself to have an integer type if there is a more suitable one. 01834 Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits); 01835 if (!canConvertValue(DL, AllocaTy, IntTy) || 01836 !canConvertValue(DL, IntTy, AllocaTy)) 01837 return false; 01838 01839 uint64_t Size = DL.getTypeStoreSize(AllocaTy); 01840 01841 // While examining uses, we ensure that the alloca has a covering load or 01842 // store. We don't want to widen the integer operations only to fail to 01843 // promote due to some other unsplittable entry (which we may make splittable 01844 // later). However, if there are only splittable uses, go ahead and assume 01845 // that we cover the alloca. 01846 bool WholeAllocaOp = (I != E) ? false : DL.isLegalInteger(SizeInBits); 01847 01848 for (; I != E; ++I) 01849 if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size, 01850 S, I, WholeAllocaOp)) 01851 return false; 01852 01853 for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(), 01854 SUE = SplitUses.end(); 01855 SUI != SUE; ++SUI) 01856 if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size, 01857 S, *SUI, WholeAllocaOp)) 01858 return false; 01859 01860 return WholeAllocaOp; 01861 } 01862 01863 static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, 01864 IntegerType *Ty, uint64_t Offset, 01865 const Twine &Name) { 01866 DEBUG(dbgs() << " start: " << *V << "\n"); 01867 IntegerType *IntTy = cast<IntegerType>(V->getType()); 01868 assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && 01869 "Element extends past full value"); 01870 uint64_t ShAmt = 8*Offset; 01871 if (DL.isBigEndian()) 01872 ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); 01873 if (ShAmt) { 01874 V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); 01875 DEBUG(dbgs() << " shifted: " << *V << "\n"); 01876 } 01877 assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 01878 "Cannot extract to a larger integer!"); 01879 if (Ty != IntTy) { 01880 V = IRB.CreateTrunc(V, Ty, Name + ".trunc"); 01881 DEBUG(dbgs() << " trunced: " << *V << "\n"); 01882 } 01883 return V; 01884 } 01885 01886 static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, 01887 Value *V, uint64_t Offset, const Twine &Name) { 01888 IntegerType *IntTy = cast<IntegerType>(Old->getType()); 01889 IntegerType *Ty = cast<IntegerType>(V->getType()); 01890 assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 01891 "Cannot insert a larger integer!"); 01892 DEBUG(dbgs() << " start: " << *V << "\n"); 01893 if (Ty != IntTy) { 01894 V = IRB.CreateZExt(V, IntTy, Name + ".ext"); 01895 DEBUG(dbgs() << " extended: " << *V << "\n"); 01896 } 01897 assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) && 01898 "Element store outside of alloca store"); 01899 uint64_t ShAmt = 8*Offset; 01900 if (DL.isBigEndian()) 01901 ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset); 01902 if (ShAmt) { 01903 V = IRB.CreateShl(V, ShAmt, Name + ".shift"); 01904 DEBUG(dbgs() << " shifted: " << *V << "\n"); 01905 } 01906 01907 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { 01908 APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); 01909 Old = IRB.CreateAnd(Old, Mask, Name + ".mask"); 01910 DEBUG(dbgs() << " masked: " << *Old << "\n"); 01911 V = IRB.CreateOr(Old, V, Name + ".insert"); 01912 DEBUG(dbgs() << " inserted: " << *V << "\n"); 01913 } 01914 return V; 01915 } 01916 01917 static Value *extractVector(IRBuilderTy &IRB, Value *V, 01918 unsigned BeginIndex, unsigned EndIndex, 01919 const Twine &Name) { 01920 VectorType *VecTy = cast<VectorType>(V->getType()); 01921 unsigned NumElements = EndIndex - BeginIndex; 01922 assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); 01923 01924 if (NumElements == VecTy->getNumElements()) 01925 return V; 01926 01927 if (NumElements == 1) { 01928 V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex), 01929 Name + ".extract"); 01930 DEBUG(dbgs() << " extract: " << *V << "\n"); 01931 return V; 01932 } 01933 01934 SmallVector<Constant*, 8> Mask; 01935 Mask.reserve(NumElements); 01936 for (unsigned i = BeginIndex; i != EndIndex; ++i) 01937 Mask.push_back(IRB.getInt32(i)); 01938 V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), 01939 ConstantVector::get(Mask), 01940 Name + ".extract"); 01941 DEBUG(dbgs() << " shuffle: " << *V << "\n"); 01942 return V; 01943 } 01944 01945 static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, 01946 unsigned BeginIndex, const Twine &Name) { 01947 VectorType *VecTy = cast<VectorType>(Old->getType()); 01948 assert(VecTy && "Can only insert a vector into a vector"); 01949 01950 VectorType *Ty = dyn_cast<VectorType>(V->getType()); 01951 if (!Ty) { 01952 // Single element to insert. 01953 V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex), 01954 Name + ".insert"); 01955 DEBUG(dbgs() << " insert: " << *V << "\n"); 01956 return V; 01957 } 01958 01959 assert(Ty->getNumElements() <= VecTy->getNumElements() && 01960 "Too many elements!"); 01961 if (Ty->getNumElements() == VecTy->getNumElements()) { 01962 assert(V->getType() == VecTy && "Vector type mismatch"); 01963 return V; 01964 } 01965 unsigned EndIndex = BeginIndex + Ty->getNumElements(); 01966 01967 // When inserting a smaller vector into the larger to store, we first 01968 // use a shuffle vector to widen it with undef elements, and then 01969 // a second shuffle vector to select between the loaded vector and the 01970 // incoming vector. 01971 SmallVector<Constant*, 8> Mask; 01972 Mask.reserve(VecTy->getNumElements()); 01973 for (unsigned i = 0; i != VecTy->getNumElements(); ++i) 01974 if (i >= BeginIndex && i < EndIndex) 01975 Mask.push_back(IRB.getInt32(i - BeginIndex)); 01976 else 01977 Mask.push_back(UndefValue::get(IRB.getInt32Ty())); 01978 V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), 01979 ConstantVector::get(Mask), 01980 Name + ".expand"); 01981 DEBUG(dbgs() << " shuffle: " << *V << "\n"); 01982 01983 Mask.clear(); 01984 for (unsigned i = 0; i != VecTy->getNumElements(); ++i) 01985 Mask.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex)); 01986 01987 V = IRB.CreateSelect(ConstantVector::get(Mask), V, Old, Name + "blend"); 01988 01989 DEBUG(dbgs() << " blend: " << *V << "\n"); 01990 return V; 01991 } 01992 01993 namespace { 01994 /// \brief Visitor to rewrite instructions using p particular slice of an alloca 01995 /// to use a new alloca. 01996 /// 01997 /// Also implements the rewriting to vector-based accesses when the partition 01998 /// passes the isVectorPromotionViable predicate. Most of the rewriting logic 01999 /// lives here. 02000 class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> { 02001 // Befriend the base class so it can delegate to private visit methods. 02002 friend class llvm::InstVisitor<AllocaSliceRewriter, bool>; 02003 typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base; 02004 02005 const DataLayout &DL; 02006 AllocaSlices &S; 02007 SROA &Pass; 02008 AllocaInst &OldAI, &NewAI; 02009 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset; 02010 Type *NewAllocaTy; 02011 02012 // If we are rewriting an alloca partition which can be written as pure 02013 // vector operations, we stash extra information here. When VecTy is 02014 // non-null, we have some strict guarantees about the rewritten alloca: 02015 // - The new alloca is exactly the size of the vector type here. 02016 // - The accesses all either map to the entire vector or to a single 02017 // element. 02018 // - The set of accessing instructions is only one of those handled above 02019 // in isVectorPromotionViable. Generally these are the same access kinds 02020 // which are promotable via mem2reg. 02021 VectorType *VecTy; 02022 Type *ElementTy; 02023 uint64_t ElementSize; 02024 02025 // This is a convenience and flag variable that will be null unless the new 02026 // alloca's integer operations should be widened to this integer type due to 02027 // passing isIntegerWideningViable above. If it is non-null, the desired 02028 // integer type will be stored here for easy access during rewriting. 02029 IntegerType *IntTy; 02030 02031 // The original offset of the slice currently being rewritten relative to 02032 // the original alloca. 02033 uint64_t BeginOffset, EndOffset; 02034 // The new offsets of the slice currently being rewritten relative to the 02035 // original alloca. 02036 uint64_t NewBeginOffset, NewEndOffset; 02037 02038 uint64_t SliceSize; 02039 bool IsSplittable; 02040 bool IsSplit; 02041 Use *OldUse; 02042 Instruction *OldPtr; 02043 02044 // Track post-rewrite users which are PHI nodes and Selects. 02045 SmallPtrSetImpl<PHINode *> &PHIUsers; 02046 SmallPtrSetImpl<SelectInst *> &SelectUsers; 02047 02048 // Utility IR builder, whose name prefix is setup for each visited use, and 02049 // the insertion point is set to point to the user. 02050 IRBuilderTy IRB; 02051 02052 public: 02053 AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &S, SROA &Pass, 02054 AllocaInst &OldAI, AllocaInst &NewAI, 02055 uint64_t NewAllocaBeginOffset, 02056 uint64_t NewAllocaEndOffset, bool IsVectorPromotable, 02057 bool IsIntegerPromotable, 02058 SmallPtrSetImpl<PHINode *> &PHIUsers, 02059 SmallPtrSetImpl<SelectInst *> &SelectUsers) 02060 : DL(DL), S(S), Pass(Pass), OldAI(OldAI), NewAI(NewAI), 02061 NewAllocaBeginOffset(NewAllocaBeginOffset), 02062 NewAllocaEndOffset(NewAllocaEndOffset), 02063 NewAllocaTy(NewAI.getAllocatedType()), 02064 VecTy(IsVectorPromotable ? cast<VectorType>(NewAllocaTy) : nullptr), 02065 ElementTy(VecTy ? VecTy->getElementType() : nullptr), 02066 ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0), 02067 IntTy(IsIntegerPromotable 02068 ? Type::getIntNTy( 02069 NewAI.getContext(), 02070 DL.getTypeSizeInBits(NewAI.getAllocatedType())) 02071 : nullptr), 02072 BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(), 02073 OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers), 02074 IRB(NewAI.getContext(), ConstantFolder()) { 02075 if (VecTy) { 02076 assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 && 02077 "Only multiple-of-8 sized vector elements are viable"); 02078 ++NumVectorized; 02079 } 02080 assert((!IsVectorPromotable && !IsIntegerPromotable) || 02081 IsVectorPromotable != IsIntegerPromotable); 02082 } 02083 02084 bool visit(AllocaSlices::const_iterator I) { 02085 bool CanSROA = true; 02086 BeginOffset = I->beginOffset(); 02087 EndOffset = I->endOffset(); 02088 IsSplittable = I->isSplittable(); 02089 IsSplit = 02090 BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset; 02091 02092 // Compute the intersecting offset range. 02093 assert(BeginOffset < NewAllocaEndOffset); 02094 assert(EndOffset > NewAllocaBeginOffset); 02095 NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); 02096 NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); 02097 02098 SliceSize = NewEndOffset - NewBeginOffset; 02099 02100 OldUse = I->getUse(); 02101 OldPtr = cast<Instruction>(OldUse->get()); 02102 02103 Instruction *OldUserI = cast<Instruction>(OldUse->getUser()); 02104 IRB.SetInsertPoint(OldUserI); 02105 IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc()); 02106 IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) + "."); 02107 02108 CanSROA &= visit(cast<Instruction>(OldUse->getUser())); 02109 if (VecTy || IntTy) 02110 assert(CanSROA); 02111 return CanSROA; 02112 } 02113 02114 private: 02115 // Make sure the other visit overloads are visible. 02116 using Base::visit; 02117 02118 // Every instruction which can end up as a user must have a rewrite rule. 02119 bool visitInstruction(Instruction &I) { 02120 DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n"); 02121 llvm_unreachable("No rewrite rule for this instruction!"); 02122 } 02123 02124 Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) { 02125 // Note that the offset computation can use BeginOffset or NewBeginOffset 02126 // interchangeably for unsplit slices. 02127 assert(IsSplit || BeginOffset == NewBeginOffset); 02128 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 02129 02130 #ifndef NDEBUG 02131 StringRef OldName = OldPtr->getName(); 02132 // Skip through the last '.sroa.' component of the name. 02133 size_t LastSROAPrefix = OldName.rfind(".sroa."); 02134 if (LastSROAPrefix != StringRef::npos) { 02135 OldName = OldName.substr(LastSROAPrefix + strlen(".sroa.")); 02136 // Look for an SROA slice index. 02137 size_t IndexEnd = OldName.find_first_not_of("0123456789"); 02138 if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') { 02139 // Strip the index and look for the offset. 02140 OldName = OldName.substr(IndexEnd + 1); 02141 size_t OffsetEnd = OldName.find_first_not_of("0123456789"); 02142 if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.') 02143 // Strip the offset. 02144 OldName = OldName.substr(OffsetEnd + 1); 02145 } 02146 } 02147 // Strip any SROA suffixes as well. 02148 OldName = OldName.substr(0, OldName.find(".sroa_")); 02149 #endif 02150 02151 return getAdjustedPtr(IRB, DL, &NewAI, 02152 APInt(DL.getPointerSizeInBits(), Offset), PointerTy, 02153 #ifndef NDEBUG 02154 Twine(OldName) + "." 02155 #else 02156 Twine() 02157 #endif 02158 ); 02159 } 02160 02161 /// \brief Compute suitable alignment to access this slice of the *new* alloca. 02162 /// 02163 /// You can optionally pass a type to this routine and if that type's ABI 02164 /// alignment is itself suitable, this will return zero. 02165 unsigned getSliceAlign(Type *Ty = nullptr) { 02166 unsigned NewAIAlign = NewAI.getAlignment(); 02167 if (!NewAIAlign) 02168 NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType()); 02169 unsigned Align = MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset); 02170 return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align; 02171 } 02172 02173 unsigned getIndex(uint64_t Offset) { 02174 assert(VecTy && "Can only call getIndex when rewriting a vector"); 02175 uint64_t RelOffset = Offset - NewAllocaBeginOffset; 02176 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds"); 02177 uint32_t Index = RelOffset / ElementSize; 02178 assert(Index * ElementSize == RelOffset); 02179 return Index; 02180 } 02181 02182 void deleteIfTriviallyDead(Value *V) { 02183 Instruction *I = cast<Instruction>(V); 02184 if (isInstructionTriviallyDead(I)) 02185 Pass.DeadInsts.insert(I); 02186 } 02187 02188 Value *rewriteVectorizedLoadInst() { 02189 unsigned BeginIndex = getIndex(NewBeginOffset); 02190 unsigned EndIndex = getIndex(NewEndOffset); 02191 assert(EndIndex > BeginIndex && "Empty vector!"); 02192 02193 Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 02194 "load"); 02195 return extractVector(IRB, V, BeginIndex, EndIndex, "vec"); 02196 } 02197 02198 Value *rewriteIntegerLoad(LoadInst &LI) { 02199 assert(IntTy && "We cannot insert an integer to the alloca"); 02200 assert(!LI.isVolatile()); 02201 Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 02202 "load"); 02203 V = convertValue(DL, IRB, V, IntTy); 02204 assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 02205 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 02206 if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) 02207 V = extractInteger(DL, IRB, V, cast<IntegerType>(LI.getType()), Offset, 02208 "extract"); 02209 return V; 02210 } 02211 02212 bool visitLoadInst(LoadInst &LI) { 02213 DEBUG(dbgs() << " original: " << LI << "\n"); 02214 Value *OldOp = LI.getOperand(0); 02215 assert(OldOp == OldPtr); 02216 02217 Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8) 02218 : LI.getType(); 02219 bool IsPtrAdjusted = false; 02220 Value *V; 02221 if (VecTy) { 02222 V = rewriteVectorizedLoadInst(); 02223 } else if (IntTy && LI.getType()->isIntegerTy()) { 02224 V = rewriteIntegerLoad(LI); 02225 } else if (NewBeginOffset == NewAllocaBeginOffset && 02226 canConvertValue(DL, NewAllocaTy, LI.getType())) { 02227 V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 02228 LI.isVolatile(), LI.getName()); 02229 } else { 02230 Type *LTy = TargetTy->getPointerTo(); 02231 V = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy), 02232 getSliceAlign(TargetTy), LI.isVolatile(), 02233 LI.getName()); 02234 IsPtrAdjusted = true; 02235 } 02236 V = convertValue(DL, IRB, V, TargetTy); 02237 02238 if (IsSplit) { 02239 assert(!LI.isVolatile()); 02240 assert(LI.getType()->isIntegerTy() && 02241 "Only integer type loads and stores are split"); 02242 assert(SliceSize < DL.getTypeStoreSize(LI.getType()) && 02243 "Split load isn't smaller than original load"); 02244 assert(LI.getType()->getIntegerBitWidth() == 02245 DL.getTypeStoreSizeInBits(LI.getType()) && 02246 "Non-byte-multiple bit width"); 02247 // Move the insertion point just past the load so that we can refer to it. 02248 IRB.SetInsertPoint(std::next(BasicBlock::iterator(&LI))); 02249 // Create a placeholder value with the same type as LI to use as the 02250 // basis for the new value. This allows us to replace the uses of LI with 02251 // the computed value, and then replace the placeholder with LI, leaving 02252 // LI only used for this computation. 02253 Value *Placeholder 02254 = new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); 02255 V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset, 02256 "insert"); 02257 LI.replaceAllUsesWith(V); 02258 Placeholder->replaceAllUsesWith(&LI); 02259 delete Placeholder; 02260 } else { 02261 LI.replaceAllUsesWith(V); 02262 } 02263 02264 Pass.DeadInsts.insert(&LI); 02265 deleteIfTriviallyDead(OldOp); 02266 DEBUG(dbgs() << " to: " << *V << "\n"); 02267 return !LI.isVolatile() && !IsPtrAdjusted; 02268 } 02269 02270 bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp) { 02271 if (V->getType() != VecTy) { 02272 unsigned BeginIndex = getIndex(NewBeginOffset); 02273 unsigned EndIndex = getIndex(NewEndOffset); 02274 assert(EndIndex > BeginIndex && "Empty vector!"); 02275 unsigned NumElements = EndIndex - BeginIndex; 02276 assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); 02277 Type *SliceTy = 02278 (NumElements == 1) ? ElementTy 02279 : VectorType::get(ElementTy, NumElements); 02280 if (V->getType() != SliceTy) 02281 V = convertValue(DL, IRB, V, SliceTy); 02282 02283 // Mix in the existing elements. 02284 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 02285 "load"); 02286 V = insertVector(IRB, Old, V, BeginIndex, "vec"); 02287 } 02288 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); 02289 Pass.DeadInsts.insert(&SI); 02290 02291 (void)Store; 02292 DEBUG(dbgs() << " to: " << *Store << "\n"); 02293 return true; 02294 } 02295 02296 bool rewriteIntegerStore(Value *V, StoreInst &SI) { 02297 assert(IntTy && "We cannot extract an integer from the alloca"); 02298 assert(!SI.isVolatile()); 02299 if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) { 02300 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 02301 "oldload"); 02302 Old = convertValue(DL, IRB, Old, IntTy); 02303 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 02304 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 02305 V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, 02306 "insert"); 02307 } 02308 V = convertValue(DL, IRB, V, NewAllocaTy); 02309 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment()); 02310 Pass.DeadInsts.insert(&SI); 02311 (void)Store; 02312 DEBUG(dbgs() << " to: " << *Store << "\n"); 02313 return true; 02314 } 02315 02316 bool visitStoreInst(StoreInst &SI) { 02317 DEBUG(dbgs() << " original: " << SI << "\n"); 02318 Value *OldOp = SI.getOperand(1); 02319 assert(OldOp == OldPtr); 02320 02321 Value *V = SI.getValueOperand(); 02322 02323 // Strip all inbounds GEPs and pointer casts to try to dig out any root 02324 // alloca that should be re-examined after promoting this alloca. 02325 if (V->getType()->isPointerTy()) 02326 if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets())) 02327 Pass.PostPromotionWorklist.insert(AI); 02328 02329 if (SliceSize < DL.getTypeStoreSize(V->getType())) { 02330 assert(!SI.isVolatile()); 02331 assert(V->getType()->isIntegerTy() && 02332 "Only integer type loads and stores are split"); 02333 assert(V->getType()->getIntegerBitWidth() == 02334 DL.getTypeStoreSizeInBits(V->getType()) && 02335 "Non-byte-multiple bit width"); 02336 IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8); 02337 V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset, 02338 "extract"); 02339 } 02340 02341 if (VecTy) 02342 return rewriteVectorizedStoreInst(V, SI, OldOp); 02343 if (IntTy && V->getType()->isIntegerTy()) 02344 return rewriteIntegerStore(V, SI); 02345 02346 StoreInst *NewSI; 02347 if (NewBeginOffset == NewAllocaBeginOffset && 02348 NewEndOffset == NewAllocaEndOffset && 02349 canConvertValue(DL, V->getType(), NewAllocaTy)) { 02350 V = convertValue(DL, IRB, V, NewAllocaTy); 02351 NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), 02352 SI.isVolatile()); 02353 } else { 02354 Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo()); 02355 NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()), 02356 SI.isVolatile()); 02357 } 02358 (void)NewSI; 02359 Pass.DeadInsts.insert(&SI); 02360 deleteIfTriviallyDead(OldOp); 02361 02362 DEBUG(dbgs() << " to: " << *NewSI << "\n"); 02363 return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile(); 02364 } 02365 02366 /// \brief Compute an integer value from splatting an i8 across the given 02367 /// number of bytes. 02368 /// 02369 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't 02370 /// call this routine. 02371 /// FIXME: Heed the advice above. 02372 /// 02373 /// \param V The i8 value to splat. 02374 /// \param Size The number of bytes in the output (assuming i8 is one byte) 02375 Value *getIntegerSplat(Value *V, unsigned Size) { 02376 assert(Size > 0 && "Expected a positive number of bytes."); 02377 IntegerType *VTy = cast<IntegerType>(V->getType()); 02378 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte"); 02379 if (Size == 1) 02380 return V; 02381 02382 Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8); 02383 V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, "zext"), 02384 ConstantExpr::getUDiv( 02385 Constant::getAllOnesValue(SplatIntTy), 02386 ConstantExpr::getZExt( 02387 Constant::getAllOnesValue(V->getType()), 02388 SplatIntTy)), 02389 "isplat"); 02390 return V; 02391 } 02392 02393 /// \brief Compute a vector splat for a given element value. 02394 Value *getVectorSplat(Value *V, unsigned NumElements) { 02395 V = IRB.CreateVectorSplat(NumElements, V, "vsplat"); 02396 DEBUG(dbgs() << " splat: " << *V << "\n"); 02397 return V; 02398 } 02399 02400 bool visitMemSetInst(MemSetInst &II) { 02401 DEBUG(dbgs() << " original: " << II << "\n"); 02402 assert(II.getRawDest() == OldPtr); 02403 02404 // If the memset has a variable size, it cannot be split, just adjust the 02405 // pointer to the new alloca. 02406 if (!isa<Constant>(II.getLength())) { 02407 assert(!IsSplit); 02408 assert(NewBeginOffset == BeginOffset); 02409 II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType())); 02410 Type *CstTy = II.getAlignmentCst()->getType(); 02411 II.setAlignment(ConstantInt::get(CstTy, getSliceAlign())); 02412 02413 deleteIfTriviallyDead(OldPtr); 02414 return false; 02415 } 02416 02417 // Record this instruction for deletion. 02418 Pass.DeadInsts.insert(&II); 02419 02420 Type *AllocaTy = NewAI.getAllocatedType(); 02421 Type *ScalarTy = AllocaTy->getScalarType(); 02422 02423 // If this doesn't map cleanly onto the alloca type, and that type isn't 02424 // a single value type, just emit a memset. 02425 if (!VecTy && !IntTy && 02426 (BeginOffset > NewAllocaBeginOffset || 02427 EndOffset < NewAllocaEndOffset || 02428 SliceSize != DL.getTypeStoreSize(AllocaTy) || 02429 !AllocaTy->isSingleValueType() || 02430 !DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) || 02431 DL.getTypeSizeInBits(ScalarTy)%8 != 0)) { 02432 Type *SizeTy = II.getLength()->getType(); 02433 Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); 02434 CallInst *New = IRB.CreateMemSet( 02435 getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size, 02436 getSliceAlign(), II.isVolatile()); 02437 (void)New; 02438 DEBUG(dbgs() << " to: " << *New << "\n"); 02439 return false; 02440 } 02441 02442 // If we can represent this as a simple value, we have to build the actual 02443 // value to store, which requires expanding the byte present in memset to 02444 // a sensible representation for the alloca type. This is essentially 02445 // splatting the byte to a sufficiently wide integer, splatting it across 02446 // any desired vector width, and bitcasting to the final type. 02447 Value *V; 02448 02449 if (VecTy) { 02450 // If this is a memset of a vectorized alloca, insert it. 02451 assert(ElementTy == ScalarTy); 02452 02453 unsigned BeginIndex = getIndex(NewBeginOffset); 02454 unsigned EndIndex = getIndex(NewEndOffset); 02455 assert(EndIndex > BeginIndex && "Empty vector!"); 02456 unsigned NumElements = EndIndex - BeginIndex; 02457 assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); 02458 02459 Value *Splat = 02460 getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ElementTy) / 8); 02461 Splat = convertValue(DL, IRB, Splat, ElementTy); 02462 if (NumElements > 1) 02463 Splat = getVectorSplat(Splat, NumElements); 02464 02465 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 02466 "oldload"); 02467 V = insertVector(IRB, Old, Splat, BeginIndex, "vec"); 02468 } else if (IntTy) { 02469 // If this is a memset on an alloca where we can widen stores, insert the 02470 // set integer. 02471 assert(!II.isVolatile()); 02472 02473 uint64_t Size = NewEndOffset - NewBeginOffset; 02474 V = getIntegerSplat(II.getValue(), Size); 02475 02476 if (IntTy && (BeginOffset != NewAllocaBeginOffset || 02477 EndOffset != NewAllocaBeginOffset)) { 02478 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 02479 "oldload"); 02480 Old = convertValue(DL, IRB, Old, IntTy); 02481 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 02482 V = insertInteger(DL, IRB, Old, V, Offset, "insert"); 02483 } else { 02484 assert(V->getType() == IntTy && 02485 "Wrong type for an alloca wide integer!"); 02486 } 02487 V = convertValue(DL, IRB, V, AllocaTy); 02488 } else { 02489 // Established these invariants above. 02490 assert(NewBeginOffset == NewAllocaBeginOffset); 02491 assert(NewEndOffset == NewAllocaEndOffset); 02492 02493 V = getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ScalarTy) / 8); 02494 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy)) 02495 V = getVectorSplat(V, AllocaVecTy->getNumElements()); 02496 02497 V = convertValue(DL, IRB, V, AllocaTy); 02498 } 02499 02500 Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), 02501 II.isVolatile()); 02502 (void)New; 02503 DEBUG(dbgs() << " to: " << *New << "\n"); 02504 return !II.isVolatile(); 02505 } 02506 02507 bool visitMemTransferInst(MemTransferInst &II) { 02508 // Rewriting of memory transfer instructions can be a bit tricky. We break 02509 // them into two categories: split intrinsics and unsplit intrinsics. 02510 02511 DEBUG(dbgs() << " original: " << II << "\n"); 02512 02513 bool IsDest = &II.getRawDestUse() == OldUse; 02514 assert((IsDest && II.getRawDest() == OldPtr) || 02515 (!IsDest && II.getRawSource() == OldPtr)); 02516 02517 unsigned SliceAlign = getSliceAlign(); 02518 02519 // For unsplit intrinsics, we simply modify the source and destination 02520 // pointers in place. This isn't just an optimization, it is a matter of 02521 // correctness. With unsplit intrinsics we may be dealing with transfers 02522 // within a single alloca before SROA ran, or with transfers that have 02523 // a variable length. We may also be dealing with memmove instead of 02524 // memcpy, and so simply updating the pointers is the necessary for us to 02525 // update both source and dest of a single call. 02526 if (!IsSplittable) { 02527 Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); 02528 if (IsDest) 02529 II.setDest(AdjustedPtr); 02530 else 02531 II.setSource(AdjustedPtr); 02532 02533 if (II.getAlignment() > SliceAlign) { 02534 Type *CstTy = II.getAlignmentCst()->getType(); 02535 II.setAlignment( 02536 ConstantInt::get(CstTy, MinAlign(II.getAlignment(), SliceAlign))); 02537 } 02538 02539 DEBUG(dbgs() << " to: " << II << "\n"); 02540 deleteIfTriviallyDead(OldPtr); 02541 return false; 02542 } 02543 // For split transfer intrinsics we have an incredibly useful assurance: 02544 // the source and destination do not reside within the same alloca, and at 02545 // least one of them does not escape. This means that we can replace 02546 // memmove with memcpy, and we don't need to worry about all manner of 02547 // downsides to splitting and transforming the operations. 02548 02549 // If this doesn't map cleanly onto the alloca type, and that type isn't 02550 // a single value type, just emit a memcpy. 02551 bool EmitMemCpy = 02552 !VecTy && !IntTy && 02553 (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset || 02554 SliceSize != DL.getTypeStoreSize(NewAI.getAllocatedType()) || 02555 !NewAI.getAllocatedType()->isSingleValueType()); 02556 02557 // If we're just going to emit a memcpy, the alloca hasn't changed, and the 02558 // size hasn't been shrunk based on analysis of the viable range, this is 02559 // a no-op. 02560 if (EmitMemCpy && &OldAI == &NewAI) { 02561 // Ensure the start lines up. 02562 assert(NewBeginOffset == BeginOffset); 02563 02564 // Rewrite the size as needed. 02565 if (NewEndOffset != EndOffset) 02566 II.setLength(ConstantInt::get(II.getLength()->getType(), 02567 NewEndOffset - NewBeginOffset)); 02568 return false; 02569 } 02570 // Record this instruction for deletion. 02571 Pass.DeadInsts.insert(&II); 02572 02573 // Strip all inbounds GEPs and pointer casts to try to dig out any root 02574 // alloca that should be re-examined after rewriting this instruction. 02575 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); 02576 if (AllocaInst *AI 02577 = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) { 02578 assert(AI != &OldAI && AI != &NewAI && 02579 "Splittable transfers cannot reach the same alloca on both ends."); 02580 Pass.Worklist.insert(AI); 02581 } 02582 02583 Type *OtherPtrTy = OtherPtr->getType(); 02584 unsigned OtherAS = OtherPtrTy->getPointerAddressSpace(); 02585 02586 // Compute the relative offset for the other pointer within the transfer. 02587 unsigned IntPtrWidth = DL.getPointerSizeInBits(OtherAS); 02588 APInt OtherOffset(IntPtrWidth, NewBeginOffset - BeginOffset); 02589 unsigned OtherAlign = MinAlign(II.getAlignment() ? II.getAlignment() : 1, 02590 OtherOffset.zextOrTrunc(64).getZExtValue()); 02591 02592 if (EmitMemCpy) { 02593 // Compute the other pointer, folding as much as possible to produce 02594 // a single, simple GEP in most cases. 02595 OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, 02596 OtherPtr->getName() + "."); 02597 02598 Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); 02599 Type *SizeTy = II.getLength()->getType(); 02600 Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); 02601 02602 CallInst *New = IRB.CreateMemCpy( 02603 IsDest ? OurPtr : OtherPtr, IsDest ? OtherPtr : OurPtr, Size, 02604 MinAlign(SliceAlign, OtherAlign), II.isVolatile()); 02605 (void)New; 02606 DEBUG(dbgs() << " to: " << *New << "\n"); 02607 return false; 02608 } 02609 02610 bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset && 02611 NewEndOffset == NewAllocaEndOffset; 02612 uint64_t Size = NewEndOffset - NewBeginOffset; 02613 unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0; 02614 unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0; 02615 unsigned NumElements = EndIndex - BeginIndex; 02616 IntegerType *SubIntTy 02617 = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : nullptr; 02618 02619 // Reset the other pointer type to match the register type we're going to 02620 // use, but using the address space of the original other pointer. 02621 if (VecTy && !IsWholeAlloca) { 02622 if (NumElements == 1) 02623 OtherPtrTy = VecTy->getElementType(); 02624 else 02625 OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements); 02626 02627 OtherPtrTy = OtherPtrTy->getPointerTo(OtherAS); 02628 } else if (IntTy && !IsWholeAlloca) { 02629 OtherPtrTy = SubIntTy->getPointerTo(OtherAS); 02630 } else { 02631 OtherPtrTy = NewAllocaTy->getPointerTo(OtherAS); 02632 } 02633 02634 Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, 02635 OtherPtr->getName() + "."); 02636 unsigned SrcAlign = OtherAlign; 02637 Value *DstPtr = &NewAI; 02638 unsigned DstAlign = SliceAlign; 02639 if (!IsDest) { 02640 std::swap(SrcPtr, DstPtr); 02641 std::swap(SrcAlign, DstAlign); 02642 } 02643 02644 Value *Src; 02645 if (VecTy && !IsWholeAlloca && !IsDest) { 02646 Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 02647 "load"); 02648 Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec"); 02649 } else if (IntTy && !IsWholeAlloca && !IsDest) { 02650 Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 02651 "load"); 02652 Src = convertValue(DL, IRB, Src, IntTy); 02653 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 02654 Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract"); 02655 } else { 02656 Src = IRB.CreateAlignedLoad(SrcPtr, SrcAlign, II.isVolatile(), 02657 "copyload"); 02658 } 02659 02660 if (VecTy && !IsWholeAlloca && IsDest) { 02661 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 02662 "oldload"); 02663 Src = insertVector(IRB, Old, Src, BeginIndex, "vec"); 02664 } else if (IntTy && !IsWholeAlloca && IsDest) { 02665 Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), 02666 "oldload"); 02667 Old = convertValue(DL, IRB, Old, IntTy); 02668 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 02669 Src = insertInteger(DL, IRB, Old, Src, Offset, "insert"); 02670 Src = convertValue(DL, IRB, Src, NewAllocaTy); 02671 } 02672 02673 StoreInst *Store = cast<StoreInst>( 02674 IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile())); 02675 (void)Store; 02676 DEBUG(dbgs() << " to: " << *Store << "\n"); 02677 return !II.isVolatile(); 02678 } 02679 02680 bool visitIntrinsicInst(IntrinsicInst &II) { 02681 assert(II.getIntrinsicID() == Intrinsic::lifetime_start || 02682 II.getIntrinsicID() == Intrinsic::lifetime_end); 02683 DEBUG(dbgs() << " original: " << II << "\n"); 02684 assert(II.getArgOperand(1) == OldPtr); 02685 02686 // Record this instruction for deletion. 02687 Pass.DeadInsts.insert(&II); 02688 02689 ConstantInt *Size 02690 = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), 02691 NewEndOffset - NewBeginOffset); 02692 Value *Ptr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); 02693 Value *New; 02694 if (II.getIntrinsicID() == Intrinsic::lifetime_start) 02695 New = IRB.CreateLifetimeStart(Ptr, Size); 02696 else 02697 New = IRB.CreateLifetimeEnd(Ptr, Size); 02698 02699 (void)New; 02700 DEBUG(dbgs() << " to: " << *New << "\n"); 02701 return true; 02702 } 02703 02704 bool visitPHINode(PHINode &PN) { 02705 DEBUG(dbgs() << " original: " << PN << "\n"); 02706 assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable"); 02707 assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable"); 02708 02709 // We would like to compute a new pointer in only one place, but have it be 02710 // as local as possible to the PHI. To do that, we re-use the location of 02711 // the old pointer, which necessarily must be in the right position to 02712 // dominate the PHI. 02713 IRBuilderTy PtrBuilder(IRB); 02714 if (isa<PHINode>(OldPtr)) 02715 PtrBuilder.SetInsertPoint(OldPtr->getParent()->getFirstInsertionPt()); 02716 else 02717 PtrBuilder.SetInsertPoint(OldPtr); 02718 PtrBuilder.SetCurrentDebugLocation(OldPtr->getDebugLoc()); 02719 02720 Value *NewPtr = getNewAllocaSlicePtr(PtrBuilder, OldPtr->getType()); 02721 // Replace the operands which were using the old pointer. 02722 std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr); 02723 02724 DEBUG(dbgs() << " to: " << PN << "\n"); 02725 deleteIfTriviallyDead(OldPtr); 02726 02727 // PHIs can't be promoted on their own, but often can be speculated. We 02728 // check the speculation outside of the rewriter so that we see the 02729 // fully-rewritten alloca. 02730 PHIUsers.insert(&PN); 02731 return true; 02732 } 02733 02734 bool visitSelectInst(SelectInst &SI) { 02735 DEBUG(dbgs() << " original: " << SI << "\n"); 02736 assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) && 02737 "Pointer isn't an operand!"); 02738 assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable"); 02739 assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable"); 02740 02741 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); 02742 // Replace the operands which were using the old pointer. 02743 if (SI.getOperand(1) == OldPtr) 02744 SI.setOperand(1, NewPtr); 02745 if (SI.getOperand(2) == OldPtr) 02746 SI.setOperand(2, NewPtr); 02747 02748 DEBUG(dbgs() << " to: " << SI << "\n"); 02749 deleteIfTriviallyDead(OldPtr); 02750 02751 // Selects can't be promoted on their own, but often can be speculated. We 02752 // check the speculation outside of the rewriter so that we see the 02753 // fully-rewritten alloca. 02754 SelectUsers.insert(&SI); 02755 return true; 02756 } 02757 02758 }; 02759 } 02760 02761 namespace { 02762 /// \brief Visitor to rewrite aggregate loads and stores as scalar. 02763 /// 02764 /// This pass aggressively rewrites all aggregate loads and stores on 02765 /// a particular pointer (or any pointer derived from it which we can identify) 02766 /// with scalar loads and stores. 02767 class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { 02768 // Befriend the base class so it can delegate to private visit methods. 02769 friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>; 02770 02771 const DataLayout &DL; 02772 02773 /// Queue of pointer uses to analyze and potentially rewrite. 02774 SmallVector<Use *, 8> Queue; 02775 02776 /// Set to prevent us from cycling with phi nodes and loops. 02777 SmallPtrSet<User *, 8> Visited; 02778 02779 /// The current pointer use being rewritten. This is used to dig up the used 02780 /// value (as opposed to the user). 02781 Use *U; 02782 02783 public: 02784 AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {} 02785 02786 /// Rewrite loads and stores through a pointer and all pointers derived from 02787 /// it. 02788 bool rewrite(Instruction &I) { 02789 DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); 02790 enqueueUsers(I); 02791 bool Changed = false; 02792 while (!Queue.empty()) { 02793 U = Queue.pop_back_val(); 02794 Changed |= visit(cast<Instruction>(U->getUser())); 02795 } 02796 return Changed; 02797 } 02798 02799 private: 02800 /// Enqueue all the users of the given instruction for further processing. 02801 /// This uses a set to de-duplicate users. 02802 void enqueueUsers(Instruction &I) { 02803 for (Use &U : I.uses()) 02804 if (Visited.insert(U.getUser())) 02805 Queue.push_back(&U); 02806 } 02807 02808 // Conservative default is to not rewrite anything. 02809 bool visitInstruction(Instruction &I) { return false; } 02810 02811 /// \brief Generic recursive split emission class. 02812 template <typename Derived> 02813 class OpSplitter { 02814 protected: 02815 /// The builder used to form new instructions. 02816 IRBuilderTy IRB; 02817 /// The indices which to be used with insert- or extractvalue to select the 02818 /// appropriate value within the aggregate. 02819 SmallVector<unsigned, 4> Indices; 02820 /// The indices to a GEP instruction which will move Ptr to the correct slot 02821 /// within the aggregate. 02822 SmallVector<Value *, 4> GEPIndices; 02823 /// The base pointer of the original op, used as a base for GEPing the 02824 /// split operations. 02825 Value *Ptr; 02826 02827 /// Initialize the splitter with an insertion point, Ptr and start with a 02828 /// single zero GEP index. 02829 OpSplitter(Instruction *InsertionPoint, Value *Ptr) 02830 : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {} 02831 02832 public: 02833 /// \brief Generic recursive split emission routine. 02834 /// 02835 /// This method recursively splits an aggregate op (load or store) into 02836 /// scalar or vector ops. It splits recursively until it hits a single value 02837 /// and emits that single value operation via the template argument. 02838 /// 02839 /// The logic of this routine relies on GEPs and insertvalue and 02840 /// extractvalue all operating with the same fundamental index list, merely 02841 /// formatted differently (GEPs need actual values). 02842 /// 02843 /// \param Ty The type being split recursively into smaller ops. 02844 /// \param Agg The aggregate value being built up or stored, depending on 02845 /// whether this is splitting a load or a store respectively. 02846 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) { 02847 if (Ty->isSingleValueType()) 02848 return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name); 02849 02850 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 02851 unsigned OldSize = Indices.size(); 02852 (void)OldSize; 02853 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size; 02854 ++Idx) { 02855 assert(Indices.size() == OldSize && "Did not return to the old size"); 02856 Indices.push_back(Idx); 02857 GEPIndices.push_back(IRB.getInt32(Idx)); 02858 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx)); 02859 GEPIndices.pop_back(); 02860 Indices.pop_back(); 02861 } 02862 return; 02863 } 02864 02865 if (StructType *STy = dyn_cast<StructType>(Ty)) { 02866 unsigned OldSize = Indices.size(); 02867 (void)OldSize; 02868 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size; 02869 ++Idx) { 02870 assert(Indices.size() == OldSize && "Did not return to the old size"); 02871 Indices.push_back(Idx); 02872 GEPIndices.push_back(IRB.getInt32(Idx)); 02873 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx)); 02874 GEPIndices.pop_back(); 02875 Indices.pop_back(); 02876 } 02877 return; 02878 } 02879 02880 llvm_unreachable("Only arrays and structs are aggregate loadable types"); 02881 } 02882 }; 02883 02884 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> { 02885 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr) 02886 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {} 02887 02888 /// Emit a leaf load of a single value. This is called at the leaves of the 02889 /// recursive emission to actually load values. 02890 void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { 02891 assert(Ty->isSingleValueType()); 02892 // Load the single value and insert it using the indices. 02893 Value *GEP = IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep"); 02894 Value *Load = IRB.CreateLoad(GEP, Name + ".load"); 02895 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); 02896 DEBUG(dbgs() << " to: " << *Load << "\n"); 02897 } 02898 }; 02899 02900 bool visitLoadInst(LoadInst &LI) { 02901 assert(LI.getPointerOperand() == *U); 02902 if (!LI.isSimple() || LI.getType()->isSingleValueType()) 02903 return false; 02904 02905 // We have an aggregate being loaded, split it apart. 02906 DEBUG(dbgs() << " original: " << LI << "\n"); 02907 LoadOpSplitter Splitter(&LI, *U); 02908 Value *V = UndefValue::get(LI.getType()); 02909 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); 02910 LI.replaceAllUsesWith(V); 02911 LI.eraseFromParent(); 02912 return true; 02913 } 02914 02915 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { 02916 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr) 02917 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {} 02918 02919 /// Emit a leaf store of a single value. This is called at the leaves of the 02920 /// recursive emission to actually produce stores. 02921 void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) { 02922 assert(Ty->isSingleValueType()); 02923 // Extract the single value and store it using the indices. 02924 Value *Store = IRB.CreateStore( 02925 IRB.CreateExtractValue(Agg, Indices, Name + ".extract"), 02926 IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep")); 02927 (void)Store; 02928 DEBUG(dbgs() << " to: " << *Store << "\n"); 02929 } 02930 }; 02931 02932 bool visitStoreInst(StoreInst &SI) { 02933 if (!SI.isSimple() || SI.getPointerOperand() != *U) 02934 return false; 02935 Value *V = SI.getValueOperand(); 02936 if (V->getType()->isSingleValueType()) 02937 return false; 02938 02939 // We have an aggregate being stored, split it apart. 02940 DEBUG(dbgs() << " original: " << SI << "\n"); 02941 StoreOpSplitter Splitter(&SI, *U); 02942 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); 02943 SI.eraseFromParent(); 02944 return true; 02945 } 02946 02947 bool visitBitCastInst(BitCastInst &BC) { 02948 enqueueUsers(BC); 02949 return false; 02950 } 02951 02952 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { 02953 enqueueUsers(GEPI); 02954 return false; 02955 } 02956 02957 bool visitPHINode(PHINode &PN) { 02958 enqueueUsers(PN); 02959 return false; 02960 } 02961 02962 bool visitSelectInst(SelectInst &SI) { 02963 enqueueUsers(SI); 02964 return false; 02965 } 02966 }; 02967 } 02968 02969 /// \brief Strip aggregate type wrapping. 02970 /// 02971 /// This removes no-op aggregate types wrapping an underlying type. It will 02972 /// strip as many layers of types as it can without changing either the type 02973 /// size or the allocated size. 02974 static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { 02975 if (Ty->isSingleValueType()) 02976 return Ty; 02977 02978 uint64_t AllocSize = DL.getTypeAllocSize(Ty); 02979 uint64_t TypeSize = DL.getTypeSizeInBits(Ty); 02980 02981 Type *InnerTy; 02982 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 02983 InnerTy = ArrTy->getElementType(); 02984 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 02985 const StructLayout *SL = DL.getStructLayout(STy); 02986 unsigned Index = SL->getElementContainingOffset(0); 02987 InnerTy = STy->getElementType(Index); 02988 } else { 02989 return Ty; 02990 } 02991 02992 if (AllocSize > DL.getTypeAllocSize(InnerTy) || 02993 TypeSize > DL.getTypeSizeInBits(InnerTy)) 02994 return Ty; 02995 02996 return stripAggregateTypeWrapping(DL, InnerTy); 02997 } 02998 02999 /// \brief Try to find a partition of the aggregate type passed in for a given 03000 /// offset and size. 03001 /// 03002 /// This recurses through the aggregate type and tries to compute a subtype 03003 /// based on the offset and size. When the offset and size span a sub-section 03004 /// of an array, it will even compute a new array type for that sub-section, 03005 /// and the same for structs. 03006 /// 03007 /// Note that this routine is very strict and tries to find a partition of the 03008 /// type which produces the *exact* right offset and size. It is not forgiving 03009 /// when the size or offset cause either end of type-based partition to be off. 03010 /// Also, this is a best-effort routine. It is reasonable to give up and not 03011 /// return a type if necessary. 03012 static Type *getTypePartition(const DataLayout &DL, Type *Ty, 03013 uint64_t Offset, uint64_t Size) { 03014 if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size) 03015 return stripAggregateTypeWrapping(DL, Ty); 03016 if (Offset > DL.getTypeAllocSize(Ty) || 03017 (DL.getTypeAllocSize(Ty) - Offset) < Size) 03018 return nullptr; 03019 03020 if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) { 03021 // We can't partition pointers... 03022 if (SeqTy->isPointerTy()) 03023 return nullptr; 03024 03025 Type *ElementTy = SeqTy->getElementType(); 03026 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy); 03027 uint64_t NumSkippedElements = Offset / ElementSize; 03028 if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) { 03029 if (NumSkippedElements >= ArrTy->getNumElements()) 03030 return nullptr; 03031 } else if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) { 03032 if (NumSkippedElements >= VecTy->getNumElements()) 03033 return nullptr; 03034 } 03035 Offset -= NumSkippedElements * ElementSize; 03036 03037 // First check if we need to recurse. 03038 if (Offset > 0 || Size < ElementSize) { 03039 // Bail if the partition ends in a different array element. 03040 if ((Offset + Size) > ElementSize) 03041 return nullptr; 03042 // Recurse through the element type trying to peel off offset bytes. 03043 return getTypePartition(DL, ElementTy, Offset, Size); 03044 } 03045 assert(Offset == 0); 03046 03047 if (Size == ElementSize) 03048 return stripAggregateTypeWrapping(DL, ElementTy); 03049 assert(Size > ElementSize); 03050 uint64_t NumElements = Size / ElementSize; 03051 if (NumElements * ElementSize != Size) 03052 return nullptr; 03053 return ArrayType::get(ElementTy, NumElements); 03054 } 03055 03056 StructType *STy = dyn_cast<StructType>(Ty); 03057 if (!STy) 03058 return nullptr; 03059 03060 const StructLayout *SL = DL.getStructLayout(STy); 03061 if (Offset >= SL->getSizeInBytes()) 03062 return nullptr; 03063 uint64_t EndOffset = Offset + Size; 03064 if (EndOffset > SL->getSizeInBytes()) 03065 return nullptr; 03066 03067 unsigned Index = SL->getElementContainingOffset(Offset); 03068 Offset -= SL->getElementOffset(Index); 03069 03070 Type *ElementTy = STy->getElementType(Index); 03071 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy); 03072 if (Offset >= ElementSize) 03073 return nullptr; // The offset points into alignment padding. 03074 03075 // See if any partition must be contained by the element. 03076 if (Offset > 0 || Size < ElementSize) { 03077 if ((Offset + Size) > ElementSize) 03078 return nullptr; 03079 return getTypePartition(DL, ElementTy, Offset, Size); 03080 } 03081 assert(Offset == 0); 03082 03083 if (Size == ElementSize) 03084 return stripAggregateTypeWrapping(DL, ElementTy); 03085 03086 StructType::element_iterator EI = STy->element_begin() + Index, 03087 EE = STy->element_end(); 03088 if (EndOffset < SL->getSizeInBytes()) { 03089 unsigned EndIndex = SL->getElementContainingOffset(EndOffset); 03090 if (Index == EndIndex) 03091 return nullptr; // Within a single element and its padding. 03092 03093 // Don't try to form "natural" types if the elements don't line up with the 03094 // expected size. 03095 // FIXME: We could potentially recurse down through the last element in the 03096 // sub-struct to find a natural end point. 03097 if (SL->getElementOffset(EndIndex) != EndOffset) 03098 return nullptr; 03099 03100 assert(Index < EndIndex); 03101 EE = STy->element_begin() + EndIndex; 03102 } 03103 03104 // Try to build up a sub-structure. 03105 StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE), 03106 STy->isPacked()); 03107 const StructLayout *SubSL = DL.getStructLayout(SubTy); 03108 if (Size != SubSL->getSizeInBytes()) 03109 return nullptr; // The sub-struct doesn't have quite the size needed. 03110 03111 return SubTy; 03112 } 03113 03114 /// \brief Rewrite an alloca partition's users. 03115 /// 03116 /// This routine drives both of the rewriting goals of the SROA pass. It tries 03117 /// to rewrite uses of an alloca partition to be conducive for SSA value 03118 /// promotion. If the partition needs a new, more refined alloca, this will 03119 /// build that new alloca, preserving as much type information as possible, and 03120 /// rewrite the uses of the old alloca to point at the new one and have the 03121 /// appropriate new offsets. It also evaluates how successful the rewrite was 03122 /// at enabling promotion and if it was successful queues the alloca to be 03123 /// promoted. 03124 bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S, 03125 AllocaSlices::iterator B, AllocaSlices::iterator E, 03126 int64_t BeginOffset, int64_t EndOffset, 03127 ArrayRef<AllocaSlices::iterator> SplitUses) { 03128 assert(BeginOffset < EndOffset); 03129 uint64_t SliceSize = EndOffset - BeginOffset; 03130 03131 // Try to compute a friendly type for this partition of the alloca. This 03132 // won't always succeed, in which case we fall back to a legal integer type 03133 // or an i8 array of an appropriate size. 03134 Type *SliceTy = nullptr; 03135 if (Type *CommonUseTy = findCommonType(B, E, EndOffset)) 03136 if (DL->getTypeAllocSize(CommonUseTy) >= SliceSize) 03137 SliceTy = CommonUseTy; 03138 if (!SliceTy) 03139 if (Type *TypePartitionTy = getTypePartition(*DL, AI.getAllocatedType(), 03140 BeginOffset, SliceSize)) 03141 SliceTy = TypePartitionTy; 03142 if ((!SliceTy || (SliceTy->isArrayTy() && 03143 SliceTy->getArrayElementType()->isIntegerTy())) && 03144 DL->isLegalInteger(SliceSize * 8)) 03145 SliceTy = Type::getIntNTy(*C, SliceSize * 8); 03146 if (!SliceTy) 03147 SliceTy = ArrayType::get(Type::getInt8Ty(*C), SliceSize); 03148 assert(DL->getTypeAllocSize(SliceTy) >= SliceSize); 03149 03150 bool IsVectorPromotable = isVectorPromotionViable( 03151 *DL, SliceTy, S, BeginOffset, EndOffset, B, E, SplitUses); 03152 03153 bool IsIntegerPromotable = 03154 !IsVectorPromotable && 03155 isIntegerWideningViable(*DL, SliceTy, BeginOffset, S, B, E, SplitUses); 03156 03157 // Check for the case where we're going to rewrite to a new alloca of the 03158 // exact same type as the original, and with the same access offsets. In that 03159 // case, re-use the existing alloca, but still run through the rewriter to 03160 // perform phi and select speculation. 03161 AllocaInst *NewAI; 03162 if (SliceTy == AI.getAllocatedType()) { 03163 assert(BeginOffset == 0 && 03164 "Non-zero begin offset but same alloca type"); 03165 NewAI = &AI; 03166 // FIXME: We should be able to bail at this point with "nothing changed". 03167 // FIXME: We might want to defer PHI speculation until after here. 03168 } else { 03169 unsigned Alignment = AI.getAlignment(); 03170 if (!Alignment) { 03171 // The minimum alignment which users can rely on when the explicit 03172 // alignment is omitted or zero is that required by the ABI for this 03173 // type. 03174 Alignment = DL->getABITypeAlignment(AI.getAllocatedType()); 03175 } 03176 Alignment = MinAlign(Alignment, BeginOffset); 03177 // If we will get at least this much alignment from the type alone, leave 03178 // the alloca's alignment unconstrained. 03179 if (Alignment <= DL->getABITypeAlignment(SliceTy)) 03180 Alignment = 0; 03181 NewAI = new AllocaInst(SliceTy, nullptr, Alignment, 03182 AI.getName() + ".sroa." + Twine(B - S.begin()), &AI); 03183 ++NumNewAllocas; 03184 } 03185 03186 DEBUG(dbgs() << "Rewriting alloca partition " 03187 << "[" << BeginOffset << "," << EndOffset << ") to: " << *NewAI 03188 << "\n"); 03189 03190 // Track the high watermark on the worklist as it is only relevant for 03191 // promoted allocas. We will reset it to this point if the alloca is not in 03192 // fact scheduled for promotion. 03193 unsigned PPWOldSize = PostPromotionWorklist.size(); 03194 unsigned NumUses = 0; 03195 SmallPtrSet<PHINode *, 8> PHIUsers; 03196 SmallPtrSet<SelectInst *, 8> SelectUsers; 03197 03198 AllocaSliceRewriter Rewriter(*DL, S, *this, AI, *NewAI, BeginOffset, 03199 EndOffset, IsVectorPromotable, 03200 IsIntegerPromotable, PHIUsers, SelectUsers); 03201 bool Promotable = true; 03202 for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(), 03203 SUE = SplitUses.end(); 03204 SUI != SUE; ++SUI) { 03205 DEBUG(dbgs() << " rewriting split "); 03206 DEBUG(S.printSlice(dbgs(), *SUI, "")); 03207 Promotable &= Rewriter.visit(*SUI); 03208 ++NumUses; 03209 } 03210 for (AllocaSlices::iterator I = B; I != E; ++I) { 03211 DEBUG(dbgs() << " rewriting "); 03212 DEBUG(S.printSlice(dbgs(), I, "")); 03213 Promotable &= Rewriter.visit(I); 03214 ++NumUses; 03215 } 03216 03217 NumAllocaPartitionUses += NumUses; 03218 MaxUsesPerAllocaPartition = 03219 std::max<unsigned>(NumUses, MaxUsesPerAllocaPartition); 03220 03221 // Now that we've processed all the slices in the new partition, check if any 03222 // PHIs or Selects would block promotion. 03223 for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(), 03224 E = PHIUsers.end(); 03225 I != E; ++I) 03226 if (!isSafePHIToSpeculate(**I, DL)) { 03227 Promotable = false; 03228 PHIUsers.clear(); 03229 SelectUsers.clear(); 03230 break; 03231 } 03232 for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(), 03233 E = SelectUsers.end(); 03234 I != E; ++I) 03235 if (!isSafeSelectToSpeculate(**I, DL)) { 03236 Promotable = false; 03237 PHIUsers.clear(); 03238 SelectUsers.clear(); 03239 break; 03240 } 03241 03242 if (Promotable) { 03243 if (PHIUsers.empty() && SelectUsers.empty()) { 03244 // Promote the alloca. 03245 PromotableAllocas.push_back(NewAI); 03246 } else { 03247 // If we have either PHIs or Selects to speculate, add them to those 03248 // worklists and re-queue the new alloca so that we promote in on the 03249 // next iteration. 03250 for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(), 03251 E = PHIUsers.end(); 03252 I != E; ++I) 03253 SpeculatablePHIs.insert(*I); 03254 for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(), 03255 E = SelectUsers.end(); 03256 I != E; ++I) 03257 SpeculatableSelects.insert(*I); 03258 Worklist.insert(NewAI); 03259 } 03260 } else { 03261 // If we can't promote the alloca, iterate on it to check for new 03262 // refinements exposed by splitting the current alloca. Don't iterate on an 03263 // alloca which didn't actually change and didn't get promoted. 03264 if (NewAI != &AI) 03265 Worklist.insert(NewAI); 03266 03267 // Drop any post-promotion work items if promotion didn't happen. 03268 while (PostPromotionWorklist.size() > PPWOldSize) 03269 PostPromotionWorklist.pop_back(); 03270 } 03271 03272 return true; 03273 } 03274 03275 static void 03276 removeFinishedSplitUses(SmallVectorImpl<AllocaSlices::iterator> &SplitUses, 03277 uint64_t &MaxSplitUseEndOffset, uint64_t Offset) { 03278 if (Offset >= MaxSplitUseEndOffset) { 03279 SplitUses.clear(); 03280 MaxSplitUseEndOffset = 0; 03281 return; 03282 } 03283 03284 size_t SplitUsesOldSize = SplitUses.size(); 03285 SplitUses.erase(std::remove_if(SplitUses.begin(), SplitUses.end(), 03286 [Offset](const AllocaSlices::iterator &I) { 03287 return I->endOffset() <= Offset; 03288 }), 03289 SplitUses.end()); 03290 if (SplitUsesOldSize == SplitUses.size()) 03291 return; 03292 03293 // Recompute the max. While this is linear, so is remove_if. 03294 MaxSplitUseEndOffset = 0; 03295 for (SmallVectorImpl<AllocaSlices::iterator>::iterator 03296 SUI = SplitUses.begin(), 03297 SUE = SplitUses.end(); 03298 SUI != SUE; ++SUI) 03299 MaxSplitUseEndOffset = std::max((*SUI)->endOffset(), MaxSplitUseEndOffset); 03300 } 03301 03302 /// \brief Walks the slices of an alloca and form partitions based on them, 03303 /// rewriting each of their uses. 03304 bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &S) { 03305 if (S.begin() == S.end()) 03306 return false; 03307 03308 unsigned NumPartitions = 0; 03309 bool Changed = false; 03310 SmallVector<AllocaSlices::iterator, 4> SplitUses; 03311 uint64_t MaxSplitUseEndOffset = 0; 03312 03313 uint64_t BeginOffset = S.begin()->beginOffset(); 03314 03315 for (AllocaSlices::iterator SI = S.begin(), SJ = std::next(SI), SE = S.end(); 03316 SI != SE; SI = SJ) { 03317 uint64_t MaxEndOffset = SI->endOffset(); 03318 03319 if (!SI->isSplittable()) { 03320 // When we're forming an unsplittable region, it must always start at the 03321 // first slice and will extend through its end. 03322 assert(BeginOffset == SI->beginOffset()); 03323 03324 // Form a partition including all of the overlapping slices with this 03325 // unsplittable slice. 03326 while (SJ != SE && SJ->beginOffset() < MaxEndOffset) { 03327 if (!SJ->isSplittable()) 03328 MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset()); 03329 ++SJ; 03330 } 03331 } else { 03332 assert(SI->isSplittable()); // Established above. 03333 03334 // Collect all of the overlapping splittable slices. 03335 while (SJ != SE && SJ->beginOffset() < MaxEndOffset && 03336 SJ->isSplittable()) { 03337 MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset()); 03338 ++SJ; 03339 } 03340 03341 // Back up MaxEndOffset and SJ if we ended the span early when 03342 // encountering an unsplittable slice. 03343 if (SJ != SE && SJ->beginOffset() < MaxEndOffset) { 03344 assert(!SJ->isSplittable()); 03345 MaxEndOffset = SJ->beginOffset(); 03346 } 03347 } 03348 03349 // Check if we have managed to move the end offset forward yet. If so, 03350 // we'll have to rewrite uses and erase old split uses. 03351 if (BeginOffset < MaxEndOffset) { 03352 // Rewrite a sequence of overlapping slices. 03353 Changed |= 03354 rewritePartition(AI, S, SI, SJ, BeginOffset, MaxEndOffset, SplitUses); 03355 ++NumPartitions; 03356 03357 removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset, MaxEndOffset); 03358 } 03359 03360 // Accumulate all the splittable slices from the [SI,SJ) region which 03361 // overlap going forward. 03362 for (AllocaSlices::iterator SK = SI; SK != SJ; ++SK) 03363 if (SK->isSplittable() && SK->endOffset() > MaxEndOffset) { 03364 SplitUses.push_back(SK); 03365 MaxSplitUseEndOffset = std::max(SK->endOffset(), MaxSplitUseEndOffset); 03366 } 03367 03368 // If we're already at the end and we have no split uses, we're done. 03369 if (SJ == SE && SplitUses.empty()) 03370 break; 03371 03372 // If we have no split uses or no gap in offsets, we're ready to move to 03373 // the next slice. 03374 if (SplitUses.empty() || (SJ != SE && MaxEndOffset == SJ->beginOffset())) { 03375 BeginOffset = SJ->beginOffset(); 03376 continue; 03377 } 03378 03379 // Even if we have split slices, if the next slice is splittable and the 03380 // split slices reach it, we can simply set up the beginning offset of the 03381 // next iteration to bridge between them. 03382 if (SJ != SE && SJ->isSplittable() && 03383 MaxSplitUseEndOffset > SJ->beginOffset()) { 03384 BeginOffset = MaxEndOffset; 03385 continue; 03386 } 03387 03388 // Otherwise, we have a tail of split slices. Rewrite them with an empty 03389 // range of slices. 03390 uint64_t PostSplitEndOffset = 03391 SJ == SE ? MaxSplitUseEndOffset : SJ->beginOffset(); 03392 03393 Changed |= rewritePartition(AI, S, SJ, SJ, MaxEndOffset, PostSplitEndOffset, 03394 SplitUses); 03395 ++NumPartitions; 03396 03397 if (SJ == SE) 03398 break; // Skip the rest, we don't need to do any cleanup. 03399 03400 removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset, 03401 PostSplitEndOffset); 03402 03403 // Now just reset the begin offset for the next iteration. 03404 BeginOffset = SJ->beginOffset(); 03405 } 03406 03407 NumAllocaPartitions += NumPartitions; 03408 MaxPartitionsPerAlloca = 03409 std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca); 03410 03411 return Changed; 03412 } 03413 03414 /// \brief Clobber a use with undef, deleting the used value if it becomes dead. 03415 void SROA::clobberUse(Use &U) { 03416 Value *OldV = U; 03417 // Replace the use with an undef value. 03418 U = UndefValue::get(OldV->getType()); 03419 03420 // Check for this making an instruction dead. We have to garbage collect 03421 // all the dead instructions to ensure the uses of any alloca end up being 03422 // minimal. 03423 if (Instruction *OldI = dyn_cast<Instruction>(OldV)) 03424 if (isInstructionTriviallyDead(OldI)) { 03425 DeadInsts.insert(OldI); 03426 } 03427 } 03428 03429 /// \brief Analyze an alloca for SROA. 03430 /// 03431 /// This analyzes the alloca to ensure we can reason about it, builds 03432 /// the slices of the alloca, and then hands it off to be split and 03433 /// rewritten as needed. 03434 bool SROA::runOnAlloca(AllocaInst &AI) { 03435 DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); 03436 ++NumAllocasAnalyzed; 03437 03438 // Special case dead allocas, as they're trivial. 03439 if (AI.use_empty()) { 03440 AI.eraseFromParent(); 03441 return true; 03442 } 03443 03444 // Skip alloca forms that this analysis can't handle. 03445 if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() || 03446 DL->getTypeAllocSize(AI.getAllocatedType()) == 0) 03447 return false; 03448 03449 bool Changed = false; 03450 03451 // First, split any FCA loads and stores touching this alloca to promote 03452 // better splitting and promotion opportunities. 03453 AggLoadStoreRewriter AggRewriter(*DL); 03454 Changed |= AggRewriter.rewrite(AI); 03455 03456 // Build the slices using a recursive instruction-visiting builder. 03457 AllocaSlices S(*DL, AI); 03458 DEBUG(S.print(dbgs())); 03459 if (S.isEscaped()) 03460 return Changed; 03461 03462 // Delete all the dead users of this alloca before splitting and rewriting it. 03463 for (AllocaSlices::dead_user_iterator DI = S.dead_user_begin(), 03464 DE = S.dead_user_end(); 03465 DI != DE; ++DI) { 03466 // Free up everything used by this instruction. 03467 for (Use &DeadOp : (*DI)->operands()) 03468 clobberUse(DeadOp); 03469 03470 // Now replace the uses of this instruction. 03471 (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType())); 03472 03473 // And mark it for deletion. 03474 DeadInsts.insert(*DI); 03475 Changed = true; 03476 } 03477 for (AllocaSlices::dead_op_iterator DO = S.dead_op_begin(), 03478 DE = S.dead_op_end(); 03479 DO != DE; ++DO) { 03480 clobberUse(**DO); 03481 Changed = true; 03482 } 03483 03484 // No slices to split. Leave the dead alloca for a later pass to clean up. 03485 if (S.begin() == S.end()) 03486 return Changed; 03487 03488 Changed |= splitAlloca(AI, S); 03489 03490 DEBUG(dbgs() << " Speculating PHIs\n"); 03491 while (!SpeculatablePHIs.empty()) 03492 speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val()); 03493 03494 DEBUG(dbgs() << " Speculating Selects\n"); 03495 while (!SpeculatableSelects.empty()) 03496 speculateSelectInstLoads(*SpeculatableSelects.pop_back_val()); 03497 03498 return Changed; 03499 } 03500 03501 /// \brief Delete the dead instructions accumulated in this run. 03502 /// 03503 /// Recursively deletes the dead instructions we've accumulated. This is done 03504 /// at the very end to maximize locality of the recursive delete and to 03505 /// minimize the problems of invalidated instruction pointers as such pointers 03506 /// are used heavily in the intermediate stages of the algorithm. 03507 /// 03508 /// We also record the alloca instructions deleted here so that they aren't 03509 /// subsequently handed to mem2reg to promote. 03510 void SROA::deleteDeadInstructions(SmallPtrSetImpl<AllocaInst*> &DeletedAllocas) { 03511 while (!DeadInsts.empty()) { 03512 Instruction *I = DeadInsts.pop_back_val(); 03513 DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); 03514 03515 I->replaceAllUsesWith(UndefValue::get(I->getType())); 03516 03517 for (Use &Operand : I->operands()) 03518 if (Instruction *U = dyn_cast<Instruction>(Operand)) { 03519 // Zero out the operand and see if it becomes trivially dead. 03520 Operand = nullptr; 03521 if (isInstructionTriviallyDead(U)) 03522 DeadInsts.insert(U); 03523 } 03524 03525 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) 03526 DeletedAllocas.insert(AI); 03527 03528 ++NumDeleted; 03529 I->eraseFromParent(); 03530 } 03531 } 03532 03533 static void enqueueUsersInWorklist(Instruction &I, 03534 SmallVectorImpl<Instruction *> &Worklist, 03535 SmallPtrSetImpl<Instruction *> &Visited) { 03536 for (User *U : I.users()) 03537 if (Visited.insert(cast<Instruction>(U))) 03538 Worklist.push_back(cast<Instruction>(U)); 03539 } 03540 03541 /// \brief Promote the allocas, using the best available technique. 03542 /// 03543 /// This attempts to promote whatever allocas have been identified as viable in 03544 /// the PromotableAllocas list. If that list is empty, there is nothing to do. 03545 /// If there is a domtree available, we attempt to promote using the full power 03546 /// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is 03547 /// based on the SSAUpdater utilities. This function returns whether any 03548 /// promotion occurred. 03549 bool SROA::promoteAllocas(Function &F) { 03550 if (PromotableAllocas.empty()) 03551 return false; 03552 03553 NumPromoted += PromotableAllocas.size(); 03554 03555 if (DT && !ForceSSAUpdater) { 03556 DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); 03557 PromoteMemToReg(PromotableAllocas, *DT, nullptr, AT); 03558 PromotableAllocas.clear(); 03559 return true; 03560 } 03561 03562 DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n"); 03563 SSAUpdater SSA; 03564 DIBuilder DIB(*F.getParent()); 03565 SmallVector<Instruction *, 64> Insts; 03566 03567 // We need a worklist to walk the uses of each alloca. 03568 SmallVector<Instruction *, 8> Worklist; 03569 SmallPtrSet<Instruction *, 8> Visited; 03570 SmallVector<Instruction *, 32> DeadInsts; 03571 03572 for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) { 03573 AllocaInst *AI = PromotableAllocas[Idx]; 03574 Insts.clear(); 03575 Worklist.clear(); 03576 Visited.clear(); 03577 03578 enqueueUsersInWorklist(*AI, Worklist, Visited); 03579 03580 while (!Worklist.empty()) { 03581 Instruction *I = Worklist.pop_back_val(); 03582 03583 // FIXME: Currently the SSAUpdater infrastructure doesn't reason about 03584 // lifetime intrinsics and so we strip them (and the bitcasts+GEPs 03585 // leading to them) here. Eventually it should use them to optimize the 03586 // scalar values produced. 03587 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 03588 assert(II->getIntrinsicID() == Intrinsic::lifetime_start || 03589 II->getIntrinsicID() == Intrinsic::lifetime_end); 03590 II->eraseFromParent(); 03591 continue; 03592 } 03593 03594 // Push the loads and stores we find onto the list. SROA will already 03595 // have validated that all loads and stores are viable candidates for 03596 // promotion. 03597 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 03598 assert(LI->getType() == AI->getAllocatedType()); 03599 Insts.push_back(LI); 03600 continue; 03601 } 03602 if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 03603 assert(SI->getValueOperand()->getType() == AI->getAllocatedType()); 03604 Insts.push_back(SI); 03605 continue; 03606 } 03607 03608 // For everything else, we know that only no-op bitcasts and GEPs will 03609 // make it this far, just recurse through them and recall them for later 03610 // removal. 03611 DeadInsts.push_back(I); 03612 enqueueUsersInWorklist(*I, Worklist, Visited); 03613 } 03614 AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts); 03615 while (!DeadInsts.empty()) 03616 DeadInsts.pop_back_val()->eraseFromParent(); 03617 AI->eraseFromParent(); 03618 } 03619 03620 PromotableAllocas.clear(); 03621 return true; 03622 } 03623 03624 bool SROA::runOnFunction(Function &F) { 03625 if (skipOptnoneFunction(F)) 03626 return false; 03627 03628 DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); 03629 C = &F.getContext(); 03630 DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); 03631 if (!DLP) { 03632 DEBUG(dbgs() << " Skipping SROA -- no target data!\n"); 03633 return false; 03634 } 03635 DL = &DLP->getDataLayout(); 03636 DominatorTreeWrapperPass *DTWP = 03637 getAnalysisIfAvailable<DominatorTreeWrapperPass>(); 03638 DT = DTWP ? &DTWP->getDomTree() : nullptr; 03639 AT = &getAnalysis<AssumptionTracker>(); 03640 03641 BasicBlock &EntryBB = F.getEntryBlock(); 03642 for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); 03643 I != E; ++I) 03644 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) 03645 Worklist.insert(AI); 03646 03647 bool Changed = false; 03648 // A set of deleted alloca instruction pointers which should be removed from 03649 // the list of promotable allocas. 03650 SmallPtrSet<AllocaInst *, 4> DeletedAllocas; 03651 03652 do { 03653 while (!Worklist.empty()) { 03654 Changed |= runOnAlloca(*Worklist.pop_back_val()); 03655 deleteDeadInstructions(DeletedAllocas); 03656 03657 // Remove the deleted allocas from various lists so that we don't try to 03658 // continue processing them. 03659 if (!DeletedAllocas.empty()) { 03660 auto IsInSet = [&](AllocaInst *AI) { 03661 return DeletedAllocas.count(AI); 03662 }; 03663 Worklist.remove_if(IsInSet); 03664 PostPromotionWorklist.remove_if(IsInSet); 03665 PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(), 03666 PromotableAllocas.end(), 03667 IsInSet), 03668 PromotableAllocas.end()); 03669 DeletedAllocas.clear(); 03670 } 03671 } 03672 03673 Changed |= promoteAllocas(F); 03674 03675 Worklist = PostPromotionWorklist; 03676 PostPromotionWorklist.clear(); 03677 } while (!Worklist.empty()); 03678 03679 return Changed; 03680 } 03681 03682 void SROA::getAnalysisUsage(AnalysisUsage &AU) const { 03683 AU.addRequired<AssumptionTracker>(); 03684 if (RequiresDomTree) 03685 AU.addRequired<DominatorTreeWrapperPass>(); 03686 AU.setPreservesCFG(); 03687 }