LLVM API Documentation
00001 //===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This transformation analyzes and transforms the induction variables (and 00011 // computations derived from them) into forms suitable for efficient execution 00012 // on the target. 00013 // 00014 // This pass performs a strength reduction on array references inside loops that 00015 // have as one or more of their components the loop induction variable, it 00016 // rewrites expressions to take advantage of scaled-index addressing modes 00017 // available on the target, and it performs a variety of other optimizations 00018 // related to loop induction variables. 00019 // 00020 // Terminology note: this code has a lot of handling for "post-increment" or 00021 // "post-inc" users. This is not talking about post-increment addressing modes; 00022 // it is instead talking about code like this: 00023 // 00024 // %i = phi [ 0, %entry ], [ %i.next, %latch ] 00025 // ... 00026 // %i.next = add %i, 1 00027 // %c = icmp eq %i.next, %n 00028 // 00029 // The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however 00030 // it's useful to think about these as the same register, with some uses using 00031 // the value of the register before the add and some using // it after. In this 00032 // example, the icmp is a post-increment user, since it uses %i.next, which is 00033 // the value of the induction variable after the increment. The other common 00034 // case of post-increment users is users outside the loop. 00035 // 00036 // TODO: More sophistication in the way Formulae are generated and filtered. 00037 // 00038 // TODO: Handle multiple loops at a time. 00039 // 00040 // TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead 00041 // of a GlobalValue? 00042 // 00043 // TODO: When truncation is free, truncate ICmp users' operands to make it a 00044 // smaller encoding (on x86 at least). 00045 // 00046 // TODO: When a negated register is used by an add (such as in a list of 00047 // multiple base registers, or as the increment expression in an addrec), 00048 // we may not actually need both reg and (-1 * reg) in registers; the 00049 // negation can be implemented by using a sub instead of an add. The 00050 // lack of support for taking this into consideration when making 00051 // register pressure decisions is partly worked around by the "Special" 00052 // use kind. 00053 // 00054 //===----------------------------------------------------------------------===// 00055 00056 #include "llvm/Transforms/Scalar.h" 00057 #include "llvm/ADT/DenseSet.h" 00058 #include "llvm/ADT/Hashing.h" 00059 #include "llvm/ADT/STLExtras.h" 00060 #include "llvm/ADT/SetVector.h" 00061 #include "llvm/ADT/SmallBitVector.h" 00062 #include "llvm/Analysis/IVUsers.h" 00063 #include "llvm/Analysis/LoopPass.h" 00064 #include "llvm/Analysis/ScalarEvolutionExpander.h" 00065 #include "llvm/Analysis/TargetTransformInfo.h" 00066 #include "llvm/IR/Constants.h" 00067 #include "llvm/IR/DerivedTypes.h" 00068 #include "llvm/IR/Dominators.h" 00069 #include "llvm/IR/Instructions.h" 00070 #include "llvm/IR/IntrinsicInst.h" 00071 #include "llvm/IR/ValueHandle.h" 00072 #include "llvm/Support/CommandLine.h" 00073 #include "llvm/Support/Debug.h" 00074 #include "llvm/Support/raw_ostream.h" 00075 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 00076 #include "llvm/Transforms/Utils/Local.h" 00077 #include <algorithm> 00078 using namespace llvm; 00079 00080 #define DEBUG_TYPE "loop-reduce" 00081 00082 /// MaxIVUsers is an arbitrary threshold that provides an early opportunitiy for 00083 /// bail out. This threshold is far beyond the number of users that LSR can 00084 /// conceivably solve, so it should not affect generated code, but catches the 00085 /// worst cases before LSR burns too much compile time and stack space. 00086 static const unsigned MaxIVUsers = 200; 00087 00088 // Temporary flag to cleanup congruent phis after LSR phi expansion. 00089 // It's currently disabled until we can determine whether it's truly useful or 00090 // not. The flag should be removed after the v3.0 release. 00091 // This is now needed for ivchains. 00092 static cl::opt<bool> EnablePhiElim( 00093 "enable-lsr-phielim", cl::Hidden, cl::init(true), 00094 cl::desc("Enable LSR phi elimination")); 00095 00096 #ifndef NDEBUG 00097 // Stress test IV chain generation. 00098 static cl::opt<bool> StressIVChain( 00099 "stress-ivchain", cl::Hidden, cl::init(false), 00100 cl::desc("Stress test LSR IV chains")); 00101 #else 00102 static bool StressIVChain = false; 00103 #endif 00104 00105 namespace { 00106 00107 /// RegSortData - This class holds data which is used to order reuse candidates. 00108 class RegSortData { 00109 public: 00110 /// UsedByIndices - This represents the set of LSRUse indices which reference 00111 /// a particular register. 00112 SmallBitVector UsedByIndices; 00113 00114 RegSortData() {} 00115 00116 void print(raw_ostream &OS) const; 00117 void dump() const; 00118 }; 00119 00120 } 00121 00122 void RegSortData::print(raw_ostream &OS) const { 00123 OS << "[NumUses=" << UsedByIndices.count() << ']'; 00124 } 00125 00126 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 00127 void RegSortData::dump() const { 00128 print(errs()); errs() << '\n'; 00129 } 00130 #endif 00131 00132 namespace { 00133 00134 /// RegUseTracker - Map register candidates to information about how they are 00135 /// used. 00136 class RegUseTracker { 00137 typedef DenseMap<const SCEV *, RegSortData> RegUsesTy; 00138 00139 RegUsesTy RegUsesMap; 00140 SmallVector<const SCEV *, 16> RegSequence; 00141 00142 public: 00143 void CountRegister(const SCEV *Reg, size_t LUIdx); 00144 void DropRegister(const SCEV *Reg, size_t LUIdx); 00145 void SwapAndDropUse(size_t LUIdx, size_t LastLUIdx); 00146 00147 bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const; 00148 00149 const SmallBitVector &getUsedByIndices(const SCEV *Reg) const; 00150 00151 void clear(); 00152 00153 typedef SmallVectorImpl<const SCEV *>::iterator iterator; 00154 typedef SmallVectorImpl<const SCEV *>::const_iterator const_iterator; 00155 iterator begin() { return RegSequence.begin(); } 00156 iterator end() { return RegSequence.end(); } 00157 const_iterator begin() const { return RegSequence.begin(); } 00158 const_iterator end() const { return RegSequence.end(); } 00159 }; 00160 00161 } 00162 00163 void 00164 RegUseTracker::CountRegister(const SCEV *Reg, size_t LUIdx) { 00165 std::pair<RegUsesTy::iterator, bool> Pair = 00166 RegUsesMap.insert(std::make_pair(Reg, RegSortData())); 00167 RegSortData &RSD = Pair.first->second; 00168 if (Pair.second) 00169 RegSequence.push_back(Reg); 00170 RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1)); 00171 RSD.UsedByIndices.set(LUIdx); 00172 } 00173 00174 void 00175 RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) { 00176 RegUsesTy::iterator It = RegUsesMap.find(Reg); 00177 assert(It != RegUsesMap.end()); 00178 RegSortData &RSD = It->second; 00179 assert(RSD.UsedByIndices.size() > LUIdx); 00180 RSD.UsedByIndices.reset(LUIdx); 00181 } 00182 00183 void 00184 RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) { 00185 assert(LUIdx <= LastLUIdx); 00186 00187 // Update RegUses. The data structure is not optimized for this purpose; 00188 // we must iterate through it and update each of the bit vectors. 00189 for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end(); 00190 I != E; ++I) { 00191 SmallBitVector &UsedByIndices = I->second.UsedByIndices; 00192 if (LUIdx < UsedByIndices.size()) 00193 UsedByIndices[LUIdx] = 00194 LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0; 00195 UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx)); 00196 } 00197 } 00198 00199 bool 00200 RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const { 00201 RegUsesTy::const_iterator I = RegUsesMap.find(Reg); 00202 if (I == RegUsesMap.end()) 00203 return false; 00204 const SmallBitVector &UsedByIndices = I->second.UsedByIndices; 00205 int i = UsedByIndices.find_first(); 00206 if (i == -1) return false; 00207 if ((size_t)i != LUIdx) return true; 00208 return UsedByIndices.find_next(i) != -1; 00209 } 00210 00211 const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const { 00212 RegUsesTy::const_iterator I = RegUsesMap.find(Reg); 00213 assert(I != RegUsesMap.end() && "Unknown register!"); 00214 return I->second.UsedByIndices; 00215 } 00216 00217 void RegUseTracker::clear() { 00218 RegUsesMap.clear(); 00219 RegSequence.clear(); 00220 } 00221 00222 namespace { 00223 00224 /// Formula - This class holds information that describes a formula for 00225 /// computing satisfying a use. It may include broken-out immediates and scaled 00226 /// registers. 00227 struct Formula { 00228 /// Global base address used for complex addressing. 00229 GlobalValue *BaseGV; 00230 00231 /// Base offset for complex addressing. 00232 int64_t BaseOffset; 00233 00234 /// Whether any complex addressing has a base register. 00235 bool HasBaseReg; 00236 00237 /// The scale of any complex addressing. 00238 int64_t Scale; 00239 00240 /// BaseRegs - The list of "base" registers for this use. When this is 00241 /// non-empty. The canonical representation of a formula is 00242 /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and 00243 /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty(). 00244 /// #1 enforces that the scaled register is always used when at least two 00245 /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2. 00246 /// #2 enforces that 1 * reg is reg. 00247 /// This invariant can be temporarly broken while building a formula. 00248 /// However, every formula inserted into the LSRInstance must be in canonical 00249 /// form. 00250 SmallVector<const SCEV *, 4> BaseRegs; 00251 00252 /// ScaledReg - The 'scaled' register for this use. This should be non-null 00253 /// when Scale is not zero. 00254 const SCEV *ScaledReg; 00255 00256 /// UnfoldedOffset - An additional constant offset which added near the 00257 /// use. This requires a temporary register, but the offset itself can 00258 /// live in an add immediate field rather than a register. 00259 int64_t UnfoldedOffset; 00260 00261 Formula() 00262 : BaseGV(nullptr), BaseOffset(0), HasBaseReg(false), Scale(0), 00263 ScaledReg(nullptr), UnfoldedOffset(0) {} 00264 00265 void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); 00266 00267 bool isCanonical() const; 00268 00269 void Canonicalize(); 00270 00271 bool Unscale(); 00272 00273 size_t getNumRegs() const; 00274 Type *getType() const; 00275 00276 void DeleteBaseReg(const SCEV *&S); 00277 00278 bool referencesReg(const SCEV *S) const; 00279 bool hasRegsUsedByUsesOtherThan(size_t LUIdx, 00280 const RegUseTracker &RegUses) const; 00281 00282 void print(raw_ostream &OS) const; 00283 void dump() const; 00284 }; 00285 00286 } 00287 00288 /// DoInitialMatch - Recursion helper for InitialMatch. 00289 static void DoInitialMatch(const SCEV *S, Loop *L, 00290 SmallVectorImpl<const SCEV *> &Good, 00291 SmallVectorImpl<const SCEV *> &Bad, 00292 ScalarEvolution &SE) { 00293 // Collect expressions which properly dominate the loop header. 00294 if (SE.properlyDominates(S, L->getHeader())) { 00295 Good.push_back(S); 00296 return; 00297 } 00298 00299 // Look at add operands. 00300 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 00301 for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); 00302 I != E; ++I) 00303 DoInitialMatch(*I, L, Good, Bad, SE); 00304 return; 00305 } 00306 00307 // Look at addrec operands. 00308 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) 00309 if (!AR->getStart()->isZero()) { 00310 DoInitialMatch(AR->getStart(), L, Good, Bad, SE); 00311 DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0), 00312 AR->getStepRecurrence(SE), 00313 // FIXME: AR->getNoWrapFlags() 00314 AR->getLoop(), SCEV::FlagAnyWrap), 00315 L, Good, Bad, SE); 00316 return; 00317 } 00318 00319 // Handle a multiplication by -1 (negation) if it didn't fold. 00320 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) 00321 if (Mul->getOperand(0)->isAllOnesValue()) { 00322 SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end()); 00323 const SCEV *NewMul = SE.getMulExpr(Ops); 00324 00325 SmallVector<const SCEV *, 4> MyGood; 00326 SmallVector<const SCEV *, 4> MyBad; 00327 DoInitialMatch(NewMul, L, MyGood, MyBad, SE); 00328 const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue( 00329 SE.getEffectiveSCEVType(NewMul->getType()))); 00330 for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(), 00331 E = MyGood.end(); I != E; ++I) 00332 Good.push_back(SE.getMulExpr(NegOne, *I)); 00333 for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(), 00334 E = MyBad.end(); I != E; ++I) 00335 Bad.push_back(SE.getMulExpr(NegOne, *I)); 00336 return; 00337 } 00338 00339 // Ok, we can't do anything interesting. Just stuff the whole thing into a 00340 // register and hope for the best. 00341 Bad.push_back(S); 00342 } 00343 00344 /// InitialMatch - Incorporate loop-variant parts of S into this Formula, 00345 /// attempting to keep all loop-invariant and loop-computable values in a 00346 /// single base register. 00347 void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { 00348 SmallVector<const SCEV *, 4> Good; 00349 SmallVector<const SCEV *, 4> Bad; 00350 DoInitialMatch(S, L, Good, Bad, SE); 00351 if (!Good.empty()) { 00352 const SCEV *Sum = SE.getAddExpr(Good); 00353 if (!Sum->isZero()) 00354 BaseRegs.push_back(Sum); 00355 HasBaseReg = true; 00356 } 00357 if (!Bad.empty()) { 00358 const SCEV *Sum = SE.getAddExpr(Bad); 00359 if (!Sum->isZero()) 00360 BaseRegs.push_back(Sum); 00361 HasBaseReg = true; 00362 } 00363 Canonicalize(); 00364 } 00365 00366 /// \brief Check whether or not this formula statisfies the canonical 00367 /// representation. 00368 /// \see Formula::BaseRegs. 00369 bool Formula::isCanonical() const { 00370 if (ScaledReg) 00371 return Scale != 1 || !BaseRegs.empty(); 00372 return BaseRegs.size() <= 1; 00373 } 00374 00375 /// \brief Helper method to morph a formula into its canonical representation. 00376 /// \see Formula::BaseRegs. 00377 /// Every formula having more than one base register, must use the ScaledReg 00378 /// field. Otherwise, we would have to do special cases everywhere in LSR 00379 /// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ... 00380 /// On the other hand, 1*reg should be canonicalized into reg. 00381 void Formula::Canonicalize() { 00382 if (isCanonical()) 00383 return; 00384 // So far we did not need this case. This is easy to implement but it is 00385 // useless to maintain dead code. Beside it could hurt compile time. 00386 assert(!BaseRegs.empty() && "1*reg => reg, should not be needed."); 00387 // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg. 00388 ScaledReg = BaseRegs.back(); 00389 BaseRegs.pop_back(); 00390 Scale = 1; 00391 size_t BaseRegsSize = BaseRegs.size(); 00392 size_t Try = 0; 00393 // If ScaledReg is an invariant, try to find a variant expression. 00394 while (Try < BaseRegsSize && !isa<SCEVAddRecExpr>(ScaledReg)) 00395 std::swap(ScaledReg, BaseRegs[Try++]); 00396 } 00397 00398 /// \brief Get rid of the scale in the formula. 00399 /// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2. 00400 /// \return true if it was possible to get rid of the scale, false otherwise. 00401 /// \note After this operation the formula may not be in the canonical form. 00402 bool Formula::Unscale() { 00403 if (Scale != 1) 00404 return false; 00405 Scale = 0; 00406 BaseRegs.push_back(ScaledReg); 00407 ScaledReg = nullptr; 00408 return true; 00409 } 00410 00411 /// getNumRegs - Return the total number of register operands used by this 00412 /// formula. This does not include register uses implied by non-constant 00413 /// addrec strides. 00414 size_t Formula::getNumRegs() const { 00415 return !!ScaledReg + BaseRegs.size(); 00416 } 00417 00418 /// getType - Return the type of this formula, if it has one, or null 00419 /// otherwise. This type is meaningless except for the bit size. 00420 Type *Formula::getType() const { 00421 return !BaseRegs.empty() ? BaseRegs.front()->getType() : 00422 ScaledReg ? ScaledReg->getType() : 00423 BaseGV ? BaseGV->getType() : 00424 nullptr; 00425 } 00426 00427 /// DeleteBaseReg - Delete the given base reg from the BaseRegs list. 00428 void Formula::DeleteBaseReg(const SCEV *&S) { 00429 if (&S != &BaseRegs.back()) 00430 std::swap(S, BaseRegs.back()); 00431 BaseRegs.pop_back(); 00432 } 00433 00434 /// referencesReg - Test if this formula references the given register. 00435 bool Formula::referencesReg(const SCEV *S) const { 00436 return S == ScaledReg || 00437 std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end(); 00438 } 00439 00440 /// hasRegsUsedByUsesOtherThan - Test whether this formula uses registers 00441 /// which are used by uses other than the use with the given index. 00442 bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx, 00443 const RegUseTracker &RegUses) const { 00444 if (ScaledReg) 00445 if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx)) 00446 return true; 00447 for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(), 00448 E = BaseRegs.end(); I != E; ++I) 00449 if (RegUses.isRegUsedByUsesOtherThan(*I, LUIdx)) 00450 return true; 00451 return false; 00452 } 00453 00454 void Formula::print(raw_ostream &OS) const { 00455 bool First = true; 00456 if (BaseGV) { 00457 if (!First) OS << " + "; else First = false; 00458 BaseGV->printAsOperand(OS, /*PrintType=*/false); 00459 } 00460 if (BaseOffset != 0) { 00461 if (!First) OS << " + "; else First = false; 00462 OS << BaseOffset; 00463 } 00464 for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(), 00465 E = BaseRegs.end(); I != E; ++I) { 00466 if (!First) OS << " + "; else First = false; 00467 OS << "reg(" << **I << ')'; 00468 } 00469 if (HasBaseReg && BaseRegs.empty()) { 00470 if (!First) OS << " + "; else First = false; 00471 OS << "**error: HasBaseReg**"; 00472 } else if (!HasBaseReg && !BaseRegs.empty()) { 00473 if (!First) OS << " + "; else First = false; 00474 OS << "**error: !HasBaseReg**"; 00475 } 00476 if (Scale != 0) { 00477 if (!First) OS << " + "; else First = false; 00478 OS << Scale << "*reg("; 00479 if (ScaledReg) 00480 OS << *ScaledReg; 00481 else 00482 OS << "<unknown>"; 00483 OS << ')'; 00484 } 00485 if (UnfoldedOffset != 0) { 00486 if (!First) OS << " + "; 00487 OS << "imm(" << UnfoldedOffset << ')'; 00488 } 00489 } 00490 00491 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 00492 void Formula::dump() const { 00493 print(errs()); errs() << '\n'; 00494 } 00495 #endif 00496 00497 /// isAddRecSExtable - Return true if the given addrec can be sign-extended 00498 /// without changing its value. 00499 static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { 00500 Type *WideTy = 00501 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1); 00502 return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy)); 00503 } 00504 00505 /// isAddSExtable - Return true if the given add can be sign-extended 00506 /// without changing its value. 00507 static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) { 00508 Type *WideTy = 00509 IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1); 00510 return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy)); 00511 } 00512 00513 /// isMulSExtable - Return true if the given mul can be sign-extended 00514 /// without changing its value. 00515 static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) { 00516 Type *WideTy = 00517 IntegerType::get(SE.getContext(), 00518 SE.getTypeSizeInBits(M->getType()) * M->getNumOperands()); 00519 return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy)); 00520 } 00521 00522 /// getExactSDiv - Return an expression for LHS /s RHS, if it can be determined 00523 /// and if the remainder is known to be zero, or null otherwise. If 00524 /// IgnoreSignificantBits is true, expressions like (X * Y) /s Y are simplified 00525 /// to Y, ignoring that the multiplication may overflow, which is useful when 00526 /// the result will be used in a context where the most significant bits are 00527 /// ignored. 00528 static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, 00529 ScalarEvolution &SE, 00530 bool IgnoreSignificantBits = false) { 00531 // Handle the trivial case, which works for any SCEV type. 00532 if (LHS == RHS) 00533 return SE.getConstant(LHS->getType(), 1); 00534 00535 // Handle a few RHS special cases. 00536 const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS); 00537 if (RC) { 00538 const APInt &RA = RC->getValue()->getValue(); 00539 // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do 00540 // some folding. 00541 if (RA.isAllOnesValue()) 00542 return SE.getMulExpr(LHS, RC); 00543 // Handle x /s 1 as x. 00544 if (RA == 1) 00545 return LHS; 00546 } 00547 00548 // Check for a division of a constant by a constant. 00549 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) { 00550 if (!RC) 00551 return nullptr; 00552 const APInt &LA = C->getValue()->getValue(); 00553 const APInt &RA = RC->getValue()->getValue(); 00554 if (LA.srem(RA) != 0) 00555 return nullptr; 00556 return SE.getConstant(LA.sdiv(RA)); 00557 } 00558 00559 // Distribute the sdiv over addrec operands, if the addrec doesn't overflow. 00560 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) { 00561 if (IgnoreSignificantBits || isAddRecSExtable(AR, SE)) { 00562 const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE, 00563 IgnoreSignificantBits); 00564 if (!Step) return nullptr; 00565 const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE, 00566 IgnoreSignificantBits); 00567 if (!Start) return nullptr; 00568 // FlagNW is independent of the start value, step direction, and is 00569 // preserved with smaller magnitude steps. 00570 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) 00571 return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap); 00572 } 00573 return nullptr; 00574 } 00575 00576 // Distribute the sdiv over add operands, if the add doesn't overflow. 00577 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) { 00578 if (IgnoreSignificantBits || isAddSExtable(Add, SE)) { 00579 SmallVector<const SCEV *, 8> Ops; 00580 for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); 00581 I != E; ++I) { 00582 const SCEV *Op = getExactSDiv(*I, RHS, SE, 00583 IgnoreSignificantBits); 00584 if (!Op) return nullptr; 00585 Ops.push_back(Op); 00586 } 00587 return SE.getAddExpr(Ops); 00588 } 00589 return nullptr; 00590 } 00591 00592 // Check for a multiply operand that we can pull RHS out of. 00593 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) { 00594 if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) { 00595 SmallVector<const SCEV *, 4> Ops; 00596 bool Found = false; 00597 for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end(); 00598 I != E; ++I) { 00599 const SCEV *S = *I; 00600 if (!Found) 00601 if (const SCEV *Q = getExactSDiv(S, RHS, SE, 00602 IgnoreSignificantBits)) { 00603 S = Q; 00604 Found = true; 00605 } 00606 Ops.push_back(S); 00607 } 00608 return Found ? SE.getMulExpr(Ops) : nullptr; 00609 } 00610 return nullptr; 00611 } 00612 00613 // Otherwise we don't know. 00614 return nullptr; 00615 } 00616 00617 /// ExtractImmediate - If S involves the addition of a constant integer value, 00618 /// return that integer value, and mutate S to point to a new SCEV with that 00619 /// value excluded. 00620 static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { 00621 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) { 00622 if (C->getValue()->getValue().getMinSignedBits() <= 64) { 00623 S = SE.getConstant(C->getType(), 0); 00624 return C->getValue()->getSExtValue(); 00625 } 00626 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 00627 SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end()); 00628 int64_t Result = ExtractImmediate(NewOps.front(), SE); 00629 if (Result != 0) 00630 S = SE.getAddExpr(NewOps); 00631 return Result; 00632 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 00633 SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end()); 00634 int64_t Result = ExtractImmediate(NewOps.front(), SE); 00635 if (Result != 0) 00636 S = SE.getAddRecExpr(NewOps, AR->getLoop(), 00637 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) 00638 SCEV::FlagAnyWrap); 00639 return Result; 00640 } 00641 return 0; 00642 } 00643 00644 /// ExtractSymbol - If S involves the addition of a GlobalValue address, 00645 /// return that symbol, and mutate S to point to a new SCEV with that 00646 /// value excluded. 00647 static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { 00648 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) { 00649 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) { 00650 S = SE.getConstant(GV->getType(), 0); 00651 return GV; 00652 } 00653 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 00654 SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end()); 00655 GlobalValue *Result = ExtractSymbol(NewOps.back(), SE); 00656 if (Result) 00657 S = SE.getAddExpr(NewOps); 00658 return Result; 00659 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 00660 SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end()); 00661 GlobalValue *Result = ExtractSymbol(NewOps.front(), SE); 00662 if (Result) 00663 S = SE.getAddRecExpr(NewOps, AR->getLoop(), 00664 // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) 00665 SCEV::FlagAnyWrap); 00666 return Result; 00667 } 00668 return nullptr; 00669 } 00670 00671 /// isAddressUse - Returns true if the specified instruction is using the 00672 /// specified value as an address. 00673 static bool isAddressUse(Instruction *Inst, Value *OperandVal) { 00674 bool isAddress = isa<LoadInst>(Inst); 00675 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 00676 if (SI->getOperand(1) == OperandVal) 00677 isAddress = true; 00678 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { 00679 // Addressing modes can also be folded into prefetches and a variety 00680 // of intrinsics. 00681 switch (II->getIntrinsicID()) { 00682 default: break; 00683 case Intrinsic::prefetch: 00684 case Intrinsic::x86_sse_storeu_ps: 00685 case Intrinsic::x86_sse2_storeu_pd: 00686 case Intrinsic::x86_sse2_storeu_dq: 00687 case Intrinsic::x86_sse2_storel_dq: 00688 if (II->getArgOperand(0) == OperandVal) 00689 isAddress = true; 00690 break; 00691 } 00692 } 00693 return isAddress; 00694 } 00695 00696 /// getAccessType - Return the type of the memory being accessed. 00697 static Type *getAccessType(const Instruction *Inst) { 00698 Type *AccessTy = Inst->getType(); 00699 if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) 00700 AccessTy = SI->getOperand(0)->getType(); 00701 else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { 00702 // Addressing modes can also be folded into prefetches and a variety 00703 // of intrinsics. 00704 switch (II->getIntrinsicID()) { 00705 default: break; 00706 case Intrinsic::x86_sse_storeu_ps: 00707 case Intrinsic::x86_sse2_storeu_pd: 00708 case Intrinsic::x86_sse2_storeu_dq: 00709 case Intrinsic::x86_sse2_storel_dq: 00710 AccessTy = II->getArgOperand(0)->getType(); 00711 break; 00712 } 00713 } 00714 00715 // All pointers have the same requirements, so canonicalize them to an 00716 // arbitrary pointer type to minimize variation. 00717 if (PointerType *PTy = dyn_cast<PointerType>(AccessTy)) 00718 AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1), 00719 PTy->getAddressSpace()); 00720 00721 return AccessTy; 00722 } 00723 00724 /// isExistingPhi - Return true if this AddRec is already a phi in its loop. 00725 static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) { 00726 for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin(); 00727 PHINode *PN = dyn_cast<PHINode>(I); ++I) { 00728 if (SE.isSCEVable(PN->getType()) && 00729 (SE.getEffectiveSCEVType(PN->getType()) == 00730 SE.getEffectiveSCEVType(AR->getType())) && 00731 SE.getSCEV(PN) == AR) 00732 return true; 00733 } 00734 return false; 00735 } 00736 00737 /// Check if expanding this expression is likely to incur significant cost. This 00738 /// is tricky because SCEV doesn't track which expressions are actually computed 00739 /// by the current IR. 00740 /// 00741 /// We currently allow expansion of IV increments that involve adds, 00742 /// multiplication by constants, and AddRecs from existing phis. 00743 /// 00744 /// TODO: Allow UDivExpr if we can find an existing IV increment that is an 00745 /// obvious multiple of the UDivExpr. 00746 static bool isHighCostExpansion(const SCEV *S, 00747 SmallPtrSetImpl<const SCEV*> &Processed, 00748 ScalarEvolution &SE) { 00749 // Zero/One operand expressions 00750 switch (S->getSCEVType()) { 00751 case scUnknown: 00752 case scConstant: 00753 return false; 00754 case scTruncate: 00755 return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(), 00756 Processed, SE); 00757 case scZeroExtend: 00758 return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(), 00759 Processed, SE); 00760 case scSignExtend: 00761 return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(), 00762 Processed, SE); 00763 } 00764 00765 if (!Processed.insert(S)) 00766 return false; 00767 00768 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 00769 for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); 00770 I != E; ++I) { 00771 if (isHighCostExpansion(*I, Processed, SE)) 00772 return true; 00773 } 00774 return false; 00775 } 00776 00777 if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) { 00778 if (Mul->getNumOperands() == 2) { 00779 // Multiplication by a constant is ok 00780 if (isa<SCEVConstant>(Mul->getOperand(0))) 00781 return isHighCostExpansion(Mul->getOperand(1), Processed, SE); 00782 00783 // If we have the value of one operand, check if an existing 00784 // multiplication already generates this expression. 00785 if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) { 00786 Value *UVal = U->getValue(); 00787 for (User *UR : UVal->users()) { 00788 // If U is a constant, it may be used by a ConstantExpr. 00789 Instruction *UI = dyn_cast<Instruction>(UR); 00790 if (UI && UI->getOpcode() == Instruction::Mul && 00791 SE.isSCEVable(UI->getType())) { 00792 return SE.getSCEV(UI) == Mul; 00793 } 00794 } 00795 } 00796 } 00797 } 00798 00799 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 00800 if (isExistingPhi(AR, SE)) 00801 return false; 00802 } 00803 00804 // Fow now, consider any other type of expression (div/mul/min/max) high cost. 00805 return true; 00806 } 00807 00808 /// DeleteTriviallyDeadInstructions - If any of the instructions is the 00809 /// specified set are trivially dead, delete them and see if this makes any of 00810 /// their operands subsequently dead. 00811 static bool 00812 DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { 00813 bool Changed = false; 00814 00815 while (!DeadInsts.empty()) { 00816 Value *V = DeadInsts.pop_back_val(); 00817 Instruction *I = dyn_cast_or_null<Instruction>(V); 00818 00819 if (!I || !isInstructionTriviallyDead(I)) 00820 continue; 00821 00822 for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) 00823 if (Instruction *U = dyn_cast<Instruction>(*OI)) { 00824 *OI = nullptr; 00825 if (U->use_empty()) 00826 DeadInsts.push_back(U); 00827 } 00828 00829 I->eraseFromParent(); 00830 Changed = true; 00831 } 00832 00833 return Changed; 00834 } 00835 00836 namespace { 00837 class LSRUse; 00838 } 00839 00840 /// \brief Check if the addressing mode defined by \p F is completely 00841 /// folded in \p LU at isel time. 00842 /// This includes address-mode folding and special icmp tricks. 00843 /// This function returns true if \p LU can accommodate what \p F 00844 /// defines and up to 1 base + 1 scaled + offset. 00845 /// In other words, if \p F has several base registers, this function may 00846 /// still return true. Therefore, users still need to account for 00847 /// additional base registers and/or unfolded offsets to derive an 00848 /// accurate cost model. 00849 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, 00850 const LSRUse &LU, const Formula &F); 00851 // Get the cost of the scaling factor used in F for LU. 00852 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, 00853 const LSRUse &LU, const Formula &F); 00854 00855 namespace { 00856 00857 /// Cost - This class is used to measure and compare candidate formulae. 00858 class Cost { 00859 /// TODO: Some of these could be merged. Also, a lexical ordering 00860 /// isn't always optimal. 00861 unsigned NumRegs; 00862 unsigned AddRecCost; 00863 unsigned NumIVMuls; 00864 unsigned NumBaseAdds; 00865 unsigned ImmCost; 00866 unsigned SetupCost; 00867 unsigned ScaleCost; 00868 00869 public: 00870 Cost() 00871 : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0), 00872 SetupCost(0), ScaleCost(0) {} 00873 00874 bool operator<(const Cost &Other) const; 00875 00876 void Lose(); 00877 00878 #ifndef NDEBUG 00879 // Once any of the metrics loses, they must all remain losers. 00880 bool isValid() { 00881 return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds 00882 | ImmCost | SetupCost | ScaleCost) != ~0u) 00883 || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds 00884 & ImmCost & SetupCost & ScaleCost) == ~0u); 00885 } 00886 #endif 00887 00888 bool isLoser() { 00889 assert(isValid() && "invalid cost"); 00890 return NumRegs == ~0u; 00891 } 00892 00893 void RateFormula(const TargetTransformInfo &TTI, 00894 const Formula &F, 00895 SmallPtrSetImpl<const SCEV *> &Regs, 00896 const DenseSet<const SCEV *> &VisitedRegs, 00897 const Loop *L, 00898 const SmallVectorImpl<int64_t> &Offsets, 00899 ScalarEvolution &SE, DominatorTree &DT, 00900 const LSRUse &LU, 00901 SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr); 00902 00903 void print(raw_ostream &OS) const; 00904 void dump() const; 00905 00906 private: 00907 void RateRegister(const SCEV *Reg, 00908 SmallPtrSetImpl<const SCEV *> &Regs, 00909 const Loop *L, 00910 ScalarEvolution &SE, DominatorTree &DT); 00911 void RatePrimaryRegister(const SCEV *Reg, 00912 SmallPtrSetImpl<const SCEV *> &Regs, 00913 const Loop *L, 00914 ScalarEvolution &SE, DominatorTree &DT, 00915 SmallPtrSetImpl<const SCEV *> *LoserRegs); 00916 }; 00917 00918 } 00919 00920 /// RateRegister - Tally up interesting quantities from the given register. 00921 void Cost::RateRegister(const SCEV *Reg, 00922 SmallPtrSetImpl<const SCEV *> &Regs, 00923 const Loop *L, 00924 ScalarEvolution &SE, DominatorTree &DT) { 00925 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) { 00926 // If this is an addrec for another loop, don't second-guess its addrec phi 00927 // nodes. LSR isn't currently smart enough to reason about more than one 00928 // loop at a time. LSR has already run on inner loops, will not run on outer 00929 // loops, and cannot be expected to change sibling loops. 00930 if (AR->getLoop() != L) { 00931 // If the AddRec exists, consider it's register free and leave it alone. 00932 if (isExistingPhi(AR, SE)) 00933 return; 00934 00935 // Otherwise, do not consider this formula at all. 00936 Lose(); 00937 return; 00938 } 00939 AddRecCost += 1; /// TODO: This should be a function of the stride. 00940 00941 // Add the step value register, if it needs one. 00942 // TODO: The non-affine case isn't precisely modeled here. 00943 if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) { 00944 if (!Regs.count(AR->getOperand(1))) { 00945 RateRegister(AR->getOperand(1), Regs, L, SE, DT); 00946 if (isLoser()) 00947 return; 00948 } 00949 } 00950 } 00951 ++NumRegs; 00952 00953 // Rough heuristic; favor registers which don't require extra setup 00954 // instructions in the preheader. 00955 if (!isa<SCEVUnknown>(Reg) && 00956 !isa<SCEVConstant>(Reg) && 00957 !(isa<SCEVAddRecExpr>(Reg) && 00958 (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) || 00959 isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart())))) 00960 ++SetupCost; 00961 00962 NumIVMuls += isa<SCEVMulExpr>(Reg) && 00963 SE.hasComputableLoopEvolution(Reg, L); 00964 } 00965 00966 /// RatePrimaryRegister - Record this register in the set. If we haven't seen it 00967 /// before, rate it. Optional LoserRegs provides a way to declare any formula 00968 /// that refers to one of those regs an instant loser. 00969 void Cost::RatePrimaryRegister(const SCEV *Reg, 00970 SmallPtrSetImpl<const SCEV *> &Regs, 00971 const Loop *L, 00972 ScalarEvolution &SE, DominatorTree &DT, 00973 SmallPtrSetImpl<const SCEV *> *LoserRegs) { 00974 if (LoserRegs && LoserRegs->count(Reg)) { 00975 Lose(); 00976 return; 00977 } 00978 if (Regs.insert(Reg)) { 00979 RateRegister(Reg, Regs, L, SE, DT); 00980 if (LoserRegs && isLoser()) 00981 LoserRegs->insert(Reg); 00982 } 00983 } 00984 00985 void Cost::RateFormula(const TargetTransformInfo &TTI, 00986 const Formula &F, 00987 SmallPtrSetImpl<const SCEV *> &Regs, 00988 const DenseSet<const SCEV *> &VisitedRegs, 00989 const Loop *L, 00990 const SmallVectorImpl<int64_t> &Offsets, 00991 ScalarEvolution &SE, DominatorTree &DT, 00992 const LSRUse &LU, 00993 SmallPtrSetImpl<const SCEV *> *LoserRegs) { 00994 assert(F.isCanonical() && "Cost is accurate only for canonical formula"); 00995 // Tally up the registers. 00996 if (const SCEV *ScaledReg = F.ScaledReg) { 00997 if (VisitedRegs.count(ScaledReg)) { 00998 Lose(); 00999 return; 01000 } 01001 RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs); 01002 if (isLoser()) 01003 return; 01004 } 01005 for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(), 01006 E = F.BaseRegs.end(); I != E; ++I) { 01007 const SCEV *BaseReg = *I; 01008 if (VisitedRegs.count(BaseReg)) { 01009 Lose(); 01010 return; 01011 } 01012 RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs); 01013 if (isLoser()) 01014 return; 01015 } 01016 01017 // Determine how many (unfolded) adds we'll need inside the loop. 01018 size_t NumBaseParts = F.getNumRegs(); 01019 if (NumBaseParts > 1) 01020 // Do not count the base and a possible second register if the target 01021 // allows to fold 2 registers. 01022 NumBaseAdds += 01023 NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F))); 01024 NumBaseAdds += (F.UnfoldedOffset != 0); 01025 01026 // Accumulate non-free scaling amounts. 01027 ScaleCost += getScalingFactorCost(TTI, LU, F); 01028 01029 // Tally up the non-zero immediates. 01030 for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), 01031 E = Offsets.end(); I != E; ++I) { 01032 int64_t Offset = (uint64_t)*I + F.BaseOffset; 01033 if (F.BaseGV) 01034 ImmCost += 64; // Handle symbolic values conservatively. 01035 // TODO: This should probably be the pointer size. 01036 else if (Offset != 0) 01037 ImmCost += APInt(64, Offset, true).getMinSignedBits(); 01038 } 01039 assert(isValid() && "invalid cost"); 01040 } 01041 01042 /// Lose - Set this cost to a losing value. 01043 void Cost::Lose() { 01044 NumRegs = ~0u; 01045 AddRecCost = ~0u; 01046 NumIVMuls = ~0u; 01047 NumBaseAdds = ~0u; 01048 ImmCost = ~0u; 01049 SetupCost = ~0u; 01050 ScaleCost = ~0u; 01051 } 01052 01053 /// operator< - Choose the lower cost. 01054 bool Cost::operator<(const Cost &Other) const { 01055 return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost, 01056 ImmCost, SetupCost) < 01057 std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls, 01058 Other.NumBaseAdds, Other.ScaleCost, Other.ImmCost, 01059 Other.SetupCost); 01060 } 01061 01062 void Cost::print(raw_ostream &OS) const { 01063 OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s"); 01064 if (AddRecCost != 0) 01065 OS << ", with addrec cost " << AddRecCost; 01066 if (NumIVMuls != 0) 01067 OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s"); 01068 if (NumBaseAdds != 0) 01069 OS << ", plus " << NumBaseAdds << " base add" 01070 << (NumBaseAdds == 1 ? "" : "s"); 01071 if (ScaleCost != 0) 01072 OS << ", plus " << ScaleCost << " scale cost"; 01073 if (ImmCost != 0) 01074 OS << ", plus " << ImmCost << " imm cost"; 01075 if (SetupCost != 0) 01076 OS << ", plus " << SetupCost << " setup cost"; 01077 } 01078 01079 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 01080 void Cost::dump() const { 01081 print(errs()); errs() << '\n'; 01082 } 01083 #endif 01084 01085 namespace { 01086 01087 /// LSRFixup - An operand value in an instruction which is to be replaced 01088 /// with some equivalent, possibly strength-reduced, replacement. 01089 struct LSRFixup { 01090 /// UserInst - The instruction which will be updated. 01091 Instruction *UserInst; 01092 01093 /// OperandValToReplace - The operand of the instruction which will 01094 /// be replaced. The operand may be used more than once; every instance 01095 /// will be replaced. 01096 Value *OperandValToReplace; 01097 01098 /// PostIncLoops - If this user is to use the post-incremented value of an 01099 /// induction variable, this variable is non-null and holds the loop 01100 /// associated with the induction variable. 01101 PostIncLoopSet PostIncLoops; 01102 01103 /// LUIdx - The index of the LSRUse describing the expression which 01104 /// this fixup needs, minus an offset (below). 01105 size_t LUIdx; 01106 01107 /// Offset - A constant offset to be added to the LSRUse expression. 01108 /// This allows multiple fixups to share the same LSRUse with different 01109 /// offsets, for example in an unrolled loop. 01110 int64_t Offset; 01111 01112 bool isUseFullyOutsideLoop(const Loop *L) const; 01113 01114 LSRFixup(); 01115 01116 void print(raw_ostream &OS) const; 01117 void dump() const; 01118 }; 01119 01120 } 01121 01122 LSRFixup::LSRFixup() 01123 : UserInst(nullptr), OperandValToReplace(nullptr), LUIdx(~size_t(0)), 01124 Offset(0) {} 01125 01126 /// isUseFullyOutsideLoop - Test whether this fixup always uses its 01127 /// value outside of the given loop. 01128 bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const { 01129 // PHI nodes use their value in their incoming blocks. 01130 if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) { 01131 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) 01132 if (PN->getIncomingValue(i) == OperandValToReplace && 01133 L->contains(PN->getIncomingBlock(i))) 01134 return false; 01135 return true; 01136 } 01137 01138 return !L->contains(UserInst); 01139 } 01140 01141 void LSRFixup::print(raw_ostream &OS) const { 01142 OS << "UserInst="; 01143 // Store is common and interesting enough to be worth special-casing. 01144 if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) { 01145 OS << "store "; 01146 Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false); 01147 } else if (UserInst->getType()->isVoidTy()) 01148 OS << UserInst->getOpcodeName(); 01149 else 01150 UserInst->printAsOperand(OS, /*PrintType=*/false); 01151 01152 OS << ", OperandValToReplace="; 01153 OperandValToReplace->printAsOperand(OS, /*PrintType=*/false); 01154 01155 for (PostIncLoopSet::const_iterator I = PostIncLoops.begin(), 01156 E = PostIncLoops.end(); I != E; ++I) { 01157 OS << ", PostIncLoop="; 01158 (*I)->getHeader()->printAsOperand(OS, /*PrintType=*/false); 01159 } 01160 01161 if (LUIdx != ~size_t(0)) 01162 OS << ", LUIdx=" << LUIdx; 01163 01164 if (Offset != 0) 01165 OS << ", Offset=" << Offset; 01166 } 01167 01168 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 01169 void LSRFixup::dump() const { 01170 print(errs()); errs() << '\n'; 01171 } 01172 #endif 01173 01174 namespace { 01175 01176 /// UniquifierDenseMapInfo - A DenseMapInfo implementation for holding 01177 /// DenseMaps and DenseSets of sorted SmallVectors of const SCEV*. 01178 struct UniquifierDenseMapInfo { 01179 static SmallVector<const SCEV *, 4> getEmptyKey() { 01180 SmallVector<const SCEV *, 4> V; 01181 V.push_back(reinterpret_cast<const SCEV *>(-1)); 01182 return V; 01183 } 01184 01185 static SmallVector<const SCEV *, 4> getTombstoneKey() { 01186 SmallVector<const SCEV *, 4> V; 01187 V.push_back(reinterpret_cast<const SCEV *>(-2)); 01188 return V; 01189 } 01190 01191 static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) { 01192 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end())); 01193 } 01194 01195 static bool isEqual(const SmallVector<const SCEV *, 4> &LHS, 01196 const SmallVector<const SCEV *, 4> &RHS) { 01197 return LHS == RHS; 01198 } 01199 }; 01200 01201 /// LSRUse - This class holds the state that LSR keeps for each use in 01202 /// IVUsers, as well as uses invented by LSR itself. It includes information 01203 /// about what kinds of things can be folded into the user, information about 01204 /// the user itself, and information about how the use may be satisfied. 01205 /// TODO: Represent multiple users of the same expression in common? 01206 class LSRUse { 01207 DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier; 01208 01209 public: 01210 /// KindType - An enum for a kind of use, indicating what types of 01211 /// scaled and immediate operands it might support. 01212 enum KindType { 01213 Basic, ///< A normal use, with no folding. 01214 Special, ///< A special case of basic, allowing -1 scales. 01215 Address, ///< An address use; folding according to TargetLowering 01216 ICmpZero ///< An equality icmp with both operands folded into one. 01217 // TODO: Add a generic icmp too? 01218 }; 01219 01220 typedef PointerIntPair<const SCEV *, 2, KindType> SCEVUseKindPair; 01221 01222 KindType Kind; 01223 Type *AccessTy; 01224 01225 SmallVector<int64_t, 8> Offsets; 01226 int64_t MinOffset; 01227 int64_t MaxOffset; 01228 01229 /// AllFixupsOutsideLoop - This records whether all of the fixups using this 01230 /// LSRUse are outside of the loop, in which case some special-case heuristics 01231 /// may be used. 01232 bool AllFixupsOutsideLoop; 01233 01234 /// RigidFormula is set to true to guarantee that this use will be associated 01235 /// with a single formula--the one that initially matched. Some SCEV 01236 /// expressions cannot be expanded. This allows LSR to consider the registers 01237 /// used by those expressions without the need to expand them later after 01238 /// changing the formula. 01239 bool RigidFormula; 01240 01241 /// WidestFixupType - This records the widest use type for any fixup using 01242 /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different 01243 /// max fixup widths to be equivalent, because the narrower one may be relying 01244 /// on the implicit truncation to truncate away bogus bits. 01245 Type *WidestFixupType; 01246 01247 /// Formulae - A list of ways to build a value that can satisfy this user. 01248 /// After the list is populated, one of these is selected heuristically and 01249 /// used to formulate a replacement for OperandValToReplace in UserInst. 01250 SmallVector<Formula, 12> Formulae; 01251 01252 /// Regs - The set of register candidates used by all formulae in this LSRUse. 01253 SmallPtrSet<const SCEV *, 4> Regs; 01254 01255 LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T), 01256 MinOffset(INT64_MAX), 01257 MaxOffset(INT64_MIN), 01258 AllFixupsOutsideLoop(true), 01259 RigidFormula(false), 01260 WidestFixupType(nullptr) {} 01261 01262 bool HasFormulaWithSameRegs(const Formula &F) const; 01263 bool InsertFormula(const Formula &F); 01264 void DeleteFormula(Formula &F); 01265 void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses); 01266 01267 void print(raw_ostream &OS) const; 01268 void dump() const; 01269 }; 01270 01271 } 01272 01273 /// HasFormula - Test whether this use as a formula which has the same 01274 /// registers as the given formula. 01275 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { 01276 SmallVector<const SCEV *, 4> Key = F.BaseRegs; 01277 if (F.ScaledReg) Key.push_back(F.ScaledReg); 01278 // Unstable sort by host order ok, because this is only used for uniquifying. 01279 std::sort(Key.begin(), Key.end()); 01280 return Uniquifier.count(Key); 01281 } 01282 01283 /// InsertFormula - If the given formula has not yet been inserted, add it to 01284 /// the list, and return true. Return false otherwise. 01285 /// The formula must be in canonical form. 01286 bool LSRUse::InsertFormula(const Formula &F) { 01287 assert(F.isCanonical() && "Invalid canonical representation"); 01288 01289 if (!Formulae.empty() && RigidFormula) 01290 return false; 01291 01292 SmallVector<const SCEV *, 4> Key = F.BaseRegs; 01293 if (F.ScaledReg) Key.push_back(F.ScaledReg); 01294 // Unstable sort by host order ok, because this is only used for uniquifying. 01295 std::sort(Key.begin(), Key.end()); 01296 01297 if (!Uniquifier.insert(Key).second) 01298 return false; 01299 01300 // Using a register to hold the value of 0 is not profitable. 01301 assert((!F.ScaledReg || !F.ScaledReg->isZero()) && 01302 "Zero allocated in a scaled register!"); 01303 #ifndef NDEBUG 01304 for (SmallVectorImpl<const SCEV *>::const_iterator I = 01305 F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) 01306 assert(!(*I)->isZero() && "Zero allocated in a base register!"); 01307 #endif 01308 01309 // Add the formula to the list. 01310 Formulae.push_back(F); 01311 01312 // Record registers now being used by this use. 01313 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); 01314 if (F.ScaledReg) 01315 Regs.insert(F.ScaledReg); 01316 01317 return true; 01318 } 01319 01320 /// DeleteFormula - Remove the given formula from this use's list. 01321 void LSRUse::DeleteFormula(Formula &F) { 01322 if (&F != &Formulae.back()) 01323 std::swap(F, Formulae.back()); 01324 Formulae.pop_back(); 01325 } 01326 01327 /// RecomputeRegs - Recompute the Regs field, and update RegUses. 01328 void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) { 01329 // Now that we've filtered out some formulae, recompute the Regs set. 01330 SmallPtrSet<const SCEV *, 4> OldRegs = Regs; 01331 Regs.clear(); 01332 for (SmallVectorImpl<Formula>::const_iterator I = Formulae.begin(), 01333 E = Formulae.end(); I != E; ++I) { 01334 const Formula &F = *I; 01335 if (F.ScaledReg) Regs.insert(F.ScaledReg); 01336 Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); 01337 } 01338 01339 // Update the RegTracker. 01340 for (const SCEV *S : OldRegs) 01341 if (!Regs.count(S)) 01342 RegUses.DropRegister(S, LUIdx); 01343 } 01344 01345 void LSRUse::print(raw_ostream &OS) const { 01346 OS << "LSR Use: Kind="; 01347 switch (Kind) { 01348 case Basic: OS << "Basic"; break; 01349 case Special: OS << "Special"; break; 01350 case ICmpZero: OS << "ICmpZero"; break; 01351 case Address: 01352 OS << "Address of "; 01353 if (AccessTy->isPointerTy()) 01354 OS << "pointer"; // the full pointer type could be really verbose 01355 else 01356 OS << *AccessTy; 01357 } 01358 01359 OS << ", Offsets={"; 01360 for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), 01361 E = Offsets.end(); I != E; ++I) { 01362 OS << *I; 01363 if (std::next(I) != E) 01364 OS << ','; 01365 } 01366 OS << '}'; 01367 01368 if (AllFixupsOutsideLoop) 01369 OS << ", all-fixups-outside-loop"; 01370 01371 if (WidestFixupType) 01372 OS << ", widest fixup type: " << *WidestFixupType; 01373 } 01374 01375 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 01376 void LSRUse::dump() const { 01377 print(errs()); errs() << '\n'; 01378 } 01379 #endif 01380 01381 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, 01382 LSRUse::KindType Kind, Type *AccessTy, 01383 GlobalValue *BaseGV, int64_t BaseOffset, 01384 bool HasBaseReg, int64_t Scale) { 01385 switch (Kind) { 01386 case LSRUse::Address: 01387 return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); 01388 01389 // Otherwise, just guess that reg+reg addressing is legal. 01390 //return ; 01391 01392 case LSRUse::ICmpZero: 01393 // There's not even a target hook for querying whether it would be legal to 01394 // fold a GV into an ICmp. 01395 if (BaseGV) 01396 return false; 01397 01398 // ICmp only has two operands; don't allow more than two non-trivial parts. 01399 if (Scale != 0 && HasBaseReg && BaseOffset != 0) 01400 return false; 01401 01402 // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by 01403 // putting the scaled register in the other operand of the icmp. 01404 if (Scale != 0 && Scale != -1) 01405 return false; 01406 01407 // If we have low-level target information, ask the target if it can fold an 01408 // integer immediate on an icmp. 01409 if (BaseOffset != 0) { 01410 // We have one of: 01411 // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset 01412 // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset 01413 // Offs is the ICmp immediate. 01414 if (Scale == 0) 01415 // The cast does the right thing with INT64_MIN. 01416 BaseOffset = -(uint64_t)BaseOffset; 01417 return TTI.isLegalICmpImmediate(BaseOffset); 01418 } 01419 01420 // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg 01421 return true; 01422 01423 case LSRUse::Basic: 01424 // Only handle single-register values. 01425 return !BaseGV && Scale == 0 && BaseOffset == 0; 01426 01427 case LSRUse::Special: 01428 // Special case Basic to handle -1 scales. 01429 return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0; 01430 } 01431 01432 llvm_unreachable("Invalid LSRUse Kind!"); 01433 } 01434 01435 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, 01436 int64_t MinOffset, int64_t MaxOffset, 01437 LSRUse::KindType Kind, Type *AccessTy, 01438 GlobalValue *BaseGV, int64_t BaseOffset, 01439 bool HasBaseReg, int64_t Scale) { 01440 // Check for overflow. 01441 if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) != 01442 (MinOffset > 0)) 01443 return false; 01444 MinOffset = (uint64_t)BaseOffset + MinOffset; 01445 if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) != 01446 (MaxOffset > 0)) 01447 return false; 01448 MaxOffset = (uint64_t)BaseOffset + MaxOffset; 01449 01450 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset, 01451 HasBaseReg, Scale) && 01452 isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset, 01453 HasBaseReg, Scale); 01454 } 01455 01456 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, 01457 int64_t MinOffset, int64_t MaxOffset, 01458 LSRUse::KindType Kind, Type *AccessTy, 01459 const Formula &F) { 01460 // For the purpose of isAMCompletelyFolded either having a canonical formula 01461 // or a scale not equal to zero is correct. 01462 // Problems may arise from non canonical formulae having a scale == 0. 01463 // Strictly speaking it would best to just rely on canonical formulae. 01464 // However, when we generate the scaled formulae, we first check that the 01465 // scaling factor is profitable before computing the actual ScaledReg for 01466 // compile time sake. 01467 assert((F.isCanonical() || F.Scale != 0)); 01468 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, 01469 F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale); 01470 } 01471 01472 /// isLegalUse - Test whether we know how to expand the current formula. 01473 static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, 01474 int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, 01475 GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, 01476 int64_t Scale) { 01477 // We know how to expand completely foldable formulae. 01478 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, 01479 BaseOffset, HasBaseReg, Scale) || 01480 // Or formulae that use a base register produced by a sum of base 01481 // registers. 01482 (Scale == 1 && 01483 isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, 01484 BaseGV, BaseOffset, true, 0)); 01485 } 01486 01487 static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, 01488 int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, 01489 const Formula &F) { 01490 return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV, 01491 F.BaseOffset, F.HasBaseReg, F.Scale); 01492 } 01493 01494 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, 01495 const LSRUse &LU, const Formula &F) { 01496 return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, 01497 LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg, 01498 F.Scale); 01499 } 01500 01501 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, 01502 const LSRUse &LU, const Formula &F) { 01503 if (!F.Scale) 01504 return 0; 01505 01506 // If the use is not completely folded in that instruction, we will have to 01507 // pay an extra cost only for scale != 1. 01508 if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, 01509 LU.AccessTy, F)) 01510 return F.Scale != 1; 01511 01512 switch (LU.Kind) { 01513 case LSRUse::Address: { 01514 // Check the scaling factor cost with both the min and max offsets. 01515 int ScaleCostMinOffset = 01516 TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV, 01517 F.BaseOffset + LU.MinOffset, 01518 F.HasBaseReg, F.Scale); 01519 int ScaleCostMaxOffset = 01520 TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV, 01521 F.BaseOffset + LU.MaxOffset, 01522 F.HasBaseReg, F.Scale); 01523 01524 assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 && 01525 "Legal addressing mode has an illegal cost!"); 01526 return std::max(ScaleCostMinOffset, ScaleCostMaxOffset); 01527 } 01528 case LSRUse::ICmpZero: 01529 case LSRUse::Basic: 01530 case LSRUse::Special: 01531 // The use is completely folded, i.e., everything is folded into the 01532 // instruction. 01533 return 0; 01534 } 01535 01536 llvm_unreachable("Invalid LSRUse Kind!"); 01537 } 01538 01539 static bool isAlwaysFoldable(const TargetTransformInfo &TTI, 01540 LSRUse::KindType Kind, Type *AccessTy, 01541 GlobalValue *BaseGV, int64_t BaseOffset, 01542 bool HasBaseReg) { 01543 // Fast-path: zero is always foldable. 01544 if (BaseOffset == 0 && !BaseGV) return true; 01545 01546 // Conservatively, create an address with an immediate and a 01547 // base and a scale. 01548 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1; 01549 01550 // Canonicalize a scale of 1 to a base register if the formula doesn't 01551 // already have a base register. 01552 if (!HasBaseReg && Scale == 1) { 01553 Scale = 0; 01554 HasBaseReg = true; 01555 } 01556 01557 return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset, 01558 HasBaseReg, Scale); 01559 } 01560 01561 static bool isAlwaysFoldable(const TargetTransformInfo &TTI, 01562 ScalarEvolution &SE, int64_t MinOffset, 01563 int64_t MaxOffset, LSRUse::KindType Kind, 01564 Type *AccessTy, const SCEV *S, bool HasBaseReg) { 01565 // Fast-path: zero is always foldable. 01566 if (S->isZero()) return true; 01567 01568 // Conservatively, create an address with an immediate and a 01569 // base and a scale. 01570 int64_t BaseOffset = ExtractImmediate(S, SE); 01571 GlobalValue *BaseGV = ExtractSymbol(S, SE); 01572 01573 // If there's anything else involved, it's not foldable. 01574 if (!S->isZero()) return false; 01575 01576 // Fast-path: zero is always foldable. 01577 if (BaseOffset == 0 && !BaseGV) return true; 01578 01579 // Conservatively, create an address with an immediate and a 01580 // base and a scale. 01581 int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1; 01582 01583 return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, 01584 BaseOffset, HasBaseReg, Scale); 01585 } 01586 01587 namespace { 01588 01589 /// IVInc - An individual increment in a Chain of IV increments. 01590 /// Relate an IV user to an expression that computes the IV it uses from the IV 01591 /// used by the previous link in the Chain. 01592 /// 01593 /// For the head of a chain, IncExpr holds the absolute SCEV expression for the 01594 /// original IVOperand. The head of the chain's IVOperand is only valid during 01595 /// chain collection, before LSR replaces IV users. During chain generation, 01596 /// IncExpr can be used to find the new IVOperand that computes the same 01597 /// expression. 01598 struct IVInc { 01599 Instruction *UserInst; 01600 Value* IVOperand; 01601 const SCEV *IncExpr; 01602 01603 IVInc(Instruction *U, Value *O, const SCEV *E): 01604 UserInst(U), IVOperand(O), IncExpr(E) {} 01605 }; 01606 01607 // IVChain - The list of IV increments in program order. 01608 // We typically add the head of a chain without finding subsequent links. 01609 struct IVChain { 01610 SmallVector<IVInc,1> Incs; 01611 const SCEV *ExprBase; 01612 01613 IVChain() : ExprBase(nullptr) {} 01614 01615 IVChain(const IVInc &Head, const SCEV *Base) 01616 : Incs(1, Head), ExprBase(Base) {} 01617 01618 typedef SmallVectorImpl<IVInc>::const_iterator const_iterator; 01619 01620 // begin - return the first increment in the chain. 01621 const_iterator begin() const { 01622 assert(!Incs.empty()); 01623 return std::next(Incs.begin()); 01624 } 01625 const_iterator end() const { 01626 return Incs.end(); 01627 } 01628 01629 // hasIncs - Returns true if this chain contains any increments. 01630 bool hasIncs() const { return Incs.size() >= 2; } 01631 01632 // add - Add an IVInc to the end of this chain. 01633 void add(const IVInc &X) { Incs.push_back(X); } 01634 01635 // tailUserInst - Returns the last UserInst in the chain. 01636 Instruction *tailUserInst() const { return Incs.back().UserInst; } 01637 01638 // isProfitableIncrement - Returns true if IncExpr can be profitably added to 01639 // this chain. 01640 bool isProfitableIncrement(const SCEV *OperExpr, 01641 const SCEV *IncExpr, 01642 ScalarEvolution&); 01643 }; 01644 01645 /// ChainUsers - Helper for CollectChains to track multiple IV increment uses. 01646 /// Distinguish between FarUsers that definitely cross IV increments and 01647 /// NearUsers that may be used between IV increments. 01648 struct ChainUsers { 01649 SmallPtrSet<Instruction*, 4> FarUsers; 01650 SmallPtrSet<Instruction*, 4> NearUsers; 01651 }; 01652 01653 /// LSRInstance - This class holds state for the main loop strength reduction 01654 /// logic. 01655 class LSRInstance { 01656 IVUsers &IU; 01657 ScalarEvolution &SE; 01658 DominatorTree &DT; 01659 LoopInfo &LI; 01660 const TargetTransformInfo &TTI; 01661 Loop *const L; 01662 bool Changed; 01663 01664 /// IVIncInsertPos - This is the insert position that the current loop's 01665 /// induction variable increment should be placed. In simple loops, this is 01666 /// the latch block's terminator. But in more complicated cases, this is a 01667 /// position which will dominate all the in-loop post-increment users. 01668 Instruction *IVIncInsertPos; 01669 01670 /// Factors - Interesting factors between use strides. 01671 SmallSetVector<int64_t, 8> Factors; 01672 01673 /// Types - Interesting use types, to facilitate truncation reuse. 01674 SmallSetVector<Type *, 4> Types; 01675 01676 /// Fixups - The list of operands which are to be replaced. 01677 SmallVector<LSRFixup, 16> Fixups; 01678 01679 /// Uses - The list of interesting uses. 01680 SmallVector<LSRUse, 16> Uses; 01681 01682 /// RegUses - Track which uses use which register candidates. 01683 RegUseTracker RegUses; 01684 01685 // Limit the number of chains to avoid quadratic behavior. We don't expect to 01686 // have more than a few IV increment chains in a loop. Missing a Chain falls 01687 // back to normal LSR behavior for those uses. 01688 static const unsigned MaxChains = 8; 01689 01690 /// IVChainVec - IV users can form a chain of IV increments. 01691 SmallVector<IVChain, MaxChains> IVChainVec; 01692 01693 /// IVIncSet - IV users that belong to profitable IVChains. 01694 SmallPtrSet<Use*, MaxChains> IVIncSet; 01695 01696 void OptimizeShadowIV(); 01697 bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse); 01698 ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse); 01699 void OptimizeLoopTermCond(); 01700 01701 void ChainInstruction(Instruction *UserInst, Instruction *IVOper, 01702 SmallVectorImpl<ChainUsers> &ChainUsersVec); 01703 void FinalizeChain(IVChain &Chain); 01704 void CollectChains(); 01705 void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, 01706 SmallVectorImpl<WeakVH> &DeadInsts); 01707 01708 void CollectInterestingTypesAndFactors(); 01709 void CollectFixupsAndInitialFormulae(); 01710 01711 LSRFixup &getNewFixup() { 01712 Fixups.push_back(LSRFixup()); 01713 return Fixups.back(); 01714 } 01715 01716 // Support for sharing of LSRUses between LSRFixups. 01717 typedef DenseMap<LSRUse::SCEVUseKindPair, size_t> UseMapTy; 01718 UseMapTy UseMap; 01719 01720 bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, 01721 LSRUse::KindType Kind, Type *AccessTy); 01722 01723 std::pair<size_t, int64_t> getUse(const SCEV *&Expr, 01724 LSRUse::KindType Kind, 01725 Type *AccessTy); 01726 01727 void DeleteUse(LSRUse &LU, size_t LUIdx); 01728 01729 LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU); 01730 01731 void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); 01732 void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); 01733 void CountRegisters(const Formula &F, size_t LUIdx); 01734 bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F); 01735 01736 void CollectLoopInvariantFixupsAndFormulae(); 01737 01738 void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base, 01739 unsigned Depth = 0); 01740 01741 void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, 01742 const Formula &Base, unsigned Depth, 01743 size_t Idx, bool IsScaledReg = false); 01744 void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base); 01745 void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx, 01746 const Formula &Base, size_t Idx, 01747 bool IsScaledReg = false); 01748 void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); 01749 void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx, 01750 const Formula &Base, 01751 const SmallVectorImpl<int64_t> &Worklist, 01752 size_t Idx, bool IsScaledReg = false); 01753 void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); 01754 void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base); 01755 void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base); 01756 void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base); 01757 void GenerateCrossUseConstantOffsets(); 01758 void GenerateAllReuseFormulae(); 01759 01760 void FilterOutUndesirableDedicatedRegisters(); 01761 01762 size_t EstimateSearchSpaceComplexity() const; 01763 void NarrowSearchSpaceByDetectingSupersets(); 01764 void NarrowSearchSpaceByCollapsingUnrolledCode(); 01765 void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); 01766 void NarrowSearchSpaceByPickingWinnerRegs(); 01767 void NarrowSearchSpaceUsingHeuristics(); 01768 01769 void SolveRecurse(SmallVectorImpl<const Formula *> &Solution, 01770 Cost &SolutionCost, 01771 SmallVectorImpl<const Formula *> &Workspace, 01772 const Cost &CurCost, 01773 const SmallPtrSet<const SCEV *, 16> &CurRegs, 01774 DenseSet<const SCEV *> &VisitedRegs) const; 01775 void Solve(SmallVectorImpl<const Formula *> &Solution) const; 01776 01777 BasicBlock::iterator 01778 HoistInsertPosition(BasicBlock::iterator IP, 01779 const SmallVectorImpl<Instruction *> &Inputs) const; 01780 BasicBlock::iterator 01781 AdjustInsertPositionForExpand(BasicBlock::iterator IP, 01782 const LSRFixup &LF, 01783 const LSRUse &LU, 01784 SCEVExpander &Rewriter) const; 01785 01786 Value *Expand(const LSRFixup &LF, 01787 const Formula &F, 01788 BasicBlock::iterator IP, 01789 SCEVExpander &Rewriter, 01790 SmallVectorImpl<WeakVH> &DeadInsts) const; 01791 void RewriteForPHI(PHINode *PN, const LSRFixup &LF, 01792 const Formula &F, 01793 SCEVExpander &Rewriter, 01794 SmallVectorImpl<WeakVH> &DeadInsts, 01795 Pass *P) const; 01796 void Rewrite(const LSRFixup &LF, 01797 const Formula &F, 01798 SCEVExpander &Rewriter, 01799 SmallVectorImpl<WeakVH> &DeadInsts, 01800 Pass *P) const; 01801 void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, 01802 Pass *P); 01803 01804 public: 01805 LSRInstance(Loop *L, Pass *P); 01806 01807 bool getChanged() const { return Changed; } 01808 01809 void print_factors_and_types(raw_ostream &OS) const; 01810 void print_fixups(raw_ostream &OS) const; 01811 void print_uses(raw_ostream &OS) const; 01812 void print(raw_ostream &OS) const; 01813 void dump() const; 01814 }; 01815 01816 } 01817 01818 /// OptimizeShadowIV - If IV is used in a int-to-float cast 01819 /// inside the loop then try to eliminate the cast operation. 01820 void LSRInstance::OptimizeShadowIV() { 01821 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); 01822 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) 01823 return; 01824 01825 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); 01826 UI != E; /* empty */) { 01827 IVUsers::const_iterator CandidateUI = UI; 01828 ++UI; 01829 Instruction *ShadowUse = CandidateUI->getUser(); 01830 Type *DestTy = nullptr; 01831 bool IsSigned = false; 01832 01833 /* If shadow use is a int->float cast then insert a second IV 01834 to eliminate this cast. 01835 01836 for (unsigned i = 0; i < n; ++i) 01837 foo((double)i); 01838 01839 is transformed into 01840 01841 double d = 0.0; 01842 for (unsigned i = 0; i < n; ++i, ++d) 01843 foo(d); 01844 */ 01845 if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) { 01846 IsSigned = false; 01847 DestTy = UCast->getDestTy(); 01848 } 01849 else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) { 01850 IsSigned = true; 01851 DestTy = SCast->getDestTy(); 01852 } 01853 if (!DestTy) continue; 01854 01855 // If target does not support DestTy natively then do not apply 01856 // this transformation. 01857 if (!TTI.isTypeLegal(DestTy)) continue; 01858 01859 PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0)); 01860 if (!PH) continue; 01861 if (PH->getNumIncomingValues() != 2) continue; 01862 01863 Type *SrcTy = PH->getType(); 01864 int Mantissa = DestTy->getFPMantissaWidth(); 01865 if (Mantissa == -1) continue; 01866 if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa) 01867 continue; 01868 01869 unsigned Entry, Latch; 01870 if (PH->getIncomingBlock(0) == L->getLoopPreheader()) { 01871 Entry = 0; 01872 Latch = 1; 01873 } else { 01874 Entry = 1; 01875 Latch = 0; 01876 } 01877 01878 ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry)); 01879 if (!Init) continue; 01880 Constant *NewInit = ConstantFP::get(DestTy, IsSigned ? 01881 (double)Init->getSExtValue() : 01882 (double)Init->getZExtValue()); 01883 01884 BinaryOperator *Incr = 01885 dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch)); 01886 if (!Incr) continue; 01887 if (Incr->getOpcode() != Instruction::Add 01888 && Incr->getOpcode() != Instruction::Sub) 01889 continue; 01890 01891 /* Initialize new IV, double d = 0.0 in above example. */ 01892 ConstantInt *C = nullptr; 01893 if (Incr->getOperand(0) == PH) 01894 C = dyn_cast<ConstantInt>(Incr->getOperand(1)); 01895 else if (Incr->getOperand(1) == PH) 01896 C = dyn_cast<ConstantInt>(Incr->getOperand(0)); 01897 else 01898 continue; 01899 01900 if (!C) continue; 01901 01902 // Ignore negative constants, as the code below doesn't handle them 01903 // correctly. TODO: Remove this restriction. 01904 if (!C->getValue().isStrictlyPositive()) continue; 01905 01906 /* Add new PHINode. */ 01907 PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH); 01908 01909 /* create new increment. '++d' in above example. */ 01910 Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue()); 01911 BinaryOperator *NewIncr = 01912 BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ? 01913 Instruction::FAdd : Instruction::FSub, 01914 NewPH, CFP, "IV.S.next.", Incr); 01915 01916 NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry)); 01917 NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch)); 01918 01919 /* Remove cast operation */ 01920 ShadowUse->replaceAllUsesWith(NewPH); 01921 ShadowUse->eraseFromParent(); 01922 Changed = true; 01923 break; 01924 } 01925 } 01926 01927 /// FindIVUserForCond - If Cond has an operand that is an expression of an IV, 01928 /// set the IV user and stride information and return true, otherwise return 01929 /// false. 01930 bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) { 01931 for (IVUsers::iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) 01932 if (UI->getUser() == Cond) { 01933 // NOTE: we could handle setcc instructions with multiple uses here, but 01934 // InstCombine does it as well for simple uses, it's not clear that it 01935 // occurs enough in real life to handle. 01936 CondUse = UI; 01937 return true; 01938 } 01939 return false; 01940 } 01941 01942 /// OptimizeMax - Rewrite the loop's terminating condition if it uses 01943 /// a max computation. 01944 /// 01945 /// This is a narrow solution to a specific, but acute, problem. For loops 01946 /// like this: 01947 /// 01948 /// i = 0; 01949 /// do { 01950 /// p[i] = 0.0; 01951 /// } while (++i < n); 01952 /// 01953 /// the trip count isn't just 'n', because 'n' might not be positive. And 01954 /// unfortunately this can come up even for loops where the user didn't use 01955 /// a C do-while loop. For example, seemingly well-behaved top-test loops 01956 /// will commonly be lowered like this: 01957 // 01958 /// if (n > 0) { 01959 /// i = 0; 01960 /// do { 01961 /// p[i] = 0.0; 01962 /// } while (++i < n); 01963 /// } 01964 /// 01965 /// and then it's possible for subsequent optimization to obscure the if 01966 /// test in such a way that indvars can't find it. 01967 /// 01968 /// When indvars can't find the if test in loops like this, it creates a 01969 /// max expression, which allows it to give the loop a canonical 01970 /// induction variable: 01971 /// 01972 /// i = 0; 01973 /// max = n < 1 ? 1 : n; 01974 /// do { 01975 /// p[i] = 0.0; 01976 /// } while (++i != max); 01977 /// 01978 /// Canonical induction variables are necessary because the loop passes 01979 /// are designed around them. The most obvious example of this is the 01980 /// LoopInfo analysis, which doesn't remember trip count values. It 01981 /// expects to be able to rediscover the trip count each time it is 01982 /// needed, and it does this using a simple analysis that only succeeds if 01983 /// the loop has a canonical induction variable. 01984 /// 01985 /// However, when it comes time to generate code, the maximum operation 01986 /// can be quite costly, especially if it's inside of an outer loop. 01987 /// 01988 /// This function solves this problem by detecting this type of loop and 01989 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting 01990 /// the instructions for the maximum computation. 01991 /// 01992 ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) { 01993 // Check that the loop matches the pattern we're looking for. 01994 if (Cond->getPredicate() != CmpInst::ICMP_EQ && 01995 Cond->getPredicate() != CmpInst::ICMP_NE) 01996 return Cond; 01997 01998 SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1)); 01999 if (!Sel || !Sel->hasOneUse()) return Cond; 02000 02001 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L); 02002 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) 02003 return Cond; 02004 const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1); 02005 02006 // Add one to the backedge-taken count to get the trip count. 02007 const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount); 02008 if (IterationCount != SE.getSCEV(Sel)) return Cond; 02009 02010 // Check for a max calculation that matches the pattern. There's no check 02011 // for ICMP_ULE here because the comparison would be with zero, which 02012 // isn't interesting. 02013 CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE; 02014 const SCEVNAryExpr *Max = nullptr; 02015 if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) { 02016 Pred = ICmpInst::ICMP_SLE; 02017 Max = S; 02018 } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) { 02019 Pred = ICmpInst::ICMP_SLT; 02020 Max = S; 02021 } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) { 02022 Pred = ICmpInst::ICMP_ULT; 02023 Max = U; 02024 } else { 02025 // No match; bail. 02026 return Cond; 02027 } 02028 02029 // To handle a max with more than two operands, this optimization would 02030 // require additional checking and setup. 02031 if (Max->getNumOperands() != 2) 02032 return Cond; 02033 02034 const SCEV *MaxLHS = Max->getOperand(0); 02035 const SCEV *MaxRHS = Max->getOperand(1); 02036 02037 // ScalarEvolution canonicalizes constants to the left. For < and >, look 02038 // for a comparison with 1. For <= and >=, a comparison with zero. 02039 if (!MaxLHS || 02040 (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One))) 02041 return Cond; 02042 02043 // Check the relevant induction variable for conformance to 02044 // the pattern. 02045 const SCEV *IV = SE.getSCEV(Cond->getOperand(0)); 02046 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV); 02047 if (!AR || !AR->isAffine() || 02048 AR->getStart() != One || 02049 AR->getStepRecurrence(SE) != One) 02050 return Cond; 02051 02052 assert(AR->getLoop() == L && 02053 "Loop condition operand is an addrec in a different loop!"); 02054 02055 // Check the right operand of the select, and remember it, as it will 02056 // be used in the new comparison instruction. 02057 Value *NewRHS = nullptr; 02058 if (ICmpInst::isTrueWhenEqual(Pred)) { 02059 // Look for n+1, and grab n. 02060 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1))) 02061 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1))) 02062 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS) 02063 NewRHS = BO->getOperand(0); 02064 if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2))) 02065 if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1))) 02066 if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS) 02067 NewRHS = BO->getOperand(0); 02068 if (!NewRHS) 02069 return Cond; 02070 } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS) 02071 NewRHS = Sel->getOperand(1); 02072 else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS) 02073 NewRHS = Sel->getOperand(2); 02074 else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS)) 02075 NewRHS = SU->getValue(); 02076 else 02077 // Max doesn't match expected pattern. 02078 return Cond; 02079 02080 // Determine the new comparison opcode. It may be signed or unsigned, 02081 // and the original comparison may be either equality or inequality. 02082 if (Cond->getPredicate() == CmpInst::ICMP_EQ) 02083 Pred = CmpInst::getInversePredicate(Pred); 02084 02085 // Ok, everything looks ok to change the condition into an SLT or SGE and 02086 // delete the max calculation. 02087 ICmpInst *NewCond = 02088 new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp"); 02089 02090 // Delete the max calculation instructions. 02091 Cond->replaceAllUsesWith(NewCond); 02092 CondUse->setUser(NewCond); 02093 Instruction *Cmp = cast<Instruction>(Sel->getOperand(0)); 02094 Cond->eraseFromParent(); 02095 Sel->eraseFromParent(); 02096 if (Cmp->use_empty()) 02097 Cmp->eraseFromParent(); 02098 return NewCond; 02099 } 02100 02101 /// OptimizeLoopTermCond - Change loop terminating condition to use the 02102 /// postinc iv when possible. 02103 void 02104 LSRInstance::OptimizeLoopTermCond() { 02105 SmallPtrSet<Instruction *, 4> PostIncs; 02106 02107 BasicBlock *LatchBlock = L->getLoopLatch(); 02108 SmallVector<BasicBlock*, 8> ExitingBlocks; 02109 L->getExitingBlocks(ExitingBlocks); 02110 02111 for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) { 02112 BasicBlock *ExitingBlock = ExitingBlocks[i]; 02113 02114 // Get the terminating condition for the loop if possible. If we 02115 // can, we want to change it to use a post-incremented version of its 02116 // induction variable, to allow coalescing the live ranges for the IV into 02117 // one register value. 02118 02119 BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator()); 02120 if (!TermBr) 02121 continue; 02122 // FIXME: Overly conservative, termination condition could be an 'or' etc.. 02123 if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition())) 02124 continue; 02125 02126 // Search IVUsesByStride to find Cond's IVUse if there is one. 02127 IVStrideUse *CondUse = nullptr; 02128 ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition()); 02129 if (!FindIVUserForCond(Cond, CondUse)) 02130 continue; 02131 02132 // If the trip count is computed in terms of a max (due to ScalarEvolution 02133 // being unable to find a sufficient guard, for example), change the loop 02134 // comparison to use SLT or ULT instead of NE. 02135 // One consequence of doing this now is that it disrupts the count-down 02136 // optimization. That's not always a bad thing though, because in such 02137 // cases it may still be worthwhile to avoid a max. 02138 Cond = OptimizeMax(Cond, CondUse); 02139 02140 // If this exiting block dominates the latch block, it may also use 02141 // the post-inc value if it won't be shared with other uses. 02142 // Check for dominance. 02143 if (!DT.dominates(ExitingBlock, LatchBlock)) 02144 continue; 02145 02146 // Conservatively avoid trying to use the post-inc value in non-latch 02147 // exits if there may be pre-inc users in intervening blocks. 02148 if (LatchBlock != ExitingBlock) 02149 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) 02150 // Test if the use is reachable from the exiting block. This dominator 02151 // query is a conservative approximation of reachability. 02152 if (&*UI != CondUse && 02153 !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) { 02154 // Conservatively assume there may be reuse if the quotient of their 02155 // strides could be a legal scale. 02156 const SCEV *A = IU.getStride(*CondUse, L); 02157 const SCEV *B = IU.getStride(*UI, L); 02158 if (!A || !B) continue; 02159 if (SE.getTypeSizeInBits(A->getType()) != 02160 SE.getTypeSizeInBits(B->getType())) { 02161 if (SE.getTypeSizeInBits(A->getType()) > 02162 SE.getTypeSizeInBits(B->getType())) 02163 B = SE.getSignExtendExpr(B, A->getType()); 02164 else 02165 A = SE.getSignExtendExpr(A, B->getType()); 02166 } 02167 if (const SCEVConstant *D = 02168 dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) { 02169 const ConstantInt *C = D->getValue(); 02170 // Stride of one or negative one can have reuse with non-addresses. 02171 if (C->isOne() || C->isAllOnesValue()) 02172 goto decline_post_inc; 02173 // Avoid weird situations. 02174 if (C->getValue().getMinSignedBits() >= 64 || 02175 C->getValue().isMinSignedValue()) 02176 goto decline_post_inc; 02177 // Check for possible scaled-address reuse. 02178 Type *AccessTy = getAccessType(UI->getUser()); 02179 int64_t Scale = C->getSExtValue(); 02180 if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr, 02181 /*BaseOffset=*/ 0, 02182 /*HasBaseReg=*/ false, Scale)) 02183 goto decline_post_inc; 02184 Scale = -Scale; 02185 if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr, 02186 /*BaseOffset=*/ 0, 02187 /*HasBaseReg=*/ false, Scale)) 02188 goto decline_post_inc; 02189 } 02190 } 02191 02192 DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: " 02193 << *Cond << '\n'); 02194 02195 // It's possible for the setcc instruction to be anywhere in the loop, and 02196 // possible for it to have multiple users. If it is not immediately before 02197 // the exiting block branch, move it. 02198 if (&*++BasicBlock::iterator(Cond) != TermBr) { 02199 if (Cond->hasOneUse()) { 02200 Cond->moveBefore(TermBr); 02201 } else { 02202 // Clone the terminating condition and insert into the loopend. 02203 ICmpInst *OldCond = Cond; 02204 Cond = cast<ICmpInst>(Cond->clone()); 02205 Cond->setName(L->getHeader()->getName() + ".termcond"); 02206 ExitingBlock->getInstList().insert(TermBr, Cond); 02207 02208 // Clone the IVUse, as the old use still exists! 02209 CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace()); 02210 TermBr->replaceUsesOfWith(OldCond, Cond); 02211 } 02212 } 02213 02214 // If we get to here, we know that we can transform the setcc instruction to 02215 // use the post-incremented version of the IV, allowing us to coalesce the 02216 // live ranges for the IV correctly. 02217 CondUse->transformToPostInc(L); 02218 Changed = true; 02219 02220 PostIncs.insert(Cond); 02221 decline_post_inc:; 02222 } 02223 02224 // Determine an insertion point for the loop induction variable increment. It 02225 // must dominate all the post-inc comparisons we just set up, and it must 02226 // dominate the loop latch edge. 02227 IVIncInsertPos = L->getLoopLatch()->getTerminator(); 02228 for (Instruction *Inst : PostIncs) { 02229 BasicBlock *BB = 02230 DT.findNearestCommonDominator(IVIncInsertPos->getParent(), 02231 Inst->getParent()); 02232 if (BB == Inst->getParent()) 02233 IVIncInsertPos = Inst; 02234 else if (BB != IVIncInsertPos->getParent()) 02235 IVIncInsertPos = BB->getTerminator(); 02236 } 02237 } 02238 02239 /// reconcileNewOffset - Determine if the given use can accommodate a fixup 02240 /// at the given offset and other details. If so, update the use and 02241 /// return true. 02242 bool 02243 LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, 02244 LSRUse::KindType Kind, Type *AccessTy) { 02245 int64_t NewMinOffset = LU.MinOffset; 02246 int64_t NewMaxOffset = LU.MaxOffset; 02247 Type *NewAccessTy = AccessTy; 02248 02249 // Check for a mismatched kind. It's tempting to collapse mismatched kinds to 02250 // something conservative, however this can pessimize in the case that one of 02251 // the uses will have all its uses outside the loop, for example. 02252 if (LU.Kind != Kind) 02253 return false; 02254 02255 // Check for a mismatched access type, and fall back conservatively as needed. 02256 // TODO: Be less conservative when the type is similar and can use the same 02257 // addressing modes. 02258 if (Kind == LSRUse::Address && AccessTy != LU.AccessTy) 02259 NewAccessTy = Type::getVoidTy(AccessTy->getContext()); 02260 02261 // Conservatively assume HasBaseReg is true for now. 02262 if (NewOffset < LU.MinOffset) { 02263 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr, 02264 LU.MaxOffset - NewOffset, HasBaseReg)) 02265 return false; 02266 NewMinOffset = NewOffset; 02267 } else if (NewOffset > LU.MaxOffset) { 02268 if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr, 02269 NewOffset - LU.MinOffset, HasBaseReg)) 02270 return false; 02271 NewMaxOffset = NewOffset; 02272 } 02273 02274 // Update the use. 02275 LU.MinOffset = NewMinOffset; 02276 LU.MaxOffset = NewMaxOffset; 02277 LU.AccessTy = NewAccessTy; 02278 if (NewOffset != LU.Offsets.back()) 02279 LU.Offsets.push_back(NewOffset); 02280 return true; 02281 } 02282 02283 /// getUse - Return an LSRUse index and an offset value for a fixup which 02284 /// needs the given expression, with the given kind and optional access type. 02285 /// Either reuse an existing use or create a new one, as needed. 02286 std::pair<size_t, int64_t> 02287 LSRInstance::getUse(const SCEV *&Expr, 02288 LSRUse::KindType Kind, Type *AccessTy) { 02289 const SCEV *Copy = Expr; 02290 int64_t Offset = ExtractImmediate(Expr, SE); 02291 02292 // Basic uses can't accept any offset, for example. 02293 if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr, 02294 Offset, /*HasBaseReg=*/ true)) { 02295 Expr = Copy; 02296 Offset = 0; 02297 } 02298 02299 std::pair<UseMapTy::iterator, bool> P = 02300 UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0)); 02301 if (!P.second) { 02302 // A use already existed with this base. 02303 size_t LUIdx = P.first->second; 02304 LSRUse &LU = Uses[LUIdx]; 02305 if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy)) 02306 // Reuse this use. 02307 return std::make_pair(LUIdx, Offset); 02308 } 02309 02310 // Create a new use. 02311 size_t LUIdx = Uses.size(); 02312 P.first->second = LUIdx; 02313 Uses.push_back(LSRUse(Kind, AccessTy)); 02314 LSRUse &LU = Uses[LUIdx]; 02315 02316 // We don't need to track redundant offsets, but we don't need to go out 02317 // of our way here to avoid them. 02318 if (LU.Offsets.empty() || Offset != LU.Offsets.back()) 02319 LU.Offsets.push_back(Offset); 02320 02321 LU.MinOffset = Offset; 02322 LU.MaxOffset = Offset; 02323 return std::make_pair(LUIdx, Offset); 02324 } 02325 02326 /// DeleteUse - Delete the given use from the Uses list. 02327 void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) { 02328 if (&LU != &Uses.back()) 02329 std::swap(LU, Uses.back()); 02330 Uses.pop_back(); 02331 02332 // Update RegUses. 02333 RegUses.SwapAndDropUse(LUIdx, Uses.size()); 02334 } 02335 02336 /// FindUseWithFormula - Look for a use distinct from OrigLU which is has 02337 /// a formula that has the same registers as the given formula. 02338 LSRUse * 02339 LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, 02340 const LSRUse &OrigLU) { 02341 // Search all uses for the formula. This could be more clever. 02342 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 02343 LSRUse &LU = Uses[LUIdx]; 02344 // Check whether this use is close enough to OrigLU, to see whether it's 02345 // worthwhile looking through its formulae. 02346 // Ignore ICmpZero uses because they may contain formulae generated by 02347 // GenerateICmpZeroScales, in which case adding fixup offsets may 02348 // be invalid. 02349 if (&LU != &OrigLU && 02350 LU.Kind != LSRUse::ICmpZero && 02351 LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy && 02352 LU.WidestFixupType == OrigLU.WidestFixupType && 02353 LU.HasFormulaWithSameRegs(OrigF)) { 02354 // Scan through this use's formulae. 02355 for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), 02356 E = LU.Formulae.end(); I != E; ++I) { 02357 const Formula &F = *I; 02358 // Check to see if this formula has the same registers and symbols 02359 // as OrigF. 02360 if (F.BaseRegs == OrigF.BaseRegs && 02361 F.ScaledReg == OrigF.ScaledReg && 02362 F.BaseGV == OrigF.BaseGV && 02363 F.Scale == OrigF.Scale && 02364 F.UnfoldedOffset == OrigF.UnfoldedOffset) { 02365 if (F.BaseOffset == 0) 02366 return &LU; 02367 // This is the formula where all the registers and symbols matched; 02368 // there aren't going to be any others. Since we declined it, we 02369 // can skip the rest of the formulae and proceed to the next LSRUse. 02370 break; 02371 } 02372 } 02373 } 02374 } 02375 02376 // Nothing looked good. 02377 return nullptr; 02378 } 02379 02380 void LSRInstance::CollectInterestingTypesAndFactors() { 02381 SmallSetVector<const SCEV *, 4> Strides; 02382 02383 // Collect interesting types and strides. 02384 SmallVector<const SCEV *, 4> Worklist; 02385 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { 02386 const SCEV *Expr = IU.getExpr(*UI); 02387 02388 // Collect interesting types. 02389 Types.insert(SE.getEffectiveSCEVType(Expr->getType())); 02390 02391 // Add strides for mentioned loops. 02392 Worklist.push_back(Expr); 02393 do { 02394 const SCEV *S = Worklist.pop_back_val(); 02395 if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 02396 if (AR->getLoop() == L) 02397 Strides.insert(AR->getStepRecurrence(SE)); 02398 Worklist.push_back(AR->getStart()); 02399 } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 02400 Worklist.append(Add->op_begin(), Add->op_end()); 02401 } 02402 } while (!Worklist.empty()); 02403 } 02404 02405 // Compute interesting factors from the set of interesting strides. 02406 for (SmallSetVector<const SCEV *, 4>::const_iterator 02407 I = Strides.begin(), E = Strides.end(); I != E; ++I) 02408 for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter = 02409 std::next(I); NewStrideIter != E; ++NewStrideIter) { 02410 const SCEV *OldStride = *I; 02411 const SCEV *NewStride = *NewStrideIter; 02412 02413 if (SE.getTypeSizeInBits(OldStride->getType()) != 02414 SE.getTypeSizeInBits(NewStride->getType())) { 02415 if (SE.getTypeSizeInBits(OldStride->getType()) > 02416 SE.getTypeSizeInBits(NewStride->getType())) 02417 NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType()); 02418 else 02419 OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType()); 02420 } 02421 if (const SCEVConstant *Factor = 02422 dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride, 02423 SE, true))) { 02424 if (Factor->getValue()->getValue().getMinSignedBits() <= 64) 02425 Factors.insert(Factor->getValue()->getValue().getSExtValue()); 02426 } else if (const SCEVConstant *Factor = 02427 dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride, 02428 NewStride, 02429 SE, true))) { 02430 if (Factor->getValue()->getValue().getMinSignedBits() <= 64) 02431 Factors.insert(Factor->getValue()->getValue().getSExtValue()); 02432 } 02433 } 02434 02435 // If all uses use the same type, don't bother looking for truncation-based 02436 // reuse. 02437 if (Types.size() == 1) 02438 Types.clear(); 02439 02440 DEBUG(print_factors_and_types(dbgs())); 02441 } 02442 02443 /// findIVOperand - Helper for CollectChains that finds an IV operand (computed 02444 /// by an AddRec in this loop) within [OI,OE) or returns OE. If IVUsers mapped 02445 /// Instructions to IVStrideUses, we could partially skip this. 02446 static User::op_iterator 02447 findIVOperand(User::op_iterator OI, User::op_iterator OE, 02448 Loop *L, ScalarEvolution &SE) { 02449 for(; OI != OE; ++OI) { 02450 if (Instruction *Oper = dyn_cast<Instruction>(*OI)) { 02451 if (!SE.isSCEVable(Oper->getType())) 02452 continue; 02453 02454 if (const SCEVAddRecExpr *AR = 02455 dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) { 02456 if (AR->getLoop() == L) 02457 break; 02458 } 02459 } 02460 } 02461 return OI; 02462 } 02463 02464 /// getWideOperand - IVChain logic must consistenctly peek base TruncInst 02465 /// operands, so wrap it in a convenient helper. 02466 static Value *getWideOperand(Value *Oper) { 02467 if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper)) 02468 return Trunc->getOperand(0); 02469 return Oper; 02470 } 02471 02472 /// isCompatibleIVType - Return true if we allow an IV chain to include both 02473 /// types. 02474 static bool isCompatibleIVType(Value *LVal, Value *RVal) { 02475 Type *LType = LVal->getType(); 02476 Type *RType = RVal->getType(); 02477 return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy()); 02478 } 02479 02480 /// getExprBase - Return an approximation of this SCEV expression's "base", or 02481 /// NULL for any constant. Returning the expression itself is 02482 /// conservative. Returning a deeper subexpression is more precise and valid as 02483 /// long as it isn't less complex than another subexpression. For expressions 02484 /// involving multiple unscaled values, we need to return the pointer-type 02485 /// SCEVUnknown. This avoids forming chains across objects, such as: 02486 /// PrevOper==a[i], IVOper==b[i], IVInc==b-a. 02487 /// 02488 /// Since SCEVUnknown is the rightmost type, and pointers are the rightmost 02489 /// SCEVUnknown, we simply return the rightmost SCEV operand. 02490 static const SCEV *getExprBase(const SCEV *S) { 02491 switch (S->getSCEVType()) { 02492 default: // uncluding scUnknown. 02493 return S; 02494 case scConstant: 02495 return nullptr; 02496 case scTruncate: 02497 return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand()); 02498 case scZeroExtend: 02499 return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand()); 02500 case scSignExtend: 02501 return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand()); 02502 case scAddExpr: { 02503 // Skip over scaled operands (scMulExpr) to follow add operands as long as 02504 // there's nothing more complex. 02505 // FIXME: not sure if we want to recognize negation. 02506 const SCEVAddExpr *Add = cast<SCEVAddExpr>(S); 02507 for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()), 02508 E(Add->op_begin()); I != E; ++I) { 02509 const SCEV *SubExpr = *I; 02510 if (SubExpr->getSCEVType() == scAddExpr) 02511 return getExprBase(SubExpr); 02512 02513 if (SubExpr->getSCEVType() != scMulExpr) 02514 return SubExpr; 02515 } 02516 return S; // all operands are scaled, be conservative. 02517 } 02518 case scAddRecExpr: 02519 return getExprBase(cast<SCEVAddRecExpr>(S)->getStart()); 02520 } 02521 } 02522 02523 /// Return true if the chain increment is profitable to expand into a loop 02524 /// invariant value, which may require its own register. A profitable chain 02525 /// increment will be an offset relative to the same base. We allow such offsets 02526 /// to potentially be used as chain increment as long as it's not obviously 02527 /// expensive to expand using real instructions. 02528 bool IVChain::isProfitableIncrement(const SCEV *OperExpr, 02529 const SCEV *IncExpr, 02530 ScalarEvolution &SE) { 02531 // Aggressively form chains when -stress-ivchain. 02532 if (StressIVChain) 02533 return true; 02534 02535 // Do not replace a constant offset from IV head with a nonconstant IV 02536 // increment. 02537 if (!isa<SCEVConstant>(IncExpr)) { 02538 const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand)); 02539 if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr))) 02540 return 0; 02541 } 02542 02543 SmallPtrSet<const SCEV*, 8> Processed; 02544 return !isHighCostExpansion(IncExpr, Processed, SE); 02545 } 02546 02547 /// Return true if the number of registers needed for the chain is estimated to 02548 /// be less than the number required for the individual IV users. First prohibit 02549 /// any IV users that keep the IV live across increments (the Users set should 02550 /// be empty). Next count the number and type of increments in the chain. 02551 /// 02552 /// Chaining IVs can lead to considerable code bloat if ISEL doesn't 02553 /// effectively use postinc addressing modes. Only consider it profitable it the 02554 /// increments can be computed in fewer registers when chained. 02555 /// 02556 /// TODO: Consider IVInc free if it's already used in another chains. 02557 static bool 02558 isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users, 02559 ScalarEvolution &SE, const TargetTransformInfo &TTI) { 02560 if (StressIVChain) 02561 return true; 02562 02563 if (!Chain.hasIncs()) 02564 return false; 02565 02566 if (!Users.empty()) { 02567 DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n"; 02568 for (Instruction *Inst : Users) { 02569 dbgs() << " " << *Inst << "\n"; 02570 }); 02571 return false; 02572 } 02573 assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); 02574 02575 // The chain itself may require a register, so intialize cost to 1. 02576 int cost = 1; 02577 02578 // A complete chain likely eliminates the need for keeping the original IV in 02579 // a register. LSR does not currently know how to form a complete chain unless 02580 // the header phi already exists. 02581 if (isa<PHINode>(Chain.tailUserInst()) 02582 && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) { 02583 --cost; 02584 } 02585 const SCEV *LastIncExpr = nullptr; 02586 unsigned NumConstIncrements = 0; 02587 unsigned NumVarIncrements = 0; 02588 unsigned NumReusedIncrements = 0; 02589 for (IVChain::const_iterator I = Chain.begin(), E = Chain.end(); 02590 I != E; ++I) { 02591 02592 if (I->IncExpr->isZero()) 02593 continue; 02594 02595 // Incrementing by zero or some constant is neutral. We assume constants can 02596 // be folded into an addressing mode or an add's immediate operand. 02597 if (isa<SCEVConstant>(I->IncExpr)) { 02598 ++NumConstIncrements; 02599 continue; 02600 } 02601 02602 if (I->IncExpr == LastIncExpr) 02603 ++NumReusedIncrements; 02604 else 02605 ++NumVarIncrements; 02606 02607 LastIncExpr = I->IncExpr; 02608 } 02609 // An IV chain with a single increment is handled by LSR's postinc 02610 // uses. However, a chain with multiple increments requires keeping the IV's 02611 // value live longer than it needs to be if chained. 02612 if (NumConstIncrements > 1) 02613 --cost; 02614 02615 // Materializing increment expressions in the preheader that didn't exist in 02616 // the original code may cost a register. For example, sign-extended array 02617 // indices can produce ridiculous increments like this: 02618 // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64))) 02619 cost += NumVarIncrements; 02620 02621 // Reusing variable increments likely saves a register to hold the multiple of 02622 // the stride. 02623 cost -= NumReusedIncrements; 02624 02625 DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost 02626 << "\n"); 02627 02628 return cost < 0; 02629 } 02630 02631 /// ChainInstruction - Add this IV user to an existing chain or make it the head 02632 /// of a new chain. 02633 void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, 02634 SmallVectorImpl<ChainUsers> &ChainUsersVec) { 02635 // When IVs are used as types of varying widths, they are generally converted 02636 // to a wider type with some uses remaining narrow under a (free) trunc. 02637 Value *const NextIV = getWideOperand(IVOper); 02638 const SCEV *const OperExpr = SE.getSCEV(NextIV); 02639 const SCEV *const OperExprBase = getExprBase(OperExpr); 02640 02641 // Visit all existing chains. Check if its IVOper can be computed as a 02642 // profitable loop invariant increment from the last link in the Chain. 02643 unsigned ChainIdx = 0, NChains = IVChainVec.size(); 02644 const SCEV *LastIncExpr = nullptr; 02645 for (; ChainIdx < NChains; ++ChainIdx) { 02646 IVChain &Chain = IVChainVec[ChainIdx]; 02647 02648 // Prune the solution space aggressively by checking that both IV operands 02649 // are expressions that operate on the same unscaled SCEVUnknown. This 02650 // "base" will be canceled by the subsequent getMinusSCEV call. Checking 02651 // first avoids creating extra SCEV expressions. 02652 if (!StressIVChain && Chain.ExprBase != OperExprBase) 02653 continue; 02654 02655 Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand); 02656 if (!isCompatibleIVType(PrevIV, NextIV)) 02657 continue; 02658 02659 // A phi node terminates a chain. 02660 if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst())) 02661 continue; 02662 02663 // The increment must be loop-invariant so it can be kept in a register. 02664 const SCEV *PrevExpr = SE.getSCEV(PrevIV); 02665 const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr); 02666 if (!SE.isLoopInvariant(IncExpr, L)) 02667 continue; 02668 02669 if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) { 02670 LastIncExpr = IncExpr; 02671 break; 02672 } 02673 } 02674 // If we haven't found a chain, create a new one, unless we hit the max. Don't 02675 // bother for phi nodes, because they must be last in the chain. 02676 if (ChainIdx == NChains) { 02677 if (isa<PHINode>(UserInst)) 02678 return; 02679 if (NChains >= MaxChains && !StressIVChain) { 02680 DEBUG(dbgs() << "IV Chain Limit\n"); 02681 return; 02682 } 02683 LastIncExpr = OperExpr; 02684 // IVUsers may have skipped over sign/zero extensions. We don't currently 02685 // attempt to form chains involving extensions unless they can be hoisted 02686 // into this loop's AddRec. 02687 if (!isa<SCEVAddRecExpr>(LastIncExpr)) 02688 return; 02689 ++NChains; 02690 IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr), 02691 OperExprBase)); 02692 ChainUsersVec.resize(NChains); 02693 DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst 02694 << ") IV=" << *LastIncExpr << "\n"); 02695 } else { 02696 DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst 02697 << ") IV+" << *LastIncExpr << "\n"); 02698 // Add this IV user to the end of the chain. 02699 IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr)); 02700 } 02701 IVChain &Chain = IVChainVec[ChainIdx]; 02702 02703 SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers; 02704 // This chain's NearUsers become FarUsers. 02705 if (!LastIncExpr->isZero()) { 02706 ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(), 02707 NearUsers.end()); 02708 NearUsers.clear(); 02709 } 02710 02711 // All other uses of IVOperand become near uses of the chain. 02712 // We currently ignore intermediate values within SCEV expressions, assuming 02713 // they will eventually be used be the current chain, or can be computed 02714 // from one of the chain increments. To be more precise we could 02715 // transitively follow its user and only add leaf IV users to the set. 02716 for (User *U : IVOper->users()) { 02717 Instruction *OtherUse = dyn_cast<Instruction>(U); 02718 if (!OtherUse) 02719 continue; 02720 // Uses in the chain will no longer be uses if the chain is formed. 02721 // Include the head of the chain in this iteration (not Chain.begin()). 02722 IVChain::const_iterator IncIter = Chain.Incs.begin(); 02723 IVChain::const_iterator IncEnd = Chain.Incs.end(); 02724 for( ; IncIter != IncEnd; ++IncIter) { 02725 if (IncIter->UserInst == OtherUse) 02726 break; 02727 } 02728 if (IncIter != IncEnd) 02729 continue; 02730 02731 if (SE.isSCEVable(OtherUse->getType()) 02732 && !isa<SCEVUnknown>(SE.getSCEV(OtherUse)) 02733 && IU.isIVUserOrOperand(OtherUse)) { 02734 continue; 02735 } 02736 NearUsers.insert(OtherUse); 02737 } 02738 02739 // Since this user is part of the chain, it's no longer considered a use 02740 // of the chain. 02741 ChainUsersVec[ChainIdx].FarUsers.erase(UserInst); 02742 } 02743 02744 /// CollectChains - Populate the vector of Chains. 02745 /// 02746 /// This decreases ILP at the architecture level. Targets with ample registers, 02747 /// multiple memory ports, and no register renaming probably don't want 02748 /// this. However, such targets should probably disable LSR altogether. 02749 /// 02750 /// The job of LSR is to make a reasonable choice of induction variables across 02751 /// the loop. Subsequent passes can easily "unchain" computation exposing more 02752 /// ILP *within the loop* if the target wants it. 02753 /// 02754 /// Finding the best IV chain is potentially a scheduling problem. Since LSR 02755 /// will not reorder memory operations, it will recognize this as a chain, but 02756 /// will generate redundant IV increments. Ideally this would be corrected later 02757 /// by a smart scheduler: 02758 /// = A[i] 02759 /// = A[i+x] 02760 /// A[i] = 02761 /// A[i+x] = 02762 /// 02763 /// TODO: Walk the entire domtree within this loop, not just the path to the 02764 /// loop latch. This will discover chains on side paths, but requires 02765 /// maintaining multiple copies of the Chains state. 02766 void LSRInstance::CollectChains() { 02767 DEBUG(dbgs() << "Collecting IV Chains.\n"); 02768 SmallVector<ChainUsers, 8> ChainUsersVec; 02769 02770 SmallVector<BasicBlock *,8> LatchPath; 02771 BasicBlock *LoopHeader = L->getHeader(); 02772 for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch()); 02773 Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) { 02774 LatchPath.push_back(Rung->getBlock()); 02775 } 02776 LatchPath.push_back(LoopHeader); 02777 02778 // Walk the instruction stream from the loop header to the loop latch. 02779 for (SmallVectorImpl<BasicBlock *>::reverse_iterator 02780 BBIter = LatchPath.rbegin(), BBEnd = LatchPath.rend(); 02781 BBIter != BBEnd; ++BBIter) { 02782 for (BasicBlock::iterator I = (*BBIter)->begin(), E = (*BBIter)->end(); 02783 I != E; ++I) { 02784 // Skip instructions that weren't seen by IVUsers analysis. 02785 if (isa<PHINode>(I) || !IU.isIVUserOrOperand(I)) 02786 continue; 02787 02788 // Ignore users that are part of a SCEV expression. This way we only 02789 // consider leaf IV Users. This effectively rediscovers a portion of 02790 // IVUsers analysis but in program order this time. 02791 if (SE.isSCEVable(I->getType()) && !isa<SCEVUnknown>(SE.getSCEV(I))) 02792 continue; 02793 02794 // Remove this instruction from any NearUsers set it may be in. 02795 for (unsigned ChainIdx = 0, NChains = IVChainVec.size(); 02796 ChainIdx < NChains; ++ChainIdx) { 02797 ChainUsersVec[ChainIdx].NearUsers.erase(I); 02798 } 02799 // Search for operands that can be chained. 02800 SmallPtrSet<Instruction*, 4> UniqueOperands; 02801 User::op_iterator IVOpEnd = I->op_end(); 02802 User::op_iterator IVOpIter = findIVOperand(I->op_begin(), IVOpEnd, L, SE); 02803 while (IVOpIter != IVOpEnd) { 02804 Instruction *IVOpInst = cast<Instruction>(*IVOpIter); 02805 if (UniqueOperands.insert(IVOpInst)) 02806 ChainInstruction(I, IVOpInst, ChainUsersVec); 02807 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE); 02808 } 02809 } // Continue walking down the instructions. 02810 } // Continue walking down the domtree. 02811 // Visit phi backedges to determine if the chain can generate the IV postinc. 02812 for (BasicBlock::iterator I = L->getHeader()->begin(); 02813 PHINode *PN = dyn_cast<PHINode>(I); ++I) { 02814 if (!SE.isSCEVable(PN->getType())) 02815 continue; 02816 02817 Instruction *IncV = 02818 dyn_cast<Instruction>(PN->getIncomingValueForBlock(L->getLoopLatch())); 02819 if (IncV) 02820 ChainInstruction(PN, IncV, ChainUsersVec); 02821 } 02822 // Remove any unprofitable chains. 02823 unsigned ChainIdx = 0; 02824 for (unsigned UsersIdx = 0, NChains = IVChainVec.size(); 02825 UsersIdx < NChains; ++UsersIdx) { 02826 if (!isProfitableChain(IVChainVec[UsersIdx], 02827 ChainUsersVec[UsersIdx].FarUsers, SE, TTI)) 02828 continue; 02829 // Preserve the chain at UsesIdx. 02830 if (ChainIdx != UsersIdx) 02831 IVChainVec[ChainIdx] = IVChainVec[UsersIdx]; 02832 FinalizeChain(IVChainVec[ChainIdx]); 02833 ++ChainIdx; 02834 } 02835 IVChainVec.resize(ChainIdx); 02836 } 02837 02838 void LSRInstance::FinalizeChain(IVChain &Chain) { 02839 assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); 02840 DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n"); 02841 02842 for (IVChain::const_iterator I = Chain.begin(), E = Chain.end(); 02843 I != E; ++I) { 02844 DEBUG(dbgs() << " Inc: " << *I->UserInst << "\n"); 02845 User::op_iterator UseI = 02846 std::find(I->UserInst->op_begin(), I->UserInst->op_end(), I->IVOperand); 02847 assert(UseI != I->UserInst->op_end() && "cannot find IV operand"); 02848 IVIncSet.insert(UseI); 02849 } 02850 } 02851 02852 /// Return true if the IVInc can be folded into an addressing mode. 02853 static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, 02854 Value *Operand, const TargetTransformInfo &TTI) { 02855 const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr); 02856 if (!IncConst || !isAddressUse(UserInst, Operand)) 02857 return false; 02858 02859 if (IncConst->getValue()->getValue().getMinSignedBits() > 64) 02860 return false; 02861 02862 int64_t IncOffset = IncConst->getValue()->getSExtValue(); 02863 if (!isAlwaysFoldable(TTI, LSRUse::Address, 02864 getAccessType(UserInst), /*BaseGV=*/ nullptr, 02865 IncOffset, /*HaseBaseReg=*/ false)) 02866 return false; 02867 02868 return true; 02869 } 02870 02871 /// GenerateIVChains - Generate an add or subtract for each IVInc in a chain to 02872 /// materialize the IV user's operand from the previous IV user's operand. 02873 void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, 02874 SmallVectorImpl<WeakVH> &DeadInsts) { 02875 // Find the new IVOperand for the head of the chain. It may have been replaced 02876 // by LSR. 02877 const IVInc &Head = Chain.Incs[0]; 02878 User::op_iterator IVOpEnd = Head.UserInst->op_end(); 02879 // findIVOperand returns IVOpEnd if it can no longer find a valid IV user. 02880 User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(), 02881 IVOpEnd, L, SE); 02882 Value *IVSrc = nullptr; 02883 while (IVOpIter != IVOpEnd) { 02884 IVSrc = getWideOperand(*IVOpIter); 02885 02886 // If this operand computes the expression that the chain needs, we may use 02887 // it. (Check this after setting IVSrc which is used below.) 02888 // 02889 // Note that if Head.IncExpr is wider than IVSrc, then this phi is too 02890 // narrow for the chain, so we can no longer use it. We do allow using a 02891 // wider phi, assuming the LSR checked for free truncation. In that case we 02892 // should already have a truncate on this operand such that 02893 // getSCEV(IVSrc) == IncExpr. 02894 if (SE.getSCEV(*IVOpIter) == Head.IncExpr 02895 || SE.getSCEV(IVSrc) == Head.IncExpr) { 02896 break; 02897 } 02898 IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE); 02899 } 02900 if (IVOpIter == IVOpEnd) { 02901 // Gracefully give up on this chain. 02902 DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n"); 02903 return; 02904 } 02905 02906 DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n"); 02907 Type *IVTy = IVSrc->getType(); 02908 Type *IntTy = SE.getEffectiveSCEVType(IVTy); 02909 const SCEV *LeftOverExpr = nullptr; 02910 for (IVChain::const_iterator IncI = Chain.begin(), 02911 IncE = Chain.end(); IncI != IncE; ++IncI) { 02912 02913 Instruction *InsertPt = IncI->UserInst; 02914 if (isa<PHINode>(InsertPt)) 02915 InsertPt = L->getLoopLatch()->getTerminator(); 02916 02917 // IVOper will replace the current IV User's operand. IVSrc is the IV 02918 // value currently held in a register. 02919 Value *IVOper = IVSrc; 02920 if (!IncI->IncExpr->isZero()) { 02921 // IncExpr was the result of subtraction of two narrow values, so must 02922 // be signed. 02923 const SCEV *IncExpr = SE.getNoopOrSignExtend(IncI->IncExpr, IntTy); 02924 LeftOverExpr = LeftOverExpr ? 02925 SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr; 02926 } 02927 if (LeftOverExpr && !LeftOverExpr->isZero()) { 02928 // Expand the IV increment. 02929 Rewriter.clearPostInc(); 02930 Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt); 02931 const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc), 02932 SE.getUnknown(IncV)); 02933 IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt); 02934 02935 // If an IV increment can't be folded, use it as the next IV value. 02936 if (!canFoldIVIncExpr(LeftOverExpr, IncI->UserInst, IncI->IVOperand, 02937 TTI)) { 02938 assert(IVTy == IVOper->getType() && "inconsistent IV increment type"); 02939 IVSrc = IVOper; 02940 LeftOverExpr = nullptr; 02941 } 02942 } 02943 Type *OperTy = IncI->IVOperand->getType(); 02944 if (IVTy != OperTy) { 02945 assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) && 02946 "cannot extend a chained IV"); 02947 IRBuilder<> Builder(InsertPt); 02948 IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain"); 02949 } 02950 IncI->UserInst->replaceUsesOfWith(IncI->IVOperand, IVOper); 02951 DeadInsts.push_back(IncI->IVOperand); 02952 } 02953 // If LSR created a new, wider phi, we may also replace its postinc. We only 02954 // do this if we also found a wide value for the head of the chain. 02955 if (isa<PHINode>(Chain.tailUserInst())) { 02956 for (BasicBlock::iterator I = L->getHeader()->begin(); 02957 PHINode *Phi = dyn_cast<PHINode>(I); ++I) { 02958 if (!isCompatibleIVType(Phi, IVSrc)) 02959 continue; 02960 Instruction *PostIncV = dyn_cast<Instruction>( 02961 Phi->getIncomingValueForBlock(L->getLoopLatch())); 02962 if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc))) 02963 continue; 02964 Value *IVOper = IVSrc; 02965 Type *PostIncTy = PostIncV->getType(); 02966 if (IVTy != PostIncTy) { 02967 assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types"); 02968 IRBuilder<> Builder(L->getLoopLatch()->getTerminator()); 02969 Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc()); 02970 IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain"); 02971 } 02972 Phi->replaceUsesOfWith(PostIncV, IVOper); 02973 DeadInsts.push_back(PostIncV); 02974 } 02975 } 02976 } 02977 02978 void LSRInstance::CollectFixupsAndInitialFormulae() { 02979 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { 02980 Instruction *UserInst = UI->getUser(); 02981 // Skip IV users that are part of profitable IV Chains. 02982 User::op_iterator UseI = std::find(UserInst->op_begin(), UserInst->op_end(), 02983 UI->getOperandValToReplace()); 02984 assert(UseI != UserInst->op_end() && "cannot find IV operand"); 02985 if (IVIncSet.count(UseI)) 02986 continue; 02987 02988 // Record the uses. 02989 LSRFixup &LF = getNewFixup(); 02990 LF.UserInst = UserInst; 02991 LF.OperandValToReplace = UI->getOperandValToReplace(); 02992 LF.PostIncLoops = UI->getPostIncLoops(); 02993 02994 LSRUse::KindType Kind = LSRUse::Basic; 02995 Type *AccessTy = nullptr; 02996 if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) { 02997 Kind = LSRUse::Address; 02998 AccessTy = getAccessType(LF.UserInst); 02999 } 03000 03001 const SCEV *S = IU.getExpr(*UI); 03002 03003 // Equality (== and !=) ICmps are special. We can rewrite (i == N) as 03004 // (N - i == 0), and this allows (N - i) to be the expression that we work 03005 // with rather than just N or i, so we can consider the register 03006 // requirements for both N and i at the same time. Limiting this code to 03007 // equality icmps is not a problem because all interesting loops use 03008 // equality icmps, thanks to IndVarSimplify. 03009 if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst)) 03010 if (CI->isEquality()) { 03011 // Swap the operands if needed to put the OperandValToReplace on the 03012 // left, for consistency. 03013 Value *NV = CI->getOperand(1); 03014 if (NV == LF.OperandValToReplace) { 03015 CI->setOperand(1, CI->getOperand(0)); 03016 CI->setOperand(0, NV); 03017 NV = CI->getOperand(1); 03018 Changed = true; 03019 } 03020 03021 // x == y --> x - y == 0 03022 const SCEV *N = SE.getSCEV(NV); 03023 if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) { 03024 // S is normalized, so normalize N before folding it into S 03025 // to keep the result normalized. 03026 N = TransformForPostIncUse(Normalize, N, CI, nullptr, 03027 LF.PostIncLoops, SE, DT); 03028 Kind = LSRUse::ICmpZero; 03029 S = SE.getMinusSCEV(N, S); 03030 } 03031 03032 // -1 and the negations of all interesting strides (except the negation 03033 // of -1) are now also interesting. 03034 for (size_t i = 0, e = Factors.size(); i != e; ++i) 03035 if (Factors[i] != -1) 03036 Factors.insert(-(uint64_t)Factors[i]); 03037 Factors.insert(-1); 03038 } 03039 03040 // Set up the initial formula for this use. 03041 std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy); 03042 LF.LUIdx = P.first; 03043 LF.Offset = P.second; 03044 LSRUse &LU = Uses[LF.LUIdx]; 03045 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); 03046 if (!LU.WidestFixupType || 03047 SE.getTypeSizeInBits(LU.WidestFixupType) < 03048 SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) 03049 LU.WidestFixupType = LF.OperandValToReplace->getType(); 03050 03051 // If this is the first use of this LSRUse, give it a formula. 03052 if (LU.Formulae.empty()) { 03053 InsertInitialFormula(S, LU, LF.LUIdx); 03054 CountRegisters(LU.Formulae.back(), LF.LUIdx); 03055 } 03056 } 03057 03058 DEBUG(print_fixups(dbgs())); 03059 } 03060 03061 /// InsertInitialFormula - Insert a formula for the given expression into 03062 /// the given use, separating out loop-variant portions from loop-invariant 03063 /// and loop-computable portions. 03064 void 03065 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { 03066 // Mark uses whose expressions cannot be expanded. 03067 if (!isSafeToExpand(S, SE)) 03068 LU.RigidFormula = true; 03069 03070 Formula F; 03071 F.InitialMatch(S, L, SE); 03072 bool Inserted = InsertFormula(LU, LUIdx, F); 03073 assert(Inserted && "Initial formula already exists!"); (void)Inserted; 03074 } 03075 03076 /// InsertSupplementalFormula - Insert a simple single-register formula for 03077 /// the given expression into the given use. 03078 void 03079 LSRInstance::InsertSupplementalFormula(const SCEV *S, 03080 LSRUse &LU, size_t LUIdx) { 03081 Formula F; 03082 F.BaseRegs.push_back(S); 03083 F.HasBaseReg = true; 03084 bool Inserted = InsertFormula(LU, LUIdx, F); 03085 assert(Inserted && "Supplemental formula already exists!"); (void)Inserted; 03086 } 03087 03088 /// CountRegisters - Note which registers are used by the given formula, 03089 /// updating RegUses. 03090 void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) { 03091 if (F.ScaledReg) 03092 RegUses.CountRegister(F.ScaledReg, LUIdx); 03093 for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(), 03094 E = F.BaseRegs.end(); I != E; ++I) 03095 RegUses.CountRegister(*I, LUIdx); 03096 } 03097 03098 /// InsertFormula - If the given formula has not yet been inserted, add it to 03099 /// the list, and return true. Return false otherwise. 03100 bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { 03101 // Do not insert formula that we will not be able to expand. 03102 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) && 03103 "Formula is illegal"); 03104 if (!LU.InsertFormula(F)) 03105 return false; 03106 03107 CountRegisters(F, LUIdx); 03108 return true; 03109 } 03110 03111 /// CollectLoopInvariantFixupsAndFormulae - Check for other uses of 03112 /// loop-invariant values which we're tracking. These other uses will pin these 03113 /// values in registers, making them less profitable for elimination. 03114 /// TODO: This currently misses non-constant addrec step registers. 03115 /// TODO: Should this give more weight to users inside the loop? 03116 void 03117 LSRInstance::CollectLoopInvariantFixupsAndFormulae() { 03118 SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end()); 03119 SmallPtrSet<const SCEV *, 8> Inserted; 03120 03121 while (!Worklist.empty()) { 03122 const SCEV *S = Worklist.pop_back_val(); 03123 03124 if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S)) 03125 Worklist.append(N->op_begin(), N->op_end()); 03126 else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S)) 03127 Worklist.push_back(C->getOperand()); 03128 else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) { 03129 Worklist.push_back(D->getLHS()); 03130 Worklist.push_back(D->getRHS()); 03131 } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) { 03132 if (!Inserted.insert(US)) continue; 03133 const Value *V = US->getValue(); 03134 if (const Instruction *Inst = dyn_cast<Instruction>(V)) { 03135 // Look for instructions defined outside the loop. 03136 if (L->contains(Inst)) continue; 03137 } else if (isa<UndefValue>(V)) 03138 // Undef doesn't have a live range, so it doesn't matter. 03139 continue; 03140 for (const Use &U : V->uses()) { 03141 const Instruction *UserInst = dyn_cast<Instruction>(U.getUser()); 03142 // Ignore non-instructions. 03143 if (!UserInst) 03144 continue; 03145 // Ignore instructions in other functions (as can happen with 03146 // Constants). 03147 if (UserInst->getParent()->getParent() != L->getHeader()->getParent()) 03148 continue; 03149 // Ignore instructions not dominated by the loop. 03150 const BasicBlock *UseBB = !isa<PHINode>(UserInst) ? 03151 UserInst->getParent() : 03152 cast<PHINode>(UserInst)->getIncomingBlock( 03153 PHINode::getIncomingValueNumForOperand(U.getOperandNo())); 03154 if (!DT.dominates(L->getHeader(), UseBB)) 03155 continue; 03156 // Ignore uses which are part of other SCEV expressions, to avoid 03157 // analyzing them multiple times. 03158 if (SE.isSCEVable(UserInst->getType())) { 03159 const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst)); 03160 // If the user is a no-op, look through to its uses. 03161 if (!isa<SCEVUnknown>(UserS)) 03162 continue; 03163 if (UserS == US) { 03164 Worklist.push_back( 03165 SE.getUnknown(const_cast<Instruction *>(UserInst))); 03166 continue; 03167 } 03168 } 03169 // Ignore icmp instructions which are already being analyzed. 03170 if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) { 03171 unsigned OtherIdx = !U.getOperandNo(); 03172 Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx)); 03173 if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L)) 03174 continue; 03175 } 03176 03177 LSRFixup &LF = getNewFixup(); 03178 LF.UserInst = const_cast<Instruction *>(UserInst); 03179 LF.OperandValToReplace = U; 03180 std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, nullptr); 03181 LF.LUIdx = P.first; 03182 LF.Offset = P.second; 03183 LSRUse &LU = Uses[LF.LUIdx]; 03184 LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); 03185 if (!LU.WidestFixupType || 03186 SE.getTypeSizeInBits(LU.WidestFixupType) < 03187 SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) 03188 LU.WidestFixupType = LF.OperandValToReplace->getType(); 03189 InsertSupplementalFormula(US, LU, LF.LUIdx); 03190 CountRegisters(LU.Formulae.back(), Uses.size() - 1); 03191 break; 03192 } 03193 } 03194 } 03195 } 03196 03197 /// CollectSubexprs - Split S into subexpressions which can be pulled out into 03198 /// separate registers. If C is non-null, multiply each subexpression by C. 03199 /// 03200 /// Return remainder expression after factoring the subexpressions captured by 03201 /// Ops. If Ops is complete, return NULL. 03202 static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, 03203 SmallVectorImpl<const SCEV *> &Ops, 03204 const Loop *L, 03205 ScalarEvolution &SE, 03206 unsigned Depth = 0) { 03207 // Arbitrarily cap recursion to protect compile time. 03208 if (Depth >= 3) 03209 return S; 03210 03211 if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { 03212 // Break out add operands. 03213 for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); 03214 I != E; ++I) { 03215 const SCEV *Remainder = CollectSubexprs(*I, C, Ops, L, SE, Depth+1); 03216 if (Remainder) 03217 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); 03218 } 03219 return nullptr; 03220 } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { 03221 // Split a non-zero base out of an addrec. 03222 if (AR->getStart()->isZero()) 03223 return S; 03224 03225 const SCEV *Remainder = CollectSubexprs(AR->getStart(), 03226 C, Ops, L, SE, Depth+1); 03227 // Split the non-zero AddRec unless it is part of a nested recurrence that 03228 // does not pertain to this loop. 03229 if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) { 03230 Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); 03231 Remainder = nullptr; 03232 } 03233 if (Remainder != AR->getStart()) { 03234 if (!Remainder) 03235 Remainder = SE.getConstant(AR->getType(), 0); 03236 return SE.getAddRecExpr(Remainder, 03237 AR->getStepRecurrence(SE), 03238 AR->getLoop(), 03239 //FIXME: AR->getNoWrapFlags(SCEV::FlagNW) 03240 SCEV::FlagAnyWrap); 03241 } 03242 } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) { 03243 // Break (C * (a + b + c)) into C*a + C*b + C*c. 03244 if (Mul->getNumOperands() != 2) 03245 return S; 03246 if (const SCEVConstant *Op0 = 03247 dyn_cast<SCEVConstant>(Mul->getOperand(0))) { 03248 C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0; 03249 const SCEV *Remainder = 03250 CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1); 03251 if (Remainder) 03252 Ops.push_back(SE.getMulExpr(C, Remainder)); 03253 return nullptr; 03254 } 03255 } 03256 return S; 03257 } 03258 03259 /// \brief Helper function for LSRInstance::GenerateReassociations. 03260 void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, 03261 const Formula &Base, 03262 unsigned Depth, size_t Idx, 03263 bool IsScaledReg) { 03264 const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; 03265 SmallVector<const SCEV *, 8> AddOps; 03266 const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE); 03267 if (Remainder) 03268 AddOps.push_back(Remainder); 03269 03270 if (AddOps.size() == 1) 03271 return; 03272 03273 for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(), 03274 JE = AddOps.end(); 03275 J != JE; ++J) { 03276 03277 // Loop-variant "unknown" values are uninteresting; we won't be able to 03278 // do anything meaningful with them. 03279 if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L)) 03280 continue; 03281 03282 // Don't pull a constant into a register if the constant could be folded 03283 // into an immediate field. 03284 if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, 03285 LU.AccessTy, *J, Base.getNumRegs() > 1)) 03286 continue; 03287 03288 // Collect all operands except *J. 03289 SmallVector<const SCEV *, 8> InnerAddOps( 03290 ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J); 03291 InnerAddOps.append(std::next(J), 03292 ((const SmallVector<const SCEV *, 8> &)AddOps).end()); 03293 03294 // Don't leave just a constant behind in a register if the constant could 03295 // be folded into an immediate field. 03296 if (InnerAddOps.size() == 1 && 03297 isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind, 03298 LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1)) 03299 continue; 03300 03301 const SCEV *InnerSum = SE.getAddExpr(InnerAddOps); 03302 if (InnerSum->isZero()) 03303 continue; 03304 Formula F = Base; 03305 03306 // Add the remaining pieces of the add back into the new formula. 03307 const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum); 03308 if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && 03309 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + 03310 InnerSumSC->getValue()->getZExtValue())) { 03311 F.UnfoldedOffset = 03312 (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue(); 03313 if (IsScaledReg) 03314 F.ScaledReg = nullptr; 03315 else 03316 F.BaseRegs.erase(F.BaseRegs.begin() + Idx); 03317 } else if (IsScaledReg) 03318 F.ScaledReg = InnerSum; 03319 else 03320 F.BaseRegs[Idx] = InnerSum; 03321 03322 // Add J as its own register, or an unfolded immediate. 03323 const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J); 03324 if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && 03325 TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + 03326 SC->getValue()->getZExtValue())) 03327 F.UnfoldedOffset = 03328 (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue(); 03329 else 03330 F.BaseRegs.push_back(*J); 03331 // We may have changed the number of register in base regs, adjust the 03332 // formula accordingly. 03333 F.Canonicalize(); 03334 03335 if (InsertFormula(LU, LUIdx, F)) 03336 // If that formula hadn't been seen before, recurse to find more like 03337 // it. 03338 GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth + 1); 03339 } 03340 } 03341 03342 /// GenerateReassociations - Split out subexpressions from adds and the bases of 03343 /// addrecs. 03344 void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, 03345 Formula Base, unsigned Depth) { 03346 assert(Base.isCanonical() && "Input must be in the canonical form"); 03347 // Arbitrarily cap recursion to protect compile time. 03348 if (Depth >= 3) 03349 return; 03350 03351 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) 03352 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i); 03353 03354 if (Base.Scale == 1) 03355 GenerateReassociationsImpl(LU, LUIdx, Base, Depth, 03356 /* Idx */ -1, /* IsScaledReg */ true); 03357 } 03358 03359 /// GenerateCombinations - Generate a formula consisting of all of the 03360 /// loop-dominating registers added into a single register. 03361 void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, 03362 Formula Base) { 03363 // This method is only interesting on a plurality of registers. 03364 if (Base.BaseRegs.size() + (Base.Scale == 1) <= 1) 03365 return; 03366 03367 // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before 03368 // processing the formula. 03369 Base.Unscale(); 03370 Formula F = Base; 03371 F.BaseRegs.clear(); 03372 SmallVector<const SCEV *, 4> Ops; 03373 for (SmallVectorImpl<const SCEV *>::const_iterator 03374 I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) { 03375 const SCEV *BaseReg = *I; 03376 if (SE.properlyDominates(BaseReg, L->getHeader()) && 03377 !SE.hasComputableLoopEvolution(BaseReg, L)) 03378 Ops.push_back(BaseReg); 03379 else 03380 F.BaseRegs.push_back(BaseReg); 03381 } 03382 if (Ops.size() > 1) { 03383 const SCEV *Sum = SE.getAddExpr(Ops); 03384 // TODO: If Sum is zero, it probably means ScalarEvolution missed an 03385 // opportunity to fold something. For now, just ignore such cases 03386 // rather than proceed with zero in a register. 03387 if (!Sum->isZero()) { 03388 F.BaseRegs.push_back(Sum); 03389 F.Canonicalize(); 03390 (void)InsertFormula(LU, LUIdx, F); 03391 } 03392 } 03393 } 03394 03395 /// \brief Helper function for LSRInstance::GenerateSymbolicOffsets. 03396 void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx, 03397 const Formula &Base, size_t Idx, 03398 bool IsScaledReg) { 03399 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; 03400 GlobalValue *GV = ExtractSymbol(G, SE); 03401 if (G->isZero() || !GV) 03402 return; 03403 Formula F = Base; 03404 F.BaseGV = GV; 03405 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) 03406 return; 03407 if (IsScaledReg) 03408 F.ScaledReg = G; 03409 else 03410 F.BaseRegs[Idx] = G; 03411 (void)InsertFormula(LU, LUIdx, F); 03412 } 03413 03414 /// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets. 03415 void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, 03416 Formula Base) { 03417 // We can't add a symbolic offset if the address already contains one. 03418 if (Base.BaseGV) return; 03419 03420 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) 03421 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i); 03422 if (Base.Scale == 1) 03423 GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1, 03424 /* IsScaledReg */ true); 03425 } 03426 03427 /// \brief Helper function for LSRInstance::GenerateConstantOffsets. 03428 void LSRInstance::GenerateConstantOffsetsImpl( 03429 LSRUse &LU, unsigned LUIdx, const Formula &Base, 03430 const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) { 03431 const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx]; 03432 for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(), 03433 E = Worklist.end(); 03434 I != E; ++I) { 03435 Formula F = Base; 03436 F.BaseOffset = (uint64_t)Base.BaseOffset - *I; 03437 if (isLegalUse(TTI, LU.MinOffset - *I, LU.MaxOffset - *I, LU.Kind, 03438 LU.AccessTy, F)) { 03439 // Add the offset to the base register. 03440 const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), *I), G); 03441 // If it cancelled out, drop the base register, otherwise update it. 03442 if (NewG->isZero()) { 03443 if (IsScaledReg) { 03444 F.Scale = 0; 03445 F.ScaledReg = nullptr; 03446 } else 03447 F.DeleteBaseReg(F.BaseRegs[Idx]); 03448 F.Canonicalize(); 03449 } else if (IsScaledReg) 03450 F.ScaledReg = NewG; 03451 else 03452 F.BaseRegs[Idx] = NewG; 03453 03454 (void)InsertFormula(LU, LUIdx, F); 03455 } 03456 } 03457 03458 int64_t Imm = ExtractImmediate(G, SE); 03459 if (G->isZero() || Imm == 0) 03460 return; 03461 Formula F = Base; 03462 F.BaseOffset = (uint64_t)F.BaseOffset + Imm; 03463 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) 03464 return; 03465 if (IsScaledReg) 03466 F.ScaledReg = G; 03467 else 03468 F.BaseRegs[Idx] = G; 03469 (void)InsertFormula(LU, LUIdx, F); 03470 } 03471 03472 /// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets. 03473 void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, 03474 Formula Base) { 03475 // TODO: For now, just add the min and max offset, because it usually isn't 03476 // worthwhile looking at everything inbetween. 03477 SmallVector<int64_t, 2> Worklist; 03478 Worklist.push_back(LU.MinOffset); 03479 if (LU.MaxOffset != LU.MinOffset) 03480 Worklist.push_back(LU.MaxOffset); 03481 03482 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) 03483 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i); 03484 if (Base.Scale == 1) 03485 GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1, 03486 /* IsScaledReg */ true); 03487 } 03488 03489 /// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up 03490 /// the comparison. For example, x == y -> x*c == y*c. 03491 void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, 03492 Formula Base) { 03493 if (LU.Kind != LSRUse::ICmpZero) return; 03494 03495 // Determine the integer type for the base formula. 03496 Type *IntTy = Base.getType(); 03497 if (!IntTy) return; 03498 if (SE.getTypeSizeInBits(IntTy) > 64) return; 03499 03500 // Don't do this if there is more than one offset. 03501 if (LU.MinOffset != LU.MaxOffset) return; 03502 03503 assert(!Base.BaseGV && "ICmpZero use is not legal!"); 03504 03505 // Check each interesting stride. 03506 for (SmallSetVector<int64_t, 8>::const_iterator 03507 I = Factors.begin(), E = Factors.end(); I != E; ++I) { 03508 int64_t Factor = *I; 03509 03510 // Check that the multiplication doesn't overflow. 03511 if (Base.BaseOffset == INT64_MIN && Factor == -1) 03512 continue; 03513 int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor; 03514 if (NewBaseOffset / Factor != Base.BaseOffset) 03515 continue; 03516 // If the offset will be truncated at this use, check that it is in bounds. 03517 if (!IntTy->isPointerTy() && 03518 !ConstantInt::isValueValidForType(IntTy, NewBaseOffset)) 03519 continue; 03520 03521 // Check that multiplying with the use offset doesn't overflow. 03522 int64_t Offset = LU.MinOffset; 03523 if (Offset == INT64_MIN && Factor == -1) 03524 continue; 03525 Offset = (uint64_t)Offset * Factor; 03526 if (Offset / Factor != LU.MinOffset) 03527 continue; 03528 // If the offset will be truncated at this use, check that it is in bounds. 03529 if (!IntTy->isPointerTy() && 03530 !ConstantInt::isValueValidForType(IntTy, Offset)) 03531 continue; 03532 03533 Formula F = Base; 03534 F.BaseOffset = NewBaseOffset; 03535 03536 // Check that this scale is legal. 03537 if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F)) 03538 continue; 03539 03540 // Compensate for the use having MinOffset built into it. 03541 F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset; 03542 03543 const SCEV *FactorS = SE.getConstant(IntTy, Factor); 03544 03545 // Check that multiplying with each base register doesn't overflow. 03546 for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) { 03547 F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS); 03548 if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i]) 03549 goto next; 03550 } 03551 03552 // Check that multiplying with the scaled register doesn't overflow. 03553 if (F.ScaledReg) { 03554 F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS); 03555 if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg) 03556 continue; 03557 } 03558 03559 // Check that multiplying with the unfolded offset doesn't overflow. 03560 if (F.UnfoldedOffset != 0) { 03561 if (F.UnfoldedOffset == INT64_MIN && Factor == -1) 03562 continue; 03563 F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor; 03564 if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset) 03565 continue; 03566 // If the offset will be truncated, check that it is in bounds. 03567 if (!IntTy->isPointerTy() && 03568 !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset)) 03569 continue; 03570 } 03571 03572 // If we make it here and it's legal, add it. 03573 (void)InsertFormula(LU, LUIdx, F); 03574 next:; 03575 } 03576 } 03577 03578 /// GenerateScales - Generate stride factor reuse formulae by making use of 03579 /// scaled-offset address modes, for example. 03580 void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { 03581 // Determine the integer type for the base formula. 03582 Type *IntTy = Base.getType(); 03583 if (!IntTy) return; 03584 03585 // If this Formula already has a scaled register, we can't add another one. 03586 // Try to unscale the formula to generate a better scale. 03587 if (Base.Scale != 0 && !Base.Unscale()) 03588 return; 03589 03590 assert(Base.Scale == 0 && "Unscale did not did its job!"); 03591 03592 // Check each interesting stride. 03593 for (SmallSetVector<int64_t, 8>::const_iterator 03594 I = Factors.begin(), E = Factors.end(); I != E; ++I) { 03595 int64_t Factor = *I; 03596 03597 Base.Scale = Factor; 03598 Base.HasBaseReg = Base.BaseRegs.size() > 1; 03599 // Check whether this scale is going to be legal. 03600 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, 03601 Base)) { 03602 // As a special-case, handle special out-of-loop Basic users specially. 03603 // TODO: Reconsider this special case. 03604 if (LU.Kind == LSRUse::Basic && 03605 isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special, 03606 LU.AccessTy, Base) && 03607 LU.AllFixupsOutsideLoop) 03608 LU.Kind = LSRUse::Special; 03609 else 03610 continue; 03611 } 03612 // For an ICmpZero, negating a solitary base register won't lead to 03613 // new solutions. 03614 if (LU.Kind == LSRUse::ICmpZero && 03615 !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV) 03616 continue; 03617 // For each addrec base reg, apply the scale, if possible. 03618 for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) 03619 if (const SCEVAddRecExpr *AR = 03620 dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) { 03621 const SCEV *FactorS = SE.getConstant(IntTy, Factor); 03622 if (FactorS->isZero()) 03623 continue; 03624 // Divide out the factor, ignoring high bits, since we'll be 03625 // scaling the value back up in the end. 03626 if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) { 03627 // TODO: This could be optimized to avoid all the copying. 03628 Formula F = Base; 03629 F.ScaledReg = Quotient; 03630 F.DeleteBaseReg(F.BaseRegs[i]); 03631 // The canonical representation of 1*reg is reg, which is already in 03632 // Base. In that case, do not try to insert the formula, it will be 03633 // rejected anyway. 03634 if (F.Scale == 1 && F.BaseRegs.empty()) 03635 continue; 03636 (void)InsertFormula(LU, LUIdx, F); 03637 } 03638 } 03639 } 03640 } 03641 03642 /// GenerateTruncates - Generate reuse formulae from different IV types. 03643 void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { 03644 // Don't bother truncating symbolic values. 03645 if (Base.BaseGV) return; 03646 03647 // Determine the integer type for the base formula. 03648 Type *DstTy = Base.getType(); 03649 if (!DstTy) return; 03650 DstTy = SE.getEffectiveSCEVType(DstTy); 03651 03652 for (SmallSetVector<Type *, 4>::const_iterator 03653 I = Types.begin(), E = Types.end(); I != E; ++I) { 03654 Type *SrcTy = *I; 03655 if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) { 03656 Formula F = Base; 03657 03658 if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I); 03659 for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(), 03660 JE = F.BaseRegs.end(); J != JE; ++J) 03661 *J = SE.getAnyExtendExpr(*J, SrcTy); 03662 03663 // TODO: This assumes we've done basic processing on all uses and 03664 // have an idea what the register usage is. 03665 if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses)) 03666 continue; 03667 03668 (void)InsertFormula(LU, LUIdx, F); 03669 } 03670 } 03671 } 03672 03673 namespace { 03674 03675 /// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to 03676 /// defer modifications so that the search phase doesn't have to worry about 03677 /// the data structures moving underneath it. 03678 struct WorkItem { 03679 size_t LUIdx; 03680 int64_t Imm; 03681 const SCEV *OrigReg; 03682 03683 WorkItem(size_t LI, int64_t I, const SCEV *R) 03684 : LUIdx(LI), Imm(I), OrigReg(R) {} 03685 03686 void print(raw_ostream &OS) const; 03687 void dump() const; 03688 }; 03689 03690 } 03691 03692 void WorkItem::print(raw_ostream &OS) const { 03693 OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx 03694 << " , add offset " << Imm; 03695 } 03696 03697 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 03698 void WorkItem::dump() const { 03699 print(errs()); errs() << '\n'; 03700 } 03701 #endif 03702 03703 /// GenerateCrossUseConstantOffsets - Look for registers which are a constant 03704 /// distance apart and try to form reuse opportunities between them. 03705 void LSRInstance::GenerateCrossUseConstantOffsets() { 03706 // Group the registers by their value without any added constant offset. 03707 typedef std::map<int64_t, const SCEV *> ImmMapTy; 03708 typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy; 03709 RegMapTy Map; 03710 DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap; 03711 SmallVector<const SCEV *, 8> Sequence; 03712 for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end(); 03713 I != E; ++I) { 03714 const SCEV *Reg = *I; 03715 int64_t Imm = ExtractImmediate(Reg, SE); 03716 std::pair<RegMapTy::iterator, bool> Pair = 03717 Map.insert(std::make_pair(Reg, ImmMapTy())); 03718 if (Pair.second) 03719 Sequence.push_back(Reg); 03720 Pair.first->second.insert(std::make_pair(Imm, *I)); 03721 UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(*I); 03722 } 03723 03724 // Now examine each set of registers with the same base value. Build up 03725 // a list of work to do and do the work in a separate step so that we're 03726 // not adding formulae and register counts while we're searching. 03727 SmallVector<WorkItem, 32> WorkItems; 03728 SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems; 03729 for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(), 03730 E = Sequence.end(); I != E; ++I) { 03731 const SCEV *Reg = *I; 03732 const ImmMapTy &Imms = Map.find(Reg)->second; 03733 03734 // It's not worthwhile looking for reuse if there's only one offset. 03735 if (Imms.size() == 1) 03736 continue; 03737 03738 DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':'; 03739 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end(); 03740 J != JE; ++J) 03741 dbgs() << ' ' << J->first; 03742 dbgs() << '\n'); 03743 03744 // Examine each offset. 03745 for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end(); 03746 J != JE; ++J) { 03747 const SCEV *OrigReg = J->second; 03748 03749 int64_t JImm = J->first; 03750 const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg); 03751 03752 if (!isa<SCEVConstant>(OrigReg) && 03753 UsedByIndicesMap[Reg].count() == 1) { 03754 DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n'); 03755 continue; 03756 } 03757 03758 // Conservatively examine offsets between this orig reg a few selected 03759 // other orig regs. 03760 ImmMapTy::const_iterator OtherImms[] = { 03761 Imms.begin(), std::prev(Imms.end()), 03762 Imms.lower_bound((Imms.begin()->first + std::prev(Imms.end())->first) / 03763 2) 03764 }; 03765 for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) { 03766 ImmMapTy::const_iterator M = OtherImms[i]; 03767 if (M == J || M == JE) continue; 03768 03769 // Compute the difference between the two. 03770 int64_t Imm = (uint64_t)JImm - M->first; 03771 for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1; 03772 LUIdx = UsedByIndices.find_next(LUIdx)) 03773 // Make a memo of this use, offset, and register tuple. 03774 if (UniqueItems.insert(std::make_pair(LUIdx, Imm))) 03775 WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg)); 03776 } 03777 } 03778 } 03779 03780 Map.clear(); 03781 Sequence.clear(); 03782 UsedByIndicesMap.clear(); 03783 UniqueItems.clear(); 03784 03785 // Now iterate through the worklist and add new formulae. 03786 for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(), 03787 E = WorkItems.end(); I != E; ++I) { 03788 const WorkItem &WI = *I; 03789 size_t LUIdx = WI.LUIdx; 03790 LSRUse &LU = Uses[LUIdx]; 03791 int64_t Imm = WI.Imm; 03792 const SCEV *OrigReg = WI.OrigReg; 03793 03794 Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType()); 03795 const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm)); 03796 unsigned BitWidth = SE.getTypeSizeInBits(IntTy); 03797 03798 // TODO: Use a more targeted data structure. 03799 for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) { 03800 Formula F = LU.Formulae[L]; 03801 // FIXME: The code for the scaled and unscaled registers looks 03802 // very similar but slightly different. Investigate if they 03803 // could be merged. That way, we would not have to unscale the 03804 // Formula. 03805 F.Unscale(); 03806 // Use the immediate in the scaled register. 03807 if (F.ScaledReg == OrigReg) { 03808 int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale; 03809 // Don't create 50 + reg(-50). 03810 if (F.referencesReg(SE.getSCEV( 03811 ConstantInt::get(IntTy, -(uint64_t)Offset)))) 03812 continue; 03813 Formula NewF = F; 03814 NewF.BaseOffset = Offset; 03815 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, 03816 NewF)) 03817 continue; 03818 NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg); 03819 03820 // If the new scale is a constant in a register, and adding the constant 03821 // value to the immediate would produce a value closer to zero than the 03822 // immediate itself, then the formula isn't worthwhile. 03823 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg)) 03824 if (C->getValue()->isNegative() != 03825 (NewF.BaseOffset < 0) && 03826 (C->getValue()->getValue().abs() * APInt(BitWidth, F.Scale)) 03827 .ule(abs64(NewF.BaseOffset))) 03828 continue; 03829 03830 // OK, looks good. 03831 NewF.Canonicalize(); 03832 (void)InsertFormula(LU, LUIdx, NewF); 03833 } else { 03834 // Use the immediate in a base register. 03835 for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) { 03836 const SCEV *BaseReg = F.BaseRegs[N]; 03837 if (BaseReg != OrigReg) 03838 continue; 03839 Formula NewF = F; 03840 NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm; 03841 if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, 03842 LU.Kind, LU.AccessTy, NewF)) { 03843 if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) 03844 continue; 03845 NewF = F; 03846 NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm; 03847 } 03848 NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg); 03849 03850 // If the new formula has a constant in a register, and adding the 03851 // constant value to the immediate would produce a value closer to 03852 // zero than the immediate itself, then the formula isn't worthwhile. 03853 for (SmallVectorImpl<const SCEV *>::const_iterator 03854 J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end(); 03855 J != JE; ++J) 03856 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J)) 03857 if ((C->getValue()->getValue() + NewF.BaseOffset).abs().slt( 03858 abs64(NewF.BaseOffset)) && 03859 (C->getValue()->getValue() + 03860 NewF.BaseOffset).countTrailingZeros() >= 03861 countTrailingZeros<uint64_t>(NewF.BaseOffset)) 03862 goto skip_formula; 03863 03864 // Ok, looks good. 03865 NewF.Canonicalize(); 03866 (void)InsertFormula(LU, LUIdx, NewF); 03867 break; 03868 skip_formula:; 03869 } 03870 } 03871 } 03872 } 03873 } 03874 03875 /// GenerateAllReuseFormulae - Generate formulae for each use. 03876 void 03877 LSRInstance::GenerateAllReuseFormulae() { 03878 // This is split into multiple loops so that hasRegsUsedByUsesOtherThan 03879 // queries are more precise. 03880 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 03881 LSRUse &LU = Uses[LUIdx]; 03882 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 03883 GenerateReassociations(LU, LUIdx, LU.Formulae[i]); 03884 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 03885 GenerateCombinations(LU, LUIdx, LU.Formulae[i]); 03886 } 03887 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 03888 LSRUse &LU = Uses[LUIdx]; 03889 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 03890 GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]); 03891 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 03892 GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]); 03893 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 03894 GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]); 03895 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 03896 GenerateScales(LU, LUIdx, LU.Formulae[i]); 03897 } 03898 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 03899 LSRUse &LU = Uses[LUIdx]; 03900 for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) 03901 GenerateTruncates(LU, LUIdx, LU.Formulae[i]); 03902 } 03903 03904 GenerateCrossUseConstantOffsets(); 03905 03906 DEBUG(dbgs() << "\n" 03907 "After generating reuse formulae:\n"; 03908 print_uses(dbgs())); 03909 } 03910 03911 /// If there are multiple formulae with the same set of registers used 03912 /// by other uses, pick the best one and delete the others. 03913 void LSRInstance::FilterOutUndesirableDedicatedRegisters() { 03914 DenseSet<const SCEV *> VisitedRegs; 03915 SmallPtrSet<const SCEV *, 16> Regs; 03916 SmallPtrSet<const SCEV *, 16> LoserRegs; 03917 #ifndef NDEBUG 03918 bool ChangedFormulae = false; 03919 #endif 03920 03921 // Collect the best formula for each unique set of shared registers. This 03922 // is reset for each use. 03923 typedef DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo> 03924 BestFormulaeTy; 03925 BestFormulaeTy BestFormulae; 03926 03927 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 03928 LSRUse &LU = Uses[LUIdx]; 03929 DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n'); 03930 03931 bool Any = false; 03932 for (size_t FIdx = 0, NumForms = LU.Formulae.size(); 03933 FIdx != NumForms; ++FIdx) { 03934 Formula &F = LU.Formulae[FIdx]; 03935 03936 // Some formulas are instant losers. For example, they may depend on 03937 // nonexistent AddRecs from other loops. These need to be filtered 03938 // immediately, otherwise heuristics could choose them over others leading 03939 // to an unsatisfactory solution. Passing LoserRegs into RateFormula here 03940 // avoids the need to recompute this information across formulae using the 03941 // same bad AddRec. Passing LoserRegs is also essential unless we remove 03942 // the corresponding bad register from the Regs set. 03943 Cost CostF; 03944 Regs.clear(); 03945 CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU, 03946 &LoserRegs); 03947 if (CostF.isLoser()) { 03948 // During initial formula generation, undesirable formulae are generated 03949 // by uses within other loops that have some non-trivial address mode or 03950 // use the postinc form of the IV. LSR needs to provide these formulae 03951 // as the basis of rediscovering the desired formula that uses an AddRec 03952 // corresponding to the existing phi. Once all formulae have been 03953 // generated, these initial losers may be pruned. 03954 DEBUG(dbgs() << " Filtering loser "; F.print(dbgs()); 03955 dbgs() << "\n"); 03956 } 03957 else { 03958 SmallVector<const SCEV *, 4> Key; 03959 for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(), 03960 JE = F.BaseRegs.end(); J != JE; ++J) { 03961 const SCEV *Reg = *J; 03962 if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx)) 03963 Key.push_back(Reg); 03964 } 03965 if (F.ScaledReg && 03966 RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx)) 03967 Key.push_back(F.ScaledReg); 03968 // Unstable sort by host order ok, because this is only used for 03969 // uniquifying. 03970 std::sort(Key.begin(), Key.end()); 03971 03972 std::pair<BestFormulaeTy::const_iterator, bool> P = 03973 BestFormulae.insert(std::make_pair(Key, FIdx)); 03974 if (P.second) 03975 continue; 03976 03977 Formula &Best = LU.Formulae[P.first->second]; 03978 03979 Cost CostBest; 03980 Regs.clear(); 03981 CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE, 03982 DT, LU); 03983 if (CostF < CostBest) 03984 std::swap(F, Best); 03985 DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); 03986 dbgs() << "\n" 03987 " in favor of formula "; Best.print(dbgs()); 03988 dbgs() << '\n'); 03989 } 03990 #ifndef NDEBUG 03991 ChangedFormulae = true; 03992 #endif 03993 LU.DeleteFormula(F); 03994 --FIdx; 03995 --NumForms; 03996 Any = true; 03997 } 03998 03999 // Now that we've filtered out some formulae, recompute the Regs set. 04000 if (Any) 04001 LU.RecomputeRegs(LUIdx, RegUses); 04002 04003 // Reset this to prepare for the next use. 04004 BestFormulae.clear(); 04005 } 04006 04007 DEBUG(if (ChangedFormulae) { 04008 dbgs() << "\n" 04009 "After filtering out undesirable candidates:\n"; 04010 print_uses(dbgs()); 04011 }); 04012 } 04013 04014 // This is a rough guess that seems to work fairly well. 04015 static const size_t ComplexityLimit = UINT16_MAX; 04016 04017 /// EstimateSearchSpaceComplexity - Estimate the worst-case number of 04018 /// solutions the solver might have to consider. It almost never considers 04019 /// this many solutions because it prune the search space, but the pruning 04020 /// isn't always sufficient. 04021 size_t LSRInstance::EstimateSearchSpaceComplexity() const { 04022 size_t Power = 1; 04023 for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), 04024 E = Uses.end(); I != E; ++I) { 04025 size_t FSize = I->Formulae.size(); 04026 if (FSize >= ComplexityLimit) { 04027 Power = ComplexityLimit; 04028 break; 04029 } 04030 Power *= FSize; 04031 if (Power >= ComplexityLimit) 04032 break; 04033 } 04034 return Power; 04035 } 04036 04037 /// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset 04038 /// of the registers of another formula, it won't help reduce register 04039 /// pressure (though it may not necessarily hurt register pressure); remove 04040 /// it to simplify the system. 04041 void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { 04042 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { 04043 DEBUG(dbgs() << "The search space is too complex.\n"); 04044 04045 DEBUG(dbgs() << "Narrowing the search space by eliminating formulae " 04046 "which use a superset of registers used by other " 04047 "formulae.\n"); 04048 04049 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 04050 LSRUse &LU = Uses[LUIdx]; 04051 bool Any = false; 04052 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { 04053 Formula &F = LU.Formulae[i]; 04054 // Look for a formula with a constant or GV in a register. If the use 04055 // also has a formula with that same value in an immediate field, 04056 // delete the one that uses a register. 04057 for (SmallVectorImpl<const SCEV *>::const_iterator 04058 I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) { 04059 if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) { 04060 Formula NewF = F; 04061 NewF.BaseOffset += C->getValue()->getSExtValue(); 04062 NewF.BaseRegs.erase(NewF.BaseRegs.begin() + 04063 (I - F.BaseRegs.begin())); 04064 if (LU.HasFormulaWithSameRegs(NewF)) { 04065 DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); 04066 LU.DeleteFormula(F); 04067 --i; 04068 --e; 04069 Any = true; 04070 break; 04071 } 04072 } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) { 04073 if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) 04074 if (!F.BaseGV) { 04075 Formula NewF = F; 04076 NewF.BaseGV = GV; 04077 NewF.BaseRegs.erase(NewF.BaseRegs.begin() + 04078 (I - F.BaseRegs.begin())); 04079 if (LU.HasFormulaWithSameRegs(NewF)) { 04080 DEBUG(dbgs() << " Deleting "; F.print(dbgs()); 04081 dbgs() << '\n'); 04082 LU.DeleteFormula(F); 04083 --i; 04084 --e; 04085 Any = true; 04086 break; 04087 } 04088 } 04089 } 04090 } 04091 } 04092 if (Any) 04093 LU.RecomputeRegs(LUIdx, RegUses); 04094 } 04095 04096 DEBUG(dbgs() << "After pre-selection:\n"; 04097 print_uses(dbgs())); 04098 } 04099 } 04100 04101 /// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers 04102 /// for expressions like A, A+1, A+2, etc., allocate a single register for 04103 /// them. 04104 void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { 04105 if (EstimateSearchSpaceComplexity() < ComplexityLimit) 04106 return; 04107 04108 DEBUG(dbgs() << "The search space is too complex.\n" 04109 "Narrowing the search space by assuming that uses separated " 04110 "by a constant offset will use the same registers.\n"); 04111 04112 // This is especially useful for unrolled loops. 04113 04114 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 04115 LSRUse &LU = Uses[LUIdx]; 04116 for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), 04117 E = LU.Formulae.end(); I != E; ++I) { 04118 const Formula &F = *I; 04119 if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1)) 04120 continue; 04121 04122 LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU); 04123 if (!LUThatHas) 04124 continue; 04125 04126 if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false, 04127 LU.Kind, LU.AccessTy)) 04128 continue; 04129 04130 DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n'); 04131 04132 LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop; 04133 04134 // Update the relocs to reference the new use. 04135 for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(), 04136 E = Fixups.end(); I != E; ++I) { 04137 LSRFixup &Fixup = *I; 04138 if (Fixup.LUIdx == LUIdx) { 04139 Fixup.LUIdx = LUThatHas - &Uses.front(); 04140 Fixup.Offset += F.BaseOffset; 04141 // Add the new offset to LUThatHas' offset list. 04142 if (LUThatHas->Offsets.back() != Fixup.Offset) { 04143 LUThatHas->Offsets.push_back(Fixup.Offset); 04144 if (Fixup.Offset > LUThatHas->MaxOffset) 04145 LUThatHas->MaxOffset = Fixup.Offset; 04146 if (Fixup.Offset < LUThatHas->MinOffset) 04147 LUThatHas->MinOffset = Fixup.Offset; 04148 } 04149 DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n'); 04150 } 04151 if (Fixup.LUIdx == NumUses-1) 04152 Fixup.LUIdx = LUIdx; 04153 } 04154 04155 // Delete formulae from the new use which are no longer legal. 04156 bool Any = false; 04157 for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) { 04158 Formula &F = LUThatHas->Formulae[i]; 04159 if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset, 04160 LUThatHas->Kind, LUThatHas->AccessTy, F)) { 04161 DEBUG(dbgs() << " Deleting "; F.print(dbgs()); 04162 dbgs() << '\n'); 04163 LUThatHas->DeleteFormula(F); 04164 --i; 04165 --e; 04166 Any = true; 04167 } 04168 } 04169 04170 if (Any) 04171 LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses); 04172 04173 // Delete the old use. 04174 DeleteUse(LU, LUIdx); 04175 --LUIdx; 04176 --NumUses; 04177 break; 04178 } 04179 } 04180 04181 DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs())); 04182 } 04183 04184 /// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call 04185 /// FilterOutUndesirableDedicatedRegisters again, if necessary, now that 04186 /// we've done more filtering, as it may be able to find more formulae to 04187 /// eliminate. 04188 void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){ 04189 if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { 04190 DEBUG(dbgs() << "The search space is too complex.\n"); 04191 04192 DEBUG(dbgs() << "Narrowing the search space by re-filtering out " 04193 "undesirable dedicated registers.\n"); 04194 04195 FilterOutUndesirableDedicatedRegisters(); 04196 04197 DEBUG(dbgs() << "After pre-selection:\n"; 04198 print_uses(dbgs())); 04199 } 04200 } 04201 04202 /// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely 04203 /// to be profitable, and then in any use which has any reference to that 04204 /// register, delete all formulae which do not reference that register. 04205 void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { 04206 // With all other options exhausted, loop until the system is simple 04207 // enough to handle. 04208 SmallPtrSet<const SCEV *, 4> Taken; 04209 while (EstimateSearchSpaceComplexity() >= ComplexityLimit) { 04210 // Ok, we have too many of formulae on our hands to conveniently handle. 04211 // Use a rough heuristic to thin out the list. 04212 DEBUG(dbgs() << "The search space is too complex.\n"); 04213 04214 // Pick the register which is used by the most LSRUses, which is likely 04215 // to be a good reuse register candidate. 04216 const SCEV *Best = nullptr; 04217 unsigned BestNum = 0; 04218 for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end(); 04219 I != E; ++I) { 04220 const SCEV *Reg = *I; 04221 if (Taken.count(Reg)) 04222 continue; 04223 if (!Best) 04224 Best = Reg; 04225 else { 04226 unsigned Count = RegUses.getUsedByIndices(Reg).count(); 04227 if (Count > BestNum) { 04228 Best = Reg; 04229 BestNum = Count; 04230 } 04231 } 04232 } 04233 04234 DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best 04235 << " will yield profitable reuse.\n"); 04236 Taken.insert(Best); 04237 04238 // In any use with formulae which references this register, delete formulae 04239 // which don't reference it. 04240 for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { 04241 LSRUse &LU = Uses[LUIdx]; 04242 if (!LU.Regs.count(Best)) continue; 04243 04244 bool Any = false; 04245 for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { 04246 Formula &F = LU.Formulae[i]; 04247 if (!F.referencesReg(Best)) { 04248 DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); 04249 LU.DeleteFormula(F); 04250 --e; 04251 --i; 04252 Any = true; 04253 assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?"); 04254 continue; 04255 } 04256 } 04257 04258 if (Any) 04259 LU.RecomputeRegs(LUIdx, RegUses); 04260 } 04261 04262 DEBUG(dbgs() << "After pre-selection:\n"; 04263 print_uses(dbgs())); 04264 } 04265 } 04266 04267 /// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of 04268 /// formulae to choose from, use some rough heuristics to prune down the number 04269 /// of formulae. This keeps the main solver from taking an extraordinary amount 04270 /// of time in some worst-case scenarios. 04271 void LSRInstance::NarrowSearchSpaceUsingHeuristics() { 04272 NarrowSearchSpaceByDetectingSupersets(); 04273 NarrowSearchSpaceByCollapsingUnrolledCode(); 04274 NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); 04275 NarrowSearchSpaceByPickingWinnerRegs(); 04276 } 04277 04278 /// SolveRecurse - This is the recursive solver. 04279 void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, 04280 Cost &SolutionCost, 04281 SmallVectorImpl<const Formula *> &Workspace, 04282 const Cost &CurCost, 04283 const SmallPtrSet<const SCEV *, 16> &CurRegs, 04284 DenseSet<const SCEV *> &VisitedRegs) const { 04285 // Some ideas: 04286 // - prune more: 04287 // - use more aggressive filtering 04288 // - sort the formula so that the most profitable solutions are found first 04289 // - sort the uses too 04290 // - search faster: 04291 // - don't compute a cost, and then compare. compare while computing a cost 04292 // and bail early. 04293 // - track register sets with SmallBitVector 04294 04295 const LSRUse &LU = Uses[Workspace.size()]; 04296 04297 // If this use references any register that's already a part of the 04298 // in-progress solution, consider it a requirement that a formula must 04299 // reference that register in order to be considered. This prunes out 04300 // unprofitable searching. 04301 SmallSetVector<const SCEV *, 4> ReqRegs; 04302 for (const SCEV *S : CurRegs) 04303 if (LU.Regs.count(S)) 04304 ReqRegs.insert(S); 04305 04306 SmallPtrSet<const SCEV *, 16> NewRegs; 04307 Cost NewCost; 04308 for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(), 04309 E = LU.Formulae.end(); I != E; ++I) { 04310 const Formula &F = *I; 04311 04312 // Ignore formulae which may not be ideal in terms of register reuse of 04313 // ReqRegs. The formula should use all required registers before 04314 // introducing new ones. 04315 int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size()); 04316 for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(), 04317 JE = ReqRegs.end(); J != JE; ++J) { 04318 const SCEV *Reg = *J; 04319 if ((F.ScaledReg && F.ScaledReg == Reg) || 04320 std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) != 04321 F.BaseRegs.end()) { 04322 --NumReqRegsToFind; 04323 if (NumReqRegsToFind == 0) 04324 break; 04325 } 04326 } 04327 if (NumReqRegsToFind != 0) { 04328 // If none of the formulae satisfied the required registers, then we could 04329 // clear ReqRegs and try again. Currently, we simply give up in this case. 04330 continue; 04331 } 04332 04333 // Evaluate the cost of the current formula. If it's already worse than 04334 // the current best, prune the search at that point. 04335 NewCost = CurCost; 04336 NewRegs = CurRegs; 04337 NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT, 04338 LU); 04339 if (NewCost < SolutionCost) { 04340 Workspace.push_back(&F); 04341 if (Workspace.size() != Uses.size()) { 04342 SolveRecurse(Solution, SolutionCost, Workspace, NewCost, 04343 NewRegs, VisitedRegs); 04344 if (F.getNumRegs() == 1 && Workspace.size() == 1) 04345 VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]); 04346 } else { 04347 DEBUG(dbgs() << "New best at "; NewCost.print(dbgs()); 04348 dbgs() << ".\n Regs:"; 04349 for (const SCEV *S : NewRegs) 04350 dbgs() << ' ' << *S; 04351 dbgs() << '\n'); 04352 04353 SolutionCost = NewCost; 04354 Solution = Workspace; 04355 } 04356 Workspace.pop_back(); 04357 } 04358 } 04359 } 04360 04361 /// Solve - Choose one formula from each use. Return the results in the given 04362 /// Solution vector. 04363 void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const { 04364 SmallVector<const Formula *, 8> Workspace; 04365 Cost SolutionCost; 04366 SolutionCost.Lose(); 04367 Cost CurCost; 04368 SmallPtrSet<const SCEV *, 16> CurRegs; 04369 DenseSet<const SCEV *> VisitedRegs; 04370 Workspace.reserve(Uses.size()); 04371 04372 // SolveRecurse does all the work. 04373 SolveRecurse(Solution, SolutionCost, Workspace, CurCost, 04374 CurRegs, VisitedRegs); 04375 if (Solution.empty()) { 04376 DEBUG(dbgs() << "\nNo Satisfactory Solution\n"); 04377 return; 04378 } 04379 04380 // Ok, we've now made all our decisions. 04381 DEBUG(dbgs() << "\n" 04382 "The chosen solution requires "; SolutionCost.print(dbgs()); 04383 dbgs() << ":\n"; 04384 for (size_t i = 0, e = Uses.size(); i != e; ++i) { 04385 dbgs() << " "; 04386 Uses[i].print(dbgs()); 04387 dbgs() << "\n" 04388 " "; 04389 Solution[i]->print(dbgs()); 04390 dbgs() << '\n'; 04391 }); 04392 04393 assert(Solution.size() == Uses.size() && "Malformed solution!"); 04394 } 04395 04396 /// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up 04397 /// the dominator tree far as we can go while still being dominated by the 04398 /// input positions. This helps canonicalize the insert position, which 04399 /// encourages sharing. 04400 BasicBlock::iterator 04401 LSRInstance::HoistInsertPosition(BasicBlock::iterator IP, 04402 const SmallVectorImpl<Instruction *> &Inputs) 04403 const { 04404 for (;;) { 04405 const Loop *IPLoop = LI.getLoopFor(IP->getParent()); 04406 unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0; 04407 04408 BasicBlock *IDom; 04409 for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) { 04410 if (!Rung) return IP; 04411 Rung = Rung->getIDom(); 04412 if (!Rung) return IP; 04413 IDom = Rung->getBlock(); 04414 04415 // Don't climb into a loop though. 04416 const Loop *IDomLoop = LI.getLoopFor(IDom); 04417 unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0; 04418 if (IDomDepth <= IPLoopDepth && 04419 (IDomDepth != IPLoopDepth || IDomLoop == IPLoop)) 04420 break; 04421 } 04422 04423 bool AllDominate = true; 04424 Instruction *BetterPos = nullptr; 04425 Instruction *Tentative = IDom->getTerminator(); 04426 for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(), 04427 E = Inputs.end(); I != E; ++I) { 04428 Instruction *Inst = *I; 04429 if (Inst == Tentative || !DT.dominates(Inst, Tentative)) { 04430 AllDominate = false; 04431 break; 04432 } 04433 // Attempt to find an insert position in the middle of the block, 04434 // instead of at the end, so that it can be used for other expansions. 04435 if (IDom == Inst->getParent() && 04436 (!BetterPos || !DT.dominates(Inst, BetterPos))) 04437 BetterPos = std::next(BasicBlock::iterator(Inst)); 04438 } 04439 if (!AllDominate) 04440 break; 04441 if (BetterPos) 04442 IP = BetterPos; 04443 else 04444 IP = Tentative; 04445 } 04446 04447 return IP; 04448 } 04449 04450 /// AdjustInsertPositionForExpand - Determine an input position which will be 04451 /// dominated by the operands and which will dominate the result. 04452 BasicBlock::iterator 04453 LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, 04454 const LSRFixup &LF, 04455 const LSRUse &LU, 04456 SCEVExpander &Rewriter) const { 04457 // Collect some instructions which must be dominated by the 04458 // expanding replacement. These must be dominated by any operands that 04459 // will be required in the expansion. 04460 SmallVector<Instruction *, 4> Inputs; 04461 if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace)) 04462 Inputs.push_back(I); 04463 if (LU.Kind == LSRUse::ICmpZero) 04464 if (Instruction *I = 04465 dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1))) 04466 Inputs.push_back(I); 04467 if (LF.PostIncLoops.count(L)) { 04468 if (LF.isUseFullyOutsideLoop(L)) 04469 Inputs.push_back(L->getLoopLatch()->getTerminator()); 04470 else 04471 Inputs.push_back(IVIncInsertPos); 04472 } 04473 // The expansion must also be dominated by the increment positions of any 04474 // loops it for which it is using post-inc mode. 04475 for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(), 04476 E = LF.PostIncLoops.end(); I != E; ++I) { 04477 const Loop *PIL = *I; 04478 if (PIL == L) continue; 04479 04480 // Be dominated by the loop exit. 04481 SmallVector<BasicBlock *, 4> ExitingBlocks; 04482 PIL->getExitingBlocks(ExitingBlocks); 04483 if (!ExitingBlocks.empty()) { 04484 BasicBlock *BB = ExitingBlocks[0]; 04485 for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i) 04486 BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]); 04487 Inputs.push_back(BB->getTerminator()); 04488 } 04489 } 04490 04491 assert(!isa<PHINode>(LowestIP) && !isa<LandingPadInst>(LowestIP) 04492 && !isa<DbgInfoIntrinsic>(LowestIP) && 04493 "Insertion point must be a normal instruction"); 04494 04495 // Then, climb up the immediate dominator tree as far as we can go while 04496 // still being dominated by the input positions. 04497 BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs); 04498 04499 // Don't insert instructions before PHI nodes. 04500 while (isa<PHINode>(IP)) ++IP; 04501 04502 // Ignore landingpad instructions. 04503 while (isa<LandingPadInst>(IP)) ++IP; 04504 04505 // Ignore debug intrinsics. 04506 while (isa<DbgInfoIntrinsic>(IP)) ++IP; 04507 04508 // Set IP below instructions recently inserted by SCEVExpander. This keeps the 04509 // IP consistent across expansions and allows the previously inserted 04510 // instructions to be reused by subsequent expansion. 04511 while (Rewriter.isInsertedInstruction(IP) && IP != LowestIP) ++IP; 04512 04513 return IP; 04514 } 04515 04516 /// Expand - Emit instructions for the leading candidate expression for this 04517 /// LSRUse (this is called "expanding"). 04518 Value *LSRInstance::Expand(const LSRFixup &LF, 04519 const Formula &F, 04520 BasicBlock::iterator IP, 04521 SCEVExpander &Rewriter, 04522 SmallVectorImpl<WeakVH> &DeadInsts) const { 04523 const LSRUse &LU = Uses[LF.LUIdx]; 04524 if (LU.RigidFormula) 04525 return LF.OperandValToReplace; 04526 04527 // Determine an input position which will be dominated by the operands and 04528 // which will dominate the result. 04529 IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter); 04530 04531 // Inform the Rewriter if we have a post-increment use, so that it can 04532 // perform an advantageous expansion. 04533 Rewriter.setPostInc(LF.PostIncLoops); 04534 04535 // This is the type that the user actually needs. 04536 Type *OpTy = LF.OperandValToReplace->getType(); 04537 // This will be the type that we'll initially expand to. 04538 Type *Ty = F.getType(); 04539 if (!Ty) 04540 // No type known; just expand directly to the ultimate type. 04541 Ty = OpTy; 04542 else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy)) 04543 // Expand directly to the ultimate type if it's the right size. 04544 Ty = OpTy; 04545 // This is the type to do integer arithmetic in. 04546 Type *IntTy = SE.getEffectiveSCEVType(Ty); 04547 04548 // Build up a list of operands to add together to form the full base. 04549 SmallVector<const SCEV *, 8> Ops; 04550 04551 // Expand the BaseRegs portion. 04552 for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(), 04553 E = F.BaseRegs.end(); I != E; ++I) { 04554 const SCEV *Reg = *I; 04555 assert(!Reg->isZero() && "Zero allocated in a base register!"); 04556 04557 // If we're expanding for a post-inc user, make the post-inc adjustment. 04558 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); 04559 Reg = TransformForPostIncUse(Denormalize, Reg, 04560 LF.UserInst, LF.OperandValToReplace, 04561 Loops, SE, DT); 04562 04563 Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr, IP))); 04564 } 04565 04566 // Expand the ScaledReg portion. 04567 Value *ICmpScaledV = nullptr; 04568 if (F.Scale != 0) { 04569 const SCEV *ScaledS = F.ScaledReg; 04570 04571 // If we're expanding for a post-inc user, make the post-inc adjustment. 04572 PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); 04573 ScaledS = TransformForPostIncUse(Denormalize, ScaledS, 04574 LF.UserInst, LF.OperandValToReplace, 04575 Loops, SE, DT); 04576 04577 if (LU.Kind == LSRUse::ICmpZero) { 04578 // Expand ScaleReg as if it was part of the base regs. 04579 if (F.Scale == 1) 04580 Ops.push_back( 04581 SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP))); 04582 else { 04583 // An interesting way of "folding" with an icmp is to use a negated 04584 // scale, which we'll implement by inserting it into the other operand 04585 // of the icmp. 04586 assert(F.Scale == -1 && 04587 "The only scale supported by ICmpZero uses is -1!"); 04588 ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr, IP); 04589 } 04590 } else { 04591 // Otherwise just expand the scaled register and an explicit scale, 04592 // which is expected to be matched as part of the address. 04593 04594 // Flush the operand list to suppress SCEVExpander hoisting address modes. 04595 // Unless the addressing mode will not be folded. 04596 if (!Ops.empty() && LU.Kind == LSRUse::Address && 04597 isAMCompletelyFolded(TTI, LU, F)) { 04598 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); 04599 Ops.clear(); 04600 Ops.push_back(SE.getUnknown(FullV)); 04601 } 04602 ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr, IP)); 04603 if (F.Scale != 1) 04604 ScaledS = 04605 SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale)); 04606 Ops.push_back(ScaledS); 04607 } 04608 } 04609 04610 // Expand the GV portion. 04611 if (F.BaseGV) { 04612 // Flush the operand list to suppress SCEVExpander hoisting. 04613 if (!Ops.empty()) { 04614 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); 04615 Ops.clear(); 04616 Ops.push_back(SE.getUnknown(FullV)); 04617 } 04618 Ops.push_back(SE.getUnknown(F.BaseGV)); 04619 } 04620 04621 // Flush the operand list to suppress SCEVExpander hoisting of both folded and 04622 // unfolded offsets. LSR assumes they both live next to their uses. 04623 if (!Ops.empty()) { 04624 Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); 04625 Ops.clear(); 04626 Ops.push_back(SE.getUnknown(FullV)); 04627 } 04628 04629 // Expand the immediate portion. 04630 int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset; 04631 if (Offset != 0) { 04632 if (LU.Kind == LSRUse::ICmpZero) { 04633 // The other interesting way of "folding" with an ICmpZero is to use a 04634 // negated immediate. 04635 if (!ICmpScaledV) 04636 ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset); 04637 else { 04638 Ops.push_back(SE.getUnknown(ICmpScaledV)); 04639 ICmpScaledV = ConstantInt::get(IntTy, Offset); 04640 } 04641 } else { 04642 // Just add the immediate values. These again are expected to be matched 04643 // as part of the address. 04644 Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset))); 04645 } 04646 } 04647 04648 // Expand the unfolded offset portion. 04649 int64_t UnfoldedOffset = F.UnfoldedOffset; 04650 if (UnfoldedOffset != 0) { 04651 // Just add the immediate values. 04652 Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, 04653 UnfoldedOffset))); 04654 } 04655 04656 // Emit instructions summing all the operands. 04657 const SCEV *FullS = Ops.empty() ? 04658 SE.getConstant(IntTy, 0) : 04659 SE.getAddExpr(Ops); 04660 Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP); 04661 04662 // We're done expanding now, so reset the rewriter. 04663 Rewriter.clearPostInc(); 04664 04665 // An ICmpZero Formula represents an ICmp which we're handling as a 04666 // comparison against zero. Now that we've expanded an expression for that 04667 // form, update the ICmp's other operand. 04668 if (LU.Kind == LSRUse::ICmpZero) { 04669 ICmpInst *CI = cast<ICmpInst>(LF.UserInst); 04670 DeadInsts.push_back(CI->getOperand(1)); 04671 assert(!F.BaseGV && "ICmp does not support folding a global value and " 04672 "a scale at the same time!"); 04673 if (F.Scale == -1) { 04674 if (ICmpScaledV->getType() != OpTy) { 04675 Instruction *Cast = 04676 CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false, 04677 OpTy, false), 04678 ICmpScaledV, OpTy, "tmp", CI); 04679 ICmpScaledV = Cast; 04680 } 04681 CI->setOperand(1, ICmpScaledV); 04682 } else { 04683 // A scale of 1 means that the scale has been expanded as part of the 04684 // base regs. 04685 assert((F.Scale == 0 || F.Scale == 1) && 04686 "ICmp does not support folding a global value and " 04687 "a scale at the same time!"); 04688 Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy), 04689 -(uint64_t)Offset); 04690 if (C->getType() != OpTy) 04691 C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false, 04692 OpTy, false), 04693 C, OpTy); 04694 04695 CI->setOperand(1, C); 04696 } 04697 } 04698 04699 return FullV; 04700 } 04701 04702 /// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use 04703 /// of their operands effectively happens in their predecessor blocks, so the 04704 /// expression may need to be expanded in multiple places. 04705 void LSRInstance::RewriteForPHI(PHINode *PN, 04706 const LSRFixup &LF, 04707 const Formula &F, 04708 SCEVExpander &Rewriter, 04709 SmallVectorImpl<WeakVH> &DeadInsts, 04710 Pass *P) const { 04711 DenseMap<BasicBlock *, Value *> Inserted; 04712 for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) 04713 if (PN->getIncomingValue(i) == LF.OperandValToReplace) { 04714 BasicBlock *BB = PN->getIncomingBlock(i); 04715 04716 // If this is a critical edge, split the edge so that we do not insert 04717 // the code on all predecessor/successor paths. We do this unless this 04718 // is the canonical backedge for this loop, which complicates post-inc 04719 // users. 04720 if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 && 04721 !isa<IndirectBrInst>(BB->getTerminator())) { 04722 BasicBlock *Parent = PN->getParent(); 04723 Loop *PNLoop = LI.getLoopFor(Parent); 04724 if (!PNLoop || Parent != PNLoop->getHeader()) { 04725 // Split the critical edge. 04726 BasicBlock *NewBB = nullptr; 04727 if (!Parent->isLandingPad()) { 04728 NewBB = SplitCriticalEdge(BB, Parent, P, 04729 /*MergeIdenticalEdges=*/true, 04730 /*DontDeleteUselessPhis=*/true); 04731 } else { 04732 SmallVector<BasicBlock*, 2> NewBBs; 04733 SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs); 04734 NewBB = NewBBs[0]; 04735 } 04736 // If NewBB==NULL, then SplitCriticalEdge refused to split because all 04737 // phi predecessors are identical. The simple thing to do is skip 04738 // splitting in this case rather than complicate the API. 04739 if (NewBB) { 04740 // If PN is outside of the loop and BB is in the loop, we want to 04741 // move the block to be immediately before the PHI block, not 04742 // immediately after BB. 04743 if (L->contains(BB) && !L->contains(PN)) 04744 NewBB->moveBefore(PN->getParent()); 04745 04746 // Splitting the edge can reduce the number of PHI entries we have. 04747 e = PN->getNumIncomingValues(); 04748 BB = NewBB; 04749 i = PN->getBasicBlockIndex(BB); 04750 } 04751 } 04752 } 04753 04754 std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair = 04755 Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr))); 04756 if (!Pair.second) 04757 PN->setIncomingValue(i, Pair.first->second); 04758 else { 04759 Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts); 04760 04761 // If this is reuse-by-noop-cast, insert the noop cast. 04762 Type *OpTy = LF.OperandValToReplace->getType(); 04763 if (FullV->getType() != OpTy) 04764 FullV = 04765 CastInst::Create(CastInst::getCastOpcode(FullV, false, 04766 OpTy, false), 04767 FullV, LF.OperandValToReplace->getType(), 04768 "tmp", BB->getTerminator()); 04769 04770 PN->setIncomingValue(i, FullV); 04771 Pair.first->second = FullV; 04772 } 04773 } 04774 } 04775 04776 /// Rewrite - Emit instructions for the leading candidate expression for this 04777 /// LSRUse (this is called "expanding"), and update the UserInst to reference 04778 /// the newly expanded value. 04779 void LSRInstance::Rewrite(const LSRFixup &LF, 04780 const Formula &F, 04781 SCEVExpander &Rewriter, 04782 SmallVectorImpl<WeakVH> &DeadInsts, 04783 Pass *P) const { 04784 // First, find an insertion point that dominates UserInst. For PHI nodes, 04785 // find the nearest block which dominates all the relevant uses. 04786 if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) { 04787 RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P); 04788 } else { 04789 Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts); 04790 04791 // If this is reuse-by-noop-cast, insert the noop cast. 04792 Type *OpTy = LF.OperandValToReplace->getType(); 04793 if (FullV->getType() != OpTy) { 04794 Instruction *Cast = 04795 CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false), 04796 FullV, OpTy, "tmp", LF.UserInst); 04797 FullV = Cast; 04798 } 04799 04800 // Update the user. ICmpZero is handled specially here (for now) because 04801 // Expand may have updated one of the operands of the icmp already, and 04802 // its new value may happen to be equal to LF.OperandValToReplace, in 04803 // which case doing replaceUsesOfWith leads to replacing both operands 04804 // with the same value. TODO: Reorganize this. 04805 if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero) 04806 LF.UserInst->setOperand(0, FullV); 04807 else 04808 LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV); 04809 } 04810 04811 DeadInsts.push_back(LF.OperandValToReplace); 04812 } 04813 04814 /// ImplementSolution - Rewrite all the fixup locations with new values, 04815 /// following the chosen solution. 04816 void 04817 LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, 04818 Pass *P) { 04819 // Keep track of instructions we may have made dead, so that 04820 // we can remove them after we are done working. 04821 SmallVector<WeakVH, 16> DeadInsts; 04822 04823 SCEVExpander Rewriter(SE, "lsr"); 04824 #ifndef NDEBUG 04825 Rewriter.setDebugType(DEBUG_TYPE); 04826 #endif 04827 Rewriter.disableCanonicalMode(); 04828 Rewriter.enableLSRMode(); 04829 Rewriter.setIVIncInsertPos(L, IVIncInsertPos); 04830 04831 // Mark phi nodes that terminate chains so the expander tries to reuse them. 04832 for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(), 04833 ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) { 04834 if (PHINode *PN = dyn_cast<PHINode>(ChainI->tailUserInst())) 04835 Rewriter.setChainedPhi(PN); 04836 } 04837 04838 // Expand the new value definitions and update the users. 04839 for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(), 04840 E = Fixups.end(); I != E; ++I) { 04841 const LSRFixup &Fixup = *I; 04842 04843 Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts, P); 04844 04845 Changed = true; 04846 } 04847 04848 for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(), 04849 ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) { 04850 GenerateIVChain(*ChainI, Rewriter, DeadInsts); 04851 Changed = true; 04852 } 04853 // Clean up after ourselves. This must be done before deleting any 04854 // instructions. 04855 Rewriter.clear(); 04856 04857 Changed |= DeleteTriviallyDeadInstructions(DeadInsts); 04858 } 04859 04860 LSRInstance::LSRInstance(Loop *L, Pass *P) 04861 : IU(P->getAnalysis<IVUsers>()), SE(P->getAnalysis<ScalarEvolution>()), 04862 DT(P->getAnalysis<DominatorTreeWrapperPass>().getDomTree()), 04863 LI(P->getAnalysis<LoopInfo>()), 04864 TTI(P->getAnalysis<TargetTransformInfo>()), L(L), Changed(false), 04865 IVIncInsertPos(nullptr) { 04866 // If LoopSimplify form is not available, stay out of trouble. 04867 if (!L->isLoopSimplifyForm()) 04868 return; 04869 04870 // If there's no interesting work to be done, bail early. 04871 if (IU.empty()) return; 04872 04873 // If there's too much analysis to be done, bail early. We won't be able to 04874 // model the problem anyway. 04875 unsigned NumUsers = 0; 04876 for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) { 04877 if (++NumUsers > MaxIVUsers) { 04878 DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << *L 04879 << "\n"); 04880 return; 04881 } 04882 } 04883 04884 #ifndef NDEBUG 04885 // All dominating loops must have preheaders, or SCEVExpander may not be able 04886 // to materialize an AddRecExpr whose Start is an outer AddRecExpr. 04887 // 04888 // IVUsers analysis should only create users that are dominated by simple loop 04889 // headers. Since this loop should dominate all of its users, its user list 04890 // should be empty if this loop itself is not within a simple loop nest. 04891 for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader()); 04892 Rung; Rung = Rung->getIDom()) { 04893 BasicBlock *BB = Rung->getBlock(); 04894 const Loop *DomLoop = LI.getLoopFor(BB); 04895 if (DomLoop && DomLoop->getHeader() == BB) { 04896 assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest"); 04897 } 04898 } 04899 #endif // DEBUG 04900 04901 DEBUG(dbgs() << "\nLSR on loop "; 04902 L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false); 04903 dbgs() << ":\n"); 04904 04905 // First, perform some low-level loop optimizations. 04906 OptimizeShadowIV(); 04907 OptimizeLoopTermCond(); 04908 04909 // If loop preparation eliminates all interesting IV users, bail. 04910 if (IU.empty()) return; 04911 04912 // Skip nested loops until we can model them better with formulae. 04913 if (!L->empty()) { 04914 DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n"); 04915 return; 04916 } 04917 04918 // Start collecting data and preparing for the solver. 04919 CollectChains(); 04920 CollectInterestingTypesAndFactors(); 04921 CollectFixupsAndInitialFormulae(); 04922 CollectLoopInvariantFixupsAndFormulae(); 04923 04924 assert(!Uses.empty() && "IVUsers reported at least one use"); 04925 DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n"; 04926 print_uses(dbgs())); 04927 04928 // Now use the reuse data to generate a bunch of interesting ways 04929 // to formulate the values needed for the uses. 04930 GenerateAllReuseFormulae(); 04931 04932 FilterOutUndesirableDedicatedRegisters(); 04933 NarrowSearchSpaceUsingHeuristics(); 04934 04935 SmallVector<const Formula *, 8> Solution; 04936 Solve(Solution); 04937 04938 // Release memory that is no longer needed. 04939 Factors.clear(); 04940 Types.clear(); 04941 RegUses.clear(); 04942 04943 if (Solution.empty()) 04944 return; 04945 04946 #ifndef NDEBUG 04947 // Formulae should be legal. 04948 for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), E = Uses.end(); 04949 I != E; ++I) { 04950 const LSRUse &LU = *I; 04951 for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(), 04952 JE = LU.Formulae.end(); 04953 J != JE; ++J) 04954 assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, 04955 *J) && "Illegal formula generated!"); 04956 }; 04957 #endif 04958 04959 // Now that we've decided what we want, make it so. 04960 ImplementSolution(Solution, P); 04961 } 04962 04963 void LSRInstance::print_factors_and_types(raw_ostream &OS) const { 04964 if (Factors.empty() && Types.empty()) return; 04965 04966 OS << "LSR has identified the following interesting factors and types: "; 04967 bool First = true; 04968 04969 for (SmallSetVector<int64_t, 8>::const_iterator 04970 I = Factors.begin(), E = Factors.end(); I != E; ++I) { 04971 if (!First) OS << ", "; 04972 First = false; 04973 OS << '*' << *I; 04974 } 04975 04976 for (SmallSetVector<Type *, 4>::const_iterator 04977 I = Types.begin(), E = Types.end(); I != E; ++I) { 04978 if (!First) OS << ", "; 04979 First = false; 04980 OS << '(' << **I << ')'; 04981 } 04982 OS << '\n'; 04983 } 04984 04985 void LSRInstance::print_fixups(raw_ostream &OS) const { 04986 OS << "LSR is examining the following fixup sites:\n"; 04987 for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(), 04988 E = Fixups.end(); I != E; ++I) { 04989 dbgs() << " "; 04990 I->print(OS); 04991 OS << '\n'; 04992 } 04993 } 04994 04995 void LSRInstance::print_uses(raw_ostream &OS) const { 04996 OS << "LSR is examining the following uses:\n"; 04997 for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), 04998 E = Uses.end(); I != E; ++I) { 04999 const LSRUse &LU = *I; 05000 dbgs() << " "; 05001 LU.print(OS); 05002 OS << '\n'; 05003 for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(), 05004 JE = LU.Formulae.end(); J != JE; ++J) { 05005 OS << " "; 05006 J->print(OS); 05007 OS << '\n'; 05008 } 05009 } 05010 } 05011 05012 void LSRInstance::print(raw_ostream &OS) const { 05013 print_factors_and_types(OS); 05014 print_fixups(OS); 05015 print_uses(OS); 05016 } 05017 05018 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 05019 void LSRInstance::dump() const { 05020 print(errs()); errs() << '\n'; 05021 } 05022 #endif 05023 05024 namespace { 05025 05026 class LoopStrengthReduce : public LoopPass { 05027 public: 05028 static char ID; // Pass ID, replacement for typeid 05029 LoopStrengthReduce(); 05030 05031 private: 05032 bool runOnLoop(Loop *L, LPPassManager &LPM) override; 05033 void getAnalysisUsage(AnalysisUsage &AU) const override; 05034 }; 05035 05036 } 05037 05038 char LoopStrengthReduce::ID = 0; 05039 INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", 05040 "Loop Strength Reduction", false, false) 05041 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) 05042 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 05043 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 05044 INITIALIZE_PASS_DEPENDENCY(IVUsers) 05045 INITIALIZE_PASS_DEPENDENCY(LoopInfo) 05046 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 05047 INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce", 05048 "Loop Strength Reduction", false, false) 05049 05050 05051 Pass *llvm::createLoopStrengthReducePass() { 05052 return new LoopStrengthReduce(); 05053 } 05054 05055 LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) { 05056 initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry()); 05057 } 05058 05059 void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { 05060 // We split critical edges, so we change the CFG. However, we do update 05061 // many analyses if they are around. 05062 AU.addPreservedID(LoopSimplifyID); 05063 05064 AU.addRequired<LoopInfo>(); 05065 AU.addPreserved<LoopInfo>(); 05066 AU.addRequiredID(LoopSimplifyID); 05067 AU.addRequired<DominatorTreeWrapperPass>(); 05068 AU.addPreserved<DominatorTreeWrapperPass>(); 05069 AU.addRequired<ScalarEvolution>(); 05070 AU.addPreserved<ScalarEvolution>(); 05071 // Requiring LoopSimplify a second time here prevents IVUsers from running 05072 // twice, since LoopSimplify was invalidated by running ScalarEvolution. 05073 AU.addRequiredID(LoopSimplifyID); 05074 AU.addRequired<IVUsers>(); 05075 AU.addPreserved<IVUsers>(); 05076 AU.addRequired<TargetTransformInfo>(); 05077 } 05078 05079 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { 05080 if (skipOptnoneFunction(L)) 05081 return false; 05082 05083 bool Changed = false; 05084 05085 // Run the main LSR transformation. 05086 Changed |= LSRInstance(L, this).getChanged(); 05087 05088 // Remove any extra phis created by processing inner loops. 05089 Changed |= DeleteDeadPHIs(L->getHeader()); 05090 if (EnablePhiElim && L->isLoopSimplifyForm()) { 05091 SmallVector<WeakVH, 16> DeadInsts; 05092 SCEVExpander Rewriter(getAnalysis<ScalarEvolution>(), "lsr"); 05093 #ifndef NDEBUG 05094 Rewriter.setDebugType(DEBUG_TYPE); 05095 #endif 05096 unsigned numFolded = Rewriter.replaceCongruentIVs( 05097 L, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), DeadInsts, 05098 &getAnalysis<TargetTransformInfo>()); 05099 if (numFolded) { 05100 Changed = true; 05101 DeleteTriviallyDeadInstructions(DeadInsts); 05102 DeleteDeadPHIs(L->getHeader()); 05103 } 05104 } 05105 return Changed; 05106 }