LLVM API Documentation
00001 //===-- LoopIdiomRecognize.cpp - Loop idiom recognition -------------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This pass implements an idiom recognizer that transforms simple loops into a 00011 // non-loop form. In cases that this kicks in, it can be a significant 00012 // performance win. 00013 // 00014 //===----------------------------------------------------------------------===// 00015 // 00016 // TODO List: 00017 // 00018 // Future loop memory idioms to recognize: 00019 // memcmp, memmove, strlen, etc. 00020 // Future floating point idioms to recognize in -ffast-math mode: 00021 // fpowi 00022 // Future integer operation idioms to recognize: 00023 // ctpop, ctlz, cttz 00024 // 00025 // Beware that isel's default lowering for ctpop is highly inefficient for 00026 // i64 and larger types when i64 is legal and the value has few bits set. It 00027 // would be good to enhance isel to emit a loop for ctpop in this case. 00028 // 00029 // We should enhance the memset/memcpy recognition to handle multiple stores in 00030 // the loop. This would handle things like: 00031 // void foo(_Complex float *P) 00032 // for (i) { __real__(*P) = 0; __imag__(*P) = 0; } 00033 // 00034 // We should enhance this to handle negative strides through memory. 00035 // Alternatively (and perhaps better) we could rely on an earlier pass to force 00036 // forward iteration through memory, which is generally better for cache 00037 // behavior. Negative strides *do* happen for memset/memcpy loops. 00038 // 00039 // This could recognize common matrix multiplies and dot product idioms and 00040 // replace them with calls to BLAS (if linked in??). 00041 // 00042 //===----------------------------------------------------------------------===// 00043 00044 #include "llvm/Transforms/Scalar.h" 00045 #include "llvm/ADT/Statistic.h" 00046 #include "llvm/Analysis/AliasAnalysis.h" 00047 #include "llvm/Analysis/LoopPass.h" 00048 #include "llvm/Analysis/ScalarEvolutionExpander.h" 00049 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 00050 #include "llvm/Analysis/TargetTransformInfo.h" 00051 #include "llvm/Analysis/ValueTracking.h" 00052 #include "llvm/IR/DataLayout.h" 00053 #include "llvm/IR/Dominators.h" 00054 #include "llvm/IR/IRBuilder.h" 00055 #include "llvm/IR/IntrinsicInst.h" 00056 #include "llvm/IR/Module.h" 00057 #include "llvm/Support/Debug.h" 00058 #include "llvm/Support/raw_ostream.h" 00059 #include "llvm/Target/TargetLibraryInfo.h" 00060 #include "llvm/Transforms/Utils/Local.h" 00061 using namespace llvm; 00062 00063 #define DEBUG_TYPE "loop-idiom" 00064 00065 STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); 00066 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); 00067 00068 namespace { 00069 00070 class LoopIdiomRecognize; 00071 00072 /// This class defines some utility functions for loop idiom recognization. 00073 class LIRUtil { 00074 public: 00075 /// Return true iff the block contains nothing but an uncondition branch 00076 /// (aka goto instruction). 00077 static bool isAlmostEmpty(BasicBlock *); 00078 00079 static BranchInst *getBranch(BasicBlock *BB) { 00080 return dyn_cast<BranchInst>(BB->getTerminator()); 00081 } 00082 00083 /// Derive the precondition block (i.e the block that guards the loop 00084 /// preheader) from the given preheader. 00085 static BasicBlock *getPrecondBb(BasicBlock *PreHead); 00086 }; 00087 00088 /// This class is to recoginize idioms of population-count conducted in 00089 /// a noncountable loop. Currently it only recognizes this pattern: 00090 /// \code 00091 /// while(x) {cnt++; ...; x &= x - 1; ...} 00092 /// \endcode 00093 class NclPopcountRecognize { 00094 LoopIdiomRecognize &LIR; 00095 Loop *CurLoop; 00096 BasicBlock *PreCondBB; 00097 00098 typedef IRBuilder<> IRBuilderTy; 00099 00100 public: 00101 explicit NclPopcountRecognize(LoopIdiomRecognize &TheLIR); 00102 bool recognize(); 00103 00104 private: 00105 /// Take a glimpse of the loop to see if we need to go ahead recoginizing 00106 /// the idiom. 00107 bool preliminaryScreen(); 00108 00109 /// Check if the given conditional branch is based on the comparison 00110 /// between a variable and zero, and if the variable is non-zero, the 00111 /// control yields to the loop entry. If the branch matches the behavior, 00112 /// the variable involved in the comparion is returned. This function will 00113 /// be called to see if the precondition and postcondition of the loop 00114 /// are in desirable form. 00115 Value *matchCondition(BranchInst *Br, BasicBlock *NonZeroTarget) const; 00116 00117 /// Return true iff the idiom is detected in the loop. and 1) \p CntInst 00118 /// is set to the instruction counting the population bit. 2) \p CntPhi 00119 /// is set to the corresponding phi node. 3) \p Var is set to the value 00120 /// whose population bits are being counted. 00121 bool detectIdiom 00122 (Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) const; 00123 00124 /// Insert ctpop intrinsic function and some obviously dead instructions. 00125 void transform(Instruction *CntInst, PHINode *CntPhi, Value *Var); 00126 00127 /// Create llvm.ctpop.* intrinsic function. 00128 CallInst *createPopcntIntrinsic(IRBuilderTy &IRB, Value *Val, DebugLoc DL); 00129 }; 00130 00131 class LoopIdiomRecognize : public LoopPass { 00132 Loop *CurLoop; 00133 const DataLayout *DL; 00134 DominatorTree *DT; 00135 ScalarEvolution *SE; 00136 TargetLibraryInfo *TLI; 00137 const TargetTransformInfo *TTI; 00138 public: 00139 static char ID; 00140 explicit LoopIdiomRecognize() : LoopPass(ID) { 00141 initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); 00142 DL = nullptr; DT = nullptr; SE = nullptr; TLI = nullptr; TTI = nullptr; 00143 } 00144 00145 bool runOnLoop(Loop *L, LPPassManager &LPM) override; 00146 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, 00147 SmallVectorImpl<BasicBlock*> &ExitBlocks); 00148 00149 bool processLoopStore(StoreInst *SI, const SCEV *BECount); 00150 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); 00151 00152 bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, 00153 unsigned StoreAlignment, 00154 Value *SplatValue, Instruction *TheStore, 00155 const SCEVAddRecExpr *Ev, 00156 const SCEV *BECount); 00157 bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, 00158 const SCEVAddRecExpr *StoreEv, 00159 const SCEVAddRecExpr *LoadEv, 00160 const SCEV *BECount); 00161 00162 /// This transformation requires natural loop information & requires that 00163 /// loop preheaders be inserted into the CFG. 00164 /// 00165 void getAnalysisUsage(AnalysisUsage &AU) const override { 00166 AU.addRequired<LoopInfo>(); 00167 AU.addPreserved<LoopInfo>(); 00168 AU.addRequiredID(LoopSimplifyID); 00169 AU.addPreservedID(LoopSimplifyID); 00170 AU.addRequiredID(LCSSAID); 00171 AU.addPreservedID(LCSSAID); 00172 AU.addRequired<AliasAnalysis>(); 00173 AU.addPreserved<AliasAnalysis>(); 00174 AU.addRequired<ScalarEvolution>(); 00175 AU.addPreserved<ScalarEvolution>(); 00176 AU.addPreserved<DominatorTreeWrapperPass>(); 00177 AU.addRequired<DominatorTreeWrapperPass>(); 00178 AU.addRequired<TargetLibraryInfo>(); 00179 AU.addRequired<TargetTransformInfo>(); 00180 } 00181 00182 const DataLayout *getDataLayout() { 00183 if (DL) 00184 return DL; 00185 DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); 00186 DL = DLP ? &DLP->getDataLayout() : nullptr; 00187 return DL; 00188 } 00189 00190 DominatorTree *getDominatorTree() { 00191 return DT ? DT 00192 : (DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree()); 00193 } 00194 00195 ScalarEvolution *getScalarEvolution() { 00196 return SE ? SE : (SE = &getAnalysis<ScalarEvolution>()); 00197 } 00198 00199 TargetLibraryInfo *getTargetLibraryInfo() { 00200 return TLI ? TLI : (TLI = &getAnalysis<TargetLibraryInfo>()); 00201 } 00202 00203 const TargetTransformInfo *getTargetTransformInfo() { 00204 return TTI ? TTI : (TTI = &getAnalysis<TargetTransformInfo>()); 00205 } 00206 00207 Loop *getLoop() const { return CurLoop; } 00208 00209 private: 00210 bool runOnNoncountableLoop(); 00211 bool runOnCountableLoop(); 00212 }; 00213 } 00214 00215 char LoopIdiomRecognize::ID = 0; 00216 INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", 00217 false, false) 00218 INITIALIZE_PASS_DEPENDENCY(LoopInfo) 00219 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 00220 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 00221 INITIALIZE_PASS_DEPENDENCY(LCSSA) 00222 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 00223 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) 00224 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 00225 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) 00226 INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", 00227 false, false) 00228 00229 Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); } 00230 00231 /// deleteDeadInstruction - Delete this instruction. Before we do, go through 00232 /// and zero out all the operands of this instruction. If any of them become 00233 /// dead, delete them and the computation tree that feeds them. 00234 /// 00235 static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE, 00236 const TargetLibraryInfo *TLI) { 00237 SmallVector<Instruction*, 32> NowDeadInsts; 00238 00239 NowDeadInsts.push_back(I); 00240 00241 // Before we touch this instruction, remove it from SE! 00242 do { 00243 Instruction *DeadInst = NowDeadInsts.pop_back_val(); 00244 00245 // This instruction is dead, zap it, in stages. Start by removing it from 00246 // SCEV. 00247 SE.forgetValue(DeadInst); 00248 00249 for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) { 00250 Value *Op = DeadInst->getOperand(op); 00251 DeadInst->setOperand(op, nullptr); 00252 00253 // If this operand just became dead, add it to the NowDeadInsts list. 00254 if (!Op->use_empty()) continue; 00255 00256 if (Instruction *OpI = dyn_cast<Instruction>(Op)) 00257 if (isInstructionTriviallyDead(OpI, TLI)) 00258 NowDeadInsts.push_back(OpI); 00259 } 00260 00261 DeadInst->eraseFromParent(); 00262 00263 } while (!NowDeadInsts.empty()); 00264 } 00265 00266 /// deleteIfDeadInstruction - If the specified value is a dead instruction, 00267 /// delete it and any recursively used instructions. 00268 static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE, 00269 const TargetLibraryInfo *TLI) { 00270 if (Instruction *I = dyn_cast<Instruction>(V)) 00271 if (isInstructionTriviallyDead(I, TLI)) 00272 deleteDeadInstruction(I, SE, TLI); 00273 } 00274 00275 //===----------------------------------------------------------------------===// 00276 // 00277 // Implementation of LIRUtil 00278 // 00279 //===----------------------------------------------------------------------===// 00280 00281 // This function will return true iff the given block contains nothing but goto. 00282 // A typical usage of this function is to check if the preheader function is 00283 // "almost" empty such that generated intrinsic functions can be moved across 00284 // the preheader and be placed at the end of the precondition block without 00285 // the concern of breaking data dependence. 00286 bool LIRUtil::isAlmostEmpty(BasicBlock *BB) { 00287 if (BranchInst *Br = getBranch(BB)) { 00288 return Br->isUnconditional() && BB->size() == 1; 00289 } 00290 return false; 00291 } 00292 00293 BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) { 00294 if (BasicBlock *BB = PreHead->getSinglePredecessor()) { 00295 BranchInst *Br = getBranch(BB); 00296 return Br && Br->isConditional() ? BB : nullptr; 00297 } 00298 return nullptr; 00299 } 00300 00301 //===----------------------------------------------------------------------===// 00302 // 00303 // Implementation of NclPopcountRecognize 00304 // 00305 //===----------------------------------------------------------------------===// 00306 00307 NclPopcountRecognize::NclPopcountRecognize(LoopIdiomRecognize &TheLIR): 00308 LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(nullptr) { 00309 } 00310 00311 bool NclPopcountRecognize::preliminaryScreen() { 00312 const TargetTransformInfo *TTI = LIR.getTargetTransformInfo(); 00313 if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware) 00314 return false; 00315 00316 // Counting population are usually conducted by few arithmetic instructions. 00317 // Such instructions can be easilly "absorbed" by vacant slots in a 00318 // non-compact loop. Therefore, recognizing popcount idiom only makes sense 00319 // in a compact loop. 00320 00321 // Give up if the loop has multiple blocks or multiple backedges. 00322 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) 00323 return false; 00324 00325 BasicBlock *LoopBody = *(CurLoop->block_begin()); 00326 if (LoopBody->size() >= 20) { 00327 // The loop is too big, bail out. 00328 return false; 00329 } 00330 00331 // It should have a preheader containing nothing but a goto instruction. 00332 BasicBlock *PreHead = CurLoop->getLoopPreheader(); 00333 if (!PreHead || !LIRUtil::isAlmostEmpty(PreHead)) 00334 return false; 00335 00336 // It should have a precondition block where the generated popcount instrinsic 00337 // function will be inserted. 00338 PreCondBB = LIRUtil::getPrecondBb(PreHead); 00339 if (!PreCondBB) 00340 return false; 00341 00342 return true; 00343 } 00344 00345 Value *NclPopcountRecognize::matchCondition(BranchInst *Br, 00346 BasicBlock *LoopEntry) const { 00347 if (!Br || !Br->isConditional()) 00348 return nullptr; 00349 00350 ICmpInst *Cond = dyn_cast<ICmpInst>(Br->getCondition()); 00351 if (!Cond) 00352 return nullptr; 00353 00354 ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1)); 00355 if (!CmpZero || !CmpZero->isZero()) 00356 return nullptr; 00357 00358 ICmpInst::Predicate Pred = Cond->getPredicate(); 00359 if ((Pred == ICmpInst::ICMP_NE && Br->getSuccessor(0) == LoopEntry) || 00360 (Pred == ICmpInst::ICMP_EQ && Br->getSuccessor(1) == LoopEntry)) 00361 return Cond->getOperand(0); 00362 00363 return nullptr; 00364 } 00365 00366 bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst, 00367 PHINode *&CntPhi, 00368 Value *&Var) const { 00369 // Following code tries to detect this idiom: 00370 // 00371 // if (x0 != 0) 00372 // goto loop-exit // the precondition of the loop 00373 // cnt0 = init-val; 00374 // do { 00375 // x1 = phi (x0, x2); 00376 // cnt1 = phi(cnt0, cnt2); 00377 // 00378 // cnt2 = cnt1 + 1; 00379 // ... 00380 // x2 = x1 & (x1 - 1); 00381 // ... 00382 // } while(x != 0); 00383 // 00384 // loop-exit: 00385 // 00386 00387 // step 1: Check to see if the look-back branch match this pattern: 00388 // "if (a!=0) goto loop-entry". 00389 BasicBlock *LoopEntry; 00390 Instruction *DefX2, *CountInst; 00391 Value *VarX1, *VarX0; 00392 PHINode *PhiX, *CountPhi; 00393 00394 DefX2 = CountInst = nullptr; 00395 VarX1 = VarX0 = nullptr; 00396 PhiX = CountPhi = nullptr; 00397 LoopEntry = *(CurLoop->block_begin()); 00398 00399 // step 1: Check if the loop-back branch is in desirable form. 00400 { 00401 if (Value *T = matchCondition (LIRUtil::getBranch(LoopEntry), LoopEntry)) 00402 DefX2 = dyn_cast<Instruction>(T); 00403 else 00404 return false; 00405 } 00406 00407 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)" 00408 { 00409 if (!DefX2 || DefX2->getOpcode() != Instruction::And) 00410 return false; 00411 00412 BinaryOperator *SubOneOp; 00413 00414 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0)))) 00415 VarX1 = DefX2->getOperand(1); 00416 else { 00417 VarX1 = DefX2->getOperand(0); 00418 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1)); 00419 } 00420 if (!SubOneOp) 00421 return false; 00422 00423 Instruction *SubInst = cast<Instruction>(SubOneOp); 00424 ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1)); 00425 if (!Dec || 00426 !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) || 00427 (SubInst->getOpcode() == Instruction::Add && Dec->isAllOnesValue()))) { 00428 return false; 00429 } 00430 } 00431 00432 // step 3: Check the recurrence of variable X 00433 { 00434 PhiX = dyn_cast<PHINode>(VarX1); 00435 if (!PhiX || 00436 (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) { 00437 return false; 00438 } 00439 } 00440 00441 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 00442 { 00443 CountInst = nullptr; 00444 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI(), 00445 IterE = LoopEntry->end(); Iter != IterE; Iter++) { 00446 Instruction *Inst = Iter; 00447 if (Inst->getOpcode() != Instruction::Add) 00448 continue; 00449 00450 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); 00451 if (!Inc || !Inc->isOne()) 00452 continue; 00453 00454 PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0)); 00455 if (!Phi || Phi->getParent() != LoopEntry) 00456 continue; 00457 00458 // Check if the result of the instruction is live of the loop. 00459 bool LiveOutLoop = false; 00460 for (User *U : Inst->users()) { 00461 if ((cast<Instruction>(U))->getParent() != LoopEntry) { 00462 LiveOutLoop = true; break; 00463 } 00464 } 00465 00466 if (LiveOutLoop) { 00467 CountInst = Inst; 00468 CountPhi = Phi; 00469 break; 00470 } 00471 } 00472 00473 if (!CountInst) 00474 return false; 00475 } 00476 00477 // step 5: check if the precondition is in this form: 00478 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;" 00479 { 00480 BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB); 00481 Value *T = matchCondition (PreCondBr, CurLoop->getLoopPreheader()); 00482 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1)) 00483 return false; 00484 00485 CntInst = CountInst; 00486 CntPhi = CountPhi; 00487 Var = T; 00488 } 00489 00490 return true; 00491 } 00492 00493 void NclPopcountRecognize::transform(Instruction *CntInst, 00494 PHINode *CntPhi, Value *Var) { 00495 00496 ScalarEvolution *SE = LIR.getScalarEvolution(); 00497 TargetLibraryInfo *TLI = LIR.getTargetLibraryInfo(); 00498 BasicBlock *PreHead = CurLoop->getLoopPreheader(); 00499 BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB); 00500 const DebugLoc DL = CntInst->getDebugLoc(); 00501 00502 // Assuming before transformation, the loop is following: 00503 // if (x) // the precondition 00504 // do { cnt++; x &= x - 1; } while(x); 00505 00506 // Step 1: Insert the ctpop instruction at the end of the precondition block 00507 IRBuilderTy Builder(PreCondBr); 00508 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt; 00509 { 00510 PopCnt = createPopcntIntrinsic(Builder, Var, DL); 00511 NewCount = PopCntZext = 00512 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType())); 00513 00514 if (NewCount != PopCnt) 00515 (cast<Instruction>(NewCount))->setDebugLoc(DL); 00516 00517 // TripCnt is exactly the number of iterations the loop has 00518 TripCnt = NewCount; 00519 00520 // If the population counter's initial value is not zero, insert Add Inst. 00521 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead); 00522 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); 00523 if (!InitConst || !InitConst->isZero()) { 00524 NewCount = Builder.CreateAdd(NewCount, CntInitVal); 00525 (cast<Instruction>(NewCount))->setDebugLoc(DL); 00526 } 00527 } 00528 00529 // Step 2: Replace the precondition from "if(x == 0) goto loop-exit" to 00530 // "if(NewCount == 0) loop-exit". Withtout this change, the intrinsic 00531 // function would be partial dead code, and downstream passes will drag 00532 // it back from the precondition block to the preheader. 00533 { 00534 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition()); 00535 00536 Value *Opnd0 = PopCntZext; 00537 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0); 00538 if (PreCond->getOperand(0) != Var) 00539 std::swap(Opnd0, Opnd1); 00540 00541 ICmpInst *NewPreCond = 00542 cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); 00543 PreCond->replaceAllUsesWith(NewPreCond); 00544 00545 deleteDeadInstruction(PreCond, *SE, TLI); 00546 } 00547 00548 // Step 3: Note that the population count is exactly the trip count of the 00549 // loop in question, which enble us to to convert the loop from noncountable 00550 // loop into a countable one. The benefit is twofold: 00551 // 00552 // - If the loop only counts population, the entire loop become dead after 00553 // the transformation. It is lots easier to prove a countable loop dead 00554 // than to prove a noncountable one. (In some C dialects, a infite loop 00555 // isn't dead even if it computes nothing useful. In general, DCE needs 00556 // to prove a noncountable loop finite before safely delete it.) 00557 // 00558 // - If the loop also performs something else, it remains alive. 00559 // Since it is transformed to countable form, it can be aggressively 00560 // optimized by some optimizations which are in general not applicable 00561 // to a noncountable loop. 00562 // 00563 // After this step, this loop (conceptually) would look like following: 00564 // newcnt = __builtin_ctpop(x); 00565 // t = newcnt; 00566 // if (x) 00567 // do { cnt++; x &= x-1; t--) } while (t > 0); 00568 BasicBlock *Body = *(CurLoop->block_begin()); 00569 { 00570 BranchInst *LbBr = LIRUtil::getBranch(Body); 00571 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); 00572 Type *Ty = TripCnt->getType(); 00573 00574 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", Body->begin()); 00575 00576 Builder.SetInsertPoint(LbCond); 00577 Value *Opnd1 = cast<Value>(TcPhi); 00578 Value *Opnd2 = cast<Value>(ConstantInt::get(Ty, 1)); 00579 Instruction *TcDec = 00580 cast<Instruction>(Builder.CreateSub(Opnd1, Opnd2, "tcdec", false, true)); 00581 00582 TcPhi->addIncoming(TripCnt, PreHead); 00583 TcPhi->addIncoming(TcDec, Body); 00584 00585 CmpInst::Predicate Pred = (LbBr->getSuccessor(0) == Body) ? 00586 CmpInst::ICMP_UGT : CmpInst::ICMP_SLE; 00587 LbCond->setPredicate(Pred); 00588 LbCond->setOperand(0, TcDec); 00589 LbCond->setOperand(1, cast<Value>(ConstantInt::get(Ty, 0))); 00590 } 00591 00592 // Step 4: All the references to the original population counter outside 00593 // the loop are replaced with the NewCount -- the value returned from 00594 // __builtin_ctpop(). 00595 { 00596 SmallVector<Value *, 4> CntUses; 00597 for (User *U : CntInst->users()) 00598 if (cast<Instruction>(U)->getParent() != Body) 00599 CntUses.push_back(U); 00600 for (unsigned Idx = 0; Idx < CntUses.size(); Idx++) { 00601 (cast<Instruction>(CntUses[Idx]))->replaceUsesOfWith(CntInst, NewCount); 00602 } 00603 } 00604 00605 // step 5: Forget the "non-computable" trip-count SCEV associated with the 00606 // loop. The loop would otherwise not be deleted even if it becomes empty. 00607 SE->forgetLoop(CurLoop); 00608 } 00609 00610 CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder, 00611 Value *Val, DebugLoc DL) { 00612 Value *Ops[] = { Val }; 00613 Type *Tys[] = { Val->getType() }; 00614 00615 Module *M = (*(CurLoop->block_begin()))->getParent()->getParent(); 00616 Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys); 00617 CallInst *CI = IRBuilder.CreateCall(Func, Ops); 00618 CI->setDebugLoc(DL); 00619 00620 return CI; 00621 } 00622 00623 /// recognize - detect population count idiom in a non-countable loop. If 00624 /// detected, transform the relevant code to popcount intrinsic function 00625 /// call, and return true; otherwise, return false. 00626 bool NclPopcountRecognize::recognize() { 00627 00628 if (!LIR.getTargetTransformInfo()) 00629 return false; 00630 00631 LIR.getScalarEvolution(); 00632 00633 if (!preliminaryScreen()) 00634 return false; 00635 00636 Instruction *CntInst; 00637 PHINode *CntPhi; 00638 Value *Val; 00639 if (!detectIdiom(CntInst, CntPhi, Val)) 00640 return false; 00641 00642 transform(CntInst, CntPhi, Val); 00643 return true; 00644 } 00645 00646 //===----------------------------------------------------------------------===// 00647 // 00648 // Implementation of LoopIdiomRecognize 00649 // 00650 //===----------------------------------------------------------------------===// 00651 00652 bool LoopIdiomRecognize::runOnCountableLoop() { 00653 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop); 00654 if (isa<SCEVCouldNotCompute>(BECount)) return false; 00655 00656 // If this loop executes exactly one time, then it should be peeled, not 00657 // optimized by this pass. 00658 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) 00659 if (BECst->getValue()->getValue() == 0) 00660 return false; 00661 00662 // We require target data for now. 00663 if (!getDataLayout()) 00664 return false; 00665 00666 // set DT 00667 (void)getDominatorTree(); 00668 00669 LoopInfo &LI = getAnalysis<LoopInfo>(); 00670 TLI = &getAnalysis<TargetLibraryInfo>(); 00671 00672 // set TLI 00673 (void)getTargetLibraryInfo(); 00674 00675 SmallVector<BasicBlock*, 8> ExitBlocks; 00676 CurLoop->getUniqueExitBlocks(ExitBlocks); 00677 00678 DEBUG(dbgs() << "loop-idiom Scanning: F[" 00679 << CurLoop->getHeader()->getParent()->getName() 00680 << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); 00681 00682 bool MadeChange = false; 00683 // Scan all the blocks in the loop that are not in subloops. 00684 for (Loop::block_iterator BI = CurLoop->block_begin(), 00685 E = CurLoop->block_end(); BI != E; ++BI) { 00686 // Ignore blocks in subloops. 00687 if (LI.getLoopFor(*BI) != CurLoop) 00688 continue; 00689 00690 MadeChange |= runOnLoopBlock(*BI, BECount, ExitBlocks); 00691 } 00692 return MadeChange; 00693 } 00694 00695 bool LoopIdiomRecognize::runOnNoncountableLoop() { 00696 NclPopcountRecognize Popcount(*this); 00697 if (Popcount.recognize()) 00698 return true; 00699 00700 return false; 00701 } 00702 00703 bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { 00704 if (skipOptnoneFunction(L)) 00705 return false; 00706 00707 CurLoop = L; 00708 00709 // If the loop could not be converted to canonical form, it must have an 00710 // indirectbr in it, just give up. 00711 if (!L->getLoopPreheader()) 00712 return false; 00713 00714 // Disable loop idiom recognition if the function's name is a common idiom. 00715 StringRef Name = L->getHeader()->getParent()->getName(); 00716 if (Name == "memset" || Name == "memcpy") 00717 return false; 00718 00719 SE = &getAnalysis<ScalarEvolution>(); 00720 if (SE->hasLoopInvariantBackedgeTakenCount(L)) 00721 return runOnCountableLoop(); 00722 return runOnNoncountableLoop(); 00723 } 00724 00725 /// runOnLoopBlock - Process the specified block, which lives in a counted loop 00726 /// with the specified backedge count. This block is known to be in the current 00727 /// loop and not in any subloops. 00728 bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, 00729 SmallVectorImpl<BasicBlock*> &ExitBlocks) { 00730 // We can only promote stores in this block if they are unconditionally 00731 // executed in the loop. For a block to be unconditionally executed, it has 00732 // to dominate all the exit blocks of the loop. Verify this now. 00733 for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) 00734 if (!DT->dominates(BB, ExitBlocks[i])) 00735 return false; 00736 00737 bool MadeChange = false; 00738 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { 00739 Instruction *Inst = I++; 00740 // Look for store instructions, which may be optimized to memset/memcpy. 00741 if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { 00742 WeakVH InstPtr(I); 00743 if (!processLoopStore(SI, BECount)) continue; 00744 MadeChange = true; 00745 00746 // If processing the store invalidated our iterator, start over from the 00747 // top of the block. 00748 if (!InstPtr) 00749 I = BB->begin(); 00750 continue; 00751 } 00752 00753 // Look for memset instructions, which may be optimized to a larger memset. 00754 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) { 00755 WeakVH InstPtr(I); 00756 if (!processLoopMemSet(MSI, BECount)) continue; 00757 MadeChange = true; 00758 00759 // If processing the memset invalidated our iterator, start over from the 00760 // top of the block. 00761 if (!InstPtr) 00762 I = BB->begin(); 00763 continue; 00764 } 00765 } 00766 00767 return MadeChange; 00768 } 00769 00770 00771 /// processLoopStore - See if this store can be promoted to a memset or memcpy. 00772 bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { 00773 if (!SI->isSimple()) return false; 00774 00775 Value *StoredVal = SI->getValueOperand(); 00776 Value *StorePtr = SI->getPointerOperand(); 00777 00778 // Reject stores that are so large that they overflow an unsigned. 00779 uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); 00780 if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) 00781 return false; 00782 00783 // See if the pointer expression is an AddRec like {base,+,1} on the current 00784 // loop, which indicates a strided store. If we have something else, it's a 00785 // random store we can't handle. 00786 const SCEVAddRecExpr *StoreEv = 00787 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); 00788 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) 00789 return false; 00790 00791 // Check to see if the stride matches the size of the store. If so, then we 00792 // know that every byte is touched in the loop. 00793 unsigned StoreSize = (unsigned)SizeInBits >> 3; 00794 const SCEVConstant *Stride = dyn_cast<SCEVConstant>(StoreEv->getOperand(1)); 00795 00796 if (!Stride || StoreSize != Stride->getValue()->getValue()) { 00797 // TODO: Could also handle negative stride here someday, that will require 00798 // the validity check in mayLoopAccessLocation to be updated though. 00799 // Enable this to print exact negative strides. 00800 if (0 && Stride && StoreSize == -Stride->getValue()->getValue()) { 00801 dbgs() << "NEGATIVE STRIDE: " << *SI << "\n"; 00802 dbgs() << "BB: " << *SI->getParent(); 00803 } 00804 00805 return false; 00806 } 00807 00808 // See if we can optimize just this store in isolation. 00809 if (processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(), 00810 StoredVal, SI, StoreEv, BECount)) 00811 return true; 00812 00813 // If the stored value is a strided load in the same loop with the same stride 00814 // this this may be transformable into a memcpy. This kicks in for stuff like 00815 // for (i) A[i] = B[i]; 00816 if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) { 00817 const SCEVAddRecExpr *LoadEv = 00818 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getOperand(0))); 00819 if (LoadEv && LoadEv->getLoop() == CurLoop && LoadEv->isAffine() && 00820 StoreEv->getOperand(1) == LoadEv->getOperand(1) && LI->isSimple()) 00821 if (processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, LoadEv, BECount)) 00822 return true; 00823 } 00824 //errs() << "UNHANDLED strided store: " << *StoreEv << " - " << *SI << "\n"; 00825 00826 return false; 00827 } 00828 00829 /// processLoopMemSet - See if this memset can be promoted to a large memset. 00830 bool LoopIdiomRecognize:: 00831 processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { 00832 // We can only handle non-volatile memsets with a constant size. 00833 if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) return false; 00834 00835 // If we're not allowed to hack on memset, we fail. 00836 if (!TLI->has(LibFunc::memset)) 00837 return false; 00838 00839 Value *Pointer = MSI->getDest(); 00840 00841 // See if the pointer expression is an AddRec like {base,+,1} on the current 00842 // loop, which indicates a strided store. If we have something else, it's a 00843 // random store we can't handle. 00844 const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer)); 00845 if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine()) 00846 return false; 00847 00848 // Reject memsets that are so large that they overflow an unsigned. 00849 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue(); 00850 if ((SizeInBytes >> 32) != 0) 00851 return false; 00852 00853 // Check to see if the stride matches the size of the memset. If so, then we 00854 // know that every byte is touched in the loop. 00855 const SCEVConstant *Stride = dyn_cast<SCEVConstant>(Ev->getOperand(1)); 00856 00857 // TODO: Could also handle negative stride here someday, that will require the 00858 // validity check in mayLoopAccessLocation to be updated though. 00859 if (!Stride || MSI->getLength() != Stride->getValue()) 00860 return false; 00861 00862 return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, 00863 MSI->getAlignment(), MSI->getValue(), 00864 MSI, Ev, BECount); 00865 } 00866 00867 00868 /// mayLoopAccessLocation - Return true if the specified loop might access the 00869 /// specified pointer location, which is a loop-strided access. The 'Access' 00870 /// argument specifies what the verboten forms of access are (read or write). 00871 static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, 00872 Loop *L, const SCEV *BECount, 00873 unsigned StoreSize, AliasAnalysis &AA, 00874 Instruction *IgnoredStore) { 00875 // Get the location that may be stored across the loop. Since the access is 00876 // strided positively through memory, we say that the modified location starts 00877 // at the pointer and has infinite size. 00878 uint64_t AccessSize = AliasAnalysis::UnknownSize; 00879 00880 // If the loop iterates a fixed number of times, we can refine the access size 00881 // to be exactly the size of the memset, which is (BECount+1)*StoreSize 00882 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) 00883 AccessSize = (BECst->getValue()->getZExtValue()+1)*StoreSize; 00884 00885 // TODO: For this to be really effective, we have to dive into the pointer 00886 // operand in the store. Store to &A[i] of 100 will always return may alias 00887 // with store of &A[100], we need to StoreLoc to be "A" with size of 100, 00888 // which will then no-alias a store to &A[100]. 00889 AliasAnalysis::Location StoreLoc(Ptr, AccessSize); 00890 00891 for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; 00892 ++BI) 00893 for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) 00894 if (&*I != IgnoredStore && 00895 (AA.getModRefInfo(I, StoreLoc) & Access)) 00896 return true; 00897 00898 return false; 00899 } 00900 00901 /// getMemSetPatternValue - If a strided store of the specified value is safe to 00902 /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should 00903 /// be passed in. Otherwise, return null. 00904 /// 00905 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these 00906 /// just replicate their input array and then pass on to memset_pattern16. 00907 static Constant *getMemSetPatternValue(Value *V, const DataLayout &DL) { 00908 // If the value isn't a constant, we can't promote it to being in a constant 00909 // array. We could theoretically do a store to an alloca or something, but 00910 // that doesn't seem worthwhile. 00911 Constant *C = dyn_cast<Constant>(V); 00912 if (!C) return nullptr; 00913 00914 // Only handle simple values that are a power of two bytes in size. 00915 uint64_t Size = DL.getTypeSizeInBits(V->getType()); 00916 if (Size == 0 || (Size & 7) || (Size & (Size-1))) 00917 return nullptr; 00918 00919 // Don't care enough about darwin/ppc to implement this. 00920 if (DL.isBigEndian()) 00921 return nullptr; 00922 00923 // Convert to size in bytes. 00924 Size /= 8; 00925 00926 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see 00927 // if the top and bottom are the same (e.g. for vectors and large integers). 00928 if (Size > 16) return nullptr; 00929 00930 // If the constant is exactly 16 bytes, just use it. 00931 if (Size == 16) return C; 00932 00933 // Otherwise, we'll use an array of the constants. 00934 unsigned ArraySize = 16/Size; 00935 ArrayType *AT = ArrayType::get(V->getType(), ArraySize); 00936 return ConstantArray::get(AT, std::vector<Constant*>(ArraySize, C)); 00937 } 00938 00939 00940 /// processLoopStridedStore - We see a strided store of some value. If we can 00941 /// transform this into a memset or memset_pattern in the loop preheader, do so. 00942 bool LoopIdiomRecognize:: 00943 processLoopStridedStore(Value *DestPtr, unsigned StoreSize, 00944 unsigned StoreAlignment, Value *StoredVal, 00945 Instruction *TheStore, const SCEVAddRecExpr *Ev, 00946 const SCEV *BECount) { 00947 00948 // If the stored value is a byte-wise value (like i32 -1), then it may be 00949 // turned into a memset of i8 -1, assuming that all the consecutive bytes 00950 // are stored. A store of i32 0x01020304 can never be turned into a memset, 00951 // but it can be turned into memset_pattern if the target supports it. 00952 Value *SplatValue = isBytewiseValue(StoredVal); 00953 Constant *PatternValue = nullptr; 00954 00955 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace(); 00956 00957 // If we're allowed to form a memset, and the stored value would be acceptable 00958 // for memset, use it. 00959 if (SplatValue && TLI->has(LibFunc::memset) && 00960 // Verify that the stored value is loop invariant. If not, we can't 00961 // promote the memset. 00962 CurLoop->isLoopInvariant(SplatValue)) { 00963 // Keep and use SplatValue. 00964 PatternValue = nullptr; 00965 } else if (DestAS == 0 && 00966 TLI->has(LibFunc::memset_pattern16) && 00967 (PatternValue = getMemSetPatternValue(StoredVal, *DL))) { 00968 // Don't create memset_pattern16s with address spaces. 00969 // It looks like we can use PatternValue! 00970 SplatValue = nullptr; 00971 } else { 00972 // Otherwise, this isn't an idiom we can transform. For example, we can't 00973 // do anything with a 3-byte store. 00974 return false; 00975 } 00976 00977 // The trip count of the loop and the base pointer of the addrec SCEV is 00978 // guaranteed to be loop invariant, which means that it should dominate the 00979 // header. This allows us to insert code for it in the preheader. 00980 BasicBlock *Preheader = CurLoop->getLoopPreheader(); 00981 IRBuilder<> Builder(Preheader->getTerminator()); 00982 SCEVExpander Expander(*SE, "loop-idiom"); 00983 00984 Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); 00985 00986 // Okay, we have a strided store "p[i]" of a splattable value. We can turn 00987 // this into a memset in the loop preheader now if we want. However, this 00988 // would be unsafe to do if there is anything else in the loop that may read 00989 // or write to the aliased location. Check for any overlap by generating the 00990 // base pointer and checking the region. 00991 Value *BasePtr = 00992 Expander.expandCodeFor(Ev->getStart(), DestInt8PtrTy, 00993 Preheader->getTerminator()); 00994 00995 if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef, 00996 CurLoop, BECount, 00997 StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) { 00998 Expander.clear(); 00999 // If we generated new code for the base pointer, clean up. 01000 deleteIfDeadInstruction(BasePtr, *SE, TLI); 01001 return false; 01002 } 01003 01004 // Okay, everything looks good, insert the memset. 01005 01006 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to 01007 // pointer size if it isn't already. 01008 Type *IntPtr = Builder.getIntPtrTy(DL, DestAS); 01009 BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); 01010 01011 const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), 01012 SCEV::FlagNUW); 01013 if (StoreSize != 1) { 01014 NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), 01015 SCEV::FlagNUW); 01016 } 01017 01018 Value *NumBytes = 01019 Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); 01020 01021 CallInst *NewCall; 01022 if (SplatValue) { 01023 NewCall = Builder.CreateMemSet(BasePtr, 01024 SplatValue, 01025 NumBytes, 01026 StoreAlignment); 01027 } else { 01028 // Everything is emitted in default address space 01029 Type *Int8PtrTy = DestInt8PtrTy; 01030 01031 Module *M = TheStore->getParent()->getParent()->getParent(); 01032 Value *MSP = M->getOrInsertFunction("memset_pattern16", 01033 Builder.getVoidTy(), 01034 Int8PtrTy, 01035 Int8PtrTy, 01036 IntPtr, 01037 (void*)nullptr); 01038 01039 // Otherwise we should form a memset_pattern16. PatternValue is known to be 01040 // an constant array of 16-bytes. Plop the value into a mergable global. 01041 GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, 01042 GlobalValue::InternalLinkage, 01043 PatternValue, ".memset_pattern"); 01044 GV->setUnnamedAddr(true); // Ok to merge these. 01045 GV->setAlignment(16); 01046 Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy); 01047 NewCall = Builder.CreateCall3(MSP, BasePtr, PatternPtr, NumBytes); 01048 } 01049 01050 DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n" 01051 << " from store to: " << *Ev << " at: " << *TheStore << "\n"); 01052 NewCall->setDebugLoc(TheStore->getDebugLoc()); 01053 01054 // Okay, the memset has been formed. Zap the original store and anything that 01055 // feeds into it. 01056 deleteDeadInstruction(TheStore, *SE, TLI); 01057 ++NumMemSet; 01058 return true; 01059 } 01060 01061 /// processLoopStoreOfLoopLoad - We see a strided store whose value is a 01062 /// same-strided load. 01063 bool LoopIdiomRecognize:: 01064 processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, 01065 const SCEVAddRecExpr *StoreEv, 01066 const SCEVAddRecExpr *LoadEv, 01067 const SCEV *BECount) { 01068 // If we're not allowed to form memcpy, we fail. 01069 if (!TLI->has(LibFunc::memcpy)) 01070 return false; 01071 01072 LoadInst *LI = cast<LoadInst>(SI->getValueOperand()); 01073 01074 // The trip count of the loop and the base pointer of the addrec SCEV is 01075 // guaranteed to be loop invariant, which means that it should dominate the 01076 // header. This allows us to insert code for it in the preheader. 01077 BasicBlock *Preheader = CurLoop->getLoopPreheader(); 01078 IRBuilder<> Builder(Preheader->getTerminator()); 01079 SCEVExpander Expander(*SE, "loop-idiom"); 01080 01081 // Okay, we have a strided store "p[i]" of a loaded value. We can turn 01082 // this into a memcpy in the loop preheader now if we want. However, this 01083 // would be unsafe to do if there is anything else in the loop that may read 01084 // or write the memory region we're storing to. This includes the load that 01085 // feeds the stores. Check for an alias by generating the base address and 01086 // checking everything. 01087 Value *StoreBasePtr = 01088 Expander.expandCodeFor(StoreEv->getStart(), 01089 Builder.getInt8PtrTy(SI->getPointerAddressSpace()), 01090 Preheader->getTerminator()); 01091 01092 if (mayLoopAccessLocation(StoreBasePtr, AliasAnalysis::ModRef, 01093 CurLoop, BECount, StoreSize, 01094 getAnalysis<AliasAnalysis>(), SI)) { 01095 Expander.clear(); 01096 // If we generated new code for the base pointer, clean up. 01097 deleteIfDeadInstruction(StoreBasePtr, *SE, TLI); 01098 return false; 01099 } 01100 01101 // For a memcpy, we have to make sure that the input array is not being 01102 // mutated by the loop. 01103 Value *LoadBasePtr = 01104 Expander.expandCodeFor(LoadEv->getStart(), 01105 Builder.getInt8PtrTy(LI->getPointerAddressSpace()), 01106 Preheader->getTerminator()); 01107 01108 if (mayLoopAccessLocation(LoadBasePtr, AliasAnalysis::Mod, CurLoop, BECount, 01109 StoreSize, getAnalysis<AliasAnalysis>(), SI)) { 01110 Expander.clear(); 01111 // If we generated new code for the base pointer, clean up. 01112 deleteIfDeadInstruction(LoadBasePtr, *SE, TLI); 01113 deleteIfDeadInstruction(StoreBasePtr, *SE, TLI); 01114 return false; 01115 } 01116 01117 // Okay, everything is safe, we can transform this! 01118 01119 01120 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to 01121 // pointer size if it isn't already. 01122 Type *IntPtrTy = Builder.getIntPtrTy(DL, SI->getPointerAddressSpace()); 01123 BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy); 01124 01125 const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtrTy, 1), 01126 SCEV::FlagNUW); 01127 if (StoreSize != 1) 01128 NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize), 01129 SCEV::FlagNUW); 01130 01131 Value *NumBytes = 01132 Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); 01133 01134 CallInst *NewCall = 01135 Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, 01136 std::min(SI->getAlignment(), LI->getAlignment())); 01137 NewCall->setDebugLoc(SI->getDebugLoc()); 01138 01139 DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" 01140 << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" 01141 << " from store ptr=" << *StoreEv << " at: " << *SI << "\n"); 01142 01143 01144 // Okay, the memset has been formed. Zap the original store and anything that 01145 // feeds into it. 01146 deleteDeadInstruction(SI, *SE, TLI); 01147 ++NumMemCpy; 01148 return true; 01149 }