LLVM API Documentation
00001 //===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This file implements a basic-block vectorization pass. The algorithm was 00011 // inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral, 00012 // et al. It works by looking for chains of pairable operations and then 00013 // pairing them. 00014 // 00015 //===----------------------------------------------------------------------===// 00016 00017 #define BBV_NAME "bb-vectorize" 00018 #include "llvm/Transforms/Vectorize.h" 00019 #include "llvm/ADT/DenseMap.h" 00020 #include "llvm/ADT/DenseSet.h" 00021 #include "llvm/ADT/STLExtras.h" 00022 #include "llvm/ADT/SmallSet.h" 00023 #include "llvm/ADT/SmallVector.h" 00024 #include "llvm/ADT/Statistic.h" 00025 #include "llvm/ADT/StringExtras.h" 00026 #include "llvm/Analysis/AliasAnalysis.h" 00027 #include "llvm/Analysis/AliasSetTracker.h" 00028 #include "llvm/Analysis/ScalarEvolution.h" 00029 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 00030 #include "llvm/Analysis/TargetTransformInfo.h" 00031 #include "llvm/Analysis/ValueTracking.h" 00032 #include "llvm/IR/Constants.h" 00033 #include "llvm/IR/DataLayout.h" 00034 #include "llvm/IR/DerivedTypes.h" 00035 #include "llvm/IR/Dominators.h" 00036 #include "llvm/IR/Function.h" 00037 #include "llvm/IR/Instructions.h" 00038 #include "llvm/IR/IntrinsicInst.h" 00039 #include "llvm/IR/Intrinsics.h" 00040 #include "llvm/IR/LLVMContext.h" 00041 #include "llvm/IR/Metadata.h" 00042 #include "llvm/IR/Type.h" 00043 #include "llvm/IR/ValueHandle.h" 00044 #include "llvm/Pass.h" 00045 #include "llvm/Support/CommandLine.h" 00046 #include "llvm/Support/Debug.h" 00047 #include "llvm/Support/raw_ostream.h" 00048 #include "llvm/Transforms/Utils/Local.h" 00049 #include <algorithm> 00050 using namespace llvm; 00051 00052 #define DEBUG_TYPE BBV_NAME 00053 00054 static cl::opt<bool> 00055 IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false), 00056 cl::Hidden, cl::desc("Ignore target information")); 00057 00058 static cl::opt<unsigned> 00059 ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, 00060 cl::desc("The required chain depth for vectorization")); 00061 00062 static cl::opt<bool> 00063 UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false), 00064 cl::Hidden, cl::desc("Use the chain depth requirement with" 00065 " target information")); 00066 00067 static cl::opt<unsigned> 00068 SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, 00069 cl::desc("The maximum search distance for instruction pairs")); 00070 00071 static cl::opt<bool> 00072 SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden, 00073 cl::desc("Replicating one element to a pair breaks the chain")); 00074 00075 static cl::opt<unsigned> 00076 VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden, 00077 cl::desc("The size of the native vector registers")); 00078 00079 static cl::opt<unsigned> 00080 MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, 00081 cl::desc("The maximum number of pairing iterations")); 00082 00083 static cl::opt<bool> 00084 Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden, 00085 cl::desc("Don't try to form non-2^n-length vectors")); 00086 00087 static cl::opt<unsigned> 00088 MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden, 00089 cl::desc("The maximum number of pairable instructions per group")); 00090 00091 static cl::opt<unsigned> 00092 MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden, 00093 cl::desc("The maximum number of candidate instruction pairs per group")); 00094 00095 static cl::opt<unsigned> 00096 MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), 00097 cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use" 00098 " a full cycle check")); 00099 00100 static cl::opt<bool> 00101 NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden, 00102 cl::desc("Don't try to vectorize boolean (i1) values")); 00103 00104 static cl::opt<bool> 00105 NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, 00106 cl::desc("Don't try to vectorize integer values")); 00107 00108 static cl::opt<bool> 00109 NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, 00110 cl::desc("Don't try to vectorize floating-point values")); 00111 00112 // FIXME: This should default to false once pointer vector support works. 00113 static cl::opt<bool> 00114 NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden, 00115 cl::desc("Don't try to vectorize pointer values")); 00116 00117 static cl::opt<bool> 00118 NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden, 00119 cl::desc("Don't try to vectorize casting (conversion) operations")); 00120 00121 static cl::opt<bool> 00122 NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, 00123 cl::desc("Don't try to vectorize floating-point math intrinsics")); 00124 00125 static cl::opt<bool> 00126 NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden, 00127 cl::desc("Don't try to vectorize BitManipulation intrinsics")); 00128 00129 static cl::opt<bool> 00130 NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, 00131 cl::desc("Don't try to vectorize the fused-multiply-add intrinsic")); 00132 00133 static cl::opt<bool> 00134 NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden, 00135 cl::desc("Don't try to vectorize select instructions")); 00136 00137 static cl::opt<bool> 00138 NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden, 00139 cl::desc("Don't try to vectorize comparison instructions")); 00140 00141 static cl::opt<bool> 00142 NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden, 00143 cl::desc("Don't try to vectorize getelementptr instructions")); 00144 00145 static cl::opt<bool> 00146 NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden, 00147 cl::desc("Don't try to vectorize loads and stores")); 00148 00149 static cl::opt<bool> 00150 AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden, 00151 cl::desc("Only generate aligned loads and stores")); 00152 00153 static cl::opt<bool> 00154 NoMemOpBoost("bb-vectorize-no-mem-op-boost", 00155 cl::init(false), cl::Hidden, 00156 cl::desc("Don't boost the chain-depth contribution of loads and stores")); 00157 00158 static cl::opt<bool> 00159 FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden, 00160 cl::desc("Use a fast instruction dependency analysis")); 00161 00162 #ifndef NDEBUG 00163 static cl::opt<bool> 00164 DebugInstructionExamination("bb-vectorize-debug-instruction-examination", 00165 cl::init(false), cl::Hidden, 00166 cl::desc("When debugging is enabled, output information on the" 00167 " instruction-examination process")); 00168 static cl::opt<bool> 00169 DebugCandidateSelection("bb-vectorize-debug-candidate-selection", 00170 cl::init(false), cl::Hidden, 00171 cl::desc("When debugging is enabled, output information on the" 00172 " candidate-selection process")); 00173 static cl::opt<bool> 00174 DebugPairSelection("bb-vectorize-debug-pair-selection", 00175 cl::init(false), cl::Hidden, 00176 cl::desc("When debugging is enabled, output information on the" 00177 " pair-selection process")); 00178 static cl::opt<bool> 00179 DebugCycleCheck("bb-vectorize-debug-cycle-check", 00180 cl::init(false), cl::Hidden, 00181 cl::desc("When debugging is enabled, output information on the" 00182 " cycle-checking process")); 00183 00184 static cl::opt<bool> 00185 PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair", 00186 cl::init(false), cl::Hidden, 00187 cl::desc("When debugging is enabled, dump the basic block after" 00188 " every pair is fused")); 00189 #endif 00190 00191 STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize"); 00192 00193 namespace { 00194 struct BBVectorize : public BasicBlockPass { 00195 static char ID; // Pass identification, replacement for typeid 00196 00197 const VectorizeConfig Config; 00198 00199 BBVectorize(const VectorizeConfig &C = VectorizeConfig()) 00200 : BasicBlockPass(ID), Config(C) { 00201 initializeBBVectorizePass(*PassRegistry::getPassRegistry()); 00202 } 00203 00204 BBVectorize(Pass *P, const VectorizeConfig &C) 00205 : BasicBlockPass(ID), Config(C) { 00206 AA = &P->getAnalysis<AliasAnalysis>(); 00207 DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 00208 SE = &P->getAnalysis<ScalarEvolution>(); 00209 DataLayoutPass *DLP = P->getAnalysisIfAvailable<DataLayoutPass>(); 00210 DL = DLP ? &DLP->getDataLayout() : nullptr; 00211 TTI = IgnoreTargetInfo ? nullptr : &P->getAnalysis<TargetTransformInfo>(); 00212 } 00213 00214 typedef std::pair<Value *, Value *> ValuePair; 00215 typedef std::pair<ValuePair, int> ValuePairWithCost; 00216 typedef std::pair<ValuePair, size_t> ValuePairWithDepth; 00217 typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair 00218 typedef std::pair<VPPair, unsigned> VPPairWithType; 00219 00220 AliasAnalysis *AA; 00221 DominatorTree *DT; 00222 ScalarEvolution *SE; 00223 const DataLayout *DL; 00224 const TargetTransformInfo *TTI; 00225 00226 // FIXME: const correct? 00227 00228 bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false); 00229 00230 bool getCandidatePairs(BasicBlock &BB, 00231 BasicBlock::iterator &Start, 00232 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 00233 DenseSet<ValuePair> &FixedOrderPairs, 00234 DenseMap<ValuePair, int> &CandidatePairCostSavings, 00235 std::vector<Value *> &PairableInsts, bool NonPow2Len); 00236 00237 // FIXME: The current implementation does not account for pairs that 00238 // are connected in multiple ways. For example: 00239 // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap) 00240 enum PairConnectionType { 00241 PairConnectionDirect, 00242 PairConnectionSwap, 00243 PairConnectionSplat 00244 }; 00245 00246 void computeConnectedPairs( 00247 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 00248 DenseSet<ValuePair> &CandidatePairsSet, 00249 std::vector<Value *> &PairableInsts, 00250 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 00251 DenseMap<VPPair, unsigned> &PairConnectionTypes); 00252 00253 void buildDepMap(BasicBlock &BB, 00254 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 00255 std::vector<Value *> &PairableInsts, 00256 DenseSet<ValuePair> &PairableInstUsers); 00257 00258 void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 00259 DenseSet<ValuePair> &CandidatePairsSet, 00260 DenseMap<ValuePair, int> &CandidatePairCostSavings, 00261 std::vector<Value *> &PairableInsts, 00262 DenseSet<ValuePair> &FixedOrderPairs, 00263 DenseMap<VPPair, unsigned> &PairConnectionTypes, 00264 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 00265 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, 00266 DenseSet<ValuePair> &PairableInstUsers, 00267 DenseMap<Value *, Value *>& ChosenPairs); 00268 00269 void fuseChosenPairs(BasicBlock &BB, 00270 std::vector<Value *> &PairableInsts, 00271 DenseMap<Value *, Value *>& ChosenPairs, 00272 DenseSet<ValuePair> &FixedOrderPairs, 00273 DenseMap<VPPair, unsigned> &PairConnectionTypes, 00274 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 00275 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps); 00276 00277 00278 bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); 00279 00280 bool areInstsCompatible(Instruction *I, Instruction *J, 00281 bool IsSimpleLoadStore, bool NonPow2Len, 00282 int &CostSavings, int &FixedOrder); 00283 00284 bool trackUsesOfI(DenseSet<Value *> &Users, 00285 AliasSetTracker &WriteSet, Instruction *I, 00286 Instruction *J, bool UpdateUsers = true, 00287 DenseSet<ValuePair> *LoadMoveSetPairs = nullptr); 00288 00289 void computePairsConnectedTo( 00290 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 00291 DenseSet<ValuePair> &CandidatePairsSet, 00292 std::vector<Value *> &PairableInsts, 00293 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 00294 DenseMap<VPPair, unsigned> &PairConnectionTypes, 00295 ValuePair P); 00296 00297 bool pairsConflict(ValuePair P, ValuePair Q, 00298 DenseSet<ValuePair> &PairableInstUsers, 00299 DenseMap<ValuePair, std::vector<ValuePair> > 00300 *PairableInstUserMap = nullptr, 00301 DenseSet<VPPair> *PairableInstUserPairSet = nullptr); 00302 00303 bool pairWillFormCycle(ValuePair P, 00304 DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers, 00305 DenseSet<ValuePair> &CurrentPairs); 00306 00307 void pruneDAGFor( 00308 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 00309 std::vector<Value *> &PairableInsts, 00310 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 00311 DenseSet<ValuePair> &PairableInstUsers, 00312 DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, 00313 DenseSet<VPPair> &PairableInstUserPairSet, 00314 DenseMap<Value *, Value *> &ChosenPairs, 00315 DenseMap<ValuePair, size_t> &DAG, 00316 DenseSet<ValuePair> &PrunedDAG, ValuePair J, 00317 bool UseCycleCheck); 00318 00319 void buildInitialDAGFor( 00320 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 00321 DenseSet<ValuePair> &CandidatePairsSet, 00322 std::vector<Value *> &PairableInsts, 00323 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 00324 DenseSet<ValuePair> &PairableInstUsers, 00325 DenseMap<Value *, Value *> &ChosenPairs, 00326 DenseMap<ValuePair, size_t> &DAG, ValuePair J); 00327 00328 void findBestDAGFor( 00329 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 00330 DenseSet<ValuePair> &CandidatePairsSet, 00331 DenseMap<ValuePair, int> &CandidatePairCostSavings, 00332 std::vector<Value *> &PairableInsts, 00333 DenseSet<ValuePair> &FixedOrderPairs, 00334 DenseMap<VPPair, unsigned> &PairConnectionTypes, 00335 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 00336 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, 00337 DenseSet<ValuePair> &PairableInstUsers, 00338 DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, 00339 DenseSet<VPPair> &PairableInstUserPairSet, 00340 DenseMap<Value *, Value *> &ChosenPairs, 00341 DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth, 00342 int &BestEffSize, Value *II, std::vector<Value *>&JJ, 00343 bool UseCycleCheck); 00344 00345 Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, 00346 Instruction *J, unsigned o); 00347 00348 void fillNewShuffleMask(LLVMContext& Context, Instruction *J, 00349 unsigned MaskOffset, unsigned NumInElem, 00350 unsigned NumInElem1, unsigned IdxOffset, 00351 std::vector<Constant*> &Mask); 00352 00353 Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I, 00354 Instruction *J); 00355 00356 bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J, 00357 unsigned o, Value *&LOp, unsigned numElemL, 00358 Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ, 00359 unsigned IdxOff = 0); 00360 00361 Value *getReplacementInput(LLVMContext& Context, Instruction *I, 00362 Instruction *J, unsigned o, bool IBeforeJ); 00363 00364 void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, 00365 Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands, 00366 bool IBeforeJ); 00367 00368 void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, 00369 Instruction *J, Instruction *K, 00370 Instruction *&InsertionPt, Instruction *&K1, 00371 Instruction *&K2); 00372 00373 void collectPairLoadMoveSet(BasicBlock &BB, 00374 DenseMap<Value *, Value *> &ChosenPairs, 00375 DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, 00376 DenseSet<ValuePair> &LoadMoveSetPairs, 00377 Instruction *I); 00378 00379 void collectLoadMoveSet(BasicBlock &BB, 00380 std::vector<Value *> &PairableInsts, 00381 DenseMap<Value *, Value *> &ChosenPairs, 00382 DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, 00383 DenseSet<ValuePair> &LoadMoveSetPairs); 00384 00385 bool canMoveUsesOfIAfterJ(BasicBlock &BB, 00386 DenseSet<ValuePair> &LoadMoveSetPairs, 00387 Instruction *I, Instruction *J); 00388 00389 void moveUsesOfIAfterJ(BasicBlock &BB, 00390 DenseSet<ValuePair> &LoadMoveSetPairs, 00391 Instruction *&InsertionPt, 00392 Instruction *I, Instruction *J); 00393 00394 bool vectorizeBB(BasicBlock &BB) { 00395 if (skipOptnoneFunction(BB)) 00396 return false; 00397 if (!DT->isReachableFromEntry(&BB)) { 00398 DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() << 00399 " in " << BB.getParent()->getName() << "\n"); 00400 return false; 00401 } 00402 00403 DEBUG(if (TTI) dbgs() << "BBV: using target information\n"); 00404 00405 bool changed = false; 00406 // Iterate a sufficient number of times to merge types of size 1 bit, 00407 // then 2 bits, then 4, etc. up to half of the target vector width of the 00408 // target vector register. 00409 unsigned n = 1; 00410 for (unsigned v = 2; 00411 (TTI || v <= Config.VectorBits) && 00412 (!Config.MaxIter || n <= Config.MaxIter); 00413 v *= 2, ++n) { 00414 DEBUG(dbgs() << "BBV: fusing loop #" << n << 00415 " for " << BB.getName() << " in " << 00416 BB.getParent()->getName() << "...\n"); 00417 if (vectorizePairs(BB)) 00418 changed = true; 00419 else 00420 break; 00421 } 00422 00423 if (changed && !Pow2LenOnly) { 00424 ++n; 00425 for (; !Config.MaxIter || n <= Config.MaxIter; ++n) { 00426 DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " << 00427 n << " for " << BB.getName() << " in " << 00428 BB.getParent()->getName() << "...\n"); 00429 if (!vectorizePairs(BB, true)) break; 00430 } 00431 } 00432 00433 DEBUG(dbgs() << "BBV: done!\n"); 00434 return changed; 00435 } 00436 00437 bool runOnBasicBlock(BasicBlock &BB) override { 00438 // OptimizeNone check deferred to vectorizeBB(). 00439 00440 AA = &getAnalysis<AliasAnalysis>(); 00441 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 00442 SE = &getAnalysis<ScalarEvolution>(); 00443 DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); 00444 DL = DLP ? &DLP->getDataLayout() : nullptr; 00445 TTI = IgnoreTargetInfo ? nullptr : &getAnalysis<TargetTransformInfo>(); 00446 00447 return vectorizeBB(BB); 00448 } 00449 00450 void getAnalysisUsage(AnalysisUsage &AU) const override { 00451 BasicBlockPass::getAnalysisUsage(AU); 00452 AU.addRequired<AliasAnalysis>(); 00453 AU.addRequired<DominatorTreeWrapperPass>(); 00454 AU.addRequired<ScalarEvolution>(); 00455 AU.addRequired<TargetTransformInfo>(); 00456 AU.addPreserved<AliasAnalysis>(); 00457 AU.addPreserved<DominatorTreeWrapperPass>(); 00458 AU.addPreserved<ScalarEvolution>(); 00459 AU.setPreservesCFG(); 00460 } 00461 00462 static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) { 00463 assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() && 00464 "Cannot form vector from incompatible scalar types"); 00465 Type *STy = ElemTy->getScalarType(); 00466 00467 unsigned numElem; 00468 if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) { 00469 numElem = VTy->getNumElements(); 00470 } else { 00471 numElem = 1; 00472 } 00473 00474 if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) { 00475 numElem += VTy->getNumElements(); 00476 } else { 00477 numElem += 1; 00478 } 00479 00480 return VectorType::get(STy, numElem); 00481 } 00482 00483 static inline void getInstructionTypes(Instruction *I, 00484 Type *&T1, Type *&T2) { 00485 if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 00486 // For stores, it is the value type, not the pointer type that matters 00487 // because the value is what will come from a vector register. 00488 00489 Value *IVal = SI->getValueOperand(); 00490 T1 = IVal->getType(); 00491 } else { 00492 T1 = I->getType(); 00493 } 00494 00495 if (CastInst *CI = dyn_cast<CastInst>(I)) 00496 T2 = CI->getSrcTy(); 00497 else 00498 T2 = T1; 00499 00500 if (SelectInst *SI = dyn_cast<SelectInst>(I)) { 00501 T2 = SI->getCondition()->getType(); 00502 } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) { 00503 T2 = SI->getOperand(0)->getType(); 00504 } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) { 00505 T2 = CI->getOperand(0)->getType(); 00506 } 00507 } 00508 00509 // Returns the weight associated with the provided value. A chain of 00510 // candidate pairs has a length given by the sum of the weights of its 00511 // members (one weight per pair; the weight of each member of the pair 00512 // is assumed to be the same). This length is then compared to the 00513 // chain-length threshold to determine if a given chain is significant 00514 // enough to be vectorized. The length is also used in comparing 00515 // candidate chains where longer chains are considered to be better. 00516 // Note: when this function returns 0, the resulting instructions are 00517 // not actually fused. 00518 inline size_t getDepthFactor(Value *V) { 00519 // InsertElement and ExtractElement have a depth factor of zero. This is 00520 // for two reasons: First, they cannot be usefully fused. Second, because 00521 // the pass generates a lot of these, they can confuse the simple metric 00522 // used to compare the dags in the next iteration. Thus, giving them a 00523 // weight of zero allows the pass to essentially ignore them in 00524 // subsequent iterations when looking for vectorization opportunities 00525 // while still tracking dependency chains that flow through those 00526 // instructions. 00527 if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V)) 00528 return 0; 00529 00530 // Give a load or store half of the required depth so that load/store 00531 // pairs will vectorize. 00532 if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V))) 00533 return Config.ReqChainDepth/2; 00534 00535 return 1; 00536 } 00537 00538 // Returns the cost of the provided instruction using TTI. 00539 // This does not handle loads and stores. 00540 unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2, 00541 TargetTransformInfo::OperandValueKind Op1VK = 00542 TargetTransformInfo::OK_AnyValue, 00543 TargetTransformInfo::OperandValueKind Op2VK = 00544 TargetTransformInfo::OK_AnyValue) { 00545 switch (Opcode) { 00546 default: break; 00547 case Instruction::GetElementPtr: 00548 // We mark this instruction as zero-cost because scalar GEPs are usually 00549 // lowered to the instruction addressing mode. At the moment we don't 00550 // generate vector GEPs. 00551 return 0; 00552 case Instruction::Br: 00553 return TTI->getCFInstrCost(Opcode); 00554 case Instruction::PHI: 00555 return 0; 00556 case Instruction::Add: 00557 case Instruction::FAdd: 00558 case Instruction::Sub: 00559 case Instruction::FSub: 00560 case Instruction::Mul: 00561 case Instruction::FMul: 00562 case Instruction::UDiv: 00563 case Instruction::SDiv: 00564 case Instruction::FDiv: 00565 case Instruction::URem: 00566 case Instruction::SRem: 00567 case Instruction::FRem: 00568 case Instruction::Shl: 00569 case Instruction::LShr: 00570 case Instruction::AShr: 00571 case Instruction::And: 00572 case Instruction::Or: 00573 case Instruction::Xor: 00574 return TTI->getArithmeticInstrCost(Opcode, T1, Op1VK, Op2VK); 00575 case Instruction::Select: 00576 case Instruction::ICmp: 00577 case Instruction::FCmp: 00578 return TTI->getCmpSelInstrCost(Opcode, T1, T2); 00579 case Instruction::ZExt: 00580 case Instruction::SExt: 00581 case Instruction::FPToUI: 00582 case Instruction::FPToSI: 00583 case Instruction::FPExt: 00584 case Instruction::PtrToInt: 00585 case Instruction::IntToPtr: 00586 case Instruction::SIToFP: 00587 case Instruction::UIToFP: 00588 case Instruction::Trunc: 00589 case Instruction::FPTrunc: 00590 case Instruction::BitCast: 00591 case Instruction::ShuffleVector: 00592 return TTI->getCastInstrCost(Opcode, T1, T2); 00593 } 00594 00595 return 1; 00596 } 00597 00598 // This determines the relative offset of two loads or stores, returning 00599 // true if the offset could be determined to be some constant value. 00600 // For example, if OffsetInElmts == 1, then J accesses the memory directly 00601 // after I; if OffsetInElmts == -1 then I accesses the memory 00602 // directly after J. 00603 bool getPairPtrInfo(Instruction *I, Instruction *J, 00604 Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, 00605 unsigned &IAddressSpace, unsigned &JAddressSpace, 00606 int64_t &OffsetInElmts, bool ComputeOffset = true) { 00607 OffsetInElmts = 0; 00608 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 00609 LoadInst *LJ = cast<LoadInst>(J); 00610 IPtr = LI->getPointerOperand(); 00611 JPtr = LJ->getPointerOperand(); 00612 IAlignment = LI->getAlignment(); 00613 JAlignment = LJ->getAlignment(); 00614 IAddressSpace = LI->getPointerAddressSpace(); 00615 JAddressSpace = LJ->getPointerAddressSpace(); 00616 } else { 00617 StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J); 00618 IPtr = SI->getPointerOperand(); 00619 JPtr = SJ->getPointerOperand(); 00620 IAlignment = SI->getAlignment(); 00621 JAlignment = SJ->getAlignment(); 00622 IAddressSpace = SI->getPointerAddressSpace(); 00623 JAddressSpace = SJ->getPointerAddressSpace(); 00624 } 00625 00626 if (!ComputeOffset) 00627 return true; 00628 00629 const SCEV *IPtrSCEV = SE->getSCEV(IPtr); 00630 const SCEV *JPtrSCEV = SE->getSCEV(JPtr); 00631 00632 // If this is a trivial offset, then we'll get something like 00633 // 1*sizeof(type). With target data, which we need anyway, this will get 00634 // constant folded into a number. 00635 const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV); 00636 if (const SCEVConstant *ConstOffSCEV = 00637 dyn_cast<SCEVConstant>(OffsetSCEV)) { 00638 ConstantInt *IntOff = ConstOffSCEV->getValue(); 00639 int64_t Offset = IntOff->getSExtValue(); 00640 00641 Type *VTy = IPtr->getType()->getPointerElementType(); 00642 int64_t VTyTSS = (int64_t) DL->getTypeStoreSize(VTy); 00643 00644 Type *VTy2 = JPtr->getType()->getPointerElementType(); 00645 if (VTy != VTy2 && Offset < 0) { 00646 int64_t VTy2TSS = (int64_t) DL->getTypeStoreSize(VTy2); 00647 OffsetInElmts = Offset/VTy2TSS; 00648 return (abs64(Offset) % VTy2TSS) == 0; 00649 } 00650 00651 OffsetInElmts = Offset/VTyTSS; 00652 return (abs64(Offset) % VTyTSS) == 0; 00653 } 00654 00655 return false; 00656 } 00657 00658 // Returns true if the provided CallInst represents an intrinsic that can 00659 // be vectorized. 00660 bool isVectorizableIntrinsic(CallInst* I) { 00661 Function *F = I->getCalledFunction(); 00662 if (!F) return false; 00663 00664 Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID(); 00665 if (!IID) return false; 00666 00667 switch(IID) { 00668 default: 00669 return false; 00670 case Intrinsic::sqrt: 00671 case Intrinsic::powi: 00672 case Intrinsic::sin: 00673 case Intrinsic::cos: 00674 case Intrinsic::log: 00675 case Intrinsic::log2: 00676 case Intrinsic::log10: 00677 case Intrinsic::exp: 00678 case Intrinsic::exp2: 00679 case Intrinsic::pow: 00680 case Intrinsic::round: 00681 case Intrinsic::copysign: 00682 case Intrinsic::ceil: 00683 case Intrinsic::nearbyint: 00684 case Intrinsic::rint: 00685 case Intrinsic::trunc: 00686 case Intrinsic::floor: 00687 case Intrinsic::fabs: 00688 return Config.VectorizeMath; 00689 case Intrinsic::bswap: 00690 case Intrinsic::ctpop: 00691 case Intrinsic::ctlz: 00692 case Intrinsic::cttz: 00693 return Config.VectorizeBitManipulations; 00694 case Intrinsic::fma: 00695 case Intrinsic::fmuladd: 00696 return Config.VectorizeFMA; 00697 } 00698 } 00699 00700 bool isPureIEChain(InsertElementInst *IE) { 00701 InsertElementInst *IENext = IE; 00702 do { 00703 if (!isa<UndefValue>(IENext->getOperand(0)) && 00704 !isa<InsertElementInst>(IENext->getOperand(0))) { 00705 return false; 00706 } 00707 } while ((IENext = 00708 dyn_cast<InsertElementInst>(IENext->getOperand(0)))); 00709 00710 return true; 00711 } 00712 }; 00713 00714 // This function implements one vectorization iteration on the provided 00715 // basic block. It returns true if the block is changed. 00716 bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) { 00717 bool ShouldContinue; 00718 BasicBlock::iterator Start = BB.getFirstInsertionPt(); 00719 00720 std::vector<Value *> AllPairableInsts; 00721 DenseMap<Value *, Value *> AllChosenPairs; 00722 DenseSet<ValuePair> AllFixedOrderPairs; 00723 DenseMap<VPPair, unsigned> AllPairConnectionTypes; 00724 DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs, 00725 AllConnectedPairDeps; 00726 00727 do { 00728 std::vector<Value *> PairableInsts; 00729 DenseMap<Value *, std::vector<Value *> > CandidatePairs; 00730 DenseSet<ValuePair> FixedOrderPairs; 00731 DenseMap<ValuePair, int> CandidatePairCostSavings; 00732 ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, 00733 FixedOrderPairs, 00734 CandidatePairCostSavings, 00735 PairableInsts, NonPow2Len); 00736 if (PairableInsts.empty()) continue; 00737 00738 // Build the candidate pair set for faster lookups. 00739 DenseSet<ValuePair> CandidatePairsSet; 00740 for (DenseMap<Value *, std::vector<Value *> >::iterator I = 00741 CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I) 00742 for (std::vector<Value *>::iterator J = I->second.begin(), 00743 JE = I->second.end(); J != JE; ++J) 00744 CandidatePairsSet.insert(ValuePair(I->first, *J)); 00745 00746 // Now we have a map of all of the pairable instructions and we need to 00747 // select the best possible pairing. A good pairing is one such that the 00748 // users of the pair are also paired. This defines a (directed) forest 00749 // over the pairs such that two pairs are connected iff the second pair 00750 // uses the first. 00751 00752 // Note that it only matters that both members of the second pair use some 00753 // element of the first pair (to allow for splatting). 00754 00755 DenseMap<ValuePair, std::vector<ValuePair> > ConnectedPairs, 00756 ConnectedPairDeps; 00757 DenseMap<VPPair, unsigned> PairConnectionTypes; 00758 computeConnectedPairs(CandidatePairs, CandidatePairsSet, 00759 PairableInsts, ConnectedPairs, PairConnectionTypes); 00760 if (ConnectedPairs.empty()) continue; 00761 00762 for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator 00763 I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); 00764 I != IE; ++I) 00765 for (std::vector<ValuePair>::iterator J = I->second.begin(), 00766 JE = I->second.end(); J != JE; ++J) 00767 ConnectedPairDeps[*J].push_back(I->first); 00768 00769 // Build the pairable-instruction dependency map 00770 DenseSet<ValuePair> PairableInstUsers; 00771 buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers); 00772 00773 // There is now a graph of the connected pairs. For each variable, pick 00774 // the pairing with the largest dag meeting the depth requirement on at 00775 // least one branch. Then select all pairings that are part of that dag 00776 // and remove them from the list of available pairings and pairable 00777 // variables. 00778 00779 DenseMap<Value *, Value *> ChosenPairs; 00780 choosePairs(CandidatePairs, CandidatePairsSet, 00781 CandidatePairCostSavings, 00782 PairableInsts, FixedOrderPairs, PairConnectionTypes, 00783 ConnectedPairs, ConnectedPairDeps, 00784 PairableInstUsers, ChosenPairs); 00785 00786 if (ChosenPairs.empty()) continue; 00787 AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(), 00788 PairableInsts.end()); 00789 AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end()); 00790 00791 // Only for the chosen pairs, propagate information on fixed-order pairs, 00792 // pair connections, and their types to the data structures used by the 00793 // pair fusion procedures. 00794 for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(), 00795 IE = ChosenPairs.end(); I != IE; ++I) { 00796 if (FixedOrderPairs.count(*I)) 00797 AllFixedOrderPairs.insert(*I); 00798 else if (FixedOrderPairs.count(ValuePair(I->second, I->first))) 00799 AllFixedOrderPairs.insert(ValuePair(I->second, I->first)); 00800 00801 for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin(); 00802 J != IE; ++J) { 00803 DenseMap<VPPair, unsigned>::iterator K = 00804 PairConnectionTypes.find(VPPair(*I, *J)); 00805 if (K != PairConnectionTypes.end()) { 00806 AllPairConnectionTypes.insert(*K); 00807 } else { 00808 K = PairConnectionTypes.find(VPPair(*J, *I)); 00809 if (K != PairConnectionTypes.end()) 00810 AllPairConnectionTypes.insert(*K); 00811 } 00812 } 00813 } 00814 00815 for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator 00816 I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); 00817 I != IE; ++I) 00818 for (std::vector<ValuePair>::iterator J = I->second.begin(), 00819 JE = I->second.end(); J != JE; ++J) 00820 if (AllPairConnectionTypes.count(VPPair(I->first, *J))) { 00821 AllConnectedPairs[I->first].push_back(*J); 00822 AllConnectedPairDeps[*J].push_back(I->first); 00823 } 00824 } while (ShouldContinue); 00825 00826 if (AllChosenPairs.empty()) return false; 00827 NumFusedOps += AllChosenPairs.size(); 00828 00829 // A set of pairs has now been selected. It is now necessary to replace the 00830 // paired instructions with vector instructions. For this procedure each 00831 // operand must be replaced with a vector operand. This vector is formed 00832 // by using build_vector on the old operands. The replaced values are then 00833 // replaced with a vector_extract on the result. Subsequent optimization 00834 // passes should coalesce the build/extract combinations. 00835 00836 fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs, 00837 AllPairConnectionTypes, 00838 AllConnectedPairs, AllConnectedPairDeps); 00839 00840 // It is important to cleanup here so that future iterations of this 00841 // function have less work to do. 00842 (void) SimplifyInstructionsInBlock(&BB, DL, AA->getTargetLibraryInfo()); 00843 return true; 00844 } 00845 00846 // This function returns true if the provided instruction is capable of being 00847 // fused into a vector instruction. This determination is based only on the 00848 // type and other attributes of the instruction. 00849 bool BBVectorize::isInstVectorizable(Instruction *I, 00850 bool &IsSimpleLoadStore) { 00851 IsSimpleLoadStore = false; 00852 00853 if (CallInst *C = dyn_cast<CallInst>(I)) { 00854 if (!isVectorizableIntrinsic(C)) 00855 return false; 00856 } else if (LoadInst *L = dyn_cast<LoadInst>(I)) { 00857 // Vectorize simple loads if possbile: 00858 IsSimpleLoadStore = L->isSimple(); 00859 if (!IsSimpleLoadStore || !Config.VectorizeMemOps) 00860 return false; 00861 } else if (StoreInst *S = dyn_cast<StoreInst>(I)) { 00862 // Vectorize simple stores if possbile: 00863 IsSimpleLoadStore = S->isSimple(); 00864 if (!IsSimpleLoadStore || !Config.VectorizeMemOps) 00865 return false; 00866 } else if (CastInst *C = dyn_cast<CastInst>(I)) { 00867 // We can vectorize casts, but not casts of pointer types, etc. 00868 if (!Config.VectorizeCasts) 00869 return false; 00870 00871 Type *SrcTy = C->getSrcTy(); 00872 if (!SrcTy->isSingleValueType()) 00873 return false; 00874 00875 Type *DestTy = C->getDestTy(); 00876 if (!DestTy->isSingleValueType()) 00877 return false; 00878 } else if (isa<SelectInst>(I)) { 00879 if (!Config.VectorizeSelect) 00880 return false; 00881 } else if (isa<CmpInst>(I)) { 00882 if (!Config.VectorizeCmp) 00883 return false; 00884 } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) { 00885 if (!Config.VectorizeGEP) 00886 return false; 00887 00888 // Currently, vector GEPs exist only with one index. 00889 if (G->getNumIndices() != 1) 00890 return false; 00891 } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) || 00892 isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) { 00893 return false; 00894 } 00895 00896 // We can't vectorize memory operations without target data 00897 if (!DL && IsSimpleLoadStore) 00898 return false; 00899 00900 Type *T1, *T2; 00901 getInstructionTypes(I, T1, T2); 00902 00903 // Not every type can be vectorized... 00904 if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) || 00905 !(VectorType::isValidElementType(T2) || T2->isVectorTy())) 00906 return false; 00907 00908 if (T1->getScalarSizeInBits() == 1) { 00909 if (!Config.VectorizeBools) 00910 return false; 00911 } else { 00912 if (!Config.VectorizeInts && T1->isIntOrIntVectorTy()) 00913 return false; 00914 } 00915 00916 if (T2->getScalarSizeInBits() == 1) { 00917 if (!Config.VectorizeBools) 00918 return false; 00919 } else { 00920 if (!Config.VectorizeInts && T2->isIntOrIntVectorTy()) 00921 return false; 00922 } 00923 00924 if (!Config.VectorizeFloats 00925 && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) 00926 return false; 00927 00928 // Don't vectorize target-specific types. 00929 if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy()) 00930 return false; 00931 if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy()) 00932 return false; 00933 00934 if ((!Config.VectorizePointers || !DL) && 00935 (T1->getScalarType()->isPointerTy() || 00936 T2->getScalarType()->isPointerTy())) 00937 return false; 00938 00939 if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits || 00940 T2->getPrimitiveSizeInBits() >= Config.VectorBits)) 00941 return false; 00942 00943 return true; 00944 } 00945 00946 // This function returns true if the two provided instructions are compatible 00947 // (meaning that they can be fused into a vector instruction). This assumes 00948 // that I has already been determined to be vectorizable and that J is not 00949 // in the use dag of I. 00950 bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, 00951 bool IsSimpleLoadStore, bool NonPow2Len, 00952 int &CostSavings, int &FixedOrder) { 00953 DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << 00954 " <-> " << *J << "\n"); 00955 00956 CostSavings = 0; 00957 FixedOrder = 0; 00958 00959 // Loads and stores can be merged if they have different alignments, 00960 // but are otherwise the same. 00961 if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment | 00962 (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0))) 00963 return false; 00964 00965 Type *IT1, *IT2, *JT1, *JT2; 00966 getInstructionTypes(I, IT1, IT2); 00967 getInstructionTypes(J, JT1, JT2); 00968 unsigned MaxTypeBits = std::max( 00969 IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(), 00970 IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits()); 00971 if (!TTI && MaxTypeBits > Config.VectorBits) 00972 return false; 00973 00974 // FIXME: handle addsub-type operations! 00975 00976 if (IsSimpleLoadStore) { 00977 Value *IPtr, *JPtr; 00978 unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; 00979 int64_t OffsetInElmts = 0; 00980 if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, 00981 IAddressSpace, JAddressSpace, 00982 OffsetInElmts) && abs64(OffsetInElmts) == 1) { 00983 FixedOrder = (int) OffsetInElmts; 00984 unsigned BottomAlignment = IAlignment; 00985 if (OffsetInElmts < 0) BottomAlignment = JAlignment; 00986 00987 Type *aTypeI = isa<StoreInst>(I) ? 00988 cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); 00989 Type *aTypeJ = isa<StoreInst>(J) ? 00990 cast<StoreInst>(J)->getValueOperand()->getType() : J->getType(); 00991 Type *VType = getVecTypeForPair(aTypeI, aTypeJ); 00992 00993 if (Config.AlignedOnly) { 00994 // An aligned load or store is possible only if the instruction 00995 // with the lower offset has an alignment suitable for the 00996 // vector type. 00997 00998 unsigned VecAlignment = DL->getPrefTypeAlignment(VType); 00999 if (BottomAlignment < VecAlignment) 01000 return false; 01001 } 01002 01003 if (TTI) { 01004 unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI, 01005 IAlignment, IAddressSpace); 01006 unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ, 01007 JAlignment, JAddressSpace); 01008 unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType, 01009 BottomAlignment, 01010 IAddressSpace); 01011 01012 ICost += TTI->getAddressComputationCost(aTypeI); 01013 JCost += TTI->getAddressComputationCost(aTypeJ); 01014 VCost += TTI->getAddressComputationCost(VType); 01015 01016 if (VCost > ICost + JCost) 01017 return false; 01018 01019 // We don't want to fuse to a type that will be split, even 01020 // if the two input types will also be split and there is no other 01021 // associated cost. 01022 unsigned VParts = TTI->getNumberOfParts(VType); 01023 if (VParts > 1) 01024 return false; 01025 else if (!VParts && VCost == ICost + JCost) 01026 return false; 01027 01028 CostSavings = ICost + JCost - VCost; 01029 } 01030 } else { 01031 return false; 01032 } 01033 } else if (TTI) { 01034 unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2); 01035 unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2); 01036 Type *VT1 = getVecTypeForPair(IT1, JT1), 01037 *VT2 = getVecTypeForPair(IT2, JT2); 01038 TargetTransformInfo::OperandValueKind Op1VK = 01039 TargetTransformInfo::OK_AnyValue; 01040 TargetTransformInfo::OperandValueKind Op2VK = 01041 TargetTransformInfo::OK_AnyValue; 01042 01043 // On some targets (example X86) the cost of a vector shift may vary 01044 // depending on whether the second operand is a Uniform or 01045 // NonUniform Constant. 01046 switch (I->getOpcode()) { 01047 default : break; 01048 case Instruction::Shl: 01049 case Instruction::LShr: 01050 case Instruction::AShr: 01051 01052 // If both I and J are scalar shifts by constant, then the 01053 // merged vector shift count would be either a constant splat value 01054 // or a non-uniform vector of constants. 01055 if (ConstantInt *CII = dyn_cast<ConstantInt>(I->getOperand(1))) { 01056 if (ConstantInt *CIJ = dyn_cast<ConstantInt>(J->getOperand(1))) 01057 Op2VK = CII == CIJ ? TargetTransformInfo::OK_UniformConstantValue : 01058 TargetTransformInfo::OK_NonUniformConstantValue; 01059 } else { 01060 // Check for a splat of a constant or for a non uniform vector 01061 // of constants. 01062 Value *IOp = I->getOperand(1); 01063 Value *JOp = J->getOperand(1); 01064 if ((isa<ConstantVector>(IOp) || isa<ConstantDataVector>(IOp)) && 01065 (isa<ConstantVector>(JOp) || isa<ConstantDataVector>(JOp))) { 01066 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; 01067 Constant *SplatValue = cast<Constant>(IOp)->getSplatValue(); 01068 if (SplatValue != nullptr && 01069 SplatValue == cast<Constant>(JOp)->getSplatValue()) 01070 Op2VK = TargetTransformInfo::OK_UniformConstantValue; 01071 } 01072 } 01073 } 01074 01075 // Note that this procedure is incorrect for insert and extract element 01076 // instructions (because combining these often results in a shuffle), 01077 // but this cost is ignored (because insert and extract element 01078 // instructions are assigned a zero depth factor and are not really 01079 // fused in general). 01080 unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK); 01081 01082 if (VCost > ICost + JCost) 01083 return false; 01084 01085 // We don't want to fuse to a type that will be split, even 01086 // if the two input types will also be split and there is no other 01087 // associated cost. 01088 unsigned VParts1 = TTI->getNumberOfParts(VT1), 01089 VParts2 = TTI->getNumberOfParts(VT2); 01090 if (VParts1 > 1 || VParts2 > 1) 01091 return false; 01092 else if ((!VParts1 || !VParts2) && VCost == ICost + JCost) 01093 return false; 01094 01095 CostSavings = ICost + JCost - VCost; 01096 } 01097 01098 // The powi,ctlz,cttz intrinsics are special because only the first 01099 // argument is vectorized, the second arguments must be equal. 01100 CallInst *CI = dyn_cast<CallInst>(I); 01101 Function *FI; 01102 if (CI && (FI = CI->getCalledFunction())) { 01103 Intrinsic::ID IID = (Intrinsic::ID) FI->getIntrinsicID(); 01104 if (IID == Intrinsic::powi || IID == Intrinsic::ctlz || 01105 IID == Intrinsic::cttz) { 01106 Value *A1I = CI->getArgOperand(1), 01107 *A1J = cast<CallInst>(J)->getArgOperand(1); 01108 const SCEV *A1ISCEV = SE->getSCEV(A1I), 01109 *A1JSCEV = SE->getSCEV(A1J); 01110 return (A1ISCEV == A1JSCEV); 01111 } 01112 01113 if (IID && TTI) { 01114 SmallVector<Type*, 4> Tys; 01115 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) 01116 Tys.push_back(CI->getArgOperand(i)->getType()); 01117 unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys); 01118 01119 Tys.clear(); 01120 CallInst *CJ = cast<CallInst>(J); 01121 for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i) 01122 Tys.push_back(CJ->getArgOperand(i)->getType()); 01123 unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys); 01124 01125 Tys.clear(); 01126 assert(CI->getNumArgOperands() == CJ->getNumArgOperands() && 01127 "Intrinsic argument counts differ"); 01128 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 01129 if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz || 01130 IID == Intrinsic::cttz) && i == 1) 01131 Tys.push_back(CI->getArgOperand(i)->getType()); 01132 else 01133 Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(), 01134 CJ->getArgOperand(i)->getType())); 01135 } 01136 01137 Type *RetTy = getVecTypeForPair(IT1, JT1); 01138 unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys); 01139 01140 if (VCost > ICost + JCost) 01141 return false; 01142 01143 // We don't want to fuse to a type that will be split, even 01144 // if the two input types will also be split and there is no other 01145 // associated cost. 01146 unsigned RetParts = TTI->getNumberOfParts(RetTy); 01147 if (RetParts > 1) 01148 return false; 01149 else if (!RetParts && VCost == ICost + JCost) 01150 return false; 01151 01152 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { 01153 if (!Tys[i]->isVectorTy()) 01154 continue; 01155 01156 unsigned NumParts = TTI->getNumberOfParts(Tys[i]); 01157 if (NumParts > 1) 01158 return false; 01159 else if (!NumParts && VCost == ICost + JCost) 01160 return false; 01161 } 01162 01163 CostSavings = ICost + JCost - VCost; 01164 } 01165 } 01166 01167 return true; 01168 } 01169 01170 // Figure out whether or not J uses I and update the users and write-set 01171 // structures associated with I. Specifically, Users represents the set of 01172 // instructions that depend on I. WriteSet represents the set 01173 // of memory locations that are dependent on I. If UpdateUsers is true, 01174 // and J uses I, then Users is updated to contain J and WriteSet is updated 01175 // to contain any memory locations to which J writes. The function returns 01176 // true if J uses I. By default, alias analysis is used to determine 01177 // whether J reads from memory that overlaps with a location in WriteSet. 01178 // If LoadMoveSet is not null, then it is a previously-computed map 01179 // where the key is the memory-based user instruction and the value is 01180 // the instruction to be compared with I. So, if LoadMoveSet is provided, 01181 // then the alias analysis is not used. This is necessary because this 01182 // function is called during the process of moving instructions during 01183 // vectorization and the results of the alias analysis are not stable during 01184 // that process. 01185 bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users, 01186 AliasSetTracker &WriteSet, Instruction *I, 01187 Instruction *J, bool UpdateUsers, 01188 DenseSet<ValuePair> *LoadMoveSetPairs) { 01189 bool UsesI = false; 01190 01191 // This instruction may already be marked as a user due, for example, to 01192 // being a member of a selected pair. 01193 if (Users.count(J)) 01194 UsesI = true; 01195 01196 if (!UsesI) 01197 for (User::op_iterator JU = J->op_begin(), JE = J->op_end(); 01198 JU != JE; ++JU) { 01199 Value *V = *JU; 01200 if (I == V || Users.count(V)) { 01201 UsesI = true; 01202 break; 01203 } 01204 } 01205 if (!UsesI && J->mayReadFromMemory()) { 01206 if (LoadMoveSetPairs) { 01207 UsesI = LoadMoveSetPairs->count(ValuePair(J, I)); 01208 } else { 01209 for (AliasSetTracker::iterator W = WriteSet.begin(), 01210 WE = WriteSet.end(); W != WE; ++W) { 01211 if (W->aliasesUnknownInst(J, *AA)) { 01212 UsesI = true; 01213 break; 01214 } 01215 } 01216 } 01217 } 01218 01219 if (UsesI && UpdateUsers) { 01220 if (J->mayWriteToMemory()) WriteSet.add(J); 01221 Users.insert(J); 01222 } 01223 01224 return UsesI; 01225 } 01226 01227 // This function iterates over all instruction pairs in the provided 01228 // basic block and collects all candidate pairs for vectorization. 01229 bool BBVectorize::getCandidatePairs(BasicBlock &BB, 01230 BasicBlock::iterator &Start, 01231 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 01232 DenseSet<ValuePair> &FixedOrderPairs, 01233 DenseMap<ValuePair, int> &CandidatePairCostSavings, 01234 std::vector<Value *> &PairableInsts, bool NonPow2Len) { 01235 size_t TotalPairs = 0; 01236 BasicBlock::iterator E = BB.end(); 01237 if (Start == E) return false; 01238 01239 bool ShouldContinue = false, IAfterStart = false; 01240 for (BasicBlock::iterator I = Start++; I != E; ++I) { 01241 if (I == Start) IAfterStart = true; 01242 01243 bool IsSimpleLoadStore; 01244 if (!isInstVectorizable(I, IsSimpleLoadStore)) continue; 01245 01246 // Look for an instruction with which to pair instruction *I... 01247 DenseSet<Value *> Users; 01248 AliasSetTracker WriteSet(*AA); 01249 if (I->mayWriteToMemory()) WriteSet.add(I); 01250 01251 bool JAfterStart = IAfterStart; 01252 BasicBlock::iterator J = std::next(I); 01253 for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) { 01254 if (J == Start) JAfterStart = true; 01255 01256 // Determine if J uses I, if so, exit the loop. 01257 bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !Config.FastDep); 01258 if (Config.FastDep) { 01259 // Note: For this heuristic to be effective, independent operations 01260 // must tend to be intermixed. This is likely to be true from some 01261 // kinds of grouped loop unrolling (but not the generic LLVM pass), 01262 // but otherwise may require some kind of reordering pass. 01263 01264 // When using fast dependency analysis, 01265 // stop searching after first use: 01266 if (UsesI) break; 01267 } else { 01268 if (UsesI) continue; 01269 } 01270 01271 // J does not use I, and comes before the first use of I, so it can be 01272 // merged with I if the instructions are compatible. 01273 int CostSavings, FixedOrder; 01274 if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len, 01275 CostSavings, FixedOrder)) continue; 01276 01277 // J is a candidate for merging with I. 01278 if (!PairableInsts.size() || 01279 PairableInsts[PairableInsts.size()-1] != I) { 01280 PairableInsts.push_back(I); 01281 } 01282 01283 CandidatePairs[I].push_back(J); 01284 ++TotalPairs; 01285 if (TTI) 01286 CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J), 01287 CostSavings)); 01288 01289 if (FixedOrder == 1) 01290 FixedOrderPairs.insert(ValuePair(I, J)); 01291 else if (FixedOrder == -1) 01292 FixedOrderPairs.insert(ValuePair(J, I)); 01293 01294 // The next call to this function must start after the last instruction 01295 // selected during this invocation. 01296 if (JAfterStart) { 01297 Start = std::next(J); 01298 IAfterStart = JAfterStart = false; 01299 } 01300 01301 DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " 01302 << *I << " <-> " << *J << " (cost savings: " << 01303 CostSavings << ")\n"); 01304 01305 // If we have already found too many pairs, break here and this function 01306 // will be called again starting after the last instruction selected 01307 // during this invocation. 01308 if (PairableInsts.size() >= Config.MaxInsts || 01309 TotalPairs >= Config.MaxPairs) { 01310 ShouldContinue = true; 01311 break; 01312 } 01313 } 01314 01315 if (ShouldContinue) 01316 break; 01317 } 01318 01319 DEBUG(dbgs() << "BBV: found " << PairableInsts.size() 01320 << " instructions with candidate pairs\n"); 01321 01322 return ShouldContinue; 01323 } 01324 01325 // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that 01326 // it looks for pairs such that both members have an input which is an 01327 // output of PI or PJ. 01328 void BBVectorize::computePairsConnectedTo( 01329 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 01330 DenseSet<ValuePair> &CandidatePairsSet, 01331 std::vector<Value *> &PairableInsts, 01332 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 01333 DenseMap<VPPair, unsigned> &PairConnectionTypes, 01334 ValuePair P) { 01335 StoreInst *SI, *SJ; 01336 01337 // For each possible pairing for this variable, look at the uses of 01338 // the first value... 01339 for (Value::user_iterator I = P.first->user_begin(), 01340 E = P.first->user_end(); 01341 I != E; ++I) { 01342 User *UI = *I; 01343 if (isa<LoadInst>(UI)) { 01344 // A pair cannot be connected to a load because the load only takes one 01345 // operand (the address) and it is a scalar even after vectorization. 01346 continue; 01347 } else if ((SI = dyn_cast<StoreInst>(UI)) && 01348 P.first == SI->getPointerOperand()) { 01349 // Similarly, a pair cannot be connected to a store through its 01350 // pointer operand. 01351 continue; 01352 } 01353 01354 // For each use of the first variable, look for uses of the second 01355 // variable... 01356 for (User *UJ : P.second->users()) { 01357 if ((SJ = dyn_cast<StoreInst>(UJ)) && 01358 P.second == SJ->getPointerOperand()) 01359 continue; 01360 01361 // Look for <I, J>: 01362 if (CandidatePairsSet.count(ValuePair(UI, UJ))) { 01363 VPPair VP(P, ValuePair(UI, UJ)); 01364 ConnectedPairs[VP.first].push_back(VP.second); 01365 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect)); 01366 } 01367 01368 // Look for <J, I>: 01369 if (CandidatePairsSet.count(ValuePair(UJ, UI))) { 01370 VPPair VP(P, ValuePair(UJ, UI)); 01371 ConnectedPairs[VP.first].push_back(VP.second); 01372 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap)); 01373 } 01374 } 01375 01376 if (Config.SplatBreaksChain) continue; 01377 // Look for cases where just the first value in the pair is used by 01378 // both members of another pair (splatting). 01379 for (Value::user_iterator J = P.first->user_begin(); J != E; ++J) { 01380 User *UJ = *J; 01381 if ((SJ = dyn_cast<StoreInst>(UJ)) && 01382 P.first == SJ->getPointerOperand()) 01383 continue; 01384 01385 if (CandidatePairsSet.count(ValuePair(UI, UJ))) { 01386 VPPair VP(P, ValuePair(UI, UJ)); 01387 ConnectedPairs[VP.first].push_back(VP.second); 01388 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); 01389 } 01390 } 01391 } 01392 01393 if (Config.SplatBreaksChain) return; 01394 // Look for cases where just the second value in the pair is used by 01395 // both members of another pair (splatting). 01396 for (Value::user_iterator I = P.second->user_begin(), 01397 E = P.second->user_end(); 01398 I != E; ++I) { 01399 User *UI = *I; 01400 if (isa<LoadInst>(UI)) 01401 continue; 01402 else if ((SI = dyn_cast<StoreInst>(UI)) && 01403 P.second == SI->getPointerOperand()) 01404 continue; 01405 01406 for (Value::user_iterator J = P.second->user_begin(); J != E; ++J) { 01407 User *UJ = *J; 01408 if ((SJ = dyn_cast<StoreInst>(UJ)) && 01409 P.second == SJ->getPointerOperand()) 01410 continue; 01411 01412 if (CandidatePairsSet.count(ValuePair(UI, UJ))) { 01413 VPPair VP(P, ValuePair(UI, UJ)); 01414 ConnectedPairs[VP.first].push_back(VP.second); 01415 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); 01416 } 01417 } 01418 } 01419 } 01420 01421 // This function figures out which pairs are connected. Two pairs are 01422 // connected if some output of the first pair forms an input to both members 01423 // of the second pair. 01424 void BBVectorize::computeConnectedPairs( 01425 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 01426 DenseSet<ValuePair> &CandidatePairsSet, 01427 std::vector<Value *> &PairableInsts, 01428 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 01429 DenseMap<VPPair, unsigned> &PairConnectionTypes) { 01430 for (std::vector<Value *>::iterator PI = PairableInsts.begin(), 01431 PE = PairableInsts.end(); PI != PE; ++PI) { 01432 DenseMap<Value *, std::vector<Value *> >::iterator PP = 01433 CandidatePairs.find(*PI); 01434 if (PP == CandidatePairs.end()) 01435 continue; 01436 01437 for (std::vector<Value *>::iterator P = PP->second.begin(), 01438 E = PP->second.end(); P != E; ++P) 01439 computePairsConnectedTo(CandidatePairs, CandidatePairsSet, 01440 PairableInsts, ConnectedPairs, 01441 PairConnectionTypes, ValuePair(*PI, *P)); 01442 } 01443 01444 DEBUG(size_t TotalPairs = 0; 01445 for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I = 01446 ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I) 01447 TotalPairs += I->second.size(); 01448 dbgs() << "BBV: found " << TotalPairs 01449 << " pair connections.\n"); 01450 } 01451 01452 // This function builds a set of use tuples such that <A, B> is in the set 01453 // if B is in the use dag of A. If B is in the use dag of A, then B 01454 // depends on the output of A. 01455 void BBVectorize::buildDepMap( 01456 BasicBlock &BB, 01457 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 01458 std::vector<Value *> &PairableInsts, 01459 DenseSet<ValuePair> &PairableInstUsers) { 01460 DenseSet<Value *> IsInPair; 01461 for (DenseMap<Value *, std::vector<Value *> >::iterator C = 01462 CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) { 01463 IsInPair.insert(C->first); 01464 IsInPair.insert(C->second.begin(), C->second.end()); 01465 } 01466 01467 // Iterate through the basic block, recording all users of each 01468 // pairable instruction. 01469 01470 BasicBlock::iterator E = BB.end(), EL = 01471 BasicBlock::iterator(cast<Instruction>(PairableInsts.back())); 01472 for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) { 01473 if (IsInPair.find(I) == IsInPair.end()) continue; 01474 01475 DenseSet<Value *> Users; 01476 AliasSetTracker WriteSet(*AA); 01477 if (I->mayWriteToMemory()) WriteSet.add(I); 01478 01479 for (BasicBlock::iterator J = std::next(I); J != E; ++J) { 01480 (void) trackUsesOfI(Users, WriteSet, I, J); 01481 01482 if (J == EL) 01483 break; 01484 } 01485 01486 for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end(); 01487 U != E; ++U) { 01488 if (IsInPair.find(*U) == IsInPair.end()) continue; 01489 PairableInstUsers.insert(ValuePair(I, *U)); 01490 } 01491 01492 if (I == EL) 01493 break; 01494 } 01495 } 01496 01497 // Returns true if an input to pair P is an output of pair Q and also an 01498 // input of pair Q is an output of pair P. If this is the case, then these 01499 // two pairs cannot be simultaneously fused. 01500 bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q, 01501 DenseSet<ValuePair> &PairableInstUsers, 01502 DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap, 01503 DenseSet<VPPair> *PairableInstUserPairSet) { 01504 // Two pairs are in conflict if they are mutual Users of eachother. 01505 bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) || 01506 PairableInstUsers.count(ValuePair(P.first, Q.second)) || 01507 PairableInstUsers.count(ValuePair(P.second, Q.first)) || 01508 PairableInstUsers.count(ValuePair(P.second, Q.second)); 01509 bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) || 01510 PairableInstUsers.count(ValuePair(Q.first, P.second)) || 01511 PairableInstUsers.count(ValuePair(Q.second, P.first)) || 01512 PairableInstUsers.count(ValuePair(Q.second, P.second)); 01513 if (PairableInstUserMap) { 01514 // FIXME: The expensive part of the cycle check is not so much the cycle 01515 // check itself but this edge insertion procedure. This needs some 01516 // profiling and probably a different data structure. 01517 if (PUsesQ) { 01518 if (PairableInstUserPairSet->insert(VPPair(Q, P)).second) 01519 (*PairableInstUserMap)[Q].push_back(P); 01520 } 01521 if (QUsesP) { 01522 if (PairableInstUserPairSet->insert(VPPair(P, Q)).second) 01523 (*PairableInstUserMap)[P].push_back(Q); 01524 } 01525 } 01526 01527 return (QUsesP && PUsesQ); 01528 } 01529 01530 // This function walks the use graph of current pairs to see if, starting 01531 // from P, the walk returns to P. 01532 bool BBVectorize::pairWillFormCycle(ValuePair P, 01533 DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, 01534 DenseSet<ValuePair> &CurrentPairs) { 01535 DEBUG(if (DebugCycleCheck) 01536 dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> " 01537 << *P.second << "\n"); 01538 // A lookup table of visisted pairs is kept because the PairableInstUserMap 01539 // contains non-direct associations. 01540 DenseSet<ValuePair> Visited; 01541 SmallVector<ValuePair, 32> Q; 01542 // General depth-first post-order traversal: 01543 Q.push_back(P); 01544 do { 01545 ValuePair QTop = Q.pop_back_val(); 01546 Visited.insert(QTop); 01547 01548 DEBUG(if (DebugCycleCheck) 01549 dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> " 01550 << *QTop.second << "\n"); 01551 DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = 01552 PairableInstUserMap.find(QTop); 01553 if (QQ == PairableInstUserMap.end()) 01554 continue; 01555 01556 for (std::vector<ValuePair>::iterator C = QQ->second.begin(), 01557 CE = QQ->second.end(); C != CE; ++C) { 01558 if (*C == P) { 01559 DEBUG(dbgs() 01560 << "BBV: rejected to prevent non-trivial cycle formation: " 01561 << QTop.first << " <-> " << C->second << "\n"); 01562 return true; 01563 } 01564 01565 if (CurrentPairs.count(*C) && !Visited.count(*C)) 01566 Q.push_back(*C); 01567 } 01568 } while (!Q.empty()); 01569 01570 return false; 01571 } 01572 01573 // This function builds the initial dag of connected pairs with the 01574 // pair J at the root. 01575 void BBVectorize::buildInitialDAGFor( 01576 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 01577 DenseSet<ValuePair> &CandidatePairsSet, 01578 std::vector<Value *> &PairableInsts, 01579 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 01580 DenseSet<ValuePair> &PairableInstUsers, 01581 DenseMap<Value *, Value *> &ChosenPairs, 01582 DenseMap<ValuePair, size_t> &DAG, ValuePair J) { 01583 // Each of these pairs is viewed as the root node of a DAG. The DAG 01584 // is then walked (depth-first). As this happens, we keep track of 01585 // the pairs that compose the DAG and the maximum depth of the DAG. 01586 SmallVector<ValuePairWithDepth, 32> Q; 01587 // General depth-first post-order traversal: 01588 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); 01589 do { 01590 ValuePairWithDepth QTop = Q.back(); 01591 01592 // Push each child onto the queue: 01593 bool MoreChildren = false; 01594 size_t MaxChildDepth = QTop.second; 01595 DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = 01596 ConnectedPairs.find(QTop.first); 01597 if (QQ != ConnectedPairs.end()) 01598 for (std::vector<ValuePair>::iterator k = QQ->second.begin(), 01599 ke = QQ->second.end(); k != ke; ++k) { 01600 // Make sure that this child pair is still a candidate: 01601 if (CandidatePairsSet.count(*k)) { 01602 DenseMap<ValuePair, size_t>::iterator C = DAG.find(*k); 01603 if (C == DAG.end()) { 01604 size_t d = getDepthFactor(k->first); 01605 Q.push_back(ValuePairWithDepth(*k, QTop.second+d)); 01606 MoreChildren = true; 01607 } else { 01608 MaxChildDepth = std::max(MaxChildDepth, C->second); 01609 } 01610 } 01611 } 01612 01613 if (!MoreChildren) { 01614 // Record the current pair as part of the DAG: 01615 DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth)); 01616 Q.pop_back(); 01617 } 01618 } while (!Q.empty()); 01619 } 01620 01621 // Given some initial dag, prune it by removing conflicting pairs (pairs 01622 // that cannot be simultaneously chosen for vectorization). 01623 void BBVectorize::pruneDAGFor( 01624 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 01625 std::vector<Value *> &PairableInsts, 01626 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 01627 DenseSet<ValuePair> &PairableInstUsers, 01628 DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, 01629 DenseSet<VPPair> &PairableInstUserPairSet, 01630 DenseMap<Value *, Value *> &ChosenPairs, 01631 DenseMap<ValuePair, size_t> &DAG, 01632 DenseSet<ValuePair> &PrunedDAG, ValuePair J, 01633 bool UseCycleCheck) { 01634 SmallVector<ValuePairWithDepth, 32> Q; 01635 // General depth-first post-order traversal: 01636 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); 01637 do { 01638 ValuePairWithDepth QTop = Q.pop_back_val(); 01639 PrunedDAG.insert(QTop.first); 01640 01641 // Visit each child, pruning as necessary... 01642 SmallVector<ValuePairWithDepth, 8> BestChildren; 01643 DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ = 01644 ConnectedPairs.find(QTop.first); 01645 if (QQ == ConnectedPairs.end()) 01646 continue; 01647 01648 for (std::vector<ValuePair>::iterator K = QQ->second.begin(), 01649 KE = QQ->second.end(); K != KE; ++K) { 01650 DenseMap<ValuePair, size_t>::iterator C = DAG.find(*K); 01651 if (C == DAG.end()) continue; 01652 01653 // This child is in the DAG, now we need to make sure it is the 01654 // best of any conflicting children. There could be multiple 01655 // conflicting children, so first, determine if we're keeping 01656 // this child, then delete conflicting children as necessary. 01657 01658 // It is also necessary to guard against pairing-induced 01659 // dependencies. Consider instructions a .. x .. y .. b 01660 // such that (a,b) are to be fused and (x,y) are to be fused 01661 // but a is an input to x and b is an output from y. This 01662 // means that y cannot be moved after b but x must be moved 01663 // after b for (a,b) to be fused. In other words, after 01664 // fusing (a,b) we have y .. a/b .. x where y is an input 01665 // to a/b and x is an output to a/b: x and y can no longer 01666 // be legally fused. To prevent this condition, we must 01667 // make sure that a child pair added to the DAG is not 01668 // both an input and output of an already-selected pair. 01669 01670 // Pairing-induced dependencies can also form from more complicated 01671 // cycles. The pair vs. pair conflicts are easy to check, and so 01672 // that is done explicitly for "fast rejection", and because for 01673 // child vs. child conflicts, we may prefer to keep the current 01674 // pair in preference to the already-selected child. 01675 DenseSet<ValuePair> CurrentPairs; 01676 01677 bool CanAdd = true; 01678 for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 01679 = BestChildren.begin(), E2 = BestChildren.end(); 01680 C2 != E2; ++C2) { 01681 if (C2->first.first == C->first.first || 01682 C2->first.first == C->first.second || 01683 C2->first.second == C->first.first || 01684 C2->first.second == C->first.second || 01685 pairsConflict(C2->first, C->first, PairableInstUsers, 01686 UseCycleCheck ? &PairableInstUserMap : nullptr, 01687 UseCycleCheck ? &PairableInstUserPairSet 01688 : nullptr)) { 01689 if (C2->second >= C->second) { 01690 CanAdd = false; 01691 break; 01692 } 01693 01694 CurrentPairs.insert(C2->first); 01695 } 01696 } 01697 if (!CanAdd) continue; 01698 01699 // Even worse, this child could conflict with another node already 01700 // selected for the DAG. If that is the case, ignore this child. 01701 for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(), 01702 E2 = PrunedDAG.end(); T != E2; ++T) { 01703 if (T->first == C->first.first || 01704 T->first == C->first.second || 01705 T->second == C->first.first || 01706 T->second == C->first.second || 01707 pairsConflict(*T, C->first, PairableInstUsers, 01708 UseCycleCheck ? &PairableInstUserMap : nullptr, 01709 UseCycleCheck ? &PairableInstUserPairSet 01710 : nullptr)) { 01711 CanAdd = false; 01712 break; 01713 } 01714 01715 CurrentPairs.insert(*T); 01716 } 01717 if (!CanAdd) continue; 01718 01719 // And check the queue too... 01720 for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = Q.begin(), 01721 E2 = Q.end(); C2 != E2; ++C2) { 01722 if (C2->first.first == C->first.first || 01723 C2->first.first == C->first.second || 01724 C2->first.second == C->first.first || 01725 C2->first.second == C->first.second || 01726 pairsConflict(C2->first, C->first, PairableInstUsers, 01727 UseCycleCheck ? &PairableInstUserMap : nullptr, 01728 UseCycleCheck ? &PairableInstUserPairSet 01729 : nullptr)) { 01730 CanAdd = false; 01731 break; 01732 } 01733 01734 CurrentPairs.insert(C2->first); 01735 } 01736 if (!CanAdd) continue; 01737 01738 // Last but not least, check for a conflict with any of the 01739 // already-chosen pairs. 01740 for (DenseMap<Value *, Value *>::iterator C2 = 01741 ChosenPairs.begin(), E2 = ChosenPairs.end(); 01742 C2 != E2; ++C2) { 01743 if (pairsConflict(*C2, C->first, PairableInstUsers, 01744 UseCycleCheck ? &PairableInstUserMap : nullptr, 01745 UseCycleCheck ? &PairableInstUserPairSet 01746 : nullptr)) { 01747 CanAdd = false; 01748 break; 01749 } 01750 01751 CurrentPairs.insert(*C2); 01752 } 01753 if (!CanAdd) continue; 01754 01755 // To check for non-trivial cycles formed by the addition of the 01756 // current pair we've formed a list of all relevant pairs, now use a 01757 // graph walk to check for a cycle. We start from the current pair and 01758 // walk the use dag to see if we again reach the current pair. If we 01759 // do, then the current pair is rejected. 01760 01761 // FIXME: It may be more efficient to use a topological-ordering 01762 // algorithm to improve the cycle check. This should be investigated. 01763 if (UseCycleCheck && 01764 pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs)) 01765 continue; 01766 01767 // This child can be added, but we may have chosen it in preference 01768 // to an already-selected child. Check for this here, and if a 01769 // conflict is found, then remove the previously-selected child 01770 // before adding this one in its place. 01771 for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 01772 = BestChildren.begin(); C2 != BestChildren.end();) { 01773 if (C2->first.first == C->first.first || 01774 C2->first.first == C->first.second || 01775 C2->first.second == C->first.first || 01776 C2->first.second == C->first.second || 01777 pairsConflict(C2->first, C->first, PairableInstUsers)) 01778 C2 = BestChildren.erase(C2); 01779 else 01780 ++C2; 01781 } 01782 01783 BestChildren.push_back(ValuePairWithDepth(C->first, C->second)); 01784 } 01785 01786 for (SmallVectorImpl<ValuePairWithDepth>::iterator C 01787 = BestChildren.begin(), E2 = BestChildren.end(); 01788 C != E2; ++C) { 01789 size_t DepthF = getDepthFactor(C->first.first); 01790 Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF)); 01791 } 01792 } while (!Q.empty()); 01793 } 01794 01795 // This function finds the best dag of mututally-compatible connected 01796 // pairs, given the choice of root pairs as an iterator range. 01797 void BBVectorize::findBestDAGFor( 01798 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 01799 DenseSet<ValuePair> &CandidatePairsSet, 01800 DenseMap<ValuePair, int> &CandidatePairCostSavings, 01801 std::vector<Value *> &PairableInsts, 01802 DenseSet<ValuePair> &FixedOrderPairs, 01803 DenseMap<VPPair, unsigned> &PairConnectionTypes, 01804 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 01805 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, 01806 DenseSet<ValuePair> &PairableInstUsers, 01807 DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, 01808 DenseSet<VPPair> &PairableInstUserPairSet, 01809 DenseMap<Value *, Value *> &ChosenPairs, 01810 DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth, 01811 int &BestEffSize, Value *II, std::vector<Value *>&JJ, 01812 bool UseCycleCheck) { 01813 for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end(); 01814 J != JE; ++J) { 01815 ValuePair IJ(II, *J); 01816 if (!CandidatePairsSet.count(IJ)) 01817 continue; 01818 01819 // Before going any further, make sure that this pair does not 01820 // conflict with any already-selected pairs (see comment below 01821 // near the DAG pruning for more details). 01822 DenseSet<ValuePair> ChosenPairSet; 01823 bool DoesConflict = false; 01824 for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(), 01825 E = ChosenPairs.end(); C != E; ++C) { 01826 if (pairsConflict(*C, IJ, PairableInstUsers, 01827 UseCycleCheck ? &PairableInstUserMap : nullptr, 01828 UseCycleCheck ? &PairableInstUserPairSet : nullptr)) { 01829 DoesConflict = true; 01830 break; 01831 } 01832 01833 ChosenPairSet.insert(*C); 01834 } 01835 if (DoesConflict) continue; 01836 01837 if (UseCycleCheck && 01838 pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet)) 01839 continue; 01840 01841 DenseMap<ValuePair, size_t> DAG; 01842 buildInitialDAGFor(CandidatePairs, CandidatePairsSet, 01843 PairableInsts, ConnectedPairs, 01844 PairableInstUsers, ChosenPairs, DAG, IJ); 01845 01846 // Because we'll keep the child with the largest depth, the largest 01847 // depth is still the same in the unpruned DAG. 01848 size_t MaxDepth = DAG.lookup(IJ); 01849 01850 DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {" 01851 << *IJ.first << " <-> " << *IJ.second << "} of depth " << 01852 MaxDepth << " and size " << DAG.size() << "\n"); 01853 01854 // At this point the DAG has been constructed, but, may contain 01855 // contradictory children (meaning that different children of 01856 // some dag node may be attempting to fuse the same instruction). 01857 // So now we walk the dag again, in the case of a conflict, 01858 // keep only the child with the largest depth. To break a tie, 01859 // favor the first child. 01860 01861 DenseSet<ValuePair> PrunedDAG; 01862 pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs, 01863 PairableInstUsers, PairableInstUserMap, 01864 PairableInstUserPairSet, 01865 ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck); 01866 01867 int EffSize = 0; 01868 if (TTI) { 01869 DenseSet<Value *> PrunedDAGInstrs; 01870 for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(), 01871 E = PrunedDAG.end(); S != E; ++S) { 01872 PrunedDAGInstrs.insert(S->first); 01873 PrunedDAGInstrs.insert(S->second); 01874 } 01875 01876 // The set of pairs that have already contributed to the total cost. 01877 DenseSet<ValuePair> IncomingPairs; 01878 01879 // If the cost model were perfect, this might not be necessary; but we 01880 // need to make sure that we don't get stuck vectorizing our own 01881 // shuffle chains. 01882 bool HasNontrivialInsts = false; 01883 01884 // The node weights represent the cost savings associated with 01885 // fusing the pair of instructions. 01886 for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(), 01887 E = PrunedDAG.end(); S != E; ++S) { 01888 if (!isa<ShuffleVectorInst>(S->first) && 01889 !isa<InsertElementInst>(S->first) && 01890 !isa<ExtractElementInst>(S->first)) 01891 HasNontrivialInsts = true; 01892 01893 bool FlipOrder = false; 01894 01895 if (getDepthFactor(S->first)) { 01896 int ESContrib = CandidatePairCostSavings.find(*S)->second; 01897 DEBUG(if (DebugPairSelection) dbgs() << "\tweight {" 01898 << *S->first << " <-> " << *S->second << "} = " << 01899 ESContrib << "\n"); 01900 EffSize += ESContrib; 01901 } 01902 01903 // The edge weights contribute in a negative sense: they represent 01904 // the cost of shuffles. 01905 DenseMap<ValuePair, std::vector<ValuePair> >::iterator SS = 01906 ConnectedPairDeps.find(*S); 01907 if (SS != ConnectedPairDeps.end()) { 01908 unsigned NumDepsDirect = 0, NumDepsSwap = 0; 01909 for (std::vector<ValuePair>::iterator T = SS->second.begin(), 01910 TE = SS->second.end(); T != TE; ++T) { 01911 VPPair Q(*S, *T); 01912 if (!PrunedDAG.count(Q.second)) 01913 continue; 01914 DenseMap<VPPair, unsigned>::iterator R = 01915 PairConnectionTypes.find(VPPair(Q.second, Q.first)); 01916 assert(R != PairConnectionTypes.end() && 01917 "Cannot find pair connection type"); 01918 if (R->second == PairConnectionDirect) 01919 ++NumDepsDirect; 01920 else if (R->second == PairConnectionSwap) 01921 ++NumDepsSwap; 01922 } 01923 01924 // If there are more swaps than direct connections, then 01925 // the pair order will be flipped during fusion. So the real 01926 // number of swaps is the minimum number. 01927 FlipOrder = !FixedOrderPairs.count(*S) && 01928 ((NumDepsSwap > NumDepsDirect) || 01929 FixedOrderPairs.count(ValuePair(S->second, S->first))); 01930 01931 for (std::vector<ValuePair>::iterator T = SS->second.begin(), 01932 TE = SS->second.end(); T != TE; ++T) { 01933 VPPair Q(*S, *T); 01934 if (!PrunedDAG.count(Q.second)) 01935 continue; 01936 DenseMap<VPPair, unsigned>::iterator R = 01937 PairConnectionTypes.find(VPPair(Q.second, Q.first)); 01938 assert(R != PairConnectionTypes.end() && 01939 "Cannot find pair connection type"); 01940 Type *Ty1 = Q.second.first->getType(), 01941 *Ty2 = Q.second.second->getType(); 01942 Type *VTy = getVecTypeForPair(Ty1, Ty2); 01943 if ((R->second == PairConnectionDirect && FlipOrder) || 01944 (R->second == PairConnectionSwap && !FlipOrder) || 01945 R->second == PairConnectionSplat) { 01946 int ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 01947 VTy, VTy); 01948 01949 if (VTy->getVectorNumElements() == 2) { 01950 if (R->second == PairConnectionSplat) 01951 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( 01952 TargetTransformInfo::SK_Broadcast, VTy)); 01953 else 01954 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( 01955 TargetTransformInfo::SK_Reverse, VTy)); 01956 } 01957 01958 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << 01959 *Q.second.first << " <-> " << *Q.second.second << 01960 "} -> {" << 01961 *S->first << " <-> " << *S->second << "} = " << 01962 ESContrib << "\n"); 01963 EffSize -= ESContrib; 01964 } 01965 } 01966 } 01967 01968 // Compute the cost of outgoing edges. We assume that edges outgoing 01969 // to shuffles, inserts or extracts can be merged, and so contribute 01970 // no additional cost. 01971 if (!S->first->getType()->isVoidTy()) { 01972 Type *Ty1 = S->first->getType(), 01973 *Ty2 = S->second->getType(); 01974 Type *VTy = getVecTypeForPair(Ty1, Ty2); 01975 01976 bool NeedsExtraction = false; 01977 for (User *U : S->first->users()) { 01978 if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) { 01979 // Shuffle can be folded if it has no other input 01980 if (isa<UndefValue>(SI->getOperand(1))) 01981 continue; 01982 } 01983 if (isa<ExtractElementInst>(U)) 01984 continue; 01985 if (PrunedDAGInstrs.count(U)) 01986 continue; 01987 NeedsExtraction = true; 01988 break; 01989 } 01990 01991 if (NeedsExtraction) { 01992 int ESContrib; 01993 if (Ty1->isVectorTy()) { 01994 ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 01995 Ty1, VTy); 01996 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( 01997 TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1)); 01998 } else 01999 ESContrib = (int) TTI->getVectorInstrCost( 02000 Instruction::ExtractElement, VTy, 0); 02001 02002 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << 02003 *S->first << "} = " << ESContrib << "\n"); 02004 EffSize -= ESContrib; 02005 } 02006 02007 NeedsExtraction = false; 02008 for (User *U : S->second->users()) { 02009 if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) { 02010 // Shuffle can be folded if it has no other input 02011 if (isa<UndefValue>(SI->getOperand(1))) 02012 continue; 02013 } 02014 if (isa<ExtractElementInst>(U)) 02015 continue; 02016 if (PrunedDAGInstrs.count(U)) 02017 continue; 02018 NeedsExtraction = true; 02019 break; 02020 } 02021 02022 if (NeedsExtraction) { 02023 int ESContrib; 02024 if (Ty2->isVectorTy()) { 02025 ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 02026 Ty2, VTy); 02027 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( 02028 TargetTransformInfo::SK_ExtractSubvector, VTy, 02029 Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2)); 02030 } else 02031 ESContrib = (int) TTI->getVectorInstrCost( 02032 Instruction::ExtractElement, VTy, 1); 02033 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << 02034 *S->second << "} = " << ESContrib << "\n"); 02035 EffSize -= ESContrib; 02036 } 02037 } 02038 02039 // Compute the cost of incoming edges. 02040 if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) { 02041 Instruction *S1 = cast<Instruction>(S->first), 02042 *S2 = cast<Instruction>(S->second); 02043 for (unsigned o = 0; o < S1->getNumOperands(); ++o) { 02044 Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o); 02045 02046 // Combining constants into vector constants (or small vector 02047 // constants into larger ones are assumed free). 02048 if (isa<Constant>(O1) && isa<Constant>(O2)) 02049 continue; 02050 02051 if (FlipOrder) 02052 std::swap(O1, O2); 02053 02054 ValuePair VP = ValuePair(O1, O2); 02055 ValuePair VPR = ValuePair(O2, O1); 02056 02057 // Internal edges are not handled here. 02058 if (PrunedDAG.count(VP) || PrunedDAG.count(VPR)) 02059 continue; 02060 02061 Type *Ty1 = O1->getType(), 02062 *Ty2 = O2->getType(); 02063 Type *VTy = getVecTypeForPair(Ty1, Ty2); 02064 02065 // Combining vector operations of the same type is also assumed 02066 // folded with other operations. 02067 if (Ty1 == Ty2) { 02068 // If both are insert elements, then both can be widened. 02069 InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1), 02070 *IEO2 = dyn_cast<InsertElementInst>(O2); 02071 if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2)) 02072 continue; 02073 // If both are extract elements, and both have the same input 02074 // type, then they can be replaced with a shuffle 02075 ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1), 02076 *EIO2 = dyn_cast<ExtractElementInst>(O2); 02077 if (EIO1 && EIO2 && 02078 EIO1->getOperand(0)->getType() == 02079 EIO2->getOperand(0)->getType()) 02080 continue; 02081 // If both are a shuffle with equal operand types and only two 02082 // unqiue operands, then they can be replaced with a single 02083 // shuffle 02084 ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1), 02085 *SIO2 = dyn_cast<ShuffleVectorInst>(O2); 02086 if (SIO1 && SIO2 && 02087 SIO1->getOperand(0)->getType() == 02088 SIO2->getOperand(0)->getType()) { 02089 SmallSet<Value *, 4> SIOps; 02090 SIOps.insert(SIO1->getOperand(0)); 02091 SIOps.insert(SIO1->getOperand(1)); 02092 SIOps.insert(SIO2->getOperand(0)); 02093 SIOps.insert(SIO2->getOperand(1)); 02094 if (SIOps.size() <= 2) 02095 continue; 02096 } 02097 } 02098 02099 int ESContrib; 02100 // This pair has already been formed. 02101 if (IncomingPairs.count(VP)) { 02102 continue; 02103 } else if (IncomingPairs.count(VPR)) { 02104 ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 02105 VTy, VTy); 02106 02107 if (VTy->getVectorNumElements() == 2) 02108 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost( 02109 TargetTransformInfo::SK_Reverse, VTy)); 02110 } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) { 02111 ESContrib = (int) TTI->getVectorInstrCost( 02112 Instruction::InsertElement, VTy, 0); 02113 ESContrib += (int) TTI->getVectorInstrCost( 02114 Instruction::InsertElement, VTy, 1); 02115 } else if (!Ty1->isVectorTy()) { 02116 // O1 needs to be inserted into a vector of size O2, and then 02117 // both need to be shuffled together. 02118 ESContrib = (int) TTI->getVectorInstrCost( 02119 Instruction::InsertElement, Ty2, 0); 02120 ESContrib += (int) getInstrCost(Instruction::ShuffleVector, 02121 VTy, Ty2); 02122 } else if (!Ty2->isVectorTy()) { 02123 // O2 needs to be inserted into a vector of size O1, and then 02124 // both need to be shuffled together. 02125 ESContrib = (int) TTI->getVectorInstrCost( 02126 Instruction::InsertElement, Ty1, 0); 02127 ESContrib += (int) getInstrCost(Instruction::ShuffleVector, 02128 VTy, Ty1); 02129 } else { 02130 Type *TyBig = Ty1, *TySmall = Ty2; 02131 if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements()) 02132 std::swap(TyBig, TySmall); 02133 02134 ESContrib = (int) getInstrCost(Instruction::ShuffleVector, 02135 VTy, TyBig); 02136 if (TyBig != TySmall) 02137 ESContrib += (int) getInstrCost(Instruction::ShuffleVector, 02138 TyBig, TySmall); 02139 } 02140 02141 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" 02142 << *O1 << " <-> " << *O2 << "} = " << 02143 ESContrib << "\n"); 02144 EffSize -= ESContrib; 02145 IncomingPairs.insert(VP); 02146 } 02147 } 02148 } 02149 02150 if (!HasNontrivialInsts) { 02151 DEBUG(if (DebugPairSelection) dbgs() << 02152 "\tNo non-trivial instructions in DAG;" 02153 " override to zero effective size\n"); 02154 EffSize = 0; 02155 } 02156 } else { 02157 for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(), 02158 E = PrunedDAG.end(); S != E; ++S) 02159 EffSize += (int) getDepthFactor(S->first); 02160 } 02161 02162 DEBUG(if (DebugPairSelection) 02163 dbgs() << "BBV: found pruned DAG for pair {" 02164 << *IJ.first << " <-> " << *IJ.second << "} of depth " << 02165 MaxDepth << " and size " << PrunedDAG.size() << 02166 " (effective size: " << EffSize << ")\n"); 02167 if (((TTI && !UseChainDepthWithTI) || 02168 MaxDepth >= Config.ReqChainDepth) && 02169 EffSize > 0 && EffSize > BestEffSize) { 02170 BestMaxDepth = MaxDepth; 02171 BestEffSize = EffSize; 02172 BestDAG = PrunedDAG; 02173 } 02174 } 02175 } 02176 02177 // Given the list of candidate pairs, this function selects those 02178 // that will be fused into vector instructions. 02179 void BBVectorize::choosePairs( 02180 DenseMap<Value *, std::vector<Value *> > &CandidatePairs, 02181 DenseSet<ValuePair> &CandidatePairsSet, 02182 DenseMap<ValuePair, int> &CandidatePairCostSavings, 02183 std::vector<Value *> &PairableInsts, 02184 DenseSet<ValuePair> &FixedOrderPairs, 02185 DenseMap<VPPair, unsigned> &PairConnectionTypes, 02186 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 02187 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, 02188 DenseSet<ValuePair> &PairableInstUsers, 02189 DenseMap<Value *, Value *>& ChosenPairs) { 02190 bool UseCycleCheck = 02191 CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck; 02192 02193 DenseMap<Value *, std::vector<Value *> > CandidatePairs2; 02194 for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(), 02195 E = CandidatePairsSet.end(); I != E; ++I) { 02196 std::vector<Value *> &JJ = CandidatePairs2[I->second]; 02197 if (JJ.empty()) JJ.reserve(32); 02198 JJ.push_back(I->first); 02199 } 02200 02201 DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap; 02202 DenseSet<VPPair> PairableInstUserPairSet; 02203 for (std::vector<Value *>::iterator I = PairableInsts.begin(), 02204 E = PairableInsts.end(); I != E; ++I) { 02205 // The number of possible pairings for this variable: 02206 size_t NumChoices = CandidatePairs.lookup(*I).size(); 02207 if (!NumChoices) continue; 02208 02209 std::vector<Value *> &JJ = CandidatePairs[*I]; 02210 02211 // The best pair to choose and its dag: 02212 size_t BestMaxDepth = 0; 02213 int BestEffSize = 0; 02214 DenseSet<ValuePair> BestDAG; 02215 findBestDAGFor(CandidatePairs, CandidatePairsSet, 02216 CandidatePairCostSavings, 02217 PairableInsts, FixedOrderPairs, PairConnectionTypes, 02218 ConnectedPairs, ConnectedPairDeps, 02219 PairableInstUsers, PairableInstUserMap, 02220 PairableInstUserPairSet, ChosenPairs, 02221 BestDAG, BestMaxDepth, BestEffSize, *I, JJ, 02222 UseCycleCheck); 02223 02224 if (BestDAG.empty()) 02225 continue; 02226 02227 // A dag has been chosen (or not) at this point. If no dag was 02228 // chosen, then this instruction, I, cannot be paired (and is no longer 02229 // considered). 02230 02231 DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: " 02232 << *cast<Instruction>(*I) << "\n"); 02233 02234 for (DenseSet<ValuePair>::iterator S = BestDAG.begin(), 02235 SE2 = BestDAG.end(); S != SE2; ++S) { 02236 // Insert the members of this dag into the list of chosen pairs. 02237 ChosenPairs.insert(ValuePair(S->first, S->second)); 02238 DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " << 02239 *S->second << "\n"); 02240 02241 // Remove all candidate pairs that have values in the chosen dag. 02242 std::vector<Value *> &KK = CandidatePairs[S->first]; 02243 for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end(); 02244 K != KE; ++K) { 02245 if (*K == S->second) 02246 continue; 02247 02248 CandidatePairsSet.erase(ValuePair(S->first, *K)); 02249 } 02250 02251 std::vector<Value *> &LL = CandidatePairs2[S->second]; 02252 for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end(); 02253 L != LE; ++L) { 02254 if (*L == S->first) 02255 continue; 02256 02257 CandidatePairsSet.erase(ValuePair(*L, S->second)); 02258 } 02259 02260 std::vector<Value *> &MM = CandidatePairs[S->second]; 02261 for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end(); 02262 M != ME; ++M) { 02263 assert(*M != S->first && "Flipped pair in candidate list?"); 02264 CandidatePairsSet.erase(ValuePair(S->second, *M)); 02265 } 02266 02267 std::vector<Value *> &NN = CandidatePairs2[S->first]; 02268 for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end(); 02269 N != NE; ++N) { 02270 assert(*N != S->second && "Flipped pair in candidate list?"); 02271 CandidatePairsSet.erase(ValuePair(*N, S->first)); 02272 } 02273 } 02274 } 02275 02276 DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n"); 02277 } 02278 02279 std::string getReplacementName(Instruction *I, bool IsInput, unsigned o, 02280 unsigned n = 0) { 02281 if (!I->hasName()) 02282 return ""; 02283 02284 return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) + 02285 (n > 0 ? "." + utostr(n) : "")).str(); 02286 } 02287 02288 // Returns the value that is to be used as the pointer input to the vector 02289 // instruction that fuses I with J. 02290 Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context, 02291 Instruction *I, Instruction *J, unsigned o) { 02292 Value *IPtr, *JPtr; 02293 unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; 02294 int64_t OffsetInElmts; 02295 02296 // Note: the analysis might fail here, that is why the pair order has 02297 // been precomputed (OffsetInElmts must be unused here). 02298 (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, 02299 IAddressSpace, JAddressSpace, 02300 OffsetInElmts, false); 02301 02302 // The pointer value is taken to be the one with the lowest offset. 02303 Value *VPtr = IPtr; 02304 02305 Type *ArgTypeI = IPtr->getType()->getPointerElementType(); 02306 Type *ArgTypeJ = JPtr->getType()->getPointerElementType(); 02307 Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); 02308 Type *VArgPtrType 02309 = PointerType::get(VArgType, 02310 IPtr->getType()->getPointerAddressSpace()); 02311 return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), 02312 /* insert before */ I); 02313 } 02314 02315 void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J, 02316 unsigned MaskOffset, unsigned NumInElem, 02317 unsigned NumInElem1, unsigned IdxOffset, 02318 std::vector<Constant*> &Mask) { 02319 unsigned NumElem1 = J->getType()->getVectorNumElements(); 02320 for (unsigned v = 0; v < NumElem1; ++v) { 02321 int m = cast<ShuffleVectorInst>(J)->getMaskValue(v); 02322 if (m < 0) { 02323 Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context)); 02324 } else { 02325 unsigned mm = m + (int) IdxOffset; 02326 if (m >= (int) NumInElem1) 02327 mm += (int) NumInElem; 02328 02329 Mask[v+MaskOffset] = 02330 ConstantInt::get(Type::getInt32Ty(Context), mm); 02331 } 02332 } 02333 } 02334 02335 // Returns the value that is to be used as the vector-shuffle mask to the 02336 // vector instruction that fuses I with J. 02337 Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context, 02338 Instruction *I, Instruction *J) { 02339 // This is the shuffle mask. We need to append the second 02340 // mask to the first, and the numbers need to be adjusted. 02341 02342 Type *ArgTypeI = I->getType(); 02343 Type *ArgTypeJ = J->getType(); 02344 Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); 02345 02346 unsigned NumElemI = ArgTypeI->getVectorNumElements(); 02347 02348 // Get the total number of elements in the fused vector type. 02349 // By definition, this must equal the number of elements in 02350 // the final mask. 02351 unsigned NumElem = VArgType->getVectorNumElements(); 02352 std::vector<Constant*> Mask(NumElem); 02353 02354 Type *OpTypeI = I->getOperand(0)->getType(); 02355 unsigned NumInElemI = OpTypeI->getVectorNumElements(); 02356 Type *OpTypeJ = J->getOperand(0)->getType(); 02357 unsigned NumInElemJ = OpTypeJ->getVectorNumElements(); 02358 02359 // The fused vector will be: 02360 // ----------------------------------------------------- 02361 // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ | 02362 // ----------------------------------------------------- 02363 // from which we'll extract NumElem total elements (where the first NumElemI 02364 // of them come from the mask in I and the remainder come from the mask 02365 // in J. 02366 02367 // For the mask from the first pair... 02368 fillNewShuffleMask(Context, I, 0, NumInElemJ, NumInElemI, 02369 0, Mask); 02370 02371 // For the mask from the second pair... 02372 fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ, 02373 NumInElemI, Mask); 02374 02375 return ConstantVector::get(Mask); 02376 } 02377 02378 bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I, 02379 Instruction *J, unsigned o, Value *&LOp, 02380 unsigned numElemL, 02381 Type *ArgTypeL, Type *ArgTypeH, 02382 bool IBeforeJ, unsigned IdxOff) { 02383 bool ExpandedIEChain = false; 02384 if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) { 02385 // If we have a pure insertelement chain, then this can be rewritten 02386 // into a chain that directly builds the larger type. 02387 if (isPureIEChain(LIE)) { 02388 SmallVector<Value *, 8> VectElemts(numElemL, 02389 UndefValue::get(ArgTypeL->getScalarType())); 02390 InsertElementInst *LIENext = LIE; 02391 do { 02392 unsigned Idx = 02393 cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue(); 02394 VectElemts[Idx] = LIENext->getOperand(1); 02395 } while ((LIENext = 02396 dyn_cast<InsertElementInst>(LIENext->getOperand(0)))); 02397 02398 LIENext = nullptr; 02399 Value *LIEPrev = UndefValue::get(ArgTypeH); 02400 for (unsigned i = 0; i < numElemL; ++i) { 02401 if (isa<UndefValue>(VectElemts[i])) continue; 02402 LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i], 02403 ConstantInt::get(Type::getInt32Ty(Context), 02404 i + IdxOff), 02405 getReplacementName(IBeforeJ ? I : J, 02406 true, o, i+1)); 02407 LIENext->insertBefore(IBeforeJ ? J : I); 02408 LIEPrev = LIENext; 02409 } 02410 02411 LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH); 02412 ExpandedIEChain = true; 02413 } 02414 } 02415 02416 return ExpandedIEChain; 02417 } 02418 02419 static unsigned getNumScalarElements(Type *Ty) { 02420 if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) 02421 return VecTy->getNumElements(); 02422 return 1; 02423 } 02424 02425 // Returns the value to be used as the specified operand of the vector 02426 // instruction that fuses I with J. 02427 Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, 02428 Instruction *J, unsigned o, bool IBeforeJ) { 02429 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); 02430 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); 02431 02432 // Compute the fused vector type for this operand 02433 Type *ArgTypeI = I->getOperand(o)->getType(); 02434 Type *ArgTypeJ = J->getOperand(o)->getType(); 02435 VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); 02436 02437 Instruction *L = I, *H = J; 02438 Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ; 02439 02440 unsigned numElemL = getNumScalarElements(ArgTypeL); 02441 unsigned numElemH = getNumScalarElements(ArgTypeH); 02442 02443 Value *LOp = L->getOperand(o); 02444 Value *HOp = H->getOperand(o); 02445 unsigned numElem = VArgType->getNumElements(); 02446 02447 // First, we check if we can reuse the "original" vector outputs (if these 02448 // exist). We might need a shuffle. 02449 ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp); 02450 ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp); 02451 ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp); 02452 ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp); 02453 02454 // FIXME: If we're fusing shuffle instructions, then we can't apply this 02455 // optimization. The input vectors to the shuffle might be a different 02456 // length from the shuffle outputs. Unfortunately, the replacement 02457 // shuffle mask has already been formed, and the mask entries are sensitive 02458 // to the sizes of the inputs. 02459 bool IsSizeChangeShuffle = 02460 isa<ShuffleVectorInst>(L) && 02461 (LOp->getType() != L->getType() || HOp->getType() != H->getType()); 02462 02463 if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) { 02464 // We can have at most two unique vector inputs. 02465 bool CanUseInputs = true; 02466 Value *I1, *I2 = nullptr; 02467 if (LEE) { 02468 I1 = LEE->getOperand(0); 02469 } else { 02470 I1 = LSV->getOperand(0); 02471 I2 = LSV->getOperand(1); 02472 if (I2 == I1 || isa<UndefValue>(I2)) 02473 I2 = nullptr; 02474 } 02475 02476 if (HEE) { 02477 Value *I3 = HEE->getOperand(0); 02478 if (!I2 && I3 != I1) 02479 I2 = I3; 02480 else if (I3 != I1 && I3 != I2) 02481 CanUseInputs = false; 02482 } else { 02483 Value *I3 = HSV->getOperand(0); 02484 if (!I2 && I3 != I1) 02485 I2 = I3; 02486 else if (I3 != I1 && I3 != I2) 02487 CanUseInputs = false; 02488 02489 if (CanUseInputs) { 02490 Value *I4 = HSV->getOperand(1); 02491 if (!isa<UndefValue>(I4)) { 02492 if (!I2 && I4 != I1) 02493 I2 = I4; 02494 else if (I4 != I1 && I4 != I2) 02495 CanUseInputs = false; 02496 } 02497 } 02498 } 02499 02500 if (CanUseInputs) { 02501 unsigned LOpElem = 02502 cast<Instruction>(LOp)->getOperand(0)->getType() 02503 ->getVectorNumElements(); 02504 02505 unsigned HOpElem = 02506 cast<Instruction>(HOp)->getOperand(0)->getType() 02507 ->getVectorNumElements(); 02508 02509 // We have one or two input vectors. We need to map each index of the 02510 // operands to the index of the original vector. 02511 SmallVector<std::pair<int, int>, 8> II(numElem); 02512 for (unsigned i = 0; i < numElemL; ++i) { 02513 int Idx, INum; 02514 if (LEE) { 02515 Idx = 02516 cast<ConstantInt>(LEE->getOperand(1))->getSExtValue(); 02517 INum = LEE->getOperand(0) == I1 ? 0 : 1; 02518 } else { 02519 Idx = LSV->getMaskValue(i); 02520 if (Idx < (int) LOpElem) { 02521 INum = LSV->getOperand(0) == I1 ? 0 : 1; 02522 } else { 02523 Idx -= LOpElem; 02524 INum = LSV->getOperand(1) == I1 ? 0 : 1; 02525 } 02526 } 02527 02528 II[i] = std::pair<int, int>(Idx, INum); 02529 } 02530 for (unsigned i = 0; i < numElemH; ++i) { 02531 int Idx, INum; 02532 if (HEE) { 02533 Idx = 02534 cast<ConstantInt>(HEE->getOperand(1))->getSExtValue(); 02535 INum = HEE->getOperand(0) == I1 ? 0 : 1; 02536 } else { 02537 Idx = HSV->getMaskValue(i); 02538 if (Idx < (int) HOpElem) { 02539 INum = HSV->getOperand(0) == I1 ? 0 : 1; 02540 } else { 02541 Idx -= HOpElem; 02542 INum = HSV->getOperand(1) == I1 ? 0 : 1; 02543 } 02544 } 02545 02546 II[i + numElemL] = std::pair<int, int>(Idx, INum); 02547 } 02548 02549 // We now have an array which tells us from which index of which 02550 // input vector each element of the operand comes. 02551 VectorType *I1T = cast<VectorType>(I1->getType()); 02552 unsigned I1Elem = I1T->getNumElements(); 02553 02554 if (!I2) { 02555 // In this case there is only one underlying vector input. Check for 02556 // the trivial case where we can use the input directly. 02557 if (I1Elem == numElem) { 02558 bool ElemInOrder = true; 02559 for (unsigned i = 0; i < numElem; ++i) { 02560 if (II[i].first != (int) i && II[i].first != -1) { 02561 ElemInOrder = false; 02562 break; 02563 } 02564 } 02565 02566 if (ElemInOrder) 02567 return I1; 02568 } 02569 02570 // A shuffle is needed. 02571 std::vector<Constant *> Mask(numElem); 02572 for (unsigned i = 0; i < numElem; ++i) { 02573 int Idx = II[i].first; 02574 if (Idx == -1) 02575 Mask[i] = UndefValue::get(Type::getInt32Ty(Context)); 02576 else 02577 Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx); 02578 } 02579 02580 Instruction *S = 02581 new ShuffleVectorInst(I1, UndefValue::get(I1T), 02582 ConstantVector::get(Mask), 02583 getReplacementName(IBeforeJ ? I : J, 02584 true, o)); 02585 S->insertBefore(IBeforeJ ? J : I); 02586 return S; 02587 } 02588 02589 VectorType *I2T = cast<VectorType>(I2->getType()); 02590 unsigned I2Elem = I2T->getNumElements(); 02591 02592 // This input comes from two distinct vectors. The first step is to 02593 // make sure that both vectors are the same length. If not, the 02594 // smaller one will need to grow before they can be shuffled together. 02595 if (I1Elem < I2Elem) { 02596 std::vector<Constant *> Mask(I2Elem); 02597 unsigned v = 0; 02598 for (; v < I1Elem; ++v) 02599 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 02600 for (; v < I2Elem; ++v) 02601 Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 02602 02603 Instruction *NewI1 = 02604 new ShuffleVectorInst(I1, UndefValue::get(I1T), 02605 ConstantVector::get(Mask), 02606 getReplacementName(IBeforeJ ? I : J, 02607 true, o, 1)); 02608 NewI1->insertBefore(IBeforeJ ? J : I); 02609 I1 = NewI1; 02610 I1T = I2T; 02611 I1Elem = I2Elem; 02612 } else if (I1Elem > I2Elem) { 02613 std::vector<Constant *> Mask(I1Elem); 02614 unsigned v = 0; 02615 for (; v < I2Elem; ++v) 02616 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 02617 for (; v < I1Elem; ++v) 02618 Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 02619 02620 Instruction *NewI2 = 02621 new ShuffleVectorInst(I2, UndefValue::get(I2T), 02622 ConstantVector::get(Mask), 02623 getReplacementName(IBeforeJ ? I : J, 02624 true, o, 1)); 02625 NewI2->insertBefore(IBeforeJ ? J : I); 02626 I2 = NewI2; 02627 I2T = I1T; 02628 I2Elem = I1Elem; 02629 } 02630 02631 // Now that both I1 and I2 are the same length we can shuffle them 02632 // together (and use the result). 02633 std::vector<Constant *> Mask(numElem); 02634 for (unsigned v = 0; v < numElem; ++v) { 02635 if (II[v].first == -1) { 02636 Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 02637 } else { 02638 int Idx = II[v].first + II[v].second * I1Elem; 02639 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); 02640 } 02641 } 02642 02643 Instruction *NewOp = 02644 new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask), 02645 getReplacementName(IBeforeJ ? I : J, true, o)); 02646 NewOp->insertBefore(IBeforeJ ? J : I); 02647 return NewOp; 02648 } 02649 } 02650 02651 Type *ArgType = ArgTypeL; 02652 if (numElemL < numElemH) { 02653 if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH, 02654 ArgTypeL, VArgType, IBeforeJ, 1)) { 02655 // This is another short-circuit case: we're combining a scalar into 02656 // a vector that is formed by an IE chain. We've just expanded the IE 02657 // chain, now insert the scalar and we're done. 02658 02659 Instruction *S = InsertElementInst::Create(HOp, LOp, CV0, 02660 getReplacementName(IBeforeJ ? I : J, true, o)); 02661 S->insertBefore(IBeforeJ ? J : I); 02662 return S; 02663 } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL, 02664 ArgTypeH, IBeforeJ)) { 02665 // The two vector inputs to the shuffle must be the same length, 02666 // so extend the smaller vector to be the same length as the larger one. 02667 Instruction *NLOp; 02668 if (numElemL > 1) { 02669 02670 std::vector<Constant *> Mask(numElemH); 02671 unsigned v = 0; 02672 for (; v < numElemL; ++v) 02673 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 02674 for (; v < numElemH; ++v) 02675 Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 02676 02677 NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL), 02678 ConstantVector::get(Mask), 02679 getReplacementName(IBeforeJ ? I : J, 02680 true, o, 1)); 02681 } else { 02682 NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0, 02683 getReplacementName(IBeforeJ ? I : J, 02684 true, o, 1)); 02685 } 02686 02687 NLOp->insertBefore(IBeforeJ ? J : I); 02688 LOp = NLOp; 02689 } 02690 02691 ArgType = ArgTypeH; 02692 } else if (numElemL > numElemH) { 02693 if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL, 02694 ArgTypeH, VArgType, IBeforeJ)) { 02695 Instruction *S = 02696 InsertElementInst::Create(LOp, HOp, 02697 ConstantInt::get(Type::getInt32Ty(Context), 02698 numElemL), 02699 getReplacementName(IBeforeJ ? I : J, 02700 true, o)); 02701 S->insertBefore(IBeforeJ ? J : I); 02702 return S; 02703 } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH, 02704 ArgTypeL, IBeforeJ)) { 02705 Instruction *NHOp; 02706 if (numElemH > 1) { 02707 std::vector<Constant *> Mask(numElemL); 02708 unsigned v = 0; 02709 for (; v < numElemH; ++v) 02710 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 02711 for (; v < numElemL; ++v) 02712 Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); 02713 02714 NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH), 02715 ConstantVector::get(Mask), 02716 getReplacementName(IBeforeJ ? I : J, 02717 true, o, 1)); 02718 } else { 02719 NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0, 02720 getReplacementName(IBeforeJ ? I : J, 02721 true, o, 1)); 02722 } 02723 02724 NHOp->insertBefore(IBeforeJ ? J : I); 02725 HOp = NHOp; 02726 } 02727 } 02728 02729 if (ArgType->isVectorTy()) { 02730 unsigned numElem = VArgType->getVectorNumElements(); 02731 std::vector<Constant*> Mask(numElem); 02732 for (unsigned v = 0; v < numElem; ++v) { 02733 unsigned Idx = v; 02734 // If the low vector was expanded, we need to skip the extra 02735 // undefined entries. 02736 if (v >= numElemL && numElemH > numElemL) 02737 Idx += (numElemH - numElemL); 02738 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); 02739 } 02740 02741 Instruction *BV = new ShuffleVectorInst(LOp, HOp, 02742 ConstantVector::get(Mask), 02743 getReplacementName(IBeforeJ ? I : J, true, o)); 02744 BV->insertBefore(IBeforeJ ? J : I); 02745 return BV; 02746 } 02747 02748 Instruction *BV1 = InsertElementInst::Create( 02749 UndefValue::get(VArgType), LOp, CV0, 02750 getReplacementName(IBeforeJ ? I : J, 02751 true, o, 1)); 02752 BV1->insertBefore(IBeforeJ ? J : I); 02753 Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1, 02754 getReplacementName(IBeforeJ ? I : J, 02755 true, o, 2)); 02756 BV2->insertBefore(IBeforeJ ? J : I); 02757 return BV2; 02758 } 02759 02760 // This function creates an array of values that will be used as the inputs 02761 // to the vector instruction that fuses I with J. 02762 void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, 02763 Instruction *I, Instruction *J, 02764 SmallVectorImpl<Value *> &ReplacedOperands, 02765 bool IBeforeJ) { 02766 unsigned NumOperands = I->getNumOperands(); 02767 02768 for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) { 02769 // Iterate backward so that we look at the store pointer 02770 // first and know whether or not we need to flip the inputs. 02771 02772 if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) { 02773 // This is the pointer for a load/store instruction. 02774 ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o); 02775 continue; 02776 } else if (isa<CallInst>(I)) { 02777 Function *F = cast<CallInst>(I)->getCalledFunction(); 02778 Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID(); 02779 if (o == NumOperands-1) { 02780 BasicBlock &BB = *I->getParent(); 02781 02782 Module *M = BB.getParent()->getParent(); 02783 Type *ArgTypeI = I->getType(); 02784 Type *ArgTypeJ = J->getType(); 02785 Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); 02786 02787 ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType); 02788 continue; 02789 } else if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz || 02790 IID == Intrinsic::cttz) && o == 1) { 02791 // The second argument of powi/ctlz/cttz is a single integer/constant 02792 // and we've already checked that both arguments are equal. 02793 // As a result, we just keep I's second argument. 02794 ReplacedOperands[o] = I->getOperand(o); 02795 continue; 02796 } 02797 } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) { 02798 ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J); 02799 continue; 02800 } 02801 02802 ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ); 02803 } 02804 } 02805 02806 // This function creates two values that represent the outputs of the 02807 // original I and J instructions. These are generally vector shuffles 02808 // or extracts. In many cases, these will end up being unused and, thus, 02809 // eliminated by later passes. 02810 void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I, 02811 Instruction *J, Instruction *K, 02812 Instruction *&InsertionPt, 02813 Instruction *&K1, Instruction *&K2) { 02814 if (isa<StoreInst>(I)) { 02815 AA->replaceWithNewValue(I, K); 02816 AA->replaceWithNewValue(J, K); 02817 } else { 02818 Type *IType = I->getType(); 02819 Type *JType = J->getType(); 02820 02821 VectorType *VType = getVecTypeForPair(IType, JType); 02822 unsigned numElem = VType->getNumElements(); 02823 02824 unsigned numElemI = getNumScalarElements(IType); 02825 unsigned numElemJ = getNumScalarElements(JType); 02826 02827 if (IType->isVectorTy()) { 02828 std::vector<Constant*> Mask1(numElemI), Mask2(numElemI); 02829 for (unsigned v = 0; v < numElemI; ++v) { 02830 Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 02831 Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v); 02832 } 02833 02834 K1 = new ShuffleVectorInst(K, UndefValue::get(VType), 02835 ConstantVector::get( Mask1), 02836 getReplacementName(K, false, 1)); 02837 } else { 02838 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); 02839 K1 = ExtractElementInst::Create(K, CV0, 02840 getReplacementName(K, false, 1)); 02841 } 02842 02843 if (JType->isVectorTy()) { 02844 std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ); 02845 for (unsigned v = 0; v < numElemJ; ++v) { 02846 Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); 02847 Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v); 02848 } 02849 02850 K2 = new ShuffleVectorInst(K, UndefValue::get(VType), 02851 ConstantVector::get( Mask2), 02852 getReplacementName(K, false, 2)); 02853 } else { 02854 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1); 02855 K2 = ExtractElementInst::Create(K, CV1, 02856 getReplacementName(K, false, 2)); 02857 } 02858 02859 K1->insertAfter(K); 02860 K2->insertAfter(K1); 02861 InsertionPt = K2; 02862 } 02863 } 02864 02865 // Move all uses of the function I (including pairing-induced uses) after J. 02866 bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB, 02867 DenseSet<ValuePair> &LoadMoveSetPairs, 02868 Instruction *I, Instruction *J) { 02869 // Skip to the first instruction past I. 02870 BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); 02871 02872 DenseSet<Value *> Users; 02873 AliasSetTracker WriteSet(*AA); 02874 if (I->mayWriteToMemory()) WriteSet.add(I); 02875 02876 for (; cast<Instruction>(L) != J; ++L) 02877 (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs); 02878 02879 assert(cast<Instruction>(L) == J && 02880 "Tracking has not proceeded far enough to check for dependencies"); 02881 // If J is now in the use set of I, then trackUsesOfI will return true 02882 // and we have a dependency cycle (and the fusing operation must abort). 02883 return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs); 02884 } 02885 02886 // Move all uses of the function I (including pairing-induced uses) after J. 02887 void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB, 02888 DenseSet<ValuePair> &LoadMoveSetPairs, 02889 Instruction *&InsertionPt, 02890 Instruction *I, Instruction *J) { 02891 // Skip to the first instruction past I. 02892 BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); 02893 02894 DenseSet<Value *> Users; 02895 AliasSetTracker WriteSet(*AA); 02896 if (I->mayWriteToMemory()) WriteSet.add(I); 02897 02898 for (; cast<Instruction>(L) != J;) { 02899 if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) { 02900 // Move this instruction 02901 Instruction *InstToMove = L; ++L; 02902 02903 DEBUG(dbgs() << "BBV: moving: " << *InstToMove << 02904 " to after " << *InsertionPt << "\n"); 02905 InstToMove->removeFromParent(); 02906 InstToMove->insertAfter(InsertionPt); 02907 InsertionPt = InstToMove; 02908 } else { 02909 ++L; 02910 } 02911 } 02912 } 02913 02914 // Collect all load instruction that are in the move set of a given first 02915 // pair member. These loads depend on the first instruction, I, and so need 02916 // to be moved after J (the second instruction) when the pair is fused. 02917 void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB, 02918 DenseMap<Value *, Value *> &ChosenPairs, 02919 DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, 02920 DenseSet<ValuePair> &LoadMoveSetPairs, 02921 Instruction *I) { 02922 // Skip to the first instruction past I. 02923 BasicBlock::iterator L = std::next(BasicBlock::iterator(I)); 02924 02925 DenseSet<Value *> Users; 02926 AliasSetTracker WriteSet(*AA); 02927 if (I->mayWriteToMemory()) WriteSet.add(I); 02928 02929 // Note: We cannot end the loop when we reach J because J could be moved 02930 // farther down the use chain by another instruction pairing. Also, J 02931 // could be before I if this is an inverted input. 02932 for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) { 02933 if (trackUsesOfI(Users, WriteSet, I, L)) { 02934 if (L->mayReadFromMemory()) { 02935 LoadMoveSet[L].push_back(I); 02936 LoadMoveSetPairs.insert(ValuePair(L, I)); 02937 } 02938 } 02939 } 02940 } 02941 02942 // In cases where both load/stores and the computation of their pointers 02943 // are chosen for vectorization, we can end up in a situation where the 02944 // aliasing analysis starts returning different query results as the 02945 // process of fusing instruction pairs continues. Because the algorithm 02946 // relies on finding the same use dags here as were found earlier, we'll 02947 // need to precompute the necessary aliasing information here and then 02948 // manually update it during the fusion process. 02949 void BBVectorize::collectLoadMoveSet(BasicBlock &BB, 02950 std::vector<Value *> &PairableInsts, 02951 DenseMap<Value *, Value *> &ChosenPairs, 02952 DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, 02953 DenseSet<ValuePair> &LoadMoveSetPairs) { 02954 for (std::vector<Value *>::iterator PI = PairableInsts.begin(), 02955 PIE = PairableInsts.end(); PI != PIE; ++PI) { 02956 DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI); 02957 if (P == ChosenPairs.end()) continue; 02958 02959 Instruction *I = cast<Instruction>(P->first); 02960 collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, 02961 LoadMoveSetPairs, I); 02962 } 02963 } 02964 02965 // This function fuses the chosen instruction pairs into vector instructions, 02966 // taking care preserve any needed scalar outputs and, then, it reorders the 02967 // remaining instructions as needed (users of the first member of the pair 02968 // need to be moved to after the location of the second member of the pair 02969 // because the vector instruction is inserted in the location of the pair's 02970 // second member). 02971 void BBVectorize::fuseChosenPairs(BasicBlock &BB, 02972 std::vector<Value *> &PairableInsts, 02973 DenseMap<Value *, Value *> &ChosenPairs, 02974 DenseSet<ValuePair> &FixedOrderPairs, 02975 DenseMap<VPPair, unsigned> &PairConnectionTypes, 02976 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, 02977 DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) { 02978 LLVMContext& Context = BB.getContext(); 02979 02980 // During the vectorization process, the order of the pairs to be fused 02981 // could be flipped. So we'll add each pair, flipped, into the ChosenPairs 02982 // list. After a pair is fused, the flipped pair is removed from the list. 02983 DenseSet<ValuePair> FlippedPairs; 02984 for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(), 02985 E = ChosenPairs.end(); P != E; ++P) 02986 FlippedPairs.insert(ValuePair(P->second, P->first)); 02987 for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(), 02988 E = FlippedPairs.end(); P != E; ++P) 02989 ChosenPairs.insert(*P); 02990 02991 DenseMap<Value *, std::vector<Value *> > LoadMoveSet; 02992 DenseSet<ValuePair> LoadMoveSetPairs; 02993 collectLoadMoveSet(BB, PairableInsts, ChosenPairs, 02994 LoadMoveSet, LoadMoveSetPairs); 02995 02996 DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n"); 02997 02998 for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { 02999 DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI); 03000 if (P == ChosenPairs.end()) { 03001 ++PI; 03002 continue; 03003 } 03004 03005 if (getDepthFactor(P->first) == 0) { 03006 // These instructions are not really fused, but are tracked as though 03007 // they are. Any case in which it would be interesting to fuse them 03008 // will be taken care of by InstCombine. 03009 --NumFusedOps; 03010 ++PI; 03011 continue; 03012 } 03013 03014 Instruction *I = cast<Instruction>(P->first), 03015 *J = cast<Instruction>(P->second); 03016 03017 DEBUG(dbgs() << "BBV: fusing: " << *I << 03018 " <-> " << *J << "\n"); 03019 03020 // Remove the pair and flipped pair from the list. 03021 DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second); 03022 assert(FP != ChosenPairs.end() && "Flipped pair not found in list"); 03023 ChosenPairs.erase(FP); 03024 ChosenPairs.erase(P); 03025 03026 if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) { 03027 DEBUG(dbgs() << "BBV: fusion of: " << *I << 03028 " <-> " << *J << 03029 " aborted because of non-trivial dependency cycle\n"); 03030 --NumFusedOps; 03031 ++PI; 03032 continue; 03033 } 03034 03035 // If the pair must have the other order, then flip it. 03036 bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I)); 03037 if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) { 03038 // This pair does not have a fixed order, and so we might want to 03039 // flip it if that will yield fewer shuffles. We count the number 03040 // of dependencies connected via swaps, and those directly connected, 03041 // and flip the order if the number of swaps is greater. 03042 bool OrigOrder = true; 03043 DenseMap<ValuePair, std::vector<ValuePair> >::iterator IJ = 03044 ConnectedPairDeps.find(ValuePair(I, J)); 03045 if (IJ == ConnectedPairDeps.end()) { 03046 IJ = ConnectedPairDeps.find(ValuePair(J, I)); 03047 OrigOrder = false; 03048 } 03049 03050 if (IJ != ConnectedPairDeps.end()) { 03051 unsigned NumDepsDirect = 0, NumDepsSwap = 0; 03052 for (std::vector<ValuePair>::iterator T = IJ->second.begin(), 03053 TE = IJ->second.end(); T != TE; ++T) { 03054 VPPair Q(IJ->first, *T); 03055 DenseMap<VPPair, unsigned>::iterator R = 03056 PairConnectionTypes.find(VPPair(Q.second, Q.first)); 03057 assert(R != PairConnectionTypes.end() && 03058 "Cannot find pair connection type"); 03059 if (R->second == PairConnectionDirect) 03060 ++NumDepsDirect; 03061 else if (R->second == PairConnectionSwap) 03062 ++NumDepsSwap; 03063 } 03064 03065 if (!OrigOrder) 03066 std::swap(NumDepsDirect, NumDepsSwap); 03067 03068 if (NumDepsSwap > NumDepsDirect) { 03069 FlipPairOrder = true; 03070 DEBUG(dbgs() << "BBV: reordering pair: " << *I << 03071 " <-> " << *J << "\n"); 03072 } 03073 } 03074 } 03075 03076 Instruction *L = I, *H = J; 03077 if (FlipPairOrder) 03078 std::swap(H, L); 03079 03080 // If the pair being fused uses the opposite order from that in the pair 03081 // connection map, then we need to flip the types. 03082 DenseMap<ValuePair, std::vector<ValuePair> >::iterator HL = 03083 ConnectedPairs.find(ValuePair(H, L)); 03084 if (HL != ConnectedPairs.end()) 03085 for (std::vector<ValuePair>::iterator T = HL->second.begin(), 03086 TE = HL->second.end(); T != TE; ++T) { 03087 VPPair Q(HL->first, *T); 03088 DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q); 03089 assert(R != PairConnectionTypes.end() && 03090 "Cannot find pair connection type"); 03091 if (R->second == PairConnectionDirect) 03092 R->second = PairConnectionSwap; 03093 else if (R->second == PairConnectionSwap) 03094 R->second = PairConnectionDirect; 03095 } 03096 03097 bool LBeforeH = !FlipPairOrder; 03098 unsigned NumOperands = I->getNumOperands(); 03099 SmallVector<Value *, 3> ReplacedOperands(NumOperands); 03100 getReplacementInputsForPair(Context, L, H, ReplacedOperands, 03101 LBeforeH); 03102 03103 // Make a copy of the original operation, change its type to the vector 03104 // type and replace its operands with the vector operands. 03105 Instruction *K = L->clone(); 03106 if (L->hasName()) 03107 K->takeName(L); 03108 else if (H->hasName()) 03109 K->takeName(H); 03110 03111 if (!isa<StoreInst>(K)) 03112 K->mutateType(getVecTypeForPair(L->getType(), H->getType())); 03113 03114 unsigned KnownIDs[] = { 03115 LLVMContext::MD_tbaa, 03116 LLVMContext::MD_alias_scope, 03117 LLVMContext::MD_noalias, 03118 LLVMContext::MD_fpmath 03119 }; 03120 combineMetadata(K, H, KnownIDs); 03121 K->intersectOptionalDataWith(H); 03122 03123 for (unsigned o = 0; o < NumOperands; ++o) 03124 K->setOperand(o, ReplacedOperands[o]); 03125 03126 K->insertAfter(J); 03127 03128 // Instruction insertion point: 03129 Instruction *InsertionPt = K; 03130 Instruction *K1 = nullptr, *K2 = nullptr; 03131 replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2); 03132 03133 // The use dag of the first original instruction must be moved to after 03134 // the location of the second instruction. The entire use dag of the 03135 // first instruction is disjoint from the input dag of the second 03136 // (by definition), and so commutes with it. 03137 03138 moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J); 03139 03140 if (!isa<StoreInst>(I)) { 03141 L->replaceAllUsesWith(K1); 03142 H->replaceAllUsesWith(K2); 03143 AA->replaceWithNewValue(L, K1); 03144 AA->replaceWithNewValue(H, K2); 03145 } 03146 03147 // Instructions that may read from memory may be in the load move set. 03148 // Once an instruction is fused, we no longer need its move set, and so 03149 // the values of the map never need to be updated. However, when a load 03150 // is fused, we need to merge the entries from both instructions in the 03151 // pair in case those instructions were in the move set of some other 03152 // yet-to-be-fused pair. The loads in question are the keys of the map. 03153 if (I->mayReadFromMemory()) { 03154 std::vector<ValuePair> NewSetMembers; 03155 DenseMap<Value *, std::vector<Value *> >::iterator II = 03156 LoadMoveSet.find(I); 03157 if (II != LoadMoveSet.end()) 03158 for (std::vector<Value *>::iterator N = II->second.begin(), 03159 NE = II->second.end(); N != NE; ++N) 03160 NewSetMembers.push_back(ValuePair(K, *N)); 03161 DenseMap<Value *, std::vector<Value *> >::iterator JJ = 03162 LoadMoveSet.find(J); 03163 if (JJ != LoadMoveSet.end()) 03164 for (std::vector<Value *>::iterator N = JJ->second.begin(), 03165 NE = JJ->second.end(); N != NE; ++N) 03166 NewSetMembers.push_back(ValuePair(K, *N)); 03167 for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(), 03168 AE = NewSetMembers.end(); A != AE; ++A) { 03169 LoadMoveSet[A->first].push_back(A->second); 03170 LoadMoveSetPairs.insert(*A); 03171 } 03172 } 03173 03174 // Before removing I, set the iterator to the next instruction. 03175 PI = std::next(BasicBlock::iterator(I)); 03176 if (cast<Instruction>(PI) == J) 03177 ++PI; 03178 03179 SE->forgetValue(I); 03180 SE->forgetValue(J); 03181 I->eraseFromParent(); 03182 J->eraseFromParent(); 03183 03184 DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" << 03185 BB << "\n"); 03186 } 03187 03188 DEBUG(dbgs() << "BBV: final: \n" << BB << "\n"); 03189 } 03190 } 03191 03192 char BBVectorize::ID = 0; 03193 static const char bb_vectorize_name[] = "Basic-Block Vectorization"; 03194 INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) 03195 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 03196 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) 03197 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 03198 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 03199 INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) 03200 03201 BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) { 03202 return new BBVectorize(C); 03203 } 03204 03205 bool 03206 llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) { 03207 BBVectorize BBVectorizer(P, C); 03208 return BBVectorizer.vectorizeBB(BB); 03209 } 03210 03211 //===----------------------------------------------------------------------===// 03212 VectorizeConfig::VectorizeConfig() { 03213 VectorBits = ::VectorBits; 03214 VectorizeBools = !::NoBools; 03215 VectorizeInts = !::NoInts; 03216 VectorizeFloats = !::NoFloats; 03217 VectorizePointers = !::NoPointers; 03218 VectorizeCasts = !::NoCasts; 03219 VectorizeMath = !::NoMath; 03220 VectorizeBitManipulations = !::NoBitManipulation; 03221 VectorizeFMA = !::NoFMA; 03222 VectorizeSelect = !::NoSelect; 03223 VectorizeCmp = !::NoCmp; 03224 VectorizeGEP = !::NoGEP; 03225 VectorizeMemOps = !::NoMemOps; 03226 AlignedOnly = ::AlignedOnly; 03227 ReqChainDepth= ::ReqChainDepth; 03228 SearchLimit = ::SearchLimit; 03229 MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck; 03230 SplatBreaksChain = ::SplatBreaksChain; 03231 MaxInsts = ::MaxInsts; 03232 MaxPairs = ::MaxPairs; 03233 MaxIter = ::MaxIter; 03234 Pow2LenOnly = ::Pow2LenOnly; 03235 NoMemOpBoost = ::NoMemOpBoost; 03236 FastDep = ::FastDep; 03237 }