LLVM API Documentation
00001 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive 00010 // stores that can be put together into vector-stores. Next, it attempts to 00011 // construct vectorizable tree using the use-def chains. If a profitable tree 00012 // was found, the SLP vectorizer performs vectorization on the tree. 00013 // 00014 // The pass is inspired by the work described in the paper: 00015 // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. 00016 // 00017 //===----------------------------------------------------------------------===// 00018 #include "llvm/Transforms/Vectorize.h" 00019 #include "llvm/ADT/MapVector.h" 00020 #include "llvm/ADT/PostOrderIterator.h" 00021 #include "llvm/ADT/SetVector.h" 00022 #include "llvm/ADT/Statistic.h" 00023 #include "llvm/Analysis/AliasAnalysis.h" 00024 #include "llvm/Analysis/LoopInfo.h" 00025 #include "llvm/Analysis/ScalarEvolution.h" 00026 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 00027 #include "llvm/Analysis/TargetTransformInfo.h" 00028 #include "llvm/Analysis/ValueTracking.h" 00029 #include "llvm/IR/DataLayout.h" 00030 #include "llvm/IR/Dominators.h" 00031 #include "llvm/IR/IRBuilder.h" 00032 #include "llvm/IR/Instructions.h" 00033 #include "llvm/IR/IntrinsicInst.h" 00034 #include "llvm/IR/Module.h" 00035 #include "llvm/IR/NoFolder.h" 00036 #include "llvm/IR/Type.h" 00037 #include "llvm/IR/Value.h" 00038 #include "llvm/IR/Verifier.h" 00039 #include "llvm/Pass.h" 00040 #include "llvm/Support/CommandLine.h" 00041 #include "llvm/Support/Debug.h" 00042 #include "llvm/Support/raw_ostream.h" 00043 #include "llvm/Transforms/Utils/VectorUtils.h" 00044 #include <algorithm> 00045 #include <map> 00046 #include <memory> 00047 00048 using namespace llvm; 00049 00050 #define SV_NAME "slp-vectorizer" 00051 #define DEBUG_TYPE "SLP" 00052 00053 STATISTIC(NumVectorInstructions, "Number of vector instructions generated"); 00054 00055 static cl::opt<int> 00056 SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, 00057 cl::desc("Only vectorize if you gain more than this " 00058 "number ")); 00059 00060 static cl::opt<bool> 00061 ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden, 00062 cl::desc("Attempt to vectorize horizontal reductions")); 00063 00064 static cl::opt<bool> ShouldStartVectorizeHorAtStore( 00065 "slp-vectorize-hor-store", cl::init(false), cl::Hidden, 00066 cl::desc( 00067 "Attempt to vectorize horizontal reductions feeding into a store")); 00068 00069 namespace { 00070 00071 static const unsigned MinVecRegSize = 128; 00072 00073 static const unsigned RecursionMaxDepth = 12; 00074 00075 /// \returns the parent basic block if all of the instructions in \p VL 00076 /// are in the same block or null otherwise. 00077 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) { 00078 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 00079 if (!I0) 00080 return nullptr; 00081 BasicBlock *BB = I0->getParent(); 00082 for (int i = 1, e = VL.size(); i < e; i++) { 00083 Instruction *I = dyn_cast<Instruction>(VL[i]); 00084 if (!I) 00085 return nullptr; 00086 00087 if (BB != I->getParent()) 00088 return nullptr; 00089 } 00090 return BB; 00091 } 00092 00093 /// \returns True if all of the values in \p VL are constants. 00094 static bool allConstant(ArrayRef<Value *> VL) { 00095 for (unsigned i = 0, e = VL.size(); i < e; ++i) 00096 if (!isa<Constant>(VL[i])) 00097 return false; 00098 return true; 00099 } 00100 00101 /// \returns True if all of the values in \p VL are identical. 00102 static bool isSplat(ArrayRef<Value *> VL) { 00103 for (unsigned i = 1, e = VL.size(); i < e; ++i) 00104 if (VL[i] != VL[0]) 00105 return false; 00106 return true; 00107 } 00108 00109 ///\returns Opcode that can be clubbed with \p Op to create an alternate 00110 /// sequence which can later be merged as a ShuffleVector instruction. 00111 static unsigned getAltOpcode(unsigned Op) { 00112 switch (Op) { 00113 case Instruction::FAdd: 00114 return Instruction::FSub; 00115 case Instruction::FSub: 00116 return Instruction::FAdd; 00117 case Instruction::Add: 00118 return Instruction::Sub; 00119 case Instruction::Sub: 00120 return Instruction::Add; 00121 default: 00122 return 0; 00123 } 00124 } 00125 00126 ///\returns bool representing if Opcode \p Op can be part 00127 /// of an alternate sequence which can later be merged as 00128 /// a ShuffleVector instruction. 00129 static bool canCombineAsAltInst(unsigned Op) { 00130 if (Op == Instruction::FAdd || Op == Instruction::FSub || 00131 Op == Instruction::Sub || Op == Instruction::Add) 00132 return true; 00133 return false; 00134 } 00135 00136 /// \returns ShuffleVector instruction if intructions in \p VL have 00137 /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. 00138 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) 00139 static unsigned isAltInst(ArrayRef<Value *> VL) { 00140 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 00141 unsigned Opcode = I0->getOpcode(); 00142 unsigned AltOpcode = getAltOpcode(Opcode); 00143 for (int i = 1, e = VL.size(); i < e; i++) { 00144 Instruction *I = dyn_cast<Instruction>(VL[i]); 00145 if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode)) 00146 return 0; 00147 } 00148 return Instruction::ShuffleVector; 00149 } 00150 00151 /// \returns The opcode if all of the Instructions in \p VL have the same 00152 /// opcode, or zero. 00153 static unsigned getSameOpcode(ArrayRef<Value *> VL) { 00154 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 00155 if (!I0) 00156 return 0; 00157 unsigned Opcode = I0->getOpcode(); 00158 for (int i = 1, e = VL.size(); i < e; i++) { 00159 Instruction *I = dyn_cast<Instruction>(VL[i]); 00160 if (!I || Opcode != I->getOpcode()) { 00161 if (canCombineAsAltInst(Opcode) && i == 1) 00162 return isAltInst(VL); 00163 return 0; 00164 } 00165 } 00166 return Opcode; 00167 } 00168 00169 /// Get the intersection (logical and) of all of the potential IR flags 00170 /// of each scalar operation (VL) that will be converted into a vector (I). 00171 /// Flag set: NSW, NUW, exact, and all of fast-math. 00172 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) { 00173 if (auto *VecOp = dyn_cast<BinaryOperator>(I)) { 00174 if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) { 00175 // Intersection is initialized to the 0th scalar, 00176 // so start counting from index '1'. 00177 for (int i = 1, e = VL.size(); i < e; ++i) { 00178 if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i])) 00179 Intersection->andIRFlags(Scalar); 00180 } 00181 VecOp->copyIRFlags(Intersection); 00182 } 00183 } 00184 } 00185 00186 /// \returns \p I after propagating metadata from \p VL. 00187 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) { 00188 Instruction *I0 = cast<Instruction>(VL[0]); 00189 SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata; 00190 I0->getAllMetadataOtherThanDebugLoc(Metadata); 00191 00192 for (unsigned i = 0, n = Metadata.size(); i != n; ++i) { 00193 unsigned Kind = Metadata[i].first; 00194 MDNode *MD = Metadata[i].second; 00195 00196 for (int i = 1, e = VL.size(); MD && i != e; i++) { 00197 Instruction *I = cast<Instruction>(VL[i]); 00198 MDNode *IMD = I->getMetadata(Kind); 00199 00200 switch (Kind) { 00201 default: 00202 MD = nullptr; // Remove unknown metadata 00203 break; 00204 case LLVMContext::MD_tbaa: 00205 MD = MDNode::getMostGenericTBAA(MD, IMD); 00206 break; 00207 case LLVMContext::MD_alias_scope: 00208 case LLVMContext::MD_noalias: 00209 MD = MDNode::intersect(MD, IMD); 00210 break; 00211 case LLVMContext::MD_fpmath: 00212 MD = MDNode::getMostGenericFPMath(MD, IMD); 00213 break; 00214 } 00215 } 00216 I->setMetadata(Kind, MD); 00217 } 00218 return I; 00219 } 00220 00221 /// \returns The type that all of the values in \p VL have or null if there 00222 /// are different types. 00223 static Type* getSameType(ArrayRef<Value *> VL) { 00224 Type *Ty = VL[0]->getType(); 00225 for (int i = 1, e = VL.size(); i < e; i++) 00226 if (VL[i]->getType() != Ty) 00227 return nullptr; 00228 00229 return Ty; 00230 } 00231 00232 /// \returns True if the ExtractElement instructions in VL can be vectorized 00233 /// to use the original vector. 00234 static bool CanReuseExtract(ArrayRef<Value *> VL) { 00235 assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); 00236 // Check if all of the extracts come from the same vector and from the 00237 // correct offset. 00238 Value *VL0 = VL[0]; 00239 ExtractElementInst *E0 = cast<ExtractElementInst>(VL0); 00240 Value *Vec = E0->getOperand(0); 00241 00242 // We have to extract from the same vector type. 00243 unsigned NElts = Vec->getType()->getVectorNumElements(); 00244 00245 if (NElts != VL.size()) 00246 return false; 00247 00248 // Check that all of the indices extract from the correct offset. 00249 ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1)); 00250 if (!CI || CI->getZExtValue()) 00251 return false; 00252 00253 for (unsigned i = 1, e = VL.size(); i < e; ++i) { 00254 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); 00255 ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1)); 00256 00257 if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) 00258 return false; 00259 } 00260 00261 return true; 00262 } 00263 00264 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, 00265 SmallVectorImpl<Value *> &Left, 00266 SmallVectorImpl<Value *> &Right) { 00267 00268 SmallVector<Value *, 16> OrigLeft, OrigRight; 00269 00270 bool AllSameOpcodeLeft = true; 00271 bool AllSameOpcodeRight = true; 00272 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 00273 Instruction *I = cast<Instruction>(VL[i]); 00274 Value *V0 = I->getOperand(0); 00275 Value *V1 = I->getOperand(1); 00276 00277 OrigLeft.push_back(V0); 00278 OrigRight.push_back(V1); 00279 00280 Instruction *I0 = dyn_cast<Instruction>(V0); 00281 Instruction *I1 = dyn_cast<Instruction>(V1); 00282 00283 // Check whether all operands on one side have the same opcode. In this case 00284 // we want to preserve the original order and not make things worse by 00285 // reordering. 00286 AllSameOpcodeLeft = I0; 00287 AllSameOpcodeRight = I1; 00288 00289 if (i && AllSameOpcodeLeft) { 00290 if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) { 00291 if(P0->getOpcode() != I0->getOpcode()) 00292 AllSameOpcodeLeft = false; 00293 } else 00294 AllSameOpcodeLeft = false; 00295 } 00296 if (i && AllSameOpcodeRight) { 00297 if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) { 00298 if(P1->getOpcode() != I1->getOpcode()) 00299 AllSameOpcodeRight = false; 00300 } else 00301 AllSameOpcodeRight = false; 00302 } 00303 00304 // Sort two opcodes. In the code below we try to preserve the ability to use 00305 // broadcast of values instead of individual inserts. 00306 // vl1 = load 00307 // vl2 = phi 00308 // vr1 = load 00309 // vr2 = vr2 00310 // = vl1 x vr1 00311 // = vl2 x vr2 00312 // If we just sorted according to opcode we would leave the first line in 00313 // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load). 00314 // = vl1 x vr1 00315 // = vr2 x vl2 00316 // Because vr2 and vr1 are from the same load we loose the opportunity of a 00317 // broadcast for the packed right side in the backend: we have [vr1, vl2] 00318 // instead of [vr1, vr2=vr1]. 00319 if (I0 && I1) { 00320 if(!i && I0->getOpcode() > I1->getOpcode()) { 00321 Left.push_back(I1); 00322 Right.push_back(I0); 00323 } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) { 00324 // Try not to destroy a broad cast for no apparent benefit. 00325 Left.push_back(I1); 00326 Right.push_back(I0); 00327 } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] == I0) { 00328 // Try preserve broadcasts. 00329 Left.push_back(I1); 00330 Right.push_back(I0); 00331 } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) { 00332 // Try preserve broadcasts. 00333 Left.push_back(I1); 00334 Right.push_back(I0); 00335 } else { 00336 Left.push_back(I0); 00337 Right.push_back(I1); 00338 } 00339 continue; 00340 } 00341 // One opcode, put the instruction on the right. 00342 if (I0) { 00343 Left.push_back(V1); 00344 Right.push_back(I0); 00345 continue; 00346 } 00347 Left.push_back(V0); 00348 Right.push_back(V1); 00349 } 00350 00351 bool LeftBroadcast = isSplat(Left); 00352 bool RightBroadcast = isSplat(Right); 00353 00354 // Don't reorder if the operands where good to begin with. 00355 if (!(LeftBroadcast || RightBroadcast) && 00356 (AllSameOpcodeRight || AllSameOpcodeLeft)) { 00357 Left = OrigLeft; 00358 Right = OrigRight; 00359 } 00360 } 00361 00362 /// \returns True if in-tree use also needs extract. This refers to 00363 /// possible scalar operand in vectorized instruction. 00364 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, 00365 TargetLibraryInfo *TLI) { 00366 00367 unsigned Opcode = UserInst->getOpcode(); 00368 switch (Opcode) { 00369 case Instruction::Load: { 00370 LoadInst *LI = cast<LoadInst>(UserInst); 00371 return (LI->getPointerOperand() == Scalar); 00372 } 00373 case Instruction::Store: { 00374 StoreInst *SI = cast<StoreInst>(UserInst); 00375 return (SI->getPointerOperand() == Scalar); 00376 } 00377 case Instruction::Call: { 00378 CallInst *CI = cast<CallInst>(UserInst); 00379 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 00380 if (hasVectorInstrinsicScalarOpd(ID, 1)) { 00381 return (CI->getArgOperand(1) == Scalar); 00382 } 00383 } 00384 default: 00385 return false; 00386 } 00387 } 00388 00389 /// Bottom Up SLP Vectorizer. 00390 class BoUpSLP { 00391 public: 00392 typedef SmallVector<Value *, 8> ValueList; 00393 typedef SmallVector<Instruction *, 16> InstrList; 00394 typedef SmallPtrSet<Value *, 16> ValueSet; 00395 typedef SmallVector<StoreInst *, 8> StoreList; 00396 00397 BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl, 00398 TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, 00399 LoopInfo *Li, DominatorTree *Dt) 00400 : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), 00401 F(Func), SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), 00402 Builder(Se->getContext()) {} 00403 00404 /// \brief Vectorize the tree that starts with the elements in \p VL. 00405 /// Returns the vectorized root. 00406 Value *vectorizeTree(); 00407 00408 /// \returns the cost incurred by unwanted spills and fills, caused by 00409 /// holding live values over call sites. 00410 int getSpillCost(); 00411 00412 /// \returns the vectorization cost of the subtree that starts at \p VL. 00413 /// A negative number means that this is profitable. 00414 int getTreeCost(); 00415 00416 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for 00417 /// the purpose of scheduling and extraction in the \p UserIgnoreLst. 00418 void buildTree(ArrayRef<Value *> Roots, 00419 ArrayRef<Value *> UserIgnoreLst = None); 00420 00421 /// Clear the internal data structures that are created by 'buildTree'. 00422 void deleteTree() { 00423 VectorizableTree.clear(); 00424 ScalarToTreeEntry.clear(); 00425 MustGather.clear(); 00426 ExternalUses.clear(); 00427 NumLoadsWantToKeepOrder = 0; 00428 NumLoadsWantToChangeOrder = 0; 00429 for (auto &Iter : BlocksSchedules) { 00430 BlockScheduling *BS = Iter.second.get(); 00431 BS->clear(); 00432 } 00433 } 00434 00435 /// \returns true if the memory operations A and B are consecutive. 00436 bool isConsecutiveAccess(Value *A, Value *B); 00437 00438 /// \brief Perform LICM and CSE on the newly generated gather sequences. 00439 void optimizeGatherSequence(); 00440 00441 /// \returns true if it is benefitial to reverse the vector order. 00442 bool shouldReorder() const { 00443 return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; 00444 } 00445 00446 private: 00447 struct TreeEntry; 00448 00449 /// \returns the cost of the vectorizable entry. 00450 int getEntryCost(TreeEntry *E); 00451 00452 /// This is the recursive part of buildTree. 00453 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth); 00454 00455 /// Vectorize a single entry in the tree. 00456 Value *vectorizeTree(TreeEntry *E); 00457 00458 /// Vectorize a single entry in the tree, starting in \p VL. 00459 Value *vectorizeTree(ArrayRef<Value *> VL); 00460 00461 /// \returns the pointer to the vectorized value if \p VL is already 00462 /// vectorized, or NULL. They may happen in cycles. 00463 Value *alreadyVectorized(ArrayRef<Value *> VL) const; 00464 00465 /// \brief Take the pointer operand from the Load/Store instruction. 00466 /// \returns NULL if this is not a valid Load/Store instruction. 00467 static Value *getPointerOperand(Value *I); 00468 00469 /// \brief Take the address space operand from the Load/Store instruction. 00470 /// \returns -1 if this is not a valid Load/Store instruction. 00471 static unsigned getAddressSpaceOperand(Value *I); 00472 00473 /// \returns the scalarization cost for this type. Scalarization in this 00474 /// context means the creation of vectors from a group of scalars. 00475 int getGatherCost(Type *Ty); 00476 00477 /// \returns the scalarization cost for this list of values. Assuming that 00478 /// this subtree gets vectorized, we may need to extract the values from the 00479 /// roots. This method calculates the cost of extracting the values. 00480 int getGatherCost(ArrayRef<Value *> VL); 00481 00482 /// \brief Set the Builder insert point to one after the last instruction in 00483 /// the bundle 00484 void setInsertPointAfterBundle(ArrayRef<Value *> VL); 00485 00486 /// \returns a vector from a collection of scalars in \p VL. 00487 Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); 00488 00489 /// \returns whether the VectorizableTree is fully vectoriable and will 00490 /// be beneficial even the tree height is tiny. 00491 bool isFullyVectorizableTinyTree(); 00492 00493 struct TreeEntry { 00494 TreeEntry() : Scalars(), VectorizedValue(nullptr), 00495 NeedToGather(0) {} 00496 00497 /// \returns true if the scalars in VL are equal to this entry. 00498 bool isSame(ArrayRef<Value *> VL) const { 00499 assert(VL.size() == Scalars.size() && "Invalid size"); 00500 return std::equal(VL.begin(), VL.end(), Scalars.begin()); 00501 } 00502 00503 /// A vector of scalars. 00504 ValueList Scalars; 00505 00506 /// The Scalars are vectorized into this value. It is initialized to Null. 00507 Value *VectorizedValue; 00508 00509 /// Do we need to gather this sequence ? 00510 bool NeedToGather; 00511 }; 00512 00513 /// Create a new VectorizableTree entry. 00514 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) { 00515 VectorizableTree.push_back(TreeEntry()); 00516 int idx = VectorizableTree.size() - 1; 00517 TreeEntry *Last = &VectorizableTree[idx]; 00518 Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); 00519 Last->NeedToGather = !Vectorized; 00520 if (Vectorized) { 00521 for (int i = 0, e = VL.size(); i != e; ++i) { 00522 assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); 00523 ScalarToTreeEntry[VL[i]] = idx; 00524 } 00525 } else { 00526 MustGather.insert(VL.begin(), VL.end()); 00527 } 00528 return Last; 00529 } 00530 00531 /// -- Vectorization State -- 00532 /// Holds all of the tree entries. 00533 std::vector<TreeEntry> VectorizableTree; 00534 00535 /// Maps a specific scalar to its tree entry. 00536 SmallDenseMap<Value*, int> ScalarToTreeEntry; 00537 00538 /// A list of scalars that we found that we need to keep as scalars. 00539 ValueSet MustGather; 00540 00541 /// This POD struct describes one external user in the vectorized tree. 00542 struct ExternalUser { 00543 ExternalUser (Value *S, llvm::User *U, int L) : 00544 Scalar(S), User(U), Lane(L){}; 00545 // Which scalar in our function. 00546 Value *Scalar; 00547 // Which user that uses the scalar. 00548 llvm::User *User; 00549 // Which lane does the scalar belong to. 00550 int Lane; 00551 }; 00552 typedef SmallVector<ExternalUser, 16> UserList; 00553 00554 /// A list of values that need to extracted out of the tree. 00555 /// This list holds pairs of (Internal Scalar : External User). 00556 UserList ExternalUses; 00557 00558 /// Holds all of the instructions that we gathered. 00559 SetVector<Instruction *> GatherSeq; 00560 /// A list of blocks that we are going to CSE. 00561 SetVector<BasicBlock *> CSEBlocks; 00562 00563 /// Contains all scheduling relevant data for an instruction. 00564 /// A ScheduleData either represents a single instruction or a member of an 00565 /// instruction bundle (= a group of instructions which is combined into a 00566 /// vector instruction). 00567 struct ScheduleData { 00568 00569 // The initial value for the dependency counters. It means that the 00570 // dependencies are not calculated yet. 00571 enum { InvalidDeps = -1 }; 00572 00573 ScheduleData() 00574 : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr), 00575 NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0), 00576 Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps), 00577 UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {} 00578 00579 void init(int BlockSchedulingRegionID) { 00580 FirstInBundle = this; 00581 NextInBundle = nullptr; 00582 NextLoadStore = nullptr; 00583 IsScheduled = false; 00584 SchedulingRegionID = BlockSchedulingRegionID; 00585 UnscheduledDepsInBundle = UnscheduledDeps; 00586 clearDependencies(); 00587 } 00588 00589 /// Returns true if the dependency information has been calculated. 00590 bool hasValidDependencies() const { return Dependencies != InvalidDeps; } 00591 00592 /// Returns true for single instructions and for bundle representatives 00593 /// (= the head of a bundle). 00594 bool isSchedulingEntity() const { return FirstInBundle == this; } 00595 00596 /// Returns true if it represents an instruction bundle and not only a 00597 /// single instruction. 00598 bool isPartOfBundle() const { 00599 return NextInBundle != nullptr || FirstInBundle != this; 00600 } 00601 00602 /// Returns true if it is ready for scheduling, i.e. it has no more 00603 /// unscheduled depending instructions/bundles. 00604 bool isReady() const { 00605 assert(isSchedulingEntity() && 00606 "can't consider non-scheduling entity for ready list"); 00607 return UnscheduledDepsInBundle == 0 && !IsScheduled; 00608 } 00609 00610 /// Modifies the number of unscheduled dependencies, also updating it for 00611 /// the whole bundle. 00612 int incrementUnscheduledDeps(int Incr) { 00613 UnscheduledDeps += Incr; 00614 return FirstInBundle->UnscheduledDepsInBundle += Incr; 00615 } 00616 00617 /// Sets the number of unscheduled dependencies to the number of 00618 /// dependencies. 00619 void resetUnscheduledDeps() { 00620 incrementUnscheduledDeps(Dependencies - UnscheduledDeps); 00621 } 00622 00623 /// Clears all dependency information. 00624 void clearDependencies() { 00625 Dependencies = InvalidDeps; 00626 resetUnscheduledDeps(); 00627 MemoryDependencies.clear(); 00628 } 00629 00630 void dump(raw_ostream &os) const { 00631 if (!isSchedulingEntity()) { 00632 os << "/ " << *Inst; 00633 } else if (NextInBundle) { 00634 os << '[' << *Inst; 00635 ScheduleData *SD = NextInBundle; 00636 while (SD) { 00637 os << ';' << *SD->Inst; 00638 SD = SD->NextInBundle; 00639 } 00640 os << ']'; 00641 } else { 00642 os << *Inst; 00643 } 00644 } 00645 00646 Instruction *Inst; 00647 00648 /// Points to the head in an instruction bundle (and always to this for 00649 /// single instructions). 00650 ScheduleData *FirstInBundle; 00651 00652 /// Single linked list of all instructions in a bundle. Null if it is a 00653 /// single instruction. 00654 ScheduleData *NextInBundle; 00655 00656 /// Single linked list of all memory instructions (e.g. load, store, call) 00657 /// in the block - until the end of the scheduling region. 00658 ScheduleData *NextLoadStore; 00659 00660 /// The dependent memory instructions. 00661 /// This list is derived on demand in calculateDependencies(). 00662 SmallVector<ScheduleData *, 4> MemoryDependencies; 00663 00664 /// This ScheduleData is in the current scheduling region if this matches 00665 /// the current SchedulingRegionID of BlockScheduling. 00666 int SchedulingRegionID; 00667 00668 /// Used for getting a "good" final ordering of instructions. 00669 int SchedulingPriority; 00670 00671 /// The number of dependencies. Constitutes of the number of users of the 00672 /// instruction plus the number of dependent memory instructions (if any). 00673 /// This value is calculated on demand. 00674 /// If InvalidDeps, the number of dependencies is not calculated yet. 00675 /// 00676 int Dependencies; 00677 00678 /// The number of dependencies minus the number of dependencies of scheduled 00679 /// instructions. As soon as this is zero, the instruction/bundle gets ready 00680 /// for scheduling. 00681 /// Note that this is negative as long as Dependencies is not calculated. 00682 int UnscheduledDeps; 00683 00684 /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for 00685 /// single instructions. 00686 int UnscheduledDepsInBundle; 00687 00688 /// True if this instruction is scheduled (or considered as scheduled in the 00689 /// dry-run). 00690 bool IsScheduled; 00691 }; 00692 00693 #ifndef NDEBUG 00694 friend raw_ostream &operator<<(raw_ostream &os, 00695 const BoUpSLP::ScheduleData &SD); 00696 #endif 00697 00698 /// Contains all scheduling data for a basic block. 00699 /// 00700 struct BlockScheduling { 00701 00702 BlockScheduling(BasicBlock *BB) 00703 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize), 00704 ScheduleStart(nullptr), ScheduleEnd(nullptr), 00705 FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr), 00706 // Make sure that the initial SchedulingRegionID is greater than the 00707 // initial SchedulingRegionID in ScheduleData (which is 0). 00708 SchedulingRegionID(1) {} 00709 00710 void clear() { 00711 ReadyInsts.clear(); 00712 ScheduleStart = nullptr; 00713 ScheduleEnd = nullptr; 00714 FirstLoadStoreInRegion = nullptr; 00715 LastLoadStoreInRegion = nullptr; 00716 00717 // Make a new scheduling region, i.e. all existing ScheduleData is not 00718 // in the new region yet. 00719 ++SchedulingRegionID; 00720 } 00721 00722 ScheduleData *getScheduleData(Value *V) { 00723 ScheduleData *SD = ScheduleDataMap[V]; 00724 if (SD && SD->SchedulingRegionID == SchedulingRegionID) 00725 return SD; 00726 return nullptr; 00727 } 00728 00729 bool isInSchedulingRegion(ScheduleData *SD) { 00730 return SD->SchedulingRegionID == SchedulingRegionID; 00731 } 00732 00733 /// Marks an instruction as scheduled and puts all dependent ready 00734 /// instructions into the ready-list. 00735 template <typename ReadyListType> 00736 void schedule(ScheduleData *SD, ReadyListType &ReadyList) { 00737 SD->IsScheduled = true; 00738 DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); 00739 00740 ScheduleData *BundleMember = SD; 00741 while (BundleMember) { 00742 // Handle the def-use chain dependencies. 00743 for (Use &U : BundleMember->Inst->operands()) { 00744 ScheduleData *OpDef = getScheduleData(U.get()); 00745 if (OpDef && OpDef->hasValidDependencies() && 00746 OpDef->incrementUnscheduledDeps(-1) == 0) { 00747 // There are no more unscheduled dependencies after decrementing, 00748 // so we can put the dependent instruction into the ready list. 00749 ScheduleData *DepBundle = OpDef->FirstInBundle; 00750 assert(!DepBundle->IsScheduled && 00751 "already scheduled bundle gets ready"); 00752 ReadyList.insert(DepBundle); 00753 DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n"); 00754 } 00755 } 00756 // Handle the memory dependencies. 00757 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { 00758 if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { 00759 // There are no more unscheduled dependencies after decrementing, 00760 // so we can put the dependent instruction into the ready list. 00761 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; 00762 assert(!DepBundle->IsScheduled && 00763 "already scheduled bundle gets ready"); 00764 ReadyList.insert(DepBundle); 00765 DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n"); 00766 } 00767 } 00768 BundleMember = BundleMember->NextInBundle; 00769 } 00770 } 00771 00772 /// Put all instructions into the ReadyList which are ready for scheduling. 00773 template <typename ReadyListType> 00774 void initialFillReadyList(ReadyListType &ReadyList) { 00775 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 00776 ScheduleData *SD = getScheduleData(I); 00777 if (SD->isSchedulingEntity() && SD->isReady()) { 00778 ReadyList.insert(SD); 00779 DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n"); 00780 } 00781 } 00782 } 00783 00784 /// Checks if a bundle of instructions can be scheduled, i.e. has no 00785 /// cyclic dependencies. This is only a dry-run, no instructions are 00786 /// actually moved at this stage. 00787 bool tryScheduleBundle(ArrayRef<Value *> VL, AliasAnalysis *AA); 00788 00789 /// Un-bundles a group of instructions. 00790 void cancelScheduling(ArrayRef<Value *> VL); 00791 00792 /// Extends the scheduling region so that V is inside the region. 00793 void extendSchedulingRegion(Value *V); 00794 00795 /// Initialize the ScheduleData structures for new instructions in the 00796 /// scheduling region. 00797 void initScheduleData(Instruction *FromI, Instruction *ToI, 00798 ScheduleData *PrevLoadStore, 00799 ScheduleData *NextLoadStore); 00800 00801 /// Updates the dependency information of a bundle and of all instructions/ 00802 /// bundles which depend on the original bundle. 00803 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList, 00804 AliasAnalysis *AA); 00805 00806 /// Sets all instruction in the scheduling region to un-scheduled. 00807 void resetSchedule(); 00808 00809 BasicBlock *BB; 00810 00811 /// Simple memory allocation for ScheduleData. 00812 std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks; 00813 00814 /// The size of a ScheduleData array in ScheduleDataChunks. 00815 int ChunkSize; 00816 00817 /// The allocator position in the current chunk, which is the last entry 00818 /// of ScheduleDataChunks. 00819 int ChunkPos; 00820 00821 /// Attaches ScheduleData to Instruction. 00822 /// Note that the mapping survives during all vectorization iterations, i.e. 00823 /// ScheduleData structures are recycled. 00824 DenseMap<Value *, ScheduleData *> ScheduleDataMap; 00825 00826 struct ReadyList : SmallVector<ScheduleData *, 8> { 00827 void insert(ScheduleData *SD) { push_back(SD); } 00828 }; 00829 00830 /// The ready-list for scheduling (only used for the dry-run). 00831 ReadyList ReadyInsts; 00832 00833 /// The first instruction of the scheduling region. 00834 Instruction *ScheduleStart; 00835 00836 /// The first instruction _after_ the scheduling region. 00837 Instruction *ScheduleEnd; 00838 00839 /// The first memory accessing instruction in the scheduling region 00840 /// (can be null). 00841 ScheduleData *FirstLoadStoreInRegion; 00842 00843 /// The last memory accessing instruction in the scheduling region 00844 /// (can be null). 00845 ScheduleData *LastLoadStoreInRegion; 00846 00847 /// The ID of the scheduling region. For a new vectorization iteration this 00848 /// is incremented which "removes" all ScheduleData from the region. 00849 int SchedulingRegionID; 00850 }; 00851 00852 /// Attaches the BlockScheduling structures to basic blocks. 00853 DenseMap<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules; 00854 00855 /// Performs the "real" scheduling. Done before vectorization is actually 00856 /// performed in a basic block. 00857 void scheduleBlock(BlockScheduling *BS); 00858 00859 /// List of users to ignore during scheduling and that don't need extracting. 00860 ArrayRef<Value *> UserIgnoreList; 00861 00862 // Number of load-bundles, which contain consecutive loads. 00863 int NumLoadsWantToKeepOrder; 00864 00865 // Number of load-bundles of size 2, which are consecutive loads if reversed. 00866 int NumLoadsWantToChangeOrder; 00867 00868 // Analysis and block reference. 00869 Function *F; 00870 ScalarEvolution *SE; 00871 const DataLayout *DL; 00872 TargetTransformInfo *TTI; 00873 TargetLibraryInfo *TLI; 00874 AliasAnalysis *AA; 00875 LoopInfo *LI; 00876 DominatorTree *DT; 00877 /// Instruction builder to construct the vectorized tree. 00878 IRBuilder<> Builder; 00879 }; 00880 00881 #ifndef NDEBUG 00882 raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) { 00883 SD.dump(os); 00884 return os; 00885 } 00886 #endif 00887 00888 void BoUpSLP::buildTree(ArrayRef<Value *> Roots, 00889 ArrayRef<Value *> UserIgnoreLst) { 00890 deleteTree(); 00891 UserIgnoreList = UserIgnoreLst; 00892 if (!getSameType(Roots)) 00893 return; 00894 buildTree_rec(Roots, 0); 00895 00896 // Collect the values that we need to extract from the tree. 00897 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { 00898 TreeEntry *Entry = &VectorizableTree[EIdx]; 00899 00900 // For each lane: 00901 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 00902 Value *Scalar = Entry->Scalars[Lane]; 00903 00904 // No need to handle users of gathered values. 00905 if (Entry->NeedToGather) 00906 continue; 00907 00908 for (User *U : Scalar->users()) { 00909 DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); 00910 00911 Instruction *UserInst = dyn_cast<Instruction>(U); 00912 if (!UserInst) 00913 continue; 00914 00915 // Skip in-tree scalars that become vectors 00916 if (ScalarToTreeEntry.count(U)) { 00917 int Idx = ScalarToTreeEntry[U]; 00918 TreeEntry *UseEntry = &VectorizableTree[Idx]; 00919 Value *UseScalar = UseEntry->Scalars[0]; 00920 // Some in-tree scalars will remain as scalar in vectorized 00921 // instructions. If that is the case, the one in Lane 0 will 00922 // be used. 00923 if (UseScalar != U || 00924 !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { 00925 DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U 00926 << ".\n"); 00927 assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); 00928 continue; 00929 } 00930 } 00931 00932 // Ignore users in the user ignore list. 00933 if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) != 00934 UserIgnoreList.end()) 00935 continue; 00936 00937 DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " << 00938 Lane << " from " << *Scalar << ".\n"); 00939 ExternalUses.push_back(ExternalUser(Scalar, U, Lane)); 00940 } 00941 } 00942 } 00943 } 00944 00945 00946 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { 00947 bool SameTy = getSameType(VL); (void)SameTy; 00948 bool isAltShuffle = false; 00949 assert(SameTy && "Invalid types!"); 00950 00951 if (Depth == RecursionMaxDepth) { 00952 DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); 00953 newTreeEntry(VL, false); 00954 return; 00955 } 00956 00957 // Don't handle vectors. 00958 if (VL[0]->getType()->isVectorTy()) { 00959 DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); 00960 newTreeEntry(VL, false); 00961 return; 00962 } 00963 00964 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 00965 if (SI->getValueOperand()->getType()->isVectorTy()) { 00966 DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); 00967 newTreeEntry(VL, false); 00968 return; 00969 } 00970 unsigned Opcode = getSameOpcode(VL); 00971 00972 // Check that this shuffle vector refers to the alternate 00973 // sequence of opcodes. 00974 if (Opcode == Instruction::ShuffleVector) { 00975 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 00976 unsigned Op = I0->getOpcode(); 00977 if (Op != Instruction::ShuffleVector) 00978 isAltShuffle = true; 00979 } 00980 00981 // If all of the operands are identical or constant we have a simple solution. 00982 if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) { 00983 DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); 00984 newTreeEntry(VL, false); 00985 return; 00986 } 00987 00988 // We now know that this is a vector of instructions of the same type from 00989 // the same block. 00990 00991 // Check if this is a duplicate of another entry. 00992 if (ScalarToTreeEntry.count(VL[0])) { 00993 int Idx = ScalarToTreeEntry[VL[0]]; 00994 TreeEntry *E = &VectorizableTree[Idx]; 00995 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 00996 DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); 00997 if (E->Scalars[i] != VL[i]) { 00998 DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); 00999 newTreeEntry(VL, false); 01000 return; 01001 } 01002 } 01003 DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n"); 01004 return; 01005 } 01006 01007 // Check that none of the instructions in the bundle are already in the tree. 01008 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 01009 if (ScalarToTreeEntry.count(VL[i])) { 01010 DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << 01011 ") is already in tree.\n"); 01012 newTreeEntry(VL, false); 01013 return; 01014 } 01015 } 01016 01017 // If any of the scalars appears in the table OR it is marked as a value that 01018 // needs to stat scalar then we need to gather the scalars. 01019 for (unsigned i = 0, e = VL.size(); i != e; ++i) { 01020 if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) { 01021 DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n"); 01022 newTreeEntry(VL, false); 01023 return; 01024 } 01025 } 01026 01027 // Check that all of the users of the scalars that we want to vectorize are 01028 // schedulable. 01029 Instruction *VL0 = cast<Instruction>(VL[0]); 01030 BasicBlock *BB = cast<Instruction>(VL0)->getParent(); 01031 01032 if (!DT->isReachableFromEntry(BB)) { 01033 // Don't go into unreachable blocks. They may contain instructions with 01034 // dependency cycles which confuse the final scheduling. 01035 DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); 01036 newTreeEntry(VL, false); 01037 return; 01038 } 01039 01040 // Check that every instructions appears once in this bundle. 01041 for (unsigned i = 0, e = VL.size(); i < e; ++i) 01042 for (unsigned j = i+1; j < e; ++j) 01043 if (VL[i] == VL[j]) { 01044 DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); 01045 newTreeEntry(VL, false); 01046 return; 01047 } 01048 01049 auto &BSRef = BlocksSchedules[BB]; 01050 if (!BSRef) { 01051 BSRef = llvm::make_unique<BlockScheduling>(BB); 01052 } 01053 BlockScheduling &BS = *BSRef.get(); 01054 01055 if (!BS.tryScheduleBundle(VL, AA)) { 01056 DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); 01057 BS.cancelScheduling(VL); 01058 newTreeEntry(VL, false); 01059 return; 01060 } 01061 DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); 01062 01063 switch (Opcode) { 01064 case Instruction::PHI: { 01065 PHINode *PH = dyn_cast<PHINode>(VL0); 01066 01067 // Check for terminator values (e.g. invoke). 01068 for (unsigned j = 0; j < VL.size(); ++j) 01069 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 01070 TerminatorInst *Term = dyn_cast<TerminatorInst>( 01071 cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i))); 01072 if (Term) { 01073 DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); 01074 BS.cancelScheduling(VL); 01075 newTreeEntry(VL, false); 01076 return; 01077 } 01078 } 01079 01080 newTreeEntry(VL, true); 01081 DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); 01082 01083 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 01084 ValueList Operands; 01085 // Prepare the operand vector. 01086 for (unsigned j = 0; j < VL.size(); ++j) 01087 Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock( 01088 PH->getIncomingBlock(i))); 01089 01090 buildTree_rec(Operands, Depth + 1); 01091 } 01092 return; 01093 } 01094 case Instruction::ExtractElement: { 01095 bool Reuse = CanReuseExtract(VL); 01096 if (Reuse) { 01097 DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); 01098 } else { 01099 BS.cancelScheduling(VL); 01100 } 01101 newTreeEntry(VL, Reuse); 01102 return; 01103 } 01104 case Instruction::Load: { 01105 // Check if the loads are consecutive or of we need to swizzle them. 01106 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { 01107 LoadInst *L = cast<LoadInst>(VL[i]); 01108 if (!L->isSimple()) { 01109 BS.cancelScheduling(VL); 01110 newTreeEntry(VL, false); 01111 DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); 01112 return; 01113 } 01114 if (!isConsecutiveAccess(VL[i], VL[i + 1])) { 01115 if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0])) { 01116 ++NumLoadsWantToChangeOrder; 01117 } 01118 BS.cancelScheduling(VL); 01119 newTreeEntry(VL, false); 01120 DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); 01121 return; 01122 } 01123 } 01124 ++NumLoadsWantToKeepOrder; 01125 newTreeEntry(VL, true); 01126 DEBUG(dbgs() << "SLP: added a vector of loads.\n"); 01127 return; 01128 } 01129 case Instruction::ZExt: 01130 case Instruction::SExt: 01131 case Instruction::FPToUI: 01132 case Instruction::FPToSI: 01133 case Instruction::FPExt: 01134 case Instruction::PtrToInt: 01135 case Instruction::IntToPtr: 01136 case Instruction::SIToFP: 01137 case Instruction::UIToFP: 01138 case Instruction::Trunc: 01139 case Instruction::FPTrunc: 01140 case Instruction::BitCast: { 01141 Type *SrcTy = VL0->getOperand(0)->getType(); 01142 for (unsigned i = 0; i < VL.size(); ++i) { 01143 Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); 01144 if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) { 01145 BS.cancelScheduling(VL); 01146 newTreeEntry(VL, false); 01147 DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); 01148 return; 01149 } 01150 } 01151 newTreeEntry(VL, true); 01152 DEBUG(dbgs() << "SLP: added a vector of casts.\n"); 01153 01154 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 01155 ValueList Operands; 01156 // Prepare the operand vector. 01157 for (unsigned j = 0; j < VL.size(); ++j) 01158 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 01159 01160 buildTree_rec(Operands, Depth+1); 01161 } 01162 return; 01163 } 01164 case Instruction::ICmp: 01165 case Instruction::FCmp: { 01166 // Check that all of the compares have the same predicate. 01167 CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); 01168 Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType(); 01169 for (unsigned i = 1, e = VL.size(); i < e; ++i) { 01170 CmpInst *Cmp = cast<CmpInst>(VL[i]); 01171 if (Cmp->getPredicate() != P0 || 01172 Cmp->getOperand(0)->getType() != ComparedTy) { 01173 BS.cancelScheduling(VL); 01174 newTreeEntry(VL, false); 01175 DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); 01176 return; 01177 } 01178 } 01179 01180 newTreeEntry(VL, true); 01181 DEBUG(dbgs() << "SLP: added a vector of compares.\n"); 01182 01183 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 01184 ValueList Operands; 01185 // Prepare the operand vector. 01186 for (unsigned j = 0; j < VL.size(); ++j) 01187 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 01188 01189 buildTree_rec(Operands, Depth+1); 01190 } 01191 return; 01192 } 01193 case Instruction::Select: 01194 case Instruction::Add: 01195 case Instruction::FAdd: 01196 case Instruction::Sub: 01197 case Instruction::FSub: 01198 case Instruction::Mul: 01199 case Instruction::FMul: 01200 case Instruction::UDiv: 01201 case Instruction::SDiv: 01202 case Instruction::FDiv: 01203 case Instruction::URem: 01204 case Instruction::SRem: 01205 case Instruction::FRem: 01206 case Instruction::Shl: 01207 case Instruction::LShr: 01208 case Instruction::AShr: 01209 case Instruction::And: 01210 case Instruction::Or: 01211 case Instruction::Xor: { 01212 newTreeEntry(VL, true); 01213 DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); 01214 01215 // Sort operands of the instructions so that each side is more likely to 01216 // have the same opcode. 01217 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { 01218 ValueList Left, Right; 01219 reorderInputsAccordingToOpcode(VL, Left, Right); 01220 buildTree_rec(Left, Depth + 1); 01221 buildTree_rec(Right, Depth + 1); 01222 return; 01223 } 01224 01225 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 01226 ValueList Operands; 01227 // Prepare the operand vector. 01228 for (unsigned j = 0; j < VL.size(); ++j) 01229 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 01230 01231 buildTree_rec(Operands, Depth+1); 01232 } 01233 return; 01234 } 01235 case Instruction::GetElementPtr: { 01236 // We don't combine GEPs with complicated (nested) indexing. 01237 for (unsigned j = 0; j < VL.size(); ++j) { 01238 if (cast<Instruction>(VL[j])->getNumOperands() != 2) { 01239 DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); 01240 BS.cancelScheduling(VL); 01241 newTreeEntry(VL, false); 01242 return; 01243 } 01244 } 01245 01246 // We can't combine several GEPs into one vector if they operate on 01247 // different types. 01248 Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType(); 01249 for (unsigned j = 0; j < VL.size(); ++j) { 01250 Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType(); 01251 if (Ty0 != CurTy) { 01252 DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); 01253 BS.cancelScheduling(VL); 01254 newTreeEntry(VL, false); 01255 return; 01256 } 01257 } 01258 01259 // We don't combine GEPs with non-constant indexes. 01260 for (unsigned j = 0; j < VL.size(); ++j) { 01261 auto Op = cast<Instruction>(VL[j])->getOperand(1); 01262 if (!isa<ConstantInt>(Op)) { 01263 DEBUG( 01264 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); 01265 BS.cancelScheduling(VL); 01266 newTreeEntry(VL, false); 01267 return; 01268 } 01269 } 01270 01271 newTreeEntry(VL, true); 01272 DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); 01273 for (unsigned i = 0, e = 2; i < e; ++i) { 01274 ValueList Operands; 01275 // Prepare the operand vector. 01276 for (unsigned j = 0; j < VL.size(); ++j) 01277 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 01278 01279 buildTree_rec(Operands, Depth + 1); 01280 } 01281 return; 01282 } 01283 case Instruction::Store: { 01284 // Check if the stores are consecutive or of we need to swizzle them. 01285 for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) 01286 if (!isConsecutiveAccess(VL[i], VL[i + 1])) { 01287 BS.cancelScheduling(VL); 01288 newTreeEntry(VL, false); 01289 DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); 01290 return; 01291 } 01292 01293 newTreeEntry(VL, true); 01294 DEBUG(dbgs() << "SLP: added a vector of stores.\n"); 01295 01296 ValueList Operands; 01297 for (unsigned j = 0; j < VL.size(); ++j) 01298 Operands.push_back(cast<Instruction>(VL[j])->getOperand(0)); 01299 01300 buildTree_rec(Operands, Depth + 1); 01301 return; 01302 } 01303 case Instruction::Call: { 01304 // Check if the calls are all to the same vectorizable intrinsic. 01305 CallInst *CI = cast<CallInst>(VL[0]); 01306 // Check if this is an Intrinsic call or something that can be 01307 // represented by an intrinsic call 01308 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 01309 if (!isTriviallyVectorizable(ID)) { 01310 BS.cancelScheduling(VL); 01311 newTreeEntry(VL, false); 01312 DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); 01313 return; 01314 } 01315 Function *Int = CI->getCalledFunction(); 01316 Value *A1I = nullptr; 01317 if (hasVectorInstrinsicScalarOpd(ID, 1)) 01318 A1I = CI->getArgOperand(1); 01319 for (unsigned i = 1, e = VL.size(); i != e; ++i) { 01320 CallInst *CI2 = dyn_cast<CallInst>(VL[i]); 01321 if (!CI2 || CI2->getCalledFunction() != Int || 01322 getIntrinsicIDForCall(CI2, TLI) != ID) { 01323 BS.cancelScheduling(VL); 01324 newTreeEntry(VL, false); 01325 DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] 01326 << "\n"); 01327 return; 01328 } 01329 // ctlz,cttz and powi are special intrinsics whose second argument 01330 // should be same in order for them to be vectorized. 01331 if (hasVectorInstrinsicScalarOpd(ID, 1)) { 01332 Value *A1J = CI2->getArgOperand(1); 01333 if (A1I != A1J) { 01334 BS.cancelScheduling(VL); 01335 newTreeEntry(VL, false); 01336 DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI 01337 << " argument "<< A1I<<"!=" << A1J 01338 << "\n"); 01339 return; 01340 } 01341 } 01342 } 01343 01344 newTreeEntry(VL, true); 01345 for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { 01346 ValueList Operands; 01347 // Prepare the operand vector. 01348 for (unsigned j = 0; j < VL.size(); ++j) { 01349 CallInst *CI2 = dyn_cast<CallInst>(VL[j]); 01350 Operands.push_back(CI2->getArgOperand(i)); 01351 } 01352 buildTree_rec(Operands, Depth + 1); 01353 } 01354 return; 01355 } 01356 case Instruction::ShuffleVector: { 01357 // If this is not an alternate sequence of opcode like add-sub 01358 // then do not vectorize this instruction. 01359 if (!isAltShuffle) { 01360 BS.cancelScheduling(VL); 01361 newTreeEntry(VL, false); 01362 DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); 01363 return; 01364 } 01365 newTreeEntry(VL, true); 01366 DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); 01367 for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { 01368 ValueList Operands; 01369 // Prepare the operand vector. 01370 for (unsigned j = 0; j < VL.size(); ++j) 01371 Operands.push_back(cast<Instruction>(VL[j])->getOperand(i)); 01372 01373 buildTree_rec(Operands, Depth + 1); 01374 } 01375 return; 01376 } 01377 default: 01378 BS.cancelScheduling(VL); 01379 newTreeEntry(VL, false); 01380 DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); 01381 return; 01382 } 01383 } 01384 01385 int BoUpSLP::getEntryCost(TreeEntry *E) { 01386 ArrayRef<Value*> VL = E->Scalars; 01387 01388 Type *ScalarTy = VL[0]->getType(); 01389 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 01390 ScalarTy = SI->getValueOperand()->getType(); 01391 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 01392 01393 if (E->NeedToGather) { 01394 if (allConstant(VL)) 01395 return 0; 01396 if (isSplat(VL)) { 01397 return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); 01398 } 01399 return getGatherCost(E->Scalars); 01400 } 01401 unsigned Opcode = getSameOpcode(VL); 01402 assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL"); 01403 Instruction *VL0 = cast<Instruction>(VL[0]); 01404 switch (Opcode) { 01405 case Instruction::PHI: { 01406 return 0; 01407 } 01408 case Instruction::ExtractElement: { 01409 if (CanReuseExtract(VL)) { 01410 int DeadCost = 0; 01411 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 01412 ExtractElementInst *E = cast<ExtractElementInst>(VL[i]); 01413 if (E->hasOneUse()) 01414 // Take credit for instruction that will become dead. 01415 DeadCost += 01416 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); 01417 } 01418 return -DeadCost; 01419 } 01420 return getGatherCost(VecTy); 01421 } 01422 case Instruction::ZExt: 01423 case Instruction::SExt: 01424 case Instruction::FPToUI: 01425 case Instruction::FPToSI: 01426 case Instruction::FPExt: 01427 case Instruction::PtrToInt: 01428 case Instruction::IntToPtr: 01429 case Instruction::SIToFP: 01430 case Instruction::UIToFP: 01431 case Instruction::Trunc: 01432 case Instruction::FPTrunc: 01433 case Instruction::BitCast: { 01434 Type *SrcTy = VL0->getOperand(0)->getType(); 01435 01436 // Calculate the cost of this instruction. 01437 int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), 01438 VL0->getType(), SrcTy); 01439 01440 VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); 01441 int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); 01442 return VecCost - ScalarCost; 01443 } 01444 case Instruction::FCmp: 01445 case Instruction::ICmp: 01446 case Instruction::Select: 01447 case Instruction::Add: 01448 case Instruction::FAdd: 01449 case Instruction::Sub: 01450 case Instruction::FSub: 01451 case Instruction::Mul: 01452 case Instruction::FMul: 01453 case Instruction::UDiv: 01454 case Instruction::SDiv: 01455 case Instruction::FDiv: 01456 case Instruction::URem: 01457 case Instruction::SRem: 01458 case Instruction::FRem: 01459 case Instruction::Shl: 01460 case Instruction::LShr: 01461 case Instruction::AShr: 01462 case Instruction::And: 01463 case Instruction::Or: 01464 case Instruction::Xor: { 01465 // Calculate the cost of this instruction. 01466 int ScalarCost = 0; 01467 int VecCost = 0; 01468 if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp || 01469 Opcode == Instruction::Select) { 01470 VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); 01471 ScalarCost = VecTy->getNumElements() * 01472 TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); 01473 VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); 01474 } else { 01475 // Certain instructions can be cheaper to vectorize if they have a 01476 // constant second vector operand. 01477 TargetTransformInfo::OperandValueKind Op1VK = 01478 TargetTransformInfo::OK_AnyValue; 01479 TargetTransformInfo::OperandValueKind Op2VK = 01480 TargetTransformInfo::OK_UniformConstantValue; 01481 TargetTransformInfo::OperandValueProperties Op1VP = 01482 TargetTransformInfo::OP_None; 01483 TargetTransformInfo::OperandValueProperties Op2VP = 01484 TargetTransformInfo::OP_None; 01485 01486 // If all operands are exactly the same ConstantInt then set the 01487 // operand kind to OK_UniformConstantValue. 01488 // If instead not all operands are constants, then set the operand kind 01489 // to OK_AnyValue. If all operands are constants but not the same, 01490 // then set the operand kind to OK_NonUniformConstantValue. 01491 ConstantInt *CInt = nullptr; 01492 for (unsigned i = 0; i < VL.size(); ++i) { 01493 const Instruction *I = cast<Instruction>(VL[i]); 01494 if (!isa<ConstantInt>(I->getOperand(1))) { 01495 Op2VK = TargetTransformInfo::OK_AnyValue; 01496 break; 01497 } 01498 if (i == 0) { 01499 CInt = cast<ConstantInt>(I->getOperand(1)); 01500 continue; 01501 } 01502 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && 01503 CInt != cast<ConstantInt>(I->getOperand(1))) 01504 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; 01505 } 01506 // FIXME: Currently cost of model modification for division by 01507 // power of 2 is handled only for X86. Add support for other targets. 01508 if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && 01509 CInt->getValue().isPowerOf2()) 01510 Op2VP = TargetTransformInfo::OP_PowerOf2; 01511 01512 ScalarCost = VecTy->getNumElements() * 01513 TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, 01514 Op1VP, Op2VP); 01515 VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, 01516 Op1VP, Op2VP); 01517 } 01518 return VecCost - ScalarCost; 01519 } 01520 case Instruction::GetElementPtr: { 01521 TargetTransformInfo::OperandValueKind Op1VK = 01522 TargetTransformInfo::OK_AnyValue; 01523 TargetTransformInfo::OperandValueKind Op2VK = 01524 TargetTransformInfo::OK_UniformConstantValue; 01525 01526 int ScalarCost = 01527 VecTy->getNumElements() * 01528 TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); 01529 int VecCost = 01530 TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); 01531 01532 return VecCost - ScalarCost; 01533 } 01534 case Instruction::Load: { 01535 // Cost of wide load - cost of scalar loads. 01536 int ScalarLdCost = VecTy->getNumElements() * 01537 TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); 01538 int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0); 01539 return VecLdCost - ScalarLdCost; 01540 } 01541 case Instruction::Store: { 01542 // We know that we can merge the stores. Calculate the cost. 01543 int ScalarStCost = VecTy->getNumElements() * 01544 TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); 01545 int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0); 01546 return VecStCost - ScalarStCost; 01547 } 01548 case Instruction::Call: { 01549 CallInst *CI = cast<CallInst>(VL0); 01550 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 01551 01552 // Calculate the cost of the scalar and vector calls. 01553 SmallVector<Type*, 4> ScalarTys, VecTys; 01554 for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) { 01555 ScalarTys.push_back(CI->getArgOperand(op)->getType()); 01556 VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), 01557 VecTy->getNumElements())); 01558 } 01559 01560 int ScalarCallCost = VecTy->getNumElements() * 01561 TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys); 01562 01563 int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys); 01564 01565 DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost 01566 << " (" << VecCallCost << "-" << ScalarCallCost << ")" 01567 << " for " << *CI << "\n"); 01568 01569 return VecCallCost - ScalarCallCost; 01570 } 01571 case Instruction::ShuffleVector: { 01572 TargetTransformInfo::OperandValueKind Op1VK = 01573 TargetTransformInfo::OK_AnyValue; 01574 TargetTransformInfo::OperandValueKind Op2VK = 01575 TargetTransformInfo::OK_AnyValue; 01576 int ScalarCost = 0; 01577 int VecCost = 0; 01578 for (unsigned i = 0; i < VL.size(); ++i) { 01579 Instruction *I = cast<Instruction>(VL[i]); 01580 if (!I) 01581 break; 01582 ScalarCost += 01583 TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); 01584 } 01585 // VecCost is equal to sum of the cost of creating 2 vectors 01586 // and the cost of creating shuffle. 01587 Instruction *I0 = cast<Instruction>(VL[0]); 01588 VecCost = 01589 TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); 01590 Instruction *I1 = cast<Instruction>(VL[1]); 01591 VecCost += 01592 TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); 01593 VecCost += 01594 TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); 01595 return VecCost - ScalarCost; 01596 } 01597 default: 01598 llvm_unreachable("Unknown instruction"); 01599 } 01600 } 01601 01602 bool BoUpSLP::isFullyVectorizableTinyTree() { 01603 DEBUG(dbgs() << "SLP: Check whether the tree with height " << 01604 VectorizableTree.size() << " is fully vectorizable .\n"); 01605 01606 // We only handle trees of height 2. 01607 if (VectorizableTree.size() != 2) 01608 return false; 01609 01610 // Handle splat stores. 01611 if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars)) 01612 return true; 01613 01614 // Gathering cost would be too much for tiny trees. 01615 if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) 01616 return false; 01617 01618 return true; 01619 } 01620 01621 int BoUpSLP::getSpillCost() { 01622 // Walk from the bottom of the tree to the top, tracking which values are 01623 // live. When we see a call instruction that is not part of our tree, 01624 // query TTI to see if there is a cost to keeping values live over it 01625 // (for example, if spills and fills are required). 01626 unsigned BundleWidth = VectorizableTree.front().Scalars.size(); 01627 int Cost = 0; 01628 01629 SmallPtrSet<Instruction*, 4> LiveValues; 01630 Instruction *PrevInst = nullptr; 01631 01632 for (unsigned N = 0; N < VectorizableTree.size(); ++N) { 01633 Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]); 01634 if (!Inst) 01635 continue; 01636 01637 if (!PrevInst) { 01638 PrevInst = Inst; 01639 continue; 01640 } 01641 01642 DEBUG( 01643 dbgs() << "SLP: #LV: " << LiveValues.size(); 01644 for (auto *X : LiveValues) 01645 dbgs() << " " << X->getName(); 01646 dbgs() << ", Looking at "; 01647 Inst->dump(); 01648 ); 01649 01650 // Update LiveValues. 01651 LiveValues.erase(PrevInst); 01652 for (auto &J : PrevInst->operands()) { 01653 if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J)) 01654 LiveValues.insert(cast<Instruction>(&*J)); 01655 } 01656 01657 // Now find the sequence of instructions between PrevInst and Inst. 01658 BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst); 01659 --PrevInstIt; 01660 while (InstIt != PrevInstIt) { 01661 if (PrevInstIt == PrevInst->getParent()->rend()) { 01662 PrevInstIt = Inst->getParent()->rbegin(); 01663 continue; 01664 } 01665 01666 if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) { 01667 SmallVector<Type*, 4> V; 01668 for (auto *II : LiveValues) 01669 V.push_back(VectorType::get(II->getType(), BundleWidth)); 01670 Cost += TTI->getCostOfKeepingLiveOverCall(V); 01671 } 01672 01673 ++PrevInstIt; 01674 } 01675 01676 PrevInst = Inst; 01677 } 01678 01679 DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n"); 01680 return Cost; 01681 } 01682 01683 int BoUpSLP::getTreeCost() { 01684 int Cost = 0; 01685 DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << 01686 VectorizableTree.size() << ".\n"); 01687 01688 // We only vectorize tiny trees if it is fully vectorizable. 01689 if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) { 01690 if (!VectorizableTree.size()) { 01691 assert(!ExternalUses.size() && "We should not have any external users"); 01692 } 01693 return INT_MAX; 01694 } 01695 01696 unsigned BundleWidth = VectorizableTree[0].Scalars.size(); 01697 01698 for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) { 01699 int C = getEntryCost(&VectorizableTree[i]); 01700 DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " 01701 << *VectorizableTree[i].Scalars[0] << " .\n"); 01702 Cost += C; 01703 } 01704 01705 SmallSet<Value *, 16> ExtractCostCalculated; 01706 int ExtractCost = 0; 01707 for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end(); 01708 I != E; ++I) { 01709 // We only add extract cost once for the same scalar. 01710 if (!ExtractCostCalculated.insert(I->Scalar)) 01711 continue; 01712 01713 VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth); 01714 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, 01715 I->Lane); 01716 } 01717 01718 Cost += getSpillCost(); 01719 01720 DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n"); 01721 return Cost + ExtractCost; 01722 } 01723 01724 int BoUpSLP::getGatherCost(Type *Ty) { 01725 int Cost = 0; 01726 for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i) 01727 Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); 01728 return Cost; 01729 } 01730 01731 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) { 01732 // Find the type of the operands in VL. 01733 Type *ScalarTy = VL[0]->getType(); 01734 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 01735 ScalarTy = SI->getValueOperand()->getType(); 01736 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 01737 // Find the cost of inserting/extracting values from the vector. 01738 return getGatherCost(VecTy); 01739 } 01740 01741 Value *BoUpSLP::getPointerOperand(Value *I) { 01742 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 01743 return LI->getPointerOperand(); 01744 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 01745 return SI->getPointerOperand(); 01746 return nullptr; 01747 } 01748 01749 unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { 01750 if (LoadInst *L = dyn_cast<LoadInst>(I)) 01751 return L->getPointerAddressSpace(); 01752 if (StoreInst *S = dyn_cast<StoreInst>(I)) 01753 return S->getPointerAddressSpace(); 01754 return -1; 01755 } 01756 01757 bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { 01758 Value *PtrA = getPointerOperand(A); 01759 Value *PtrB = getPointerOperand(B); 01760 unsigned ASA = getAddressSpaceOperand(A); 01761 unsigned ASB = getAddressSpaceOperand(B); 01762 01763 // Check that the address spaces match and that the pointers are valid. 01764 if (!PtrA || !PtrB || (ASA != ASB)) 01765 return false; 01766 01767 // Make sure that A and B are different pointers of the same type. 01768 if (PtrA == PtrB || PtrA->getType() != PtrB->getType()) 01769 return false; 01770 01771 unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA); 01772 Type *Ty = cast<PointerType>(PtrA->getType())->getElementType(); 01773 APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty)); 01774 01775 APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0); 01776 PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA); 01777 PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB); 01778 01779 APInt OffsetDelta = OffsetB - OffsetA; 01780 01781 // Check if they are based on the same pointer. That makes the offsets 01782 // sufficient. 01783 if (PtrA == PtrB) 01784 return OffsetDelta == Size; 01785 01786 // Compute the necessary base pointer delta to have the necessary final delta 01787 // equal to the size. 01788 APInt BaseDelta = Size - OffsetDelta; 01789 01790 // Otherwise compute the distance with SCEV between the base pointers. 01791 const SCEV *PtrSCEVA = SE->getSCEV(PtrA); 01792 const SCEV *PtrSCEVB = SE->getSCEV(PtrB); 01793 const SCEV *C = SE->getConstant(BaseDelta); 01794 const SCEV *X = SE->getAddExpr(PtrSCEVA, C); 01795 return X == PtrSCEVB; 01796 } 01797 01798 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) { 01799 Instruction *VL0 = cast<Instruction>(VL[0]); 01800 BasicBlock::iterator NextInst = VL0; 01801 ++NextInst; 01802 Builder.SetInsertPoint(VL0->getParent(), NextInst); 01803 Builder.SetCurrentDebugLocation(VL0->getDebugLoc()); 01804 } 01805 01806 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { 01807 Value *Vec = UndefValue::get(Ty); 01808 // Generate the 'InsertElement' instruction. 01809 for (unsigned i = 0; i < Ty->getNumElements(); ++i) { 01810 Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); 01811 if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) { 01812 GatherSeq.insert(Insrt); 01813 CSEBlocks.insert(Insrt->getParent()); 01814 01815 // Add to our 'need-to-extract' list. 01816 if (ScalarToTreeEntry.count(VL[i])) { 01817 int Idx = ScalarToTreeEntry[VL[i]]; 01818 TreeEntry *E = &VectorizableTree[Idx]; 01819 // Find which lane we need to extract. 01820 int FoundLane = -1; 01821 for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) { 01822 // Is this the lane of the scalar that we are looking for ? 01823 if (E->Scalars[Lane] == VL[i]) { 01824 FoundLane = Lane; 01825 break; 01826 } 01827 } 01828 assert(FoundLane >= 0 && "Could not find the correct lane"); 01829 ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane)); 01830 } 01831 } 01832 } 01833 01834 return Vec; 01835 } 01836 01837 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const { 01838 SmallDenseMap<Value*, int>::const_iterator Entry 01839 = ScalarToTreeEntry.find(VL[0]); 01840 if (Entry != ScalarToTreeEntry.end()) { 01841 int Idx = Entry->second; 01842 const TreeEntry *En = &VectorizableTree[Idx]; 01843 if (En->isSame(VL) && En->VectorizedValue) 01844 return En->VectorizedValue; 01845 } 01846 return nullptr; 01847 } 01848 01849 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { 01850 if (ScalarToTreeEntry.count(VL[0])) { 01851 int Idx = ScalarToTreeEntry[VL[0]]; 01852 TreeEntry *E = &VectorizableTree[Idx]; 01853 if (E->isSame(VL)) 01854 return vectorizeTree(E); 01855 } 01856 01857 Type *ScalarTy = VL[0]->getType(); 01858 if (StoreInst *SI = dyn_cast<StoreInst>(VL[0])) 01859 ScalarTy = SI->getValueOperand()->getType(); 01860 VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); 01861 01862 return Gather(VL, VecTy); 01863 } 01864 01865 Value *BoUpSLP::vectorizeTree(TreeEntry *E) { 01866 IRBuilder<>::InsertPointGuard Guard(Builder); 01867 01868 if (E->VectorizedValue) { 01869 DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); 01870 return E->VectorizedValue; 01871 } 01872 01873 Instruction *VL0 = cast<Instruction>(E->Scalars[0]); 01874 Type *ScalarTy = VL0->getType(); 01875 if (StoreInst *SI = dyn_cast<StoreInst>(VL0)) 01876 ScalarTy = SI->getValueOperand()->getType(); 01877 VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); 01878 01879 if (E->NeedToGather) { 01880 setInsertPointAfterBundle(E->Scalars); 01881 return Gather(E->Scalars, VecTy); 01882 } 01883 01884 unsigned Opcode = getSameOpcode(E->Scalars); 01885 01886 switch (Opcode) { 01887 case Instruction::PHI: { 01888 PHINode *PH = dyn_cast<PHINode>(VL0); 01889 Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); 01890 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 01891 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); 01892 E->VectorizedValue = NewPhi; 01893 01894 // PHINodes may have multiple entries from the same block. We want to 01895 // visit every block once. 01896 SmallSet<BasicBlock*, 4> VisitedBBs; 01897 01898 for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { 01899 ValueList Operands; 01900 BasicBlock *IBB = PH->getIncomingBlock(i); 01901 01902 if (!VisitedBBs.insert(IBB)) { 01903 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB); 01904 continue; 01905 } 01906 01907 // Prepare the operand vector. 01908 for (unsigned j = 0; j < E->Scalars.size(); ++j) 01909 Operands.push_back(cast<PHINode>(E->Scalars[j])-> 01910 getIncomingValueForBlock(IBB)); 01911 01912 Builder.SetInsertPoint(IBB->getTerminator()); 01913 Builder.SetCurrentDebugLocation(PH->getDebugLoc()); 01914 Value *Vec = vectorizeTree(Operands); 01915 NewPhi->addIncoming(Vec, IBB); 01916 } 01917 01918 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && 01919 "Invalid number of incoming values"); 01920 return NewPhi; 01921 } 01922 01923 case Instruction::ExtractElement: { 01924 if (CanReuseExtract(E->Scalars)) { 01925 Value *V = VL0->getOperand(0); 01926 E->VectorizedValue = V; 01927 return V; 01928 } 01929 return Gather(E->Scalars, VecTy); 01930 } 01931 case Instruction::ZExt: 01932 case Instruction::SExt: 01933 case Instruction::FPToUI: 01934 case Instruction::FPToSI: 01935 case Instruction::FPExt: 01936 case Instruction::PtrToInt: 01937 case Instruction::IntToPtr: 01938 case Instruction::SIToFP: 01939 case Instruction::UIToFP: 01940 case Instruction::Trunc: 01941 case Instruction::FPTrunc: 01942 case Instruction::BitCast: { 01943 ValueList INVL; 01944 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 01945 INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 01946 01947 setInsertPointAfterBundle(E->Scalars); 01948 01949 Value *InVec = vectorizeTree(INVL); 01950 01951 if (Value *V = alreadyVectorized(E->Scalars)) 01952 return V; 01953 01954 CastInst *CI = dyn_cast<CastInst>(VL0); 01955 Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); 01956 E->VectorizedValue = V; 01957 ++NumVectorInstructions; 01958 return V; 01959 } 01960 case Instruction::FCmp: 01961 case Instruction::ICmp: { 01962 ValueList LHSV, RHSV; 01963 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 01964 LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 01965 RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 01966 } 01967 01968 setInsertPointAfterBundle(E->Scalars); 01969 01970 Value *L = vectorizeTree(LHSV); 01971 Value *R = vectorizeTree(RHSV); 01972 01973 if (Value *V = alreadyVectorized(E->Scalars)) 01974 return V; 01975 01976 CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate(); 01977 Value *V; 01978 if (Opcode == Instruction::FCmp) 01979 V = Builder.CreateFCmp(P0, L, R); 01980 else 01981 V = Builder.CreateICmp(P0, L, R); 01982 01983 E->VectorizedValue = V; 01984 ++NumVectorInstructions; 01985 return V; 01986 } 01987 case Instruction::Select: { 01988 ValueList TrueVec, FalseVec, CondVec; 01989 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 01990 CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 01991 TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 01992 FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2)); 01993 } 01994 01995 setInsertPointAfterBundle(E->Scalars); 01996 01997 Value *Cond = vectorizeTree(CondVec); 01998 Value *True = vectorizeTree(TrueVec); 01999 Value *False = vectorizeTree(FalseVec); 02000 02001 if (Value *V = alreadyVectorized(E->Scalars)) 02002 return V; 02003 02004 Value *V = Builder.CreateSelect(Cond, True, False); 02005 E->VectorizedValue = V; 02006 ++NumVectorInstructions; 02007 return V; 02008 } 02009 case Instruction::Add: 02010 case Instruction::FAdd: 02011 case Instruction::Sub: 02012 case Instruction::FSub: 02013 case Instruction::Mul: 02014 case Instruction::FMul: 02015 case Instruction::UDiv: 02016 case Instruction::SDiv: 02017 case Instruction::FDiv: 02018 case Instruction::URem: 02019 case Instruction::SRem: 02020 case Instruction::FRem: 02021 case Instruction::Shl: 02022 case Instruction::LShr: 02023 case Instruction::AShr: 02024 case Instruction::And: 02025 case Instruction::Or: 02026 case Instruction::Xor: { 02027 ValueList LHSVL, RHSVL; 02028 if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) 02029 reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); 02030 else 02031 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 02032 LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 02033 RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 02034 } 02035 02036 setInsertPointAfterBundle(E->Scalars); 02037 02038 Value *LHS = vectorizeTree(LHSVL); 02039 Value *RHS = vectorizeTree(RHSVL); 02040 02041 if (LHS == RHS && isa<Instruction>(LHS)) { 02042 assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order"); 02043 } 02044 02045 if (Value *V = alreadyVectorized(E->Scalars)) 02046 return V; 02047 02048 BinaryOperator *BinOp = cast<BinaryOperator>(VL0); 02049 Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); 02050 E->VectorizedValue = V; 02051 propagateIRFlags(E->VectorizedValue, E->Scalars); 02052 ++NumVectorInstructions; 02053 02054 if (Instruction *I = dyn_cast<Instruction>(V)) 02055 return propagateMetadata(I, E->Scalars); 02056 02057 return V; 02058 } 02059 case Instruction::Load: { 02060 // Loads are inserted at the head of the tree because we don't want to 02061 // sink them all the way down past store instructions. 02062 setInsertPointAfterBundle(E->Scalars); 02063 02064 LoadInst *LI = cast<LoadInst>(VL0); 02065 Type *ScalarLoadTy = LI->getType(); 02066 unsigned AS = LI->getPointerAddressSpace(); 02067 02068 Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), 02069 VecTy->getPointerTo(AS)); 02070 02071 // The pointer operand uses an in-tree scalar so we add the new BitCast to 02072 // ExternalUses list to make sure that an extract will be generated in the 02073 // future. 02074 if (ScalarToTreeEntry.count(LI->getPointerOperand())) 02075 ExternalUses.push_back( 02076 ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0)); 02077 02078 unsigned Alignment = LI->getAlignment(); 02079 LI = Builder.CreateLoad(VecPtr); 02080 if (!Alignment) 02081 Alignment = DL->getABITypeAlignment(ScalarLoadTy); 02082 LI->setAlignment(Alignment); 02083 E->VectorizedValue = LI; 02084 ++NumVectorInstructions; 02085 return propagateMetadata(LI, E->Scalars); 02086 } 02087 case Instruction::Store: { 02088 StoreInst *SI = cast<StoreInst>(VL0); 02089 unsigned Alignment = SI->getAlignment(); 02090 unsigned AS = SI->getPointerAddressSpace(); 02091 02092 ValueList ValueOp; 02093 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 02094 ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand()); 02095 02096 setInsertPointAfterBundle(E->Scalars); 02097 02098 Value *VecValue = vectorizeTree(ValueOp); 02099 Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), 02100 VecTy->getPointerTo(AS)); 02101 StoreInst *S = Builder.CreateStore(VecValue, VecPtr); 02102 02103 // The pointer operand uses an in-tree scalar so we add the new BitCast to 02104 // ExternalUses list to make sure that an extract will be generated in the 02105 // future. 02106 if (ScalarToTreeEntry.count(SI->getPointerOperand())) 02107 ExternalUses.push_back( 02108 ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0)); 02109 02110 if (!Alignment) 02111 Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); 02112 S->setAlignment(Alignment); 02113 E->VectorizedValue = S; 02114 ++NumVectorInstructions; 02115 return propagateMetadata(S, E->Scalars); 02116 } 02117 case Instruction::GetElementPtr: { 02118 setInsertPointAfterBundle(E->Scalars); 02119 02120 ValueList Op0VL; 02121 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 02122 Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0)); 02123 02124 Value *Op0 = vectorizeTree(Op0VL); 02125 02126 std::vector<Value *> OpVecs; 02127 for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; 02128 ++j) { 02129 ValueList OpVL; 02130 for (int i = 0, e = E->Scalars.size(); i < e; ++i) 02131 OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j)); 02132 02133 Value *OpVec = vectorizeTree(OpVL); 02134 OpVecs.push_back(OpVec); 02135 } 02136 02137 Value *V = Builder.CreateGEP(Op0, OpVecs); 02138 E->VectorizedValue = V; 02139 ++NumVectorInstructions; 02140 02141 if (Instruction *I = dyn_cast<Instruction>(V)) 02142 return propagateMetadata(I, E->Scalars); 02143 02144 return V; 02145 } 02146 case Instruction::Call: { 02147 CallInst *CI = cast<CallInst>(VL0); 02148 setInsertPointAfterBundle(E->Scalars); 02149 Function *FI; 02150 Intrinsic::ID IID = Intrinsic::not_intrinsic; 02151 Value *ScalarArg = nullptr; 02152 if (CI && (FI = CI->getCalledFunction())) { 02153 IID = (Intrinsic::ID) FI->getIntrinsicID(); 02154 } 02155 std::vector<Value *> OpVecs; 02156 for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { 02157 ValueList OpVL; 02158 // ctlz,cttz and powi are special intrinsics whose second argument is 02159 // a scalar. This argument should not be vectorized. 02160 if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) { 02161 CallInst *CEI = cast<CallInst>(E->Scalars[0]); 02162 ScalarArg = CEI->getArgOperand(j); 02163 OpVecs.push_back(CEI->getArgOperand(j)); 02164 continue; 02165 } 02166 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 02167 CallInst *CEI = cast<CallInst>(E->Scalars[i]); 02168 OpVL.push_back(CEI->getArgOperand(j)); 02169 } 02170 02171 Value *OpVec = vectorizeTree(OpVL); 02172 DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); 02173 OpVecs.push_back(OpVec); 02174 } 02175 02176 Module *M = F->getParent(); 02177 Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); 02178 Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) }; 02179 Function *CF = Intrinsic::getDeclaration(M, ID, Tys); 02180 Value *V = Builder.CreateCall(CF, OpVecs); 02181 02182 // The scalar argument uses an in-tree scalar so we add the new vectorized 02183 // call to ExternalUses list to make sure that an extract will be 02184 // generated in the future. 02185 if (ScalarArg && ScalarToTreeEntry.count(ScalarArg)) 02186 ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0)); 02187 02188 E->VectorizedValue = V; 02189 ++NumVectorInstructions; 02190 return V; 02191 } 02192 case Instruction::ShuffleVector: { 02193 ValueList LHSVL, RHSVL; 02194 for (int i = 0, e = E->Scalars.size(); i < e; ++i) { 02195 LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0)); 02196 RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1)); 02197 } 02198 setInsertPointAfterBundle(E->Scalars); 02199 02200 Value *LHS = vectorizeTree(LHSVL); 02201 Value *RHS = vectorizeTree(RHSVL); 02202 02203 if (Value *V = alreadyVectorized(E->Scalars)) 02204 return V; 02205 02206 // Create a vector of LHS op1 RHS 02207 BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0); 02208 Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS); 02209 02210 // Create a vector of LHS op2 RHS 02211 Instruction *VL1 = cast<Instruction>(E->Scalars[1]); 02212 BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1); 02213 Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS); 02214 02215 // Create shuffle to take alternate operations from the vector. 02216 // Also, gather up odd and even scalar ops to propagate IR flags to 02217 // each vector operation. 02218 ValueList OddScalars, EvenScalars; 02219 unsigned e = E->Scalars.size(); 02220 SmallVector<Constant *, 8> Mask(e); 02221 for (unsigned i = 0; i < e; ++i) { 02222 if (i & 1) { 02223 Mask[i] = Builder.getInt32(e + i); 02224 OddScalars.push_back(E->Scalars[i]); 02225 } else { 02226 Mask[i] = Builder.getInt32(i); 02227 EvenScalars.push_back(E->Scalars[i]); 02228 } 02229 } 02230 02231 Value *ShuffleMask = ConstantVector::get(Mask); 02232 propagateIRFlags(V0, EvenScalars); 02233 propagateIRFlags(V1, OddScalars); 02234 02235 Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); 02236 E->VectorizedValue = V; 02237 ++NumVectorInstructions; 02238 if (Instruction *I = dyn_cast<Instruction>(V)) 02239 return propagateMetadata(I, E->Scalars); 02240 02241 return V; 02242 } 02243 default: 02244 llvm_unreachable("unknown inst"); 02245 } 02246 return nullptr; 02247 } 02248 02249 Value *BoUpSLP::vectorizeTree() { 02250 02251 // All blocks must be scheduled before any instructions are inserted. 02252 for (auto &BSIter : BlocksSchedules) { 02253 scheduleBlock(BSIter.second.get()); 02254 } 02255 02256 Builder.SetInsertPoint(F->getEntryBlock().begin()); 02257 vectorizeTree(&VectorizableTree[0]); 02258 02259 DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); 02260 02261 // Extract all of the elements with the external uses. 02262 for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end(); 02263 it != e; ++it) { 02264 Value *Scalar = it->Scalar; 02265 llvm::User *User = it->User; 02266 02267 // Skip users that we already RAUW. This happens when one instruction 02268 // has multiple uses of the same value. 02269 if (std::find(Scalar->user_begin(), Scalar->user_end(), User) == 02270 Scalar->user_end()) 02271 continue; 02272 assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar"); 02273 02274 int Idx = ScalarToTreeEntry[Scalar]; 02275 TreeEntry *E = &VectorizableTree[Idx]; 02276 assert(!E->NeedToGather && "Extracting from a gather list"); 02277 02278 Value *Vec = E->VectorizedValue; 02279 assert(Vec && "Can't find vectorizable value"); 02280 02281 Value *Lane = Builder.getInt32(it->Lane); 02282 // Generate extracts for out-of-tree users. 02283 // Find the insertion point for the extractelement lane. 02284 if (isa<Instruction>(Vec)){ 02285 if (PHINode *PH = dyn_cast<PHINode>(User)) { 02286 for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) { 02287 if (PH->getIncomingValue(i) == Scalar) { 02288 Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); 02289 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 02290 CSEBlocks.insert(PH->getIncomingBlock(i)); 02291 PH->setOperand(i, Ex); 02292 } 02293 } 02294 } else { 02295 Builder.SetInsertPoint(cast<Instruction>(User)); 02296 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 02297 CSEBlocks.insert(cast<Instruction>(User)->getParent()); 02298 User->replaceUsesOfWith(Scalar, Ex); 02299 } 02300 } else { 02301 Builder.SetInsertPoint(F->getEntryBlock().begin()); 02302 Value *Ex = Builder.CreateExtractElement(Vec, Lane); 02303 CSEBlocks.insert(&F->getEntryBlock()); 02304 User->replaceUsesOfWith(Scalar, Ex); 02305 } 02306 02307 DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); 02308 } 02309 02310 // For each vectorized value: 02311 for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { 02312 TreeEntry *Entry = &VectorizableTree[EIdx]; 02313 02314 // For each lane: 02315 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { 02316 Value *Scalar = Entry->Scalars[Lane]; 02317 // No need to handle users of gathered values. 02318 if (Entry->NeedToGather) 02319 continue; 02320 02321 assert(Entry->VectorizedValue && "Can't find vectorizable value"); 02322 02323 Type *Ty = Scalar->getType(); 02324 if (!Ty->isVoidTy()) { 02325 #ifndef NDEBUG 02326 for (User *U : Scalar->users()) { 02327 DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); 02328 02329 assert((ScalarToTreeEntry.count(U) || 02330 // It is legal to replace users in the ignorelist by undef. 02331 (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) != 02332 UserIgnoreList.end())) && 02333 "Replacing out-of-tree value with undef"); 02334 } 02335 #endif 02336 Value *Undef = UndefValue::get(Ty); 02337 Scalar->replaceAllUsesWith(Undef); 02338 } 02339 DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); 02340 cast<Instruction>(Scalar)->eraseFromParent(); 02341 } 02342 } 02343 02344 Builder.ClearInsertionPoint(); 02345 02346 return VectorizableTree[0].VectorizedValue; 02347 } 02348 02349 void BoUpSLP::optimizeGatherSequence() { 02350 DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size() 02351 << " gather sequences instructions.\n"); 02352 // LICM InsertElementInst sequences. 02353 for (SetVector<Instruction *>::iterator it = GatherSeq.begin(), 02354 e = GatherSeq.end(); it != e; ++it) { 02355 InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it); 02356 02357 if (!Insert) 02358 continue; 02359 02360 // Check if this block is inside a loop. 02361 Loop *L = LI->getLoopFor(Insert->getParent()); 02362 if (!L) 02363 continue; 02364 02365 // Check if it has a preheader. 02366 BasicBlock *PreHeader = L->getLoopPreheader(); 02367 if (!PreHeader) 02368 continue; 02369 02370 // If the vector or the element that we insert into it are 02371 // instructions that are defined in this basic block then we can't 02372 // hoist this instruction. 02373 Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0)); 02374 Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1)); 02375 if (CurrVec && L->contains(CurrVec)) 02376 continue; 02377 if (NewElem && L->contains(NewElem)) 02378 continue; 02379 02380 // We can hoist this instruction. Move it to the pre-header. 02381 Insert->moveBefore(PreHeader->getTerminator()); 02382 } 02383 02384 // Make a list of all reachable blocks in our CSE queue. 02385 SmallVector<const DomTreeNode *, 8> CSEWorkList; 02386 CSEWorkList.reserve(CSEBlocks.size()); 02387 for (BasicBlock *BB : CSEBlocks) 02388 if (DomTreeNode *N = DT->getNode(BB)) { 02389 assert(DT->isReachableFromEntry(N)); 02390 CSEWorkList.push_back(N); 02391 } 02392 02393 // Sort blocks by domination. This ensures we visit a block after all blocks 02394 // dominating it are visited. 02395 std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), 02396 [this](const DomTreeNode *A, const DomTreeNode *B) { 02397 return DT->properlyDominates(A, B); 02398 }); 02399 02400 // Perform O(N^2) search over the gather sequences and merge identical 02401 // instructions. TODO: We can further optimize this scan if we split the 02402 // instructions into different buckets based on the insert lane. 02403 SmallVector<Instruction *, 16> Visited; 02404 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { 02405 assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && 02406 "Worklist not sorted properly!"); 02407 BasicBlock *BB = (*I)->getBlock(); 02408 // For all instructions in blocks containing gather sequences: 02409 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { 02410 Instruction *In = it++; 02411 if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) 02412 continue; 02413 02414 // Check if we can replace this instruction with any of the 02415 // visited instructions. 02416 for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(), 02417 ve = Visited.end(); 02418 v != ve; ++v) { 02419 if (In->isIdenticalTo(*v) && 02420 DT->dominates((*v)->getParent(), In->getParent())) { 02421 In->replaceAllUsesWith(*v); 02422 In->eraseFromParent(); 02423 In = nullptr; 02424 break; 02425 } 02426 } 02427 if (In) { 02428 assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end()); 02429 Visited.push_back(In); 02430 } 02431 } 02432 } 02433 CSEBlocks.clear(); 02434 GatherSeq.clear(); 02435 } 02436 02437 // Groups the instructions to a bundle (which is then a single scheduling entity) 02438 // and schedules instructions until the bundle gets ready. 02439 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, 02440 AliasAnalysis *AA) { 02441 if (isa<PHINode>(VL[0])) 02442 return true; 02443 02444 // Initialize the instruction bundle. 02445 Instruction *OldScheduleEnd = ScheduleEnd; 02446 ScheduleData *PrevInBundle = nullptr; 02447 ScheduleData *Bundle = nullptr; 02448 bool ReSchedule = false; 02449 DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n"); 02450 for (Value *V : VL) { 02451 extendSchedulingRegion(V); 02452 ScheduleData *BundleMember = getScheduleData(V); 02453 assert(BundleMember && 02454 "no ScheduleData for bundle member (maybe not in same basic block)"); 02455 if (BundleMember->IsScheduled) { 02456 // A bundle member was scheduled as single instruction before and now 02457 // needs to be scheduled as part of the bundle. We just get rid of the 02458 // existing schedule. 02459 DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember 02460 << " was already scheduled\n"); 02461 ReSchedule = true; 02462 } 02463 assert(BundleMember->isSchedulingEntity() && 02464 "bundle member already part of other bundle"); 02465 if (PrevInBundle) { 02466 PrevInBundle->NextInBundle = BundleMember; 02467 } else { 02468 Bundle = BundleMember; 02469 } 02470 BundleMember->UnscheduledDepsInBundle = 0; 02471 Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; 02472 02473 // Group the instructions to a bundle. 02474 BundleMember->FirstInBundle = Bundle; 02475 PrevInBundle = BundleMember; 02476 } 02477 if (ScheduleEnd != OldScheduleEnd) { 02478 // The scheduling region got new instructions at the lower end (or it is a 02479 // new region for the first bundle). This makes it necessary to 02480 // recalculate all dependencies. 02481 // It is seldom that this needs to be done a second time after adding the 02482 // initial bundle to the region. 02483 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 02484 ScheduleData *SD = getScheduleData(I); 02485 SD->clearDependencies(); 02486 } 02487 ReSchedule = true; 02488 } 02489 if (ReSchedule) { 02490 resetSchedule(); 02491 initialFillReadyList(ReadyInsts); 02492 } 02493 02494 DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " 02495 << BB->getName() << "\n"); 02496 02497 calculateDependencies(Bundle, true, AA); 02498 02499 // Now try to schedule the new bundle. As soon as the bundle is "ready" it 02500 // means that there are no cyclic dependencies and we can schedule it. 02501 // Note that's important that we don't "schedule" the bundle yet (see 02502 // cancelScheduling). 02503 while (!Bundle->isReady() && !ReadyInsts.empty()) { 02504 02505 ScheduleData *pickedSD = ReadyInsts.back(); 02506 ReadyInsts.pop_back(); 02507 02508 if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) { 02509 schedule(pickedSD, ReadyInsts); 02510 } 02511 } 02512 return Bundle->isReady(); 02513 } 02514 02515 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) { 02516 if (isa<PHINode>(VL[0])) 02517 return; 02518 02519 ScheduleData *Bundle = getScheduleData(VL[0]); 02520 DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); 02521 assert(!Bundle->IsScheduled && 02522 "Can't cancel bundle which is already scheduled"); 02523 assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && 02524 "tried to unbundle something which is not a bundle"); 02525 02526 // Un-bundle: make single instructions out of the bundle. 02527 ScheduleData *BundleMember = Bundle; 02528 while (BundleMember) { 02529 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links"); 02530 BundleMember->FirstInBundle = BundleMember; 02531 ScheduleData *Next = BundleMember->NextInBundle; 02532 BundleMember->NextInBundle = nullptr; 02533 BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps; 02534 if (BundleMember->UnscheduledDepsInBundle == 0) { 02535 ReadyInsts.insert(BundleMember); 02536 } 02537 BundleMember = Next; 02538 } 02539 } 02540 02541 void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { 02542 if (getScheduleData(V)) 02543 return; 02544 Instruction *I = dyn_cast<Instruction>(V); 02545 assert(I && "bundle member must be an instruction"); 02546 assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled"); 02547 if (!ScheduleStart) { 02548 // It's the first instruction in the new region. 02549 initScheduleData(I, I->getNextNode(), nullptr, nullptr); 02550 ScheduleStart = I; 02551 ScheduleEnd = I->getNextNode(); 02552 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); 02553 DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); 02554 return; 02555 } 02556 // Search up and down at the same time, because we don't know if the new 02557 // instruction is above or below the existing scheduling region. 02558 BasicBlock::reverse_iterator UpIter(ScheduleStart); 02559 BasicBlock::reverse_iterator UpperEnd = BB->rend(); 02560 BasicBlock::iterator DownIter(ScheduleEnd); 02561 BasicBlock::iterator LowerEnd = BB->end(); 02562 for (;;) { 02563 if (UpIter != UpperEnd) { 02564 if (&*UpIter == I) { 02565 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); 02566 ScheduleStart = I; 02567 DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); 02568 return; 02569 } 02570 UpIter++; 02571 } 02572 if (DownIter != LowerEnd) { 02573 if (&*DownIter == I) { 02574 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, 02575 nullptr); 02576 ScheduleEnd = I->getNextNode(); 02577 assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); 02578 DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); 02579 return; 02580 } 02581 DownIter++; 02582 } 02583 assert((UpIter != UpperEnd || DownIter != LowerEnd) && 02584 "instruction not found in block"); 02585 } 02586 } 02587 02588 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, 02589 Instruction *ToI, 02590 ScheduleData *PrevLoadStore, 02591 ScheduleData *NextLoadStore) { 02592 ScheduleData *CurrentLoadStore = PrevLoadStore; 02593 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { 02594 ScheduleData *SD = ScheduleDataMap[I]; 02595 if (!SD) { 02596 // Allocate a new ScheduleData for the instruction. 02597 if (ChunkPos >= ChunkSize) { 02598 ScheduleDataChunks.push_back( 02599 llvm::make_unique<ScheduleData[]>(ChunkSize)); 02600 ChunkPos = 0; 02601 } 02602 SD = &(ScheduleDataChunks.back()[ChunkPos++]); 02603 ScheduleDataMap[I] = SD; 02604 SD->Inst = I; 02605 } 02606 assert(!isInSchedulingRegion(SD) && 02607 "new ScheduleData already in scheduling region"); 02608 SD->init(SchedulingRegionID); 02609 02610 if (I->mayReadOrWriteMemory()) { 02611 // Update the linked list of memory accessing instructions. 02612 if (CurrentLoadStore) { 02613 CurrentLoadStore->NextLoadStore = SD; 02614 } else { 02615 FirstLoadStoreInRegion = SD; 02616 } 02617 CurrentLoadStore = SD; 02618 } 02619 } 02620 if (NextLoadStore) { 02621 if (CurrentLoadStore) 02622 CurrentLoadStore->NextLoadStore = NextLoadStore; 02623 } else { 02624 LastLoadStoreInRegion = CurrentLoadStore; 02625 } 02626 } 02627 02628 /// \returns the AA location that is being access by the instruction. 02629 static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) { 02630 if (StoreInst *SI = dyn_cast<StoreInst>(I)) 02631 return AA->getLocation(SI); 02632 if (LoadInst *LI = dyn_cast<LoadInst>(I)) 02633 return AA->getLocation(LI); 02634 return AliasAnalysis::Location(); 02635 } 02636 02637 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD, 02638 bool InsertInReadyList, 02639 AliasAnalysis *AA) { 02640 assert(SD->isSchedulingEntity()); 02641 02642 SmallVector<ScheduleData *, 10> WorkList; 02643 WorkList.push_back(SD); 02644 02645 while (!WorkList.empty()) { 02646 ScheduleData *SD = WorkList.back(); 02647 WorkList.pop_back(); 02648 02649 ScheduleData *BundleMember = SD; 02650 while (BundleMember) { 02651 assert(isInSchedulingRegion(BundleMember)); 02652 if (!BundleMember->hasValidDependencies()) { 02653 02654 DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n"); 02655 BundleMember->Dependencies = 0; 02656 BundleMember->resetUnscheduledDeps(); 02657 02658 // Handle def-use chain dependencies. 02659 for (User *U : BundleMember->Inst->users()) { 02660 if (isa<Instruction>(U)) { 02661 ScheduleData *UseSD = getScheduleData(U); 02662 if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { 02663 BundleMember->Dependencies++; 02664 ScheduleData *DestBundle = UseSD->FirstInBundle; 02665 if (!DestBundle->IsScheduled) { 02666 BundleMember->incrementUnscheduledDeps(1); 02667 } 02668 if (!DestBundle->hasValidDependencies()) { 02669 WorkList.push_back(DestBundle); 02670 } 02671 } 02672 } else { 02673 // I'm not sure if this can ever happen. But we need to be safe. 02674 // This lets the instruction/bundle never be scheduled and eventally 02675 // disable vectorization. 02676 BundleMember->Dependencies++; 02677 BundleMember->incrementUnscheduledDeps(1); 02678 } 02679 } 02680 02681 // Handle the memory dependencies. 02682 ScheduleData *DepDest = BundleMember->NextLoadStore; 02683 if (DepDest) { 02684 AliasAnalysis::Location SrcLoc = getLocation(BundleMember->Inst, AA); 02685 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); 02686 02687 while (DepDest) { 02688 assert(isInSchedulingRegion(DepDest)); 02689 if (SrcMayWrite || DepDest->Inst->mayWriteToMemory()) { 02690 AliasAnalysis::Location DstLoc = getLocation(DepDest->Inst, AA); 02691 if (!SrcLoc.Ptr || !DstLoc.Ptr || AA->alias(SrcLoc, DstLoc)) { 02692 DepDest->MemoryDependencies.push_back(BundleMember); 02693 BundleMember->Dependencies++; 02694 ScheduleData *DestBundle = DepDest->FirstInBundle; 02695 if (!DestBundle->IsScheduled) { 02696 BundleMember->incrementUnscheduledDeps(1); 02697 } 02698 if (!DestBundle->hasValidDependencies()) { 02699 WorkList.push_back(DestBundle); 02700 } 02701 } 02702 } 02703 DepDest = DepDest->NextLoadStore; 02704 } 02705 } 02706 } 02707 BundleMember = BundleMember->NextInBundle; 02708 } 02709 if (InsertInReadyList && SD->isReady()) { 02710 ReadyInsts.push_back(SD); 02711 DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n"); 02712 } 02713 } 02714 } 02715 02716 void BoUpSLP::BlockScheduling::resetSchedule() { 02717 assert(ScheduleStart && 02718 "tried to reset schedule on block which has not been scheduled"); 02719 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { 02720 ScheduleData *SD = getScheduleData(I); 02721 assert(isInSchedulingRegion(SD)); 02722 SD->IsScheduled = false; 02723 SD->resetUnscheduledDeps(); 02724 } 02725 ReadyInsts.clear(); 02726 } 02727 02728 void BoUpSLP::scheduleBlock(BlockScheduling *BS) { 02729 02730 if (!BS->ScheduleStart) 02731 return; 02732 02733 DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); 02734 02735 BS->resetSchedule(); 02736 02737 // For the real scheduling we use a more sophisticated ready-list: it is 02738 // sorted by the original instruction location. This lets the final schedule 02739 // be as close as possible to the original instruction order. 02740 struct ScheduleDataCompare { 02741 bool operator()(ScheduleData *SD1, ScheduleData *SD2) { 02742 return SD2->SchedulingPriority < SD1->SchedulingPriority; 02743 } 02744 }; 02745 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts; 02746 02747 // Ensure that all depencency data is updated and fill the ready-list with 02748 // initial instructions. 02749 int Idx = 0; 02750 int NumToSchedule = 0; 02751 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; 02752 I = I->getNextNode()) { 02753 ScheduleData *SD = BS->getScheduleData(I); 02754 assert( 02755 SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) && 02756 "scheduler and vectorizer have different opinion on what is a bundle"); 02757 SD->FirstInBundle->SchedulingPriority = Idx++; 02758 if (SD->isSchedulingEntity()) { 02759 BS->calculateDependencies(SD, false, AA); 02760 NumToSchedule++; 02761 } 02762 } 02763 BS->initialFillReadyList(ReadyInsts); 02764 02765 Instruction *LastScheduledInst = BS->ScheduleEnd; 02766 02767 // Do the "real" scheduling. 02768 while (!ReadyInsts.empty()) { 02769 ScheduleData *picked = *ReadyInsts.begin(); 02770 ReadyInsts.erase(ReadyInsts.begin()); 02771 02772 // Move the scheduled instruction(s) to their dedicated places, if not 02773 // there yet. 02774 ScheduleData *BundleMember = picked; 02775 while (BundleMember) { 02776 Instruction *pickedInst = BundleMember->Inst; 02777 if (LastScheduledInst->getNextNode() != pickedInst) { 02778 BS->BB->getInstList().remove(pickedInst); 02779 BS->BB->getInstList().insert(LastScheduledInst, pickedInst); 02780 } 02781 LastScheduledInst = pickedInst; 02782 BundleMember = BundleMember->NextInBundle; 02783 } 02784 02785 BS->schedule(picked, ReadyInsts); 02786 NumToSchedule--; 02787 } 02788 assert(NumToSchedule == 0 && "could not schedule all instructions"); 02789 02790 // Avoid duplicate scheduling of the block. 02791 BS->ScheduleStart = nullptr; 02792 } 02793 02794 /// The SLPVectorizer Pass. 02795 struct SLPVectorizer : public FunctionPass { 02796 typedef SmallVector<StoreInst *, 8> StoreList; 02797 typedef MapVector<Value *, StoreList> StoreListMap; 02798 02799 /// Pass identification, replacement for typeid 02800 static char ID; 02801 02802 explicit SLPVectorizer() : FunctionPass(ID) { 02803 initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); 02804 } 02805 02806 ScalarEvolution *SE; 02807 const DataLayout *DL; 02808 TargetTransformInfo *TTI; 02809 TargetLibraryInfo *TLI; 02810 AliasAnalysis *AA; 02811 LoopInfo *LI; 02812 DominatorTree *DT; 02813 02814 bool runOnFunction(Function &F) override { 02815 if (skipOptnoneFunction(F)) 02816 return false; 02817 02818 SE = &getAnalysis<ScalarEvolution>(); 02819 DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>(); 02820 DL = DLP ? &DLP->getDataLayout() : nullptr; 02821 TTI = &getAnalysis<TargetTransformInfo>(); 02822 TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); 02823 AA = &getAnalysis<AliasAnalysis>(); 02824 LI = &getAnalysis<LoopInfo>(); 02825 DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 02826 02827 StoreRefs.clear(); 02828 bool Changed = false; 02829 02830 // If the target claims to have no vector registers don't attempt 02831 // vectorization. 02832 if (!TTI->getNumberOfRegisters(true)) 02833 return false; 02834 02835 // Must have DataLayout. We can't require it because some tests run w/o 02836 // triple. 02837 if (!DL) 02838 return false; 02839 02840 // Don't vectorize when the attribute NoImplicitFloat is used. 02841 if (F.hasFnAttribute(Attribute::NoImplicitFloat)) 02842 return false; 02843 02844 DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n"); 02845 02846 // Use the bottom up slp vectorizer to construct chains that start with 02847 // store instructions. 02848 BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT); 02849 02850 // Scan the blocks in the function in post order. 02851 for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()), 02852 e = po_end(&F.getEntryBlock()); it != e; ++it) { 02853 BasicBlock *BB = *it; 02854 // Vectorize trees that end at stores. 02855 if (unsigned count = collectStores(BB, R)) { 02856 (void)count; 02857 DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n"); 02858 Changed |= vectorizeStoreChains(R); 02859 } 02860 02861 // Vectorize trees that end at reductions. 02862 Changed |= vectorizeChainsInBlock(BB, R); 02863 } 02864 02865 if (Changed) { 02866 R.optimizeGatherSequence(); 02867 DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n"); 02868 DEBUG(verifyFunction(F)); 02869 } 02870 return Changed; 02871 } 02872 02873 void getAnalysisUsage(AnalysisUsage &AU) const override { 02874 FunctionPass::getAnalysisUsage(AU); 02875 AU.addRequired<ScalarEvolution>(); 02876 AU.addRequired<AliasAnalysis>(); 02877 AU.addRequired<TargetTransformInfo>(); 02878 AU.addRequired<LoopInfo>(); 02879 AU.addRequired<DominatorTreeWrapperPass>(); 02880 AU.addPreserved<LoopInfo>(); 02881 AU.addPreserved<DominatorTreeWrapperPass>(); 02882 AU.setPreservesCFG(); 02883 } 02884 02885 private: 02886 02887 /// \brief Collect memory references and sort them according to their base 02888 /// object. We sort the stores to their base objects to reduce the cost of the 02889 /// quadratic search on the stores. TODO: We can further reduce this cost 02890 /// if we flush the chain creation every time we run into a memory barrier. 02891 unsigned collectStores(BasicBlock *BB, BoUpSLP &R); 02892 02893 /// \brief Try to vectorize a chain that starts at two arithmetic instrs. 02894 bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); 02895 02896 /// \brief Try to vectorize a list of operands. 02897 /// \@param BuildVector A list of users to ignore for the purpose of 02898 /// scheduling and that don't need extracting. 02899 /// \returns true if a value was vectorized. 02900 bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 02901 ArrayRef<Value *> BuildVector = None, 02902 bool allowReorder = false); 02903 02904 /// \brief Try to vectorize a chain that may start at the operands of \V; 02905 bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); 02906 02907 /// \brief Vectorize the stores that were collected in StoreRefs. 02908 bool vectorizeStoreChains(BoUpSLP &R); 02909 02910 /// \brief Scan the basic block and look for patterns that are likely to start 02911 /// a vectorization chain. 02912 bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); 02913 02914 bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold, 02915 BoUpSLP &R); 02916 02917 bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold, 02918 BoUpSLP &R); 02919 private: 02920 StoreListMap StoreRefs; 02921 }; 02922 02923 /// \brief Check that the Values in the slice in VL array are still existent in 02924 /// the WeakVH array. 02925 /// Vectorization of part of the VL array may cause later values in the VL array 02926 /// to become invalid. We track when this has happened in the WeakVH array. 02927 static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL, 02928 SmallVectorImpl<WeakVH> &VH, 02929 unsigned SliceBegin, 02930 unsigned SliceSize) { 02931 for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i) 02932 if (VH[i] != VL[i]) 02933 return true; 02934 02935 return false; 02936 } 02937 02938 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, 02939 int CostThreshold, BoUpSLP &R) { 02940 unsigned ChainLen = Chain.size(); 02941 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen 02942 << "\n"); 02943 Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); 02944 unsigned Sz = DL->getTypeSizeInBits(StoreTy); 02945 unsigned VF = MinVecRegSize / Sz; 02946 02947 if (!isPowerOf2_32(Sz) || VF < 2) 02948 return false; 02949 02950 // Keep track of values that were deleted by vectorizing in the loop below. 02951 SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end()); 02952 02953 bool Changed = false; 02954 // Look for profitable vectorizable trees at all offsets, starting at zero. 02955 for (unsigned i = 0, e = ChainLen; i < e; ++i) { 02956 if (i + VF > e) 02957 break; 02958 02959 // Check that a previous iteration of this loop did not delete the Value. 02960 if (hasValueBeenRAUWed(Chain, TrackValues, i, VF)) 02961 continue; 02962 02963 DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i 02964 << "\n"); 02965 ArrayRef<Value *> Operands = Chain.slice(i, VF); 02966 02967 R.buildTree(Operands); 02968 02969 int Cost = R.getTreeCost(); 02970 02971 DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); 02972 if (Cost < CostThreshold) { 02973 DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); 02974 R.vectorizeTree(); 02975 02976 // Move to the next bundle. 02977 i += VF - 1; 02978 Changed = true; 02979 } 02980 } 02981 02982 return Changed; 02983 } 02984 02985 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, 02986 int costThreshold, BoUpSLP &R) { 02987 SetVector<Value *> Heads, Tails; 02988 SmallDenseMap<Value *, Value *> ConsecutiveChain; 02989 02990 // We may run into multiple chains that merge into a single chain. We mark the 02991 // stores that we vectorized so that we don't visit the same store twice. 02992 BoUpSLP::ValueSet VectorizedStores; 02993 bool Changed = false; 02994 02995 // Do a quadratic search on all of the given stores and find 02996 // all of the pairs of stores that follow each other. 02997 for (unsigned i = 0, e = Stores.size(); i < e; ++i) { 02998 for (unsigned j = 0; j < e; ++j) { 02999 if (i == j) 03000 continue; 03001 03002 if (R.isConsecutiveAccess(Stores[i], Stores[j])) { 03003 Tails.insert(Stores[j]); 03004 Heads.insert(Stores[i]); 03005 ConsecutiveChain[Stores[i]] = Stores[j]; 03006 } 03007 } 03008 } 03009 03010 // For stores that start but don't end a link in the chain: 03011 for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end(); 03012 it != e; ++it) { 03013 if (Tails.count(*it)) 03014 continue; 03015 03016 // We found a store instr that starts a chain. Now follow the chain and try 03017 // to vectorize it. 03018 BoUpSLP::ValueList Operands; 03019 Value *I = *it; 03020 // Collect the chain into a list. 03021 while (Tails.count(I) || Heads.count(I)) { 03022 if (VectorizedStores.count(I)) 03023 break; 03024 Operands.push_back(I); 03025 // Move to the next value in the chain. 03026 I = ConsecutiveChain[I]; 03027 } 03028 03029 bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R); 03030 03031 // Mark the vectorized stores so that we don't vectorize them again. 03032 if (Vectorized) 03033 VectorizedStores.insert(Operands.begin(), Operands.end()); 03034 Changed |= Vectorized; 03035 } 03036 03037 return Changed; 03038 } 03039 03040 03041 unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { 03042 unsigned count = 0; 03043 StoreRefs.clear(); 03044 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { 03045 StoreInst *SI = dyn_cast<StoreInst>(it); 03046 if (!SI) 03047 continue; 03048 03049 // Don't touch volatile stores. 03050 if (!SI->isSimple()) 03051 continue; 03052 03053 // Check that the pointer points to scalars. 03054 Type *Ty = SI->getValueOperand()->getType(); 03055 if (Ty->isAggregateType() || Ty->isVectorTy()) 03056 continue; 03057 03058 // Find the base pointer. 03059 Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL); 03060 03061 // Save the store locations. 03062 StoreRefs[Ptr].push_back(SI); 03063 count++; 03064 } 03065 return count; 03066 } 03067 03068 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { 03069 if (!A || !B) 03070 return false; 03071 Value *VL[] = { A, B }; 03072 return tryToVectorizeList(VL, R, None, true); 03073 } 03074 03075 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, 03076 ArrayRef<Value *> BuildVector, 03077 bool allowReorder) { 03078 if (VL.size() < 2) 03079 return false; 03080 03081 DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n"); 03082 03083 // Check that all of the parts are scalar instructions of the same type. 03084 Instruction *I0 = dyn_cast<Instruction>(VL[0]); 03085 if (!I0) 03086 return false; 03087 03088 unsigned Opcode0 = I0->getOpcode(); 03089 03090 Type *Ty0 = I0->getType(); 03091 unsigned Sz = DL->getTypeSizeInBits(Ty0); 03092 unsigned VF = MinVecRegSize / Sz; 03093 03094 for (int i = 0, e = VL.size(); i < e; ++i) { 03095 Type *Ty = VL[i]->getType(); 03096 if (Ty->isAggregateType() || Ty->isVectorTy()) 03097 return false; 03098 Instruction *Inst = dyn_cast<Instruction>(VL[i]); 03099 if (!Inst || Inst->getOpcode() != Opcode0) 03100 return false; 03101 } 03102 03103 bool Changed = false; 03104 03105 // Keep track of values that were deleted by vectorizing in the loop below. 03106 SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end()); 03107 03108 for (unsigned i = 0, e = VL.size(); i < e; ++i) { 03109 unsigned OpsWidth = 0; 03110 03111 if (i + VF > e) 03112 OpsWidth = e - i; 03113 else 03114 OpsWidth = VF; 03115 03116 if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) 03117 break; 03118 03119 // Check that a previous iteration of this loop did not delete the Value. 03120 if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth)) 03121 continue; 03122 03123 DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " 03124 << "\n"); 03125 ArrayRef<Value *> Ops = VL.slice(i, OpsWidth); 03126 03127 ArrayRef<Value *> BuildVectorSlice; 03128 if (!BuildVector.empty()) 03129 BuildVectorSlice = BuildVector.slice(i, OpsWidth); 03130 03131 R.buildTree(Ops, BuildVectorSlice); 03132 // TODO: check if we can allow reordering also for other cases than 03133 // tryToVectorizePair() 03134 if (allowReorder && R.shouldReorder()) { 03135 assert(Ops.size() == 2); 03136 assert(BuildVectorSlice.empty()); 03137 Value *ReorderedOps[] = { Ops[1], Ops[0] }; 03138 R.buildTree(ReorderedOps, None); 03139 } 03140 int Cost = R.getTreeCost(); 03141 03142 if (Cost < -SLPCostThreshold) { 03143 DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); 03144 Value *VectorizedRoot = R.vectorizeTree(); 03145 03146 // Reconstruct the build vector by extracting the vectorized root. This 03147 // way we handle the case where some elements of the vector are undefined. 03148 // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2)) 03149 if (!BuildVectorSlice.empty()) { 03150 // The insert point is the last build vector instruction. The vectorized 03151 // root will precede it. This guarantees that we get an instruction. The 03152 // vectorized tree could have been constant folded. 03153 Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back()); 03154 unsigned VecIdx = 0; 03155 for (auto &V : BuildVectorSlice) { 03156 IRBuilder<true, NoFolder> Builder( 03157 ++BasicBlock::iterator(InsertAfter)); 03158 InsertElementInst *IE = cast<InsertElementInst>(V); 03159 Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement( 03160 VectorizedRoot, Builder.getInt32(VecIdx++))); 03161 IE->setOperand(1, Extract); 03162 IE->removeFromParent(); 03163 IE->insertAfter(Extract); 03164 InsertAfter = IE; 03165 } 03166 } 03167 // Move to the next bundle. 03168 i += VF - 1; 03169 Changed = true; 03170 } 03171 } 03172 03173 return Changed; 03174 } 03175 03176 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) { 03177 if (!V) 03178 return false; 03179 03180 // Try to vectorize V. 03181 if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) 03182 return true; 03183 03184 BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0)); 03185 BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1)); 03186 // Try to skip B. 03187 if (B && B->hasOneUse()) { 03188 BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0)); 03189 BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1)); 03190 if (tryToVectorizePair(A, B0, R)) { 03191 return true; 03192 } 03193 if (tryToVectorizePair(A, B1, R)) { 03194 return true; 03195 } 03196 } 03197 03198 // Try to skip A. 03199 if (A && A->hasOneUse()) { 03200 BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0)); 03201 BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1)); 03202 if (tryToVectorizePair(A0, B, R)) { 03203 return true; 03204 } 03205 if (tryToVectorizePair(A1, B, R)) { 03206 return true; 03207 } 03208 } 03209 return 0; 03210 } 03211 03212 /// \brief Generate a shuffle mask to be used in a reduction tree. 03213 /// 03214 /// \param VecLen The length of the vector to be reduced. 03215 /// \param NumEltsToRdx The number of elements that should be reduced in the 03216 /// vector. 03217 /// \param IsPairwise Whether the reduction is a pairwise or splitting 03218 /// reduction. A pairwise reduction will generate a mask of 03219 /// <0,2,...> or <1,3,..> while a splitting reduction will generate 03220 /// <2,3, undef,undef> for a vector of 4 and NumElts = 2. 03221 /// \param IsLeft True will generate a mask of even elements, odd otherwise. 03222 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, 03223 bool IsPairwise, bool IsLeft, 03224 IRBuilder<> &Builder) { 03225 assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask"); 03226 03227 SmallVector<Constant *, 32> ShuffleMask( 03228 VecLen, UndefValue::get(Builder.getInt32Ty())); 03229 03230 if (IsPairwise) 03231 // Build a mask of 0, 2, ... (left) or 1, 3, ... (right). 03232 for (unsigned i = 0; i != NumEltsToRdx; ++i) 03233 ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft); 03234 else 03235 // Move the upper half of the vector to the lower half. 03236 for (unsigned i = 0; i != NumEltsToRdx; ++i) 03237 ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i); 03238 03239 return ConstantVector::get(ShuffleMask); 03240 } 03241 03242 03243 /// Model horizontal reductions. 03244 /// 03245 /// A horizontal reduction is a tree of reduction operations (currently add and 03246 /// fadd) that has operations that can be put into a vector as its leaf. 03247 /// For example, this tree: 03248 /// 03249 /// mul mul mul mul 03250 /// \ / \ / 03251 /// + + 03252 /// \ / 03253 /// + 03254 /// This tree has "mul" as its reduced values and "+" as its reduction 03255 /// operations. A reduction might be feeding into a store or a binary operation 03256 /// feeding a phi. 03257 /// ... 03258 /// \ / 03259 /// + 03260 /// | 03261 /// phi += 03262 /// 03263 /// Or: 03264 /// ... 03265 /// \ / 03266 /// + 03267 /// | 03268 /// *p = 03269 /// 03270 class HorizontalReduction { 03271 SmallVector<Value *, 16> ReductionOps; 03272 SmallVector<Value *, 32> ReducedVals; 03273 03274 BinaryOperator *ReductionRoot; 03275 PHINode *ReductionPHI; 03276 03277 /// The opcode of the reduction. 03278 unsigned ReductionOpcode; 03279 /// The opcode of the values we perform a reduction on. 03280 unsigned ReducedValueOpcode; 03281 /// The width of one full horizontal reduction operation. 03282 unsigned ReduxWidth; 03283 /// Should we model this reduction as a pairwise reduction tree or a tree that 03284 /// splits the vector in halves and adds those halves. 03285 bool IsPairwiseReduction; 03286 03287 public: 03288 HorizontalReduction() 03289 : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0), 03290 ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {} 03291 03292 /// \brief Try to find a reduction tree. 03293 bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B, 03294 const DataLayout *DL) { 03295 assert((!Phi || 03296 std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && 03297 "Thi phi needs to use the binary operator"); 03298 03299 // We could have a initial reductions that is not an add. 03300 // r *= v1 + v2 + v3 + v4 03301 // In such a case start looking for a tree rooted in the first '+'. 03302 if (Phi) { 03303 if (B->getOperand(0) == Phi) { 03304 Phi = nullptr; 03305 B = dyn_cast<BinaryOperator>(B->getOperand(1)); 03306 } else if (B->getOperand(1) == Phi) { 03307 Phi = nullptr; 03308 B = dyn_cast<BinaryOperator>(B->getOperand(0)); 03309 } 03310 } 03311 03312 if (!B) 03313 return false; 03314 03315 Type *Ty = B->getType(); 03316 if (Ty->isVectorTy()) 03317 return false; 03318 03319 ReductionOpcode = B->getOpcode(); 03320 ReducedValueOpcode = 0; 03321 ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty); 03322 ReductionRoot = B; 03323 ReductionPHI = Phi; 03324 03325 if (ReduxWidth < 4) 03326 return false; 03327 03328 // We currently only support adds. 03329 if (ReductionOpcode != Instruction::Add && 03330 ReductionOpcode != Instruction::FAdd) 03331 return false; 03332 03333 // Post order traverse the reduction tree starting at B. We only handle true 03334 // trees containing only binary operators. 03335 SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack; 03336 Stack.push_back(std::make_pair(B, 0)); 03337 while (!Stack.empty()) { 03338 BinaryOperator *TreeN = Stack.back().first; 03339 unsigned EdgeToVist = Stack.back().second++; 03340 bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode; 03341 03342 // Only handle trees in the current basic block. 03343 if (TreeN->getParent() != B->getParent()) 03344 return false; 03345 03346 // Each tree node needs to have one user except for the ultimate 03347 // reduction. 03348 if (!TreeN->hasOneUse() && TreeN != B) 03349 return false; 03350 03351 // Postorder vist. 03352 if (EdgeToVist == 2 || IsReducedValue) { 03353 if (IsReducedValue) { 03354 // Make sure that the opcodes of the operations that we are going to 03355 // reduce match. 03356 if (!ReducedValueOpcode) 03357 ReducedValueOpcode = TreeN->getOpcode(); 03358 else if (ReducedValueOpcode != TreeN->getOpcode()) 03359 return false; 03360 ReducedVals.push_back(TreeN); 03361 } else { 03362 // We need to be able to reassociate the adds. 03363 if (!TreeN->isAssociative()) 03364 return false; 03365 ReductionOps.push_back(TreeN); 03366 } 03367 // Retract. 03368 Stack.pop_back(); 03369 continue; 03370 } 03371 03372 // Visit left or right. 03373 Value *NextV = TreeN->getOperand(EdgeToVist); 03374 BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV); 03375 if (Next) 03376 Stack.push_back(std::make_pair(Next, 0)); 03377 else if (NextV != Phi) 03378 return false; 03379 } 03380 return true; 03381 } 03382 03383 /// \brief Attempt to vectorize the tree found by 03384 /// matchAssociativeReduction. 03385 bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { 03386 if (ReducedVals.empty()) 03387 return false; 03388 03389 unsigned NumReducedVals = ReducedVals.size(); 03390 if (NumReducedVals < ReduxWidth) 03391 return false; 03392 03393 Value *VectorizedTree = nullptr; 03394 IRBuilder<> Builder(ReductionRoot); 03395 FastMathFlags Unsafe; 03396 Unsafe.setUnsafeAlgebra(); 03397 Builder.SetFastMathFlags(Unsafe); 03398 unsigned i = 0; 03399 03400 for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { 03401 V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps); 03402 03403 // Estimate cost. 03404 int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); 03405 if (Cost >= -SLPCostThreshold) 03406 break; 03407 03408 DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost 03409 << ". (HorRdx)\n"); 03410 03411 // Vectorize a tree. 03412 DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); 03413 Value *VectorizedRoot = V.vectorizeTree(); 03414 03415 // Emit a reduction. 03416 Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder); 03417 if (VectorizedTree) { 03418 Builder.SetCurrentDebugLocation(Loc); 03419 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, 03420 ReducedSubTree, "bin.rdx"); 03421 } else 03422 VectorizedTree = ReducedSubTree; 03423 } 03424 03425 if (VectorizedTree) { 03426 // Finish the reduction. 03427 for (; i < NumReducedVals; ++i) { 03428 Builder.SetCurrentDebugLocation( 03429 cast<Instruction>(ReducedVals[i])->getDebugLoc()); 03430 VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree, 03431 ReducedVals[i]); 03432 } 03433 // Update users. 03434 if (ReductionPHI) { 03435 assert(ReductionRoot && "Need a reduction operation"); 03436 ReductionRoot->setOperand(0, VectorizedTree); 03437 ReductionRoot->setOperand(1, ReductionPHI); 03438 } else 03439 ReductionRoot->replaceAllUsesWith(VectorizedTree); 03440 } 03441 return VectorizedTree != nullptr; 03442 } 03443 03444 private: 03445 03446 /// \brief Calcuate the cost of a reduction. 03447 int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) { 03448 Type *ScalarTy = FirstReducedVal->getType(); 03449 Type *VecTy = VectorType::get(ScalarTy, ReduxWidth); 03450 03451 int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true); 03452 int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false); 03453 03454 IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; 03455 int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; 03456 03457 int ScalarReduxCost = 03458 ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy); 03459 03460 DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost 03461 << " for reduction that starts with " << *FirstReducedVal 03462 << " (It is a " 03463 << (IsPairwiseReduction ? "pairwise" : "splitting") 03464 << " reduction)\n"); 03465 03466 return VecReduxCost - ScalarReduxCost; 03467 } 03468 03469 static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L, 03470 Value *R, const Twine &Name = "") { 03471 if (Opcode == Instruction::FAdd) 03472 return Builder.CreateFAdd(L, R, Name); 03473 return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name); 03474 } 03475 03476 /// \brief Emit a horizontal reduction of the vectorized value. 03477 Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) { 03478 assert(VectorizedValue && "Need to have a vectorized tree node"); 03479 Instruction *ValToReduce = dyn_cast<Instruction>(VectorizedValue); 03480 assert(isPowerOf2_32(ReduxWidth) && 03481 "We only handle power-of-two reductions for now"); 03482 03483 Value *TmpVec = ValToReduce; 03484 for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { 03485 if (IsPairwiseReduction) { 03486 Value *LeftMask = 03487 createRdxShuffleMask(ReduxWidth, i, true, true, Builder); 03488 Value *RightMask = 03489 createRdxShuffleMask(ReduxWidth, i, true, false, Builder); 03490 03491 Value *LeftShuf = Builder.CreateShuffleVector( 03492 TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); 03493 Value *RightShuf = Builder.CreateShuffleVector( 03494 TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), 03495 "rdx.shuf.r"); 03496 TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf, 03497 "bin.rdx"); 03498 } else { 03499 Value *UpperHalf = 03500 createRdxShuffleMask(ReduxWidth, i, false, false, Builder); 03501 Value *Shuf = Builder.CreateShuffleVector( 03502 TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); 03503 TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx"); 03504 } 03505 } 03506 03507 // The result is in the first element of the vector. 03508 return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); 03509 } 03510 }; 03511 03512 /// \brief Recognize construction of vectors like 03513 /// %ra = insertelement <4 x float> undef, float %s0, i32 0 03514 /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 03515 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 03516 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 03517 /// 03518 /// Returns true if it matches 03519 /// 03520 static bool findBuildVector(InsertElementInst *FirstInsertElem, 03521 SmallVectorImpl<Value *> &BuildVector, 03522 SmallVectorImpl<Value *> &BuildVectorOpds) { 03523 if (!isa<UndefValue>(FirstInsertElem->getOperand(0))) 03524 return false; 03525 03526 InsertElementInst *IE = FirstInsertElem; 03527 while (true) { 03528 BuildVector.push_back(IE); 03529 BuildVectorOpds.push_back(IE->getOperand(1)); 03530 03531 if (IE->use_empty()) 03532 return false; 03533 03534 InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back()); 03535 if (!NextUse) 03536 return true; 03537 03538 // If this isn't the final use, make sure the next insertelement is the only 03539 // use. It's OK if the final constructed vector is used multiple times 03540 if (!IE->hasOneUse()) 03541 return false; 03542 03543 IE = NextUse; 03544 } 03545 03546 return false; 03547 } 03548 03549 static bool PhiTypeSorterFunc(Value *V, Value *V2) { 03550 return V->getType() < V2->getType(); 03551 } 03552 03553 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { 03554 bool Changed = false; 03555 SmallVector<Value *, 4> Incoming; 03556 SmallSet<Value *, 16> VisitedInstrs; 03557 03558 bool HaveVectorizedPhiNodes = true; 03559 while (HaveVectorizedPhiNodes) { 03560 HaveVectorizedPhiNodes = false; 03561 03562 // Collect the incoming values from the PHIs. 03563 Incoming.clear(); 03564 for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie; 03565 ++instr) { 03566 PHINode *P = dyn_cast<PHINode>(instr); 03567 if (!P) 03568 break; 03569 03570 if (!VisitedInstrs.count(P)) 03571 Incoming.push_back(P); 03572 } 03573 03574 // Sort by type. 03575 std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc); 03576 03577 // Try to vectorize elements base on their type. 03578 for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(), 03579 E = Incoming.end(); 03580 IncIt != E;) { 03581 03582 // Look for the next elements with the same type. 03583 SmallVector<Value *, 4>::iterator SameTypeIt = IncIt; 03584 while (SameTypeIt != E && 03585 (*SameTypeIt)->getType() == (*IncIt)->getType()) { 03586 VisitedInstrs.insert(*SameTypeIt); 03587 ++SameTypeIt; 03588 } 03589 03590 // Try to vectorize them. 03591 unsigned NumElts = (SameTypeIt - IncIt); 03592 DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n"); 03593 if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) { 03594 // Success start over because instructions might have been changed. 03595 HaveVectorizedPhiNodes = true; 03596 Changed = true; 03597 break; 03598 } 03599 03600 // Start over at the next instruction of a different type (or the end). 03601 IncIt = SameTypeIt; 03602 } 03603 } 03604 03605 VisitedInstrs.clear(); 03606 03607 for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) { 03608 // We may go through BB multiple times so skip the one we have checked. 03609 if (!VisitedInstrs.insert(it)) 03610 continue; 03611 03612 if (isa<DbgInfoIntrinsic>(it)) 03613 continue; 03614 03615 // Try to vectorize reductions that use PHINodes. 03616 if (PHINode *P = dyn_cast<PHINode>(it)) { 03617 // Check that the PHI is a reduction PHI. 03618 if (P->getNumIncomingValues() != 2) 03619 return Changed; 03620 Value *Rdx = 03621 (P->getIncomingBlock(0) == BB 03622 ? (P->getIncomingValue(0)) 03623 : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) 03624 : nullptr)); 03625 // Check if this is a Binary Operator. 03626 BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx); 03627 if (!BI) 03628 continue; 03629 03630 // Try to match and vectorize a horizontal reduction. 03631 HorizontalReduction HorRdx; 03632 if (ShouldVectorizeHor && 03633 HorRdx.matchAssociativeReduction(P, BI, DL) && 03634 HorRdx.tryToReduce(R, TTI)) { 03635 Changed = true; 03636 it = BB->begin(); 03637 e = BB->end(); 03638 continue; 03639 } 03640 03641 Value *Inst = BI->getOperand(0); 03642 if (Inst == P) 03643 Inst = BI->getOperand(1); 03644 03645 if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) { 03646 // We would like to start over since some instructions are deleted 03647 // and the iterator may become invalid value. 03648 Changed = true; 03649 it = BB->begin(); 03650 e = BB->end(); 03651 continue; 03652 } 03653 03654 continue; 03655 } 03656 03657 // Try to vectorize horizontal reductions feeding into a store. 03658 if (ShouldStartVectorizeHorAtStore) 03659 if (StoreInst *SI = dyn_cast<StoreInst>(it)) 03660 if (BinaryOperator *BinOp = 03661 dyn_cast<BinaryOperator>(SI->getValueOperand())) { 03662 HorizontalReduction HorRdx; 03663 if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) && 03664 HorRdx.tryToReduce(R, TTI)) || 03665 tryToVectorize(BinOp, R))) { 03666 Changed = true; 03667 it = BB->begin(); 03668 e = BB->end(); 03669 continue; 03670 } 03671 } 03672 03673 // Try to vectorize trees that start at compare instructions. 03674 if (CmpInst *CI = dyn_cast<CmpInst>(it)) { 03675 if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) { 03676 Changed = true; 03677 // We would like to start over since some instructions are deleted 03678 // and the iterator may become invalid value. 03679 it = BB->begin(); 03680 e = BB->end(); 03681 continue; 03682 } 03683 03684 for (int i = 0; i < 2; ++i) { 03685 if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) { 03686 if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) { 03687 Changed = true; 03688 // We would like to start over since some instructions are deleted 03689 // and the iterator may become invalid value. 03690 it = BB->begin(); 03691 e = BB->end(); 03692 } 03693 } 03694 } 03695 continue; 03696 } 03697 03698 // Try to vectorize trees that start at insertelement instructions. 03699 if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) { 03700 SmallVector<Value *, 16> BuildVector; 03701 SmallVector<Value *, 16> BuildVectorOpds; 03702 if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds)) 03703 continue; 03704 03705 // Vectorize starting with the build vector operands ignoring the 03706 // BuildVector instructions for the purpose of scheduling and user 03707 // extraction. 03708 if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) { 03709 Changed = true; 03710 it = BB->begin(); 03711 e = BB->end(); 03712 } 03713 03714 continue; 03715 } 03716 } 03717 03718 return Changed; 03719 } 03720 03721 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { 03722 bool Changed = false; 03723 // Attempt to sort and vectorize each of the store-groups. 03724 for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); 03725 it != e; ++it) { 03726 if (it->second.size() < 2) 03727 continue; 03728 03729 DEBUG(dbgs() << "SLP: Analyzing a store chain of length " 03730 << it->second.size() << ".\n"); 03731 03732 // Process the stores in chunks of 16. 03733 for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { 03734 unsigned Len = std::min<unsigned>(CE - CI, 16); 03735 Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), 03736 -SLPCostThreshold, R); 03737 } 03738 } 03739 return Changed; 03740 } 03741 03742 } // end anonymous namespace 03743 03744 char SLPVectorizer::ID = 0; 03745 static const char lv_name[] = "SLP Vectorizer"; 03746 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false) 03747 INITIALIZE_AG_DEPENDENCY(AliasAnalysis) 03748 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) 03749 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) 03750 INITIALIZE_PASS_DEPENDENCY(LoopSimplify) 03751 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) 03752 03753 namespace llvm { 03754 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); } 03755 }