LLVM: SLPVectorizer.cpp Source File

Go to the documentation of this file.
00001 //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
00010 // stores that can be put together into vector-stores. Next, it attempts to
00011 // construct vectorizable tree using the use-def chains. If a profitable tree
00012 // was found, the SLP vectorizer performs vectorization on the tree.
00013 //
00014 // The pass is inspired by the work described in the paper:
00015 //  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
00016 //
00017 //===----------------------------------------------------------------------===//
00018 #include "llvm/Transforms/Vectorize.h"
00019 #include "llvm/ADT/MapVector.h"
00020 #include "llvm/ADT/PostOrderIterator.h"
00021 #include "llvm/ADT/SetVector.h"
00022 #include "llvm/ADT/Statistic.h"
00023 #include "llvm/Analysis/AliasAnalysis.h"
00024 #include "llvm/Analysis/LoopInfo.h"
00025 #include "llvm/Analysis/ScalarEvolution.h"
00026 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
00027 #include "llvm/Analysis/TargetTransformInfo.h"
00028 #include "llvm/Analysis/ValueTracking.h"
00029 #include "llvm/IR/DataLayout.h"
00030 #include "llvm/IR/Dominators.h"
00031 #include "llvm/IR/IRBuilder.h"
00032 #include "llvm/IR/Instructions.h"
00033 #include "llvm/IR/IntrinsicInst.h"
00034 #include "llvm/IR/Module.h"
00035 #include "llvm/IR/NoFolder.h"
00036 #include "llvm/IR/Type.h"
00037 #include "llvm/IR/Value.h"
00038 #include "llvm/IR/Verifier.h"
00039 #include "llvm/Pass.h"
00040 #include "llvm/Support/CommandLine.h"
00041 #include "llvm/Support/Debug.h"
00042 #include "llvm/Support/raw_ostream.h"
00043 #include "llvm/Transforms/Utils/VectorUtils.h"
00044 #include <algorithm>
00045 #include <map>
00046 #include <memory>
00047 
00048 using namespace llvm;
00049 
00050 #define SV_NAME "slp-vectorizer"
00051 #define DEBUG_TYPE "SLP"
00052 
00053 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
00054 
00055 static cl::opt<int>
00056     SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
00057                      cl::desc("Only vectorize if you gain more than this "
00058                               "number "));
00059 
00060 static cl::opt<bool>
00061 ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden,
00062                    cl::desc("Attempt to vectorize horizontal reductions"));
00063 
00064 static cl::opt<bool> ShouldStartVectorizeHorAtStore(
00065     "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
00066     cl::desc(
00067         "Attempt to vectorize horizontal reductions feeding into a store"));
00068 
00069 namespace {
00070 
00071 static const unsigned MinVecRegSize = 128;
00072 
00073 static const unsigned RecursionMaxDepth = 12;
00074 
00075 /// \returns the parent basic block if all of the instructions in \p VL
00076 /// are in the same block or null otherwise.
00077 static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
00078   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
00079   if (!I0)
00080     return nullptr;
00081   BasicBlock *BB = I0->getParent();
00082   for (int i = 1, e = VL.size(); i < e; i++) {
00083     Instruction *I = dyn_cast<Instruction>(VL[i]);
00084     if (!I)
00085       return nullptr;
00086 
00087     if (BB != I->getParent())
00088       return nullptr;
00089   }
00090   return BB;
00091 }
00092 
00093 /// \returns True if all of the values in \p VL are constants.
00094 static bool allConstant(ArrayRef<Value *> VL) {
00095   for (unsigned i = 0, e = VL.size(); i < e; ++i)
00096     if (!isa<Constant>(VL[i]))
00097       return false;
00098   return true;
00099 }
00100 
00101 /// \returns True if all of the values in \p VL are identical.
00102 static bool isSplat(ArrayRef<Value *> VL) {
00103   for (unsigned i = 1, e = VL.size(); i < e; ++i)
00104     if (VL[i] != VL[0])
00105       return false;
00106   return true;
00107 }
00108 
00109 ///\returns Opcode that can be clubbed with \p Op to create an alternate
00110 /// sequence which can later be merged as a ShuffleVector instruction.
00111 static unsigned getAltOpcode(unsigned Op) {
00112   switch (Op) {
00113   case Instruction::FAdd:
00114     return Instruction::FSub;
00115   case Instruction::FSub:
00116     return Instruction::FAdd;
00117   case Instruction::Add:
00118     return Instruction::Sub;
00119   case Instruction::Sub:
00120     return Instruction::Add;
00121   default:
00122     return 0;
00123   }
00124 }
00125 
00126 ///\returns bool representing if Opcode \p Op can be part
00127 /// of an alternate sequence which can later be merged as
00128 /// a ShuffleVector instruction.
00129 static bool canCombineAsAltInst(unsigned Op) {
00130   if (Op == Instruction::FAdd || Op == Instruction::FSub ||
00131       Op == Instruction::Sub || Op == Instruction::Add)
00132     return true;
00133   return false;
00134 }
00135 
00136 /// \returns ShuffleVector instruction if intructions in \p VL have
00137 ///  alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
00138 /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
00139 static unsigned isAltInst(ArrayRef<Value *> VL) {
00140   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
00141   unsigned Opcode = I0->getOpcode();
00142   unsigned AltOpcode = getAltOpcode(Opcode);
00143   for (int i = 1, e = VL.size(); i < e; i++) {
00144     Instruction *I = dyn_cast<Instruction>(VL[i]);
00145     if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
00146       return 0;
00147   }
00148   return Instruction::ShuffleVector;
00149 }
00150 
00151 /// \returns The opcode if all of the Instructions in \p VL have the same
00152 /// opcode, or zero.
00153 static unsigned getSameOpcode(ArrayRef<Value *> VL) {
00154   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
00155   if (!I0)
00156     return 0;
00157   unsigned Opcode = I0->getOpcode();
00158   for (int i = 1, e = VL.size(); i < e; i++) {
00159     Instruction *I = dyn_cast<Instruction>(VL[i]);
00160     if (!I || Opcode != I->getOpcode()) {
00161       if (canCombineAsAltInst(Opcode) && i == 1)
00162         return isAltInst(VL);
00163       return 0;
00164     }
00165   }
00166   return Opcode;
00167 }
00168 
00169 /// Get the intersection (logical and) of all of the potential IR flags
00170 /// of each scalar operation (VL) that will be converted into a vector (I).
00171 /// Flag set: NSW, NUW, exact, and all of fast-math.
00172 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
00173   if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
00174     if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
00175       // Intersection is initialized to the 0th scalar,
00176       // so start counting from index '1'.
00177       for (int i = 1, e = VL.size(); i < e; ++i) {
00178         if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
00179           Intersection->andIRFlags(Scalar);
00180       }
00181       VecOp->copyIRFlags(Intersection);
00182     }
00183   }
00184 }
00185   
00186 /// \returns \p I after propagating metadata from \p VL.
00187 static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
00188   Instruction *I0 = cast<Instruction>(VL[0]);
00189   SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
00190   I0->getAllMetadataOtherThanDebugLoc(Metadata);
00191 
00192   for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
00193     unsigned Kind = Metadata[i].first;
00194     MDNode *MD = Metadata[i].second;
00195 
00196     for (int i = 1, e = VL.size(); MD && i != e; i++) {
00197       Instruction *I = cast<Instruction>(VL[i]);
00198       MDNode *IMD = I->getMetadata(Kind);
00199 
00200       switch (Kind) {
00201       default:
00202         MD = nullptr; // Remove unknown metadata
00203         break;
00204       case LLVMContext::MD_tbaa:
00205         MD = MDNode::getMostGenericTBAA(MD, IMD);
00206         break;
00207       case LLVMContext::MD_alias_scope:
00208       case LLVMContext::MD_noalias:
00209         MD = MDNode::intersect(MD, IMD);
00210         break;
00211       case LLVMContext::MD_fpmath:
00212         MD = MDNode::getMostGenericFPMath(MD, IMD);
00213         break;
00214       }
00215     }
00216     I->setMetadata(Kind, MD);
00217   }
00218   return I;
00219 }
00220 
00221 /// \returns The type that all of the values in \p VL have or null if there
00222 /// are different types.
00223 static Type* getSameType(ArrayRef<Value *> VL) {
00224   Type *Ty = VL[0]->getType();
00225   for (int i = 1, e = VL.size(); i < e; i++)
00226     if (VL[i]->getType() != Ty)
00227       return nullptr;
00228 
00229   return Ty;
00230 }
00231 
00232 /// \returns True if the ExtractElement instructions in VL can be vectorized
00233 /// to use the original vector.
00234 static bool CanReuseExtract(ArrayRef<Value *> VL) {
00235   assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
00236   // Check if all of the extracts come from the same vector and from the
00237   // correct offset.
00238   Value *VL0 = VL[0];
00239   ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
00240   Value *Vec = E0->getOperand(0);
00241 
00242   // We have to extract from the same vector type.
00243   unsigned NElts = Vec->getType()->getVectorNumElements();
00244 
00245   if (NElts != VL.size())
00246     return false;
00247 
00248   // Check that all of the indices extract from the correct offset.
00249   ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
00250   if (!CI || CI->getZExtValue())
00251     return false;
00252 
00253   for (unsigned i = 1, e = VL.size(); i < e; ++i) {
00254     ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
00255     ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
00256 
00257     if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
00258       return false;
00259   }
00260 
00261   return true;
00262 }
00263 
00264 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
00265                                            SmallVectorImpl<Value *> &Left,
00266                                            SmallVectorImpl<Value *> &Right) {
00267 
00268   SmallVector<Value *, 16> OrigLeft, OrigRight;
00269 
00270   bool AllSameOpcodeLeft = true;
00271   bool AllSameOpcodeRight = true;
00272   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
00273     Instruction *I = cast<Instruction>(VL[i]);
00274     Value *V0 = I->getOperand(0);
00275     Value *V1 = I->getOperand(1);
00276 
00277     OrigLeft.push_back(V0);
00278     OrigRight.push_back(V1);
00279 
00280     Instruction *I0 = dyn_cast<Instruction>(V0);
00281     Instruction *I1 = dyn_cast<Instruction>(V1);
00282 
00283     // Check whether all operands on one side have the same opcode. In this case
00284     // we want to preserve the original order and not make things worse by
00285     // reordering.
00286     AllSameOpcodeLeft = I0;
00287     AllSameOpcodeRight = I1;
00288 
00289     if (i && AllSameOpcodeLeft) {
00290       if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) {
00291         if(P0->getOpcode() != I0->getOpcode())
00292           AllSameOpcodeLeft = false;
00293       } else
00294         AllSameOpcodeLeft = false;
00295     }
00296     if (i && AllSameOpcodeRight) {
00297       if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) {
00298         if(P1->getOpcode() != I1->getOpcode())
00299           AllSameOpcodeRight = false;
00300       } else
00301         AllSameOpcodeRight = false;
00302     }
00303 
00304     // Sort two opcodes. In the code below we try to preserve the ability to use
00305     // broadcast of values instead of individual inserts.
00306     // vl1 = load
00307     // vl2 = phi
00308     // vr1 = load
00309     // vr2 = vr2
00310     //    = vl1 x vr1
00311     //    = vl2 x vr2
00312     // If we just sorted according to opcode we would leave the first line in
00313     // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
00314     //    = vl1 x vr1
00315     //    = vr2 x vl2
00316     // Because vr2 and vr1 are from the same load we loose the opportunity of a
00317     // broadcast for the packed right side in the backend: we have [vr1, vl2]
00318     // instead of [vr1, vr2=vr1].
00319     if (I0 && I1) {
00320        if(!i && I0->getOpcode() > I1->getOpcode()) {
00321          Left.push_back(I1);
00322          Right.push_back(I0);
00323        } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) {
00324          // Try not to destroy a broad cast for no apparent benefit.
00325          Left.push_back(I1);
00326          Right.push_back(I0);
00327        } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] ==  I0) {
00328          // Try preserve broadcasts.
00329          Left.push_back(I1);
00330          Right.push_back(I0);
00331        } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) {
00332          // Try preserve broadcasts.
00333          Left.push_back(I1);
00334          Right.push_back(I0);
00335        } else {
00336          Left.push_back(I0);
00337          Right.push_back(I1);
00338        }
00339        continue;
00340     }
00341     // One opcode, put the instruction on the right.
00342     if (I0) {
00343       Left.push_back(V1);
00344       Right.push_back(I0);
00345       continue;
00346     }
00347     Left.push_back(V0);
00348     Right.push_back(V1);
00349   }
00350 
00351   bool LeftBroadcast = isSplat(Left);
00352   bool RightBroadcast = isSplat(Right);
00353 
00354   // Don't reorder if the operands where good to begin with.
00355   if (!(LeftBroadcast || RightBroadcast) &&
00356       (AllSameOpcodeRight || AllSameOpcodeLeft)) {
00357     Left = OrigLeft;
00358     Right = OrigRight;
00359   }
00360 }
00361 
00362 /// \returns True if in-tree use also needs extract. This refers to
00363 /// possible scalar operand in vectorized instruction.
00364 static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
00365                                     TargetLibraryInfo *TLI) {
00366 
00367   unsigned Opcode = UserInst->getOpcode();
00368   switch (Opcode) {
00369   case Instruction::Load: {
00370     LoadInst *LI = cast<LoadInst>(UserInst);
00371     return (LI->getPointerOperand() == Scalar);
00372   }
00373   case Instruction::Store: {
00374     StoreInst *SI = cast<StoreInst>(UserInst);
00375     return (SI->getPointerOperand() == Scalar);
00376   }
00377   case Instruction::Call: {
00378     CallInst *CI = cast<CallInst>(UserInst);
00379     Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
00380     if (hasVectorInstrinsicScalarOpd(ID, 1)) {
00381       return (CI->getArgOperand(1) == Scalar);
00382     }
00383   }
00384   default:
00385     return false;
00386   }
00387 }
00388 
00389 /// Bottom Up SLP Vectorizer.
00390 class BoUpSLP {
00391 public:
00392   typedef SmallVector<Value *, 8> ValueList;
00393   typedef SmallVector<Instruction *, 16> InstrList;
00394   typedef SmallPtrSet<Value *, 16> ValueSet;
00395   typedef SmallVector<StoreInst *, 8> StoreList;
00396 
00397   BoUpSLP(Function *Func, ScalarEvolution *Se, const DataLayout *Dl,
00398           TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa,
00399           LoopInfo *Li, DominatorTree *Dt)
00400       : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0),
00401         F(Func), SE(Se), DL(Dl), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
00402         Builder(Se->getContext()) {}
00403 
00404   /// \brief Vectorize the tree that starts with the elements in \p VL.
00405   /// Returns the vectorized root.
00406   Value *vectorizeTree();
00407 
00408   /// \returns the cost incurred by unwanted spills and fills, caused by
00409   /// holding live values over call sites.
00410   int getSpillCost();
00411 
00412   /// \returns the vectorization cost of the subtree that starts at \p VL.
00413   /// A negative number means that this is profitable.
00414   int getTreeCost();
00415 
00416   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
00417   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
00418   void buildTree(ArrayRef<Value *> Roots,
00419                  ArrayRef<Value *> UserIgnoreLst = None);
00420 
00421   /// Clear the internal data structures that are created by 'buildTree'.
00422   void deleteTree() {
00423     VectorizableTree.clear();
00424     ScalarToTreeEntry.clear();
00425     MustGather.clear();
00426     ExternalUses.clear();
00427     NumLoadsWantToKeepOrder = 0;
00428     NumLoadsWantToChangeOrder = 0;
00429     for (auto &Iter : BlocksSchedules) {
00430       BlockScheduling *BS = Iter.second.get();
00431       BS->clear();
00432     }
00433   }
00434 
00435   /// \returns true if the memory operations A and B are consecutive.
00436   bool isConsecutiveAccess(Value *A, Value *B);
00437 
00438   /// \brief Perform LICM and CSE on the newly generated gather sequences.
00439   void optimizeGatherSequence();
00440 
00441   /// \returns true if it is benefitial to reverse the vector order.
00442   bool shouldReorder() const {
00443     return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
00444   }
00445 
00446 private:
00447   struct TreeEntry;
00448 
00449   /// \returns the cost of the vectorizable entry.
00450   int getEntryCost(TreeEntry *E);
00451 
00452   /// This is the recursive part of buildTree.
00453   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
00454 
00455   /// Vectorize a single entry in the tree.
00456   Value *vectorizeTree(TreeEntry *E);
00457 
00458   /// Vectorize a single entry in the tree, starting in \p VL.
00459   Value *vectorizeTree(ArrayRef<Value *> VL);
00460 
00461   /// \returns the pointer to the vectorized value if \p VL is already
00462   /// vectorized, or NULL. They may happen in cycles.
00463   Value *alreadyVectorized(ArrayRef<Value *> VL) const;
00464 
00465   /// \brief Take the pointer operand from the Load/Store instruction.
00466   /// \returns NULL if this is not a valid Load/Store instruction.
00467   static Value *getPointerOperand(Value *I);
00468 
00469   /// \brief Take the address space operand from the Load/Store instruction.
00470   /// \returns -1 if this is not a valid Load/Store instruction.
00471   static unsigned getAddressSpaceOperand(Value *I);
00472 
00473   /// \returns the scalarization cost for this type. Scalarization in this
00474   /// context means the creation of vectors from a group of scalars.
00475   int getGatherCost(Type *Ty);
00476 
00477   /// \returns the scalarization cost for this list of values. Assuming that
00478   /// this subtree gets vectorized, we may need to extract the values from the
00479   /// roots. This method calculates the cost of extracting the values.
00480   int getGatherCost(ArrayRef<Value *> VL);
00481 
00482   /// \brief Set the Builder insert point to one after the last instruction in
00483   /// the bundle
00484   void setInsertPointAfterBundle(ArrayRef<Value *> VL);
00485 
00486   /// \returns a vector from a collection of scalars in \p VL.
00487   Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
00488 
00489   /// \returns whether the VectorizableTree is fully vectoriable and will
00490   /// be beneficial even the tree height is tiny.
00491   bool isFullyVectorizableTinyTree();
00492 
00493   struct TreeEntry {
00494     TreeEntry() : Scalars(), VectorizedValue(nullptr),
00495     NeedToGather(0) {}
00496 
00497     /// \returns true if the scalars in VL are equal to this entry.
00498     bool isSame(ArrayRef<Value *> VL) const {
00499       assert(VL.size() == Scalars.size() && "Invalid size");
00500       return std::equal(VL.begin(), VL.end(), Scalars.begin());
00501     }
00502 
00503     /// A vector of scalars.
00504     ValueList Scalars;
00505 
00506     /// The Scalars are vectorized into this value. It is initialized to Null.
00507     Value *VectorizedValue;
00508 
00509     /// Do we need to gather this sequence ?
00510     bool NeedToGather;
00511   };
00512 
00513   /// Create a new VectorizableTree entry.
00514   TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
00515     VectorizableTree.push_back(TreeEntry());
00516     int idx = VectorizableTree.size() - 1;
00517     TreeEntry *Last = &VectorizableTree[idx];
00518     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
00519     Last->NeedToGather = !Vectorized;
00520     if (Vectorized) {
00521       for (int i = 0, e = VL.size(); i != e; ++i) {
00522         assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
00523         ScalarToTreeEntry[VL[i]] = idx;
00524       }
00525     } else {
00526       MustGather.insert(VL.begin(), VL.end());
00527     }
00528     return Last;
00529   }
00530   
00531   /// -- Vectorization State --
00532   /// Holds all of the tree entries.
00533   std::vector<TreeEntry> VectorizableTree;
00534 
00535   /// Maps a specific scalar to its tree entry.
00536   SmallDenseMap<Value*, int> ScalarToTreeEntry;
00537 
00538   /// A list of scalars that we found that we need to keep as scalars.
00539   ValueSet MustGather;
00540 
00541   /// This POD struct describes one external user in the vectorized tree.
00542   struct ExternalUser {
00543     ExternalUser (Value *S, llvm::User *U, int L) :
00544       Scalar(S), User(U), Lane(L){};
00545     // Which scalar in our function.
00546     Value *Scalar;
00547     // Which user that uses the scalar.
00548     llvm::User *User;
00549     // Which lane does the scalar belong to.
00550     int Lane;
00551   };
00552   typedef SmallVector<ExternalUser, 16> UserList;
00553 
00554   /// A list of values that need to extracted out of the tree.
00555   /// This list holds pairs of (Internal Scalar : External User).
00556   UserList ExternalUses;
00557 
00558   /// Holds all of the instructions that we gathered.
00559   SetVector<Instruction *> GatherSeq;
00560   /// A list of blocks that we are going to CSE.
00561   SetVector<BasicBlock *> CSEBlocks;
00562 
00563   /// Contains all scheduling relevant data for an instruction.
00564   /// A ScheduleData either represents a single instruction or a member of an
00565   /// instruction bundle (= a group of instructions which is combined into a
00566   /// vector instruction).
00567   struct ScheduleData {
00568 
00569     // The initial value for the dependency counters. It means that the
00570     // dependencies are not calculated yet.
00571     enum { InvalidDeps = -1 };
00572 
00573     ScheduleData()
00574         : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
00575           NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
00576           Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
00577           UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
00578 
00579     void init(int BlockSchedulingRegionID) {
00580       FirstInBundle = this;
00581       NextInBundle = nullptr;
00582       NextLoadStore = nullptr;
00583       IsScheduled = false;
00584       SchedulingRegionID = BlockSchedulingRegionID;
00585       UnscheduledDepsInBundle = UnscheduledDeps;
00586       clearDependencies();
00587     }
00588 
00589     /// Returns true if the dependency information has been calculated.
00590     bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
00591 
00592     /// Returns true for single instructions and for bundle representatives
00593     /// (= the head of a bundle).
00594     bool isSchedulingEntity() const { return FirstInBundle == this; }
00595 
00596     /// Returns true if it represents an instruction bundle and not only a
00597     /// single instruction.
00598     bool isPartOfBundle() const {
00599       return NextInBundle != nullptr || FirstInBundle != this;
00600     }
00601 
00602     /// Returns true if it is ready for scheduling, i.e. it has no more
00603     /// unscheduled depending instructions/bundles.
00604     bool isReady() const {
00605       assert(isSchedulingEntity() &&
00606              "can't consider non-scheduling entity for ready list");
00607       return UnscheduledDepsInBundle == 0 && !IsScheduled;
00608     }
00609 
00610     /// Modifies the number of unscheduled dependencies, also updating it for
00611     /// the whole bundle.
00612     int incrementUnscheduledDeps(int Incr) {
00613       UnscheduledDeps += Incr;
00614       return FirstInBundle->UnscheduledDepsInBundle += Incr;
00615     }
00616 
00617     /// Sets the number of unscheduled dependencies to the number of
00618     /// dependencies.
00619     void resetUnscheduledDeps() {
00620       incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
00621     }
00622 
00623     /// Clears all dependency information.
00624     void clearDependencies() {
00625       Dependencies = InvalidDeps;
00626       resetUnscheduledDeps();
00627       MemoryDependencies.clear();
00628     }
00629 
00630     void dump(raw_ostream &os) const {
00631       if (!isSchedulingEntity()) {
00632         os << "/ " << *Inst;
00633       } else if (NextInBundle) {
00634         os << '[' << *Inst;
00635         ScheduleData *SD = NextInBundle;
00636         while (SD) {
00637           os << ';' << *SD->Inst;
00638           SD = SD->NextInBundle;
00639         }
00640         os << ']';
00641       } else {
00642         os << *Inst;
00643       }
00644     }
00645 
00646     Instruction *Inst;
00647 
00648     /// Points to the head in an instruction bundle (and always to this for
00649     /// single instructions).
00650     ScheduleData *FirstInBundle;
00651 
00652     /// Single linked list of all instructions in a bundle. Null if it is a
00653     /// single instruction.
00654     ScheduleData *NextInBundle;
00655 
00656     /// Single linked list of all memory instructions (e.g. load, store, call)
00657     /// in the block - until the end of the scheduling region.
00658     ScheduleData *NextLoadStore;
00659 
00660     /// The dependent memory instructions.
00661     /// This list is derived on demand in calculateDependencies().
00662     SmallVector<ScheduleData *, 4> MemoryDependencies;
00663 
00664     /// This ScheduleData is in the current scheduling region if this matches
00665     /// the current SchedulingRegionID of BlockScheduling.
00666     int SchedulingRegionID;
00667 
00668     /// Used for getting a "good" final ordering of instructions.
00669     int SchedulingPriority;
00670 
00671     /// The number of dependencies. Constitutes of the number of users of the
00672     /// instruction plus the number of dependent memory instructions (if any).
00673     /// This value is calculated on demand.
00674     /// If InvalidDeps, the number of dependencies is not calculated yet.
00675     ///
00676     int Dependencies;
00677 
00678     /// The number of dependencies minus the number of dependencies of scheduled
00679     /// instructions. As soon as this is zero, the instruction/bundle gets ready
00680     /// for scheduling.
00681     /// Note that this is negative as long as Dependencies is not calculated.
00682     int UnscheduledDeps;
00683 
00684     /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
00685     /// single instructions.
00686     int UnscheduledDepsInBundle;
00687 
00688     /// True if this instruction is scheduled (or considered as scheduled in the
00689     /// dry-run).
00690     bool IsScheduled;
00691   };
00692 
00693 #ifndef NDEBUG
00694   friend raw_ostream &operator<<(raw_ostream &os,
00695                                  const BoUpSLP::ScheduleData &SD);
00696 #endif
00697 
00698   /// Contains all scheduling data for a basic block.
00699   ///
00700   struct BlockScheduling {
00701 
00702     BlockScheduling(BasicBlock *BB)
00703         : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
00704           ScheduleStart(nullptr), ScheduleEnd(nullptr),
00705           FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
00706           // Make sure that the initial SchedulingRegionID is greater than the
00707           // initial SchedulingRegionID in ScheduleData (which is 0).
00708           SchedulingRegionID(1) {}
00709 
00710     void clear() {
00711       ReadyInsts.clear();
00712       ScheduleStart = nullptr;
00713       ScheduleEnd = nullptr;
00714       FirstLoadStoreInRegion = nullptr;
00715       LastLoadStoreInRegion = nullptr;
00716 
00717       // Make a new scheduling region, i.e. all existing ScheduleData is not
00718       // in the new region yet.
00719       ++SchedulingRegionID;
00720     }
00721 
00722     ScheduleData *getScheduleData(Value *V) {
00723       ScheduleData *SD = ScheduleDataMap[V];
00724       if (SD && SD->SchedulingRegionID == SchedulingRegionID)
00725         return SD;
00726       return nullptr;
00727     }
00728 
00729     bool isInSchedulingRegion(ScheduleData *SD) {
00730       return SD->SchedulingRegionID == SchedulingRegionID;
00731     }
00732 
00733     /// Marks an instruction as scheduled and puts all dependent ready
00734     /// instructions into the ready-list.
00735     template <typename ReadyListType>
00736     void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
00737       SD->IsScheduled = true;
00738       DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
00739 
00740       ScheduleData *BundleMember = SD;
00741       while (BundleMember) {
00742         // Handle the def-use chain dependencies.
00743         for (Use &U : BundleMember->Inst->operands()) {
00744           ScheduleData *OpDef = getScheduleData(U.get());
00745           if (OpDef && OpDef->hasValidDependencies() &&
00746               OpDef->incrementUnscheduledDeps(-1) == 0) {
00747             // There are no more unscheduled dependencies after decrementing,
00748             // so we can put the dependent instruction into the ready list.
00749             ScheduleData *DepBundle = OpDef->FirstInBundle;
00750             assert(!DepBundle->IsScheduled &&
00751                    "already scheduled bundle gets ready");
00752             ReadyList.insert(DepBundle);
00753             DEBUG(dbgs() << "SLP:    gets ready (def): " << *DepBundle << "\n");
00754           }
00755         }
00756         // Handle the memory dependencies.
00757         for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
00758           if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
00759             // There are no more unscheduled dependencies after decrementing,
00760             // so we can put the dependent instruction into the ready list.
00761             ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
00762             assert(!DepBundle->IsScheduled &&
00763                    "already scheduled bundle gets ready");
00764             ReadyList.insert(DepBundle);
00765             DEBUG(dbgs() << "SLP:    gets ready (mem): " << *DepBundle << "\n");
00766           }
00767         }
00768         BundleMember = BundleMember->NextInBundle;
00769       }
00770     }
00771 
00772     /// Put all instructions into the ReadyList which are ready for scheduling.
00773     template <typename ReadyListType>
00774     void initialFillReadyList(ReadyListType &ReadyList) {
00775       for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
00776         ScheduleData *SD = getScheduleData(I);
00777         if (SD->isSchedulingEntity() && SD->isReady()) {
00778           ReadyList.insert(SD);
00779           DEBUG(dbgs() << "SLP:    initially in ready list: " << *I << "\n");
00780         }
00781       }
00782     }
00783 
00784     /// Checks if a bundle of instructions can be scheduled, i.e. has no
00785     /// cyclic dependencies. This is only a dry-run, no instructions are
00786     /// actually moved at this stage.
00787     bool tryScheduleBundle(ArrayRef<Value *> VL, AliasAnalysis *AA);
00788 
00789     /// Un-bundles a group of instructions.
00790     void cancelScheduling(ArrayRef<Value *> VL);
00791 
00792     /// Extends the scheduling region so that V is inside the region.
00793     void extendSchedulingRegion(Value *V);
00794 
00795     /// Initialize the ScheduleData structures for new instructions in the
00796     /// scheduling region.
00797     void initScheduleData(Instruction *FromI, Instruction *ToI,
00798                           ScheduleData *PrevLoadStore,
00799                           ScheduleData *NextLoadStore);
00800 
00801     /// Updates the dependency information of a bundle and of all instructions/
00802     /// bundles which depend on the original bundle.
00803     void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
00804                                AliasAnalysis *AA);
00805 
00806     /// Sets all instruction in the scheduling region to un-scheduled.
00807     void resetSchedule();
00808 
00809     BasicBlock *BB;
00810 
00811     /// Simple memory allocation for ScheduleData.
00812     std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
00813 
00814     /// The size of a ScheduleData array in ScheduleDataChunks.
00815     int ChunkSize;
00816 
00817     /// The allocator position in the current chunk, which is the last entry
00818     /// of ScheduleDataChunks.
00819     int ChunkPos;
00820 
00821     /// Attaches ScheduleData to Instruction.
00822     /// Note that the mapping survives during all vectorization iterations, i.e.
00823     /// ScheduleData structures are recycled.
00824     DenseMap<Value *, ScheduleData *> ScheduleDataMap;
00825 
00826     struct ReadyList : SmallVector<ScheduleData *, 8> {
00827       void insert(ScheduleData *SD) { push_back(SD); }
00828     };
00829 
00830     /// The ready-list for scheduling (only used for the dry-run).
00831     ReadyList ReadyInsts;
00832 
00833     /// The first instruction of the scheduling region.
00834     Instruction *ScheduleStart;
00835 
00836     /// The first instruction _after_ the scheduling region.
00837     Instruction *ScheduleEnd;
00838 
00839     /// The first memory accessing instruction in the scheduling region
00840     /// (can be null).
00841     ScheduleData *FirstLoadStoreInRegion;
00842 
00843     /// The last memory accessing instruction in the scheduling region
00844     /// (can be null).
00845     ScheduleData *LastLoadStoreInRegion;
00846 
00847     /// The ID of the scheduling region. For a new vectorization iteration this
00848     /// is incremented which "removes" all ScheduleData from the region.
00849     int SchedulingRegionID;
00850   };
00851 
00852   /// Attaches the BlockScheduling structures to basic blocks.
00853   DenseMap<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
00854 
00855   /// Performs the "real" scheduling. Done before vectorization is actually
00856   /// performed in a basic block.
00857   void scheduleBlock(BlockScheduling *BS);
00858 
00859   /// List of users to ignore during scheduling and that don't need extracting.
00860   ArrayRef<Value *> UserIgnoreList;
00861 
00862   // Number of load-bundles, which contain consecutive loads.
00863   int NumLoadsWantToKeepOrder;
00864 
00865   // Number of load-bundles of size 2, which are consecutive loads if reversed.
00866   int NumLoadsWantToChangeOrder;
00867 
00868   // Analysis and block reference.
00869   Function *F;
00870   ScalarEvolution *SE;
00871   const DataLayout *DL;
00872   TargetTransformInfo *TTI;
00873   TargetLibraryInfo *TLI;
00874   AliasAnalysis *AA;
00875   LoopInfo *LI;
00876   DominatorTree *DT;
00877   /// Instruction builder to construct the vectorized tree.
00878   IRBuilder<> Builder;
00879 };
00880 
00881 #ifndef NDEBUG
00882 raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
00883   SD.dump(os);
00884   return os;
00885 }
00886 #endif
00887 
00888 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
00889                         ArrayRef<Value *> UserIgnoreLst) {
00890   deleteTree();
00891   UserIgnoreList = UserIgnoreLst;
00892   if (!getSameType(Roots))
00893     return;
00894   buildTree_rec(Roots, 0);
00895 
00896   // Collect the values that we need to extract from the tree.
00897   for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
00898     TreeEntry *Entry = &VectorizableTree[EIdx];
00899 
00900     // For each lane:
00901     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
00902       Value *Scalar = Entry->Scalars[Lane];
00903 
00904       // No need to handle users of gathered values.
00905       if (Entry->NeedToGather)
00906         continue;
00907 
00908       for (User *U : Scalar->users()) {
00909         DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
00910 
00911         Instruction *UserInst = dyn_cast<Instruction>(U);
00912         if (!UserInst)
00913           continue;
00914 
00915         // Skip in-tree scalars that become vectors
00916         if (ScalarToTreeEntry.count(U)) {
00917           int Idx = ScalarToTreeEntry[U];
00918           TreeEntry *UseEntry = &VectorizableTree[Idx];
00919           Value *UseScalar = UseEntry->Scalars[0];
00920           // Some in-tree scalars will remain as scalar in vectorized
00921           // instructions. If that is the case, the one in Lane 0 will
00922           // be used.
00923           if (UseScalar != U ||
00924               !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
00925             DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
00926                          << ".\n");
00927             assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
00928             continue;
00929           }
00930         }
00931 
00932         // Ignore users in the user ignore list.
00933         if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
00934             UserIgnoreList.end())
00935           continue;
00936 
00937         DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
00938               Lane << " from " << *Scalar << ".\n");
00939         ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
00940       }
00941     }
00942   }
00943 }
00944 
00945 
00946 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
00947   bool SameTy = getSameType(VL); (void)SameTy;
00948   bool isAltShuffle = false;
00949   assert(SameTy && "Invalid types!");
00950 
00951   if (Depth == RecursionMaxDepth) {
00952     DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
00953     newTreeEntry(VL, false);
00954     return;
00955   }
00956 
00957   // Don't handle vectors.
00958   if (VL[0]->getType()->isVectorTy()) {
00959     DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
00960     newTreeEntry(VL, false);
00961     return;
00962   }
00963 
00964   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
00965     if (SI->getValueOperand()->getType()->isVectorTy()) {
00966       DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
00967       newTreeEntry(VL, false);
00968       return;
00969     }
00970   unsigned Opcode = getSameOpcode(VL);
00971 
00972   // Check that this shuffle vector refers to the alternate
00973   // sequence of opcodes.
00974   if (Opcode == Instruction::ShuffleVector) {
00975     Instruction *I0 = dyn_cast<Instruction>(VL[0]);
00976     unsigned Op = I0->getOpcode();
00977     if (Op != Instruction::ShuffleVector)
00978       isAltShuffle = true;
00979   }
00980 
00981   // If all of the operands are identical or constant we have a simple solution.
00982   if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
00983     DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
00984     newTreeEntry(VL, false);
00985     return;
00986   }
00987 
00988   // We now know that this is a vector of instructions of the same type from
00989   // the same block.
00990 
00991   // Check if this is a duplicate of another entry.
00992   if (ScalarToTreeEntry.count(VL[0])) {
00993     int Idx = ScalarToTreeEntry[VL[0]];
00994     TreeEntry *E = &VectorizableTree[Idx];
00995     for (unsigned i = 0, e = VL.size(); i != e; ++i) {
00996       DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
00997       if (E->Scalars[i] != VL[i]) {
00998         DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
00999         newTreeEntry(VL, false);
01000         return;
01001       }
01002     }
01003     DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
01004     return;
01005   }
01006 
01007   // Check that none of the instructions in the bundle are already in the tree.
01008   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
01009     if (ScalarToTreeEntry.count(VL[i])) {
01010       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
01011             ") is already in tree.\n");
01012       newTreeEntry(VL, false);
01013       return;
01014     }
01015   }
01016 
01017   // If any of the scalars appears in the table OR it is marked as a value that
01018   // needs to stat scalar then we need to gather the scalars.
01019   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
01020     if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) {
01021       DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n");
01022       newTreeEntry(VL, false);
01023       return;
01024     }
01025   }
01026 
01027   // Check that all of the users of the scalars that we want to vectorize are
01028   // schedulable.
01029   Instruction *VL0 = cast<Instruction>(VL[0]);
01030   BasicBlock *BB = cast<Instruction>(VL0)->getParent();
01031 
01032   if (!DT->isReachableFromEntry(BB)) {
01033     // Don't go into unreachable blocks. They may contain instructions with
01034     // dependency cycles which confuse the final scheduling.
01035     DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
01036     newTreeEntry(VL, false);
01037     return;
01038   }
01039   
01040   // Check that every instructions appears once in this bundle.
01041   for (unsigned i = 0, e = VL.size(); i < e; ++i)
01042     for (unsigned j = i+1; j < e; ++j)
01043       if (VL[i] == VL[j]) {
01044         DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
01045         newTreeEntry(VL, false);
01046         return;
01047       }
01048 
01049   auto &BSRef = BlocksSchedules[BB];
01050   if (!BSRef) {
01051     BSRef = llvm::make_unique<BlockScheduling>(BB);
01052   }
01053   BlockScheduling &BS = *BSRef.get();
01054 
01055   if (!BS.tryScheduleBundle(VL, AA)) {
01056     DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
01057     BS.cancelScheduling(VL);
01058     newTreeEntry(VL, false);
01059     return;
01060   }
01061   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
01062 
01063   switch (Opcode) {
01064     case Instruction::PHI: {
01065       PHINode *PH = dyn_cast<PHINode>(VL0);
01066 
01067       // Check for terminator values (e.g. invoke).
01068       for (unsigned j = 0; j < VL.size(); ++j)
01069         for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
01070           TerminatorInst *Term = dyn_cast<TerminatorInst>(
01071               cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
01072           if (Term) {
01073             DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
01074             BS.cancelScheduling(VL);
01075             newTreeEntry(VL, false);
01076             return;
01077           }
01078         }
01079 
01080       newTreeEntry(VL, true);
01081       DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
01082 
01083       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
01084         ValueList Operands;
01085         // Prepare the operand vector.
01086         for (unsigned j = 0; j < VL.size(); ++j)
01087           Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(
01088               PH->getIncomingBlock(i)));
01089 
01090         buildTree_rec(Operands, Depth + 1);
01091       }
01092       return;
01093     }
01094     case Instruction::ExtractElement: {
01095       bool Reuse = CanReuseExtract(VL);
01096       if (Reuse) {
01097         DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
01098       } else {
01099         BS.cancelScheduling(VL);
01100       }
01101       newTreeEntry(VL, Reuse);
01102       return;
01103     }
01104     case Instruction::Load: {
01105       // Check if the loads are consecutive or of we need to swizzle them.
01106       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
01107         LoadInst *L = cast<LoadInst>(VL[i]);
01108         if (!L->isSimple()) {
01109           BS.cancelScheduling(VL);
01110           newTreeEntry(VL, false);
01111           DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
01112           return;
01113         }
01114         if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
01115           if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0])) {
01116             ++NumLoadsWantToChangeOrder;
01117           }
01118           BS.cancelScheduling(VL);
01119           newTreeEntry(VL, false);
01120           DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
01121           return;
01122         }
01123       }
01124       ++NumLoadsWantToKeepOrder;
01125       newTreeEntry(VL, true);
01126       DEBUG(dbgs() << "SLP: added a vector of loads.\n");
01127       return;
01128     }
01129     case Instruction::ZExt:
01130     case Instruction::SExt:
01131     case Instruction::FPToUI:
01132     case Instruction::FPToSI:
01133     case Instruction::FPExt:
01134     case Instruction::PtrToInt:
01135     case Instruction::IntToPtr:
01136     case Instruction::SIToFP:
01137     case Instruction::UIToFP:
01138     case Instruction::Trunc:
01139     case Instruction::FPTrunc:
01140     case Instruction::BitCast: {
01141       Type *SrcTy = VL0->getOperand(0)->getType();
01142       for (unsigned i = 0; i < VL.size(); ++i) {
01143         Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
01144         if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) {
01145           BS.cancelScheduling(VL);
01146           newTreeEntry(VL, false);
01147           DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
01148           return;
01149         }
01150       }
01151       newTreeEntry(VL, true);
01152       DEBUG(dbgs() << "SLP: added a vector of casts.\n");
01153 
01154       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
01155         ValueList Operands;
01156         // Prepare the operand vector.
01157         for (unsigned j = 0; j < VL.size(); ++j)
01158           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
01159 
01160         buildTree_rec(Operands, Depth+1);
01161       }
01162       return;
01163     }
01164     case Instruction::ICmp:
01165     case Instruction::FCmp: {
01166       // Check that all of the compares have the same predicate.
01167       CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
01168       Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
01169       for (unsigned i = 1, e = VL.size(); i < e; ++i) {
01170         CmpInst *Cmp = cast<CmpInst>(VL[i]);
01171         if (Cmp->getPredicate() != P0 ||
01172             Cmp->getOperand(0)->getType() != ComparedTy) {
01173           BS.cancelScheduling(VL);
01174           newTreeEntry(VL, false);
01175           DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
01176           return;
01177         }
01178       }
01179 
01180       newTreeEntry(VL, true);
01181       DEBUG(dbgs() << "SLP: added a vector of compares.\n");
01182 
01183       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
01184         ValueList Operands;
01185         // Prepare the operand vector.
01186         for (unsigned j = 0; j < VL.size(); ++j)
01187           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
01188 
01189         buildTree_rec(Operands, Depth+1);
01190       }
01191       return;
01192     }
01193     case Instruction::Select:
01194     case Instruction::Add:
01195     case Instruction::FAdd:
01196     case Instruction::Sub:
01197     case Instruction::FSub:
01198     case Instruction::Mul:
01199     case Instruction::FMul:
01200     case Instruction::UDiv:
01201     case Instruction::SDiv:
01202     case Instruction::FDiv:
01203     case Instruction::URem:
01204     case Instruction::SRem:
01205     case Instruction::FRem:
01206     case Instruction::Shl:
01207     case Instruction::LShr:
01208     case Instruction::AShr:
01209     case Instruction::And:
01210     case Instruction::Or:
01211     case Instruction::Xor: {
01212       newTreeEntry(VL, true);
01213       DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
01214 
01215       // Sort operands of the instructions so that each side is more likely to
01216       // have the same opcode.
01217       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
01218         ValueList Left, Right;
01219         reorderInputsAccordingToOpcode(VL, Left, Right);
01220         buildTree_rec(Left, Depth + 1);
01221         buildTree_rec(Right, Depth + 1);
01222         return;
01223       }
01224 
01225       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
01226         ValueList Operands;
01227         // Prepare the operand vector.
01228         for (unsigned j = 0; j < VL.size(); ++j)
01229           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
01230 
01231         buildTree_rec(Operands, Depth+1);
01232       }
01233       return;
01234     }
01235     case Instruction::GetElementPtr: {
01236       // We don't combine GEPs with complicated (nested) indexing.
01237       for (unsigned j = 0; j < VL.size(); ++j) {
01238         if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
01239           DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
01240           BS.cancelScheduling(VL);
01241           newTreeEntry(VL, false);
01242           return;
01243         }
01244       }
01245 
01246       // We can't combine several GEPs into one vector if they operate on
01247       // different types.
01248       Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
01249       for (unsigned j = 0; j < VL.size(); ++j) {
01250         Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
01251         if (Ty0 != CurTy) {
01252           DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
01253           BS.cancelScheduling(VL);
01254           newTreeEntry(VL, false);
01255           return;
01256         }
01257       }
01258 
01259       // We don't combine GEPs with non-constant indexes.
01260       for (unsigned j = 0; j < VL.size(); ++j) {
01261         auto Op = cast<Instruction>(VL[j])->getOperand(1);
01262         if (!isa<ConstantInt>(Op)) {
01263           DEBUG(
01264               dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
01265           BS.cancelScheduling(VL);
01266           newTreeEntry(VL, false);
01267           return;
01268         }
01269       }
01270 
01271       newTreeEntry(VL, true);
01272       DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
01273       for (unsigned i = 0, e = 2; i < e; ++i) {
01274         ValueList Operands;
01275         // Prepare the operand vector.
01276         for (unsigned j = 0; j < VL.size(); ++j)
01277           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
01278 
01279         buildTree_rec(Operands, Depth + 1);
01280       }
01281       return;
01282     }
01283     case Instruction::Store: {
01284       // Check if the stores are consecutive or of we need to swizzle them.
01285       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
01286         if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
01287           BS.cancelScheduling(VL);
01288           newTreeEntry(VL, false);
01289           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
01290           return;
01291         }
01292 
01293       newTreeEntry(VL, true);
01294       DEBUG(dbgs() << "SLP: added a vector of stores.\n");
01295 
01296       ValueList Operands;
01297       for (unsigned j = 0; j < VL.size(); ++j)
01298         Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
01299 
01300       buildTree_rec(Operands, Depth + 1);
01301       return;
01302     }
01303     case Instruction::Call: {
01304       // Check if the calls are all to the same vectorizable intrinsic.
01305       CallInst *CI = cast<CallInst>(VL[0]);
01306       // Check if this is an Intrinsic call or something that can be
01307       // represented by an intrinsic call
01308       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
01309       if (!isTriviallyVectorizable(ID)) {
01310         BS.cancelScheduling(VL);
01311         newTreeEntry(VL, false);
01312         DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
01313         return;
01314       }
01315       Function *Int = CI->getCalledFunction();
01316       Value *A1I = nullptr;
01317       if (hasVectorInstrinsicScalarOpd(ID, 1))
01318         A1I = CI->getArgOperand(1);
01319       for (unsigned i = 1, e = VL.size(); i != e; ++i) {
01320         CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
01321         if (!CI2 || CI2->getCalledFunction() != Int ||
01322             getIntrinsicIDForCall(CI2, TLI) != ID) {
01323           BS.cancelScheduling(VL);
01324           newTreeEntry(VL, false);
01325           DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
01326                        << "\n");
01327           return;
01328         }
01329         // ctlz,cttz and powi are special intrinsics whose second argument
01330         // should be same in order for them to be vectorized.
01331         if (hasVectorInstrinsicScalarOpd(ID, 1)) {
01332           Value *A1J = CI2->getArgOperand(1);
01333           if (A1I != A1J) {
01334             BS.cancelScheduling(VL);
01335             newTreeEntry(VL, false);
01336             DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
01337                          << " argument "<< A1I<<"!=" << A1J
01338                          << "\n");
01339             return;
01340           }
01341         }
01342       }
01343 
01344       newTreeEntry(VL, true);
01345       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
01346         ValueList Operands;
01347         // Prepare the operand vector.
01348         for (unsigned j = 0; j < VL.size(); ++j) {
01349           CallInst *CI2 = dyn_cast<CallInst>(VL[j]);
01350           Operands.push_back(CI2->getArgOperand(i));
01351         }
01352         buildTree_rec(Operands, Depth + 1);
01353       }
01354       return;
01355     }
01356     case Instruction::ShuffleVector: {
01357       // If this is not an alternate sequence of opcode like add-sub
01358       // then do not vectorize this instruction.
01359       if (!isAltShuffle) {
01360         BS.cancelScheduling(VL);
01361         newTreeEntry(VL, false);
01362         DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
01363         return;
01364       }
01365       newTreeEntry(VL, true);
01366       DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
01367       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
01368         ValueList Operands;
01369         // Prepare the operand vector.
01370         for (unsigned j = 0; j < VL.size(); ++j)
01371           Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
01372 
01373         buildTree_rec(Operands, Depth + 1);
01374       }
01375       return;
01376     }
01377     default:
01378       BS.cancelScheduling(VL);
01379       newTreeEntry(VL, false);
01380       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
01381       return;
01382   }
01383 }
01384 
01385 int BoUpSLP::getEntryCost(TreeEntry *E) {
01386   ArrayRef<Value*> VL = E->Scalars;
01387 
01388   Type *ScalarTy = VL[0]->getType();
01389   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
01390     ScalarTy = SI->getValueOperand()->getType();
01391   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
01392 
01393   if (E->NeedToGather) {
01394     if (allConstant(VL))
01395       return 0;
01396     if (isSplat(VL)) {
01397       return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
01398     }
01399     return getGatherCost(E->Scalars);
01400   }
01401   unsigned Opcode = getSameOpcode(VL);
01402   assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
01403   Instruction *VL0 = cast<Instruction>(VL[0]);
01404   switch (Opcode) {
01405     case Instruction::PHI: {
01406       return 0;
01407     }
01408     case Instruction::ExtractElement: {
01409       if (CanReuseExtract(VL)) {
01410         int DeadCost = 0;
01411         for (unsigned i = 0, e = VL.size(); i < e; ++i) {
01412           ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
01413           if (E->hasOneUse())
01414             // Take credit for instruction that will become dead.
01415             DeadCost +=
01416                 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
01417         }
01418         return -DeadCost;
01419       }
01420       return getGatherCost(VecTy);
01421     }
01422     case Instruction::ZExt:
01423     case Instruction::SExt:
01424     case Instruction::FPToUI:
01425     case Instruction::FPToSI:
01426     case Instruction::FPExt:
01427     case Instruction::PtrToInt:
01428     case Instruction::IntToPtr:
01429     case Instruction::SIToFP:
01430     case Instruction::UIToFP:
01431     case Instruction::Trunc:
01432     case Instruction::FPTrunc:
01433     case Instruction::BitCast: {
01434       Type *SrcTy = VL0->getOperand(0)->getType();
01435 
01436       // Calculate the cost of this instruction.
01437       int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
01438                                                          VL0->getType(), SrcTy);
01439 
01440       VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
01441       int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
01442       return VecCost - ScalarCost;
01443     }
01444     case Instruction::FCmp:
01445     case Instruction::ICmp:
01446     case Instruction::Select:
01447     case Instruction::Add:
01448     case Instruction::FAdd:
01449     case Instruction::Sub:
01450     case Instruction::FSub:
01451     case Instruction::Mul:
01452     case Instruction::FMul:
01453     case Instruction::UDiv:
01454     case Instruction::SDiv:
01455     case Instruction::FDiv:
01456     case Instruction::URem:
01457     case Instruction::SRem:
01458     case Instruction::FRem:
01459     case Instruction::Shl:
01460     case Instruction::LShr:
01461     case Instruction::AShr:
01462     case Instruction::And:
01463     case Instruction::Or:
01464     case Instruction::Xor: {
01465       // Calculate the cost of this instruction.
01466       int ScalarCost = 0;
01467       int VecCost = 0;
01468       if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
01469           Opcode == Instruction::Select) {
01470         VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
01471         ScalarCost = VecTy->getNumElements() *
01472         TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
01473         VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
01474       } else {
01475         // Certain instructions can be cheaper to vectorize if they have a
01476         // constant second vector operand.
01477         TargetTransformInfo::OperandValueKind Op1VK =
01478             TargetTransformInfo::OK_AnyValue;
01479         TargetTransformInfo::OperandValueKind Op2VK =
01480             TargetTransformInfo::OK_UniformConstantValue;
01481         TargetTransformInfo::OperandValueProperties Op1VP =
01482             TargetTransformInfo::OP_None;
01483         TargetTransformInfo::OperandValueProperties Op2VP =
01484             TargetTransformInfo::OP_None;
01485 
01486         // If all operands are exactly the same ConstantInt then set the
01487         // operand kind to OK_UniformConstantValue.
01488         // If instead not all operands are constants, then set the operand kind
01489         // to OK_AnyValue. If all operands are constants but not the same,
01490         // then set the operand kind to OK_NonUniformConstantValue.
01491         ConstantInt *CInt = nullptr;
01492         for (unsigned i = 0; i < VL.size(); ++i) {
01493           const Instruction *I = cast<Instruction>(VL[i]);
01494           if (!isa<ConstantInt>(I->getOperand(1))) {
01495             Op2VK = TargetTransformInfo::OK_AnyValue;
01496             break;
01497           }
01498           if (i == 0) {
01499             CInt = cast<ConstantInt>(I->getOperand(1));
01500             continue;
01501           }
01502           if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
01503               CInt != cast<ConstantInt>(I->getOperand(1)))
01504             Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
01505         }
01506         // FIXME: Currently cost of model modification for division by
01507         // power of 2 is handled only for X86. Add support for other targets.
01508         if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
01509             CInt->getValue().isPowerOf2())
01510           Op2VP = TargetTransformInfo::OP_PowerOf2;
01511 
01512         ScalarCost = VecTy->getNumElements() *
01513                      TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
01514                                                  Op1VP, Op2VP);
01515         VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
01516                                               Op1VP, Op2VP);
01517       }
01518       return VecCost - ScalarCost;
01519     }
01520     case Instruction::GetElementPtr: {
01521       TargetTransformInfo::OperandValueKind Op1VK =
01522           TargetTransformInfo::OK_AnyValue;
01523       TargetTransformInfo::OperandValueKind Op2VK =
01524           TargetTransformInfo::OK_UniformConstantValue;
01525 
01526       int ScalarCost =
01527           VecTy->getNumElements() *
01528           TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
01529       int VecCost =
01530           TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
01531 
01532       return VecCost - ScalarCost;
01533     }
01534     case Instruction::Load: {
01535       // Cost of wide load - cost of scalar loads.
01536       int ScalarLdCost = VecTy->getNumElements() *
01537       TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
01538       int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
01539       return VecLdCost - ScalarLdCost;
01540     }
01541     case Instruction::Store: {
01542       // We know that we can merge the stores. Calculate the cost.
01543       int ScalarStCost = VecTy->getNumElements() *
01544       TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
01545       int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
01546       return VecStCost - ScalarStCost;
01547     }
01548     case Instruction::Call: {
01549       CallInst *CI = cast<CallInst>(VL0);
01550       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
01551 
01552       // Calculate the cost of the scalar and vector calls.
01553       SmallVector<Type*, 4> ScalarTys, VecTys;
01554       for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
01555         ScalarTys.push_back(CI->getArgOperand(op)->getType());
01556         VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
01557                                          VecTy->getNumElements()));
01558       }
01559 
01560       int ScalarCallCost = VecTy->getNumElements() *
01561           TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys);
01562 
01563       int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys);
01564 
01565       DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
01566             << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")"
01567             << " for " << *CI << "\n");
01568 
01569       return VecCallCost - ScalarCallCost;
01570     }
01571     case Instruction::ShuffleVector: {
01572       TargetTransformInfo::OperandValueKind Op1VK =
01573           TargetTransformInfo::OK_AnyValue;
01574       TargetTransformInfo::OperandValueKind Op2VK =
01575           TargetTransformInfo::OK_AnyValue;
01576       int ScalarCost = 0;
01577       int VecCost = 0;
01578       for (unsigned i = 0; i < VL.size(); ++i) {
01579         Instruction *I = cast<Instruction>(VL[i]);
01580         if (!I)
01581           break;
01582         ScalarCost +=
01583             TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
01584       }
01585       // VecCost is equal to sum of the cost of creating 2 vectors
01586       // and the cost of creating shuffle.
01587       Instruction *I0 = cast<Instruction>(VL[0]);
01588       VecCost =
01589           TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
01590       Instruction *I1 = cast<Instruction>(VL[1]);
01591       VecCost +=
01592           TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
01593       VecCost +=
01594           TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
01595       return VecCost - ScalarCost;
01596     }
01597     default:
01598       llvm_unreachable("Unknown instruction");
01599   }
01600 }
01601 
01602 bool BoUpSLP::isFullyVectorizableTinyTree() {
01603   DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
01604         VectorizableTree.size() << " is fully vectorizable .\n");
01605 
01606   // We only handle trees of height 2.
01607   if (VectorizableTree.size() != 2)
01608     return false;
01609 
01610   // Handle splat stores.
01611   if (!VectorizableTree[0].NeedToGather && isSplat(VectorizableTree[1].Scalars))
01612     return true;
01613 
01614   // Gathering cost would be too much for tiny trees.
01615   if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
01616     return false;
01617 
01618   return true;
01619 }
01620 
01621 int BoUpSLP::getSpillCost() {
01622   // Walk from the bottom of the tree to the top, tracking which values are
01623   // live. When we see a call instruction that is not part of our tree,
01624   // query TTI to see if there is a cost to keeping values live over it
01625   // (for example, if spills and fills are required).
01626   unsigned BundleWidth = VectorizableTree.front().Scalars.size();
01627   int Cost = 0;
01628 
01629   SmallPtrSet<Instruction*, 4> LiveValues;
01630   Instruction *PrevInst = nullptr; 
01631 
01632   for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
01633     Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
01634     if (!Inst)
01635       continue;
01636 
01637     if (!PrevInst) {
01638       PrevInst = Inst;
01639       continue;
01640     }
01641 
01642     DEBUG(
01643       dbgs() << "SLP: #LV: " << LiveValues.size();
01644       for (auto *X : LiveValues)
01645         dbgs() << " " << X->getName();
01646       dbgs() << ", Looking at ";
01647       Inst->dump();
01648       );
01649 
01650     // Update LiveValues.
01651     LiveValues.erase(PrevInst);
01652     for (auto &J : PrevInst->operands()) {
01653       if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
01654         LiveValues.insert(cast<Instruction>(&*J));
01655     }    
01656 
01657     // Now find the sequence of instructions between PrevInst and Inst.
01658     BasicBlock::reverse_iterator InstIt(Inst), PrevInstIt(PrevInst);
01659     --PrevInstIt;
01660     while (InstIt != PrevInstIt) {
01661       if (PrevInstIt == PrevInst->getParent()->rend()) {
01662         PrevInstIt = Inst->getParent()->rbegin();
01663         continue;
01664       }
01665 
01666       if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
01667         SmallVector<Type*, 4> V;
01668         for (auto *II : LiveValues)
01669           V.push_back(VectorType::get(II->getType(), BundleWidth));
01670         Cost += TTI->getCostOfKeepingLiveOverCall(V);
01671       }
01672 
01673       ++PrevInstIt;
01674     }
01675 
01676     PrevInst = Inst;
01677   }
01678 
01679   DEBUG(dbgs() << "SLP: SpillCost=" << Cost << "\n");
01680   return Cost;
01681 }
01682 
01683 int BoUpSLP::getTreeCost() {
01684   int Cost = 0;
01685   DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
01686         VectorizableTree.size() << ".\n");
01687 
01688   // We only vectorize tiny trees if it is fully vectorizable.
01689   if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
01690     if (!VectorizableTree.size()) {
01691       assert(!ExternalUses.size() && "We should not have any external users");
01692     }
01693     return INT_MAX;
01694   }
01695 
01696   unsigned BundleWidth = VectorizableTree[0].Scalars.size();
01697 
01698   for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) {
01699     int C = getEntryCost(&VectorizableTree[i]);
01700     DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
01701           << *VectorizableTree[i].Scalars[0] << " .\n");
01702     Cost += C;
01703   }
01704 
01705   SmallSet<Value *, 16> ExtractCostCalculated;
01706   int ExtractCost = 0;
01707   for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
01708        I != E; ++I) {
01709     // We only add extract cost once for the same scalar.
01710     if (!ExtractCostCalculated.insert(I->Scalar))
01711       continue;
01712 
01713     VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
01714     ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
01715                                            I->Lane);
01716   }
01717 
01718   Cost += getSpillCost();
01719 
01720   DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
01721   return  Cost + ExtractCost;
01722 }
01723 
01724 int BoUpSLP::getGatherCost(Type *Ty) {
01725   int Cost = 0;
01726   for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
01727     Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
01728   return Cost;
01729 }
01730 
01731 int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
01732   // Find the type of the operands in VL.
01733   Type *ScalarTy = VL[0]->getType();
01734   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
01735     ScalarTy = SI->getValueOperand()->getType();
01736   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
01737   // Find the cost of inserting/extracting values from the vector.
01738   return getGatherCost(VecTy);
01739 }
01740 
01741 Value *BoUpSLP::getPointerOperand(Value *I) {
01742   if (LoadInst *LI = dyn_cast<LoadInst>(I))
01743     return LI->getPointerOperand();
01744   if (StoreInst *SI = dyn_cast<StoreInst>(I))
01745     return SI->getPointerOperand();
01746   return nullptr;
01747 }
01748 
01749 unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
01750   if (LoadInst *L = dyn_cast<LoadInst>(I))
01751     return L->getPointerAddressSpace();
01752   if (StoreInst *S = dyn_cast<StoreInst>(I))
01753     return S->getPointerAddressSpace();
01754   return -1;
01755 }
01756 
01757 bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
01758   Value *PtrA = getPointerOperand(A);
01759   Value *PtrB = getPointerOperand(B);
01760   unsigned ASA = getAddressSpaceOperand(A);
01761   unsigned ASB = getAddressSpaceOperand(B);
01762 
01763   // Check that the address spaces match and that the pointers are valid.
01764   if (!PtrA || !PtrB || (ASA != ASB))
01765     return false;
01766 
01767   // Make sure that A and B are different pointers of the same type.
01768   if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
01769     return false;
01770 
01771   unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA);
01772   Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
01773   APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty));
01774 
01775   APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
01776   PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA);
01777   PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB);
01778 
01779   APInt OffsetDelta = OffsetB - OffsetA;
01780 
01781   // Check if they are based on the same pointer. That makes the offsets
01782   // sufficient.
01783   if (PtrA == PtrB)
01784     return OffsetDelta == Size;
01785 
01786   // Compute the necessary base pointer delta to have the necessary final delta
01787   // equal to the size.
01788   APInt BaseDelta = Size - OffsetDelta;
01789 
01790   // Otherwise compute the distance with SCEV between the base pointers.
01791   const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
01792   const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
01793   const SCEV *C = SE->getConstant(BaseDelta);
01794   const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
01795   return X == PtrSCEVB;
01796 }
01797 
01798 void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
01799   Instruction *VL0 = cast<Instruction>(VL[0]);
01800   BasicBlock::iterator NextInst = VL0;
01801   ++NextInst;
01802   Builder.SetInsertPoint(VL0->getParent(), NextInst);
01803   Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
01804 }
01805 
01806 Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
01807   Value *Vec = UndefValue::get(Ty);
01808   // Generate the 'InsertElement' instruction.
01809   for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
01810     Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
01811     if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
01812       GatherSeq.insert(Insrt);
01813       CSEBlocks.insert(Insrt->getParent());
01814 
01815       // Add to our 'need-to-extract' list.
01816       if (ScalarToTreeEntry.count(VL[i])) {
01817         int Idx = ScalarToTreeEntry[VL[i]];
01818         TreeEntry *E = &VectorizableTree[Idx];
01819         // Find which lane we need to extract.
01820         int FoundLane = -1;
01821         for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
01822           // Is this the lane of the scalar that we are looking for ?
01823           if (E->Scalars[Lane] == VL[i]) {
01824             FoundLane = Lane;
01825             break;
01826           }
01827         }
01828         assert(FoundLane >= 0 && "Could not find the correct lane");
01829         ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
01830       }
01831     }
01832   }
01833 
01834   return Vec;
01835 }
01836 
01837 Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
01838   SmallDenseMap<Value*, int>::const_iterator Entry
01839     = ScalarToTreeEntry.find(VL[0]);
01840   if (Entry != ScalarToTreeEntry.end()) {
01841     int Idx = Entry->second;
01842     const TreeEntry *En = &VectorizableTree[Idx];
01843     if (En->isSame(VL) && En->VectorizedValue)
01844       return En->VectorizedValue;
01845   }
01846   return nullptr;
01847 }
01848 
01849 Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
01850   if (ScalarToTreeEntry.count(VL[0])) {
01851     int Idx = ScalarToTreeEntry[VL[0]];
01852     TreeEntry *E = &VectorizableTree[Idx];
01853     if (E->isSame(VL))
01854       return vectorizeTree(E);
01855   }
01856 
01857   Type *ScalarTy = VL[0]->getType();
01858   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
01859     ScalarTy = SI->getValueOperand()->getType();
01860   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
01861 
01862   return Gather(VL, VecTy);
01863 }
01864 
01865 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
01866   IRBuilder<>::InsertPointGuard Guard(Builder);
01867 
01868   if (E->VectorizedValue) {
01869     DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
01870     return E->VectorizedValue;
01871   }
01872 
01873   Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
01874   Type *ScalarTy = VL0->getType();
01875   if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
01876     ScalarTy = SI->getValueOperand()->getType();
01877   VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
01878 
01879   if (E->NeedToGather) {
01880     setInsertPointAfterBundle(E->Scalars);
01881     return Gather(E->Scalars, VecTy);
01882   }
01883 
01884   unsigned Opcode = getSameOpcode(E->Scalars);
01885 
01886   switch (Opcode) {
01887     case Instruction::PHI: {
01888       PHINode *PH = dyn_cast<PHINode>(VL0);
01889       Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
01890       Builder.SetCurrentDebugLocation(PH->getDebugLoc());
01891       PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
01892       E->VectorizedValue = NewPhi;
01893 
01894       // PHINodes may have multiple entries from the same block. We want to
01895       // visit every block once.
01896       SmallSet<BasicBlock*, 4> VisitedBBs;
01897 
01898       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
01899         ValueList Operands;
01900         BasicBlock *IBB = PH->getIncomingBlock(i);
01901 
01902         if (!VisitedBBs.insert(IBB)) {
01903           NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
01904           continue;
01905         }
01906 
01907         // Prepare the operand vector.
01908         for (unsigned j = 0; j < E->Scalars.size(); ++j)
01909           Operands.push_back(cast<PHINode>(E->Scalars[j])->
01910                              getIncomingValueForBlock(IBB));
01911 
01912         Builder.SetInsertPoint(IBB->getTerminator());
01913         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
01914         Value *Vec = vectorizeTree(Operands);
01915         NewPhi->addIncoming(Vec, IBB);
01916       }
01917 
01918       assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
01919              "Invalid number of incoming values");
01920       return NewPhi;
01921     }
01922 
01923     case Instruction::ExtractElement: {
01924       if (CanReuseExtract(E->Scalars)) {
01925         Value *V = VL0->getOperand(0);
01926         E->VectorizedValue = V;
01927         return V;
01928       }
01929       return Gather(E->Scalars, VecTy);
01930     }
01931     case Instruction::ZExt:
01932     case Instruction::SExt:
01933     case Instruction::FPToUI:
01934     case Instruction::FPToSI:
01935     case Instruction::FPExt:
01936     case Instruction::PtrToInt:
01937     case Instruction::IntToPtr:
01938     case Instruction::SIToFP:
01939     case Instruction::UIToFP:
01940     case Instruction::Trunc:
01941     case Instruction::FPTrunc:
01942     case Instruction::BitCast: {
01943       ValueList INVL;
01944       for (int i = 0, e = E->Scalars.size(); i < e; ++i)
01945         INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
01946 
01947       setInsertPointAfterBundle(E->Scalars);
01948 
01949       Value *InVec = vectorizeTree(INVL);
01950 
01951       if (Value *V = alreadyVectorized(E->Scalars))
01952         return V;
01953 
01954       CastInst *CI = dyn_cast<CastInst>(VL0);
01955       Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
01956       E->VectorizedValue = V;
01957       ++NumVectorInstructions;
01958       return V;
01959     }
01960     case Instruction::FCmp:
01961     case Instruction::ICmp: {
01962       ValueList LHSV, RHSV;
01963       for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
01964         LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
01965         RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
01966       }
01967 
01968       setInsertPointAfterBundle(E->Scalars);
01969 
01970       Value *L = vectorizeTree(LHSV);
01971       Value *R = vectorizeTree(RHSV);
01972 
01973       if (Value *V = alreadyVectorized(E->Scalars))
01974         return V;
01975 
01976       CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
01977       Value *V;
01978       if (Opcode == Instruction::FCmp)
01979         V = Builder.CreateFCmp(P0, L, R);
01980       else
01981         V = Builder.CreateICmp(P0, L, R);
01982 
01983       E->VectorizedValue = V;
01984       ++NumVectorInstructions;
01985       return V;
01986     }
01987     case Instruction::Select: {
01988       ValueList TrueVec, FalseVec, CondVec;
01989       for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
01990         CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
01991         TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
01992         FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2));
01993       }
01994 
01995       setInsertPointAfterBundle(E->Scalars);
01996 
01997       Value *Cond = vectorizeTree(CondVec);
01998       Value *True = vectorizeTree(TrueVec);
01999       Value *False = vectorizeTree(FalseVec);
02000 
02001       if (Value *V = alreadyVectorized(E->Scalars))
02002         return V;
02003 
02004       Value *V = Builder.CreateSelect(Cond, True, False);
02005       E->VectorizedValue = V;
02006       ++NumVectorInstructions;
02007       return V;
02008     }
02009     case Instruction::Add:
02010     case Instruction::FAdd:
02011     case Instruction::Sub:
02012     case Instruction::FSub:
02013     case Instruction::Mul:
02014     case Instruction::FMul:
02015     case Instruction::UDiv:
02016     case Instruction::SDiv:
02017     case Instruction::FDiv:
02018     case Instruction::URem:
02019     case Instruction::SRem:
02020     case Instruction::FRem:
02021     case Instruction::Shl:
02022     case Instruction::LShr:
02023     case Instruction::AShr:
02024     case Instruction::And:
02025     case Instruction::Or:
02026     case Instruction::Xor: {
02027       ValueList LHSVL, RHSVL;
02028       if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
02029         reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
02030       else
02031         for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
02032           LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
02033           RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
02034         }
02035 
02036       setInsertPointAfterBundle(E->Scalars);
02037 
02038       Value *LHS = vectorizeTree(LHSVL);
02039       Value *RHS = vectorizeTree(RHSVL);
02040 
02041       if (LHS == RHS && isa<Instruction>(LHS)) {
02042         assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
02043       }
02044 
02045       if (Value *V = alreadyVectorized(E->Scalars))
02046         return V;
02047 
02048       BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
02049       Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
02050       E->VectorizedValue = V;
02051       propagateIRFlags(E->VectorizedValue, E->Scalars);
02052       ++NumVectorInstructions;
02053 
02054       if (Instruction *I = dyn_cast<Instruction>(V))
02055         return propagateMetadata(I, E->Scalars);
02056 
02057       return V;
02058     }
02059     case Instruction::Load: {
02060       // Loads are inserted at the head of the tree because we don't want to
02061       // sink them all the way down past store instructions.
02062       setInsertPointAfterBundle(E->Scalars);
02063 
02064       LoadInst *LI = cast<LoadInst>(VL0);
02065       Type *ScalarLoadTy = LI->getType();
02066       unsigned AS = LI->getPointerAddressSpace();
02067 
02068       Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
02069                                             VecTy->getPointerTo(AS));
02070 
02071       // The pointer operand uses an in-tree scalar so we add the new BitCast to
02072       // ExternalUses list to make sure that an extract will be generated in the
02073       // future.
02074       if (ScalarToTreeEntry.count(LI->getPointerOperand()))
02075         ExternalUses.push_back(
02076             ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
02077 
02078       unsigned Alignment = LI->getAlignment();
02079       LI = Builder.CreateLoad(VecPtr);
02080       if (!Alignment)
02081         Alignment = DL->getABITypeAlignment(ScalarLoadTy);
02082       LI->setAlignment(Alignment);
02083       E->VectorizedValue = LI;
02084       ++NumVectorInstructions;
02085       return propagateMetadata(LI, E->Scalars);
02086     }
02087     case Instruction::Store: {
02088       StoreInst *SI = cast<StoreInst>(VL0);
02089       unsigned Alignment = SI->getAlignment();
02090       unsigned AS = SI->getPointerAddressSpace();
02091 
02092       ValueList ValueOp;
02093       for (int i = 0, e = E->Scalars.size(); i < e; ++i)
02094         ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand());
02095 
02096       setInsertPointAfterBundle(E->Scalars);
02097 
02098       Value *VecValue = vectorizeTree(ValueOp);
02099       Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
02100                                             VecTy->getPointerTo(AS));
02101       StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
02102 
02103       // The pointer operand uses an in-tree scalar so we add the new BitCast to
02104       // ExternalUses list to make sure that an extract will be generated in the
02105       // future.
02106       if (ScalarToTreeEntry.count(SI->getPointerOperand()))
02107         ExternalUses.push_back(
02108             ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
02109 
02110       if (!Alignment)
02111         Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
02112       S->setAlignment(Alignment);
02113       E->VectorizedValue = S;
02114       ++NumVectorInstructions;
02115       return propagateMetadata(S, E->Scalars);
02116     }
02117     case Instruction::GetElementPtr: {
02118       setInsertPointAfterBundle(E->Scalars);
02119 
02120       ValueList Op0VL;
02121       for (int i = 0, e = E->Scalars.size(); i < e; ++i)
02122         Op0VL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(0));
02123 
02124       Value *Op0 = vectorizeTree(Op0VL);
02125 
02126       std::vector<Value *> OpVecs;
02127       for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
02128            ++j) {
02129         ValueList OpVL;
02130         for (int i = 0, e = E->Scalars.size(); i < e; ++i)
02131           OpVL.push_back(cast<GetElementPtrInst>(E->Scalars[i])->getOperand(j));
02132 
02133         Value *OpVec = vectorizeTree(OpVL);
02134         OpVecs.push_back(OpVec);
02135       }
02136 
02137       Value *V = Builder.CreateGEP(Op0, OpVecs);
02138       E->VectorizedValue = V;
02139       ++NumVectorInstructions;
02140 
02141       if (Instruction *I = dyn_cast<Instruction>(V))
02142         return propagateMetadata(I, E->Scalars);
02143 
02144       return V;
02145     }
02146     case Instruction::Call: {
02147       CallInst *CI = cast<CallInst>(VL0);
02148       setInsertPointAfterBundle(E->Scalars);
02149       Function *FI;
02150       Intrinsic::ID IID  = Intrinsic::not_intrinsic;
02151       Value *ScalarArg = nullptr;
02152       if (CI && (FI = CI->getCalledFunction())) {
02153         IID = (Intrinsic::ID) FI->getIntrinsicID();
02154       }
02155       std::vector<Value *> OpVecs;
02156       for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
02157         ValueList OpVL;
02158         // ctlz,cttz and powi are special intrinsics whose second argument is
02159         // a scalar. This argument should not be vectorized.
02160         if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
02161           CallInst *CEI = cast<CallInst>(E->Scalars[0]);
02162           ScalarArg = CEI->getArgOperand(j);
02163           OpVecs.push_back(CEI->getArgOperand(j));
02164           continue;
02165         }
02166         for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
02167           CallInst *CEI = cast<CallInst>(E->Scalars[i]);
02168           OpVL.push_back(CEI->getArgOperand(j));
02169         }
02170 
02171         Value *OpVec = vectorizeTree(OpVL);
02172         DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
02173         OpVecs.push_back(OpVec);
02174       }
02175 
02176       Module *M = F->getParent();
02177       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
02178       Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
02179       Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
02180       Value *V = Builder.CreateCall(CF, OpVecs);
02181 
02182       // The scalar argument uses an in-tree scalar so we add the new vectorized
02183       // call to ExternalUses list to make sure that an extract will be
02184       // generated in the future.
02185       if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
02186         ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
02187 
02188       E->VectorizedValue = V;
02189       ++NumVectorInstructions;
02190       return V;
02191     }
02192     case Instruction::ShuffleVector: {
02193       ValueList LHSVL, RHSVL;
02194       for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
02195         LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
02196         RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
02197       }
02198       setInsertPointAfterBundle(E->Scalars);
02199 
02200       Value *LHS = vectorizeTree(LHSVL);
02201       Value *RHS = vectorizeTree(RHSVL);
02202 
02203       if (Value *V = alreadyVectorized(E->Scalars))
02204         return V;
02205 
02206       // Create a vector of LHS op1 RHS
02207       BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
02208       Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
02209 
02210       // Create a vector of LHS op2 RHS
02211       Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
02212       BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
02213       Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
02214 
02215       // Create shuffle to take alternate operations from the vector.
02216       // Also, gather up odd and even scalar ops to propagate IR flags to
02217       // each vector operation.
02218       ValueList OddScalars, EvenScalars;
02219       unsigned e = E->Scalars.size();
02220       SmallVector<Constant *, 8> Mask(e);
02221       for (unsigned i = 0; i < e; ++i) {
02222         if (i & 1) {
02223           Mask[i] = Builder.getInt32(e + i);
02224           OddScalars.push_back(E->Scalars[i]);
02225         } else {
02226           Mask[i] = Builder.getInt32(i);
02227           EvenScalars.push_back(E->Scalars[i]);
02228         }
02229       }
02230 
02231       Value *ShuffleMask = ConstantVector::get(Mask);
02232       propagateIRFlags(V0, EvenScalars);
02233       propagateIRFlags(V1, OddScalars);
02234 
02235       Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
02236       E->VectorizedValue = V;
02237       ++NumVectorInstructions;
02238       if (Instruction *I = dyn_cast<Instruction>(V))
02239         return propagateMetadata(I, E->Scalars);
02240 
02241       return V;
02242     }
02243     default:
02244     llvm_unreachable("unknown inst");
02245   }
02246   return nullptr;
02247 }
02248 
02249 Value *BoUpSLP::vectorizeTree() {
02250   
02251   // All blocks must be scheduled before any instructions are inserted.
02252   for (auto &BSIter : BlocksSchedules) {
02253     scheduleBlock(BSIter.second.get());
02254   }
02255 
02256   Builder.SetInsertPoint(F->getEntryBlock().begin());
02257   vectorizeTree(&VectorizableTree[0]);
02258 
02259   DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
02260 
02261   // Extract all of the elements with the external uses.
02262   for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
02263        it != e; ++it) {
02264     Value *Scalar = it->Scalar;
02265     llvm::User *User = it->User;
02266 
02267     // Skip users that we already RAUW. This happens when one instruction
02268     // has multiple uses of the same value.
02269     if (std::find(Scalar->user_begin(), Scalar->user_end(), User) ==
02270         Scalar->user_end())
02271       continue;
02272     assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
02273 
02274     int Idx = ScalarToTreeEntry[Scalar];
02275     TreeEntry *E = &VectorizableTree[Idx];
02276     assert(!E->NeedToGather && "Extracting from a gather list");
02277 
02278     Value *Vec = E->VectorizedValue;
02279     assert(Vec && "Can't find vectorizable value");
02280 
02281     Value *Lane = Builder.getInt32(it->Lane);
02282     // Generate extracts for out-of-tree users.
02283     // Find the insertion point for the extractelement lane.
02284     if (isa<Instruction>(Vec)){
02285       if (PHINode *PH = dyn_cast<PHINode>(User)) {
02286         for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
02287           if (PH->getIncomingValue(i) == Scalar) {
02288             Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
02289             Value *Ex = Builder.CreateExtractElement(Vec, Lane);
02290             CSEBlocks.insert(PH->getIncomingBlock(i));
02291             PH->setOperand(i, Ex);
02292           }
02293         }
02294       } else {
02295         Builder.SetInsertPoint(cast<Instruction>(User));
02296         Value *Ex = Builder.CreateExtractElement(Vec, Lane);
02297         CSEBlocks.insert(cast<Instruction>(User)->getParent());
02298         User->replaceUsesOfWith(Scalar, Ex);
02299      }
02300     } else {
02301       Builder.SetInsertPoint(F->getEntryBlock().begin());
02302       Value *Ex = Builder.CreateExtractElement(Vec, Lane);
02303       CSEBlocks.insert(&F->getEntryBlock());
02304       User->replaceUsesOfWith(Scalar, Ex);
02305     }
02306 
02307     DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
02308   }
02309 
02310   // For each vectorized value:
02311   for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
02312     TreeEntry *Entry = &VectorizableTree[EIdx];
02313 
02314     // For each lane:
02315     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
02316       Value *Scalar = Entry->Scalars[Lane];
02317       // No need to handle users of gathered values.
02318       if (Entry->NeedToGather)
02319         continue;
02320 
02321       assert(Entry->VectorizedValue && "Can't find vectorizable value");
02322 
02323       Type *Ty = Scalar->getType();
02324       if (!Ty->isVoidTy()) {
02325 #ifndef NDEBUG
02326         for (User *U : Scalar->users()) {
02327           DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
02328 
02329           assert((ScalarToTreeEntry.count(U) ||
02330                   // It is legal to replace users in the ignorelist by undef.
02331                   (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
02332                    UserIgnoreList.end())) &&
02333                  "Replacing out-of-tree value with undef");
02334         }
02335 #endif
02336         Value *Undef = UndefValue::get(Ty);
02337         Scalar->replaceAllUsesWith(Undef);
02338       }
02339       DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
02340       cast<Instruction>(Scalar)->eraseFromParent();
02341     }
02342   }
02343 
02344   Builder.ClearInsertionPoint();
02345 
02346   return VectorizableTree[0].VectorizedValue;
02347 }
02348 
02349 void BoUpSLP::optimizeGatherSequence() {
02350   DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
02351         << " gather sequences instructions.\n");
02352   // LICM InsertElementInst sequences.
02353   for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
02354        e = GatherSeq.end(); it != e; ++it) {
02355     InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
02356 
02357     if (!Insert)
02358       continue;
02359 
02360     // Check if this block is inside a loop.
02361     Loop *L = LI->getLoopFor(Insert->getParent());
02362     if (!L)
02363       continue;
02364 
02365     // Check if it has a preheader.
02366     BasicBlock *PreHeader = L->getLoopPreheader();
02367     if (!PreHeader)
02368       continue;
02369 
02370     // If the vector or the element that we insert into it are
02371     // instructions that are defined in this basic block then we can't
02372     // hoist this instruction.
02373     Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
02374     Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
02375     if (CurrVec && L->contains(CurrVec))
02376       continue;
02377     if (NewElem && L->contains(NewElem))
02378       continue;
02379 
02380     // We can hoist this instruction. Move it to the pre-header.
02381     Insert->moveBefore(PreHeader->getTerminator());
02382   }
02383 
02384   // Make a list of all reachable blocks in our CSE queue.
02385   SmallVector<const DomTreeNode *, 8> CSEWorkList;
02386   CSEWorkList.reserve(CSEBlocks.size());
02387   for (BasicBlock *BB : CSEBlocks)
02388     if (DomTreeNode *N = DT->getNode(BB)) {
02389       assert(DT->isReachableFromEntry(N));
02390       CSEWorkList.push_back(N);
02391     }
02392 
02393   // Sort blocks by domination. This ensures we visit a block after all blocks
02394   // dominating it are visited.
02395   std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
02396                    [this](const DomTreeNode *A, const DomTreeNode *B) {
02397     return DT->properlyDominates(A, B);
02398   });
02399 
02400   // Perform O(N^2) search over the gather sequences and merge identical
02401   // instructions. TODO: We can further optimize this scan if we split the
02402   // instructions into different buckets based on the insert lane.
02403   SmallVector<Instruction *, 16> Visited;
02404   for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
02405     assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
02406            "Worklist not sorted properly!");
02407     BasicBlock *BB = (*I)->getBlock();
02408     // For all instructions in blocks containing gather sequences:
02409     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
02410       Instruction *In = it++;
02411       if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
02412         continue;
02413 
02414       // Check if we can replace this instruction with any of the
02415       // visited instructions.
02416       for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
02417                                                     ve = Visited.end();
02418            v != ve; ++v) {
02419         if (In->isIdenticalTo(*v) &&
02420             DT->dominates((*v)->getParent(), In->getParent())) {
02421           In->replaceAllUsesWith(*v);
02422           In->eraseFromParent();
02423           In = nullptr;
02424           break;
02425         }
02426       }
02427       if (In) {
02428         assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
02429         Visited.push_back(In);
02430       }
02431     }
02432   }
02433   CSEBlocks.clear();
02434   GatherSeq.clear();
02435 }
02436 
02437 // Groups the instructions to a bundle (which is then a single scheduling entity)
02438 // and schedules instructions until the bundle gets ready.
02439 bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
02440                                                  AliasAnalysis *AA) {
02441   if (isa<PHINode>(VL[0]))
02442     return true;
02443 
02444   // Initialize the instruction bundle.
02445   Instruction *OldScheduleEnd = ScheduleEnd;
02446   ScheduleData *PrevInBundle = nullptr;
02447   ScheduleData *Bundle = nullptr;
02448   bool ReSchedule = false;
02449   DEBUG(dbgs() << "SLP:  bundle: " << *VL[0] << "\n");
02450   for (Value *V : VL) {
02451     extendSchedulingRegion(V);
02452     ScheduleData *BundleMember = getScheduleData(V);
02453     assert(BundleMember &&
02454            "no ScheduleData for bundle member (maybe not in same basic block)");
02455     if (BundleMember->IsScheduled) {
02456       // A bundle member was scheduled as single instruction before and now
02457       // needs to be scheduled as part of the bundle. We just get rid of the
02458       // existing schedule.
02459       DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
02460                    << " was already scheduled\n");
02461       ReSchedule = true;
02462     }
02463     assert(BundleMember->isSchedulingEntity() &&
02464            "bundle member already part of other bundle");
02465     if (PrevInBundle) {
02466       PrevInBundle->NextInBundle = BundleMember;
02467     } else {
02468       Bundle = BundleMember;
02469     }
02470     BundleMember->UnscheduledDepsInBundle = 0;
02471     Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
02472 
02473     // Group the instructions to a bundle.
02474     BundleMember->FirstInBundle = Bundle;
02475     PrevInBundle = BundleMember;
02476   }
02477   if (ScheduleEnd != OldScheduleEnd) {
02478     // The scheduling region got new instructions at the lower end (or it is a
02479     // new region for the first bundle). This makes it necessary to
02480     // recalculate all dependencies.
02481     // It is seldom that this needs to be done a second time after adding the
02482     // initial bundle to the region.
02483     for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
02484       ScheduleData *SD = getScheduleData(I);
02485       SD->clearDependencies();
02486     }
02487     ReSchedule = true;
02488   }
02489   if (ReSchedule) {
02490     resetSchedule();
02491     initialFillReadyList(ReadyInsts);
02492   }
02493 
02494   DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
02495                << BB->getName() << "\n");
02496 
02497   calculateDependencies(Bundle, true, AA);
02498 
02499   // Now try to schedule the new bundle. As soon as the bundle is "ready" it
02500   // means that there are no cyclic dependencies and we can schedule it.
02501   // Note that's important that we don't "schedule" the bundle yet (see
02502   // cancelScheduling).
02503   while (!Bundle->isReady() && !ReadyInsts.empty()) {
02504 
02505     ScheduleData *pickedSD = ReadyInsts.back();
02506     ReadyInsts.pop_back();
02507 
02508     if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
02509       schedule(pickedSD, ReadyInsts);
02510     }
02511   }
02512   return Bundle->isReady();
02513 }
02514 
02515 void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
02516   if (isa<PHINode>(VL[0]))
02517     return;
02518 
02519   ScheduleData *Bundle = getScheduleData(VL[0]);
02520   DEBUG(dbgs() << "SLP:  cancel scheduling of " << *Bundle << "\n");
02521   assert(!Bundle->IsScheduled &&
02522          "Can't cancel bundle which is already scheduled");
02523   assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
02524          "tried to unbundle something which is not a bundle");
02525 
02526   // Un-bundle: make single instructions out of the bundle.
02527   ScheduleData *BundleMember = Bundle;
02528   while (BundleMember) {
02529     assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
02530     BundleMember->FirstInBundle = BundleMember;
02531     ScheduleData *Next = BundleMember->NextInBundle;
02532     BundleMember->NextInBundle = nullptr;
02533     BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
02534     if (BundleMember->UnscheduledDepsInBundle == 0) {
02535       ReadyInsts.insert(BundleMember);
02536     }
02537     BundleMember = Next;
02538   }
02539 }
02540 
02541 void BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
02542   if (getScheduleData(V))
02543     return;
02544   Instruction *I = dyn_cast<Instruction>(V);
02545   assert(I && "bundle member must be an instruction");
02546   assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
02547   if (!ScheduleStart) {
02548     // It's the first instruction in the new region.
02549     initScheduleData(I, I->getNextNode(), nullptr, nullptr);
02550     ScheduleStart = I;
02551     ScheduleEnd = I->getNextNode();
02552     assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
02553     DEBUG(dbgs() << "SLP:  initialize schedule region to " << *I << "\n");
02554     return;
02555   }
02556   // Search up and down at the same time, because we don't know if the new
02557   // instruction is above or below the existing scheduling region.
02558   BasicBlock::reverse_iterator UpIter(ScheduleStart);
02559   BasicBlock::reverse_iterator UpperEnd = BB->rend();
02560   BasicBlock::iterator DownIter(ScheduleEnd);
02561   BasicBlock::iterator LowerEnd = BB->end();
02562   for (;;) {
02563     if (UpIter != UpperEnd) {
02564       if (&*UpIter == I) {
02565         initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
02566         ScheduleStart = I;
02567         DEBUG(dbgs() << "SLP:  extend schedule region start to " << *I << "\n");
02568         return;
02569       }
02570       UpIter++;
02571     }
02572     if (DownIter != LowerEnd) {
02573       if (&*DownIter == I) {
02574         initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
02575                          nullptr);
02576         ScheduleEnd = I->getNextNode();
02577         assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
02578         DEBUG(dbgs() << "SLP:  extend schedule region end to " << *I << "\n");
02579         return;
02580       }
02581       DownIter++;
02582     }
02583     assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
02584            "instruction not found in block");
02585   }
02586 }
02587 
02588 void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
02589                                                 Instruction *ToI,
02590                                                 ScheduleData *PrevLoadStore,
02591                                                 ScheduleData *NextLoadStore) {
02592   ScheduleData *CurrentLoadStore = PrevLoadStore;
02593   for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
02594     ScheduleData *SD = ScheduleDataMap[I];
02595     if (!SD) {
02596       // Allocate a new ScheduleData for the instruction.
02597       if (ChunkPos >= ChunkSize) {
02598         ScheduleDataChunks.push_back(
02599             llvm::make_unique<ScheduleData[]>(ChunkSize));
02600         ChunkPos = 0;
02601       }
02602       SD = &(ScheduleDataChunks.back()[ChunkPos++]);
02603       ScheduleDataMap[I] = SD;
02604       SD->Inst = I;
02605     }
02606     assert(!isInSchedulingRegion(SD) &&
02607            "new ScheduleData already in scheduling region");
02608     SD->init(SchedulingRegionID);
02609 
02610     if (I->mayReadOrWriteMemory()) {
02611       // Update the linked list of memory accessing instructions.
02612       if (CurrentLoadStore) {
02613         CurrentLoadStore->NextLoadStore = SD;
02614       } else {
02615         FirstLoadStoreInRegion = SD;
02616       }
02617       CurrentLoadStore = SD;
02618     }
02619   }
02620   if (NextLoadStore) {
02621     if (CurrentLoadStore)
02622       CurrentLoadStore->NextLoadStore = NextLoadStore;
02623   } else {
02624     LastLoadStoreInRegion = CurrentLoadStore;
02625   }
02626 }
02627 
02628 /// \returns the AA location that is being access by the instruction.
02629 static AliasAnalysis::Location getLocation(Instruction *I, AliasAnalysis *AA) {
02630   if (StoreInst *SI = dyn_cast<StoreInst>(I))
02631     return AA->getLocation(SI);
02632   if (LoadInst *LI = dyn_cast<LoadInst>(I))
02633     return AA->getLocation(LI);
02634   return AliasAnalysis::Location();
02635 }
02636 
02637 void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
02638                                                      bool InsertInReadyList,
02639                                                      AliasAnalysis *AA) {
02640   assert(SD->isSchedulingEntity());
02641 
02642   SmallVector<ScheduleData *, 10> WorkList;
02643   WorkList.push_back(SD);
02644 
02645   while (!WorkList.empty()) {
02646     ScheduleData *SD = WorkList.back();
02647     WorkList.pop_back();
02648 
02649     ScheduleData *BundleMember = SD;
02650     while (BundleMember) {
02651       assert(isInSchedulingRegion(BundleMember));
02652       if (!BundleMember->hasValidDependencies()) {
02653 
02654         DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember << "\n");
02655         BundleMember->Dependencies = 0;
02656         BundleMember->resetUnscheduledDeps();
02657 
02658         // Handle def-use chain dependencies.
02659         for (User *U : BundleMember->Inst->users()) {
02660           if (isa<Instruction>(U)) {
02661             ScheduleData *UseSD = getScheduleData(U);
02662             if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
02663               BundleMember->Dependencies++;
02664               ScheduleData *DestBundle = UseSD->FirstInBundle;
02665               if (!DestBundle->IsScheduled) {
02666                 BundleMember->incrementUnscheduledDeps(1);
02667               }
02668               if (!DestBundle->hasValidDependencies()) {
02669                 WorkList.push_back(DestBundle);
02670               }
02671             }
02672           } else {
02673             // I'm not sure if this can ever happen. But we need to be safe.
02674             // This lets the instruction/bundle never be scheduled and eventally
02675             // disable vectorization.
02676             BundleMember->Dependencies++;
02677             BundleMember->incrementUnscheduledDeps(1);
02678           }
02679         }
02680 
02681         // Handle the memory dependencies.
02682         ScheduleData *DepDest = BundleMember->NextLoadStore;
02683         if (DepDest) {
02684           AliasAnalysis::Location SrcLoc = getLocation(BundleMember->Inst, AA);
02685           bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
02686 
02687           while (DepDest) {
02688             assert(isInSchedulingRegion(DepDest));
02689             if (SrcMayWrite || DepDest->Inst->mayWriteToMemory()) {
02690               AliasAnalysis::Location DstLoc = getLocation(DepDest->Inst, AA);
02691               if (!SrcLoc.Ptr || !DstLoc.Ptr || AA->alias(SrcLoc, DstLoc)) {
02692                 DepDest->MemoryDependencies.push_back(BundleMember);
02693                 BundleMember->Dependencies++;
02694                 ScheduleData *DestBundle = DepDest->FirstInBundle;
02695                 if (!DestBundle->IsScheduled) {
02696                   BundleMember->incrementUnscheduledDeps(1);
02697                 }
02698                 if (!DestBundle->hasValidDependencies()) {
02699                   WorkList.push_back(DestBundle);
02700                 }
02701               }
02702             }
02703             DepDest = DepDest->NextLoadStore;
02704           }
02705         }
02706       }
02707       BundleMember = BundleMember->NextInBundle;
02708     }
02709     if (InsertInReadyList && SD->isReady()) {
02710       ReadyInsts.push_back(SD);
02711       DEBUG(dbgs() << "SLP:     gets ready on update: " << *SD->Inst << "\n");
02712     }
02713   }
02714 }
02715 
02716 void BoUpSLP::BlockScheduling::resetSchedule() {
02717   assert(ScheduleStart &&
02718          "tried to reset schedule on block which has not been scheduled");
02719   for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
02720     ScheduleData *SD = getScheduleData(I);
02721     assert(isInSchedulingRegion(SD));
02722     SD->IsScheduled = false;
02723     SD->resetUnscheduledDeps();
02724   }
02725   ReadyInsts.clear();
02726 }
02727 
02728 void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
02729   
02730   if (!BS->ScheduleStart)
02731     return;
02732   
02733   DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
02734 
02735   BS->resetSchedule();
02736 
02737   // For the real scheduling we use a more sophisticated ready-list: it is
02738   // sorted by the original instruction location. This lets the final schedule
02739   // be as  close as possible to the original instruction order.
02740   struct ScheduleDataCompare {
02741     bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
02742       return SD2->SchedulingPriority < SD1->SchedulingPriority;
02743     }
02744   };
02745   std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
02746 
02747   // Ensure that all depencency data is updated and fill the ready-list with
02748   // initial instructions.
02749   int Idx = 0;
02750   int NumToSchedule = 0;
02751   for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
02752        I = I->getNextNode()) {
02753     ScheduleData *SD = BS->getScheduleData(I);
02754     assert(
02755         SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
02756         "scheduler and vectorizer have different opinion on what is a bundle");
02757     SD->FirstInBundle->SchedulingPriority = Idx++;
02758     if (SD->isSchedulingEntity()) {
02759       BS->calculateDependencies(SD, false, AA);
02760       NumToSchedule++;
02761     }
02762   }
02763   BS->initialFillReadyList(ReadyInsts);
02764 
02765   Instruction *LastScheduledInst = BS->ScheduleEnd;
02766 
02767   // Do the "real" scheduling.
02768   while (!ReadyInsts.empty()) {
02769     ScheduleData *picked = *ReadyInsts.begin();
02770     ReadyInsts.erase(ReadyInsts.begin());
02771 
02772     // Move the scheduled instruction(s) to their dedicated places, if not
02773     // there yet.
02774     ScheduleData *BundleMember = picked;
02775     while (BundleMember) {
02776       Instruction *pickedInst = BundleMember->Inst;
02777       if (LastScheduledInst->getNextNode() != pickedInst) {
02778         BS->BB->getInstList().remove(pickedInst);
02779         BS->BB->getInstList().insert(LastScheduledInst, pickedInst);
02780       }
02781       LastScheduledInst = pickedInst;
02782       BundleMember = BundleMember->NextInBundle;
02783     }
02784 
02785     BS->schedule(picked, ReadyInsts);
02786     NumToSchedule--;
02787   }
02788   assert(NumToSchedule == 0 && "could not schedule all instructions");
02789 
02790   // Avoid duplicate scheduling of the block.
02791   BS->ScheduleStart = nullptr;
02792 }
02793 
02794 /// The SLPVectorizer Pass.
02795 struct SLPVectorizer : public FunctionPass {
02796   typedef SmallVector<StoreInst *, 8> StoreList;
02797   typedef MapVector<Value *, StoreList> StoreListMap;
02798 
02799   /// Pass identification, replacement for typeid
02800   static char ID;
02801 
02802   explicit SLPVectorizer() : FunctionPass(ID) {
02803     initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
02804   }
02805 
02806   ScalarEvolution *SE;
02807   const DataLayout *DL;
02808   TargetTransformInfo *TTI;
02809   TargetLibraryInfo *TLI;
02810   AliasAnalysis *AA;
02811   LoopInfo *LI;
02812   DominatorTree *DT;
02813 
02814   bool runOnFunction(Function &F) override {
02815     if (skipOptnoneFunction(F))
02816       return false;
02817 
02818     SE = &getAnalysis<ScalarEvolution>();
02819     DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
02820     DL = DLP ? &DLP->getDataLayout() : nullptr;
02821     TTI = &getAnalysis<TargetTransformInfo>();
02822     TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
02823     AA = &getAnalysis<AliasAnalysis>();
02824     LI = &getAnalysis<LoopInfo>();
02825     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
02826 
02827     StoreRefs.clear();
02828     bool Changed = false;
02829 
02830     // If the target claims to have no vector registers don't attempt
02831     // vectorization.
02832     if (!TTI->getNumberOfRegisters(true))
02833       return false;
02834 
02835     // Must have DataLayout. We can't require it because some tests run w/o
02836     // triple.
02837     if (!DL)
02838       return false;
02839 
02840     // Don't vectorize when the attribute NoImplicitFloat is used.
02841     if (F.hasFnAttribute(Attribute::NoImplicitFloat))
02842       return false;
02843 
02844     DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
02845 
02846     // Use the bottom up slp vectorizer to construct chains that start with
02847     // store instructions.
02848     BoUpSLP R(&F, SE, DL, TTI, TLI, AA, LI, DT);
02849 
02850     // Scan the blocks in the function in post order.
02851     for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
02852          e = po_end(&F.getEntryBlock()); it != e; ++it) {
02853       BasicBlock *BB = *it;
02854       // Vectorize trees that end at stores.
02855       if (unsigned count = collectStores(BB, R)) {
02856         (void)count;
02857         DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
02858         Changed |= vectorizeStoreChains(R);
02859       }
02860 
02861       // Vectorize trees that end at reductions.
02862       Changed |= vectorizeChainsInBlock(BB, R);
02863     }
02864 
02865     if (Changed) {
02866       R.optimizeGatherSequence();
02867       DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
02868       DEBUG(verifyFunction(F));
02869     }
02870     return Changed;
02871   }
02872 
02873   void getAnalysisUsage(AnalysisUsage &AU) const override {
02874     FunctionPass::getAnalysisUsage(AU);
02875     AU.addRequired<ScalarEvolution>();
02876     AU.addRequired<AliasAnalysis>();
02877     AU.addRequired<TargetTransformInfo>();
02878     AU.addRequired<LoopInfo>();
02879     AU.addRequired<DominatorTreeWrapperPass>();
02880     AU.addPreserved<LoopInfo>();
02881     AU.addPreserved<DominatorTreeWrapperPass>();
02882     AU.setPreservesCFG();
02883   }
02884 
02885 private:
02886 
02887   /// \brief Collect memory references and sort them according to their base
02888   /// object. We sort the stores to their base objects to reduce the cost of the
02889   /// quadratic search on the stores. TODO: We can further reduce this cost
02890   /// if we flush the chain creation every time we run into a memory barrier.
02891   unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
02892 
02893   /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
02894   bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
02895 
02896   /// \brief Try to vectorize a list of operands.
02897   /// \@param BuildVector A list of users to ignore for the purpose of
02898   ///                     scheduling and that don't need extracting.
02899   /// \returns true if a value was vectorized.
02900   bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
02901                           ArrayRef<Value *> BuildVector = None,
02902                           bool allowReorder = false);
02903 
02904   /// \brief Try to vectorize a chain that may start at the operands of \V;
02905   bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
02906 
02907   /// \brief Vectorize the stores that were collected in StoreRefs.
02908   bool vectorizeStoreChains(BoUpSLP &R);
02909 
02910   /// \brief Scan the basic block and look for patterns that are likely to start
02911   /// a vectorization chain.
02912   bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
02913 
02914   bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
02915                            BoUpSLP &R);
02916 
02917   bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
02918                        BoUpSLP &R);
02919 private:
02920   StoreListMap StoreRefs;
02921 };
02922 
02923 /// \brief Check that the Values in the slice in VL array are still existent in
02924 /// the WeakVH array.
02925 /// Vectorization of part of the VL array may cause later values in the VL array
02926 /// to become invalid. We track when this has happened in the WeakVH array.
02927 static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL,
02928                                SmallVectorImpl<WeakVH> &VH,
02929                                unsigned SliceBegin,
02930                                unsigned SliceSize) {
02931   for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i)
02932     if (VH[i] != VL[i])
02933       return true;
02934 
02935   return false;
02936 }
02937 
02938 bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
02939                                           int CostThreshold, BoUpSLP &R) {
02940   unsigned ChainLen = Chain.size();
02941   DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
02942         << "\n");
02943   Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
02944   unsigned Sz = DL->getTypeSizeInBits(StoreTy);
02945   unsigned VF = MinVecRegSize / Sz;
02946 
02947   if (!isPowerOf2_32(Sz) || VF < 2)
02948     return false;
02949 
02950   // Keep track of values that were deleted by vectorizing in the loop below.
02951   SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
02952 
02953   bool Changed = false;
02954   // Look for profitable vectorizable trees at all offsets, starting at zero.
02955   for (unsigned i = 0, e = ChainLen; i < e; ++i) {
02956     if (i + VF > e)
02957       break;
02958 
02959     // Check that a previous iteration of this loop did not delete the Value.
02960     if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
02961       continue;
02962 
02963     DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
02964           << "\n");
02965     ArrayRef<Value *> Operands = Chain.slice(i, VF);
02966 
02967     R.buildTree(Operands);
02968 
02969     int Cost = R.getTreeCost();
02970 
02971     DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
02972     if (Cost < CostThreshold) {
02973       DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
02974       R.vectorizeTree();
02975 
02976       // Move to the next bundle.
02977       i += VF - 1;
02978       Changed = true;
02979     }
02980   }
02981 
02982   return Changed;
02983 }
02984 
02985 bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
02986                                     int costThreshold, BoUpSLP &R) {
02987   SetVector<Value *> Heads, Tails;
02988   SmallDenseMap<Value *, Value *> ConsecutiveChain;
02989 
02990   // We may run into multiple chains that merge into a single chain. We mark the
02991   // stores that we vectorized so that we don't visit the same store twice.
02992   BoUpSLP::ValueSet VectorizedStores;
02993   bool Changed = false;
02994 
02995   // Do a quadratic search on all of the given stores and find
02996   // all of the pairs of stores that follow each other.
02997   for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
02998     for (unsigned j = 0; j < e; ++j) {
02999       if (i == j)
03000         continue;
03001 
03002       if (R.isConsecutiveAccess(Stores[i], Stores[j])) {
03003         Tails.insert(Stores[j]);
03004         Heads.insert(Stores[i]);
03005         ConsecutiveChain[Stores[i]] = Stores[j];
03006       }
03007     }
03008   }
03009 
03010   // For stores that start but don't end a link in the chain:
03011   for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end();
03012        it != e; ++it) {
03013     if (Tails.count(*it))
03014       continue;
03015 
03016     // We found a store instr that starts a chain. Now follow the chain and try
03017     // to vectorize it.
03018     BoUpSLP::ValueList Operands;
03019     Value *I = *it;
03020     // Collect the chain into a list.
03021     while (Tails.count(I) || Heads.count(I)) {
03022       if (VectorizedStores.count(I))
03023         break;
03024       Operands.push_back(I);
03025       // Move to the next value in the chain.
03026       I = ConsecutiveChain[I];
03027     }
03028 
03029     bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R);
03030 
03031     // Mark the vectorized stores so that we don't vectorize them again.
03032     if (Vectorized)
03033       VectorizedStores.insert(Operands.begin(), Operands.end());
03034     Changed |= Vectorized;
03035   }
03036 
03037   return Changed;
03038 }
03039 
03040 
03041 unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
03042   unsigned count = 0;
03043   StoreRefs.clear();
03044   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
03045     StoreInst *SI = dyn_cast<StoreInst>(it);
03046     if (!SI)
03047       continue;
03048 
03049     // Don't touch volatile stores.
03050     if (!SI->isSimple())
03051       continue;
03052 
03053     // Check that the pointer points to scalars.
03054     Type *Ty = SI->getValueOperand()->getType();
03055     if (Ty->isAggregateType() || Ty->isVectorTy())
03056       continue;
03057 
03058     // Find the base pointer.
03059     Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
03060 
03061     // Save the store locations.
03062     StoreRefs[Ptr].push_back(SI);
03063     count++;
03064   }
03065   return count;
03066 }
03067 
03068 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
03069   if (!A || !B)
03070     return false;
03071   Value *VL[] = { A, B };
03072   return tryToVectorizeList(VL, R, None, true);
03073 }
03074 
03075 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
03076                                        ArrayRef<Value *> BuildVector,
03077                                        bool allowReorder) {
03078   if (VL.size() < 2)
03079     return false;
03080 
03081   DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n");
03082 
03083   // Check that all of the parts are scalar instructions of the same type.
03084   Instruction *I0 = dyn_cast<Instruction>(VL[0]);
03085   if (!I0)
03086     return false;
03087 
03088   unsigned Opcode0 = I0->getOpcode();
03089 
03090   Type *Ty0 = I0->getType();
03091   unsigned Sz = DL->getTypeSizeInBits(Ty0);
03092   unsigned VF = MinVecRegSize / Sz;
03093 
03094   for (int i = 0, e = VL.size(); i < e; ++i) {
03095     Type *Ty = VL[i]->getType();
03096     if (Ty->isAggregateType() || Ty->isVectorTy())
03097       return false;
03098     Instruction *Inst = dyn_cast<Instruction>(VL[i]);
03099     if (!Inst || Inst->getOpcode() != Opcode0)
03100       return false;
03101   }
03102 
03103   bool Changed = false;
03104 
03105   // Keep track of values that were deleted by vectorizing in the loop below.
03106   SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
03107 
03108   for (unsigned i = 0, e = VL.size(); i < e; ++i) {
03109     unsigned OpsWidth = 0;
03110 
03111     if (i + VF > e)
03112       OpsWidth = e - i;
03113     else
03114       OpsWidth = VF;
03115 
03116     if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
03117       break;
03118 
03119     // Check that a previous iteration of this loop did not delete the Value.
03120     if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
03121       continue;
03122 
03123     DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
03124                  << "\n");
03125     ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
03126 
03127     ArrayRef<Value *> BuildVectorSlice;
03128     if (!BuildVector.empty())
03129       BuildVectorSlice = BuildVector.slice(i, OpsWidth);
03130 
03131     R.buildTree(Ops, BuildVectorSlice);
03132     // TODO: check if we can allow reordering also for other cases than
03133     // tryToVectorizePair()
03134     if (allowReorder && R.shouldReorder()) {
03135       assert(Ops.size() == 2);
03136       assert(BuildVectorSlice.empty());
03137       Value *ReorderedOps[] = { Ops[1], Ops[0] };
03138       R.buildTree(ReorderedOps, None);
03139     }
03140     int Cost = R.getTreeCost();
03141 
03142     if (Cost < -SLPCostThreshold) {
03143       DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
03144       Value *VectorizedRoot = R.vectorizeTree();
03145 
03146       // Reconstruct the build vector by extracting the vectorized root. This
03147       // way we handle the case where some elements of the vector are undefined.
03148       //  (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
03149       if (!BuildVectorSlice.empty()) {
03150         // The insert point is the last build vector instruction. The vectorized
03151         // root will precede it. This guarantees that we get an instruction. The
03152         // vectorized tree could have been constant folded.
03153         Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
03154         unsigned VecIdx = 0;
03155         for (auto &V : BuildVectorSlice) {
03156           IRBuilder<true, NoFolder> Builder(
03157               ++BasicBlock::iterator(InsertAfter));
03158           InsertElementInst *IE = cast<InsertElementInst>(V);
03159           Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
03160               VectorizedRoot, Builder.getInt32(VecIdx++)));
03161           IE->setOperand(1, Extract);
03162           IE->removeFromParent();
03163           IE->insertAfter(Extract);
03164           InsertAfter = IE;
03165         }
03166       }
03167       // Move to the next bundle.
03168       i += VF - 1;
03169       Changed = true;
03170     }
03171   }
03172 
03173   return Changed;
03174 }
03175 
03176 bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
03177   if (!V)
03178     return false;
03179 
03180   // Try to vectorize V.
03181   if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
03182     return true;
03183 
03184   BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
03185   BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
03186   // Try to skip B.
03187   if (B && B->hasOneUse()) {
03188     BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
03189     BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
03190     if (tryToVectorizePair(A, B0, R)) {
03191       return true;
03192     }
03193     if (tryToVectorizePair(A, B1, R)) {
03194       return true;
03195     }
03196   }
03197 
03198   // Try to skip A.
03199   if (A && A->hasOneUse()) {
03200     BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
03201     BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
03202     if (tryToVectorizePair(A0, B, R)) {
03203       return true;
03204     }
03205     if (tryToVectorizePair(A1, B, R)) {
03206       return true;
03207     }
03208   }
03209   return 0;
03210 }
03211 
03212 /// \brief Generate a shuffle mask to be used in a reduction tree.
03213 ///
03214 /// \param VecLen The length of the vector to be reduced.
03215 /// \param NumEltsToRdx The number of elements that should be reduced in the
03216 ///        vector.
03217 /// \param IsPairwise Whether the reduction is a pairwise or splitting
03218 ///        reduction. A pairwise reduction will generate a mask of 
03219 ///        <0,2,...> or <1,3,..> while a splitting reduction will generate
03220 ///        <2,3, undef,undef> for a vector of 4 and NumElts = 2.
03221 /// \param IsLeft True will generate a mask of even elements, odd otherwise.
03222 static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
03223                                    bool IsPairwise, bool IsLeft,
03224                                    IRBuilder<> &Builder) {
03225   assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
03226 
03227   SmallVector<Constant *, 32> ShuffleMask(
03228       VecLen, UndefValue::get(Builder.getInt32Ty()));
03229 
03230   if (IsPairwise)
03231     // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
03232     for (unsigned i = 0; i != NumEltsToRdx; ++i)
03233       ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
03234   else
03235     // Move the upper half of the vector to the lower half.
03236     for (unsigned i = 0; i != NumEltsToRdx; ++i)
03237       ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
03238 
03239   return ConstantVector::get(ShuffleMask);
03240 }
03241 
03242 
03243 /// Model horizontal reductions.
03244 ///
03245 /// A horizontal reduction is a tree of reduction operations (currently add and
03246 /// fadd) that has operations that can be put into a vector as its leaf.
03247 /// For example, this tree:
03248 ///
03249 /// mul mul mul mul
03250 ///  \  /    \  /
03251 ///   +       +
03252 ///    \     /
03253 ///       +
03254 /// This tree has "mul" as its reduced values and "+" as its reduction
03255 /// operations. A reduction might be feeding into a store or a binary operation
03256 /// feeding a phi.
03257 ///    ...
03258 ///    \  /
03259 ///     +
03260 ///     |
03261 ///  phi +=
03262 ///
03263 ///  Or:
03264 ///    ...
03265 ///    \  /
03266 ///     +
03267 ///     |
03268 ///   *p =
03269 ///
03270 class HorizontalReduction {
03271   SmallVector<Value *, 16> ReductionOps;
03272   SmallVector<Value *, 32> ReducedVals;
03273 
03274   BinaryOperator *ReductionRoot;
03275   PHINode *ReductionPHI;
03276 
03277   /// The opcode of the reduction.
03278   unsigned ReductionOpcode;
03279   /// The opcode of the values we perform a reduction on.
03280   unsigned ReducedValueOpcode;
03281   /// The width of one full horizontal reduction operation.
03282   unsigned ReduxWidth;
03283   /// Should we model this reduction as a pairwise reduction tree or a tree that
03284   /// splits the vector in halves and adds those halves.
03285   bool IsPairwiseReduction;
03286 
03287 public:
03288   HorizontalReduction()
03289     : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
03290     ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
03291 
03292   /// \brief Try to find a reduction tree.
03293   bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B,
03294                                  const DataLayout *DL) {
03295     assert((!Phi ||
03296             std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
03297            "Thi phi needs to use the binary operator");
03298 
03299     // We could have a initial reductions that is not an add.
03300     //  r *= v1 + v2 + v3 + v4
03301     // In such a case start looking for a tree rooted in the first '+'.
03302     if (Phi) {
03303       if (B->getOperand(0) == Phi) {
03304         Phi = nullptr;
03305         B = dyn_cast<BinaryOperator>(B->getOperand(1));
03306       } else if (B->getOperand(1) == Phi) {
03307         Phi = nullptr;
03308         B = dyn_cast<BinaryOperator>(B->getOperand(0));
03309       }
03310     }
03311 
03312     if (!B)
03313       return false;
03314 
03315     Type *Ty = B->getType();
03316     if (Ty->isVectorTy())
03317       return false;
03318 
03319     ReductionOpcode = B->getOpcode();
03320     ReducedValueOpcode = 0;
03321     ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty);
03322     ReductionRoot = B;
03323     ReductionPHI = Phi;
03324 
03325     if (ReduxWidth < 4)
03326       return false;
03327 
03328     // We currently only support adds.
03329     if (ReductionOpcode != Instruction::Add &&
03330         ReductionOpcode != Instruction::FAdd)
03331       return false;
03332 
03333     // Post order traverse the reduction tree starting at B. We only handle true
03334     // trees containing only binary operators.
03335     SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack;
03336     Stack.push_back(std::make_pair(B, 0));
03337     while (!Stack.empty()) {
03338       BinaryOperator *TreeN = Stack.back().first;
03339       unsigned EdgeToVist = Stack.back().second++;
03340       bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
03341 
03342       // Only handle trees in the current basic block.
03343       if (TreeN->getParent() != B->getParent())
03344         return false;
03345 
03346       // Each tree node needs to have one user except for the ultimate
03347       // reduction.
03348       if (!TreeN->hasOneUse() && TreeN != B)
03349         return false;
03350 
03351       // Postorder vist.
03352       if (EdgeToVist == 2 || IsReducedValue) {
03353         if (IsReducedValue) {
03354           // Make sure that the opcodes of the operations that we are going to
03355           // reduce match.
03356           if (!ReducedValueOpcode)
03357             ReducedValueOpcode = TreeN->getOpcode();
03358           else if (ReducedValueOpcode != TreeN->getOpcode())
03359             return false;
03360           ReducedVals.push_back(TreeN);
03361         } else {
03362           // We need to be able to reassociate the adds.
03363           if (!TreeN->isAssociative())
03364             return false;
03365           ReductionOps.push_back(TreeN);
03366         }
03367         // Retract.
03368         Stack.pop_back();
03369         continue;
03370       }
03371 
03372       // Visit left or right.
03373       Value *NextV = TreeN->getOperand(EdgeToVist);
03374       BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV);
03375       if (Next)
03376         Stack.push_back(std::make_pair(Next, 0));
03377       else if (NextV != Phi)
03378         return false;
03379     }
03380     return true;
03381   }
03382 
03383   /// \brief Attempt to vectorize the tree found by
03384   /// matchAssociativeReduction.
03385   bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
03386     if (ReducedVals.empty())
03387       return false;
03388 
03389     unsigned NumReducedVals = ReducedVals.size();
03390     if (NumReducedVals < ReduxWidth)
03391       return false;
03392 
03393     Value *VectorizedTree = nullptr;
03394     IRBuilder<> Builder(ReductionRoot);
03395     FastMathFlags Unsafe;
03396     Unsafe.setUnsafeAlgebra();
03397     Builder.SetFastMathFlags(Unsafe);
03398     unsigned i = 0;
03399 
03400     for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
03401       V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
03402 
03403       // Estimate cost.
03404       int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
03405       if (Cost >= -SLPCostThreshold)
03406         break;
03407 
03408       DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
03409                    << ". (HorRdx)\n");
03410 
03411       // Vectorize a tree.
03412       DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
03413       Value *VectorizedRoot = V.vectorizeTree();
03414 
03415       // Emit a reduction.
03416       Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
03417       if (VectorizedTree) {
03418         Builder.SetCurrentDebugLocation(Loc);
03419         VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
03420                                      ReducedSubTree, "bin.rdx");
03421       } else
03422         VectorizedTree = ReducedSubTree;
03423     }
03424 
03425     if (VectorizedTree) {
03426       // Finish the reduction.
03427       for (; i < NumReducedVals; ++i) {
03428         Builder.SetCurrentDebugLocation(
03429           cast<Instruction>(ReducedVals[i])->getDebugLoc());
03430         VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
03431                                      ReducedVals[i]);
03432       }
03433       // Update users.
03434       if (ReductionPHI) {
03435         assert(ReductionRoot && "Need a reduction operation");
03436         ReductionRoot->setOperand(0, VectorizedTree);
03437         ReductionRoot->setOperand(1, ReductionPHI);
03438       } else
03439         ReductionRoot->replaceAllUsesWith(VectorizedTree);
03440     }
03441     return VectorizedTree != nullptr;
03442   }
03443 
03444 private:
03445 
03446   /// \brief Calcuate the cost of a reduction.
03447   int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
03448     Type *ScalarTy = FirstReducedVal->getType();
03449     Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
03450 
03451     int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
03452     int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
03453 
03454     IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
03455     int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
03456 
03457     int ScalarReduxCost =
03458         ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
03459 
03460     DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
03461                  << " for reduction that starts with " << *FirstReducedVal
03462                  << " (It is a "
03463                  << (IsPairwiseReduction ? "pairwise" : "splitting")
03464                  << " reduction)\n");
03465 
03466     return VecReduxCost - ScalarReduxCost;
03467   }
03468 
03469   static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
03470                             Value *R, const Twine &Name = "") {
03471     if (Opcode == Instruction::FAdd)
03472       return Builder.CreateFAdd(L, R, Name);
03473     return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
03474   }
03475 
03476   /// \brief Emit a horizontal reduction of the vectorized value.
03477   Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
03478     assert(VectorizedValue && "Need to have a vectorized tree node");
03479     Instruction *ValToReduce = dyn_cast<Instruction>(VectorizedValue);
03480     assert(isPowerOf2_32(ReduxWidth) &&
03481            "We only handle power-of-two reductions for now");
03482 
03483     Value *TmpVec = ValToReduce;
03484     for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
03485       if (IsPairwiseReduction) {
03486         Value *LeftMask =
03487           createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
03488         Value *RightMask =
03489           createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
03490 
03491         Value *LeftShuf = Builder.CreateShuffleVector(
03492           TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
03493         Value *RightShuf = Builder.CreateShuffleVector(
03494           TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
03495           "rdx.shuf.r");
03496         TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
03497                              "bin.rdx");
03498       } else {
03499         Value *UpperHalf =
03500           createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
03501         Value *Shuf = Builder.CreateShuffleVector(
03502           TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
03503         TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
03504       }
03505     }
03506 
03507     // The result is in the first element of the vector.
03508     return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
03509   }
03510 };
03511 
03512 /// \brief Recognize construction of vectors like
03513 ///  %ra = insertelement <4 x float> undef, float %s0, i32 0
03514 ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
03515 ///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
03516 ///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
03517 ///
03518 /// Returns true if it matches
03519 ///
03520 static bool findBuildVector(InsertElementInst *FirstInsertElem,
03521                             SmallVectorImpl<Value *> &BuildVector,
03522                             SmallVectorImpl<Value *> &BuildVectorOpds) {
03523   if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
03524     return false;
03525 
03526   InsertElementInst *IE = FirstInsertElem;
03527   while (true) {
03528     BuildVector.push_back(IE);
03529     BuildVectorOpds.push_back(IE->getOperand(1));
03530 
03531     if (IE->use_empty())
03532       return false;
03533 
03534     InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
03535     if (!NextUse)
03536       return true;
03537 
03538     // If this isn't the final use, make sure the next insertelement is the only
03539     // use. It's OK if the final constructed vector is used multiple times
03540     if (!IE->hasOneUse())
03541       return false;
03542 
03543     IE = NextUse;
03544   }
03545 
03546   return false;
03547 }
03548 
03549 static bool PhiTypeSorterFunc(Value *V, Value *V2) {
03550   return V->getType() < V2->getType();
03551 }
03552 
03553 bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
03554   bool Changed = false;
03555   SmallVector<Value *, 4> Incoming;
03556   SmallSet<Value *, 16> VisitedInstrs;
03557 
03558   bool HaveVectorizedPhiNodes = true;
03559   while (HaveVectorizedPhiNodes) {
03560     HaveVectorizedPhiNodes = false;
03561 
03562     // Collect the incoming values from the PHIs.
03563     Incoming.clear();
03564     for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
03565          ++instr) {
03566       PHINode *P = dyn_cast<PHINode>(instr);
03567       if (!P)
03568         break;
03569 
03570       if (!VisitedInstrs.count(P))
03571         Incoming.push_back(P);
03572     }
03573 
03574     // Sort by type.
03575     std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
03576 
03577     // Try to vectorize elements base on their type.
03578     for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
03579                                            E = Incoming.end();
03580          IncIt != E;) {
03581 
03582       // Look for the next elements with the same type.
03583       SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
03584       while (SameTypeIt != E &&
03585              (*SameTypeIt)->getType() == (*IncIt)->getType()) {
03586         VisitedInstrs.insert(*SameTypeIt);
03587         ++SameTypeIt;
03588       }
03589 
03590       // Try to vectorize them.
03591       unsigned NumElts = (SameTypeIt - IncIt);
03592       DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
03593       if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
03594         // Success start over because instructions might have been changed.
03595         HaveVectorizedPhiNodes = true;
03596         Changed = true;
03597         break;
03598       }
03599 
03600       // Start over at the next instruction of a different type (or the end).
03601       IncIt = SameTypeIt;
03602     }
03603   }
03604 
03605   VisitedInstrs.clear();
03606 
03607   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
03608     // We may go through BB multiple times so skip the one we have checked.
03609     if (!VisitedInstrs.insert(it))
03610       continue;
03611 
03612     if (isa<DbgInfoIntrinsic>(it))
03613       continue;
03614 
03615     // Try to vectorize reductions that use PHINodes.
03616     if (PHINode *P = dyn_cast<PHINode>(it)) {
03617       // Check that the PHI is a reduction PHI.
03618       if (P->getNumIncomingValues() != 2)
03619         return Changed;
03620       Value *Rdx =
03621           (P->getIncomingBlock(0) == BB
03622                ? (P->getIncomingValue(0))
03623                : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1)
03624                                                : nullptr));
03625       // Check if this is a Binary Operator.
03626       BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
03627       if (!BI)
03628         continue;
03629 
03630       // Try to match and vectorize a horizontal reduction.
03631       HorizontalReduction HorRdx;
03632       if (ShouldVectorizeHor &&
03633           HorRdx.matchAssociativeReduction(P, BI, DL) &&
03634           HorRdx.tryToReduce(R, TTI)) {
03635         Changed = true;
03636         it = BB->begin();
03637         e = BB->end();
03638         continue;
03639       }
03640 
03641      Value *Inst = BI->getOperand(0);
03642       if (Inst == P)
03643         Inst = BI->getOperand(1);
03644 
03645       if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
03646         // We would like to start over since some instructions are deleted
03647         // and the iterator may become invalid value.
03648         Changed = true;
03649         it = BB->begin();
03650         e = BB->end();
03651         continue;
03652       }
03653 
03654       continue;
03655     }
03656 
03657     // Try to vectorize horizontal reductions feeding into a store.
03658     if (ShouldStartVectorizeHorAtStore)
03659       if (StoreInst *SI = dyn_cast<StoreInst>(it))
03660         if (BinaryOperator *BinOp =
03661                 dyn_cast<BinaryOperator>(SI->getValueOperand())) {
03662           HorizontalReduction HorRdx;
03663           if (((HorRdx.matchAssociativeReduction(nullptr, BinOp, DL) &&
03664                 HorRdx.tryToReduce(R, TTI)) ||
03665                tryToVectorize(BinOp, R))) {
03666             Changed = true;
03667             it = BB->begin();
03668             e = BB->end();
03669             continue;
03670           }
03671         }
03672 
03673     // Try to vectorize trees that start at compare instructions.
03674     if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
03675       if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
03676         Changed = true;
03677         // We would like to start over since some instructions are deleted
03678         // and the iterator may become invalid value.
03679         it = BB->begin();
03680         e = BB->end();
03681         continue;
03682       }
03683 
03684       for (int i = 0; i < 2; ++i) {
03685         if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
03686           if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
03687             Changed = true;
03688             // We would like to start over since some instructions are deleted
03689             // and the iterator may become invalid value.
03690             it = BB->begin();
03691             e = BB->end();
03692           }
03693         }
03694       }
03695       continue;
03696     }
03697 
03698     // Try to vectorize trees that start at insertelement instructions.
03699     if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
03700       SmallVector<Value *, 16> BuildVector;
03701       SmallVector<Value *, 16> BuildVectorOpds;
03702       if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
03703         continue;
03704 
03705       // Vectorize starting with the build vector operands ignoring the
03706       // BuildVector instructions for the purpose of scheduling and user
03707       // extraction.
03708       if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
03709         Changed = true;
03710         it = BB->begin();
03711         e = BB->end();
03712       }
03713 
03714       continue;
03715     }
03716   }
03717 
03718   return Changed;
03719 }
03720 
03721 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
03722   bool Changed = false;
03723   // Attempt to sort and vectorize each of the store-groups.
03724   for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
03725        it != e; ++it) {
03726     if (it->second.size() < 2)
03727       continue;
03728 
03729     DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
03730           << it->second.size() << ".\n");
03731 
03732     // Process the stores in chunks of 16.
03733     for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
03734       unsigned Len = std::min<unsigned>(CE - CI, 16);
03735       Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
03736                                  -SLPCostThreshold, R);
03737     }
03738   }
03739   return Changed;
03740 }
03741 
03742 } // end anonymous namespace
03743 
03744 char SLPVectorizer::ID = 0;
03745 static const char lv_name[] = "SLP Vectorizer";
03746 INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
03747 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
03748 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
03749 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
03750 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
03751 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
03752 
03753 namespace llvm {
03754 Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
03755 }
Generated on Thu Sep 18 2014 03:35:00 for LLVM by Doxygen