LLVM API Documentation
00001 //===-- AArch64A57FPLoadBalancing.cpp - Balance FP ops statically on A57---===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // For best-case performance on Cortex-A57, we should try to use a balanced 00010 // mix of odd and even D-registers when performing a critical sequence of 00011 // independent, non-quadword FP/ASIMD floating-point multiply or 00012 // multiply-accumulate operations. 00013 // 00014 // This pass attempts to detect situations where the register allocation may 00015 // adversely affect this load balancing and to change the registers used so as 00016 // to better utilize the CPU. 00017 // 00018 // Ideally we'd just take each multiply or multiply-accumulate in turn and 00019 // allocate it alternating even or odd registers. However, multiply-accumulates 00020 // are most efficiently performed in the same functional unit as their 00021 // accumulation operand. Therefore this pass tries to find maximal sequences 00022 // ("Chains") of multiply-accumulates linked via their accumulation operand, 00023 // and assign them all the same "color" (oddness/evenness). 00024 // 00025 // This optimization affects S-register and D-register floating point 00026 // multiplies and FMADD/FMAs, as well as vector (floating point only) muls and 00027 // FMADD/FMA. Q register instructions (and 128-bit vector instructions) are 00028 // not affected. 00029 //===----------------------------------------------------------------------===// 00030 00031 #include "AArch64.h" 00032 #include "AArch64InstrInfo.h" 00033 #include "AArch64Subtarget.h" 00034 #include "llvm/ADT/BitVector.h" 00035 #include "llvm/ADT/EquivalenceClasses.h" 00036 #include "llvm/CodeGen/MachineFunction.h" 00037 #include "llvm/CodeGen/MachineFunctionPass.h" 00038 #include "llvm/CodeGen/MachineInstr.h" 00039 #include "llvm/CodeGen/MachineInstrBuilder.h" 00040 #include "llvm/CodeGen/MachineRegisterInfo.h" 00041 #include "llvm/CodeGen/RegisterScavenging.h" 00042 #include "llvm/CodeGen/RegisterClassInfo.h" 00043 #include "llvm/Support/CommandLine.h" 00044 #include "llvm/Support/Debug.h" 00045 #include "llvm/Support/raw_ostream.h" 00046 #include <list> 00047 using namespace llvm; 00048 00049 #define DEBUG_TYPE "aarch64-a57-fp-load-balancing" 00050 00051 // Enforce the algorithm to use the scavenged register even when the original 00052 // destination register is the correct color. Used for testing. 00053 static cl::opt<bool> 00054 TransformAll("aarch64-a57-fp-load-balancing-force-all", 00055 cl::desc("Always modify dest registers regardless of color"), 00056 cl::init(false), cl::Hidden); 00057 00058 // Never use the balance information obtained from chains - return a specific 00059 // color always. Used for testing. 00060 static cl::opt<unsigned> 00061 OverrideBalance("aarch64-a57-fp-load-balancing-override", 00062 cl::desc("Ignore balance information, always return " 00063 "(1: Even, 2: Odd)."), 00064 cl::init(0), cl::Hidden); 00065 00066 //===----------------------------------------------------------------------===// 00067 // Helper functions 00068 00069 // Is the instruction a type of multiply on 64-bit (or 32-bit) FPRs? 00070 static bool isMul(MachineInstr *MI) { 00071 switch (MI->getOpcode()) { 00072 case AArch64::FMULSrr: 00073 case AArch64::FNMULSrr: 00074 case AArch64::FMULDrr: 00075 case AArch64::FNMULDrr: 00076 return true; 00077 default: 00078 return false; 00079 } 00080 } 00081 00082 // Is the instruction a type of FP multiply-accumulate on 64-bit (or 32-bit) FPRs? 00083 static bool isMla(MachineInstr *MI) { 00084 switch (MI->getOpcode()) { 00085 case AArch64::FMSUBSrrr: 00086 case AArch64::FMADDSrrr: 00087 case AArch64::FNMSUBSrrr: 00088 case AArch64::FNMADDSrrr: 00089 case AArch64::FMSUBDrrr: 00090 case AArch64::FMADDDrrr: 00091 case AArch64::FNMSUBDrrr: 00092 case AArch64::FNMADDDrrr: 00093 return true; 00094 default: 00095 return false; 00096 } 00097 } 00098 00099 //===----------------------------------------------------------------------===// 00100 00101 namespace { 00102 /// A "color", which is either even or odd. Yes, these aren't really colors 00103 /// but the algorithm is conceptually doing two-color graph coloring. 00104 enum class Color { Even, Odd }; 00105 #ifndef NDEBUG 00106 static const char *ColorNames[2] = { "Even", "Odd" }; 00107 #endif 00108 00109 class Chain; 00110 00111 class AArch64A57FPLoadBalancing : public MachineFunctionPass { 00112 const AArch64InstrInfo *TII; 00113 MachineRegisterInfo *MRI; 00114 const TargetRegisterInfo *TRI; 00115 RegisterClassInfo RCI; 00116 00117 public: 00118 static char ID; 00119 explicit AArch64A57FPLoadBalancing() : MachineFunctionPass(ID) {} 00120 00121 bool runOnMachineFunction(MachineFunction &F) override; 00122 00123 const char *getPassName() const override { 00124 return "A57 FP Anti-dependency breaker"; 00125 } 00126 00127 void getAnalysisUsage(AnalysisUsage &AU) const override { 00128 AU.setPreservesCFG(); 00129 MachineFunctionPass::getAnalysisUsage(AU); 00130 } 00131 00132 private: 00133 bool runOnBasicBlock(MachineBasicBlock &MBB); 00134 bool colorChainSet(std::vector<Chain*> GV, MachineBasicBlock &MBB, 00135 int &Balance); 00136 bool colorChain(Chain *G, Color C, MachineBasicBlock &MBB); 00137 int scavengeRegister(Chain *G, Color C, MachineBasicBlock &MBB); 00138 void scanInstruction(MachineInstr *MI, unsigned Idx, 00139 std::map<unsigned, Chain*> &Active, 00140 std::set<std::unique_ptr<Chain>> &AllChains); 00141 void maybeKillChain(MachineOperand &MO, unsigned Idx, 00142 std::map<unsigned, Chain*> &RegChains); 00143 Color getColor(unsigned Register); 00144 Chain *getAndEraseNext(Color PreferredColor, std::vector<Chain*> &L); 00145 }; 00146 char AArch64A57FPLoadBalancing::ID = 0; 00147 00148 /// A Chain is a sequence of instructions that are linked together by 00149 /// an accumulation operand. For example: 00150 /// 00151 /// fmul d0<def>, ? 00152 /// fmla d1<def>, ?, ?, d0<kill> 00153 /// fmla d2<def>, ?, ?, d1<kill> 00154 /// 00155 /// There may be other instructions interleaved in the sequence that 00156 /// do not belong to the chain. These other instructions must not use 00157 /// the "chain" register at any point. 00158 /// 00159 /// We currently only support chains where the "chain" operand is killed 00160 /// at each link in the chain for simplicity. 00161 /// A chain has three important instructions - Start, Last and Kill. 00162 /// * The start instruction is the first instruction in the chain. 00163 /// * Last is the final instruction in the chain. 00164 /// * Kill may or may not be defined. If defined, Kill is the instruction 00165 /// where the outgoing value of the Last instruction is killed. 00166 /// This information is important as if we know the outgoing value is 00167 /// killed with no intervening uses, we can safely change its register. 00168 /// 00169 /// Without a kill instruction, we must assume the outgoing value escapes 00170 /// beyond our model and either must not change its register or must 00171 /// create a fixup FMOV to keep the old register value consistent. 00172 /// 00173 class Chain { 00174 public: 00175 /// The important (marker) instructions. 00176 MachineInstr *StartInst, *LastInst, *KillInst; 00177 /// The index, from the start of the basic block, that each marker 00178 /// appears. These are stored so we can do quick interval tests. 00179 unsigned StartInstIdx, LastInstIdx, KillInstIdx; 00180 /// All instructions in the chain. 00181 std::set<MachineInstr*> Insts; 00182 /// True if KillInst cannot be modified. If this is true, 00183 /// we cannot change LastInst's outgoing register. 00184 /// This will be true for tied values and regmasks. 00185 bool KillIsImmutable; 00186 /// The "color" of LastInst. This will be the preferred chain color, 00187 /// as changing intermediate nodes is easy but changing the last 00188 /// instruction can be more tricky. 00189 Color LastColor; 00190 00191 Chain(MachineInstr *MI, unsigned Idx, Color C) 00192 : StartInst(MI), LastInst(MI), KillInst(nullptr), 00193 StartInstIdx(Idx), LastInstIdx(Idx), KillInstIdx(0), 00194 LastColor(C) { 00195 Insts.insert(MI); 00196 } 00197 00198 /// Add a new instruction into the chain. The instruction's dest operand 00199 /// has the given color. 00200 void add(MachineInstr *MI, unsigned Idx, Color C) { 00201 LastInst = MI; 00202 LastInstIdx = Idx; 00203 LastColor = C; 00204 assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) && 00205 "Chain: broken invariant. A Chain can only be killed after its last " 00206 "def"); 00207 00208 Insts.insert(MI); 00209 } 00210 00211 /// Return true if MI is a member of the chain. 00212 bool contains(MachineInstr *MI) { return Insts.count(MI) > 0; } 00213 00214 /// Return the number of instructions in the chain. 00215 unsigned size() const { 00216 return Insts.size(); 00217 } 00218 00219 /// Inform the chain that its last active register (the dest register of 00220 /// LastInst) is killed by MI with no intervening uses or defs. 00221 void setKill(MachineInstr *MI, unsigned Idx, bool Immutable) { 00222 KillInst = MI; 00223 KillInstIdx = Idx; 00224 KillIsImmutable = Immutable; 00225 assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) && 00226 "Chain: broken invariant. A Chain can only be killed after its last " 00227 "def"); 00228 } 00229 00230 /// Return the first instruction in the chain. 00231 MachineInstr *getStart() const { return StartInst; } 00232 /// Return the last instruction in the chain. 00233 MachineInstr *getLast() const { return LastInst; } 00234 /// Return the "kill" instruction (as set with setKill()) or NULL. 00235 MachineInstr *getKill() const { return KillInst; } 00236 /// Return an instruction that can be used as an iterator for the end 00237 /// of the chain. This is the maximum of KillInst (if set) and LastInst. 00238 MachineBasicBlock::iterator getEnd() const { 00239 return ++MachineBasicBlock::iterator(KillInst ? KillInst : LastInst); 00240 } 00241 00242 /// Can the Kill instruction (assuming one exists) be modified? 00243 bool isKillImmutable() const { return KillIsImmutable; } 00244 00245 /// Return the preferred color of this chain. 00246 Color getPreferredColor() { 00247 if (OverrideBalance != 0) 00248 return OverrideBalance == 1 ? Color::Even : Color::Odd; 00249 return LastColor; 00250 } 00251 00252 /// Return true if this chain (StartInst..KillInst) overlaps with Other. 00253 bool rangeOverlapsWith(const Chain &Other) const { 00254 unsigned End = KillInst ? KillInstIdx : LastInstIdx; 00255 unsigned OtherEnd = Other.KillInst ? 00256 Other.KillInstIdx : Other.LastInstIdx; 00257 00258 return StartInstIdx <= OtherEnd && Other.StartInstIdx <= End; 00259 } 00260 00261 /// Return true if this chain starts before Other. 00262 bool startsBefore(Chain *Other) { 00263 return StartInstIdx < Other->StartInstIdx; 00264 } 00265 00266 /// Return true if the group will require a fixup MOV at the end. 00267 bool requiresFixup() const { 00268 return (getKill() && isKillImmutable()) || !getKill(); 00269 } 00270 00271 /// Return a simple string representation of the chain. 00272 std::string str() const { 00273 std::string S; 00274 raw_string_ostream OS(S); 00275 00276 OS << "{"; 00277 StartInst->print(OS, NULL, true); 00278 OS << " -> "; 00279 LastInst->print(OS, NULL, true); 00280 if (KillInst) { 00281 OS << " (kill @ "; 00282 KillInst->print(OS, NULL, true); 00283 OS << ")"; 00284 } 00285 OS << "}"; 00286 00287 return OS.str(); 00288 } 00289 00290 }; 00291 00292 } // end anonymous namespace 00293 00294 //===----------------------------------------------------------------------===// 00295 00296 bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) { 00297 bool Changed = false; 00298 DEBUG(dbgs() << "***** AArch64A57FPLoadBalancing *****\n"); 00299 00300 const TargetMachine &TM = F.getTarget(); 00301 MRI = &F.getRegInfo(); 00302 TRI = F.getRegInfo().getTargetRegisterInfo(); 00303 TII = TM.getSubtarget<AArch64Subtarget>().getInstrInfo(); 00304 RCI.runOnMachineFunction(F); 00305 00306 for (auto &MBB : F) { 00307 Changed |= runOnBasicBlock(MBB); 00308 } 00309 00310 return Changed; 00311 } 00312 00313 bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) { 00314 bool Changed = false; 00315 DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n"); 00316 00317 // First, scan the basic block producing a set of chains. 00318 00319 // The currently "active" chains - chains that can be added to and haven't 00320 // been killed yet. This is keyed by register - all chains can only have one 00321 // "link" register between each inst in the chain. 00322 std::map<unsigned, Chain*> ActiveChains; 00323 std::set<std::unique_ptr<Chain>> AllChains; 00324 unsigned Idx = 0; 00325 for (auto &MI : MBB) 00326 scanInstruction(&MI, Idx++, ActiveChains, AllChains); 00327 00328 DEBUG(dbgs() << "Scan complete, "<< AllChains.size() << " chains created.\n"); 00329 00330 // Group the chains into disjoint sets based on their liveness range. This is 00331 // a poor-man's version of graph coloring. Ideally we'd create an interference 00332 // graph and perform full-on graph coloring on that, but; 00333 // (a) That's rather heavyweight for only two colors. 00334 // (b) We expect multiple disjoint interference regions - in practice the live 00335 // range of chains is quite small and they are clustered between loads 00336 // and stores. 00337 EquivalenceClasses<Chain*> EC; 00338 for (auto &I : AllChains) 00339 EC.insert(I.get()); 00340 00341 for (auto &I : AllChains) 00342 for (auto &J : AllChains) 00343 if (I != J && I->rangeOverlapsWith(*J)) 00344 EC.unionSets(I.get(), J.get()); 00345 DEBUG(dbgs() << "Created " << EC.getNumClasses() << " disjoint sets.\n"); 00346 00347 // Now we assume that every member of an equivalence class interferes 00348 // with every other member of that class, and with no members of other classes. 00349 00350 // Convert the EquivalenceClasses to a simpler set of sets. 00351 std::vector<std::vector<Chain*> > V; 00352 for (auto I = EC.begin(), E = EC.end(); I != E; ++I) { 00353 std::vector<Chain*> Cs(EC.member_begin(I), EC.member_end()); 00354 if (Cs.empty()) continue; 00355 V.push_back(Cs); 00356 } 00357 00358 // Now we have a set of sets, order them by start address so 00359 // we can iterate over them sequentially. 00360 std::sort(V.begin(), V.end(), 00361 [](const std::vector<Chain*> &A, 00362 const std::vector<Chain*> &B) { 00363 return A.front()->startsBefore(B.front()); 00364 }); 00365 00366 // As we only have two colors, we can track the global (BB-level) balance of 00367 // odds versus evens. We aim to keep this near zero to keep both execution 00368 // units fed. 00369 // Positive means we're even-heavy, negative we're odd-heavy. 00370 // 00371 // FIXME: If chains have interdependencies, for example: 00372 // mul r0, r1, r2 00373 // mul r3, r0, r1 00374 // We do not model this and may color each one differently, assuming we'll 00375 // get ILP when we obviously can't. This hasn't been seen to be a problem 00376 // in practice so far, so we simplify the algorithm by ignoring it. 00377 int Parity = 0; 00378 00379 for (auto &I : V) 00380 Changed |= colorChainSet(I, MBB, Parity); 00381 00382 return Changed; 00383 } 00384 00385 Chain *AArch64A57FPLoadBalancing::getAndEraseNext(Color PreferredColor, 00386 std::vector<Chain*> &L) { 00387 if (L.empty()) 00388 return nullptr; 00389 00390 // We try and get the best candidate from L to color next, given that our 00391 // preferred color is "PreferredColor". L is ordered from larger to smaller 00392 // chains. It is beneficial to color the large chains before the small chains, 00393 // but if we can't find a chain of the maximum length with the preferred color, 00394 // we fuzz the size and look for slightly smaller chains before giving up and 00395 // returning a chain that must be recolored. 00396 00397 // FIXME: Does this need to be configurable? 00398 const unsigned SizeFuzz = 1; 00399 unsigned MinSize = L.front()->size() - SizeFuzz; 00400 for (auto I = L.begin(), E = L.end(); I != E; ++I) { 00401 if ((*I)->size() <= MinSize) { 00402 // We've gone past the size limit. Return the previous item. 00403 Chain *Ch = *--I; 00404 L.erase(I); 00405 return Ch; 00406 } 00407 00408 if ((*I)->getPreferredColor() == PreferredColor) { 00409 Chain *Ch = *I; 00410 L.erase(I); 00411 return Ch; 00412 } 00413 } 00414 00415 // Bailout case - just return the first item. 00416 Chain *Ch = L.front(); 00417 L.erase(L.begin()); 00418 return Ch; 00419 } 00420 00421 bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV, 00422 MachineBasicBlock &MBB, 00423 int &Parity) { 00424 bool Changed = false; 00425 DEBUG(dbgs() << "colorChainSet(): #sets=" << GV.size() << "\n"); 00426 00427 // Sort by descending size order so that we allocate the most important 00428 // sets first. 00429 // Tie-break equivalent sizes by sorting chains requiring fixups before 00430 // those without fixups. The logic here is that we should look at the 00431 // chains that we cannot change before we look at those we can, 00432 // so the parity counter is updated and we know what color we should 00433 // change them to! 00434 std::sort(GV.begin(), GV.end(), [](const Chain *G1, const Chain *G2) { 00435 if (G1->size() != G2->size()) 00436 return G1->size() > G2->size(); 00437 return G1->requiresFixup() > G2->requiresFixup(); 00438 }); 00439 00440 Color PreferredColor = Parity < 0 ? Color::Even : Color::Odd; 00441 while (Chain *G = getAndEraseNext(PreferredColor, GV)) { 00442 // Start off by assuming we'll color to our own preferred color. 00443 Color C = PreferredColor; 00444 if (Parity == 0) 00445 // But if we really don't care, use the chain's preferred color. 00446 C = G->getPreferredColor(); 00447 00448 DEBUG(dbgs() << " - Parity=" << Parity << ", Color=" 00449 << ColorNames[(int)C] << "\n"); 00450 00451 // If we'll need a fixup FMOV, don't bother. Testing has shown that this 00452 // happens infrequently and when it does it has at least a 50% chance of 00453 // slowing code down instead of speeding it up. 00454 if (G->requiresFixup() && C != G->getPreferredColor()) { 00455 C = G->getPreferredColor(); 00456 DEBUG(dbgs() << " - " << G->str() << " - not worthwhile changing; " 00457 "color remains " << ColorNames[(int)C] << "\n"); 00458 } 00459 00460 Changed |= colorChain(G, C, MBB); 00461 00462 Parity += (C == Color::Even) ? G->size() : -G->size(); 00463 PreferredColor = Parity < 0 ? Color::Even : Color::Odd; 00464 } 00465 00466 return Changed; 00467 } 00468 00469 int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C, 00470 MachineBasicBlock &MBB) { 00471 RegScavenger RS; 00472 RS.enterBasicBlock(&MBB); 00473 RS.forward(MachineBasicBlock::iterator(G->getStart())); 00474 00475 // Can we find an appropriate register that is available throughout the life 00476 // of the chain? 00477 unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass; 00478 BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID)); 00479 for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd(); 00480 I != E; ++I) { 00481 RS.forward(I); 00482 AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID)); 00483 00484 // Remove any registers clobbered by a regmask. 00485 for (auto J : I->operands()) { 00486 if (J.isRegMask()) 00487 AvailableRegs.clearBitsNotInMask(J.getRegMask()); 00488 } 00489 } 00490 00491 // Make sure we allocate in-order, to get the cheapest registers first. 00492 auto Ord = RCI.getOrder(TRI->getRegClass(RegClassID)); 00493 for (auto Reg : Ord) { 00494 if (!AvailableRegs[Reg]) 00495 continue; 00496 if ((C == Color::Even && (Reg % 2) == 0) || 00497 (C == Color::Odd && (Reg % 2) == 1)) 00498 return Reg; 00499 } 00500 00501 return -1; 00502 } 00503 00504 bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C, 00505 MachineBasicBlock &MBB) { 00506 bool Changed = false; 00507 DEBUG(dbgs() << " - colorChain(" << G->str() << ", " 00508 << ColorNames[(int)C] << ")\n"); 00509 00510 // Try and obtain a free register of the right class. Without a register 00511 // to play with we cannot continue. 00512 int Reg = scavengeRegister(G, C, MBB); 00513 if (Reg == -1) { 00514 DEBUG(dbgs() << "Scavenging (thus coloring) failed!\n"); 00515 return false; 00516 } 00517 DEBUG(dbgs() << " - Scavenged register: " << TRI->getName(Reg) << "\n"); 00518 00519 std::map<unsigned, unsigned> Substs; 00520 for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd(); 00521 I != E; ++I) { 00522 if (!G->contains(I) && 00523 (&*I != G->getKill() || G->isKillImmutable())) 00524 continue; 00525 00526 // I is a member of G, or I is a mutable instruction that kills G. 00527 00528 std::vector<unsigned> ToErase; 00529 for (auto &U : I->operands()) { 00530 if (U.isReg() && U.isUse() && Substs.find(U.getReg()) != Substs.end()) { 00531 unsigned OrigReg = U.getReg(); 00532 U.setReg(Substs[OrigReg]); 00533 if (U.isKill()) 00534 // Don't erase straight away, because there may be other operands 00535 // that also reference this substitution! 00536 ToErase.push_back(OrigReg); 00537 } else if (U.isRegMask()) { 00538 for (auto J : Substs) { 00539 if (U.clobbersPhysReg(J.first)) 00540 ToErase.push_back(J.first); 00541 } 00542 } 00543 } 00544 // Now it's safe to remove the substs identified earlier. 00545 for (auto J : ToErase) 00546 Substs.erase(J); 00547 00548 // Only change the def if this isn't the last instruction. 00549 if (&*I != G->getKill()) { 00550 MachineOperand &MO = I->getOperand(0); 00551 00552 bool Change = TransformAll || getColor(MO.getReg()) != C; 00553 if (G->requiresFixup() && &*I == G->getLast()) 00554 Change = false; 00555 00556 if (Change) { 00557 Substs[MO.getReg()] = Reg; 00558 MO.setReg(Reg); 00559 MRI->setPhysRegUsed(Reg); 00560 00561 Changed = true; 00562 } 00563 } 00564 } 00565 assert(Substs.size() == 0 && "No substitutions should be left active!"); 00566 00567 if (G->getKill()) { 00568 DEBUG(dbgs() << " - Kill instruction seen.\n"); 00569 } else { 00570 // We didn't have a kill instruction, but we didn't seem to need to change 00571 // the destination register anyway. 00572 DEBUG(dbgs() << " - Destination register not changed.\n"); 00573 } 00574 return Changed; 00575 } 00576 00577 void AArch64A57FPLoadBalancing:: 00578 scanInstruction(MachineInstr *MI, unsigned Idx, 00579 std::map<unsigned, Chain*> &ActiveChains, 00580 std::set<std::unique_ptr<Chain>> &AllChains) { 00581 // Inspect "MI", updating ActiveChains and AllChains. 00582 00583 if (isMul(MI)) { 00584 00585 for (auto &I : MI->uses()) 00586 maybeKillChain(I, Idx, ActiveChains); 00587 for (auto &I : MI->defs()) 00588 maybeKillChain(I, Idx, ActiveChains); 00589 00590 // Create a new chain. Multiplies don't require forwarding so can go on any 00591 // unit. 00592 unsigned DestReg = MI->getOperand(0).getReg(); 00593 00594 DEBUG(dbgs() << "New chain started for register " 00595 << TRI->getName(DestReg) << " at " << *MI); 00596 00597 auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg)); 00598 ActiveChains[DestReg] = G.get(); 00599 AllChains.insert(std::move(G)); 00600 00601 } else if (isMla(MI)) { 00602 00603 // It is beneficial to keep MLAs on the same functional unit as their 00604 // accumulator operand. 00605 unsigned DestReg = MI->getOperand(0).getReg(); 00606 unsigned AccumReg = MI->getOperand(3).getReg(); 00607 00608 maybeKillChain(MI->getOperand(1), Idx, ActiveChains); 00609 maybeKillChain(MI->getOperand(2), Idx, ActiveChains); 00610 if (DestReg != AccumReg) 00611 maybeKillChain(MI->getOperand(0), Idx, ActiveChains); 00612 00613 if (ActiveChains.find(AccumReg) != ActiveChains.end()) { 00614 DEBUG(dbgs() << "Chain found for accumulator register " 00615 << TRI->getName(AccumReg) << " in MI " << *MI); 00616 00617 // For simplicity we only chain together sequences of MULs/MLAs where the 00618 // accumulator register is killed on each instruction. This means we don't 00619 // need to track other uses of the registers we want to rewrite. 00620 // 00621 // FIXME: We could extend to handle the non-kill cases for more coverage. 00622 if (MI->getOperand(3).isKill()) { 00623 // Add to chain. 00624 DEBUG(dbgs() << "Instruction was successfully added to chain.\n"); 00625 ActiveChains[AccumReg]->add(MI, Idx, getColor(DestReg)); 00626 // Handle cases where the destination is not the same as the accumulator. 00627 if (DestReg != AccumReg) { 00628 ActiveChains[DestReg] = ActiveChains[AccumReg]; 00629 ActiveChains.erase(AccumReg); 00630 } 00631 return; 00632 } 00633 00634 DEBUG(dbgs() << "Cannot add to chain because accumulator operand wasn't " 00635 << "marked <kill>!\n"); 00636 maybeKillChain(MI->getOperand(3), Idx, ActiveChains); 00637 } 00638 00639 DEBUG(dbgs() << "Creating new chain for dest register " 00640 << TRI->getName(DestReg) << "\n"); 00641 auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg)); 00642 ActiveChains[DestReg] = G.get(); 00643 AllChains.insert(std::move(G)); 00644 00645 } else { 00646 00647 // Non-MUL or MLA instruction. Invalidate any chain in the uses or defs 00648 // lists. 00649 for (auto &I : MI->uses()) 00650 maybeKillChain(I, Idx, ActiveChains); 00651 for (auto &I : MI->defs()) 00652 maybeKillChain(I, Idx, ActiveChains); 00653 00654 } 00655 } 00656 00657 void AArch64A57FPLoadBalancing:: 00658 maybeKillChain(MachineOperand &MO, unsigned Idx, 00659 std::map<unsigned, Chain*> &ActiveChains) { 00660 // Given an operand and the set of active chains (keyed by register), 00661 // determine if a chain should be ended and remove from ActiveChains. 00662 MachineInstr *MI = MO.getParent(); 00663 00664 if (MO.isReg()) { 00665 00666 // If this is a KILL of a current chain, record it. 00667 if (MO.isKill() && ActiveChains.find(MO.getReg()) != ActiveChains.end()) { 00668 DEBUG(dbgs() << "Kill seen for chain " << TRI->getName(MO.getReg()) 00669 << "\n"); 00670 ActiveChains[MO.getReg()]->setKill(MI, Idx, /*Immutable=*/MO.isTied()); 00671 } 00672 ActiveChains.erase(MO.getReg()); 00673 00674 } else if (MO.isRegMask()) { 00675 00676 for (auto I = ActiveChains.begin(), E = ActiveChains.end(); 00677 I != E;) { 00678 if (MO.clobbersPhysReg(I->first)) { 00679 DEBUG(dbgs() << "Kill (regmask) seen for chain " 00680 << TRI->getName(I->first) << "\n"); 00681 I->second->setKill(MI, Idx, /*Immutable=*/true); 00682 ActiveChains.erase(I++); 00683 } else 00684 ++I; 00685 } 00686 00687 } 00688 } 00689 00690 Color AArch64A57FPLoadBalancing::getColor(unsigned Reg) { 00691 if ((TRI->getEncodingValue(Reg) % 2) == 0) 00692 return Color::Even; 00693 else 00694 return Color::Odd; 00695 } 00696 00697 // Factory function used by AArch64TargetMachine to add the pass to the passmanager. 00698 FunctionPass *llvm::createAArch64A57FPLoadBalancing() { 00699 return new AArch64A57FPLoadBalancing(); 00700 }