LLVM API Documentation

R600ControlFlowFinalizer.cpp
Go to the documentation of this file.
00001 //===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 /// \file
00011 /// This pass compute turns all control flow pseudo instructions into native one
00012 /// computing their address on the fly ; it also sets STACK_SIZE info.
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "llvm/Support/Debug.h"
00016 #include "AMDGPU.h"
00017 #include "AMDGPUSubtarget.h"
00018 #include "R600Defines.h"
00019 #include "R600InstrInfo.h"
00020 #include "R600MachineFunctionInfo.h"
00021 #include "R600RegisterInfo.h"
00022 #include "llvm/CodeGen/MachineFunctionPass.h"
00023 #include "llvm/CodeGen/MachineInstrBuilder.h"
00024 #include "llvm/CodeGen/MachineRegisterInfo.h"
00025 #include "llvm/Support/raw_ostream.h"
00026 
00027 using namespace llvm;
00028 
00029 #define DEBUG_TYPE "r600cf"
00030 
00031 namespace {
00032 
00033 struct CFStack {
00034 
00035   enum StackItem {
00036     ENTRY = 0,
00037     SUB_ENTRY = 1,
00038     FIRST_NON_WQM_PUSH = 2,
00039     FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
00040   };
00041 
00042   const AMDGPUSubtarget &ST;
00043   std::vector<StackItem> BranchStack;
00044   std::vector<StackItem> LoopStack;
00045   unsigned MaxStackSize;
00046   unsigned CurrentEntries;
00047   unsigned CurrentSubEntries;
00048 
00049   CFStack(const AMDGPUSubtarget &st, unsigned ShaderType) : ST(st),
00050       // We need to reserve a stack entry for CALL_FS in vertex shaders.
00051       MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0),
00052       CurrentEntries(0), CurrentSubEntries(0) { }
00053 
00054   unsigned getLoopDepth();
00055   bool branchStackContains(CFStack::StackItem);
00056   bool requiresWorkAroundForInst(unsigned Opcode);
00057   unsigned getSubEntrySize(CFStack::StackItem Item);
00058   void updateMaxStackSize();
00059   void pushBranch(unsigned Opcode, bool isWQM = false);
00060   void pushLoop();
00061   void popBranch();
00062   void popLoop();
00063 };
00064 
00065 unsigned CFStack::getLoopDepth() {
00066   return LoopStack.size();
00067 }
00068 
00069 bool CFStack::branchStackContains(CFStack::StackItem Item) {
00070   for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
00071        E = BranchStack.end(); I != E; ++I) {
00072     if (*I == Item)
00073       return true;
00074   }
00075   return false;
00076 }
00077 
00078 bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
00079   if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST.hasCaymanISA() &&
00080       getLoopDepth() > 1)
00081     return true;
00082 
00083   if (!ST.hasCFAluBug())
00084     return false;
00085 
00086   switch(Opcode) {
00087   default: return false;
00088   case AMDGPU::CF_ALU_PUSH_BEFORE:
00089   case AMDGPU::CF_ALU_ELSE_AFTER:
00090   case AMDGPU::CF_ALU_BREAK:
00091   case AMDGPU::CF_ALU_CONTINUE:
00092     if (CurrentSubEntries == 0)
00093       return false;
00094     if (ST.getWavefrontSize() == 64) {
00095       // We are being conservative here.  We only require this work-around if
00096       // CurrentSubEntries > 3 &&
00097       // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0)
00098       //
00099       // We have to be conservative, because we don't know for certain that
00100       // our stack allocation algorithm for Evergreen/NI is correct.  Applying this
00101       // work-around when CurrentSubEntries > 3 allows us to over-allocate stack
00102       // resources without any problems.
00103       return CurrentSubEntries > 3;
00104     } else {
00105       assert(ST.getWavefrontSize() == 32);
00106       // We are being conservative here.  We only require the work-around if
00107       // CurrentSubEntries > 7 &&
00108       // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0)
00109       // See the comment on the wavefront size == 64 case for why we are
00110       // being conservative.
00111       return CurrentSubEntries > 7;
00112     }
00113   }
00114 }
00115 
00116 unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
00117   switch(Item) {
00118   default:
00119     return 0;
00120   case CFStack::FIRST_NON_WQM_PUSH:
00121   assert(!ST.hasCaymanISA());
00122   if (ST.getGeneration() <= AMDGPUSubtarget::R700) {
00123     // +1 For the push operation.
00124     // +2 Extra space required.
00125     return 3;
00126   } else {
00127     // Some documentation says that this is not necessary on Evergreen,
00128     // but experimentation has show that we need to allocate 1 extra
00129     // sub-entry for the first non-WQM push.
00130     // +1 For the push operation.
00131     // +1 Extra space required.
00132     return 2;
00133   }
00134   case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
00135     assert(ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN);
00136     // +1 For the push operation.
00137     // +1 Extra space required.
00138     return 2;
00139   case CFStack::SUB_ENTRY:
00140     return 1;
00141   }
00142 }
00143 
00144 void CFStack::updateMaxStackSize() {
00145   unsigned CurrentStackSize = CurrentEntries +
00146                               (RoundUpToAlignment(CurrentSubEntries, 4) / 4);
00147   MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
00148 }
00149 
00150 void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
00151   CFStack::StackItem Item = CFStack::ENTRY;
00152   switch(Opcode) {
00153   case AMDGPU::CF_PUSH_EG:
00154   case AMDGPU::CF_ALU_PUSH_BEFORE:
00155     if (!isWQM) {
00156       if (!ST.hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
00157         Item = CFStack::FIRST_NON_WQM_PUSH;  // May not be required on Evergreen/NI
00158                                              // See comment in
00159                                              // CFStack::getSubEntrySize()
00160       else if (CurrentEntries > 0 &&
00161                ST.getGeneration() > AMDGPUSubtarget::EVERGREEN &&
00162                !ST.hasCaymanISA() &&
00163                !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
00164         Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
00165       else
00166         Item = CFStack::SUB_ENTRY;
00167     } else
00168       Item = CFStack::ENTRY;
00169     break;
00170   }
00171   BranchStack.push_back(Item);
00172   if (Item == CFStack::ENTRY)
00173     CurrentEntries++;
00174   else
00175     CurrentSubEntries += getSubEntrySize(Item);
00176   updateMaxStackSize();
00177 }
00178 
00179 void CFStack::pushLoop() {
00180   LoopStack.push_back(CFStack::ENTRY);
00181   CurrentEntries++;
00182   updateMaxStackSize();
00183 }
00184 
00185 void CFStack::popBranch() {
00186   CFStack::StackItem Top = BranchStack.back();
00187   if (Top == CFStack::ENTRY)
00188     CurrentEntries--;
00189   else
00190     CurrentSubEntries-= getSubEntrySize(Top);
00191   BranchStack.pop_back();
00192 }
00193 
00194 void CFStack::popLoop() {
00195   CurrentEntries--;
00196   LoopStack.pop_back();
00197 }
00198 
00199 class R600ControlFlowFinalizer : public MachineFunctionPass {
00200 
00201 private:
00202   typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
00203 
00204   enum ControlFlowInstruction {
00205     CF_TC,
00206     CF_VC,
00207     CF_CALL_FS,
00208     CF_WHILE_LOOP,
00209     CF_END_LOOP,
00210     CF_LOOP_BREAK,
00211     CF_LOOP_CONTINUE,
00212     CF_JUMP,
00213     CF_ELSE,
00214     CF_POP,
00215     CF_END
00216   };
00217 
00218   static char ID;
00219   const R600InstrInfo *TII;
00220   const R600RegisterInfo *TRI;
00221   unsigned MaxFetchInst;
00222   const AMDGPUSubtarget &ST;
00223 
00224   bool IsTrivialInst(MachineInstr *MI) const {
00225     switch (MI->getOpcode()) {
00226     case AMDGPU::KILL:
00227     case AMDGPU::RETURN:
00228       return true;
00229     default:
00230       return false;
00231     }
00232   }
00233 
00234   const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
00235     unsigned Opcode = 0;
00236     bool isEg = (ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN);
00237     switch (CFI) {
00238     case CF_TC:
00239       Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
00240       break;
00241     case CF_VC:
00242       Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
00243       break;
00244     case CF_CALL_FS:
00245       Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
00246       break;
00247     case CF_WHILE_LOOP:
00248       Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
00249       break;
00250     case CF_END_LOOP:
00251       Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
00252       break;
00253     case CF_LOOP_BREAK:
00254       Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
00255       break;
00256     case CF_LOOP_CONTINUE:
00257       Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
00258       break;
00259     case CF_JUMP:
00260       Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
00261       break;
00262     case CF_ELSE:
00263       Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
00264       break;
00265     case CF_POP:
00266       Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
00267       break;
00268     case CF_END:
00269       if (ST.hasCaymanISA()) {
00270         Opcode = AMDGPU::CF_END_CM;
00271         break;
00272       }
00273       Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
00274       break;
00275     }
00276     assert (Opcode && "No opcode selected");
00277     return TII->get(Opcode);
00278   }
00279 
00280   bool isCompatibleWithClause(const MachineInstr *MI,
00281       std::set<unsigned> &DstRegs) const {
00282     unsigned DstMI, SrcMI;
00283     for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
00284         E = MI->operands_end(); I != E; ++I) {
00285       const MachineOperand &MO = *I;
00286       if (!MO.isReg())
00287         continue;
00288       if (MO.isDef()) {
00289         unsigned Reg = MO.getReg();
00290         if (AMDGPU::R600_Reg128RegClass.contains(Reg))
00291           DstMI = Reg;
00292         else
00293           DstMI = TRI->getMatchingSuperReg(Reg,
00294               TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
00295               &AMDGPU::R600_Reg128RegClass);
00296       }
00297       if (MO.isUse()) {
00298         unsigned Reg = MO.getReg();
00299         if (AMDGPU::R600_Reg128RegClass.contains(Reg))
00300           SrcMI = Reg;
00301         else
00302           SrcMI = TRI->getMatchingSuperReg(Reg,
00303               TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
00304               &AMDGPU::R600_Reg128RegClass);
00305       }
00306     }
00307     if ((DstRegs.find(SrcMI) == DstRegs.end())) {
00308       DstRegs.insert(DstMI);
00309       return true;
00310     } else
00311       return false;
00312   }
00313 
00314   ClauseFile
00315   MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
00316       const {
00317     MachineBasicBlock::iterator ClauseHead = I;
00318     std::vector<MachineInstr *> ClauseContent;
00319     unsigned AluInstCount = 0;
00320     bool IsTex = TII->usesTextureCache(ClauseHead);
00321     std::set<unsigned> DstRegs;
00322     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
00323       if (IsTrivialInst(I))
00324         continue;
00325       if (AluInstCount >= MaxFetchInst)
00326         break;
00327       if ((IsTex && !TII->usesTextureCache(I)) ||
00328           (!IsTex && !TII->usesVertexCache(I)))
00329         break;
00330       if (!isCompatibleWithClause(I, DstRegs))
00331         break;
00332       AluInstCount ++;
00333       ClauseContent.push_back(I);
00334     }
00335     MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
00336         getHWInstrDesc(IsTex?CF_TC:CF_VC))
00337         .addImm(0) // ADDR
00338         .addImm(AluInstCount - 1); // COUNT
00339     return ClauseFile(MIb, ClauseContent);
00340   }
00341 
00342   void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const {
00343     static const unsigned LiteralRegs[] = {
00344       AMDGPU::ALU_LITERAL_X,
00345       AMDGPU::ALU_LITERAL_Y,
00346       AMDGPU::ALU_LITERAL_Z,
00347       AMDGPU::ALU_LITERAL_W
00348     };
00349     const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs =
00350         TII->getSrcs(MI);
00351     for (unsigned i = 0, e = Srcs.size(); i < e; ++i) {
00352       if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X)
00353         continue;
00354       int64_t Imm = Srcs[i].second;
00355       std::vector<int64_t>::iterator It =
00356           std::find(Lits.begin(), Lits.end(), Imm);
00357       if (It != Lits.end()) {
00358         unsigned Index = It - Lits.begin();
00359         Srcs[i].first->setReg(LiteralRegs[Index]);
00360       } else {
00361         assert(Lits.size() < 4 && "Too many literals in Instruction Group");
00362         Srcs[i].first->setReg(LiteralRegs[Lits.size()]);
00363         Lits.push_back(Imm);
00364       }
00365     }
00366   }
00367 
00368   MachineBasicBlock::iterator insertLiterals(
00369       MachineBasicBlock::iterator InsertPos,
00370       const std::vector<unsigned> &Literals) const {
00371     MachineBasicBlock *MBB = InsertPos->getParent();
00372     for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
00373       unsigned LiteralPair0 = Literals[i];
00374       unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
00375       InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
00376           TII->get(AMDGPU::LITERALS))
00377           .addImm(LiteralPair0)
00378           .addImm(LiteralPair1);
00379     }
00380     return InsertPos;
00381   }
00382 
00383   ClauseFile
00384   MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
00385       const {
00386     MachineBasicBlock::iterator ClauseHead = I;
00387     std::vector<MachineInstr *> ClauseContent;
00388     I++;
00389     for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
00390       if (IsTrivialInst(I)) {
00391         ++I;
00392         continue;
00393       }
00394       if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
00395         break;
00396       std::vector<int64_t> Literals;
00397       if (I->isBundle()) {
00398         MachineInstr *DeleteMI = I;
00399         MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
00400         while (++BI != E && BI->isBundledWithPred()) {
00401           BI->unbundleFromPred();
00402           for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) {
00403             MachineOperand &MO = BI->getOperand(i);
00404             if (MO.isReg() && MO.isInternalRead())
00405               MO.setIsInternalRead(false);
00406           }
00407           getLiteral(BI, Literals);
00408           ClauseContent.push_back(BI);
00409         }
00410         I = BI;
00411         DeleteMI->eraseFromParent();
00412       } else {
00413         getLiteral(I, Literals);
00414         ClauseContent.push_back(I);
00415         I++;
00416       }
00417       for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
00418         unsigned literal0 = Literals[i];
00419         unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0;
00420         MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(),
00421             TII->get(AMDGPU::LITERALS))
00422             .addImm(literal0)
00423             .addImm(literal2);
00424         ClauseContent.push_back(MILit);
00425       }
00426     }
00427     assert(ClauseContent.size() < 128 && "ALU clause is too big");
00428     ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1);
00429     return ClauseFile(ClauseHead, ClauseContent);
00430   }
00431 
00432   void
00433   EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
00434       unsigned &CfCount) {
00435     CounterPropagateAddr(Clause.first, CfCount);
00436     MachineBasicBlock *BB = Clause.first->getParent();
00437     BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE))
00438         .addImm(CfCount);
00439     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
00440       BB->splice(InsertPos, BB, Clause.second[i]);
00441     }
00442     CfCount += 2 * Clause.second.size();
00443   }
00444 
00445   void
00446   EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
00447       unsigned &CfCount) {
00448     Clause.first->getOperand(0).setImm(0);
00449     CounterPropagateAddr(Clause.first, CfCount);
00450     MachineBasicBlock *BB = Clause.first->getParent();
00451     BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE))
00452         .addImm(CfCount);
00453     for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
00454       BB->splice(InsertPos, BB, Clause.second[i]);
00455     }
00456     CfCount += Clause.second.size();
00457   }
00458 
00459   void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const {
00460     MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm());
00461   }
00462   void CounterPropagateAddr(std::set<MachineInstr *> MIs, unsigned Addr)
00463       const {
00464     for (std::set<MachineInstr *>::iterator It = MIs.begin(), E = MIs.end();
00465         It != E; ++It) {
00466       MachineInstr *MI = *It;
00467       CounterPropagateAddr(MI, Addr);
00468     }
00469   }
00470 
00471 public:
00472   R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
00473     TII (nullptr), TRI(nullptr),
00474     ST(tm.getSubtarget<AMDGPUSubtarget>()) {
00475       const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>();
00476       MaxFetchInst = ST.getTexVTXClauseSize();
00477   }
00478 
00479   bool runOnMachineFunction(MachineFunction &MF) override {
00480     TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
00481     TRI = static_cast<const R600RegisterInfo *>(
00482         MF.getSubtarget().getRegisterInfo());
00483     R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
00484 
00485     CFStack CFStack(ST, MFI->getShaderType());
00486     for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
00487         ++MB) {
00488       MachineBasicBlock &MBB = *MB;
00489       unsigned CfCount = 0;
00490       std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
00491       std::vector<MachineInstr * > IfThenElseStack;
00492       if (MFI->getShaderType() == ShaderType::VERTEX) {
00493         BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
00494             getHWInstrDesc(CF_CALL_FS));
00495         CfCount++;
00496       }
00497       std::vector<ClauseFile> FetchClauses, AluClauses;
00498       std::vector<MachineInstr *> LastAlu(1);
00499       std::vector<MachineInstr *> ToPopAfter;
00500       
00501       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
00502           I != E;) {
00503         if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) {
00504           DEBUG(dbgs() << CfCount << ":"; I->dump(););
00505           FetchClauses.push_back(MakeFetchClause(MBB, I));
00506           CfCount++;
00507           LastAlu.back() = nullptr;
00508           continue;
00509         }
00510 
00511         MachineBasicBlock::iterator MI = I;
00512         if (MI->getOpcode() != AMDGPU::ENDIF)
00513           LastAlu.back() = nullptr;
00514         if (MI->getOpcode() == AMDGPU::CF_ALU)
00515           LastAlu.back() = MI;
00516         I++;
00517         bool RequiresWorkAround =
00518             CFStack.requiresWorkAroundForInst(MI->getOpcode());
00519         switch (MI->getOpcode()) {
00520         case AMDGPU::CF_ALU_PUSH_BEFORE:
00521           if (RequiresWorkAround) {
00522             DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
00523             BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
00524                 .addImm(CfCount + 1)
00525                 .addImm(1);
00526             MI->setDesc(TII->get(AMDGPU::CF_ALU));
00527             CfCount++;
00528             CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
00529           } else
00530             CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
00531 
00532         case AMDGPU::CF_ALU:
00533           I = MI;
00534           AluClauses.push_back(MakeALUClause(MBB, I));
00535           DEBUG(dbgs() << CfCount << ":"; MI->dump(););
00536           CfCount++;
00537           break;
00538         case AMDGPU::WHILELOOP: {
00539           CFStack.pushLoop();
00540           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
00541               getHWInstrDesc(CF_WHILE_LOOP))
00542               .addImm(1);
00543           std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
00544               std::set<MachineInstr *>());
00545           Pair.second.insert(MIb);
00546           LoopStack.push_back(Pair);
00547           MI->eraseFromParent();
00548           CfCount++;
00549           break;
00550         }
00551         case AMDGPU::ENDLOOP: {
00552           CFStack.popLoop();
00553           std::pair<unsigned, std::set<MachineInstr *> > Pair =
00554               LoopStack.back();
00555           LoopStack.pop_back();
00556           CounterPropagateAddr(Pair.second, CfCount);
00557           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
00558               .addImm(Pair.first + 1);
00559           MI->eraseFromParent();
00560           CfCount++;
00561           break;
00562         }
00563         case AMDGPU::IF_PREDICATE_SET: {
00564           LastAlu.push_back(nullptr);
00565           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
00566               getHWInstrDesc(CF_JUMP))
00567               .addImm(0)
00568               .addImm(0);
00569           IfThenElseStack.push_back(MIb);
00570           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
00571           MI->eraseFromParent();
00572           CfCount++;
00573           break;
00574         }
00575         case AMDGPU::ELSE: {
00576           MachineInstr * JumpInst = IfThenElseStack.back();
00577           IfThenElseStack.pop_back();
00578           CounterPropagateAddr(JumpInst, CfCount);
00579           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
00580               getHWInstrDesc(CF_ELSE))
00581               .addImm(0)
00582               .addImm(0);
00583           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
00584           IfThenElseStack.push_back(MIb);
00585           MI->eraseFromParent();
00586           CfCount++;
00587           break;
00588         }
00589         case AMDGPU::ENDIF: {
00590           CFStack.popBranch();
00591           if (LastAlu.back()) {
00592             ToPopAfter.push_back(LastAlu.back());
00593           } else {
00594             MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
00595                 getHWInstrDesc(CF_POP))
00596                 .addImm(CfCount + 1)
00597                 .addImm(1);
00598             (void)MIb;
00599             DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
00600             CfCount++;
00601           }
00602           
00603           MachineInstr *IfOrElseInst = IfThenElseStack.back();
00604           IfThenElseStack.pop_back();
00605           CounterPropagateAddr(IfOrElseInst, CfCount);
00606           IfOrElseInst->getOperand(1).setImm(1);
00607           LastAlu.pop_back();
00608           MI->eraseFromParent();
00609           break;
00610         }
00611         case AMDGPU::BREAK: {
00612           CfCount ++;
00613           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
00614               getHWInstrDesc(CF_LOOP_BREAK))
00615               .addImm(0);
00616           LoopStack.back().second.insert(MIb);
00617           MI->eraseFromParent();
00618           break;
00619         }
00620         case AMDGPU::CONTINUE: {
00621           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
00622               getHWInstrDesc(CF_LOOP_CONTINUE))
00623               .addImm(0);
00624           LoopStack.back().second.insert(MIb);
00625           MI->eraseFromParent();
00626           CfCount++;
00627           break;
00628         }
00629         case AMDGPU::RETURN: {
00630           BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END));
00631           CfCount++;
00632           MI->eraseFromParent();
00633           if (CfCount % 2) {
00634             BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD));
00635             CfCount++;
00636           }
00637           for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
00638             EmitFetchClause(I, FetchClauses[i], CfCount);
00639           for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
00640             EmitALUClause(I, AluClauses[i], CfCount);
00641         }
00642         default:
00643           if (TII->isExport(MI->getOpcode())) {
00644             DEBUG(dbgs() << CfCount << ":"; MI->dump(););
00645             CfCount++;
00646           }
00647           break;
00648         }
00649       }
00650       for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
00651         MachineInstr *Alu = ToPopAfter[i];
00652         BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
00653             TII->get(AMDGPU::CF_ALU_POP_AFTER))
00654             .addImm(Alu->getOperand(0).getImm())
00655             .addImm(Alu->getOperand(1).getImm())
00656             .addImm(Alu->getOperand(2).getImm())
00657             .addImm(Alu->getOperand(3).getImm())
00658             .addImm(Alu->getOperand(4).getImm())
00659             .addImm(Alu->getOperand(5).getImm())
00660             .addImm(Alu->getOperand(6).getImm())
00661             .addImm(Alu->getOperand(7).getImm())
00662             .addImm(Alu->getOperand(8).getImm());
00663         Alu->eraseFromParent();
00664       }
00665       MFI->StackSize = CFStack.MaxStackSize;
00666     }
00667 
00668     return false;
00669   }
00670 
00671   const char *getPassName() const override {
00672     return "R600 Control Flow Finalizer Pass";
00673   }
00674 };
00675 
00676 char R600ControlFlowFinalizer::ID = 0;
00677 
00678 } // end anonymous namespace
00679 
00680 
00681 llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
00682   return new R600ControlFlowFinalizer(TM);
00683 }