LLVM API Documentation

R600EmitClauseMarkers.cpp
Go to the documentation of this file.
00001 //===-- R600EmitClauseMarkers.cpp - Emit CF_ALU ---------------------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 /// \file
00011 /// Add CF_ALU. R600 Alu instructions are grouped in clause which can hold
00012 /// 128 Alu instructions ; these instructions can access up to 4 prefetched
00013 /// 4 lines of 16 registers from constant buffers. Such ALU clauses are
00014 /// initiated by CF_ALU instructions.
00015 //===----------------------------------------------------------------------===//
00016 
00017 #include "AMDGPU.h"
00018 #include "R600Defines.h"
00019 #include "R600InstrInfo.h"
00020 #include "R600MachineFunctionInfo.h"
00021 #include "R600RegisterInfo.h"
00022 #include "AMDGPUSubtarget.h"
00023 #include "llvm/CodeGen/MachineFunctionPass.h"
00024 #include "llvm/CodeGen/MachineInstrBuilder.h"
00025 #include "llvm/CodeGen/MachineRegisterInfo.h"
00026 
00027 using namespace llvm;
00028 
00029 namespace llvm {
00030   void initializeR600EmitClauseMarkersPass(PassRegistry&);
00031 }
00032 
00033 namespace {
00034 
00035 class R600EmitClauseMarkers : public MachineFunctionPass {
00036 
00037 private:
00038   const R600InstrInfo *TII;
00039   int Address;
00040 
00041   unsigned OccupiedDwords(MachineInstr *MI) const {
00042     switch (MI->getOpcode()) {
00043     case AMDGPU::INTERP_PAIR_XY:
00044     case AMDGPU::INTERP_PAIR_ZW:
00045     case AMDGPU::INTERP_VEC_LOAD:
00046     case AMDGPU::DOT_4:
00047       return 4;
00048     case AMDGPU::KILL:
00049       return 0;
00050     default:
00051       break;
00052     }
00053 
00054     // These will be expanded to two ALU instructions in the
00055     // ExpandSpecialInstructions pass.
00056     if (TII->isLDSRetInstr(MI->getOpcode()))
00057       return 2;
00058 
00059     if(TII->isVector(*MI) ||
00060         TII->isCubeOp(MI->getOpcode()) ||
00061         TII->isReductionOp(MI->getOpcode()))
00062       return 4;
00063 
00064     unsigned NumLiteral = 0;
00065     for (MachineInstr::mop_iterator It = MI->operands_begin(),
00066         E = MI->operands_end(); It != E; ++It) {
00067       MachineOperand &MO = *It;
00068       if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
00069         ++NumLiteral;
00070     }
00071     return 1 + NumLiteral;
00072   }
00073 
00074   bool isALU(const MachineInstr *MI) const {
00075     if (TII->isALUInstr(MI->getOpcode()))
00076       return true;
00077     if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode()))
00078       return true;
00079     switch (MI->getOpcode()) {
00080     case AMDGPU::PRED_X:
00081     case AMDGPU::INTERP_PAIR_XY:
00082     case AMDGPU::INTERP_PAIR_ZW:
00083     case AMDGPU::INTERP_VEC_LOAD:
00084     case AMDGPU::COPY:
00085     case AMDGPU::DOT_4:
00086       return true;
00087     default:
00088       return false;
00089     }
00090   }
00091 
00092   bool IsTrivialInst(MachineInstr *MI) const {
00093     switch (MI->getOpcode()) {
00094     case AMDGPU::KILL:
00095     case AMDGPU::RETURN:
00096     case AMDGPU::IMPLICIT_DEF:
00097       return true;
00098     default:
00099       return false;
00100     }
00101   }
00102 
00103   std::pair<unsigned, unsigned> getAccessedBankLine(unsigned Sel) const {
00104     // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2
00105     // (See also R600ISelLowering.cpp)
00106     // ConstIndex value is in [0, 4095];
00107     return std::pair<unsigned, unsigned>(
00108         ((Sel >> 2) - 512) >> 12, // KC_BANK
00109         // Line Number of ConstIndex
00110         // A line contains 16 constant registers however KCX bank can lock
00111         // two line at the same time ; thus we want to get an even line number.
00112         // Line number can be retrieved with (>>4), using (>>5) <<1 generates
00113         // an even number.
00114         ((((Sel >> 2) - 512) & 4095) >> 5) << 1);
00115   }
00116 
00117   bool SubstituteKCacheBank(MachineInstr *MI,
00118       std::vector<std::pair<unsigned, unsigned> > &CachedConsts,
00119       bool UpdateInstr = true) const {
00120     std::vector<std::pair<unsigned, unsigned> > UsedKCache;
00121 
00122     if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4)
00123       return true;
00124 
00125     const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Consts =
00126         TII->getSrcs(MI);
00127     assert((TII->isALUInstr(MI->getOpcode()) ||
00128         MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const");
00129     for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
00130       if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
00131         continue;
00132       unsigned Sel = Consts[i].second;
00133       unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31;
00134       unsigned KCacheIndex = Index * 4 + Chan;
00135       const std::pair<unsigned, unsigned> &BankLine = getAccessedBankLine(Sel);
00136       if (CachedConsts.empty()) {
00137         CachedConsts.push_back(BankLine);
00138         UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex));
00139         continue;
00140       }
00141       if (CachedConsts[0] == BankLine) {
00142         UsedKCache.push_back(std::pair<unsigned, unsigned>(0, KCacheIndex));
00143         continue;
00144       }
00145       if (CachedConsts.size() == 1) {
00146         CachedConsts.push_back(BankLine);
00147         UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex));
00148         continue;
00149       }
00150       if (CachedConsts[1] == BankLine) {
00151         UsedKCache.push_back(std::pair<unsigned, unsigned>(1, KCacheIndex));
00152         continue;
00153       }
00154       return false;
00155     }
00156 
00157     if (!UpdateInstr)
00158       return true;
00159 
00160     for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) {
00161       if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
00162         continue;
00163       switch(UsedKCache[j].first) {
00164       case 0:
00165         Consts[i].first->setReg(
00166             AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second));
00167         break;
00168       case 1:
00169         Consts[i].first->setReg(
00170             AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second));
00171         break;
00172       default:
00173         llvm_unreachable("Wrong Cache Line");
00174       }
00175       j++;
00176     }
00177     return true;
00178   }
00179 
00180   bool canClauseLocalKillFitInClause(
00181                         unsigned AluInstCount,
00182                         std::vector<std::pair<unsigned, unsigned> > KCacheBanks,
00183                         MachineBasicBlock::iterator Def,
00184                         MachineBasicBlock::iterator BBEnd) {
00185     const R600RegisterInfo &TRI = TII->getRegisterInfo();
00186     for (MachineInstr::const_mop_iterator
00187            MOI = Def->operands_begin(),
00188            MOE = Def->operands_end(); MOI != MOE; ++MOI) {
00189       if (!MOI->isReg() || !MOI->isDef() ||
00190           TRI.isPhysRegLiveAcrossClauses(MOI->getReg()))
00191         continue;
00192 
00193       // Def defines a clause local register, so check that its use will fit
00194       // in the clause.
00195       unsigned LastUseCount = 0;
00196       for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) {
00197         AluInstCount += OccupiedDwords(UseI);
00198         // Make sure we won't need to end the clause due to KCache limitations.
00199         if (!SubstituteKCacheBank(UseI, KCacheBanks, false))
00200           return false;
00201 
00202         // We have reached the maximum instruction limit before finding the
00203         // use that kills this register, so we cannot use this def in the
00204         // current clause.
00205         if (AluInstCount >= TII->getMaxAlusPerClause())
00206           return false;
00207 
00208         // Register kill flags have been cleared by the time we get to this
00209         // pass, but it is safe to assume that all uses of this register
00210         // occur in the same basic block as its definition, because
00211         // it is illegal for the scheduler to schedule them in
00212         // different blocks.
00213         if (UseI->findRegisterUseOperandIdx(MOI->getReg()))
00214           LastUseCount = AluInstCount;
00215 
00216         if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1)
00217           break;
00218       }
00219       if (LastUseCount)
00220         return LastUseCount <= TII->getMaxAlusPerClause();
00221       llvm_unreachable("Clause local register live at end of clause.");
00222     }
00223     return true;
00224   }
00225 
00226   MachineBasicBlock::iterator
00227   MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) {
00228     MachineBasicBlock::iterator ClauseHead = I;
00229     std::vector<std::pair<unsigned, unsigned> > KCacheBanks;
00230     bool PushBeforeModifier = false;
00231     unsigned AluInstCount = 0;
00232     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
00233       if (IsTrivialInst(I))
00234         continue;
00235       if (!isALU(I))
00236         break;
00237       if (AluInstCount > TII->getMaxAlusPerClause())
00238         break;
00239       if (I->getOpcode() == AMDGPU::PRED_X) {
00240         // We put PRED_X in its own clause to ensure that ifcvt won't create
00241         // clauses with more than 128 insts.
00242         // IfCvt is indeed checking that "then" and "else" branches of an if
00243         // statement have less than ~60 insts thus converted clauses can't be
00244         // bigger than ~121 insts (predicate setter needs to be in the same
00245         // clause as predicated alus).
00246         if (AluInstCount > 0)
00247           break;
00248         if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH)
00249           PushBeforeModifier = true;
00250         AluInstCount ++;
00251         continue;
00252       }
00253       // XXX: GROUP_BARRIER instructions cannot be in the same ALU clause as:
00254       //
00255       // * KILL or INTERP instructions
00256       // * Any instruction that sets UPDATE_EXEC_MASK or UPDATE_PRED bits
00257       // * Uses waterfalling (i.e. INDEX_MODE = AR.X)
00258       //
00259       // XXX: These checks have not been implemented yet.
00260       if (TII->mustBeLastInClause(I->getOpcode())) {
00261         I++;
00262         break;
00263       }
00264 
00265       // If this instruction defines a clause local register, make sure
00266       // its use can fit in this clause.
00267       if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E))
00268         break;
00269 
00270       if (!SubstituteKCacheBank(I, KCacheBanks))
00271         break;
00272       AluInstCount += OccupiedDwords(I);
00273     }
00274     unsigned Opcode = PushBeforeModifier ?
00275         AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU;
00276     BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode))
00277     // We don't use the ADDR field until R600ControlFlowFinalizer pass, where
00278     // it is safe to assume it is 0. However if we always put 0 here, the ifcvt
00279     // pass may assume that identical ALU clause starter at the beginning of a 
00280     // true and false branch can be factorized which is not the case.
00281         .addImm(Address++) // ADDR
00282         .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0
00283         .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1
00284         .addImm(KCacheBanks.empty()?0:2) // KM0
00285         .addImm((KCacheBanks.size() < 2)?0:2) // KM1
00286         .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0
00287         .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1
00288         .addImm(AluInstCount) // COUNT
00289         .addImm(1); // Enabled
00290     return I;
00291   }
00292 
00293 public:
00294   static char ID;
00295   R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) {
00296 
00297     initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry());
00298   }
00299 
00300   bool runOnMachineFunction(MachineFunction &MF) override {
00301     TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
00302 
00303     for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
00304                                                     BB != BB_E; ++BB) {
00305       MachineBasicBlock &MBB = *BB;
00306       MachineBasicBlock::iterator I = MBB.begin();
00307       if (I->getOpcode() == AMDGPU::CF_ALU)
00308         continue; // BB was already parsed
00309       for (MachineBasicBlock::iterator E = MBB.end(); I != E;) {
00310         if (isALU(I))
00311           I = MakeALUClause(MBB, I);
00312         else
00313           ++I;
00314       }
00315     }
00316     return false;
00317   }
00318 
00319   const char *getPassName() const override {
00320     return "R600 Emit Clause Markers Pass";
00321   }
00322 };
00323 
00324 char R600EmitClauseMarkers::ID = 0;
00325 
00326 } // end anonymous namespace
00327 
00328 INITIALIZE_PASS_BEGIN(R600EmitClauseMarkers, "emitclausemarkers",
00329                       "R600 Emit Clause Markters", false, false)
00330 INITIALIZE_PASS_END(R600EmitClauseMarkers, "emitclausemarkers",
00331                       "R600 Emit Clause Markters", false, false)
00332 
00333 llvm::FunctionPass *llvm::createR600EmitClauseMarkers() {
00334   return new R600EmitClauseMarkers();
00335 }
00336