LLVM API Documentation
00001 //===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 /// \file 00011 /// \brief R600 Machine Scheduler interface 00012 // 00013 //===----------------------------------------------------------------------===// 00014 00015 #include "R600MachineScheduler.h" 00016 #include "AMDGPUSubtarget.h" 00017 #include "llvm/CodeGen/MachineRegisterInfo.h" 00018 #include "llvm/Pass.h" 00019 #include "llvm/PassManager.h" 00020 #include "llvm/Support/raw_ostream.h" 00021 00022 using namespace llvm; 00023 00024 #define DEBUG_TYPE "misched" 00025 00026 void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { 00027 assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness"); 00028 DAG = static_cast<ScheduleDAGMILive*>(dag); 00029 TII = static_cast<const R600InstrInfo*>(DAG->TII); 00030 TRI = static_cast<const R600RegisterInfo*>(DAG->TRI); 00031 VLIW5 = !DAG->MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA(); 00032 MRI = &DAG->MRI; 00033 CurInstKind = IDOther; 00034 CurEmitted = 0; 00035 OccupedSlotsMask = 31; 00036 InstKindLimit[IDAlu] = TII->getMaxAlusPerClause(); 00037 InstKindLimit[IDOther] = 32; 00038 00039 const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>(); 00040 InstKindLimit[IDFetch] = ST.getTexVTXClauseSize(); 00041 AluInstCount = 0; 00042 FetchInstCount = 0; 00043 } 00044 00045 void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc, 00046 std::vector<SUnit *> &QDst) 00047 { 00048 QDst.insert(QDst.end(), QSrc.begin(), QSrc.end()); 00049 QSrc.clear(); 00050 } 00051 00052 static 00053 unsigned getWFCountLimitedByGPR(unsigned GPRCount) { 00054 assert (GPRCount && "GPRCount cannot be 0"); 00055 return 248 / GPRCount; 00056 } 00057 00058 SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { 00059 SUnit *SU = nullptr; 00060 NextInstKind = IDOther; 00061 00062 IsTopNode = false; 00063 00064 // check if we might want to switch current clause type 00065 bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) || 00066 (Available[CurInstKind].empty()); 00067 bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) && 00068 (!Available[IDFetch].empty() || !Available[IDOther].empty()); 00069 00070 if (CurInstKind == IDAlu && !Available[IDFetch].empty()) { 00071 // We use the heuristic provided by AMD Accelerated Parallel Processing 00072 // OpenCL Programming Guide : 00073 // The approx. number of WF that allows TEX inst to hide ALU inst is : 00074 // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU)) 00075 float ALUFetchRationEstimate = 00076 (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) / 00077 (FetchInstCount + Available[IDFetch].size()); 00078 if (ALUFetchRationEstimate == 0) { 00079 AllowSwitchFromAlu = true; 00080 } else { 00081 unsigned NeededWF = 62.5f / ALUFetchRationEstimate; 00082 DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" ); 00083 // We assume the local GPR requirements to be "dominated" by the requirement 00084 // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and 00085 // after TEX are indeed likely to consume or generate values from/for the 00086 // TEX clause. 00087 // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause 00088 // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need 00089 // one GPR) or TmXYZW = TnXYZW (need 2 GPR). 00090 // (TODO : use RegisterPressure) 00091 // If we are going too use too many GPR, we flush Fetch instruction to lower 00092 // register pressure on 128 bits regs. 00093 unsigned NearRegisterRequirement = 2 * Available[IDFetch].size(); 00094 if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement)) 00095 AllowSwitchFromAlu = true; 00096 } 00097 } 00098 00099 if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) || 00100 (!AllowSwitchFromAlu && CurInstKind == IDAlu))) { 00101 // try to pick ALU 00102 SU = pickAlu(); 00103 if (!SU && !PhysicalRegCopy.empty()) { 00104 SU = PhysicalRegCopy.front(); 00105 PhysicalRegCopy.erase(PhysicalRegCopy.begin()); 00106 } 00107 if (SU) { 00108 if (CurEmitted >= InstKindLimit[IDAlu]) 00109 CurEmitted = 0; 00110 NextInstKind = IDAlu; 00111 } 00112 } 00113 00114 if (!SU) { 00115 // try to pick FETCH 00116 SU = pickOther(IDFetch); 00117 if (SU) 00118 NextInstKind = IDFetch; 00119 } 00120 00121 // try to pick other 00122 if (!SU) { 00123 SU = pickOther(IDOther); 00124 if (SU) 00125 NextInstKind = IDOther; 00126 } 00127 00128 DEBUG( 00129 if (SU) { 00130 dbgs() << " ** Pick node **\n"; 00131 SU->dump(DAG); 00132 } else { 00133 dbgs() << "NO NODE \n"; 00134 for (unsigned i = 0; i < DAG->SUnits.size(); i++) { 00135 const SUnit &S = DAG->SUnits[i]; 00136 if (!S.isScheduled) 00137 S.dump(DAG); 00138 } 00139 } 00140 ); 00141 00142 return SU; 00143 } 00144 00145 void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { 00146 if (NextInstKind != CurInstKind) { 00147 DEBUG(dbgs() << "Instruction Type Switch\n"); 00148 if (NextInstKind != IDAlu) 00149 OccupedSlotsMask |= 31; 00150 CurEmitted = 0; 00151 CurInstKind = NextInstKind; 00152 } 00153 00154 if (CurInstKind == IDAlu) { 00155 AluInstCount ++; 00156 switch (getAluKind(SU)) { 00157 case AluT_XYZW: 00158 CurEmitted += 4; 00159 break; 00160 case AluDiscarded: 00161 break; 00162 default: { 00163 ++CurEmitted; 00164 for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(), 00165 E = SU->getInstr()->operands_end(); It != E; ++It) { 00166 MachineOperand &MO = *It; 00167 if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) 00168 ++CurEmitted; 00169 } 00170 } 00171 } 00172 } else { 00173 ++CurEmitted; 00174 } 00175 00176 00177 DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n"); 00178 00179 if (CurInstKind != IDFetch) { 00180 MoveUnits(Pending[IDFetch], Available[IDFetch]); 00181 } else 00182 FetchInstCount++; 00183 } 00184 00185 static bool 00186 isPhysicalRegCopy(MachineInstr *MI) { 00187 if (MI->getOpcode() != AMDGPU::COPY) 00188 return false; 00189 00190 return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg()); 00191 } 00192 00193 void R600SchedStrategy::releaseTopNode(SUnit *SU) { 00194 DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG);); 00195 } 00196 00197 void R600SchedStrategy::releaseBottomNode(SUnit *SU) { 00198 DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG);); 00199 if (isPhysicalRegCopy(SU->getInstr())) { 00200 PhysicalRegCopy.push_back(SU); 00201 return; 00202 } 00203 00204 int IK = getInstKind(SU); 00205 00206 // There is no export clause, we can schedule one as soon as its ready 00207 if (IK == IDOther) 00208 Available[IDOther].push_back(SU); 00209 else 00210 Pending[IK].push_back(SU); 00211 00212 } 00213 00214 bool R600SchedStrategy::regBelongsToClass(unsigned Reg, 00215 const TargetRegisterClass *RC) const { 00216 if (!TargetRegisterInfo::isVirtualRegister(Reg)) { 00217 return RC->contains(Reg); 00218 } else { 00219 return MRI->getRegClass(Reg) == RC; 00220 } 00221 } 00222 00223 R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { 00224 MachineInstr *MI = SU->getInstr(); 00225 00226 if (TII->isTransOnly(MI)) 00227 return AluTrans; 00228 00229 switch (MI->getOpcode()) { 00230 case AMDGPU::PRED_X: 00231 return AluPredX; 00232 case AMDGPU::INTERP_PAIR_XY: 00233 case AMDGPU::INTERP_PAIR_ZW: 00234 case AMDGPU::INTERP_VEC_LOAD: 00235 case AMDGPU::DOT_4: 00236 return AluT_XYZW; 00237 case AMDGPU::COPY: 00238 if (MI->getOperand(1).isUndef()) { 00239 // MI will become a KILL, don't considers it in scheduling 00240 return AluDiscarded; 00241 } 00242 default: 00243 break; 00244 } 00245 00246 // Does the instruction take a whole IG ? 00247 // XXX: Is it possible to add a helper function in R600InstrInfo that can 00248 // be used here and in R600PacketizerList::isSoloInstruction() ? 00249 if(TII->isVector(*MI) || 00250 TII->isCubeOp(MI->getOpcode()) || 00251 TII->isReductionOp(MI->getOpcode()) || 00252 MI->getOpcode() == AMDGPU::GROUP_BARRIER) { 00253 return AluT_XYZW; 00254 } 00255 00256 if (TII->isLDSInstr(MI->getOpcode())) { 00257 return AluT_X; 00258 } 00259 00260 // Is the result already assigned to a channel ? 00261 unsigned DestSubReg = MI->getOperand(0).getSubReg(); 00262 switch (DestSubReg) { 00263 case AMDGPU::sub0: 00264 return AluT_X; 00265 case AMDGPU::sub1: 00266 return AluT_Y; 00267 case AMDGPU::sub2: 00268 return AluT_Z; 00269 case AMDGPU::sub3: 00270 return AluT_W; 00271 default: 00272 break; 00273 } 00274 00275 // Is the result already member of a X/Y/Z/W class ? 00276 unsigned DestReg = MI->getOperand(0).getReg(); 00277 if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) || 00278 regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass)) 00279 return AluT_X; 00280 if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass)) 00281 return AluT_Y; 00282 if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass)) 00283 return AluT_Z; 00284 if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass)) 00285 return AluT_W; 00286 if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass)) 00287 return AluT_XYZW; 00288 00289 // LDS src registers cannot be used in the Trans slot. 00290 if (TII->readsLDSSrcReg(MI)) 00291 return AluT_XYZW; 00292 00293 return AluAny; 00294 00295 } 00296 00297 int R600SchedStrategy::getInstKind(SUnit* SU) { 00298 int Opcode = SU->getInstr()->getOpcode(); 00299 00300 if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode)) 00301 return IDFetch; 00302 00303 if (TII->isALUInstr(Opcode)) { 00304 return IDAlu; 00305 } 00306 00307 switch (Opcode) { 00308 case AMDGPU::PRED_X: 00309 case AMDGPU::COPY: 00310 case AMDGPU::CONST_COPY: 00311 case AMDGPU::INTERP_PAIR_XY: 00312 case AMDGPU::INTERP_PAIR_ZW: 00313 case AMDGPU::INTERP_VEC_LOAD: 00314 case AMDGPU::DOT_4: 00315 return IDAlu; 00316 default: 00317 return IDOther; 00318 } 00319 } 00320 00321 SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) { 00322 if (Q.empty()) 00323 return nullptr; 00324 for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend(); 00325 It != E; ++It) { 00326 SUnit *SU = *It; 00327 InstructionsGroupCandidate.push_back(SU->getInstr()); 00328 if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) 00329 && (!AnyALU || !TII->isVectorOnly(SU->getInstr())) 00330 ) { 00331 InstructionsGroupCandidate.pop_back(); 00332 Q.erase((It + 1).base()); 00333 return SU; 00334 } else { 00335 InstructionsGroupCandidate.pop_back(); 00336 } 00337 } 00338 return nullptr; 00339 } 00340 00341 void R600SchedStrategy::LoadAlu() { 00342 std::vector<SUnit *> &QSrc = Pending[IDAlu]; 00343 for (unsigned i = 0, e = QSrc.size(); i < e; ++i) { 00344 AluKind AK = getAluKind(QSrc[i]); 00345 AvailableAlus[AK].push_back(QSrc[i]); 00346 } 00347 QSrc.clear(); 00348 } 00349 00350 void R600SchedStrategy::PrepareNextSlot() { 00351 DEBUG(dbgs() << "New Slot\n"); 00352 assert (OccupedSlotsMask && "Slot wasn't filled"); 00353 OccupedSlotsMask = 0; 00354 // if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS) 00355 // OccupedSlotsMask |= 16; 00356 InstructionsGroupCandidate.clear(); 00357 LoadAlu(); 00358 } 00359 00360 void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) { 00361 int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); 00362 if (DstIndex == -1) { 00363 return; 00364 } 00365 unsigned DestReg = MI->getOperand(DstIndex).getReg(); 00366 // PressureRegister crashes if an operand is def and used in the same inst 00367 // and we try to constraint its regclass 00368 for (MachineInstr::mop_iterator It = MI->operands_begin(), 00369 E = MI->operands_end(); It != E; ++It) { 00370 MachineOperand &MO = *It; 00371 if (MO.isReg() && !MO.isDef() && 00372 MO.getReg() == DestReg) 00373 return; 00374 } 00375 // Constrains the regclass of DestReg to assign it to Slot 00376 switch (Slot) { 00377 case 0: 00378 MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass); 00379 break; 00380 case 1: 00381 MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass); 00382 break; 00383 case 2: 00384 MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass); 00385 break; 00386 case 3: 00387 MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass); 00388 break; 00389 } 00390 } 00391 00392 SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) { 00393 static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W}; 00394 SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu); 00395 if (SlotedSU) 00396 return SlotedSU; 00397 SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu); 00398 if (UnslotedSU) 00399 AssignSlot(UnslotedSU->getInstr(), Slot); 00400 return UnslotedSU; 00401 } 00402 00403 unsigned R600SchedStrategy::AvailablesAluCount() const { 00404 return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() + 00405 AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() + 00406 AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() + 00407 AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() + 00408 AvailableAlus[AluPredX].size(); 00409 } 00410 00411 SUnit* R600SchedStrategy::pickAlu() { 00412 while (AvailablesAluCount() || !Pending[IDAlu].empty()) { 00413 if (!OccupedSlotsMask) { 00414 // Bottom up scheduling : predX must comes first 00415 if (!AvailableAlus[AluPredX].empty()) { 00416 OccupedSlotsMask |= 31; 00417 return PopInst(AvailableAlus[AluPredX], false); 00418 } 00419 // Flush physical reg copies (RA will discard them) 00420 if (!AvailableAlus[AluDiscarded].empty()) { 00421 OccupedSlotsMask |= 31; 00422 return PopInst(AvailableAlus[AluDiscarded], false); 00423 } 00424 // If there is a T_XYZW alu available, use it 00425 if (!AvailableAlus[AluT_XYZW].empty()) { 00426 OccupedSlotsMask |= 15; 00427 return PopInst(AvailableAlus[AluT_XYZW], false); 00428 } 00429 } 00430 bool TransSlotOccuped = OccupedSlotsMask & 16; 00431 if (!TransSlotOccuped && VLIW5) { 00432 if (!AvailableAlus[AluTrans].empty()) { 00433 OccupedSlotsMask |= 16; 00434 return PopInst(AvailableAlus[AluTrans], false); 00435 } 00436 SUnit *SU = AttemptFillSlot(3, true); 00437 if (SU) { 00438 OccupedSlotsMask |= 16; 00439 return SU; 00440 } 00441 } 00442 for (int Chan = 3; Chan > -1; --Chan) { 00443 bool isOccupied = OccupedSlotsMask & (1 << Chan); 00444 if (!isOccupied) { 00445 SUnit *SU = AttemptFillSlot(Chan, false); 00446 if (SU) { 00447 OccupedSlotsMask |= (1 << Chan); 00448 InstructionsGroupCandidate.push_back(SU->getInstr()); 00449 return SU; 00450 } 00451 } 00452 } 00453 PrepareNextSlot(); 00454 } 00455 return nullptr; 00456 } 00457 00458 SUnit* R600SchedStrategy::pickOther(int QID) { 00459 SUnit *SU = nullptr; 00460 std::vector<SUnit *> &AQ = Available[QID]; 00461 00462 if (AQ.empty()) { 00463 MoveUnits(Pending[QID], AQ); 00464 } 00465 if (!AQ.empty()) { 00466 SU = AQ.back(); 00467 AQ.resize(AQ.size() - 1); 00468 } 00469 return SU; 00470 }