LLVM API Documentation
00001 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 /// \file 00011 /// \brief This pass lowers the pseudo control flow instructions to real 00012 /// machine instructions. 00013 /// 00014 /// All control flow is handled using predicated instructions and 00015 /// a predicate stack. Each Scalar ALU controls the operations of 64 Vector 00016 /// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs 00017 /// by writting to the 64-bit EXEC register (each bit corresponds to a 00018 /// single vector ALU). Typically, for predicates, a vector ALU will write 00019 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each 00020 /// Vector ALU) and then the ScalarALU will AND the VCC register with the 00021 /// EXEC to update the predicates. 00022 /// 00023 /// For example: 00024 /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 00025 /// %SGPR0 = SI_IF %VCC 00026 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 00027 /// %SGPR0 = SI_ELSE %SGPR0 00028 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 00029 /// SI_END_CF %SGPR0 00030 /// 00031 /// becomes: 00032 /// 00033 /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask 00034 /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask 00035 /// S_CBRANCH_EXECZ label0 // This instruction is an optional 00036 /// // optimization which allows us to 00037 /// // branch if all the bits of 00038 /// // EXEC are zero. 00039 /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch 00040 /// 00041 /// label0: 00042 /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block 00043 /// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask 00044 /// S_BRANCH_EXECZ label1 // Use our branch optimization 00045 /// // instruction again. 00046 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block 00047 /// label1: 00048 /// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits 00049 //===----------------------------------------------------------------------===// 00050 00051 #include "AMDGPU.h" 00052 #include "AMDGPUSubtarget.h" 00053 #include "SIInstrInfo.h" 00054 #include "SIMachineFunctionInfo.h" 00055 #include "llvm/CodeGen/MachineFrameInfo.h" 00056 #include "llvm/CodeGen/MachineFunction.h" 00057 #include "llvm/CodeGen/MachineFunctionPass.h" 00058 #include "llvm/CodeGen/MachineInstrBuilder.h" 00059 #include "llvm/CodeGen/MachineRegisterInfo.h" 00060 #include "llvm/IR/Constants.h" 00061 00062 using namespace llvm; 00063 00064 namespace { 00065 00066 class SILowerControlFlowPass : public MachineFunctionPass { 00067 00068 private: 00069 static const unsigned SkipThreshold = 12; 00070 00071 static char ID; 00072 const SIRegisterInfo *TRI; 00073 const SIInstrInfo *TII; 00074 00075 bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); 00076 00077 void Skip(MachineInstr &From, MachineOperand &To); 00078 void SkipIfDead(MachineInstr &MI); 00079 00080 void If(MachineInstr &MI); 00081 void Else(MachineInstr &MI); 00082 void Break(MachineInstr &MI); 00083 void IfBreak(MachineInstr &MI); 00084 void ElseBreak(MachineInstr &MI); 00085 void Loop(MachineInstr &MI); 00086 void EndCf(MachineInstr &MI); 00087 00088 void Kill(MachineInstr &MI); 00089 void Branch(MachineInstr &MI); 00090 00091 void InitM0ForLDS(MachineBasicBlock::iterator MI); 00092 void LoadM0(MachineInstr &MI, MachineInstr *MovRel); 00093 void IndirectSrc(MachineInstr &MI); 00094 void IndirectDst(MachineInstr &MI); 00095 00096 public: 00097 SILowerControlFlowPass(TargetMachine &tm) : 00098 MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { } 00099 00100 bool runOnMachineFunction(MachineFunction &MF) override; 00101 00102 const char *getPassName() const override { 00103 return "SI Lower control flow instructions"; 00104 } 00105 00106 }; 00107 00108 } // End anonymous namespace 00109 00110 char SILowerControlFlowPass::ID = 0; 00111 00112 FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) { 00113 return new SILowerControlFlowPass(tm); 00114 } 00115 00116 bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From, 00117 MachineBasicBlock *To) { 00118 00119 unsigned NumInstr = 0; 00120 00121 for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty(); 00122 MBB = *MBB->succ_begin()) { 00123 00124 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); 00125 NumInstr < SkipThreshold && I != E; ++I) { 00126 00127 if (I->isBundle() || !I->isBundled()) 00128 if (++NumInstr >= SkipThreshold) 00129 return true; 00130 } 00131 } 00132 00133 return false; 00134 } 00135 00136 void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) { 00137 00138 if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB())) 00139 return; 00140 00141 DebugLoc DL = From.getDebugLoc(); 00142 BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) 00143 .addOperand(To) 00144 .addReg(AMDGPU::EXEC); 00145 } 00146 00147 void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { 00148 00149 MachineBasicBlock &MBB = *MI.getParent(); 00150 DebugLoc DL = MI.getDebugLoc(); 00151 00152 if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() != 00153 ShaderType::PIXEL || 00154 !shouldSkip(&MBB, &MBB.getParent()->back())) 00155 return; 00156 00157 MachineBasicBlock::iterator Insert = &MI; 00158 ++Insert; 00159 00160 // If the exec mask is non-zero, skip the next two instructions 00161 BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 00162 .addImm(3) 00163 .addReg(AMDGPU::EXEC); 00164 00165 // Exec mask is zero: Export to NULL target... 00166 BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP)) 00167 .addImm(0) 00168 .addImm(0x09) // V_008DFC_SQ_EXP_NULL 00169 .addImm(0) 00170 .addImm(1) 00171 .addImm(1) 00172 .addReg(AMDGPU::VGPR0) 00173 .addReg(AMDGPU::VGPR0) 00174 .addReg(AMDGPU::VGPR0) 00175 .addReg(AMDGPU::VGPR0); 00176 00177 // ... and terminate wavefront 00178 BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); 00179 } 00180 00181 void SILowerControlFlowPass::If(MachineInstr &MI) { 00182 MachineBasicBlock &MBB = *MI.getParent(); 00183 DebugLoc DL = MI.getDebugLoc(); 00184 unsigned Reg = MI.getOperand(0).getReg(); 00185 unsigned Vcc = MI.getOperand(1).getReg(); 00186 00187 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg) 00188 .addReg(Vcc); 00189 00190 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg) 00191 .addReg(AMDGPU::EXEC) 00192 .addReg(Reg); 00193 00194 Skip(MI, MI.getOperand(2)); 00195 00196 MI.eraseFromParent(); 00197 } 00198 00199 void SILowerControlFlowPass::Else(MachineInstr &MI) { 00200 MachineBasicBlock &MBB = *MI.getParent(); 00201 DebugLoc DL = MI.getDebugLoc(); 00202 unsigned Dst = MI.getOperand(0).getReg(); 00203 unsigned Src = MI.getOperand(1).getReg(); 00204 00205 BuildMI(MBB, MBB.getFirstNonPHI(), DL, 00206 TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) 00207 .addReg(Src); // Saved EXEC 00208 00209 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) 00210 .addReg(AMDGPU::EXEC) 00211 .addReg(Dst); 00212 00213 Skip(MI, MI.getOperand(2)); 00214 00215 MI.eraseFromParent(); 00216 } 00217 00218 void SILowerControlFlowPass::Break(MachineInstr &MI) { 00219 MachineBasicBlock &MBB = *MI.getParent(); 00220 DebugLoc DL = MI.getDebugLoc(); 00221 00222 unsigned Dst = MI.getOperand(0).getReg(); 00223 unsigned Src = MI.getOperand(1).getReg(); 00224 00225 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) 00226 .addReg(AMDGPU::EXEC) 00227 .addReg(Src); 00228 00229 MI.eraseFromParent(); 00230 } 00231 00232 void SILowerControlFlowPass::IfBreak(MachineInstr &MI) { 00233 MachineBasicBlock &MBB = *MI.getParent(); 00234 DebugLoc DL = MI.getDebugLoc(); 00235 00236 unsigned Dst = MI.getOperand(0).getReg(); 00237 unsigned Vcc = MI.getOperand(1).getReg(); 00238 unsigned Src = MI.getOperand(2).getReg(); 00239 00240 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) 00241 .addReg(Vcc) 00242 .addReg(Src); 00243 00244 MI.eraseFromParent(); 00245 } 00246 00247 void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) { 00248 MachineBasicBlock &MBB = *MI.getParent(); 00249 DebugLoc DL = MI.getDebugLoc(); 00250 00251 unsigned Dst = MI.getOperand(0).getReg(); 00252 unsigned Saved = MI.getOperand(1).getReg(); 00253 unsigned Src = MI.getOperand(2).getReg(); 00254 00255 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) 00256 .addReg(Saved) 00257 .addReg(Src); 00258 00259 MI.eraseFromParent(); 00260 } 00261 00262 void SILowerControlFlowPass::Loop(MachineInstr &MI) { 00263 MachineBasicBlock &MBB = *MI.getParent(); 00264 DebugLoc DL = MI.getDebugLoc(); 00265 unsigned Src = MI.getOperand(0).getReg(); 00266 00267 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) 00268 .addReg(AMDGPU::EXEC) 00269 .addReg(Src); 00270 00271 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 00272 .addOperand(MI.getOperand(1)) 00273 .addReg(AMDGPU::EXEC); 00274 00275 MI.eraseFromParent(); 00276 } 00277 00278 void SILowerControlFlowPass::EndCf(MachineInstr &MI) { 00279 MachineBasicBlock &MBB = *MI.getParent(); 00280 DebugLoc DL = MI.getDebugLoc(); 00281 unsigned Reg = MI.getOperand(0).getReg(); 00282 00283 BuildMI(MBB, MBB.getFirstNonPHI(), DL, 00284 TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) 00285 .addReg(AMDGPU::EXEC) 00286 .addReg(Reg); 00287 00288 MI.eraseFromParent(); 00289 } 00290 00291 void SILowerControlFlowPass::Branch(MachineInstr &MI) { 00292 if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode()) 00293 MI.eraseFromParent(); 00294 00295 // If these aren't equal, this is probably an infinite loop. 00296 } 00297 00298 void SILowerControlFlowPass::Kill(MachineInstr &MI) { 00299 MachineBasicBlock &MBB = *MI.getParent(); 00300 DebugLoc DL = MI.getDebugLoc(); 00301 const MachineOperand &Op = MI.getOperand(0); 00302 00303 #ifndef NDEBUG 00304 const SIMachineFunctionInfo *MFI 00305 = MBB.getParent()->getInfo<SIMachineFunctionInfo>(); 00306 // Kill is only allowed in pixel / geometry shaders. 00307 assert(MFI->getShaderType() == ShaderType::PIXEL || 00308 MFI->getShaderType() == ShaderType::GEOMETRY); 00309 #endif 00310 00311 // Clear this thread from the exec mask if the operand is negative 00312 if ((Op.isImm() || Op.isFPImm())) { 00313 // Constant operand: Set exec mask to 0 or do nothing 00314 if (Op.isImm() ? (Op.getImm() & 0x80000000) : 00315 Op.getFPImm()->isNegative()) { 00316 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) 00317 .addImm(0); 00318 } 00319 } else { 00320 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC) 00321 .addImm(0) 00322 .addOperand(Op); 00323 } 00324 00325 MI.eraseFromParent(); 00326 } 00327 00328 /// The m0 register stores the maximum allowable address for LDS reads and 00329 /// writes. Its value must be at least the size in bytes of LDS allocated by 00330 /// the shader. For simplicity, we set it to the maximum possible value. 00331 void SILowerControlFlowPass::InitM0ForLDS(MachineBasicBlock::iterator MI) { 00332 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), 00333 AMDGPU::M0).addImm(0xffffffff); 00334 } 00335 00336 void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { 00337 00338 MachineBasicBlock &MBB = *MI.getParent(); 00339 DebugLoc DL = MI.getDebugLoc(); 00340 MachineBasicBlock::iterator I = MI; 00341 00342 unsigned Save = MI.getOperand(1).getReg(); 00343 unsigned Idx = MI.getOperand(3).getReg(); 00344 00345 if (AMDGPU::SReg_32RegClass.contains(Idx)) { 00346 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 00347 .addReg(Idx); 00348 MBB.insert(I, MovRel); 00349 } else { 00350 00351 assert(AMDGPU::SReg_64RegClass.contains(Save)); 00352 assert(AMDGPU::VReg_32RegClass.contains(Idx)); 00353 00354 // Save the EXEC mask 00355 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) 00356 .addReg(AMDGPU::EXEC); 00357 00358 // Read the next variant into VCC (lower 32 bits) <- also loop target 00359 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 00360 AMDGPU::VCC_LO) 00361 .addReg(Idx); 00362 00363 // Move index from VCC into M0 00364 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) 00365 .addReg(AMDGPU::VCC_LO); 00366 00367 // Compare the just read M0 value to all possible Idx values 00368 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC) 00369 .addReg(AMDGPU::M0) 00370 .addReg(Idx); 00371 00372 // Update EXEC, save the original EXEC value to VCC 00373 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) 00374 .addReg(AMDGPU::VCC); 00375 00376 // Do the actual move 00377 MBB.insert(I, MovRel); 00378 00379 // Update EXEC, switch all done bits to 0 and all todo bits to 1 00380 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) 00381 .addReg(AMDGPU::EXEC) 00382 .addReg(AMDGPU::VCC); 00383 00384 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover 00385 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 00386 .addImm(-7) 00387 .addReg(AMDGPU::EXEC); 00388 00389 // Restore EXEC 00390 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) 00391 .addReg(Save); 00392 00393 } 00394 // FIXME: Are there any values other than the LDS address clamp that need to 00395 // be stored in the m0 register and may be live for more than a few 00396 // instructions? If so, we should save the m0 register at the beginning 00397 // of this function and restore it here. 00398 // FIXME: Add support for LDS direct loads. 00399 InitM0ForLDS(&MI); 00400 MI.eraseFromParent(); 00401 } 00402 00403 void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) { 00404 00405 MachineBasicBlock &MBB = *MI.getParent(); 00406 DebugLoc DL = MI.getDebugLoc(); 00407 00408 unsigned Dst = MI.getOperand(0).getReg(); 00409 unsigned Vec = MI.getOperand(2).getReg(); 00410 unsigned Off = MI.getOperand(4).getImm(); 00411 unsigned SubReg = TRI->getSubReg(Vec, AMDGPU::sub0); 00412 if (!SubReg) 00413 SubReg = Vec; 00414 00415 MachineInstr *MovRel = 00416 BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) 00417 .addReg(SubReg + Off) 00418 .addReg(AMDGPU::M0, RegState::Implicit) 00419 .addReg(Vec, RegState::Implicit); 00420 00421 LoadM0(MI, MovRel); 00422 } 00423 00424 void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { 00425 00426 MachineBasicBlock &MBB = *MI.getParent(); 00427 DebugLoc DL = MI.getDebugLoc(); 00428 00429 unsigned Dst = MI.getOperand(0).getReg(); 00430 unsigned Off = MI.getOperand(4).getImm(); 00431 unsigned Val = MI.getOperand(5).getReg(); 00432 unsigned SubReg = TRI->getSubReg(Dst, AMDGPU::sub0); 00433 if (!SubReg) 00434 SubReg = Dst; 00435 00436 MachineInstr *MovRel = 00437 BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32)) 00438 .addReg(SubReg + Off, RegState::Define) 00439 .addReg(Val) 00440 .addReg(AMDGPU::M0, RegState::Implicit) 00441 .addReg(Dst, RegState::Implicit); 00442 00443 LoadM0(MI, MovRel); 00444 } 00445 00446 bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { 00447 TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); 00448 TRI = 00449 static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); 00450 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 00451 00452 bool HaveKill = false; 00453 bool NeedM0 = false; 00454 bool NeedWQM = false; 00455 bool NeedFlat = false; 00456 unsigned Depth = 0; 00457 00458 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 00459 BI != BE; ++BI) { 00460 00461 MachineBasicBlock &MBB = *BI; 00462 MachineBasicBlock::iterator I, Next; 00463 for (I = MBB.begin(); I != MBB.end(); I = Next) { 00464 Next = std::next(I); 00465 00466 MachineInstr &MI = *I; 00467 if (TII->isDS(MI.getOpcode())) { 00468 NeedM0 = true; 00469 NeedWQM = true; 00470 } 00471 00472 // Flat uses m0 in case it needs to access LDS. 00473 if (TII->isFLAT(MI.getOpcode())) { 00474 NeedM0 = true; 00475 NeedFlat = true; 00476 } 00477 00478 switch (MI.getOpcode()) { 00479 default: break; 00480 case AMDGPU::SI_IF: 00481 ++Depth; 00482 If(MI); 00483 break; 00484 00485 case AMDGPU::SI_ELSE: 00486 Else(MI); 00487 break; 00488 00489 case AMDGPU::SI_BREAK: 00490 Break(MI); 00491 break; 00492 00493 case AMDGPU::SI_IF_BREAK: 00494 IfBreak(MI); 00495 break; 00496 00497 case AMDGPU::SI_ELSE_BREAK: 00498 ElseBreak(MI); 00499 break; 00500 00501 case AMDGPU::SI_LOOP: 00502 ++Depth; 00503 Loop(MI); 00504 break; 00505 00506 case AMDGPU::SI_END_CF: 00507 if (--Depth == 0 && HaveKill) { 00508 SkipIfDead(MI); 00509 HaveKill = false; 00510 } 00511 EndCf(MI); 00512 break; 00513 00514 case AMDGPU::SI_KILL: 00515 if (Depth == 0) 00516 SkipIfDead(MI); 00517 else 00518 HaveKill = true; 00519 Kill(MI); 00520 break; 00521 00522 case AMDGPU::S_BRANCH: 00523 Branch(MI); 00524 break; 00525 00526 case AMDGPU::SI_INDIRECT_SRC: 00527 IndirectSrc(MI); 00528 break; 00529 00530 case AMDGPU::SI_INDIRECT_DST_V1: 00531 case AMDGPU::SI_INDIRECT_DST_V2: 00532 case AMDGPU::SI_INDIRECT_DST_V4: 00533 case AMDGPU::SI_INDIRECT_DST_V8: 00534 case AMDGPU::SI_INDIRECT_DST_V16: 00535 IndirectDst(MI); 00536 break; 00537 00538 case AMDGPU::V_INTERP_P1_F32: 00539 case AMDGPU::V_INTERP_P2_F32: 00540 case AMDGPU::V_INTERP_MOV_F32: 00541 NeedWQM = true; 00542 break; 00543 } 00544 } 00545 } 00546 00547 if (NeedM0) { 00548 MachineBasicBlock &MBB = MF.front(); 00549 // Initialize M0 to a value that won't cause LDS access to be discarded 00550 // due to offset clamping 00551 InitM0ForLDS(MBB.getFirstNonPHI()); 00552 } 00553 00554 if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { 00555 MachineBasicBlock &MBB = MF.front(); 00556 BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), 00557 AMDGPU::EXEC).addReg(AMDGPU::EXEC); 00558 } 00559 00560 // FIXME: This seems inappropriate to do here. 00561 if (NeedFlat && MFI->IsKernel) { 00562 // Insert the prologue initializing the SGPRs pointing to the scratch space 00563 // for flat accesses. 00564 const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); 00565 00566 // TODO: What to use with function calls? 00567 00568 // FIXME: This is reporting stack size that is used in a scratch buffer 00569 // rather than registers as well. 00570 uint64_t StackSizeBytes = FrameInfo->getStackSize(); 00571 00572 int IndirectBegin 00573 = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF); 00574 // Convert register index to 256-byte unit. 00575 uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256); 00576 00577 assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff && 00578 "Stack limits should be smaller than 16-bits"); 00579 00580 // Initialize the flat scratch register pair. 00581 // TODO: Can we use one s_mov_b64 here? 00582 00583 // Offset is in units of 256-bytes. 00584 MachineBasicBlock &MBB = MF.front(); 00585 DebugLoc NoDL; 00586 MachineBasicBlock::iterator Start = MBB.getFirstNonPHI(); 00587 const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32); 00588 00589 BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO) 00590 .addImm(StackOffset); 00591 00592 // Documentation says size is "per-thread scratch size in bytes" 00593 BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI) 00594 .addImm(StackSizeBytes); 00595 } 00596 00597 return true; 00598 }