LLVM API Documentation
00001 //===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer --------------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 /// \file 00011 /// 00012 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary 00013 /// code. When passed an MCAsmStreamer it prints assembly and when passed 00014 /// an MCObjectStreamer it outputs binary code. 00015 // 00016 //===----------------------------------------------------------------------===// 00017 // 00018 00019 #include "AMDGPUAsmPrinter.h" 00020 #include "AMDGPU.h" 00021 #include "AMDGPUSubtarget.h" 00022 #include "R600Defines.h" 00023 #include "R600MachineFunctionInfo.h" 00024 #include "R600RegisterInfo.h" 00025 #include "SIDefines.h" 00026 #include "SIMachineFunctionInfo.h" 00027 #include "SIRegisterInfo.h" 00028 #include "llvm/CodeGen/MachineFrameInfo.h" 00029 #include "llvm/MC/MCContext.h" 00030 #include "llvm/MC/MCSectionELF.h" 00031 #include "llvm/MC/MCStreamer.h" 00032 #include "llvm/Support/ELF.h" 00033 #include "llvm/Support/MathExtras.h" 00034 #include "llvm/Support/TargetRegistry.h" 00035 #include "llvm/Target/TargetLoweringObjectFile.h" 00036 00037 using namespace llvm; 00038 00039 // TODO: This should get the default rounding mode from the kernel. We just set 00040 // the default here, but this could change if the OpenCL rounding mode pragmas 00041 // are used. 00042 // 00043 // The denormal mode here should match what is reported by the OpenCL runtime 00044 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but 00045 // can also be override to flush with the -cl-denorms-are-zero compiler flag. 00046 // 00047 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double 00048 // precision, and leaves single precision to flush all and does not report 00049 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports 00050 // CL_FP_DENORM for both. 00051 // 00052 // FIXME: It seems some instructions do not support single precision denormals 00053 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32, 00054 // and sin_f32, cos_f32 on most parts). 00055 00056 // We want to use these instructions, and using fp32 denormals also causes 00057 // instructions to run at the double precision rate for the device so it's 00058 // probably best to just report no single precision denormals. 00059 static uint32_t getFPMode(const MachineFunction &F) { 00060 const AMDGPUSubtarget& ST = F.getTarget().getSubtarget<AMDGPUSubtarget>(); 00061 // TODO: Is there any real use for the flush in only / flush out only modes? 00062 00063 uint32_t FP32Denormals = 00064 ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 00065 00066 uint32_t FP64Denormals = 00067 ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 00068 00069 return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | 00070 FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | 00071 FP_DENORM_MODE_SP(FP32Denormals) | 00072 FP_DENORM_MODE_DP(FP64Denormals); 00073 } 00074 00075 static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm, 00076 MCStreamer &Streamer) { 00077 return new AMDGPUAsmPrinter(tm, Streamer); 00078 } 00079 00080 extern "C" void LLVMInitializeR600AsmPrinter() { 00081 TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass); 00082 } 00083 00084 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer) 00085 : AsmPrinter(TM, Streamer) { 00086 DisasmEnabled = TM.getSubtarget<AMDGPUSubtarget>().dumpCode(); 00087 } 00088 00089 void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { 00090 00091 // This label is used to mark the end of the .text section. 00092 const TargetLoweringObjectFile &TLOF = getObjFileLowering(); 00093 OutStreamer.SwitchSection(TLOF.getTextSection()); 00094 MCSymbol *EndOfTextLabel = 00095 OutContext.GetOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); 00096 OutStreamer.EmitLabel(EndOfTextLabel); 00097 } 00098 00099 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { 00100 SetupMachineFunction(MF); 00101 00102 OutStreamer.emitRawComment(Twine('@') + MF.getName() + Twine(':')); 00103 00104 MCContext &Context = getObjFileLowering().getContext(); 00105 const MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config", 00106 ELF::SHT_PROGBITS, 0, 00107 SectionKind::getReadOnly()); 00108 OutStreamer.SwitchSection(ConfigSection); 00109 00110 const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); 00111 SIProgramInfo KernelInfo; 00112 if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) { 00113 getSIProgramInfo(KernelInfo, MF); 00114 EmitProgramInfoSI(MF, KernelInfo); 00115 } else { 00116 EmitProgramInfoR600(MF); 00117 } 00118 00119 DisasmLines.clear(); 00120 HexLines.clear(); 00121 DisasmLineMaxLen = 0; 00122 00123 OutStreamer.SwitchSection(getObjFileLowering().getTextSection()); 00124 EmitFunctionBody(); 00125 00126 if (isVerbose()) { 00127 const MCSectionELF *CommentSection 00128 = Context.getELFSection(".AMDGPU.csdata", 00129 ELF::SHT_PROGBITS, 0, 00130 SectionKind::getReadOnly()); 00131 OutStreamer.SwitchSection(CommentSection); 00132 00133 if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { 00134 OutStreamer.emitRawComment(" Kernel info:", false); 00135 OutStreamer.emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen), 00136 false); 00137 OutStreamer.emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR), 00138 false); 00139 OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), 00140 false); 00141 OutStreamer.emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode), 00142 false); 00143 OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode), 00144 false); 00145 OutStreamer.emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize), 00146 false); 00147 } else { 00148 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 00149 OutStreamer.emitRawComment( 00150 Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize))); 00151 } 00152 } 00153 00154 if (STM.dumpCode()) { 00155 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 00156 MF.dump(); 00157 #endif 00158 00159 if (DisasmEnabled) { 00160 OutStreamer.SwitchSection(Context.getELFSection(".AMDGPU.disasm", 00161 ELF::SHT_NOTE, 0, 00162 SectionKind::getReadOnly())); 00163 00164 for (size_t i = 0; i < DisasmLines.size(); ++i) { 00165 std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' '); 00166 Comment += " ; " + HexLines[i] + "\n"; 00167 00168 OutStreamer.EmitBytes(StringRef(DisasmLines[i])); 00169 OutStreamer.EmitBytes(StringRef(Comment)); 00170 } 00171 } 00172 } 00173 00174 return false; 00175 } 00176 00177 void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { 00178 unsigned MaxGPR = 0; 00179 bool killPixel = false; 00180 const R600RegisterInfo *RI = static_cast<const R600RegisterInfo *>( 00181 TM.getSubtargetImpl()->getRegisterInfo()); 00182 const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 00183 const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); 00184 00185 for (const MachineBasicBlock &MBB : MF) { 00186 for (const MachineInstr &MI : MBB) { 00187 if (MI.getOpcode() == AMDGPU::KILLGT) 00188 killPixel = true; 00189 unsigned numOperands = MI.getNumOperands(); 00190 for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { 00191 const MachineOperand &MO = MI.getOperand(op_idx); 00192 if (!MO.isReg()) 00193 continue; 00194 unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff; 00195 00196 // Register with value > 127 aren't GPR 00197 if (HWReg > 127) 00198 continue; 00199 MaxGPR = std::max(MaxGPR, HWReg); 00200 } 00201 } 00202 } 00203 00204 unsigned RsrcReg; 00205 if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) { 00206 // Evergreen / Northern Islands 00207 switch (MFI->getShaderType()) { 00208 default: // Fall through 00209 case ShaderType::COMPUTE: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; 00210 case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; 00211 case ShaderType::PIXEL: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break; 00212 case ShaderType::VERTEX: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break; 00213 } 00214 } else { 00215 // R600 / R700 00216 switch (MFI->getShaderType()) { 00217 default: // Fall through 00218 case ShaderType::GEOMETRY: // Fall through 00219 case ShaderType::COMPUTE: // Fall through 00220 case ShaderType::VERTEX: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break; 00221 case ShaderType::PIXEL: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break; 00222 } 00223 } 00224 00225 OutStreamer.EmitIntValue(RsrcReg, 4); 00226 OutStreamer.EmitIntValue(S_NUM_GPRS(MaxGPR + 1) | 00227 S_STACK_SIZE(MFI->StackSize), 4); 00228 OutStreamer.EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); 00229 OutStreamer.EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); 00230 00231 if (MFI->getShaderType() == ShaderType::COMPUTE) { 00232 OutStreamer.EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); 00233 OutStreamer.EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4); 00234 } 00235 } 00236 00237 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, 00238 const MachineFunction &MF) const { 00239 uint64_t CodeSize = 0; 00240 unsigned MaxSGPR = 0; 00241 unsigned MaxVGPR = 0; 00242 bool VCCUsed = false; 00243 bool FlatUsed = false; 00244 const SIRegisterInfo *RI = static_cast<const SIRegisterInfo *>( 00245 TM.getSubtargetImpl()->getRegisterInfo()); 00246 00247 for (const MachineBasicBlock &MBB : MF) { 00248 for (const MachineInstr &MI : MBB) { 00249 // TODO: CodeSize should account for multiple functions. 00250 CodeSize += MI.getDesc().Size; 00251 00252 unsigned numOperands = MI.getNumOperands(); 00253 for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { 00254 const MachineOperand &MO = MI.getOperand(op_idx); 00255 unsigned width = 0; 00256 bool isSGPR = false; 00257 00258 if (!MO.isReg()) { 00259 continue; 00260 } 00261 unsigned reg = MO.getReg(); 00262 if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO || 00263 reg == AMDGPU::VCC_HI) { 00264 VCCUsed = true; 00265 continue; 00266 } else if (reg == AMDGPU::FLAT_SCR || 00267 reg == AMDGPU::FLAT_SCR_LO || 00268 reg == AMDGPU::FLAT_SCR_HI) { 00269 FlatUsed = true; 00270 continue; 00271 } 00272 00273 switch (reg) { 00274 default: break; 00275 case AMDGPU::SCC: 00276 case AMDGPU::EXEC: 00277 case AMDGPU::M0: 00278 continue; 00279 } 00280 00281 if (AMDGPU::SReg_32RegClass.contains(reg)) { 00282 isSGPR = true; 00283 width = 1; 00284 } else if (AMDGPU::VReg_32RegClass.contains(reg)) { 00285 isSGPR = false; 00286 width = 1; 00287 } else if (AMDGPU::SReg_64RegClass.contains(reg)) { 00288 isSGPR = true; 00289 width = 2; 00290 } else if (AMDGPU::VReg_64RegClass.contains(reg)) { 00291 isSGPR = false; 00292 width = 2; 00293 } else if (AMDGPU::VReg_96RegClass.contains(reg)) { 00294 isSGPR = false; 00295 width = 3; 00296 } else if (AMDGPU::SReg_128RegClass.contains(reg)) { 00297 isSGPR = true; 00298 width = 4; 00299 } else if (AMDGPU::VReg_128RegClass.contains(reg)) { 00300 isSGPR = false; 00301 width = 4; 00302 } else if (AMDGPU::SReg_256RegClass.contains(reg)) { 00303 isSGPR = true; 00304 width = 8; 00305 } else if (AMDGPU::VReg_256RegClass.contains(reg)) { 00306 isSGPR = false; 00307 width = 8; 00308 } else if (AMDGPU::SReg_512RegClass.contains(reg)) { 00309 isSGPR = true; 00310 width = 16; 00311 } else if (AMDGPU::VReg_512RegClass.contains(reg)) { 00312 isSGPR = false; 00313 width = 16; 00314 } else { 00315 llvm_unreachable("Unknown register class"); 00316 } 00317 unsigned hwReg = RI->getEncodingValue(reg) & 0xff; 00318 unsigned maxUsed = hwReg + width - 1; 00319 if (isSGPR) { 00320 MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR; 00321 } else { 00322 MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR; 00323 } 00324 } 00325 } 00326 } 00327 00328 if (VCCUsed) 00329 MaxSGPR += 2; 00330 00331 if (FlatUsed) 00332 MaxSGPR += 2; 00333 00334 // We found the maximum register index. They start at 0, so add one to get the 00335 // number of registers. 00336 ProgInfo.NumVGPR = MaxVGPR + 1; 00337 ProgInfo.NumSGPR = MaxSGPR + 1; 00338 00339 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode 00340 // register. 00341 ProgInfo.FloatMode = getFPMode(MF); 00342 00343 // XXX: Not quite sure what this does, but sc seems to unset this. 00344 ProgInfo.IEEEMode = 0; 00345 00346 // Do not clamp NAN to 0. 00347 ProgInfo.DX10Clamp = 0; 00348 00349 const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); 00350 ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF); 00351 00352 ProgInfo.FlatUsed = FlatUsed; 00353 ProgInfo.VCCUsed = VCCUsed; 00354 ProgInfo.CodeLen = CodeSize; 00355 } 00356 00357 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, 00358 const SIProgramInfo &KernelInfo) { 00359 const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>(); 00360 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 00361 00362 unsigned RsrcReg; 00363 switch (MFI->getShaderType()) { 00364 default: // Fall through 00365 case ShaderType::COMPUTE: RsrcReg = R_00B848_COMPUTE_PGM_RSRC1; break; 00366 case ShaderType::GEOMETRY: RsrcReg = R_00B228_SPI_SHADER_PGM_RSRC1_GS; break; 00367 case ShaderType::PIXEL: RsrcReg = R_00B028_SPI_SHADER_PGM_RSRC1_PS; break; 00368 case ShaderType::VERTEX: RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break; 00369 } 00370 00371 unsigned LDSAlignShift; 00372 if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { 00373 // LDS is allocated in 64 dword blocks. 00374 LDSAlignShift = 8; 00375 } else { 00376 // LDS is allocated in 128 dword blocks. 00377 LDSAlignShift = 9; 00378 } 00379 00380 unsigned LDSBlocks = 00381 RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; 00382 00383 // Scratch is allocated in 256 dword blocks. 00384 unsigned ScratchAlignShift = 10; 00385 // We need to program the hardware with the amount of scratch memory that 00386 // is used by the entire wave. KernelInfo.ScratchSize is the amount of 00387 // scratch memory used per thread. 00388 unsigned ScratchBlocks = 00389 RoundUpToAlignment(KernelInfo.ScratchSize * STM.getWavefrontSize(), 00390 1 << ScratchAlignShift) >> ScratchAlignShift; 00391 00392 if (MFI->getShaderType() == ShaderType::COMPUTE) { 00393 OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); 00394 00395 const uint32_t ComputePGMRSrc1 = 00396 S_00B848_VGPRS(KernelInfo.NumVGPR / 4) | 00397 S_00B848_SGPRS(KernelInfo.NumSGPR / 8) | 00398 S_00B848_PRIORITY(KernelInfo.Priority) | 00399 S_00B848_FLOAT_MODE(KernelInfo.FloatMode) | 00400 S_00B848_PRIV(KernelInfo.Priv) | 00401 S_00B848_DX10_CLAMP(KernelInfo.DX10Clamp) | 00402 S_00B848_IEEE_MODE(KernelInfo.DebugMode) | 00403 S_00B848_IEEE_MODE(KernelInfo.IEEEMode); 00404 00405 OutStreamer.EmitIntValue(ComputePGMRSrc1, 4); 00406 00407 OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); 00408 const uint32_t ComputePGMRSrc2 = 00409 S_00B84C_LDS_SIZE(LDSBlocks) | 00410 S_00B02C_SCRATCH_EN(ScratchBlocks > 0); 00411 00412 OutStreamer.EmitIntValue(ComputePGMRSrc2, 4); 00413 00414 OutStreamer.EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); 00415 OutStreamer.EmitIntValue(S_00B860_WAVESIZE(ScratchBlocks), 4); 00416 00417 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = 00418 // 0" comment but I don't see a corresponding field in the register spec. 00419 } else { 00420 OutStreamer.EmitIntValue(RsrcReg, 4); 00421 OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) | 00422 S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4); 00423 } 00424 00425 if (MFI->getShaderType() == ShaderType::PIXEL) { 00426 OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); 00427 OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4); 00428 OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); 00429 OutStreamer.EmitIntValue(MFI->PSInputAddr, 4); 00430 } 00431 }