LLVM API Documentation
00001 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 /// \file 00011 /// \brief Custom DAG lowering for SI 00012 // 00013 //===----------------------------------------------------------------------===// 00014 00015 #ifdef _MSC_VER 00016 // Provide M_PI. 00017 #define _USE_MATH_DEFINES 00018 #include <cmath> 00019 #endif 00020 00021 #include "SIISelLowering.h" 00022 #include "AMDGPU.h" 00023 #include "AMDGPUIntrinsicInfo.h" 00024 #include "AMDGPUSubtarget.h" 00025 #include "SIInstrInfo.h" 00026 #include "SIMachineFunctionInfo.h" 00027 #include "SIRegisterInfo.h" 00028 #include "llvm/ADT/BitVector.h" 00029 #include "llvm/CodeGen/CallingConvLower.h" 00030 #include "llvm/CodeGen/MachineInstrBuilder.h" 00031 #include "llvm/CodeGen/MachineRegisterInfo.h" 00032 #include "llvm/CodeGen/SelectionDAG.h" 00033 #include "llvm/IR/Function.h" 00034 #include "llvm/ADT/SmallString.h" 00035 00036 using namespace llvm; 00037 00038 SITargetLowering::SITargetLowering(TargetMachine &TM) : 00039 AMDGPUTargetLowering(TM) { 00040 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); 00041 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); 00042 00043 addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); 00044 addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); 00045 00046 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); 00047 addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass); 00048 00049 addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); 00050 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); 00051 addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); 00052 00053 addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); 00054 addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); 00055 00056 addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); 00057 addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); 00058 00059 addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass); 00060 addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); 00061 00062 computeRegisterProperties(); 00063 00064 // Condition Codes 00065 setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 00066 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 00067 setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); 00068 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 00069 setCondCodeAction(ISD::SETULE, MVT::f32, Expand); 00070 setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 00071 00072 setCondCodeAction(ISD::SETONE, MVT::f64, Expand); 00073 setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand); 00074 setCondCodeAction(ISD::SETUGE, MVT::f64, Expand); 00075 setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); 00076 setCondCodeAction(ISD::SETULE, MVT::f64, Expand); 00077 setCondCodeAction(ISD::SETULT, MVT::f64, Expand); 00078 00079 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); 00080 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); 00081 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); 00082 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); 00083 00084 setOperationAction(ISD::ADD, MVT::i32, Legal); 00085 setOperationAction(ISD::ADDC, MVT::i32, Legal); 00086 setOperationAction(ISD::ADDE, MVT::i32, Legal); 00087 setOperationAction(ISD::SUBC, MVT::i32, Legal); 00088 setOperationAction(ISD::SUBE, MVT::i32, Legal); 00089 00090 setOperationAction(ISD::FSIN, MVT::f32, Custom); 00091 setOperationAction(ISD::FCOS, MVT::f32, Custom); 00092 00093 // We need to custom lower vector stores from local memory 00094 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 00095 setOperationAction(ISD::LOAD, MVT::v8i32, Custom); 00096 setOperationAction(ISD::LOAD, MVT::v16i32, Custom); 00097 00098 setOperationAction(ISD::STORE, MVT::v8i32, Custom); 00099 setOperationAction(ISD::STORE, MVT::v16i32, Custom); 00100 00101 setOperationAction(ISD::STORE, MVT::i1, Custom); 00102 setOperationAction(ISD::STORE, MVT::i32, Custom); 00103 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 00104 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 00105 00106 setOperationAction(ISD::SELECT, MVT::f32, Promote); 00107 AddPromotedToType(ISD::SELECT, MVT::f32, MVT::i32); 00108 setOperationAction(ISD::SELECT, MVT::i64, Custom); 00109 setOperationAction(ISD::SELECT, MVT::f64, Promote); 00110 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); 00111 00112 setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); 00113 setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); 00114 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 00115 setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); 00116 00117 setOperationAction(ISD::SETCC, MVT::v2i1, Expand); 00118 setOperationAction(ISD::SETCC, MVT::v4i1, Expand); 00119 00120 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); 00121 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); 00122 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); 00123 00124 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal); 00125 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); 00126 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); 00127 00128 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); 00129 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); 00130 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); 00131 00132 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom); 00133 00134 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); 00135 00136 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 00137 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); 00138 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); 00139 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); 00140 00141 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 00142 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 00143 00144 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 00145 setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); 00146 setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom); 00147 setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand); 00148 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand); 00149 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand); 00150 00151 setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote); 00152 setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); 00153 setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom); 00154 setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand); 00155 00156 setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote); 00157 setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); 00158 setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom); 00159 setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand); 00160 setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); 00161 00162 setTruncStoreAction(MVT::i32, MVT::i8, Custom); 00163 setTruncStoreAction(MVT::i32, MVT::i16, Custom); 00164 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 00165 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 00166 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); 00167 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); 00168 00169 setOperationAction(ISD::LOAD, MVT::i1, Custom); 00170 00171 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand); 00172 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); 00173 00174 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 00175 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 00176 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 00177 00178 // These should use UDIVREM, so set them to expand 00179 setOperationAction(ISD::UDIV, MVT::i64, Expand); 00180 setOperationAction(ISD::UREM, MVT::i64, Expand); 00181 00182 // We only support LOAD/STORE and vector manipulation ops for vectors 00183 // with > 4 elements. 00184 MVT VecTypes[] = { 00185 MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32 00186 }; 00187 00188 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); 00189 setOperationAction(ISD::SELECT, MVT::i1, Promote); 00190 00191 for (MVT VT : VecTypes) { 00192 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { 00193 switch(Op) { 00194 case ISD::LOAD: 00195 case ISD::STORE: 00196 case ISD::BUILD_VECTOR: 00197 case ISD::BITCAST: 00198 case ISD::EXTRACT_VECTOR_ELT: 00199 case ISD::INSERT_VECTOR_ELT: 00200 case ISD::INSERT_SUBVECTOR: 00201 case ISD::EXTRACT_SUBVECTOR: 00202 break; 00203 case ISD::CONCAT_VECTORS: 00204 setOperationAction(Op, VT, Custom); 00205 break; 00206 default: 00207 setOperationAction(Op, VT, Expand); 00208 break; 00209 } 00210 } 00211 } 00212 00213 for (int I = MVT::v1f64; I <= MVT::v8f64; ++I) { 00214 MVT::SimpleValueType VT = static_cast<MVT::SimpleValueType>(I); 00215 setOperationAction(ISD::FTRUNC, VT, Expand); 00216 setOperationAction(ISD::FCEIL, VT, Expand); 00217 setOperationAction(ISD::FFLOOR, VT, Expand); 00218 } 00219 00220 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 00221 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 00222 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 00223 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 00224 setOperationAction(ISD::FRINT, MVT::f64, Legal); 00225 } 00226 00227 setOperationAction(ISD::FDIV, MVT::f32, Custom); 00228 00229 setTargetDAGCombine(ISD::FSUB); 00230 setTargetDAGCombine(ISD::SELECT_CC); 00231 setTargetDAGCombine(ISD::SETCC); 00232 00233 setTargetDAGCombine(ISD::UINT_TO_FP); 00234 00235 // All memory operations. Some folding on the pointer operand is done to help 00236 // matching the constant offsets in the addressing modes. 00237 setTargetDAGCombine(ISD::LOAD); 00238 setTargetDAGCombine(ISD::STORE); 00239 setTargetDAGCombine(ISD::ATOMIC_LOAD); 00240 setTargetDAGCombine(ISD::ATOMIC_STORE); 00241 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP); 00242 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS); 00243 setTargetDAGCombine(ISD::ATOMIC_SWAP); 00244 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD); 00245 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB); 00246 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND); 00247 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR); 00248 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR); 00249 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND); 00250 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN); 00251 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); 00252 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); 00253 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); 00254 00255 setSchedulingPreference(Sched::RegPressure); 00256 } 00257 00258 //===----------------------------------------------------------------------===// 00259 // TargetLowering queries 00260 //===----------------------------------------------------------------------===// 00261 00262 // FIXME: This really needs an address space argument. The immediate offset 00263 // size is different for different sets of memory instruction sets. 00264 00265 // The single offset DS instructions have a 16-bit unsigned byte offset. 00266 // 00267 // MUBUF / MTBUF have a 12-bit unsigned byte offset, and additionally can do r + 00268 // r + i with addr64. 32-bit has more addressing mode options. Depending on the 00269 // resource constant, it can also do (i64 r0) + (i32 r1) * (i14 i). 00270 // 00271 // SMRD instructions have an 8-bit, dword offset. 00272 // 00273 bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM, 00274 Type *Ty) const { 00275 // No global is ever allowed as a base. 00276 if (AM.BaseGV) 00277 return false; 00278 00279 // Allow a 16-bit unsigned immediate field, since this is what DS instructions 00280 // use. 00281 if (!isUInt<16>(AM.BaseOffs)) 00282 return false; 00283 00284 // Only support r+r, 00285 switch (AM.Scale) { 00286 case 0: // "r+i" or just "i", depending on HasBaseReg. 00287 break; 00288 case 1: 00289 if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed. 00290 return false; 00291 // Otherwise we have r+r or r+i. 00292 break; 00293 case 2: 00294 if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed. 00295 return false; 00296 // Allow 2*r as r+r. 00297 break; 00298 default: // Don't allow n * r 00299 return false; 00300 } 00301 00302 return true; 00303 } 00304 00305 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 00306 unsigned AddrSpace, 00307 unsigned Align, 00308 bool *IsFast) const { 00309 if (IsFast) 00310 *IsFast = false; 00311 00312 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, 00313 // which isn't a simple VT. 00314 if (!VT.isSimple() || VT == MVT::Other) 00315 return false; 00316 00317 // XXX - CI changes say "Support for unaligned memory accesses" but I don't 00318 // see what for specifically. The wording everywhere else seems to be the 00319 // same. 00320 00321 // XXX - The only mention I see of this in the ISA manual is for LDS direct 00322 // reads the "byte address and must be dword aligned". Is it also true for the 00323 // normal loads and stores? 00324 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) { 00325 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte 00326 // aligned, 8 byte access in a single operation using ds_read2/write2_b32 00327 // with adjacent offsets. 00328 return Align % 4 == 0; 00329 } 00330 00331 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the 00332 // byte-address are ignored, thus forcing Dword alignment. 00333 // This applies to private, global, and constant memory. 00334 if (IsFast) 00335 *IsFast = true; 00336 return VT.bitsGT(MVT::i32); 00337 } 00338 00339 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, 00340 unsigned SrcAlign, bool IsMemset, 00341 bool ZeroMemset, 00342 bool MemcpyStrSrc, 00343 MachineFunction &MF) const { 00344 // FIXME: Should account for address space here. 00345 00346 // The default fallback uses the private pointer size as a guess for a type to 00347 // use. Make sure we switch these to 64-bit accesses. 00348 00349 if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global 00350 return MVT::v4i32; 00351 00352 if (Size >= 8 && DstAlign >= 4) 00353 return MVT::v2i32; 00354 00355 // Use the default. 00356 return MVT::Other; 00357 } 00358 00359 TargetLoweringBase::LegalizeTypeAction 00360 SITargetLowering::getPreferredVectorAction(EVT VT) const { 00361 if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) 00362 return TypeSplitVector; 00363 00364 return TargetLoweringBase::getPreferredVectorAction(VT); 00365 } 00366 00367 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 00368 Type *Ty) const { 00369 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( 00370 getTargetMachine().getSubtargetImpl()->getInstrInfo()); 00371 return TII->isInlineConstant(Imm); 00372 } 00373 00374 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, 00375 SDLoc SL, SDValue Chain, 00376 unsigned Offset, bool Signed) const { 00377 const DataLayout *DL = getDataLayout(); 00378 00379 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 00380 00381 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 00382 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); 00383 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, 00384 MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64); 00385 SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, BasePtr, 00386 DAG.getConstant(Offset, MVT::i64)); 00387 SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS)); 00388 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 00389 00390 return DAG.getLoad(ISD::UNINDEXED, Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD, 00391 VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, 00392 false, // isVolatile 00393 true, // isNonTemporal 00394 true, // isInvariant 00395 DL->getABITypeAlignment(Ty)); // Alignment 00396 } 00397 00398 SDValue SITargetLowering::LowerFormalArguments( 00399 SDValue Chain, 00400 CallingConv::ID CallConv, 00401 bool isVarArg, 00402 const SmallVectorImpl<ISD::InputArg> &Ins, 00403 SDLoc DL, SelectionDAG &DAG, 00404 SmallVectorImpl<SDValue> &InVals) const { 00405 00406 const TargetRegisterInfo *TRI = 00407 getTargetMachine().getSubtargetImpl()->getRegisterInfo(); 00408 00409 MachineFunction &MF = DAG.getMachineFunction(); 00410 FunctionType *FType = MF.getFunction()->getFunctionType(); 00411 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 00412 00413 assert(CallConv == CallingConv::C); 00414 00415 SmallVector<ISD::InputArg, 16> Splits; 00416 BitVector Skipped(Ins.size()); 00417 00418 for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { 00419 const ISD::InputArg &Arg = Ins[i]; 00420 00421 // First check if it's a PS input addr 00422 if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && 00423 !Arg.Flags.isByVal()) { 00424 00425 assert((PSInputNum <= 15) && "Too many PS inputs!"); 00426 00427 if (!Arg.Used) { 00428 // We can savely skip PS inputs 00429 Skipped.set(i); 00430 ++PSInputNum; 00431 continue; 00432 } 00433 00434 Info->PSInputAddr |= 1 << PSInputNum++; 00435 } 00436 00437 // Second split vertices into their elements 00438 if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) { 00439 ISD::InputArg NewArg = Arg; 00440 NewArg.Flags.setSplit(); 00441 NewArg.VT = Arg.VT.getVectorElementType(); 00442 00443 // We REALLY want the ORIGINAL number of vertex elements here, e.g. a 00444 // three or five element vertex only needs three or five registers, 00445 // NOT four or eigth. 00446 Type *ParamType = FType->getParamType(Arg.OrigArgIndex); 00447 unsigned NumElements = ParamType->getVectorNumElements(); 00448 00449 for (unsigned j = 0; j != NumElements; ++j) { 00450 Splits.push_back(NewArg); 00451 NewArg.PartOffset += NewArg.VT.getStoreSize(); 00452 } 00453 00454 } else if (Info->getShaderType() != ShaderType::COMPUTE) { 00455 Splits.push_back(Arg); 00456 } 00457 } 00458 00459 SmallVector<CCValAssign, 16> ArgLocs; 00460 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 00461 *DAG.getContext()); 00462 00463 // At least one interpolation mode must be enabled or else the GPU will hang. 00464 if (Info->getShaderType() == ShaderType::PIXEL && 00465 (Info->PSInputAddr & 0x7F) == 0) { 00466 Info->PSInputAddr |= 1; 00467 CCInfo.AllocateReg(AMDGPU::VGPR0); 00468 CCInfo.AllocateReg(AMDGPU::VGPR1); 00469 } 00470 00471 // The pointer to the list of arguments is stored in SGPR0, SGPR1 00472 // The pointer to the scratch buffer is stored in SGPR2, SGPR3 00473 if (Info->getShaderType() == ShaderType::COMPUTE) { 00474 Info->NumUserSGPRs = 4; 00475 CCInfo.AllocateReg(AMDGPU::SGPR0); 00476 CCInfo.AllocateReg(AMDGPU::SGPR1); 00477 CCInfo.AllocateReg(AMDGPU::SGPR2); 00478 CCInfo.AllocateReg(AMDGPU::SGPR3); 00479 MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass); 00480 MF.addLiveIn(AMDGPU::SGPR2_SGPR3, &AMDGPU::SReg_64RegClass); 00481 } 00482 00483 if (Info->getShaderType() == ShaderType::COMPUTE) { 00484 getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, 00485 Splits); 00486 } 00487 00488 AnalyzeFormalArguments(CCInfo, Splits); 00489 00490 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 00491 00492 const ISD::InputArg &Arg = Ins[i]; 00493 if (Skipped[i]) { 00494 InVals.push_back(DAG.getUNDEF(Arg.VT)); 00495 continue; 00496 } 00497 00498 CCValAssign &VA = ArgLocs[ArgIdx++]; 00499 EVT VT = VA.getLocVT(); 00500 00501 if (VA.isMemLoc()) { 00502 VT = Ins[i].VT; 00503 EVT MemVT = Splits[i].VT; 00504 // The first 36 bytes of the input buffer contains information about 00505 // thread group and global sizes. 00506 SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(), 00507 36 + VA.getLocMemOffset(), 00508 Ins[i].Flags.isSExt()); 00509 00510 const PointerType *ParamTy = 00511 dyn_cast<PointerType>(FType->getParamType(Ins[i].OrigArgIndex)); 00512 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && 00513 ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 00514 // On SI local pointers are just offsets into LDS, so they are always 00515 // less than 16-bits. On CI and newer they could potentially be 00516 // real pointers, so we can't guarantee their size. 00517 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, 00518 DAG.getValueType(MVT::i16)); 00519 } 00520 00521 InVals.push_back(Arg); 00522 continue; 00523 } 00524 assert(VA.isRegLoc() && "Parameter must be in a register!"); 00525 00526 unsigned Reg = VA.getLocReg(); 00527 00528 if (VT == MVT::i64) { 00529 // For now assume it is a pointer 00530 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, 00531 &AMDGPU::SReg_64RegClass); 00532 Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); 00533 InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 00534 continue; 00535 } 00536 00537 const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); 00538 00539 Reg = MF.addLiveIn(Reg, RC); 00540 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 00541 00542 if (Arg.VT.isVector()) { 00543 00544 // Build a vector from the registers 00545 Type *ParamType = FType->getParamType(Arg.OrigArgIndex); 00546 unsigned NumElements = ParamType->getVectorNumElements(); 00547 00548 SmallVector<SDValue, 4> Regs; 00549 Regs.push_back(Val); 00550 for (unsigned j = 1; j != NumElements; ++j) { 00551 Reg = ArgLocs[ArgIdx++].getLocReg(); 00552 Reg = MF.addLiveIn(Reg, RC); 00553 Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 00554 } 00555 00556 // Fill up the missing vector elements 00557 NumElements = Arg.VT.getVectorNumElements() - NumElements; 00558 for (unsigned j = 0; j != NumElements; ++j) 00559 Regs.push_back(DAG.getUNDEF(VT)); 00560 00561 InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs)); 00562 continue; 00563 } 00564 00565 InVals.push_back(Val); 00566 } 00567 return Chain; 00568 } 00569 00570 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( 00571 MachineInstr * MI, MachineBasicBlock * BB) const { 00572 00573 MachineBasicBlock::iterator I = *MI; 00574 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( 00575 getTargetMachine().getSubtargetImpl()->getInstrInfo()); 00576 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 00577 00578 switch (MI->getOpcode()) { 00579 default: 00580 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 00581 case AMDGPU::BRANCH: return BB; 00582 case AMDGPU::SI_ADDR64_RSRC: { 00583 unsigned SuperReg = MI->getOperand(0).getReg(); 00584 unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); 00585 unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass); 00586 unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 00587 unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 00588 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo) 00589 .addOperand(MI->getOperand(1)); 00590 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo) 00591 .addImm(0); 00592 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi) 00593 .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32); 00594 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi) 00595 .addReg(SubRegHiLo) 00596 .addImm(AMDGPU::sub0) 00597 .addReg(SubRegHiHi) 00598 .addImm(AMDGPU::sub1); 00599 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SuperReg) 00600 .addReg(SubRegLo) 00601 .addImm(AMDGPU::sub0_sub1) 00602 .addReg(SubRegHi) 00603 .addImm(AMDGPU::sub2_sub3); 00604 MI->eraseFromParent(); 00605 break; 00606 } 00607 case AMDGPU::SI_BUFFER_RSRC: { 00608 unsigned SuperReg = MI->getOperand(0).getReg(); 00609 unsigned Args[4]; 00610 for (unsigned i = 0, e = 4; i < e; ++i) { 00611 MachineOperand &Arg = MI->getOperand(i + 1); 00612 00613 if (Arg.isReg()) { 00614 Args[i] = Arg.getReg(); 00615 continue; 00616 } 00617 00618 assert(Arg.isImm()); 00619 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); 00620 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), Reg) 00621 .addImm(Arg.getImm()); 00622 Args[i] = Reg; 00623 } 00624 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), 00625 SuperReg) 00626 .addReg(Args[0]) 00627 .addImm(AMDGPU::sub0) 00628 .addReg(Args[1]) 00629 .addImm(AMDGPU::sub1) 00630 .addReg(Args[2]) 00631 .addImm(AMDGPU::sub2) 00632 .addReg(Args[3]) 00633 .addImm(AMDGPU::sub3); 00634 MI->eraseFromParent(); 00635 break; 00636 } 00637 case AMDGPU::V_SUB_F64: { 00638 unsigned DestReg = MI->getOperand(0).getReg(); 00639 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg) 00640 .addImm(0) // SRC0 modifiers 00641 .addReg(MI->getOperand(1).getReg()) 00642 .addImm(1) // SRC1 modifiers 00643 .addReg(MI->getOperand(2).getReg()) 00644 .addImm(0) // CLAMP 00645 .addImm(0); // OMOD 00646 MI->eraseFromParent(); 00647 break; 00648 } 00649 case AMDGPU::SI_RegisterStorePseudo: { 00650 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 00651 unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 00652 MachineInstrBuilder MIB = 00653 BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), 00654 Reg); 00655 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) 00656 MIB.addOperand(MI->getOperand(i)); 00657 00658 MI->eraseFromParent(); 00659 break; 00660 } 00661 case AMDGPU::FCLAMP_SI: { 00662 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( 00663 getTargetMachine().getSubtargetImpl()->getInstrInfo()); 00664 DebugLoc DL = MI->getDebugLoc(); 00665 unsigned DestReg = MI->getOperand(0).getReg(); 00666 BuildMI(*BB, I, DL, TII->get(AMDGPU::V_ADD_F32_e64), DestReg) 00667 .addImm(0) // SRC0 modifiers 00668 .addOperand(MI->getOperand(1)) 00669 .addImm(0) // SRC1 modifiers 00670 .addImm(0) // SRC1 00671 .addImm(1) // CLAMP 00672 .addImm(0); // OMOD 00673 MI->eraseFromParent(); 00674 } 00675 } 00676 return BB; 00677 } 00678 00679 EVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 00680 if (!VT.isVector()) { 00681 return MVT::i1; 00682 } 00683 return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); 00684 } 00685 00686 MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { 00687 return MVT::i32; 00688 } 00689 00690 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 00691 VT = VT.getScalarType(); 00692 00693 if (!VT.isSimple()) 00694 return false; 00695 00696 switch (VT.getSimpleVT().SimpleTy) { 00697 case MVT::f32: 00698 return false; /* There is V_MAD_F32 for f32 */ 00699 case MVT::f64: 00700 return true; 00701 default: 00702 break; 00703 } 00704 00705 return false; 00706 } 00707 00708 //===----------------------------------------------------------------------===// 00709 // Custom DAG Lowering Operations 00710 //===----------------------------------------------------------------------===// 00711 00712 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 00713 switch (Op.getOpcode()) { 00714 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 00715 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); 00716 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 00717 case ISD::LOAD: { 00718 SDValue Result = LowerLOAD(Op, DAG); 00719 assert((!Result.getNode() || 00720 Result.getNode()->getNumValues() == 2) && 00721 "Load should return a value and a chain"); 00722 return Result; 00723 } 00724 00725 case ISD::FSIN: 00726 case ISD::FCOS: 00727 return LowerTrig(Op, DAG); 00728 case ISD::SELECT: return LowerSELECT(Op, DAG); 00729 case ISD::FDIV: return LowerFDIV(Op, DAG); 00730 case ISD::STORE: return LowerSTORE(Op, DAG); 00731 case ISD::GlobalAddress: { 00732 MachineFunction &MF = DAG.getMachineFunction(); 00733 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 00734 return LowerGlobalAddress(MFI, Op, DAG); 00735 } 00736 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 00737 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); 00738 } 00739 return SDValue(); 00740 } 00741 00742 /// \brief Helper function for LowerBRCOND 00743 static SDNode *findUser(SDValue Value, unsigned Opcode) { 00744 00745 SDNode *Parent = Value.getNode(); 00746 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 00747 I != E; ++I) { 00748 00749 if (I.getUse().get() != Value) 00750 continue; 00751 00752 if (I->getOpcode() == Opcode) 00753 return *I; 00754 } 00755 return nullptr; 00756 } 00757 00758 SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { 00759 00760 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op); 00761 unsigned FrameIndex = FINode->getIndex(); 00762 00763 return DAG.getTargetFrameIndex(FrameIndex, MVT::i32); 00764 } 00765 00766 /// This transforms the control flow intrinsics to get the branch destination as 00767 /// last parameter, also switches branch target with BR if the need arise 00768 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 00769 SelectionDAG &DAG) const { 00770 00771 SDLoc DL(BRCOND); 00772 00773 SDNode *Intr = BRCOND.getOperand(1).getNode(); 00774 SDValue Target = BRCOND.getOperand(2); 00775 SDNode *BR = nullptr; 00776 00777 if (Intr->getOpcode() == ISD::SETCC) { 00778 // As long as we negate the condition everything is fine 00779 SDNode *SetCC = Intr; 00780 assert(SetCC->getConstantOperandVal(1) == 1); 00781 assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 00782 ISD::SETNE); 00783 Intr = SetCC->getOperand(0).getNode(); 00784 00785 } else { 00786 // Get the target from BR if we don't negate the condition 00787 BR = findUser(BRCOND, ISD::BR); 00788 Target = BR->getOperand(1); 00789 } 00790 00791 assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); 00792 00793 // Build the result and 00794 SmallVector<EVT, 4> Res; 00795 for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i) 00796 Res.push_back(Intr->getValueType(i)); 00797 00798 // operands of the new intrinsic call 00799 SmallVector<SDValue, 4> Ops; 00800 Ops.push_back(BRCOND.getOperand(0)); 00801 for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i) 00802 Ops.push_back(Intr->getOperand(i)); 00803 Ops.push_back(Target); 00804 00805 // build the new intrinsic call 00806 SDNode *Result = DAG.getNode( 00807 Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, 00808 DAG.getVTList(Res), Ops).getNode(); 00809 00810 if (BR) { 00811 // Give the branch instruction our target 00812 SDValue Ops[] = { 00813 BR->getOperand(0), 00814 BRCOND.getOperand(2) 00815 }; 00816 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); 00817 DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); 00818 BR = NewBR.getNode(); 00819 } 00820 00821 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 00822 00823 // Copy the intrinsic results to registers 00824 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 00825 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 00826 if (!CopyToReg) 00827 continue; 00828 00829 Chain = DAG.getCopyToReg( 00830 Chain, DL, 00831 CopyToReg->getOperand(1), 00832 SDValue(Result, i - 1), 00833 SDValue()); 00834 00835 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 00836 } 00837 00838 // Remove the old intrinsic from the chain 00839 DAG.ReplaceAllUsesOfValueWith( 00840 SDValue(Intr, Intr->getNumValues() - 1), 00841 Intr->getOperand(0)); 00842 00843 return Chain; 00844 } 00845 00846 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, 00847 SDValue Op, 00848 SelectionDAG &DAG) const { 00849 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); 00850 00851 if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 00852 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); 00853 00854 SDLoc DL(GSD); 00855 const GlobalValue *GV = GSD->getGlobal(); 00856 MVT PtrVT = getPointerTy(GSD->getAddressSpace()); 00857 00858 SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT); 00859 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); 00860 00861 SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, 00862 DAG.getConstant(0, MVT::i32)); 00863 SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, 00864 DAG.getConstant(1, MVT::i32)); 00865 00866 SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue), 00867 PtrLo, GA); 00868 SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue), 00869 PtrHi, DAG.getConstant(0, MVT::i32), 00870 SDValue(Lo.getNode(), 1)); 00871 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); 00872 } 00873 00874 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 00875 SelectionDAG &DAG) const { 00876 MachineFunction &MF = DAG.getMachineFunction(); 00877 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 00878 00879 EVT VT = Op.getValueType(); 00880 SDLoc DL(Op); 00881 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 00882 00883 switch (IntrinsicID) { 00884 case Intrinsic::r600_read_ngroups_x: 00885 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0, false); 00886 case Intrinsic::r600_read_ngroups_y: 00887 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4, false); 00888 case Intrinsic::r600_read_ngroups_z: 00889 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8, false); 00890 case Intrinsic::r600_read_global_size_x: 00891 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12, false); 00892 case Intrinsic::r600_read_global_size_y: 00893 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16, false); 00894 case Intrinsic::r600_read_global_size_z: 00895 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20, false); 00896 case Intrinsic::r600_read_local_size_x: 00897 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24, false); 00898 case Intrinsic::r600_read_local_size_y: 00899 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28, false); 00900 case Intrinsic::r600_read_local_size_z: 00901 return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false); 00902 case Intrinsic::r600_read_tgid_x: 00903 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 00904 AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0), VT); 00905 case Intrinsic::r600_read_tgid_y: 00906 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 00907 AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1), VT); 00908 case Intrinsic::r600_read_tgid_z: 00909 return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 00910 AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2), VT); 00911 case Intrinsic::r600_read_tidig_x: 00912 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 00913 AMDGPU::VGPR0, VT); 00914 case Intrinsic::r600_read_tidig_y: 00915 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 00916 AMDGPU::VGPR1, VT); 00917 case Intrinsic::r600_read_tidig_z: 00918 return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 00919 AMDGPU::VGPR2, VT); 00920 case AMDGPUIntrinsic::SI_load_const: { 00921 SDValue Ops[] = { 00922 Op.getOperand(1), 00923 Op.getOperand(2) 00924 }; 00925 00926 MachineMemOperand *MMO = MF.getMachineMemOperand( 00927 MachinePointerInfo(), 00928 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, 00929 VT.getStoreSize(), 4); 00930 return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, 00931 Op->getVTList(), Ops, VT, MMO); 00932 } 00933 case AMDGPUIntrinsic::SI_sample: 00934 return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); 00935 case AMDGPUIntrinsic::SI_sampleb: 00936 return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); 00937 case AMDGPUIntrinsic::SI_sampled: 00938 return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); 00939 case AMDGPUIntrinsic::SI_samplel: 00940 return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); 00941 case AMDGPUIntrinsic::SI_vs_load_input: 00942 return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, 00943 Op.getOperand(1), 00944 Op.getOperand(2), 00945 Op.getOperand(3)); 00946 default: 00947 return AMDGPUTargetLowering::LowerOperation(Op, DAG); 00948 } 00949 } 00950 00951 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, 00952 SelectionDAG &DAG) const { 00953 MachineFunction &MF = DAG.getMachineFunction(); 00954 SDValue Chain = Op.getOperand(0); 00955 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 00956 00957 switch (IntrinsicID) { 00958 case AMDGPUIntrinsic::SI_tbuffer_store: { 00959 SDLoc DL(Op); 00960 SDValue Ops[] = { 00961 Chain, 00962 Op.getOperand(2), 00963 Op.getOperand(3), 00964 Op.getOperand(4), 00965 Op.getOperand(5), 00966 Op.getOperand(6), 00967 Op.getOperand(7), 00968 Op.getOperand(8), 00969 Op.getOperand(9), 00970 Op.getOperand(10), 00971 Op.getOperand(11), 00972 Op.getOperand(12), 00973 Op.getOperand(13), 00974 Op.getOperand(14) 00975 }; 00976 00977 EVT VT = Op.getOperand(3).getValueType(); 00978 00979 MachineMemOperand *MMO = MF.getMachineMemOperand( 00980 MachinePointerInfo(), 00981 MachineMemOperand::MOStore, 00982 VT.getStoreSize(), 4); 00983 return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, 00984 Op->getVTList(), Ops, VT, MMO); 00985 } 00986 default: 00987 return SDValue(); 00988 } 00989 } 00990 00991 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 00992 SDLoc DL(Op); 00993 LoadSDNode *Load = cast<LoadSDNode>(Op); 00994 00995 if (Op.getValueType().isVector()) { 00996 assert(Op.getValueType().getVectorElementType() == MVT::i32 && 00997 "Custom lowering for non-i32 vectors hasn't been implemented."); 00998 unsigned NumElements = Op.getValueType().getVectorNumElements(); 00999 assert(NumElements != 2 && "v2 loads are supported for all address spaces."); 01000 switch (Load->getAddressSpace()) { 01001 default: break; 01002 case AMDGPUAS::GLOBAL_ADDRESS: 01003 case AMDGPUAS::PRIVATE_ADDRESS: 01004 // v4 loads are supported for private and global memory. 01005 if (NumElements <= 4) 01006 break; 01007 // fall-through 01008 case AMDGPUAS::LOCAL_ADDRESS: 01009 return ScalarizeVectorLoad(Op, DAG); 01010 } 01011 } 01012 01013 return AMDGPUTargetLowering::LowerLOAD(Op, DAG); 01014 } 01015 01016 SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, 01017 const SDValue &Op, 01018 SelectionDAG &DAG) const { 01019 return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), 01020 Op.getOperand(2), 01021 Op.getOperand(3), 01022 Op.getOperand(4)); 01023 } 01024 01025 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 01026 if (Op.getValueType() != MVT::i64) 01027 return SDValue(); 01028 01029 SDLoc DL(Op); 01030 SDValue Cond = Op.getOperand(0); 01031 01032 SDValue Zero = DAG.getConstant(0, MVT::i32); 01033 SDValue One = DAG.getConstant(1, MVT::i32); 01034 01035 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1)); 01036 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2)); 01037 01038 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero); 01039 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero); 01040 01041 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1); 01042 01043 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One); 01044 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One); 01045 01046 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); 01047 01048 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi); 01049 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); 01050 } 01051 01052 // Catch division cases where we can use shortcuts with rcp and rsq 01053 // instructions. 01054 SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { 01055 SDLoc SL(Op); 01056 SDValue LHS = Op.getOperand(0); 01057 SDValue RHS = Op.getOperand(1); 01058 EVT VT = Op.getValueType(); 01059 bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; 01060 01061 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { 01062 if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) && 01063 CLHS->isExactlyValue(1.0)) { 01064 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 01065 // the CI documentation has a worst case error of 1 ulp. 01066 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 01067 // use it as long as we aren't trying to use denormals. 01068 01069 // 1.0 / sqrt(x) -> rsq(x) 01070 // 01071 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP 01072 // error seems really high at 2^29 ULP. 01073 if (RHS.getOpcode() == ISD::FSQRT) 01074 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); 01075 01076 // 1.0 / x -> rcp(x) 01077 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 01078 } 01079 } 01080 01081 if (Unsafe) { 01082 // Turn into multiply by the reciprocal. 01083 // x / y -> x * (1.0 / y) 01084 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); 01085 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip); 01086 } 01087 01088 return SDValue(); 01089 } 01090 01091 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { 01092 SDValue FastLowered = LowerFastFDIV(Op, DAG); 01093 if (FastLowered.getNode()) 01094 return FastLowered; 01095 01096 // This uses v_rcp_f32 which does not handle denormals. Let this hit a 01097 // selection error for now rather than do something incorrect. 01098 if (Subtarget->hasFP32Denormals()) 01099 return SDValue(); 01100 01101 SDLoc SL(Op); 01102 SDValue LHS = Op.getOperand(0); 01103 SDValue RHS = Op.getOperand(1); 01104 01105 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); 01106 01107 const APFloat K0Val(BitsToFloat(0x6f800000)); 01108 const SDValue K0 = DAG.getConstantFP(K0Val, MVT::f32); 01109 01110 const APFloat K1Val(BitsToFloat(0x2f800000)); 01111 const SDValue K1 = DAG.getConstantFP(K1Val, MVT::f32); 01112 01113 const SDValue One = DAG.getTargetConstantFP(1.0, MVT::f32); 01114 01115 EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); 01116 01117 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); 01118 01119 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); 01120 01121 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); 01122 01123 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); 01124 01125 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); 01126 01127 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); 01128 } 01129 01130 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { 01131 return SDValue(); 01132 } 01133 01134 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const { 01135 EVT VT = Op.getValueType(); 01136 01137 if (VT == MVT::f32) 01138 return LowerFDIV32(Op, DAG); 01139 01140 if (VT == MVT::f64) 01141 return LowerFDIV64(Op, DAG); 01142 01143 llvm_unreachable("Unexpected type for fdiv"); 01144 } 01145 01146 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 01147 SDLoc DL(Op); 01148 StoreSDNode *Store = cast<StoreSDNode>(Op); 01149 EVT VT = Store->getMemoryVT(); 01150 01151 // These stores are legal. 01152 if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && 01153 VT.isVector() && VT.getVectorNumElements() == 2 && 01154 VT.getVectorElementType() == MVT::i32) 01155 return SDValue(); 01156 01157 if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { 01158 if (VT.isVector() && VT.getVectorNumElements() > 4) 01159 return ScalarizeVectorStore(Op, DAG); 01160 return SDValue(); 01161 } 01162 01163 SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 01164 if (Ret.getNode()) 01165 return Ret; 01166 01167 if (VT.isVector() && VT.getVectorNumElements() >= 8) 01168 return ScalarizeVectorStore(Op, DAG); 01169 01170 if (VT == MVT::i1) 01171 return DAG.getTruncStore(Store->getChain(), DL, 01172 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), 01173 Store->getBasePtr(), MVT::i1, Store->getMemOperand()); 01174 01175 return SDValue(); 01176 } 01177 01178 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 01179 EVT VT = Op.getValueType(); 01180 SDValue Arg = Op.getOperand(0); 01181 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT, 01182 DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg, 01183 DAG.getConstantFP(0.5 / M_PI, VT))); 01184 01185 switch (Op.getOpcode()) { 01186 case ISD::FCOS: 01187 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart); 01188 case ISD::FSIN: 01189 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart); 01190 default: 01191 llvm_unreachable("Wrong trig opcode"); 01192 } 01193 } 01194 01195 //===----------------------------------------------------------------------===// 01196 // Custom DAG optimizations 01197 //===----------------------------------------------------------------------===// 01198 01199 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, 01200 DAGCombinerInfo &DCI) { 01201 EVT VT = N->getValueType(0); 01202 EVT ScalarVT = VT.getScalarType(); 01203 if (ScalarVT != MVT::f32) 01204 return SDValue(); 01205 01206 SelectionDAG &DAG = DCI.DAG; 01207 SDLoc DL(N); 01208 01209 SDValue Src = N->getOperand(0); 01210 EVT SrcVT = Src.getValueType(); 01211 01212 // TODO: We could try to match extracting the higher bytes, which would be 01213 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 01214 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 01215 // about in practice. 01216 if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) { 01217 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { 01218 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); 01219 DCI.AddToWorklist(Cvt.getNode()); 01220 return Cvt; 01221 } 01222 } 01223 01224 // We are primarily trying to catch operations on illegal vector types 01225 // before they are expanded. 01226 // For scalars, we can use the more flexible method of checking masked bits 01227 // after legalization. 01228 if (!DCI.isBeforeLegalize() || 01229 !SrcVT.isVector() || 01230 SrcVT.getVectorElementType() != MVT::i8) { 01231 return SDValue(); 01232 } 01233 01234 assert(DCI.isBeforeLegalize() && "Unexpected legal type"); 01235 01236 // Weird sized vectors are a pain to handle, but we know 3 is really the same 01237 // size as 4. 01238 unsigned NElts = SrcVT.getVectorNumElements(); 01239 if (!SrcVT.isSimple() && NElts != 3) 01240 return SDValue(); 01241 01242 // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to 01243 // prevent a mess from expanding to v4i32 and repacking. 01244 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { 01245 EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); 01246 EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); 01247 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); 01248 01249 LoadSDNode *Load = cast<LoadSDNode>(Src); 01250 SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, 01251 Load->getChain(), 01252 Load->getBasePtr(), 01253 LoadVT, 01254 Load->getMemOperand()); 01255 01256 // Make sure successors of the original load stay after it by updating 01257 // them to use the new Chain. 01258 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); 01259 01260 SmallVector<SDValue, 4> Elts; 01261 if (RegVT.isVector()) 01262 DAG.ExtractVectorElements(NewLoad, Elts); 01263 else 01264 Elts.push_back(NewLoad); 01265 01266 SmallVector<SDValue, 4> Ops; 01267 01268 unsigned EltIdx = 0; 01269 for (SDValue Elt : Elts) { 01270 unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); 01271 for (unsigned I = 0; I < ComponentsInElt; ++I) { 01272 unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; 01273 SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); 01274 DCI.AddToWorklist(Cvt.getNode()); 01275 Ops.push_back(Cvt); 01276 } 01277 01278 ++EltIdx; 01279 } 01280 01281 assert(Ops.size() == NElts); 01282 01283 return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops); 01284 } 01285 01286 return SDValue(); 01287 } 01288 01289 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) 01290 01291 // This is a variant of 01292 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2), 01293 // 01294 // The normal DAG combiner will do this, but only if the add has one use since 01295 // that would increase the number of instructions. 01296 // 01297 // This prevents us from seeing a constant offset that can be folded into a 01298 // memory instruction's addressing mode. If we know the resulting add offset of 01299 // a pointer can be folded into an addressing offset, we can replace the pointer 01300 // operand with the add of new constant offset. This eliminates one of the uses, 01301 // and may allow the remaining use to also be simplified. 01302 // 01303 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, 01304 unsigned AddrSpace, 01305 DAGCombinerInfo &DCI) const { 01306 SDValue N0 = N->getOperand(0); 01307 SDValue N1 = N->getOperand(1); 01308 01309 if (N0.getOpcode() != ISD::ADD) 01310 return SDValue(); 01311 01312 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1); 01313 if (!CN1) 01314 return SDValue(); 01315 01316 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 01317 if (!CAdd) 01318 return SDValue(); 01319 01320 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( 01321 getTargetMachine().getSubtargetImpl()->getInstrInfo()); 01322 01323 // If the resulting offset is too large, we can't fold it into the addressing 01324 // mode offset. 01325 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); 01326 if (!TII->canFoldOffset(Offset.getZExtValue(), AddrSpace)) 01327 return SDValue(); 01328 01329 SelectionDAG &DAG = DCI.DAG; 01330 SDLoc SL(N); 01331 EVT VT = N->getValueType(0); 01332 01333 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); 01334 SDValue COffset = DAG.getConstant(Offset, MVT::i32); 01335 01336 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset); 01337 } 01338 01339 SDValue SITargetLowering::PerformDAGCombine(SDNode *N, 01340 DAGCombinerInfo &DCI) const { 01341 SelectionDAG &DAG = DCI.DAG; 01342 SDLoc DL(N); 01343 EVT VT = N->getValueType(0); 01344 01345 switch (N->getOpcode()) { 01346 default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 01347 case ISD::SETCC: { 01348 SDValue Arg0 = N->getOperand(0); 01349 SDValue Arg1 = N->getOperand(1); 01350 SDValue CC = N->getOperand(2); 01351 ConstantSDNode * C = nullptr; 01352 ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get(); 01353 01354 // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne) 01355 if (VT == MVT::i1 01356 && Arg0.getOpcode() == ISD::SIGN_EXTEND 01357 && Arg0.getOperand(0).getValueType() == MVT::i1 01358 && (C = dyn_cast<ConstantSDNode>(Arg1)) 01359 && C->isNullValue() 01360 && CCOp == ISD::SETNE) { 01361 return SimplifySetCC(VT, Arg0.getOperand(0), 01362 DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL); 01363 } 01364 break; 01365 } 01366 01367 case AMDGPUISD::CVT_F32_UBYTE0: 01368 case AMDGPUISD::CVT_F32_UBYTE1: 01369 case AMDGPUISD::CVT_F32_UBYTE2: 01370 case AMDGPUISD::CVT_F32_UBYTE3: { 01371 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; 01372 01373 SDValue Src = N->getOperand(0); 01374 APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); 01375 01376 APInt KnownZero, KnownOne; 01377 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 01378 !DCI.isBeforeLegalizeOps()); 01379 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 01380 if (TLO.ShrinkDemandedConstant(Src, Demanded) || 01381 TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { 01382 DCI.CommitTargetLoweringOpt(TLO); 01383 } 01384 01385 break; 01386 } 01387 01388 case ISD::UINT_TO_FP: { 01389 return performUCharToFloatCombine(N, DCI); 01390 01391 case ISD::FSUB: { 01392 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) 01393 break; 01394 01395 EVT VT = N->getValueType(0); 01396 01397 // Try to get the fneg to fold into the source modifier. This undoes generic 01398 // DAG combines and folds them into the mad. 01399 if (VT == MVT::f32) { 01400 SDValue LHS = N->getOperand(0); 01401 SDValue RHS = N->getOperand(1); 01402 01403 if (LHS.getOpcode() == ISD::FMUL) { 01404 // (fsub (fmul a, b), c) -> mad a, b, (fneg c) 01405 01406 SDValue A = LHS.getOperand(0); 01407 SDValue B = LHS.getOperand(1); 01408 SDValue C = DAG.getNode(ISD::FNEG, DL, VT, RHS); 01409 01410 return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C); 01411 } 01412 01413 if (RHS.getOpcode() == ISD::FMUL) { 01414 // (fsub c, (fmul a, b)) -> mad (fneg a), b, c 01415 01416 SDValue A = DAG.getNode(ISD::FNEG, DL, VT, RHS.getOperand(0)); 01417 SDValue B = RHS.getOperand(1); 01418 SDValue C = LHS; 01419 01420 return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C); 01421 } 01422 } 01423 01424 break; 01425 } 01426 } 01427 case ISD::LOAD: 01428 case ISD::STORE: 01429 case ISD::ATOMIC_LOAD: 01430 case ISD::ATOMIC_STORE: 01431 case ISD::ATOMIC_CMP_SWAP: 01432 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: 01433 case ISD::ATOMIC_SWAP: 01434 case ISD::ATOMIC_LOAD_ADD: 01435 case ISD::ATOMIC_LOAD_SUB: 01436 case ISD::ATOMIC_LOAD_AND: 01437 case ISD::ATOMIC_LOAD_OR: 01438 case ISD::ATOMIC_LOAD_XOR: 01439 case ISD::ATOMIC_LOAD_NAND: 01440 case ISD::ATOMIC_LOAD_MIN: 01441 case ISD::ATOMIC_LOAD_MAX: 01442 case ISD::ATOMIC_LOAD_UMIN: 01443 case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics. 01444 if (DCI.isBeforeLegalize()) 01445 break; 01446 01447 MemSDNode *MemNode = cast<MemSDNode>(N); 01448 SDValue Ptr = MemNode->getBasePtr(); 01449 01450 // TODO: We could also do this for multiplies. 01451 unsigned AS = MemNode->getAddressSpace(); 01452 if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) { 01453 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI); 01454 if (NewPtr) { 01455 SmallVector<SDValue, 8> NewOps; 01456 for (unsigned I = 0, E = MemNode->getNumOperands(); I != E; ++I) 01457 NewOps.push_back(MemNode->getOperand(I)); 01458 01459 NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; 01460 return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0); 01461 } 01462 } 01463 break; 01464 } 01465 } 01466 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 01467 } 01468 01469 /// \brief Test if RegClass is one of the VSrc classes 01470 static bool isVSrc(unsigned RegClass) { 01471 return AMDGPU::VSrc_32RegClassID == RegClass || 01472 AMDGPU::VSrc_64RegClassID == RegClass; 01473 } 01474 01475 /// \brief Test if RegClass is one of the SSrc classes 01476 static bool isSSrc(unsigned RegClass) { 01477 return AMDGPU::SSrc_32RegClassID == RegClass || 01478 AMDGPU::SSrc_64RegClassID == RegClass; 01479 } 01480 01481 /// \brief Analyze the possible immediate value Op 01482 /// 01483 /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate 01484 /// and the immediate value if it's a literal immediate 01485 int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { 01486 01487 union { 01488 int32_t I; 01489 float F; 01490 } Imm; 01491 01492 if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { 01493 if (Node->getZExtValue() >> 32) { 01494 return -1; 01495 } 01496 Imm.I = Node->getSExtValue(); 01497 } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) { 01498 if (N->getValueType(0) != MVT::f32) 01499 return -1; 01500 Imm.F = Node->getValueAPF().convertToFloat(); 01501 } else 01502 return -1; // It isn't an immediate 01503 01504 if ((Imm.I >= -16 && Imm.I <= 64) || 01505 Imm.F == 0.5f || Imm.F == -0.5f || 01506 Imm.F == 1.0f || Imm.F == -1.0f || 01507 Imm.F == 2.0f || Imm.F == -2.0f || 01508 Imm.F == 4.0f || Imm.F == -4.0f) 01509 return 0; // It's an inline immediate 01510 01511 return Imm.I; // It's a literal immediate 01512 } 01513 01514 /// \brief Try to fold an immediate directly into an instruction 01515 bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, 01516 bool &ScalarSlotUsed) const { 01517 01518 MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand); 01519 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( 01520 getTargetMachine().getSubtargetImpl()->getInstrInfo()); 01521 if (!Mov || !TII->isMov(Mov->getMachineOpcode())) 01522 return false; 01523 01524 const SDValue &Op = Mov->getOperand(0); 01525 int32_t Value = analyzeImmediate(Op.getNode()); 01526 if (Value == -1) { 01527 // Not an immediate at all 01528 return false; 01529 01530 } else if (Value == 0) { 01531 // Inline immediates can always be fold 01532 Operand = Op; 01533 return true; 01534 01535 } else if (Value == Immediate) { 01536 // Already fold literal immediate 01537 Operand = Op; 01538 return true; 01539 01540 } else if (!ScalarSlotUsed && !Immediate) { 01541 // Fold this literal immediate 01542 ScalarSlotUsed = true; 01543 Immediate = Value; 01544 Operand = Op; 01545 return true; 01546 01547 } 01548 01549 return false; 01550 } 01551 01552 const TargetRegisterClass *SITargetLowering::getRegClassForNode( 01553 SelectionDAG &DAG, const SDValue &Op) const { 01554 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( 01555 getTargetMachine().getSubtargetImpl()->getInstrInfo()); 01556 const SIRegisterInfo &TRI = TII->getRegisterInfo(); 01557 01558 if (!Op->isMachineOpcode()) { 01559 switch(Op->getOpcode()) { 01560 case ISD::CopyFromReg: { 01561 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 01562 unsigned Reg = cast<RegisterSDNode>(Op->getOperand(1))->getReg(); 01563 if (TargetRegisterInfo::isVirtualRegister(Reg)) { 01564 return MRI.getRegClass(Reg); 01565 } 01566 return TRI.getPhysRegClass(Reg); 01567 } 01568 default: return nullptr; 01569 } 01570 } 01571 const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode()); 01572 int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass; 01573 if (OpClassID != -1) { 01574 return TRI.getRegClass(OpClassID); 01575 } 01576 switch(Op.getMachineOpcode()) { 01577 case AMDGPU::COPY_TO_REGCLASS: 01578 // Operand 1 is the register class id for COPY_TO_REGCLASS instructions. 01579 OpClassID = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue(); 01580 01581 // If the COPY_TO_REGCLASS instruction is copying to a VSrc register 01582 // class, then the register class for the value could be either a 01583 // VReg or and SReg. In order to get a more accurate 01584 if (OpClassID == AMDGPU::VSrc_32RegClassID || 01585 OpClassID == AMDGPU::VSrc_64RegClassID) { 01586 return getRegClassForNode(DAG, Op.getOperand(0)); 01587 } 01588 return TRI.getRegClass(OpClassID); 01589 case AMDGPU::EXTRACT_SUBREG: { 01590 int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 01591 const TargetRegisterClass *SuperClass = 01592 getRegClassForNode(DAG, Op.getOperand(0)); 01593 return TRI.getSubClassWithSubReg(SuperClass, SubIdx); 01594 } 01595 case AMDGPU::REG_SEQUENCE: 01596 // Operand 0 is the register class id for REG_SEQUENCE instructions. 01597 return TRI.getRegClass( 01598 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()); 01599 default: 01600 return getRegClassFor(Op.getSimpleValueType()); 01601 } 01602 } 01603 01604 /// \brief Does "Op" fit into register class "RegClass" ? 01605 bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op, 01606 unsigned RegClass) const { 01607 const TargetRegisterInfo *TRI = 01608 getTargetMachine().getSubtargetImpl()->getRegisterInfo(); 01609 const TargetRegisterClass *RC = getRegClassForNode(DAG, Op); 01610 if (!RC) { 01611 return false; 01612 } 01613 return TRI->getRegClass(RegClass)->hasSubClassEq(RC); 01614 } 01615 01616 /// \brief Make sure that we don't exeed the number of allowed scalars 01617 void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, 01618 unsigned RegClass, 01619 bool &ScalarSlotUsed) const { 01620 01621 // First map the operands register class to a destination class 01622 if (RegClass == AMDGPU::VSrc_32RegClassID) 01623 RegClass = AMDGPU::VReg_32RegClassID; 01624 else if (RegClass == AMDGPU::VSrc_64RegClassID) 01625 RegClass = AMDGPU::VReg_64RegClassID; 01626 else 01627 return; 01628 01629 // Nothing to do if they fit naturally 01630 if (fitsRegClass(DAG, Operand, RegClass)) 01631 return; 01632 01633 // If the scalar slot isn't used yet use it now 01634 if (!ScalarSlotUsed) { 01635 ScalarSlotUsed = true; 01636 return; 01637 } 01638 01639 // This is a conservative aproach. It is possible that we can't determine the 01640 // correct register class and copy too often, but better safe than sorry. 01641 01642 SDNode *Node; 01643 // We can't use COPY_TO_REGCLASS with FrameIndex arguments. 01644 if (isa<FrameIndexSDNode>(Operand) || 01645 isa<GlobalAddressSDNode>(Operand)) { 01646 unsigned Opcode = Operand.getValueType() == MVT::i32 ? 01647 AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 01648 Node = DAG.getMachineNode(Opcode, SDLoc(), Operand.getValueType(), 01649 Operand); 01650 } else { 01651 SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); 01652 Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), 01653 Operand.getValueType(), Operand, RC); 01654 } 01655 Operand = SDValue(Node, 0); 01656 } 01657 01658 /// \returns true if \p Node's operands are different from the SDValue list 01659 /// \p Ops 01660 static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) { 01661 for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) { 01662 if (Ops[i].getNode() != Node->getOperand(i).getNode()) { 01663 return true; 01664 } 01665 } 01666 return false; 01667 } 01668 01669 /// \brief Try to commute instructions and insert copies in order to satisfy the 01670 /// operand constraints. 01671 SDNode *SITargetLowering::legalizeOperands(MachineSDNode *Node, 01672 SelectionDAG &DAG) const { 01673 // Original encoding (either e32 or e64) 01674 int Opcode = Node->getMachineOpcode(); 01675 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( 01676 getTargetMachine().getSubtargetImpl()->getInstrInfo()); 01677 const MCInstrDesc *Desc = &TII->get(Opcode); 01678 01679 unsigned NumDefs = Desc->getNumDefs(); 01680 unsigned NumOps = Desc->getNumOperands(); 01681 01682 // Commuted opcode if available 01683 int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1; 01684 const MCInstrDesc *DescRev = OpcodeRev == -1 ? nullptr : &TII->get(OpcodeRev); 01685 01686 assert(!DescRev || DescRev->getNumDefs() == NumDefs); 01687 assert(!DescRev || DescRev->getNumOperands() == NumOps); 01688 01689 int32_t Immediate = Desc->getSize() == 4 ? 0 : -1; 01690 bool HaveVSrc = false, HaveSSrc = false; 01691 01692 // First figure out what we already have in this instruction. 01693 for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; 01694 i != e && Op < NumOps; ++i, ++Op) { 01695 01696 unsigned RegClass = Desc->OpInfo[Op].RegClass; 01697 if (isVSrc(RegClass)) 01698 HaveVSrc = true; 01699 else if (isSSrc(RegClass)) 01700 HaveSSrc = true; 01701 else 01702 continue; 01703 01704 int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode()); 01705 if (Imm != -1 && Imm != 0) { 01706 // Literal immediate 01707 Immediate = Imm; 01708 } 01709 } 01710 01711 // If we neither have VSrc nor SSrc, it makes no sense to continue. 01712 if (!HaveVSrc && !HaveSSrc) 01713 return Node; 01714 01715 // No scalar allowed when we have both VSrc and SSrc 01716 bool ScalarSlotUsed = HaveVSrc && HaveSSrc; 01717 01718 // Second go over the operands and try to fold them 01719 std::vector<SDValue> Ops; 01720 for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; 01721 i != e && Op < NumOps; ++i, ++Op) { 01722 01723 const SDValue &Operand = Node->getOperand(i); 01724 Ops.push_back(Operand); 01725 01726 // Already folded immediate? 01727 if (isa<ConstantSDNode>(Operand.getNode()) || 01728 isa<ConstantFPSDNode>(Operand.getNode())) 01729 continue; 01730 01731 // Is this a VSrc or SSrc operand? 01732 unsigned RegClass = Desc->OpInfo[Op].RegClass; 01733 if (isVSrc(RegClass) || isSSrc(RegClass)) { 01734 // Try to fold the immediates 01735 if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) { 01736 // Folding didn't work, make sure we don't hit the SReg limit. 01737 ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); 01738 } 01739 continue; 01740 } else { 01741 // If it's not a VSrc or SSrc operand check if we have a GlobalAddress. 01742 // These will be lowered to immediates, so we will need to insert a MOV. 01743 if (isa<GlobalAddressSDNode>(Ops[i])) { 01744 SDNode *Node = DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(), 01745 Operand.getValueType(), Operand); 01746 Ops[i] = SDValue(Node, 0); 01747 } 01748 } 01749 01750 if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) { 01751 01752 unsigned OtherRegClass = Desc->OpInfo[NumDefs].RegClass; 01753 assert(isVSrc(OtherRegClass) || isSSrc(OtherRegClass)); 01754 01755 // Test if it makes sense to swap operands 01756 if (foldImm(Ops[1], Immediate, ScalarSlotUsed) || 01757 (!fitsRegClass(DAG, Ops[1], RegClass) && 01758 fitsRegClass(DAG, Ops[1], OtherRegClass))) { 01759 01760 // Swap commutable operands 01761 std::swap(Ops[0], Ops[1]); 01762 01763 Desc = DescRev; 01764 DescRev = nullptr; 01765 continue; 01766 } 01767 } 01768 } 01769 01770 // Add optional chain and glue 01771 for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i) 01772 Ops.push_back(Node->getOperand(i)); 01773 01774 // Nodes that have a glue result are not CSE'd by getMachineNode(), so in 01775 // this case a brand new node is always be created, even if the operands 01776 // are the same as before. So, manually check if anything has been changed. 01777 if (Desc->Opcode == Opcode && !isNodeChanged(Node, Ops)) { 01778 return Node; 01779 } 01780 01781 // Create a complete new instruction 01782 return DAG.getMachineNode(Desc->Opcode, SDLoc(Node), Node->getVTList(), Ops); 01783 } 01784 01785 /// \brief Helper function for adjustWritemask 01786 static unsigned SubIdx2Lane(unsigned Idx) { 01787 switch (Idx) { 01788 default: return 0; 01789 case AMDGPU::sub0: return 0; 01790 case AMDGPU::sub1: return 1; 01791 case AMDGPU::sub2: return 2; 01792 case AMDGPU::sub3: return 3; 01793 } 01794 } 01795 01796 /// \brief Adjust the writemask of MIMG instructions 01797 void SITargetLowering::adjustWritemask(MachineSDNode *&Node, 01798 SelectionDAG &DAG) const { 01799 SDNode *Users[4] = { }; 01800 unsigned Lane = 0; 01801 unsigned OldDmask = Node->getConstantOperandVal(0); 01802 unsigned NewDmask = 0; 01803 01804 // Try to figure out the used register components 01805 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); 01806 I != E; ++I) { 01807 01808 // Abort if we can't understand the usage 01809 if (!I->isMachineOpcode() || 01810 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 01811 return; 01812 01813 // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. 01814 // Note that subregs are packed, i.e. Lane==0 is the first bit set 01815 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit 01816 // set, etc. 01817 Lane = SubIdx2Lane(I->getConstantOperandVal(1)); 01818 01819 // Set which texture component corresponds to the lane. 01820 unsigned Comp; 01821 for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { 01822 assert(Dmask); 01823 Comp = countTrailingZeros(Dmask); 01824 Dmask &= ~(1 << Comp); 01825 } 01826 01827 // Abort if we have more than one user per component 01828 if (Users[Lane]) 01829 return; 01830 01831 Users[Lane] = *I; 01832 NewDmask |= 1 << Comp; 01833 } 01834 01835 // Abort if there's no change 01836 if (NewDmask == OldDmask) 01837 return; 01838 01839 // Adjust the writemask in the node 01840 std::vector<SDValue> Ops; 01841 Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32)); 01842 for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) 01843 Ops.push_back(Node->getOperand(i)); 01844 Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); 01845 01846 // If we only got one lane, replace it with a copy 01847 // (if NewDmask has only one bit set...) 01848 if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { 01849 SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32); 01850 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 01851 SDLoc(), Users[Lane]->getValueType(0), 01852 SDValue(Node, 0), RC); 01853 DAG.ReplaceAllUsesWith(Users[Lane], Copy); 01854 return; 01855 } 01856 01857 // Update the users of the node with the new indices 01858 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { 01859 01860 SDNode *User = Users[i]; 01861 if (!User) 01862 continue; 01863 01864 SDValue Op = DAG.getTargetConstant(Idx, MVT::i32); 01865 DAG.UpdateNodeOperands(User, User->getOperand(0), Op); 01866 01867 switch (Idx) { 01868 default: break; 01869 case AMDGPU::sub0: Idx = AMDGPU::sub1; break; 01870 case AMDGPU::sub1: Idx = AMDGPU::sub2; break; 01871 case AMDGPU::sub2: Idx = AMDGPU::sub3; break; 01872 } 01873 } 01874 } 01875 01876 /// \brief Fold the instructions after selecting them. 01877 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 01878 SelectionDAG &DAG) const { 01879 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( 01880 getTargetMachine().getSubtargetImpl()->getInstrInfo()); 01881 Node = AdjustRegClass(Node, DAG); 01882 01883 if (TII->isMIMG(Node->getMachineOpcode())) 01884 adjustWritemask(Node, DAG); 01885 01886 return legalizeOperands(Node, DAG); 01887 } 01888 01889 /// \brief Assign the register class depending on the number of 01890 /// bits set in the writemask 01891 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, 01892 SDNode *Node) const { 01893 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>( 01894 getTargetMachine().getSubtargetImpl()->getInstrInfo()); 01895 01896 if (TII->isMIMG(MI->getOpcode())) { 01897 unsigned VReg = MI->getOperand(0).getReg(); 01898 unsigned Writemask = MI->getOperand(1).getImm(); 01899 unsigned BitsSet = 0; 01900 for (unsigned i = 0; i < 4; ++i) 01901 BitsSet += Writemask & (1 << i) ? 1 : 0; 01902 01903 const TargetRegisterClass *RC; 01904 switch (BitsSet) { 01905 default: return; 01906 case 1: RC = &AMDGPU::VReg_32RegClass; break; 01907 case 2: RC = &AMDGPU::VReg_64RegClass; break; 01908 case 3: RC = &AMDGPU::VReg_96RegClass; break; 01909 } 01910 01911 unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); 01912 MI->setDesc(TII->get(NewOpcode)); 01913 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 01914 MRI.setRegClass(VReg, RC); 01915 return; 01916 } 01917 01918 // Replace unused atomics with the no return version. 01919 int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode()); 01920 if (NoRetAtomicOp != -1) { 01921 if (!Node->hasAnyUseOfValue(0)) { 01922 MI->setDesc(TII->get(NoRetAtomicOp)); 01923 MI->RemoveOperand(0); 01924 } 01925 01926 return; 01927 } 01928 } 01929 01930 MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N, 01931 SelectionDAG &DAG) const { 01932 01933 SDLoc DL(N); 01934 unsigned NewOpcode = N->getMachineOpcode(); 01935 01936 switch (N->getMachineOpcode()) { 01937 default: return N; 01938 case AMDGPU::S_LOAD_DWORD_IMM: 01939 NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64; 01940 // Fall-through 01941 case AMDGPU::S_LOAD_DWORDX2_SGPR: 01942 if (NewOpcode == N->getMachineOpcode()) { 01943 NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; 01944 } 01945 // Fall-through 01946 case AMDGPU::S_LOAD_DWORDX4_IMM: 01947 case AMDGPU::S_LOAD_DWORDX4_SGPR: { 01948 if (NewOpcode == N->getMachineOpcode()) { 01949 NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; 01950 } 01951 if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) { 01952 return N; 01953 } 01954 ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1)); 01955 MachineSDNode *RSrc = DAG.getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, 01956 MVT::i128, 01957 DAG.getConstant(0, MVT::i64)); 01958 01959 SmallVector<SDValue, 8> Ops; 01960 Ops.push_back(SDValue(RSrc, 0)); 01961 Ops.push_back(N->getOperand(0)); 01962 Ops.push_back(DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32)); 01963 01964 // Copy remaining operands so we keep any chain and glue nodes that follow 01965 // the normal operands. 01966 for (unsigned I = 2, E = N->getNumOperands(); I != E; ++I) 01967 Ops.push_back(N->getOperand(I)); 01968 01969 return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops); 01970 } 01971 } 01972 } 01973 01974 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 01975 const TargetRegisterClass *RC, 01976 unsigned Reg, EVT VT) const { 01977 SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); 01978 01979 return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), 01980 cast<RegisterSDNode>(VReg)->getReg(), VT); 01981 }