LLVM API Documentation
00001 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 /// \file 00011 /// \brief This is the parent TargetLowering class for hardware code gen 00012 /// targets. 00013 // 00014 //===----------------------------------------------------------------------===// 00015 00016 #include "AMDGPUISelLowering.h" 00017 #include "AMDGPU.h" 00018 #include "AMDGPUFrameLowering.h" 00019 #include "AMDGPUIntrinsicInfo.h" 00020 #include "AMDGPURegisterInfo.h" 00021 #include "AMDGPUSubtarget.h" 00022 #include "R600MachineFunctionInfo.h" 00023 #include "SIMachineFunctionInfo.h" 00024 #include "llvm/CodeGen/CallingConvLower.h" 00025 #include "llvm/CodeGen/MachineFunction.h" 00026 #include "llvm/CodeGen/MachineRegisterInfo.h" 00027 #include "llvm/CodeGen/SelectionDAG.h" 00028 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" 00029 #include "llvm/IR/DataLayout.h" 00030 #include "llvm/IR/DiagnosticInfo.h" 00031 #include "llvm/IR/DiagnosticPrinter.h" 00032 00033 using namespace llvm; 00034 00035 namespace { 00036 00037 /// Diagnostic information for unimplemented or unsupported feature reporting. 00038 class DiagnosticInfoUnsupported : public DiagnosticInfo { 00039 private: 00040 const Twine &Description; 00041 const Function &Fn; 00042 00043 static int KindID; 00044 00045 static int getKindID() { 00046 if (KindID == 0) 00047 KindID = llvm::getNextAvailablePluginDiagnosticKind(); 00048 return KindID; 00049 } 00050 00051 public: 00052 DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc, 00053 DiagnosticSeverity Severity = DS_Error) 00054 : DiagnosticInfo(getKindID(), Severity), 00055 Description(Desc), 00056 Fn(Fn) { } 00057 00058 const Function &getFunction() const { return Fn; } 00059 const Twine &getDescription() const { return Description; } 00060 00061 void print(DiagnosticPrinter &DP) const override { 00062 DP << "unsupported " << getDescription() << " in " << Fn.getName(); 00063 } 00064 00065 static bool classof(const DiagnosticInfo *DI) { 00066 return DI->getKind() == getKindID(); 00067 } 00068 }; 00069 00070 int DiagnosticInfoUnsupported::KindID = 0; 00071 } 00072 00073 00074 static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, 00075 CCValAssign::LocInfo LocInfo, 00076 ISD::ArgFlagsTy ArgFlags, CCState &State) { 00077 unsigned Offset = State.AllocateStack(ValVT.getStoreSize(), 00078 ArgFlags.getOrigAlign()); 00079 State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); 00080 00081 return true; 00082 } 00083 00084 #include "AMDGPUGenCallingConv.inc" 00085 00086 // Find a larger type to do a load / store of a vector with. 00087 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { 00088 unsigned StoreSize = VT.getStoreSizeInBits(); 00089 if (StoreSize <= 32) 00090 return EVT::getIntegerVT(Ctx, StoreSize); 00091 00092 assert(StoreSize % 32 == 0 && "Store size not a multiple of 32"); 00093 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); 00094 } 00095 00096 // Type for a vector that will be loaded to. 00097 EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) { 00098 unsigned StoreSize = VT.getStoreSizeInBits(); 00099 if (StoreSize <= 32) 00100 return EVT::getIntegerVT(Ctx, 32); 00101 00102 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); 00103 } 00104 00105 AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : 00106 TargetLowering(TM, new TargetLoweringObjectFileELF()) { 00107 00108 Subtarget = &TM.getSubtarget<AMDGPUSubtarget>(); 00109 00110 setOperationAction(ISD::Constant, MVT::i32, Legal); 00111 setOperationAction(ISD::Constant, MVT::i64, Legal); 00112 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 00113 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 00114 00115 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 00116 setOperationAction(ISD::BRIND, MVT::Other, Expand); 00117 00118 // We need to custom lower some of the intrinsics 00119 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 00120 00121 // Library functions. These default to Expand, but we have instructions 00122 // for them. 00123 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 00124 setOperationAction(ISD::FEXP2, MVT::f32, Legal); 00125 setOperationAction(ISD::FPOW, MVT::f32, Legal); 00126 setOperationAction(ISD::FLOG2, MVT::f32, Legal); 00127 setOperationAction(ISD::FABS, MVT::f32, Legal); 00128 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 00129 setOperationAction(ISD::FRINT, MVT::f32, Legal); 00130 setOperationAction(ISD::FROUND, MVT::f32, Legal); 00131 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 00132 00133 setOperationAction(ISD::FREM, MVT::f32, Custom); 00134 setOperationAction(ISD::FREM, MVT::f64, Custom); 00135 00136 // Lower floating point store/load to integer store/load to reduce the number 00137 // of patterns in tablegen. 00138 setOperationAction(ISD::STORE, MVT::f32, Promote); 00139 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); 00140 00141 setOperationAction(ISD::STORE, MVT::v2f32, Promote); 00142 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32); 00143 00144 setOperationAction(ISD::STORE, MVT::i64, Promote); 00145 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); 00146 00147 setOperationAction(ISD::STORE, MVT::v4f32, Promote); 00148 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32); 00149 00150 setOperationAction(ISD::STORE, MVT::v8f32, Promote); 00151 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32); 00152 00153 setOperationAction(ISD::STORE, MVT::v16f32, Promote); 00154 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); 00155 00156 setOperationAction(ISD::STORE, MVT::f64, Promote); 00157 AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); 00158 00159 setOperationAction(ISD::STORE, MVT::v2f64, Promote); 00160 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64); 00161 00162 // Custom lowering of vector stores is required for local address space 00163 // stores. 00164 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 00165 // XXX: Native v2i32 local address space stores are possible, but not 00166 // currently implemented. 00167 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 00168 00169 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); 00170 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); 00171 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); 00172 00173 // XXX: This can be change to Custom, once ExpandVectorStores can 00174 // handle 64-bit stores. 00175 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); 00176 00177 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 00178 setTruncStoreAction(MVT::i64, MVT::i8, Expand); 00179 setTruncStoreAction(MVT::i64, MVT::i1, Expand); 00180 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); 00181 setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand); 00182 00183 00184 setOperationAction(ISD::LOAD, MVT::f32, Promote); 00185 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); 00186 00187 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 00188 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); 00189 00190 setOperationAction(ISD::LOAD, MVT::i64, Promote); 00191 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); 00192 00193 setOperationAction(ISD::LOAD, MVT::v4f32, Promote); 00194 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); 00195 00196 setOperationAction(ISD::LOAD, MVT::v8f32, Promote); 00197 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); 00198 00199 setOperationAction(ISD::LOAD, MVT::v16f32, Promote); 00200 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); 00201 00202 setOperationAction(ISD::LOAD, MVT::f64, Promote); 00203 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64); 00204 00205 setOperationAction(ISD::LOAD, MVT::v2f64, Promote); 00206 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64); 00207 00208 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 00209 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); 00210 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom); 00211 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom); 00212 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom); 00213 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom); 00214 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom); 00215 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom); 00216 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); 00217 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); 00218 00219 setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand); 00220 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand); 00221 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i8, Expand); 00222 setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Expand); 00223 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Expand); 00224 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Expand); 00225 setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Expand); 00226 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Expand); 00227 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, Expand); 00228 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Expand); 00229 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand); 00230 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand); 00231 00232 setOperationAction(ISD::BR_CC, MVT::i1, Expand); 00233 00234 if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { 00235 setOperationAction(ISD::FCEIL, MVT::f64, Custom); 00236 setOperationAction(ISD::FTRUNC, MVT::f64, Custom); 00237 setOperationAction(ISD::FRINT, MVT::f64, Custom); 00238 setOperationAction(ISD::FFLOOR, MVT::f64, Custom); 00239 } 00240 00241 if (!Subtarget->hasBFI()) { 00242 // fcopysign can be done in a single instruction with BFI. 00243 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 00244 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 00245 } 00246 00247 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 00248 00249 setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand); 00250 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 00251 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 00252 00253 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 00254 for (MVT VT : ScalarIntVTs) { 00255 setOperationAction(ISD::SREM, VT, Expand); 00256 setOperationAction(ISD::SDIV, VT, Expand); 00257 00258 // GPU does not have divrem function for signed or unsigned. 00259 setOperationAction(ISD::SDIVREM, VT, Custom); 00260 setOperationAction(ISD::UDIVREM, VT, Custom); 00261 00262 // GPU does not have [S|U]MUL_LOHI functions as a single instruction. 00263 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 00264 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 00265 00266 setOperationAction(ISD::BSWAP, VT, Expand); 00267 setOperationAction(ISD::CTTZ, VT, Expand); 00268 setOperationAction(ISD::CTLZ, VT, Expand); 00269 } 00270 00271 if (!Subtarget->hasBCNT(32)) 00272 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 00273 00274 if (!Subtarget->hasBCNT(64)) 00275 setOperationAction(ISD::CTPOP, MVT::i64, Expand); 00276 00277 // The hardware supports 32-bit ROTR, but not ROTL. 00278 setOperationAction(ISD::ROTL, MVT::i32, Expand); 00279 setOperationAction(ISD::ROTL, MVT::i64, Expand); 00280 setOperationAction(ISD::ROTR, MVT::i64, Expand); 00281 00282 setOperationAction(ISD::MUL, MVT::i64, Expand); 00283 setOperationAction(ISD::MULHU, MVT::i64, Expand); 00284 setOperationAction(ISD::MULHS, MVT::i64, Expand); 00285 setOperationAction(ISD::UDIV, MVT::i32, Expand); 00286 setOperationAction(ISD::UREM, MVT::i32, Expand); 00287 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 00288 setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); 00289 00290 if (!Subtarget->hasFFBH()) 00291 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); 00292 00293 if (!Subtarget->hasFFBL()) 00294 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); 00295 00296 static const MVT::SimpleValueType VectorIntTypes[] = { 00297 MVT::v2i32, MVT::v4i32 00298 }; 00299 00300 for (MVT VT : VectorIntTypes) { 00301 // Expand the following operations for the current type by default. 00302 setOperationAction(ISD::ADD, VT, Expand); 00303 setOperationAction(ISD::AND, VT, Expand); 00304 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 00305 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 00306 setOperationAction(ISD::MUL, VT, Expand); 00307 setOperationAction(ISD::OR, VT, Expand); 00308 setOperationAction(ISD::SHL, VT, Expand); 00309 setOperationAction(ISD::SRA, VT, Expand); 00310 setOperationAction(ISD::SRL, VT, Expand); 00311 setOperationAction(ISD::ROTL, VT, Expand); 00312 setOperationAction(ISD::ROTR, VT, Expand); 00313 setOperationAction(ISD::SUB, VT, Expand); 00314 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 00315 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 00316 setOperationAction(ISD::SDIV, VT, Expand); 00317 setOperationAction(ISD::UDIV, VT, Expand); 00318 setOperationAction(ISD::SREM, VT, Expand); 00319 setOperationAction(ISD::UREM, VT, Expand); 00320 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 00321 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 00322 setOperationAction(ISD::SDIVREM, VT, Custom); 00323 setOperationAction(ISD::UDIVREM, VT, Custom); 00324 setOperationAction(ISD::ADDC, VT, Expand); 00325 setOperationAction(ISD::SUBC, VT, Expand); 00326 setOperationAction(ISD::ADDE, VT, Expand); 00327 setOperationAction(ISD::SUBE, VT, Expand); 00328 setOperationAction(ISD::SELECT, VT, Expand); 00329 setOperationAction(ISD::VSELECT, VT, Expand); 00330 setOperationAction(ISD::SELECT_CC, VT, Expand); 00331 setOperationAction(ISD::XOR, VT, Expand); 00332 setOperationAction(ISD::BSWAP, VT, Expand); 00333 setOperationAction(ISD::CTPOP, VT, Expand); 00334 setOperationAction(ISD::CTTZ, VT, Expand); 00335 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); 00336 setOperationAction(ISD::CTLZ, VT, Expand); 00337 setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); 00338 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); 00339 } 00340 00341 static const MVT::SimpleValueType FloatVectorTypes[] = { 00342 MVT::v2f32, MVT::v4f32 00343 }; 00344 00345 for (MVT VT : FloatVectorTypes) { 00346 setOperationAction(ISD::FABS, VT, Expand); 00347 setOperationAction(ISD::FADD, VT, Expand); 00348 setOperationAction(ISD::FCEIL, VT, Expand); 00349 setOperationAction(ISD::FCOS, VT, Expand); 00350 setOperationAction(ISD::FDIV, VT, Expand); 00351 setOperationAction(ISD::FEXP2, VT, Expand); 00352 setOperationAction(ISD::FLOG2, VT, Expand); 00353 setOperationAction(ISD::FREM, VT, Expand); 00354 setOperationAction(ISD::FPOW, VT, Expand); 00355 setOperationAction(ISD::FFLOOR, VT, Expand); 00356 setOperationAction(ISD::FTRUNC, VT, Expand); 00357 setOperationAction(ISD::FMUL, VT, Expand); 00358 setOperationAction(ISD::FMA, VT, Expand); 00359 setOperationAction(ISD::FRINT, VT, Expand); 00360 setOperationAction(ISD::FNEARBYINT, VT, Expand); 00361 setOperationAction(ISD::FSQRT, VT, Expand); 00362 setOperationAction(ISD::FSIN, VT, Expand); 00363 setOperationAction(ISD::FSUB, VT, Expand); 00364 setOperationAction(ISD::FNEG, VT, Expand); 00365 setOperationAction(ISD::SELECT, VT, Expand); 00366 setOperationAction(ISD::VSELECT, VT, Expand); 00367 setOperationAction(ISD::SELECT_CC, VT, Expand); 00368 setOperationAction(ISD::FCOPYSIGN, VT, Expand); 00369 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); 00370 } 00371 00372 setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); 00373 setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); 00374 00375 setTargetDAGCombine(ISD::MUL); 00376 setTargetDAGCombine(ISD::SELECT_CC); 00377 setTargetDAGCombine(ISD::STORE); 00378 00379 setSchedulingPreference(Sched::RegPressure); 00380 setJumpIsExpensive(true); 00381 00382 // SI at least has hardware support for floating point exceptions, but no way 00383 // of using or handling them is implemented. They are also optional in OpenCL 00384 // (Section 7.3) 00385 setHasFloatingPointExceptions(false); 00386 00387 setSelectIsExpensive(false); 00388 PredictableSelectIsExpensive = false; 00389 00390 // There are no integer divide instructions, and these expand to a pretty 00391 // large sequence of instructions. 00392 setIntDivIsCheap(false); 00393 setPow2SDivIsCheap(false); 00394 00395 // TODO: Investigate this when 64-bit divides are implemented. 00396 addBypassSlowDiv(64, 32); 00397 00398 // FIXME: Need to really handle these. 00399 MaxStoresPerMemcpy = 4096; 00400 MaxStoresPerMemmove = 4096; 00401 MaxStoresPerMemset = 4096; 00402 } 00403 00404 //===----------------------------------------------------------------------===// 00405 // Target Information 00406 //===----------------------------------------------------------------------===// 00407 00408 MVT AMDGPUTargetLowering::getVectorIdxTy() const { 00409 return MVT::i32; 00410 } 00411 00412 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const { 00413 return true; 00414 } 00415 00416 // The backend supports 32 and 64 bit floating point immediates. 00417 // FIXME: Why are we reporting vectors of FP immediates as legal? 00418 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 00419 EVT ScalarVT = VT.getScalarType(); 00420 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64); 00421 } 00422 00423 // We don't want to shrink f64 / f32 constants. 00424 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const { 00425 EVT ScalarVT = VT.getScalarType(); 00426 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64); 00427 } 00428 00429 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, 00430 EVT CastTy) const { 00431 if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) 00432 return true; 00433 00434 unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits(); 00435 unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits(); 00436 00437 return ((LScalarSize <= CastScalarSize) || 00438 (CastScalarSize >= 32) || 00439 (LScalarSize < 32)); 00440 } 00441 00442 //===---------------------------------------------------------------------===// 00443 // Target Properties 00444 //===---------------------------------------------------------------------===// 00445 00446 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const { 00447 assert(VT.isFloatingPoint()); 00448 return VT == MVT::f32 || VT == MVT::f64; 00449 } 00450 00451 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const { 00452 assert(VT.isFloatingPoint()); 00453 return VT == MVT::f32 || VT == MVT::f64; 00454 } 00455 00456 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { 00457 // Truncate is just accessing a subregister. 00458 return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); 00459 } 00460 00461 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const { 00462 // Truncate is just accessing a subregister. 00463 return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() && 00464 (Dest->getPrimitiveSizeInBits() % 32 == 0); 00465 } 00466 00467 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const { 00468 const DataLayout *DL = getDataLayout(); 00469 unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType()); 00470 unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType()); 00471 00472 return SrcSize == 32 && DestSize == 64; 00473 } 00474 00475 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const { 00476 // Any register load of a 64-bit value really requires 2 32-bit moves. For all 00477 // practical purposes, the extra mov 0 to load a 64-bit is free. As used, 00478 // this will enable reducing 64-bit operations the 32-bit, which is always 00479 // good. 00480 return Src == MVT::i32 && Dest == MVT::i64; 00481 } 00482 00483 bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 00484 return isZExtFree(Val.getValueType(), VT2); 00485 } 00486 00487 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { 00488 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a 00489 // limited number of native 64-bit operations. Shrinking an operation to fit 00490 // in a single 32-bit register should always be helpful. As currently used, 00491 // this is much less general than the name suggests, and is only used in 00492 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is 00493 // not profitable, and may actually be harmful. 00494 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32; 00495 } 00496 00497 //===---------------------------------------------------------------------===// 00498 // TargetLowering Callbacks 00499 //===---------------------------------------------------------------------===// 00500 00501 void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, 00502 const SmallVectorImpl<ISD::InputArg> &Ins) const { 00503 00504 State.AnalyzeFormalArguments(Ins, CC_AMDGPU); 00505 } 00506 00507 SDValue AMDGPUTargetLowering::LowerReturn( 00508 SDValue Chain, 00509 CallingConv::ID CallConv, 00510 bool isVarArg, 00511 const SmallVectorImpl<ISD::OutputArg> &Outs, 00512 const SmallVectorImpl<SDValue> &OutVals, 00513 SDLoc DL, SelectionDAG &DAG) const { 00514 return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain); 00515 } 00516 00517 //===---------------------------------------------------------------------===// 00518 // Target specific lowering 00519 //===---------------------------------------------------------------------===// 00520 00521 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, 00522 SmallVectorImpl<SDValue> &InVals) const { 00523 SDValue Callee = CLI.Callee; 00524 SelectionDAG &DAG = CLI.DAG; 00525 00526 const Function &Fn = *DAG.getMachineFunction().getFunction(); 00527 00528 StringRef FuncName("<unknown>"); 00529 00530 if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) 00531 FuncName = G->getSymbol(); 00532 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 00533 FuncName = G->getGlobal()->getName(); 00534 00535 DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName); 00536 DAG.getContext()->diagnose(NoCalls); 00537 return SDValue(); 00538 } 00539 00540 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, 00541 SelectionDAG &DAG) const { 00542 switch (Op.getOpcode()) { 00543 default: 00544 Op.getNode()->dump(); 00545 llvm_unreachable("Custom lowering code for this" 00546 "instruction is not implemented yet!"); 00547 break; 00548 case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); 00549 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 00550 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); 00551 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); 00552 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 00553 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); 00554 case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); 00555 case ISD::FREM: return LowerFREM(Op, DAG); 00556 case ISD::FCEIL: return LowerFCEIL(Op, DAG); 00557 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); 00558 case ISD::FRINT: return LowerFRINT(Op, DAG); 00559 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); 00560 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); 00561 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 00562 } 00563 return Op; 00564 } 00565 00566 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, 00567 SmallVectorImpl<SDValue> &Results, 00568 SelectionDAG &DAG) const { 00569 switch (N->getOpcode()) { 00570 case ISD::SIGN_EXTEND_INREG: 00571 // Different parts of legalization seem to interpret which type of 00572 // sign_extend_inreg is the one to check for custom lowering. The extended 00573 // from type is what really matters, but some places check for custom 00574 // lowering of the result type. This results in trying to use 00575 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do 00576 // nothing here and let the illegal result integer be handled normally. 00577 return; 00578 case ISD::LOAD: { 00579 SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); 00580 if (!Node) 00581 return; 00582 00583 Results.push_back(SDValue(Node, 0)); 00584 Results.push_back(SDValue(Node, 1)); 00585 // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode 00586 // function 00587 DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); 00588 return; 00589 } 00590 case ISD::STORE: { 00591 SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG); 00592 if (Lowered.getNode()) 00593 Results.push_back(Lowered); 00594 return; 00595 } 00596 default: 00597 return; 00598 } 00599 } 00600 00601 // FIXME: This implements accesses to initialized globals in the constant 00602 // address space by copying them to private and accessing that. It does not 00603 // properly handle illegal types or vectors. The private vector loads are not 00604 // scalarized, and the illegal scalars hit an assertion. This technique will not 00605 // work well with large initializers, and this should eventually be 00606 // removed. Initialized globals should be placed into a data section that the 00607 // runtime will load into a buffer before the kernel is executed. Uses of the 00608 // global need to be replaced with a pointer loaded from an implicit kernel 00609 // argument into this buffer holding the copy of the data, which will remove the 00610 // need for any of this. 00611 SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, 00612 const GlobalValue *GV, 00613 const SDValue &InitPtr, 00614 SDValue Chain, 00615 SelectionDAG &DAG) const { 00616 const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout(); 00617 SDLoc DL(InitPtr); 00618 Type *InitTy = Init->getType(); 00619 00620 if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) { 00621 EVT VT = EVT::getEVT(InitTy); 00622 PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); 00623 return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr, 00624 MachinePointerInfo(UndefValue::get(PtrTy)), false, false, 00625 TD->getPrefTypeAlignment(InitTy)); 00626 } 00627 00628 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) { 00629 EVT VT = EVT::getEVT(CFP->getType()); 00630 PointerType *PtrTy = PointerType::get(CFP->getType(), 0); 00631 return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr, 00632 MachinePointerInfo(UndefValue::get(PtrTy)), false, false, 00633 TD->getPrefTypeAlignment(CFP->getType())); 00634 } 00635 00636 if (StructType *ST = dyn_cast<StructType>(InitTy)) { 00637 const StructLayout *SL = TD->getStructLayout(ST); 00638 00639 EVT PtrVT = InitPtr.getValueType(); 00640 SmallVector<SDValue, 8> Chains; 00641 00642 for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) { 00643 SDValue Offset = DAG.getConstant(SL->getElementOffset(I), PtrVT); 00644 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); 00645 00646 Constant *Elt = Init->getAggregateElement(I); 00647 Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); 00648 } 00649 00650 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 00651 } 00652 00653 if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) { 00654 EVT PtrVT = InitPtr.getValueType(); 00655 00656 unsigned NumElements; 00657 if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy)) 00658 NumElements = AT->getNumElements(); 00659 else if (VectorType *VT = dyn_cast<VectorType>(SeqTy)) 00660 NumElements = VT->getNumElements(); 00661 else 00662 llvm_unreachable("Unexpected type"); 00663 00664 unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType()); 00665 SmallVector<SDValue, 8> Chains; 00666 for (unsigned i = 0; i < NumElements; ++i) { 00667 SDValue Offset = DAG.getConstant(i * EltSize, PtrVT); 00668 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset); 00669 00670 Constant *Elt = Init->getAggregateElement(i); 00671 Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG)); 00672 } 00673 00674 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 00675 } 00676 00677 if (isa<UndefValue>(Init)) { 00678 EVT VT = EVT::getEVT(InitTy); 00679 PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); 00680 return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr, 00681 MachinePointerInfo(UndefValue::get(PtrTy)), false, false, 00682 TD->getPrefTypeAlignment(InitTy)); 00683 } 00684 00685 Init->dump(); 00686 llvm_unreachable("Unhandled constant initializer"); 00687 } 00688 00689 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, 00690 SDValue Op, 00691 SelectionDAG &DAG) const { 00692 00693 const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout(); 00694 GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op); 00695 const GlobalValue *GV = G->getGlobal(); 00696 00697 switch (G->getAddressSpace()) { 00698 default: llvm_unreachable("Global Address lowering not implemented for this " 00699 "address space"); 00700 case AMDGPUAS::LOCAL_ADDRESS: { 00701 // XXX: What does the value of G->getOffset() mean? 00702 assert(G->getOffset() == 0 && 00703 "Do not know what to do with an non-zero offset"); 00704 00705 unsigned Offset; 00706 if (MFI->LocalMemoryObjects.count(GV) == 0) { 00707 uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType()); 00708 Offset = MFI->LDSSize; 00709 MFI->LocalMemoryObjects[GV] = Offset; 00710 // XXX: Account for alignment? 00711 MFI->LDSSize += Size; 00712 } else { 00713 Offset = MFI->LocalMemoryObjects[GV]; 00714 } 00715 00716 return DAG.getConstant(Offset, getPointerTy(AMDGPUAS::LOCAL_ADDRESS)); 00717 } 00718 case AMDGPUAS::CONSTANT_ADDRESS: { 00719 MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); 00720 Type *EltType = GV->getType()->getElementType(); 00721 unsigned Size = TD->getTypeAllocSize(EltType); 00722 unsigned Alignment = TD->getPrefTypeAlignment(EltType); 00723 00724 MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS); 00725 MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS); 00726 00727 int FI = FrameInfo->CreateStackObject(Size, Alignment, false); 00728 SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT); 00729 00730 const GlobalVariable *Var = cast<GlobalVariable>(GV); 00731 if (!Var->hasInitializer()) { 00732 // This has no use, but bugpoint will hit it. 00733 return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); 00734 } 00735 00736 const Constant *Init = Var->getInitializer(); 00737 SmallVector<SDNode*, 8> WorkList; 00738 00739 for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(), 00740 E = DAG.getEntryNode()->use_end(); I != E; ++I) { 00741 if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD) 00742 continue; 00743 WorkList.push_back(*I); 00744 } 00745 SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG); 00746 for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(), 00747 E = WorkList.end(); I != E; ++I) { 00748 SmallVector<SDValue, 8> Ops; 00749 Ops.push_back(Chain); 00750 for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) { 00751 Ops.push_back((*I)->getOperand(i)); 00752 } 00753 DAG.UpdateNodeOperands(*I, Ops); 00754 } 00755 return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); 00756 } 00757 } 00758 } 00759 00760 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, 00761 SelectionDAG &DAG) const { 00762 SmallVector<SDValue, 8> Args; 00763 SDValue A = Op.getOperand(0); 00764 SDValue B = Op.getOperand(1); 00765 00766 DAG.ExtractVectorElements(A, Args); 00767 DAG.ExtractVectorElements(B, Args); 00768 00769 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); 00770 } 00771 00772 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 00773 SelectionDAG &DAG) const { 00774 00775 SmallVector<SDValue, 8> Args; 00776 unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 00777 EVT VT = Op.getValueType(); 00778 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, 00779 VT.getVectorNumElements()); 00780 00781 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); 00782 } 00783 00784 SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, 00785 SelectionDAG &DAG) const { 00786 00787 MachineFunction &MF = DAG.getMachineFunction(); 00788 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>( 00789 getTargetMachine().getSubtargetImpl()->getFrameLowering()); 00790 00791 FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); 00792 00793 unsigned FrameIndex = FIN->getIndex(); 00794 unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); 00795 return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), 00796 Op.getValueType()); 00797 } 00798 00799 SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, 00800 SelectionDAG &DAG) const { 00801 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 00802 SDLoc DL(Op); 00803 EVT VT = Op.getValueType(); 00804 00805 switch (IntrinsicID) { 00806 default: return Op; 00807 case AMDGPUIntrinsic::AMDGPU_abs: 00808 case AMDGPUIntrinsic::AMDIL_abs: // Legacy name. 00809 return LowerIntrinsicIABS(Op, DAG); 00810 case AMDGPUIntrinsic::AMDGPU_lrp: 00811 return LowerIntrinsicLRP(Op, DAG); 00812 case AMDGPUIntrinsic::AMDGPU_fract: 00813 case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. 00814 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); 00815 00816 case AMDGPUIntrinsic::AMDGPU_clamp: 00817 case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name. 00818 return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, 00819 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 00820 00821 case Intrinsic::AMDGPU_div_scale: { 00822 // 3rd parameter required to be a constant. 00823 const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3)); 00824 if (!Param) 00825 return DAG.getUNDEF(VT); 00826 00827 // Translate to the operands expected by the machine instruction. The 00828 // first parameter must be the same as the first instruction. 00829 SDValue Numerator = Op.getOperand(1); 00830 SDValue Denominator = Op.getOperand(2); 00831 SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; 00832 00833 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, 00834 Denominator, Numerator); 00835 } 00836 00837 case Intrinsic::AMDGPU_div_fmas: 00838 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, 00839 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 00840 00841 case Intrinsic::AMDGPU_div_fixup: 00842 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, 00843 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 00844 00845 case Intrinsic::AMDGPU_trig_preop: 00846 return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, 00847 Op.getOperand(1), Op.getOperand(2)); 00848 00849 case Intrinsic::AMDGPU_rcp: 00850 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); 00851 00852 case Intrinsic::AMDGPU_rsq: 00853 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 00854 00855 case AMDGPUIntrinsic::AMDGPU_legacy_rsq: 00856 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); 00857 00858 case Intrinsic::AMDGPU_rsq_clamped: 00859 return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); 00860 00861 case Intrinsic::AMDGPU_ldexp: 00862 return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1), 00863 Op.getOperand(2)); 00864 00865 case AMDGPUIntrinsic::AMDGPU_imax: 00866 return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1), 00867 Op.getOperand(2)); 00868 case AMDGPUIntrinsic::AMDGPU_umax: 00869 return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1), 00870 Op.getOperand(2)); 00871 case AMDGPUIntrinsic::AMDGPU_imin: 00872 return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1), 00873 Op.getOperand(2)); 00874 case AMDGPUIntrinsic::AMDGPU_umin: 00875 return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1), 00876 Op.getOperand(2)); 00877 00878 case AMDGPUIntrinsic::AMDGPU_umul24: 00879 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, 00880 Op.getOperand(1), Op.getOperand(2)); 00881 00882 case AMDGPUIntrinsic::AMDGPU_imul24: 00883 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, 00884 Op.getOperand(1), Op.getOperand(2)); 00885 00886 case AMDGPUIntrinsic::AMDGPU_umad24: 00887 return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT, 00888 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 00889 00890 case AMDGPUIntrinsic::AMDGPU_imad24: 00891 return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT, 00892 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 00893 00894 case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0: 00895 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1)); 00896 00897 case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1: 00898 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1)); 00899 00900 case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2: 00901 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1)); 00902 00903 case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3: 00904 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1)); 00905 00906 case AMDGPUIntrinsic::AMDGPU_bfe_i32: 00907 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, 00908 Op.getOperand(1), 00909 Op.getOperand(2), 00910 Op.getOperand(3)); 00911 00912 case AMDGPUIntrinsic::AMDGPU_bfe_u32: 00913 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, 00914 Op.getOperand(1), 00915 Op.getOperand(2), 00916 Op.getOperand(3)); 00917 00918 case AMDGPUIntrinsic::AMDGPU_bfi: 00919 return DAG.getNode(AMDGPUISD::BFI, DL, VT, 00920 Op.getOperand(1), 00921 Op.getOperand(2), 00922 Op.getOperand(3)); 00923 00924 case AMDGPUIntrinsic::AMDGPU_bfm: 00925 return DAG.getNode(AMDGPUISD::BFM, DL, VT, 00926 Op.getOperand(1), 00927 Op.getOperand(2)); 00928 00929 case AMDGPUIntrinsic::AMDGPU_brev: 00930 return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1)); 00931 00932 case AMDGPUIntrinsic::AMDIL_exp: // Legacy name. 00933 return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); 00934 00935 case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name. 00936 return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); 00937 case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name. 00938 return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1)); 00939 } 00940 } 00941 00942 ///IABS(a) = SMAX(sub(0, a), a) 00943 SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op, 00944 SelectionDAG &DAG) const { 00945 SDLoc DL(Op); 00946 EVT VT = Op.getValueType(); 00947 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), 00948 Op.getOperand(1)); 00949 00950 return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1)); 00951 } 00952 00953 /// Linear Interpolation 00954 /// LRP(a, b, c) = muladd(a, b, (1 - a) * c) 00955 SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, 00956 SelectionDAG &DAG) const { 00957 SDLoc DL(Op); 00958 EVT VT = Op.getValueType(); 00959 SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT, 00960 DAG.getConstantFP(1.0f, MVT::f32), 00961 Op.getOperand(1)); 00962 SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA, 00963 Op.getOperand(3)); 00964 return DAG.getNode(ISD::FADD, DL, VT, 00965 DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)), 00966 OneSubAC); 00967 } 00968 00969 /// \brief Generate Min/Max node 00970 SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N, 00971 SelectionDAG &DAG) const { 00972 SDLoc DL(N); 00973 EVT VT = N->getValueType(0); 00974 00975 SDValue LHS = N->getOperand(0); 00976 SDValue RHS = N->getOperand(1); 00977 SDValue True = N->getOperand(2); 00978 SDValue False = N->getOperand(3); 00979 SDValue CC = N->getOperand(4); 00980 00981 if (VT != MVT::f32 || 00982 !((LHS == True && RHS == False) || (LHS == False && RHS == True))) { 00983 return SDValue(); 00984 } 00985 00986 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 00987 switch (CCOpcode) { 00988 case ISD::SETOEQ: 00989 case ISD::SETONE: 00990 case ISD::SETUNE: 00991 case ISD::SETNE: 00992 case ISD::SETUEQ: 00993 case ISD::SETEQ: 00994 case ISD::SETFALSE: 00995 case ISD::SETFALSE2: 00996 case ISD::SETTRUE: 00997 case ISD::SETTRUE2: 00998 case ISD::SETUO: 00999 case ISD::SETO: 01000 llvm_unreachable("Operation should already be optimised!"); 01001 case ISD::SETULE: 01002 case ISD::SETULT: 01003 case ISD::SETOLE: 01004 case ISD::SETOLT: 01005 case ISD::SETLE: 01006 case ISD::SETLT: { 01007 unsigned Opc = (LHS == True) ? AMDGPUISD::FMIN : AMDGPUISD::FMAX; 01008 return DAG.getNode(Opc, DL, VT, LHS, RHS); 01009 } 01010 case ISD::SETGT: 01011 case ISD::SETGE: 01012 case ISD::SETUGE: 01013 case ISD::SETOGE: 01014 case ISD::SETUGT: 01015 case ISD::SETOGT: { 01016 unsigned Opc = (LHS == True) ? AMDGPUISD::FMAX : AMDGPUISD::FMIN; 01017 return DAG.getNode(Opc, DL, VT, LHS, RHS); 01018 } 01019 case ISD::SETCC_INVALID: 01020 llvm_unreachable("Invalid setcc condcode!"); 01021 } 01022 return SDValue(); 01023 } 01024 01025 SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op, 01026 SelectionDAG &DAG) const { 01027 LoadSDNode *Load = cast<LoadSDNode>(Op); 01028 EVT MemVT = Load->getMemoryVT(); 01029 EVT MemEltVT = MemVT.getVectorElementType(); 01030 01031 EVT LoadVT = Op.getValueType(); 01032 EVT EltVT = LoadVT.getVectorElementType(); 01033 EVT PtrVT = Load->getBasePtr().getValueType(); 01034 01035 unsigned NumElts = Load->getMemoryVT().getVectorNumElements(); 01036 SmallVector<SDValue, 8> Loads; 01037 SmallVector<SDValue, 8> Chains; 01038 01039 SDLoc SL(Op); 01040 unsigned MemEltSize = MemEltVT.getStoreSize(); 01041 MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); 01042 01043 for (unsigned i = 0; i < NumElts; ++i) { 01044 SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(), 01045 DAG.getConstant(i * MemEltSize, PtrVT)); 01046 01047 SDValue NewLoad 01048 = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, 01049 Load->getChain(), Ptr, 01050 SrcValue.getWithOffset(i * MemEltSize), 01051 MemEltVT, Load->isVolatile(), Load->isNonTemporal(), 01052 Load->isInvariant(), Load->getAlignment()); 01053 Loads.push_back(NewLoad.getValue(0)); 01054 Chains.push_back(NewLoad.getValue(1)); 01055 } 01056 01057 SDValue Ops[] = { 01058 DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads), 01059 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains) 01060 }; 01061 01062 return DAG.getMergeValues(Ops, SL); 01063 } 01064 01065 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, 01066 SelectionDAG &DAG) const { 01067 EVT VT = Op.getValueType(); 01068 01069 // If this is a 2 element vector, we really want to scalarize and not create 01070 // weird 1 element vectors. 01071 if (VT.getVectorNumElements() == 2) 01072 return ScalarizeVectorLoad(Op, DAG); 01073 01074 LoadSDNode *Load = cast<LoadSDNode>(Op); 01075 SDValue BasePtr = Load->getBasePtr(); 01076 EVT PtrVT = BasePtr.getValueType(); 01077 EVT MemVT = Load->getMemoryVT(); 01078 SDLoc SL(Op); 01079 MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); 01080 01081 EVT LoVT, HiVT; 01082 EVT LoMemVT, HiMemVT; 01083 SDValue Lo, Hi; 01084 01085 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); 01086 std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); 01087 std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT); 01088 SDValue LoLoad 01089 = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, 01090 Load->getChain(), BasePtr, 01091 SrcValue, 01092 LoMemVT, Load->isVolatile(), Load->isNonTemporal(), 01093 Load->isInvariant(), Load->getAlignment()); 01094 01095 SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, 01096 DAG.getConstant(LoMemVT.getStoreSize(), PtrVT)); 01097 01098 SDValue HiLoad 01099 = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, 01100 Load->getChain(), HiPtr, 01101 SrcValue.getWithOffset(LoMemVT.getStoreSize()), 01102 HiMemVT, Load->isVolatile(), Load->isNonTemporal(), 01103 Load->isInvariant(), Load->getAlignment()); 01104 01105 SDValue Ops[] = { 01106 DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad), 01107 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, 01108 LoLoad.getValue(1), HiLoad.getValue(1)) 01109 }; 01110 01111 return DAG.getMergeValues(Ops, SL); 01112 } 01113 01114 SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, 01115 SelectionDAG &DAG) const { 01116 StoreSDNode *Store = cast<StoreSDNode>(Op); 01117 EVT MemVT = Store->getMemoryVT(); 01118 unsigned MemBits = MemVT.getSizeInBits(); 01119 01120 // Byte stores are really expensive, so if possible, try to pack 32-bit vector 01121 // truncating store into an i32 store. 01122 // XXX: We could also handle optimize other vector bitwidths. 01123 if (!MemVT.isVector() || MemBits > 32) { 01124 return SDValue(); 01125 } 01126 01127 SDLoc DL(Op); 01128 SDValue Value = Store->getValue(); 01129 EVT VT = Value.getValueType(); 01130 EVT ElemVT = VT.getVectorElementType(); 01131 SDValue Ptr = Store->getBasePtr(); 01132 EVT MemEltVT = MemVT.getVectorElementType(); 01133 unsigned MemEltBits = MemEltVT.getSizeInBits(); 01134 unsigned MemNumElements = MemVT.getVectorNumElements(); 01135 unsigned PackedSize = MemVT.getStoreSizeInBits(); 01136 SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, MVT::i32); 01137 01138 assert(Value.getValueType().getScalarSizeInBits() >= 32); 01139 01140 SDValue PackedValue; 01141 for (unsigned i = 0; i < MemNumElements; ++i) { 01142 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value, 01143 DAG.getConstant(i, MVT::i32)); 01144 Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32); 01145 Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg 01146 01147 SDValue Shift = DAG.getConstant(MemEltBits * i, MVT::i32); 01148 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift); 01149 01150 if (i == 0) { 01151 PackedValue = Elt; 01152 } else { 01153 PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt); 01154 } 01155 } 01156 01157 if (PackedSize < 32) { 01158 EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize); 01159 return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr, 01160 Store->getMemOperand()->getPointerInfo(), 01161 PackedVT, 01162 Store->isNonTemporal(), Store->isVolatile(), 01163 Store->getAlignment()); 01164 } 01165 01166 return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, 01167 Store->getMemOperand()->getPointerInfo(), 01168 Store->isVolatile(), Store->isNonTemporal(), 01169 Store->getAlignment()); 01170 } 01171 01172 SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op, 01173 SelectionDAG &DAG) const { 01174 StoreSDNode *Store = cast<StoreSDNode>(Op); 01175 EVT MemEltVT = Store->getMemoryVT().getVectorElementType(); 01176 EVT EltVT = Store->getValue().getValueType().getVectorElementType(); 01177 EVT PtrVT = Store->getBasePtr().getValueType(); 01178 unsigned NumElts = Store->getMemoryVT().getVectorNumElements(); 01179 SDLoc SL(Op); 01180 01181 SmallVector<SDValue, 8> Chains; 01182 01183 unsigned EltSize = MemEltVT.getStoreSize(); 01184 MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); 01185 01186 for (unsigned i = 0, e = NumElts; i != e; ++i) { 01187 SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, 01188 Store->getValue(), 01189 DAG.getConstant(i, MVT::i32)); 01190 01191 SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), PtrVT); 01192 SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset); 01193 SDValue NewStore = 01194 DAG.getTruncStore(Store->getChain(), SL, Val, Ptr, 01195 SrcValue.getWithOffset(i * EltSize), 01196 MemEltVT, Store->isNonTemporal(), Store->isVolatile(), 01197 Store->getAlignment()); 01198 Chains.push_back(NewStore); 01199 } 01200 01201 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains); 01202 } 01203 01204 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, 01205 SelectionDAG &DAG) const { 01206 StoreSDNode *Store = cast<StoreSDNode>(Op); 01207 SDValue Val = Store->getValue(); 01208 EVT VT = Val.getValueType(); 01209 01210 // If this is a 2 element vector, we really want to scalarize and not create 01211 // weird 1 element vectors. 01212 if (VT.getVectorNumElements() == 2) 01213 return ScalarizeVectorStore(Op, DAG); 01214 01215 EVT MemVT = Store->getMemoryVT(); 01216 SDValue Chain = Store->getChain(); 01217 SDValue BasePtr = Store->getBasePtr(); 01218 SDLoc SL(Op); 01219 01220 EVT LoVT, HiVT; 01221 EVT LoMemVT, HiMemVT; 01222 SDValue Lo, Hi; 01223 01224 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); 01225 std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); 01226 std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT); 01227 01228 EVT PtrVT = BasePtr.getValueType(); 01229 SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, 01230 DAG.getConstant(LoMemVT.getStoreSize(), PtrVT)); 01231 01232 MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); 01233 SDValue LoStore 01234 = DAG.getTruncStore(Chain, SL, Lo, 01235 BasePtr, 01236 SrcValue, 01237 LoMemVT, 01238 Store->isNonTemporal(), 01239 Store->isVolatile(), 01240 Store->getAlignment()); 01241 SDValue HiStore 01242 = DAG.getTruncStore(Chain, SL, Hi, 01243 HiPtr, 01244 SrcValue.getWithOffset(LoMemVT.getStoreSize()), 01245 HiMemVT, 01246 Store->isNonTemporal(), 01247 Store->isVolatile(), 01248 Store->getAlignment()); 01249 01250 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); 01251 } 01252 01253 01254 SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 01255 SDLoc DL(Op); 01256 LoadSDNode *Load = cast<LoadSDNode>(Op); 01257 ISD::LoadExtType ExtType = Load->getExtensionType(); 01258 EVT VT = Op.getValueType(); 01259 EVT MemVT = Load->getMemoryVT(); 01260 01261 if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) { 01262 // We can do the extload to 32-bits, and then need to separately extend to 01263 // 64-bits. 01264 01265 SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, MVT::i32, 01266 Load->getChain(), 01267 Load->getBasePtr(), 01268 MemVT, 01269 Load->getMemOperand()); 01270 01271 SDValue Ops[] = { 01272 DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32), 01273 ExtLoad32.getValue(1) 01274 }; 01275 01276 return DAG.getMergeValues(Ops, DL); 01277 } 01278 01279 if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) { 01280 assert(VT == MVT::i1 && "Only i1 non-extloads expected"); 01281 // FIXME: Copied from PPC 01282 // First, load into 32 bits, then truncate to 1 bit. 01283 01284 SDValue Chain = Load->getChain(); 01285 SDValue BasePtr = Load->getBasePtr(); 01286 MachineMemOperand *MMO = Load->getMemOperand(); 01287 01288 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, 01289 BasePtr, MVT::i8, MMO); 01290 01291 SDValue Ops[] = { 01292 DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD), 01293 NewLD.getValue(1) 01294 }; 01295 01296 return DAG.getMergeValues(Ops, DL); 01297 } 01298 01299 if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS || 01300 Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS || 01301 ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32)) 01302 return SDValue(); 01303 01304 01305 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), 01306 DAG.getConstant(2, MVT::i32)); 01307 SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), 01308 Load->getChain(), Ptr, 01309 DAG.getTargetConstant(0, MVT::i32), 01310 Op.getOperand(2)); 01311 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, 01312 Load->getBasePtr(), 01313 DAG.getConstant(0x3, MVT::i32)); 01314 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 01315 DAG.getConstant(3, MVT::i32)); 01316 01317 Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); 01318 01319 EVT MemEltVT = MemVT.getScalarType(); 01320 if (ExtType == ISD::SEXTLOAD) { 01321 SDValue MemEltVTNode = DAG.getValueType(MemEltVT); 01322 01323 SDValue Ops[] = { 01324 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), 01325 Load->getChain() 01326 }; 01327 01328 return DAG.getMergeValues(Ops, DL); 01329 } 01330 01331 SDValue Ops[] = { 01332 DAG.getZeroExtendInReg(Ret, DL, MemEltVT), 01333 Load->getChain() 01334 }; 01335 01336 return DAG.getMergeValues(Ops, DL); 01337 } 01338 01339 SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 01340 SDLoc DL(Op); 01341 SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG); 01342 if (Result.getNode()) { 01343 return Result; 01344 } 01345 01346 StoreSDNode *Store = cast<StoreSDNode>(Op); 01347 SDValue Chain = Store->getChain(); 01348 if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 01349 Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && 01350 Store->getValue().getValueType().isVector()) { 01351 return ScalarizeVectorStore(Op, DAG); 01352 } 01353 01354 EVT MemVT = Store->getMemoryVT(); 01355 if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && 01356 MemVT.bitsLT(MVT::i32)) { 01357 unsigned Mask = 0; 01358 if (Store->getMemoryVT() == MVT::i8) { 01359 Mask = 0xff; 01360 } else if (Store->getMemoryVT() == MVT::i16) { 01361 Mask = 0xffff; 01362 } 01363 SDValue BasePtr = Store->getBasePtr(); 01364 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, 01365 DAG.getConstant(2, MVT::i32)); 01366 SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, 01367 Chain, Ptr, DAG.getTargetConstant(0, MVT::i32)); 01368 01369 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, 01370 DAG.getConstant(0x3, MVT::i32)); 01371 01372 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 01373 DAG.getConstant(3, MVT::i32)); 01374 01375 SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, 01376 Store->getValue()); 01377 01378 SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); 01379 01380 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, 01381 MaskedValue, ShiftAmt); 01382 01383 SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32), 01384 ShiftAmt); 01385 DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, 01386 DAG.getConstant(0xffffffff, MVT::i32)); 01387 Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); 01388 01389 SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); 01390 return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 01391 Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32)); 01392 } 01393 return SDValue(); 01394 } 01395 01396 // This is a shortcut for integer division because we have fast i32<->f32 01397 // conversions, and fast f32 reciprocal instructions. The fractional part of a 01398 // float is enough to accurately represent up to a 24-bit integer. 01399 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const { 01400 SDLoc DL(Op); 01401 EVT VT = Op.getValueType(); 01402 SDValue LHS = Op.getOperand(0); 01403 SDValue RHS = Op.getOperand(1); 01404 MVT IntVT = MVT::i32; 01405 MVT FltVT = MVT::f32; 01406 01407 ISD::NodeType ToFp = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; 01408 ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; 01409 01410 if (VT.isVector()) { 01411 unsigned NElts = VT.getVectorNumElements(); 01412 IntVT = MVT::getVectorVT(MVT::i32, NElts); 01413 FltVT = MVT::getVectorVT(MVT::f32, NElts); 01414 } 01415 01416 unsigned BitSize = VT.getScalarType().getSizeInBits(); 01417 01418 SDValue jq = DAG.getConstant(1, IntVT); 01419 01420 if (sign) { 01421 // char|short jq = ia ^ ib; 01422 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); 01423 01424 // jq = jq >> (bitsize - 2) 01425 jq = DAG.getNode(ISD::SRA, DL, VT, jq, DAG.getConstant(BitSize - 2, VT)); 01426 01427 // jq = jq | 0x1 01428 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, VT)); 01429 01430 // jq = (int)jq 01431 jq = DAG.getSExtOrTrunc(jq, DL, IntVT); 01432 } 01433 01434 // int ia = (int)LHS; 01435 SDValue ia = sign ? 01436 DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT); 01437 01438 // int ib, (int)RHS; 01439 SDValue ib = sign ? 01440 DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT); 01441 01442 // float fa = (float)ia; 01443 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia); 01444 01445 // float fb = (float)ib; 01446 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); 01447 01448 // float fq = native_divide(fa, fb); 01449 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, 01450 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); 01451 01452 // fq = trunc(fq); 01453 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq); 01454 01455 // float fqneg = -fq; 01456 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); 01457 01458 // float fr = mad(fqneg, fb, fa); 01459 SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT, 01460 DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa); 01461 01462 // int iq = (int)fq; 01463 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); 01464 01465 // fr = fabs(fr); 01466 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr); 01467 01468 // fb = fabs(fb); 01469 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb); 01470 01471 EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT); 01472 01473 // int cv = fr >= fb; 01474 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE); 01475 01476 // jq = (cv ? jq : 0); 01477 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, VT)); 01478 01479 // dst = trunc/extend to legal type 01480 iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT); 01481 01482 // dst = iq + jq; 01483 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq); 01484 01485 // Rem needs compensation, it's easier to recompute it 01486 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS); 01487 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem); 01488 01489 SDValue Res[2] = { 01490 Div, 01491 Rem 01492 }; 01493 return DAG.getMergeValues(Res, DL); 01494 } 01495 01496 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, 01497 SelectionDAG &DAG) const { 01498 SDLoc DL(Op); 01499 EVT VT = Op.getValueType(); 01500 01501 SDValue Num = Op.getOperand(0); 01502 SDValue Den = Op.getOperand(1); 01503 01504 if (VT == MVT::i32) { 01505 if (DAG.MaskedValueIsZero(Op.getOperand(0), APInt(32, 0xff << 24)) && 01506 DAG.MaskedValueIsZero(Op.getOperand(1), APInt(32, 0xff << 24))) { 01507 // TODO: We technically could do this for i64, but shouldn't that just be 01508 // handled by something generally reducing 64-bit division on 32-bit 01509 // values to 32-bit? 01510 return LowerDIVREM24(Op, DAG, false); 01511 } 01512 } 01513 01514 // RCP = URECIP(Den) = 2^32 / Den + e 01515 // e is rounding error. 01516 SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); 01517 01518 // RCP_LO = umulo(RCP, Den) */ 01519 SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den); 01520 01521 // RCP_HI = mulhu (RCP, Den) */ 01522 SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); 01523 01524 // NEG_RCP_LO = -RCP_LO 01525 SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), 01526 RCP_LO); 01527 01528 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 01529 SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT), 01530 NEG_RCP_LO, RCP_LO, 01531 ISD::SETEQ); 01532 // Calculate the rounding error from the URECIP instruction 01533 // E = mulhu(ABS_RCP_LO, RCP) 01534 SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP); 01535 01536 // RCP_A_E = RCP + E 01537 SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E); 01538 01539 // RCP_S_E = RCP - E 01540 SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E); 01541 01542 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 01543 SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT), 01544 RCP_A_E, RCP_S_E, 01545 ISD::SETEQ); 01546 // Quotient = mulhu(Tmp0, Num) 01547 SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); 01548 01549 // Num_S_Remainder = Quotient * Den 01550 SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den); 01551 01552 // Remainder = Num - Num_S_Remainder 01553 SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); 01554 01555 // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) 01556 SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, 01557 DAG.getConstant(-1, VT), 01558 DAG.getConstant(0, VT), 01559 ISD::SETUGE); 01560 // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) 01561 SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num, 01562 Num_S_Remainder, 01563 DAG.getConstant(-1, VT), 01564 DAG.getConstant(0, VT), 01565 ISD::SETUGE); 01566 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 01567 SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, 01568 Remainder_GE_Zero); 01569 01570 // Calculate Division result: 01571 01572 // Quotient_A_One = Quotient + 1 01573 SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient, 01574 DAG.getConstant(1, VT)); 01575 01576 // Quotient_S_One = Quotient - 1 01577 SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient, 01578 DAG.getConstant(1, VT)); 01579 01580 // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) 01581 SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT), 01582 Quotient, Quotient_A_One, ISD::SETEQ); 01583 01584 // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) 01585 Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), 01586 Quotient_S_One, Div, ISD::SETEQ); 01587 01588 // Calculate Rem result: 01589 01590 // Remainder_S_Den = Remainder - Den 01591 SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den); 01592 01593 // Remainder_A_Den = Remainder + Den 01594 SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den); 01595 01596 // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) 01597 SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT), 01598 Remainder, Remainder_S_Den, ISD::SETEQ); 01599 01600 // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) 01601 Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT), 01602 Remainder_A_Den, Rem, ISD::SETEQ); 01603 SDValue Ops[2] = { 01604 Div, 01605 Rem 01606 }; 01607 return DAG.getMergeValues(Ops, DL); 01608 } 01609 01610 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, 01611 SelectionDAG &DAG) const { 01612 SDLoc DL(Op); 01613 EVT VT = Op.getValueType(); 01614 01615 SDValue LHS = Op.getOperand(0); 01616 SDValue RHS = Op.getOperand(1); 01617 01618 if (VT == MVT::i32) { 01619 if (DAG.ComputeNumSignBits(Op.getOperand(0)) > 8 && 01620 DAG.ComputeNumSignBits(Op.getOperand(1)) > 8) { 01621 // TODO: We technically could do this for i64, but shouldn't that just be 01622 // handled by something generally reducing 64-bit division on 32-bit 01623 // values to 32-bit? 01624 return LowerDIVREM24(Op, DAG, true); 01625 } 01626 } 01627 01628 SDValue Zero = DAG.getConstant(0, VT); 01629 SDValue NegOne = DAG.getConstant(-1, VT); 01630 01631 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT); 01632 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT); 01633 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign); 01634 SDValue RSign = LHSign; // Remainder sign is the same as LHS 01635 01636 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign); 01637 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign); 01638 01639 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign); 01640 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign); 01641 01642 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS); 01643 SDValue Rem = Div.getValue(1); 01644 01645 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign); 01646 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign); 01647 01648 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign); 01649 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign); 01650 01651 SDValue Res[2] = { 01652 Div, 01653 Rem 01654 }; 01655 return DAG.getMergeValues(Res, DL); 01656 } 01657 01658 // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y)) 01659 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { 01660 SDLoc SL(Op); 01661 EVT VT = Op.getValueType(); 01662 SDValue X = Op.getOperand(0); 01663 SDValue Y = Op.getOperand(1); 01664 01665 SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y); 01666 SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div); 01667 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y); 01668 01669 return DAG.getNode(ISD::FSUB, SL, VT, X, Mul); 01670 } 01671 01672 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { 01673 SDLoc SL(Op); 01674 SDValue Src = Op.getOperand(0); 01675 01676 // result = trunc(src) 01677 // if (src > 0.0 && src != result) 01678 // result += 1.0 01679 01680 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 01681 01682 const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64); 01683 const SDValue One = DAG.getConstantFP(1.0, MVT::f64); 01684 01685 EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); 01686 01687 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT); 01688 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); 01689 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); 01690 01691 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero); 01692 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); 01693 } 01694 01695 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { 01696 SDLoc SL(Op); 01697 SDValue Src = Op.getOperand(0); 01698 01699 assert(Op.getValueType() == MVT::f64); 01700 01701 const SDValue Zero = DAG.getConstant(0, MVT::i32); 01702 const SDValue One = DAG.getConstant(1, MVT::i32); 01703 01704 SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src); 01705 01706 // Extract the upper half, since this is where we will find the sign and 01707 // exponent. 01708 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One); 01709 01710 const unsigned FractBits = 52; 01711 const unsigned ExpBits = 11; 01712 01713 // Extract the exponent. 01714 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_I32, SL, MVT::i32, 01715 Hi, 01716 DAG.getConstant(FractBits - 32, MVT::i32), 01717 DAG.getConstant(ExpBits, MVT::i32)); 01718 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, 01719 DAG.getConstant(1023, MVT::i32)); 01720 01721 // Extract the sign bit. 01722 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32); 01723 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); 01724 01725 // Extend back to to 64-bits. 01726 SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, 01727 Zero, SignBit); 01728 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); 01729 01730 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); 01731 const SDValue FractMask 01732 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, MVT::i64); 01733 01734 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp); 01735 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64); 01736 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not); 01737 01738 EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); 01739 01740 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, MVT::i32); 01741 01742 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); 01743 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); 01744 01745 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0); 01746 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1); 01747 01748 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2); 01749 } 01750 01751 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { 01752 SDLoc SL(Op); 01753 SDValue Src = Op.getOperand(0); 01754 01755 assert(Op.getValueType() == MVT::f64); 01756 01757 APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52"); 01758 SDValue C1 = DAG.getConstantFP(C1Val, MVT::f64); 01759 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); 01760 01761 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); 01762 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign); 01763 01764 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src); 01765 01766 APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51"); 01767 SDValue C2 = DAG.getConstantFP(C2Val, MVT::f64); 01768 01769 EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); 01770 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT); 01771 01772 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2); 01773 } 01774 01775 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const { 01776 // FNEARBYINT and FRINT are the same, except in their handling of FP 01777 // exceptions. Those aren't really meaningful for us, and OpenCL only has 01778 // rint, so just treat them as equivalent. 01779 return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0)); 01780 } 01781 01782 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { 01783 SDLoc SL(Op); 01784 SDValue Src = Op.getOperand(0); 01785 01786 // result = trunc(src); 01787 // if (src < 0.0 && src != result) 01788 // result += -1.0. 01789 01790 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src); 01791 01792 const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64); 01793 const SDValue NegOne = DAG.getConstantFP(-1.0, MVT::f64); 01794 01795 EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64); 01796 01797 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT); 01798 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE); 01799 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); 01800 01801 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero); 01802 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); 01803 } 01804 01805 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, 01806 SelectionDAG &DAG) const { 01807 SDValue S0 = Op.getOperand(0); 01808 SDLoc DL(Op); 01809 if (Op.getValueType() != MVT::f32 || S0.getValueType() != MVT::i64) 01810 return SDValue(); 01811 01812 // f32 uint_to_fp i64 01813 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, 01814 DAG.getConstant(0, MVT::i32)); 01815 SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo); 01816 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, 01817 DAG.getConstant(1, MVT::i32)); 01818 SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi); 01819 FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi, 01820 DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32 01821 return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); 01822 } 01823 01824 SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op, 01825 unsigned BitsDiff, 01826 SelectionDAG &DAG) const { 01827 MVT VT = Op.getSimpleValueType(); 01828 SDLoc DL(Op); 01829 SDValue Shift = DAG.getConstant(BitsDiff, VT); 01830 // Shift left by 'Shift' bits. 01831 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Op.getOperand(0), Shift); 01832 // Signed shift Right by 'Shift' bits. 01833 return DAG.getNode(ISD::SRA, DL, VT, Shl, Shift); 01834 } 01835 01836 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, 01837 SelectionDAG &DAG) const { 01838 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT(); 01839 MVT VT = Op.getSimpleValueType(); 01840 MVT ScalarVT = VT.getScalarType(); 01841 01842 if (!VT.isVector()) 01843 return SDValue(); 01844 01845 SDValue Src = Op.getOperand(0); 01846 SDLoc DL(Op); 01847 01848 // TODO: Don't scalarize on Evergreen? 01849 unsigned NElts = VT.getVectorNumElements(); 01850 SmallVector<SDValue, 8> Args; 01851 DAG.ExtractVectorElements(Src, Args, 0, NElts); 01852 01853 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType()); 01854 for (unsigned I = 0; I < NElts; ++I) 01855 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); 01856 01857 return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args); 01858 } 01859 01860 //===----------------------------------------------------------------------===// 01861 // Custom DAG optimizations 01862 //===----------------------------------------------------------------------===// 01863 01864 static bool isU24(SDValue Op, SelectionDAG &DAG) { 01865 APInt KnownZero, KnownOne; 01866 EVT VT = Op.getValueType(); 01867 DAG.computeKnownBits(Op, KnownZero, KnownOne); 01868 01869 return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24; 01870 } 01871 01872 static bool isI24(SDValue Op, SelectionDAG &DAG) { 01873 EVT VT = Op.getValueType(); 01874 01875 // In order for this to be a signed 24-bit value, bit 23, must 01876 // be a sign bit. 01877 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated 01878 // as unsigned 24-bit values. 01879 (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24; 01880 } 01881 01882 static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) { 01883 01884 SelectionDAG &DAG = DCI.DAG; 01885 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 01886 EVT VT = Op.getValueType(); 01887 01888 APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24); 01889 APInt KnownZero, KnownOne; 01890 TargetLowering::TargetLoweringOpt TLO(DAG, true, true); 01891 if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) 01892 DCI.CommitTargetLoweringOpt(TLO); 01893 } 01894 01895 template <typename IntTy> 01896 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, 01897 uint32_t Offset, uint32_t Width) { 01898 if (Width + Offset < 32) { 01899 IntTy Result = (Src0 << (32 - Offset - Width)) >> (32 - Width); 01900 return DAG.getConstant(Result, MVT::i32); 01901 } 01902 01903 return DAG.getConstant(Src0 >> Offset, MVT::i32); 01904 } 01905 01906 static bool usesAllNormalStores(SDNode *LoadVal) { 01907 for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) { 01908 if (!ISD::isNormalStore(*I)) 01909 return false; 01910 } 01911 01912 return true; 01913 } 01914 01915 // If we have a copy of an illegal type, replace it with a load / store of an 01916 // equivalently sized legal type. This avoids intermediate bit pack / unpack 01917 // instructions emitted when handling extloads and truncstores. Ideally we could 01918 // recognize the pack / unpack pattern to eliminate it. 01919 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, 01920 DAGCombinerInfo &DCI) const { 01921 if (!DCI.isBeforeLegalize()) 01922 return SDValue(); 01923 01924 StoreSDNode *SN = cast<StoreSDNode>(N); 01925 SDValue Value = SN->getValue(); 01926 EVT VT = Value.getValueType(); 01927 01928 if (isTypeLegal(VT) || SN->isVolatile() || !ISD::isNormalLoad(Value.getNode())) 01929 return SDValue(); 01930 01931 LoadSDNode *LoadVal = cast<LoadSDNode>(Value); 01932 if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal)) 01933 return SDValue(); 01934 01935 EVT MemVT = LoadVal->getMemoryVT(); 01936 01937 SDLoc SL(N); 01938 SelectionDAG &DAG = DCI.DAG; 01939 EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT); 01940 01941 SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, 01942 LoadVT, SL, 01943 LoadVal->getChain(), 01944 LoadVal->getBasePtr(), 01945 LoadVal->getOffset(), 01946 LoadVT, 01947 LoadVal->getMemOperand()); 01948 01949 SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0)); 01950 DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false); 01951 01952 return DAG.getStore(SN->getChain(), SL, NewLoad, 01953 SN->getBasePtr(), SN->getMemOperand()); 01954 } 01955 01956 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, 01957 DAGCombinerInfo &DCI) const { 01958 EVT VT = N->getValueType(0); 01959 01960 if (VT.isVector() || VT.getSizeInBits() > 32) 01961 return SDValue(); 01962 01963 SelectionDAG &DAG = DCI.DAG; 01964 SDLoc DL(N); 01965 01966 SDValue N0 = N->getOperand(0); 01967 SDValue N1 = N->getOperand(1); 01968 SDValue Mul; 01969 01970 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) { 01971 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32); 01972 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32); 01973 Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1); 01974 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) { 01975 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32); 01976 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32); 01977 Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1); 01978 } else { 01979 return SDValue(); 01980 } 01981 01982 // We need to use sext even for MUL_U24, because MUL_U24 is used 01983 // for signed multiply of 8 and 16-bit types. 01984 return DAG.getSExtOrTrunc(Mul, DL, VT); 01985 } 01986 01987 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, 01988 DAGCombinerInfo &DCI) const { 01989 SelectionDAG &DAG = DCI.DAG; 01990 SDLoc DL(N); 01991 01992 switch(N->getOpcode()) { 01993 default: break; 01994 case ISD::MUL: 01995 return performMulCombine(N, DCI); 01996 case AMDGPUISD::MUL_I24: 01997 case AMDGPUISD::MUL_U24: { 01998 SDValue N0 = N->getOperand(0); 01999 SDValue N1 = N->getOperand(1); 02000 simplifyI24(N0, DCI); 02001 simplifyI24(N1, DCI); 02002 return SDValue(); 02003 } 02004 case ISD::SELECT_CC: { 02005 return CombineMinMax(N, DAG); 02006 } 02007 case AMDGPUISD::BFE_I32: 02008 case AMDGPUISD::BFE_U32: { 02009 assert(!N->getValueType(0).isVector() && 02010 "Vector handling of BFE not implemented"); 02011 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2)); 02012 if (!Width) 02013 break; 02014 02015 uint32_t WidthVal = Width->getZExtValue() & 0x1f; 02016 if (WidthVal == 0) 02017 return DAG.getConstant(0, MVT::i32); 02018 02019 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1)); 02020 if (!Offset) 02021 break; 02022 02023 SDValue BitsFrom = N->getOperand(0); 02024 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f; 02025 02026 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32; 02027 02028 if (OffsetVal == 0) { 02029 // This is already sign / zero extended, so try to fold away extra BFEs. 02030 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal); 02031 02032 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom); 02033 if (OpSignBits >= SignBits) 02034 return BitsFrom; 02035 02036 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal); 02037 if (Signed) { 02038 // This is a sign_extend_inreg. Replace it to take advantage of existing 02039 // DAG Combines. If not eliminated, we will match back to BFE during 02040 // selection. 02041 02042 // TODO: The sext_inreg of extended types ends, although we can could 02043 // handle them in a single BFE. 02044 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom, 02045 DAG.getValueType(SmallVT)); 02046 } 02047 02048 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT); 02049 } 02050 02051 if (ConstantSDNode *Val = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 02052 if (Signed) { 02053 return constantFoldBFE<int32_t>(DAG, 02054 Val->getSExtValue(), 02055 OffsetVal, 02056 WidthVal); 02057 } 02058 02059 return constantFoldBFE<uint32_t>(DAG, 02060 Val->getZExtValue(), 02061 OffsetVal, 02062 WidthVal); 02063 } 02064 02065 APInt Demanded = APInt::getBitsSet(32, 02066 OffsetVal, 02067 OffsetVal + WidthVal); 02068 02069 if ((OffsetVal + WidthVal) >= 32) { 02070 SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32); 02071 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32, 02072 BitsFrom, ShiftVal); 02073 } 02074 02075 APInt KnownZero, KnownOne; 02076 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 02077 !DCI.isBeforeLegalizeOps()); 02078 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 02079 if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) || 02080 TLI.SimplifyDemandedBits(BitsFrom, Demanded, KnownZero, KnownOne, TLO)) { 02081 DCI.CommitTargetLoweringOpt(TLO); 02082 } 02083 02084 break; 02085 } 02086 02087 case ISD::STORE: 02088 return performStoreCombine(N, DCI); 02089 } 02090 return SDValue(); 02091 } 02092 02093 //===----------------------------------------------------------------------===// 02094 // Helper functions 02095 //===----------------------------------------------------------------------===// 02096 02097 void AMDGPUTargetLowering::getOriginalFunctionArgs( 02098 SelectionDAG &DAG, 02099 const Function *F, 02100 const SmallVectorImpl<ISD::InputArg> &Ins, 02101 SmallVectorImpl<ISD::InputArg> &OrigIns) const { 02102 02103 for (unsigned i = 0, e = Ins.size(); i < e; ++i) { 02104 if (Ins[i].ArgVT == Ins[i].VT) { 02105 OrigIns.push_back(Ins[i]); 02106 continue; 02107 } 02108 02109 EVT VT; 02110 if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) { 02111 // Vector has been split into scalars. 02112 VT = Ins[i].ArgVT.getVectorElementType(); 02113 } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() && 02114 Ins[i].ArgVT.getVectorElementType() != 02115 Ins[i].VT.getVectorElementType()) { 02116 // Vector elements have been promoted 02117 VT = Ins[i].ArgVT; 02118 } else { 02119 // Vector has been spilt into smaller vectors. 02120 VT = Ins[i].VT; 02121 } 02122 02123 ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used, 02124 Ins[i].OrigArgIndex, Ins[i].PartOffset); 02125 OrigIns.push_back(Arg); 02126 } 02127 } 02128 02129 bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const { 02130 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 02131 return CFP->isExactlyValue(1.0); 02132 } 02133 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 02134 return C->isAllOnesValue(); 02135 } 02136 return false; 02137 } 02138 02139 bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const { 02140 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 02141 return CFP->getValueAPF().isZero(); 02142 } 02143 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 02144 return C->isNullValue(); 02145 } 02146 return false; 02147 } 02148 02149 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 02150 const TargetRegisterClass *RC, 02151 unsigned Reg, EVT VT) const { 02152 MachineFunction &MF = DAG.getMachineFunction(); 02153 MachineRegisterInfo &MRI = MF.getRegInfo(); 02154 unsigned VirtualRegister; 02155 if (!MRI.isLiveIn(Reg)) { 02156 VirtualRegister = MRI.createVirtualRegister(RC); 02157 MRI.addLiveIn(Reg, VirtualRegister); 02158 } else { 02159 VirtualRegister = MRI.getLiveInVirtReg(Reg); 02160 } 02161 return DAG.getRegister(VirtualRegister, VT); 02162 } 02163 02164 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node; 02165 02166 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { 02167 switch (Opcode) { 02168 default: return nullptr; 02169 // AMDIL DAG nodes 02170 NODE_NAME_CASE(CALL); 02171 NODE_NAME_CASE(UMUL); 02172 NODE_NAME_CASE(RET_FLAG); 02173 NODE_NAME_CASE(BRANCH_COND); 02174 02175 // AMDGPU DAG nodes 02176 NODE_NAME_CASE(DWORDADDR) 02177 NODE_NAME_CASE(FRACT) 02178 NODE_NAME_CASE(CLAMP) 02179 NODE_NAME_CASE(MAD) 02180 NODE_NAME_CASE(FMAX) 02181 NODE_NAME_CASE(SMAX) 02182 NODE_NAME_CASE(UMAX) 02183 NODE_NAME_CASE(FMIN) 02184 NODE_NAME_CASE(SMIN) 02185 NODE_NAME_CASE(UMIN) 02186 NODE_NAME_CASE(URECIP) 02187 NODE_NAME_CASE(DIV_SCALE) 02188 NODE_NAME_CASE(DIV_FMAS) 02189 NODE_NAME_CASE(DIV_FIXUP) 02190 NODE_NAME_CASE(TRIG_PREOP) 02191 NODE_NAME_CASE(RCP) 02192 NODE_NAME_CASE(RSQ) 02193 NODE_NAME_CASE(RSQ_LEGACY) 02194 NODE_NAME_CASE(RSQ_CLAMPED) 02195 NODE_NAME_CASE(LDEXP) 02196 NODE_NAME_CASE(DOT4) 02197 NODE_NAME_CASE(BFE_U32) 02198 NODE_NAME_CASE(BFE_I32) 02199 NODE_NAME_CASE(BFI) 02200 NODE_NAME_CASE(BFM) 02201 NODE_NAME_CASE(BREV) 02202 NODE_NAME_CASE(MUL_U24) 02203 NODE_NAME_CASE(MUL_I24) 02204 NODE_NAME_CASE(MAD_U24) 02205 NODE_NAME_CASE(MAD_I24) 02206 NODE_NAME_CASE(EXPORT) 02207 NODE_NAME_CASE(CONST_ADDRESS) 02208 NODE_NAME_CASE(REGISTER_LOAD) 02209 NODE_NAME_CASE(REGISTER_STORE) 02210 NODE_NAME_CASE(LOAD_CONSTANT) 02211 NODE_NAME_CASE(LOAD_INPUT) 02212 NODE_NAME_CASE(SAMPLE) 02213 NODE_NAME_CASE(SAMPLEB) 02214 NODE_NAME_CASE(SAMPLED) 02215 NODE_NAME_CASE(SAMPLEL) 02216 NODE_NAME_CASE(CVT_F32_UBYTE0) 02217 NODE_NAME_CASE(CVT_F32_UBYTE1) 02218 NODE_NAME_CASE(CVT_F32_UBYTE2) 02219 NODE_NAME_CASE(CVT_F32_UBYTE3) 02220 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) 02221 NODE_NAME_CASE(CONST_DATA_PTR) 02222 NODE_NAME_CASE(STORE_MSKOR) 02223 NODE_NAME_CASE(TBUFFER_STORE_FORMAT) 02224 } 02225 } 02226 02227 static void computeKnownBitsForMinMax(const SDValue Op0, 02228 const SDValue Op1, 02229 APInt &KnownZero, 02230 APInt &KnownOne, 02231 const SelectionDAG &DAG, 02232 unsigned Depth) { 02233 APInt Op0Zero, Op0One; 02234 APInt Op1Zero, Op1One; 02235 DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth); 02236 DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth); 02237 02238 KnownZero = Op0Zero & Op1Zero; 02239 KnownOne = Op0One & Op1One; 02240 } 02241 02242 void AMDGPUTargetLowering::computeKnownBitsForTargetNode( 02243 const SDValue Op, 02244 APInt &KnownZero, 02245 APInt &KnownOne, 02246 const SelectionDAG &DAG, 02247 unsigned Depth) const { 02248 02249 KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything. 02250 02251 APInt KnownZero2; 02252 APInt KnownOne2; 02253 unsigned Opc = Op.getOpcode(); 02254 02255 switch (Opc) { 02256 default: 02257 break; 02258 case ISD::INTRINSIC_WO_CHAIN: { 02259 // FIXME: The intrinsic should just use the node. 02260 switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) { 02261 case AMDGPUIntrinsic::AMDGPU_imax: 02262 case AMDGPUIntrinsic::AMDGPU_umax: 02263 case AMDGPUIntrinsic::AMDGPU_imin: 02264 case AMDGPUIntrinsic::AMDGPU_umin: 02265 computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2), 02266 KnownZero, KnownOne, DAG, Depth); 02267 break; 02268 default: 02269 break; 02270 } 02271 02272 break; 02273 } 02274 case AMDGPUISD::SMAX: 02275 case AMDGPUISD::UMAX: 02276 case AMDGPUISD::SMIN: 02277 case AMDGPUISD::UMIN: 02278 computeKnownBitsForMinMax(Op.getOperand(0), Op.getOperand(1), 02279 KnownZero, KnownOne, DAG, Depth); 02280 break; 02281 02282 case AMDGPUISD::BFE_I32: 02283 case AMDGPUISD::BFE_U32: { 02284 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 02285 if (!CWidth) 02286 return; 02287 02288 unsigned BitWidth = 32; 02289 uint32_t Width = CWidth->getZExtValue() & 0x1f; 02290 if (Width == 0) { 02291 KnownZero = APInt::getAllOnesValue(BitWidth); 02292 KnownOne = APInt::getNullValue(BitWidth); 02293 return; 02294 } 02295 02296 // FIXME: This could do a lot more. If offset is 0, should be the same as 02297 // sign_extend_inreg implementation, but that involves duplicating it. 02298 if (Opc == AMDGPUISD::BFE_I32) 02299 KnownOne = APInt::getHighBitsSet(BitWidth, BitWidth - Width); 02300 else 02301 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width); 02302 02303 break; 02304 } 02305 } 02306 } 02307 02308 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( 02309 SDValue Op, 02310 const SelectionDAG &DAG, 02311 unsigned Depth) const { 02312 switch (Op.getOpcode()) { 02313 case AMDGPUISD::BFE_I32: { 02314 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 02315 if (!Width) 02316 return 1; 02317 02318 unsigned SignBits = 32 - Width->getZExtValue() + 1; 02319 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 02320 if (!Offset || !Offset->isNullValue()) 02321 return SignBits; 02322 02323 // TODO: Could probably figure something out with non-0 offsets. 02324 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); 02325 return std::max(SignBits, Op0SignBits); 02326 } 02327 02328 case AMDGPUISD::BFE_U32: { 02329 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 02330 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1; 02331 } 02332 02333 default: 02334 return 1; 02335 } 02336 }