LLVM API Documentation

AMDGPUISelLowering.cpp
Go to the documentation of this file.
00001 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 /// \file
00011 /// \brief This is the parent TargetLowering class for hardware code gen
00012 /// targets.
00013 //
00014 //===----------------------------------------------------------------------===//
00015 
00016 #include "AMDGPUISelLowering.h"
00017 #include "AMDGPU.h"
00018 #include "AMDGPUFrameLowering.h"
00019 #include "AMDGPUIntrinsicInfo.h"
00020 #include "AMDGPURegisterInfo.h"
00021 #include "AMDGPUSubtarget.h"
00022 #include "R600MachineFunctionInfo.h"
00023 #include "SIMachineFunctionInfo.h"
00024 #include "llvm/CodeGen/CallingConvLower.h"
00025 #include "llvm/CodeGen/MachineFunction.h"
00026 #include "llvm/CodeGen/MachineRegisterInfo.h"
00027 #include "llvm/CodeGen/SelectionDAG.h"
00028 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
00029 #include "llvm/IR/DataLayout.h"
00030 #include "llvm/IR/DiagnosticInfo.h"
00031 #include "llvm/IR/DiagnosticPrinter.h"
00032 
00033 using namespace llvm;
00034 
00035 namespace {
00036 
00037 /// Diagnostic information for unimplemented or unsupported feature reporting.
00038 class DiagnosticInfoUnsupported : public DiagnosticInfo {
00039 private:
00040   const Twine &Description;
00041   const Function &Fn;
00042 
00043   static int KindID;
00044 
00045   static int getKindID() {
00046     if (KindID == 0)
00047       KindID = llvm::getNextAvailablePluginDiagnosticKind();
00048     return KindID;
00049   }
00050 
00051 public:
00052   DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
00053                           DiagnosticSeverity Severity = DS_Error)
00054     : DiagnosticInfo(getKindID(), Severity),
00055       Description(Desc),
00056       Fn(Fn) { }
00057 
00058   const Function &getFunction() const { return Fn; }
00059   const Twine &getDescription() const { return Description; }
00060 
00061   void print(DiagnosticPrinter &DP) const override {
00062     DP << "unsupported " << getDescription() << " in " << Fn.getName();
00063   }
00064 
00065   static bool classof(const DiagnosticInfo *DI) {
00066     return DI->getKind() == getKindID();
00067   }
00068 };
00069 
00070 int DiagnosticInfoUnsupported::KindID = 0;
00071 }
00072 
00073 
00074 static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
00075                       CCValAssign::LocInfo LocInfo,
00076                       ISD::ArgFlagsTy ArgFlags, CCState &State) {
00077   unsigned Offset = State.AllocateStack(ValVT.getStoreSize(),
00078                                         ArgFlags.getOrigAlign());
00079   State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
00080 
00081   return true;
00082 }
00083 
00084 #include "AMDGPUGenCallingConv.inc"
00085 
00086 // Find a larger type to do a load / store of a vector with.
00087 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
00088   unsigned StoreSize = VT.getStoreSizeInBits();
00089   if (StoreSize <= 32)
00090     return EVT::getIntegerVT(Ctx, StoreSize);
00091 
00092   assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
00093   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
00094 }
00095 
00096 // Type for a vector that will be loaded to.
00097 EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) {
00098   unsigned StoreSize = VT.getStoreSizeInBits();
00099   if (StoreSize <= 32)
00100     return EVT::getIntegerVT(Ctx, 32);
00101 
00102   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
00103 }
00104 
00105 AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
00106   TargetLowering(TM, new TargetLoweringObjectFileELF()) {
00107 
00108   Subtarget = &TM.getSubtarget<AMDGPUSubtarget>();
00109 
00110   setOperationAction(ISD::Constant, MVT::i32, Legal);
00111   setOperationAction(ISD::Constant, MVT::i64, Legal);
00112   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
00113   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
00114 
00115   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
00116   setOperationAction(ISD::BRIND, MVT::Other, Expand);
00117 
00118   // We need to custom lower some of the intrinsics
00119   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
00120 
00121   // Library functions.  These default to Expand, but we have instructions
00122   // for them.
00123   setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
00124   setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
00125   setOperationAction(ISD::FPOW,   MVT::f32, Legal);
00126   setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
00127   setOperationAction(ISD::FABS,   MVT::f32, Legal);
00128   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
00129   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
00130   setOperationAction(ISD::FROUND, MVT::f32, Legal);
00131   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
00132 
00133   setOperationAction(ISD::FREM, MVT::f32, Custom);
00134   setOperationAction(ISD::FREM, MVT::f64, Custom);
00135 
00136   // Lower floating point store/load to integer store/load to reduce the number
00137   // of patterns in tablegen.
00138   setOperationAction(ISD::STORE, MVT::f32, Promote);
00139   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
00140 
00141   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
00142   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
00143 
00144   setOperationAction(ISD::STORE, MVT::i64, Promote);
00145   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
00146 
00147   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
00148   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
00149 
00150   setOperationAction(ISD::STORE, MVT::v8f32, Promote);
00151   AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
00152 
00153   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
00154   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
00155 
00156   setOperationAction(ISD::STORE, MVT::f64, Promote);
00157   AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
00158 
00159   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
00160   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64);
00161 
00162   // Custom lowering of vector stores is required for local address space
00163   // stores.
00164   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
00165   // XXX: Native v2i32 local address space stores are possible, but not
00166   // currently implemented.
00167   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
00168 
00169   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
00170   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
00171   setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
00172 
00173   // XXX: This can be change to Custom, once ExpandVectorStores can
00174   // handle 64-bit stores.
00175   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
00176 
00177   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
00178   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
00179   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
00180   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
00181   setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand);
00182 
00183 
00184   setOperationAction(ISD::LOAD, MVT::f32, Promote);
00185   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
00186 
00187   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
00188   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
00189 
00190   setOperationAction(ISD::LOAD, MVT::i64, Promote);
00191   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
00192 
00193   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
00194   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
00195 
00196   setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
00197   AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
00198 
00199   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
00200   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
00201 
00202   setOperationAction(ISD::LOAD, MVT::f64, Promote);
00203   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
00204 
00205   setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
00206   AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64);
00207 
00208   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
00209   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
00210   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
00211   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
00212   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
00213   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
00214   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
00215   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
00216   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
00217   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
00218 
00219   setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand);
00220   setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand);
00221   setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i8, Expand);
00222   setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Expand);
00223   setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Expand);
00224   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Expand);
00225   setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Expand);
00226   setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Expand);
00227   setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, Expand);
00228   setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Expand);
00229   setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand);
00230   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand);
00231 
00232   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
00233 
00234   if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
00235     setOperationAction(ISD::FCEIL, MVT::f64, Custom);
00236     setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
00237     setOperationAction(ISD::FRINT, MVT::f64, Custom);
00238     setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
00239   }
00240 
00241   if (!Subtarget->hasBFI()) {
00242     // fcopysign can be done in a single instruction with BFI.
00243     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
00244     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00245   }
00246 
00247   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
00248 
00249   setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
00250   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
00251   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
00252 
00253   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
00254   for (MVT VT : ScalarIntVTs) {
00255     setOperationAction(ISD::SREM, VT, Expand);
00256     setOperationAction(ISD::SDIV, VT, Expand);
00257 
00258     // GPU does not have divrem function for signed or unsigned.
00259     setOperationAction(ISD::SDIVREM, VT, Custom);
00260     setOperationAction(ISD::UDIVREM, VT, Custom);
00261 
00262     // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
00263     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
00264     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
00265 
00266     setOperationAction(ISD::BSWAP, VT, Expand);
00267     setOperationAction(ISD::CTTZ, VT, Expand);
00268     setOperationAction(ISD::CTLZ, VT, Expand);
00269   }
00270 
00271   if (!Subtarget->hasBCNT(32))
00272     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
00273 
00274   if (!Subtarget->hasBCNT(64))
00275     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
00276 
00277   // The hardware supports 32-bit ROTR, but not ROTL.
00278   setOperationAction(ISD::ROTL, MVT::i32, Expand);
00279   setOperationAction(ISD::ROTL, MVT::i64, Expand);
00280   setOperationAction(ISD::ROTR, MVT::i64, Expand);
00281 
00282   setOperationAction(ISD::MUL, MVT::i64, Expand);
00283   setOperationAction(ISD::MULHU, MVT::i64, Expand);
00284   setOperationAction(ISD::MULHS, MVT::i64, Expand);
00285   setOperationAction(ISD::UDIV, MVT::i32, Expand);
00286   setOperationAction(ISD::UREM, MVT::i32, Expand);
00287   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
00288   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
00289 
00290   if (!Subtarget->hasFFBH())
00291     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
00292 
00293   if (!Subtarget->hasFFBL())
00294     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
00295 
00296   static const MVT::SimpleValueType VectorIntTypes[] = {
00297     MVT::v2i32, MVT::v4i32
00298   };
00299 
00300   for (MVT VT : VectorIntTypes) {
00301     // Expand the following operations for the current type by default.
00302     setOperationAction(ISD::ADD,  VT, Expand);
00303     setOperationAction(ISD::AND,  VT, Expand);
00304     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
00305     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
00306     setOperationAction(ISD::MUL,  VT, Expand);
00307     setOperationAction(ISD::OR,   VT, Expand);
00308     setOperationAction(ISD::SHL,  VT, Expand);
00309     setOperationAction(ISD::SRA,  VT, Expand);
00310     setOperationAction(ISD::SRL,  VT, Expand);
00311     setOperationAction(ISD::ROTL, VT, Expand);
00312     setOperationAction(ISD::ROTR, VT, Expand);
00313     setOperationAction(ISD::SUB,  VT, Expand);
00314     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
00315     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
00316     setOperationAction(ISD::SDIV, VT, Expand);
00317     setOperationAction(ISD::UDIV, VT, Expand);
00318     setOperationAction(ISD::SREM, VT, Expand);
00319     setOperationAction(ISD::UREM, VT, Expand);
00320     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
00321     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
00322     setOperationAction(ISD::SDIVREM, VT, Custom);
00323     setOperationAction(ISD::UDIVREM, VT, Custom);
00324     setOperationAction(ISD::ADDC, VT, Expand);
00325     setOperationAction(ISD::SUBC, VT, Expand);
00326     setOperationAction(ISD::ADDE, VT, Expand);
00327     setOperationAction(ISD::SUBE, VT, Expand);
00328     setOperationAction(ISD::SELECT, VT, Expand);
00329     setOperationAction(ISD::VSELECT, VT, Expand);
00330     setOperationAction(ISD::SELECT_CC, VT, Expand);
00331     setOperationAction(ISD::XOR,  VT, Expand);
00332     setOperationAction(ISD::BSWAP, VT, Expand);
00333     setOperationAction(ISD::CTPOP, VT, Expand);
00334     setOperationAction(ISD::CTTZ, VT, Expand);
00335     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
00336     setOperationAction(ISD::CTLZ, VT, Expand);
00337     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
00338     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
00339   }
00340 
00341   static const MVT::SimpleValueType FloatVectorTypes[] = {
00342     MVT::v2f32, MVT::v4f32
00343   };
00344 
00345   for (MVT VT : FloatVectorTypes) {
00346     setOperationAction(ISD::FABS, VT, Expand);
00347     setOperationAction(ISD::FADD, VT, Expand);
00348     setOperationAction(ISD::FCEIL, VT, Expand);
00349     setOperationAction(ISD::FCOS, VT, Expand);
00350     setOperationAction(ISD::FDIV, VT, Expand);
00351     setOperationAction(ISD::FEXP2, VT, Expand);
00352     setOperationAction(ISD::FLOG2, VT, Expand);
00353     setOperationAction(ISD::FREM, VT, Expand);
00354     setOperationAction(ISD::FPOW, VT, Expand);
00355     setOperationAction(ISD::FFLOOR, VT, Expand);
00356     setOperationAction(ISD::FTRUNC, VT, Expand);
00357     setOperationAction(ISD::FMUL, VT, Expand);
00358     setOperationAction(ISD::FMA, VT, Expand);
00359     setOperationAction(ISD::FRINT, VT, Expand);
00360     setOperationAction(ISD::FNEARBYINT, VT, Expand);
00361     setOperationAction(ISD::FSQRT, VT, Expand);
00362     setOperationAction(ISD::FSIN, VT, Expand);
00363     setOperationAction(ISD::FSUB, VT, Expand);
00364     setOperationAction(ISD::FNEG, VT, Expand);
00365     setOperationAction(ISD::SELECT, VT, Expand);
00366     setOperationAction(ISD::VSELECT, VT, Expand);
00367     setOperationAction(ISD::SELECT_CC, VT, Expand);
00368     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
00369     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
00370   }
00371 
00372   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
00373   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
00374 
00375   setTargetDAGCombine(ISD::MUL);
00376   setTargetDAGCombine(ISD::SELECT_CC);
00377   setTargetDAGCombine(ISD::STORE);
00378 
00379   setSchedulingPreference(Sched::RegPressure);
00380   setJumpIsExpensive(true);
00381 
00382   // SI at least has hardware support for floating point exceptions, but no way
00383   // of using or handling them is implemented. They are also optional in OpenCL
00384   // (Section 7.3)
00385   setHasFloatingPointExceptions(false);
00386 
00387   setSelectIsExpensive(false);
00388   PredictableSelectIsExpensive = false;
00389 
00390   // There are no integer divide instructions, and these expand to a pretty
00391   // large sequence of instructions.
00392   setIntDivIsCheap(false);
00393   setPow2SDivIsCheap(false);
00394 
00395   // TODO: Investigate this when 64-bit divides are implemented.
00396   addBypassSlowDiv(64, 32);
00397 
00398   // FIXME: Need to really handle these.
00399   MaxStoresPerMemcpy  = 4096;
00400   MaxStoresPerMemmove = 4096;
00401   MaxStoresPerMemset  = 4096;
00402 }
00403 
00404 //===----------------------------------------------------------------------===//
00405 // Target Information
00406 //===----------------------------------------------------------------------===//
00407 
00408 MVT AMDGPUTargetLowering::getVectorIdxTy() const {
00409   return MVT::i32;
00410 }
00411 
00412 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
00413   return true;
00414 }
00415 
00416 // The backend supports 32 and 64 bit floating point immediates.
00417 // FIXME: Why are we reporting vectors of FP immediates as legal?
00418 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
00419   EVT ScalarVT = VT.getScalarType();
00420   return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64);
00421 }
00422 
00423 // We don't want to shrink f64 / f32 constants.
00424 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
00425   EVT ScalarVT = VT.getScalarType();
00426   return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
00427 }
00428 
00429 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
00430                                                    EVT CastTy) const {
00431   if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
00432     return true;
00433 
00434   unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits();
00435   unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits();
00436 
00437   return ((LScalarSize <= CastScalarSize) ||
00438           (CastScalarSize >= 32) ||
00439           (LScalarSize < 32));
00440 }
00441 
00442 //===---------------------------------------------------------------------===//
00443 // Target Properties
00444 //===---------------------------------------------------------------------===//
00445 
00446 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
00447   assert(VT.isFloatingPoint());
00448   return VT == MVT::f32 || VT == MVT::f64;
00449 }
00450 
00451 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
00452   assert(VT.isFloatingPoint());
00453   return VT == MVT::f32 || VT == MVT::f64;
00454 }
00455 
00456 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
00457   // Truncate is just accessing a subregister.
00458   return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
00459 }
00460 
00461 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
00462   // Truncate is just accessing a subregister.
00463   return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() &&
00464          (Dest->getPrimitiveSizeInBits() % 32 == 0);
00465 }
00466 
00467 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
00468   const DataLayout *DL = getDataLayout();
00469   unsigned SrcSize = DL->getTypeSizeInBits(Src->getScalarType());
00470   unsigned DestSize = DL->getTypeSizeInBits(Dest->getScalarType());
00471 
00472   return SrcSize == 32 && DestSize == 64;
00473 }
00474 
00475 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
00476   // Any register load of a 64-bit value really requires 2 32-bit moves. For all
00477   // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
00478   // this will enable reducing 64-bit operations the 32-bit, which is always
00479   // good.
00480   return Src == MVT::i32 && Dest == MVT::i64;
00481 }
00482 
00483 bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
00484   return isZExtFree(Val.getValueType(), VT2);
00485 }
00486 
00487 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
00488   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
00489   // limited number of native 64-bit operations. Shrinking an operation to fit
00490   // in a single 32-bit register should always be helpful. As currently used,
00491   // this is much less general than the name suggests, and is only used in
00492   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
00493   // not profitable, and may actually be harmful.
00494   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
00495 }
00496 
00497 //===---------------------------------------------------------------------===//
00498 // TargetLowering Callbacks
00499 //===---------------------------------------------------------------------===//
00500 
00501 void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
00502                              const SmallVectorImpl<ISD::InputArg> &Ins) const {
00503 
00504   State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
00505 }
00506 
00507 SDValue AMDGPUTargetLowering::LowerReturn(
00508                                      SDValue Chain,
00509                                      CallingConv::ID CallConv,
00510                                      bool isVarArg,
00511                                      const SmallVectorImpl<ISD::OutputArg> &Outs,
00512                                      const SmallVectorImpl<SDValue> &OutVals,
00513                                      SDLoc DL, SelectionDAG &DAG) const {
00514   return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
00515 }
00516 
00517 //===---------------------------------------------------------------------===//
00518 // Target specific lowering
00519 //===---------------------------------------------------------------------===//
00520 
00521 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
00522                                         SmallVectorImpl<SDValue> &InVals) const {
00523   SDValue Callee = CLI.Callee;
00524   SelectionDAG &DAG = CLI.DAG;
00525 
00526   const Function &Fn = *DAG.getMachineFunction().getFunction();
00527 
00528   StringRef FuncName("<unknown>");
00529 
00530   if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
00531     FuncName = G->getSymbol();
00532   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
00533     FuncName = G->getGlobal()->getName();
00534 
00535   DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName);
00536   DAG.getContext()->diagnose(NoCalls);
00537   return SDValue();
00538 }
00539 
00540 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
00541                                              SelectionDAG &DAG) const {
00542   switch (Op.getOpcode()) {
00543   default:
00544     Op.getNode()->dump();
00545     llvm_unreachable("Custom lowering code for this"
00546                      "instruction is not implemented yet!");
00547     break;
00548   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
00549   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
00550   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
00551   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
00552   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
00553   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
00554   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
00555   case ISD::FREM: return LowerFREM(Op, DAG);
00556   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
00557   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
00558   case ISD::FRINT: return LowerFRINT(Op, DAG);
00559   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
00560   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
00561   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
00562   }
00563   return Op;
00564 }
00565 
00566 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
00567                                               SmallVectorImpl<SDValue> &Results,
00568                                               SelectionDAG &DAG) const {
00569   switch (N->getOpcode()) {
00570   case ISD::SIGN_EXTEND_INREG:
00571     // Different parts of legalization seem to interpret which type of
00572     // sign_extend_inreg is the one to check for custom lowering. The extended
00573     // from type is what really matters, but some places check for custom
00574     // lowering of the result type. This results in trying to use
00575     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
00576     // nothing here and let the illegal result integer be handled normally.
00577     return;
00578   case ISD::LOAD: {
00579     SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
00580     if (!Node)
00581       return;
00582 
00583     Results.push_back(SDValue(Node, 0));
00584     Results.push_back(SDValue(Node, 1));
00585     // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
00586     // function
00587     DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
00588     return;
00589   }
00590   case ISD::STORE: {
00591     SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG);
00592     if (Lowered.getNode())
00593       Results.push_back(Lowered);
00594     return;
00595   }
00596   default:
00597     return;
00598   }
00599 }
00600 
00601 // FIXME: This implements accesses to initialized globals in the constant
00602 // address space by copying them to private and accessing that. It does not
00603 // properly handle illegal types or vectors. The private vector loads are not
00604 // scalarized, and the illegal scalars hit an assertion. This technique will not
00605 // work well with large initializers, and this should eventually be
00606 // removed. Initialized globals should be placed into a data section that the
00607 // runtime will load into a buffer before the kernel is executed. Uses of the
00608 // global need to be replaced with a pointer loaded from an implicit kernel
00609 // argument into this buffer holding the copy of the data, which will remove the
00610 // need for any of this.
00611 SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
00612                                                        const GlobalValue *GV,
00613                                                        const SDValue &InitPtr,
00614                                                        SDValue Chain,
00615                                                        SelectionDAG &DAG) const {
00616   const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout();
00617   SDLoc DL(InitPtr);
00618   Type *InitTy = Init->getType();
00619 
00620   if (const ConstantInt *CI = dyn_cast<ConstantInt>(Init)) {
00621     EVT VT = EVT::getEVT(InitTy);
00622     PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
00623     return DAG.getStore(Chain, DL, DAG.getConstant(*CI, VT), InitPtr,
00624                         MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
00625                         TD->getPrefTypeAlignment(InitTy));
00626   }
00627 
00628   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
00629     EVT VT = EVT::getEVT(CFP->getType());
00630     PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
00631     return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr,
00632                  MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
00633                  TD->getPrefTypeAlignment(CFP->getType()));
00634   }
00635 
00636   if (StructType *ST = dyn_cast<StructType>(InitTy)) {
00637     const StructLayout *SL = TD->getStructLayout(ST);
00638 
00639     EVT PtrVT = InitPtr.getValueType();
00640     SmallVector<SDValue, 8> Chains;
00641 
00642     for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) {
00643       SDValue Offset = DAG.getConstant(SL->getElementOffset(I), PtrVT);
00644       SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
00645 
00646       Constant *Elt = Init->getAggregateElement(I);
00647       Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
00648     }
00649 
00650     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
00651   }
00652 
00653   if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) {
00654     EVT PtrVT = InitPtr.getValueType();
00655 
00656     unsigned NumElements;
00657     if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy))
00658       NumElements = AT->getNumElements();
00659     else if (VectorType *VT = dyn_cast<VectorType>(SeqTy))
00660       NumElements = VT->getNumElements();
00661     else
00662       llvm_unreachable("Unexpected type");
00663 
00664     unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType());
00665     SmallVector<SDValue, 8> Chains;
00666     for (unsigned i = 0; i < NumElements; ++i) {
00667       SDValue Offset = DAG.getConstant(i * EltSize, PtrVT);
00668       SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
00669 
00670       Constant *Elt = Init->getAggregateElement(i);
00671       Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
00672     }
00673 
00674     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
00675   }
00676 
00677   if (isa<UndefValue>(Init)) {
00678     EVT VT = EVT::getEVT(InitTy);
00679     PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
00680     return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr,
00681                         MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
00682                         TD->getPrefTypeAlignment(InitTy));
00683   }
00684 
00685   Init->dump();
00686   llvm_unreachable("Unhandled constant initializer");
00687 }
00688 
00689 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
00690                                                  SDValue Op,
00691                                                  SelectionDAG &DAG) const {
00692 
00693   const DataLayout *TD = getTargetMachine().getSubtargetImpl()->getDataLayout();
00694   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
00695   const GlobalValue *GV = G->getGlobal();
00696 
00697   switch (G->getAddressSpace()) {
00698   default: llvm_unreachable("Global Address lowering not implemented for this "
00699                             "address space");
00700   case AMDGPUAS::LOCAL_ADDRESS: {
00701     // XXX: What does the value of G->getOffset() mean?
00702     assert(G->getOffset() == 0 &&
00703          "Do not know what to do with an non-zero offset");
00704 
00705     unsigned Offset;
00706     if (MFI->LocalMemoryObjects.count(GV) == 0) {
00707       uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
00708       Offset = MFI->LDSSize;
00709       MFI->LocalMemoryObjects[GV] = Offset;
00710       // XXX: Account for alignment?
00711       MFI->LDSSize += Size;
00712     } else {
00713       Offset = MFI->LocalMemoryObjects[GV];
00714     }
00715 
00716     return DAG.getConstant(Offset, getPointerTy(AMDGPUAS::LOCAL_ADDRESS));
00717   }
00718   case AMDGPUAS::CONSTANT_ADDRESS: {
00719     MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
00720     Type *EltType = GV->getType()->getElementType();
00721     unsigned Size = TD->getTypeAllocSize(EltType);
00722     unsigned Alignment = TD->getPrefTypeAlignment(EltType);
00723 
00724     MVT PrivPtrVT = getPointerTy(AMDGPUAS::PRIVATE_ADDRESS);
00725     MVT ConstPtrVT = getPointerTy(AMDGPUAS::CONSTANT_ADDRESS);
00726 
00727     int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
00728     SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT);
00729 
00730     const GlobalVariable *Var = cast<GlobalVariable>(GV);
00731     if (!Var->hasInitializer()) {
00732       // This has no use, but bugpoint will hit it.
00733       return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
00734     }
00735 
00736     const Constant *Init = Var->getInitializer();
00737     SmallVector<SDNode*, 8> WorkList;
00738 
00739     for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(),
00740                               E = DAG.getEntryNode()->use_end(); I != E; ++I) {
00741       if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD)
00742         continue;
00743       WorkList.push_back(*I);
00744     }
00745     SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG);
00746     for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(),
00747                                            E = WorkList.end(); I != E; ++I) {
00748       SmallVector<SDValue, 8> Ops;
00749       Ops.push_back(Chain);
00750       for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) {
00751         Ops.push_back((*I)->getOperand(i));
00752       }
00753       DAG.UpdateNodeOperands(*I, Ops);
00754     }
00755     return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
00756   }
00757   }
00758 }
00759 
00760 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
00761                                                   SelectionDAG &DAG) const {
00762   SmallVector<SDValue, 8> Args;
00763   SDValue A = Op.getOperand(0);
00764   SDValue B = Op.getOperand(1);
00765 
00766   DAG.ExtractVectorElements(A, Args);
00767   DAG.ExtractVectorElements(B, Args);
00768 
00769   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
00770 }
00771 
00772 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
00773                                                      SelectionDAG &DAG) const {
00774 
00775   SmallVector<SDValue, 8> Args;
00776   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
00777   EVT VT = Op.getValueType();
00778   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
00779                             VT.getVectorNumElements());
00780 
00781   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
00782 }
00783 
00784 SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
00785                                               SelectionDAG &DAG) const {
00786 
00787   MachineFunction &MF = DAG.getMachineFunction();
00788   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
00789       getTargetMachine().getSubtargetImpl()->getFrameLowering());
00790 
00791   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
00792 
00793   unsigned FrameIndex = FIN->getIndex();
00794   unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
00795   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF),
00796                          Op.getValueType());
00797 }
00798 
00799 SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
00800     SelectionDAG &DAG) const {
00801   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
00802   SDLoc DL(Op);
00803   EVT VT = Op.getValueType();
00804 
00805   switch (IntrinsicID) {
00806     default: return Op;
00807     case AMDGPUIntrinsic::AMDGPU_abs:
00808     case AMDGPUIntrinsic::AMDIL_abs: // Legacy name.
00809       return LowerIntrinsicIABS(Op, DAG);
00810     case AMDGPUIntrinsic::AMDGPU_lrp:
00811       return LowerIntrinsicLRP(Op, DAG);
00812     case AMDGPUIntrinsic::AMDGPU_fract:
00813     case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
00814       return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
00815 
00816     case AMDGPUIntrinsic::AMDGPU_clamp:
00817     case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name.
00818       return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
00819                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
00820 
00821     case Intrinsic::AMDGPU_div_scale: {
00822       // 3rd parameter required to be a constant.
00823       const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
00824       if (!Param)
00825         return DAG.getUNDEF(VT);
00826 
00827       // Translate to the operands expected by the machine instruction. The
00828       // first parameter must be the same as the first instruction.
00829       SDValue Numerator = Op.getOperand(1);
00830       SDValue Denominator = Op.getOperand(2);
00831       SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
00832 
00833       return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
00834                          Denominator, Numerator);
00835     }
00836 
00837     case Intrinsic::AMDGPU_div_fmas:
00838       return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
00839                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
00840 
00841     case Intrinsic::AMDGPU_div_fixup:
00842       return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
00843                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
00844 
00845     case Intrinsic::AMDGPU_trig_preop:
00846       return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
00847                          Op.getOperand(1), Op.getOperand(2));
00848 
00849     case Intrinsic::AMDGPU_rcp:
00850       return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
00851 
00852     case Intrinsic::AMDGPU_rsq:
00853       return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
00854 
00855     case AMDGPUIntrinsic::AMDGPU_legacy_rsq:
00856       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
00857 
00858     case Intrinsic::AMDGPU_rsq_clamped:
00859       return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
00860 
00861     case Intrinsic::AMDGPU_ldexp:
00862       return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1),
00863                                                    Op.getOperand(2));
00864 
00865     case AMDGPUIntrinsic::AMDGPU_imax:
00866       return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
00867                                                   Op.getOperand(2));
00868     case AMDGPUIntrinsic::AMDGPU_umax:
00869       return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1),
00870                                                   Op.getOperand(2));
00871     case AMDGPUIntrinsic::AMDGPU_imin:
00872       return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1),
00873                                                   Op.getOperand(2));
00874     case AMDGPUIntrinsic::AMDGPU_umin:
00875       return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1),
00876                                                   Op.getOperand(2));
00877 
00878     case AMDGPUIntrinsic::AMDGPU_umul24:
00879       return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT,
00880                          Op.getOperand(1), Op.getOperand(2));
00881 
00882     case AMDGPUIntrinsic::AMDGPU_imul24:
00883       return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT,
00884                          Op.getOperand(1), Op.getOperand(2));
00885 
00886     case AMDGPUIntrinsic::AMDGPU_umad24:
00887       return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT,
00888                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
00889 
00890     case AMDGPUIntrinsic::AMDGPU_imad24:
00891       return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT,
00892                          Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
00893 
00894     case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0:
00895       return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1));
00896 
00897     case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1:
00898       return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1));
00899 
00900     case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2:
00901       return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1));
00902 
00903     case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3:
00904       return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1));
00905 
00906     case AMDGPUIntrinsic::AMDGPU_bfe_i32:
00907       return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
00908                          Op.getOperand(1),
00909                          Op.getOperand(2),
00910                          Op.getOperand(3));
00911 
00912     case AMDGPUIntrinsic::AMDGPU_bfe_u32:
00913       return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
00914                          Op.getOperand(1),
00915                          Op.getOperand(2),
00916                          Op.getOperand(3));
00917 
00918     case AMDGPUIntrinsic::AMDGPU_bfi:
00919       return DAG.getNode(AMDGPUISD::BFI, DL, VT,
00920                          Op.getOperand(1),
00921                          Op.getOperand(2),
00922                          Op.getOperand(3));
00923 
00924     case AMDGPUIntrinsic::AMDGPU_bfm:
00925       return DAG.getNode(AMDGPUISD::BFM, DL, VT,
00926                          Op.getOperand(1),
00927                          Op.getOperand(2));
00928 
00929     case AMDGPUIntrinsic::AMDGPU_brev:
00930       return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1));
00931 
00932     case AMDGPUIntrinsic::AMDIL_exp: // Legacy name.
00933       return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
00934 
00935     case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name.
00936       return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
00937     case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name.
00938       return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1));
00939   }
00940 }
00941 
00942 ///IABS(a) = SMAX(sub(0, a), a)
00943 SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
00944                                                  SelectionDAG &DAG) const {
00945   SDLoc DL(Op);
00946   EVT VT = Op.getValueType();
00947   SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
00948                                               Op.getOperand(1));
00949 
00950   return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1));
00951 }
00952 
00953 /// Linear Interpolation
00954 /// LRP(a, b, c) = muladd(a,  b, (1 - a) * c)
00955 SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
00956                                                 SelectionDAG &DAG) const {
00957   SDLoc DL(Op);
00958   EVT VT = Op.getValueType();
00959   SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
00960                                 DAG.getConstantFP(1.0f, MVT::f32),
00961                                 Op.getOperand(1));
00962   SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA,
00963                                                     Op.getOperand(3));
00964   return DAG.getNode(ISD::FADD, DL, VT,
00965       DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)),
00966       OneSubAC);
00967 }
00968 
00969 /// \brief Generate Min/Max node
00970 SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N,
00971                                             SelectionDAG &DAG) const {
00972   SDLoc DL(N);
00973   EVT VT = N->getValueType(0);
00974 
00975   SDValue LHS = N->getOperand(0);
00976   SDValue RHS = N->getOperand(1);
00977   SDValue True = N->getOperand(2);
00978   SDValue False = N->getOperand(3);
00979   SDValue CC = N->getOperand(4);
00980 
00981   if (VT != MVT::f32 ||
00982       !((LHS == True && RHS == False) || (LHS == False && RHS == True))) {
00983     return SDValue();
00984   }
00985 
00986   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
00987   switch (CCOpcode) {
00988   case ISD::SETOEQ:
00989   case ISD::SETONE:
00990   case ISD::SETUNE:
00991   case ISD::SETNE:
00992   case ISD::SETUEQ:
00993   case ISD::SETEQ:
00994   case ISD::SETFALSE:
00995   case ISD::SETFALSE2:
00996   case ISD::SETTRUE:
00997   case ISD::SETTRUE2:
00998   case ISD::SETUO:
00999   case ISD::SETO:
01000     llvm_unreachable("Operation should already be optimised!");
01001   case ISD::SETULE:
01002   case ISD::SETULT:
01003   case ISD::SETOLE:
01004   case ISD::SETOLT:
01005   case ISD::SETLE:
01006   case ISD::SETLT: {
01007     unsigned Opc = (LHS == True) ? AMDGPUISD::FMIN : AMDGPUISD::FMAX;
01008     return DAG.getNode(Opc, DL, VT, LHS, RHS);
01009   }
01010   case ISD::SETGT:
01011   case ISD::SETGE:
01012   case ISD::SETUGE:
01013   case ISD::SETOGE:
01014   case ISD::SETUGT:
01015   case ISD::SETOGT: {
01016     unsigned Opc = (LHS == True) ? AMDGPUISD::FMAX : AMDGPUISD::FMIN;
01017     return DAG.getNode(Opc, DL, VT, LHS, RHS);
01018   }
01019   case ISD::SETCC_INVALID:
01020     llvm_unreachable("Invalid setcc condcode!");
01021   }
01022   return SDValue();
01023 }
01024 
01025 SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op,
01026                                                   SelectionDAG &DAG) const {
01027   LoadSDNode *Load = cast<LoadSDNode>(Op);
01028   EVT MemVT = Load->getMemoryVT();
01029   EVT MemEltVT = MemVT.getVectorElementType();
01030 
01031   EVT LoadVT = Op.getValueType();
01032   EVT EltVT = LoadVT.getVectorElementType();
01033   EVT PtrVT = Load->getBasePtr().getValueType();
01034 
01035   unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
01036   SmallVector<SDValue, 8> Loads;
01037   SmallVector<SDValue, 8> Chains;
01038 
01039   SDLoc SL(Op);
01040   unsigned MemEltSize = MemEltVT.getStoreSize();
01041   MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
01042 
01043   for (unsigned i = 0; i < NumElts; ++i) {
01044     SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
01045                               DAG.getConstant(i * MemEltSize, PtrVT));
01046 
01047     SDValue NewLoad
01048       = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
01049                        Load->getChain(), Ptr,
01050                        SrcValue.getWithOffset(i * MemEltSize),
01051                        MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
01052                        Load->isInvariant(), Load->getAlignment());
01053     Loads.push_back(NewLoad.getValue(0));
01054     Chains.push_back(NewLoad.getValue(1));
01055   }
01056 
01057   SDValue Ops[] = {
01058     DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads),
01059     DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains)
01060   };
01061 
01062   return DAG.getMergeValues(Ops, SL);
01063 }
01064 
01065 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
01066                                               SelectionDAG &DAG) const {
01067   EVT VT = Op.getValueType();
01068 
01069   // If this is a 2 element vector, we really want to scalarize and not create
01070   // weird 1 element vectors.
01071   if (VT.getVectorNumElements() == 2)
01072     return ScalarizeVectorLoad(Op, DAG);
01073 
01074   LoadSDNode *Load = cast<LoadSDNode>(Op);
01075   SDValue BasePtr = Load->getBasePtr();
01076   EVT PtrVT = BasePtr.getValueType();
01077   EVT MemVT = Load->getMemoryVT();
01078   SDLoc SL(Op);
01079   MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
01080 
01081   EVT LoVT, HiVT;
01082   EVT LoMemVT, HiMemVT;
01083   SDValue Lo, Hi;
01084 
01085   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
01086   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
01087   std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
01088   SDValue LoLoad
01089     = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
01090                      Load->getChain(), BasePtr,
01091                      SrcValue,
01092                      LoMemVT, Load->isVolatile(), Load->isNonTemporal(),
01093                      Load->isInvariant(), Load->getAlignment());
01094 
01095   SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
01096                               DAG.getConstant(LoMemVT.getStoreSize(), PtrVT));
01097 
01098   SDValue HiLoad
01099     = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT,
01100                      Load->getChain(), HiPtr,
01101                      SrcValue.getWithOffset(LoMemVT.getStoreSize()),
01102                      HiMemVT, Load->isVolatile(), Load->isNonTemporal(),
01103                      Load->isInvariant(), Load->getAlignment());
01104 
01105   SDValue Ops[] = {
01106     DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
01107     DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
01108                 LoLoad.getValue(1), HiLoad.getValue(1))
01109   };
01110 
01111   return DAG.getMergeValues(Ops, SL);
01112 }
01113 
01114 SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
01115                                                SelectionDAG &DAG) const {
01116   StoreSDNode *Store = cast<StoreSDNode>(Op);
01117   EVT MemVT = Store->getMemoryVT();
01118   unsigned MemBits = MemVT.getSizeInBits();
01119 
01120   // Byte stores are really expensive, so if possible, try to pack 32-bit vector
01121   // truncating store into an i32 store.
01122   // XXX: We could also handle optimize other vector bitwidths.
01123   if (!MemVT.isVector() || MemBits > 32) {
01124     return SDValue();
01125   }
01126 
01127   SDLoc DL(Op);
01128   SDValue Value = Store->getValue();
01129   EVT VT = Value.getValueType();
01130   EVT ElemVT = VT.getVectorElementType();
01131   SDValue Ptr = Store->getBasePtr();
01132   EVT MemEltVT = MemVT.getVectorElementType();
01133   unsigned MemEltBits = MemEltVT.getSizeInBits();
01134   unsigned MemNumElements = MemVT.getVectorNumElements();
01135   unsigned PackedSize = MemVT.getStoreSizeInBits();
01136   SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, MVT::i32);
01137 
01138   assert(Value.getValueType().getScalarSizeInBits() >= 32);
01139 
01140   SDValue PackedValue;
01141   for (unsigned i = 0; i < MemNumElements; ++i) {
01142     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
01143                               DAG.getConstant(i, MVT::i32));
01144     Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32);
01145     Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg
01146 
01147     SDValue Shift = DAG.getConstant(MemEltBits * i, MVT::i32);
01148     Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift);
01149 
01150     if (i == 0) {
01151       PackedValue = Elt;
01152     } else {
01153       PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt);
01154     }
01155   }
01156 
01157   if (PackedSize < 32) {
01158     EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize);
01159     return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr,
01160                              Store->getMemOperand()->getPointerInfo(),
01161                              PackedVT,
01162                              Store->isNonTemporal(), Store->isVolatile(),
01163                              Store->getAlignment());
01164   }
01165 
01166   return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
01167                       Store->getMemOperand()->getPointerInfo(),
01168                       Store->isVolatile(),  Store->isNonTemporal(),
01169                       Store->getAlignment());
01170 }
01171 
01172 SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op,
01173                                                    SelectionDAG &DAG) const {
01174   StoreSDNode *Store = cast<StoreSDNode>(Op);
01175   EVT MemEltVT = Store->getMemoryVT().getVectorElementType();
01176   EVT EltVT = Store->getValue().getValueType().getVectorElementType();
01177   EVT PtrVT = Store->getBasePtr().getValueType();
01178   unsigned NumElts = Store->getMemoryVT().getVectorNumElements();
01179   SDLoc SL(Op);
01180 
01181   SmallVector<SDValue, 8> Chains;
01182 
01183   unsigned EltSize = MemEltVT.getStoreSize();
01184   MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
01185 
01186   for (unsigned i = 0, e = NumElts; i != e; ++i) {
01187     SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
01188                               Store->getValue(),
01189                               DAG.getConstant(i, MVT::i32));
01190 
01191     SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), PtrVT);
01192     SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset);
01193     SDValue NewStore =
01194       DAG.getTruncStore(Store->getChain(), SL, Val, Ptr,
01195                         SrcValue.getWithOffset(i * EltSize),
01196                         MemEltVT, Store->isNonTemporal(), Store->isVolatile(),
01197                         Store->getAlignment());
01198     Chains.push_back(NewStore);
01199   }
01200 
01201   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains);
01202 }
01203 
01204 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
01205                                                SelectionDAG &DAG) const {
01206   StoreSDNode *Store = cast<StoreSDNode>(Op);
01207   SDValue Val = Store->getValue();
01208   EVT VT = Val.getValueType();
01209 
01210   // If this is a 2 element vector, we really want to scalarize and not create
01211   // weird 1 element vectors.
01212   if (VT.getVectorNumElements() == 2)
01213     return ScalarizeVectorStore(Op, DAG);
01214 
01215   EVT MemVT = Store->getMemoryVT();
01216   SDValue Chain = Store->getChain();
01217   SDValue BasePtr = Store->getBasePtr();
01218   SDLoc SL(Op);
01219 
01220   EVT LoVT, HiVT;
01221   EVT LoMemVT, HiMemVT;
01222   SDValue Lo, Hi;
01223 
01224   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
01225   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
01226   std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
01227 
01228   EVT PtrVT = BasePtr.getValueType();
01229   SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
01230                               DAG.getConstant(LoMemVT.getStoreSize(), PtrVT));
01231 
01232   MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
01233   SDValue LoStore
01234     = DAG.getTruncStore(Chain, SL, Lo,
01235                         BasePtr,
01236                         SrcValue,
01237                         LoMemVT,
01238                         Store->isNonTemporal(),
01239                         Store->isVolatile(),
01240                         Store->getAlignment());
01241   SDValue HiStore
01242     = DAG.getTruncStore(Chain, SL, Hi,
01243                         HiPtr,
01244                         SrcValue.getWithOffset(LoMemVT.getStoreSize()),
01245                         HiMemVT,
01246                         Store->isNonTemporal(),
01247                         Store->isVolatile(),
01248                         Store->getAlignment());
01249 
01250   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
01251 }
01252 
01253 
01254 SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
01255   SDLoc DL(Op);
01256   LoadSDNode *Load = cast<LoadSDNode>(Op);
01257   ISD::LoadExtType ExtType = Load->getExtensionType();
01258   EVT VT = Op.getValueType();
01259   EVT MemVT = Load->getMemoryVT();
01260 
01261   if (ExtType != ISD::NON_EXTLOAD && !VT.isVector() && VT.getSizeInBits() > 32) {
01262     // We can do the extload to 32-bits, and then need to separately extend to
01263     // 64-bits.
01264 
01265     SDValue ExtLoad32 = DAG.getExtLoad(ExtType, DL, MVT::i32,
01266                                        Load->getChain(),
01267                                        Load->getBasePtr(),
01268                                        MemVT,
01269                                        Load->getMemOperand());
01270 
01271     SDValue Ops[] = {
01272       DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32),
01273       ExtLoad32.getValue(1)
01274     };
01275 
01276     return DAG.getMergeValues(Ops, DL);
01277   }
01278 
01279   if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
01280     assert(VT == MVT::i1 && "Only i1 non-extloads expected");
01281     // FIXME: Copied from PPC
01282     // First, load into 32 bits, then truncate to 1 bit.
01283 
01284     SDValue Chain = Load->getChain();
01285     SDValue BasePtr = Load->getBasePtr();
01286     MachineMemOperand *MMO = Load->getMemOperand();
01287 
01288     SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
01289                                    BasePtr, MVT::i8, MMO);
01290 
01291     SDValue Ops[] = {
01292       DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD),
01293       NewLD.getValue(1)
01294     };
01295 
01296     return DAG.getMergeValues(Ops, DL);
01297   }
01298 
01299   if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS ||
01300       Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
01301       ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32))
01302     return SDValue();
01303 
01304 
01305   SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
01306                             DAG.getConstant(2, MVT::i32));
01307   SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
01308                             Load->getChain(), Ptr,
01309                             DAG.getTargetConstant(0, MVT::i32),
01310                             Op.getOperand(2));
01311   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
01312                                 Load->getBasePtr(),
01313                                 DAG.getConstant(0x3, MVT::i32));
01314   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
01315                                  DAG.getConstant(3, MVT::i32));
01316 
01317   Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
01318 
01319   EVT MemEltVT = MemVT.getScalarType();
01320   if (ExtType == ISD::SEXTLOAD) {
01321     SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
01322 
01323     SDValue Ops[] = {
01324       DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
01325       Load->getChain()
01326     };
01327 
01328     return DAG.getMergeValues(Ops, DL);
01329   }
01330 
01331   SDValue Ops[] = {
01332     DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
01333     Load->getChain()
01334   };
01335 
01336   return DAG.getMergeValues(Ops, DL);
01337 }
01338 
01339 SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
01340   SDLoc DL(Op);
01341   SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG);
01342   if (Result.getNode()) {
01343     return Result;
01344   }
01345 
01346   StoreSDNode *Store = cast<StoreSDNode>(Op);
01347   SDValue Chain = Store->getChain();
01348   if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
01349        Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
01350       Store->getValue().getValueType().isVector()) {
01351     return ScalarizeVectorStore(Op, DAG);
01352   }
01353 
01354   EVT MemVT = Store->getMemoryVT();
01355   if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
01356       MemVT.bitsLT(MVT::i32)) {
01357     unsigned Mask = 0;
01358     if (Store->getMemoryVT() == MVT::i8) {
01359       Mask = 0xff;
01360     } else if (Store->getMemoryVT() == MVT::i16) {
01361       Mask = 0xffff;
01362     }
01363     SDValue BasePtr = Store->getBasePtr();
01364     SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
01365                               DAG.getConstant(2, MVT::i32));
01366     SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
01367                               Chain, Ptr, DAG.getTargetConstant(0, MVT::i32));
01368 
01369     SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
01370                                   DAG.getConstant(0x3, MVT::i32));
01371 
01372     SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
01373                                    DAG.getConstant(3, MVT::i32));
01374 
01375     SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
01376                                     Store->getValue());
01377 
01378     SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
01379 
01380     SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
01381                                        MaskedValue, ShiftAmt);
01382 
01383     SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(Mask, MVT::i32),
01384                                   ShiftAmt);
01385     DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
01386                           DAG.getConstant(0xffffffff, MVT::i32));
01387     Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
01388 
01389     SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
01390     return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
01391                        Chain, Value, Ptr, DAG.getTargetConstant(0, MVT::i32));
01392   }
01393   return SDValue();
01394 }
01395 
01396 // This is a shortcut for integer division because we have fast i32<->f32
01397 // conversions, and fast f32 reciprocal instructions. The fractional part of a
01398 // float is enough to accurately represent up to a 24-bit integer.
01399 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const {
01400   SDLoc DL(Op);
01401   EVT VT = Op.getValueType();
01402   SDValue LHS = Op.getOperand(0);
01403   SDValue RHS = Op.getOperand(1);
01404   MVT IntVT = MVT::i32;
01405   MVT FltVT = MVT::f32;
01406 
01407   ISD::NodeType ToFp  = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
01408   ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
01409 
01410   if (VT.isVector()) {
01411     unsigned NElts = VT.getVectorNumElements();
01412     IntVT = MVT::getVectorVT(MVT::i32, NElts);
01413     FltVT = MVT::getVectorVT(MVT::f32, NElts);
01414   }
01415 
01416   unsigned BitSize = VT.getScalarType().getSizeInBits();
01417 
01418   SDValue jq = DAG.getConstant(1, IntVT);
01419 
01420   if (sign) {
01421     // char|short jq = ia ^ ib;
01422     jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
01423 
01424     // jq = jq >> (bitsize - 2)
01425     jq = DAG.getNode(ISD::SRA, DL, VT, jq, DAG.getConstant(BitSize - 2, VT));
01426 
01427     // jq = jq | 0x1
01428     jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, VT));
01429 
01430     // jq = (int)jq
01431     jq = DAG.getSExtOrTrunc(jq, DL, IntVT);
01432   }
01433 
01434   // int ia = (int)LHS;
01435   SDValue ia = sign ?
01436     DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT);
01437 
01438   // int ib, (int)RHS;
01439   SDValue ib = sign ?
01440     DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT);
01441 
01442   // float fa = (float)ia;
01443   SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
01444 
01445   // float fb = (float)ib;
01446   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
01447 
01448   // float fq = native_divide(fa, fb);
01449   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
01450                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
01451 
01452   // fq = trunc(fq);
01453   fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
01454 
01455   // float fqneg = -fq;
01456   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
01457 
01458   // float fr = mad(fqneg, fb, fa);
01459   SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT,
01460                            DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa);
01461 
01462   // int iq = (int)fq;
01463   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
01464 
01465   // fr = fabs(fr);
01466   fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
01467 
01468   // fb = fabs(fb);
01469   fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
01470 
01471   EVT SetCCVT = getSetCCResultType(*DAG.getContext(), VT);
01472 
01473   // int cv = fr >= fb;
01474   SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
01475 
01476   // jq = (cv ? jq : 0);
01477   jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, VT));
01478 
01479   // dst = trunc/extend to legal type
01480   iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT);
01481 
01482   // dst = iq + jq;
01483   SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
01484 
01485   // Rem needs compensation, it's easier to recompute it
01486   SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
01487   Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
01488 
01489   SDValue Res[2] = {
01490     Div,
01491     Rem
01492   };
01493   return DAG.getMergeValues(Res, DL);
01494 }
01495 
01496 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
01497                                            SelectionDAG &DAG) const {
01498   SDLoc DL(Op);
01499   EVT VT = Op.getValueType();
01500 
01501   SDValue Num = Op.getOperand(0);
01502   SDValue Den = Op.getOperand(1);
01503 
01504   if (VT == MVT::i32) {
01505     if (DAG.MaskedValueIsZero(Op.getOperand(0), APInt(32, 0xff << 24)) &&
01506         DAG.MaskedValueIsZero(Op.getOperand(1), APInt(32, 0xff << 24))) {
01507       // TODO: We technically could do this for i64, but shouldn't that just be
01508       // handled by something generally reducing 64-bit division on 32-bit
01509       // values to 32-bit?
01510       return LowerDIVREM24(Op, DAG, false);
01511     }
01512   }
01513 
01514   // RCP =  URECIP(Den) = 2^32 / Den + e
01515   // e is rounding error.
01516   SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
01517 
01518   // RCP_LO = umulo(RCP, Den) */
01519   SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den);
01520 
01521   // RCP_HI = mulhu (RCP, Den) */
01522   SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
01523 
01524   // NEG_RCP_LO = -RCP_LO
01525   SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
01526                                                      RCP_LO);
01527 
01528   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
01529   SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
01530                                            NEG_RCP_LO, RCP_LO,
01531                                            ISD::SETEQ);
01532   // Calculate the rounding error from the URECIP instruction
01533   // E = mulhu(ABS_RCP_LO, RCP)
01534   SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
01535 
01536   // RCP_A_E = RCP + E
01537   SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
01538 
01539   // RCP_S_E = RCP - E
01540   SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
01541 
01542   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
01543   SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
01544                                      RCP_A_E, RCP_S_E,
01545                                      ISD::SETEQ);
01546   // Quotient = mulhu(Tmp0, Num)
01547   SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
01548 
01549   // Num_S_Remainder = Quotient * Den
01550   SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den);
01551 
01552   // Remainder = Num - Num_S_Remainder
01553   SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
01554 
01555   // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
01556   SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
01557                                                  DAG.getConstant(-1, VT),
01558                                                  DAG.getConstant(0, VT),
01559                                                  ISD::SETUGE);
01560   // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
01561   SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
01562                                                   Num_S_Remainder,
01563                                                   DAG.getConstant(-1, VT),
01564                                                   DAG.getConstant(0, VT),
01565                                                   ISD::SETUGE);
01566   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
01567   SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
01568                                                Remainder_GE_Zero);
01569 
01570   // Calculate Division result:
01571 
01572   // Quotient_A_One = Quotient + 1
01573   SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
01574                                                          DAG.getConstant(1, VT));
01575 
01576   // Quotient_S_One = Quotient - 1
01577   SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
01578                                                          DAG.getConstant(1, VT));
01579 
01580   // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
01581   SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
01582                                      Quotient, Quotient_A_One, ISD::SETEQ);
01583 
01584   // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
01585   Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
01586                             Quotient_S_One, Div, ISD::SETEQ);
01587 
01588   // Calculate Rem result:
01589 
01590   // Remainder_S_Den = Remainder - Den
01591   SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
01592 
01593   // Remainder_A_Den = Remainder + Den
01594   SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
01595 
01596   // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
01597   SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
01598                                     Remainder, Remainder_S_Den, ISD::SETEQ);
01599 
01600   // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
01601   Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
01602                             Remainder_A_Den, Rem, ISD::SETEQ);
01603   SDValue Ops[2] = {
01604     Div,
01605     Rem
01606   };
01607   return DAG.getMergeValues(Ops, DL);
01608 }
01609 
01610 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
01611                                            SelectionDAG &DAG) const {
01612   SDLoc DL(Op);
01613   EVT VT = Op.getValueType();
01614 
01615   SDValue LHS = Op.getOperand(0);
01616   SDValue RHS = Op.getOperand(1);
01617 
01618   if (VT == MVT::i32) {
01619     if (DAG.ComputeNumSignBits(Op.getOperand(0)) > 8 &&
01620         DAG.ComputeNumSignBits(Op.getOperand(1)) > 8) {
01621       // TODO: We technically could do this for i64, but shouldn't that just be
01622       // handled by something generally reducing 64-bit division on 32-bit
01623       // values to 32-bit?
01624       return LowerDIVREM24(Op, DAG, true);
01625     }
01626   }
01627 
01628   SDValue Zero = DAG.getConstant(0, VT);
01629   SDValue NegOne = DAG.getConstant(-1, VT);
01630 
01631   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
01632   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
01633   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
01634   SDValue RSign = LHSign; // Remainder sign is the same as LHS
01635 
01636   LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
01637   RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
01638 
01639   LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
01640   RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
01641 
01642   SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
01643   SDValue Rem = Div.getValue(1);
01644 
01645   Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
01646   Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
01647 
01648   Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
01649   Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
01650 
01651   SDValue Res[2] = {
01652     Div,
01653     Rem
01654   };
01655   return DAG.getMergeValues(Res, DL);
01656 }
01657 
01658 // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
01659 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
01660   SDLoc SL(Op);
01661   EVT VT = Op.getValueType();
01662   SDValue X = Op.getOperand(0);
01663   SDValue Y = Op.getOperand(1);
01664 
01665   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
01666   SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
01667   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
01668 
01669   return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
01670 }
01671 
01672 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
01673   SDLoc SL(Op);
01674   SDValue Src = Op.getOperand(0);
01675 
01676   // result = trunc(src)
01677   // if (src > 0.0 && src != result)
01678   //   result += 1.0
01679 
01680   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
01681 
01682   const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64);
01683   const SDValue One = DAG.getConstantFP(1.0, MVT::f64);
01684 
01685   EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
01686 
01687   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
01688   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
01689   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
01690 
01691   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
01692   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
01693 }
01694 
01695 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
01696   SDLoc SL(Op);
01697   SDValue Src = Op.getOperand(0);
01698 
01699   assert(Op.getValueType() == MVT::f64);
01700 
01701   const SDValue Zero = DAG.getConstant(0, MVT::i32);
01702   const SDValue One = DAG.getConstant(1, MVT::i32);
01703 
01704   SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
01705 
01706   // Extract the upper half, since this is where we will find the sign and
01707   // exponent.
01708   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
01709 
01710   const unsigned FractBits = 52;
01711   const unsigned ExpBits = 11;
01712 
01713   // Extract the exponent.
01714   SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_I32, SL, MVT::i32,
01715                                 Hi,
01716                                 DAG.getConstant(FractBits - 32, MVT::i32),
01717                                 DAG.getConstant(ExpBits, MVT::i32));
01718   SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
01719                             DAG.getConstant(1023, MVT::i32));
01720 
01721   // Extract the sign bit.
01722   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32);
01723   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
01724 
01725   // Extend back to to 64-bits.
01726   SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
01727                                   Zero, SignBit);
01728   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
01729 
01730   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
01731   const SDValue FractMask
01732     = DAG.getConstant((UINT64_C(1) << FractBits) - 1, MVT::i64);
01733 
01734   SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
01735   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
01736   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
01737 
01738   EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32);
01739 
01740   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, MVT::i32);
01741 
01742   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
01743   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
01744 
01745   SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
01746   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
01747 
01748   return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
01749 }
01750 
01751 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
01752   SDLoc SL(Op);
01753   SDValue Src = Op.getOperand(0);
01754 
01755   assert(Op.getValueType() == MVT::f64);
01756 
01757   APFloat C1Val(APFloat::IEEEdouble, "0x1.0p+52");
01758   SDValue C1 = DAG.getConstantFP(C1Val, MVT::f64);
01759   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
01760 
01761   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
01762   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
01763 
01764   SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
01765 
01766   APFloat C2Val(APFloat::IEEEdouble, "0x1.fffffffffffffp+51");
01767   SDValue C2 = DAG.getConstantFP(C2Val, MVT::f64);
01768 
01769   EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
01770   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
01771 
01772   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
01773 }
01774 
01775 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
01776   // FNEARBYINT and FRINT are the same, except in their handling of FP
01777   // exceptions. Those aren't really meaningful for us, and OpenCL only has
01778   // rint, so just treat them as equivalent.
01779   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
01780 }
01781 
01782 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
01783   SDLoc SL(Op);
01784   SDValue Src = Op.getOperand(0);
01785 
01786   // result = trunc(src);
01787   // if (src < 0.0 && src != result)
01788   //   result += -1.0.
01789 
01790   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
01791 
01792   const SDValue Zero = DAG.getConstantFP(0.0, MVT::f64);
01793   const SDValue NegOne = DAG.getConstantFP(-1.0, MVT::f64);
01794 
01795   EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f64);
01796 
01797   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
01798   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
01799   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
01800 
01801   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
01802   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
01803 }
01804 
01805 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
01806                                                SelectionDAG &DAG) const {
01807   SDValue S0 = Op.getOperand(0);
01808   SDLoc DL(Op);
01809   if (Op.getValueType() != MVT::f32 || S0.getValueType() != MVT::i64)
01810     return SDValue();
01811 
01812   // f32 uint_to_fp i64
01813   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
01814                            DAG.getConstant(0, MVT::i32));
01815   SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo);
01816   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
01817                            DAG.getConstant(1, MVT::i32));
01818   SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi);
01819   FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
01820                         DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32
01821   return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
01822 }
01823 
01824 SDValue AMDGPUTargetLowering::ExpandSIGN_EXTEND_INREG(SDValue Op,
01825                                                       unsigned BitsDiff,
01826                                                       SelectionDAG &DAG) const {
01827   MVT VT = Op.getSimpleValueType();
01828   SDLoc DL(Op);
01829   SDValue Shift = DAG.getConstant(BitsDiff, VT);
01830   // Shift left by 'Shift' bits.
01831   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, Op.getOperand(0), Shift);
01832   // Signed shift Right by 'Shift' bits.
01833   return DAG.getNode(ISD::SRA, DL, VT, Shl, Shift);
01834 }
01835 
01836 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
01837                                                      SelectionDAG &DAG) const {
01838   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
01839   MVT VT = Op.getSimpleValueType();
01840   MVT ScalarVT = VT.getScalarType();
01841 
01842   if (!VT.isVector())
01843     return SDValue();
01844 
01845   SDValue Src = Op.getOperand(0);
01846   SDLoc DL(Op);
01847 
01848   // TODO: Don't scalarize on Evergreen?
01849   unsigned NElts = VT.getVectorNumElements();
01850   SmallVector<SDValue, 8> Args;
01851   DAG.ExtractVectorElements(Src, Args, 0, NElts);
01852 
01853   SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
01854   for (unsigned I = 0; I < NElts; ++I)
01855     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
01856 
01857   return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args);
01858 }
01859 
01860 //===----------------------------------------------------------------------===//
01861 // Custom DAG optimizations
01862 //===----------------------------------------------------------------------===//
01863 
01864 static bool isU24(SDValue Op, SelectionDAG &DAG) {
01865   APInt KnownZero, KnownOne;
01866   EVT VT = Op.getValueType();
01867   DAG.computeKnownBits(Op, KnownZero, KnownOne);
01868 
01869   return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
01870 }
01871 
01872 static bool isI24(SDValue Op, SelectionDAG &DAG) {
01873   EVT VT = Op.getValueType();
01874 
01875   // In order for this to be a signed 24-bit value, bit 23, must
01876   // be a sign bit.
01877   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
01878                                      // as unsigned 24-bit values.
01879          (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
01880 }
01881 
01882 static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
01883 
01884   SelectionDAG &DAG = DCI.DAG;
01885   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
01886   EVT VT = Op.getValueType();
01887 
01888   APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
01889   APInt KnownZero, KnownOne;
01890   TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
01891   if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
01892     DCI.CommitTargetLoweringOpt(TLO);
01893 }
01894 
01895 template <typename IntTy>
01896 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
01897                                uint32_t Offset, uint32_t Width) {
01898   if (Width + Offset < 32) {
01899     IntTy Result = (Src0 << (32 - Offset - Width)) >> (32 - Width);
01900     return DAG.getConstant(Result, MVT::i32);
01901   }
01902 
01903   return DAG.getConstant(Src0 >> Offset, MVT::i32);
01904 }
01905 
01906 static bool usesAllNormalStores(SDNode *LoadVal) {
01907   for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) {
01908     if (!ISD::isNormalStore(*I))
01909       return false;
01910   }
01911 
01912   return true;
01913 }
01914 
01915 // If we have a copy of an illegal type, replace it with a load / store of an
01916 // equivalently sized legal type. This avoids intermediate bit pack / unpack
01917 // instructions emitted when handling extloads and truncstores. Ideally we could
01918 // recognize the pack / unpack pattern to eliminate it.
01919 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
01920                                                   DAGCombinerInfo &DCI) const {
01921   if (!DCI.isBeforeLegalize())
01922     return SDValue();
01923 
01924   StoreSDNode *SN = cast<StoreSDNode>(N);
01925   SDValue Value = SN->getValue();
01926   EVT VT = Value.getValueType();
01927 
01928   if (isTypeLegal(VT) || SN->isVolatile() || !ISD::isNormalLoad(Value.getNode()))
01929     return SDValue();
01930 
01931   LoadSDNode *LoadVal = cast<LoadSDNode>(Value);
01932   if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal))
01933     return SDValue();
01934 
01935   EVT MemVT = LoadVal->getMemoryVT();
01936 
01937   SDLoc SL(N);
01938   SelectionDAG &DAG = DCI.DAG;
01939   EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT);
01940 
01941   SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
01942                                 LoadVT, SL,
01943                                 LoadVal->getChain(),
01944                                 LoadVal->getBasePtr(),
01945                                 LoadVal->getOffset(),
01946                                 LoadVT,
01947                                 LoadVal->getMemOperand());
01948 
01949   SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0));
01950   DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false);
01951 
01952   return DAG.getStore(SN->getChain(), SL, NewLoad,
01953                       SN->getBasePtr(), SN->getMemOperand());
01954 }
01955 
01956 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
01957                                                 DAGCombinerInfo &DCI) const {
01958   EVT VT = N->getValueType(0);
01959 
01960   if (VT.isVector() || VT.getSizeInBits() > 32)
01961     return SDValue();
01962 
01963   SelectionDAG &DAG = DCI.DAG;
01964   SDLoc DL(N);
01965 
01966   SDValue N0 = N->getOperand(0);
01967   SDValue N1 = N->getOperand(1);
01968   SDValue Mul;
01969 
01970   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
01971     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
01972     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
01973     Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
01974   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
01975     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
01976     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
01977     Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
01978   } else {
01979     return SDValue();
01980   }
01981 
01982   // We need to use sext even for MUL_U24, because MUL_U24 is used
01983   // for signed multiply of 8 and 16-bit types.
01984   return DAG.getSExtOrTrunc(Mul, DL, VT);
01985 }
01986 
01987 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
01988                                                 DAGCombinerInfo &DCI) const {
01989   SelectionDAG &DAG = DCI.DAG;
01990   SDLoc DL(N);
01991 
01992   switch(N->getOpcode()) {
01993     default: break;
01994     case ISD::MUL:
01995       return performMulCombine(N, DCI);
01996     case AMDGPUISD::MUL_I24:
01997     case AMDGPUISD::MUL_U24: {
01998       SDValue N0 = N->getOperand(0);
01999       SDValue N1 = N->getOperand(1);
02000       simplifyI24(N0, DCI);
02001       simplifyI24(N1, DCI);
02002       return SDValue();
02003     }
02004     case ISD::SELECT_CC: {
02005       return CombineMinMax(N, DAG);
02006     }
02007   case AMDGPUISD::BFE_I32:
02008   case AMDGPUISD::BFE_U32: {
02009     assert(!N->getValueType(0).isVector() &&
02010            "Vector handling of BFE not implemented");
02011     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
02012     if (!Width)
02013       break;
02014 
02015     uint32_t WidthVal = Width->getZExtValue() & 0x1f;
02016     if (WidthVal == 0)
02017       return DAG.getConstant(0, MVT::i32);
02018 
02019     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
02020     if (!Offset)
02021       break;
02022 
02023     SDValue BitsFrom = N->getOperand(0);
02024     uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
02025 
02026     bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
02027 
02028     if (OffsetVal == 0) {
02029       // This is already sign / zero extended, so try to fold away extra BFEs.
02030       unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
02031 
02032       unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
02033       if (OpSignBits >= SignBits)
02034         return BitsFrom;
02035 
02036       EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
02037       if (Signed) {
02038         // This is a sign_extend_inreg. Replace it to take advantage of existing
02039         // DAG Combines. If not eliminated, we will match back to BFE during
02040         // selection.
02041 
02042         // TODO: The sext_inreg of extended types ends, although we can could
02043         // handle them in a single BFE.
02044         return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
02045                            DAG.getValueType(SmallVT));
02046       }
02047 
02048       return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
02049     }
02050 
02051     if (ConstantSDNode *Val = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
02052       if (Signed) {
02053         return constantFoldBFE<int32_t>(DAG,
02054                                         Val->getSExtValue(),
02055                                         OffsetVal,
02056                                         WidthVal);
02057       }
02058 
02059       return constantFoldBFE<uint32_t>(DAG,
02060                                        Val->getZExtValue(),
02061                                        OffsetVal,
02062                                        WidthVal);
02063     }
02064 
02065     APInt Demanded = APInt::getBitsSet(32,
02066                                        OffsetVal,
02067                                        OffsetVal + WidthVal);
02068 
02069     if ((OffsetVal + WidthVal) >= 32) {
02070       SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32);
02071       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
02072                          BitsFrom, ShiftVal);
02073     }
02074 
02075     APInt KnownZero, KnownOne;
02076     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
02077                                           !DCI.isBeforeLegalizeOps());
02078     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
02079     if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
02080         TLI.SimplifyDemandedBits(BitsFrom, Demanded, KnownZero, KnownOne, TLO)) {
02081       DCI.CommitTargetLoweringOpt(TLO);
02082     }
02083 
02084     break;
02085   }
02086 
02087   case ISD::STORE:
02088     return performStoreCombine(N, DCI);
02089   }
02090   return SDValue();
02091 }
02092 
02093 //===----------------------------------------------------------------------===//
02094 // Helper functions
02095 //===----------------------------------------------------------------------===//
02096 
02097 void AMDGPUTargetLowering::getOriginalFunctionArgs(
02098                                SelectionDAG &DAG,
02099                                const Function *F,
02100                                const SmallVectorImpl<ISD::InputArg> &Ins,
02101                                SmallVectorImpl<ISD::InputArg> &OrigIns) const {
02102 
02103   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
02104     if (Ins[i].ArgVT == Ins[i].VT) {
02105       OrigIns.push_back(Ins[i]);
02106       continue;
02107     }
02108 
02109     EVT VT;
02110     if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) {
02111       // Vector has been split into scalars.
02112       VT = Ins[i].ArgVT.getVectorElementType();
02113     } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() &&
02114                Ins[i].ArgVT.getVectorElementType() !=
02115                Ins[i].VT.getVectorElementType()) {
02116       // Vector elements have been promoted
02117       VT = Ins[i].ArgVT;
02118     } else {
02119       // Vector has been spilt into smaller vectors.
02120       VT = Ins[i].VT;
02121     }
02122 
02123     ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used,
02124                       Ins[i].OrigArgIndex, Ins[i].PartOffset);
02125     OrigIns.push_back(Arg);
02126   }
02127 }
02128 
02129 bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
02130   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
02131     return CFP->isExactlyValue(1.0);
02132   }
02133   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
02134     return C->isAllOnesValue();
02135   }
02136   return false;
02137 }
02138 
02139 bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
02140   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
02141     return CFP->getValueAPF().isZero();
02142   }
02143   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
02144     return C->isNullValue();
02145   }
02146   return false;
02147 }
02148 
02149 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
02150                                                   const TargetRegisterClass *RC,
02151                                                    unsigned Reg, EVT VT) const {
02152   MachineFunction &MF = DAG.getMachineFunction();
02153   MachineRegisterInfo &MRI = MF.getRegInfo();
02154   unsigned VirtualRegister;
02155   if (!MRI.isLiveIn(Reg)) {
02156     VirtualRegister = MRI.createVirtualRegister(RC);
02157     MRI.addLiveIn(Reg, VirtualRegister);
02158   } else {
02159     VirtualRegister = MRI.getLiveInVirtReg(Reg);
02160   }
02161   return DAG.getRegister(VirtualRegister, VT);
02162 }
02163 
02164 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
02165 
02166 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
02167   switch (Opcode) {
02168   default: return nullptr;
02169   // AMDIL DAG nodes
02170   NODE_NAME_CASE(CALL);
02171   NODE_NAME_CASE(UMUL);
02172   NODE_NAME_CASE(RET_FLAG);
02173   NODE_NAME_CASE(BRANCH_COND);
02174 
02175   // AMDGPU DAG nodes
02176   NODE_NAME_CASE(DWORDADDR)
02177   NODE_NAME_CASE(FRACT)
02178   NODE_NAME_CASE(CLAMP)
02179   NODE_NAME_CASE(MAD)
02180   NODE_NAME_CASE(FMAX)
02181   NODE_NAME_CASE(SMAX)
02182   NODE_NAME_CASE(UMAX)
02183   NODE_NAME_CASE(FMIN)
02184   NODE_NAME_CASE(SMIN)
02185   NODE_NAME_CASE(UMIN)
02186   NODE_NAME_CASE(URECIP)
02187   NODE_NAME_CASE(DIV_SCALE)
02188   NODE_NAME_CASE(DIV_FMAS)
02189   NODE_NAME_CASE(DIV_FIXUP)
02190   NODE_NAME_CASE(TRIG_PREOP)
02191   NODE_NAME_CASE(RCP)
02192   NODE_NAME_CASE(RSQ)
02193   NODE_NAME_CASE(RSQ_LEGACY)
02194   NODE_NAME_CASE(RSQ_CLAMPED)
02195   NODE_NAME_CASE(LDEXP)
02196   NODE_NAME_CASE(DOT4)
02197   NODE_NAME_CASE(BFE_U32)
02198   NODE_NAME_CASE(BFE_I32)
02199   NODE_NAME_CASE(BFI)
02200   NODE_NAME_CASE(BFM)
02201   NODE_NAME_CASE(BREV)
02202   NODE_NAME_CASE(MUL_U24)
02203   NODE_NAME_CASE(MUL_I24)
02204   NODE_NAME_CASE(MAD_U24)
02205   NODE_NAME_CASE(MAD_I24)
02206   NODE_NAME_CASE(EXPORT)
02207   NODE_NAME_CASE(CONST_ADDRESS)
02208   NODE_NAME_CASE(REGISTER_LOAD)
02209   NODE_NAME_CASE(REGISTER_STORE)
02210   NODE_NAME_CASE(LOAD_CONSTANT)
02211   NODE_NAME_CASE(LOAD_INPUT)
02212   NODE_NAME_CASE(SAMPLE)
02213   NODE_NAME_CASE(SAMPLEB)
02214   NODE_NAME_CASE(SAMPLED)
02215   NODE_NAME_CASE(SAMPLEL)
02216   NODE_NAME_CASE(CVT_F32_UBYTE0)
02217   NODE_NAME_CASE(CVT_F32_UBYTE1)
02218   NODE_NAME_CASE(CVT_F32_UBYTE2)
02219   NODE_NAME_CASE(CVT_F32_UBYTE3)
02220   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
02221   NODE_NAME_CASE(CONST_DATA_PTR)
02222   NODE_NAME_CASE(STORE_MSKOR)
02223   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
02224   }
02225 }
02226 
02227 static void computeKnownBitsForMinMax(const SDValue Op0,
02228                                       const SDValue Op1,
02229                                       APInt &KnownZero,
02230                                       APInt &KnownOne,
02231                                       const SelectionDAG &DAG,
02232                                       unsigned Depth) {
02233   APInt Op0Zero, Op0One;
02234   APInt Op1Zero, Op1One;
02235   DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth);
02236   DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth);
02237 
02238   KnownZero = Op0Zero & Op1Zero;
02239   KnownOne = Op0One & Op1One;
02240 }
02241 
02242 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
02243   const SDValue Op,
02244   APInt &KnownZero,
02245   APInt &KnownOne,
02246   const SelectionDAG &DAG,
02247   unsigned Depth) const {
02248 
02249   KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
02250 
02251   APInt KnownZero2;
02252   APInt KnownOne2;
02253   unsigned Opc = Op.getOpcode();
02254 
02255   switch (Opc) {
02256   default:
02257     break;
02258   case ISD::INTRINSIC_WO_CHAIN: {
02259     // FIXME: The intrinsic should just use the node.
02260     switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
02261     case AMDGPUIntrinsic::AMDGPU_imax:
02262     case AMDGPUIntrinsic::AMDGPU_umax:
02263     case AMDGPUIntrinsic::AMDGPU_imin:
02264     case AMDGPUIntrinsic::AMDGPU_umin:
02265       computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
02266                                 KnownZero, KnownOne, DAG, Depth);
02267       break;
02268     default:
02269       break;
02270     }
02271 
02272     break;
02273   }
02274   case AMDGPUISD::SMAX:
02275   case AMDGPUISD::UMAX:
02276   case AMDGPUISD::SMIN:
02277   case AMDGPUISD::UMIN:
02278     computeKnownBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
02279                               KnownZero, KnownOne, DAG, Depth);
02280     break;
02281 
02282   case AMDGPUISD::BFE_I32:
02283   case AMDGPUISD::BFE_U32: {
02284     ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
02285     if (!CWidth)
02286       return;
02287 
02288     unsigned BitWidth = 32;
02289     uint32_t Width = CWidth->getZExtValue() & 0x1f;
02290     if (Width == 0) {
02291       KnownZero = APInt::getAllOnesValue(BitWidth);
02292       KnownOne = APInt::getNullValue(BitWidth);
02293       return;
02294     }
02295 
02296     // FIXME: This could do a lot more. If offset is 0, should be the same as
02297     // sign_extend_inreg implementation, but that involves duplicating it.
02298     if (Opc == AMDGPUISD::BFE_I32)
02299       KnownOne = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
02300     else
02301       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
02302 
02303     break;
02304   }
02305   }
02306 }
02307 
02308 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
02309   SDValue Op,
02310   const SelectionDAG &DAG,
02311   unsigned Depth) const {
02312   switch (Op.getOpcode()) {
02313   case AMDGPUISD::BFE_I32: {
02314     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
02315     if (!Width)
02316       return 1;
02317 
02318     unsigned SignBits = 32 - Width->getZExtValue() + 1;
02319     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1));
02320     if (!Offset || !Offset->isNullValue())
02321       return SignBits;
02322 
02323     // TODO: Could probably figure something out with non-0 offsets.
02324     unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
02325     return std::max(SignBits, Op0SignBits);
02326   }
02327 
02328   case AMDGPUISD::BFE_U32: {
02329     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
02330     return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
02331   }
02332 
02333   default:
02334     return 1;
02335   }
02336 }