LLVM API Documentation

SIISelLowering.cpp
Go to the documentation of this file.
00001 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 /// \file
00011 /// \brief Custom DAG lowering for SI
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #ifdef _MSC_VER
00016 // Provide M_PI.
00017 #define _USE_MATH_DEFINES
00018 #include <cmath>
00019 #endif
00020 
00021 #include "SIISelLowering.h"
00022 #include "AMDGPU.h"
00023 #include "AMDGPUIntrinsicInfo.h"
00024 #include "AMDGPUSubtarget.h"
00025 #include "SIInstrInfo.h"
00026 #include "SIMachineFunctionInfo.h"
00027 #include "SIRegisterInfo.h"
00028 #include "llvm/ADT/BitVector.h"
00029 #include "llvm/CodeGen/CallingConvLower.h"
00030 #include "llvm/CodeGen/MachineInstrBuilder.h"
00031 #include "llvm/CodeGen/MachineRegisterInfo.h"
00032 #include "llvm/CodeGen/SelectionDAG.h"
00033 #include "llvm/IR/Function.h"
00034 #include "llvm/ADT/SmallString.h"
00035 
00036 using namespace llvm;
00037 
00038 SITargetLowering::SITargetLowering(TargetMachine &TM) :
00039     AMDGPUTargetLowering(TM) {
00040   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
00041   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
00042 
00043   addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
00044   addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
00045 
00046   addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
00047   addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
00048 
00049   addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
00050   addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
00051   addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
00052 
00053   addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
00054   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
00055 
00056   addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass);
00057   addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
00058 
00059   addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass);
00060   addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
00061 
00062   computeRegisterProperties();
00063 
00064   // Condition Codes
00065   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
00066   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
00067   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
00068   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
00069   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
00070   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
00071 
00072   setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
00073   setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
00074   setCondCodeAction(ISD::SETUGE, MVT::f64, Expand);
00075   setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
00076   setCondCodeAction(ISD::SETULE, MVT::f64, Expand);
00077   setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
00078 
00079   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
00080   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
00081   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
00082   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
00083 
00084   setOperationAction(ISD::ADD, MVT::i32, Legal);
00085   setOperationAction(ISD::ADDC, MVT::i32, Legal);
00086   setOperationAction(ISD::ADDE, MVT::i32, Legal);
00087   setOperationAction(ISD::SUBC, MVT::i32, Legal);
00088   setOperationAction(ISD::SUBE, MVT::i32, Legal);
00089 
00090   setOperationAction(ISD::FSIN, MVT::f32, Custom);
00091   setOperationAction(ISD::FCOS, MVT::f32, Custom);
00092 
00093   // We need to custom lower vector stores from local memory
00094   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
00095   setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
00096   setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
00097 
00098   setOperationAction(ISD::STORE, MVT::v8i32, Custom);
00099   setOperationAction(ISD::STORE, MVT::v16i32, Custom);
00100 
00101   setOperationAction(ISD::STORE, MVT::i1, Custom);
00102   setOperationAction(ISD::STORE, MVT::i32, Custom);
00103   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
00104   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
00105 
00106   setOperationAction(ISD::SELECT, MVT::f32, Promote);
00107   AddPromotedToType(ISD::SELECT, MVT::f32, MVT::i32);
00108   setOperationAction(ISD::SELECT, MVT::i64, Custom);
00109   setOperationAction(ISD::SELECT, MVT::f64, Promote);
00110   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
00111 
00112   setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
00113   setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
00114   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
00115   setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
00116 
00117   setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
00118   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
00119 
00120   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
00121   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
00122   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
00123 
00124   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
00125   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
00126   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
00127 
00128   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
00129   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
00130   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
00131 
00132   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom);
00133 
00134   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
00135 
00136   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
00137   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
00138   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
00139   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
00140 
00141   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
00142   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
00143 
00144   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
00145   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
00146   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
00147   setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
00148   setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand);
00149   setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand);
00150 
00151   setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
00152   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
00153   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
00154   setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand);
00155 
00156   setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
00157   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
00158   setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
00159   setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
00160   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
00161 
00162   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
00163   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
00164   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
00165   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
00166   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
00167   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
00168 
00169   setOperationAction(ISD::LOAD, MVT::i1, Custom);
00170 
00171   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
00172   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
00173 
00174   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
00175   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
00176   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
00177 
00178   // These should use UDIVREM, so set them to expand
00179   setOperationAction(ISD::UDIV, MVT::i64, Expand);
00180   setOperationAction(ISD::UREM, MVT::i64, Expand);
00181 
00182   // We only support LOAD/STORE and vector manipulation ops for vectors
00183   // with > 4 elements.
00184   MVT VecTypes[] = {
00185     MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32
00186   };
00187 
00188   setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
00189   setOperationAction(ISD::SELECT, MVT::i1, Promote);
00190 
00191   for (MVT VT : VecTypes) {
00192     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
00193       switch(Op) {
00194       case ISD::LOAD:
00195       case ISD::STORE:
00196       case ISD::BUILD_VECTOR:
00197       case ISD::BITCAST:
00198       case ISD::EXTRACT_VECTOR_ELT:
00199       case ISD::INSERT_VECTOR_ELT:
00200       case ISD::INSERT_SUBVECTOR:
00201       case ISD::EXTRACT_SUBVECTOR:
00202         break;
00203       case ISD::CONCAT_VECTORS:
00204         setOperationAction(Op, VT, Custom);
00205         break;
00206       default:
00207         setOperationAction(Op, VT, Expand);
00208         break;
00209       }
00210     }
00211   }
00212 
00213   for (int I = MVT::v1f64; I <= MVT::v8f64; ++I) {
00214     MVT::SimpleValueType VT = static_cast<MVT::SimpleValueType>(I);
00215     setOperationAction(ISD::FTRUNC, VT, Expand);
00216     setOperationAction(ISD::FCEIL, VT, Expand);
00217     setOperationAction(ISD::FFLOOR, VT, Expand);
00218   }
00219 
00220   if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
00221     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
00222     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
00223     setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
00224     setOperationAction(ISD::FRINT, MVT::f64, Legal);
00225   }
00226 
00227   setOperationAction(ISD::FDIV, MVT::f32, Custom);
00228 
00229   setTargetDAGCombine(ISD::FSUB);
00230   setTargetDAGCombine(ISD::SELECT_CC);
00231   setTargetDAGCombine(ISD::SETCC);
00232 
00233   setTargetDAGCombine(ISD::UINT_TO_FP);
00234 
00235   // All memory operations. Some folding on the pointer operand is done to help
00236   // matching the constant offsets in the addressing modes.
00237   setTargetDAGCombine(ISD::LOAD);
00238   setTargetDAGCombine(ISD::STORE);
00239   setTargetDAGCombine(ISD::ATOMIC_LOAD);
00240   setTargetDAGCombine(ISD::ATOMIC_STORE);
00241   setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
00242   setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
00243   setTargetDAGCombine(ISD::ATOMIC_SWAP);
00244   setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
00245   setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
00246   setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
00247   setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
00248   setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
00249   setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
00250   setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
00251   setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
00252   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
00253   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
00254 
00255   setSchedulingPreference(Sched::RegPressure);
00256 }
00257 
00258 //===----------------------------------------------------------------------===//
00259 // TargetLowering queries
00260 //===----------------------------------------------------------------------===//
00261 
00262 // FIXME: This really needs an address space argument. The immediate offset
00263 // size is different for different sets of memory instruction sets.
00264 
00265 // The single offset DS instructions have a 16-bit unsigned byte offset.
00266 //
00267 // MUBUF / MTBUF have a 12-bit unsigned byte offset, and additionally can do r +
00268 // r + i with addr64. 32-bit has more addressing mode options. Depending on the
00269 // resource constant, it can also do (i64 r0) + (i32 r1) * (i14 i).
00270 //
00271 // SMRD instructions have an 8-bit, dword offset.
00272 //
00273 bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM,
00274                                              Type *Ty) const {
00275   // No global is ever allowed as a base.
00276   if (AM.BaseGV)
00277     return false;
00278 
00279   // Allow a 16-bit unsigned immediate field, since this is what DS instructions
00280   // use.
00281   if (!isUInt<16>(AM.BaseOffs))
00282     return false;
00283 
00284   // Only support r+r,
00285   switch (AM.Scale) {
00286   case 0:  // "r+i" or just "i", depending on HasBaseReg.
00287     break;
00288   case 1:
00289     if (AM.HasBaseReg && AM.BaseOffs)  // "r+r+i" is not allowed.
00290       return false;
00291     // Otherwise we have r+r or r+i.
00292     break;
00293   case 2:
00294     if (AM.HasBaseReg || AM.BaseOffs)  // 2*r+r  or  2*r+i is not allowed.
00295       return false;
00296     // Allow 2*r as r+r.
00297     break;
00298   default: // Don't allow n * r
00299     return false;
00300   }
00301 
00302   return true;
00303 }
00304 
00305 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT  VT,
00306                                                       unsigned AddrSpace,
00307                                                       unsigned Align,
00308                                                       bool *IsFast) const {
00309   if (IsFast)
00310     *IsFast = false;
00311 
00312   // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
00313   // which isn't a simple VT.
00314   if (!VT.isSimple() || VT == MVT::Other)
00315     return false;
00316 
00317   // XXX - CI changes say "Support for unaligned memory accesses" but I don't
00318   // see what for specifically. The wording everywhere else seems to be the
00319   // same.
00320 
00321   // XXX - The only mention I see of this in the ISA manual is for LDS direct
00322   // reads the "byte address and must be dword aligned". Is it also true for the
00323   // normal loads and stores?
00324   if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) {
00325     // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
00326     // aligned, 8 byte access in a single operation using ds_read2/write2_b32
00327     // with adjacent offsets.
00328     return Align % 4 == 0;
00329   }
00330 
00331   // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
00332   // byte-address are ignored, thus forcing Dword alignment.
00333   // This applies to private, global, and constant memory.
00334   if (IsFast)
00335     *IsFast = true;
00336   return VT.bitsGT(MVT::i32);
00337 }
00338 
00339 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
00340                                           unsigned SrcAlign, bool IsMemset,
00341                                           bool ZeroMemset,
00342                                           bool MemcpyStrSrc,
00343                                           MachineFunction &MF) const {
00344   // FIXME: Should account for address space here.
00345 
00346   // The default fallback uses the private pointer size as a guess for a type to
00347   // use. Make sure we switch these to 64-bit accesses.
00348 
00349   if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
00350     return MVT::v4i32;
00351 
00352   if (Size >= 8 && DstAlign >= 4)
00353     return MVT::v2i32;
00354 
00355   // Use the default.
00356   return MVT::Other;
00357 }
00358 
00359 TargetLoweringBase::LegalizeTypeAction
00360 SITargetLowering::getPreferredVectorAction(EVT VT) const {
00361   if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
00362     return TypeSplitVector;
00363 
00364   return TargetLoweringBase::getPreferredVectorAction(VT);
00365 }
00366 
00367 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
00368                                                          Type *Ty) const {
00369   const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
00370       getTargetMachine().getSubtargetImpl()->getInstrInfo());
00371   return TII->isInlineConstant(Imm);
00372 }
00373 
00374 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
00375                                          SDLoc SL, SDValue Chain,
00376                                          unsigned Offset, bool Signed) const {
00377   const DataLayout *DL = getDataLayout();
00378 
00379   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
00380 
00381   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
00382   PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
00383   SDValue BasePtr =  DAG.getCopyFromReg(Chain, SL,
00384                            MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64);
00385   SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, BasePtr,
00386                                              DAG.getConstant(Offset, MVT::i64));
00387   SDValue PtrOffset = DAG.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
00388   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
00389 
00390   return DAG.getLoad(ISD::UNINDEXED, Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD,
00391                      VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
00392                      false, // isVolatile
00393                      true, // isNonTemporal
00394                      true, // isInvariant
00395                      DL->getABITypeAlignment(Ty)); // Alignment
00396 }
00397 
00398 SDValue SITargetLowering::LowerFormalArguments(
00399                                       SDValue Chain,
00400                                       CallingConv::ID CallConv,
00401                                       bool isVarArg,
00402                                       const SmallVectorImpl<ISD::InputArg> &Ins,
00403                                       SDLoc DL, SelectionDAG &DAG,
00404                                       SmallVectorImpl<SDValue> &InVals) const {
00405 
00406   const TargetRegisterInfo *TRI =
00407       getTargetMachine().getSubtargetImpl()->getRegisterInfo();
00408 
00409   MachineFunction &MF = DAG.getMachineFunction();
00410   FunctionType *FType = MF.getFunction()->getFunctionType();
00411   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
00412 
00413   assert(CallConv == CallingConv::C);
00414 
00415   SmallVector<ISD::InputArg, 16> Splits;
00416   BitVector Skipped(Ins.size());
00417 
00418   for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
00419     const ISD::InputArg &Arg = Ins[i];
00420 
00421     // First check if it's a PS input addr
00422     if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() &&
00423         !Arg.Flags.isByVal()) {
00424 
00425       assert((PSInputNum <= 15) && "Too many PS inputs!");
00426 
00427       if (!Arg.Used) {
00428         // We can savely skip PS inputs
00429         Skipped.set(i);
00430         ++PSInputNum;
00431         continue;
00432       }
00433 
00434       Info->PSInputAddr |= 1 << PSInputNum++;
00435     }
00436 
00437     // Second split vertices into their elements
00438     if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) {
00439       ISD::InputArg NewArg = Arg;
00440       NewArg.Flags.setSplit();
00441       NewArg.VT = Arg.VT.getVectorElementType();
00442 
00443       // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
00444       // three or five element vertex only needs three or five registers,
00445       // NOT four or eigth.
00446       Type *ParamType = FType->getParamType(Arg.OrigArgIndex);
00447       unsigned NumElements = ParamType->getVectorNumElements();
00448 
00449       for (unsigned j = 0; j != NumElements; ++j) {
00450         Splits.push_back(NewArg);
00451         NewArg.PartOffset += NewArg.VT.getStoreSize();
00452       }
00453 
00454     } else if (Info->getShaderType() != ShaderType::COMPUTE) {
00455       Splits.push_back(Arg);
00456     }
00457   }
00458 
00459   SmallVector<CCValAssign, 16> ArgLocs;
00460   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
00461                  *DAG.getContext());
00462 
00463   // At least one interpolation mode must be enabled or else the GPU will hang.
00464   if (Info->getShaderType() == ShaderType::PIXEL &&
00465       (Info->PSInputAddr & 0x7F) == 0) {
00466     Info->PSInputAddr |= 1;
00467     CCInfo.AllocateReg(AMDGPU::VGPR0);
00468     CCInfo.AllocateReg(AMDGPU::VGPR1);
00469   }
00470 
00471   // The pointer to the list of arguments is stored in SGPR0, SGPR1
00472   // The pointer to the scratch buffer is stored in SGPR2, SGPR3
00473   if (Info->getShaderType() == ShaderType::COMPUTE) {
00474     Info->NumUserSGPRs = 4;
00475     CCInfo.AllocateReg(AMDGPU::SGPR0);
00476     CCInfo.AllocateReg(AMDGPU::SGPR1);
00477     CCInfo.AllocateReg(AMDGPU::SGPR2);
00478     CCInfo.AllocateReg(AMDGPU::SGPR3);
00479     MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass);
00480     MF.addLiveIn(AMDGPU::SGPR2_SGPR3, &AMDGPU::SReg_64RegClass);
00481   }
00482 
00483   if (Info->getShaderType() == ShaderType::COMPUTE) {
00484     getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
00485                             Splits);
00486   }
00487 
00488   AnalyzeFormalArguments(CCInfo, Splits);
00489 
00490   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
00491 
00492     const ISD::InputArg &Arg = Ins[i];
00493     if (Skipped[i]) {
00494       InVals.push_back(DAG.getUNDEF(Arg.VT));
00495       continue;
00496     }
00497 
00498     CCValAssign &VA = ArgLocs[ArgIdx++];
00499     EVT VT = VA.getLocVT();
00500 
00501     if (VA.isMemLoc()) {
00502       VT = Ins[i].VT;
00503       EVT MemVT = Splits[i].VT;
00504       // The first 36 bytes of the input buffer contains information about
00505       // thread group and global sizes.
00506       SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, DAG.getRoot(),
00507                                    36 + VA.getLocMemOffset(),
00508                                    Ins[i].Flags.isSExt());
00509 
00510       const PointerType *ParamTy =
00511           dyn_cast<PointerType>(FType->getParamType(Ins[i].OrigArgIndex));
00512       if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
00513           ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
00514         // On SI local pointers are just offsets into LDS, so they are always
00515         // less than 16-bits.  On CI and newer they could potentially be
00516         // real pointers, so we can't guarantee their size.
00517         Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
00518                           DAG.getValueType(MVT::i16));
00519       }
00520 
00521       InVals.push_back(Arg);
00522       continue;
00523     }
00524     assert(VA.isRegLoc() && "Parameter must be in a register!");
00525 
00526     unsigned Reg = VA.getLocReg();
00527 
00528     if (VT == MVT::i64) {
00529       // For now assume it is a pointer
00530       Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
00531                                      &AMDGPU::SReg_64RegClass);
00532       Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
00533       InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
00534       continue;
00535     }
00536 
00537     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
00538 
00539     Reg = MF.addLiveIn(Reg, RC);
00540     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
00541 
00542     if (Arg.VT.isVector()) {
00543 
00544       // Build a vector from the registers
00545       Type *ParamType = FType->getParamType(Arg.OrigArgIndex);
00546       unsigned NumElements = ParamType->getVectorNumElements();
00547 
00548       SmallVector<SDValue, 4> Regs;
00549       Regs.push_back(Val);
00550       for (unsigned j = 1; j != NumElements; ++j) {
00551         Reg = ArgLocs[ArgIdx++].getLocReg();
00552         Reg = MF.addLiveIn(Reg, RC);
00553         Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT));
00554       }
00555 
00556       // Fill up the missing vector elements
00557       NumElements = Arg.VT.getVectorNumElements() - NumElements;
00558       for (unsigned j = 0; j != NumElements; ++j)
00559         Regs.push_back(DAG.getUNDEF(VT));
00560 
00561       InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs));
00562       continue;
00563     }
00564 
00565     InVals.push_back(Val);
00566   }
00567   return Chain;
00568 }
00569 
00570 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
00571     MachineInstr * MI, MachineBasicBlock * BB) const {
00572 
00573   MachineBasicBlock::iterator I = *MI;
00574   const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
00575       getTargetMachine().getSubtargetImpl()->getInstrInfo());
00576   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
00577 
00578   switch (MI->getOpcode()) {
00579   default:
00580     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
00581   case AMDGPU::BRANCH: return BB;
00582   case AMDGPU::SI_ADDR64_RSRC: {
00583     unsigned SuperReg = MI->getOperand(0).getReg();
00584     unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
00585     unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
00586     unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
00587     unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
00588     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo)
00589             .addOperand(MI->getOperand(1));
00590     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo)
00591             .addImm(0);
00592     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi)
00593             .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
00594     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi)
00595             .addReg(SubRegHiLo)
00596             .addImm(AMDGPU::sub0)
00597             .addReg(SubRegHiHi)
00598             .addImm(AMDGPU::sub1);
00599     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SuperReg)
00600             .addReg(SubRegLo)
00601             .addImm(AMDGPU::sub0_sub1)
00602             .addReg(SubRegHi)
00603             .addImm(AMDGPU::sub2_sub3);
00604     MI->eraseFromParent();
00605     break;
00606   }
00607   case AMDGPU::SI_BUFFER_RSRC: {
00608     unsigned SuperReg = MI->getOperand(0).getReg();
00609     unsigned Args[4];
00610     for (unsigned i = 0, e = 4; i < e; ++i) {
00611       MachineOperand &Arg = MI->getOperand(i + 1);
00612 
00613       if (Arg.isReg()) {
00614         Args[i] = Arg.getReg();
00615         continue;
00616       }
00617 
00618       assert(Arg.isImm());
00619       unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
00620       BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), Reg)
00621               .addImm(Arg.getImm());
00622       Args[i] = Reg;
00623     }
00624     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE),
00625             SuperReg)
00626             .addReg(Args[0])
00627             .addImm(AMDGPU::sub0)
00628             .addReg(Args[1])
00629             .addImm(AMDGPU::sub1)
00630             .addReg(Args[2])
00631             .addImm(AMDGPU::sub2)
00632             .addReg(Args[3])
00633             .addImm(AMDGPU::sub3);
00634     MI->eraseFromParent();
00635     break;
00636   }
00637   case AMDGPU::V_SUB_F64: {
00638     unsigned DestReg = MI->getOperand(0).getReg();
00639     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), DestReg)
00640       .addImm(0)  // SRC0 modifiers
00641       .addReg(MI->getOperand(1).getReg())
00642       .addImm(1)  // SRC1 modifiers
00643       .addReg(MI->getOperand(2).getReg())
00644       .addImm(0)  // CLAMP
00645       .addImm(0); // OMOD
00646     MI->eraseFromParent();
00647     break;
00648   }
00649   case AMDGPU::SI_RegisterStorePseudo: {
00650     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
00651     unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
00652     MachineInstrBuilder MIB =
00653         BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore),
00654                 Reg);
00655     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
00656       MIB.addOperand(MI->getOperand(i));
00657 
00658     MI->eraseFromParent();
00659     break;
00660   }
00661   case AMDGPU::FCLAMP_SI: {
00662     const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
00663         getTargetMachine().getSubtargetImpl()->getInstrInfo());
00664     DebugLoc DL = MI->getDebugLoc();
00665     unsigned DestReg = MI->getOperand(0).getReg();
00666     BuildMI(*BB, I, DL, TII->get(AMDGPU::V_ADD_F32_e64), DestReg)
00667       .addImm(0) // SRC0 modifiers
00668       .addOperand(MI->getOperand(1))
00669       .addImm(0) // SRC1 modifiers
00670       .addImm(0) // SRC1
00671       .addImm(1) // CLAMP
00672       .addImm(0); // OMOD
00673     MI->eraseFromParent();
00674   }
00675   }
00676   return BB;
00677 }
00678 
00679 EVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
00680   if (!VT.isVector()) {
00681     return MVT::i1;
00682   }
00683   return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
00684 }
00685 
00686 MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
00687   return MVT::i32;
00688 }
00689 
00690 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
00691   VT = VT.getScalarType();
00692 
00693   if (!VT.isSimple())
00694     return false;
00695 
00696   switch (VT.getSimpleVT().SimpleTy) {
00697   case MVT::f32:
00698     return false; /* There is V_MAD_F32 for f32 */
00699   case MVT::f64:
00700     return true;
00701   default:
00702     break;
00703   }
00704 
00705   return false;
00706 }
00707 
00708 //===----------------------------------------------------------------------===//
00709 // Custom DAG Lowering Operations
00710 //===----------------------------------------------------------------------===//
00711 
00712 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
00713   switch (Op.getOpcode()) {
00714   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
00715   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
00716   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
00717   case ISD::LOAD: {
00718     SDValue Result = LowerLOAD(Op, DAG);
00719     assert((!Result.getNode() ||
00720             Result.getNode()->getNumValues() == 2) &&
00721            "Load should return a value and a chain");
00722     return Result;
00723   }
00724 
00725   case ISD::FSIN:
00726   case ISD::FCOS:
00727     return LowerTrig(Op, DAG);
00728   case ISD::SELECT: return LowerSELECT(Op, DAG);
00729   case ISD::FDIV: return LowerFDIV(Op, DAG);
00730   case ISD::STORE: return LowerSTORE(Op, DAG);
00731   case ISD::GlobalAddress: {
00732     MachineFunction &MF = DAG.getMachineFunction();
00733     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
00734     return LowerGlobalAddress(MFI, Op, DAG);
00735   }
00736   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
00737   case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
00738   }
00739   return SDValue();
00740 }
00741 
00742 /// \brief Helper function for LowerBRCOND
00743 static SDNode *findUser(SDValue Value, unsigned Opcode) {
00744 
00745   SDNode *Parent = Value.getNode();
00746   for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
00747        I != E; ++I) {
00748 
00749     if (I.getUse().get() != Value)
00750       continue;
00751 
00752     if (I->getOpcode() == Opcode)
00753       return *I;
00754   }
00755   return nullptr;
00756 }
00757 
00758 SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
00759 
00760   FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
00761   unsigned FrameIndex = FINode->getIndex();
00762 
00763   return DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
00764 }
00765 
00766 /// This transforms the control flow intrinsics to get the branch destination as
00767 /// last parameter, also switches branch target with BR if the need arise
00768 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
00769                                       SelectionDAG &DAG) const {
00770 
00771   SDLoc DL(BRCOND);
00772 
00773   SDNode *Intr = BRCOND.getOperand(1).getNode();
00774   SDValue Target = BRCOND.getOperand(2);
00775   SDNode *BR = nullptr;
00776 
00777   if (Intr->getOpcode() == ISD::SETCC) {
00778     // As long as we negate the condition everything is fine
00779     SDNode *SetCC = Intr;
00780     assert(SetCC->getConstantOperandVal(1) == 1);
00781     assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
00782            ISD::SETNE);
00783     Intr = SetCC->getOperand(0).getNode();
00784 
00785   } else {
00786     // Get the target from BR if we don't negate the condition
00787     BR = findUser(BRCOND, ISD::BR);
00788     Target = BR->getOperand(1);
00789   }
00790 
00791   assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
00792 
00793   // Build the result and
00794   SmallVector<EVT, 4> Res;
00795   for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i)
00796     Res.push_back(Intr->getValueType(i));
00797 
00798   // operands of the new intrinsic call
00799   SmallVector<SDValue, 4> Ops;
00800   Ops.push_back(BRCOND.getOperand(0));
00801   for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i)
00802     Ops.push_back(Intr->getOperand(i));
00803   Ops.push_back(Target);
00804 
00805   // build the new intrinsic call
00806   SDNode *Result = DAG.getNode(
00807     Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
00808     DAG.getVTList(Res), Ops).getNode();
00809 
00810   if (BR) {
00811     // Give the branch instruction our target
00812     SDValue Ops[] = {
00813       BR->getOperand(0),
00814       BRCOND.getOperand(2)
00815     };
00816     SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
00817     DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
00818     BR = NewBR.getNode();
00819   }
00820 
00821   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
00822 
00823   // Copy the intrinsic results to registers
00824   for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
00825     SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
00826     if (!CopyToReg)
00827       continue;
00828 
00829     Chain = DAG.getCopyToReg(
00830       Chain, DL,
00831       CopyToReg->getOperand(1),
00832       SDValue(Result, i - 1),
00833       SDValue());
00834 
00835     DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
00836   }
00837 
00838   // Remove the old intrinsic from the chain
00839   DAG.ReplaceAllUsesOfValueWith(
00840     SDValue(Intr, Intr->getNumValues() - 1),
00841     Intr->getOperand(0));
00842 
00843   return Chain;
00844 }
00845 
00846 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
00847                                              SDValue Op,
00848                                              SelectionDAG &DAG) const {
00849   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
00850 
00851   if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
00852     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
00853 
00854   SDLoc DL(GSD);
00855   const GlobalValue *GV = GSD->getGlobal();
00856   MVT PtrVT = getPointerTy(GSD->getAddressSpace());
00857 
00858   SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT);
00859   SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
00860 
00861   SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
00862                               DAG.getConstant(0, MVT::i32));
00863   SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr,
00864                               DAG.getConstant(1, MVT::i32));
00865 
00866   SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue),
00867                            PtrLo, GA);
00868   SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue),
00869                            PtrHi, DAG.getConstant(0, MVT::i32),
00870                            SDValue(Lo.getNode(), 1));
00871   return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
00872 }
00873 
00874 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
00875                                                   SelectionDAG &DAG) const {
00876   MachineFunction &MF = DAG.getMachineFunction();
00877   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
00878 
00879   EVT VT = Op.getValueType();
00880   SDLoc DL(Op);
00881   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
00882 
00883   switch (IntrinsicID) {
00884   case Intrinsic::r600_read_ngroups_x:
00885     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0, false);
00886   case Intrinsic::r600_read_ngroups_y:
00887     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4, false);
00888   case Intrinsic::r600_read_ngroups_z:
00889     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8, false);
00890   case Intrinsic::r600_read_global_size_x:
00891     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12, false);
00892   case Intrinsic::r600_read_global_size_y:
00893     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16, false);
00894   case Intrinsic::r600_read_global_size_z:
00895     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20, false);
00896   case Intrinsic::r600_read_local_size_x:
00897     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24, false);
00898   case Intrinsic::r600_read_local_size_y:
00899     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28, false);
00900   case Intrinsic::r600_read_local_size_z:
00901     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false);
00902   case Intrinsic::r600_read_tgid_x:
00903     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
00904       AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0), VT);
00905   case Intrinsic::r600_read_tgid_y:
00906     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
00907       AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1), VT);
00908   case Intrinsic::r600_read_tgid_z:
00909     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
00910       AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2), VT);
00911   case Intrinsic::r600_read_tidig_x:
00912     return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
00913                                 AMDGPU::VGPR0, VT);
00914   case Intrinsic::r600_read_tidig_y:
00915     return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
00916                                 AMDGPU::VGPR1, VT);
00917   case Intrinsic::r600_read_tidig_z:
00918     return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
00919                                 AMDGPU::VGPR2, VT);
00920   case AMDGPUIntrinsic::SI_load_const: {
00921     SDValue Ops[] = {
00922       Op.getOperand(1),
00923       Op.getOperand(2)
00924     };
00925 
00926     MachineMemOperand *MMO = MF.getMachineMemOperand(
00927       MachinePointerInfo(),
00928       MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
00929       VT.getStoreSize(), 4);
00930     return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
00931                                    Op->getVTList(), Ops, VT, MMO);
00932   }
00933   case AMDGPUIntrinsic::SI_sample:
00934     return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
00935   case AMDGPUIntrinsic::SI_sampleb:
00936     return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG);
00937   case AMDGPUIntrinsic::SI_sampled:
00938     return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG);
00939   case AMDGPUIntrinsic::SI_samplel:
00940     return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
00941   case AMDGPUIntrinsic::SI_vs_load_input:
00942     return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
00943                        Op.getOperand(1),
00944                        Op.getOperand(2),
00945                        Op.getOperand(3));
00946   default:
00947     return AMDGPUTargetLowering::LowerOperation(Op, DAG);
00948   }
00949 }
00950 
00951 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
00952                                               SelectionDAG &DAG) const {
00953   MachineFunction &MF = DAG.getMachineFunction();
00954   SDValue Chain = Op.getOperand(0);
00955   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
00956 
00957   switch (IntrinsicID) {
00958   case AMDGPUIntrinsic::SI_tbuffer_store: {
00959     SDLoc DL(Op);
00960     SDValue Ops[] = {
00961       Chain,
00962       Op.getOperand(2),
00963       Op.getOperand(3),
00964       Op.getOperand(4),
00965       Op.getOperand(5),
00966       Op.getOperand(6),
00967       Op.getOperand(7),
00968       Op.getOperand(8),
00969       Op.getOperand(9),
00970       Op.getOperand(10),
00971       Op.getOperand(11),
00972       Op.getOperand(12),
00973       Op.getOperand(13),
00974       Op.getOperand(14)
00975     };
00976 
00977     EVT VT = Op.getOperand(3).getValueType();
00978 
00979     MachineMemOperand *MMO = MF.getMachineMemOperand(
00980       MachinePointerInfo(),
00981       MachineMemOperand::MOStore,
00982       VT.getStoreSize(), 4);
00983     return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
00984                                    Op->getVTList(), Ops, VT, MMO);
00985   }
00986   default:
00987     return SDValue();
00988   }
00989 }
00990 
00991 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
00992   SDLoc DL(Op);
00993   LoadSDNode *Load = cast<LoadSDNode>(Op);
00994 
00995   if (Op.getValueType().isVector()) {
00996     assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
00997            "Custom lowering for non-i32 vectors hasn't been implemented.");
00998     unsigned NumElements = Op.getValueType().getVectorNumElements();
00999     assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
01000     switch (Load->getAddressSpace()) {
01001       default: break;
01002       case AMDGPUAS::GLOBAL_ADDRESS:
01003       case AMDGPUAS::PRIVATE_ADDRESS:
01004         // v4 loads are supported for private and global memory.
01005         if (NumElements <= 4)
01006           break;
01007         // fall-through
01008       case AMDGPUAS::LOCAL_ADDRESS:
01009         return ScalarizeVectorLoad(Op, DAG);
01010     }
01011   }
01012 
01013   return AMDGPUTargetLowering::LowerLOAD(Op, DAG);
01014 }
01015 
01016 SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
01017                                                const SDValue &Op,
01018                                                SelectionDAG &DAG) const {
01019   return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1),
01020                      Op.getOperand(2),
01021                      Op.getOperand(3),
01022                      Op.getOperand(4));
01023 }
01024 
01025 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
01026   if (Op.getValueType() != MVT::i64)
01027     return SDValue();
01028 
01029   SDLoc DL(Op);
01030   SDValue Cond = Op.getOperand(0);
01031 
01032   SDValue Zero = DAG.getConstant(0, MVT::i32);
01033   SDValue One = DAG.getConstant(1, MVT::i32);
01034 
01035   SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
01036   SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
01037 
01038   SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
01039   SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
01040 
01041   SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
01042 
01043   SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
01044   SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
01045 
01046   SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
01047 
01048   SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi);
01049   return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
01050 }
01051 
01052 // Catch division cases where we can use shortcuts with rcp and rsq
01053 // instructions.
01054 SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
01055   SDLoc SL(Op);
01056   SDValue LHS = Op.getOperand(0);
01057   SDValue RHS = Op.getOperand(1);
01058   EVT VT = Op.getValueType();
01059   bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
01060 
01061   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
01062     if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) &&
01063         CLHS->isExactlyValue(1.0)) {
01064       // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
01065       // the CI documentation has a worst case error of 1 ulp.
01066       // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
01067       // use it as long as we aren't trying to use denormals.
01068 
01069       // 1.0 / sqrt(x) -> rsq(x)
01070       //
01071       // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
01072       // error seems really high at 2^29 ULP.
01073       if (RHS.getOpcode() == ISD::FSQRT)
01074         return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
01075 
01076       // 1.0 / x -> rcp(x)
01077       return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
01078     }
01079   }
01080 
01081   if (Unsafe) {
01082     // Turn into multiply by the reciprocal.
01083     // x / y -> x * (1.0 / y)
01084     SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
01085     return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip);
01086   }
01087 
01088   return SDValue();
01089 }
01090 
01091 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
01092   SDValue FastLowered = LowerFastFDIV(Op, DAG);
01093   if (FastLowered.getNode())
01094     return FastLowered;
01095 
01096   // This uses v_rcp_f32 which does not handle denormals. Let this hit a
01097   // selection error for now rather than do something incorrect.
01098   if (Subtarget->hasFP32Denormals())
01099     return SDValue();
01100 
01101   SDLoc SL(Op);
01102   SDValue LHS = Op.getOperand(0);
01103   SDValue RHS = Op.getOperand(1);
01104 
01105   SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
01106 
01107   const APFloat K0Val(BitsToFloat(0x6f800000));
01108   const SDValue K0 = DAG.getConstantFP(K0Val, MVT::f32);
01109 
01110   const APFloat K1Val(BitsToFloat(0x2f800000));
01111   const SDValue K1 = DAG.getConstantFP(K1Val, MVT::f32);
01112 
01113   const SDValue One = DAG.getTargetConstantFP(1.0, MVT::f32);
01114 
01115   EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32);
01116 
01117   SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
01118 
01119   SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
01120 
01121   r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
01122 
01123   SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
01124 
01125   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
01126 
01127   return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
01128 }
01129 
01130 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
01131   return SDValue();
01132 }
01133 
01134 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
01135   EVT VT = Op.getValueType();
01136 
01137   if (VT == MVT::f32)
01138     return LowerFDIV32(Op, DAG);
01139 
01140   if (VT == MVT::f64)
01141     return LowerFDIV64(Op, DAG);
01142 
01143   llvm_unreachable("Unexpected type for fdiv");
01144 }
01145 
01146 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
01147   SDLoc DL(Op);
01148   StoreSDNode *Store = cast<StoreSDNode>(Op);
01149   EVT VT = Store->getMemoryVT();
01150 
01151   // These stores are legal.
01152   if (Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
01153       VT.isVector() && VT.getVectorNumElements() == 2 &&
01154       VT.getVectorElementType() == MVT::i32)
01155     return SDValue();
01156 
01157   if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
01158     if (VT.isVector() && VT.getVectorNumElements() > 4)
01159       return ScalarizeVectorStore(Op, DAG);
01160     return SDValue();
01161   }
01162 
01163   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
01164   if (Ret.getNode())
01165     return Ret;
01166 
01167   if (VT.isVector() && VT.getVectorNumElements() >= 8)
01168       return ScalarizeVectorStore(Op, DAG);
01169 
01170   if (VT == MVT::i1)
01171     return DAG.getTruncStore(Store->getChain(), DL,
01172                         DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
01173                         Store->getBasePtr(), MVT::i1, Store->getMemOperand());
01174 
01175   return SDValue();
01176 }
01177 
01178 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
01179   EVT VT = Op.getValueType();
01180   SDValue Arg = Op.getOperand(0);
01181   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
01182         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
01183           DAG.getConstantFP(0.5 / M_PI, VT)));
01184 
01185   switch (Op.getOpcode()) {
01186   case ISD::FCOS:
01187     return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
01188   case ISD::FSIN:
01189     return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
01190   default:
01191     llvm_unreachable("Wrong trig opcode");
01192   }
01193 }
01194 
01195 //===----------------------------------------------------------------------===//
01196 // Custom DAG optimizations
01197 //===----------------------------------------------------------------------===//
01198 
01199 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
01200                                                      DAGCombinerInfo &DCI) {
01201   EVT VT = N->getValueType(0);
01202   EVT ScalarVT = VT.getScalarType();
01203   if (ScalarVT != MVT::f32)
01204     return SDValue();
01205 
01206   SelectionDAG &DAG = DCI.DAG;
01207   SDLoc DL(N);
01208 
01209   SDValue Src = N->getOperand(0);
01210   EVT SrcVT = Src.getValueType();
01211 
01212   // TODO: We could try to match extracting the higher bytes, which would be
01213   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
01214   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
01215   // about in practice.
01216   if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
01217     if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
01218       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
01219       DCI.AddToWorklist(Cvt.getNode());
01220       return Cvt;
01221     }
01222   }
01223 
01224   // We are primarily trying to catch operations on illegal vector types
01225   // before they are expanded.
01226   // For scalars, we can use the more flexible method of checking masked bits
01227   // after legalization.
01228   if (!DCI.isBeforeLegalize() ||
01229       !SrcVT.isVector() ||
01230       SrcVT.getVectorElementType() != MVT::i8) {
01231     return SDValue();
01232   }
01233 
01234   assert(DCI.isBeforeLegalize() && "Unexpected legal type");
01235 
01236   // Weird sized vectors are a pain to handle, but we know 3 is really the same
01237   // size as 4.
01238   unsigned NElts = SrcVT.getVectorNumElements();
01239   if (!SrcVT.isSimple() && NElts != 3)
01240     return SDValue();
01241 
01242   // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to
01243   // prevent a mess from expanding to v4i32 and repacking.
01244   if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
01245     EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT);
01246     EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT);
01247     EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts);
01248 
01249     LoadSDNode *Load = cast<LoadSDNode>(Src);
01250     SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT,
01251                                      Load->getChain(),
01252                                      Load->getBasePtr(),
01253                                      LoadVT,
01254                                      Load->getMemOperand());
01255 
01256     // Make sure successors of the original load stay after it by updating
01257     // them to use the new Chain.
01258     DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1));
01259 
01260     SmallVector<SDValue, 4> Elts;
01261     if (RegVT.isVector())
01262       DAG.ExtractVectorElements(NewLoad, Elts);
01263     else
01264       Elts.push_back(NewLoad);
01265 
01266     SmallVector<SDValue, 4> Ops;
01267 
01268     unsigned EltIdx = 0;
01269     for (SDValue Elt : Elts) {
01270       unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx);
01271       for (unsigned I = 0; I < ComponentsInElt; ++I) {
01272         unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I;
01273         SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt);
01274         DCI.AddToWorklist(Cvt.getNode());
01275         Ops.push_back(Cvt);
01276       }
01277 
01278       ++EltIdx;
01279     }
01280 
01281     assert(Ops.size() == NElts);
01282 
01283     return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops);
01284   }
01285 
01286   return SDValue();
01287 }
01288 
01289 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
01290 
01291 // This is a variant of
01292 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
01293 //
01294 // The normal DAG combiner will do this, but only if the add has one use since
01295 // that would increase the number of instructions.
01296 //
01297 // This prevents us from seeing a constant offset that can be folded into a
01298 // memory instruction's addressing mode. If we know the resulting add offset of
01299 // a pointer can be folded into an addressing offset, we can replace the pointer
01300 // operand with the add of new constant offset. This eliminates one of the uses,
01301 // and may allow the remaining use to also be simplified.
01302 //
01303 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
01304                                                unsigned AddrSpace,
01305                                                DAGCombinerInfo &DCI) const {
01306   SDValue N0 = N->getOperand(0);
01307   SDValue N1 = N->getOperand(1);
01308 
01309   if (N0.getOpcode() != ISD::ADD)
01310     return SDValue();
01311 
01312   const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
01313   if (!CN1)
01314     return SDValue();
01315 
01316   const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
01317   if (!CAdd)
01318     return SDValue();
01319 
01320   const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
01321       getTargetMachine().getSubtargetImpl()->getInstrInfo());
01322 
01323   // If the resulting offset is too large, we can't fold it into the addressing
01324   // mode offset.
01325   APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
01326   if (!TII->canFoldOffset(Offset.getZExtValue(), AddrSpace))
01327     return SDValue();
01328 
01329   SelectionDAG &DAG = DCI.DAG;
01330   SDLoc SL(N);
01331   EVT VT = N->getValueType(0);
01332 
01333   SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
01334   SDValue COffset = DAG.getConstant(Offset, MVT::i32);
01335 
01336   return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
01337 }
01338 
01339 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
01340                                             DAGCombinerInfo &DCI) const {
01341   SelectionDAG &DAG = DCI.DAG;
01342   SDLoc DL(N);
01343   EVT VT = N->getValueType(0);
01344 
01345   switch (N->getOpcode()) {
01346     default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
01347     case ISD::SETCC: {
01348       SDValue Arg0 = N->getOperand(0);
01349       SDValue Arg1 = N->getOperand(1);
01350       SDValue CC = N->getOperand(2);
01351       ConstantSDNode * C = nullptr;
01352       ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get();
01353 
01354       // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne)
01355       if (VT == MVT::i1
01356           && Arg0.getOpcode() == ISD::SIGN_EXTEND
01357           && Arg0.getOperand(0).getValueType() == MVT::i1
01358           && (C = dyn_cast<ConstantSDNode>(Arg1))
01359           && C->isNullValue()
01360           && CCOp == ISD::SETNE) {
01361         return SimplifySetCC(VT, Arg0.getOperand(0),
01362                              DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL);
01363       }
01364       break;
01365     }
01366 
01367   case AMDGPUISD::CVT_F32_UBYTE0:
01368   case AMDGPUISD::CVT_F32_UBYTE1:
01369   case AMDGPUISD::CVT_F32_UBYTE2:
01370   case AMDGPUISD::CVT_F32_UBYTE3: {
01371     unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
01372 
01373     SDValue Src = N->getOperand(0);
01374     APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
01375 
01376     APInt KnownZero, KnownOne;
01377     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
01378                                           !DCI.isBeforeLegalizeOps());
01379     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
01380     if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
01381         TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
01382       DCI.CommitTargetLoweringOpt(TLO);
01383     }
01384 
01385     break;
01386   }
01387 
01388   case ISD::UINT_TO_FP: {
01389     return performUCharToFloatCombine(N, DCI);
01390 
01391   case ISD::FSUB: {
01392     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
01393       break;
01394 
01395     EVT VT = N->getValueType(0);
01396 
01397     // Try to get the fneg to fold into the source modifier. This undoes generic
01398     // DAG combines and folds them into the mad.
01399     if (VT == MVT::f32) {
01400       SDValue LHS = N->getOperand(0);
01401       SDValue RHS = N->getOperand(1);
01402 
01403       if (LHS.getOpcode() == ISD::FMUL) {
01404         // (fsub (fmul a, b), c) -> mad a, b, (fneg c)
01405 
01406         SDValue A = LHS.getOperand(0);
01407         SDValue B = LHS.getOperand(1);
01408         SDValue C = DAG.getNode(ISD::FNEG, DL, VT, RHS);
01409 
01410         return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C);
01411       }
01412 
01413       if (RHS.getOpcode() == ISD::FMUL) {
01414         // (fsub c, (fmul a, b)) -> mad (fneg a), b, c
01415 
01416         SDValue A = DAG.getNode(ISD::FNEG, DL, VT, RHS.getOperand(0));
01417         SDValue B = RHS.getOperand(1);
01418         SDValue C = LHS;
01419 
01420         return DAG.getNode(AMDGPUISD::MAD, DL, VT, A, B, C);
01421       }
01422     }
01423 
01424     break;
01425   }
01426   }
01427   case ISD::LOAD:
01428   case ISD::STORE:
01429   case ISD::ATOMIC_LOAD:
01430   case ISD::ATOMIC_STORE:
01431   case ISD::ATOMIC_CMP_SWAP:
01432   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
01433   case ISD::ATOMIC_SWAP:
01434   case ISD::ATOMIC_LOAD_ADD:
01435   case ISD::ATOMIC_LOAD_SUB:
01436   case ISD::ATOMIC_LOAD_AND:
01437   case ISD::ATOMIC_LOAD_OR:
01438   case ISD::ATOMIC_LOAD_XOR:
01439   case ISD::ATOMIC_LOAD_NAND:
01440   case ISD::ATOMIC_LOAD_MIN:
01441   case ISD::ATOMIC_LOAD_MAX:
01442   case ISD::ATOMIC_LOAD_UMIN:
01443   case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics.
01444     if (DCI.isBeforeLegalize())
01445       break;
01446 
01447     MemSDNode *MemNode = cast<MemSDNode>(N);
01448     SDValue Ptr = MemNode->getBasePtr();
01449 
01450     // TODO: We could also do this for multiplies.
01451     unsigned AS = MemNode->getAddressSpace();
01452     if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
01453       SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
01454       if (NewPtr) {
01455         SmallVector<SDValue, 8> NewOps;
01456         for (unsigned I = 0, E = MemNode->getNumOperands(); I != E; ++I)
01457           NewOps.push_back(MemNode->getOperand(I));
01458 
01459         NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
01460         return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0);
01461       }
01462     }
01463     break;
01464   }
01465   }
01466   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
01467 }
01468 
01469 /// \brief Test if RegClass is one of the VSrc classes
01470 static bool isVSrc(unsigned RegClass) {
01471   return AMDGPU::VSrc_32RegClassID == RegClass ||
01472          AMDGPU::VSrc_64RegClassID == RegClass;
01473 }
01474 
01475 /// \brief Test if RegClass is one of the SSrc classes
01476 static bool isSSrc(unsigned RegClass) {
01477   return AMDGPU::SSrc_32RegClassID == RegClass ||
01478          AMDGPU::SSrc_64RegClassID == RegClass;
01479 }
01480 
01481 /// \brief Analyze the possible immediate value Op
01482 ///
01483 /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate
01484 /// and the immediate value if it's a literal immediate
01485 int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
01486 
01487   union {
01488     int32_t I;
01489     float F;
01490   } Imm;
01491 
01492   if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) {
01493     if (Node->getZExtValue() >> 32) {
01494         return -1;
01495     }
01496     Imm.I = Node->getSExtValue();
01497   } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) {
01498     if (N->getValueType(0) != MVT::f32)
01499       return -1;
01500     Imm.F = Node->getValueAPF().convertToFloat();
01501   } else
01502     return -1; // It isn't an immediate
01503 
01504   if ((Imm.I >= -16 && Imm.I <= 64) ||
01505       Imm.F == 0.5f || Imm.F == -0.5f ||
01506       Imm.F == 1.0f || Imm.F == -1.0f ||
01507       Imm.F == 2.0f || Imm.F == -2.0f ||
01508       Imm.F == 4.0f || Imm.F == -4.0f)
01509     return 0; // It's an inline immediate
01510 
01511   return Imm.I; // It's a literal immediate
01512 }
01513 
01514 /// \brief Try to fold an immediate directly into an instruction
01515 bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate,
01516                                bool &ScalarSlotUsed) const {
01517 
01518   MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand);
01519   const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
01520       getTargetMachine().getSubtargetImpl()->getInstrInfo());
01521   if (!Mov || !TII->isMov(Mov->getMachineOpcode()))
01522     return false;
01523 
01524   const SDValue &Op = Mov->getOperand(0);
01525   int32_t Value = analyzeImmediate(Op.getNode());
01526   if (Value == -1) {
01527     // Not an immediate at all
01528     return false;
01529 
01530   } else if (Value == 0) {
01531     // Inline immediates can always be fold
01532     Operand = Op;
01533     return true;
01534 
01535   } else if (Value == Immediate) {
01536     // Already fold literal immediate
01537     Operand = Op;
01538     return true;
01539 
01540   } else if (!ScalarSlotUsed && !Immediate) {
01541     // Fold this literal immediate
01542     ScalarSlotUsed = true;
01543     Immediate = Value;
01544     Operand = Op;
01545     return true;
01546 
01547   }
01548 
01549   return false;
01550 }
01551 
01552 const TargetRegisterClass *SITargetLowering::getRegClassForNode(
01553                                    SelectionDAG &DAG, const SDValue &Op) const {
01554   const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
01555       getTargetMachine().getSubtargetImpl()->getInstrInfo());
01556   const SIRegisterInfo &TRI = TII->getRegisterInfo();
01557 
01558   if (!Op->isMachineOpcode()) {
01559     switch(Op->getOpcode()) {
01560     case ISD::CopyFromReg: {
01561       MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
01562       unsigned Reg = cast<RegisterSDNode>(Op->getOperand(1))->getReg();
01563       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
01564         return MRI.getRegClass(Reg);
01565       }
01566       return TRI.getPhysRegClass(Reg);
01567     }
01568     default:  return nullptr;
01569     }
01570   }
01571   const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode());
01572   int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass;
01573   if (OpClassID != -1) {
01574     return TRI.getRegClass(OpClassID);
01575   }
01576   switch(Op.getMachineOpcode()) {
01577   case AMDGPU::COPY_TO_REGCLASS:
01578     // Operand 1 is the register class id for COPY_TO_REGCLASS instructions.
01579     OpClassID = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
01580 
01581     // If the COPY_TO_REGCLASS instruction is copying to a VSrc register
01582     // class, then the register class for the value could be either a
01583     // VReg or and SReg.  In order to get a more accurate
01584     if (OpClassID == AMDGPU::VSrc_32RegClassID ||
01585         OpClassID == AMDGPU::VSrc_64RegClassID) {
01586       return getRegClassForNode(DAG, Op.getOperand(0));
01587     }
01588     return TRI.getRegClass(OpClassID);
01589   case AMDGPU::EXTRACT_SUBREG: {
01590     int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
01591     const TargetRegisterClass *SuperClass =
01592       getRegClassForNode(DAG, Op.getOperand(0));
01593     return TRI.getSubClassWithSubReg(SuperClass, SubIdx);
01594   }
01595   case AMDGPU::REG_SEQUENCE:
01596     // Operand 0 is the register class id for REG_SEQUENCE instructions.
01597     return TRI.getRegClass(
01598       cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue());
01599   default:
01600     return getRegClassFor(Op.getSimpleValueType());
01601   }
01602 }
01603 
01604 /// \brief Does "Op" fit into register class "RegClass" ?
01605 bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
01606                                     unsigned RegClass) const {
01607   const TargetRegisterInfo *TRI =
01608       getTargetMachine().getSubtargetImpl()->getRegisterInfo();
01609   const TargetRegisterClass *RC = getRegClassForNode(DAG, Op);
01610   if (!RC) {
01611     return false;
01612   }
01613   return TRI->getRegClass(RegClass)->hasSubClassEq(RC);
01614 }
01615 
01616 /// \brief Make sure that we don't exeed the number of allowed scalars
01617 void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
01618                                        unsigned RegClass,
01619                                        bool &ScalarSlotUsed) const {
01620 
01621   // First map the operands register class to a destination class
01622   if (RegClass == AMDGPU::VSrc_32RegClassID)
01623     RegClass = AMDGPU::VReg_32RegClassID;
01624   else if (RegClass == AMDGPU::VSrc_64RegClassID)
01625     RegClass = AMDGPU::VReg_64RegClassID;
01626   else
01627     return;
01628 
01629   // Nothing to do if they fit naturally
01630   if (fitsRegClass(DAG, Operand, RegClass))
01631     return;
01632 
01633   // If the scalar slot isn't used yet use it now
01634   if (!ScalarSlotUsed) {
01635     ScalarSlotUsed = true;
01636     return;
01637   }
01638 
01639   // This is a conservative aproach. It is possible that we can't determine the
01640   // correct register class and copy too often, but better safe than sorry.
01641 
01642   SDNode *Node;
01643   // We can't use COPY_TO_REGCLASS with FrameIndex arguments.
01644   if (isa<FrameIndexSDNode>(Operand) ||
01645       isa<GlobalAddressSDNode>(Operand)) {
01646     unsigned Opcode = Operand.getValueType() == MVT::i32 ?
01647                       AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
01648     Node = DAG.getMachineNode(Opcode, SDLoc(), Operand.getValueType(),
01649                               Operand);
01650   } else {
01651     SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
01652     Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(),
01653                               Operand.getValueType(), Operand, RC);
01654   }
01655   Operand = SDValue(Node, 0);
01656 }
01657 
01658 /// \returns true if \p Node's operands are different from the SDValue list
01659 /// \p Ops
01660 static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) {
01661   for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) {
01662     if (Ops[i].getNode() != Node->getOperand(i).getNode()) {
01663       return true;
01664     }
01665   }
01666   return false;
01667 }
01668 
01669 /// \brief Try to commute instructions and insert copies in order to satisfy the
01670 /// operand constraints.
01671 SDNode *SITargetLowering::legalizeOperands(MachineSDNode *Node,
01672                                            SelectionDAG &DAG) const {
01673   // Original encoding (either e32 or e64)
01674   int Opcode = Node->getMachineOpcode();
01675   const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
01676       getTargetMachine().getSubtargetImpl()->getInstrInfo());
01677   const MCInstrDesc *Desc = &TII->get(Opcode);
01678 
01679   unsigned NumDefs = Desc->getNumDefs();
01680   unsigned NumOps = Desc->getNumOperands();
01681 
01682   // Commuted opcode if available
01683   int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1;
01684   const MCInstrDesc *DescRev = OpcodeRev == -1 ? nullptr : &TII->get(OpcodeRev);
01685 
01686   assert(!DescRev || DescRev->getNumDefs() == NumDefs);
01687   assert(!DescRev || DescRev->getNumOperands() == NumOps);
01688 
01689   int32_t Immediate = Desc->getSize() == 4 ? 0 : -1;
01690   bool HaveVSrc = false, HaveSSrc = false;
01691 
01692   // First figure out what we already have in this instruction.
01693   for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
01694        i != e && Op < NumOps; ++i, ++Op) {
01695 
01696     unsigned RegClass = Desc->OpInfo[Op].RegClass;
01697     if (isVSrc(RegClass))
01698       HaveVSrc = true;
01699     else if (isSSrc(RegClass))
01700       HaveSSrc = true;
01701     else
01702       continue;
01703 
01704     int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode());
01705     if (Imm != -1 && Imm != 0) {
01706       // Literal immediate
01707       Immediate = Imm;
01708     }
01709   }
01710 
01711   // If we neither have VSrc nor SSrc, it makes no sense to continue.
01712   if (!HaveVSrc && !HaveSSrc)
01713     return Node;
01714 
01715   // No scalar allowed when we have both VSrc and SSrc
01716   bool ScalarSlotUsed = HaveVSrc && HaveSSrc;
01717 
01718   // Second go over the operands and try to fold them
01719   std::vector<SDValue> Ops;
01720   for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs;
01721        i != e && Op < NumOps; ++i, ++Op) {
01722 
01723     const SDValue &Operand = Node->getOperand(i);
01724     Ops.push_back(Operand);
01725 
01726     // Already folded immediate?
01727     if (isa<ConstantSDNode>(Operand.getNode()) ||
01728         isa<ConstantFPSDNode>(Operand.getNode()))
01729       continue;
01730 
01731     // Is this a VSrc or SSrc operand?
01732     unsigned RegClass = Desc->OpInfo[Op].RegClass;
01733     if (isVSrc(RegClass) || isSSrc(RegClass)) {
01734       // Try to fold the immediates
01735       if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) {
01736         // Folding didn't work, make sure we don't hit the SReg limit.
01737         ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed);
01738       }
01739       continue;
01740     } else {
01741       // If it's not a VSrc or SSrc operand check if we have a GlobalAddress.
01742       // These will be lowered to immediates, so we will need to insert a MOV.
01743       if (isa<GlobalAddressSDNode>(Ops[i])) {
01744         SDNode *Node = DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(),
01745                                     Operand.getValueType(), Operand);
01746         Ops[i] = SDValue(Node, 0);
01747       }
01748     }
01749 
01750     if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) {
01751 
01752       unsigned OtherRegClass = Desc->OpInfo[NumDefs].RegClass;
01753       assert(isVSrc(OtherRegClass) || isSSrc(OtherRegClass));
01754 
01755       // Test if it makes sense to swap operands
01756       if (foldImm(Ops[1], Immediate, ScalarSlotUsed) ||
01757           (!fitsRegClass(DAG, Ops[1], RegClass) &&
01758            fitsRegClass(DAG, Ops[1], OtherRegClass))) {
01759 
01760         // Swap commutable operands
01761         std::swap(Ops[0], Ops[1]);
01762 
01763         Desc = DescRev;
01764         DescRev = nullptr;
01765         continue;
01766       }
01767     }
01768   }
01769 
01770   // Add optional chain and glue
01771   for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i)
01772     Ops.push_back(Node->getOperand(i));
01773 
01774   // Nodes that have a glue result are not CSE'd by getMachineNode(), so in
01775   // this case a brand new node is always be created, even if the operands
01776   // are the same as before.  So, manually check if anything has been changed.
01777   if (Desc->Opcode == Opcode && !isNodeChanged(Node, Ops)) {
01778     return Node;
01779   }
01780 
01781   // Create a complete new instruction
01782   return DAG.getMachineNode(Desc->Opcode, SDLoc(Node), Node->getVTList(), Ops);
01783 }
01784 
01785 /// \brief Helper function for adjustWritemask
01786 static unsigned SubIdx2Lane(unsigned Idx) {
01787   switch (Idx) {
01788   default: return 0;
01789   case AMDGPU::sub0: return 0;
01790   case AMDGPU::sub1: return 1;
01791   case AMDGPU::sub2: return 2;
01792   case AMDGPU::sub3: return 3;
01793   }
01794 }
01795 
01796 /// \brief Adjust the writemask of MIMG instructions
01797 void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
01798                                        SelectionDAG &DAG) const {
01799   SDNode *Users[4] = { };
01800   unsigned Lane = 0;
01801   unsigned OldDmask = Node->getConstantOperandVal(0);
01802   unsigned NewDmask = 0;
01803 
01804   // Try to figure out the used register components
01805   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
01806        I != E; ++I) {
01807 
01808     // Abort if we can't understand the usage
01809     if (!I->isMachineOpcode() ||
01810         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
01811       return;
01812 
01813     // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
01814     // Note that subregs are packed, i.e. Lane==0 is the first bit set
01815     // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
01816     // set, etc.
01817     Lane = SubIdx2Lane(I->getConstantOperandVal(1));
01818 
01819     // Set which texture component corresponds to the lane.
01820     unsigned Comp;
01821     for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
01822       assert(Dmask);
01823       Comp = countTrailingZeros(Dmask);
01824       Dmask &= ~(1 << Comp);
01825     }
01826 
01827     // Abort if we have more than one user per component
01828     if (Users[Lane])
01829       return;
01830 
01831     Users[Lane] = *I;
01832     NewDmask |= 1 << Comp;
01833   }
01834 
01835   // Abort if there's no change
01836   if (NewDmask == OldDmask)
01837     return;
01838 
01839   // Adjust the writemask in the node
01840   std::vector<SDValue> Ops;
01841   Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32));
01842   for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
01843     Ops.push_back(Node->getOperand(i));
01844   Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
01845 
01846   // If we only got one lane, replace it with a copy
01847   // (if NewDmask has only one bit set...)
01848   if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
01849     SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32);
01850     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
01851                                       SDLoc(), Users[Lane]->getValueType(0),
01852                                       SDValue(Node, 0), RC);
01853     DAG.ReplaceAllUsesWith(Users[Lane], Copy);
01854     return;
01855   }
01856 
01857   // Update the users of the node with the new indices
01858   for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
01859 
01860     SDNode *User = Users[i];
01861     if (!User)
01862       continue;
01863 
01864     SDValue Op = DAG.getTargetConstant(Idx, MVT::i32);
01865     DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
01866 
01867     switch (Idx) {
01868     default: break;
01869     case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
01870     case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
01871     case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
01872     }
01873   }
01874 }
01875 
01876 /// \brief Fold the instructions after selecting them.
01877 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
01878                                           SelectionDAG &DAG) const {
01879   const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
01880       getTargetMachine().getSubtargetImpl()->getInstrInfo());
01881   Node = AdjustRegClass(Node, DAG);
01882 
01883   if (TII->isMIMG(Node->getMachineOpcode()))
01884     adjustWritemask(Node, DAG);
01885 
01886   return legalizeOperands(Node, DAG);
01887 }
01888 
01889 /// \brief Assign the register class depending on the number of
01890 /// bits set in the writemask
01891 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
01892                                                      SDNode *Node) const {
01893   const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
01894       getTargetMachine().getSubtargetImpl()->getInstrInfo());
01895 
01896   if (TII->isMIMG(MI->getOpcode())) {
01897     unsigned VReg = MI->getOperand(0).getReg();
01898     unsigned Writemask = MI->getOperand(1).getImm();
01899     unsigned BitsSet = 0;
01900     for (unsigned i = 0; i < 4; ++i)
01901       BitsSet += Writemask & (1 << i) ? 1 : 0;
01902 
01903     const TargetRegisterClass *RC;
01904     switch (BitsSet) {
01905     default: return;
01906     case 1:  RC = &AMDGPU::VReg_32RegClass; break;
01907     case 2:  RC = &AMDGPU::VReg_64RegClass; break;
01908     case 3:  RC = &AMDGPU::VReg_96RegClass; break;
01909     }
01910 
01911     unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
01912     MI->setDesc(TII->get(NewOpcode));
01913     MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
01914     MRI.setRegClass(VReg, RC);
01915     return;
01916   }
01917 
01918   // Replace unused atomics with the no return version.
01919   int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode());
01920   if (NoRetAtomicOp != -1) {
01921     if (!Node->hasAnyUseOfValue(0)) {
01922       MI->setDesc(TII->get(NoRetAtomicOp));
01923       MI->RemoveOperand(0);
01924     }
01925 
01926     return;
01927   }
01928 }
01929 
01930 MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N,
01931                                                 SelectionDAG &DAG) const {
01932 
01933   SDLoc DL(N);
01934   unsigned NewOpcode = N->getMachineOpcode();
01935 
01936   switch (N->getMachineOpcode()) {
01937   default: return N;
01938   case AMDGPU::S_LOAD_DWORD_IMM:
01939     NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
01940     // Fall-through
01941   case AMDGPU::S_LOAD_DWORDX2_SGPR:
01942     if (NewOpcode == N->getMachineOpcode()) {
01943       NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
01944     }
01945     // Fall-through
01946   case AMDGPU::S_LOAD_DWORDX4_IMM:
01947   case AMDGPU::S_LOAD_DWORDX4_SGPR: {
01948     if (NewOpcode == N->getMachineOpcode()) {
01949       NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
01950     }
01951     if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) {
01952       return N;
01953     }
01954     ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1));
01955     MachineSDNode *RSrc = DAG.getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL,
01956                                              MVT::i128,
01957                                              DAG.getConstant(0, MVT::i64));
01958 
01959     SmallVector<SDValue, 8> Ops;
01960     Ops.push_back(SDValue(RSrc, 0));
01961     Ops.push_back(N->getOperand(0));
01962     Ops.push_back(DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32));
01963 
01964     // Copy remaining operands so we keep any chain and glue nodes that follow
01965     // the normal operands.
01966     for (unsigned I = 2, E = N->getNumOperands(); I != E; ++I)
01967       Ops.push_back(N->getOperand(I));
01968 
01969     return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops);
01970   }
01971   }
01972 }
01973 
01974 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
01975                                                const TargetRegisterClass *RC,
01976                                                unsigned Reg, EVT VT) const {
01977   SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
01978 
01979   return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
01980                             cast<RegisterSDNode>(VReg)->getReg(), VT);
01981 }