LLVM API Documentation

R600ISelLowering.cpp
Go to the documentation of this file.
00001 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 /// \file
00011 /// \brief Custom DAG lowering for R600
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "R600ISelLowering.h"
00016 #include "AMDGPUFrameLowering.h"
00017 #include "AMDGPUIntrinsicInfo.h"
00018 #include "AMDGPUSubtarget.h"
00019 #include "R600Defines.h"
00020 #include "R600InstrInfo.h"
00021 #include "R600MachineFunctionInfo.h"
00022 #include "llvm/Analysis/ValueTracking.h"
00023 #include "llvm/CodeGen/CallingConvLower.h"
00024 #include "llvm/CodeGen/MachineFrameInfo.h"
00025 #include "llvm/CodeGen/MachineInstrBuilder.h"
00026 #include "llvm/CodeGen/MachineRegisterInfo.h"
00027 #include "llvm/CodeGen/SelectionDAG.h"
00028 #include "llvm/IR/Argument.h"
00029 #include "llvm/IR/Function.h"
00030 
00031 using namespace llvm;
00032 
00033 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
00034     AMDGPUTargetLowering(TM),
00035     Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
00036   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
00037   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
00038   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
00039   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
00040   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
00041   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
00042 
00043   computeRegisterProperties();
00044 
00045   // Set condition code actions
00046   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
00047   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
00048   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
00049   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
00050   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
00051   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
00052   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
00053   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
00054   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
00055   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
00056   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
00057   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
00058 
00059   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
00060   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
00061   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
00062   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
00063 
00064   setOperationAction(ISD::FCOS, MVT::f32, Custom);
00065   setOperationAction(ISD::FSIN, MVT::f32, Custom);
00066 
00067   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
00068   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
00069 
00070   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
00071   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
00072   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
00073 
00074   setOperationAction(ISD::FSUB, MVT::f32, Expand);
00075 
00076   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
00077   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
00078   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
00079 
00080   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
00081   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
00082 
00083   setOperationAction(ISD::SETCC, MVT::i32, Expand);
00084   setOperationAction(ISD::SETCC, MVT::f32, Expand);
00085   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
00086   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
00087   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
00088 
00089   setOperationAction(ISD::SELECT, MVT::i32, Expand);
00090   setOperationAction(ISD::SELECT, MVT::f32, Expand);
00091   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
00092   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
00093 
00094   // Expand sign extension of vectors
00095   if (!Subtarget->hasBFE())
00096     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
00097 
00098   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
00099   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
00100 
00101   if (!Subtarget->hasBFE())
00102     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
00103   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
00104   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
00105 
00106   if (!Subtarget->hasBFE())
00107     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
00108   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
00109   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
00110 
00111   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
00112   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
00113   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
00114 
00115   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
00116 
00117 
00118   // Legalize loads and stores to the private address space.
00119   setOperationAction(ISD::LOAD, MVT::i32, Custom);
00120   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
00121   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
00122 
00123   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
00124   // spaces, so it is custom lowered to handle those where it isn't.
00125   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
00126   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
00127   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
00128   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
00129   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
00130   setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
00131 
00132   setOperationAction(ISD::STORE, MVT::i8, Custom);
00133   setOperationAction(ISD::STORE, MVT::i32, Custom);
00134   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
00135   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
00136   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
00137   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
00138 
00139   setOperationAction(ISD::LOAD, MVT::i32, Custom);
00140   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
00141   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
00142 
00143   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
00144   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
00145   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
00146   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
00147 
00148   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
00149   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
00150   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
00151   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
00152 
00153   setTargetDAGCombine(ISD::FP_ROUND);
00154   setTargetDAGCombine(ISD::FP_TO_SINT);
00155   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
00156   setTargetDAGCombine(ISD::SELECT_CC);
00157   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
00158 
00159   setOperationAction(ISD::SUB, MVT::i64, Expand);
00160 
00161   // These should be replaced by UDVIREM, but it does not happen automatically
00162   // during Type Legalization
00163   setOperationAction(ISD::UDIV, MVT::i64, Custom);
00164   setOperationAction(ISD::UREM, MVT::i64, Custom);
00165   setOperationAction(ISD::SDIV, MVT::i64, Custom);
00166   setOperationAction(ISD::SREM, MVT::i64, Custom);
00167 
00168   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
00169   //  to be Legal/Custom in order to avoid library calls.
00170   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
00171   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
00172   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
00173 
00174   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
00175 
00176   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
00177   for (MVT VT : ScalarIntVTs) {
00178     setOperationAction(ISD::ADDC, VT, Expand);
00179     setOperationAction(ISD::SUBC, VT, Expand);
00180     setOperationAction(ISD::ADDE, VT, Expand);
00181     setOperationAction(ISD::SUBE, VT, Expand);
00182   }
00183 
00184   setBooleanContents(ZeroOrNegativeOneBooleanContent);
00185   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
00186   setSchedulingPreference(Sched::Source);
00187 }
00188 
00189 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
00190     MachineInstr * MI, MachineBasicBlock * BB) const {
00191   MachineFunction * MF = BB->getParent();
00192   MachineRegisterInfo &MRI = MF->getRegInfo();
00193   MachineBasicBlock::iterator I = *MI;
00194   const R600InstrInfo *TII =
00195       static_cast<const R600InstrInfo *>(MF->getSubtarget().getInstrInfo());
00196 
00197   switch (MI->getOpcode()) {
00198   default:
00199     // Replace LDS_*_RET instruction that don't have any uses with the
00200     // equivalent LDS_*_NORET instruction.
00201     if (TII->isLDSRetInstr(MI->getOpcode())) {
00202       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
00203       assert(DstIdx != -1);
00204       MachineInstrBuilder NewMI;
00205       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
00206       //        LDS_1A2D support and remove this special case.
00207       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
00208            MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
00209         return BB;
00210 
00211       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
00212                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
00213       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
00214         NewMI.addOperand(MI->getOperand(i));
00215       }
00216     } else {
00217       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
00218     }
00219     break;
00220   case AMDGPU::CLAMP_R600: {
00221     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
00222                                                    AMDGPU::MOV,
00223                                                    MI->getOperand(0).getReg(),
00224                                                    MI->getOperand(1).getReg());
00225     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
00226     break;
00227   }
00228 
00229   case AMDGPU::FABS_R600: {
00230     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
00231                                                     AMDGPU::MOV,
00232                                                     MI->getOperand(0).getReg(),
00233                                                     MI->getOperand(1).getReg());
00234     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
00235     break;
00236   }
00237 
00238   case AMDGPU::FNEG_R600: {
00239     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
00240                                                     AMDGPU::MOV,
00241                                                     MI->getOperand(0).getReg(),
00242                                                     MI->getOperand(1).getReg());
00243     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
00244     break;
00245   }
00246 
00247   case AMDGPU::MASK_WRITE: {
00248     unsigned maskedRegister = MI->getOperand(0).getReg();
00249     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
00250     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
00251     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
00252     break;
00253   }
00254 
00255   case AMDGPU::MOV_IMM_F32:
00256     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
00257                      MI->getOperand(1).getFPImm()->getValueAPF()
00258                          .bitcastToAPInt().getZExtValue());
00259     break;
00260   case AMDGPU::MOV_IMM_I32:
00261     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
00262                      MI->getOperand(1).getImm());
00263     break;
00264   case AMDGPU::CONST_COPY: {
00265     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
00266         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
00267     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
00268         MI->getOperand(1).getImm());
00269     break;
00270   }
00271 
00272   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
00273   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
00274   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
00275     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
00276 
00277     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
00278             .addOperand(MI->getOperand(0))
00279             .addOperand(MI->getOperand(1))
00280             .addImm(EOP); // Set End of program bit
00281     break;
00282   }
00283 
00284   case AMDGPU::TXD: {
00285     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
00286     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
00287     MachineOperand &RID = MI->getOperand(4);
00288     MachineOperand &SID = MI->getOperand(5);
00289     unsigned TextureId = MI->getOperand(6).getImm();
00290     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
00291     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
00292 
00293     switch (TextureId) {
00294     case 5: // Rect
00295       CTX = CTY = 0;
00296       break;
00297     case 6: // Shadow1D
00298       SrcW = SrcZ;
00299       break;
00300     case 7: // Shadow2D
00301       SrcW = SrcZ;
00302       break;
00303     case 8: // ShadowRect
00304       CTX = CTY = 0;
00305       SrcW = SrcZ;
00306       break;
00307     case 9: // 1DArray
00308       SrcZ = SrcY;
00309       CTZ = 0;
00310       break;
00311     case 10: // 2DArray
00312       CTZ = 0;
00313       break;
00314     case 11: // Shadow1DArray
00315       SrcZ = SrcY;
00316       CTZ = 0;
00317       break;
00318     case 12: // Shadow2DArray
00319       CTZ = 0;
00320       break;
00321     }
00322     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
00323             .addOperand(MI->getOperand(3))
00324             .addImm(SrcX)
00325             .addImm(SrcY)
00326             .addImm(SrcZ)
00327             .addImm(SrcW)
00328             .addImm(0)
00329             .addImm(0)
00330             .addImm(0)
00331             .addImm(0)
00332             .addImm(1)
00333             .addImm(2)
00334             .addImm(3)
00335             .addOperand(RID)
00336             .addOperand(SID)
00337             .addImm(CTX)
00338             .addImm(CTY)
00339             .addImm(CTZ)
00340             .addImm(CTW);
00341     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
00342             .addOperand(MI->getOperand(2))
00343             .addImm(SrcX)
00344             .addImm(SrcY)
00345             .addImm(SrcZ)
00346             .addImm(SrcW)
00347             .addImm(0)
00348             .addImm(0)
00349             .addImm(0)
00350             .addImm(0)
00351             .addImm(1)
00352             .addImm(2)
00353             .addImm(3)
00354             .addOperand(RID)
00355             .addOperand(SID)
00356             .addImm(CTX)
00357             .addImm(CTY)
00358             .addImm(CTZ)
00359             .addImm(CTW);
00360     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
00361             .addOperand(MI->getOperand(0))
00362             .addOperand(MI->getOperand(1))
00363             .addImm(SrcX)
00364             .addImm(SrcY)
00365             .addImm(SrcZ)
00366             .addImm(SrcW)
00367             .addImm(0)
00368             .addImm(0)
00369             .addImm(0)
00370             .addImm(0)
00371             .addImm(1)
00372             .addImm(2)
00373             .addImm(3)
00374             .addOperand(RID)
00375             .addOperand(SID)
00376             .addImm(CTX)
00377             .addImm(CTY)
00378             .addImm(CTZ)
00379             .addImm(CTW)
00380             .addReg(T0, RegState::Implicit)
00381             .addReg(T1, RegState::Implicit);
00382     break;
00383   }
00384 
00385   case AMDGPU::TXD_SHADOW: {
00386     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
00387     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
00388     MachineOperand &RID = MI->getOperand(4);
00389     MachineOperand &SID = MI->getOperand(5);
00390     unsigned TextureId = MI->getOperand(6).getImm();
00391     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
00392     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
00393 
00394     switch (TextureId) {
00395     case 5: // Rect
00396       CTX = CTY = 0;
00397       break;
00398     case 6: // Shadow1D
00399       SrcW = SrcZ;
00400       break;
00401     case 7: // Shadow2D
00402       SrcW = SrcZ;
00403       break;
00404     case 8: // ShadowRect
00405       CTX = CTY = 0;
00406       SrcW = SrcZ;
00407       break;
00408     case 9: // 1DArray
00409       SrcZ = SrcY;
00410       CTZ = 0;
00411       break;
00412     case 10: // 2DArray
00413       CTZ = 0;
00414       break;
00415     case 11: // Shadow1DArray
00416       SrcZ = SrcY;
00417       CTZ = 0;
00418       break;
00419     case 12: // Shadow2DArray
00420       CTZ = 0;
00421       break;
00422     }
00423 
00424     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
00425             .addOperand(MI->getOperand(3))
00426             .addImm(SrcX)
00427             .addImm(SrcY)
00428             .addImm(SrcZ)
00429             .addImm(SrcW)
00430             .addImm(0)
00431             .addImm(0)
00432             .addImm(0)
00433             .addImm(0)
00434             .addImm(1)
00435             .addImm(2)
00436             .addImm(3)
00437             .addOperand(RID)
00438             .addOperand(SID)
00439             .addImm(CTX)
00440             .addImm(CTY)
00441             .addImm(CTZ)
00442             .addImm(CTW);
00443     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
00444             .addOperand(MI->getOperand(2))
00445             .addImm(SrcX)
00446             .addImm(SrcY)
00447             .addImm(SrcZ)
00448             .addImm(SrcW)
00449             .addImm(0)
00450             .addImm(0)
00451             .addImm(0)
00452             .addImm(0)
00453             .addImm(1)
00454             .addImm(2)
00455             .addImm(3)
00456             .addOperand(RID)
00457             .addOperand(SID)
00458             .addImm(CTX)
00459             .addImm(CTY)
00460             .addImm(CTZ)
00461             .addImm(CTW);
00462     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
00463             .addOperand(MI->getOperand(0))
00464             .addOperand(MI->getOperand(1))
00465             .addImm(SrcX)
00466             .addImm(SrcY)
00467             .addImm(SrcZ)
00468             .addImm(SrcW)
00469             .addImm(0)
00470             .addImm(0)
00471             .addImm(0)
00472             .addImm(0)
00473             .addImm(1)
00474             .addImm(2)
00475             .addImm(3)
00476             .addOperand(RID)
00477             .addOperand(SID)
00478             .addImm(CTX)
00479             .addImm(CTY)
00480             .addImm(CTZ)
00481             .addImm(CTW)
00482             .addReg(T0, RegState::Implicit)
00483             .addReg(T1, RegState::Implicit);
00484     break;
00485   }
00486 
00487   case AMDGPU::BRANCH:
00488       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
00489               .addOperand(MI->getOperand(0));
00490       break;
00491 
00492   case AMDGPU::BRANCH_COND_f32: {
00493     MachineInstr *NewMI =
00494       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
00495               AMDGPU::PREDICATE_BIT)
00496               .addOperand(MI->getOperand(1))
00497               .addImm(OPCODE_IS_NOT_ZERO)
00498               .addImm(0); // Flags
00499     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
00500     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
00501             .addOperand(MI->getOperand(0))
00502             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
00503     break;
00504   }
00505 
00506   case AMDGPU::BRANCH_COND_i32: {
00507     MachineInstr *NewMI =
00508       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
00509             AMDGPU::PREDICATE_BIT)
00510             .addOperand(MI->getOperand(1))
00511             .addImm(OPCODE_IS_NOT_ZERO_INT)
00512             .addImm(0); // Flags
00513     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
00514     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
00515            .addOperand(MI->getOperand(0))
00516             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
00517     break;
00518   }
00519 
00520   case AMDGPU::EG_ExportSwz:
00521   case AMDGPU::R600_ExportSwz: {
00522     // Instruction is left unmodified if its not the last one of its type
00523     bool isLastInstructionOfItsType = true;
00524     unsigned InstExportType = MI->getOperand(1).getImm();
00525     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
00526          EndBlock = BB->end(); NextExportInst != EndBlock;
00527          NextExportInst = std::next(NextExportInst)) {
00528       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
00529           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
00530         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
00531             .getImm();
00532         if (CurrentInstExportType == InstExportType) {
00533           isLastInstructionOfItsType = false;
00534           break;
00535         }
00536       }
00537     }
00538     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
00539     if (!EOP && !isLastInstructionOfItsType)
00540       return BB;
00541     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
00542     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
00543             .addOperand(MI->getOperand(0))
00544             .addOperand(MI->getOperand(1))
00545             .addOperand(MI->getOperand(2))
00546             .addOperand(MI->getOperand(3))
00547             .addOperand(MI->getOperand(4))
00548             .addOperand(MI->getOperand(5))
00549             .addOperand(MI->getOperand(6))
00550             .addImm(CfInst)
00551             .addImm(EOP);
00552     break;
00553   }
00554   case AMDGPU::RETURN: {
00555     // RETURN instructions must have the live-out registers as implicit uses,
00556     // otherwise they appear dead.
00557     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
00558     MachineInstrBuilder MIB(*MF, MI);
00559     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
00560       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
00561     return BB;
00562   }
00563   }
00564 
00565   MI->eraseFromParent();
00566   return BB;
00567 }
00568 
00569 //===----------------------------------------------------------------------===//
00570 // Custom DAG Lowering Operations
00571 //===----------------------------------------------------------------------===//
00572 
00573 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
00574   MachineFunction &MF = DAG.getMachineFunction();
00575   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
00576   switch (Op.getOpcode()) {
00577   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
00578   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
00579   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
00580   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
00581   case ISD::SRA_PARTS:
00582   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
00583   case ISD::FCOS:
00584   case ISD::FSIN: return LowerTrig(Op, DAG);
00585   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
00586   case ISD::STORE: return LowerSTORE(Op, DAG);
00587   case ISD::LOAD: {
00588     SDValue Result = LowerLOAD(Op, DAG);
00589     assert((!Result.getNode() ||
00590             Result.getNode()->getNumValues() == 2) &&
00591            "Load should return a value and a chain");
00592     return Result;
00593   }
00594 
00595   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
00596   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
00597   case ISD::INTRINSIC_VOID: {
00598     SDValue Chain = Op.getOperand(0);
00599     unsigned IntrinsicID =
00600                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
00601     switch (IntrinsicID) {
00602     case AMDGPUIntrinsic::AMDGPU_store_output: {
00603       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
00604       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
00605       MFI->LiveOuts.push_back(Reg);
00606       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
00607     }
00608     case AMDGPUIntrinsic::R600_store_swizzle: {
00609       const SDValue Args[8] = {
00610         Chain,
00611         Op.getOperand(2), // Export Value
00612         Op.getOperand(3), // ArrayBase
00613         Op.getOperand(4), // Type
00614         DAG.getConstant(0, MVT::i32), // SWZ_X
00615         DAG.getConstant(1, MVT::i32), // SWZ_Y
00616         DAG.getConstant(2, MVT::i32), // SWZ_Z
00617         DAG.getConstant(3, MVT::i32) // SWZ_W
00618       };
00619       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
00620     }
00621 
00622     // default for switch(IntrinsicID)
00623     default: break;
00624     }
00625     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
00626     break;
00627   }
00628   case ISD::INTRINSIC_WO_CHAIN: {
00629     unsigned IntrinsicID =
00630                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
00631     EVT VT = Op.getValueType();
00632     SDLoc DL(Op);
00633     switch(IntrinsicID) {
00634     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
00635     case AMDGPUIntrinsic::R600_load_input: {
00636       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
00637       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
00638       MachineFunction &MF = DAG.getMachineFunction();
00639       MachineRegisterInfo &MRI = MF.getRegInfo();
00640       MRI.addLiveIn(Reg);
00641       return DAG.getCopyFromReg(DAG.getEntryNode(),
00642           SDLoc(DAG.getEntryNode()), Reg, VT);
00643     }
00644 
00645     case AMDGPUIntrinsic::R600_interp_input: {
00646       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
00647       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
00648       MachineSDNode *interp;
00649       if (ijb < 0) {
00650         const MachineFunction &MF = DAG.getMachineFunction();
00651         const R600InstrInfo *TII = static_cast<const R600InstrInfo *>(
00652             MF.getSubtarget().getInstrInfo());
00653         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
00654             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
00655         return DAG.getTargetExtractSubreg(
00656             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
00657             DL, MVT::f32, SDValue(interp, 0));
00658       }
00659       MachineFunction &MF = DAG.getMachineFunction();
00660       MachineRegisterInfo &MRI = MF.getRegInfo();
00661       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
00662       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
00663       MRI.addLiveIn(RegisterI);
00664       MRI.addLiveIn(RegisterJ);
00665       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
00666           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
00667       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
00668           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
00669 
00670       if (slot % 4 < 2)
00671         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
00672             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
00673             RegisterJNode, RegisterINode);
00674       else
00675         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
00676             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
00677             RegisterJNode, RegisterINode);
00678       return SDValue(interp, slot % 2);
00679     }
00680     case AMDGPUIntrinsic::R600_interp_xy:
00681     case AMDGPUIntrinsic::R600_interp_zw: {
00682       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
00683       MachineSDNode *interp;
00684       SDValue RegisterINode = Op.getOperand(2);
00685       SDValue RegisterJNode = Op.getOperand(3);
00686 
00687       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
00688         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
00689             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
00690             RegisterJNode, RegisterINode);
00691       else
00692         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
00693             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
00694             RegisterJNode, RegisterINode);
00695       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
00696           SDValue(interp, 0), SDValue(interp, 1));
00697     }
00698     case AMDGPUIntrinsic::R600_tex:
00699     case AMDGPUIntrinsic::R600_texc:
00700     case AMDGPUIntrinsic::R600_txl:
00701     case AMDGPUIntrinsic::R600_txlc:
00702     case AMDGPUIntrinsic::R600_txb:
00703     case AMDGPUIntrinsic::R600_txbc:
00704     case AMDGPUIntrinsic::R600_txf:
00705     case AMDGPUIntrinsic::R600_txq:
00706     case AMDGPUIntrinsic::R600_ddx:
00707     case AMDGPUIntrinsic::R600_ddy:
00708     case AMDGPUIntrinsic::R600_ldptr: {
00709       unsigned TextureOp;
00710       switch (IntrinsicID) {
00711       case AMDGPUIntrinsic::R600_tex:
00712         TextureOp = 0;
00713         break;
00714       case AMDGPUIntrinsic::R600_texc:
00715         TextureOp = 1;
00716         break;
00717       case AMDGPUIntrinsic::R600_txl:
00718         TextureOp = 2;
00719         break;
00720       case AMDGPUIntrinsic::R600_txlc:
00721         TextureOp = 3;
00722         break;
00723       case AMDGPUIntrinsic::R600_txb:
00724         TextureOp = 4;
00725         break;
00726       case AMDGPUIntrinsic::R600_txbc:
00727         TextureOp = 5;
00728         break;
00729       case AMDGPUIntrinsic::R600_txf:
00730         TextureOp = 6;
00731         break;
00732       case AMDGPUIntrinsic::R600_txq:
00733         TextureOp = 7;
00734         break;
00735       case AMDGPUIntrinsic::R600_ddx:
00736         TextureOp = 8;
00737         break;
00738       case AMDGPUIntrinsic::R600_ddy:
00739         TextureOp = 9;
00740         break;
00741       case AMDGPUIntrinsic::R600_ldptr:
00742         TextureOp = 10;
00743         break;
00744       default:
00745         llvm_unreachable("Unknow Texture Operation");
00746       }
00747 
00748       SDValue TexArgs[19] = {
00749         DAG.getConstant(TextureOp, MVT::i32),
00750         Op.getOperand(1),
00751         DAG.getConstant(0, MVT::i32),
00752         DAG.getConstant(1, MVT::i32),
00753         DAG.getConstant(2, MVT::i32),
00754         DAG.getConstant(3, MVT::i32),
00755         Op.getOperand(2),
00756         Op.getOperand(3),
00757         Op.getOperand(4),
00758         DAG.getConstant(0, MVT::i32),
00759         DAG.getConstant(1, MVT::i32),
00760         DAG.getConstant(2, MVT::i32),
00761         DAG.getConstant(3, MVT::i32),
00762         Op.getOperand(5),
00763         Op.getOperand(6),
00764         Op.getOperand(7),
00765         Op.getOperand(8),
00766         Op.getOperand(9),
00767         Op.getOperand(10)
00768       };
00769       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
00770     }
00771     case AMDGPUIntrinsic::AMDGPU_dp4: {
00772       SDValue Args[8] = {
00773       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
00774           DAG.getConstant(0, MVT::i32)),
00775       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
00776           DAG.getConstant(0, MVT::i32)),
00777       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
00778           DAG.getConstant(1, MVT::i32)),
00779       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
00780           DAG.getConstant(1, MVT::i32)),
00781       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
00782           DAG.getConstant(2, MVT::i32)),
00783       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
00784           DAG.getConstant(2, MVT::i32)),
00785       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
00786           DAG.getConstant(3, MVT::i32)),
00787       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
00788           DAG.getConstant(3, MVT::i32))
00789       };
00790       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
00791     }
00792 
00793     case Intrinsic::r600_read_ngroups_x:
00794       return LowerImplicitParameter(DAG, VT, DL, 0);
00795     case Intrinsic::r600_read_ngroups_y:
00796       return LowerImplicitParameter(DAG, VT, DL, 1);
00797     case Intrinsic::r600_read_ngroups_z:
00798       return LowerImplicitParameter(DAG, VT, DL, 2);
00799     case Intrinsic::r600_read_global_size_x:
00800       return LowerImplicitParameter(DAG, VT, DL, 3);
00801     case Intrinsic::r600_read_global_size_y:
00802       return LowerImplicitParameter(DAG, VT, DL, 4);
00803     case Intrinsic::r600_read_global_size_z:
00804       return LowerImplicitParameter(DAG, VT, DL, 5);
00805     case Intrinsic::r600_read_local_size_x:
00806       return LowerImplicitParameter(DAG, VT, DL, 6);
00807     case Intrinsic::r600_read_local_size_y:
00808       return LowerImplicitParameter(DAG, VT, DL, 7);
00809     case Intrinsic::r600_read_local_size_z:
00810       return LowerImplicitParameter(DAG, VT, DL, 8);
00811 
00812     case Intrinsic::r600_read_tgid_x:
00813       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
00814                                   AMDGPU::T1_X, VT);
00815     case Intrinsic::r600_read_tgid_y:
00816       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
00817                                   AMDGPU::T1_Y, VT);
00818     case Intrinsic::r600_read_tgid_z:
00819       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
00820                                   AMDGPU::T1_Z, VT);
00821     case Intrinsic::r600_read_tidig_x:
00822       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
00823                                   AMDGPU::T0_X, VT);
00824     case Intrinsic::r600_read_tidig_y:
00825       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
00826                                   AMDGPU::T0_Y, VT);
00827     case Intrinsic::r600_read_tidig_z:
00828       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
00829                                   AMDGPU::T0_Z, VT);
00830     case Intrinsic::AMDGPU_rsq:
00831       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
00832       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
00833     }
00834     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
00835     break;
00836   }
00837   } // end switch(Op.getOpcode())
00838   return SDValue();
00839 }
00840 
00841 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
00842                                             SmallVectorImpl<SDValue> &Results,
00843                                             SelectionDAG &DAG) const {
00844   switch (N->getOpcode()) {
00845   default:
00846     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
00847     return;
00848   case ISD::FP_TO_UINT:
00849     if (N->getValueType(0) == MVT::i1) {
00850       Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
00851       return;
00852     }
00853     // Fall-through. Since we don't care about out of bounds values
00854     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
00855     // considers some extra cases which are not necessary here.
00856   case ISD::FP_TO_SINT: {
00857     SDValue Result;
00858     if (expandFP_TO_SINT(N, Result, DAG))
00859       Results.push_back(Result);
00860     return;
00861   }
00862   case ISD::UDIV: {
00863     SDValue Op = SDValue(N, 0);
00864     SDLoc DL(Op);
00865     EVT VT = Op.getValueType();
00866     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
00867       N->getOperand(0), N->getOperand(1));
00868     Results.push_back(UDIVREM);
00869     break;
00870   }
00871   case ISD::UREM: {
00872     SDValue Op = SDValue(N, 0);
00873     SDLoc DL(Op);
00874     EVT VT = Op.getValueType();
00875     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
00876       N->getOperand(0), N->getOperand(1));
00877     Results.push_back(UDIVREM.getValue(1));
00878     break;
00879   }
00880   case ISD::SDIV: {
00881     SDValue Op = SDValue(N, 0);
00882     SDLoc DL(Op);
00883     EVT VT = Op.getValueType();
00884     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
00885       N->getOperand(0), N->getOperand(1));
00886     Results.push_back(SDIVREM);
00887     break;
00888   }
00889   case ISD::SREM: {
00890     SDValue Op = SDValue(N, 0);
00891     SDLoc DL(Op);
00892     EVT VT = Op.getValueType();
00893     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
00894       N->getOperand(0), N->getOperand(1));
00895     Results.push_back(SDIVREM.getValue(1));
00896     break;
00897   }
00898   case ISD::SDIVREM: {
00899     SDValue Op = SDValue(N, 1);
00900     SDValue RES = LowerSDIVREM(Op, DAG);
00901     Results.push_back(RES);
00902     Results.push_back(RES.getValue(1));
00903     break;
00904   }
00905   case ISD::UDIVREM: {
00906     SDValue Op = SDValue(N, 0);
00907     SDLoc DL(Op);
00908     EVT VT = Op.getValueType();
00909     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
00910 
00911     SDValue one = DAG.getConstant(1, HalfVT);
00912     SDValue zero = DAG.getConstant(0, HalfVT);
00913 
00914     //HiLo split
00915     SDValue LHS = N->getOperand(0);
00916     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
00917     SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
00918 
00919     SDValue RHS = N->getOperand(1);
00920     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
00921     SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
00922 
00923     // Get Speculative values
00924     SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
00925     SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
00926 
00927     SDValue REM_Hi = zero;
00928     SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
00929 
00930     SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
00931     SDValue DIV_Lo = zero;
00932 
00933     const unsigned halfBitWidth = HalfVT.getSizeInBits();
00934 
00935     for (unsigned i = 0; i < halfBitWidth; ++i) {
00936       SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
00937       // Get Value of high bit
00938       SDValue HBit;
00939       if (halfBitWidth == 32 && Subtarget->hasBFE()) {
00940         HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
00941       } else {
00942         HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
00943         HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
00944       }
00945 
00946       SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
00947         DAG.getConstant(halfBitWidth - 1, HalfVT));
00948       REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
00949       REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
00950 
00951       REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
00952       REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
00953 
00954 
00955       SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
00956 
00957       SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
00958       SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
00959 
00960       DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
00961 
00962       // Update REM
00963 
00964       SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
00965 
00966       REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
00967       REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
00968       REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
00969     }
00970 
00971     SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
00972     SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
00973     Results.push_back(DIV);
00974     Results.push_back(REM);
00975     break;
00976   }
00977   }
00978 }
00979 
00980 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
00981                                                    SDValue Vector) const {
00982 
00983   SDLoc DL(Vector);
00984   EVT VecVT = Vector.getValueType();
00985   EVT EltVT = VecVT.getVectorElementType();
00986   SmallVector<SDValue, 8> Args;
00987 
00988   for (unsigned i = 0, e = VecVT.getVectorNumElements();
00989                                                            i != e; ++i) {
00990     Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
00991                                Vector, DAG.getConstant(i, getVectorIdxTy())));
00992   }
00993 
00994   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
00995 }
00996 
00997 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
00998                                                     SelectionDAG &DAG) const {
00999 
01000   SDLoc DL(Op);
01001   SDValue Vector = Op.getOperand(0);
01002   SDValue Index = Op.getOperand(1);
01003 
01004   if (isa<ConstantSDNode>(Index) ||
01005       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
01006     return Op;
01007 
01008   Vector = vectorToVerticalVector(DAG, Vector);
01009   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
01010                      Vector, Index);
01011 }
01012 
01013 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
01014                                                    SelectionDAG &DAG) const {
01015   SDLoc DL(Op);
01016   SDValue Vector = Op.getOperand(0);
01017   SDValue Value = Op.getOperand(1);
01018   SDValue Index = Op.getOperand(2);
01019 
01020   if (isa<ConstantSDNode>(Index) ||
01021       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
01022     return Op;
01023 
01024   Vector = vectorToVerticalVector(DAG, Vector);
01025   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
01026                                Vector, Value, Index);
01027   return vectorToVerticalVector(DAG, Insert);
01028 }
01029 
01030 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
01031   // On hw >= R700, COS/SIN input must be between -1. and 1.
01032   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
01033   EVT VT = Op.getValueType();
01034   SDValue Arg = Op.getOperand(0);
01035   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
01036       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
01037         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
01038           DAG.getConstantFP(0.15915494309, MVT::f32)),
01039         DAG.getConstantFP(0.5, MVT::f32)));
01040   unsigned TrigNode;
01041   switch (Op.getOpcode()) {
01042   case ISD::FCOS:
01043     TrigNode = AMDGPUISD::COS_HW;
01044     break;
01045   case ISD::FSIN:
01046     TrigNode = AMDGPUISD::SIN_HW;
01047     break;
01048   default:
01049     llvm_unreachable("Wrong trig opcode");
01050   }
01051   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
01052       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
01053         DAG.getConstantFP(-0.5, MVT::f32)));
01054   if (Gen >= AMDGPUSubtarget::R700)
01055     return TrigVal;
01056   // On R600 hw, COS/SIN input must be between -Pi and Pi.
01057   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
01058       DAG.getConstantFP(3.14159265359, MVT::f32));
01059 }
01060 
01061 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
01062   SDLoc DL(Op);
01063   EVT VT = Op.getValueType();
01064 
01065   SDValue Lo = Op.getOperand(0);
01066   SDValue Hi = Op.getOperand(1);
01067   SDValue Shift = Op.getOperand(2);
01068   SDValue Zero = DAG.getConstant(0, VT);
01069   SDValue One  = DAG.getConstant(1, VT);
01070 
01071   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
01072   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
01073   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
01074   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
01075 
01076   // The dance around Width1 is necessary for 0 special case.
01077   // Without it the CompShift might be 32, producing incorrect results in
01078   // Overflow. So we do the shift in two steps, the alternative is to
01079   // add a conditional to filter the special case.
01080 
01081   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
01082   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
01083 
01084   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
01085   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
01086   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
01087 
01088   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
01089   SDValue LoBig = Zero;
01090 
01091   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
01092   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
01093 
01094   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
01095 }
01096 
01097 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
01098   SDLoc DL(Op);
01099   EVT VT = Op.getValueType();
01100 
01101   SDValue Lo = Op.getOperand(0);
01102   SDValue Hi = Op.getOperand(1);
01103   SDValue Shift = Op.getOperand(2);
01104   SDValue Zero = DAG.getConstant(0, VT);
01105   SDValue One  = DAG.getConstant(1, VT);
01106 
01107   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
01108 
01109   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
01110   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
01111   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
01112   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
01113 
01114   // The dance around Width1 is necessary for 0 special case.
01115   // Without it the CompShift might be 32, producing incorrect results in
01116   // Overflow. So we do the shift in two steps, the alternative is to
01117   // add a conditional to filter the special case.
01118 
01119   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
01120   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
01121 
01122   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
01123   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
01124   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
01125 
01126   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
01127   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
01128 
01129   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
01130   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
01131 
01132   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
01133 }
01134 
01135 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
01136   return DAG.getNode(
01137       ISD::SETCC,
01138       SDLoc(Op),
01139       MVT::i1,
01140       Op, DAG.getConstantFP(0.0f, MVT::f32),
01141       DAG.getCondCode(ISD::SETNE)
01142       );
01143 }
01144 
01145 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
01146                                                    SDLoc DL,
01147                                                    unsigned DwordOffset) const {
01148   unsigned ByteOffset = DwordOffset * 4;
01149   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
01150                                       AMDGPUAS::CONSTANT_BUFFER_0);
01151 
01152   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
01153   assert(isInt<16>(ByteOffset));
01154 
01155   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
01156                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
01157                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
01158                      false, false, false, 0);
01159 }
01160 
01161 bool R600TargetLowering::isZero(SDValue Op) const {
01162   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
01163     return Cst->isNullValue();
01164   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
01165     return CstFP->isZero();
01166   } else {
01167     return false;
01168   }
01169 }
01170 
01171 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
01172   SDLoc DL(Op);
01173   EVT VT = Op.getValueType();
01174 
01175   SDValue LHS = Op.getOperand(0);
01176   SDValue RHS = Op.getOperand(1);
01177   SDValue True = Op.getOperand(2);
01178   SDValue False = Op.getOperand(3);
01179   SDValue CC = Op.getOperand(4);
01180   SDValue Temp;
01181 
01182   // LHS and RHS are guaranteed to be the same value type
01183   EVT CompareVT = LHS.getValueType();
01184 
01185   // Check if we can lower this to a native operation.
01186 
01187   // Try to lower to a SET* instruction:
01188   //
01189   // SET* can match the following patterns:
01190   //
01191   // select_cc f32, f32, -1,  0, cc_supported
01192   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
01193   // select_cc i32, i32, -1,  0, cc_supported
01194   //
01195 
01196   // Move hardware True/False values to the correct operand.
01197   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
01198   ISD::CondCode InverseCC =
01199      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
01200   if (isHWTrueValue(False) && isHWFalseValue(True)) {
01201     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
01202       std::swap(False, True);
01203       CC = DAG.getCondCode(InverseCC);
01204     } else {
01205       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
01206       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
01207         std::swap(False, True);
01208         std::swap(LHS, RHS);
01209         CC = DAG.getCondCode(SwapInvCC);
01210       }
01211     }
01212   }
01213 
01214   if (isHWTrueValue(True) && isHWFalseValue(False) &&
01215       (CompareVT == VT || VT == MVT::i32)) {
01216     // This can be matched by a SET* instruction.
01217     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
01218   }
01219 
01220   // Try to lower to a CND* instruction:
01221   //
01222   // CND* can match the following patterns:
01223   //
01224   // select_cc f32, 0.0, f32, f32, cc_supported
01225   // select_cc f32, 0.0, i32, i32, cc_supported
01226   // select_cc i32, 0,   f32, f32, cc_supported
01227   // select_cc i32, 0,   i32, i32, cc_supported
01228   //
01229 
01230   // Try to move the zero value to the RHS
01231   if (isZero(LHS)) {
01232     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
01233     // Try swapping the operands
01234     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
01235     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
01236       std::swap(LHS, RHS);
01237       CC = DAG.getCondCode(CCSwapped);
01238     } else {
01239       // Try inverting the conditon and then swapping the operands
01240       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
01241       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
01242       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
01243         std::swap(True, False);
01244         std::swap(LHS, RHS);
01245         CC = DAG.getCondCode(CCSwapped);
01246       }
01247     }
01248   }
01249   if (isZero(RHS)) {
01250     SDValue Cond = LHS;
01251     SDValue Zero = RHS;
01252     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
01253     if (CompareVT != VT) {
01254       // Bitcast True / False to the correct types.  This will end up being
01255       // a nop, but it allows us to define only a single pattern in the
01256       // .TD files for each CND* instruction rather than having to have
01257       // one pattern for integer True/False and one for fp True/False
01258       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
01259       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
01260     }
01261 
01262     switch (CCOpcode) {
01263     case ISD::SETONE:
01264     case ISD::SETUNE:
01265     case ISD::SETNE:
01266       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
01267       Temp = True;
01268       True = False;
01269       False = Temp;
01270       break;
01271     default:
01272       break;
01273     }
01274     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
01275         Cond, Zero,
01276         True, False,
01277         DAG.getCondCode(CCOpcode));
01278     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
01279   }
01280 
01281   // If we make it this for it means we have no native instructions to handle
01282   // this SELECT_CC, so we must lower it.
01283   SDValue HWTrue, HWFalse;
01284 
01285   if (CompareVT == MVT::f32) {
01286     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
01287     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
01288   } else if (CompareVT == MVT::i32) {
01289     HWTrue = DAG.getConstant(-1, CompareVT);
01290     HWFalse = DAG.getConstant(0, CompareVT);
01291   }
01292   else {
01293     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
01294   }
01295 
01296   // Lower this unsupported SELECT_CC into a combination of two supported
01297   // SELECT_CC operations.
01298   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
01299 
01300   return DAG.getNode(ISD::SELECT_CC, DL, VT,
01301       Cond, HWFalse,
01302       True, False,
01303       DAG.getCondCode(ISD::SETNE));
01304 }
01305 
01306 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
01307 /// convert these pointers to a register index.  Each register holds
01308 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
01309 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
01310 /// for indirect addressing.
01311 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
01312                                                unsigned StackWidth,
01313                                                SelectionDAG &DAG) const {
01314   unsigned SRLPad;
01315   switch(StackWidth) {
01316   case 1:
01317     SRLPad = 2;
01318     break;
01319   case 2:
01320     SRLPad = 3;
01321     break;
01322   case 4:
01323     SRLPad = 4;
01324     break;
01325   default: llvm_unreachable("Invalid stack width");
01326   }
01327 
01328   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
01329                      DAG.getConstant(SRLPad, MVT::i32));
01330 }
01331 
01332 void R600TargetLowering::getStackAddress(unsigned StackWidth,
01333                                          unsigned ElemIdx,
01334                                          unsigned &Channel,
01335                                          unsigned &PtrIncr) const {
01336   switch (StackWidth) {
01337   default:
01338   case 1:
01339     Channel = 0;
01340     if (ElemIdx > 0) {
01341       PtrIncr = 1;
01342     } else {
01343       PtrIncr = 0;
01344     }
01345     break;
01346   case 2:
01347     Channel = ElemIdx % 2;
01348     if (ElemIdx == 2) {
01349       PtrIncr = 1;
01350     } else {
01351       PtrIncr = 0;
01352     }
01353     break;
01354   case 4:
01355     Channel = ElemIdx;
01356     PtrIncr = 0;
01357     break;
01358   }
01359 }
01360 
01361 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
01362   SDLoc DL(Op);
01363   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
01364   SDValue Chain = Op.getOperand(0);
01365   SDValue Value = Op.getOperand(1);
01366   SDValue Ptr = Op.getOperand(2);
01367 
01368   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
01369   if (Result.getNode()) {
01370     return Result;
01371   }
01372 
01373   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
01374     if (StoreNode->isTruncatingStore()) {
01375       EVT VT = Value.getValueType();
01376       assert(VT.bitsLE(MVT::i32));
01377       EVT MemVT = StoreNode->getMemoryVT();
01378       SDValue MaskConstant;
01379       if (MemVT == MVT::i8) {
01380         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
01381       } else {
01382         assert(MemVT == MVT::i16);
01383         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
01384       }
01385       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
01386                                       DAG.getConstant(2, MVT::i32));
01387       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
01388                                       DAG.getConstant(0x00000003, VT));
01389       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
01390       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
01391                                    DAG.getConstant(3, VT));
01392       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
01393       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
01394       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
01395       // vector instead.
01396       SDValue Src[4] = {
01397         ShiftedValue,
01398         DAG.getConstant(0, MVT::i32),
01399         DAG.getConstant(0, MVT::i32),
01400         Mask
01401       };
01402       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
01403       SDValue Args[3] = { Chain, Input, DWordAddr };
01404       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
01405                                      Op->getVTList(), Args, MemVT,
01406                                      StoreNode->getMemOperand());
01407     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
01408                Value.getValueType().bitsGE(MVT::i32)) {
01409       // Convert pointer from byte address to dword address.
01410       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
01411                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
01412                                     Ptr, DAG.getConstant(2, MVT::i32)));
01413 
01414       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
01415         llvm_unreachable("Truncated and indexed stores not supported yet");
01416       } else {
01417         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
01418       }
01419       return Chain;
01420     }
01421   }
01422 
01423   EVT ValueVT = Value.getValueType();
01424 
01425   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
01426     return SDValue();
01427   }
01428 
01429   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
01430   if (Ret.getNode()) {
01431     return Ret;
01432   }
01433   // Lowering for indirect addressing
01434 
01435   const MachineFunction &MF = DAG.getMachineFunction();
01436   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
01437       getTargetMachine().getSubtargetImpl()->getFrameLowering());
01438   unsigned StackWidth = TFL->getStackWidth(MF);
01439 
01440   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
01441 
01442   if (ValueVT.isVector()) {
01443     unsigned NumElemVT = ValueVT.getVectorNumElements();
01444     EVT ElemVT = ValueVT.getVectorElementType();
01445     SmallVector<SDValue, 4> Stores(NumElemVT);
01446 
01447     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
01448                                       "vector width in load");
01449 
01450     for (unsigned i = 0; i < NumElemVT; ++i) {
01451       unsigned Channel, PtrIncr;
01452       getStackAddress(StackWidth, i, Channel, PtrIncr);
01453       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
01454                         DAG.getConstant(PtrIncr, MVT::i32));
01455       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
01456                                  Value, DAG.getConstant(i, MVT::i32));
01457 
01458       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
01459                               Chain, Elem, Ptr,
01460                               DAG.getTargetConstant(Channel, MVT::i32));
01461     }
01462      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
01463    } else {
01464     if (ValueVT == MVT::i8) {
01465       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
01466     }
01467     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
01468     DAG.getTargetConstant(0, MVT::i32)); // Channel
01469   }
01470 
01471   return Chain;
01472 }
01473 
01474 // return (512 + (kc_bank << 12)
01475 static int
01476 ConstantAddressBlock(unsigned AddressSpace) {
01477   switch (AddressSpace) {
01478   case AMDGPUAS::CONSTANT_BUFFER_0:
01479     return 512;
01480   case AMDGPUAS::CONSTANT_BUFFER_1:
01481     return 512 + 4096;
01482   case AMDGPUAS::CONSTANT_BUFFER_2:
01483     return 512 + 4096 * 2;
01484   case AMDGPUAS::CONSTANT_BUFFER_3:
01485     return 512 + 4096 * 3;
01486   case AMDGPUAS::CONSTANT_BUFFER_4:
01487     return 512 + 4096 * 4;
01488   case AMDGPUAS::CONSTANT_BUFFER_5:
01489     return 512 + 4096 * 5;
01490   case AMDGPUAS::CONSTANT_BUFFER_6:
01491     return 512 + 4096 * 6;
01492   case AMDGPUAS::CONSTANT_BUFFER_7:
01493     return 512 + 4096 * 7;
01494   case AMDGPUAS::CONSTANT_BUFFER_8:
01495     return 512 + 4096 * 8;
01496   case AMDGPUAS::CONSTANT_BUFFER_9:
01497     return 512 + 4096 * 9;
01498   case AMDGPUAS::CONSTANT_BUFFER_10:
01499     return 512 + 4096 * 10;
01500   case AMDGPUAS::CONSTANT_BUFFER_11:
01501     return 512 + 4096 * 11;
01502   case AMDGPUAS::CONSTANT_BUFFER_12:
01503     return 512 + 4096 * 12;
01504   case AMDGPUAS::CONSTANT_BUFFER_13:
01505     return 512 + 4096 * 13;
01506   case AMDGPUAS::CONSTANT_BUFFER_14:
01507     return 512 + 4096 * 14;
01508   case AMDGPUAS::CONSTANT_BUFFER_15:
01509     return 512 + 4096 * 15;
01510   default:
01511     return -1;
01512   }
01513 }
01514 
01515 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
01516 {
01517   EVT VT = Op.getValueType();
01518   SDLoc DL(Op);
01519   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
01520   SDValue Chain = Op.getOperand(0);
01521   SDValue Ptr = Op.getOperand(1);
01522   SDValue LoweredLoad;
01523 
01524   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
01525   if (Ret.getNode()) {
01526     SDValue Ops[2] = {
01527       Ret,
01528       Chain
01529     };
01530     return DAG.getMergeValues(Ops, DL);
01531   }
01532 
01533   // Lower loads constant address space global variable loads
01534   if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
01535       isa<GlobalVariable>(
01536           GetUnderlyingObject(LoadNode->getMemOperand()->getValue()))) {
01537 
01538     SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
01539         getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
01540     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
01541         DAG.getConstant(2, MVT::i32));
01542     return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
01543                        LoadNode->getChain(), Ptr,
01544                        DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
01545   }
01546 
01547   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
01548     SDValue MergedValues[2] = {
01549       ScalarizeVectorLoad(Op, DAG),
01550       Chain
01551     };
01552     return DAG.getMergeValues(MergedValues, DL);
01553   }
01554 
01555   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
01556   if (ConstantBlock > -1 &&
01557       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
01558        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
01559     SDValue Result;
01560     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
01561         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
01562         isa<ConstantSDNode>(Ptr)) {
01563       SDValue Slots[4];
01564       for (unsigned i = 0; i < 4; i++) {
01565         // We want Const position encoded with the following formula :
01566         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
01567         // const_index is Ptr computed by llvm using an alignment of 16.
01568         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
01569         // then div by 4 at the ISel step
01570         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
01571             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
01572         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
01573       }
01574       EVT NewVT = MVT::v4i32;
01575       unsigned NumElements = 4;
01576       if (VT.isVector()) {
01577         NewVT = VT;
01578         NumElements = VT.getVectorNumElements();
01579       }
01580       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
01581                            makeArrayRef(Slots, NumElements));
01582     } else {
01583       // non-constant ptr can't be folded, keeps it as a v4f32 load
01584       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
01585           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
01586           DAG.getConstant(LoadNode->getAddressSpace() -
01587                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
01588           );
01589     }
01590 
01591     if (!VT.isVector()) {
01592       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
01593           DAG.getConstant(0, MVT::i32));
01594     }
01595 
01596     SDValue MergedValues[2] = {
01597       Result,
01598       Chain
01599     };
01600     return DAG.getMergeValues(MergedValues, DL);
01601   }
01602 
01603   // For most operations returning SDValue() will result in the node being
01604   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
01605   // need to manually expand loads that may be legal in some address spaces and
01606   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
01607   // compute shaders, since the data is sign extended when it is uploaded to the
01608   // buffer. However SEXT loads from other address spaces are not supported, so
01609   // we need to expand them here.
01610   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
01611     EVT MemVT = LoadNode->getMemoryVT();
01612     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
01613     SDValue ShiftAmount =
01614           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
01615     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
01616                                   LoadNode->getPointerInfo(), MemVT,
01617                                   LoadNode->isVolatile(),
01618                                   LoadNode->isNonTemporal(),
01619                                   LoadNode->isInvariant(),
01620                                   LoadNode->getAlignment());
01621     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
01622     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
01623 
01624     SDValue MergedValues[2] = { Sra, Chain };
01625     return DAG.getMergeValues(MergedValues, DL);
01626   }
01627 
01628   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
01629     return SDValue();
01630   }
01631 
01632   // Lowering for indirect addressing
01633   const MachineFunction &MF = DAG.getMachineFunction();
01634   const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
01635       getTargetMachine().getSubtargetImpl()->getFrameLowering());
01636   unsigned StackWidth = TFL->getStackWidth(MF);
01637 
01638   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
01639 
01640   if (VT.isVector()) {
01641     unsigned NumElemVT = VT.getVectorNumElements();
01642     EVT ElemVT = VT.getVectorElementType();
01643     SDValue Loads[4];
01644 
01645     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
01646                                       "vector width in load");
01647 
01648     for (unsigned i = 0; i < NumElemVT; ++i) {
01649       unsigned Channel, PtrIncr;
01650       getStackAddress(StackWidth, i, Channel, PtrIncr);
01651       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
01652                         DAG.getConstant(PtrIncr, MVT::i32));
01653       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
01654                              Chain, Ptr,
01655                              DAG.getTargetConstant(Channel, MVT::i32),
01656                              Op.getOperand(2));
01657     }
01658     for (unsigned i = NumElemVT; i < 4; ++i) {
01659       Loads[i] = DAG.getUNDEF(ElemVT);
01660     }
01661     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
01662     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
01663   } else {
01664     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
01665                               Chain, Ptr,
01666                               DAG.getTargetConstant(0, MVT::i32), // Channel
01667                               Op.getOperand(2));
01668   }
01669 
01670   SDValue Ops[2] = {
01671     LoweredLoad,
01672     Chain
01673   };
01674 
01675   return DAG.getMergeValues(Ops, DL);
01676 }
01677 
01678 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
01679   SDValue Chain = Op.getOperand(0);
01680   SDValue Cond  = Op.getOperand(1);
01681   SDValue Jump  = Op.getOperand(2);
01682 
01683   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
01684                      Chain, Jump, Cond);
01685 }
01686 
01687 /// XXX Only kernel functions are supported, so we can assume for now that
01688 /// every function is a kernel function, but in the future we should use
01689 /// separate calling conventions for kernel and non-kernel functions.
01690 SDValue R600TargetLowering::LowerFormalArguments(
01691                                       SDValue Chain,
01692                                       CallingConv::ID CallConv,
01693                                       bool isVarArg,
01694                                       const SmallVectorImpl<ISD::InputArg> &Ins,
01695                                       SDLoc DL, SelectionDAG &DAG,
01696                                       SmallVectorImpl<SDValue> &InVals) const {
01697   SmallVector<CCValAssign, 16> ArgLocs;
01698   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
01699                  *DAG.getContext());
01700   MachineFunction &MF = DAG.getMachineFunction();
01701   unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->getShaderType();
01702 
01703   SmallVector<ISD::InputArg, 8> LocalIns;
01704 
01705   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
01706 
01707   AnalyzeFormalArguments(CCInfo, LocalIns);
01708 
01709   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
01710     CCValAssign &VA = ArgLocs[i];
01711     const ISD::InputArg &In = Ins[i];
01712     EVT VT = In.VT;
01713     EVT MemVT = VA.getLocVT();
01714     if (!VT.isVector() && MemVT.isVector()) {
01715       // Get load source type if scalarized.
01716       MemVT = MemVT.getVectorElementType();
01717     }
01718 
01719     if (ShaderType != ShaderType::COMPUTE) {
01720       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
01721       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
01722       InVals.push_back(Register);
01723       continue;
01724     }
01725 
01726     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
01727                                           AMDGPUAS::CONSTANT_BUFFER_0);
01728 
01729     // i64 isn't a legal type, so the register type used ends up as i32, which
01730     // isn't expected here. It attempts to create this sextload, but it ends up
01731     // being invalid. Somehow this seems to work with i64 arguments, but breaks
01732     // for <1 x i64>.
01733 
01734     // The first 36 bytes of the input buffer contains information about
01735     // thread group and global sizes.
01736     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
01737     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
01738       // FIXME: This should really check the extload type, but the handling of
01739       // extload vector parameters seems to be broken.
01740 
01741       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
01742       Ext = ISD::SEXTLOAD;
01743     }
01744 
01745     // Compute the offset from the value.
01746     // XXX - I think PartOffset should give you this, but it seems to give the
01747     // size of the register which isn't useful.
01748 
01749     unsigned ValBase = ArgLocs[In.OrigArgIndex].getLocMemOffset();
01750     unsigned PartOffset = VA.getLocMemOffset();
01751 
01752     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
01753     SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
01754                               DAG.getConstant(36 + PartOffset, MVT::i32),
01755                               DAG.getUNDEF(MVT::i32),
01756                               PtrInfo,
01757                               MemVT, false, true, true, 4);
01758 
01759     // 4 is the preferred alignment for the CONSTANT memory space.
01760     InVals.push_back(Arg);
01761   }
01762   return Chain;
01763 }
01764 
01765 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
01766    if (!VT.isVector())
01767      return MVT::i32;
01768    return VT.changeVectorElementTypeToInteger();
01769 }
01770 
01771 static SDValue CompactSwizzlableVector(
01772   SelectionDAG &DAG, SDValue VectorEntry,
01773   DenseMap<unsigned, unsigned> &RemapSwizzle) {
01774   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
01775   assert(RemapSwizzle.empty());
01776   SDValue NewBldVec[4] = {
01777     VectorEntry.getOperand(0),
01778     VectorEntry.getOperand(1),
01779     VectorEntry.getOperand(2),
01780     VectorEntry.getOperand(3)
01781   };
01782 
01783   for (unsigned i = 0; i < 4; i++) {
01784     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
01785       // We mask write here to teach later passes that the ith element of this
01786       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
01787       // break false dependencies and additionnaly make assembly easier to read.
01788       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
01789     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
01790       if (C->isZero()) {
01791         RemapSwizzle[i] = 4; // SEL_0
01792         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
01793       } else if (C->isExactlyValue(1.0)) {
01794         RemapSwizzle[i] = 5; // SEL_1
01795         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
01796       }
01797     }
01798 
01799     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
01800       continue;
01801     for (unsigned j = 0; j < i; j++) {
01802       if (NewBldVec[i] == NewBldVec[j]) {
01803         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
01804         RemapSwizzle[i] = j;
01805         break;
01806       }
01807     }
01808   }
01809 
01810   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
01811                      VectorEntry.getValueType(), NewBldVec);
01812 }
01813 
01814 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
01815                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
01816   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
01817   assert(RemapSwizzle.empty());
01818   SDValue NewBldVec[4] = {
01819       VectorEntry.getOperand(0),
01820       VectorEntry.getOperand(1),
01821       VectorEntry.getOperand(2),
01822       VectorEntry.getOperand(3)
01823   };
01824   bool isUnmovable[4] = { false, false, false, false };
01825   for (unsigned i = 0; i < 4; i++) {
01826     RemapSwizzle[i] = i;
01827     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
01828       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
01829           ->getZExtValue();
01830       if (i == Idx)
01831         isUnmovable[Idx] = true;
01832     }
01833   }
01834 
01835   for (unsigned i = 0; i < 4; i++) {
01836     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
01837       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
01838           ->getZExtValue();
01839       if (isUnmovable[Idx])
01840         continue;
01841       // Swap i and Idx
01842       std::swap(NewBldVec[Idx], NewBldVec[i]);
01843       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
01844       break;
01845     }
01846   }
01847 
01848   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
01849                      VectorEntry.getValueType(), NewBldVec);
01850 }
01851 
01852 
01853 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
01854 SDValue Swz[4], SelectionDAG &DAG) const {
01855   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
01856   // Old -> New swizzle values
01857   DenseMap<unsigned, unsigned> SwizzleRemap;
01858 
01859   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
01860   for (unsigned i = 0; i < 4; i++) {
01861     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
01862     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
01863       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
01864   }
01865 
01866   SwizzleRemap.clear();
01867   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
01868   for (unsigned i = 0; i < 4; i++) {
01869     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
01870     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
01871       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
01872   }
01873 
01874   return BuildVector;
01875 }
01876 
01877 
01878 //===----------------------------------------------------------------------===//
01879 // Custom DAG Optimizations
01880 //===----------------------------------------------------------------------===//
01881 
01882 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
01883                                               DAGCombinerInfo &DCI) const {
01884   SelectionDAG &DAG = DCI.DAG;
01885 
01886   switch (N->getOpcode()) {
01887   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
01888   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
01889   case ISD::FP_ROUND: {
01890       SDValue Arg = N->getOperand(0);
01891       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
01892         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
01893                            Arg.getOperand(0));
01894       }
01895       break;
01896     }
01897 
01898   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
01899   // (i32 select_cc f32, f32, -1, 0 cc)
01900   //
01901   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
01902   // this to one of the SET*_DX10 instructions.
01903   case ISD::FP_TO_SINT: {
01904     SDValue FNeg = N->getOperand(0);
01905     if (FNeg.getOpcode() != ISD::FNEG) {
01906       return SDValue();
01907     }
01908     SDValue SelectCC = FNeg.getOperand(0);
01909     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
01910         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
01911         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
01912         !isHWTrueValue(SelectCC.getOperand(2)) ||
01913         !isHWFalseValue(SelectCC.getOperand(3))) {
01914       return SDValue();
01915     }
01916 
01917     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
01918                            SelectCC.getOperand(0), // LHS
01919                            SelectCC.getOperand(1), // RHS
01920                            DAG.getConstant(-1, MVT::i32), // True
01921                            DAG.getConstant(0, MVT::i32),  // Flase
01922                            SelectCC.getOperand(4)); // CC
01923 
01924     break;
01925   }
01926 
01927   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
01928   // => build_vector elt0, ... , NewEltIdx, ... , eltN
01929   case ISD::INSERT_VECTOR_ELT: {
01930     SDValue InVec = N->getOperand(0);
01931     SDValue InVal = N->getOperand(1);
01932     SDValue EltNo = N->getOperand(2);
01933     SDLoc dl(N);
01934 
01935     // If the inserted element is an UNDEF, just use the input vector.
01936     if (InVal.getOpcode() == ISD::UNDEF)
01937       return InVec;
01938 
01939     EVT VT = InVec.getValueType();
01940 
01941     // If we can't generate a legal BUILD_VECTOR, exit
01942     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
01943       return SDValue();
01944 
01945     // Check that we know which element is being inserted
01946     if (!isa<ConstantSDNode>(EltNo))
01947       return SDValue();
01948     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
01949 
01950     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
01951     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
01952     // vector elements.
01953     SmallVector<SDValue, 8> Ops;
01954     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
01955       Ops.append(InVec.getNode()->op_begin(),
01956                  InVec.getNode()->op_end());
01957     } else if (InVec.getOpcode() == ISD::UNDEF) {
01958       unsigned NElts = VT.getVectorNumElements();
01959       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
01960     } else {
01961       return SDValue();
01962     }
01963 
01964     // Insert the element
01965     if (Elt < Ops.size()) {
01966       // All the operands of BUILD_VECTOR must have the same type;
01967       // we enforce that here.
01968       EVT OpVT = Ops[0].getValueType();
01969       if (InVal.getValueType() != OpVT)
01970         InVal = OpVT.bitsGT(InVal.getValueType()) ?
01971           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
01972           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
01973       Ops[Elt] = InVal;
01974     }
01975 
01976     // Return the new vector
01977     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
01978   }
01979 
01980   // Extract_vec (Build_vector) generated by custom lowering
01981   // also needs to be customly combined
01982   case ISD::EXTRACT_VECTOR_ELT: {
01983     SDValue Arg = N->getOperand(0);
01984     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
01985       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
01986         unsigned Element = Const->getZExtValue();
01987         return Arg->getOperand(Element);
01988       }
01989     }
01990     if (Arg.getOpcode() == ISD::BITCAST &&
01991         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
01992       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
01993         unsigned Element = Const->getZExtValue();
01994         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
01995             Arg->getOperand(0).getOperand(Element));
01996       }
01997     }
01998   }
01999 
02000   case ISD::SELECT_CC: {
02001     // Try common optimizations
02002     SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
02003     if (Ret.getNode())
02004       return Ret;
02005 
02006     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
02007     //      selectcc x, y, a, b, inv(cc)
02008     //
02009     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
02010     //      selectcc x, y, a, b, cc
02011     SDValue LHS = N->getOperand(0);
02012     if (LHS.getOpcode() != ISD::SELECT_CC) {
02013       return SDValue();
02014     }
02015 
02016     SDValue RHS = N->getOperand(1);
02017     SDValue True = N->getOperand(2);
02018     SDValue False = N->getOperand(3);
02019     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
02020 
02021     if (LHS.getOperand(2).getNode() != True.getNode() ||
02022         LHS.getOperand(3).getNode() != False.getNode() ||
02023         RHS.getNode() != False.getNode()) {
02024       return SDValue();
02025     }
02026 
02027     switch (NCC) {
02028     default: return SDValue();
02029     case ISD::SETNE: return LHS;
02030     case ISD::SETEQ: {
02031       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
02032       LHSCC = ISD::getSetCCInverse(LHSCC,
02033                                   LHS.getOperand(0).getValueType().isInteger());
02034       if (DCI.isBeforeLegalizeOps() ||
02035           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
02036         return DAG.getSelectCC(SDLoc(N),
02037                                LHS.getOperand(0),
02038                                LHS.getOperand(1),
02039                                LHS.getOperand(2),
02040                                LHS.getOperand(3),
02041                                LHSCC);
02042       break;
02043     }
02044     }
02045     return SDValue();
02046   }
02047 
02048   case AMDGPUISD::EXPORT: {
02049     SDValue Arg = N->getOperand(1);
02050     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
02051       break;
02052 
02053     SDValue NewArgs[8] = {
02054       N->getOperand(0), // Chain
02055       SDValue(),
02056       N->getOperand(2), // ArrayBase
02057       N->getOperand(3), // Type
02058       N->getOperand(4), // SWZ_X
02059       N->getOperand(5), // SWZ_Y
02060       N->getOperand(6), // SWZ_Z
02061       N->getOperand(7) // SWZ_W
02062     };
02063     SDLoc DL(N);
02064     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
02065     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
02066   }
02067   case AMDGPUISD::TEXTURE_FETCH: {
02068     SDValue Arg = N->getOperand(1);
02069     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
02070       break;
02071 
02072     SDValue NewArgs[19] = {
02073       N->getOperand(0),
02074       N->getOperand(1),
02075       N->getOperand(2),
02076       N->getOperand(3),
02077       N->getOperand(4),
02078       N->getOperand(5),
02079       N->getOperand(6),
02080       N->getOperand(7),
02081       N->getOperand(8),
02082       N->getOperand(9),
02083       N->getOperand(10),
02084       N->getOperand(11),
02085       N->getOperand(12),
02086       N->getOperand(13),
02087       N->getOperand(14),
02088       N->getOperand(15),
02089       N->getOperand(16),
02090       N->getOperand(17),
02091       N->getOperand(18),
02092     };
02093     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
02094     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
02095         NewArgs);
02096   }
02097   }
02098 
02099   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
02100 }
02101 
02102 static bool
02103 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
02104             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
02105   const R600InstrInfo *TII =
02106       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
02107   if (!Src.isMachineOpcode())
02108     return false;
02109   switch (Src.getMachineOpcode()) {
02110   case AMDGPU::FNEG_R600:
02111     if (!Neg.getNode())
02112       return false;
02113     Src = Src.getOperand(0);
02114     Neg = DAG.getTargetConstant(1, MVT::i32);
02115     return true;
02116   case AMDGPU::FABS_R600:
02117     if (!Abs.getNode())
02118       return false;
02119     Src = Src.getOperand(0);
02120     Abs = DAG.getTargetConstant(1, MVT::i32);
02121     return true;
02122   case AMDGPU::CONST_COPY: {
02123     unsigned Opcode = ParentNode->getMachineOpcode();
02124     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
02125 
02126     if (!Sel.getNode())
02127       return false;
02128 
02129     SDValue CstOffset = Src.getOperand(0);
02130     if (ParentNode->getValueType(0).isVector())
02131       return false;
02132 
02133     // Gather constants values
02134     int SrcIndices[] = {
02135       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
02136       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
02137       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
02138       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
02139       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
02140       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
02141       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
02142       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
02143       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
02144       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
02145       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
02146     };
02147     std::vector<unsigned> Consts;
02148     for (int OtherSrcIdx : SrcIndices) {
02149       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
02150       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
02151         continue;
02152       if (HasDst) {
02153         OtherSrcIdx--;
02154         OtherSelIdx--;
02155       }
02156       if (RegisterSDNode *Reg =
02157           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
02158         if (Reg->getReg() == AMDGPU::ALU_CONST) {
02159           ConstantSDNode *Cst
02160             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
02161           Consts.push_back(Cst->getZExtValue());
02162         }
02163       }
02164     }
02165 
02166     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
02167     Consts.push_back(Cst->getZExtValue());
02168     if (!TII->fitsConstReadLimitations(Consts)) {
02169       return false;
02170     }
02171 
02172     Sel = CstOffset;
02173     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
02174     return true;
02175   }
02176   case AMDGPU::MOV_IMM_I32:
02177   case AMDGPU::MOV_IMM_F32: {
02178     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
02179     uint64_t ImmValue = 0;
02180 
02181 
02182     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
02183       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
02184       float FloatValue = FPC->getValueAPF().convertToFloat();
02185       if (FloatValue == 0.0) {
02186         ImmReg = AMDGPU::ZERO;
02187       } else if (FloatValue == 0.5) {
02188         ImmReg = AMDGPU::HALF;
02189       } else if (FloatValue == 1.0) {
02190         ImmReg = AMDGPU::ONE;
02191       } else {
02192         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
02193       }
02194     } else {
02195       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
02196       uint64_t Value = C->getZExtValue();
02197       if (Value == 0) {
02198         ImmReg = AMDGPU::ZERO;
02199       } else if (Value == 1) {
02200         ImmReg = AMDGPU::ONE_INT;
02201       } else {
02202         ImmValue = Value;
02203       }
02204     }
02205 
02206     // Check that we aren't already using an immediate.
02207     // XXX: It's possible for an instruction to have more than one
02208     // immediate operand, but this is not supported yet.
02209     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
02210       if (!Imm.getNode())
02211         return false;
02212       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
02213       assert(C);
02214       if (C->getZExtValue())
02215         return false;
02216       Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
02217     }
02218     Src = DAG.getRegister(ImmReg, MVT::i32);
02219     return true;
02220   }
02221   default:
02222     return false;
02223   }
02224 }
02225 
02226 
02227 /// \brief Fold the instructions after selecting them
02228 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
02229                                             SelectionDAG &DAG) const {
02230   const R600InstrInfo *TII =
02231       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
02232   if (!Node->isMachineOpcode())
02233     return Node;
02234   unsigned Opcode = Node->getMachineOpcode();
02235   SDValue FakeOp;
02236 
02237   std::vector<SDValue> Ops;
02238   for (const SDUse &I : Node->ops())
02239     Ops.push_back(I);
02240 
02241   if (Opcode == AMDGPU::DOT_4) {
02242     int OperandIdx[] = {
02243       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
02244       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
02245       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
02246       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
02247       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
02248       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
02249       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
02250       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
02251         };
02252     int NegIdx[] = {
02253       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
02254       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
02255       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
02256       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
02257       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
02258       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
02259       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
02260       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
02261     };
02262     int AbsIdx[] = {
02263       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
02264       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
02265       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
02266       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
02267       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
02268       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
02269       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
02270       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
02271     };
02272     for (unsigned i = 0; i < 8; i++) {
02273       if (OperandIdx[i] < 0)
02274         return Node;
02275       SDValue &Src = Ops[OperandIdx[i] - 1];
02276       SDValue &Neg = Ops[NegIdx[i] - 1];
02277       SDValue &Abs = Ops[AbsIdx[i] - 1];
02278       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
02279       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
02280       if (HasDst)
02281         SelIdx--;
02282       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
02283       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
02284         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
02285     }
02286   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
02287     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
02288       SDValue &Src = Ops[i];
02289       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
02290         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
02291     }
02292   } else if (Opcode == AMDGPU::CLAMP_R600) {
02293     SDValue Src = Node->getOperand(0);
02294     if (!Src.isMachineOpcode() ||
02295         !TII->hasInstrModifiers(Src.getMachineOpcode()))
02296       return Node;
02297     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
02298         AMDGPU::OpName::clamp);
02299     if (ClampIdx < 0)
02300       return Node;
02301     std::vector<SDValue> Ops;
02302     unsigned NumOp = Src.getNumOperands();
02303     for(unsigned i = 0; i < NumOp; ++i)
02304           Ops.push_back(Src.getOperand(i));
02305     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
02306     return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
02307         Node->getVTList(), Ops);
02308   } else {
02309     if (!TII->hasInstrModifiers(Opcode))
02310       return Node;
02311     int OperandIdx[] = {
02312       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
02313       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
02314       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
02315     };
02316     int NegIdx[] = {
02317       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
02318       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
02319       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
02320     };
02321     int AbsIdx[] = {
02322       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
02323       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
02324       -1
02325     };
02326     for (unsigned i = 0; i < 3; i++) {
02327       if (OperandIdx[i] < 0)
02328         return Node;
02329       SDValue &Src = Ops[OperandIdx[i] - 1];
02330       SDValue &Neg = Ops[NegIdx[i] - 1];
02331       SDValue FakeAbs;
02332       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
02333       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
02334       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
02335       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
02336       if (HasDst) {
02337         SelIdx--;
02338         ImmIdx--;
02339       }
02340       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
02341       SDValue &Imm = Ops[ImmIdx];
02342       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
02343         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
02344     }
02345   }
02346 
02347   return Node;
02348 }