LLVM API Documentation
00001 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 /// \file 00011 /// \brief Custom DAG lowering for R600 00012 // 00013 //===----------------------------------------------------------------------===// 00014 00015 #include "R600ISelLowering.h" 00016 #include "AMDGPUFrameLowering.h" 00017 #include "AMDGPUIntrinsicInfo.h" 00018 #include "AMDGPUSubtarget.h" 00019 #include "R600Defines.h" 00020 #include "R600InstrInfo.h" 00021 #include "R600MachineFunctionInfo.h" 00022 #include "llvm/Analysis/ValueTracking.h" 00023 #include "llvm/CodeGen/CallingConvLower.h" 00024 #include "llvm/CodeGen/MachineFrameInfo.h" 00025 #include "llvm/CodeGen/MachineInstrBuilder.h" 00026 #include "llvm/CodeGen/MachineRegisterInfo.h" 00027 #include "llvm/CodeGen/SelectionDAG.h" 00028 #include "llvm/IR/Argument.h" 00029 #include "llvm/IR/Function.h" 00030 00031 using namespace llvm; 00032 00033 R600TargetLowering::R600TargetLowering(TargetMachine &TM) : 00034 AMDGPUTargetLowering(TM), 00035 Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) { 00036 addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); 00037 addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); 00038 addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); 00039 addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); 00040 addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); 00041 addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); 00042 00043 computeRegisterProperties(); 00044 00045 // Set condition code actions 00046 setCondCodeAction(ISD::SETO, MVT::f32, Expand); 00047 setCondCodeAction(ISD::SETUO, MVT::f32, Expand); 00048 setCondCodeAction(ISD::SETLT, MVT::f32, Expand); 00049 setCondCodeAction(ISD::SETLE, MVT::f32, Expand); 00050 setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); 00051 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); 00052 setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 00053 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 00054 setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); 00055 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 00056 setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 00057 setCondCodeAction(ISD::SETULE, MVT::f32, Expand); 00058 00059 setCondCodeAction(ISD::SETLE, MVT::i32, Expand); 00060 setCondCodeAction(ISD::SETLT, MVT::i32, Expand); 00061 setCondCodeAction(ISD::SETULE, MVT::i32, Expand); 00062 setCondCodeAction(ISD::SETULT, MVT::i32, Expand); 00063 00064 setOperationAction(ISD::FCOS, MVT::f32, Custom); 00065 setOperationAction(ISD::FSIN, MVT::f32, Custom); 00066 00067 setOperationAction(ISD::SETCC, MVT::v4i32, Expand); 00068 setOperationAction(ISD::SETCC, MVT::v2i32, Expand); 00069 00070 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 00071 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 00072 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 00073 00074 setOperationAction(ISD::FSUB, MVT::f32, Expand); 00075 00076 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 00077 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 00078 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); 00079 00080 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 00081 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 00082 00083 setOperationAction(ISD::SETCC, MVT::i32, Expand); 00084 setOperationAction(ISD::SETCC, MVT::f32, Expand); 00085 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); 00086 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 00087 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 00088 00089 setOperationAction(ISD::SELECT, MVT::i32, Expand); 00090 setOperationAction(ISD::SELECT, MVT::f32, Expand); 00091 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 00092 setOperationAction(ISD::SELECT, MVT::v4i32, Expand); 00093 00094 // Expand sign extension of vectors 00095 if (!Subtarget->hasBFE()) 00096 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 00097 00098 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand); 00099 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand); 00100 00101 if (!Subtarget->hasBFE()) 00102 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 00103 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand); 00104 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand); 00105 00106 if (!Subtarget->hasBFE()) 00107 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 00108 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); 00109 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand); 00110 00111 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 00112 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand); 00113 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand); 00114 00115 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); 00116 00117 00118 // Legalize loads and stores to the private address space. 00119 setOperationAction(ISD::LOAD, MVT::i32, Custom); 00120 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 00121 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 00122 00123 // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address 00124 // spaces, so it is custom lowered to handle those where it isn't. 00125 setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); 00126 setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom); 00127 setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); 00128 setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom); 00129 setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); 00130 setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom); 00131 00132 setOperationAction(ISD::STORE, MVT::i8, Custom); 00133 setOperationAction(ISD::STORE, MVT::i32, Custom); 00134 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 00135 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 00136 setTruncStoreAction(MVT::i32, MVT::i8, Custom); 00137 setTruncStoreAction(MVT::i32, MVT::i16, Custom); 00138 00139 setOperationAction(ISD::LOAD, MVT::i32, Custom); 00140 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 00141 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 00142 00143 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); 00144 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); 00145 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 00146 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 00147 00148 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom); 00149 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom); 00150 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 00151 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 00152 00153 setTargetDAGCombine(ISD::FP_ROUND); 00154 setTargetDAGCombine(ISD::FP_TO_SINT); 00155 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 00156 setTargetDAGCombine(ISD::SELECT_CC); 00157 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 00158 00159 setOperationAction(ISD::SUB, MVT::i64, Expand); 00160 00161 // These should be replaced by UDVIREM, but it does not happen automatically 00162 // during Type Legalization 00163 setOperationAction(ISD::UDIV, MVT::i64, Custom); 00164 setOperationAction(ISD::UREM, MVT::i64, Custom); 00165 setOperationAction(ISD::SDIV, MVT::i64, Custom); 00166 setOperationAction(ISD::SREM, MVT::i64, Custom); 00167 00168 // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 00169 // to be Legal/Custom in order to avoid library calls. 00170 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 00171 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 00172 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 00173 00174 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 00175 00176 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 00177 for (MVT VT : ScalarIntVTs) { 00178 setOperationAction(ISD::ADDC, VT, Expand); 00179 setOperationAction(ISD::SUBC, VT, Expand); 00180 setOperationAction(ISD::ADDE, VT, Expand); 00181 setOperationAction(ISD::SUBE, VT, Expand); 00182 } 00183 00184 setBooleanContents(ZeroOrNegativeOneBooleanContent); 00185 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 00186 setSchedulingPreference(Sched::Source); 00187 } 00188 00189 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( 00190 MachineInstr * MI, MachineBasicBlock * BB) const { 00191 MachineFunction * MF = BB->getParent(); 00192 MachineRegisterInfo &MRI = MF->getRegInfo(); 00193 MachineBasicBlock::iterator I = *MI; 00194 const R600InstrInfo *TII = 00195 static_cast<const R600InstrInfo *>(MF->getSubtarget().getInstrInfo()); 00196 00197 switch (MI->getOpcode()) { 00198 default: 00199 // Replace LDS_*_RET instruction that don't have any uses with the 00200 // equivalent LDS_*_NORET instruction. 00201 if (TII->isLDSRetInstr(MI->getOpcode())) { 00202 int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); 00203 assert(DstIdx != -1); 00204 MachineInstrBuilder NewMI; 00205 // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add 00206 // LDS_1A2D support and remove this special case. 00207 if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) || 00208 MI->getOpcode() == AMDGPU::LDS_CMPST_RET) 00209 return BB; 00210 00211 NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), 00212 TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode()))); 00213 for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) { 00214 NewMI.addOperand(MI->getOperand(i)); 00215 } 00216 } else { 00217 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 00218 } 00219 break; 00220 case AMDGPU::CLAMP_R600: { 00221 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 00222 AMDGPU::MOV, 00223 MI->getOperand(0).getReg(), 00224 MI->getOperand(1).getReg()); 00225 TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); 00226 break; 00227 } 00228 00229 case AMDGPU::FABS_R600: { 00230 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 00231 AMDGPU::MOV, 00232 MI->getOperand(0).getReg(), 00233 MI->getOperand(1).getReg()); 00234 TII->addFlag(NewMI, 0, MO_FLAG_ABS); 00235 break; 00236 } 00237 00238 case AMDGPU::FNEG_R600: { 00239 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 00240 AMDGPU::MOV, 00241 MI->getOperand(0).getReg(), 00242 MI->getOperand(1).getReg()); 00243 TII->addFlag(NewMI, 0, MO_FLAG_NEG); 00244 break; 00245 } 00246 00247 case AMDGPU::MASK_WRITE: { 00248 unsigned maskedRegister = MI->getOperand(0).getReg(); 00249 assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); 00250 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); 00251 TII->addFlag(defInstr, 0, MO_FLAG_MASK); 00252 break; 00253 } 00254 00255 case AMDGPU::MOV_IMM_F32: 00256 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 00257 MI->getOperand(1).getFPImm()->getValueAPF() 00258 .bitcastToAPInt().getZExtValue()); 00259 break; 00260 case AMDGPU::MOV_IMM_I32: 00261 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 00262 MI->getOperand(1).getImm()); 00263 break; 00264 case AMDGPU::CONST_COPY: { 00265 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, 00266 MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); 00267 TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel, 00268 MI->getOperand(1).getImm()); 00269 break; 00270 } 00271 00272 case AMDGPU::RAT_WRITE_CACHELESS_32_eg: 00273 case AMDGPU::RAT_WRITE_CACHELESS_64_eg: 00274 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { 00275 unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; 00276 00277 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 00278 .addOperand(MI->getOperand(0)) 00279 .addOperand(MI->getOperand(1)) 00280 .addImm(EOP); // Set End of program bit 00281 break; 00282 } 00283 00284 case AMDGPU::TXD: { 00285 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 00286 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 00287 MachineOperand &RID = MI->getOperand(4); 00288 MachineOperand &SID = MI->getOperand(5); 00289 unsigned TextureId = MI->getOperand(6).getImm(); 00290 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 00291 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 00292 00293 switch (TextureId) { 00294 case 5: // Rect 00295 CTX = CTY = 0; 00296 break; 00297 case 6: // Shadow1D 00298 SrcW = SrcZ; 00299 break; 00300 case 7: // Shadow2D 00301 SrcW = SrcZ; 00302 break; 00303 case 8: // ShadowRect 00304 CTX = CTY = 0; 00305 SrcW = SrcZ; 00306 break; 00307 case 9: // 1DArray 00308 SrcZ = SrcY; 00309 CTZ = 0; 00310 break; 00311 case 10: // 2DArray 00312 CTZ = 0; 00313 break; 00314 case 11: // Shadow1DArray 00315 SrcZ = SrcY; 00316 CTZ = 0; 00317 break; 00318 case 12: // Shadow2DArray 00319 CTZ = 0; 00320 break; 00321 } 00322 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 00323 .addOperand(MI->getOperand(3)) 00324 .addImm(SrcX) 00325 .addImm(SrcY) 00326 .addImm(SrcZ) 00327 .addImm(SrcW) 00328 .addImm(0) 00329 .addImm(0) 00330 .addImm(0) 00331 .addImm(0) 00332 .addImm(1) 00333 .addImm(2) 00334 .addImm(3) 00335 .addOperand(RID) 00336 .addOperand(SID) 00337 .addImm(CTX) 00338 .addImm(CTY) 00339 .addImm(CTZ) 00340 .addImm(CTW); 00341 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 00342 .addOperand(MI->getOperand(2)) 00343 .addImm(SrcX) 00344 .addImm(SrcY) 00345 .addImm(SrcZ) 00346 .addImm(SrcW) 00347 .addImm(0) 00348 .addImm(0) 00349 .addImm(0) 00350 .addImm(0) 00351 .addImm(1) 00352 .addImm(2) 00353 .addImm(3) 00354 .addOperand(RID) 00355 .addOperand(SID) 00356 .addImm(CTX) 00357 .addImm(CTY) 00358 .addImm(CTZ) 00359 .addImm(CTW); 00360 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) 00361 .addOperand(MI->getOperand(0)) 00362 .addOperand(MI->getOperand(1)) 00363 .addImm(SrcX) 00364 .addImm(SrcY) 00365 .addImm(SrcZ) 00366 .addImm(SrcW) 00367 .addImm(0) 00368 .addImm(0) 00369 .addImm(0) 00370 .addImm(0) 00371 .addImm(1) 00372 .addImm(2) 00373 .addImm(3) 00374 .addOperand(RID) 00375 .addOperand(SID) 00376 .addImm(CTX) 00377 .addImm(CTY) 00378 .addImm(CTZ) 00379 .addImm(CTW) 00380 .addReg(T0, RegState::Implicit) 00381 .addReg(T1, RegState::Implicit); 00382 break; 00383 } 00384 00385 case AMDGPU::TXD_SHADOW: { 00386 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 00387 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 00388 MachineOperand &RID = MI->getOperand(4); 00389 MachineOperand &SID = MI->getOperand(5); 00390 unsigned TextureId = MI->getOperand(6).getImm(); 00391 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 00392 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 00393 00394 switch (TextureId) { 00395 case 5: // Rect 00396 CTX = CTY = 0; 00397 break; 00398 case 6: // Shadow1D 00399 SrcW = SrcZ; 00400 break; 00401 case 7: // Shadow2D 00402 SrcW = SrcZ; 00403 break; 00404 case 8: // ShadowRect 00405 CTX = CTY = 0; 00406 SrcW = SrcZ; 00407 break; 00408 case 9: // 1DArray 00409 SrcZ = SrcY; 00410 CTZ = 0; 00411 break; 00412 case 10: // 2DArray 00413 CTZ = 0; 00414 break; 00415 case 11: // Shadow1DArray 00416 SrcZ = SrcY; 00417 CTZ = 0; 00418 break; 00419 case 12: // Shadow2DArray 00420 CTZ = 0; 00421 break; 00422 } 00423 00424 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 00425 .addOperand(MI->getOperand(3)) 00426 .addImm(SrcX) 00427 .addImm(SrcY) 00428 .addImm(SrcZ) 00429 .addImm(SrcW) 00430 .addImm(0) 00431 .addImm(0) 00432 .addImm(0) 00433 .addImm(0) 00434 .addImm(1) 00435 .addImm(2) 00436 .addImm(3) 00437 .addOperand(RID) 00438 .addOperand(SID) 00439 .addImm(CTX) 00440 .addImm(CTY) 00441 .addImm(CTZ) 00442 .addImm(CTW); 00443 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 00444 .addOperand(MI->getOperand(2)) 00445 .addImm(SrcX) 00446 .addImm(SrcY) 00447 .addImm(SrcZ) 00448 .addImm(SrcW) 00449 .addImm(0) 00450 .addImm(0) 00451 .addImm(0) 00452 .addImm(0) 00453 .addImm(1) 00454 .addImm(2) 00455 .addImm(3) 00456 .addOperand(RID) 00457 .addOperand(SID) 00458 .addImm(CTX) 00459 .addImm(CTY) 00460 .addImm(CTZ) 00461 .addImm(CTW); 00462 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) 00463 .addOperand(MI->getOperand(0)) 00464 .addOperand(MI->getOperand(1)) 00465 .addImm(SrcX) 00466 .addImm(SrcY) 00467 .addImm(SrcZ) 00468 .addImm(SrcW) 00469 .addImm(0) 00470 .addImm(0) 00471 .addImm(0) 00472 .addImm(0) 00473 .addImm(1) 00474 .addImm(2) 00475 .addImm(3) 00476 .addOperand(RID) 00477 .addOperand(SID) 00478 .addImm(CTX) 00479 .addImm(CTY) 00480 .addImm(CTZ) 00481 .addImm(CTW) 00482 .addReg(T0, RegState::Implicit) 00483 .addReg(T1, RegState::Implicit); 00484 break; 00485 } 00486 00487 case AMDGPU::BRANCH: 00488 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) 00489 .addOperand(MI->getOperand(0)); 00490 break; 00491 00492 case AMDGPU::BRANCH_COND_f32: { 00493 MachineInstr *NewMI = 00494 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 00495 AMDGPU::PREDICATE_BIT) 00496 .addOperand(MI->getOperand(1)) 00497 .addImm(OPCODE_IS_NOT_ZERO) 00498 .addImm(0); // Flags 00499 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 00500 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 00501 .addOperand(MI->getOperand(0)) 00502 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 00503 break; 00504 } 00505 00506 case AMDGPU::BRANCH_COND_i32: { 00507 MachineInstr *NewMI = 00508 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 00509 AMDGPU::PREDICATE_BIT) 00510 .addOperand(MI->getOperand(1)) 00511 .addImm(OPCODE_IS_NOT_ZERO_INT) 00512 .addImm(0); // Flags 00513 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 00514 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 00515 .addOperand(MI->getOperand(0)) 00516 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 00517 break; 00518 } 00519 00520 case AMDGPU::EG_ExportSwz: 00521 case AMDGPU::R600_ExportSwz: { 00522 // Instruction is left unmodified if its not the last one of its type 00523 bool isLastInstructionOfItsType = true; 00524 unsigned InstExportType = MI->getOperand(1).getImm(); 00525 for (MachineBasicBlock::iterator NextExportInst = std::next(I), 00526 EndBlock = BB->end(); NextExportInst != EndBlock; 00527 NextExportInst = std::next(NextExportInst)) { 00528 if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || 00529 NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { 00530 unsigned CurrentInstExportType = NextExportInst->getOperand(1) 00531 .getImm(); 00532 if (CurrentInstExportType == InstExportType) { 00533 isLastInstructionOfItsType = false; 00534 break; 00535 } 00536 } 00537 } 00538 bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; 00539 if (!EOP && !isLastInstructionOfItsType) 00540 return BB; 00541 unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; 00542 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 00543 .addOperand(MI->getOperand(0)) 00544 .addOperand(MI->getOperand(1)) 00545 .addOperand(MI->getOperand(2)) 00546 .addOperand(MI->getOperand(3)) 00547 .addOperand(MI->getOperand(4)) 00548 .addOperand(MI->getOperand(5)) 00549 .addOperand(MI->getOperand(6)) 00550 .addImm(CfInst) 00551 .addImm(EOP); 00552 break; 00553 } 00554 case AMDGPU::RETURN: { 00555 // RETURN instructions must have the live-out registers as implicit uses, 00556 // otherwise they appear dead. 00557 R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); 00558 MachineInstrBuilder MIB(*MF, MI); 00559 for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) 00560 MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); 00561 return BB; 00562 } 00563 } 00564 00565 MI->eraseFromParent(); 00566 return BB; 00567 } 00568 00569 //===----------------------------------------------------------------------===// 00570 // Custom DAG Lowering Operations 00571 //===----------------------------------------------------------------------===// 00572 00573 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 00574 MachineFunction &MF = DAG.getMachineFunction(); 00575 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 00576 switch (Op.getOpcode()) { 00577 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 00578 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 00579 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 00580 case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG); 00581 case ISD::SRA_PARTS: 00582 case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG); 00583 case ISD::FCOS: 00584 case ISD::FSIN: return LowerTrig(Op, DAG); 00585 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 00586 case ISD::STORE: return LowerSTORE(Op, DAG); 00587 case ISD::LOAD: { 00588 SDValue Result = LowerLOAD(Op, DAG); 00589 assert((!Result.getNode() || 00590 Result.getNode()->getNumValues() == 2) && 00591 "Load should return a value and a chain"); 00592 return Result; 00593 } 00594 00595 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 00596 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); 00597 case ISD::INTRINSIC_VOID: { 00598 SDValue Chain = Op.getOperand(0); 00599 unsigned IntrinsicID = 00600 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 00601 switch (IntrinsicID) { 00602 case AMDGPUIntrinsic::AMDGPU_store_output: { 00603 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 00604 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 00605 MFI->LiveOuts.push_back(Reg); 00606 return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2)); 00607 } 00608 case AMDGPUIntrinsic::R600_store_swizzle: { 00609 const SDValue Args[8] = { 00610 Chain, 00611 Op.getOperand(2), // Export Value 00612 Op.getOperand(3), // ArrayBase 00613 Op.getOperand(4), // Type 00614 DAG.getConstant(0, MVT::i32), // SWZ_X 00615 DAG.getConstant(1, MVT::i32), // SWZ_Y 00616 DAG.getConstant(2, MVT::i32), // SWZ_Z 00617 DAG.getConstant(3, MVT::i32) // SWZ_W 00618 }; 00619 return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args); 00620 } 00621 00622 // default for switch(IntrinsicID) 00623 default: break; 00624 } 00625 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) 00626 break; 00627 } 00628 case ISD::INTRINSIC_WO_CHAIN: { 00629 unsigned IntrinsicID = 00630 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 00631 EVT VT = Op.getValueType(); 00632 SDLoc DL(Op); 00633 switch(IntrinsicID) { 00634 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 00635 case AMDGPUIntrinsic::R600_load_input: { 00636 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 00637 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 00638 MachineFunction &MF = DAG.getMachineFunction(); 00639 MachineRegisterInfo &MRI = MF.getRegInfo(); 00640 MRI.addLiveIn(Reg); 00641 return DAG.getCopyFromReg(DAG.getEntryNode(), 00642 SDLoc(DAG.getEntryNode()), Reg, VT); 00643 } 00644 00645 case AMDGPUIntrinsic::R600_interp_input: { 00646 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 00647 int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue(); 00648 MachineSDNode *interp; 00649 if (ijb < 0) { 00650 const MachineFunction &MF = DAG.getMachineFunction(); 00651 const R600InstrInfo *TII = static_cast<const R600InstrInfo *>( 00652 MF.getSubtarget().getInstrInfo()); 00653 interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, 00654 MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32)); 00655 return DAG.getTargetExtractSubreg( 00656 TII->getRegisterInfo().getSubRegFromChannel(slot % 4), 00657 DL, MVT::f32, SDValue(interp, 0)); 00658 } 00659 MachineFunction &MF = DAG.getMachineFunction(); 00660 MachineRegisterInfo &MRI = MF.getRegInfo(); 00661 unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb); 00662 unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1); 00663 MRI.addLiveIn(RegisterI); 00664 MRI.addLiveIn(RegisterJ); 00665 SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(), 00666 SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32); 00667 SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(), 00668 SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32); 00669 00670 if (slot % 4 < 2) 00671 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, 00672 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), 00673 RegisterJNode, RegisterINode); 00674 else 00675 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, 00676 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), 00677 RegisterJNode, RegisterINode); 00678 return SDValue(interp, slot % 2); 00679 } 00680 case AMDGPUIntrinsic::R600_interp_xy: 00681 case AMDGPUIntrinsic::R600_interp_zw: { 00682 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 00683 MachineSDNode *interp; 00684 SDValue RegisterINode = Op.getOperand(2); 00685 SDValue RegisterJNode = Op.getOperand(3); 00686 00687 if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy) 00688 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, 00689 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32), 00690 RegisterJNode, RegisterINode); 00691 else 00692 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, 00693 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32), 00694 RegisterJNode, RegisterINode); 00695 return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, 00696 SDValue(interp, 0), SDValue(interp, 1)); 00697 } 00698 case AMDGPUIntrinsic::R600_tex: 00699 case AMDGPUIntrinsic::R600_texc: 00700 case AMDGPUIntrinsic::R600_txl: 00701 case AMDGPUIntrinsic::R600_txlc: 00702 case AMDGPUIntrinsic::R600_txb: 00703 case AMDGPUIntrinsic::R600_txbc: 00704 case AMDGPUIntrinsic::R600_txf: 00705 case AMDGPUIntrinsic::R600_txq: 00706 case AMDGPUIntrinsic::R600_ddx: 00707 case AMDGPUIntrinsic::R600_ddy: 00708 case AMDGPUIntrinsic::R600_ldptr: { 00709 unsigned TextureOp; 00710 switch (IntrinsicID) { 00711 case AMDGPUIntrinsic::R600_tex: 00712 TextureOp = 0; 00713 break; 00714 case AMDGPUIntrinsic::R600_texc: 00715 TextureOp = 1; 00716 break; 00717 case AMDGPUIntrinsic::R600_txl: 00718 TextureOp = 2; 00719 break; 00720 case AMDGPUIntrinsic::R600_txlc: 00721 TextureOp = 3; 00722 break; 00723 case AMDGPUIntrinsic::R600_txb: 00724 TextureOp = 4; 00725 break; 00726 case AMDGPUIntrinsic::R600_txbc: 00727 TextureOp = 5; 00728 break; 00729 case AMDGPUIntrinsic::R600_txf: 00730 TextureOp = 6; 00731 break; 00732 case AMDGPUIntrinsic::R600_txq: 00733 TextureOp = 7; 00734 break; 00735 case AMDGPUIntrinsic::R600_ddx: 00736 TextureOp = 8; 00737 break; 00738 case AMDGPUIntrinsic::R600_ddy: 00739 TextureOp = 9; 00740 break; 00741 case AMDGPUIntrinsic::R600_ldptr: 00742 TextureOp = 10; 00743 break; 00744 default: 00745 llvm_unreachable("Unknow Texture Operation"); 00746 } 00747 00748 SDValue TexArgs[19] = { 00749 DAG.getConstant(TextureOp, MVT::i32), 00750 Op.getOperand(1), 00751 DAG.getConstant(0, MVT::i32), 00752 DAG.getConstant(1, MVT::i32), 00753 DAG.getConstant(2, MVT::i32), 00754 DAG.getConstant(3, MVT::i32), 00755 Op.getOperand(2), 00756 Op.getOperand(3), 00757 Op.getOperand(4), 00758 DAG.getConstant(0, MVT::i32), 00759 DAG.getConstant(1, MVT::i32), 00760 DAG.getConstant(2, MVT::i32), 00761 DAG.getConstant(3, MVT::i32), 00762 Op.getOperand(5), 00763 Op.getOperand(6), 00764 Op.getOperand(7), 00765 Op.getOperand(8), 00766 Op.getOperand(9), 00767 Op.getOperand(10) 00768 }; 00769 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); 00770 } 00771 case AMDGPUIntrinsic::AMDGPU_dp4: { 00772 SDValue Args[8] = { 00773 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 00774 DAG.getConstant(0, MVT::i32)), 00775 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 00776 DAG.getConstant(0, MVT::i32)), 00777 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 00778 DAG.getConstant(1, MVT::i32)), 00779 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 00780 DAG.getConstant(1, MVT::i32)), 00781 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 00782 DAG.getConstant(2, MVT::i32)), 00783 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 00784 DAG.getConstant(2, MVT::i32)), 00785 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 00786 DAG.getConstant(3, MVT::i32)), 00787 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 00788 DAG.getConstant(3, MVT::i32)) 00789 }; 00790 return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args); 00791 } 00792 00793 case Intrinsic::r600_read_ngroups_x: 00794 return LowerImplicitParameter(DAG, VT, DL, 0); 00795 case Intrinsic::r600_read_ngroups_y: 00796 return LowerImplicitParameter(DAG, VT, DL, 1); 00797 case Intrinsic::r600_read_ngroups_z: 00798 return LowerImplicitParameter(DAG, VT, DL, 2); 00799 case Intrinsic::r600_read_global_size_x: 00800 return LowerImplicitParameter(DAG, VT, DL, 3); 00801 case Intrinsic::r600_read_global_size_y: 00802 return LowerImplicitParameter(DAG, VT, DL, 4); 00803 case Intrinsic::r600_read_global_size_z: 00804 return LowerImplicitParameter(DAG, VT, DL, 5); 00805 case Intrinsic::r600_read_local_size_x: 00806 return LowerImplicitParameter(DAG, VT, DL, 6); 00807 case Intrinsic::r600_read_local_size_y: 00808 return LowerImplicitParameter(DAG, VT, DL, 7); 00809 case Intrinsic::r600_read_local_size_z: 00810 return LowerImplicitParameter(DAG, VT, DL, 8); 00811 00812 case Intrinsic::r600_read_tgid_x: 00813 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 00814 AMDGPU::T1_X, VT); 00815 case Intrinsic::r600_read_tgid_y: 00816 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 00817 AMDGPU::T1_Y, VT); 00818 case Intrinsic::r600_read_tgid_z: 00819 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 00820 AMDGPU::T1_Z, VT); 00821 case Intrinsic::r600_read_tidig_x: 00822 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 00823 AMDGPU::T0_X, VT); 00824 case Intrinsic::r600_read_tidig_y: 00825 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 00826 AMDGPU::T0_Y, VT); 00827 case Intrinsic::r600_read_tidig_z: 00828 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 00829 AMDGPU::T0_Z, VT); 00830 case Intrinsic::AMDGPU_rsq: 00831 // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. 00832 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); 00833 } 00834 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) 00835 break; 00836 } 00837 } // end switch(Op.getOpcode()) 00838 return SDValue(); 00839 } 00840 00841 void R600TargetLowering::ReplaceNodeResults(SDNode *N, 00842 SmallVectorImpl<SDValue> &Results, 00843 SelectionDAG &DAG) const { 00844 switch (N->getOpcode()) { 00845 default: 00846 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); 00847 return; 00848 case ISD::FP_TO_UINT: 00849 if (N->getValueType(0) == MVT::i1) { 00850 Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); 00851 return; 00852 } 00853 // Fall-through. Since we don't care about out of bounds values 00854 // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint 00855 // considers some extra cases which are not necessary here. 00856 case ISD::FP_TO_SINT: { 00857 SDValue Result; 00858 if (expandFP_TO_SINT(N, Result, DAG)) 00859 Results.push_back(Result); 00860 return; 00861 } 00862 case ISD::UDIV: { 00863 SDValue Op = SDValue(N, 0); 00864 SDLoc DL(Op); 00865 EVT VT = Op.getValueType(); 00866 SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), 00867 N->getOperand(0), N->getOperand(1)); 00868 Results.push_back(UDIVREM); 00869 break; 00870 } 00871 case ISD::UREM: { 00872 SDValue Op = SDValue(N, 0); 00873 SDLoc DL(Op); 00874 EVT VT = Op.getValueType(); 00875 SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), 00876 N->getOperand(0), N->getOperand(1)); 00877 Results.push_back(UDIVREM.getValue(1)); 00878 break; 00879 } 00880 case ISD::SDIV: { 00881 SDValue Op = SDValue(N, 0); 00882 SDLoc DL(Op); 00883 EVT VT = Op.getValueType(); 00884 SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT), 00885 N->getOperand(0), N->getOperand(1)); 00886 Results.push_back(SDIVREM); 00887 break; 00888 } 00889 case ISD::SREM: { 00890 SDValue Op = SDValue(N, 0); 00891 SDLoc DL(Op); 00892 EVT VT = Op.getValueType(); 00893 SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT), 00894 N->getOperand(0), N->getOperand(1)); 00895 Results.push_back(SDIVREM.getValue(1)); 00896 break; 00897 } 00898 case ISD::SDIVREM: { 00899 SDValue Op = SDValue(N, 1); 00900 SDValue RES = LowerSDIVREM(Op, DAG); 00901 Results.push_back(RES); 00902 Results.push_back(RES.getValue(1)); 00903 break; 00904 } 00905 case ISD::UDIVREM: { 00906 SDValue Op = SDValue(N, 0); 00907 SDLoc DL(Op); 00908 EVT VT = Op.getValueType(); 00909 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext()); 00910 00911 SDValue one = DAG.getConstant(1, HalfVT); 00912 SDValue zero = DAG.getConstant(0, HalfVT); 00913 00914 //HiLo split 00915 SDValue LHS = N->getOperand(0); 00916 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero); 00917 SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one); 00918 00919 SDValue RHS = N->getOperand(1); 00920 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero); 00921 SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one); 00922 00923 // Get Speculative values 00924 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo); 00925 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); 00926 00927 SDValue REM_Hi = zero; 00928 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); 00929 00930 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); 00931 SDValue DIV_Lo = zero; 00932 00933 const unsigned halfBitWidth = HalfVT.getSizeInBits(); 00934 00935 for (unsigned i = 0; i < halfBitWidth; ++i) { 00936 SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT); 00937 // Get Value of high bit 00938 SDValue HBit; 00939 if (halfBitWidth == 32 && Subtarget->hasBFE()) { 00940 HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one); 00941 } else { 00942 HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS); 00943 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one); 00944 } 00945 00946 SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo, 00947 DAG.getConstant(halfBitWidth - 1, HalfVT)); 00948 REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one); 00949 REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry); 00950 00951 REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one); 00952 REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit); 00953 00954 00955 SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); 00956 00957 SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT); 00958 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE); 00959 00960 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); 00961 00962 // Update REM 00963 00964 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS); 00965 00966 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE); 00967 REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero); 00968 REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one); 00969 } 00970 00971 SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi); 00972 SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); 00973 Results.push_back(DIV); 00974 Results.push_back(REM); 00975 break; 00976 } 00977 } 00978 } 00979 00980 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, 00981 SDValue Vector) const { 00982 00983 SDLoc DL(Vector); 00984 EVT VecVT = Vector.getValueType(); 00985 EVT EltVT = VecVT.getVectorElementType(); 00986 SmallVector<SDValue, 8> Args; 00987 00988 for (unsigned i = 0, e = VecVT.getVectorNumElements(); 00989 i != e; ++i) { 00990 Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, 00991 Vector, DAG.getConstant(i, getVectorIdxTy()))); 00992 } 00993 00994 return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); 00995 } 00996 00997 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 00998 SelectionDAG &DAG) const { 00999 01000 SDLoc DL(Op); 01001 SDValue Vector = Op.getOperand(0); 01002 SDValue Index = Op.getOperand(1); 01003 01004 if (isa<ConstantSDNode>(Index) || 01005 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) 01006 return Op; 01007 01008 Vector = vectorToVerticalVector(DAG, Vector); 01009 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(), 01010 Vector, Index); 01011 } 01012 01013 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 01014 SelectionDAG &DAG) const { 01015 SDLoc DL(Op); 01016 SDValue Vector = Op.getOperand(0); 01017 SDValue Value = Op.getOperand(1); 01018 SDValue Index = Op.getOperand(2); 01019 01020 if (isa<ConstantSDNode>(Index) || 01021 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) 01022 return Op; 01023 01024 Vector = vectorToVerticalVector(DAG, Vector); 01025 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), 01026 Vector, Value, Index); 01027 return vectorToVerticalVector(DAG, Insert); 01028 } 01029 01030 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 01031 // On hw >= R700, COS/SIN input must be between -1. and 1. 01032 // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) 01033 EVT VT = Op.getValueType(); 01034 SDValue Arg = Op.getOperand(0); 01035 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT, 01036 DAG.getNode(ISD::FADD, SDLoc(Op), VT, 01037 DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg, 01038 DAG.getConstantFP(0.15915494309, MVT::f32)), 01039 DAG.getConstantFP(0.5, MVT::f32))); 01040 unsigned TrigNode; 01041 switch (Op.getOpcode()) { 01042 case ISD::FCOS: 01043 TrigNode = AMDGPUISD::COS_HW; 01044 break; 01045 case ISD::FSIN: 01046 TrigNode = AMDGPUISD::SIN_HW; 01047 break; 01048 default: 01049 llvm_unreachable("Wrong trig opcode"); 01050 } 01051 SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT, 01052 DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart, 01053 DAG.getConstantFP(-0.5, MVT::f32))); 01054 if (Gen >= AMDGPUSubtarget::R700) 01055 return TrigVal; 01056 // On R600 hw, COS/SIN input must be between -Pi and Pi. 01057 return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal, 01058 DAG.getConstantFP(3.14159265359, MVT::f32)); 01059 } 01060 01061 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const { 01062 SDLoc DL(Op); 01063 EVT VT = Op.getValueType(); 01064 01065 SDValue Lo = Op.getOperand(0); 01066 SDValue Hi = Op.getOperand(1); 01067 SDValue Shift = Op.getOperand(2); 01068 SDValue Zero = DAG.getConstant(0, VT); 01069 SDValue One = DAG.getConstant(1, VT); 01070 01071 SDValue Width = DAG.getConstant(VT.getSizeInBits(), VT); 01072 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT); 01073 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); 01074 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); 01075 01076 // The dance around Width1 is necessary for 0 special case. 01077 // Without it the CompShift might be 32, producing incorrect results in 01078 // Overflow. So we do the shift in two steps, the alternative is to 01079 // add a conditional to filter the special case. 01080 01081 SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift); 01082 Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One); 01083 01084 SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift); 01085 HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow); 01086 SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift); 01087 01088 SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift); 01089 SDValue LoBig = Zero; 01090 01091 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); 01092 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); 01093 01094 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); 01095 } 01096 01097 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const { 01098 SDLoc DL(Op); 01099 EVT VT = Op.getValueType(); 01100 01101 SDValue Lo = Op.getOperand(0); 01102 SDValue Hi = Op.getOperand(1); 01103 SDValue Shift = Op.getOperand(2); 01104 SDValue Zero = DAG.getConstant(0, VT); 01105 SDValue One = DAG.getConstant(1, VT); 01106 01107 const bool SRA = Op.getOpcode() == ISD::SRA_PARTS; 01108 01109 SDValue Width = DAG.getConstant(VT.getSizeInBits(), VT); 01110 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT); 01111 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); 01112 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); 01113 01114 // The dance around Width1 is necessary for 0 special case. 01115 // Without it the CompShift might be 32, producing incorrect results in 01116 // Overflow. So we do the shift in two steps, the alternative is to 01117 // add a conditional to filter the special case. 01118 01119 SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift); 01120 Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One); 01121 01122 SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift); 01123 SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift); 01124 LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow); 01125 01126 SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift); 01127 SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero; 01128 01129 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); 01130 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); 01131 01132 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); 01133 } 01134 01135 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { 01136 return DAG.getNode( 01137 ISD::SETCC, 01138 SDLoc(Op), 01139 MVT::i1, 01140 Op, DAG.getConstantFP(0.0f, MVT::f32), 01141 DAG.getCondCode(ISD::SETNE) 01142 ); 01143 } 01144 01145 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, 01146 SDLoc DL, 01147 unsigned DwordOffset) const { 01148 unsigned ByteOffset = DwordOffset * 4; 01149 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 01150 AMDGPUAS::CONSTANT_BUFFER_0); 01151 01152 // We shouldn't be using an offset wider than 16-bits for implicit parameters. 01153 assert(isInt<16>(ByteOffset)); 01154 01155 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 01156 DAG.getConstant(ByteOffset, MVT::i32), // PTR 01157 MachinePointerInfo(ConstantPointerNull::get(PtrType)), 01158 false, false, false, 0); 01159 } 01160 01161 bool R600TargetLowering::isZero(SDValue Op) const { 01162 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { 01163 return Cst->isNullValue(); 01164 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){ 01165 return CstFP->isZero(); 01166 } else { 01167 return false; 01168 } 01169 } 01170 01171 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 01172 SDLoc DL(Op); 01173 EVT VT = Op.getValueType(); 01174 01175 SDValue LHS = Op.getOperand(0); 01176 SDValue RHS = Op.getOperand(1); 01177 SDValue True = Op.getOperand(2); 01178 SDValue False = Op.getOperand(3); 01179 SDValue CC = Op.getOperand(4); 01180 SDValue Temp; 01181 01182 // LHS and RHS are guaranteed to be the same value type 01183 EVT CompareVT = LHS.getValueType(); 01184 01185 // Check if we can lower this to a native operation. 01186 01187 // Try to lower to a SET* instruction: 01188 // 01189 // SET* can match the following patterns: 01190 // 01191 // select_cc f32, f32, -1, 0, cc_supported 01192 // select_cc f32, f32, 1.0f, 0.0f, cc_supported 01193 // select_cc i32, i32, -1, 0, cc_supported 01194 // 01195 01196 // Move hardware True/False values to the correct operand. 01197 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 01198 ISD::CondCode InverseCC = 01199 ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 01200 if (isHWTrueValue(False) && isHWFalseValue(True)) { 01201 if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) { 01202 std::swap(False, True); 01203 CC = DAG.getCondCode(InverseCC); 01204 } else { 01205 ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC); 01206 if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) { 01207 std::swap(False, True); 01208 std::swap(LHS, RHS); 01209 CC = DAG.getCondCode(SwapInvCC); 01210 } 01211 } 01212 } 01213 01214 if (isHWTrueValue(True) && isHWFalseValue(False) && 01215 (CompareVT == VT || VT == MVT::i32)) { 01216 // This can be matched by a SET* instruction. 01217 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); 01218 } 01219 01220 // Try to lower to a CND* instruction: 01221 // 01222 // CND* can match the following patterns: 01223 // 01224 // select_cc f32, 0.0, f32, f32, cc_supported 01225 // select_cc f32, 0.0, i32, i32, cc_supported 01226 // select_cc i32, 0, f32, f32, cc_supported 01227 // select_cc i32, 0, i32, i32, cc_supported 01228 // 01229 01230 // Try to move the zero value to the RHS 01231 if (isZero(LHS)) { 01232 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 01233 // Try swapping the operands 01234 ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode); 01235 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 01236 std::swap(LHS, RHS); 01237 CC = DAG.getCondCode(CCSwapped); 01238 } else { 01239 // Try inverting the conditon and then swapping the operands 01240 ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger()); 01241 CCSwapped = ISD::getSetCCSwappedOperands(CCInv); 01242 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 01243 std::swap(True, False); 01244 std::swap(LHS, RHS); 01245 CC = DAG.getCondCode(CCSwapped); 01246 } 01247 } 01248 } 01249 if (isZero(RHS)) { 01250 SDValue Cond = LHS; 01251 SDValue Zero = RHS; 01252 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 01253 if (CompareVT != VT) { 01254 // Bitcast True / False to the correct types. This will end up being 01255 // a nop, but it allows us to define only a single pattern in the 01256 // .TD files for each CND* instruction rather than having to have 01257 // one pattern for integer True/False and one for fp True/False 01258 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); 01259 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); 01260 } 01261 01262 switch (CCOpcode) { 01263 case ISD::SETONE: 01264 case ISD::SETUNE: 01265 case ISD::SETNE: 01266 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 01267 Temp = True; 01268 True = False; 01269 False = Temp; 01270 break; 01271 default: 01272 break; 01273 } 01274 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, 01275 Cond, Zero, 01276 True, False, 01277 DAG.getCondCode(CCOpcode)); 01278 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); 01279 } 01280 01281 // If we make it this for it means we have no native instructions to handle 01282 // this SELECT_CC, so we must lower it. 01283 SDValue HWTrue, HWFalse; 01284 01285 if (CompareVT == MVT::f32) { 01286 HWTrue = DAG.getConstantFP(1.0f, CompareVT); 01287 HWFalse = DAG.getConstantFP(0.0f, CompareVT); 01288 } else if (CompareVT == MVT::i32) { 01289 HWTrue = DAG.getConstant(-1, CompareVT); 01290 HWFalse = DAG.getConstant(0, CompareVT); 01291 } 01292 else { 01293 llvm_unreachable("Unhandled value type in LowerSELECT_CC"); 01294 } 01295 01296 // Lower this unsupported SELECT_CC into a combination of two supported 01297 // SELECT_CC operations. 01298 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); 01299 01300 return DAG.getNode(ISD::SELECT_CC, DL, VT, 01301 Cond, HWFalse, 01302 True, False, 01303 DAG.getCondCode(ISD::SETNE)); 01304 } 01305 01306 /// LLVM generates byte-addressed pointers. For indirect addressing, we need to 01307 /// convert these pointers to a register index. Each register holds 01308 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the 01309 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used 01310 /// for indirect addressing. 01311 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, 01312 unsigned StackWidth, 01313 SelectionDAG &DAG) const { 01314 unsigned SRLPad; 01315 switch(StackWidth) { 01316 case 1: 01317 SRLPad = 2; 01318 break; 01319 case 2: 01320 SRLPad = 3; 01321 break; 01322 case 4: 01323 SRLPad = 4; 01324 break; 01325 default: llvm_unreachable("Invalid stack width"); 01326 } 01327 01328 return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr, 01329 DAG.getConstant(SRLPad, MVT::i32)); 01330 } 01331 01332 void R600TargetLowering::getStackAddress(unsigned StackWidth, 01333 unsigned ElemIdx, 01334 unsigned &Channel, 01335 unsigned &PtrIncr) const { 01336 switch (StackWidth) { 01337 default: 01338 case 1: 01339 Channel = 0; 01340 if (ElemIdx > 0) { 01341 PtrIncr = 1; 01342 } else { 01343 PtrIncr = 0; 01344 } 01345 break; 01346 case 2: 01347 Channel = ElemIdx % 2; 01348 if (ElemIdx == 2) { 01349 PtrIncr = 1; 01350 } else { 01351 PtrIncr = 0; 01352 } 01353 break; 01354 case 4: 01355 Channel = ElemIdx; 01356 PtrIncr = 0; 01357 break; 01358 } 01359 } 01360 01361 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 01362 SDLoc DL(Op); 01363 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 01364 SDValue Chain = Op.getOperand(0); 01365 SDValue Value = Op.getOperand(1); 01366 SDValue Ptr = Op.getOperand(2); 01367 01368 SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 01369 if (Result.getNode()) { 01370 return Result; 01371 } 01372 01373 if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) { 01374 if (StoreNode->isTruncatingStore()) { 01375 EVT VT = Value.getValueType(); 01376 assert(VT.bitsLE(MVT::i32)); 01377 EVT MemVT = StoreNode->getMemoryVT(); 01378 SDValue MaskConstant; 01379 if (MemVT == MVT::i8) { 01380 MaskConstant = DAG.getConstant(0xFF, MVT::i32); 01381 } else { 01382 assert(MemVT == MVT::i16); 01383 MaskConstant = DAG.getConstant(0xFFFF, MVT::i32); 01384 } 01385 SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, 01386 DAG.getConstant(2, MVT::i32)); 01387 SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr, 01388 DAG.getConstant(0x00000003, VT)); 01389 SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); 01390 SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, 01391 DAG.getConstant(3, VT)); 01392 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift); 01393 SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift); 01394 // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 01395 // vector instead. 01396 SDValue Src[4] = { 01397 ShiftedValue, 01398 DAG.getConstant(0, MVT::i32), 01399 DAG.getConstant(0, MVT::i32), 01400 Mask 01401 }; 01402 SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src); 01403 SDValue Args[3] = { Chain, Input, DWordAddr }; 01404 return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, 01405 Op->getVTList(), Args, MemVT, 01406 StoreNode->getMemOperand()); 01407 } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && 01408 Value.getValueType().bitsGE(MVT::i32)) { 01409 // Convert pointer from byte address to dword address. 01410 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), 01411 DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), 01412 Ptr, DAG.getConstant(2, MVT::i32))); 01413 01414 if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { 01415 llvm_unreachable("Truncated and indexed stores not supported yet"); 01416 } else { 01417 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); 01418 } 01419 return Chain; 01420 } 01421 } 01422 01423 EVT ValueVT = Value.getValueType(); 01424 01425 if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 01426 return SDValue(); 01427 } 01428 01429 SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 01430 if (Ret.getNode()) { 01431 return Ret; 01432 } 01433 // Lowering for indirect addressing 01434 01435 const MachineFunction &MF = DAG.getMachineFunction(); 01436 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>( 01437 getTargetMachine().getSubtargetImpl()->getFrameLowering()); 01438 unsigned StackWidth = TFL->getStackWidth(MF); 01439 01440 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 01441 01442 if (ValueVT.isVector()) { 01443 unsigned NumElemVT = ValueVT.getVectorNumElements(); 01444 EVT ElemVT = ValueVT.getVectorElementType(); 01445 SmallVector<SDValue, 4> Stores(NumElemVT); 01446 01447 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 01448 "vector width in load"); 01449 01450 for (unsigned i = 0; i < NumElemVT; ++i) { 01451 unsigned Channel, PtrIncr; 01452 getStackAddress(StackWidth, i, Channel, PtrIncr); 01453 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 01454 DAG.getConstant(PtrIncr, MVT::i32)); 01455 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, 01456 Value, DAG.getConstant(i, MVT::i32)); 01457 01458 Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 01459 Chain, Elem, Ptr, 01460 DAG.getTargetConstant(Channel, MVT::i32)); 01461 } 01462 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); 01463 } else { 01464 if (ValueVT == MVT::i8) { 01465 Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); 01466 } 01467 Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, 01468 DAG.getTargetConstant(0, MVT::i32)); // Channel 01469 } 01470 01471 return Chain; 01472 } 01473 01474 // return (512 + (kc_bank << 12) 01475 static int 01476 ConstantAddressBlock(unsigned AddressSpace) { 01477 switch (AddressSpace) { 01478 case AMDGPUAS::CONSTANT_BUFFER_0: 01479 return 512; 01480 case AMDGPUAS::CONSTANT_BUFFER_1: 01481 return 512 + 4096; 01482 case AMDGPUAS::CONSTANT_BUFFER_2: 01483 return 512 + 4096 * 2; 01484 case AMDGPUAS::CONSTANT_BUFFER_3: 01485 return 512 + 4096 * 3; 01486 case AMDGPUAS::CONSTANT_BUFFER_4: 01487 return 512 + 4096 * 4; 01488 case AMDGPUAS::CONSTANT_BUFFER_5: 01489 return 512 + 4096 * 5; 01490 case AMDGPUAS::CONSTANT_BUFFER_6: 01491 return 512 + 4096 * 6; 01492 case AMDGPUAS::CONSTANT_BUFFER_7: 01493 return 512 + 4096 * 7; 01494 case AMDGPUAS::CONSTANT_BUFFER_8: 01495 return 512 + 4096 * 8; 01496 case AMDGPUAS::CONSTANT_BUFFER_9: 01497 return 512 + 4096 * 9; 01498 case AMDGPUAS::CONSTANT_BUFFER_10: 01499 return 512 + 4096 * 10; 01500 case AMDGPUAS::CONSTANT_BUFFER_11: 01501 return 512 + 4096 * 11; 01502 case AMDGPUAS::CONSTANT_BUFFER_12: 01503 return 512 + 4096 * 12; 01504 case AMDGPUAS::CONSTANT_BUFFER_13: 01505 return 512 + 4096 * 13; 01506 case AMDGPUAS::CONSTANT_BUFFER_14: 01507 return 512 + 4096 * 14; 01508 case AMDGPUAS::CONSTANT_BUFFER_15: 01509 return 512 + 4096 * 15; 01510 default: 01511 return -1; 01512 } 01513 } 01514 01515 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const 01516 { 01517 EVT VT = Op.getValueType(); 01518 SDLoc DL(Op); 01519 LoadSDNode *LoadNode = cast<LoadSDNode>(Op); 01520 SDValue Chain = Op.getOperand(0); 01521 SDValue Ptr = Op.getOperand(1); 01522 SDValue LoweredLoad; 01523 01524 SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG); 01525 if (Ret.getNode()) { 01526 SDValue Ops[2] = { 01527 Ret, 01528 Chain 01529 }; 01530 return DAG.getMergeValues(Ops, DL); 01531 } 01532 01533 // Lower loads constant address space global variable loads 01534 if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && 01535 isa<GlobalVariable>( 01536 GetUnderlyingObject(LoadNode->getMemOperand()->getValue()))) { 01537 01538 SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL, 01539 getPointerTy(AMDGPUAS::PRIVATE_ADDRESS)); 01540 Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, 01541 DAG.getConstant(2, MVT::i32)); 01542 return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), 01543 LoadNode->getChain(), Ptr, 01544 DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2)); 01545 } 01546 01547 if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { 01548 SDValue MergedValues[2] = { 01549 ScalarizeVectorLoad(Op, DAG), 01550 Chain 01551 }; 01552 return DAG.getMergeValues(MergedValues, DL); 01553 } 01554 01555 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); 01556 if (ConstantBlock > -1 && 01557 ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || 01558 (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { 01559 SDValue Result; 01560 if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) || 01561 isa<Constant>(LoadNode->getMemOperand()->getValue()) || 01562 isa<ConstantSDNode>(Ptr)) { 01563 SDValue Slots[4]; 01564 for (unsigned i = 0; i < 4; i++) { 01565 // We want Const position encoded with the following formula : 01566 // (((512 + (kc_bank << 12) + const_index) << 2) + chan) 01567 // const_index is Ptr computed by llvm using an alignment of 16. 01568 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and 01569 // then div by 4 at the ISel step 01570 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 01571 DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32)); 01572 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); 01573 } 01574 EVT NewVT = MVT::v4i32; 01575 unsigned NumElements = 4; 01576 if (VT.isVector()) { 01577 NewVT = VT; 01578 NumElements = VT.getVectorNumElements(); 01579 } 01580 Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, 01581 makeArrayRef(Slots, NumElements)); 01582 } else { 01583 // non-constant ptr can't be folded, keeps it as a v4f32 load 01584 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, 01585 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)), 01586 DAG.getConstant(LoadNode->getAddressSpace() - 01587 AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32) 01588 ); 01589 } 01590 01591 if (!VT.isVector()) { 01592 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, 01593 DAG.getConstant(0, MVT::i32)); 01594 } 01595 01596 SDValue MergedValues[2] = { 01597 Result, 01598 Chain 01599 }; 01600 return DAG.getMergeValues(MergedValues, DL); 01601 } 01602 01603 // For most operations returning SDValue() will result in the node being 01604 // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we 01605 // need to manually expand loads that may be legal in some address spaces and 01606 // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for 01607 // compute shaders, since the data is sign extended when it is uploaded to the 01608 // buffer. However SEXT loads from other address spaces are not supported, so 01609 // we need to expand them here. 01610 if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { 01611 EVT MemVT = LoadNode->getMemoryVT(); 01612 assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); 01613 SDValue ShiftAmount = 01614 DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32); 01615 SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr, 01616 LoadNode->getPointerInfo(), MemVT, 01617 LoadNode->isVolatile(), 01618 LoadNode->isNonTemporal(), 01619 LoadNode->isInvariant(), 01620 LoadNode->getAlignment()); 01621 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount); 01622 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount); 01623 01624 SDValue MergedValues[2] = { Sra, Chain }; 01625 return DAG.getMergeValues(MergedValues, DL); 01626 } 01627 01628 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 01629 return SDValue(); 01630 } 01631 01632 // Lowering for indirect addressing 01633 const MachineFunction &MF = DAG.getMachineFunction(); 01634 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>( 01635 getTargetMachine().getSubtargetImpl()->getFrameLowering()); 01636 unsigned StackWidth = TFL->getStackWidth(MF); 01637 01638 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 01639 01640 if (VT.isVector()) { 01641 unsigned NumElemVT = VT.getVectorNumElements(); 01642 EVT ElemVT = VT.getVectorElementType(); 01643 SDValue Loads[4]; 01644 01645 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 01646 "vector width in load"); 01647 01648 for (unsigned i = 0; i < NumElemVT; ++i) { 01649 unsigned Channel, PtrIncr; 01650 getStackAddress(StackWidth, i, Channel, PtrIncr); 01651 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 01652 DAG.getConstant(PtrIncr, MVT::i32)); 01653 Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, 01654 Chain, Ptr, 01655 DAG.getTargetConstant(Channel, MVT::i32), 01656 Op.getOperand(2)); 01657 } 01658 for (unsigned i = NumElemVT; i < 4; ++i) { 01659 Loads[i] = DAG.getUNDEF(ElemVT); 01660 } 01661 EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); 01662 LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads); 01663 } else { 01664 LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, 01665 Chain, Ptr, 01666 DAG.getTargetConstant(0, MVT::i32), // Channel 01667 Op.getOperand(2)); 01668 } 01669 01670 SDValue Ops[2] = { 01671 LoweredLoad, 01672 Chain 01673 }; 01674 01675 return DAG.getMergeValues(Ops, DL); 01676 } 01677 01678 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 01679 SDValue Chain = Op.getOperand(0); 01680 SDValue Cond = Op.getOperand(1); 01681 SDValue Jump = Op.getOperand(2); 01682 01683 return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(), 01684 Chain, Jump, Cond); 01685 } 01686 01687 /// XXX Only kernel functions are supported, so we can assume for now that 01688 /// every function is a kernel function, but in the future we should use 01689 /// separate calling conventions for kernel and non-kernel functions. 01690 SDValue R600TargetLowering::LowerFormalArguments( 01691 SDValue Chain, 01692 CallingConv::ID CallConv, 01693 bool isVarArg, 01694 const SmallVectorImpl<ISD::InputArg> &Ins, 01695 SDLoc DL, SelectionDAG &DAG, 01696 SmallVectorImpl<SDValue> &InVals) const { 01697 SmallVector<CCValAssign, 16> ArgLocs; 01698 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 01699 *DAG.getContext()); 01700 MachineFunction &MF = DAG.getMachineFunction(); 01701 unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->getShaderType(); 01702 01703 SmallVector<ISD::InputArg, 8> LocalIns; 01704 01705 getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns); 01706 01707 AnalyzeFormalArguments(CCInfo, LocalIns); 01708 01709 for (unsigned i = 0, e = Ins.size(); i < e; ++i) { 01710 CCValAssign &VA = ArgLocs[i]; 01711 const ISD::InputArg &In = Ins[i]; 01712 EVT VT = In.VT; 01713 EVT MemVT = VA.getLocVT(); 01714 if (!VT.isVector() && MemVT.isVector()) { 01715 // Get load source type if scalarized. 01716 MemVT = MemVT.getVectorElementType(); 01717 } 01718 01719 if (ShaderType != ShaderType::COMPUTE) { 01720 unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); 01721 SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); 01722 InVals.push_back(Register); 01723 continue; 01724 } 01725 01726 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 01727 AMDGPUAS::CONSTANT_BUFFER_0); 01728 01729 // i64 isn't a legal type, so the register type used ends up as i32, which 01730 // isn't expected here. It attempts to create this sextload, but it ends up 01731 // being invalid. Somehow this seems to work with i64 arguments, but breaks 01732 // for <1 x i64>. 01733 01734 // The first 36 bytes of the input buffer contains information about 01735 // thread group and global sizes. 01736 ISD::LoadExtType Ext = ISD::NON_EXTLOAD; 01737 if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) { 01738 // FIXME: This should really check the extload type, but the handling of 01739 // extload vector parameters seems to be broken. 01740 01741 // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 01742 Ext = ISD::SEXTLOAD; 01743 } 01744 01745 // Compute the offset from the value. 01746 // XXX - I think PartOffset should give you this, but it seems to give the 01747 // size of the register which isn't useful. 01748 01749 unsigned ValBase = ArgLocs[In.OrigArgIndex].getLocMemOffset(); 01750 unsigned PartOffset = VA.getLocMemOffset(); 01751 01752 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); 01753 SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain, 01754 DAG.getConstant(36 + PartOffset, MVT::i32), 01755 DAG.getUNDEF(MVT::i32), 01756 PtrInfo, 01757 MemVT, false, true, true, 4); 01758 01759 // 4 is the preferred alignment for the CONSTANT memory space. 01760 InVals.push_back(Arg); 01761 } 01762 return Chain; 01763 } 01764 01765 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 01766 if (!VT.isVector()) 01767 return MVT::i32; 01768 return VT.changeVectorElementTypeToInteger(); 01769 } 01770 01771 static SDValue CompactSwizzlableVector( 01772 SelectionDAG &DAG, SDValue VectorEntry, 01773 DenseMap<unsigned, unsigned> &RemapSwizzle) { 01774 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 01775 assert(RemapSwizzle.empty()); 01776 SDValue NewBldVec[4] = { 01777 VectorEntry.getOperand(0), 01778 VectorEntry.getOperand(1), 01779 VectorEntry.getOperand(2), 01780 VectorEntry.getOperand(3) 01781 }; 01782 01783 for (unsigned i = 0; i < 4; i++) { 01784 if (NewBldVec[i].getOpcode() == ISD::UNDEF) 01785 // We mask write here to teach later passes that the ith element of this 01786 // vector is undef. Thus we can use it to reduce 128 bits reg usage, 01787 // break false dependencies and additionnaly make assembly easier to read. 01788 RemapSwizzle[i] = 7; // SEL_MASK_WRITE 01789 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) { 01790 if (C->isZero()) { 01791 RemapSwizzle[i] = 4; // SEL_0 01792 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 01793 } else if (C->isExactlyValue(1.0)) { 01794 RemapSwizzle[i] = 5; // SEL_1 01795 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 01796 } 01797 } 01798 01799 if (NewBldVec[i].getOpcode() == ISD::UNDEF) 01800 continue; 01801 for (unsigned j = 0; j < i; j++) { 01802 if (NewBldVec[i] == NewBldVec[j]) { 01803 NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); 01804 RemapSwizzle[i] = j; 01805 break; 01806 } 01807 } 01808 } 01809 01810 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), 01811 VectorEntry.getValueType(), NewBldVec); 01812 } 01813 01814 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, 01815 DenseMap<unsigned, unsigned> &RemapSwizzle) { 01816 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 01817 assert(RemapSwizzle.empty()); 01818 SDValue NewBldVec[4] = { 01819 VectorEntry.getOperand(0), 01820 VectorEntry.getOperand(1), 01821 VectorEntry.getOperand(2), 01822 VectorEntry.getOperand(3) 01823 }; 01824 bool isUnmovable[4] = { false, false, false, false }; 01825 for (unsigned i = 0; i < 4; i++) { 01826 RemapSwizzle[i] = i; 01827 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 01828 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 01829 ->getZExtValue(); 01830 if (i == Idx) 01831 isUnmovable[Idx] = true; 01832 } 01833 } 01834 01835 for (unsigned i = 0; i < 4; i++) { 01836 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 01837 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 01838 ->getZExtValue(); 01839 if (isUnmovable[Idx]) 01840 continue; 01841 // Swap i and Idx 01842 std::swap(NewBldVec[Idx], NewBldVec[i]); 01843 std::swap(RemapSwizzle[i], RemapSwizzle[Idx]); 01844 break; 01845 } 01846 } 01847 01848 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), 01849 VectorEntry.getValueType(), NewBldVec); 01850 } 01851 01852 01853 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, 01854 SDValue Swz[4], SelectionDAG &DAG) const { 01855 assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); 01856 // Old -> New swizzle values 01857 DenseMap<unsigned, unsigned> SwizzleRemap; 01858 01859 BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); 01860 for (unsigned i = 0; i < 4; i++) { 01861 unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue(); 01862 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 01863 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32); 01864 } 01865 01866 SwizzleRemap.clear(); 01867 BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); 01868 for (unsigned i = 0; i < 4; i++) { 01869 unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue(); 01870 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 01871 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32); 01872 } 01873 01874 return BuildVector; 01875 } 01876 01877 01878 //===----------------------------------------------------------------------===// 01879 // Custom DAG Optimizations 01880 //===----------------------------------------------------------------------===// 01881 01882 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, 01883 DAGCombinerInfo &DCI) const { 01884 SelectionDAG &DAG = DCI.DAG; 01885 01886 switch (N->getOpcode()) { 01887 default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 01888 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) 01889 case ISD::FP_ROUND: { 01890 SDValue Arg = N->getOperand(0); 01891 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { 01892 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0), 01893 Arg.getOperand(0)); 01894 } 01895 break; 01896 } 01897 01898 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> 01899 // (i32 select_cc f32, f32, -1, 0 cc) 01900 // 01901 // Mesa's GLSL frontend generates the above pattern a lot and we can lower 01902 // this to one of the SET*_DX10 instructions. 01903 case ISD::FP_TO_SINT: { 01904 SDValue FNeg = N->getOperand(0); 01905 if (FNeg.getOpcode() != ISD::FNEG) { 01906 return SDValue(); 01907 } 01908 SDValue SelectCC = FNeg.getOperand(0); 01909 if (SelectCC.getOpcode() != ISD::SELECT_CC || 01910 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS 01911 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True 01912 !isHWTrueValue(SelectCC.getOperand(2)) || 01913 !isHWFalseValue(SelectCC.getOperand(3))) { 01914 return SDValue(); 01915 } 01916 01917 return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0), 01918 SelectCC.getOperand(0), // LHS 01919 SelectCC.getOperand(1), // RHS 01920 DAG.getConstant(-1, MVT::i32), // True 01921 DAG.getConstant(0, MVT::i32), // Flase 01922 SelectCC.getOperand(4)); // CC 01923 01924 break; 01925 } 01926 01927 // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx 01928 // => build_vector elt0, ... , NewEltIdx, ... , eltN 01929 case ISD::INSERT_VECTOR_ELT: { 01930 SDValue InVec = N->getOperand(0); 01931 SDValue InVal = N->getOperand(1); 01932 SDValue EltNo = N->getOperand(2); 01933 SDLoc dl(N); 01934 01935 // If the inserted element is an UNDEF, just use the input vector. 01936 if (InVal.getOpcode() == ISD::UNDEF) 01937 return InVec; 01938 01939 EVT VT = InVec.getValueType(); 01940 01941 // If we can't generate a legal BUILD_VECTOR, exit 01942 if (!isOperationLegal(ISD::BUILD_VECTOR, VT)) 01943 return SDValue(); 01944 01945 // Check that we know which element is being inserted 01946 if (!isa<ConstantSDNode>(EltNo)) 01947 return SDValue(); 01948 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 01949 01950 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially 01951 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the 01952 // vector elements. 01953 SmallVector<SDValue, 8> Ops; 01954 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 01955 Ops.append(InVec.getNode()->op_begin(), 01956 InVec.getNode()->op_end()); 01957 } else if (InVec.getOpcode() == ISD::UNDEF) { 01958 unsigned NElts = VT.getVectorNumElements(); 01959 Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); 01960 } else { 01961 return SDValue(); 01962 } 01963 01964 // Insert the element 01965 if (Elt < Ops.size()) { 01966 // All the operands of BUILD_VECTOR must have the same type; 01967 // we enforce that here. 01968 EVT OpVT = Ops[0].getValueType(); 01969 if (InVal.getValueType() != OpVT) 01970 InVal = OpVT.bitsGT(InVal.getValueType()) ? 01971 DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) : 01972 DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal); 01973 Ops[Elt] = InVal; 01974 } 01975 01976 // Return the new vector 01977 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); 01978 } 01979 01980 // Extract_vec (Build_vector) generated by custom lowering 01981 // also needs to be customly combined 01982 case ISD::EXTRACT_VECTOR_ELT: { 01983 SDValue Arg = N->getOperand(0); 01984 if (Arg.getOpcode() == ISD::BUILD_VECTOR) { 01985 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 01986 unsigned Element = Const->getZExtValue(); 01987 return Arg->getOperand(Element); 01988 } 01989 } 01990 if (Arg.getOpcode() == ISD::BITCAST && 01991 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 01992 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 01993 unsigned Element = Const->getZExtValue(); 01994 return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(), 01995 Arg->getOperand(0).getOperand(Element)); 01996 } 01997 } 01998 } 01999 02000 case ISD::SELECT_CC: { 02001 // Try common optimizations 02002 SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 02003 if (Ret.getNode()) 02004 return Ret; 02005 02006 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> 02007 // selectcc x, y, a, b, inv(cc) 02008 // 02009 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> 02010 // selectcc x, y, a, b, cc 02011 SDValue LHS = N->getOperand(0); 02012 if (LHS.getOpcode() != ISD::SELECT_CC) { 02013 return SDValue(); 02014 } 02015 02016 SDValue RHS = N->getOperand(1); 02017 SDValue True = N->getOperand(2); 02018 SDValue False = N->getOperand(3); 02019 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 02020 02021 if (LHS.getOperand(2).getNode() != True.getNode() || 02022 LHS.getOperand(3).getNode() != False.getNode() || 02023 RHS.getNode() != False.getNode()) { 02024 return SDValue(); 02025 } 02026 02027 switch (NCC) { 02028 default: return SDValue(); 02029 case ISD::SETNE: return LHS; 02030 case ISD::SETEQ: { 02031 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); 02032 LHSCC = ISD::getSetCCInverse(LHSCC, 02033 LHS.getOperand(0).getValueType().isInteger()); 02034 if (DCI.isBeforeLegalizeOps() || 02035 isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType())) 02036 return DAG.getSelectCC(SDLoc(N), 02037 LHS.getOperand(0), 02038 LHS.getOperand(1), 02039 LHS.getOperand(2), 02040 LHS.getOperand(3), 02041 LHSCC); 02042 break; 02043 } 02044 } 02045 return SDValue(); 02046 } 02047 02048 case AMDGPUISD::EXPORT: { 02049 SDValue Arg = N->getOperand(1); 02050 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 02051 break; 02052 02053 SDValue NewArgs[8] = { 02054 N->getOperand(0), // Chain 02055 SDValue(), 02056 N->getOperand(2), // ArrayBase 02057 N->getOperand(3), // Type 02058 N->getOperand(4), // SWZ_X 02059 N->getOperand(5), // SWZ_Y 02060 N->getOperand(6), // SWZ_Z 02061 N->getOperand(7) // SWZ_W 02062 }; 02063 SDLoc DL(N); 02064 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG); 02065 return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs); 02066 } 02067 case AMDGPUISD::TEXTURE_FETCH: { 02068 SDValue Arg = N->getOperand(1); 02069 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 02070 break; 02071 02072 SDValue NewArgs[19] = { 02073 N->getOperand(0), 02074 N->getOperand(1), 02075 N->getOperand(2), 02076 N->getOperand(3), 02077 N->getOperand(4), 02078 N->getOperand(5), 02079 N->getOperand(6), 02080 N->getOperand(7), 02081 N->getOperand(8), 02082 N->getOperand(9), 02083 N->getOperand(10), 02084 N->getOperand(11), 02085 N->getOperand(12), 02086 N->getOperand(13), 02087 N->getOperand(14), 02088 N->getOperand(15), 02089 N->getOperand(16), 02090 N->getOperand(17), 02091 N->getOperand(18), 02092 }; 02093 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG); 02094 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(), 02095 NewArgs); 02096 } 02097 } 02098 02099 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 02100 } 02101 02102 static bool 02103 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, 02104 SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) { 02105 const R600InstrInfo *TII = 02106 static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo()); 02107 if (!Src.isMachineOpcode()) 02108 return false; 02109 switch (Src.getMachineOpcode()) { 02110 case AMDGPU::FNEG_R600: 02111 if (!Neg.getNode()) 02112 return false; 02113 Src = Src.getOperand(0); 02114 Neg = DAG.getTargetConstant(1, MVT::i32); 02115 return true; 02116 case AMDGPU::FABS_R600: 02117 if (!Abs.getNode()) 02118 return false; 02119 Src = Src.getOperand(0); 02120 Abs = DAG.getTargetConstant(1, MVT::i32); 02121 return true; 02122 case AMDGPU::CONST_COPY: { 02123 unsigned Opcode = ParentNode->getMachineOpcode(); 02124 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 02125 02126 if (!Sel.getNode()) 02127 return false; 02128 02129 SDValue CstOffset = Src.getOperand(0); 02130 if (ParentNode->getValueType(0).isVector()) 02131 return false; 02132 02133 // Gather constants values 02134 int SrcIndices[] = { 02135 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), 02136 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), 02137 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2), 02138 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), 02139 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), 02140 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), 02141 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), 02142 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), 02143 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), 02144 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), 02145 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) 02146 }; 02147 std::vector<unsigned> Consts; 02148 for (int OtherSrcIdx : SrcIndices) { 02149 int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); 02150 if (OtherSrcIdx < 0 || OtherSelIdx < 0) 02151 continue; 02152 if (HasDst) { 02153 OtherSrcIdx--; 02154 OtherSelIdx--; 02155 } 02156 if (RegisterSDNode *Reg = 02157 dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) { 02158 if (Reg->getReg() == AMDGPU::ALU_CONST) { 02159 ConstantSDNode *Cst 02160 = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx)); 02161 Consts.push_back(Cst->getZExtValue()); 02162 } 02163 } 02164 } 02165 02166 ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset); 02167 Consts.push_back(Cst->getZExtValue()); 02168 if (!TII->fitsConstReadLimitations(Consts)) { 02169 return false; 02170 } 02171 02172 Sel = CstOffset; 02173 Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32); 02174 return true; 02175 } 02176 case AMDGPU::MOV_IMM_I32: 02177 case AMDGPU::MOV_IMM_F32: { 02178 unsigned ImmReg = AMDGPU::ALU_LITERAL_X; 02179 uint64_t ImmValue = 0; 02180 02181 02182 if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) { 02183 ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0)); 02184 float FloatValue = FPC->getValueAPF().convertToFloat(); 02185 if (FloatValue == 0.0) { 02186 ImmReg = AMDGPU::ZERO; 02187 } else if (FloatValue == 0.5) { 02188 ImmReg = AMDGPU::HALF; 02189 } else if (FloatValue == 1.0) { 02190 ImmReg = AMDGPU::ONE; 02191 } else { 02192 ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); 02193 } 02194 } else { 02195 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0)); 02196 uint64_t Value = C->getZExtValue(); 02197 if (Value == 0) { 02198 ImmReg = AMDGPU::ZERO; 02199 } else if (Value == 1) { 02200 ImmReg = AMDGPU::ONE_INT; 02201 } else { 02202 ImmValue = Value; 02203 } 02204 } 02205 02206 // Check that we aren't already using an immediate. 02207 // XXX: It's possible for an instruction to have more than one 02208 // immediate operand, but this is not supported yet. 02209 if (ImmReg == AMDGPU::ALU_LITERAL_X) { 02210 if (!Imm.getNode()) 02211 return false; 02212 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm); 02213 assert(C); 02214 if (C->getZExtValue()) 02215 return false; 02216 Imm = DAG.getTargetConstant(ImmValue, MVT::i32); 02217 } 02218 Src = DAG.getRegister(ImmReg, MVT::i32); 02219 return true; 02220 } 02221 default: 02222 return false; 02223 } 02224 } 02225 02226 02227 /// \brief Fold the instructions after selecting them 02228 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, 02229 SelectionDAG &DAG) const { 02230 const R600InstrInfo *TII = 02231 static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo()); 02232 if (!Node->isMachineOpcode()) 02233 return Node; 02234 unsigned Opcode = Node->getMachineOpcode(); 02235 SDValue FakeOp; 02236 02237 std::vector<SDValue> Ops; 02238 for (const SDUse &I : Node->ops()) 02239 Ops.push_back(I); 02240 02241 if (Opcode == AMDGPU::DOT_4) { 02242 int OperandIdx[] = { 02243 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), 02244 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), 02245 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), 02246 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), 02247 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), 02248 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), 02249 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), 02250 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) 02251 }; 02252 int NegIdx[] = { 02253 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X), 02254 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y), 02255 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z), 02256 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W), 02257 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X), 02258 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y), 02259 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z), 02260 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W) 02261 }; 02262 int AbsIdx[] = { 02263 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X), 02264 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y), 02265 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z), 02266 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W), 02267 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X), 02268 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y), 02269 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z), 02270 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W) 02271 }; 02272 for (unsigned i = 0; i < 8; i++) { 02273 if (OperandIdx[i] < 0) 02274 return Node; 02275 SDValue &Src = Ops[OperandIdx[i] - 1]; 02276 SDValue &Neg = Ops[NegIdx[i] - 1]; 02277 SDValue &Abs = Ops[AbsIdx[i] - 1]; 02278 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 02279 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 02280 if (HasDst) 02281 SelIdx--; 02282 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 02283 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG)) 02284 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 02285 } 02286 } else if (Opcode == AMDGPU::REG_SEQUENCE) { 02287 for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) { 02288 SDValue &Src = Ops[i]; 02289 if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG)) 02290 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 02291 } 02292 } else if (Opcode == AMDGPU::CLAMP_R600) { 02293 SDValue Src = Node->getOperand(0); 02294 if (!Src.isMachineOpcode() || 02295 !TII->hasInstrModifiers(Src.getMachineOpcode())) 02296 return Node; 02297 int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(), 02298 AMDGPU::OpName::clamp); 02299 if (ClampIdx < 0) 02300 return Node; 02301 std::vector<SDValue> Ops; 02302 unsigned NumOp = Src.getNumOperands(); 02303 for(unsigned i = 0; i < NumOp; ++i) 02304 Ops.push_back(Src.getOperand(i)); 02305 Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32); 02306 return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node), 02307 Node->getVTList(), Ops); 02308 } else { 02309 if (!TII->hasInstrModifiers(Opcode)) 02310 return Node; 02311 int OperandIdx[] = { 02312 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), 02313 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), 02314 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2) 02315 }; 02316 int NegIdx[] = { 02317 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg), 02318 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg), 02319 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg) 02320 }; 02321 int AbsIdx[] = { 02322 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs), 02323 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs), 02324 -1 02325 }; 02326 for (unsigned i = 0; i < 3; i++) { 02327 if (OperandIdx[i] < 0) 02328 return Node; 02329 SDValue &Src = Ops[OperandIdx[i] - 1]; 02330 SDValue &Neg = Ops[NegIdx[i] - 1]; 02331 SDValue FakeAbs; 02332 SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; 02333 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 02334 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 02335 int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal); 02336 if (HasDst) { 02337 SelIdx--; 02338 ImmIdx--; 02339 } 02340 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 02341 SDValue &Imm = Ops[ImmIdx]; 02342 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG)) 02343 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 02344 } 02345 } 02346 02347 return Node; 02348 }